L1 Prediction from ELL Writing Samples

Part 3: Models

Author: Ryan Harper

In [1]:
# iPython/Jupyter Notebook
import time
from pprint import pprint
import warnings
from IPython.display import Image

import time

# Data processing
import pandas as pd
import plotly as plo
import seaborn as sns
from scipy import stats
from collections import Counter
import numpy as np
import itertools

from nltk.corpus import stopwords as sw
from nltk.util import ngrams
from nltk.corpus import brown
import nltk
import re
from nltk.tokenize import RegexpTokenizer
import difflib

# Stats
from sklearn.metrics import classification_report, roc_curve,roc_auc_score,accuracy_score
from sklearn import metrics

# Preparing Models
from sklearn.model_selection import train_test_split

# Decomposition
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix

# Models
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.naive_bayes import BernoulliNB,MultinomialNB,GaussianNB
from sklearn.neural_network import MLPClassifier

# Ensemble
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

from IPython.display import Image
import pydotplus
import graphviz
import matplotlib.pyplot as plt

# import altair as alt

Import Features + Target

In [2]:
loadf = True
if loadf:
    features = pd.read_csv('blogfeatures.csv').sample(frac=1.0)
    del features['Unnamed: 0']
    %store -r blog
    %store -r keepfeatures
    features = blog

lang = list(features.language.unique())
In [3]:
del features['id']
del features['content']
del features['pos']
del features['pos2']
del features['pos3']
del features['tokens']
features = features[features['sc']<15]
In [4]:
Japanese               9536
Traditional Chinese    3407
Name: language, dtype: int64


y = features['language'].values.reshape(-1, 1).ravel() X = features[features.columns[~features.columns.str.contains('language')]]

Select Features: Features with biggest differences between L1s

In [ ]:
# Choose feature selection method
selectfeatures = 'all'
In [46]:
# Feature selection
if selectfeatures == 'best':
    nonbinary = ['letters_per','wc','sc','sent_pol','sent_subj','cap_let','punc_count','freq_score','full_freq_score']
    best_features = keepfeatures
elif selectfeatures == 'all':
    best_features = features.columns[~features.columns.str.contains('language')]
elif selectfeatures == 'bnb':
    best_features = bnb_sorted[0:500]

Truncated SVD: Determine best features

In [7]:
svdbool = False
if svdbool:
    svd = TruncatedSVD(n_components=20, n_iter=7, random_state=42)
    # print(svd.explained_variance_ratio_)
    # print(svd.explained_variance_ratio_.sum())  
    # print(svd.singular_values_)
    # svd.get_params
    # best_features = [X.columns[i] for i in svd.components_[0].argsort()[::-1]]

Split Data to Train/Test: Even Distribution Sampling

In [8]:
# evenly_distributed_test = [60 japanese,60 english, 60 chinese, 60 korean]
langlist = []
for l in lang:
    langlist.append(features[features['language'] == l].sample(n=200, random_state=rs))

testset = langlist
Traditional Chinese
In [9]:
test_data = pd.concat(testset)
y_test = test_data['language'].values.reshape(-1, 1).ravel()
X_test = test_data[best_features]
In [10]:
train_data = features[~features.isin(test_data)].dropna()
y_train = train_data['language'].values.reshape(-1, 1).ravel()
X_train = train_data[best_features]
# X_train = svd.transform(train_data[train_data.columns[~train_data.columns.str.contains('language')]])
CPU times: user 27.5 s, sys: 14.1 s, total: 41.6 s
Wall time: 45 s
Wall time: 45 s

Create Function for Comparing Models

In [11]:
cols = ['name','time','total','precision','recall','f1']

model_set = pd.DataFrame(columns=cols)
models_stored = []
pattern = "%.2f"
In [26]:
def run_model(model,name):
    global model_set
    m = model
    m.fit(X_train, y_train)
    start = time.time()

    total_score = m.score(X_test,y_test)
    pscore = [pattern % i for i in list(metrics.precision_score(y_test, m.predict(X_test),labels=lang,average=None))]
    rscore = [pattern % i for i in list(metrics.recall_score(y_test, m.predict(X_test),labels=lang,average=None))]
    fscore = [pattern % i for i in list(metrics.f1_score(y_test, m.predict(X_test),labels=lang,average=None))]
    end = time.time()
    t= pattern % (end - start)
    cvs = cross_val_score(m, X_test,y_test)

    r = dict(zip(cols,[name,t,total_score,pscore,rscore,fscore]))
    print('1. Check for Overfitting: {}\n'.format(m.score(X_train,y_train)))
    print('2. Test Score is: {}\n'.format(total_score))
    print('3. Classification Report:')
    print(classification_report(y_test, m.predict(X_test)))
    print('4. Cross Val Score: {} ==> Avg: {} '.format(cvs,cvs.sum()/len(cvs)))
    model_set = model_set.append(r,ignore_index=True)
    return r,m

A. LR - Logistic Regression (top)

Target is binary so logistic regression will operate on probabilities

In [27]:
lreg_data,lreg = run_model(linear_model.LogisticRegression(),'Logistic Regression')
1. Check for Overfitting: 0.8251614446304711

2. Test Score is: 0.67

3. Classification Report:
                     precision    recall  f1-score   support

           Japanese       0.61      0.96      0.75       200
Traditional Chinese       0.91      0.38      0.53       200

        avg / total       0.76      0.67      0.64       400

4. Cross Val Score: [0.74626866 0.67910448 0.71969697] ==> Avg: 0.7150233680084427 

CPU times: user 8.18 s, sys: 2.88 s, total: 11.1 s
Wall time: 9.79 s
Wall time: 9.79 s
In [14]:
lreg_coef = lreg.coef_[0][-20:]

# Plot the coefficients
plt.plot(range(len(X_train.columns[-20:])), lreg_coef)
plt.xticks(range(len(X_train.columns[-20:])), X_train.columns, rotation=60)

E. K Nearest Neighbors (top)

Can handle discrete values for target
Quantitative values are limited (not continuous) and might be problematic for nearest neighbors

In [15]:
neighbors_data,neighbors = run_model(KNeighborsClassifier(n_neighbors=8),'K Nearest Neighbor')
Check for Overfitting: 0.7514948576895479

Test Score is: 0.51

                     precision    recall  f1-score   support

           Japanese       0.51      0.99      0.67       200
Traditional Chinese       0.75      0.03      0.06       200

        avg / total       0.63      0.51      0.36       400

Cross Val Score: [0.47761194 0.53731343 0.5530303 ] | Avg: 0.5226518920548772 

CPU times: user 1min 10s, sys: 5.32 s, total: 1min 15s
Wall time: 1min 22s
Wall time: 1min 22s

F. Naive Bayes - Bernoulli (top)

In [16]:
bnb_data,bnb = run_model(BernoulliNB(),'Naive Bayes - Bernoulli')
Check for Overfitting: 0.7373036753567727

Test Score is: 0.6575

                     precision    recall  f1-score   support

           Japanese       0.62      0.81      0.70       200
Traditional Chinese       0.72      0.51      0.60       200

        avg / total       0.67      0.66      0.65       400

Cross Val Score: [0.6119403  0.59701493 0.70454545] | Avg: 0.6378335594753505 

CPU times: user 19.4 s, sys: 8.65 s, total: 28 s
Wall time: 25.4 s
Wall time: 25.4 s
In [45]:
if selectfeatures != 'bnb':
    importance = dict(list(zip(X_train.columns,bnb.coef_[0])))
    bnb_sorted = sorted(importance, key=importance.get, reverse=True)
    %store bnb_sorted

    bnb_coef = []
    for r in bnb_sorted:
        # print(importance[r])
    if selectfeatures == 'best':
        bnb_coef = bnb_coef[::-1]
    # Plot the coefficients
    plt.plot(range(len(bnb_coef[0:50])), bnb_coef[0:50])
    plt.xticks(range(len(bnb_coef[0:50])), bnb_sorted[0:50], rotation=60)
Stored 'bnb_sorted' (list)
[print(c) for c in X_train.columns];

G. Decision Tree (top)

In [18]:
dt_data,dt = run_model(tree.DecisionTreeClassifier(criterion='entropy',max_depth=4),'Decision Tree')
Check for Overfitting: 0.7921549868452523

Test Score is: 0.6175

                     precision    recall  f1-score   support

           Japanese       0.57      0.97      0.72       200
Traditional Chinese       0.90      0.27      0.41       200

        avg / total       0.73      0.62      0.56       400

Cross Val Score: [0.64925373 0.67910448 0.65151515] | Avg: 0.6599577868234584 

CPU times: user 3.81 s, sys: 1.01 s, total: 4.82 s
Wall time: 4.9 s
Wall time: 4.9 s
In [19]:
# Render tree.
dot_data = tree.export_graphviz(
    label= 'root',

graph = pydotplus.graph_from_dot_data(dot_data)

dimportance = list(zip(best_features,dt.feature_importances_)) dimportance = dict(dimportance) a1_sorted_keys = sorted(dimportance, key=dimportance.get, reverse=True) p = [] for r in a1_sorted_keys: if dimportance[r] != 0: p.append(r) # print(r, dimportance[r]) print(p)

Good visualization of important features and presentation of entropy weighting

H. Random Forest (top)

Runs decision tree multiple times for best output
Longest processing time

In [20]:
rf_data,rf = run_model(ensemble.RandomForestClassifier(n_estimators=150,
                                                       max_depth=6),'Random Forest')
Check for Overfitting: 0.825560073347684

Test Score is: 0.6375

                     precision    recall  f1-score   support

           Japanese       0.58      0.95      0.72       200
Traditional Chinese       0.88      0.32      0.47       200

        avg / total       0.73      0.64      0.60       400

Cross Val Score: [0.70149254 0.75373134 0.77272727] | Avg: 0.7426503844414292 

CPU times: user 7min 23s, sys: 5.43 s, total: 7min 28s
Wall time: 9min 50s
Wall time: 9min 50s
cvs = cross_val_score(rf, X_test, y_test, cv=5) print(cvs.sum()/len(cvs))
In [21]:
importance = dict(list(zip(X_train.columns,rf.feature_importances_)))
rf_sorted = sorted(importance, key=importance.get, reverse=True)
for r in rf_sorted[0:5]:
    if importance[r] >0:
        print(r, importance[r])
%store rf_sorted
let2_ja 0.2218376960916456
sc 0.1593812123378788
pos2_DT-JJ 0.06808873632280418
pos2_NN-MD 0.038436240913713136
cap_let 0.033210016321393435

Stored 'rf_sorted' (list)
i_tree = 0 for tree_in_forest in rf.estimators_: dot_data = tree.export_graphviz(tree_in_forest, out_file=None,feature_names=X_train.columns,label= 'root', proportion=False,rounded=True,class_names=lang,filled=True) graph = pydotplus.graph_from_dot_data(dot_data) Image(graph.create_png()) graph.write_png('decision_tree'+str(i_tree)+'.png') i_tree = i_tree+1

I. MLPC - Supervised Neural Network (top)

In [22]:
mlp_data,mlp = run_model(MLPClassifier(hidden_layer_sizes=(400,)),'Neural Network - MLPC')
Check for Overfitting: 0.744638443753488

Test Score is: 0.5

                     precision    recall  f1-score   support

           Japanese       0.50      1.00      0.67       200
Traditional Chinese       0.00      0.00      0.00       200

        avg / total       0.25      0.50      0.33       400

Cross Val Score: [0.5 0.5 0.5] | Avg: 0.5 

CPU times: user 3min 56s, sys: 53.7 s, total: 4min 50s
Wall time: 1h 12min 51s
Wall time: 1h 12min 51s

2. Model Comparison (top)

In [23]:
model_set.columns = ['name','time','total','prec: | JA | CH |','rec: | JA | CH |','f1: | JA | CH |']
name time total prec: | JA | CH | rec: | JA | CH | f1: | JA | CH |
0 Logistic Regression 0.20 0.6700 [0.61, 0.91] [0.96, 0.38] [0.75, 0.53]
1 K Nearest Neighbor 6.11 0.5100 [0.51, 0.75] [0.99, 0.03] [0.67, 0.06]
2 Naive Bayes - Bernoulli 0.71 0.6575 [0.62, 0.72] [0.81, 0.51] [0.70, 0.60]
3 Decision Tree 0.13 0.6175 [0.57, 0.90] [0.97, 0.27] [0.72, 0.41]
4 Random Forest 0.25 0.6375 [0.58, 0.88] [0.95, 0.32] [0.72, 0.47]
5 Neural Network - MLPC 0.37 0.5000 [0.50, 0.00] [1.00, 0.00] [0.67, 0.00]
In [24]:
model_save = model_set