L1 Prediction from ELL Writing Samples

Part 3: Models

Author: Ryan Harper


1. Models: (top)

In [1]:
# iPython/Jupyter Notebook
import time
from pprint import pprint
import warnings
from IPython.display import Image

import time

# Data processing
import pandas as pd
import plotly as plo
import seaborn as sns
from scipy import stats
from collections import Counter
import numpy as np
import itertools

# NLP
from nltk.corpus import stopwords as sw
from nltk.util import ngrams
from nltk.corpus import brown
import nltk
import re
from nltk.tokenize import RegexpTokenizer
import difflib

# Stats
from sklearn.metrics import classification_report, roc_curve,roc_auc_score,accuracy_score
from sklearn import metrics

# Preparing Models
from sklearn.model_selection import train_test_split

# Decomposition
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix

# Models
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.naive_bayes import BernoulliNB,MultinomialNB,GaussianNB
from sklearn.neural_network import MLPClassifier

# Ensemble
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

#Visualization
from IPython.display import Image
import pydotplus
import graphviz
import matplotlib.pyplot as plt

# import altair as alt

Import Features + Target

In [2]:
loadf = True
if loadf:
    features = pd.read_csv('blogfeatures.csv').sample(frac=1.0)
    del features['Unnamed: 0']
else:
    %store -r blog
    %store -r keepfeatures
    features = blog

lang = list(features.language.unique())
In [3]:
del features['id']
del features['content']
del features['pos']
del features['pos2']
del features['pos3']
del features['tokens']
features = features[features['sc']<15]
In [4]:
features.language.value_counts()
Out[4]:
Japanese               9536
Traditional Chinese    3407
Name: language, dtype: int64

Set X,Y (UNUSED)

y = features['language'].values.reshape(-1, 1).ravel() X = features[features.columns[~features.columns.str.contains('language')]]

Select Features: Features with biggest differences between L1s

In [ ]:
# Choose feature selection method
selectfeatures = 'all'
In [46]:
# Feature selection
if selectfeatures == 'best':
    nonbinary = ['letters_per','wc','sc','sent_pol','sent_subj','cap_let','punc_count','freq_score','full_freq_score']
    best_features = keepfeatures
elif selectfeatures == 'all':
    best_features = features.columns[~features.columns.str.contains('language')]
elif selectfeatures == 'bnb':
    best_features = bnb_sorted[0:500]

Truncated SVD: Determine best features

In [7]:
svdbool = False
if svdbool:
    svd = TruncatedSVD(n_components=20, n_iter=7, random_state=42)
    svd.fit(features[best_features])
    # print(svd.explained_variance_ratio_)
    # print(svd.explained_variance_ratio_.sum())  
    # print(svd.singular_values_)
    # svd.get_params
    # best_features = [X.columns[i] for i in svd.components_[0].argsort()[::-1]]

Split Data to Train/Test: Even Distribution Sampling

In [8]:
# evenly_distributed_test = [60 japanese,60 english, 60 chinese, 60 korean]
langlist = []
rs=87
for l in lang:
    print(l)
    langlist.append(features[features['language'] == l].sample(n=200, random_state=rs))

testset = langlist
Japanese
Traditional Chinese
In [9]:
test_data = pd.concat(testset)
y_test = test_data['language'].values.reshape(-1, 1).ravel()
X_test = test_data[best_features]
In [10]:
%%time
train_data = features[~features.isin(test_data)].dropna()
y_train = train_data['language'].values.reshape(-1, 1).ravel()
X_train = train_data[best_features]
# X_train = svd.transform(train_data[train_data.columns[~train_data.columns.str.contains('language')]])
CPU times: user 27.5 s, sys: 14.1 s, total: 41.6 s
Wall time: 45 s

Create Function for Comparing Models

In [11]:
cols = ['name','time','total','precision','recall','f1']

model_set = pd.DataFrame(columns=cols)
models_stored = []
pattern = "%.2f"
In [26]:
def run_model(model,name):
    global model_set
    m = model
    m.fit(X_train, y_train)
    start = time.time()

    total_score = m.score(X_test,y_test)
    pscore = [pattern % i for i in list(metrics.precision_score(y_test, m.predict(X_test),labels=lang,average=None))]
    rscore = [pattern % i for i in list(metrics.recall_score(y_test, m.predict(X_test),labels=lang,average=None))]
    fscore = [pattern % i for i in list(metrics.f1_score(y_test, m.predict(X_test),labels=lang,average=None))]
    end = time.time()
    t= pattern % (end - start)
    cvs = cross_val_score(m, X_test,y_test)

    r = dict(zip(cols,[name,t,total_score,pscore,rscore,fscore]))
    print('1. Check for Overfitting: {}\n'.format(m.score(X_train,y_train)))
    print('2. Test Score is: {}\n'.format(total_score))
    print('3. Classification Report:')
    print(classification_report(y_test, m.predict(X_test)))
    print('')
    print('4. Cross Val Score: {} ==> Avg: {} '.format(cvs,cvs.sum()/len(cvs)))
    print('')
    
    model_set = model_set.append(r,ignore_index=True)
    return r,m

A. LR - Logistic Regression (top)

Target is binary so logistic regression will operate on probabilities

In [27]:
%%time
lreg_data,lreg = run_model(linear_model.LogisticRegression(),'Logistic Regression')
1. Check for Overfitting: 0.8251614446304711

2. Test Score is: 0.67

3. Classification Report:
                     precision    recall  f1-score   support

           Japanese       0.61      0.96      0.75       200
Traditional Chinese       0.91      0.38      0.53       200

        avg / total       0.76      0.67      0.64       400


4. Cross Val Score: [0.74626866 0.67910448 0.71969697] ==> Avg: 0.7150233680084427 

CPU times: user 8.18 s, sys: 2.88 s, total: 11.1 s
Wall time: 9.79 s
In [14]:
lreg_coef = lreg.coef_[0][-20:]

# Plot the coefficients
plt.figure(figsize=(20,6))
plt.plot(range(len(X_train.columns[-20:])), lreg_coef)
plt.xticks(range(len(X_train.columns[-20:])), X_train.columns, rotation=60)
plt.margins(0.02)
plt.show()

E. K Nearest Neighbors (top)

Can handle discrete values for target
Quantitative values are limited (not continuous) and might be problematic for nearest neighbors

In [15]:
%%time
neighbors_data,neighbors = run_model(KNeighborsClassifier(n_neighbors=8),'K Nearest Neighbor')
Check for Overfitting: 0.7514948576895479

Test Score is: 0.51

                     precision    recall  f1-score   support

           Japanese       0.51      0.99      0.67       200
Traditional Chinese       0.75      0.03      0.06       200

        avg / total       0.63      0.51      0.36       400


Cross Val Score: [0.47761194 0.53731343 0.5530303 ] | Avg: 0.5226518920548772 

CPU times: user 1min 10s, sys: 5.32 s, total: 1min 15s
Wall time: 1min 22s

F. Naive Bayes - Bernoulli (top)

In [16]:
%%time
bnb_data,bnb = run_model(BernoulliNB(),'Naive Bayes - Bernoulli')
Check for Overfitting: 0.7373036753567727

Test Score is: 0.6575

                     precision    recall  f1-score   support

           Japanese       0.62      0.81      0.70       200
Traditional Chinese       0.72      0.51      0.60       200

        avg / total       0.67      0.66      0.65       400


Cross Val Score: [0.6119403  0.59701493 0.70454545] | Avg: 0.6378335594753505 

CPU times: user 19.4 s, sys: 8.65 s, total: 28 s
Wall time: 25.4 s
In [45]:
if selectfeatures != 'bnb':
    importance = dict(list(zip(X_train.columns,bnb.coef_[0])))
    bnb_sorted = sorted(importance, key=importance.get, reverse=True)
    %store bnb_sorted

    bnb_coef = []
    for r in bnb_sorted:
        # print(importance[r])
        bnb_coef.append(importance[r])
        
    if selectfeatures == 'best':
        bnb_coef = bnb_coef[::-1]
    
    # Plot the coefficients
    plt.figure(figsize=(18,4))
    plt.plot(range(len(bnb_coef[0:50])), bnb_coef[0:50])
    plt.xticks(range(len(bnb_coef[0:50])), bnb_sorted[0:50], rotation=60)
    plt.margins(0.02)
    plt.show()
Stored 'bnb_sorted' (list)
[print(c) for c in X_train.columns];

G. Decision Tree (top)

In [18]:
%%time
dt_data,dt = run_model(tree.DecisionTreeClassifier(criterion='entropy',max_depth=4),'Decision Tree')
Check for Overfitting: 0.7921549868452523

Test Score is: 0.6175

                     precision    recall  f1-score   support

           Japanese       0.57      0.97      0.72       200
Traditional Chinese       0.90      0.27      0.41       200

        avg / total       0.73      0.62      0.56       400


Cross Val Score: [0.64925373 0.67910448 0.65151515] | Avg: 0.6599577868234584 

CPU times: user 3.81 s, sys: 1.01 s, total: 4.82 s
Wall time: 4.9 s
In [19]:
# Render tree.
dot_data = tree.export_graphviz(
    dt, 
    out_file=None,
    feature_names=X_train.columns,
    label= 'root',
    proportion=False,
    rounded=True,
    class_names=lang,
    filled=True
)

graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

graph.write_png('decision_tree.png')
Out[19]:
True
dimportance = list(zip(best_features,dt.feature_importances_)) dimportance = dict(dimportance) a1_sorted_keys = sorted(dimportance, key=dimportance.get, reverse=True) p = [] for r in a1_sorted_keys: if dimportance[r] != 0: p.append(r) # print(r, dimportance[r]) print(p)

Good visualization of important features and presentation of entropy weighting

H. Random Forest (top)

Runs decision tree multiple times for best output
Longest processing time

In [20]:
%%time
rf_data,rf = run_model(ensemble.RandomForestClassifier(n_estimators=150,
                                                       criterion='entropy',
                                                       max_features=len(X_train.columns),
                                                       max_depth=6),'Random Forest')
Check for Overfitting: 0.825560073347684

Test Score is: 0.6375

                     precision    recall  f1-score   support

           Japanese       0.58      0.95      0.72       200
Traditional Chinese       0.88      0.32      0.47       200

        avg / total       0.73      0.64      0.60       400


Cross Val Score: [0.70149254 0.75373134 0.77272727] | Avg: 0.7426503844414292 

CPU times: user 7min 23s, sys: 5.43 s, total: 7min 28s
Wall time: 9min 50s
cvs = cross_val_score(rf, X_test, y_test, cv=5) print(cvs.sum()/len(cvs))
In [21]:
rf.feature_importances_
importance = dict(list(zip(X_train.columns,rf.feature_importances_)))
rf_sorted = sorted(importance, key=importance.get, reverse=True)
for r in rf_sorted[0:5]:
    if importance[r] >0:
        print(r, importance[r])
print('')
%store rf_sorted
let2_ja 0.2218376960916456
sc 0.1593812123378788
pos2_DT-JJ 0.06808873632280418
pos2_NN-MD 0.038436240913713136
cap_let 0.033210016321393435

Stored 'rf_sorted' (list)
i_tree = 0 for tree_in_forest in rf.estimators_: dot_data = tree.export_graphviz(tree_in_forest, out_file=None,feature_names=X_train.columns,label= 'root', proportion=False,rounded=True,class_names=lang,filled=True) graph = pydotplus.graph_from_dot_data(dot_data) Image(graph.create_png()) graph.write_png('decision_tree'+str(i_tree)+'.png') i_tree = i_tree+1

I. MLPC - Supervised Neural Network (top)

In [22]:
%%time
mlp_data,mlp = run_model(MLPClassifier(hidden_layer_sizes=(400,)),'Neural Network - MLPC')
Check for Overfitting: 0.744638443753488

Test Score is: 0.5

                     precision    recall  f1-score   support

           Japanese       0.50      1.00      0.67       200
Traditional Chinese       0.00      0.00      0.00       200

        avg / total       0.25      0.50      0.33       400


Cross Val Score: [0.5 0.5 0.5] | Avg: 0.5 

CPU times: user 3min 56s, sys: 53.7 s, total: 4min 50s
Wall time: 1h 12min 51s

2. Model Comparison (top)

In [23]:
model_set.columns = ['name','time','total','prec: | JA | CH |','rec: | JA | CH |','f1: | JA | CH |']
model_set
Out[23]:
name time total prec: | JA | CH | rec: | JA | CH | f1: | JA | CH |
0 Logistic Regression 0.20 0.6700 [0.61, 0.91] [0.96, 0.38] [0.75, 0.53]
1 K Nearest Neighbor 6.11 0.5100 [0.51, 0.75] [0.99, 0.03] [0.67, 0.06]
2 Naive Bayes - Bernoulli 0.71 0.6575 [0.62, 0.72] [0.81, 0.51] [0.70, 0.60]
3 Decision Tree 0.13 0.6175 [0.57, 0.90] [0.97, 0.27] [0.72, 0.41]
4 Random Forest 0.25 0.6375 [0.58, 0.88] [0.95, 0.32] [0.72, 0.47]
5 Neural Network - MLPC 0.37 0.5000 [0.50, 0.00] [1.00, 0.00] [0.67, 0.00]
In [24]:
model_save = model_set