Author: Ryan Harper
# iPython/Jupyter Notebook
import time
from pprint import pprint
import warnings
from IPython.display import Image
import time
# Data processing
import pandas as pd
import plotly as plo
import seaborn as sns
from scipy import stats
from collections import Counter
import numpy as np
import itertools
# NLP
from nltk.corpus import stopwords as sw
from nltk.util import ngrams
from nltk.corpus import brown
import nltk
import re
from nltk.tokenize import RegexpTokenizer
import difflib
# Stats
from sklearn.metrics import classification_report, roc_curve,roc_auc_score,accuracy_score
from sklearn import metrics
# Preparing Models
from sklearn.model_selection import train_test_split
# Decomposition
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix
# Models
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.naive_bayes import BernoulliNB,MultinomialNB,GaussianNB
from sklearn.neural_network import MLPClassifier
# Ensemble
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
#Visualization
from IPython.display import Image
import pydotplus
import graphviz
import matplotlib.pyplot as plt
# import altair as alt
Import Features + Target
loadf = True
if loadf:
features = pd.read_csv('blogfeatures.csv').sample(frac=1.0)
del features['Unnamed: 0']
else:
%store -r blog
%store -r keepfeatures
features = blog
lang = list(features.language.unique())
del features['id']
del features['content']
del features['pos']
del features['pos2']
del features['pos3']
del features['tokens']
features.language.value_counts()
Set X,Y (UNUSED)
Select Features: Features with biggest differences between L1s
# Choose feature selection method
selectfeatures = 'all'
# Feature selection
if selectfeatures == 'best':
nonbinary = ['letters_per','wc','sc','sent_pol','sent_subj','cap_let','punc_count','freq_score','full_freq_score']
best_features = keepfeatures
elif selectfeatures == 'all':
best_features = features.columns[~features.columns.str.contains('language')]
elif selectfeatures == 'bnb':
best_features = bnb_sorted[0:500]
Truncated SVD: Determine best features
svdbool = False
if svdbool:
svd = TruncatedSVD(n_components=20, n_iter=7, random_state=42)
svd.fit(features[best_features])
# print(svd.explained_variance_ratio_)
# print(svd.explained_variance_ratio_.sum())
# print(svd.singular_values_)
# svd.get_params
# best_features = [X.columns[i] for i in svd.components_[0].argsort()[::-1]]
Split Data to Train/Test: Even Distribution Sampling
# evenly_distributed_test = [60 japanese,60 english, 60 chinese, 60 korean]
langlist = []
rs=87
for l in lang:
print(l)
langlist.append(features[features['language'] == l].sample(n=200, random_state=rs))
testset = langlist
test_data = pd.concat(testset)
y_test = test_data['language'].values.reshape(-1, 1).ravel()
X_test = test_data[best_features]
%%time
train_data = features[~features.isin(test_data)].dropna()
y_train = train_data['language'].values.reshape(-1, 1).ravel()
X_train = train_data[best_features]
# X_train = svd.transform(train_data[train_data.columns[~train_data.columns.str.contains('language')]])
Create Function for Comparing Models
cols = ['name','time','total','precision','recall','f1']
model_set = pd.DataFrame(columns=cols)
models_stored = []
pattern = "%.2f"
def run_model(model,name):
global model_set
m = model
m.fit(X_train, y_train)
start = time.time()
total_score = m.score(X_test,y_test)
pscore = [pattern % i for i in list(metrics.precision_score(y_test, m.predict(X_test),labels=lang,average=None))]
rscore = [pattern % i for i in list(metrics.recall_score(y_test, m.predict(X_test),labels=lang,average=None))]
fscore = [pattern % i for i in list(metrics.f1_score(y_test, m.predict(X_test),labels=lang,average=None))]
end = time.time()
t= pattern % (end - start)
cvs = cross_val_score(m, X_test,y_test)
r = dict(zip(cols,[name,t,total_score,pscore,rscore,fscore]))
print('1. Check for Overfitting: {}\n'.format(m.score(X_train,y_train)))
print('2. Test Score is: {}\n'.format(total_score))
print('3. Classification Report:')
print(classification_report(y_test, m.predict(X_test)))
print('')
print('4. Cross Val Score: {} ==> Avg: {} '.format(cvs,cvs.sum()/len(cvs)))
print('')
model_set = model_set.append(r,ignore_index=True)
return r,m
Target is binary so logistic regression will operate on probabilities
%%time
lreg_data,lreg = run_model(linear_model.LogisticRegression(),'Logistic Regression')
lreg_coef = lreg.coef_[0][-20:]
# Plot the coefficients
plt.figure(figsize=(20,6))
plt.plot(range(len(X_train.columns[-20:])), lreg_coef)
plt.xticks(range(len(X_train.columns[-20:])), X_train.columns, rotation=60)
plt.margins(0.02)
plt.show()
Can handle discrete values for target
Quantitative values are limited (not continuous) and might be problematic for nearest neighbors
%%time
neighbors_data,neighbors = run_model(KNeighborsClassifier(n_neighbors=8),'K Nearest Neighbor')
%%time
bnb_data,bnb = run_model(BernoulliNB(),'Naive Bayes - Bernoulli')
if selectfeatures != 'bnb':
importance = dict(list(zip(X_train.columns,bnb.coef_[0])))
bnb_sorted = sorted(importance, key=importance.get, reverse=True)
%store bnb_sorted
bnb_coef = []
for r in bnb_sorted:
# print(importance[r])
bnb_coef.append(importance[r])
if selectfeatures == 'best':
bnb_coef = bnb_coef[::-1]
# Plot the coefficients
plt.figure(figsize=(18,4))
plt.plot(range(len(bnb_coef[0:50])), bnb_coef[0:50])
plt.xticks(range(len(bnb_coef[0:50])), bnb_sorted[0:50], rotation=60)
plt.margins(0.02)
plt.show()
%%time
dt_data,dt = run_model(tree.DecisionTreeClassifier(criterion='entropy',max_depth=4),'Decision Tree')
# Render tree.
dot_data = tree.export_graphviz(
dt,
out_file=None,
feature_names=X_train.columns,
label= 'root',
proportion=False,
rounded=True,
class_names=lang,
filled=True
)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
graph.write_png('decision_tree.png')
Good visualization of important features and presentation of entropy weighting
Runs decision tree multiple times for best output
Longest processing time
%%time
rf_data,rf = run_model(ensemble.RandomForestClassifier(n_estimators=150,
criterion='entropy',
max_features=len(X_train.columns),
max_depth=6),'Random Forest')
rf.feature_importances_
importance = dict(list(zip(X_train.columns,rf.feature_importances_)))
rf_sorted = sorted(importance, key=importance.get, reverse=True)
for r in rf_sorted[0:5]:
if importance[r] >0:
print(r, importance[r])
print('')
%store rf_sorted
%%time
mlp_data,mlp = run_model(MLPClassifier(hidden_layer_sizes=(400,)),'Neural Network - MLPC')
model_set.columns = ['name','time','total','prec: | JA | CH |','rec: | JA | CH |','f1: | JA | CH |']
model_set
model_save = model_set