Author: Ryan Harper
# iPython/Jupyter Notebook
import time
from pprint import pprint
import warnings
from IPython.display import Image
# Data processing
import scipy
import pandas as pd
import plotly as plo
import numpy as np
import seaborn as sns
from collections import Counter
from functools import reduce
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
# Statistics
from scipy import stats
from statsmodels import robust
from scipy.stats import ttest_ind, mannwhitneyu, median_test, f_oneway,mood, shapiro
# NLP
import textblob
from nltk.corpus import stopwords as sw
from nltk.util import ngrams
from nltk.corpus import brown
import nltk
import re
from nltk.tokenize import RegexpTokenizer
import difflib
from string import punctuation
# import altair as alt
# Jupyter Settings and Imports
%pylab
%matplotlib inline
warnings.filterwarnings(action='once')
loadDF = False
if loadDF:
features = pd.read_csv('blogfeatures.csv').sample(frac=1.0)
else:
%store -r blog
features = blog
lang = list(features.language.unique())
Create Analysis Set
languagesDF = pd.DataFrame(index=['Traditional Chinese','Japanese','CCount','JCount'],columns=['NA'])
binary = features
nonbinary = ['language','Unnamed: 0','id','pos','pos1','pos2','pos3','tokens','content','letters_per','wc','sc','sent_pol','sent_subj','cap_let','punc_count','freq_score','full_freq_score']
for b in binary.columns:
if b not in nonbinary:
binary[b] = binary[b].apply(lambda val: 0 if val == 0 else 1)
ll = []
for l in lang:
ll.append(binary[b][binary['language'] == l].sum()/binary.language[binary['language'] == l].count())
for l in lang:
ll.append(int(binary[b][binary['language'] == l].sum()))
languagesDF[b] = ll
del languagesDF['NA']
langDF = languagesDF.iloc[0:2].reset_index()
languagesDF
binary['cuw_god'][binary['language']=='Traditional Chinese'].sum()/(binary.language[binary['language'] == 'Traditional Chinese']).count()
languagesDF['cuw_god']
def check_differences(s):
a = s.iloc[0]
b = s.iloc[1]
top = abs(a-b)
bot = (a+b)/2
perc_diff = top/bot
return perc_diff
def graph_vis(f,bool):
if bool:
sns.factorplot(data=langDF, x='index', y=f, kind='bar')
plt.title('% of {}'.format(f))
blue_patch = mpatches.Patch(color='steelblue', label='# of Chinese: {}'.format(languagesDF[f].iloc[2]))
orange_patch = mpatches.Patch(color='darkorange', label='# of Japanese: {}'.format(languagesDF[f].iloc[3]))
plt.legend(handles=[blue_patch,orange_patch])
languagesDF['adv_still'].iloc[3]
from scipy.stats import binom_test
keepfeatures = []
for feat in list(languagesDF.columns):
if feat != 'language':
if (check_differences(languagesDF[feat]) > .4) and languagesDF[feat].sum()>50:
# graph_vis(feat,True)
keepfeatures.append(feat)
plt.show()
%store keepfeatures
len(keepfeatures)
Reject Outliers
def MEDIAN_reject_outliers(data, m=3):
data = data[abs(data - np.median(data)) < m*robust.mad(data)]
return data[~np.isnan(data)].sort_values()
def MEAN_reject_outliers(data, m=3):
data = data[abs(data - np.mean(data)) < m*np.std(data)]
return data[~np.isnan(data)].sort_values()
Create Analysis DF
analysis_features=['letters_per','wc','sc','sent_pol','sent_subj','cap_let','punc_count','full_freq_score']
blog_features =['language']
full_features = analysis_features + blog_features
analysis = features[full_features]
Plot Hist
Decision Tree Truncated SVD Features
i = 1
fig=plt.figure(figsize=(20,10))
for feat in analysis_features:
if feat != 'language':
plt.subplot(2, 4, i)
i = i + 1
langfeats = [MEDIAN_reject_outliers(analysis[feat][analysis.language == l]) for l in lang]
for g in langfeats:
sns.kdeplot(g,legend=None,kernel='gau',shade=True,)
plt.title(feat, fontsize=18)
fig.suptitle('KDeplot (Histograms) of Continuous Features', fontsize=18, y=1.03)
plt.tight_layout()
plt.legend(lang)
plt.show()
from scipy.stats import ttest_ind
for c in analysis_features:
g = [(blog[c][blog.language == l]) for l in lang]
stat,p = ttest_ind(g[0],g[1])
if (p < .05) & (p > 0):
print('{}:\x1b[92m{}\x1b[0m'.format(c,p))
elif p > .05:
print('{}:\x1b[31m{}\x1b[0m'.format(c,p))
else:
print('{}:\x1b[33m{}\x1b[0m'.format(c,p))
Correlations (of top features by decision tree importance)
%store -r bnb_sorted
# corr = analysis.corr(); print(corr)
i = 1
fig=plt.figure(figsize=(22,15))
for ind,g in enumerate([(features[bnb_sorted[0:5]][features.language == l]) for l in lang]):
plt.subplot(2, 2, i)
i = i + 1
corr = g.corr()
sns.heatmap(corr,
xticklabels=corr.columns,
yticklabels=corr.columns,
annot=True)
plt.title(lang[ind],fontsize = 20)
plt.xticks(rotation='vertical',fontsize = 14)
plt.yticks(rotation='horizontal',fontsize = 14)
plt.show()
from scipy.stats import median_test
from statsmodels.stats.proportion import proportions_ztest
def mw_test(a,b):
stat,p = mannwhitneyu(a,b, use_continuity=True, alternative=None)
return stat,p
def moods_median_test(a,b,c,d):
stat, p, med, tbl = median_test(a,b,c,d)
return stat,p
def f1way_test(a,b,c,d):
f,b = f_oneway(a,b,c,d)
return f,b
import warnings
warnings.filterwarnings('ignore')
Proportion Z-Test:
CLen = len(features[features.language =='Traditional Chinese'])
JLen = len(features[features.language =='Traditional Chinese'])
for feat in keepfeatures[0:20]:
CCount = languagesDF[feat].iloc[2]
JCount = languagesDF[feat].iloc[3]
counts = np.array([CCount,JCount])
nobs = np.array([CLen,JLen])
stat, p = proportions_ztest(counts, nobs)
if (p < .05) & (p > 0):
print('{}:\x1b[92m{}\x1b[0m'.format(feat,p),end='|')
elif p > .05:
print('{}:\x1b[31m{}\x1b[0m'.format(feat,p),end='|')
else:
print('{}:\x1b[33m{}\x1b[0m'.format(feat,p),end='|')
Mann Whitney U test (2 Non-Normally Distributed Independent Samples)
B. Mood’s Median test (2+ Non-Normally Distributed Independent Samples)
Null Hypothesis: Assumes samples are from same 1st language population.
B. One-Way AnovaTest (2+ Normally Distributed Independent Samples)
Null Hypothesis: Assumes samples are from same 1st language population.