Part 1: Exploration
https://kimrharper.github.io/project-lang8-1.html

Part 2: Analysis
https://kimrharper.github.io/project-lang8-2.html

Part 3: Models
https://kimrharper.github.io/project-lang8-3.html

L1 Prediction from ELL Writing Samples¶

Part 2: Analysis¶

Author: Ryan Harper

# iPython/Jupyter Notebook
import time
from pprint import pprint
import warnings
from IPython.display import Image

# Data processing
import scipy
import pandas as pd
import plotly as plo
import numpy as np
import seaborn as sns
from collections import Counter
from functools import reduce
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# Statistics
from scipy import stats
from statsmodels import robust
from scipy.stats import ttest_ind, mannwhitneyu, median_test, f_oneway,mood, shapiro

# NLP
import textblob
from nltk.corpus import stopwords as sw
from nltk.util import ngrams
from nltk.corpus import brown
import nltk
import re
from nltk.tokenize import RegexpTokenizer
import difflib
from string import punctuation

# import altair as alt

# Jupyter Settings and Imports
%pylab
%matplotlib inline 
warnings.filterwarnings(action='once')

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib

1. Looking at Language Samples (top)

loadDF = False
if loadDF:
    features = pd.read_csv('blogfeatures.csv').sample(frac=1.0)
else:
    %store -r blog
    features = blog
    lang = list(features.language.unique())

Create Analysis Set

languagesDF = pd.DataFrame(index=['Traditional Chinese','Japanese','CCount','JCount'],columns=['NA'])
binary = features
nonbinary = ['language','Unnamed: 0','id','pos','pos1','pos2','pos3','tokens','content','letters_per','wc','sc','sent_pol','sent_subj','cap_let','punc_count','freq_score','full_freq_score']
for b in binary.columns:
    if b not in nonbinary:
        binary[b] = binary[b].apply(lambda val: 0 if val == 0 else 1)
        ll = []
        for l in lang:
            ll.append(binary[b][binary['language'] == l].sum()/binary.language[binary['language'] == l].count())
        for l in lang:
            ll.append(int(binary[b][binary['language'] == l].sum()))
        languagesDF[b] = ll
        
del languagesDF['NA']
langDF = languagesDF.iloc[0:2].reset_index()
languagesDF

binary['cuw_god'][binary['language']=='Traditional Chinese'].sum()/(binary.language[binary['language'] == 'Traditional Chinese']).count()

0.03668916935720575

languagesDF['cuw_god']

Traditional Chinese      0.036689
Japanese                 0.008809
CCount                 125.000000
JCount                  84.000000
Name: cuw_god, dtype: float64

def check_differences(s):
    a = s.iloc[0]
    b = s.iloc[1]
    top = abs(a-b)
    bot = (a+b)/2
    perc_diff = top/bot
    return perc_diff

def graph_vis(f,bool):
    if bool:
        sns.factorplot(data=langDF, x='index', y=f, kind='bar')
        plt.title('% of {}'.format(f))
        blue_patch = mpatches.Patch(color='steelblue', label='# of Chinese: {}'.format(languagesDF[f].iloc[2]))
        orange_patch = mpatches.Patch(color='darkorange', label='# of Japanese: {}'.format(languagesDF[f].iloc[3]))
        plt.legend(handles=[blue_patch,orange_patch])

languagesDF['adv_still'].iloc[3]

616.0

from scipy.stats import binom_test

keepfeatures = []
for feat in list(languagesDF.columns):
    if feat != 'language':
        
        if (check_differences(languagesDF[feat]) > .4) and languagesDF[feat].sum()>50:
#             graph_vis(feat,True)
            keepfeatures.append(feat)       
plt.show()
%store keepfeatures

Stored 'keepfeatures' (list)

len(keepfeatures)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-ffc9cdafe6ee> in <module>()
----> 1 len(keepfeatures)

NameError: name 'keepfeatures' is not defined

Reject Outliers

def MEDIAN_reject_outliers(data, m=3):
    data = data[abs(data - np.median(data)) < m*robust.mad(data)]
    return data[~np.isnan(data)].sort_values()


def MEAN_reject_outliers(data, m=3):
    data = data[abs(data - np.mean(data)) < m*np.std(data)]
    return data[~np.isnan(data)].sort_values()

Create Analysis DF

analysis_features=['letters_per','wc','sc','sent_pol','sent_subj','cap_let','punc_count','full_freq_score']
blog_features =['language']
full_features = analysis_features + blog_features
analysis = features[full_features]

Plot Hist

Decision Tree Truncated SVD Features

i = 1
fig=plt.figure(figsize=(20,10))
for feat in analysis_features:
    if feat != 'language':
        plt.subplot(2, 4, i)
        i = i + 1
        langfeats = [MEDIAN_reject_outliers(analysis[feat][analysis.language == l]) for l in lang]
        for g in langfeats:
            sns.kdeplot(g,legend=None,kernel='gau',shade=True,)
            plt.title(feat, fontsize=18)

fig.suptitle('KDeplot (Histograms) of Continuous Features', fontsize=18, y=1.03)
plt.tight_layout()
plt.legend(lang)
plt.show()

from scipy.stats import ttest_ind

for c in analysis_features:
    g = [(blog[c][blog.language == l]) for l in lang]
    stat,p = ttest_ind(g[0],g[1])

    if (p < .05) & (p > 0):
        print('{}:\x1b[92m{}\x1b[0m'.format(c,p))
    elif p > .05:
        print('{}:\x1b[31m{}\x1b[0m'.format(c,p))
    else:
        print('{}:\x1b[33m{}\x1b[0m'.format(c,p))

letters_per:3.658300613614781e-39
wc:6.378157218238824e-14
sc:6.100618488069283e-09
sent_pol:0.004090030379914499
sent_subj:0.48504616386813104
cap_let:1.2756773756042557e-67
punc_count:1.6397855918587436e-33
full_freq_score:1.882342916446436e-11

Correlations (of top features by decision tree importance)

%store -r bnb_sorted

# corr = analysis.corr(); print(corr)
i = 1
fig=plt.figure(figsize=(22,15))

for ind,g in enumerate([(features[bnb_sorted[0:5]][features.language == l]) for l in lang]):
    plt.subplot(2, 2, i)
    i = i + 1
    
    corr = g.corr()
    sns.heatmap(corr, 
            xticklabels=corr.columns,
            yticklabels=corr.columns,
            annot=True)
    plt.title(lang[ind],fontsize = 20)
    plt.xticks(rotation='vertical',fontsize = 14)
    plt.yticks(rotation='horizontal',fontsize = 14)
    
plt.show()

C. Statistical Significance (top)

from scipy.stats import median_test
from statsmodels.stats.proportion import proportions_ztest

def mw_test(a,b):
    stat,p = mannwhitneyu(a,b, use_continuity=True, alternative=None)
    return stat,p

def moods_median_test(a,b,c,d):
    stat, p, med, tbl = median_test(a,b,c,d)
    return stat,p

def f1way_test(a,b,c,d):
    f,b = f_oneway(a,b,c,d)
    return f,b

import warnings
warnings.filterwarnings('ignore')

Proportion Z-Test:

CLen = len(features[features.language =='Traditional Chinese'])
JLen = len(features[features.language =='Traditional Chinese'])

for feat in keepfeatures[0:20]:
    CCount = languagesDF[feat].iloc[2]
    JCount = languagesDF[feat].iloc[3]
    counts = np.array([CCount,JCount])
    nobs = np.array([CLen,JLen])
    stat, p = proportions_ztest(counts, nobs)
    
    if (p < .05) & (p > 0):
        print('{}:\x1b[92m{}\x1b[0m'.format(feat,p),end='|')
    elif p > .05:
        print('{}:\x1b[31m{}\x1b[0m'.format(feat,p),end='|')
    else:
        print('{}:\x1b[33m{}\x1b[0m'.format(feat,p),end='|')

prp_us:2.1727948710394688e-13|prp_themselves:0.26859516439037034|prp_myself:2.227770490101894e-06|prp_herself:0.031798833029012236|prp_yourself:0.30901526618045183|prp_ourselves:0.12607830691962182|cc_yet:4.180954285713628e-30|prep_along:0.03986536843870174|prep_across:0.025983670860614816|prep_de:2.5725226517465718e-08|prep_behind:0.06702703872349279|prep_toward:1.340885251376464e-07|prep_near:2.258662335643635e-51|prep_unless:0.0262255204655606|prep_past:0.06818855261481119|prep_far:2.8870145644730413e-05|adv_just:1.3914137330310227e-15|adv_here:7.719205896364638e-29|adv_still:1.60873401994528e-06|adv_always:1.22480365668219e-12|

Mann Whitney U test (2 Non-Normally Distributed Independent Samples)

B. Mood’s Median test (2+ Non-Normally Distributed Independent Samples)

Null Hypothesis: Assumes samples are from same 1st language population.

B. One-Way AnovaTest (2+ Normally Distributed Independent Samples)