Author: Ryan Harper
Data Source:
NLTK Gutenberg
Summary:
This project explores the variation in vocabulary and writing structure of three different authors by employing Word2Vec, IDF, and a gaussian mixture cluster model to examine the sentences in three different novels.
Variables:
author: author of the novel
sentence: sentence from the novel/author
Hypothesis:
Authors will use different vocabulary and collocations that will help models predict differences between the authors in an unsupervised environment.
Observations:
Proper nouns (names and places) are the strongest indicator of different authors (via their specific novels)
Some frequently used 1 and 2 word expressions (i.e.'Oh' and 'was said') are a good indicator of different authors
Clustering models on a Truncated SVD IDF vector does not appear to produce meaningful results. Future experiments might need to explore other ways to better vectorize sentences for cluster models.
Method:
Imported and merged the three novels into a data set - added a fourth novel from the first author to control for differences in novels
Split up the novels by sentences and randomly sorted the samples.
Implemented Word2Vec on the sentences and visualized word similarities
Used IDF and SVD to assess important components and then matched them to the original sentences.
Used the Gaussian Mixture model (cluster model) to look at the three combined samples and to determine if clusters could be determined from sentences.
Used v_measurement, cross tab, and akaike to assess the effectiveness of the clustering.
%matplotlib inline
import numpy as np;import pandas as pd;import matplotlib.pyplot as plt
import scipy
import re
import sklearn;from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import gutenberg
from IPython.display import display
import warnings
import seaborn as sns
from textblob import TextBlob
warnings.filterwarnings('ignore')
from colorama import Fore
green = Fore.GREEN
red = Fore.RED
black = Fore.RESET
print(gutenberg.fileids())
# Utility function for standard text cleaning.
def text_cleaner(text):
text = str(text)
# Visual inspection identifies a form of punctuation spaCy does not
# recognize: the double dash '--'. Better get rid of it now!
text = re.sub(r'--',' ',text)
text = re.sub("[\[].*?[\]]", "", text)
text = re.sub('\n'," ",text)
text = re.sub('\r'," ",text)
return text
def mlist(story):
return [text_cleaner(i) for i in TextBlob(re.sub(r'Chapter \d+', '', story)).sentences]
def mtokens(story):
return [list(i.words.lower()) for i in TextBlob(re.sub(r'Chapter \d+', '', story)).sentences]
# Load and clean the data.
chestertonbrown = mlist(gutenberg.raw('chesterton-brown.txt'))
edgeworth = mlist(gutenberg.raw('edgeworth-parents.txt'))
alice = mlist(gutenberg.raw('carroll-alice.txt'))
sentences = chestertonbrown + alice + edgeworth
def nl(name,data):
return [name for i in range(len(data))]
name = nl('chesterton',chestertonbrown)+nl('edgeworth',edgeworth)+nl('carroll',alice)
# Add second book to control for book differences (instead of author differences)
chestertonball = mlist(gutenberg.raw('chesterton-ball.txt'))
sentences+=chestertonball
name+=nl('chesterton',chestertonball)
stories = pd.DataFrame([sentences,name]).T.sample(frac=1,random_state=42)
stories.columns = ['sentence','author']
from gensim.models import word2vec
from gensim.utils import tokenize
stories.tokens = stories.sentence.apply(tokenize).apply(list)
%%time
load_model = False
vec_size = 200
word_vec = word2vec.Word2Vec(
stories.tokens,
workers=4, # Number of threads to run in parallel
min_count=5, # Minimum word count threshold.
window=6, # Number of words around target word to consider.
sg=0, # Use CBOW because our corpus is small.
sample=1e-3 , # Penalize frequent words.
size=vec_size, # Word vector length.
hs=1 # Use hierarchical softmax.
)
# List of words in model.
vocab = word_vec.wv.vocab.keys()
Cosine Similarity Function
Version A. Cosine Similarity Function on Word2Vec Matrix (with numpy)
euclidean_norm = lambda m: np.sqrt(np.array([a*a for a in m]).sum())
def similarity_vec(a,b):
return (np.dot(a,b))/(euclidean_norm(a)*euclidean_norm(b))
ladyship_miss = similarity_vec(word_vec['ladyship'],word_vec['Miss'])
ladyship_statue = similarity_vec(word_vec['ladyship'],word_vec['statue'])
statue_miss = similarity_vec(word_vec['statue'],word_vec['Miss'])
bore_hard = similarity_vec(word_vec['bore'],word_vec['hard'])
print(green+'ladyship - Miss: {}'.format(ladyship_miss)+black)
print(red+'ladyship - statue: {}'.format(ladyship_statue)+black)
print(red+'statue - Miss: {}'.format(statue_miss)+black)
print(green+'bore - hard: {}'.format(bore_hard)+black)
Version B. Gensim
w1,w2,w3 = 'Miss','ladyship','men'
print(word_vec.wv.most_similar(positive=[w1, w2], negative=[w3], topn=1))
w1 = 'statue'
print(word_vec.wv.most_similar(positive=w1,topn=3))
w1 = 'hard'
print(word_vec.wv.most_similar(positive=w1,topn=3))
Version C. SKlearn
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(word_vec['ladyship'].reshape(-1,1),word_vec['Miss'].reshape(-1,1))
Returns a matrix for first part of cosine similarity equation
colors_dict = dict(zip(['statue','Miss','ladyship'],['red','green','green']))
from sklearn.manifold import TSNE
X = word_vec[word_vec.wv.vocab]
words = list(word_vec.wv.vocab)
graph_tsne = TSNE(n_components=2)
result = graph_tsne.fit_transform(X)
# create a scatter plot of the projection
plt.figure(figsize=(20,20))
plt.scatter(result[:, 0], result[:, 1],color='gray')
for i, word in enumerate(words):
if word in colors_dict.keys():
plt.annotate(word,
xy=(result[i, 0],
result[i, 1]),
size=18,
backgroundcolor=colors_dict[word])
plt.show()
vectorizer = TfidfVectorizer(max_df=0.3,
min_df=2,
stop_words='english',
lowercase=True, #lower case
use_idf=True,#inverse document frequencies
norm=u'l1', #longer and shorter paragraphs get treated equally
smooth_idf=True #Adds 1 to all document frequencies
)
X_train, y_train = stories.sentence,stories.author
#Applying the vectorizer
stories_paras_tfidf=vectorizer.fit_transform(X_train)
print("Number of features: %d" % stories_paras_tfidf.get_shape()[1])
# Reformat tfidf (Thinkful code)
X_train_tfidf_csr = stories_paras_tfidf.tocsr()
n = X_train_tfidf_csr.shape[0]
tfidf_bypara = [{} for _ in range(0,n)]
Examine Features
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]
#Keep in mind that the log base 2 of 1 is 0
print('Original sentence:', stories.iloc[0])
print('Tf_idf vector:', tfidf_bypara[10])
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
variances = []
r = range (1,7001,500)
for i in r:
svd= TruncatedSVD(i)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf_csr)
variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
variances.append(total_variance)
print('# of Features: {}'.format(i),end=' | ')
plt.scatter(r,variances)
plt.title('Truncated SVD: Explained Variance')
svd= TruncatedSVD(400)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf_csr)
variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
#Looking at what sorts of paragraphs our solution considers similar
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train.index)
paras_by_component['author'] = stories.author
paras_by_component['sentence'] = stories.sentence
pd.options.display.width = 100
pd.options.display.max_colwidth = 100
for i in range(5):
component = i
print('Component {}:'.format(component))
df = pd.DataFrame(data = paras_by_component[[component,'author','sentence']]).sort_values(by=component,ascending=False)[0:40:5]
df = df.reset_index();del df['index'];df.columns=['Similarity','Author','Sentence']
display(df)
print('')
r = range(0,20)
# Compute document similarity using LSA components
similarity = np.asarray(np.asmatrix(X_train_lsa) * np.asmatrix(X_train_lsa).T)
#Only taking the first 10 sentences
sim_matrix=pd.DataFrame(similarity,index=X_train).iloc[r[0]:r[-1]+1,r[0]:r[-1]+1]
#Making a plot
plt.figure(figsize=(10,8))
ax = sns.heatmap(sim_matrix,yticklabels=r)
plt.show()
#Generating a key for the plot.
print('Key:')
for i in r:
j = i-r[0]
print("{}-{}: {}".format(i,stories.author.iloc[i],sim_matrix.index[j][0:100]))
Because four novels (and three authors) were merged into a single data set, a mixture model is helpful for distinguising between
%%time
from sklearn.mixture import GaussianMixture
gm_range = range(2,8,1)
gm_aic = []
gm_predict = []
gm_score = []
gm_x = X_train_lsa
gm_y = y_train
for i in gm_range:
# Declare and fit the model.
gm = GaussianMixture(n_components=i,n_init=10,init_params='random')
gm.fit(gm_x)
# gm_predict.append(km.predict_fit(X_test_tfidf))
gm_aic.append(gm.aic(gm_x))
gm_predict.append(gm.predict(gm_x))
gm_score.append(gm.score(gm_x,gm_y))
#Predicted clusters.
print('{} completed'.format(i),end=' | ')
plt.scatter(range(2,8,1),gm_aic)
plt.savefig('aic.jpg')
from sklearn.metrics import v_measure_score
v_score = []
for score in gm_predict:
v_score.append(v_measure_score(y_train,score))
plt.plot(v_score);
table = pd.crosstab(y_train, gm_predict[1],margins = True)
table.index = list(y_train.unique()) + ['total'];table