Ryan Harper
import random
import numpy as np
from string import punctuation as punct
import matplotlib.pyplot as plt
import seaborn as sns
import os
import psutil
import operator
from math import ceil
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as mpatches
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import SpectralClustering
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import v_measure_score
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from keras.layers import Dense, Input, CuDNNLSTM, LSTM, Embedding, Dropout,SpatialDropout1D, Bidirectional
from keras.models import Model
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from IPython.display import display, Markdown, Latex
def beep(audio):
# for mac only: frog,blow,funk,glass,tink,submarine,purr,sosumi
os.system('afplay /System/Library/Sounds/' + audio +'.aiff')
def memory_check():
print('')
print('CPU Percent: {} || Virtual Memory: {}'.format(psutil.cpu_percent(),psutil.virtual_memory()))
print('')
import pandas as pd
boardgamereviews = pd.read_csv('../data/boardgame/boardgame-comments-english.csv')
boardgamereviews.columns = 'reviewer_id', 'game_id', 'rating', 'comment'
unique_id = boardgamereviews.reviewer_id.unique()
sample_id = random.sample(unique_id.tolist(),100)
review = boardgamereviews[boardgamereviews['reviewer_id'].isin(sample_id)]
review['rating_normed']=review.rating - review['rating'].groupby(review['reviewer_id']).transform('mean')
review['rating_normed']=(review['rating_normed']-review['rating_normed'].min())/(review['rating_normed'].max()-review['rating_normed'].min())
review['rating_normed']=review['rating_normed'].apply(lambda val: val*10).apply(ceil)
if 0 not in review['rating'].unique():
review['rating_normed']=review['rating_normed'].apply(lambda val: 1 if val==0 else val)
# review['estimate'] = review['rating_normed'] + review['rating'].groupby(review['reviewer_id']).transform('mean')
review['rating'] = review['rating'].apply(round)
memory_check()
plt.figure(figsize=(9,6))
plt.subplot(2,2,1)
plt.hist(review.rating,bins=10)
plt.title('Ratings')
plt.subplot(2,2,2)
plt.hist(review.rating_normed,bins=10)
plt.title('Ratings (Normed by User)');
print('Processing text dataset')
from nltk.tokenize import WordPunctTokenizer
from collections import Counter
from string import punctuation, ascii_lowercase
import regex as re
from tqdm import tqdm
# replace urls
re_url = re.compile(r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\
.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*",
re.MULTILINE|re.UNICODE)
# replace ips
re_ip = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
# setup tokenizer
tokenizer = WordPunctTokenizer()
vocab = Counter()
def text_to_wordlist(text, lower=False):
# replace URLs
text = re_url.sub("URL", text)
# replace IPs
text = re_ip.sub("IPADDRESS", text)
# Tokenize
text = tokenizer.tokenize(text)
# optional: lower case
if lower:
text = [t.lower() for t in text]
# Return a list of words
vocab.update(text)
return text
def process_comments(list_sentences, lower=False):
comments = []
for text in tqdm(list_sentences):
txt = text_to_wordlist(text, lower=lower)
comments.append(txt)
return comments
comments_train = list(review["comment"].fillna("NAN_WORD").values)
review['token'] = process_comments(comments_train, lower=True)
print("The vocabulary contains {} unique tokens".format(len(vocab)))
print('')
memory_check()
CREDIT: Methodology and some code for this section closely follows Kenny Liao's Approach
word_list = set([v for sublist in review.token.tolist() for v in sublist])
vocab_ratings = review[['rating']]
vocab_ratings.columns = ['Rating']
token_ratings = review[['rating','token']]
for v in tqdm(word_list):
vocab_ratings[v]=[row[1][0] if v in row[1][1] else None for row in token_ratings.iterrows()]
[col for col, val in vocab_ratings.count().iteritems() if val < 1]
# Split the data into a 70/30 split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vocab_ratings.iloc[:,1:], vocab_ratings['Rating'], test_size=0.33, random_state=42)
from math import sqrt
def rmse(true, predicted):
rmse = sqrt(((true-predicted)**2).mean())
return rmse
Get the baseline RMSE if we just predict every comment is equal to the median for all ratings.
baseline_median = y_train.median();baseline_median
rmse_baseline = rmse(y_test, baseline_median)
display(Markdown(f"__Baseline RMSE:__ _{rmse_baseline:.5f}_"))
vocab_describe = X_train.describe().T
# Get the median for every word
def percentile(n):
def percentile_(x):
return np.percentile(x, n)
percentile_.__name__ = 'Q_%s' % n
return percentile_
percentile(25)(vocab_ratings.king.dropna())
wordsets = []
ratings_range = range(1,11)
for i in ratings_range:
q = vocab_describe[(vocab_describe['50%'] > i-1) & (vocab_describe['50%'] <= i)]
q = q.sort_values(by=['count'],ascending=False)
q = q[0:10]
wordsets.append(q.index.values.tolist())
wordsets[0]
ax = sns.distplot(vocab_describe["50%"].dropna(), kde=False, bins=40, hist_kws={'log':True})
assign_values = dict(zip(vocab_describe.index, vocab_describe['50%']))
def isNaN(num):
return num != num
X_train_median = X_train
X_test_median = X_test
for col in X_train:
X_train_median[col] = X_train_median[col].apply(lambda val: assign_values[col] if not isNaN(val) else None)
X_test_median[col] = X_test_median[col].apply(lambda val: assign_values[col] if not isNaN(val) else None)
y_train_predict = X_train_median.sum(axis=1)/X_train_median.count(axis=1)
y_test_predict = X_test_median.sum(axis=1)/X_test_median.count(axis=1)
train_median = rmse(y_train, y_train_predict)
display(Markdown(f"__Baseline RMSE:__ _{train_median:.5f}_"))
import plotly.graph_objs as go
import plotly
trace0 = go.Scatter(
x=y_train,
y=y_train_predict,
mode='markers')
trace1 = go.Scatter(
x=[0,10],
y=[0,10],
name='Ideal Line')
layout = go.Layout(
xaxis=dict(title='Actual'),
yaxis=dict(title='Predicted'),
hovermode='closest',
margin=dict(t=50))
data=[trace0, trace1]
fig = go.Figure(data, layout)
# plotly.offline.iplot(fig, filename='predictions-naive.html')
plt.show()
from gensim.models import Word2Vec
WV_DIM = 200
model = Word2Vec(review['token'], size=WV_DIM, window=10, min_count=1, workers=4, sg=0, negative=3)
vec_size = model.layer1_size
word_vectors = model.wv
print("Number of word vectors: {}".format(len(word_vectors.vocab)))
Cosine Similarity Function
Version A. Raw Code
cols_conv = ratings_range
colors = cm.rainbow(np.linspace(0, 1, len(cols_conv)))
fullcolors = []
for c in colors:
for i in ratings_range:
fullcolors.append(c)
fullwords = []
for wset in wordsets:
for w in wset:
fullwords.append(w)
colors_dict = dict(zip(fullwords,fullcolors))
from sklearn.manifold import TSNE
X = model[model.wv.vocab]
words = list(model.wv.vocab)
graph_tsne = TSNE(n_components=2)
result = graph_tsne.fit_transform(X)
# create a scatter plot of the projection
plt.figure(figsize=(20,20))
plt.scatter(result[0:50, 0], result[0:50, 1],color='gray')
for i, word in enumerate(words):
if word in colors_dict.keys():
plt.annotate(word,
xy=(result[i, 0],
result[i, 1]),
size=18,bbox=dict(facecolor='white', edgecolor=colors_dict[word], boxstyle='round',alpha=.4))
handle_set = []
for i in colors:
handle_set.append(mpatches.Patch(color=i, label=c))
plt.legend(handles=handle_set)
plt.show()
from sklearn.manifold import TSNE
X = model[model.wv.vocab]
words = list(model.wv.vocab)
graph_tsne = TSNE(n_components=2)
result = graph_tsne.fit_transform(X)
# create a scatter plot of the projection
plt.figure(figsize=(20,20))
plt.scatter(result[:, 0], result[:, 1],color='gray')
for i, word in enumerate(words):
if word in colors_dict.keys():
plt.annotate(word,
xy=(result[i, 0],
result[i, 1]),
size=18,
backgroundcolor=colors_dict[word],alpha=.4)
handle_set = []
for i in ratings_range:
handle_set.append(mpatches.Patch(color=colors[i-1], label='Rating {}'.format(i)))
plt.legend(handles=handle_set)
plt.show()
from sklearn.manifold import TSNE
X = model[model.wv.vocab]
words = list(model.wv.vocab)
graph_tsne = TSNE(n_components=2)
result = graph_tsne.fit_transform(X)
# create a scatter plot of the projection
plt.figure(figsize=(20,20))
plt.scatter(result[:, 0], result[:, 1],color='gray')
for i, word in enumerate(words):
if word in colors_dict.keys():
plt.annotate(word,
xy=(result[i, 0],
result[i, 1]),
size=18,
bbox=dict(facecolor=colors_dict[word], edgecolor=colors_dict[word], boxstyle='round',alpha=.8))
handle_set = []
for i in ratings_range:
handle_set.append(mpatches.Patch(color=colors[i-1], label='Rating {}'.format(i)))
plt.legend(handles=handle_set)
plt.savefig('word2vec_ratings2.png')
plt.show()