March 22, 2021
Code is adapted from Google News and Leo Tolstoy: Visualizing Word2Vec Word Embeddings using t-SNE by Sergey Smetanin.
We use Google's Word2Vec vectors from https://code.google.com/archive/p/word2vec/.
We are publishing pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described by Mikolov et al., 2013.
Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013.
import gensim
import gensim.downloader
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
import numpy as np
import tempfile
import imageio
import shutil
import os
from statistics import mean
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from gensim.test.utils import datapath
from gensim.models.fasttext import load_facebook_vectors
from IPython.display import Image
from IPython.display import display
pd.options.display.max_columns = None
import matplotlib.patheffects as PathEffects
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
TOP_N = 30
KEYS = { \
"offensive": set(),
"abusive": set(),
"cyberbullying": set(),
"vulgar": set(),
"racist": set(),
"homophobic": set(),
"profane": set(),
"slur": set(),
"harrasment": set(),
"obscene": set(),
"threat": set(),
"discredit": set(),
"hateful": set(),
"insult": set(),
"hostile": set(),
"taboo": set()
}
FIXED_KEYS = list(KEYS.keys())
Too similar words that will not be taken into account as neighbouring words:
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
for key in KEYS.keys():
lemma = lemmatizer.lemmatize(key)
stem = ps.stem(key)
KEYS[key].add(lemma)
KEYS[key].add(stem)
KEYS[key].add(key)
KEYS
def same_word(similar_word, ommit_words):
similar_word = similar_word.replace("_", " ").replace("-", " ").lower()
if similar_word in ommit_words:
print(f"{ommit_words} -- {similar_word}")
return True
for ommit_word in ommit_words:
if ommit_word in similar_word:
print(f"{ommit_words} -- {similar_word}")
return True
return False
def getSimilarWords(model_gn):
embedding_clusters = []
word_clusters = []
for key in FIXED_KEYS:
ommit_words = KEYS[key]
embeddings = []
words = []
for similar_word, _ in model_gn.most_similar(key, topn=TOP_N * 3):
if not same_word(similar_word, ommit_words):
words.append(similar_word)
embeddings.append(model_gn[similar_word])
if len(words) < TOP_N or len(embeddings) < TOP_N:
print("ERROR")
words = words[:TOP_N]
embeddings = embeddings[:TOP_N]
embedding_clusters.append(embeddings)
word_clusters.append(words)
return (word_clusters, embedding_clusters)
def displayDF(word_clusters):
df = pd.DataFrame(dict(zip(FIXED_KEYS, word_clusters)))
display(df)
def plot_similar_words(title, labels, embedding_clusters, word_clusters, filename=None):
plt.figure(figsize=(16, 9))
colors = cm.rainbow(np.linspace(0, 1, len(labels)))
for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
x = embeddings[:, 0]
y = embeddings[:, 1]
plt.scatter(x, y, c=color, alpha=0.7, label=label)
for i, word in enumerate(words):
plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
textcoords='offset points', ha='right', va='bottom', size=8)
#plt.annotate(label.upper(), alpha=1.0, xy=(mean(x), mean(y)), xytext=(0, 0),
# textcoords='offset points', ha='center', va='center', size=15)
plt.text(x.mean(), y.mean(), label.upper(), color='white', weight='bold', fontsize=13,
path_effects=[PathEffects.withStroke(linewidth=3, foreground="black", alpha=0.9)])
plt.legend(loc=4)
plt.title(title)
plt.grid(False)
if filename:
plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
plt.show()
def plotTSNE(title, word_clusters, embedding_clusters, filename = None):
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
model_en_2d = model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))
embeddings_en_2d = np.array(model_en_2d).reshape(n, m, 2)
plot_similar_words(title, FIXED_KEYS, embeddings_en_2d, word_clusters, filename)
def plotMDS(title, word_clusters, embedding_clusters, filename = None):
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
model_en_2d = MDS(n_components=2, max_iter=3500, random_state=32)
model_en_2d = model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))
embeddings_en_2d = np.array(model_en_2d).reshape(n, m, 2)
plot_similar_words(title, FIXED_KEYS, embeddings_en_2d, word_clusters, filename)
def plotPCA(title, word_clusters, embedding_clusters, filename = None):
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
model_en_2d = PCA(n_components=2, random_state = 32)
model_en_2d = model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))
embeddings_en_2d = np.array(model_en_2d).reshape(n, m, 2)
plot_similar_words(title, FIXED_KEYS, embeddings_en_2d, word_clusters, filename)
Loading model
model_gn = gensim.downloader.load('word2vec-google-news-300')
print(f"Vocabulary size: {len(model_gn.vocab)}")
Getting similar words. Printed out words are neighboring words that are ommited for the analysis.
word_clusters, embedding_clusters = getSimilarWords(model_gn)
displayDF(word_clusters)
plotTSNE("Similar words - Word2Vec [t-SNE]", word_clusters, embedding_clusters, "SimilarWords - word2vec - t-SNE.png")
plotMDS("Similar words - Word2Vec [MDS]", word_clusters, embedding_clusters, "SimilarWords - word2vec - MDS.png")
plotPCA("Similar words - Word2Vec [PCA]", word_clusters, embedding_clusters, "SimilarWords - word2vec - PCA.png")
Loading model
model_gn = gensim.downloader.load('glove-wiki-gigaword-300')
print(f"Vocabulary size: {len(model_gn.vocab)}")
Getting similar words. Printed out words are neighboring words that are ommited for the analysis.
word_clusters, embedding_clusters = getSimilarWords(model_gn)
displayDF(word_clusters)
plotTSNE("Similar words - Glove [t-SNE]", word_clusters, embedding_clusters, "SimilarWords - Glove - t-SNE.png")
plotMDS("Similar words - Glove [MDS]", word_clusters, embedding_clusters, "SimilarWords - Glove - MDS.png")
plotPCA("Similar words - Glove [PCA]", word_clusters, embedding_clusters, "SimilarWords - Glove - PCA.png")
Loading model
# Run first time only to download model
#import fasttext.util
#fasttext.util.download_model('en', if_exists='ignore') # English
#ft = fasttext.load_model('cc.en.300.bin')
# Chaya used: https://fasttext.cc/docs/en/crawl-vectors.html
# It includes both Common Crawl and Wikipedia - file cc.en.300.bin.
model_gn = load_facebook_vectors("cc.en.300.bin")
print(f"Vocabulary size: {len(model_gn.vocab)}")
Getting similar words. Printed out words are neighboring words that are ommited for the analysis.
word_clusters, embedding_clusters = getSimilarWords(model_gn)
displayDF(word_clusters)
plotTSNE("Similar words - fastText [t-SNE]", word_clusters, embedding_clusters, "SimilarWords - fastText - t-SNE.png")
plotMDS("Similar words - fastText [MDS]", word_clusters, embedding_clusters, "SimilarWords - fastText - MDS.png")
plotPCA("Similar words - fastText [PCA]", word_clusters, embedding_clusters, "SimilarWords - fastText - PCA.png")