Visualizing Offensive language keywords - Word2Vec and t-SNE

March 22, 2021

Code is adapted from Google News and Leo Tolstoy: Visualizing Word2Vec Word Embeddings using t-SNE by Sergey Smetanin.

We use Google's Word2Vec vectors from https://code.google.com/archive/p/word2vec/.

We are publishing pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described by Mikolov et al., 2013.

Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013.

Imports

In [1]:
import gensim
import gensim.downloader
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
import numpy as np
import tempfile
import imageio
import shutil
import os
from statistics import mean
import pandas as pd 
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer 
from gensim.test.utils import datapath
from gensim.models.fasttext import load_facebook_vectors

from IPython.display import Image
from IPython.display import display
pd.options.display.max_columns = None

import matplotlib.patheffects as PathEffects
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')

Constants

In [2]:
TOP_N = 30
KEYS = { \
    "offensive": set(), 
    "abusive": set(), 
    "cyberbullying": set(), 
    "vulgar": set(), 
    "racist": set(), 
    "homophobic": set(), 
    "profane": set(), 
    "slur": set(), 
    "harrasment": set(), 
    "obscene": set(), 
    "threat": set(), 
    "discredit": set(), 
    "hateful": set(), 
    "insult": set(), 
    "hostile": set(),
    "taboo": set()
}
FIXED_KEYS = list(KEYS.keys())

Too similar words that will not be taken into account as neighbouring words:

In [3]:
lemmatizer = WordNetLemmatizer() 
ps = PorterStemmer() 


for key in KEYS.keys():
    lemma = lemmatizer.lemmatize(key)
    stem = ps.stem(key)
    KEYS[key].add(lemma)
    KEYS[key].add(stem)
    KEYS[key].add(key)
KEYS
Out[3]:
{'offensive': {'offens', 'offensive'},
 'abusive': {'abus', 'abusive'},
 'cyberbullying': {'cyberbulli', 'cyberbullying'},
 'vulgar': {'vulgar'},
 'racist': {'racist'},
 'homophobic': {'homophob', 'homophobic'},
 'profane': {'profan', 'profane'},
 'slur': {'slur'},
 'harrasment': {'harras', 'harrasment'},
 'obscene': {'obscen', 'obscene'},
 'threat': {'threat'},
 'discredit': {'discredit'},
 'hateful': {'hate', 'hateful'},
 'insult': {'insult'},
 'hostile': {'hostil', 'hostile'},
 'taboo': {'taboo'}}

Functions

In [4]:
def same_word(similar_word, ommit_words):
    similar_word = similar_word.replace("_", " ").replace("-", " ").lower()
    
    if similar_word in ommit_words:
        print(f"{ommit_words} -- {similar_word}")
        return True
    
    for ommit_word in ommit_words:
        if ommit_word in similar_word:
            print(f"{ommit_words} -- {similar_word}")
            return True
        
    return False
In [5]:
def getSimilarWords(model_gn):
    embedding_clusters = []
    word_clusters = []
    for key in FIXED_KEYS:
        ommit_words = KEYS[key]
        embeddings = []
        words = []
        for similar_word, _ in model_gn.most_similar(key, topn=TOP_N * 3):
            if not same_word(similar_word, ommit_words):
                words.append(similar_word)
                embeddings.append(model_gn[similar_word])
                
        if len(words) < TOP_N or len(embeddings) < TOP_N:
            print("ERROR")
            
        words = words[:TOP_N]
        embeddings = embeddings[:TOP_N]
        
        embedding_clusters.append(embeddings)
        word_clusters.append(words)
        
    return (word_clusters, embedding_clusters)
In [6]:
def displayDF(word_clusters):
    df = pd.DataFrame(dict(zip(FIXED_KEYS, word_clusters)))  
    display(df)
In [7]:
def plot_similar_words(title, labels, embedding_clusters, word_clusters, filename=None):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=0.7, label=label)
        for i, word in enumerate(words):              
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
        #plt.annotate(label.upper(), alpha=1.0, xy=(mean(x), mean(y)), xytext=(0, 0),
        #    textcoords='offset points', ha='center', va='center', size=15)
        
        plt.text(x.mean(), y.mean(), label.upper(), color='white', weight='bold', fontsize=13, 
                 path_effects=[PathEffects.withStroke(linewidth=3, foreground="black", alpha=0.9)])
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(False)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()
In [8]:
def plotTSNE(title, word_clusters, embedding_clusters, filename = None):
    embedding_clusters = np.array(embedding_clusters)
    n, m, k = embedding_clusters.shape
    model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
    model_en_2d = model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))
    embeddings_en_2d = np.array(model_en_2d).reshape(n, m, 2)
    plot_similar_words(title, FIXED_KEYS, embeddings_en_2d, word_clusters, filename)
In [9]:
def plotMDS(title, word_clusters, embedding_clusters, filename = None):
    embedding_clusters = np.array(embedding_clusters)
    n, m, k = embedding_clusters.shape
    model_en_2d = MDS(n_components=2, max_iter=3500, random_state=32)
    model_en_2d = model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))
    embeddings_en_2d = np.array(model_en_2d).reshape(n, m, 2)
    plot_similar_words(title, FIXED_KEYS, embeddings_en_2d, word_clusters, filename)
In [10]:
def plotPCA(title, word_clusters, embedding_clusters, filename = None):
    embedding_clusters = np.array(embedding_clusters)
    n, m, k = embedding_clusters.shape
    model_en_2d = PCA(n_components=2, random_state = 32)
    model_en_2d = model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))
    embeddings_en_2d = np.array(model_en_2d).reshape(n, m, 2)
    plot_similar_words(title, FIXED_KEYS, embeddings_en_2d, word_clusters, filename)

Word2Vec

Loading model

In [11]:
model_gn = gensim.downloader.load('word2vec-google-news-300')
In [12]:
print(f"Vocabulary size: {len(model_gn.vocab)}")
Vocabulary size: 3000000

Getting similar words. Printed out words are neighboring words that are ommited for the analysis.

In [13]:
word_clusters, embedding_clusters = getSimilarWords(model_gn)
{'offensive', 'offens'} -- offensive
{'offensive', 'offens'} -- offensively
{'offensive', 'offens'} -- offense
{'offensive', 'offens'} -- offensively
{'offensive', 'offens'} -- offensive line
{'offensive', 'offens'} -- offensive firepower
{'abusive', 'abus'} -- abusive
{'abusive', 'abus'} -- verbally abusive
{'abusive', 'abus'} -- abuse
{'abusive', 'abus'} -- verbal abuse
{'abusive', 'abus'} -- abused
{'abusive', 'abus'} -- physically abused
{'abusive', 'abus'} -- abuser
{'abusive', 'abus'} -- abusive neglectful
{'abusive', 'abus'} -- verbal abuse
{'abusive', 'abus'} -- profane abusive
{'abusive', 'abus'} -- sexually abusive
{'abusive', 'abus'} -- verbally abused
{'abusive', 'abus'} -- abusing
{'abusive', 'abus'} -- sexually abused
{'abusive', 'abus'} -- abusive profane
{'abusive', 'abus'} -- verbally abusing
{'abusive', 'abus'} -- nonabusive
{'abusive', 'abus'} -- physically abusing
{'cyberbulli', 'cyberbullying'} -- cyberbullying
{'cyberbulli', 'cyberbullying'} -- cyberbullies
{'cyberbulli', 'cyberbullying'} -- bullying cyberbullying
{'cyberbulli', 'cyberbullying'} -- cyberbulling
{'cyberbulli', 'cyberbullying'} -- cyberbullied
{'cyberbulli', 'cyberbullying'} -- cyberbullying sexting
{'cyberbulli', 'cyberbullying'} -- cyberbullies
{'vulgar'} -- vulgarity
{'vulgar'} -- vulgar obscene
{'vulgar'} -- vulgar language
{'vulgar'} -- vulgarism
{'vulgar'} -- profane vulgar
{'vulgar'} -- vulgar language
{'racist'} -- racists
{'racist'} -- racist
{'racist'} -- overtly racist
{'racist'} -- racist sexist
{'racist'} -- blatantly racist
{'racist'} -- racist slur
{'racist'} -- racist remark
{'racist'} -- racist
{'racist'} -- racist overtones
{'racist'} -- racist bigoted
{'racist'} -- racist undertones
{'racist'} -- racist bigot
{'racist'} -- racist slurs
{'racist'} -- racist connotations
{'racist'} -- racist sexist homophobic
{'racist'} -- sexist racist
{'racist'} -- racist homophobic
{'racist'} -- racist taunts
{'racist'} -- racist bigots
{'racist'} -- racist epithets
{'homophobic', 'homophob'} -- homophobia
{'homophobic', 'homophob'} -- homophobic
{'homophobic', 'homophob'} -- homophobes
{'homophobic', 'homophob'} -- homophobe
{'homophobic', 'homophob'} -- homophobic slurs
{'homophobic', 'homophob'} -- homophobic insults
{'homophobic', 'homophob'} -- homophobic bullying
{'homophobic', 'homophob'} -- homophobic attitudes
{'homophobic', 'homophob'} -- homophobic taunts
{'homophobic', 'homophob'} -- sexist homophobic
{'homophobic', 'homophob'} -- virulently homophobic
{'homophobic', 'homophob'} -- blatant homophobia
{'homophobic', 'homophob'} -- racist sexist homophobic
{'homophobic', 'homophob'} -- homophobic misogynistic
{'homophobic', 'homophob'} -- homophobia
{'homophobic', 'homophob'} -- bigot homophobe
{'homophobic', 'homophob'} -- homophobic bigots
{'homophobic', 'homophob'} -- racist homophobic sexist
{'profan', 'profane'} -- profanity
{'profan', 'profane'} -- profanities
{'profan', 'profane'} -- profanity
{'profan', 'profane'} -- profane vulgar
{'profan', 'profane'} -- profane abusive
{'profan', 'profane'} -- obscene profane
{'profan', 'profane'} -- abusive profane
{'profan', 'profane'} -- vulgar profane
{'profan', 'profane'} -- profane
{'profan', 'profane'} -- profanity laced
{'profan', 'profane'} -- profanity laden
{'profan', 'profane'} -- profanity obscenity
{'profan', 'profane'} -- mild profanity
{'slur'} -- racial slur
{'slur'} -- racist slur
{'slur'} -- slurs
{'slur'} -- racial slurs
{'slur'} -- homophobic slurs
{'slur'} -- homophobic slur
{'slur'} -- sexist slurs
{'slur'} -- sexist slur
{'slur'} -- racist slurs
{'slur'} -- homosexual slur
{'slur'} -- ethnic slur
{'slur'} -- gay slur
{'slur'} -- derogatory slurs
{'slur'} -- antigay slur
{'slur'} -- racial slurs
{'slur'} -- derogatory slur
{'slur'} -- anti semitic slur
{'slur'} -- ethnic slurs
{'slur'} -- homosexual slurs
{'harras', 'harrasment'} -- harrassment
{'harras', 'harrasment'} -- sexual harrasment
{'harras', 'harrasment'} -- harrasing
{'harras', 'harrasment'} -- sexual harrassment
{'harras', 'harrasment'} -- harrassment
{'harras', 'harrasment'} -- harrassing
{'harras', 'harrasment'} -- harrased
{'obscene', 'obscen'} -- obscene
{'obscene', 'obscen'} -- vulgar obscene
{'obscene', 'obscen'} -- obscenity
{'obscene', 'obscen'} -- obscene pornographic
{'obscene', 'obscen'} -- profanity obscenity
{'obscene', 'obscen'} -- obscenities
{'obscene', 'obscen'} -- obscenity pornography
{'threat'} -- threats
{'threat'} -- threat posed
{'threat'} -- threat
{'threat'} -- existential threat
{'threat'} -- pose threat
{'threat'} -- graver threat
{'threat'} -- gravest threat
{'threat'} -- threats
{'threat'} -- threatened
{'threat'} -- cyberthreat
{'threat'} -- threaten
{'threat'} -- threatening
{'threat'} -- gravest threats
{'threat'} -- threatens
{'threat'} -- veiled threat
{'discredit'} -- discrediting
{'discredit'} -- discredit plame
{'discredit'} -- discrediting
{'discredit'} -- discredited
{'discredit'} -- discredits
{'hate', 'hateful'} -- vile hateful
{'hate', 'hateful'} -- hate mongering
{'hate', 'hateful'} -- hateful rhetoric
{'hate', 'hateful'} -- hatefully
{'hate', 'hateful'} -- hatefulness
{'hate', 'hateful'} -- hate
{'hate', 'hateful'} -- spew hateful
{'hate', 'hateful'} -- offensive hateful
{'hate', 'hateful'} -- hate mongers
{'hate', 'hateful'} -- hateful racist
{'hate', 'hateful'} -- hatemongering
{'hate', 'hateful'} -- hatemongers
{'hate', 'hateful'} -- bigoted hateful
{'hate', 'hateful'} -- hate speech
{'hate', 'hateful'} -- hatered
{'insult'} -- insulting
{'insult'} -- insulted
{'insult'} -- insult
{'insult'} -- gratuitous insult
{'insult'} -- insults
{'insult'} -- adding insult
{'insult'} -- grievous insult
{'insult'} -- gratuitously insulting
{'insult'} -- racially insulting
{'insult'} -- insulting
{'hostile', 'hostil'} -- hostile
{'hostile', 'hostil'} -- hostility
{'hostile', 'hostil'} -- nonhostile
{'hostile', 'hostil'} -- nonhostile causes
{'hostile', 'hostil'} -- hostile takeover
{'hostile', 'hostil'} -- racially hostile
{'hostile', 'hostil'} -- hostile takeover
{'hostile', 'hostil'} -- outright hostility
{'hostile', 'hostil'} -- overtly hostile
{'hostile', 'hostil'} -- nonhostile incidents
{'hostile', 'hostil'} -- hostilely
{'hostile', 'hostil'} -- overt hostility
{'hostile', 'hostil'} -- hostility toward
{'hostile', 'hostil'} -- hostile takeover bid
{'hostile', 'hostil'} -- implacably hostile
{'taboo'} -- taboos
{'taboo'} -- taboo subjects
{'taboo'} -- taboo topic
{'taboo'} -- tabooed
{'taboo'} -- cultural taboos
{'taboo'} -- taboos
{'taboo'} -- unspoken taboo
{'taboo'} -- societal taboos
In [14]:
displayDF(word_clusters)
offensive abusive cyberbullying vulgar racist homophobic profane slur harrasment obscene threat discredit hateful insult hostile taboo
0 defensive graphically_depicts_physically cyber_bullying profane racism racist vulgar derogatory harassment vulgar danger malign bigoted affront unfriendly touchy_subject
1 coach_Bob_Palcic Luke_McNorton sexting obscene anti_Semitic gay_bashing obscenities racist_remark Harassment indecent imminent_danger besmirch racist disrespect antagonistic frowned_upon
2 guard_RJ_Mattes Dorian_Wesson bullying crass bigoted antigay vulgar_language epithet harassments pornographic menace embarrass vile disrespectful nipple_pinching verboten
3 coach_Jimmy_Heggins inappropriate Cyber_bullying rude homophobic transphobic foul_language derogatory_remark harassement lewd challenge delegitimize hurtful disgrace warlike stigma_attached
4 promoted_Pete_Metzelaars adequate_Tamberg cyberstalking demeaning hateful gay expletives word_nigger ASPEN_Colo._Actor sexually_explicit dangers demonize mean_spirited denigrate unwelcoming forbidden
5 Dave_Borbely rude Bullying politically_incorrect anti_semitic homosexual curse_words remark Verbal_abuse filthy_diatribe hazard smear anti_semetic demean mistaken_celebratory_gunfire touchy
6 coach_George_Yarno Mo'Nique_searing cyberbully sexist rascist bigoted hateful racial_epithet violance inappropriate nightmare_scenario marginalize bigotry humiliate confrontational touchy_subjects
7 coach_Dave_Magazu Advocate_Safehouse_Project Cyber_Bullying raunchy racialist anti_Semitic foul_mouthed derogatory_language sexual_harassments disgusting possibility vilify vitriolic dishonor inhospitable unmentionable
8 coach_Greg_Studrawa behaves_unreasonably_yells schoolyard_bullying culturally_insensitive_inappropriate racially_motivated hateful politically_incorrect racist_connotations verbal_abuse outrageous peril denigrate racist_sexist_homophobic disrespecting belligerent Broaching
9 coach_Dan_Roushar delete_inappropriate cyber_bulling obnoxious sexist anti_semitic containing_advertising_astroturfing racist harassing insulting concern belittle hatred belittle neocolonialist_enemies broach
10 defensively rude_disrespectful sexual_predators risqué racial racist_sexist scatological_references disparaging Mr._Lutfi_drugged innapropriate dangers_posed defame anti_Semitic taunt donned_riot_gear stigma
11 Pashtun_Zarghun_district Libellous cyber_bullies sexual_innuendos anti_semetic homosexuals Ad_Age_reserves racist_epithet Rathore_behest objectionable Sadequee_countered intimidate racist_bigoted belittling unsupportive genital_piercing
12 specialist_Damien_Groce profane Sexting misogynistic derogatory sexist vulgarity racially_derogatory homophobic_taunts sexually_suggestive risk mislead homophobic disparage bellicose socially_acceptable
13 tackle_Pat_McQuistan foul_language cyber_stalking insulting antisemitic antisemitic sarcastic derogatory_remarks slander vile undeterrable impugn bigots disrespects retaliatory broaching
14 tackle_Rob_Droege disrespectful Cyberstalking lewd blatant_racism anti_Semetic swearwords pejorative harassment_intimidation demeaning imminent undermine bigot_homophobe denigrating mutated_creatures socially_unacceptable
15 offfensive sexually_predatory bullies vile sexist_homophobic transphobic_bullying vulgar_obscene fagot brickbatting sexually_exploitive specter smear_campaign racists use_racial_slurs thinly_veiled_barb unspoken
16 coach_Larry_Beightol editor@kickoff.com bullying_cyber_bullying sexually_suggestive racially_insensitive slurs rude disparaging_remarks forcible_conversion pornography mortal_danger refute sexist_homophobic slur adversarial touchier
17 linebacking_crew demeaning cyberstalkers sexual_innuendoes xenophobic homosexuality cusswords nigger anti_semitic_tirade immoral scare debunk religiously_intolerant offend hospitable Female_genital_mutilation
18 linebacking_corp derogatory cybersafety mildly_suggestive racially_charged anti_semetic mildly_suggestive tirade discrimation offensive_hateful problem insinuate vitriol_spewed shame intimidating transexuality
19 bend_don't_break harassing identity_theft puerile racially_intolerant gays scatological racial_connotations non_cognisable_offense disseminating_pornographic chant_Omm humiliate racist_sexist travesty violent unwritten_rule
20 Andria_Hurley inapproriate relational_aggression overtly_sexual racial_slurs lesbian epithets n_****_r Carlos_Irwin_Estevez sexually_exploitative Naxalism_Maoism tarnish sexist_racist_homophobic demeaning intimidatory Forced_marriages
21 Chris_Kapilovic vulgar_language textual_harassment uncouth slurs racism vitriolic derogatory_slang brutalities_meted Libelous concerns cast_aspersions meanspirited absolute_disgrace friendly stigmas
22 playmakers behaving_violently bullycide hateful anti_Semetic blatantly_racist Avoid_lewd_obscene disparaging_remark Eve_teasing inapropriate photogs_snaps delegitimise bigotted unkind conciliatory politically_incorrect
23 coach_Bob_Bostad drunken_rages cyberharassment derogatory Islamophobic gay_epithets Foul_language unparliamentary_language eve_teasing vulgarity hazards divert_attention racist_bigot offends uncongenial Anal_sex
24 Chad_Germer Crystal_Kimberly_Elise inhalant_abuse tasteless racially_prejudiced bigotry No_vulgarity_racial homophobic_remark pro_cussers reprehensible risks rebut despicable disparaging Sayyed_Moqtada_al_Sadr Interracial_dating
25 offen_sive mother_Erica_Alphonse homophobic_bullying smutty insulting racist_sexist_anti_Semitic sexist_racist racial_epithets homophobic_insults Lewd abstracted_backdrop implicate vulgar disservice agressive Lesbianism
26 fensive vulgar cybercrime indecorous homophobic_sexist overtly_racist derogatory epithets sexual_harassment obsenity Fred_Daskoski_spokesman subvert anti_Semetic insensitive Panshir_Afghanistan stigma_associated
27 physically_outmatched harmfully_lax cybercrimes risque bigot_homophobe misogynist Satanic_symbols derogative harassment_meted profane warning misinform homophobic_sexist disrepectful decidedly_unfriendly touchy_topic
28 coach_Pete_Hoener sexist cyber_bullying_sexting overt_sexuality nigger LGBT laced_tirade homophobic_insults homophobic_slurs lude formidable_obstacle disinformation rascist mockery Christos_Kittas patriarchal_society
29 coach_Stacey_Searels unsupportive cyber misogynist racially_biased Islamophobic sexual_innuendos insulting uttering_seditious_words ridiculous jeopardy blacken homophobic_rants Jorge_Farinacci intolerant strictly_forbidden
In [15]:
plotTSNE("Similar words - Word2Vec [t-SNE]", word_clusters, embedding_clusters, "SimilarWords - word2vec - t-SNE.png")
In [16]:
plotMDS("Similar words - Word2Vec [MDS]", word_clusters, embedding_clusters, "SimilarWords - word2vec - MDS.png")
In [17]:
plotPCA("Similar words - Word2Vec [PCA]", word_clusters, embedding_clusters, "SimilarWords - word2vec - PCA.png")

Glove

Loading model

In [18]:
model_gn = gensim.downloader.load('glove-wiki-gigaword-300')
In [19]:
print(f"Vocabulary size: {len(model_gn.vocab)}")
Vocabulary size: 400000

Getting similar words. Printed out words are neighboring words that are ommited for the analysis.

In [20]:
word_clusters, embedding_clusters = getSimilarWords(model_gn)
{'offensive', 'offens'} -- offense
{'offensive', 'offens'} -- offensives
{'offensive', 'offens'} -- offensively
{'abusive', 'abus'} -- abuse
{'abusive', 'abus'} -- abused
{'abusive', 'abus'} -- abusing
{'vulgar'} -- vulgarity
{'racist'} -- racists
{'homophobic', 'homophob'} -- homophobia
{'profan', 'profane'} -- profanity
{'slur'} -- slurs
{'harras', 'harrasment'} -- harrassment
{'obscene', 'obscen'} -- obscenity
{'obscene', 'obscen'} -- obscenities
{'threat'} -- threats
{'threat'} -- threatening
{'threat'} -- threatened
{'threat'} -- threaten
{'threat'} -- threatens
{'discredit'} -- discrediting
{'discredit'} -- discredited
{'hate', 'hateful'} -- hate
{'insult'} -- insulting
{'insult'} -- insults
{'insult'} -- insulted
{'hostile', 'hostil'} -- hostility
{'taboo'} -- taboos
In [21]:
displayDF(word_clusters)
offensive abusive cyberbullying vulgar racist homophobic profane slur harrasment obscene threat discredit hateful insult hostile taboo
0 defensive neglectful bullying profane homophobic sexist vulgar derogatory romish indecent danger undermine bigoted affront unfriendly homosexuality
1 lineman inappropriate cyberstalking obscene sexist racist raunchy homophobic pi96 lewd posed embarrass hurtful disrespect takeover topic
2 linemen sexually cyber-bullying raunchy semitic misogynistic obscene epithets nonfeasance vulgar pose intimidate racist offend belligerent sacrosanct
3 coordinator stepfather sexting risque xenophobic slurs vulgarity disparaging zety profane possibility defame vile humiliation antagonistic forbidden
4 attack behavior anti-gay crass anti-semitic misogynist misogynistic uttered depressurisation pornographic warned rebut homophobic shame enemy topics
5 quarterback verbally vigilantism sexist antisemitic bigoted ribald racist keyrates inappropriate warning humiliate sexist provocation takeovers touchy
6 blitz overbearing victimisation tasteless slurs antisemitic irreverent nigger jiwamol insulting poses tarnish despicable ridicule threats broached
7 forces bullying victimization suggestive racism anti-semitic scatological epithet demoralisation defamatory imminent smear intolerant embarrassment aggressive incest
8 fighting disrespectful self-harm derogatory hateful xenophobic rude sexist supunnabul gestures dangers destabilize idiotic disrespectful fend subject
9 tackle domineering harrassment disrespectful bigoted semitic bawdy semitic ryryryryryry immoral possible portray disgusting injustice enemies unspoken
10 attacking husbands institutionalised rude hate slur disrespectful stereotyping rw96 objectionable terrorism marginalize ignorant slur thwart celibacy
11 assault discriminatory falkoff slang racial hateful risque remark equidistance derogatory concern accuse spiteful outrageous intentions unwritten
12 patriots exploitative anti-muslim demeaning racially anti-gay demeaning obscenities kanoksilp pornography potential subvert vitriolic slander friendly sexuality
13 quarterbacks alcoholic autistics profanity slur bullying derogatory profanities continous blasphemous terrorist refute bellicose offended increasingly stigma
14 defenses practices self-injury disgusting discriminatory disparaging sarcastic racial _____________ disrespectful fears denigrate semitic indignity bid lesbianism
15 onslaught behaviour anti-spam racy derogatory disrespectful sexist insult suwannakij gesture risks demonize xenophobic dignity threatening subjects
16 attacks manipulative nanostructure bawdy insulting epithets impolite pejorative tom.fowler@chron.com sacrilegious serious mislead disrespectful humiliate engaging masturbation
17 raiders oppressive cybercrime impolite fascist intolerant hilariously uttering zilliacus subversive concerns disinformation profane mockery actions untouchable
18 troops immoral anti-bullying misogynistic insults derogatory tirade vulgar rw97 deemed risk destabilise anti-semitic compliment violent unthinkable
19 backfield corrupt texting scatological taunts insensitive hateful tirade rungfapaisarn libelous warnings ploy intemperate apologize overtures shunned
20 counterattack coercive licentiousness jokes misogynistic condescending playful expletives ufdots outrageous fear undermining vulgar disgrace abusive discussing
21 rebels priests rasted inappropriate epithets vulgar suggestive insensitive mongkolporn racist consequences accusation misogynistic racist engage euphemism
22 effort racist ragging homophobic violent bigotry epithets demeaning rosnazura hateful attacks undercut demeaning demeaning attempts divisive
23 nfl insulting homophobic lewd stereotypes transphobic outbursts insults cw96 demeaning any distract barbaric perceived opposing stereotypes
24 jets improper revisionary pretentious misogynist chauvinistic obnoxious bigoted compulsivity explicit terror depose murderous joke confrontational verboten
25 raids unscrupulous spamming hateful disrespectful misogyny boorish defamatory ooooooooooooooooooooooooooooooooooooooo suggestive facing orchestrated shameful mocking threat homosexual
26 army violent exhibitionism irreverent hatred prejudiced homophobic insulting aldingham lascivious alert deflect hatred remark outsiders frowned
27 rebel irresponsible galiardi obnoxious denouncing hypocritical expletives profiling westendorf disparaging looming vilify contemptuous gratuitous stance anathema
28 firepower demeaning islamophobia insensitive disparaging demeaning rants scapegoating misfeasance unlawful warn disprove bloodthirsty sexist suspicious openly
29 starting deceptive bullies outrageous insensitive taunts lewd taunts mo96 seditious concerned assassinate vindictive exaggeration attitude censorship
In [22]:
plotTSNE("Similar words - Glove [t-SNE]", word_clusters, embedding_clusters, "SimilarWords - Glove - t-SNE.png")
In [23]:
plotMDS("Similar words - Glove [MDS]", word_clusters, embedding_clusters, "SimilarWords - Glove - MDS.png")
In [24]:
plotPCA("Similar words - Glove [PCA]", word_clusters, embedding_clusters, "SimilarWords - Glove - PCA.png")

fastText

Loading model

In [25]:
# Run first time only to download model
#import fasttext.util
#fasttext.util.download_model('en', if_exists='ignore')  # English
#ft = fasttext.load_model('cc.en.300.bin')

# Chaya used: https://fasttext.cc/docs/en/crawl-vectors.html
# It includes both Common Crawl and Wikipedia - file cc.en.300.bin.

model_gn = load_facebook_vectors("cc.en.300.bin")
In [26]:
print(f"Vocabulary size: {len(model_gn.vocab)}")
Vocabulary size: 2000000

Getting similar words. Printed out words are neighboring words that are ommited for the analysis.

In [27]:
word_clusters, embedding_clusters = getSimilarWords(model_gn)
{'offensive', 'offens'} -- offensive
{'offensive', 'offens'} -- offensively
{'offensive', 'offens'} -- offensive
{'offensive', 'offens'} -- offensive 
{'offensive', 'offens'} -- offensiv
{'offensive', 'offens'} -- offensive.the
{'offensive', 'offens'} -- offensive.
{'offensive', 'offens'} -- non offensive
{'offensive', 'offens'} -- offense
{'offensive', 'offens'} -- offensive oriented
{'offensive', 'offens'} -- offensiveness
{'offensive', 'offens'} -- offensive.i
{'offensive', 'offens'} -- offensives
{'offensive', 'offens'} -- offensive line
{'offensive', 'offens'} -- offensive defensive
{'offensive', 'offens'} -- offensive minded
{'offensive', 'offens'} -- nonoffensive
{'offensive', 'offens'} -- offense.it
{'offensive', 'offens'} -- offens
{'offensive', 'offens'} -- offense first
{'offensive', 'offens'} -- counter offensive
{'offensive', 'offens'} -- counter offensives
{'offensive', 'offens'} -- offensiveness
{'offensive', 'offens'} -- offense.this
{'offensive', 'offens'} -- offense oriented
{'offensive', 'offens'} -- offense minded
{'offensive', 'offens'} -- offense.that
{'offensive', 'offens'} -- offense.but
{'offensive', 'offens'} -- offensively challenged
{'offensive', 'offens'} -- unoffensive
{'offensive', 'offens'} -- counteroffensives
{'offensive', 'offens'} -- offensively
{'abusive', 'abus'} -- abusiveness
{'abusive', 'abus'} -- abusive
{'abusive', 'abus'} -- abusive
{'abusive', 'abus'} -- abused
{'abusive', 'abus'} -- non abusive
{'abusive', 'abus'} -- abuse
{'abusive', 'abus'} -- abuser
{'abusive', 'abus'} -- abusive.
{'abusive', 'abus'} -- abusively
{'abusive', 'abus'} -- nonabusive
{'abusive', 'abus'} -- abusing
{'abusive', 'abus'} -- self abusive
{'abusive', 'abus'} -- abusers
{'abusive', 'abus'} -- abuses
{'abusive', 'abus'} -- abuser
{'cyberbulli', 'cyberbullying'} -- cyberbullying
{'cyberbulli', 'cyberbullying'} -- cyberbulling
{'cyberbulli', 'cyberbullying'} -- cyberbullies
{'cyberbulli', 'cyberbullying'} -- cyberbullied
{'cyberbulli', 'cyberbullying'} -- cyberbullies
{'cyberbulli', 'cyberbullying'} -- anti cyberbullying
{'cyberbulli', 'cyberbullying'} -- cyberbullying
{'cyberbulli', 'cyberbullying'} -- cyberbullying
{'vulgar'} -- vulgarity
{'vulgar'} -- vulgarities
{'vulgar'} -- vulgarism
{'vulgar'} -- vulgarisms
{'vulgar'} -- vulgarly
{'vulgar'} -- non vulgar
{'vulgar'} -- vulgarians
{'vulgar'} -- vulgar
{'racist'} -- racists
{'racist'} -- racist 
{'racist'} -- racist
{'racist'} -- racist.
{'racist'} -- quasi racist
{'racist'} -- racist
{'racist'} -- racistly
{'racist'} -- racist.the
{'racist'} -- non racists
{'racist'} -- racist.i
{'racist'} -- non racist
{'racist'} -- nonracist
{'racist'} -- anti racist
{'racist'} -- racists.
{'racist'} -- racists
{'homophobic', 'homophob'} -- homophobia
{'homophobic', 'homophob'} -- homophobe
{'homophobic', 'homophob'} -- homophobes
{'homophobic', 'homophob'} -- homophobic
{'homophobic', 'homophob'} -- anti homophobic
{'homophobic', 'homophob'} -- non homophobic
{'homophobic', 'homophob'} -- homophobics
{'homophobic', 'homophob'} -- homophobic
{'homophobic', 'homophob'} -- homophobia
{'homophobic', 'homophob'} -- homophobes
{'homophobic', 'homophob'} -- anti homophobia
{'profan', 'profane'} -- profanely
{'profan', 'profane'} -- profanity
{'profan', 'profane'} -- profanities
{'profan', 'profane'} -- profanity laden
{'profan', 'profane'} -- profane
{'profan', 'profane'} -- profanity filled
{'profan', 'profane'} -- profanes
{'profan', 'profane'} -- profanity laced
{'profan', 'profane'} -- profaned
{'profan', 'profane'} -- profanity free
{'profan', 'profane'} -- profaning
{'profan', 'profane'} -- profanation
{'slur'} -- slurs
{'slur'} -- slur
{'slur'} -- slurring
{'harras', 'harrasment'} -- harrassment
{'harras', 'harrasment'} -- harrasement
{'harras', 'harrasment'} -- harrassement
{'harras', 'harrasment'} -- harrasment
{'harras', 'harrasment'} -- harrassing
{'harras', 'harrasment'} -- harrasing
{'harras', 'harrasment'} -- harrassment
{'harras', 'harrasment'} -- harrasser
{'harras', 'harrasment'} -- harrased
{'harras', 'harrasment'} -- harrassers
{'harras', 'harrasment'} -- harrass
{'harras', 'harrasment'} -- harrassed
{'harras', 'harrasment'} -- harras
{'harras', 'harrasment'} -- harrasses
{'harras', 'harrasment'} -- harrassing
{'obscene', 'obscen'} -- non obscene
{'obscene', 'obscen'} -- obscene
{'obscene', 'obscen'} -- obscenity
{'obscene', 'obscen'} -- obscenely
{'obscene', 'obscen'} -- obscene
{'obscene', 'obscen'} -- obscenities
{'threat'} -- threats
{'threat'} -- threat.the
{'threat'} -- counter threat
{'threat'} -- threat.but
{'threat'} -- threat.as
{'threat'} -- non threat
{'threat'} -- threat.this
{'threat'} -- threat.
{'threat'} -- threat 
{'threat'} -- threat.a
{'threat'} -- threat.and
{'threat'} -- threat.it
{'threat'} -- threat.if
{'threat'} -- threat.in
{'threat'} -- threath
{'threat'} -- threat.i
{'threat'} -- threat
{'threat'} -- threatening
{'threat'} -- threats.the
{'threat'} -- cyber threat
{'threat'} -- threats
{'threat'} -- non threats
{'threat'} -- threatthe
{'threat'} -- threaten
{'threat'} -- threating
{'threat'} -- threat
{'threat'} -- cyber threats
{'threat'} -- threats.in
{'threat'} -- threats.
{'threat'} -- threatener
{'threat'} -- threats 
{'threat'} -- threatened
{'threat'} -- threats.i
{'threat'} -- threate
{'threat'} -- cyberthreat
{'threat'} -- threat based
{'threat'} -- world threatening
{'threat'} -- cyberthreats
{'threat'} -- no threat
{'threat'} -- threatsthe
{'threat'} -- threathening
{'threat'} -- threatining
{'threat'} -- threatens
{'threat'} -- threated
{'threat'} -- threathen
{'threat'} -- death threat
{'discredit'} -- discrediting
{'discredit'} -- discredits
{'discredit'} -- discrediting
{'discredit'} -- discreditation
{'discredit'} -- discredited
{'hate', 'hateful'} -- hate filled
{'hate', 'hateful'} -- hatefilled
{'hate', 'hateful'} -- hatefull
{'hate', 'hateful'} -- hate driven
{'hate', 'hateful'} -- hatefulness
{'hate', 'hateful'} -- hate fueled
{'hate', 'hateful'} -- hatemongering
{'hate', 'hateful'} -- hatefully
{'hate', 'hateful'} -- hate spewing
{'hate', 'hateful'} -- hate mongering
{'hate', 'hateful'} -- hate fuelled
{'hate', 'hateful'} -- hate filled
{'hate', 'hateful'} -- hate mongers
{'hate', 'hateful'} -- hate monger
{'hate', 'hateful'} -- hatemonger
{'hate', 'hateful'} -- hate speech
{'hate', 'hateful'} -- hatemongers
{'hate', 'hateful'} -- hate
{'insult'} -- insulting
{'insult'} -- insulter
{'insult'} -- insults
{'insult'} -- insult
{'insult'} -- insulted
{'insult'} -- insult.
{'insult'} -- insult
{'insult'} -- insult.i
{'insult'} -- insulters
{'insult'} -- insulting
{'insult'} -- insultive
{'insult'} -- insults.
{'insult'} -- insults
{'insult'} -- insultingly
{'insult'} -- non insulting
{'insult'} -- insults
{'insult'} -- insulting
{'hostile', 'hostil'} -- semi hostile
{'hostile', 'hostil'} -- hostile
{'hostile', 'hostil'} -- non hostile
{'hostile', 'hostil'} -- hostility
{'hostile', 'hostil'} -- hostilely
{'hostile', 'hostil'} -- hostile.
{'hostile', 'hostil'} -- hostil
{'hostile', 'hostil'} -- hostiles
{'hostile', 'hostil'} -- often hostile
{'hostile', 'hostil'} -- hostile
{'taboo'} -- taboos
{'taboo'} -- tabooed
{'taboo'} -- once taboo
{'taboo'} -- taboo.
{'taboo'} -- taboos
{'taboo'} -- taboo
{'taboo'} -- taboo breaking
{'taboo'} -- taboo busting
In [28]:
displayDF(word_clusters)
offensive abusive cyberbullying vulgar racist homophobic profane slur harrasment obscene threat discredit hateful insult hostile taboo
0 defensive neglectful cyber-bullying crass rascist anti-gay vulgar derogatory harassment vulgar menace denigrate bigoted affront unfriendly tabboo
1 ofensive hurtful Cyber-bullying uncouth bigoted anti-homosexual blasphemous pejorative harassement indecent danger defame vile disrespect antagonistic verboten
2 deffensive unloving bullying profane anti-white antigay irreverent derogative harasment obsene imminent impugn spiteful demean belligerent unmentionable
3 offencive harassing cyberbully obscene anti-black heterophobic scatological N-word harassment. lewd menaces undermine mean-spirited belittle inhospitable forbidden
4 fensive vindictive cyber-bullies puerile racism gay-bashing obscene disparaging harassments profane imminence besmirch vitriolic disparagement less-than-friendly off-limits
5 offesive hateful cyber-bullied lewd racialist bigoted vulgarities n-word acusations pornographic immenent villify hurtful belittling unwelcoming touchy
6 defensive-oriented disrespectful cyber-harassment vulger anti-semitic homphobic sacrilegious pejoratives harassing outrageous danger.But demonize racist put-down inimical tabu
7 run-game inappropriate Bullying raunchy homophobic anti-LGBT lewd racist assualt blasphemous dangers disparage slanderous redicule distrustful sexuality
8 run-based insultive cyber-bully boorish bigotted racist ribald perjorative Harassment X-rated pose belittle misogynistic denigration spiteful stigmatized
9 quick-strike demeaning cyberharassment indelicate xenophobic anti-Gay impious t-word harassment.The x-rated peril vilify bigotted ridicule antipathetic frowned-upon
10 play-callers violent bullying.The scatological bigot heterosexist obscenities homophobic assult crass looming de-legitimize vindictive denigrate contemptuous broaching
11 play-making exploitive bullying-related distasteful antiwhite biphobic sacred cussword descrimination salacious menance misrepresent homophobic disrepect non-friendly lesbianism
12 insulting spiteful cyberstalking vile anti-White gay-hating bawdy epithets discrimation inappropriate ever-looming marginalize biggoted disrespectful mistrustful controversial
13 defensively misogynistic bullying. sophomoric hateful anti-lesbian crass insult threatning unseemly danger.This refute vitrolic offend unsympathetic prudishness
14 play-makers passive-agressive bullycide indecorous anti-Black transphobic foul-mouthed bigot harasser semi-pornographic posed demean despicable putdown combative undiscussable
15 defencive non-loving sextortion ribald sexist misogynistic off-color nigger harass grotesque danger.The delegitimize unkind mockery belligerant unspoken
16 play-caller mean-spirited cyber-stalking low-minded race-baiting gay-basher foulmouthed h-word discimination disgusting attack insinuate mysoginistic denegrate disdainful homosexuality
17 pass-heavy uncivil cyber-safety crudity white-hating anti-queer scatalogical quasi-racist uncivility scandalous danger.It debunk mean-hearted slur bellicose broached
18 O-lines homophobic sexting derogatory supremacist misogynist potty-mouthed put-down HARASSMENT vile retaliation disinform hatred slander adversarial unsayable
19 derogatory manipulative Cyberbully demeaning anti-semetic pro-gay cussword swear-word acusation objectionable specter mischaracterize anger-filled disparage indifferent risqué
20 play-call exploitative Cyber-Bullying low-brow rascists homosexual raunchy J-word rudness vulgarity cyber-terror delegitimise insensative insinuation vindictive un-PC
21 Defensive self-destructive cybersafety risqué racis anti-LGBTQ unsacred jigaboo violance sacrilegious repercussions deligitimize xenophobic name-call ill-disposed hot-button
22 run-blocking unrespectful Cyberstalking misogynistic racially-motivated hateful vulgarity insinuation vicitim vulgarities risk denegrate loathsome belittlement untrusting nudity
23 vulgar dysfunctional anti-bullying tasteless anti-Semitic gay-bashers uncouth non-derogatory villification outrageously menacing mis-represent anti-semitic derogatory threatening eroticism
24 over-the-line passive-aggressive antibullying bawdy racially-based bigotted hateful tirade fruad immoral cyber-terrorism misinform misogynist affronts confrontational broach
25 defensive-minded domineering bulling licentious supremist mysoginistic non-obscene putdown vandelism distasteful terror demonise bigots umbrage hateful bi-sexuality
26 run-oriented insulting cyber-crimes ill-bred Anti-white gay intemperate demeaning intimidation scatological impending rebut demeaning uncalled dismissive necrophilia
27 pass-oriented foul-mouthed cybercrimes disrespectable rasist gay-baiting puerile disparagement discrmination smutty terrorism de-legitimise misogynic mocking vitriolic off-limit
28 tight-ends harassment Ask.fm off-color biggot mysogynistic indelicate derisive harassers lascivious cyberterror maligning disrespectful invective mean-spirited discussable
29 defensively-minded unkind Sexting profanity reverse-racism sexist prophane rascist bulling prurient danger.And mislead anti-Semitic disparaging truculent eroticization
In [29]:
plotTSNE("Similar words - fastText [t-SNE]", word_clusters, embedding_clusters, "SimilarWords - fastText - t-SNE.png")
In [30]:
plotMDS("Similar words - fastText [MDS]", word_clusters, embedding_clusters, "SimilarWords - fastText - MDS.png")
In [31]:
plotPCA("Similar words - fastText [PCA]", word_clusters, embedding_clusters, "SimilarWords - fastText - PCA.png")
In [ ]: