The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
# fpaths = get_fpaths_year('2020')
fpaths = get_fpaths_subreddit('conspiracy')
comments = read_multi_comments_csvs(fpaths)
comments_clean = clean_comments(comments)
conv_to_lowerc (2400, 5) 0:00:00.001485
rm_punct (2400, 5) 0:00:00.014857
tokenize (2400, 5) 0:00:00.003988
rem_short_comments (1695, 5) 0:00:00.002041
Corpus (docs)
An iterator that yields sentences (lists of str).
train_model (corpus, MIN_COUNT=5, SIZE=300, WORKERS=8, WINDOW=5, EPOCHS=5)
load_models (model_names:list, models_dir:str='../models_test')
def intersection_align_gensim(m1, m2, words=None):
"""
Intersect two gensim word2vec models, m1 and m2.
Only the shared vocabulary between them is kept.
If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
These indices correspond to the new syn0 and syn0norm objects in both gensim models:
-- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
-- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
The .vocab dictionary is also updated for each model, preserving the count but updating the index.
"""
# Get the vocab for each model
vocab_m1 = set(m1.wv.index_to_key)
vocab_m2 = set(m2.wv.index_to_key)
# Find the common vocabulary
common_vocab = vocab_m1 & vocab_m2
if words: common_vocab &= set(words)
# If no alignment necessary because vocab is identical...
if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
return (m1,m2)
# Otherwise sort by frequency (summed for both)
common_vocab = list(common_vocab)
common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)
# print(len(common_vocab))
# Then for each model...
for m in [m1, m2]:
# Replace old syn0norm array with new one (with common vocab)
indices = [m.wv.key_to_index[w] for w in common_vocab]
old_arr = m.wv.vectors
new_arr = np.array([old_arr[index] for index in indices])
m.wv.vectors = new_arr
# Replace old vocab dictionary with new one (with common vocab)
# and old index2word with new one
new_key_to_index = {}
new_index_to_key = []
for new_index, key in enumerate(common_vocab):
new_key_to_index[key] = new_index
new_index_to_key.append(key)
m.wv.key_to_index = new_key_to_index
m.wv.index_to_key = new_index_to_key
print(len(m.wv.key_to_index), len(m.wv.vectors))
return (m1,m2)
intersection_align_gensim (m1, m2, words=None)
Intersect two gensim word2vec models, m1 and m2. Only the shared vocabulary between them is kept. If ‘words’ is set (as list or set), then the vocabulary is intersected with this list as well. Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2). These indices correspond to the new syn0 and syn0norm objects in both gensim models: – so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0 – you can find the index of any word on the .index2word list: model.index2word.index(word) => 2 The .vocab dictionary is also updated for each model, preserving the count but updating the index.
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
"""
Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
First, intersect the vocabularies (see `intersection_align_gensim` documentation).
Then do the alignment on the other_embed model.
Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
Return other_embed.
If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
"""
# make sure vocabulary and indices are aligned
in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)
# get the (normalized) embedding matrices
base_vecs = in_base_embed.wv.get_normed_vectors()
other_vecs = in_other_embed.wv.get_normed_vectors()
# just a matrix dot product with numpy
m = other_vecs.T.dot(base_vecs)
# SVD method from numpy
u, _, v = np.linalg.svd(m)
# another matrix operation
ortho = u.dot(v)
# Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)
return other_embed
smart_procrustes_align_gensim (base_embed, other_embed, words=None)
Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf Procrustes align two gensim word2vec models (to allow for comparison between same word across models). Code ported from HistWords https://github.com/williamleif/histwords by William Hamilton wleif@stanford.edu.
First, intersect the vocabularies (see intersection_align_gensim
documentation). Then do the alignment on the other_embed model. Replace the other_embed model’s syn0 and syn0norm numpy matrices with the aligned version. Return other_embed. If words
is set, intersect the two models’ vocabulary with the vocabulary in words (see intersection_align_gensim
documentation).
1710 1710
1710 1710
<gensim.models.word2vec.Word2Vec>
def measure_distances(model_1, model_2):
distances = pd.DataFrame(
columns=('lex', 'dist_sem', "freq_1", "freq_2"),
data=(
#[w, spatial.distance.euclidean(model_1.wv[w], model_2.wv[w]),
#[w, np.sum(model_1.wv[w] * model_2.wv[w]) / (np.linalg.norm(model_1.wv[w]) * np.linalg.norm(model_2.wv[w])),
[w, spatial.distance.cosine(model_1.wv[w], model_2.wv[w]),
model_1.wv.get_vecattr(w, "count"),
model_2.wv.get_vecattr(w, "count")
] for w in model_1.wv.index_to_key
)
)
return distances
measure_distances (model_1, model_2)
distances = measure_distances(models['2019'], models['2020'])
distances\
.sort_values('dist_sem', ascending=False)
lex | dist_sem | freq_1 | freq_2 | |
---|---|---|---|---|
130 | bot | 0.179056 | 106 | 123 |
147 | action | 0.172512 | 98 | 112 |
1348 | forget | 0.171441 | 8 | 9 |
62 | any | 0.155413 | 243 | 272 |
94 | am | 0.153691 | 158 | 174 |
... | ... | ... | ... | ... |
364 | posts | 0.000489 | 32 | 37 |
300 | big | 0.000456 | 41 | 47 |
227 | life | 0.000440 | 58 | 67 |
242 | once | 0.000392 | 55 | 62 |
265 | another | 0.000371 | 48 | 56 |
1710 rows × 4 columns
get_change_candidates (k:int, distances:pandas.core.frame.DataFrame, freq_min:int=100, propNouns:bool=True)
def get_nearest_neighbours_models(lex: str, freq_min: int, model_1, model_2, topn: int=100_000, k: int=10):
nbs = []
for count, model in enumerate([model_1, model_2]):
for nb, sim in model.wv.most_similar(lex, topn=topn):
if model.wv.get_vecattr(nb, 'count') > freq_min:
d = {}
d['Model'] = count + 1
d['Word'] = nb
d['SemDist'] = round(1 - sim, 2)
d['Freq'] = model.wv.get_vecattr(nb, "count")
d['vec'] = model.wv.get_vector(lex)
nbs.append(d)
nbs_df = pd.DataFrame(nbs)
nbs_df = nbs_df\
.query('Freq > @freq_min')\
.groupby('Model', group_keys=False)\
.apply(lambda group: group.nsmallest(k, 'SemDist'))
nbs_model_1 = nbs_df.query('Model == 1')
nbs_model_2 = nbs_df.query('Model == 2')
return nbs_model_1, nbs_model_2
get_nearest_neighbours_models (lex:str, freq_min:int, model_1, model_2, topn:int=100000, k:int=10)
/var/folders/gp/dw55jb3d3gl6jn22rscvxjm40000gn/T/ipykernel_48107/1648447668.py:18: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
.apply(lambda group: group.nsmallest(k, 'SemDist'))
Model | Word | SemDist | Freq | vec | |
---|---|---|---|---|---|
0 | 1 | her | 0.0 | 166 | [-0.00916857, 0.2121922, -0.025869183, 0.07142... |
1 | 1 | me | 0.0 | 332 | [-0.00916857, 0.2121922, -0.025869183, 0.07142... |
2 | 1 | much | 0.0 | 150 | [-0.00916857, 0.2121922, -0.025869183, 0.07142... |
3 | 1 | as | 0.0 | 487 | [-0.00916857, 0.2121922, -0.025869183, 0.07142... |
4 | 1 | people | 0.0 | 293 | [-0.00916857, 0.2121922, -0.025869183, 0.07142... |
5 | 1 | one | 0.0 | 280 | [-0.00916857, 0.2121922, -0.025869183, 0.07142... |
6 | 1 | had | 0.0 | 178 | [-0.00916857, 0.2121922, -0.025869183, 0.07142... |
7 | 1 | now | 0.0 | 150 | [-0.00916857, 0.2121922, -0.025869183, 0.07142... |
8 | 1 | all | 0.0 | 374 | [-0.00916857, 0.2121922, -0.025869183, 0.07142... |
9 | 1 | him | 0.0 | 146 | [-0.00916857, 0.2121922, -0.025869183, 0.07142... |
Model | Word | SemDist | Freq | vec | |
---|---|---|---|---|---|
1709 | 2 | well | 0.0 | 144 | [-0.0038482083, 0.21741627, -0.03928126, 0.050... |
1710 | 2 | also | 0.0 | 184 | [-0.0038482083, 0.21741627, -0.03928126, 0.050... |
1711 | 2 | me | 0.0 | 371 | [-0.0038482083, 0.21741627, -0.03928126, 0.050... |
1712 | 2 | ve | 0.0 | 178 | [-0.0038482083, 0.21741627, -0.03928126, 0.050... |
1713 | 2 | means | 0.0 | 33 | [-0.0038482083, 0.21741627, -0.03928126, 0.050... |
1714 | 2 | much | 0.0 | 167 | [-0.0038482083, 0.21741627, -0.03928126, 0.050... |
1715 | 2 | yeah | 0.0 | 81 | [-0.0038482083, 0.21741627, -0.03928126, 0.050... |
1716 | 2 | idea | 0.0 | 41 | [-0.0038482083, 0.21741627, -0.03928126, 0.050... |
1717 | 2 | sure | 0.0 | 114 | [-0.0038482083, 0.21741627, -0.03928126, 0.050... |
1718 | 2 | when | 0.0 | 296 | [-0.0038482083, 0.21741627, -0.03928126, 0.050... |
def get_pole_avg(model_name: str, model: Word2Vec, lex, k=10):
words = []
vecs = []
vecs.append(model.wv[lex])
df = (pd.read_csv(f"../pole-words/{model_name}_{lex}.csv")
.query('Include != "f"')
.nlargest(k, 'SemSim')
)
pole_words = df['Word'].tolist()
for word in pole_words:
if word in model.wv:
vecs.append(model.wv[word])
words.append(word)
pole_avg = np.mean(vecs, axis=0)
return pole_avg
get_pole_avg (model_name:str, model:gensim.models.word2vec.Word2Vec, lex, k=10)
make_sem_axis_avg (model_name:str, model:gensim.models.word2vec.Word2Vec, pole_word_1:str, pole_word_2:str, k=10)
get_axis_sim (lex:str, pole_word_1:str, pole_word_2:str, model_name, model, k=10)
0.9172929190814479
def get_axis_sims(lexs: list, models: dict, pole_words: list, k=10):
sims = []
for lex in lexs:
for name, model in models.items():
sim = {}
sim['model'] = name
sim['lex'] = lex
sim['sim'] = get_axis_sim(lex, pole_words[0], pole_words[1], name, model, k)
sims.append(sim)
sims_df = pd.DataFrame(sims)
return sims_df
get_axis_sims (lexs:list, models:dict, pole_words:list, k=10)
model | lex | sim | |
---|---|---|---|
0 | Coronavirus | vaccines | 0.917293 |
1 | conspiracy | vaccines | 0.990792 |
2 | Coronavirus | vaccine | 0.933222 |
3 | conspiracy | vaccine | 0.977039 |
aggregate_proj_sims (proj_sims)
model | lex | Coronavirus | conspiracy | SimDiff |
---|---|---|---|---|
0 | vaccine | 0.933222 | 0.977039 | -0.043817 |
1 | vaccines | 0.917293 | 0.990792 | -0.073499 |
proj_sims_melted = proj_sims.melt(id_vars=['lex', 'SimDiff'], var_name='model', value_name='SemSim')
proj_sims_melted
lex | SimDiff | model | SemSim | |
---|---|---|---|---|
0 | vaccine | -0.043817 | Coronavirus | 0.933222 |
1 | vaccines | -0.073499 | Coronavirus | 0.917293 |
2 | vaccine | -0.043817 | conspiracy | 0.977039 |
3 | vaccines | -0.073499 | conspiracy | 0.990792 |
plot_sem_axis (proj_sims_melted:pandas.core.frame.DataFrame, pole_words:list)
1173 1173
1173 1173
<gensim.models.word2vec.Word2Vec>
def get_nbs_vecs(lex: str, model_name: str, model: Word2Vec, k=50):
lex_vecs = []
lex_d = {}
lex_d['lex'] = lex
lex_d['type'] = 'center'
lex_d['subreddit'] = model_name
lex_d['vec'] = model.wv.get_vector(lex)
lex_vecs.append(lex_d)
for nb, sim in model.wv.most_similar(lex, topn=k):
lex_d = {}
lex_d['lex'] = nb
lex_d['type'] = 'nb'
lex_d['sim'] = sim
lex_d['subreddit'] = model_name
lex_d['vec'] = model.wv.get_vector(nb)
lex_d['freq'] = model.wv.get_vecattr(nb, "count")
lex_vecs.append(lex_d)
lex_vecs_df = pd.DataFrame(lex_vecs)
return lex_vecs_df
get_nbs_vecs (lex:str, model_name:str, model:gensim.models.word2vec.Word2Vec, k=50)
lex | type | subreddit | vec | sim | freq | |
---|---|---|---|---|---|---|
0 | vaccines | center | Coronavirus | [0.057227388, 0.10430427, 0.032422945, 0.06407... | NaN | NaN |
1 | health | nb | Coronavirus | [0.114723645, 0.21284297, 0.064497545, 0.12715... | 0.999692 | 46.0 |
2 | per | nb | Coronavirus | [0.090133496, 0.16385178, 0.045519162, 0.09794... | 0.999669 | 29.0 |
3 | deaths | nb | Coronavirus | [0.11426731, 0.20661092, 0.06342163, 0.1284676... | 0.999660 | 49.0 |
4 | 1 | nb | Coronavirus | [0.11298724, 0.20978343, 0.061724722, 0.128311... | 0.999638 | 54.0 |
5 | us | nb | Coronavirus | [0.11077853, 0.21411182, 0.06554103, 0.1271514... | 0.999608 | 123.0 |
6 | new | nb | Coronavirus | [0.1211073, 0.22653188, 0.069263056, 0.1391144... | 0.999603 | 76.0 |
7 | years | nb | Coronavirus | [0.09558547, 0.18013933, 0.05468028, 0.1110726... | 0.999591 | 52.0 |
8 | spread | nb | Coronavirus | [0.07730215, 0.14945872, 0.04962611, 0.0888780... | 0.999588 | 35.0 |
9 | 0 | nb | Coronavirus | [0.0645965, 0.1220492, 0.035898782, 0.07554879... | 0.999573 | 16.0 |
10 | 000 | nb | Coronavirus | [0.075567916, 0.1435639, 0.040953316, 0.084523... | 0.999569 | 29.0 |
11 | 7 | nb | Coronavirus | [0.06406919, 0.12866612, 0.03618963, 0.0769648... | 0.999565 | 17.0 |
12 | world | nb | Coronavirus | [0.08870022, 0.159735, 0.04859646, 0.09668462,... | 0.999564 | 74.0 |
13 | another | nb | Coronavirus | [0.06684075, 0.12619431, 0.037108395, 0.077369... | 0.999556 | 42.0 |
14 | public | nb | Coronavirus | [0.08594237, 0.16238208, 0.054486435, 0.097496... | 0.999552 | 39.0 |
15 | hours | nb | Coronavirus | [0.055532847, 0.11181068, 0.032346416, 0.06532... | 0.999545 | 15.0 |
16 | control | nb | Coronavirus | [0.0560417, 0.10506418, 0.030902166, 0.0663534... | 0.999544 | 31.0 |
17 | keep | nb | Coronavirus | [0.08326553, 0.15533412, 0.04887768, 0.0957071... | 0.999539 | 85.0 |
18 | reported | nb | Coronavirus | [0.07306963, 0.1333254, 0.045092944, 0.0796391... | 0.999537 | 19.0 |
19 | average | nb | Coronavirus | [0.058386106, 0.10160504, 0.028476257, 0.06527... | 0.999528 | 17.0 |
20 | sars | nb | Coronavirus | [0.054962628, 0.10801497, 0.030391451, 0.06244... | 0.999521 | 14.0 |
21 | ago | nb | Coronavirus | [0.05723345, 0.1124895, 0.03595097, 0.06608445... | 0.999521 | 29.0 |
22 | science | nb | Coronavirus | [0.06259699, 0.11655103, 0.03798895, 0.0715784... | 0.999516 | 24.0 |
23 | under | nb | Coronavirus | [0.08245373, 0.15575409, 0.054234445, 0.097904... | 0.999508 | 40.0 |
24 | early | nb | Coronavirus | [0.07038852, 0.12683387, 0.044630148, 0.081177... | 0.999506 | 19.0 |
25 | likely | nb | Coronavirus | [0.06962872, 0.13197441, 0.043262962, 0.075477... | 0.999506 | 36.0 |
26 | due | nb | Coronavirus | [0.052545387, 0.09022615, 0.028288025, 0.05750... | 0.999492 | 25.0 |
27 | schools | nb | Coronavirus | [0.07024904, 0.13720992, 0.042491872, 0.087135... | 0.999490 | 22.0 |
28 | state | nb | Coronavirus | [0.08707496, 0.16359739, 0.054630436, 0.101618... | 0.999488 | 47.0 |
29 | quality | nb | Coronavirus | [0.05661944, 0.10371776, 0.03314033, 0.0622458... | 0.999486 | 14.0 |
30 | 4 | nb | Coronavirus | [0.057619605, 0.117240496, 0.036980513, 0.0707... | 0.999483 | 26.0 |
31 | help | nb | Coronavirus | [0.074597746, 0.13663343, 0.040477894, 0.08401... | 0.999481 | 34.0 |
32 | available | nb | Coronavirus | [0.050409965, 0.097474575, 0.028630486, 0.0579... | 0.999480 | 13.0 |
33 | america | nb | Coronavirus | [0.055805784, 0.09872247, 0.029724134, 0.06394... | 0.999477 | 30.0 |
34 | response | nb | Coronavirus | [0.04918535, 0.08948149, 0.022905065, 0.054223... | 0.999472 | 12.0 |
35 | high | nb | Coronavirus | [0.10157195, 0.19419962, 0.06280887, 0.1199198... | 0.999469 | 39.0 |
36 | 10 | nb | Coronavirus | [0.08214546, 0.14870152, 0.04579945, 0.0888806... | 0.999461 | 32.0 |
37 | am | nb | Coronavirus | [0.096367285, 0.18651997, 0.044814415, 0.11034... | 0.999456 | 102.0 |
38 | city | nb | Coronavirus | [0.06082973, 0.11749765, 0.034887813, 0.073830... | 0.999453 | 19.0 |
39 | months | nb | Coronavirus | [0.09699332, 0.17284966, 0.057289552, 0.105755... | 0.999451 | 44.0 |
40 | currently | nb | Coronavirus | [0.062632866, 0.12303838, 0.03613432, 0.074787... | 0.999444 | 19.0 |
41 | already | nb | Coronavirus | [0.09850482, 0.1830824, 0.059671603, 0.1124184... | 0.999443 | 49.0 |
42 | fact | nb | Coronavirus | [0.063221715, 0.123731576, 0.04299859, 0.07519... | 0.999428 | 44.0 |
43 | places | nb | Coronavirus | [0.058041047, 0.11331363, 0.038818423, 0.06794... | 0.999427 | 18.0 |
44 | provide | nb | Coronavirus | [0.04588933, 0.095273994, 0.023019709, 0.05869... | 0.999418 | 14.0 |
45 | positive | nb | Coronavirus | [0.067284316, 0.12324461, 0.040740743, 0.07202... | 0.999414 | 19.0 |
46 | link | nb | Coronavirus | [0.04832779, 0.08898213, 0.022611374, 0.052865... | 0.999399 | 27.0 |
47 | based | nb | Coronavirus | [0.073736355, 0.14148337, 0.03525661, 0.082236... | 0.999397 | 24.0 |
48 | 20 | nb | Coronavirus | [0.09155286, 0.18733306, 0.051601738, 0.109539... | 0.999391 | 28.0 |
49 | than | nb | Coronavirus | [0.14099781, 0.26505557, 0.09407188, 0.1668624... | 0.999377 | 124.0 |
50 | open | nb | Coronavirus | [0.08959826, 0.16488104, 0.051867336, 0.098772... | 0.999374 | 36.0 |
def dim_red_nbs_vecs(nbs_vecs: pd.DataFrame, perplexity=50, n_iter=1000):
Y_tsne = TSNE(
perplexity=perplexity,
method='exact',
init='pca',
verbose=False,
learning_rate='auto',
n_iter=n_iter
).fit_transform(np.array(list(nbs_vecs['vec'])))
nbs_vecs['x_tsne'] = Y_tsne[:, [0]]
nbs_vecs['y_tsne'] = Y_tsne[:, [1]]
return nbs_vecs
dim_red_nbs_vecs (nbs_vecs:pandas.core.frame.DataFrame, perplexity=50, n_iter=1000)
nbs_sim = (nbs_vecs
.groupby('subreddit')
.apply(lambda df: df.nlargest(10, 'sim'))
.reset_index(drop=True)
)
/var/folders/gp/dw55jb3d3gl6jn22rscvxjm40000gn/T/ipykernel_48107/1291324649.py:3: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
.apply(lambda df: df.nlargest(10, 'sim'))
nbs_diff = nbs_vecs.drop_duplicates(subset='lex', keep=False)
nbs_diff = (nbs_diff
.groupby('subreddit')
.apply(lambda df: df.nlargest(20, 'sim'))
.reset_index(drop=True)
)
/var/folders/gp/dw55jb3d3gl6jn22rscvxjm40000gn/T/ipykernel_48107/628690727.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
.apply(lambda df: df.nlargest(20, 'sim'))