Commit bd2b7ee0 authored by Valentin Pelloin's avatar Valentin Pelloin
Browse files

bug trouvé : gensim ne veut pas plus de 10 000 mots par documents batch_words

parent 58ff01d2
%% Cell type:markdown id: tags:
# Using Gensim with `svd2vec` output
[Gensim](https://pypi.org/project/gensim/) is a Python library for topic modelling, document indexing and similarity retrieval with large corpora.
Gensim can use `word2vec` to compute similarity (and more!) between words. `svd2vec` can save it's vectors in a `word2vec` format that Gensim can process.
In this notebook it is shown how you can use Gensim with vectors learnt from `svd2vec`. We also compare our results with the pure word2vec model.
%% Cell type:markdown id: tags:
---
## I - Preparation
%% Cell type:code id: tags:
``` python
from svd2vec import svd2vec
from gensim.models import Word2Vec
from gensim.models.keyedvectors import Word2VecKeyedVectors
```
%% Cell type:code id: tags:
``` python
# Gensim does not have any implementation of an analogy method, so we add one here (3CosAdd)
def analogy_keyed(self, a, b, c, topn=10):
return self.most_similar(positive=[b, c], negative=[a], topn=topn)
Word2VecKeyedVectors.analogy = analogy_keyed
def analogy_w2v(self, a, b, c, topn=10):
return self.wv.most_similar(positive=[b, c], negative=[a], topn=topn)
Word2Vec.analogy = analogy_w2v
```
%% Cell type:code id: tags:
``` python
# we load our previously made text8 document list
documents = [open("text8", "r").read().split(" ")[1:]]
```
%% Cell type:code id: tags:
``` python
from svd2vec import Utils
documents = Utils.split(documents[0], 1701)
```
%% Cell type:markdown id: tags:
---
## II - Models construction
%% Cell type:markdown id: tags:
### SVD with svd2vec
%% Cell type:code id: tags:
``` python
#svd2vec_svd = svd2vec(documents, size=100, window=5, min_count=100, verbose=False)
svd2vec_svd = svd2vec.load("svd.svd2vec")
```
%% Cell type:markdown id: tags:
### SVD with Gensim from svd2vec
%% Cell type:code id: tags:
``` python
# we first need to export svd2vec_svd to the word2vec format
svd2vec_svd.save_word2vec_format("svd.word2vec")
# we then load the model using Gensim
gensim_svd = Word2VecKeyedVectors.load_word2vec_format("svd.word2vec")
```
%% Cell type:markdown id: tags:
### word2vec
%% Cell type:code id: tags:
``` python
word2vec_w2v = Word2VecKeyedVectors.load_word2vec_format("w2v.word2vec")
```
%% Cell type:markdown id: tags:
### word2vec with Gensim
%% Cell type:code id: tags:
``` python
gensim_w2v = Word2Vec(documents, size=100, window=5, min_count=100, workers=16)
import gensim
gensim_w2v = gensim.models.Word2Vec(documents, size=100, window=5, min_count=100, workers=16)
```
%% Cell type:code id: tags:
``` python
len(list(gensim_w2v.wv.vocab.keys()))
```
%% Output
11815
%% Cell type:markdown id: tags:
---
## III - Cosine similarity comparison
%% Cell type:code id: tags:
``` python
def compare_similarity(w1, w2):
print("cosine similarity between", w1, "and", w2, ":")
print("\tsvd2vec_svd ", svd2vec_svd.similarity(w1, w2))
print("\tgensim_svd ", gensim_svd.similarity(w1, w2))
print("\tgensim_w2v ", gensim_w2v.wv.similarity(w1, w2))
print("\tword2vec_w2v", word2vec_w2v.similarity(w1, w2))
def compare_analogy(w1, w2, w3, topn=3):
def analogy_str(model):
a = model.analogy(w1, w2, w3, topn=topn)
s = "\n\t\t".join(["{: <20}".format(w) + str(c) for w, c in a])
return "\n\t\t" + s
print("analogy similaties :", w1, "is to", w2, "as", w3, "is to?")
print("\tsvd2vec_svd", analogy_str(svd2vec_svd))
print("\tgensim_svd", analogy_str(gensim_svd))
print("\tgensim_w2v", analogy_str(gensim_w2v))
print("\tword2vec_w2v", analogy_str(word2vec_w2v))
```
%% Cell type:code id: tags:
``` python
compare_similarity("good", "bad")
```
%% Output
cosine similarity between good and bad :
svd2vec_svd 0.4951483093832256
gensim_svd 0.4951475
gensim_w2v 0.7870999
gensim_w2v 0.7723463
word2vec_w2v 0.728928
%% Cell type:code id: tags:
``` python
compare_similarity("truck", "car")
```
%% Output
cosine similarity between truck and car :
svd2vec_svd 0.8725645794464922
gensim_svd 0.8725649
gensim_w2v 0.054074332
gensim_w2v 0.71462846
word2vec_w2v 0.6936528
%% Cell type:code id: tags:
``` python
compare_analogy("january", "month", "monday")
```
%% Output
analogy similaties : january is to month as monday is?
analogy similaties : january is to month as monday is to?
svd2vec_svd
friday 0.7990049263196153
holiday 0.7774813849657727
day 0.7696653269345999
gensim_svd
friday 0.7990041971206665
holiday 0.7774807810783386
day 0.7696648836135864
gensim_w2v
x 0.37333881855010986
cargo 0.3701249957084656
multi 0.3621957004070282
week 0.7143122553825378
evening 0.6310715675354004
weekend 0.6066169142723083
word2vec_w2v
week 0.7236202359199524
evening 0.5867935419082642
weekend 0.5843297839164734
%% Cell type:code id: tags:
``` python
compare_analogy("paris", "france", "berlin")
```
%% Output
analogy similaties : paris is to france as berlin is?
analogy similaties : paris is to france as berlin is to?
svd2vec_svd
germany 0.7687125088187668
reich 0.7243489014216623
sch 0.7123675101373064
gensim_svd
germany 0.7687125205993652
reich 0.7243496179580688
sch 0.712367594242096
gensim_w2v
arrive 0.5704855918884277
refers 0.5651636123657227
se 0.5631446838378906
germany 0.8262317180633545
finland 0.7536041140556335
austria 0.7173164486885071
word2vec_w2v
germany 0.840154767036438
austria 0.6982203722000122
poland 0.6571524143218994
%% Cell type:code id: tags:
``` python
compare_analogy("man", "king", "woman")
```
%% Output
analogy similaties : man is to king as woman is?
analogy similaties : man is to king as woman is to?
svd2vec_svd
crowned 0.623713716342001
isabella 0.6024687219275104
consort 0.6019050828977524
gensim_svd
crowned 0.6237134337425232
isabella 0.6024693846702576
consort 0.601904571056366
gensim_w2v
pieces 0.38315141201019287
labs 0.3812718987464905
wolfgang 0.33765164017677307
queen 0.7210809588432312
elizabeth 0.6706132888793945
isabella 0.6488653421401978
word2vec_w2v
queen 0.6623748540878296
regent 0.6608081459999084
consort 0.6403408050537109
%% Cell type:code id: tags:
``` python
compare_analogy("road", "cars", "rail")
```
%% Output
analogy similaties : road is to cars as rail is?
analogy similaties : road is to cars as rail is to?
svd2vec_svd
locomotives 0.7105197854472618
diesel 0.6920861316045748
locomotive 0.6578811562326874
gensim_svd
locomotives 0.7105196714401245
diesel 0.6920859813690186
locomotive 0.6578816175460815
gensim_w2v
regarded 0.5483688116073608
fraction 0.5402679443359375
disability 0.5247717499732971
vehicles 0.7365255355834961
locomotives 0.7124711275100708
automobiles 0.7065150737762451
word2vec_w2v
locomotives 0.6976078152656555
vehicles 0.6787285804748535
diesel 0.6171871423721313
%% Cell type:markdown id: tags:
---
## IV - Evaluations
%% Cell type:code id: tags:
``` python
def compare_similarity(datafile):
from gensim.test.utils import datapath
contents = datapath(datafile)
print("pearson correlation of", datafile)
print("\tsvd2vec_svd ", svd2vec_svd.evaluate_word_pairs(contents)[0])
print("\tgensim_svd ", gensim_svd.evaluate_word_pairs(contents)[0][0])
print("\tgensim_w2v ", gensim_w2v.wv.evaluate_word_pairs(contents)[0][0])
print("\tword2vec_w2v ", word2vec_w2v.evaluate_word_pairs(contents)[0][0])
```
%% Cell type:code id: tags:
``` python
compare_similarity('wordsim353.tsv')
```
%% Output
pearson correlation of wordsim353.tsv
svd2vec_svd 0.6701752412518817
gensim_svd 0.6805493828205335
gensim_w2v -0.015837619806909214
gensim_w2v 0.6570723922031956
word2vec_w2v 0.6848196247009626
%% Cell type:code id: tags:
``` python
def compare_analogy(datafile):
from gensim.test.utils import datapath
contents = datapath(datafile)
print("analogies success rate of", datafile)
svd2vec_svd.verbose = True
print("\tsvd2vec_svd ", svd2vec_svd.evaluate_word_analogies(contents))
print("\tgensim_svd ", gensim_svd.evaluate_word_analogies(contents)[0])
print("\tgensim_w2v ", gensim_w2v.wv.evaluate_word_analogies(contents)[0])
print("\tword2vec_w2v ", word2vec_w2v.evaluate_word_analogies(contents)[0])
```
%% Cell type:code id: tags:
``` python
compare_analogy('questions-words.txt')
```
%% Output
analogies success rate of questions-words.txt
svd2vec_svd 0.31634891175974356
gensim_svd 0.31634891175974356
gensim_w2v 0.0003374388392103931
gensim_w2v 0.4552049940948203
word2vec_w2v 0.5129070355997976
%% Cell type:code id: tags:
``` python
```
......
......@@ -3,7 +3,6 @@
"""
import bz2
import heapq
import pickle
import numpy as np
import pandas as pd
......@@ -14,9 +13,7 @@ from scipy.sparse.linalg import svds
from scipy.stats import pearsonr
from joblib import Parallel, delayed
from collections import OrderedDict, Counter
from operator import itemgetter
from tqdm import tqdm, tqdm_notebook
from numba import jit
from .utils import Utils
from .window import WindowWeights
......@@ -86,7 +83,7 @@ class svd2vec:
eig_p_weight=0,
nrm_type=NRM_SCHEME_ROW,
sub_threshold=1e-5,
verbose=True,
verbose=False,
workers=MAX_CPU_CORES):
# -------------
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment