Commit ff2ae806 authored by Valentin Pelloin's avatar Valentin Pelloin
Browse files

correction windows, gensim, notebooks, and many more

parent c39283d8
......@@ -3,5 +3,6 @@ __pycache__/
.ipynb_checkpoints/
*.binary
*.word2vec
*.svd2vec
text8
text8.zip
This diff is collapsed.
%% Cell type:markdown id: tags:
# Getting started with `svd2vec`
%% Cell type:markdown id: tags:
## I - Installation
`svd2vec` can be installed using *pip*:
```shell
pip install svd2vec
```
%% Cell type:markdown id: tags:
## II - Usage
`svd2vec` can be used like the `word2vec` implementation of [Gensim](https://pypi.org/project/gensim/).
The full documentation can be seen [here](#).
%% Cell type:markdown id: tags:
### A/ Corpus creation
The corpus (`documents`) parameter of `svd2vec` should be a list of documents. Each document should be a list of words representing that document.
%% Cell type:code id: tags:
``` python
# saving the word2vec corpus locally
import requests, zipfile, io
url = "http://mattmahoney.net/dc/text8.zip"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
```
%% Cell type:code id: tags:
``` python
# loading the word2vec demo corpus as a single document
documents = [open("text8", "r").read().split(" ")]
```
%% Cell type:markdown id: tags:
### B/ Creation of the vectors
%% Cell type:code id: tags:
``` python
from svd2vec import svd2vec
```
%% Cell type:code id: tags:
``` python
# showing first fifteen words of each documents
[d[:15] + ['...'] for d in documents]
```
%% Output
[['',
'anarchism',
'originated',
'as',
'a',
'term',
'of',
'abuse',
'first',
'used',
'against',
'early',
'working',
'class',
'radicals',
'...']]
%% Cell type:code id: tags:
``` python
# creating the words representation (can take a while)
svd = svd2vec(documents, window=5, min_count=100, verbose=False)
```
%% Cell type:markdown id: tags:
### C/ Similarity and distance
%% Cell type:code id: tags:
``` python
svd.similarity("bad", "good")
```
%% Output
0.5595044997663727
%% Cell type:code id: tags:
``` python
svd.similarity("monday", "friday")
```
%% Output
0.8000593208690482
%% Cell type:code id: tags:
``` python
svd.distance("apollo", "moon")
```
%% Output
0.51619968887672
%% Cell type:code id: tags:
``` python
svd.most_similar(positive=["january"], topn=2)
```
%% Output
[('december', 0.7869627196261781), ('march', 0.7782765534824396)]
%% Cell type:markdown id: tags:
### D/ Analogy
%% Cell type:code id: tags:
``` python
svd.analogy("paris", "france", "berlin")
```
%% Output
[('germany', 0.7240066875926087),
('weimar', 0.6371445233683818),
('reich', 0.631414594126022),
('munich', 0.5917068813628168),
('sch', 0.5591401823289636),
('brandenburg', 0.5468138153874815),
('und', 0.541566598856033),
('hermann', 0.5411562914966189),
('adolf', 0.5394922186458038),
('otto', 0.5391901427839293)]
%% Cell type:code id: tags:
``` python
svd.analogy("road", "cars", "rail", topn=5)
```
%% Output
[('locomotives', 0.7626203484386807),
('locomotive', 0.7587259422633467),
('trucks', 0.7255470578340787),
('trains', 0.717637832883044),
('automobiles', 0.6737808582283374)]
%% Cell type:code id: tags:
``` python
svd.analogy("cow", "cows", "pig")
```
%% Output
[('sheep', 0.5829199353965691),
('pigs', 0.5629631047865382),
('goat', 0.5611478942276642),
('eat', 0.5592920869267609),
('cats', 0.523851442525088),
('goats', 0.5230269418385303),
('meat', 0.5202435333205421),
('animal', 0.5194570523705068),
('fish', 0.5131523388198542),
('dogs', 0.5125122379464395)]
%% Cell type:code id: tags:
``` python
svd.analogy("man", "men", "woman")
```
%% Output
[('women', 0.7754647153730071),
('couples', 0.6097503266776299),
('male', 0.5914266186445117),
('sex', 0.5782558939194317),
('female', 0.570068551351722),
('intercourse', 0.5302306678128059),
('heterosexual', 0.5222203608894108),
('children', 0.5139059481091136),
('lesbian', 0.5132646381911999),
('feminism', 0.5027363468750581)]
%% Cell type:markdown id: tags:
### E/ Saving and loading vectors
%% Cell type:code id: tags:
``` python
# saving to a binary format
svd.save("svd.binary")
svd.save("svd.svd2vec")
```
%% Cell type:code id: tags:
``` python
# loading from binary file
loaded = svd2vec.load("svd.binary")
loaded = svd2vec.load("svd.svd2vec")
loaded.similarity("bad", "good")
```
%% Output
0.5259838000029272
0.5595044997663727
%% Cell type:code id: tags:
``` python
# saving to a word2vec like representation
svd.save_word2vec_format("svd.word2vec")
```
......
from .core import svd2vec
from .window import WindowWeights
__all__ = ["svd2vec"]
__all__ = ["svd2vec", "WindowWeights"]
......@@ -12,6 +12,7 @@ import multiprocessing
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr
from joblib import Parallel, delayed
from collections import OrderedDict, Counter
from operator import itemgetter
......@@ -102,9 +103,9 @@ class svd2vec:
# window type
if isinstance(window, int):
window = WindowWeights.create_window(left=window, right=window, weighter=window_weighter)
window, window_size = WindowWeights.create_window(left=window, right=window, weighter=window_weighter)
elif isinstance(window, tuple) and len(window) == 2 and all(map(lambda e: isinstance(e, int), window)):
window = WindowWeights.create_window(left=window[0], right=window[1], weighter=window_weighter)
window, window_size = WindowWeights.create_window(left=window[0], right=window[1], weighter=window_weighter)
else:
raise ValueError("'" + window + "' not implemented as a window yielder")
......@@ -127,6 +128,7 @@ class svd2vec:
self.min_count = min_count
self.size = size
self.window = window
self.window_size = window_size
self.cds_alpha = cds_alpha
self.sub_threshold = sub_threshold
self.neg_k_shift = neg_k_shift
......@@ -221,7 +223,7 @@ class svd2vec:
matrix = file.load(erase=True)
for document in self.bar(self.documents, "co-occurence counting"):
for word, context, weight in self.bar(self.window(document), "document co-occurence counting", total=self.vocabulary_len * self.vocabulary_len, offset=1):
for word, context, weight in self.bar(self.window(document), "document co-occurence counting", total=self.window_size(document), offset=1):
i_word = self.vocabulary[word]
i_context = self.vocabulary[context]
matrix[i_word, i_context] += weight
......@@ -240,6 +242,7 @@ class svd2vec:
# instance variable will stop us from using joblib parallelisation
# because this can not be saved as a pickle object
delattr(self, "window")
delattr(self, "window_size")
def pmi_matrix(self):
# pointwise mutal information
......@@ -248,9 +251,6 @@ class svd2vec:
pmi_list = Parallel(n_jobs=self.workers)(delayed(self.pmi_parallized)(slice, i) for i, slice in enumerate(slices) if slice != [])
pmi = np.concatenate(pmi_list, axis=0)
if self.verbose:
print("")
return pmi
def pmi_parallized(self, slice, i):
......@@ -400,10 +400,12 @@ class svd2vec:
def cosine_similarity(self, wx, cx, wy, cy):
# compute the cosine similarity of x (word x and context x) and y (word
# y and context y)
top = np.dot(wx + cx, wy + cy)
bot = np.sqrt(np.dot(wx + cx, wx + cx)) * np.sqrt(np.dot(wy + cy, wy + cy))
#top = np.dot(wx, wy) + np.dot(cx, cy) + np.dot(wx, cy) + np.dot(cx, wy)
#bot = (2 * np.sqrt(np.dot(wx, cx) + 1)) * (np.sqrt(np.dot(wy, cy) + 1))
wxcx = wx + cx
wycy = wy + cy
top = np.dot(wxcx, wycy)
bot = np.sqrt(np.dot(wxcx, wxcx)) * np.sqrt(np.dot(wycy, wycy))
# top = np.dot(wx, wy) + np.dot(cx, cy) + np.dot(wx, cy) + np.dot(cx, wy)
# bot = (2 * np.sqrt(np.dot(wx, cx) + 1)) * (np.sqrt(np.dot(wy, cy) + 1))
return top / bot
def similarity(self, x, y):
......@@ -506,7 +508,7 @@ class svd2vec:
positives = [self.vectors(x) for x in positive]
negatives = [self.vectors(x) for x in negative]
first_w, first_c = positives[0] if positive else negatives[0]
# first_w, first_c = positives[0] if positive else negatives[0]
mean_w = []
mean_c = []
......@@ -576,6 +578,65 @@ class svd2vec:
else:
raise ValueError("Word '" + word + "' not in the vocabulary")
#####
# Evaluation
#####
def evaluate_word_pairs(self, pairs, delimiter='\t'):
"""
Evaluates the model similarity using a pairs file of human judgments
of similarities.
Parameters
----------
pairs : string
A filepath of a csv file. Lines starting by '#' will be ignored.
The first and second column are the words. The third column is the
human made similarity.
delimiter : string
The delimiter of the csv file
Returns
-------
tuple
The first value is the pearson coefficient (1.0 means the model is
very good according to humans, 0.0 it's very bad). The second value
is the two-tailed p-value.
"""
file = Utils.parse_csv(pairs, delimiter)
x = []
y = []
for row in file:
w1 = row[0]
w2 = row[1]
hsim = float(row[2])
if w1 not in self.vocabulary or w2 not in self.vocabulary:
continue
msim = self.similarity(w1, w2)
x.append(hsim)
y.append(msim)
pearson = pearsonr(np.array(x), np.array(y))
return pearson
def evaluate_word_analogies(self, analogies, section_separator=":"):
total = 0
correct = 0
with open(analogies, "r") as file:
for line in file.read().splitlines():
if line.startswith(section_separator):
continue
words = line.split(" ")
if any([w not in self.vocabulary for w in words]):
continue
total += 1
predicted = self.analogy(words[0], words[1], words[2])
if predicted == words[3]:
correct += 1
result = correct / total
return result
#####
# Debug
#####
......
......@@ -22,6 +22,8 @@ class Utils:
return list(a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
def getsize(obj):
# Returns the size of the object in bytes (dict for each instance var)
# Note: not working well with np.memmap
size = {}
size["total"] = 0
for var, inner_obj in obj.__dict__.items():
......@@ -33,8 +35,21 @@ class Utils:
return size
def running_notebook():
# Returns True if the current code is running in a Jupyter Notebook,
# False otherwise
if 'IPython' in sys.modules:
from IPython import get_ipython
return 'IPKernelApp' in get_ipython().config
else:
return False
def parse_csv(file_path, delimiter, comment="#"):
# Returns a list of lines, each line being a list of cells
output = []
with open(file_path, "r") as file:
for line in file.read().splitlines():
if line[0] == comment:
continue
else:
output.append(line.split(delimiter))
return output
......@@ -8,17 +8,25 @@ class WindowWeights:
for iW, word in enumerate(document):
for i in reversed(range(1, left)):
ictx = iW - i
if ictx <= 0:
break
if ictx < 0:
continue
ctx = document[ictx]
yield weighter(word, ctx, i, left)
for i in range(1, right):
ictx = iW + i
if ictx >= doc_len:
break
continue
ctx = document[ictx]
yield weighter(word, ctx, i, right)
return window
def window_size(document):
l1 = left - 1
r1 = right - 1
doc_len = len(document)
size = doc_len * (l1 + r1) - (l1 * (l1 + 1)) / 2 - (r1 * (r1 + 1)) / 2
return int(size)
return window, window_size
def weight_harmonic(word, context, dist, windowSize):
# the harmonic weighing
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment