Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Valentin Pelloin
svd2vec
Commits
ff2ae806
Commit
ff2ae806
authored
May 23, 2019
by
Valentin Pelloin
Browse files
correction windows, gensim, notebooks, and many more
parent
c39283d8
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
ff2ae806
...
...
@@ -3,5 +3,6 @@ __pycache__/
.ipynb_checkpoints/
*.binary
*.word2vec
*.svd2vec
text8
text8.zip
notebooks/Gensim comparison.ipynb
0 → 100644
View file @
ff2ae806
This diff is collapsed.
Click to expand it.
notebooks/Getting started.ipynb
View file @
ff2ae806
...
...
@@ -340,39 +340,39 @@
},
{
"cell_type": "code",
"execution_count":
12
,
"execution_count":
58
,
"metadata": {},
"outputs": [],
"source": [
"# saving to a binary format\n",
"svd.save(\"svd.
binary
\")"
"svd.save(\"svd.
svd2vec
\")"
]
},
{
"cell_type": "code",
"execution_count":
13
,
"execution_count":
59
,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.5
2
59
8380000292
72"
"0.559
5044997663
72
7
"
]
},
"execution_count":
13
,
"execution_count":
59
,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# loading from binary file\n",
"loaded = svd2vec.load(\"svd.
binary
\")\n",
"loaded = svd2vec.load(\"svd.
svd2vec
\")\n",
"loaded.similarity(\"bad\", \"good\")"
]
},
{
"cell_type": "code",
"execution_count":
15
,
"execution_count":
60
,
"metadata": {},
"outputs": [],
"source": [
...
...
%% Cell type:markdown id: tags:
# Getting started with `svd2vec`
%% Cell type:markdown id: tags:
## I - Installation
`svd2vec`
can be installed using
*pip*
:
```
shell
pip
install
svd2vec
```
%% Cell type:markdown id: tags:
## II - Usage
`svd2vec`
can be used like the
`word2vec`
implementation of
[
Gensim
](
https://pypi.org/project/gensim/
)
.
The full documentation can be seen
[
here
](
#
)
.
%% Cell type:markdown id: tags:
### A/ Corpus creation
The corpus (
`documents`
) parameter of
`svd2vec`
should be a list of documents. Each document should be a list of words representing that document.
%% Cell type:code id: tags:
```
python
# saving the word2vec corpus locally
import
requests
,
zipfile
,
io
url
=
"http://mattmahoney.net/dc/text8.zip"
r
=
requests
.
get
(
url
)
z
=
zipfile
.
ZipFile
(
io
.
BytesIO
(
r
.
content
))
z
.
extractall
()
```
%% Cell type:code id: tags:
```
python
# loading the word2vec demo corpus as a single document
documents
=
[
open
(
"text8"
,
"r"
).
read
().
split
(
" "
)]
```
%% Cell type:markdown id: tags:
### B/ Creation of the vectors
%% Cell type:code id: tags:
```
python
from
svd2vec
import
svd2vec
```
%% Cell type:code id: tags:
```
python
# showing first fifteen words of each documents
[
d
[:
15
]
+
[
'...'
]
for
d
in
documents
]
```
%% Output
[['',
'anarchism',
'originated',
'as',
'a',
'term',
'of',
'abuse',
'first',
'used',
'against',
'early',
'working',
'class',
'radicals',
'...']]
%% Cell type:code id: tags:
```
python
# creating the words representation (can take a while)
svd
=
svd2vec
(
documents
,
window
=
5
,
min_count
=
100
,
verbose
=
False
)
```
%% Cell type:markdown id: tags:
### C/ Similarity and distance
%% Cell type:code id: tags:
```
python
svd
.
similarity
(
"bad"
,
"good"
)
```
%% Output
0.5595044997663727
%% Cell type:code id: tags:
```
python
svd
.
similarity
(
"monday"
,
"friday"
)
```
%% Output
0.8000593208690482
%% Cell type:code id: tags:
```
python
svd
.
distance
(
"apollo"
,
"moon"
)
```
%% Output
0.51619968887672
%% Cell type:code id: tags:
```
python
svd
.
most_similar
(
positive
=
[
"january"
],
topn
=
2
)
```
%% Output
[('december', 0.7869627196261781), ('march', 0.7782765534824396)]
%% Cell type:markdown id: tags:
### D/ Analogy
%% Cell type:code id: tags:
```
python
svd
.
analogy
(
"paris"
,
"france"
,
"berlin"
)
```
%% Output
[('germany', 0.7240066875926087),
('weimar', 0.6371445233683818),
('reich', 0.631414594126022),
('munich', 0.5917068813628168),
('sch', 0.5591401823289636),
('brandenburg', 0.5468138153874815),
('und', 0.541566598856033),
('hermann', 0.5411562914966189),
('adolf', 0.5394922186458038),
('otto', 0.5391901427839293)]
%% Cell type:code id: tags:
```
python
svd
.
analogy
(
"road"
,
"cars"
,
"rail"
,
topn
=
5
)
```
%% Output
[('locomotives', 0.7626203484386807),
('locomotive', 0.7587259422633467),
('trucks', 0.7255470578340787),
('trains', 0.717637832883044),
('automobiles', 0.6737808582283374)]
%% Cell type:code id: tags:
```
python
svd
.
analogy
(
"cow"
,
"cows"
,
"pig"
)
```
%% Output
[('sheep', 0.5829199353965691),
('pigs', 0.5629631047865382),
('goat', 0.5611478942276642),
('eat', 0.5592920869267609),
('cats', 0.523851442525088),
('goats', 0.5230269418385303),
('meat', 0.5202435333205421),
('animal', 0.5194570523705068),
('fish', 0.5131523388198542),
('dogs', 0.5125122379464395)]
%% Cell type:code id: tags:
```
python
svd
.
analogy
(
"man"
,
"men"
,
"woman"
)
```
%% Output
[('women', 0.7754647153730071),
('couples', 0.6097503266776299),
('male', 0.5914266186445117),
('sex', 0.5782558939194317),
('female', 0.570068551351722),
('intercourse', 0.5302306678128059),
('heterosexual', 0.5222203608894108),
('children', 0.5139059481091136),
('lesbian', 0.5132646381911999),
('feminism', 0.5027363468750581)]
%% Cell type:markdown id: tags:
### E/ Saving and loading vectors
%% Cell type:code id: tags:
```
python
# saving to a binary format
svd
.
save
(
"svd.
binary
"
)
svd
.
save
(
"svd.
svd2vec
"
)
```
%% Cell type:code id: tags:
```
python
# loading from binary file
loaded
=
svd2vec
.
load
(
"svd.
binary
"
)
loaded
=
svd2vec
.
load
(
"svd.
svd2vec
"
)
loaded
.
similarity
(
"bad"
,
"good"
)
```
%% Output
0.5
2
59
8380000292
72
0.559
5044997663
72
7
%% Cell type:code id: tags:
```
python
# saving to a word2vec like representation
svd
.
save_word2vec_format
(
"svd.word2vec"
)
```
...
...
svd2vec/__init__.py
View file @
ff2ae806
from
.core
import
svd2vec
from
.window
import
WindowWeights
__all__
=
[
"svd2vec"
]
__all__
=
[
"svd2vec"
,
"WindowWeights"
]
svd2vec/core.py
View file @
ff2ae806
...
...
@@ -12,6 +12,7 @@ import multiprocessing
from
scipy.sparse
import
csc_matrix
from
scipy.sparse.linalg
import
svds
from
scipy.spatial.distance
import
cosine
from
scipy.stats
import
pearsonr
from
joblib
import
Parallel
,
delayed
from
collections
import
OrderedDict
,
Counter
from
operator
import
itemgetter
...
...
@@ -102,9 +103,9 @@ class svd2vec:
# window type
if
isinstance
(
window
,
int
):
window
=
WindowWeights
.
create_window
(
left
=
window
,
right
=
window
,
weighter
=
window_weighter
)
window
,
window_size
=
WindowWeights
.
create_window
(
left
=
window
,
right
=
window
,
weighter
=
window_weighter
)
elif
isinstance
(
window
,
tuple
)
and
len
(
window
)
==
2
and
all
(
map
(
lambda
e
:
isinstance
(
e
,
int
),
window
)):
window
=
WindowWeights
.
create_window
(
left
=
window
[
0
],
right
=
window
[
1
],
weighter
=
window_weighter
)
window
,
window_size
=
WindowWeights
.
create_window
(
left
=
window
[
0
],
right
=
window
[
1
],
weighter
=
window_weighter
)
else
:
raise
ValueError
(
"'"
+
window
+
"' not implemented as a window yielder"
)
...
...
@@ -127,6 +128,7 @@ class svd2vec:
self
.
min_count
=
min_count
self
.
size
=
size
self
.
window
=
window
self
.
window_size
=
window_size
self
.
cds_alpha
=
cds_alpha
self
.
sub_threshold
=
sub_threshold
self
.
neg_k_shift
=
neg_k_shift
...
...
@@ -221,7 +223,7 @@ class svd2vec:
matrix
=
file
.
load
(
erase
=
True
)
for
document
in
self
.
bar
(
self
.
documents
,
"co-occurence counting"
):
for
word
,
context
,
weight
in
self
.
bar
(
self
.
window
(
document
),
"document co-occurence counting"
,
total
=
self
.
vocabulary_len
*
self
.
vocabulary_len
,
offset
=
1
):
for
word
,
context
,
weight
in
self
.
bar
(
self
.
window
(
document
),
"document co-occurence counting"
,
total
=
self
.
window_size
(
document
)
,
offset
=
1
):
i_word
=
self
.
vocabulary
[
word
]
i_context
=
self
.
vocabulary
[
context
]
matrix
[
i_word
,
i_context
]
+=
weight
...
...
@@ -240,6 +242,7 @@ class svd2vec:
# instance variable will stop us from using joblib parallelisation
# because this can not be saved as a pickle object
delattr
(
self
,
"window"
)
delattr
(
self
,
"window_size"
)
def
pmi_matrix
(
self
):
# pointwise mutal information
...
...
@@ -248,9 +251,6 @@ class svd2vec:
pmi_list
=
Parallel
(
n_jobs
=
self
.
workers
)(
delayed
(
self
.
pmi_parallized
)(
slice
,
i
)
for
i
,
slice
in
enumerate
(
slices
)
if
slice
!=
[])
pmi
=
np
.
concatenate
(
pmi_list
,
axis
=
0
)
if
self
.
verbose
:
print
(
""
)
return
pmi
def
pmi_parallized
(
self
,
slice
,
i
):
...
...
@@ -400,10 +400,12 @@ class svd2vec:
def
cosine_similarity
(
self
,
wx
,
cx
,
wy
,
cy
):
# compute the cosine similarity of x (word x and context x) and y (word
# y and context y)
top
=
np
.
dot
(
wx
+
cx
,
wy
+
cy
)
bot
=
np
.
sqrt
(
np
.
dot
(
wx
+
cx
,
wx
+
cx
))
*
np
.
sqrt
(
np
.
dot
(
wy
+
cy
,
wy
+
cy
))
#top = np.dot(wx, wy) + np.dot(cx, cy) + np.dot(wx, cy) + np.dot(cx, wy)
#bot = (2 * np.sqrt(np.dot(wx, cx) + 1)) * (np.sqrt(np.dot(wy, cy) + 1))
wxcx
=
wx
+
cx
wycy
=
wy
+
cy
top
=
np
.
dot
(
wxcx
,
wycy
)
bot
=
np
.
sqrt
(
np
.
dot
(
wxcx
,
wxcx
))
*
np
.
sqrt
(
np
.
dot
(
wycy
,
wycy
))
# top = np.dot(wx, wy) + np.dot(cx, cy) + np.dot(wx, cy) + np.dot(cx, wy)
# bot = (2 * np.sqrt(np.dot(wx, cx) + 1)) * (np.sqrt(np.dot(wy, cy) + 1))
return
top
/
bot
def
similarity
(
self
,
x
,
y
):
...
...
@@ -506,7 +508,7 @@ class svd2vec:
positives
=
[
self
.
vectors
(
x
)
for
x
in
positive
]
negatives
=
[
self
.
vectors
(
x
)
for
x
in
negative
]
first_w
,
first_c
=
positives
[
0
]
if
positive
else
negatives
[
0
]
#
first_w, first_c = positives[0] if positive else negatives[0]
mean_w
=
[]
mean_c
=
[]
...
...
@@ -576,6 +578,65 @@ class svd2vec:
else
:
raise
ValueError
(
"Word '"
+
word
+
"' not in the vocabulary"
)
#####
# Evaluation
#####
def
evaluate_word_pairs
(
self
,
pairs
,
delimiter
=
'
\t
'
):
"""
Evaluates the model similarity using a pairs file of human judgments
of similarities.
Parameters
----------
pairs : string
A filepath of a csv file. Lines starting by '#' will be ignored.
The first and second column are the words. The third column is the
human made similarity.
delimiter : string
The delimiter of the csv file
Returns
-------
tuple
The first value is the pearson coefficient (1.0 means the model is
very good according to humans, 0.0 it's very bad). The second value
is the two-tailed p-value.
"""
file
=
Utils
.
parse_csv
(
pairs
,
delimiter
)
x
=
[]
y
=
[]
for
row
in
file
:
w1
=
row
[
0
]
w2
=
row
[
1
]
hsim
=
float
(
row
[
2
])
if
w1
not
in
self
.
vocabulary
or
w2
not
in
self
.
vocabulary
:
continue
msim
=
self
.
similarity
(
w1
,
w2
)
x
.
append
(
hsim
)
y
.
append
(
msim
)
pearson
=
pearsonr
(
np
.
array
(
x
),
np
.
array
(
y
))
return
pearson
def
evaluate_word_analogies
(
self
,
analogies
,
section_separator
=
":"
):
total
=
0
correct
=
0
with
open
(
analogies
,
"r"
)
as
file
:
for
line
in
file
.
read
().
splitlines
():
if
line
.
startswith
(
section_separator
):
continue
words
=
line
.
split
(
" "
)
if
any
([
w
not
in
self
.
vocabulary
for
w
in
words
]):
continue
total
+=
1
predicted
=
self
.
analogy
(
words
[
0
],
words
[
1
],
words
[
2
])
if
predicted
==
words
[
3
]:
correct
+=
1
result
=
correct
/
total
return
result
#####
# Debug
#####
...
...
svd2vec/utils.py
View file @
ff2ae806
...
...
@@ -22,6 +22,8 @@ class Utils:
return
list
(
a
[
i
*
k
+
min
(
i
,
m
):(
i
+
1
)
*
k
+
min
(
i
+
1
,
m
)]
for
i
in
range
(
n
))
def
getsize
(
obj
):
# Returns the size of the object in bytes (dict for each instance var)
# Note: not working well with np.memmap
size
=
{}
size
[
"total"
]
=
0
for
var
,
inner_obj
in
obj
.
__dict__
.
items
():
...
...
@@ -33,8 +35,21 @@ class Utils:
return
size
def
running_notebook
():
# Returns True if the current code is running in a Jupyter Notebook,
# False otherwise
if
'IPython'
in
sys
.
modules
:
from
IPython
import
get_ipython
return
'IPKernelApp'
in
get_ipython
().
config
else
:
return
False
def
parse_csv
(
file_path
,
delimiter
,
comment
=
"#"
):
# Returns a list of lines, each line being a list of cells
output
=
[]
with
open
(
file_path
,
"r"
)
as
file
:
for
line
in
file
.
read
().
splitlines
():
if
line
[
0
]
==
comment
:
continue
else
:
output
.
append
(
line
.
split
(
delimiter
))
return
output
svd2vec/window.py
View file @
ff2ae806
...
...
@@ -8,17 +8,25 @@ class WindowWeights:
for
iW
,
word
in
enumerate
(
document
):
for
i
in
reversed
(
range
(
1
,
left
)):
ictx
=
iW
-
i
if
ictx
<
=
0
:
break
if
ictx
<
0
:
continue
ctx
=
document
[
ictx
]
yield
weighter
(
word
,
ctx
,
i
,
left
)
for
i
in
range
(
1
,
right
):
ictx
=
iW
+
i
if
ictx
>=
doc_len
:
break
continue
ctx
=
document
[
ictx
]
yield
weighter
(
word
,
ctx
,
i
,
right
)
return
window
def
window_size
(
document
):
l1
=
left
-
1
r1
=
right
-
1
doc_len
=
len
(
document
)
size
=
doc_len
*
(
l1
+
r1
)
-
(
l1
*
(
l1
+
1
))
/
2
-
(
r1
*
(
r1
+
1
))
/
2
return
int
(
size
)
return
window
,
window_size
def
weight_harmonic
(
word
,
context
,
dist
,
windowSize
):
# the harmonic weighing
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment