Commit 153f314b authored by Valentin Pelloin's avatar Valentin Pelloin
Browse files

documentation + pip installation

parent 77dec450
%% Cell type:markdown id: tags:
# Getting started with `svd2vec`
%% Cell type:markdown id: tags:
## Installation
`svd2vec` can be installed using *pip*:
```shell
pip install svd2vec
```
%% Cell type:markdown id: tags:
## Usage
`svd2vec` can be used like the `word2vec` implementation of [Gensim](https://pypi.org/project/gensim/).
%% Cell type:markdown id: tags:
- `svd = svd2vec(documents, size=150, min_count=2, window=10, dyn_window_weight=svd2vec.WINDOW_WEIGHT_WORD2VEC, cds_alpha=0.75, neg_k_shift=5, eig_p_weight=0, nrm_type=NRM_SCHEME_ROW, sub_threshold=1e-5, verbose=True, workers=MAX_CPU_CORES)`
- `svd.similarity("hello", "nice")`
%% Cell type:code id: tags:
``` python
from svd2vec import svd2vec
```
%% Cell type:code id: tags:
``` python
# loading the word2vec demo corpus as a single document
documents = [open("text8", "r").read().split(" ")]
```
%% Cell type:code id: tags:
``` python
# showing first fifteen words of each documents
[d[:15] + ['...'] for d in documents]
```
%% Output
[['',
'anarchism',
'originated',
'as',
'a',
'term',
'of',
'abuse',
'first',
'used',
'against',
'early',
'working',
'class',
'radicals',
'...']]
%% Cell type:code id: tags:
``` python
# creating the words representation (can take a while)
svd = svd2vec(documents, window=2, min_count=100, verbose=False)
```
%% Cell type:code id: tags:
``` python
svd.analogy("paris", "france", "berlin")
```
%% Output
[('germany', 0.7229600894363701),
('der', 0.6960269874634539),
('und', 0.6830549907853377),
('leipzig', 0.6791055342999746),
('verlag', 0.6656014798750752),
('weimar', 0.652113201909802),
('sch', 0.6481760373713031),
('ber', 0.6110703637613146),
('munich', 0.6102097105336671),
('frankfurt', 0.6036993618607962)]
%% Cell type:code id: tags:
``` python
```
from setuptools import setup
setup(name='svd2vec',
version='0.1',
description='A library that converts words to vectors using PMI and SVD',
url='https://git-lium.univ-lemans.fr/vpelloin/svd2vec',
author='Valentin Pelloin',
author_email='valentin.pelloin.etu@univ-lemans.fr',
license='MIT',
packages=['svd2vec'],
zip_safe=False)
from .core import svd2vec
__all__ = ["svd2vec"]
"""
.. module:: argparse_actions
"""
import bz2
import heapq
......@@ -12,22 +15,62 @@ from scipy.spatial.distance import cosine
from joblib import Parallel, delayed
from collections import OrderedDict, Counter
from operator import itemgetter
from tqdm import tqdm
from tqdm import tqdm, tqdm_notebook
from .utils import Utils
from .window import WindowWeights
from .temporary_array import TemporaryArray
class svd2vec:
"""
The representation of the documents words in a vector format.
Parameters
----------
documents : list of list of string
The list of document, each document being a list of words
size : int
Maximum numbers of extracted features for each word
min_count : int
Minimum number of occurence of each word to be included in the model
window : int or tuple of ints
Window word counts for getting context of words.
If an int is given, it's equivalent of a symmetric tuple (int, int).
dyn_window_weight : WINDOW_WEIGHT_HARMONIC or WINDOW_WEIGHT_WORD2VEC
The window weighing scheme.
cds_alpha : float
The context distribution smoothing constant that smooths the context
frequency
neg_k_shift : int
The negative PMI log shifting
eig_p_weight : float
The eigenvalue weighting applied to the eigenvalue matrix
nrm_type : string
A normalization scheme to use with the L2 normalization
sub_threshold : float
A threshold for subsampling (diluting very frequent words). Higher value
means less words removed.
verbose : bool
If True, displays progress during the init step
workers : int
The numbers of workers to use in parallel (should not exceed the
available number of cores on the computer)
"""
WINDOW_WEIGHT_HARMONIC = 0
"""The harmonic weighing scheme for context words *(1/5, 1/4, 1/3, 1/2, ...)*"""
WINDOW_WEIGHT_WORD2VEC = 1
"""The word2vec weighing scheme for context words *(1/5, 2/5, 3/5, 4/5, ...)*"""
NRM_SCHEME_NONE = "none"
NRM_SCHEME_ROW = "row"
NRM_SCHEME_COLUMN = "column"
NRM_SCHEME_BOTH = "both"
NRM_SCHEMES = [NRM_SCHEME_NONE, NRM_SCHEME_ROW, NRM_SCHEME_COLUMN, NRM_SCHEME_BOTH]
"""Available normalization schemes"""
MAX_CPU_CORES = -1
......@@ -161,14 +204,17 @@ class svd2vec:
def bar(self, yielder=None, desc=None, total=None, offset=0):
disable = not self.verbose
return tqdm(
notebook = Utils.running_notebook()
func = tqdm_notebook if notebook else tqdm
format = None if notebook else "{desc: <30} {percentage:3.0f}% {bar}"
return func(
iterable=yielder,
desc=desc,
leave=False,
total=total,
disable=disable,
position=offset,
bar_format="{desc: <30} {percentage:3.0f}% {bar}")
bar_format=format)
def skipgram_weighted_count_matrix(self):
file = TemporaryArray((self.vocabulary_len, self.vocabulary_len), np.dtype('float16'))
......@@ -271,14 +317,48 @@ class svd2vec:
#####
def save(self, path):
"""
Saves the svd2vec object to the given path.
Parameters
----------
path : string
The file path to write the object to. The directories should exists.
"""
with bz2.open(path, "wb") as file:
pickle.dump(self, file)
def load(path):
"""
Load a previously saved svd2vec object from a path.
Parameters
----------
path : string
The file path to load the object from.
Returns
-------
svd2vec
A new `svd2vec` object
"""
with bz2.open(path, "rb") as file:
return pickle.load(file)
def save_word2vec_format(self, path):
"""
Saves the word vectors to a path using the same format as word2vec.
The file can then be used by other modules or libraries able to load
word2vec vectors.
Parameters
----------
path : string
The file path to write the object to. The directories should exists.
"""
with open(path, "w") as f:
print(str(self.vocabulary_len) + " " + str(self.size), file=f)
for word in self.vocabulary:
......@@ -327,18 +407,95 @@ class svd2vec:
return top / bot
def similarity(self, x, y):
# Returns the similarity of the two words x and y
"""
Computes and returns the cosine similarity of the two given words.
Parameters
----------
x : string
The first word to compute the similarity
y : string
The second word to compute the similarity
Returns
-------
float
The cosine similarity between the two words
Warning
-------
The two words ``x`` and ``y`` should have been trainned during the
initialization step.
"""
wx, cx = self.vectors(x)
wy, cy = self.vectors(y)
sim = self.cosine_similarity(wx, cx, wy, cy)
return sim
def distance(self, x, y):
# Returns the cosine distance between the two words x and y
"""
Computes and returns the cosine distance of the two given words.
Parameters
----------
x : string
The first word to compute the distance
y : string
The second word to compute the distance
Returns
-------
float
The cosine distance between the two words
Raises
------
ValueError
If either x or y have not been trained during the initialization step.
Warning
-------
The two words ``x`` and ``y`` should have been trained during the
initialization step.
"""
sim = self.similarity(x, y)
return 1 - sim
def most_similar(self, positive=[], negative=[], topn=10):
"""
Computes and returns the most similar words from those given in positive
and negative.
Parameters
----------
positive : list of string
Each word in positive will contribute positively to the output words
negative : list of string
Each word in negative will contribute negatively to the output words
topn : int
Number of similar words to output
Returns
-------
list of ``(word, similarity)``
Each tuple is a similar word with it's similarity to the given word.
Raises
------
ValueError
If the no input is given in both positive and negative
ValueError
If some words have not been trained during the initialization step.
Warning
-------
The input words should have been trained during the
initialization step.
"""
# Output the most similar words for the given positive and negative
# words. topn limits the number of output words
if not isinstance(positive, list) or not isinstance(negative, list):
......@@ -376,10 +533,39 @@ class svd2vec:
most_similar = heapq.nlargest(topn, similiarities.items(), key=itemgetter(1))
return most_similar
def analogy(self, exampleA, answerA, exampleB):
def analogy(self, exampleA, answerA, exampleB, topn=10):
"""
Returns the topn most probable answers to the analogy question "exampleA
if to answerA as exampleB is to ?"
Parameters
----------
exampleA : string
The first word to "train" the analogy on
answerA : string
The second word to "train" the analogy on
exampleB : string
The first word to ask the answer
Returns
-------
list of (word, similarity)
Each word and similarity is a probable answer to the analogy
Raises
------
ValueError
If some words have not been trained during the initialization step.
Warning
-------
The three input words should have been trained during the
initialization step.
"""
# returns answerB, ie the answer to the question
# exampleA is to answerA as exampleB is to answerB
return self.most_similar(positive=[exampleB, answerA], negative=[exampleA])
return self.most_similar(positive=[exampleB, answerA], negative=[exampleA], topn=topn)
def vectors(self, word):
if word in self.vocabulary:
......
......@@ -31,3 +31,10 @@ class Utils:
size[var] = asizeof.asizeof(inner_obj)
size["total"] += size[var]
return size
def running_notebook():
if 'IPython' in sys.modules:
from IPython import get_ipython
return 'IPKernelApp' in get_ipython().config
else:
return False
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment