Commit fe69ba87 authored by Gaëtan Caillaut's avatar Gaëtan Caillaut
Browse files

modif pour compatibilité avec gensim

parent 1c5204d7
......@@ -53,12 +53,13 @@ class JsonDataset(Dataset):
class SncfDataset(IterableDataset):
def __init__(self, path, size, stride=None):
def __init__(self, path, size, stride=None, split_sentences=False):
self.path = Path(path).expanduser()
self.files = list(self.path.glob("**/*.txt"))
self.vocabulary = None
self.size = size
self.stride = stride or size
self.split_sentences = split_sentences
self._data = None
def __iter__(self):
......@@ -69,7 +70,11 @@ class SncfDataset(IterableDataset):
for l in f:
spl = l.strip().split()
for i in range(0, len(spl), self.stride):
self._data.append(" ".join(spl[i:i + self.size]))
sent = " ".join(spl[i:i + self.size])
if self.split_sentences:
self._data.append(sent.replace("(", " ( ").replace(")", " ) ").split())
else:
self._data.append(sent)
return iter(self._data)
def split(self, output_dir, train=0.6, dev=0.2, test=0.2):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment