Commit 67715ef6 authored by Gaëtan Caillaut's avatar Gaëtan Caillaut
Browse files

sort vocabulary in Corpus.vocabulary

parent 5ef82cee
......@@ -53,7 +53,7 @@ if __name__ == "__main__":
crps = load_corpus(args.corpus, args.simplify)
mask_token = "<mask>"
voc = sorted(crps.vocabulary().union({mask_token}))
voc = sorted(set(crps.vocabulary()).union({mask_token}))
voc2idx = {x: i for i, x in enumerate(voc)}
mask_idx = voc2idx[mask_token]
......
......@@ -21,10 +21,10 @@ class BaseCorpus():
res = set()
for s in self:
res.update(set(tokenizer(s)))
return res
return sorted(res)
def dtm(self, tokenizer=str.split):
voc = sorted(self.vocabulary(tokenizer=tokenizer))
voc = self.vocabulary(tokenizer=tokenizer)
voc2idx = {v: i for i, v in enumerate(voc)}
res = torch.zeros((len(self), len(voc)), dtype=torch.int)
for i, doc in enumerate(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment