Commit 13c4c2a6 authored by Gaëtan Caillaut's avatar Gaëtan Caillaut
Browse files

Initial commit

parents
output
\ No newline at end of file
import gzip
import math
import random
from pathlib import Path
import spacy
from spacy.tokens import Token
from spacy import symbols as syms
from tqdm import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, normalizers, decoders, trainers, processors
from tokenizers.normalizers import NFD, StripAccents, Lowercase
class PosFilter:
name = "pos_remover"
def __init__(self, pos_to_remove):
self.pos_to_remove = set(pos_to_remove)
Token.set_extension("pos_filtered", default=False)
def __call__(self, doc):
for tok in doc:
if tok.pos in self.pos_to_remove:
tok._.set("pos_filtered", True)
return doc
if __name__ == "__main__":
import argparse
from itertools import islice
def _lemmatize(args):
nlp = spacy.load("fr_core_news_sm", disable=["ner"])
pos_remover = PosFilter({
syms.PUNCT,
syms.CCONJ,
syms.NUM,
syms.DET,
syms.PROPN,
syms.INTJ,
syms.SCONJ,
syms.ADP,
syms.SPACE
})
nlp.add_pipe(pos_remover, last=True)
def _doc2str(doc):
return " ".join(t.lemma_ for t in doc if not t._.pos_filtered)
with gzip.open(args.input, "rt", encoding="UTF-8") as infile:
file_iterator = infile
if args.n is not None:
file_iterator = islice(file_iterator, args.n)
docs = nlp.pipe(file_iterator, n_process=args.nprocess)
with gzip.open(args.output, "wt", encoding="UTF-8") as outfile:
outfile.writelines(f"{_doc2str(d)}\n" for d in tqdm(docs))
def _split(args):
train_ratio, dev_ratio, test_ratio = args.train, args.dev, args.test
assert train_ratio + dev_ratio + test_ratio == 1
n = 0
with gzip.open(args.input, "rt", encoding="UTF-8") as infile:
for _ in infile:
n += 1
ids = random.sample(range(n), n)
train_end = math.ceil(n * train_ratio)
dev_end = train_end + math.ceil(n * dev_ratio)
train_ids, dev_ids, test_ids = ids[:train_end], ids[train_end:dev_end], ids[dev_end:]
train_ids.sort()
dev_ids.sort()
test_ids.sort()
outdir = Path(args.output).expanduser()
outdir.mkdir(exist_ok=args.force)
train_outfile = str(Path(outdir, "train.txt.gz"))
dev_outfile = str(Path(outdir, "dev.txt.gz"))
test_outfile = str(Path(outdir, "test.txt.gz"))
with gzip.open(train_outfile, "wt", encoding="UTF-8") as trainfile, \
gzip.open(dev_outfile, "wt", encoding="UTF-8") as devfile, \
gzip.open(test_outfile, "wt", encoding="UTF-8") as testfile, \
gzip.open(args.input, "rt", encoding="UTF-8") as infile:
itrain, idev, itest = 0, 0, 0
for i, l in enumerate(infile):
outfile = None
if itrain < len(train_ids) and train_ids[itrain] == i:
itrain += 1
outfile = trainfile
elif idev < len(dev_ids) and dev_ids[idev] == i:
idev += 1
outfile = devfile
elif itest < len(test_ids) and test_ids[itest] == i:
itest += 1
outfile = testfile
outfile.write(l)
def _train_tokenizer(args):
special_tokens = ["<mask>", "<pad>", "<unk>", "<sep>"]
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
tokenizer.normalizer = normalizers.Sequence([
NFD(),
StripAccents(),
Lowercase()
])
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
trainer = trainers.BpeTrainer(
vocab_size=args.vocab,
min_frequency=args.minfreq,
special_tokens=special_tokens
)
with gzip.open(args.input, "rt", encoding="UTF-8") as infile:
tokenizer.train_from_iterator(infile, trainer=trainer)
tokenizer.save(args.output, pretty=True)
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
lemmatizer_parser = subparsers.add_parser("lemmatize")
lemmatizer_parser.add_argument("input")
lemmatizer_parser.add_argument("output")
lemmatizer_parser.add_argument("-n", required=False, type=int)
lemmatizer_parser.add_argument("--nprocess", default=1, type=int)
lemmatizer_parser.set_defaults(func=_lemmatize)
split_parser = subparsers.add_parser("split")
split_parser.add_argument("input")
split_parser.add_argument("output")
split_parser.add_argument("-t", "--train", default=0.8)
split_parser.add_argument("-d", "--dev", default=0.1)
split_parser.add_argument("-T", "--test", default=0.1)
split_parser.add_argument("-f", "--force", action="store_true")
split_parser.set_defaults(func=_split)
train_tokenizer_parser = subparsers.add_parser("train-tokenizer")
train_tokenizer_parser.add_argument("input")
train_tokenizer_parser.add_argument("output")
train_tokenizer_parser.add_argument("--vocab", default=20000, type=int)
train_tokenizer_parser.add_argument("--minfreq", default=1, type=int)
train_tokenizer_parser.set_defaults(func=_train_tokenizer)
args = parser.parse_args()
args.func(args)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment