Commit 8115b3ad authored by Gaëtan Caillaut's avatar Gaëtan Caillaut
Browse files

Add an option specifying input’s size

parent 13c4c2a6
......@@ -61,9 +61,12 @@ if __name__ == "__main__":
assert train_ratio + dev_ratio + test_ratio == 1
n = 0
with gzip.open(args.input, "rt", encoding="UTF-8") as infile:
for _ in infile:
n += 1
if args.n is None:
with gzip.open(args.input, "rt", encoding="UTF-8") as infile:
for _ in infile:
n += 1
else:
n = args.n
ids = random.sample(range(n), n)
train_end = math.ceil(n * train_ratio)
......@@ -137,6 +140,7 @@ if __name__ == "__main__":
split_parser.add_argument("-d", "--dev", default=0.1)
split_parser.add_argument("-T", "--test", default=0.1)
split_parser.add_argument("-f", "--force", action="store_true")
split_parser.add_argument("-n", required=False, type=int)
split_parser.set_defaults(func=_split)
train_tokenizer_parser = subparsers.add_parser("train-tokenizer")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment