job-train-tokenizers.sh 953 Bytes
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#!/bin/sh

export PYTHONPATH="$HOME/polysemy/minibert:$PYTHONPATH"

set -x
set -e

python train_minibert.py train-tokenizer --input data/lemmatized-hate-speech-dataset.csv --output data/tokenizer.json --column Cleaned
python train_minibert.py train-tokenizer --input data/lemmatized-hate-speech-dataset.csv --output data/tokenizer-lemmatized.json --column Lemmatized

python train_minibert.py train-tokenizer --input data/lemmatized-davidson-dataset.csv --output data/tokenizer-davidson.json --column Cleaned
python train_minibert.py train-tokenizer --input data/lemmatized-davidson-dataset.csv --output data/tokenizer-davidson-lemmatized.json --column Lemmatized

python train_minibert.py train-tokenizer --input data/lemmatized-gibert-dataset.csv --output data/tokenizer-gibert.json --column Cleaned
python train_minibert.py train-tokenizer --input data/lemmatized-gibert-dataset.csv --output data/tokenizer-gibert-lemmatized.json --column Lemmatized