Commit 79b8097d authored by Gaëtan Caillaut's avatar Gaëtan Caillaut
Browse files

trained models

parent d127dbfe
import pandas as pd
import torch
from tokenizers import Tokenizer
from torch.utils.data import DataLoader
from pathlib import Path
from data import *
from evaluation import *
from minibert import *
from train_minibert import *
if __name__ == "__main__":
import argparse
def _eval_minibert(args):
if args.device is None:
device = torch.device(
'cuda') if torch.cuda.is_available() else torch.device('cpu')
else:
device = torch.device(args.device)
pm = device == torch.device("cuda")
tokenizer = Tokenizer.from_file(args.tokenizer)
pad_id = tokenizer.token_to_id("<pad>")
dataset = HateSpeechDataset(args.input)
collater = HateSpeechCollater(tokenizer)
class_names = collater.class_names()
model = minibert_model_for_hatespeech(tokenizer).to(device)
state_dict = torch.load(args.model, map_location=device)
model.load_state_dict(state_dict)
loader = DataLoader(dataset, collate_fn=collater,
shuffle=False, batch_size=128, pin_memory=pm)
res_dict = {
"token_id": [],
"token": [],
"attention_output": [],
"predicted": [],
"predicted_id": [],
"expected": [],
"expected_id": [],
"sentence": [],
}
model.eval()
with torch.no_grad():
for batch, (inputs, labels) in enumerate(loader, 1):
x = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)
labels = labels.to(device)
outputs, att, att_out = model(
x, attention_mask, return_attention=True)
predictions = torch.argmax(outputs, dim=-1)
for i in range(x.size(0)):
not_pad = x[i] != pad_id
ids = x[i, not_pad].tolist()
tokens = [tokenizer.id_to_token(id) for id in ids]
attention_output = att_out[i, not_pad].tolist()
predicted = [
class_names[predictions[i].item()]] * len(tokens)
expected = [class_names[labels[i].item()]] * \
len(tokens)
predicted_id = [
predictions[i].item()] * len(tokens)
expected_id = [labels[i].item()] * \
len(tokens)
sentence = [" ".join(tokens)] * len(tokens)
res_dict["token_id"].extend(ids)
res_dict["token"].extend(tokens)
res_dict["attention_output"].extend(attention_output)
res_dict["predicted"].extend(predicted)
res_dict["expected"].extend(expected)
res_dict["predicted_id"].extend(predicted_id)
res_dict["expected_id"].extend(expected_id)
res_dict["sentence"].extend(sentence)
output = Path(Path(args.model).parent, "attention-stats.csv")
df = pd.DataFrame(res_dict)
df.to_csv(str(output), index=False)
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
eval_parser = subparsers.add_parser("eval")
eval_parser.add_argument(
"-m", "--model", default="output/minibert/davidson-final/minibert-adam-fold1.pt")
eval_parser.add_argument(
"-i", "--input", default="data/cleaned-davidson-dataset.csv")
eval_parser.add_argument(
"-t", "--tokenizer", default="data/tokenizer-davidson.json")
eval_parser.add_argument("-c", "--column", default="Cleaned")
eval_parser.add_argument("--device", required=False)
eval_parser.set_defaults(func=_eval_minibert)
args = parser.parse_args()
args.func(args)
#!/bin/sh
export PYTHONPATH="$HOME/polysemy/minibert:$PYTHONPATH"
python train_minibert.py train-minibert --input data/lemmatized-davidson-dataset.csv \
--output output/minibert/davidson-lemmatized-final \
--epochs 10 \
--optimizer adam \
--device cpu \
--jobname minibert/davidson-lemmatized-final \
--tokenizer data/tokenizer-davidson-lemmatized.json \
--column Lemmatized \
--folds 1
#!/bin/sh
export PYTHONPATH="$HOME/polysemy/minibert:$PYTHONPATH"
python eval_minibert.py eval --model output/minibert/davidson-final/minibert-adam-fold1.pt \
--input data/cleaned-davidson-dataset.csv \
--tokenizer data/tokenizer-davidson.json \
--column Cleaned \
--device cpu
#!/bin/sh
export PYTHONPATH="$HOME/polysemy/minibert:$PYTHONPATH"
set -x
set -e
python eval_minibert.py eval --model output/minibert/davidson-final/minibert-adam-fold1.pt \
--input data/cleaned-davidson-dataset.csv \
--tokenizer data/tokenizer-davidson.json \
--column Cleaned \
--device cpu
python eval_minibert.py eval --model output/minibert/gibert-final/minibert-adam-fold1.pt \
--input data/cleaned-gibert-dataset.csv \
--tokenizer data/tokenizer-gibert.json \
--column Cleaned \
--device cpu
python eval_minibert.py eval --model output/minibert/merged-final/minibert-adam-fold1.pt \
--input data/cleaned-hate-speech-dataset.csv \
--tokenizer data/tokenizer.json \
--column Cleaned \
--device cpu
python eval_minibert.py eval --model output/minibert/davidson-lemmatized-final/minibert-adam-fold1.pt \
--input data/lemmatized-davidson-dataset.csv \
--tokenizer data/tokenizer-davidson-lemmatized.json \
--column Lematized \
--device cpu
python eval_minibert.py eval --model output/minibert/gibert-lemmatized-final/minibert-adam-fold1.pt \
--input data/lemmatized-gibert-dataset.csv \
--tokenizer data/tokenizer-gibert-lemmatized.json \
--column Lematized \
--device cpu
python eval_minibert.py eval --model output/minibert/merged-lemmatized-final/minibert-adam-fold1.pt \
--input data/lemmatized-hate-speech-dataset.csv \
--tokenizer data/tokenizer-lemmatized.json \
--column Lematized \
--device cpu
\ No newline at end of file
#!/bin/sh
export PYTHONPATH="$HOME/polysemy/minibert:$PYTHONPATH"
python train_minibert.py train-minibert --input data/lemmatized-gibert-dataset.csv \
--output output/minibert/gibert-lemmatized-final \
--epochs 10 \
--optimizer adam \
--device cpu \
--jobname minibert/gibert-lemmatized-final \
--tokenizer data/tokenizer-gibert-lemmatized.json \
--column Lemmatized \
--folds 1
#!/bin/sh
export PYTHONPATH="$HOME/polysemy/minibert:$PYTHONPATH"
python train_minibert.py train-minibert --input data/lemmatized-hate-speech-dataset.csv \
--output output/minibert/merged-lemmatized-final \
--epochs 10 \
--optimizer adam \
--device cpu \
--jobname minibert/merged-lemmatized-final \
--tokenizer data/tokenizer-lemmatized.json \
--column Lemmatized \
--folds 1
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment