Commit cb65454a authored by Gaëtan Caillaut's avatar Gaëtan Caillaut
Browse files

camembert v2

parent 164924ca
#!/bin/bash
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH --gres gpu:rtx6000:1
#SBATCH --job-name camembert-t1-lemmatized-v2
#SBATCH --time 10-0
#SBATCH --mem 20G
#SBATCH -o logs/out-%j.txt
#SBATCH -e logs/err-%j.txt
#SBATCH --mail-type=ALL
#SBATCH --mail-user=gaetan.caillaut@univ-lemans.fr
eval "$(conda shell.bash hook)"
conda activate polysemy
TRAIN="data/lemmatized/t1/train.csv"
DEV="data/lemmatized/t1/dev.csv"
TEST="data/lemmatized/t1/test.csv"
TOKENIZER="output/tokenizer.json"
PRETRAINED_DIR="models/lemmatized"
OUT_DIR="models/t1/lemmatized/camembert-v2"
BS=200
DEVICE="cuda"
LOGDIR="runs/t1/lemmatized/camembert-v2"
for d in ${OUT_DIR} ${LOGDIR}; do
if [ ! -d ${d} ]; then
mkdir -p ${d}
fi
done
export PYTHONPATH="/lium/raid01_b/gcaillaut/polysemy/minibert:${PYTHONPATH}"
set -x
set -e
python train.py camembert-t1-v2 ${TRAIN} ${TEST} ${DEV} --outdir ${OUT_DIR} --bs ${BS} -e 10 --epochs-between-save 1 --logdir ${LOGDIR} --device ${DEVICE}
#!/bin/bash
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH --gres gpu:rtx6000:1
#SBATCH --job-name camembert-t1-v2
#SBATCH --time 10-0
#SBATCH --mem 20G
#SBATCH -o logs/out-%j.txt
#SBATCH -e logs/err-%j.txt
#SBATCH --mail-type=ALL
#SBATCH --mail-user=gaetan.caillaut@univ-lemans.fr
eval "$(conda shell.bash hook)"
conda activate polysemy
TRAIN="data/cleaned/t1/train.csv"
DEV="data/cleaned/t1/dev.csv"
TEST="data/cleaned/t1/test.csv"
TOKENIZER="output/tokenizer.json"
PRETRAINED_DIR="models/cleaned"
OUT_DIR="models/t1/cleaned/camembert-v2"
BS=200
DEVICE="cuda"
LOGDIR="runs/t1/cleaned/camembert-v2"
for d in ${OUT_DIR} ${LOGDIR}; do
if [ ! -d ${d} ]; then
mkdir -p ${d}
fi
done
export PYTHONPATH="/lium/raid01_b/gcaillaut/polysemy/minibert:${PYTHONPATH}"
set -x
set -e
python train.py camembert-t1-v2 ${TRAIN} ${TEST} ${DEV} --outdir ${OUT_DIR} --bs ${BS} -e 10 --epochs-between-save 1 --logdir ${LOGDIR} --device ${DEVICE}
#!/bin/bash
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH --gres gpu:rtx6000:1
#SBATCH --job-name camembert-t2-lemmatized-v2
#SBATCH --time 10-0
#SBATCH --mem 20G
#SBATCH -o logs/out-%j.txt
#SBATCH -e logs/err-%j.txt
#SBATCH --mail-type=ALL
#SBATCH --mail-user=gaetan.caillaut@univ-lemans.fr
eval "$(conda shell.bash hook)"
conda activate polysemy
TRAIN="data/lemmatized/t2/train.csv"
DEV="data/lemmatized/t2/dev.csv"
TEST="data/lemmatized/t2/test.csv"
TOKENIZER="output/tokenizer.json"
PRETRAINED_DIR="models/lemmatized"
OUT_DIR="models/t2/lemmatized/camembert-v2"
BS=200
DEVICE="cuda"
LOGDIR="runs/t2/lemmatized/camembert-v2"
for d in ${OUT_DIR} ${LOGDIR}; do
if [ ! -d ${d} ]; then
mkdir -p ${d}
fi
done
export PYTHONPATH="/lium/raid01_b/gcaillaut/polysemy/minibert:${PYTHONPATH}"
set -x
set -e
python train.py camembert-t2-v2 ${TRAIN} ${TEST} ${DEV} --outdir ${OUT_DIR} --bs ${BS} -e 10 --epochs-between-save 1 --logdir ${LOGDIR} --device ${DEVICE}
#!/bin/bash
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH --gres gpu:rtx6000:1
#SBATCH --job-name camembert-t2
#SBATCH --time 10-0
#SBATCH --mem 20G
#SBATCH -o logs/out-%j.txt
#SBATCH -e logs/err-%j.txt
#SBATCH --mail-type=ALL
#SBATCH --mail-user=gaetan.caillaut@univ-lemans.fr
eval "$(conda shell.bash hook)"
conda activate polysemy
TRAIN="data/cleaned/t2/train.csv"
DEV="data/cleaned/t2/dev.csv"
TEST="data/cleaned/t2/test.csv"
TOKENIZER="output/tokenizer.json"
PRETRAINED_DIR="models/cleaned"
OUT_DIR="models/t2/cleaned/camembert-v2"
BS=200
DEVICE="cuda"
LOGDIR="runs/t2/cleaned/camembert-v2"
for d in ${OUT_DIR} ${LOGDIR}; do
if [ ! -d ${d} ]; then
mkdir -p ${d}
fi
done
export PYTHONPATH="/lium/raid01_b/gcaillaut/polysemy/minibert:${PYTHONPATH}"
set -x
set -e
python train.py camembert-t2-v2 ${TRAIN} ${TEST} ${DEV} --outdir ${OUT_DIR} --bs ${BS} -e 10 --epochs-between-save 1 --logdir ${LOGDIR} --device ${DEVICE}
......@@ -2,15 +2,48 @@ from pathlib import Path
from datetime import datetime, timedelta
from torch.utils.tensorboard import SummaryWriter
from tokenizers import Tokenizer
from transformers import CamembertForSequenceClassification, CamembertTokenizerFast
from transformers import CamembertForSequenceClassification, CamembertTokenizerFast, CamembertModel
from datasets import *
from evaluation import *
import argparse
import sys
import torch
from minibert import *
class MyCamembertForSequenceClassification(torch.nn.Module):
def __init__(self, num_labels):
super(MyCamembertForSequenceClassification, self).__init__()
self.camembert = CamembertModel.from_pretrained("camembert-base")
self.l1 = nn.Linear(768, 768 / 2, bias=True)
self.l1_activation_fun = parse_activation_function("gelu")
self.l2 = nn.Linear(768/2, num_labels, bias=True)
self.l2_activation_fun = parse_activation_function("none")
def forward(self, input, attention_mask=None):
x = self.camembert(input_ids=input, attention_mask=attention_mask)
# Average tokens for sentence classification
if attention_mask is None:
x = torch.mean(x, dim=1)
else:
averaged = torch.zeros(
(x.size(0), 768), dtype=torch.float, device=x.device)
for i in range(x.size(0)):
token_embs = x[i, attention_mask[i, :] > 0, :]
averaged[i, :] = torch.mean(token_embs, dim=0)
x = averaged
x = self.l1(x)
x = self.l1_activation_fun(x)
x = self.l2(x)
x = self.l2_activation_fun(x, dim=1)
return x
def parse_position(s):
position_mapper = {
"none": PositionalEmbeddingType.NONE,
......@@ -1098,6 +1131,153 @@ def finetune_t1_camembert(args):
torch.save(optimizer.state_dict(), str(optimizer_out))
def finetune_t1_camembert_v2(args):
device = args.device
pin_memory = device != "cpu"
camembert_tokenizer = CamembertTokenizerFast.from_pretrained(
"camembert/camembert-base")
if args.checkpoint is None:
model = MyCamembertForSequenceClassification(num_labels=2)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters())
prev_epoch = 0
else:
checkpoint = torch.load(args.checkpoint)
# configuration = checkpoint["configuration"]
device = checkpoint["device"]
model = MyCamembertForSequenceClassification(num_labels=2)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.Adam(model.parameters())
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
prev_epoch = checkpoint["epoch"]
model.train()
deft_collater = DEFT2018CollaterForCamembert(
camembert_tokenizer, T1Dataset.labels_to_id())
train_dataset = T1Dataset.from_csv(args.train)
dev_dataset = T1Dataset.from_csv(args.dev)
test_dataset = T1Dataset.from_csv(args.test)
if args.sample:
train_dataset = train_dataset[:50]
test_dataset = test_dataset[:50]
train_loader = DataLoader(
train_dataset, collate_fn=deft_collater, batch_size=args.bs, pin_memory=pin_memory)
dev_loader = DataLoader(
dev_dataset, collate_fn=deft_collater, batch_size=args.bs, pin_memory=pin_memory)
test_loader = DataLoader(
test_dataset, collate_fn=deft_collater, batch_size=args.bs, pin_memory=pin_memory)
outdir = Path(args.outdir)
if args.checkpoint is None:
outdir.mkdir(exist_ok=True)
writer = SummaryWriter(log_dir=args.logdir)
loss_fun = loss = torch.nn.CrossEntropyLoss()
print("BEGIN TRAINING", flush=True)
for epoch in range(prev_epoch + 1, prev_epoch + 1 + args.epochs):
model.train()
cumloss = 0
t0_epoch = datetime.now()
batch_cumulated_time = timedelta()
for batch_id, (x, attention_mask, labels) in enumerate(train_loader, 1):
t0_batch = datetime.now()
x = x.to(device)
attention_mask = attention_mask.to(device)
# wids = wids.to(device)
labels = labels.to(device)
optimizer.zero_grad()
output = model(x, attention_mask=attention_mask)
loss = loss_fun(output, labels)
loss.backward()
optimizer.step()
cumloss += loss.item()
t1_batch = datetime.now()
batch_time = t1_batch - t0_batch
batch_cumulated_time += batch_time
if batch_id % args.show_progress == 0:
print(
f"EPOCH {epoch} - BATCH {batch_id:05} - LOSS {loss.item()} - TIME {batch_cumulated_time}", flush=True)
batch_cumulated_time = timedelta()
mean_loss = cumloss / len(train_loader)
writer.add_scalar("Loss/train", mean_loss, epoch)
t1_epoch = datetime.now()
print(
f"EPOCH {epoch:04} - MEAN LOSS {mean_loss} - TIME {t1_epoch - t0_epoch}", flush=True)
if epoch % args.epochs_between_save == 0:
model.eval()
tp_dev, fp_dev, fn_dev, recall_dev, precision_dev, fmeasure_dev = fmeasure_deft2018_t1(
model, dev_loader, device)
tp_test, fp_test, fn_test, recall_test, precision_test, fmeasure_test = fmeasure_deft2018_t1(
model, test_loader, device)
writer.add_scalar("dev/true positives", tp_dev, epoch)
writer.add_scalar("dev/false positives", fp_dev, epoch)
writer.add_scalar("dev/false negatives", fn_dev, epoch)
writer.add_scalar("dev/recall", recall_dev, epoch)
writer.add_scalar("dev/precision", precision_dev, epoch)
writer.add_scalar("dev/fmeasure", fmeasure_dev, epoch)
writer.add_scalar("test/true positives", tp_test, epoch)
writer.add_scalar("test/false positives", fp_test, epoch)
writer.add_scalar("test/false negatives", fn_test, epoch)
writer.add_scalar("test/recall", recall_test, epoch)
writer.add_scalar("test/precision", precision_test, epoch)
writer.add_scalar("test/fmeasure", fmeasure_test, epoch)
checkpoint = {
"epoch": epoch,
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
"device": device,
# "configuration": model.config,
"perf": {
"dev": {
"recall": recall_dev,
"precision": precision_dev,
"fmeasure": fmeasure_dev,
"tp": tp_dev,
"fp": fp_dev,
"fn": fn_dev
},
"test": {
"recall": recall_test,
"precision": precision_test,
"fmeasure": fmeasure_test,
"tp": tp_test,
"fp": fp_test,
"fn": fn_test
},
}
}
outfile = Path(outdir, f"checkpoint-{epoch:05}.tar")
torch.save(checkpoint, str(outfile))
writer.flush()
writer.close()
model_out = Path(outdir, "t1-model.pt")
optimizer_out = Path(outdir, "t1-optimizer.pt")
torch.save(model.state_dict(), str(model_out))
torch.save(optimizer.state_dict(), str(optimizer_out))
def finetune_t2_camembert(args):
device = args.device
pin_memory = device != "cpu"
......@@ -1248,6 +1428,154 @@ def finetune_t2_camembert(args):
torch.save(optimizer.state_dict(), str(optimizer_out))
def finetune_t2_camembert_v2(args):
device = args.device
pin_memory = device != "cpu"
camembert_tokenizer = CamembertTokenizerFast.from_pretrained(
"camembert/camembert-base",
)
if args.checkpoint is None:
model = MyCamembertForSequenceClassification(num_labels=4)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters())
prev_epoch = 0
else:
checkpoint = torch.load(args.checkpoint)
# configuration = checkpoint["configuration"]
device = checkpoint["device"]
model = MyCamembertForSequenceClassification(num_labels=4)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.Adam(model.parameters())
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
prev_epoch = checkpoint["epoch"]
model.train()
deft_collater = DEFT2018CollaterForCamembert(
camembert_tokenizer, T2Dataset.labels_to_id())
classes = set(T2Dataset.labels_to_id().values())
train_dataset = T2Dataset.from_csv(args.train)
dev_dataset = T2Dataset.from_csv(args.dev)
test_dataset = T2Dataset.from_csv(args.test)
if args.sample:
train_dataset = train_dataset[:10]
test_dataset = test_dataset[:10]
train_loader = DataLoader(
train_dataset, collate_fn=deft_collater, batch_size=args.bs, pin_memory=pin_memory)
dev_loader = DataLoader(
dev_dataset, collate_fn=deft_collater, batch_size=args.bs, pin_memory=pin_memory)
test_loader = DataLoader(
test_dataset, collate_fn=deft_collater, batch_size=args.bs, pin_memory=pin_memory)
outdir = Path(args.outdir)
if args.checkpoint is None:
outdir.mkdir(exist_ok=True)
writer = SummaryWriter(log_dir=args.logdir)
loss_fun = loss = torch.nn.CrossEntropyLoss()
print("BEGIN TRAINING", flush=True)
for epoch in range(prev_epoch + 1, prev_epoch + 1 + args.epochs):
model.train()
cumloss = 0
t0_epoch = datetime.now()
batch_cumulated_time = timedelta()
for batch_id, (x, attention_mask, labels) in enumerate(train_loader, 1):
t0_batch = datetime.now()
x = x.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)
optimizer.zero_grad()
output = model(x, attention_mask)
loss = loss_fun(output, labels)
loss.backward()
optimizer.step()
cumloss += loss.item()
t1_batch = datetime.now()
batch_time = t1_batch - t0_batch
batch_cumulated_time += batch_time
if batch_id % args.show_progress == 0:
print(
f"EPOCH {epoch} - BATCH {batch_id:05} - LOSS {loss.item()} - TIME {batch_cumulated_time}", flush=True)
batch_cumulated_time = timedelta()
mean_loss = cumloss / len(train_loader)
writer.add_scalar("Loss/train", mean_loss, epoch)
t1_epoch = datetime.now()
print(
f"EPOCH {epoch:04} - MEAN LOSS {mean_loss} - TIME {t1_epoch - t0_epoch}", flush=True)
if epoch % args.epochs_between_save == 0:
model.eval()
tp_dev, fp_dev, fn_dev, recall_dev, precision_dev, fmeasure_dev = fmeasure_deft2018_t2(
model, dev_loader, classes, device)
tp_test, fp_test, fn_test, recall_test, precision_test, fmeasure_test = fmeasure_deft2018_t2(
model, test_loader, classes, device)
writer.add_scalar("dev/true positives", tp_dev, epoch)
writer.add_scalar("dev/false positives", fp_dev, epoch)
writer.add_scalar("dev/false negatives", fn_dev, epoch)
writer.add_scalar("dev/recall", recall_dev, epoch)
writer.add_scalar("dev/precision", precision_dev, epoch)
writer.add_scalar("dev/fmeasure", fmeasure_dev, epoch)
writer.add_scalar("test/true positives", tp_test, epoch)
writer.add_scalar("test/false positives", fp_test, epoch)
writer.add_scalar("test/false negatives", fn_test, epoch)
writer.add_scalar("test/recall", recall_test, epoch)
writer.add_scalar("test/precision", precision_test, epoch)
writer.add_scalar("test/fmeasure", fmeasure_test, epoch)
checkpoint = {
"epoch": epoch,
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
"device": device,
# "configuration": model.config,
"perf": {
"dev": {
"recall": recall_dev,
"precision": precision_dev,
"fmeasure": fmeasure_dev,
"tp": tp_dev,
"fp": fp_dev,
"fn": fn_dev
},
"test": {
"recall": recall_test,
"precision": precision_test,
"fmeasure": fmeasure_test,
"tp": tp_test,
"fp": fp_test,
"fn": fn_test
},
}
}
outfile = Path(outdir, f"checkpoint-{epoch:05}.tar")
torch.save(checkpoint, str(outfile))
writer.flush()
writer.close()
model_out = Path(outdir, "t2-model.pt")
optimizer_out = Path(outdir, "t2-optimizer.pt")
torch.save(model.state_dict(), str(model_out))
torch.save(optimizer.state_dict(), str(optimizer_out))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
......@@ -1425,7 +1753,37 @@ if __name__ == "__main__":
cam_t2_parser.add_argument("--epochs-between-save", default=10, type=int)
cam_t2_parser.add_argument("--show-progress", default=50, type=int)
cam_t2_parser.add_argument("--sample", action="store_true")
cam_t2_parser.set_defaults(func=finetune_t2_camembert)
cam_t2_parser.set_defaults(func=finetune_t2_camembert_v2)
cam_t1_parser = subparsers.add_parser("camembert-t1-v2")
cam_t1_parser.add_argument("train")
cam_t1_parser.add_argument("test")
cam_t1_parser.add_argument("dev")
cam_t1_parser.add_argument("-o", "--outdir", type=str)
cam_t1_parser.add_argument("--bs", type=int, default=128)
cam_t1_parser.add_argument("-e", "--epochs", type=int, default=100)
cam_t1_parser.add_argument("--device", type=str, default="cpu")
cam_t1_parser.add_argument("--logdir", type=str, required=True)
cam_t1_parser.add_argument("-c", "--checkpoint", type=str, required=False)
cam_t1_parser.add_argument("--epochs-between-save", default=10, type=int)
cam_t1_parser.add_argument("--show-progress", default=50, type=int)
cam_t1_parser.add_argument("--sample", action="store_true")
cam_t1_parser.set_defaults(func=finetune_t1_camembert)
cam_t2_parser = subparsers.add_parser("camembert-t2-v2")
cam_t2_parser.add_argument("train")
cam_t2_parser.add_argument("test")
cam_t2_parser.add_argument("dev")
cam_t2_parser.add_argument("-o", "--outdir", type=str)
cam_t2_parser.add_argument("--bs", type=int, default=128)
cam_t2_parser.add_argument("-e", "--epochs", type=int, default=100)
cam_t2_parser.add_argument("--device", type=str, default="cpu")
cam_t2_parser.add_argument("--logdir", type=str, required=True)
cam_t2_parser.add_argument("-c", "--checkpoint", type=str, required=False)
cam_t2_parser.add_argument("--epochs-between-save", default=10, type=int)
cam_t2_parser.add_argument("--show-progress", default=50, type=int)
cam_t2_parser.add_argument("--sample", action="store_true")
cam_t2_parser.set_defaults(func=finetune_t2_camembert_v2)
args = parser.parse_args()
args.func(args)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment