Commit 42094045 authored by Gaëtan Caillaut's avatar Gaëtan Caillaut
Browse files

xp with attention on seq classif

parent 31c365cb
......@@ -2,7 +2,7 @@
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH --gres gpu:rtx6000:1
#SBATCH --job-name camembert-t2
#SBATCH --job-name camembert-t2-v2
#SBATCH --time 10-0
#SBATCH --mem 20G
#SBATCH -o logs/out-%j.txt
......
#!/bin/bash
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH --gres gpu:rtx6000:1
#SBATCH --job-name t1_fs-lemmatized-wa
#SBATCH --time 10-0
#SBATCH --mem 20G
#SBATCH -o logs/out-%j.txt
#SBATCH -e logs/err-%j.txt
#SBATCH --mail-type=ALL
#SBATCH --mail-user=gaetan.caillaut@univ-lemans.fr
eval "$(conda shell.bash hook)"
conda activate polysemy
TRAIN="data/lemmatized/t1/train.csv"
DEV="data/lemmatized/t1/dev.csv"
TEST="data/lemmatized/t1/test.csv"
TOKENIZER="output/tokenizer_lemmatized.json"
PRETRAINED_DIR="models/lemmatized"
OUT_DIR="models/t1_fs/lemmatized"
BS=512
DEVICE="cuda"
LOGDIR="runs/t1_fs/lemmatized"
for d in ${OUT_DIR} ${LOGDIR}; do
if [ ! -d ${d} ]; then
mkdir -p ${d}
fi
done
export PYTHONPATH="/lium/raid01_b/gcaillaut/polysemy/minibert:${PYTHONPATH}"
set -x
set -e
for E in $(seq -f "%05g" 0 10 40); do
for D in 32; do
for ATT in "self-attention" "non-transforming" "semi-transforming"; do
for POS in "none" "fixed"; do
T1_RUN_NAME="d${D}_${ATT}_${POS}_gelu_norm_h1d1_softmax_wa"
if ((10#$E>0)); then
CHECKPOINT="${OUT_DIR}/${T1_RUN_NAME}/checkpoint-${E}.tar"
python train.py t1-fs ${TRAIN} ${TEST} ${DEV} ${TOKENIZER} -o ${OUT_DIR} -d ${D} --attention ${ATT} --position ${POS} --epochs 10 --bs ${BS} --device ${DEVICE} --logdir ${LOGDIR} --checkpoint ${CHECKPOINT}
else
python train.py t1-fs ${TRAIN} ${TEST} ${DEV} ${TOKENIZER} -o ${OUT_DIR} -d ${D} --attention ${ATT} --position ${POS} --epochs 10 --bs ${BS} --device ${DEVICE} --logdir ${LOGDIR}
fi
done
done
done
done
\ No newline at end of file
#!/bin/bash
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH --gres gpu:rtx6000:1
#SBATCH --job-name 1-lemmatized-wa
#SBATCH --time 10-0
#SBATCH --mem 20G
#SBATCH -o logs/out-%j.txt
#SBATCH -e logs/err-%j.txt
#SBATCH --mail-type=ALL
#SBATCH --mail-user=gaetan.caillaut@univ-lemans.fr
eval "$(conda shell.bash hook)"
conda activate polysemy
TRAIN="data/lemmatized/t1/train.csv"
DEV="data/lemmatized/t1/dev.csv"
TEST="data/lemmatized/t1/test.csv"
TOKENIZER="output/tokenizer_lemmatized.json"
PRETRAINED_DIR="models/lemmatized"
OUT_DIR="models/t1/lemmatized"
BS=512
DEVICE="cuda"
LOGDIR="runs/t1/lemmatized"
for d in ${OUT_DIR} ${LOGDIR}; do
if [ ! -d ${d} ]; then
mkdir -p ${d}
fi
done
export PYTHONPATH="/lium/raid01_b/gcaillaut/polysemy/minibert:${PYTHONPATH}"
set -x
set -e
for E in $(seq -f "%05g" 0 10 40); do
for D in 32; do
for ATT in "self-attention" "non-transforming" "semi-transforming"; do
for POS in "none" "fixed"; do
MLM_RUN_NAME="d${D}_${ATT}_${POS}_gelu_norm_h1d1_softmax"
T1_RUN_NAME="d${D}_${ATT}_${POS}_gelu_norm_h1d1_softmax_wa"
if ((10#$E>0)); then
CHECKPOINT="${OUT_DIR}/${T1_RUN_NAME}/checkpoint-${E}.tar"
python train.py t1 ${TRAIN} ${TEST} ${DEV} ${TOKENIZER} "${PRETRAINED_DIR}/${MLM_RUN_NAME}/minibert-model.pt" -o ${OUT_DIR} -d ${D} --attention ${ATT} --position ${POS} --epochs 10 --bs ${BS} --device ${DEVICE} --logdir ${LOGDIR} --checkpoint ${CHECKPOINT}
else
python train.py t1 ${TRAIN} ${TEST} ${DEV} ${TOKENIZER} "${PRETRAINED_DIR}/${MLM_RUN_NAME}/mi\
nibert-model.pt" -o ${OUT_DIR} -d ${D} --attention ${ATT} --position ${POS} --epochs 10 --bs ${BS} --device ${DEVICE} --logdir ${LOGDIR}
fi
done
done
done
done
\ No newline at end of file
#!/bin/bash
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH --gres gpu:rtx6000:1
#SBATCH --job-name t2_fs-lemmatized-wa
#SBATCH --time 10-0
#SBATCH --mem 20G
#SBATCH -o logs/out-%j.txt
#SBATCH -e logs/err-%j.txt
#SBATCH --mail-type=ALL
#SBATCH --mail-user=gaetan.caillaut@univ-lemans.fr
eval "$(conda shell.bash hook)"
conda activate polysemy
TRAIN="data/lemmatized/t2/train.csv"
DEV="data/lemmatized/t2/dev.csv"
TEST="data/lemmatized/t2/test.csv"
TOKENIZER="output/tokenizer_lemmatized.json"
PRETRAINED_DIR="models/lemmatized"
OUT_DIR="models/t2_fs/lemmatized"
BS=512
DEVICE="cuda"
LOGDIR="runs/t2_fs/lemmatized"
for d in ${OUT_DIR} ${LOGDIR}; do
if [ ! -d ${d} ]; then
mkdir -p ${d}
fi
done
export PYTHONPATH="/lium/raid01_b/gcaillaut/polysemy/minibert:${PYTHONPATH}"
set -x
set -e
for E in $(seq -f "%05g" 0 10 40); do
for D in 32; do
for ATT in "self-attention" "non-transforming" "semi-transforming"; do
for POS in "none" "fixed"; do
T2_RUN_NAME="d${D}_${ATT}_${POS}_gelu_norm_h1d1_softmax_wa"
if ((10#$E>0)); then
CHECKPOINT="${OUT_DIR}/${T2_RUN_NAME}/checkpoint-${E}.tar"
python train.py t2-fs ${TRAIN} ${TEST} ${DEV} ${TOKENIZER} -o ${OUT_DIR} -d ${D} --attention ${ATT} --position ${POS} --epochs 10 --bs ${BS} --device ${DEVICE} --logdir ${LOGDIR} --checkpoint ${CHECKPOINT}
else
python train.py t2-fs ${TRAIN} ${TEST} ${DEV} ${TOKENIZER} -o ${OUT_DIR} -d ${D} --attention ${ATT} --position ${POS} --epochs 10 --bs ${BS} --device ${DEVICE} --logdir ${LOGDIR}
fi
done
done
done
done
\ No newline at end of file
#!/bin/bash
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH --gres gpu:rtx6000:1
#SBATCH --job-name t2-lemmatized-wa
#SBATCH --time 10-0
#SBATCH --mem 20G
#SBATCH -o logs/out-%j.txt
#SBATCH -e logs/err-%j.txt
#SBATCH --mail-type=ALL
#SBATCH --mail-user=gaetan.caillaut@univ-lemans.fr
eval "$(conda shell.bash hook)"
conda activate polysemy
TRAIN="data/lemmatized/t2/train.csv"
DEV="data/lemmatized/t2/dev.csv"
TEST="data/lemmatized/t2/test.csv"
TOKENIZER="output/tokenizer_lemmatized.json"
PRETRAINED_DIR="models/lemmatized"
OUT_DIR="models/t2/lemmatized"
BS=512
DEVICE="cuda"
LOGDIR="runs/t2/lemmatized"
for d in ${OUT_DIR} ${LOGDIR}; do
if [ ! -d ${d} ]; then
mkdir -p ${d}
fi
done
export PYTHONPATH="/lium/raid01_b/gcaillaut/polysemy/minibert:${PYTHONPATH}"
set -x
set -e
for E in $(seq -f "%05g" 0 10 40); do
for D in 32; do
for ATT in "self-attention" "non-transforming" "semi-transforming"; do
for POS in "none" "fixed"; do
MLM_RUN_NAME="d${D}_${ATT}_${POS}_gelu_norm_h1d1_softmax"
T2_RUN_NAME="d${D}_${ATT}_${POS}_gelu_norm_h1d1_softmax_wa"
if ((10#$E>0)); then
CHECKPOINT="${OUT_DIR}/${T2_RUN_NAME}/checkpoint-${E}.tar"
python train.py t2 ${TRAIN} ${TEST} ${DEV} ${TOKENIZER} "${PRETRAINED_DIR}/${MLM_RUN_NAME}/minibert-model.pt" -o ${OUT_DIR} -d ${D} --attention ${ATT} --position ${POS} --epochs 10 --bs ${BS} --device ${DEVICE} --logdir ${LOGDIR} --checkpoi\
nt ${CHECKPOINT}
else
python train.py t2 ${TRAIN} ${TEST} ${DEV} ${TOKENIZER} "${PRETRAINED_DIR}/${MLM_RUN_NAME}/minibert-model.pt" -o ${OUT_DIR} -d ${D} --attention ${ATT} --position ${POS} --epochs 10 --bs ${BS} --device ${DEVICE} --logdir ${LOGDIR}
fi
done
done
done
done
\ No newline at end of file
......@@ -17,10 +17,10 @@ class MyCamembertForSequenceClassification(torch.nn.Module):
super(MyCamembertForSequenceClassification, self).__init__()
self.camembert = CamembertModel.from_pretrained("camembert-base")
self.l1 = torch.nn.Linear(768, 768 / 2, bias=True)
self.l1 = torch.nn.Linear(768, 768 // 2, bias=True)
self.l1_activation_fun = parse_activation_function("gelu")
self.l2 = torch.nn.Linear(768/2, num_labels, bias=True)
self.l2 = torch.nn.Linear(768//2, num_labels, bias=True)
self.l2_activation_fun = parse_activation_function("none")
def forward(self, input, attention_mask=None):
......@@ -85,9 +85,15 @@ def run_name_from_params(args):
freeze_attention = args.freeze_attention
except AttributeError:
freeze_attention = False
try:
wa = args.with_attention
except AttributeError:
wa = False
if freeze_attention:
s = f"{s}_frozen"
if wa:
s = f"{s}_wa"
return s
......@@ -143,7 +149,7 @@ def mlm_model_from_checkpoint(checkpoint_path, device="cpu"):
return model, optimizer, prev_epoch, configuration_dict
def t1_model_from_params(pretrained_path, d, attention, position, tokenizer, max_seq_size, mask_token, pad_token, device, checkpoint_path=None, height=1, depth=1, attention_scaling=softmax):
def t1_model_from_params(pretrained_path, d, attention, position, tokenizer, max_seq_size, mask_token, pad_token, device, checkpoint_path=None, height=1, depth=1, attention_scaling=softmax, model_type=MiniBertForSequenceClassification):
if checkpoint_path is None:
vocabulary = tokenizer.get_vocab()
configuration_dict = dict(
......@@ -174,23 +180,23 @@ def t1_model_from_params(pretrained_path, d, attention, position, tokenizer, max
)
configuration = MiniBertForSequenceClassificationConfiguration(
**configuration_dict)
model = MiniBertForSequenceClassification(configuration).to(device)
model = model_type(configuration).to(device)
if pretrained_path is not None:
state_dict = torch.load(pretrained_path)
model.load_state_dict(state_dict, strict=False)
optimizer = torch.optim.Adam(model.parameters())
prev_epoch = 0
else:
return t1_model_from_checkpoint(checkpoint_path, device=device)
return t1_model_from_checkpoint(checkpoint_path, device=device, model_type=model_type)
return model, optimizer, prev_epoch, configuration_dict
def t1_model_from_checkpoint(checkpoint_path, device="cpu"):
def t1_model_from_checkpoint(checkpoint_path, device="cpu", model_type=MiniBertForSequenceClassification):
checkpoint = torch.load(checkpoint_path, map_location=torch.device(device))
configuration_dict = checkpoint["configuration"]
configuration = MiniBertForSequenceClassificationConfiguration(
**configuration_dict)
model = MiniBertForSequenceClassification(configuration).to(device)
model = model_type(configuration).to(device)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.Adam(model.parameters())
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
......@@ -198,7 +204,7 @@ def t1_model_from_checkpoint(checkpoint_path, device="cpu"):
return model, optimizer, prev_epoch, configuration_dict
def t2_model_from_params(pretrained_path, d, attention, position, tokenizer, max_seq_size, mask_token, pad_token, device, checkpoint_path=None, height=1, depth=1, attention_scaling=softmax):
def t2_model_from_params(pretrained_path, d, attention, position, tokenizer, max_seq_size, mask_token, pad_token, device, checkpoint_path=None, height=1, depth=1, attention_scaling=softmax, model_type=MiniBertForSequenceClassification):
if checkpoint_path is None:
vocabulary = tokenizer.get_vocab()
configuration_dict = dict(
......@@ -229,7 +235,7 @@ def t2_model_from_params(pretrained_path, d, attention, position, tokenizer, max
)
configuration = MiniBertForSequenceClassificationConfiguration(
**configuration_dict)
model = MiniBertForSequenceClassification(configuration).to(device)
model = model_type(configuration).to(device)
if pretrained_path is not None:
state_dict = torch.load(pretrained_path)
model.load_state_dict(state_dict, strict=False)
......@@ -237,15 +243,15 @@ def t2_model_from_params(pretrained_path, d, attention, position, tokenizer, max
prev_epoch = 0
return model, optimizer, prev_epoch, configuration_dict
else:
return t2_model_from_checkpoint(checkpoint_path, device=device)
return t2_model_from_checkpoint(checkpoint_path, device=device, model_type=model_type)
def t2_model_from_checkpoint(checkpoint_path, device="cpu"):
def t2_model_from_checkpoint(checkpoint_path, device="cpu", model_type=MiniBertForSequenceClassification):
checkpoint = torch.load(checkpoint_path, map_location=torch.device(device))
configuration_dict = checkpoint["configuration"]
configuration = MiniBertForSequenceClassificationConfiguration(
**configuration_dict)
model = MiniBertForSequenceClassification(configuration).to(device)
model = model_type(configuration).to(device)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.Adam(model.parameters())
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
......@@ -405,8 +411,13 @@ def finetune_t1(args):
test_loader = DataLoader(
test_dataset, collate_fn=deft_collater, batch_size=args.bs, pin_memory=pin_memory)
if args.with_attention:
model_type = MiniBertForSequenceClassificationWithAttention
else:
model_type = MiniBertForSequenceClassification
model, optimizer, prev_epoch, config_dict = t1_model_from_params(
args.model, args.d, attention_type, position_type, tokenizer, max_seq_size, mask_token, pad_token, device, checkpoint_path=args.checkpoint, height=args.height, depth=args.depth, attention_scaling=attention_scaling)
args.model, args.d, attention_type, position_type, tokenizer, max_seq_size, mask_token, pad_token, device, checkpoint_path=args.checkpoint, height=args.height, depth=args.depth, attention_scaling=attention_scaling, model_type=model_type)
run_name = run_name_from_params(args)
try:
......@@ -562,8 +573,13 @@ def t1_from_scratch(args):
test_loader = DataLoader(
test_dataset, collate_fn=deft_collater, batch_size=args.bs, pin_memory=pin_memory)
if args.with_attention:
model_type = MiniBertForSequenceClassificationWithAttention
else:
model_type = MiniBertForSequenceClassification
model, optimizer, prev_epoch, config_dict = t1_model_from_params(
None, args.d, attention_type, position_type, tokenizer, max_seq_size, mask_token, pad_token, device, checkpoint_path=args.checkpoint, height=args.height, depth=args.depth, attention_scaling=attention_scaling)
None, args.d, attention_type, position_type, tokenizer, max_seq_size, mask_token, pad_token, device, checkpoint_path=args.checkpoint, height=args.height, depth=args.depth, attention_scaling=attention_scaling, model_type=model_type)
run_name = run_name_from_params(args)
if args.logdir is None:
......@@ -709,8 +725,13 @@ def finetune_t2(args):
test_loader = DataLoader(
test_dataset, collate_fn=deft_collater, batch_size=args.bs, pin_memory=pin_memory)
if args.with_attention:
model_type = MiniBertForSequenceClassificationWithAttention
else:
model_type = MiniBertForSequenceClassification
model, optimizer, prev_epoch, config_dict = t2_model_from_params(
args.model, args.d, attention_type, position_type, tokenizer, max_seq_size, mask_token, pad_token, device, checkpoint_path=args.checkpoint, height=args.height, depth=args.depth, attention_scaling=attention_scaling)
args.model, args.d, attention_type, position_type, tokenizer, max_seq_size, mask_token, pad_token, device, checkpoint_path=args.checkpoint, height=args.height, depth=args.depth, attention_scaling=attention_scaling, model_type=model_type)
run_name = run_name_from_params(args)
try:
......@@ -867,8 +888,13 @@ def t2_from_scratch(args):
test_loader = DataLoader(
test_dataset, collate_fn=deft_collater, batch_size=args.bs, pin_memory=pin_memory)
if args.with_attention:
model_type = MiniBertForSequenceClassificationWithAttention
else:
model_type = MiniBertForSequenceClassification
model, optimizer, prev_epoch, config_dict = t2_model_from_params(
None, args.d, attention_type, position_type, tokenizer, max_seq_size, mask_token, pad_token, device, checkpoint_path=args.checkpoint, height=args.height, depth=args.depth, attention_scaling=attention_scaling)
None, args.d, attention_type, position_type, tokenizer, max_seq_size, mask_token, pad_token, device, checkpoint_path=args.checkpoint, height=args.height, depth=args.depth, attention_scaling=attention_scaling, model_type=model_type)
run_name = run_name_from_params(args)
if args.logdir is None:
......@@ -1624,6 +1650,7 @@ if __name__ == "__main__":
t1_parser.add_argument("--attention-scaling", type=str, default="softmax")
t1_parser.add_argument("--position", type=str, default="fixed")
t1_parser.add_argument("--dont-normalize", action="store_true")
t1_parser.add_argument("--with-attention", action="store_true")
t1_parser.add_argument("--activation", type=str, default="gelu")
t1_parser.add_argument("--device", type=str, default="cpu")
......@@ -1654,6 +1681,7 @@ if __name__ == "__main__":
type=str, default="softmax")
t1fs_parser.add_argument("--position", type=str, default="fixed")
t1fs_parser.add_argument("--dont-normalize", action="store_true")
t1fs_parser.add_argument("--with-attention", action="store_true")
t1fs_parser.add_argument("--activation", type=str, default="gelu")
t1fs_parser.add_argument("--device", type=str, default="cpu")
......@@ -1684,6 +1712,7 @@ if __name__ == "__main__":
t2_parser.add_argument("--attention-scaling", type=str, default="softmax")
t2_parser.add_argument("--position", type=str, default="fixed")
t2_parser.add_argument("--dont-normalize", action="store_true")
t2_parser.add_argument("--with-attention", action="store_true")
t2_parser.add_argument("--freeze-attention", action="store_true")
t2_parser.add_argument("--activation", type=str, default="gelu")
......@@ -1714,6 +1743,7 @@ if __name__ == "__main__":
type=str, default="softmax")
t2fs_parser.add_argument("--position", type=str, default="fixed")
t2fs_parser.add_argument("--dont-normalize", action="store_true")
t2fs_parser.add_argument("--with-attention", action="store_true")
t2fs_parser.add_argument("--activation", type=str, default="gelu")
t2fs_parser.add_argument("--device", type=str, default="cpu")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment