Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Gaëtan Caillaut
minibert-deft2018
Commits
42094045
Commit
42094045
authored
Apr 08, 2021
by
Gaëtan Caillaut
Browse files
xp with attention on seq classif
parent
31c365cb
Changes
6
Hide whitespace changes
Inline
Side-by-side
slurm_scripts/job_t2_camembert_v2.sh
View file @
42094045
...
...
@@ -2,7 +2,7 @@
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH --gres gpu:rtx6000:1
#SBATCH --job-name camembert-t2
#SBATCH --job-name camembert-t2
-v2
#SBATCH --time 10-0
#SBATCH --mem 20G
#SBATCH -o logs/out-%j.txt
...
...
slurm_scripts/with-attention/job_t1_fs_lemmatized.sh
0 → 100755
View file @
42094045
#!/bin/bash
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH --gres gpu:rtx6000:1
#SBATCH --job-name t1_fs-lemmatized-wa
#SBATCH --time 10-0
#SBATCH --mem 20G
#SBATCH -o logs/out-%j.txt
#SBATCH -e logs/err-%j.txt
#SBATCH --mail-type=ALL
#SBATCH --mail-user=gaetan.caillaut@univ-lemans.fr
eval
"
$(
conda shell.bash hook
)
"
conda activate polysemy
TRAIN
=
"data/lemmatized/t1/train.csv"
DEV
=
"data/lemmatized/t1/dev.csv"
TEST
=
"data/lemmatized/t1/test.csv"
TOKENIZER
=
"output/tokenizer_lemmatized.json"
PRETRAINED_DIR
=
"models/lemmatized"
OUT_DIR
=
"models/t1_fs/lemmatized"
BS
=
512
DEVICE
=
"cuda"
LOGDIR
=
"runs/t1_fs/lemmatized"
for
d
in
${
OUT_DIR
}
${
LOGDIR
}
;
do
if
[
!
-d
${
d
}
]
;
then
mkdir
-p
${
d
}
fi
done
export
PYTHONPATH
=
"/lium/raid01_b/gcaillaut/polysemy/minibert:
${
PYTHONPATH
}
"
set
-x
set
-e
for
E
in
$(
seq
-f
"%05g"
0 10 40
)
;
do
for
D
in
32
;
do
for
ATT
in
"self-attention"
"non-transforming"
"semi-transforming"
;
do
for
POS
in
"none"
"fixed"
;
do
T1_RUN_NAME
=
"d
${
D
}
_
${
ATT
}
_
${
POS
}
_gelu_norm_h1d1_softmax_wa"
if
((
10#
$E
>
0
))
;
then
CHECKPOINT
=
"
${
OUT_DIR
}
/
${
T1_RUN_NAME
}
/checkpoint-
${
E
}
.tar"
python train.py t1-fs
${
TRAIN
}
${
TEST
}
${
DEV
}
${
TOKENIZER
}
-o
${
OUT_DIR
}
-d
${
D
}
--attention
${
ATT
}
--position
${
POS
}
--epochs
10
--bs
${
BS
}
--device
${
DEVICE
}
--logdir
${
LOGDIR
}
--checkpoint
${
CHECKPOINT
}
else
python train.py t1-fs
${
TRAIN
}
${
TEST
}
${
DEV
}
${
TOKENIZER
}
-o
${
OUT_DIR
}
-d
${
D
}
--attention
${
ATT
}
--position
${
POS
}
--epochs
10
--bs
${
BS
}
--device
${
DEVICE
}
--logdir
${
LOGDIR
}
fi
done
done
done
done
\ No newline at end of file
slurm_scripts/with-attention/job_t1_lemmatized.sh
0 → 100755
View file @
42094045
#!/bin/bash
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH --gres gpu:rtx6000:1
#SBATCH --job-name 1-lemmatized-wa
#SBATCH --time 10-0
#SBATCH --mem 20G
#SBATCH -o logs/out-%j.txt
#SBATCH -e logs/err-%j.txt
#SBATCH --mail-type=ALL
#SBATCH --mail-user=gaetan.caillaut@univ-lemans.fr
eval
"
$(
conda shell.bash hook
)
"
conda activate polysemy
TRAIN
=
"data/lemmatized/t1/train.csv"
DEV
=
"data/lemmatized/t1/dev.csv"
TEST
=
"data/lemmatized/t1/test.csv"
TOKENIZER
=
"output/tokenizer_lemmatized.json"
PRETRAINED_DIR
=
"models/lemmatized"
OUT_DIR
=
"models/t1/lemmatized"
BS
=
512
DEVICE
=
"cuda"
LOGDIR
=
"runs/t1/lemmatized"
for
d
in
${
OUT_DIR
}
${
LOGDIR
}
;
do
if
[
!
-d
${
d
}
]
;
then
mkdir
-p
${
d
}
fi
done
export
PYTHONPATH
=
"/lium/raid01_b/gcaillaut/polysemy/minibert:
${
PYTHONPATH
}
"
set
-x
set
-e
for
E
in
$(
seq
-f
"%05g"
0 10 40
)
;
do
for
D
in
32
;
do
for
ATT
in
"self-attention"
"non-transforming"
"semi-transforming"
;
do
for
POS
in
"none"
"fixed"
;
do
MLM_RUN_NAME
=
"d
${
D
}
_
${
ATT
}
_
${
POS
}
_gelu_norm_h1d1_softmax"
T1_RUN_NAME
=
"d
${
D
}
_
${
ATT
}
_
${
POS
}
_gelu_norm_h1d1_softmax_wa"
if
((
10#
$E
>
0
))
;
then
CHECKPOINT
=
"
${
OUT_DIR
}
/
${
T1_RUN_NAME
}
/checkpoint-
${
E
}
.tar"
python train.py t1
${
TRAIN
}
${
TEST
}
${
DEV
}
${
TOKENIZER
}
"
${
PRETRAINED_DIR
}
/
${
MLM_RUN_NAME
}
/minibert-model.pt"
-o
${
OUT_DIR
}
-d
${
D
}
--attention
${
ATT
}
--position
${
POS
}
--epochs
10
--bs
${
BS
}
--device
${
DEVICE
}
--logdir
${
LOGDIR
}
--checkpoint
${
CHECKPOINT
}
else
python train.py t1
${
TRAIN
}
${
TEST
}
${
DEV
}
${
TOKENIZER
}
"
${
PRETRAINED_DIR
}
/
${
MLM_RUN_NAME
}
/mi
\
nibert-model.pt"
-o
${
OUT_DIR
}
-d
${
D
}
--attention
${
ATT
}
--position
${
POS
}
--epochs
10
--bs
${
BS
}
--device
${
DEVICE
}
--logdir
${
LOGDIR
}
fi
done
done
done
done
\ No newline at end of file
slurm_scripts/with-attention/job_t2_fs_lemmatized.sh
0 → 100755
View file @
42094045
#!/bin/bash
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH --gres gpu:rtx6000:1
#SBATCH --job-name t2_fs-lemmatized-wa
#SBATCH --time 10-0
#SBATCH --mem 20G
#SBATCH -o logs/out-%j.txt
#SBATCH -e logs/err-%j.txt
#SBATCH --mail-type=ALL
#SBATCH --mail-user=gaetan.caillaut@univ-lemans.fr
eval
"
$(
conda shell.bash hook
)
"
conda activate polysemy
TRAIN
=
"data/lemmatized/t2/train.csv"
DEV
=
"data/lemmatized/t2/dev.csv"
TEST
=
"data/lemmatized/t2/test.csv"
TOKENIZER
=
"output/tokenizer_lemmatized.json"
PRETRAINED_DIR
=
"models/lemmatized"
OUT_DIR
=
"models/t2_fs/lemmatized"
BS
=
512
DEVICE
=
"cuda"
LOGDIR
=
"runs/t2_fs/lemmatized"
for
d
in
${
OUT_DIR
}
${
LOGDIR
}
;
do
if
[
!
-d
${
d
}
]
;
then
mkdir
-p
${
d
}
fi
done
export
PYTHONPATH
=
"/lium/raid01_b/gcaillaut/polysemy/minibert:
${
PYTHONPATH
}
"
set
-x
set
-e
for
E
in
$(
seq
-f
"%05g"
0 10 40
)
;
do
for
D
in
32
;
do
for
ATT
in
"self-attention"
"non-transforming"
"semi-transforming"
;
do
for
POS
in
"none"
"fixed"
;
do
T2_RUN_NAME
=
"d
${
D
}
_
${
ATT
}
_
${
POS
}
_gelu_norm_h1d1_softmax_wa"
if
((
10#
$E
>
0
))
;
then
CHECKPOINT
=
"
${
OUT_DIR
}
/
${
T2_RUN_NAME
}
/checkpoint-
${
E
}
.tar"
python train.py t2-fs
${
TRAIN
}
${
TEST
}
${
DEV
}
${
TOKENIZER
}
-o
${
OUT_DIR
}
-d
${
D
}
--attention
${
ATT
}
--position
${
POS
}
--epochs
10
--bs
${
BS
}
--device
${
DEVICE
}
--logdir
${
LOGDIR
}
--checkpoint
${
CHECKPOINT
}
else
python train.py t2-fs
${
TRAIN
}
${
TEST
}
${
DEV
}
${
TOKENIZER
}
-o
${
OUT_DIR
}
-d
${
D
}
--attention
${
ATT
}
--position
${
POS
}
--epochs
10
--bs
${
BS
}
--device
${
DEVICE
}
--logdir
${
LOGDIR
}
fi
done
done
done
done
\ No newline at end of file
slurm_scripts/with-attention/job_t2_lemmatized.sh
0 → 100755
View file @
42094045
#!/bin/bash
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH --gres gpu:rtx6000:1
#SBATCH --job-name t2-lemmatized-wa
#SBATCH --time 10-0
#SBATCH --mem 20G
#SBATCH -o logs/out-%j.txt
#SBATCH -e logs/err-%j.txt
#SBATCH --mail-type=ALL
#SBATCH --mail-user=gaetan.caillaut@univ-lemans.fr
eval
"
$(
conda shell.bash hook
)
"
conda activate polysemy
TRAIN
=
"data/lemmatized/t2/train.csv"
DEV
=
"data/lemmatized/t2/dev.csv"
TEST
=
"data/lemmatized/t2/test.csv"
TOKENIZER
=
"output/tokenizer_lemmatized.json"
PRETRAINED_DIR
=
"models/lemmatized"
OUT_DIR
=
"models/t2/lemmatized"
BS
=
512
DEVICE
=
"cuda"
LOGDIR
=
"runs/t2/lemmatized"
for
d
in
${
OUT_DIR
}
${
LOGDIR
}
;
do
if
[
!
-d
${
d
}
]
;
then
mkdir
-p
${
d
}
fi
done
export
PYTHONPATH
=
"/lium/raid01_b/gcaillaut/polysemy/minibert:
${
PYTHONPATH
}
"
set
-x
set
-e
for
E
in
$(
seq
-f
"%05g"
0 10 40
)
;
do
for
D
in
32
;
do
for
ATT
in
"self-attention"
"non-transforming"
"semi-transforming"
;
do
for
POS
in
"none"
"fixed"
;
do
MLM_RUN_NAME
=
"d
${
D
}
_
${
ATT
}
_
${
POS
}
_gelu_norm_h1d1_softmax"
T2_RUN_NAME
=
"d
${
D
}
_
${
ATT
}
_
${
POS
}
_gelu_norm_h1d1_softmax_wa"
if
((
10#
$E
>
0
))
;
then
CHECKPOINT
=
"
${
OUT_DIR
}
/
${
T2_RUN_NAME
}
/checkpoint-
${
E
}
.tar"
python train.py t2
${
TRAIN
}
${
TEST
}
${
DEV
}
${
TOKENIZER
}
"
${
PRETRAINED_DIR
}
/
${
MLM_RUN_NAME
}
/minibert-model.pt"
-o
${
OUT_DIR
}
-d
${
D
}
--attention
${
ATT
}
--position
${
POS
}
--epochs
10
--bs
${
BS
}
--device
${
DEVICE
}
--logdir
${
LOGDIR
}
--checkpoi
\
nt
${
CHECKPOINT
}
else
python train.py t2
${
TRAIN
}
${
TEST
}
${
DEV
}
${
TOKENIZER
}
"
${
PRETRAINED_DIR
}
/
${
MLM_RUN_NAME
}
/minibert-model.pt"
-o
${
OUT_DIR
}
-d
${
D
}
--attention
${
ATT
}
--position
${
POS
}
--epochs
10
--bs
${
BS
}
--device
${
DEVICE
}
--logdir
${
LOGDIR
}
fi
done
done
done
done
\ No newline at end of file
train.py
View file @
42094045
...
...
@@ -17,10 +17,10 @@ class MyCamembertForSequenceClassification(torch.nn.Module):
super
(
MyCamembertForSequenceClassification
,
self
).
__init__
()
self
.
camembert
=
CamembertModel
.
from_pretrained
(
"camembert-base"
)
self
.
l1
=
torch
.
nn
.
Linear
(
768
,
768
/
2
,
bias
=
True
)
self
.
l1
=
torch
.
nn
.
Linear
(
768
,
768
/
/
2
,
bias
=
True
)
self
.
l1_activation_fun
=
parse_activation_function
(
"gelu"
)
self
.
l2
=
torch
.
nn
.
Linear
(
768
/
2
,
num_labels
,
bias
=
True
)
self
.
l2
=
torch
.
nn
.
Linear
(
768
/
/
2
,
num_labels
,
bias
=
True
)
self
.
l2_activation_fun
=
parse_activation_function
(
"none"
)
def
forward
(
self
,
input
,
attention_mask
=
None
):
...
...
@@ -85,9 +85,15 @@ def run_name_from_params(args):
freeze_attention
=
args
.
freeze_attention
except
AttributeError
:
freeze_attention
=
False
try
:
wa
=
args
.
with_attention
except
AttributeError
:
wa
=
False
if
freeze_attention
:
s
=
f
"
{
s
}
_frozen"
if
wa
:
s
=
f
"
{
s
}
_wa"
return
s
...
...
@@ -143,7 +149,7 @@ def mlm_model_from_checkpoint(checkpoint_path, device="cpu"):
return
model
,
optimizer
,
prev_epoch
,
configuration_dict
def
t1_model_from_params
(
pretrained_path
,
d
,
attention
,
position
,
tokenizer
,
max_seq_size
,
mask_token
,
pad_token
,
device
,
checkpoint_path
=
None
,
height
=
1
,
depth
=
1
,
attention_scaling
=
softmax
):
def
t1_model_from_params
(
pretrained_path
,
d
,
attention
,
position
,
tokenizer
,
max_seq_size
,
mask_token
,
pad_token
,
device
,
checkpoint_path
=
None
,
height
=
1
,
depth
=
1
,
attention_scaling
=
softmax
,
model_type
=
MiniBertForSequenceClassification
):
if
checkpoint_path
is
None
:
vocabulary
=
tokenizer
.
get_vocab
()
configuration_dict
=
dict
(
...
...
@@ -174,23 +180,23 @@ def t1_model_from_params(pretrained_path, d, attention, position, tokenizer, max
)
configuration
=
MiniBertForSequenceClassificationConfiguration
(
**
configuration_dict
)
model
=
MiniBertForSequenceClassification
(
configuration
).
to
(
device
)
model
=
model_type
(
configuration
).
to
(
device
)
if
pretrained_path
is
not
None
:
state_dict
=
torch
.
load
(
pretrained_path
)
model
.
load_state_dict
(
state_dict
,
strict
=
False
)
optimizer
=
torch
.
optim
.
Adam
(
model
.
parameters
())
prev_epoch
=
0
else
:
return
t1_model_from_checkpoint
(
checkpoint_path
,
device
=
device
)
return
t1_model_from_checkpoint
(
checkpoint_path
,
device
=
device
,
model_type
=
model_type
)
return
model
,
optimizer
,
prev_epoch
,
configuration_dict
def
t1_model_from_checkpoint
(
checkpoint_path
,
device
=
"cpu"
):
def
t1_model_from_checkpoint
(
checkpoint_path
,
device
=
"cpu"
,
model_type
=
MiniBertForSequenceClassification
):
checkpoint
=
torch
.
load
(
checkpoint_path
,
map_location
=
torch
.
device
(
device
))
configuration_dict
=
checkpoint
[
"configuration"
]
configuration
=
MiniBertForSequenceClassificationConfiguration
(
**
configuration_dict
)
model
=
MiniBertForSequenceClassification
(
configuration
).
to
(
device
)
model
=
model_type
(
configuration
).
to
(
device
)
model
.
load_state_dict
(
checkpoint
[
"model_state_dict"
])
optimizer
=
torch
.
optim
.
Adam
(
model
.
parameters
())
optimizer
.
load_state_dict
(
checkpoint
[
"optimizer_state_dict"
])
...
...
@@ -198,7 +204,7 @@ def t1_model_from_checkpoint(checkpoint_path, device="cpu"):
return
model
,
optimizer
,
prev_epoch
,
configuration_dict
def
t2_model_from_params
(
pretrained_path
,
d
,
attention
,
position
,
tokenizer
,
max_seq_size
,
mask_token
,
pad_token
,
device
,
checkpoint_path
=
None
,
height
=
1
,
depth
=
1
,
attention_scaling
=
softmax
):
def
t2_model_from_params
(
pretrained_path
,
d
,
attention
,
position
,
tokenizer
,
max_seq_size
,
mask_token
,
pad_token
,
device
,
checkpoint_path
=
None
,
height
=
1
,
depth
=
1
,
attention_scaling
=
softmax
,
model_type
=
MiniBertForSequenceClassification
):
if
checkpoint_path
is
None
:
vocabulary
=
tokenizer
.
get_vocab
()
configuration_dict
=
dict
(
...
...
@@ -229,7 +235,7 @@ def t2_model_from_params(pretrained_path, d, attention, position, tokenizer, max
)
configuration
=
MiniBertForSequenceClassificationConfiguration
(
**
configuration_dict
)
model
=
MiniBertForSequenceClassification
(
configuration
).
to
(
device
)
model
=
model_type
(
configuration
).
to
(
device
)
if
pretrained_path
is
not
None
:
state_dict
=
torch
.
load
(
pretrained_path
)
model
.
load_state_dict
(
state_dict
,
strict
=
False
)
...
...
@@ -237,15 +243,15 @@ def t2_model_from_params(pretrained_path, d, attention, position, tokenizer, max
prev_epoch
=
0
return
model
,
optimizer
,
prev_epoch
,
configuration_dict
else
:
return
t2_model_from_checkpoint
(
checkpoint_path
,
device
=
device
)
return
t2_model_from_checkpoint
(
checkpoint_path
,
device
=
device
,
model_type
=
model_type
)
def
t2_model_from_checkpoint
(
checkpoint_path
,
device
=
"cpu"
):
def
t2_model_from_checkpoint
(
checkpoint_path
,
device
=
"cpu"
,
model_type
=
MiniBertForSequenceClassification
):
checkpoint
=
torch
.
load
(
checkpoint_path
,
map_location
=
torch
.
device
(
device
))
configuration_dict
=
checkpoint
[
"configuration"
]
configuration
=
MiniBertForSequenceClassificationConfiguration
(
**
configuration_dict
)
model
=
MiniBertForSequenceClassification
(
configuration
).
to
(
device
)
model
=
model_type
(
configuration
).
to
(
device
)
model
.
load_state_dict
(
checkpoint
[
"model_state_dict"
])
optimizer
=
torch
.
optim
.
Adam
(
model
.
parameters
())
optimizer
.
load_state_dict
(
checkpoint
[
"optimizer_state_dict"
])
...
...
@@ -405,8 +411,13 @@ def finetune_t1(args):
test_loader
=
DataLoader
(
test_dataset
,
collate_fn
=
deft_collater
,
batch_size
=
args
.
bs
,
pin_memory
=
pin_memory
)
if
args
.
with_attention
:
model_type
=
MiniBertForSequenceClassificationWithAttention
else
:
model_type
=
MiniBertForSequenceClassification
model
,
optimizer
,
prev_epoch
,
config_dict
=
t1_model_from_params
(
args
.
model
,
args
.
d
,
attention_type
,
position_type
,
tokenizer
,
max_seq_size
,
mask_token
,
pad_token
,
device
,
checkpoint_path
=
args
.
checkpoint
,
height
=
args
.
height
,
depth
=
args
.
depth
,
attention_scaling
=
attention_scaling
)
args
.
model
,
args
.
d
,
attention_type
,
position_type
,
tokenizer
,
max_seq_size
,
mask_token
,
pad_token
,
device
,
checkpoint_path
=
args
.
checkpoint
,
height
=
args
.
height
,
depth
=
args
.
depth
,
attention_scaling
=
attention_scaling
,
model_type
=
model_type
)
run_name
=
run_name_from_params
(
args
)
try
:
...
...
@@ -562,8 +573,13 @@ def t1_from_scratch(args):
test_loader
=
DataLoader
(
test_dataset
,
collate_fn
=
deft_collater
,
batch_size
=
args
.
bs
,
pin_memory
=
pin_memory
)
if
args
.
with_attention
:
model_type
=
MiniBertForSequenceClassificationWithAttention
else
:
model_type
=
MiniBertForSequenceClassification
model
,
optimizer
,
prev_epoch
,
config_dict
=
t1_model_from_params
(
None
,
args
.
d
,
attention_type
,
position_type
,
tokenizer
,
max_seq_size
,
mask_token
,
pad_token
,
device
,
checkpoint_path
=
args
.
checkpoint
,
height
=
args
.
height
,
depth
=
args
.
depth
,
attention_scaling
=
attention_scaling
)
None
,
args
.
d
,
attention_type
,
position_type
,
tokenizer
,
max_seq_size
,
mask_token
,
pad_token
,
device
,
checkpoint_path
=
args
.
checkpoint
,
height
=
args
.
height
,
depth
=
args
.
depth
,
attention_scaling
=
attention_scaling
,
model_type
=
model_type
)
run_name
=
run_name_from_params
(
args
)
if
args
.
logdir
is
None
:
...
...
@@ -709,8 +725,13 @@ def finetune_t2(args):
test_loader
=
DataLoader
(
test_dataset
,
collate_fn
=
deft_collater
,
batch_size
=
args
.
bs
,
pin_memory
=
pin_memory
)
if
args
.
with_attention
:
model_type
=
MiniBertForSequenceClassificationWithAttention
else
:
model_type
=
MiniBertForSequenceClassification
model
,
optimizer
,
prev_epoch
,
config_dict
=
t2_model_from_params
(
args
.
model
,
args
.
d
,
attention_type
,
position_type
,
tokenizer
,
max_seq_size
,
mask_token
,
pad_token
,
device
,
checkpoint_path
=
args
.
checkpoint
,
height
=
args
.
height
,
depth
=
args
.
depth
,
attention_scaling
=
attention_scaling
)
args
.
model
,
args
.
d
,
attention_type
,
position_type
,
tokenizer
,
max_seq_size
,
mask_token
,
pad_token
,
device
,
checkpoint_path
=
args
.
checkpoint
,
height
=
args
.
height
,
depth
=
args
.
depth
,
attention_scaling
=
attention_scaling
,
model_type
=
model_type
)
run_name
=
run_name_from_params
(
args
)
try
:
...
...
@@ -867,8 +888,13 @@ def t2_from_scratch(args):
test_loader
=
DataLoader
(
test_dataset
,
collate_fn
=
deft_collater
,
batch_size
=
args
.
bs
,
pin_memory
=
pin_memory
)
if
args
.
with_attention
:
model_type
=
MiniBertForSequenceClassificationWithAttention
else
:
model_type
=
MiniBertForSequenceClassification
model
,
optimizer
,
prev_epoch
,
config_dict
=
t2_model_from_params
(
None
,
args
.
d
,
attention_type
,
position_type
,
tokenizer
,
max_seq_size
,
mask_token
,
pad_token
,
device
,
checkpoint_path
=
args
.
checkpoint
,
height
=
args
.
height
,
depth
=
args
.
depth
,
attention_scaling
=
attention_scaling
)
None
,
args
.
d
,
attention_type
,
position_type
,
tokenizer
,
max_seq_size
,
mask_token
,
pad_token
,
device
,
checkpoint_path
=
args
.
checkpoint
,
height
=
args
.
height
,
depth
=
args
.
depth
,
attention_scaling
=
attention_scaling
,
model_type
=
model_type
)
run_name
=
run_name_from_params
(
args
)
if
args
.
logdir
is
None
:
...
...
@@ -1624,6 +1650,7 @@ if __name__ == "__main__":
t1_parser
.
add_argument
(
"--attention-scaling"
,
type
=
str
,
default
=
"softmax"
)
t1_parser
.
add_argument
(
"--position"
,
type
=
str
,
default
=
"fixed"
)
t1_parser
.
add_argument
(
"--dont-normalize"
,
action
=
"store_true"
)
t1_parser
.
add_argument
(
"--with-attention"
,
action
=
"store_true"
)
t1_parser
.
add_argument
(
"--activation"
,
type
=
str
,
default
=
"gelu"
)
t1_parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
"cpu"
)
...
...
@@ -1654,6 +1681,7 @@ if __name__ == "__main__":
type
=
str
,
default
=
"softmax"
)
t1fs_parser
.
add_argument
(
"--position"
,
type
=
str
,
default
=
"fixed"
)
t1fs_parser
.
add_argument
(
"--dont-normalize"
,
action
=
"store_true"
)
t1fs_parser
.
add_argument
(
"--with-attention"
,
action
=
"store_true"
)
t1fs_parser
.
add_argument
(
"--activation"
,
type
=
str
,
default
=
"gelu"
)
t1fs_parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
"cpu"
)
...
...
@@ -1684,6 +1712,7 @@ if __name__ == "__main__":
t2_parser
.
add_argument
(
"--attention-scaling"
,
type
=
str
,
default
=
"softmax"
)
t2_parser
.
add_argument
(
"--position"
,
type
=
str
,
default
=
"fixed"
)
t2_parser
.
add_argument
(
"--dont-normalize"
,
action
=
"store_true"
)
t2_parser
.
add_argument
(
"--with-attention"
,
action
=
"store_true"
)
t2_parser
.
add_argument
(
"--freeze-attention"
,
action
=
"store_true"
)
t2_parser
.
add_argument
(
"--activation"
,
type
=
str
,
default
=
"gelu"
)
...
...
@@ -1714,6 +1743,7 @@ if __name__ == "__main__":
type
=
str
,
default
=
"softmax"
)
t2fs_parser
.
add_argument
(
"--position"
,
type
=
str
,
default
=
"fixed"
)
t2fs_parser
.
add_argument
(
"--dont-normalize"
,
action
=
"store_true"
)
t2fs_parser
.
add_argument
(
"--with-attention"
,
action
=
"store_true"
)
t2fs_parser
.
add_argument
(
"--activation"
,
type
=
str
,
default
=
"gelu"
)
t2fs_parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
"cpu"
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment