Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Gaëtan Caillaut
MiniBert
Commits
8c8d7d84
Commit
8c8d7d84
authored
Nov 05, 2020
by
Gaëtan Caillaut
Browse files
init with w2v
parent
16aadbc4
Pipeline
#628
canceled with stages
Changes
1
Pipelines
1
Show whitespace changes
Inline
Side-by-side
train_semeval/init_with_w2v.py
0 → 100644
View file @
8c8d7d84
import
sys
import
torch
import
os
import
itertools
from
gensim.models
import
Word2Vec
from
corpus
import
*
try
:
from
minibert
import
*
except
:
sys
.
path
.
append
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))))
from
minibert
import
*
from
torch.utils.tensorboard
import
SummaryWriter
def
build_batches
(
seqs
,
bs
=
5
):
seqs
=
sorted
(
seqs
,
key
=
len
)
res
=
[]
b
=
[]
prev_len
=
len
(
seqs
[
0
])
i
=
0
for
x
in
seqs
:
if
len
(
x
)
!=
prev_len
or
i
>=
bs
:
prev_len
=
len
(
x
)
b
=
[]
i
=
0
res
.
append
(
b
)
b
.
append
(
x
)
i
=
i
+
1
if
len
(
b
)
>
0
:
res
.
append
(
b
)
return
res
def
build_one_tensor_batch
(
b
,
voc2idx
):
return
torch
.
tensor
([
[
voc2idx
[
x
]
for
x
in
sent
]
for
sent
in
b
],
dtype
=
torch
.
long
,
requires_grad
=
False
)
def
build_tensor_batches
(
batches
,
voc2idx
):
res
=
[]
for
b
in
batches
:
tensor_batch
=
build_one_tensor_batch
(
b
,
voc2idx
)
res
.
append
(
tensor_batch
)
return
res
def
eval_model
(
model
,
sentences
,
voc2idx
,
mask
):
train_backup
=
model
.
train
model
.
set_train
(
False
)
pos
=
0
total
=
0
with
torch
.
no_grad
():
for
sent
in
sentences
:
splitted
=
sent
.
split
()
test_tokens
=
[]
for
i
in
range
(
len
(
splitted
)):
toks
=
splitted
.
copy
()
toks
[
i
]
=
mask
test_tokens
.
append
(
toks
)
test_tensors
=
build_one_tensor_batch
(
test_tokens
,
voc2idx
)
output
=
model
(
test_tensors
)
out_probs
=
torch
.
stack
([
output
[
i
,
i
,
:]
for
i
in
range
(
len
(
splitted
))
])
out_labels
=
torch
.
argmax
(
out_probs
,
dim
=
1
)
expected_labels
=
torch
.
tensor
(
[
voc2idx
[
x
]
for
x
in
splitted
],
dtype
=
torch
.
int
)
pos
=
pos
+
torch
.
sum
(
expected_labels
==
out_labels
).
item
()
total
=
total
+
len
(
splitted
)
model
.
set_train
(
train_backup
)
return
pos
/
total
,
pos
if
__name__
==
"__main__"
:
src_dir
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
crps
=
Corpus
(
os
.
path
.
join
(
src_dir
,
"trial_corpus.xml"
))
#crps = Corpus(os.path.join(src_dir, "test_corpus.xml"))
crps
=
CorpusSimplifier
(
crps
)
mask_token
=
"<mask>"
voc
=
sorted
(
list
(
crps
.
compute_vocabulary
().
union
({
mask_token
})))
voc2idx
=
{
x
:
i
for
i
,
x
in
enumerate
(
voc
)}
mask_idx
=
voc2idx
[
mask_token
]
tokenized
=
[
sent
.
split
()
for
sent
in
crps
]
batches
=
build_batches
(
tokenized
)
train_tensors
=
build_tensor_batches
(
batches
,
voc2idx
)
emb_dim
=
64
voc_size
=
len
(
voc
)
crps_tokens
=
map
(
str
.
split
,
crps
)
w2v
=
Word2Vec
(
sentences
=
crps_tokens
,
size
=
emb_dim
,
workers
=
4
,
min_count
=
0
)
embs
=
torch
.
full
((
voc_size
,
emb_dim
),
0.0
,
dtype
=
torch
.
float
)
for
x
,
i
in
voc2idx
.
items
():
try
:
embs
[
i
,
:]
=
torch
.
from_numpy
(
w2v
.
wv
.
get_vector
(
x
).
copy
())
except
KeyError
:
print
(
f
"No embedding for '
{
x
}
'"
,
file
=
sys
.
stderr
)
model
=
MiniBertForTraining
(
emb_dim
,
voc_size
,
mask_idx
,
hidden_dim
=
64
)
model
.
minibert
.
embedding
.
word_embeddings
.
weight
=
torch
.
nn
.
Parameter
(
embs
)
learning_rate
=
1e-3
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
learning_rate
)
# for name, param in model.named_parameters():
# if param.requires_grad:
# print(name, param.data)
writer
=
SummaryWriter
()
ibatch
=
0
for
epoch
in
range
(
10000
):
cumloss
=
0
for
x
in
train_tensors
:
output
,
loss
=
model
(
x
)
optimizer
.
zero_grad
()
loss
.
backward
()
optimizer
.
step
()
cumloss
+=
loss
.
item
()
precision
,
nb_pos
=
eval_model
(
model
,
crps
,
voc2idx
,
mask_token
)
writer
.
add_scalar
(
"Cumulated loss/train"
,
cumloss
,
epoch
)
writer
.
add_scalar
(
"Averaged loss/train"
,
cumloss
/
len
(
train_tensors
),
epoch
)
writer
.
add_scalar
(
"Precision/train"
,
precision
,
epoch
)
writer
.
add_scalar
(
"True positives/train"
,
nb_pos
,
epoch
)
if
epoch
%
100
==
0
:
writer
.
add_embedding
(
model
.
minibert
.
embedding
.
word_embeddings
.
weight
,
metadata
=
voc
,
global_step
=
epoch
,
tag
=
"Embeddings"
)
writer
.
flush
()
writer
.
close
()
model
.
set_train
(
False
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment