Commit 8c8d7d84 authored by Gaëtan Caillaut's avatar Gaëtan Caillaut
Browse files

init with w2v

parent 16aadbc4
Pipeline #628 canceled with stages
import sys
import torch
import os
import itertools
from gensim.models import Word2Vec
from corpus import *
try:
from minibert import *
except:
sys.path.append(os.path.dirname(
os.path.dirname(os.path.realpath(__file__))))
from minibert import *
from torch.utils.tensorboard import SummaryWriter
def build_batches(seqs, bs=5):
seqs = sorted(seqs, key=len)
res = []
b = []
prev_len = len(seqs[0])
i = 0
for x in seqs:
if len(x) != prev_len or i >= bs:
prev_len = len(x)
b = []
i = 0
res.append(b)
b.append(x)
i = i + 1
if len(b) > 0:
res.append(b)
return res
def build_one_tensor_batch(b, voc2idx):
return torch.tensor([
[voc2idx[x] for x in sent] for sent in b
], dtype=torch.long, requires_grad=False)
def build_tensor_batches(batches, voc2idx):
res = []
for b in batches:
tensor_batch = build_one_tensor_batch(b, voc2idx)
res.append(tensor_batch)
return res
def eval_model(model, sentences, voc2idx, mask):
train_backup = model.train
model.set_train(False)
pos = 0
total = 0
with torch.no_grad():
for sent in sentences:
splitted = sent.split()
test_tokens = []
for i in range(len(splitted)):
toks = splitted.copy()
toks[i] = mask
test_tokens.append(toks)
test_tensors = build_one_tensor_batch(test_tokens, voc2idx)
output = model(test_tensors)
out_probs = torch.stack([
output[i, i, :] for i in range(len(splitted))
])
out_labels = torch.argmax(out_probs, dim=1)
expected_labels = torch.tensor(
[voc2idx[x] for x in splitted], dtype=torch.int)
pos = pos + torch.sum(expected_labels == out_labels).item()
total = total + len(splitted)
model.set_train(train_backup)
return pos / total, pos
if __name__ == "__main__":
src_dir = os.path.dirname(os.path.realpath(__file__))
crps = Corpus(os.path.join(src_dir, "trial_corpus.xml"))
#crps = Corpus(os.path.join(src_dir, "test_corpus.xml"))
crps = CorpusSimplifier(crps)
mask_token = "<mask>"
voc = sorted(list(crps.compute_vocabulary().union({mask_token})))
voc2idx = {x: i for i, x in enumerate(voc)}
mask_idx = voc2idx[mask_token]
tokenized = [sent.split() for sent in crps]
batches = build_batches(tokenized)
train_tensors = build_tensor_batches(batches, voc2idx)
emb_dim = 64
voc_size = len(voc)
crps_tokens = map(str.split, crps)
w2v = Word2Vec(sentences=crps_tokens, size=emb_dim, workers=4, min_count=0)
embs = torch.full((voc_size, emb_dim), 0.0, dtype=torch.float)
for x, i in voc2idx.items():
try:
embs[i, :] = torch.from_numpy(w2v.wv.get_vector(x).copy())
except KeyError:
print(f"No embedding for '{x}'", file=sys.stderr)
model = MiniBertForTraining(emb_dim, voc_size, mask_idx, hidden_dim=64)
model.minibert.embedding.word_embeddings.weight = torch.nn.Parameter(embs)
learning_rate = 1e-3
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# for name, param in model.named_parameters():
# if param.requires_grad:
# print(name, param.data)
writer = SummaryWriter()
ibatch = 0
for epoch in range(10000):
cumloss = 0
for x in train_tensors:
output, loss = model(x)
optimizer.zero_grad()
loss.backward()
optimizer.step()
cumloss += loss.item()
precision, nb_pos = eval_model(model, crps, voc2idx, mask_token)
writer.add_scalar("Cumulated loss/train", cumloss, epoch)
writer.add_scalar("Averaged loss/train", cumloss /
len(train_tensors), epoch)
writer.add_scalar("Precision/train", precision, epoch)
writer.add_scalar("True positives/train", nb_pos, epoch)
if epoch % 100 == 0:
writer.add_embedding(model.minibert.embedding.word_embeddings.weight,
metadata=voc, global_step=epoch, tag="Embeddings")
writer.flush()
writer.close()
model.set_train(False)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment