init_with_w2v.py 4.3 KB
Newer Older
Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import sys
import torch
import os
import itertools
from gensim.models import Word2Vec
from corpus import *
try:
    from minibert import *
except:
    sys.path.append(os.path.dirname(
        os.path.dirname(os.path.realpath(__file__))))
    from minibert import *
from torch.utils.tensorboard import SummaryWriter


def build_batches(seqs, bs=5):
    seqs = sorted(seqs, key=len)

    res = []
    b = []
    prev_len = len(seqs[0])
    i = 0

    for x in seqs:
        if len(x) != prev_len or i >= bs:
            prev_len = len(x)
            b = []
            i = 0
            res.append(b)

        b.append(x)
        i = i + 1
    if len(b) > 0:
        res.append(b)
    return res


def build_one_tensor_batch(b, voc2idx):
    return torch.tensor([
        [voc2idx[x] for x in sent] for sent in b
    ], dtype=torch.long, requires_grad=False)


def build_tensor_batches(batches, voc2idx):
    res = []
    for b in batches:
        tensor_batch = build_one_tensor_batch(b, voc2idx)
        res.append(tensor_batch)
    return res


def eval_model(model, sentences, voc2idx, mask):
    train_backup = model.train
    model.set_train(False)

    pos = 0
    total = 0

    with torch.no_grad():
        for sent in sentences:
            splitted = sent.split()
            test_tokens = []
            for i in range(len(splitted)):
                toks = splitted.copy()
                toks[i] = mask
                test_tokens.append(toks)

            test_tensors = build_one_tensor_batch(test_tokens, voc2idx)
            output = model(test_tensors)
            out_probs = torch.stack([
                output[i, i, :] for i in range(len(splitted))
            ])
            out_labels = torch.argmax(out_probs, dim=1)
            expected_labels = torch.tensor(
                [voc2idx[x] for x in splitted], dtype=torch.int)
            pos = pos + torch.sum(expected_labels == out_labels).item()
            total = total + len(splitted)

    model.set_train(train_backup)
    return pos / total, pos


if __name__ == "__main__":
    src_dir = os.path.dirname(os.path.realpath(__file__))
    crps = Corpus(os.path.join(src_dir, "trial_corpus.xml"))
    #crps = Corpus(os.path.join(src_dir, "test_corpus.xml"))
    crps = CorpusSimplifier(crps)

    mask_token = "<mask>"
    voc = sorted(list(crps.compute_vocabulary().union({mask_token})))
    voc2idx = {x: i for i, x in enumerate(voc)}
    mask_idx = voc2idx[mask_token]

    tokenized = [sent.split() for sent in crps]
    batches = build_batches(tokenized)
    train_tensors = build_tensor_batches(batches, voc2idx)

    emb_dim = 64
    voc_size = len(voc)

    crps_tokens = map(str.split, crps)
    w2v = Word2Vec(sentences=crps_tokens, size=emb_dim, workers=4, min_count=0)
    embs = torch.full((voc_size, emb_dim), 0.0, dtype=torch.float)
    for x, i in voc2idx.items():
        try:
            embs[i, :] = torch.from_numpy(w2v.wv.get_vector(x).copy())
        except KeyError:
            print(f"No embedding for '{x}'", file=sys.stderr)

    model = MiniBertForTraining(emb_dim, voc_size, mask_idx, hidden_dim=64)
    model.minibert.embedding.word_embeddings.weight = torch.nn.Parameter(embs)
Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
112
113
    if torch.cuda.is_available():
        model = model.to("cuda")
Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
    learning_rate = 1e-3
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # for name, param in model.named_parameters():
    #     if param.requires_grad:
    #         print(name, param.data)

    writer = SummaryWriter()
    ibatch = 0
    for epoch in range(10000):
        cumloss = 0
        for x in train_tensors:
            output, loss = model(x)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            cumloss += loss.item()
        precision, nb_pos = eval_model(model, crps, voc2idx, mask_token)
        writer.add_scalar("Cumulated loss/train", cumloss, epoch)
        writer.add_scalar("Averaged loss/train", cumloss /
                          len(train_tensors), epoch)
        writer.add_scalar("Precision/train", precision, epoch)
        writer.add_scalar("True positives/train", nb_pos, epoch)
        if epoch % 100 == 0:
            writer.add_embedding(model.minibert.embedding.word_embeddings.weight,
                                 metadata=voc, global_step=epoch, tag="Embeddings")

    writer.flush()
    writer.close()

    model.set_train(False)