data.py 2.86 KB
Newer Older
Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
1
2
3
4
5
6
7
8
from torch.utils.data import Dataset
import pandas as pd
from torch.nn.functional import one_hot
import torch
import random


class HateSpeechDataset(Dataset):
Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
9
    def __init__(self, path, sent_column="Cleaned"):
Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
10
11
        self.path = path
        df = pd.read_csv(self.path).dropna()
Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
12
        self.sentences = df[sent_column].tolist()
Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
13
14
15
16
17
18
19
20
21
        self.labels = df["Class"].tolist()

    def __getitem__(self, i):
        return self.sentences[i], self.labels[i]

    def __len__(self):
        return len(self.labels)

    def iter_folds(self, k=10, shuffle=True):
Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
22
23
24
25
26
        # If there is one fold, then train = test
        if k == 1:
            yield self, self
            return

Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
        n = len(self.sentences)
        if shuffle:
            indices = list(range(n))
            random.shuffle(indices)
            sentences = [self.sentences[i] for i in indices]
            labels = [self.labels[i] for i in indices]
        fold_len = n // k
        for i in range(k):
            start = i * fold_len
            end = min(start + fold_len, n)

            test_sentences = sentences[start:end]
            test_labels = labels[start:end]
            train_sentences = sentences[:start] + sentences[end:]
            train_labels = labels[:start] + labels[end:]

            train = [(x, y) for x, y in zip(train_sentences, train_labels)]
            test = [(x, y) for x, y in zip(test_sentences, test_labels)]
            yield train, test


class HateSpeechCollater:
49
    def __init__(self, tokenizer, device=None):
Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
50
51
        self.tokenizer = tokenizer
        self.labels2id = {"Hate": 0, "Offensive": 1, "Other": 2}
52
53
        self.device = torch.device(
            "cpu") if device is None else torch.device(device)
Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
54
55
56
57
58
59
60

    def __call__(self, input):
        sentences = []
        labels = []
        for sent, lab in input:
            sentences.append(sent)
            labels.append(self.labels2id[lab])
61
62
        labels_tensor = torch.tensor(
            labels, dtype=torch.long, device=self.device)
Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
63
64
65
66
67
        # labels_onehot = one_hot(labels_tensor, num_classes=3)

        try:
            inputs = self.tokenizer(
                sentences, return_tensors="pt", padding=True, truncation=True)
68
69
            inputs["input_ids"] = inputs["input_ids"].to(self.device)
            inputs["attention_mask"] = inputs["attention_mask"].to(self.device)
Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
70
71
72
73
74
        except TypeError:
            pad_id = self.tokenizer.token_to_id("<pad>")
            self.tokenizer.enable_padding(pad_id=pad_id)
            encoded = self.tokenizer.encode_batch(sentences)
            inputs = {
75
76
                "input_ids": torch.tensor([x.ids for x in encoded], device=self.device),
                "attention_mask": torch.tensor([x.attention_mask for x in encoded], device=self.device)
Gaëtan Caillaut's avatar
Gaëtan Caillaut committed
77
78
79
80
81
82
            }

        return inputs, labels_tensor

    def class_names(self):
        return sorted(self.labels2id.keys(), key=lambda x: self.labels2id[x])