Commit 18ccf33c authored by Gaëtan Caillaut's avatar Gaëtan Caillaut
Browse files

initial commit

parents
__pycache__
.ipynb_checkpoints
.vscode
\ No newline at end of file
[submodule "data/hate-speech-davidson"]
path = data/hate-speech-davidson
url = https://github.com/t-davidson/hate-speech-and-offensive-language.git
[submodule "data/hate-speech-gibert"]
path = data/hate-speech-gibert
url = https://github.com/Vicomtech/hate-speech-dataset.git
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([3, 6, 4])\n",
"torch.Size([3, 6, 4])\n",
"torch.Size([3, 6, 1])\n",
"torch.Size([3, 1, 6])\n",
"torch.Size([3, 4])\n",
"torch.Size([3, 8])\n",
"torch.Size([3, 8])\n",
"tensor([[ 0.2840, 0.0541, 0.0641, -0.0458, -0.2244, 0.2004, 0.4426, -0.2979],\n",
" [ 0.3696, 0.0677, 0.0477, -0.0572, -0.1058, 0.1365, 0.4910, -0.2979],\n",
" [ 0.3562, 0.0686, 0.0664, -0.0494, -0.1154, 0.1042, 0.4609, -0.3036]],\n",
" grad_fn=<AddmmBackward>)\n",
"tensor([6, 6, 6])\n"
]
}
],
"source": [
"import torch\n",
"\n",
"bs = 3\n",
"d = 4\n",
"seqlen = 6\n",
"outsize = 8\n",
"\n",
"l1 = torch.nn.Linear(d, d, bias=True)\n",
"attention_layer = torch.nn.Linear(d, 1, bias=False)\n",
"l2 = torch.nn.Linear(d, outsize, bias=True)\n",
"output_act = torch.nn.Sigmoid()\n",
"\n",
"x = torch.rand((bs, seqlen, d), dtype=torch.float)\n",
"print(x.size())\n",
"\n",
"x = l1(x)\n",
"print(x.size())\n",
"\n",
"aw = attention_layer(x)\n",
"print(aw.size())\n",
"\n",
"att = torch.nn.functional.softmax(aw, dim=-2).transpose(-2, -1)\n",
"print(att.size())\n",
"# print(aw)\n",
"# print(att)\n",
"\n",
"x = torch.matmul(att, x).squeeze()\n",
"print(x.size())\n",
"# print(x)\n",
"\n",
"x = l2(x)\n",
"print(x.size())\n",
"\n",
"logits = output_act(x)\n",
"print(logits.size())\n",
"\n",
"out = torch.argmax(logits, dim=-1)\n",
"\n",
"print(x)\n",
"print(out)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (PolysEmY)",
"language": "python",
"name": "polysemy"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
https://arxiv.org/pdf/1809.04444v1.pdf -> https://github.com/aitor-garcia-p/hate-speech-dataset
http://sdl.soc.cornell.edu/img/publication_pdf/hatespeechdetection.pdf -> https://github.com/t-davidson/hate-speech-and-offensive-language
\ No newline at end of file
import pandas as pd
import re
import html
from merge_data import *
def remove_urls(sent):
# from https://www.urlregex.com/
url_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
return re.sub(url_pattern, "", sent)
def remove_retweets(sent):
retweet_pattern = r"\bRT @\w+[:]?(\s|$)"
return re.sub(retweet_pattern, " <rt_user> ", sent)
def remove_weird_html_entities(sent):
pat = r"&#\d+;"
return re.sub(pat, " ", sent)
def remove_usernames(sent):
username_pattern = r"@\w+[:]?\b"
return re.sub(username_pattern, " <user> ", sent)
def remove_double_spaces(sent):
return re.sub(r"\s+", " ", sent)
def clean_sentence(sent):
pipeline = [remove_urls, remove_weird_html_entities, html.unescape,
remove_retweets, remove_usernames, remove_double_spaces, str.strip]
for p in pipeline:
sent = p(sent)
return sent
if __name__ == "__main__":
df = pd.read_csv("data/merged-hate-speech-dataset.csv")
df["Cleaned"] = [clean_sentence(sent) for sent in df["Sentence"]]
df.dropna(inplace=True)
df.to_csv("data/cleaned-hate-speech-dataset.csv", index=False)
df = read_davidson_data()
df["Cleaned"] = [clean_sentence(sent) for sent in df["Sentence"]]
df.dropna(inplace=True)
df.to_csv("data/cleaned-davidson-dataset.csv", index=False)
df = read_gibert_data()
df["Cleaned"] = [clean_sentence(sent) for sent in df["Sentence"]]
df.dropna(inplace=True)
df.to_csv("data/cleaned-gibert-dataset.csv", index=False)
from torch.utils.data import Dataset
import pandas as pd
from torch.nn.functional import one_hot
import torch
import random
class HateSpeechDataset(Dataset):
def __init__(self, path):
self.path = path
df = pd.read_csv(self.path).dropna()
self.sentences = df["Cleaned"].tolist()
self.labels = df["Class"].tolist()
def __getitem__(self, i):
return self.sentences[i], self.labels[i]
def __len__(self):
return len(self.labels)
def iter_folds(self, k=10, shuffle=True):
n = len(self.sentences)
if shuffle:
indices = list(range(n))
random.shuffle(indices)
sentences = [self.sentences[i] for i in indices]
labels = [self.labels[i] for i in indices]
fold_len = n // k
for i in range(k):
start = i * fold_len
end = min(start + fold_len, n)
test_sentences = sentences[start:end]
test_labels = labels[start:end]
train_sentences = sentences[:start] + sentences[end:]
train_labels = labels[:start] + labels[end:]
train = [(x, y) for x, y in zip(train_sentences, train_labels)]
test = [(x, y) for x, y in zip(test_sentences, test_labels)]
yield train, test
class HateSpeechCollater:
def __init__(self, tokenizer):
self.tokenizer = tokenizer
self.labels2id = {"Hate": 0, "Offensive": 1, "Other": 2}
def __call__(self, input):
sentences = []
labels = []
for sent, lab in input:
sentences.append(sent)
labels.append(self.labels2id[lab])
labels_tensor = torch.tensor(labels, dtype=torch.long)
# labels_onehot = one_hot(labels_tensor, num_classes=3)
try:
inputs = self.tokenizer(
sentences, return_tensors="pt", padding=True, truncation=True)
except TypeError:
pad_id = self.tokenizer.token_to_id("<pad>")
self.tokenizer.enable_padding(pad_id=pad_id)
encoded = self.tokenizer.encode_batch(sentences)
inputs = {
"input_ids": torch.tensor([x.ids for x in encoded]),
"attention_mask": torch.tensor([x.attention_mask for x in encoded])
}
return inputs, labels_tensor
def class_names(self):
return sorted(self.labels2id.keys(), key=lambda x: self.labels2id[x])
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Subproject commit 6d1505012da627f535f7ae2c99c712c639d6366b
Subproject commit b802137e8314c38a8d4faa965f9908d6c8e7a84d
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import torch
import numpy as np
import itertools
import matplotlib.pyplot as plt
def eval_model(model, dataloader, device):
confusion = {}
for inputs, labels in dataloader:
x = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)
labels = labels.to(device)
logits = model(x, attention_mask)
predicted = torch.argmax(logits, dim=-1).tolist()
for pred, gold in zip(predicted, labels.tolist()):
if gold not in confusion:
confusion[gold] = {}
if pred not in confusion[gold]:
confusion[gold][pred] = 0
confusion[gold][pred] += 1
labels = sorted(set(confusion.keys()))
for x in labels:
for y in labels:
if y not in confusion[x]:
confusion[x][y] = 0
confusion_lst = [
[confusion[x][y] for y in labels]
for x in labels
]
confusion_tensor = torch.tensor(confusion_lst, dtype=torch.long)
true_positives = torch.sum(torch.diag(confusion_tensor)).item()
tp_ratio = true_positives / torch.sum(confusion_tensor).item()
i = torch.arange(len(labels))
lab2ind = {x: i for i, x in enumerate(labels)}
scores_per_class = [
{
"true_positives": confusion_tensor[lab2ind[x], lab2ind[x]].item(),
"false_positives": confusion_tensor[i != lab2ind[x], lab2ind[x]].sum().item(),
"false_negatives": confusion_tensor[lab2ind[x], i != lab2ind[x]].sum().item(),
"true_negatives": confusion_tensor[lab2ind[x] != i, i != lab2ind[x]].sum().item(),
}
for x in labels
]
return true_positives, tp_ratio, confusion_tensor, scores_per_class
def plot_confusion_matrix(cm, class_names):
"""
Returns a matplotlib figure containing the plotted confusion matrix.
Args:
cm (array, shape = [n, n]): a confusion matrix of integer classes
class_names (array, shape = [n]): String names of the integer classes
"""
figure = plt.figure(figsize=(8, 8))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion matrix")
plt.colorbar()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names, rotation=45)
plt.yticks(tick_marks, class_names)
# Normalize the confusion matrix.
cm = np.around(cm.astype('float') / cm.sum(axis=1)
[:, np.newaxis], decimals=2)
# Use white text if squares are dark; otherwise black.
threshold = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
color = "white" if cm[i, j] > threshold else "black"
plt.text(j, i, cm[i, j], horizontalalignment="center", color=color)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
return figure
#!/bin/sh
export PYTHONPATH="$HOME/polysemy/minibert:$PYTHONPATH"
python train_minibert.py train-minibert --input data/cleaned-davidson-dataset.csv \
--output output/minibert/davidson \
--epochs 50 \
--optimizer adam \
--device cpu \
--jobname minibert/davidson \
--tokenizer data/tokenizer-davidson.json \
--folds 5
#!/bin/sh
export PYTHONPATH="$HOME/polysemy/minibert:$PYTHONPATH"
python train_minibert.py train-minibert --input data/cleaned-gibert-dataset.csv \
--output output/minibert/gibert \
--epochs 50 \
--optimizer adam \
--device cpu \
--jobname minibert/gibert \
--tokenizer data/tokenizer-gibert.json \
--folds 5
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment