Commit d127dbfe authored by Gaëtan Caillaut's avatar Gaëtan Caillaut
Browse files

lemmatized datasets

parent 18ccf33c
https://arxiv.org/pdf/1809.04444v1.pdf -> https://github.com/aitor-garcia-p/hate-speech-dataset
http://sdl.soc.cornell.edu/img/publication_pdf/hatespeechdetection.pdf -> https://github.com/t-davidson/hate-speech-and-offensive-language
\ No newline at end of file
## Données
Le dossier _data_ contient les jeux de données décrits dans les articles ci-dessous :
- [Automated Hate Speech Detection and the Problem of Offensive Language](http://sdl.soc.cornell.edu/img/publication_pdf/hatespeechdetection.pdf) : [dépôt git](https://github.com/t-davidson/hate-speech-and-offensive-language)
- [Hate Speech Dataset from a White Supremacy Forum](https://arxiv.org/pdf/1809.04444v1.pdf) : [dépôt git](https://github.com/aitor-garcia-p/hate-speech-dataset)
## Utilisation
Fusion des deux jeux de données :
```
python merge_data.py
```
Nettoyages des jeux de données :
```
python clean_data.py
```
Le nettoyage tente d'éliminer la présence de nom d'utilisateur, d'entités HTML (comme `&`) et d'url.
\ No newline at end of file
......@@ -6,10 +6,10 @@ import random
class HateSpeechDataset(Dataset):
def __init__(self, path):
def __init__(self, path, sent_column="Cleaned"):
self.path = path
df = pd.read_csv(self.path).dropna()
self.sentences = df["Cleaned"].tolist()
self.sentences = df[sent_column].tolist()
self.labels = df["Class"].tolist()
def __getitem__(self, i):
......@@ -19,6 +19,11 @@ class HateSpeechDataset(Dataset):
return len(self.labels)
def iter_folds(self, k=10, shuffle=True):
# If there is one fold, then train = test
if k == 1:
yield self, self
return
n = len(self.sentences)
if shuffle:
indices = list(range(n))
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/bin/sh
export PYTHONPATH="$HOME/polysemy/minibert:$PYTHONPATH"
python train_minibert.py train-minibert --input data/cleaned-davidson-dataset.csv \
--output output/minibert/davidson-final \
--epochs 15 \
--optimizer adam \
--device cpu \
--jobname minibert/davidson-final \
--tokenizer data/tokenizer-davidson.json \
--folds 1
#!/bin/sh
export PYTHONPATH="$HOME/polysemy/minibert:$PYTHONPATH"
python train_minibert.py train-minibert --input data/lemmatized-davidson-dataset.csv \
--output output/minibert/davidson-lemmatized \
--epochs 50 \
--optimizer adam \
--device cpu \
--jobname minibert/davidson-lemmatized \
--tokenizer data/tokenizer-davidson-lemmatized.json \
--column Lemmatized \
--folds 5
#!/bin/sh
export PYTHONPATH="$HOME/polysemy/minibert:$PYTHONPATH"
python train_minibert.py train-minibert --input data/cleaned-gibert-dataset.csv \
--output output/minibert/gibert-final \
--epochs 15 \
--optimizer adam \
--device cpu \
--jobname minibert/gibert-final \
--tokenizer data/tokenizer-gibert.json \
--folds 1
#!/bin/sh
export PYTHONPATH="$HOME/polysemy/minibert:$PYTHONPATH"
python train_minibert.py train-minibert --input data/lemmatized-gibert-dataset.csv \
--output output/minibert/gibert-lemmatized \
--epochs 50 \
--optimizer adam \
--device cpu \
--jobname minibert/gibert-lemmatized \
--tokenizer data/tokenizer-gibert-lemmatized.json \
--column Lemmatized \
--folds 5
#!/bin/sh
export PYTHONPATH="$HOME/polysemy/minibert:$PYTHONPATH"
python train_minibert.py train-minibert --input data/cleaned-hate-speech-dataset.csv \
--output output/minibert/merged-final \
--epochs 15 \
--optimizer adam \
--device cpu \
--jobname minibert/merged-final \
--tokenizer data/tokenizer.json \
--folds 1
#!/bin/sh
export PYTHONPATH="$HOME/polysemy/minibert:$PYTHONPATH"
python train_minibert.py train-minibert --input data/lemmatized-hate-speech-dataset.csv \
--output output/minibert/merged-lemmatized \
--epochs 50 \
--optimizer adam \
--device cpu \
--jobname minibert/merged-lemmatized \
--tokenizer data/tokenizer-lemmatized.json \
--column Lemmatized \
--folds 5
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
def tokenize_data(df, nlp):
lemmatized = []
matcher = PhraseMatcher(nlp.vocab, attr="ORTH")
matcher.add("user", list(nlp.tokenizer.pipe(["<user>", "<rt_user>"])))
for sent in df["Cleaned"]:
doc = nlp(sent)
with doc.retokenize() as retokenizer:
for span in matcher(doc, as_spans=True):
retokenizer.merge(span, attrs={"LEMMA": span.text})
lemmatized_sent = " ".join([tok.lemma_ for tok in doc])
lemmatized.append(lemmatized_sent)
df["Lemmatized"] = lemmatized
return df
if __name__ == "__main__":
print("Loading model")
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
print("Merged dataset")
df = pd.read_csv("data/cleaned-hate-speech-dataset.csv").dropna()
tokenize_data(df, nlp)
df.to_csv("data/lemmatized-hate-speech-dataset.csv", index=False)
print("Davidson dataset")
df = pd.read_csv("data/cleaned-davidson-dataset.csv").dropna()
tokenize_data(df, nlp)
df.to_csv("data/lemmatized-davidson-dataset.csv", index=False)
print("Gibert dataset")
df = pd.read_csv("data/cleaned-gibert-dataset.csv").dropna()
tokenize_data(df, nlp)
df.to_csv("data/lemmatized-gibert-dataset.csv", index=False)
......@@ -123,7 +123,7 @@ if __name__ == "__main__":
def _traintok(args):
df = pd.read_csv(args.input).dropna()
sentences = df["Cleaned"].tolist()
sentences = df[args.column].tolist()
train_tokenizer(sentences, args.output)
def _trainmb(args):
......@@ -220,6 +220,7 @@ if __name__ == "__main__":
"-i", "--input", default="data/cleaned-hate-speech-dataset.csv")
traintok_parser.add_argument(
"-o", "--output", default="data/tokenizer.json")
traintok_parser.add_argument("--column", default="Cleaned")
traintok_parser.set_defaults(func=_traintok)
trainmb_parser = subparsers.add_parser("train-minibert")
......@@ -233,6 +234,7 @@ if __name__ == "__main__":
trainmb_parser.add_argument("--folds", default=10, type=int)
trainmb_parser.add_argument(
"-t", "--tokenizer", default="data/tokenizer.json")
trainmb_parser.add_argument("--column", default="Cleaned")
trainmb_parser.set_defaults(func=_trainmb)
args = parser.parse_args()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment