Commit 464b34e4 authored by Colleen Beaumard's avatar Colleen Beaumard
Browse files

Modification of scoring scripts (no more arg(epoch) + any list of emotions is ok)

parent e30b391e
# IEMOCAP
This repository contains the framework for training emotion recognition models on IEMOCAP.
### Data preparation
IEMOCAP dataset must be in data/IEMOCAP.
```bash
# To activate env.sh
source ../../env.sh
# Install pip and ruamel (to be able to modify YAML files within a Python script)
python -m ensurepip --upgrade
python3 -m pip install ruamel.yaml
# To create the different csv files needed for trainings (they will be saved in /list)
python ./local/dataprep_iemocap.py --make-train-csv
# Sort csv files according to emotions (speaker_idx)
python ./local/csv_tri.py
```
In order to use data augmentation, also run:
```bash
python ./local/dataprep_aug.py --save-path ./data --download
python ./local/dataprep_aug.py --from ./data/RIRS_NOISES --make-csv-augment-reverb
python ./local/dataprep_aug.py --from ./data/musan_split --make-csv-augment-noise
```
### Train from scratch
Multiple x-vector architectures are implemented, each of them has their own `train_<model_type>.sh` script.
You do not need to modify manually the YAML files, the script will do it automatically (except if you want to change an other value).
Example (! -s -c -b -l have to be written !):
```bash
./train_iemocap_half_resnet34.sh -s session_test -c nb_categories -b batch_size -l lr
```
During training, logs will be put under `logs/<model_type>` and checkpoints will be placed under `model_<model_type>/`.
To create a [TorchScript](https://pytorch.org/docs/stable/jit.html) compatible model use:
```bash
create_jit_model.py model_half_resnet34/best_model_half_resnet34.pt (change the name of the model)
# MODEL = half_resnet34
# Saving CPU TorchScript model to model_half_resnet34/best_model_half_resnet34_cpu_JIT.pt
# Saving GPU TorchScript model to model_half_resnet34/best_model_half_resnet34_cuda_JIT.pt
```
To share models on Hugging Face's Model Hub:
```bash
release_model.sh model_half_resnet34/best_model_half_resnet34_cuda_JIT.pt model_half_resnet34/best_model_half_resnet34_cpu_JIT.pt model_half_resnet34/best_model_half_resnet34.pt
```
### Evaluation
To launch the evaluation of the model, run:
```bash
python ./local/scoring.py #model #session_test #nb_categories #batch #lr #--emotions(default:neu ang sad hap+exc, specific order needed) #--freeze(if parts of the model were frozen)
```
A confusion matrix and losses plot will be made, and all the files will be moves to a special directory (example: "model\_half\_resnet34/Sess1\_test/4emo\_100batch\_lr-0.0001").
To launch the evaluation with cross-validation (all sessions must have a model trained with the same hyperparameters), run:
```bash
python ./local/scoring_cross_validation.py #model #nb_categories #batch #lr #--emotions(default:neu ang sad hap+exc, specific order needed) #--freeze(if parts of the model were frozen)
```
Only a confusion matrix will be plotted and will be saved under a special directory (example: "model\_half\_resnet34/Sess\_all\_cross-valid/4emo\_100batch\_lr-0.0001")
import torch
from tqdm import tqdm
import os
import seaborn as sns
import pandas as pd
import numpy as np
import torchaudio
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sidekit.nnet.xvector import Xtractor
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("model", help="The model used")
parser.add_argument("session_test", help="The session considered as a test (for IEMOCAP)")
parser.add_argument("categories", help="The number of categories")
parser.add_argument("batchs", help="The number of batches used during training")
parser.add_argument("lr", help="The learning rate used during training")
parser.add_argument("--emotions", help="The emotions considered during training (has to respect the order in the "
"dictionnary index:emotion!): neu ang sad hap+exc fea exc hap dis fru sur",
default="neu ang sad hap+exc")
parser.add_argument("--freeze", help="If some parts of the mode were frozen")
args = parser.parse_args()
def load_model(model_path, device):
"""
Load a model
:param model_path: path (str) to the model
:device: cpu, cuda, etc.
"""
device = torch.device(device)
model_config = torch.load(model_path, map_location=device)
model_opts = model_config["model_archi"]
if "embedding_size" not in model_opts:
model_opts["embedding_size"] = 256
xtractor = Xtractor(
model_config["speaker_number"],
model_archi=model_opts["model_type"],
loss=model_opts["loss"]["type"],
embedding_size=model_opts["embedding_size"],
)
xtractor.load_state_dict(model_config["model_state_dict"], strict=True)
xtractor.eval()
return xtractor, model_config
# We store the prediction (the argmax value's index) to compare it later to the golden annotation
ses_nb = args.session_test
labels = list(args.emotions.split(" "))
nb_batch = str(args.batchs)
model_type = args.model
lr = str(args.lr)
cates = str(args.categories)
if args.freeze is not None:
freeze = "_freeze"
else:
freeze = ""
### For the confusion matrix ###
# 1st is the model, 2nd is the weights and all
xtract, config = load_model("model_{}/best_{}_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.pt"
.format(model_type, model_type, cates, nb_batch, lr, ses_nb), "cuda")
predictions = []
gold_anno = []
path = "data/IEMOCAP/Session{}/sentences/wav".format(ses_nb)
# We open the file to obtain the golden annotation (and we sort it out to keep only the current session)
recap = open("data/recap_emo_file.txt", "r")
recap_emo = recap.readlines()
recap_emo = [line for line in recap_emo if ses_nb in line.split("\t")[0]]
recap.close()
# We open the file with the index corresponding to emotions
index = open("list/emos_index.txt", "r")
index_emo = index.readlines()
index.close()
dic_index_emo = {}
# Needed when the model_type == custom
dico = {}
for elmt in index_emo:
dic_index_emo[int(elmt.split(": ")[1].replace("\n", ""))] = elmt.split(": ")[0]
print("\nAll emotions:", dic_index_emo, "\n\n## Beginning of extraction ##")
for line in tqdm(recap_emo):
# We retrieve the golden emotion and retrieve the index associated to it
gold = line.split("\t")[2].replace("\n", "")
if "hap+exc" in labels:
if gold == "hap" or gold == "exc":
gold = "hap+exc"
if gold in labels:
gold_anno.append(gold)
# We extract the predicted emotion ONLY if the gold emotion is within the considered emotions
file = line.split("\t")[1]
folder = line.split("\t")[1].rsplit("_", 1)[0]
signal, sr = torchaudio.load(os.path.join(path, folder, file))
if model_type == "custom":
# Need to make a dictionnary with a key
# "speech" and the signal in value
dico["speech"] = signal
outModel = xtract(dico)
else:
outModel = xtract(signal)
predictions.append(dic_index_emo[outModel[0][1].argmax().item()])
assert len(predictions) == len(gold_anno)
# We start to compare the predictions and gold_anno lists
UAR = metrics.recall_score(gold_anno, predictions, average="macro")
UARPercent = round(UAR * 100, 2)
print("\nUAR:", UARPercent, "%\n")
confMatrix = metrics.confusion_matrix(gold_anno, predictions, labels=labels)
print(confMatrix)
gold_dic = {key: 0 for key in labels}
dico = {key: gold_dic for key in labels}
for gold, pred in zip(gold_anno, predictions):
dico[gold][pred] += 1
gold_dic[gold] += 1
[print("Total", key, ":", value) for key, value in gold_dic.items()]
annot = []
em = [value for value in gold_dic.values()]
for i in range(0, len(confMatrix)): # row
annot.append([])
tot = 0
for j in range(0, len(confMatrix[i])): # column
nbr = confMatrix[i][j]
percent = round(nbr/em[i], 2)*100
tot += percent
if j == len(confMatrix[i])-1:
if tot > 100:
percent -= (tot - 100)
elif tot < 100:
percent += (100 - tot)
full = str(int(percent)) + "% (" + str(nbr) + ")"
annot[i].append(full)
### For the losses ###
fil = open("logs/{}_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.log".format(model_type, cates, nb_batch, lr, ses_nb), "r")
file = fil.readlines()
fil.close()
# Search for "Loss:", "Validation Loss", "Epoch" and "reducing" in all lines
valid_loss = [line for line in file if "Validation Loss" in line]
# "Loss:" and "Epoch" in same line
if model_type == "custom":
loss_epoch = [line for line in file if "Epoch" in line]
loss_epoch = loss_epoch[::2]
else:
loss_epoch = [line for line in file if "Epoch" in line][2:]
reduce_lr = [line for line in file if "reducing" in line] # Can be empty
vloss = [] # Validation losses
tloss = [] # Training losses
aepoch = [] # Number of epochs
for linev, linel in zip(valid_loss, loss_epoch):
linel = linel.split(":")
linev = linev.split("=")
vloss.append(round(float(linev[3].replace("\n", "").replace(" ", "")), 2))
aepoch.append(linel[3].split(" ")[1])
tloss.append(round(float(linel[4].split("\t")[0].replace(" ", "")), 2))
assert len(aepoch) == len(tloss) == len(vloss)
print("--------------")
for e, t, v in zip(aepoch, tloss, vloss):
print("Epoch:", e, "\ttrain loss:", t, "\tvalid loss:", v)
### Plotting of confusion matrix and losses ###
path = "model_{}/Sess{}_test/{}emo_{}batch_lr-{}{}".format(model_type, ses_nb, cates, nb_batch, lr, freeze)
if not os.path.isdir(path.rsplit("/", 1)[0]):
os.mkdir(path.rsplit("/", 1)[0])
if not os.path.isdir(path):
os.mkdir(path)
# Plot confusion matrice
sns.heatmap(confMatrix, annot=annot, fmt="10", cmap="Blues", vmin=0, vmax=350, xticklabels=labels, yticklabels=labels)
plt.title("Model: " + str(config["model_archi"]["model_type"]) + "{}_".format(freeze) + str(config["speaker_number"]) +
"emo_{}batch\nepoch: {} lr: {} Data: Test-IEMOCAP {}".format(nb_batch, aepoch[-1], lr, ses_nb) +
" UAR = " + str(UARPercent) + "%")
plt.xlabel("Prediction")
plt.ylabel("Ground truth")
plt.savefig(os.path.join(path, "confusion_matrix_{}{}_".format(model_type, freeze) + str(config["speaker_number"]) +
"emo_{}batch_epoch-{}_lr-{}_Test-IEMOCAP{}.png".format(nb_batch, aepoch[-1], lr, ses_nb)))
plt.show()
plt.clf()
print("\nConfusion matrix done!")
# Plot losses
ticks = [nb for nb in range(0, 9)]
eticks = [nb for nb in range(0, len(aepoch), 5)]
plt.plot(aepoch, tloss, label="Training loss")
plt.plot(aepoch, vloss, label="Validation loss")
plt.yticks(ticks)
plt.xticks(eticks)
if len(reduce_lr) != 0:
colors = ["b", "g", "y", "c", "m", "r"]
i = 0
for key, value in reduce_lr.items():
label = "lr: " + str(value)
plt.axvline(x=key, color=colors[i], linestyle='--', label=label)
i += 1
plt.legend()
plt.title("Model: {}{}_{}emo_{}batch\nEpoch: {} lr: {} Data: Test-IEMOCAP {}".format(model_type, freeze, cates,
nb_batch, aepoch[-1], lr, ses_nb)
)
plt.savefig(os.path.join(path, "losses_{}{}_{}emo_{}batch_epoch-{}_lr-{}_Test-IEMOCAP{}.png".format(model_type, freeze,
cates, nb_batch,
aepoch[-1], lr,
ses_nb)))
plt.show()
plt.clf()
print("Losses plotted!\n")
### We move confusion matrix and logs to another folder ###
os.replace("model_{}/best_{}_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.pt".format(model_type, model_type, cates, nb_batch, lr,
ses_nb),
os.path.join(path, "best_{}{}_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.pt".format(model_type, freeze, cates,
nb_batch, lr, ses_nb)))
os.replace("model_{}/tmp_{}_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.pt".format(model_type, model_type, cates, nb_batch, lr,
ses_nb),
os.path.join(path, "tmp_{}{}_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.pt".format(model_type, freeze, cates,
nb_batch, lr, ses_nb)))
os.replace("logs/{}_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.log".format(model_type, cates, nb_batch, lr, ses_nb),
os.path.join(path, "{}{}_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.log".format(model_type, freeze, cates,
nb_batch, lr, ses_nb)))
import torch
from tqdm import tqdm
import os
import seaborn as sns
import pandas as pd
import numpy as np
import torchaudio
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sidekit.nnet.xvector import Xtractor
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("model", help="The model used")
parser.add_argument("categories", help="The number of categories")
parser.add_argument("batchs", help="The number of batches used during training")
parser.add_argument("lr", help="The learning rate used during training")
parser.add_argument("--emotions", help="The emotions considered during training (has to respect the order in the "
"dictionnary index:emotion: neu ang sad hap+exc fea exc hap dis fru sur)",
default="neu ang sad hap+exc")
parser.add_argument("--freeze", help="If some parts of the mode were frozen")
args = parser.parse_args()
# We store the prediction (the argmax value's index) to compare it later to the golden annotation
labels = list(args.emotions.split(" "))
nb_batch = str(args.batchs)
model_type = args.model
lr = str(args.lr)
cates = str(args.categories)
if args.freeze is not None:
freeze = "_freeze"
else:
freeze = ""
# Path to save the confusion matrix for cross-validation
path = "model_{}/Sess_all_cross-valid/{}emo_{}batch_lr-{}{}".format(model_type, cates, nb_batch, lr, freeze)
if not os.path.isdir(path.rsplit("/", 1)[0]):
os.mkdir(path.rsplit("/", 1)[0])
if not os.path.isdir(path):
os.mkdir(path)
def load_model(model_path, device):
"""
Load a model
:param model_path: path (str) to the model
:device: cpu, cuda, etc.
"""
device = torch.device(device)
model_config = torch.load(model_path, map_location=device)
model_opts = model_config["model_archi"]
if "embedding_size" not in model_opts:
model_opts["embedding_size"] = 256
xtractor = Xtractor(
model_config["speaker_number"],
model_archi=model_opts["model_type"],
loss=model_opts["loss"]["type"],
embedding_size=model_opts["embedding_size"],
)
xtractor.load_state_dict(model_config["model_state_dict"], strict=True)
xtractor.eval()
return xtractor, model_config
predictions = []
gold_anno = []
# We open the file with the index corresponding to emotions
index = open("list/emos_index.txt", "r")
index_emo = index.readlines()
index.close()
dic_index_emo = {}
dico = {} # Needed when model_type == custom
for elmt in index_emo:
dic_index_emo[int(elmt.split(": ")[1].replace("\n", ""))] = elmt.split(": ")[0]
print("\nAll emotions:", dic_index_emo, "\n\n## Beginning of extraction ##")
for i in range(1, 6):
print("Session {} is processing...".format(i))
# 1st is the model, 2nd is the weights and all
xtract, config = load_model("model_{}/Sess{}_test/{}emo_{}batch_lr-{}{}/best_{}{}_{}emo_{}batch_lr-{}_"
"Test-IEMOCAP{}.pt".format(model_type, i, cates, nb_batch, lr, freeze, model_type,
freeze, cates, nb_batch, lr, i), "cuda")
path_wav = "data/IEMOCAP/Session{}/sentences/wav".format(i)
# We open the file to obtain the golden annotation (and we sort it out to keep only the current session)
recap = open("data/recap_emo_file.txt", "r")
recap_emo = recap.readlines()
recap_emo = [line for line in recap_emo if str(i) in line.split("\t")[0]]
recap.close()
for line in tqdm(recap_emo):
# We retrieve the golden emotion and retrieve the index associated to it
gold = line.split("\t")[2].replace("\n", "")
if "hap+exc" in labels:
if gold == "hap" or gold == "exc":
gold = "hap+exc"
if gold in labels:
gold_anno.append(gold)
# We extract the predicted emotion ONLY if the gold emotion is within the considered emotions
file = line.split("\t")[1]
folder = line.split("\t")[1].rsplit("_", 1)[0]
signal, sr = torchaudio.load(os.path.join(path_wav, folder, file))
if model_type == "custom":
# Need to make a dictionnary with a key
# "speech" and the signal in value
dico["speech"] = signal
outModel = xtract(dico)
else:
outModel = xtract(signal)
predictions.append(dic_index_emo[outModel[0][1].argmax().item()])
print("\n")
assert len(predictions) == len(gold_anno)
# We start to compare the predictions and gold_anno lists
UAR = metrics.recall_score(gold_anno, predictions, average="macro")
UARPercent = round(UAR * 100, 2)
print("UAR:", UARPercent, "%\n")
confMatrix = metrics.confusion_matrix(gold_anno, predictions, labels=labels)
print(confMatrix)
gold_dic = {key: 0 for key in labels}
dico = {key: gold_dic for key in labels}
for gold, pred in zip(gold_anno, predictions):
dico[gold][pred] += 1
gold_dic[gold] += 1
[print("Total", key, ":", value) for key, value in gold_dic.items()]
annot = []
em = [value for value in gold_dic.values()]
for i in range(0, len(confMatrix)): # row
annot.append([])
tot = 0
for j in range(0, len(confMatrix[i])): # column
nbr = confMatrix[i][j]
percent = round(nbr/em[i], 2)*100
tot += percent
if j == len(confMatrix[i])-1:
if tot > 100:
percent -= (tot - 100)
elif tot < 100:
percent += (100 - tot)
full = str(int(percent)) + "% (" + str(nbr) + ")"
annot[i].append(full)
sns.heatmap(confMatrix, annot=annot, fmt="10", cmap="Blues", vmin=0, vmax=1000, xticklabels=labels, yticklabels=labels)
plt.title("Model: " + str(config["model_archi"]["model_type"]) + "{}_".format(freeze) + str(config["speaker_number"]) +
"emo_{}batch\nlr: {} Data: Test-IEMOCAP-cross_validation".format(nb_batch, lr) +
" UAR = " + str(UARPercent) + "%")
plt.xlabel("Prediction")
plt.ylabel("Ground truth")
plt.savefig(os.path.join(path, "confusion_matrix_{}{}_".format(model_type, freeze) + str(config["speaker_number"]) +
"emo_{}batch_lr-{}_IEMOCAP-cross_validation.png".format(nb_batch, lr)))
plt.show()
plt.clf()
print("\nConfusion matrix done!")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment