Commit 91dfba68 authored by Hubert Nourtel's avatar Hubert Nourtel
Browse files

Merge branch 'main' into 'main'

Main

See merge request !1
parents e30b391e 24ec5a6c
# Dataset description
# General options
data_path: /
data_file_extension: .wav
dataset_csv: list/iemocap_ses1-test.csv
sample_rate: 16000
validation_ratio: 0.02
batch_size: 4
# Training set
train:
duration: 3.
chunk_per_segment: -1
overlap: 3.
sampler:
examples_per_speaker: 1
samples_per_speaker: 100
augmentation_replica: 1
transform_number: 0
transformation:
pipeline: # no transformation
# pipeline: add_reverb,add_noise,filtering,phone_filtering,codec
add_noise:
noise_db_csv: list/musan.csv
data_path: /
add_reverb:
rir_db_csv: list/reverb.csv
data_path: /
# Validation set
valid:
duration: 3.
transformation:
pipeline: # no transformation
add_noise:
noise_db_csv: list/musan.csv
data_path: /
# Test set
test:
idmap: ./list/asv_test_libri/libri_test_idmap.h5
ndx: ./list/asv_test_libri/libri_test_ndx.h5
key: ./list/asv_test_libri/libri_test_key.h5
data_path: .
id2wav: ./data/asv_test_libri/libri_test.id2wav
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
import sidekit.nnet
def build():
# You can also inherit nn.sidekit.nnet.Xtractor directly (change model_archi)
"""
class Net(sidekit.nnet.Xtractor):
def __init__(self, speaker_number, loss=None, embedding_size=256):
super().__init__(speaker_number, model_archi="xvector", loss=loss, embedding_size=embedding_size)
# add additional logic here
def forward(self, x, target=None, norm_embedding=True):
return super().forward(x, target, norm_embedding)
# add additional logic here
return Net
"""
# Define your model, you can use building blocks from sidekit.nnet
class Net(nn.Module):
def __init__(self, speaker_number, loss=None, embedding_size=256):
# You can change the parameters value by changing the 'config/custom/model.yaml' config
super().__init__()
if loss not in ["aam"]:
raise NotImplementedError(f"Loss not implemented")
self.preprocessor = sidekit.nnet.MfccFrontEnd()
feature_size = self.preprocessor.n_mfcc
self.loss = loss
self.speaker_number = speaker_number
self.sequence_network = nn.Sequential(
OrderedDict(
[
("conv1", nn.Conv1d(feature_size, 512, 5, dilation=1)),
("activation1", nn.LeakyReLU(0.2)),
("batch_norm1", nn.BatchNorm1d(512)),
("conv2", nn.Conv1d(512, 512, 3, dilation=2)),
("activation2", nn.LeakyReLU(0.2)),
("batch_norm2", nn.BatchNorm1d(512)),
("conv3", nn.Conv1d(512, 512, 3, dilation=3)),
("activation3", nn.LeakyReLU(0.2)),
("batch_norm3", nn.BatchNorm1d(512)),
("conv4", nn.Conv1d(512, 512, 1)),
("activation4", nn.LeakyReLU(0.2)),
("batch_norm4", nn.BatchNorm1d(512)),
("conv5", nn.Conv1d(512, 1536, 1)),
("activation5", nn.LeakyReLU(0.2)),
("batch_norm5", nn.BatchNorm1d(1536)),
]
)
)
self.embedding_size = embedding_size
self.stat_pooling = sidekit.nnet.MeanStdPooling()
self.before_speaker_embedding = nn.Sequential(
OrderedDict([("linear6", nn.Linear(3072, self.embedding_size))])
)
# The final layer computes the loss
if self.loss == "aam":
self.after_speaker_embedding = sidekit.nnet.ArcMarginProduct(
self.embedding_size,
int(self.speaker_number),
s=30.0,
m=0.2,
easy_margin=False,
)
self.after_speaker_embedding_emotion = nn.Linear(
self.embedding_size, 5
) # 5 -> 5 emotions
self.after_speaker_embedding_emotion_loss = torch.nn.CrossEntropyLoss()
def set_lr_weight_decay_layers_for_optim(self, _optimizer, _options):
self._optimizer_option = _options
self._optimizer = _optimizer
# fmt: off
param_list = []
param_list.append({"params": self.preprocessor.parameters(), "weight_decay": 0.0002})
param_list.append({"params": self.sequence_network.parameters(), "weight_decay": 0.0002})
param_list.append({ "params": self.stat_pooling.parameters(), "weight_decay": 0})
param_list.append({ "params": self.before_speaker_embedding.parameters(), "weight_decay": 0.002})
param_list.append({ "params": self.after_speaker_embedding.parameters(), "weight_decay": 0.002})
# EMOTION: param_list.append({ "params": self.after_speaker_embedding_emotion.parameters(), "weight_decay": 0.002})
# fmt: on
self.optimizer = _optimizer(param_list, **_options)
# example on applying different LR to different layers
# self.optimizer.param_groups[0]["lr"] = _options["lr"] / 2
return self.optimizer
def forward(self, args, target=None, norm_embedding=True):
"""
The forward mothod MUST take 3 arguemnts
The forward mothod MUST return 2 values:
- a tuple of: (loss: to train the model, in testing (target==None) you should return torch.tensor(torch.nan).
cross-entroy prediction: raw output of the network to compute accuracy on
- In this example the returned value handled by: ArcMarginProduct
- the x-vector embedding
i.e., (loss, cce), x_vector = model([...])
"""
x = args["speech"]
x = x.squeeze(1)
x = self.preprocessor(x)
x = self.sequence_network(x)
x = self.stat_pooling(x)
x = self.before_speaker_embedding(x)
if norm_embedding:
x = F.normalize(x, dim=1)
speaker_loss, s_layer = self.after_speaker_embedding(x, target=target)
return (speaker_loss, s_layer), x
e_layer = self.after_speaker_embedding_emotion(x)
emotion_loss = torch.tensor(torch.nan)
if "emotion" in args:
emotion_loss = self.after_speaker_embedding_emotion_loss(
e_layer, args["emotion"]
)
return (emotion_loss, e_layer), x
# possible to add losses together for multitask training i.e.: emotion_loss + speaker_loss[0] * 0.2
def test(self, model_opts, dataset_opts, training_opts, device="cpu"):
# EER computation for the testing dataset
# you can tweak this to your own task (emotion reco...)
enroll_dataset = sidekit.nnet.IdMapSet(
idmap_name=dataset_opts["test"]["idmap"],
data_path=dataset_opts["test"]["data_path"],
file_extension=dataset_opts["data_file_extension"].replace(".", ""),
transform_number=0,
id_wavs_maps=dataset_opts["test"]["id2wav"],
sliding_window=False,
hook=get_data_loading_hook(dataset_opts), # local usage of the hook
)
enroll_dataloader = torch.utils.data.DataLoader(
enroll_dataset,
batch_size=1,
shuffle=False,
drop_last=False,
pin_memory=True,
num_workers=training_opts["num_cpu"],
)
# reverse IdMap
ndx = sidekit.bosaris.Ndx(dataset_opts["test"]["ndx"])
key = sidekit.bosaris.Key(dataset_opts["test"]["key"])
test_idmap = sidekit.bosaris.IdMap()
test_idmap.leftids = ndx.segset
test_idmap.rightids = ndx.segset
test_idmap.start = [None] * len(ndx.segset)
test_idmap.stop = [None] * len(ndx.segset)
test_dataset = sidekit.nnet.IdMapSet(
idmap_name=test_idmap,
data_path=dataset_opts["test"]["data_path"],
file_extension=dataset_opts["data_file_extension"].replace(".", ""),
transform_number=0,
id_wavs_maps=dataset_opts["test"]["id2wav"],
sliding_window=False,
hook=get_data_loading_hook(dataset_opts), # local usage of the hook
)
test_dataloader = torch.utils.data.DataLoader(
test_dataset,
batch_size=1,
shuffle=False,
drop_last=False,
pin_memory=True,
num_workers=training_opts["num_cpu"],
)
def _format(data):
data = data["speech"].to(device)
return data
def _pre_forward(x):
return {"speech": x}
enrolls_stat = sidekit.nnet.extract_embeddings_from_dataloader(
self,
enroll_dataloader,
device=device,
format=_format,
pre_forward=_pre_forward,
mixed_precision=training_opts["mixed_precision"],
)
test_stat = sidekit.nnet.extract_embeddings_from_dataloader(
self,
test_dataloader,
device=device,
format=_format,
pre_forward=_pre_forward,
mixed_precision=training_opts["mixed_precision"],
)
# Compute cosine similarity
cosine_scores = sidekit.iv_scoring.cosine_scoring(
enrolls_stat, test_stat, ndx, device=device
)
scores = cosine_scores.scoremat
scores = scores[ndx.trialmask]
key.tar = key.tar[ndx.trialmask]
key.non = key.non[ndx.trialmask]
pmiss, pfa = sidekit.bosaris.detplot.rocch(scores[key.tar], scores[key.non])
eer = sidekit.bosaris.detplot.rocch2eer(pmiss, pfa)
print(f"**Test metrics - Test EER = {eer * 100} %")
def new_epoch_hook(self, current_epoch, total_epoch):
pass
# example of modifying the optimizer / freezing some layers depending on the epoch
"""
self.optimizer.param_groups[0]["lr"] = self.optimizer_option["lr"]
if current_epoch < total_epoch * 0.40:
self.optimizer.param_groups[0]["lr"] = self.optimizer_option["lr"] / 2
switch_require_grad = False
for name, param in self.named_parameters():
if name.startswith("sequence_network.conv4"):
switch_require_grad = True
param.requires_grad = switch_require_grad
"""
@torch.no_grad()
def validate_model(self):
# fmt: off
print("Model_parameters_count: {:d}".format( sum( p.numel() for p in self.sequence_network.parameters() if p.requires_grad ) + sum( p.numel() for p in self.before_speaker_embedding.parameters() if p.requires_grad ) + sum( p.numel() for p in self.stat_pooling.parameters() if p.requires_grad ) ))
# fmt: on
batch = torch.rand(16, 32000)
indices = torch.randint(0, 5, size=(16,))
_, x_vector = self.forward({"speech": batch, "emotion": indices})
assert x_vector.shape[1] == 256
return Net
# DATA loading hook to add your own data/target
# used by sidekit internaly
# you can also call it on your own (i.e.: in the test function)
def get_data_loading_hook(sessions):
# print(sessions)
# This hook is exectued during dataloading (Done by the CPU in parallel)
def _hook(speech, csv_line, file_ext):
if speech.ndim == 1:
speech = speech.unsqueeze(0)
# print(speech.shape, csv_line, file_ext)
# check for test dset with csv_line["dataset"] == "test"
# Here you can modify what is
args = {}
args["speech"] = speech
args["F0"] = torch.rand((1, speech.size(1) // 320)) # fake F0 extractor
# Fake emotion anontation
n_emo = 5
indice = torch.randint(0, 5, size=(1,))[0] # (Either 0,1,2,3,4)
args["emotion"] = indice # fake emotion anontation
return args
return _hook
# Model description
speaker_number: 4
loss:
type: aam
aam_margin: 0.2
aam_s: 30
# Warning, this hook is experimental, it is broking some other scripts (extract_xvectors.py, scoring..)
data_loading_hook: ./config/custom/model.py
# Initialize model from file, reset and freeze parts of it
initial_model_name:
reset_parts: [after_speaker_embedding]
freeze_parts: [] #[preprocessor,sequence_network,stat_pooling,before_speaker_embedding]
# Model can be fastresnet34, resnet34, xvector, ..
model_type: ./config/custom/model.py
import ruamel.yaml
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("session_test", help="The session used in test", type=int)
parser.add_argument("categories", help="The number of categories", type=int)
parser.add_argument("batch", help="The number of batch", type=int)
parser.add_argument("lr", help="The learning rate", type=float)
args = parser.parse_args()
## Preparation of all arguments
# For model.yaml
nb_cate = args.categories
# For Iemocap.yaml
batch = args.batch
examples = int(batch/nb_cate)
session_test = args.session_test
# For training.yaml
lr = args.lr
tmp = "model_custom/tmp_custom_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.pt".format(nb_cate, batch, lr, session_test)
best = "model_custom/best_custom_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.pt".format(nb_cate, batch, lr, session_test)
log = "logs/half_resnet34_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.log".format(nb_cate, batch, lr, session_test)
## Modification of variables in YAML files
# model.yaml
yaml = ruamel.yaml.YAML()
with open("config/custom/model.yaml") as fp:
data = yaml.load(fp)
data['speaker_number'] = int(nb_cate)
with open("config/custom/model.yaml", 'w') as fp:
yaml.dump(data, fp)
# training.yaml
yaml = ruamel.yaml.YAML()
with open("config/custom/training.yaml") as fp:
data = yaml.load(fp)
data['lr'] = lr
data["tmp_model_name"] = tmp
data["best_model_name"] = best
data["log_file"] = log
with open("config/custom/training.yaml", 'w') as fp:
yaml.dump(data, fp)
# Iemocap.yaml
yaml = ruamel.yaml.YAML()
with open("config/custom/Iemocap.yaml") as fp:
data = yaml.load(fp)
data['batch_size'] = batch
data["train"]["sampler"]["examples_per_speaker"] = examples
data["dataset_csv"] = "list/iemocap_ses{}-test.csv".format(session_test)
with open("config/custom/Iemocap.yaml", 'w') as fp:
yaml.dump(data, fp)
# Training description
# General options
log_file: logs/custom.log
torch_seed: 42
numpy_seed: 42
random_seed: 42
deterministic: false
epochs: 100
lr: 0.0001
patience: 30
multi_gpu: false
num_cpu: 16
mixed_precision: true
clipping: false
# Optimizer and scheduler options
optimizer:
type: adam
options:
scheduler:
type: CyclicLR
mode: triangular2
base_lr: 1.0e-05
step_size_up: 40000
# Evaluation options
compute_test_eer: false
log_interval: 50
validation_frequency: 1
# Save options
tmp_model_name: model_custom/tmp_custom_4emo_4batch_lr-0.0001_Test-IEMOCAP1.pt
best_model_name: model_custom/best_custom_4emo_4batch_lr-0.0001_Test-IEMOCAP1.pt
checkpoint_frequency:
# Dataset description
# General options
data_path: / # path to add before each wavs of list/voxceleb2.csv
data_file_extension: .wav
dataset_csv: list/iemocap_ses2-test.csv
sample_rate: 16000
validation_ratio: 0.02
batch_size: 4
# Training set
train:
duration: 3
chunk_per_segment: -1
overlap: 3
sampler:
examples_per_speaker: 1
samples_per_speaker: 192
augmentation_replica: 1
transform_number: 1
transformation:
pipeline: add_reverb,add_noise,filtering,phone_filtering,codec
add_noise:
noise_db_csv: list/musan.csv
data_path: /
add_reverb:
rir_db_csv: list/reverb.csv
data_path: /
# Validation set
valid:
duration: 3
transformation:
pipeline: # no transformation
add_noise:
noise_db_csv: list/musan.csv
data_path: /
# Test set (set 'compute_test_eer' to true in training.yaml)
test:
idmap: ./list/asv_test/voxceleb1-O-clean_idmap.h5
ndx: ./list/asv_test/voxceleb1-O-clean_ndx.h5
key: ./list/asv_test/voxceleb1-O-clean_key.h5
data_path: .
id2wav: ./data/asv_test_voxceleb1/voxceleb1-O-clean.id2wav
# Model description
speaker_number: 4
loss:
type: aam
aam_margin: 0.2
aam_s: 30
# Initialize model from file, reset and freeze parts of it
initial_model_name: #/srv/storage/talc@talc-data.nancy/multispeech/calcul/users/hnourtel/sidekit/best_halp_clr_adam_aam0.2_30_b256_vox12.pt_epoch201
reset_parts: [after_speaker_embedding]
freeze_parts: []
# Model can be fastresnet34, resnet34, xvector, ..
model_type: halfresnet34
import ruamel.yaml
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("session_test", help="The session used in test", type=int)
parser.add_argument("categories", help="The number of categories", type=int)
parser.add_argument("batch", help="The number of batch", type=int)
parser.add_argument("lr", help="The learning rate", type=float)
args = parser.parse_args()
## Preparation of all arguments
# For model.yaml
nb_cate = args.categories
# For Iemocap.yaml
batch = args.batch
examples = int(batch/nb_cate)
session_test = args.session_test
# For training.yaml
lr = args.lr
tmp = "model_half_resnet34/tmp_half_resnet34_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.pt".format( nb_cate, batch, lr, session_test)
best = "model_half_resnet34/best_half_resnet34_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.pt".format(nb_cate, batch, lr, session_test)
log = "logs/half_resnet34_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.log".format(nb_cate, batch, lr, session_test)
## Modification of variables in YAML files
# model.yaml
yaml = ruamel.yaml.YAML()
with open("config/half_resnet34/model.yaml") as fp:
data = yaml.load(fp)
data['speaker_number'] = int(nb_cate)
with open("config/half_resnet34/model.yaml", 'w') as fp:
yaml.dump(data, fp)
# training.yaml
yaml = ruamel.yaml.YAML()
with open("config/half_resnet34/training.yaml") as fp:
data = yaml.load(fp)
data['lr'] = lr
data["tmp_model_name"] = tmp
data["best_model_name"] = best
data["log_file"] = log
with open("config/half_resnet34/training.yaml", 'w') as fp:
yaml.dump(data, fp)
# Iemocap.yaml
yaml = ruamel.yaml.YAML()
with open("config/half_resnet34/Iemocap.yaml") as fp:
data = yaml.load(fp)