Commit c7235553 authored by Hubert Nourtel's avatar Hubert Nourtel
Browse files

Adding collate hook for IEMOCAP for loading entire utterances

parent e6736976
......@@ -23,84 +23,89 @@ def build():
"""
# Define your model, you can use building blocks from sidekit.nnet
class Net(nn.Module):
class Net(sidekit.nnet.Xtractor):
def __init__(self, speaker_number, loss=None, embedding_size=256):
# You can change the parameters value by changing the 'config/custom/model.yaml' config
super().__init__()
if loss not in ["aam"]:
raise NotImplementedError(f"Loss not implemented")
self.preprocessor = sidekit.nnet.MfccFrontEnd()
feature_size = self.preprocessor.n_mfcc
self.loss = loss
self.speaker_number = speaker_number
self.sequence_network = nn.Sequential(
OrderedDict(
[
("conv1", nn.Conv1d(feature_size, 512, 5, dilation=1)),
("activation1", nn.LeakyReLU(0.2)),
("batch_norm1", nn.BatchNorm1d(512)),
("conv2", nn.Conv1d(512, 512, 3, dilation=2)),
("activation2", nn.LeakyReLU(0.2)),
("batch_norm2", nn.BatchNorm1d(512)),
("conv3", nn.Conv1d(512, 512, 3, dilation=3)),
("activation3", nn.LeakyReLU(0.2)),
("batch_norm3", nn.BatchNorm1d(512)),
("conv4", nn.Conv1d(512, 512, 1)),
("activation4", nn.LeakyReLU(0.2)),
("batch_norm4", nn.BatchNorm1d(512)),
("conv5", nn.Conv1d(512, 1536, 1)),
("activation5", nn.LeakyReLU(0.2)),
("batch_norm5", nn.BatchNorm1d(1536)),
]
)
)
self.embedding_size = embedding_size
self.stat_pooling = sidekit.nnet.MeanStdPooling()
self.before_speaker_embedding = nn.Sequential(
OrderedDict([("linear6", nn.Linear(3072, self.embedding_size))])
)
# The final layer computes the loss
if self.loss == "aam":
self.after_speaker_embedding = sidekit.nnet.ArcMarginProduct(
self.embedding_size,
int(self.speaker_number),
s=30.0,
m=0.2,
easy_margin=False,
)
self.after_speaker_embedding_emotion = nn.Linear(
self.embedding_size, 5
) # 5 -> 5 emotions
self.after_speaker_embedding_emotion_loss = torch.nn.CrossEntropyLoss()
def set_lr_weight_decay_layers_for_optim(self, _optimizer, _options):
self._optimizer_option = _options
self._optimizer = _optimizer
# fmt: off
param_list = []
param_list.append({"params": self.preprocessor.parameters(), "weight_decay": 0.0002})
param_list.append({"params": self.sequence_network.parameters(), "weight_decay": 0.0002})
param_list.append({ "params": self.stat_pooling.parameters(), "weight_decay": 0})
param_list.append({ "params": self.before_speaker_embedding.parameters(), "weight_decay": 0.002})
param_list.append({ "params": self.after_speaker_embedding.parameters(), "weight_decay": 0.002})
# EMOTION: param_list.append({ "params": self.after_speaker_embedding_emotion.parameters(), "weight_decay": 0.002})
# fmt: on
self.optimizer = _optimizer(param_list, **_options)
# example on applying different LR to different layers
# self.optimizer.param_groups[0]["lr"] = _options["lr"] / 2
return self.optimizer
super().__init__(speaker_number, model_archi="halfresnet34", loss=loss, embedding_size=embedding_size)
self.param_device_detection = nn.Parameter(
torch.empty(0)) # Empty parameter used to detect model device location
# # You can change the parameters value by changing the 'config/custom/model.yaml' config
# super().__init__()
#
# if loss not in ["aam"]:
# raise NotImplementedError(f"Loss not implemented")
#
# self.preprocessor = sidekit.nnet.MfccFrontEnd()
# feature_size = self.preprocessor.n_mfcc
# self.loss = loss
# self.speaker_number = speaker_number
#
# self.sequence_network = nn.Sequential(
# OrderedDict(
# [
# ("conv1", nn.Conv1d(feature_size, 512, 5, dilation=1)),
# ("activation1", nn.LeakyReLU(0.2)),
# ("batch_norm1", nn.BatchNorm1d(512)),
# ("conv2", nn.Conv1d(512, 512, 3, dilation=2)),
# ("activation2", nn.LeakyReLU(0.2)),
# ("batch_norm2", nn.BatchNorm1d(512)),
# ("conv3", nn.Conv1d(512, 512, 3, dilation=3)),
# ("activation3", nn.LeakyReLU(0.2)),
# ("batch_norm3", nn.BatchNorm1d(512)),
# ("conv4", nn.Conv1d(512, 512, 1)),
# ("activation4", nn.LeakyReLU(0.2)),
# ("batch_norm4", nn.BatchNorm1d(512)),
# ("conv5", nn.Conv1d(512, 1536, 1)),
# ("activation5", nn.LeakyReLU(0.2)),
# ("batch_norm5", nn.BatchNorm1d(1536)),
# ]
# )
# )
#
# self.embedding_size = embedding_size
#
# self.stat_pooling = sidekit.nnet.MeanStdPooling()
# self.before_speaker_embedding = nn.Sequential(
# OrderedDict([("linear6", nn.Linear(3072, self.embedding_size))])
# )
#
# # The final layer computes the loss
# if self.loss == "aam":
# self.after_speaker_embedding = sidekit.nnet.ArcMarginProduct(
# self.embedding_size,
# int(self.speaker_number),
# s=30.0,
# m=0.2,
# easy_margin=False,
# )
#
# self.after_speaker_embedding_emotion = nn.Linear(
# self.embedding_size, 5
# ) # 5 -> 5 emotions
# self.after_speaker_embedding_emotion_loss = torch.nn.CrossEntropyLoss()
#
# def set_lr_weight_decay_layers_for_optim(self, _optimizer, _options):
# self._optimizer_option = _options
# self._optimizer = _optimizer
#
# # fmt: off
# param_list = []
# param_list.append({"params": self.preprocessor.parameters(), "weight_decay": 0.0002})
# param_list.append({"params": self.sequence_network.parameters(), "weight_decay": 0.0002})
# param_list.append({ "params": self.stat_pooling.parameters(), "weight_decay": 0})
# param_list.append({ "params": self.before_speaker_embedding.parameters(), "weight_decay": 0.002})
# param_list.append({ "params": self.after_speaker_embedding.parameters(), "weight_decay": 0.002})
#
# # EMOTION: param_list.append({ "params": self.after_speaker_embedding_emotion.parameters(), "weight_decay": 0.002})
# # fmt: on
#
# self.optimizer = _optimizer(param_list, **_options)
#
# # example on applying different LR to different layers
# # self.optimizer.param_groups[0]["lr"] = _options["lr"] / 2
#
# return self.optimizer
def forward(self, args, target=None, norm_embedding=True):
"""
......@@ -112,7 +117,7 @@ def build():
- the x-vector embedding
i.e., (loss, cce), x_vector = model([...])
"""
x = args["speech"]
x = args["speech"].to(self.param_device_detection.device)
x = x.squeeze(1)
x = self.preprocessor(x)
x = self.sequence_network(x)
......@@ -138,6 +143,7 @@ def build():
# possible to add losses together for multitask training i.e.: emotion_loss + speaker_loss[0] * 0.2
def test(self, model_opts, dataset_opts, training_opts, device="cpu"):
return
# EER computation for the testing dataset
# you can tweak this to your own task (emotion reco...)
......@@ -280,3 +286,23 @@ def get_data_loading_hook(sessions):
return args
return _hook
# Custom data collate for padding with zeroes
# when the whole audio file is considered
def collate_hook(batch):
data_speech_list, data_f0_list, data_emotion_list, target_spk_list = [], [], [], []
# Extract data from batch
for data, target in batch:
data_speech_list.append(data["speech"].squeeze(0))
data_f0_list.append(data["F0"].squeeze(0))
data_emotion_list.append(data["emotion"])
target_spk_list.append(target)
# Pad tensors lists if required and construct output data
out_speech = nn.utils.rnn.pad_sequence(data_speech_list, batch_first=True, padding_value=0.0)
out_f0 = nn.utils.rnn.pad_sequence(data_f0_list, batch_first=True, padding_value=0.0)
out_data_dict = {"speech": out_speech.unsqueeze(1), "F0": out_f0.unsqueeze(1), "emotion": torch.tensor(data_emotion_list)}
out_target = torch.tensor(target_spk_list)
return out_data_dict, out_target
\ No newline at end of file
......@@ -9,6 +9,7 @@ loss:
# Warning, this hook is experimental, it is broking some other scripts (extract_xvectors.py, scoring..)
data_loading_hook: ./config/custom/model.py
collate_hook: ./config/custom/model.py
# Initialize model from file, reset and freeze parts of it
initial_model_name:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment