Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Speaker
sidekit
Commits
c7235553
Commit
c7235553
authored
May 23, 2022
by
Hubert Nourtel
Browse files
Adding collate hook for IEMOCAP for loading entire utterances
parent
e6736976
Changes
2
Hide whitespace changes
Inline
Side-by-side
egs/iemocap/config/custom/model.py
View file @
c7235553
...
...
@@ -23,84 +23,89 @@ def build():
"""
# Define your model, you can use building blocks from sidekit.nnet
class
Net
(
nn
.
Module
):
class
Net
(
sidekit
.
nnet
.
Xtractor
):
def
__init__
(
self
,
speaker_number
,
loss
=
None
,
embedding_size
=
256
):
# You can change the parameters value by changing the 'config/custom/model.yaml' config
super
().
__init__
()
if
loss
not
in
[
"aam"
]:
raise
NotImplementedError
(
f
"Loss not implemented"
)
self
.
preprocessor
=
sidekit
.
nnet
.
MfccFrontEnd
()
feature_size
=
self
.
preprocessor
.
n_mfcc
self
.
loss
=
loss
self
.
speaker_number
=
speaker_number
self
.
sequence_network
=
nn
.
Sequential
(
OrderedDict
(
[
(
"conv1"
,
nn
.
Conv1d
(
feature_size
,
512
,
5
,
dilation
=
1
)),
(
"activation1"
,
nn
.
LeakyReLU
(
0.2
)),
(
"batch_norm1"
,
nn
.
BatchNorm1d
(
512
)),
(
"conv2"
,
nn
.
Conv1d
(
512
,
512
,
3
,
dilation
=
2
)),
(
"activation2"
,
nn
.
LeakyReLU
(
0.2
)),
(
"batch_norm2"
,
nn
.
BatchNorm1d
(
512
)),
(
"conv3"
,
nn
.
Conv1d
(
512
,
512
,
3
,
dilation
=
3
)),
(
"activation3"
,
nn
.
LeakyReLU
(
0.2
)),
(
"batch_norm3"
,
nn
.
BatchNorm1d
(
512
)),
(
"conv4"
,
nn
.
Conv1d
(
512
,
512
,
1
)),
(
"activation4"
,
nn
.
LeakyReLU
(
0.2
)),
(
"batch_norm4"
,
nn
.
BatchNorm1d
(
512
)),
(
"conv5"
,
nn
.
Conv1d
(
512
,
1536
,
1
)),
(
"activation5"
,
nn
.
LeakyReLU
(
0.2
)),
(
"batch_norm5"
,
nn
.
BatchNorm1d
(
1536
)),
]
)
)
self
.
embedding_size
=
embedding_size
self
.
stat_pooling
=
sidekit
.
nnet
.
MeanStdPooling
()
self
.
before_speaker_embedding
=
nn
.
Sequential
(
OrderedDict
([(
"linear6"
,
nn
.
Linear
(
3072
,
self
.
embedding_size
))])
)
# The final layer computes the loss
if
self
.
loss
==
"aam"
:
self
.
after_speaker_embedding
=
sidekit
.
nnet
.
ArcMarginProduct
(
self
.
embedding_size
,
int
(
self
.
speaker_number
),
s
=
30.0
,
m
=
0.2
,
easy_margin
=
False
,
)
self
.
after_speaker_embedding_emotion
=
nn
.
Linear
(
self
.
embedding_size
,
5
)
# 5 -> 5 emotions
self
.
after_speaker_embedding_emotion_loss
=
torch
.
nn
.
CrossEntropyLoss
()
def
set_lr_weight_decay_layers_for_optim
(
self
,
_optimizer
,
_options
):
self
.
_optimizer_option
=
_options
self
.
_optimizer
=
_optimizer
# fmt: off
param_list
=
[]
param_list
.
append
({
"params"
:
self
.
preprocessor
.
parameters
(),
"weight_decay"
:
0.0002
})
param_list
.
append
({
"params"
:
self
.
sequence_network
.
parameters
(),
"weight_decay"
:
0.0002
})
param_list
.
append
({
"params"
:
self
.
stat_pooling
.
parameters
(),
"weight_decay"
:
0
})
param_list
.
append
({
"params"
:
self
.
before_speaker_embedding
.
parameters
(),
"weight_decay"
:
0.002
})
param_list
.
append
({
"params"
:
self
.
after_speaker_embedding
.
parameters
(),
"weight_decay"
:
0.002
})
# EMOTION: param_list.append({ "params": self.after_speaker_embedding_emotion.parameters(), "weight_decay": 0.002})
# fmt: on
self
.
optimizer
=
_optimizer
(
param_list
,
**
_options
)
# example on applying different LR to different layers
# self.optimizer.param_groups[0]["lr"] = _options["lr"] / 2
return
self
.
optimizer
super
().
__init__
(
speaker_number
,
model_archi
=
"halfresnet34"
,
loss
=
loss
,
embedding_size
=
embedding_size
)
self
.
param_device_detection
=
nn
.
Parameter
(
torch
.
empty
(
0
))
# Empty parameter used to detect model device location
# # You can change the parameters value by changing the 'config/custom/model.yaml' config
# super().__init__()
#
# if loss not in ["aam"]:
# raise NotImplementedError(f"Loss not implemented")
#
# self.preprocessor = sidekit.nnet.MfccFrontEnd()
# feature_size = self.preprocessor.n_mfcc
# self.loss = loss
# self.speaker_number = speaker_number
#
# self.sequence_network = nn.Sequential(
# OrderedDict(
# [
# ("conv1", nn.Conv1d(feature_size, 512, 5, dilation=1)),
# ("activation1", nn.LeakyReLU(0.2)),
# ("batch_norm1", nn.BatchNorm1d(512)),
# ("conv2", nn.Conv1d(512, 512, 3, dilation=2)),
# ("activation2", nn.LeakyReLU(0.2)),
# ("batch_norm2", nn.BatchNorm1d(512)),
# ("conv3", nn.Conv1d(512, 512, 3, dilation=3)),
# ("activation3", nn.LeakyReLU(0.2)),
# ("batch_norm3", nn.BatchNorm1d(512)),
# ("conv4", nn.Conv1d(512, 512, 1)),
# ("activation4", nn.LeakyReLU(0.2)),
# ("batch_norm4", nn.BatchNorm1d(512)),
# ("conv5", nn.Conv1d(512, 1536, 1)),
# ("activation5", nn.LeakyReLU(0.2)),
# ("batch_norm5", nn.BatchNorm1d(1536)),
# ]
# )
# )
#
# self.embedding_size = embedding_size
#
# self.stat_pooling = sidekit.nnet.MeanStdPooling()
# self.before_speaker_embedding = nn.Sequential(
# OrderedDict([("linear6", nn.Linear(3072, self.embedding_size))])
# )
#
# # The final layer computes the loss
# if self.loss == "aam":
# self.after_speaker_embedding = sidekit.nnet.ArcMarginProduct(
# self.embedding_size,
# int(self.speaker_number),
# s=30.0,
# m=0.2,
# easy_margin=False,
# )
#
# self.after_speaker_embedding_emotion = nn.Linear(
# self.embedding_size, 5
# ) # 5 -> 5 emotions
# self.after_speaker_embedding_emotion_loss = torch.nn.CrossEntropyLoss()
#
# def set_lr_weight_decay_layers_for_optim(self, _optimizer, _options):
# self._optimizer_option = _options
# self._optimizer = _optimizer
#
# # fmt: off
# param_list = []
# param_list.append({"params": self.preprocessor.parameters(), "weight_decay": 0.0002})
# param_list.append({"params": self.sequence_network.parameters(), "weight_decay": 0.0002})
# param_list.append({ "params": self.stat_pooling.parameters(), "weight_decay": 0})
# param_list.append({ "params": self.before_speaker_embedding.parameters(), "weight_decay": 0.002})
# param_list.append({ "params": self.after_speaker_embedding.parameters(), "weight_decay": 0.002})
#
# # EMOTION: param_list.append({ "params": self.after_speaker_embedding_emotion.parameters(), "weight_decay": 0.002})
# # fmt: on
#
# self.optimizer = _optimizer(param_list, **_options)
#
# # example on applying different LR to different layers
# # self.optimizer.param_groups[0]["lr"] = _options["lr"] / 2
#
# return self.optimizer
def
forward
(
self
,
args
,
target
=
None
,
norm_embedding
=
True
):
"""
...
...
@@ -112,7 +117,7 @@ def build():
- the x-vector embedding
i.e., (loss, cce), x_vector = model([...])
"""
x
=
args
[
"speech"
]
x
=
args
[
"speech"
]
.
to
(
self
.
param_device_detection
.
device
)
x
=
x
.
squeeze
(
1
)
x
=
self
.
preprocessor
(
x
)
x
=
self
.
sequence_network
(
x
)
...
...
@@ -138,6 +143,7 @@ def build():
# possible to add losses together for multitask training i.e.: emotion_loss + speaker_loss[0] * 0.2
def
test
(
self
,
model_opts
,
dataset_opts
,
training_opts
,
device
=
"cpu"
):
return
# EER computation for the testing dataset
# you can tweak this to your own task (emotion reco...)
...
...
@@ -280,3 +286,23 @@ def get_data_loading_hook(sessions):
return
args
return
_hook
# Custom data collate for padding with zeroes
# when the whole audio file is considered
def
collate_hook
(
batch
):
data_speech_list
,
data_f0_list
,
data_emotion_list
,
target_spk_list
=
[],
[],
[],
[]
# Extract data from batch
for
data
,
target
in
batch
:
data_speech_list
.
append
(
data
[
"speech"
].
squeeze
(
0
))
data_f0_list
.
append
(
data
[
"F0"
].
squeeze
(
0
))
data_emotion_list
.
append
(
data
[
"emotion"
])
target_spk_list
.
append
(
target
)
# Pad tensors lists if required and construct output data
out_speech
=
nn
.
utils
.
rnn
.
pad_sequence
(
data_speech_list
,
batch_first
=
True
,
padding_value
=
0.0
)
out_f0
=
nn
.
utils
.
rnn
.
pad_sequence
(
data_f0_list
,
batch_first
=
True
,
padding_value
=
0.0
)
out_data_dict
=
{
"speech"
:
out_speech
.
unsqueeze
(
1
),
"F0"
:
out_f0
.
unsqueeze
(
1
),
"emotion"
:
torch
.
tensor
(
data_emotion_list
)}
out_target
=
torch
.
tensor
(
target_spk_list
)
return
out_data_dict
,
out_target
\ No newline at end of file
egs/iemocap/config/custom/model.yaml
View file @
c7235553
...
...
@@ -9,6 +9,7 @@ loss:
# Warning, this hook is experimental, it is broking some other scripts (extract_xvectors.py, scoring..)
data_loading_hook
:
./config/custom/model.py
collate_hook
:
./config/custom/model.py
# Initialize model from file, reset and freeze parts of it
initial_model_name
:
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment