Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Speaker
sidekit
Commits
91dfba68
Commit
91dfba68
authored
Apr 26, 2022
by
Hubert Nourtel
Browse files
Merge branch 'main' into 'main'
Main See merge request
!1
parents
e30b391e
24ec5a6c
Changes
25
Expand all
Hide whitespace changes
Inline
Side-by-side
egs/iemocap/.gitkeep
0 → 100644
View file @
91dfba68
egs/iemocap/config/custom/Iemocap.yaml
0 → 100644
View file @
91dfba68
# Dataset description
# General options
data_path
:
/
data_file_extension
:
.wav
dataset_csv
:
list/iemocap_ses1-test.csv
sample_rate
:
16000
validation_ratio
:
0.02
batch_size
:
4
# Training set
train
:
duration
:
3.
chunk_per_segment
:
-1
overlap
:
3.
sampler
:
examples_per_speaker
:
1
samples_per_speaker
:
100
augmentation_replica
:
1
transform_number
:
0
transformation
:
pipeline
:
# no transformation
# pipeline: add_reverb,add_noise,filtering,phone_filtering,codec
add_noise
:
noise_db_csv
:
list/musan.csv
data_path
:
/
add_reverb
:
rir_db_csv
:
list/reverb.csv
data_path
:
/
# Validation set
valid
:
duration
:
3.
transformation
:
pipeline
:
# no transformation
add_noise
:
noise_db_csv
:
list/musan.csv
data_path
:
/
# Test set
test
:
idmap
:
./list/asv_test_libri/libri_test_idmap.h5
ndx
:
./list/asv_test_libri/libri_test_ndx.h5
key
:
./list/asv_test_libri/libri_test_key.h5
data_path
:
.
id2wav
:
./data/asv_test_libri/libri_test.id2wav
egs/iemocap/config/custom/model.py
0 → 100644
View file @
91dfba68
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
collections
import
OrderedDict
import
sidekit.nnet
def
build
():
# You can also inherit nn.sidekit.nnet.Xtractor directly (change model_archi)
"""
class Net(sidekit.nnet.Xtractor):
def __init__(self, speaker_number, loss=None, embedding_size=256):
super().__init__(speaker_number, model_archi="xvector", loss=loss, embedding_size=embedding_size)
# add additional logic here
def forward(self, x, target=None, norm_embedding=True):
return super().forward(x, target, norm_embedding)
# add additional logic here
return Net
"""
# Define your model, you can use building blocks from sidekit.nnet
class
Net
(
nn
.
Module
):
def
__init__
(
self
,
speaker_number
,
loss
=
None
,
embedding_size
=
256
):
# You can change the parameters value by changing the 'config/custom/model.yaml' config
super
().
__init__
()
if
loss
not
in
[
"aam"
]:
raise
NotImplementedError
(
f
"Loss not implemented"
)
self
.
preprocessor
=
sidekit
.
nnet
.
MfccFrontEnd
()
feature_size
=
self
.
preprocessor
.
n_mfcc
self
.
loss
=
loss
self
.
speaker_number
=
speaker_number
self
.
sequence_network
=
nn
.
Sequential
(
OrderedDict
(
[
(
"conv1"
,
nn
.
Conv1d
(
feature_size
,
512
,
5
,
dilation
=
1
)),
(
"activation1"
,
nn
.
LeakyReLU
(
0.2
)),
(
"batch_norm1"
,
nn
.
BatchNorm1d
(
512
)),
(
"conv2"
,
nn
.
Conv1d
(
512
,
512
,
3
,
dilation
=
2
)),
(
"activation2"
,
nn
.
LeakyReLU
(
0.2
)),
(
"batch_norm2"
,
nn
.
BatchNorm1d
(
512
)),
(
"conv3"
,
nn
.
Conv1d
(
512
,
512
,
3
,
dilation
=
3
)),
(
"activation3"
,
nn
.
LeakyReLU
(
0.2
)),
(
"batch_norm3"
,
nn
.
BatchNorm1d
(
512
)),
(
"conv4"
,
nn
.
Conv1d
(
512
,
512
,
1
)),
(
"activation4"
,
nn
.
LeakyReLU
(
0.2
)),
(
"batch_norm4"
,
nn
.
BatchNorm1d
(
512
)),
(
"conv5"
,
nn
.
Conv1d
(
512
,
1536
,
1
)),
(
"activation5"
,
nn
.
LeakyReLU
(
0.2
)),
(
"batch_norm5"
,
nn
.
BatchNorm1d
(
1536
)),
]
)
)
self
.
embedding_size
=
embedding_size
self
.
stat_pooling
=
sidekit
.
nnet
.
MeanStdPooling
()
self
.
before_speaker_embedding
=
nn
.
Sequential
(
OrderedDict
([(
"linear6"
,
nn
.
Linear
(
3072
,
self
.
embedding_size
))])
)
# The final layer computes the loss
if
self
.
loss
==
"aam"
:
self
.
after_speaker_embedding
=
sidekit
.
nnet
.
ArcMarginProduct
(
self
.
embedding_size
,
int
(
self
.
speaker_number
),
s
=
30.0
,
m
=
0.2
,
easy_margin
=
False
,
)
self
.
after_speaker_embedding_emotion
=
nn
.
Linear
(
self
.
embedding_size
,
5
)
# 5 -> 5 emotions
self
.
after_speaker_embedding_emotion_loss
=
torch
.
nn
.
CrossEntropyLoss
()
def
set_lr_weight_decay_layers_for_optim
(
self
,
_optimizer
,
_options
):
self
.
_optimizer_option
=
_options
self
.
_optimizer
=
_optimizer
# fmt: off
param_list
=
[]
param_list
.
append
({
"params"
:
self
.
preprocessor
.
parameters
(),
"weight_decay"
:
0.0002
})
param_list
.
append
({
"params"
:
self
.
sequence_network
.
parameters
(),
"weight_decay"
:
0.0002
})
param_list
.
append
({
"params"
:
self
.
stat_pooling
.
parameters
(),
"weight_decay"
:
0
})
param_list
.
append
({
"params"
:
self
.
before_speaker_embedding
.
parameters
(),
"weight_decay"
:
0.002
})
param_list
.
append
({
"params"
:
self
.
after_speaker_embedding
.
parameters
(),
"weight_decay"
:
0.002
})
# EMOTION: param_list.append({ "params": self.after_speaker_embedding_emotion.parameters(), "weight_decay": 0.002})
# fmt: on
self
.
optimizer
=
_optimizer
(
param_list
,
**
_options
)
# example on applying different LR to different layers
# self.optimizer.param_groups[0]["lr"] = _options["lr"] / 2
return
self
.
optimizer
def
forward
(
self
,
args
,
target
=
None
,
norm_embedding
=
True
):
"""
The forward mothod MUST take 3 arguemnts
The forward mothod MUST return 2 values:
- a tuple of: (loss: to train the model, in testing (target==None) you should return torch.tensor(torch.nan).
cross-entroy prediction: raw output of the network to compute accuracy on
- In this example the returned value handled by: ArcMarginProduct
- the x-vector embedding
i.e., (loss, cce), x_vector = model([...])
"""
x
=
args
[
"speech"
]
x
=
x
.
squeeze
(
1
)
x
=
self
.
preprocessor
(
x
)
x
=
self
.
sequence_network
(
x
)
x
=
self
.
stat_pooling
(
x
)
x
=
self
.
before_speaker_embedding
(
x
)
if
norm_embedding
:
x
=
F
.
normalize
(
x
,
dim
=
1
)
speaker_loss
,
s_layer
=
self
.
after_speaker_embedding
(
x
,
target
=
target
)
return
(
speaker_loss
,
s_layer
),
x
e_layer
=
self
.
after_speaker_embedding_emotion
(
x
)
emotion_loss
=
torch
.
tensor
(
torch
.
nan
)
if
"emotion"
in
args
:
emotion_loss
=
self
.
after_speaker_embedding_emotion_loss
(
e_layer
,
args
[
"emotion"
]
)
return
(
emotion_loss
,
e_layer
),
x
# possible to add losses together for multitask training i.e.: emotion_loss + speaker_loss[0] * 0.2
def
test
(
self
,
model_opts
,
dataset_opts
,
training_opts
,
device
=
"cpu"
):
# EER computation for the testing dataset
# you can tweak this to your own task (emotion reco...)
enroll_dataset
=
sidekit
.
nnet
.
IdMapSet
(
idmap_name
=
dataset_opts
[
"test"
][
"idmap"
],
data_path
=
dataset_opts
[
"test"
][
"data_path"
],
file_extension
=
dataset_opts
[
"data_file_extension"
].
replace
(
"."
,
""
),
transform_number
=
0
,
id_wavs_maps
=
dataset_opts
[
"test"
][
"id2wav"
],
sliding_window
=
False
,
hook
=
get_data_loading_hook
(
dataset_opts
),
# local usage of the hook
)
enroll_dataloader
=
torch
.
utils
.
data
.
DataLoader
(
enroll_dataset
,
batch_size
=
1
,
shuffle
=
False
,
drop_last
=
False
,
pin_memory
=
True
,
num_workers
=
training_opts
[
"num_cpu"
],
)
# reverse IdMap
ndx
=
sidekit
.
bosaris
.
Ndx
(
dataset_opts
[
"test"
][
"ndx"
])
key
=
sidekit
.
bosaris
.
Key
(
dataset_opts
[
"test"
][
"key"
])
test_idmap
=
sidekit
.
bosaris
.
IdMap
()
test_idmap
.
leftids
=
ndx
.
segset
test_idmap
.
rightids
=
ndx
.
segset
test_idmap
.
start
=
[
None
]
*
len
(
ndx
.
segset
)
test_idmap
.
stop
=
[
None
]
*
len
(
ndx
.
segset
)
test_dataset
=
sidekit
.
nnet
.
IdMapSet
(
idmap_name
=
test_idmap
,
data_path
=
dataset_opts
[
"test"
][
"data_path"
],
file_extension
=
dataset_opts
[
"data_file_extension"
].
replace
(
"."
,
""
),
transform_number
=
0
,
id_wavs_maps
=
dataset_opts
[
"test"
][
"id2wav"
],
sliding_window
=
False
,
hook
=
get_data_loading_hook
(
dataset_opts
),
# local usage of the hook
)
test_dataloader
=
torch
.
utils
.
data
.
DataLoader
(
test_dataset
,
batch_size
=
1
,
shuffle
=
False
,
drop_last
=
False
,
pin_memory
=
True
,
num_workers
=
training_opts
[
"num_cpu"
],
)
def
_format
(
data
):
data
=
data
[
"speech"
].
to
(
device
)
return
data
def
_pre_forward
(
x
):
return
{
"speech"
:
x
}
enrolls_stat
=
sidekit
.
nnet
.
extract_embeddings_from_dataloader
(
self
,
enroll_dataloader
,
device
=
device
,
format
=
_format
,
pre_forward
=
_pre_forward
,
mixed_precision
=
training_opts
[
"mixed_precision"
],
)
test_stat
=
sidekit
.
nnet
.
extract_embeddings_from_dataloader
(
self
,
test_dataloader
,
device
=
device
,
format
=
_format
,
pre_forward
=
_pre_forward
,
mixed_precision
=
training_opts
[
"mixed_precision"
],
)
# Compute cosine similarity
cosine_scores
=
sidekit
.
iv_scoring
.
cosine_scoring
(
enrolls_stat
,
test_stat
,
ndx
,
device
=
device
)
scores
=
cosine_scores
.
scoremat
scores
=
scores
[
ndx
.
trialmask
]
key
.
tar
=
key
.
tar
[
ndx
.
trialmask
]
key
.
non
=
key
.
non
[
ndx
.
trialmask
]
pmiss
,
pfa
=
sidekit
.
bosaris
.
detplot
.
rocch
(
scores
[
key
.
tar
],
scores
[
key
.
non
])
eer
=
sidekit
.
bosaris
.
detplot
.
rocch2eer
(
pmiss
,
pfa
)
print
(
f
"**Test metrics - Test EER =
{
eer
*
100
}
%"
)
def
new_epoch_hook
(
self
,
current_epoch
,
total_epoch
):
pass
# example of modifying the optimizer / freezing some layers depending on the epoch
"""
self.optimizer.param_groups[0]["lr"] = self.optimizer_option["lr"]
if current_epoch < total_epoch * 0.40:
self.optimizer.param_groups[0]["lr"] = self.optimizer_option["lr"] / 2
switch_require_grad = False
for name, param in self.named_parameters():
if name.startswith("sequence_network.conv4"):
switch_require_grad = True
param.requires_grad = switch_require_grad
"""
@
torch
.
no_grad
()
def
validate_model
(
self
):
# fmt: off
print
(
"Model_parameters_count: {:d}"
.
format
(
sum
(
p
.
numel
()
for
p
in
self
.
sequence_network
.
parameters
()
if
p
.
requires_grad
)
+
sum
(
p
.
numel
()
for
p
in
self
.
before_speaker_embedding
.
parameters
()
if
p
.
requires_grad
)
+
sum
(
p
.
numel
()
for
p
in
self
.
stat_pooling
.
parameters
()
if
p
.
requires_grad
)
))
# fmt: on
batch
=
torch
.
rand
(
16
,
32000
)
indices
=
torch
.
randint
(
0
,
5
,
size
=
(
16
,))
_
,
x_vector
=
self
.
forward
({
"speech"
:
batch
,
"emotion"
:
indices
})
assert
x_vector
.
shape
[
1
]
==
256
return
Net
# DATA loading hook to add your own data/target
# used by sidekit internaly
# you can also call it on your own (i.e.: in the test function)
def
get_data_loading_hook
(
sessions
):
# print(sessions)
# This hook is exectued during dataloading (Done by the CPU in parallel)
def
_hook
(
speech
,
csv_line
,
file_ext
):
if
speech
.
ndim
==
1
:
speech
=
speech
.
unsqueeze
(
0
)
# print(speech.shape, csv_line, file_ext)
# check for test dset with csv_line["dataset"] == "test"
# Here you can modify what is
args
=
{}
args
[
"speech"
]
=
speech
args
[
"F0"
]
=
torch
.
rand
((
1
,
speech
.
size
(
1
)
//
320
))
# fake F0 extractor
# Fake emotion anontation
n_emo
=
5
indice
=
torch
.
randint
(
0
,
5
,
size
=
(
1
,))[
0
]
# (Either 0,1,2,3,4)
args
[
"emotion"
]
=
indice
# fake emotion anontation
return
args
return
_hook
egs/iemocap/config/custom/model.yaml
0 → 100644
View file @
91dfba68
# Model description
speaker_number
:
4
loss
:
type
:
aam
aam_margin
:
0.2
aam_s
:
30
# Warning, this hook is experimental, it is broking some other scripts (extract_xvectors.py, scoring..)
data_loading_hook
:
./config/custom/model.py
# Initialize model from file, reset and freeze parts of it
initial_model_name
:
reset_parts
:
[
after_speaker_embedding
]
freeze_parts
:
[]
#[preprocessor,sequence_network,stat_pooling,before_speaker_embedding]
# Model can be fastresnet34, resnet34, xvector, ..
model_type
:
./config/custom/model.py
egs/iemocap/config/custom/modification_yaml.py
0 → 100644
View file @
91dfba68
import
ruamel.yaml
import
argparse
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"session_test"
,
help
=
"The session used in test"
,
type
=
int
)
parser
.
add_argument
(
"categories"
,
help
=
"The number of categories"
,
type
=
int
)
parser
.
add_argument
(
"batch"
,
help
=
"The number of batch"
,
type
=
int
)
parser
.
add_argument
(
"lr"
,
help
=
"The learning rate"
,
type
=
float
)
args
=
parser
.
parse_args
()
## Preparation of all arguments
# For model.yaml
nb_cate
=
args
.
categories
# For Iemocap.yaml
batch
=
args
.
batch
examples
=
int
(
batch
/
nb_cate
)
session_test
=
args
.
session_test
# For training.yaml
lr
=
args
.
lr
tmp
=
"model_custom/tmp_custom_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.pt"
.
format
(
nb_cate
,
batch
,
lr
,
session_test
)
best
=
"model_custom/best_custom_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.pt"
.
format
(
nb_cate
,
batch
,
lr
,
session_test
)
log
=
"logs/half_resnet34_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.log"
.
format
(
nb_cate
,
batch
,
lr
,
session_test
)
## Modification of variables in YAML files
# model.yaml
yaml
=
ruamel
.
yaml
.
YAML
()
with
open
(
"config/custom/model.yaml"
)
as
fp
:
data
=
yaml
.
load
(
fp
)
data
[
'speaker_number'
]
=
int
(
nb_cate
)
with
open
(
"config/custom/model.yaml"
,
'w'
)
as
fp
:
yaml
.
dump
(
data
,
fp
)
# training.yaml
yaml
=
ruamel
.
yaml
.
YAML
()
with
open
(
"config/custom/training.yaml"
)
as
fp
:
data
=
yaml
.
load
(
fp
)
data
[
'lr'
]
=
lr
data
[
"tmp_model_name"
]
=
tmp
data
[
"best_model_name"
]
=
best
data
[
"log_file"
]
=
log
with
open
(
"config/custom/training.yaml"
,
'w'
)
as
fp
:
yaml
.
dump
(
data
,
fp
)
# Iemocap.yaml
yaml
=
ruamel
.
yaml
.
YAML
()
with
open
(
"config/custom/Iemocap.yaml"
)
as
fp
:
data
=
yaml
.
load
(
fp
)
data
[
'batch_size'
]
=
batch
data
[
"train"
][
"sampler"
][
"examples_per_speaker"
]
=
examples
data
[
"dataset_csv"
]
=
"list/iemocap_ses{}-test.csv"
.
format
(
session_test
)
with
open
(
"config/custom/Iemocap.yaml"
,
'w'
)
as
fp
:
yaml
.
dump
(
data
,
fp
)
egs/iemocap/config/custom/training.yaml
0 → 100644
View file @
91dfba68
# Training description
# General options
log_file
:
logs/custom.log
torch_seed
:
42
numpy_seed
:
42
random_seed
:
42
deterministic
:
false
epochs
:
100
lr
:
0.0001
patience
:
30
multi_gpu
:
false
num_cpu
:
16
mixed_precision
:
true
clipping
:
false
# Optimizer and scheduler options
optimizer
:
type
:
adam
options
:
scheduler
:
type
:
CyclicLR
mode
:
triangular2
base_lr
:
1.0e-05
step_size_up
:
40000
# Evaluation options
compute_test_eer
:
false
log_interval
:
50
validation_frequency
:
1
# Save options
tmp_model_name
:
model_custom/tmp_custom_4emo_4batch_lr-0.0001_Test-IEMOCAP1.pt
best_model_name
:
model_custom/best_custom_4emo_4batch_lr-0.0001_Test-IEMOCAP1.pt
checkpoint_frequency
:
egs/iemocap/config/half_resnet34/Iemocap.yaml
0 → 100644
View file @
91dfba68
# Dataset description
# General options
data_path
:
/
# path to add before each wavs of list/voxceleb2.csv
data_file_extension
:
.wav
dataset_csv
:
list/iemocap_ses2-test.csv
sample_rate
:
16000
validation_ratio
:
0.02
batch_size
:
4
# Training set
train
:
duration
:
3
chunk_per_segment
:
-1
overlap
:
3
sampler
:
examples_per_speaker
:
1
samples_per_speaker
:
192
augmentation_replica
:
1
transform_number
:
1
transformation
:
pipeline
:
add_reverb,add_noise,filtering,phone_filtering,codec
add_noise
:
noise_db_csv
:
list/musan.csv
data_path
:
/
add_reverb
:
rir_db_csv
:
list/reverb.csv
data_path
:
/
# Validation set
valid
:
duration
:
3
transformation
:
pipeline
:
# no transformation
add_noise
:
noise_db_csv
:
list/musan.csv
data_path
:
/
# Test set (set 'compute_test_eer' to true in training.yaml)
test
:
idmap
:
./list/asv_test/voxceleb1-O-clean_idmap.h5
ndx
:
./list/asv_test/voxceleb1-O-clean_ndx.h5
key
:
./list/asv_test/voxceleb1-O-clean_key.h5
data_path
:
.
id2wav
:
./data/asv_test_voxceleb1/voxceleb1-O-clean.id2wav
egs/iemocap/config/half_resnet34/model.yaml
0 → 100644
View file @
91dfba68
# Model description
speaker_number
:
4
loss
:
type
:
aam
aam_margin
:
0.2
aam_s
:
30
# Initialize model from file, reset and freeze parts of it
initial_model_name
:
#/srv/storage/talc@talc-data.nancy/multispeech/calcul/users/hnourtel/sidekit/best_halp_clr_adam_aam0.2_30_b256_vox12.pt_epoch201
reset_parts
:
[
after_speaker_embedding
]
freeze_parts
:
[]
# Model can be fastresnet34, resnet34, xvector, ..
model_type
:
halfresnet34
egs/iemocap/config/half_resnet34/modification_yaml.py
0 → 100644
View file @
91dfba68
import
ruamel.yaml
import
argparse
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"session_test"
,
help
=
"The session used in test"
,
type
=
int
)
parser
.
add_argument
(
"categories"
,
help
=
"The number of categories"
,
type
=
int
)
parser
.
add_argument
(
"batch"
,
help
=
"The number of batch"
,
type
=
int
)
parser
.
add_argument
(
"lr"
,
help
=
"The learning rate"
,
type
=
float
)
args
=
parser
.
parse_args
()
## Preparation of all arguments
# For model.yaml
nb_cate
=
args
.
categories
# For Iemocap.yaml
batch
=
args
.
batch
examples
=
int
(
batch
/
nb_cate
)
session_test
=
args
.
session_test
# For training.yaml
lr
=
args
.
lr
tmp
=
"model_half_resnet34/tmp_half_resnet34_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.pt"
.
format
(
nb_cate
,
batch
,
lr
,
session_test
)
best
=
"model_half_resnet34/best_half_resnet34_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.pt"
.
format
(
nb_cate
,
batch
,
lr
,
session_test
)
log
=
"logs/half_resnet34_{}emo_{}batch_lr-{}_Test-IEMOCAP{}.log"
.
format
(
nb_cate
,
batch
,
lr
,
session_test
)
## Modification of variables in YAML files
# model.yaml
yaml
=
ruamel
.
yaml
.
YAML
()
with
open
(
"config/half_resnet34/model.yaml"
)
as
fp
:
data
=
yaml
.
load
(
fp
)
data
[
'speaker_number'
]
=
int
(
nb_cate
)
with
open
(
"config/half_resnet34/model.yaml"
,
'w'
)
as
fp
:
yaml
.
dump
(
data
,
fp
)
# training.yaml
yaml
=
ruamel
.
yaml
.
YAML
()
with
open
(
"config/half_resnet34/training.yaml"
)
as
fp
:
data
=
yaml
.
load
(
fp
)
data
[
'lr'
]
=
lr
data
[
"tmp_model_name"
]
=
tmp
data
[
"best_model_name"
]
=
best
data
[
"log_file"
]
=
log
with
open
(
"config/half_resnet34/training.yaml"
,
'w'
)
as
fp
:
yaml
.
dump
(
data
,
fp
)
# Iemocap.yaml
yaml
=
ruamel
.
yaml
.
YAML
()
with
open
(
"config/half_resnet34/Iemocap.yaml"
)
as
fp
:
data
=
yaml
.
load
(
fp
)