Commit 4ac596f8 authored by Anthony Larcher's avatar Anthony Larcher
Browse files

wavLM + Ecapa

parent 51cc030f
......@@ -119,7 +119,7 @@ from .jfa_scoring import jfa_scoring
from .score_normalization import znorm
from .score_normalization import tnorm
from .score_normalization import ztnorm
from .score_normalization import snorm
from .score_normalization import asnorm
from .sidekit_io import write_norm_hdf5
from .sidekit_io import write_matrix_hdf5
......
......@@ -262,15 +262,15 @@ def data_augmentation(speech,
if "codec" in augmentations:
final_shape = speech.shape[1]
configs = [
({"format": "wav", "encoding": 'ULAW', "bits_per_sample": 8}, "8 bit mu-law"),
({"format": "wav", "encoding": 'ALAW', "bits_per_sample": 8}, "8 bit a-law"),
({"format": "mp3", "compression": -9}, "MP3"),
({"format": "vorbis", "compression": -1}, "Vorbis")
]
param, title = random.choice(configs)
speech = torchaudio.functional.apply_codec(speech, sample_rate, **param)
speech = speech[:, :final_shape]
configs = [
({"format": "wav", "encoding": 'ULAW', "bits_per_sample": 8}, "8 bit mu-law"),
({"format": "wav", "encoding": 'ALAW', "bits_per_sample": 8}, "8 bit a-law"),
({"format": "mp3", "compression": -9}, "MP3"),
({"format": "vorbis", "compression": -1}, "Vorbis")
]
param, title = random.choice(configs)
speech = torchaudio.functional.apply_codec(speech, sample_rate, **param)
speech = speech[:, :final_shape]
return speech
......
......@@ -305,7 +305,7 @@ def ECAPA_TDNN_SMALL(feat_dim,
sr=sr,
feature_selection=feature_selection,
update_extract=update_extract,
config_path=config_path):
config_path=config_path)
if __name__ == '__main__':
x = torch.zeros(2, 32000)
......
......@@ -45,6 +45,7 @@ from .ecapa_tdnn import ECAPA_TDNN_SMALL
from .pooling import MeanStdPooling
from .pooling import AttentivePooling, ChannelWiseCorrPooling
from .pooling import GruPooling
from .preprocessor import WavLmFrontEnd
from .preprocessor import MfccFrontEnd
from .preprocessor import MelSpecFrontEnd
from .preprocessor import RawPreprocessor
......@@ -253,9 +254,16 @@ def test_metrics(model,
cohort = model.module.after_speaker_embedding.weight.data
cohort_xv = torch.nn.functional.normalize(cohort, dim=1).cpu()
tsr = torch.nn.functional.normalize(torch.FloatTensor(xv_stat.stat1), dim=1)
s_enrol_test_scores = asnorm(tsr.cpu(), cohort_xv, ndx)
s_enrol_test_scores= s_enrol_test_scores[ndx.trialmask]
norm_pmiss, norm_pfa = rocch(s_enrol_test_scores[key.tar], s_enrol_test_scores[key.non])
return rocch2eer(pmiss, pfa), rocch2eer(norm_pmiss, norm_pfa)
else:
return rocch2eer(pmiss, pfa)
......@@ -432,6 +440,8 @@ class Xtractor(torch.nn.Module):
self.feature_size = None
self.norm_embedding = norm_embedding
print(f"MODEL = {model_archi}")
if model_archi == "xvector":
self.input_nbdim = 2
......@@ -580,11 +590,10 @@ class Xtractor(torch.nn.Module):
self.before_speaker_embedding_weight_decay = 0.00002
self.after_speaker_embedding_weight_decay = 0.000
elif model_archi == "wavlmEcapa":
elif model_archi == "wavlmecapa":
self.embedding_size = embedding_size
self.preprocessor = MelSpecFrontEnd()
self.sequence_network = ECAPA_TDNN(80,
emb_dim=self.embedding_size,
self.preprocessor = WavLmFrontEnd()
self.sequence_network = ECAPA_TDNN(1024,
feat_type='fbank',
sr=16000,
feature_selection="hidden_states",
......@@ -882,7 +891,7 @@ class Xtractor(torch.nn.Module):
else:
return self.after_speaker_embedding(x), x
elif self.loss in ['aam', 'aps']:
elif self.loss in ['aam', 'aps', 'circle']:
x = self.after_speaker_embedding(x, target=target), torch.nn.functional.normalize(x, dim=1)
elif self.loss == 'smn':
if not is_eval:
......@@ -1093,7 +1102,7 @@ def get_network(model_opts, local_rank):
:return: the neural network
"""
if model_opts["model_type"] in ["xvector", "rawnet2", "resnet34", "fastresnet34", "halfresnet34", "experimental"]:
if model_opts["model_type"] in ["xvector", "rawnet2", "resnet34", "fastresnet34", "halfresnet34", "wavlmecapa"]:
model = Xtractor(model_opts["speaker_number"], model_opts["model_type"], loss=model_opts["loss"]["type"], embedding_size=model_opts["embedding_size"])
else:
# Custom type of model
......@@ -1654,6 +1663,10 @@ def train_epoch(model,
output_tuple, _ = model(data, target=target)
output, no_margin_output = output_tuple
loss = criterion(output, target)
elif loss_criteria == "circle":
output_tuple, _ = model(data, target=target)
output, no_margin_output = output_tuple
loss = criterion(output, target)
elif loss_criteria == 'smn':
output_tuple, _ = model(data, target=target)
loss, output = output_tuple
......
......@@ -124,9 +124,7 @@ def asnorm(enrol_xv, cohort_xv, ndx):
:return:the normalized scores
"""
# Compute cosine similarity
enrol_xv = torch.nn.functional.normalize(enrol_xv, dim=1)
enrol_test_scores = torch.einsum('ij,kj', enrol_xv, enrol_xv).numpy()
enrol_test_scores = enrol_test_scores[ndx.trialmask]
cohort_xv = torch.nn.functional.normalize(cohort_xv, dim=1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment