Commit 39e464a5 authored by Anthony Larcher's avatar Anthony Larcher
Browse files

refactoring

parent c73b1d5b
......@@ -28,6 +28,7 @@ Copyright 2014-2021 Anthony Larcher
import collections
import math
import numpy
from scipy import signal
import pandas
import random
import soundfile
......@@ -459,10 +460,7 @@ def data_augmentation(speech,
"""
# Select the data augmentation randomly
if len(transform_dict.keys()) >= transform_number:
aug_idx = numpy.arange(len(transform_dict.keys()))
else:
aug_idx = random.choice(numpy.arange(len(transform_dict.keys())), k=transform_number)
aug_idx = random.sample(range(len(transform_dict.keys())), k=transform_number)
augmentations = numpy.array(list(transform_dict.keys()))[aug_idx]
if "phone_filtering" in augmentations:
......@@ -481,12 +479,10 @@ def data_augmentation(speech,
speech = strech(speech, rate)
if "add_reverb" in augmentations:
rir_nfo = random.randrange(len(rir_df))
rir_fn = transform_dict["add_noise"]["data_path"] + "/" + rir_nfo + ".wav"
rir_nfo = rir_df.iloc[random.randrange(rir_df.shape[0])].file_id
rir_fn = transform_dict["add_reverb"]["data_path"] + "/" + rir_nfo + ".wav"
rir, rir_fs = torchaudio.load(rir_fn)
rir = rir[rir_nfo[1], :] #keep selected channel
speech_ = torch.nn.functional.pad(speech, (rir.shape[1]-1, 0))
speech = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0]
speech = torch.tensor(signal.convolve(speech, rir, mode='full')[:, :speech.shape[1]])
if "add_noise" in augmentations:
# Pick a noise type
......@@ -499,7 +495,7 @@ def data_augmentation(speech,
# TODO make SNRs configurable by noise type
snr_db = random.randint(13, 20)
pick_count = random.randint(3, 7)
index_list = random.choices(range(noise_df.loc['speech'].shape[0]), k=pick_count)
index_list = random.sample(range(noise_df.loc['speech'].shape[0]), k=pick_count)
for idx in index_list:
noise_row = noise_df.loc['speech'].iloc[idx]
noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
......
# coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2021 Anthony Larcher, Yevhenii Prokopalo
"""
import os
import torch
os.environ['MKL_THREADING_LAYER'] = 'GNU'
__license__ = "LGPL"
__author__ = "Anthony Larcher"
__copyright__ = "Copyright 2015-2021 Anthony Larcher"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reS'
class MeanStdPooling(torch.nn.Module):
"""
Mean and Standard deviation pooling
"""
def __init__(self):
"""
"""
super(MeanStdPooling, self).__init__()
pass
def forward(self, x):
"""
:param x:
:return:
"""
mean = torch.mean(x, dim=2)
std = torch.std(x, dim=2)
return torch.cat([mean, std], dim=1)
class AttentivePooling(torch.nn.Module):
"""
Mean and Standard deviation attentive pooling
"""
def __init__(self, num_channels, n_mels, reduction=2, global_context=False):
"""
"""
# TODO Make global_context configurable (True/False)
# TODO Make convolution parameters configurable
super(AttentivePooling, self).__init__()
in_factor = 3 if global_context else 1
self.attention = torch.nn.Sequential(
torch.nn.Conv1d(num_channels * (n_mels//8) * in_factor, num_channels//reduction, kernel_size=1),
torch.nn.ReLU(),
torch.nn.BatchNorm1d(num_channels//reduction),
torch.nn.Tanh(),
torch.nn.Conv1d(num_channels//reduction, num_channels * (n_mels//8), kernel_size=1),
torch.nn.Softmax(dim=2),
)
self.global_context = global_context
self.gc = MeanStdPooling()
def new_parameter(self, *size):
out = torch.nn.Parameter(torch.FloatTensor(*size))
torch.nn.init.xavier_normal_(out)
return out
def forward(self, x):
"""
:param x:
:return:
"""
if self.global_context:
w = self.attention(torch.cat([x, self.gc(x).unsqueeze(2).repeat(1, 1, x.shape[-1])], dim=1))
else:
w = self.attention(x)
mu = torch.sum(x * w, dim=2)
rh = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-5) )
x = torch.cat((mu, rh),1)
x = x.view(x.size()[0], -1)
return x
class GruPooling(torch.nn.Module):
"""
"""
def __init__(self, input_size, gru_node, nb_gru_layer):
"""
:param input_size:
:param gru_node:
:param nb_gru_layer:
"""
super(GruPooling, self).__init__()
self.lrelu_keras = torch.nn.LeakyReLU(negative_slope = 0.3)
self.bn_before_gru = torch.nn.BatchNorm1d(num_features = input_size)
self.gru = torch.nn.GRU(input_size = input_size,
hidden_size = gru_node,
num_layers = nb_gru_layer,
batch_first = True)
def forward(self, x):
"""
:param x:
:return:
"""
x = self.bn_before_gru(x)
x = self.lrelu_keras(x)
x = x.permute(0, 2, 1) #(batch, filt, time) >> (batch, time, filt)
self.gru.flatten_parameters()
x, _ = self.gru(x)
x = x[:, -1, :]
return x
......@@ -242,8 +242,7 @@ class SideSet(Dataset):
self.rir_df = None
if "add_reverb" in self.transform:
# load the RIR database
tmp_rir_df = pandas.read_csv(self.transformation["add_reverb"]["rir_db_csv"])
self.rir_df = zip(tmp_rir_df['file_id'].tolist(), tmp_rir_df['channel'].tolist())
self.rir_df = pandas.read_csv(self.transformation["add_reverb"]["rir_db_csv"])
def __getitem__(self, index):
"""
......@@ -314,7 +313,7 @@ class IdMapSet(Dataset):
window_len=24000,
window_shift=8000,
sample_rate=16000,
min_duration=0.150
min_duration=0.165
):
"""
......
......@@ -43,6 +43,9 @@ import yaml
from collections import OrderedDict
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from .pooling import MeanStdPooling
from .pooling import AttentivePooling
from .pooling import GruPooling
from .preprocessor import MfccFrontEnd
from .preprocessor import MelSpecFrontEnd
from .preprocessor import RawPreprocessor
......@@ -67,6 +70,8 @@ from .loss import ArcMarginProduct
from ..sidekit_io import init_logging
torch.backends.cudnn.benchmark = True
os.environ['MKL_THREADING_LAYER'] = 'GNU'
__license__ = "LGPL"
......@@ -78,17 +83,6 @@ __status__ = "Production"
__docformat__ = 'reS'
#logging.basicConfig(format='%(asctime)s %(message)s')
# Make PyTorch Deterministic
torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
numpy.random.seed(0)
def eer(negatives, positives):
"""Logarithmic complexity EER computation
......@@ -424,63 +418,6 @@ class TrainingMonitor():
self.current_patience -= 1
class MeanStdPooling(torch.nn.Module):
"""
Mean and Standard deviation pooling
"""
def __init__(self):
"""
"""
super(MeanStdPooling, self).__init__()
pass
def forward(self, x):
"""
:param x:
:return:
"""
mean = torch.mean(x, dim=2)
std = torch.std(x, dim=2)
return torch.cat([mean, std], dim=1)
class GruPooling(torch.nn.Module):
"""
"""
def __init__(self, input_size, gru_node, nb_gru_layer):
"""
:param input_size:
:param gru_node:
:param nb_gru_layer:
"""
super(GruPooling, self).__init__()
self.lrelu_keras = torch.nn.LeakyReLU(negative_slope = 0.3)
self.bn_before_gru = torch.nn.BatchNorm1d(num_features = input_size)
self.gru = torch.nn.GRU(input_size = input_size,
hidden_size = gru_node,
num_layers = nb_gru_layer,
batch_first = True)
def forward(self, x):
"""
:param x:
:return:
"""
x = self.bn_before_gru(x)
x = self.lrelu_keras(x)
x = x.permute(0, 2, 1) #(batch, filt, time) >> (batch, time, filt)
self.gru.flatten_parameters()
x, _ = self.gru(x)
x = x[:, -1, :]
return x
class Xtractor(torch.nn.Module):
"""
Class that defines an x-vector extractor based on 5 convolutional layers and a mean standard deviation pooling
......@@ -614,11 +551,11 @@ class Xtractor(torch.nn.Module):
elif self.loss == 'aps':
self.after_speaker_embedding = SoftmaxAngularProto(int(self.speaker_number))
self.preprocessor_weight_decay = 0.000
self.sequence_network_weight_decay = 0.000
self.stat_pooling_weight_decay = 0.000
self.before_speaker_embedding_weight_decay = 0.00
self.after_speaker_embedding_weight_decay = 0.00
self.preprocessor_weight_decay = 0.00002
self.sequence_network_weight_decay = 0.00002
self.stat_pooling_weight_decay = 0.00002
self.before_speaker_embedding_weight_decay = 0.00002
self.after_speaker_embedding_weight_decay = 0.0002
elif model_archi == "rawnet2":
......@@ -855,18 +792,8 @@ class Xtractor(torch.nn.Module):
m=0.2,
easy_margin=True)
#self.after_speaker_embedding = ArcLinear(input_size,
# self.speaker_number,
# margin=self.aam_margin,
# s=self.aam_s)
#self.after_speaker_embedding = ArcFace(embedding_size=input_size,
# classnum=self.speaker_number,
# s=64.,
# m=0.5)
self.after_speaker_embedding_weight_decay = cfg["after_embedding"]["weight_decay"]
def forward(self, x, is_eval=False, target=None, extract_after_pooling=False):
"""
......@@ -888,9 +815,6 @@ class Xtractor(torch.nn.Module):
x = self.before_speaker_embedding(x)
if self.norm_embedding:
#x_norm = x.norm(p=2,dim=1, keepdim=True) / 10. # Why 10. ?
#x_norm = torch.linalg.norm(x, ord=2, dim=1, keepdim=True, out=None, dtype=None)
#x = torch.div(x, x_norm)
x = l2_norm(x)
if self.loss == "cce":
......@@ -1058,6 +982,7 @@ def update_training_dictionary(dataset_description,
training_opts["compute_test_eer"] = False
training_opts["log_interval"] = 10
training_opts["validation_frequency"] = 1
training_opts["tmp_model_name"] = "tmp_model.pt"
training_opts["best_model_name"] = "best_model.pt"
......@@ -1139,15 +1064,11 @@ def get_loaders(dataset_opts, training_opts, speaker_number):
First we load the dataframe from CSV file in order to split it for training and validation purpose
Then we provide those two
"""
#with open(dataset_yaml, "r") as fh:
# dataset_params = yaml.load(fh, Loader=yaml.FullLoader)
# df = pandas.read_csv(dataset_params["dataset_description"])
df = pandas.read_csv(dataset_opts["dataset_csv"])
training_df, validation_df = train_test_split(df, test_size=dataset_opts["validation_ratio"] , stratify=df["speaker_idx"])
torch.manual_seed(training_opts['seed'])
training_df, validation_df = train_test_split(df,
test_size=dataset_opts["validation_ratio"],
stratify=df["speaker_idx"])
training_set = SideSet(dataset_opts,
set_type="train",
......@@ -1164,8 +1085,8 @@ def get_loaders(dataset_opts, training_opts, speaker_number):
side_sampler = SideSampler(training_set.sessions['speaker_idx'],
speaker_number,
1,
100,
dataset_opts["train"]["sampler"]["examples_per_speaker"],
dataset_opts["train"]["sampler"]["samples_per_speaker"],
dataset_opts["batch_size"])
training_loader = DataLoader(training_set,
......@@ -1192,9 +1113,6 @@ def get_loaders(dataset_opts, training_opts, speaker_number):
# Select a subset of non-target trials to reduce the number of tests
tar_non_ratio = numpy.sum(tar_indices)/numpy.sum(non_indices)
#non_indices *= numpy.random.choice([False, True],
# size=non_indices.shape,
# p=[1-tar_non_ratio, tar_non_ratio])
non_indices *= (numpy.random.rand(*non_indices.shape) < tar_non_ratio)
return training_loader, validation_loader, tar_indices, non_indices
......@@ -1317,8 +1235,7 @@ def new_xtrain(dataset_description,
**kwargs):
"""
REFACTORING
- tester les loggings
- une fonction qui prend le modele et retourne un optimizer et un scheduler
- affiner les loggings
"""
dataset_opts, model_opts, training_opts = update_training_dictionary(dataset_description,
model_description,
......@@ -1333,23 +1250,33 @@ def new_xtrain(dataset_description,
best_eer=100,
compute_test_eer=training_opts["compute_test_eer"])
# Display the entire configurations as YAML dictionnaries
monitor.logger.info(yaml.dump(dataset_opts, default_flow_style=False))
monitor.logger.info(yaml.dump(model_opts, default_flow_style=False))
monitor.logger.info(yaml.dump(training_opts, default_flow_style=False))
# Make PyTorch Deterministic
torch.backends.cudnn.deterministic = False
if training_opts["deterministic"]:
torch.backends.cudnn.deterministic = True
# Set all the seeds
numpy.random.seed(training_opts["seed"]) # Set the random seed of numpy for the data split.
torch.manual_seed(training_opts["seed"])
torch.cuda.manual_seed(training_opts["seed"])
# Display the entire configurations as YAML dictionnaries
monitor.logger.info("\n*********************************\nDataset options\n*********************************\n")
monitor.logger.info(yaml.dump(dataset_opts, default_flow_style=False))
monitor.logger.info("\n*********************************\nModel options\n*********************************\n")
monitor.logger.info(yaml.dump(model_opts, default_flow_style=False))
monitor.logger.info("\n*********************************\nTraining options\n*********************************\n")
monitor.logger.info(yaml.dump(training_opts, default_flow_style=False))
# Test to optimize
torch.autograd.profiler.emit_nvtx(enabled=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize the model
model = get_network(model_opts)
speaker_number = model.speaker_number
embedding_size = model.embedding_size
if torch.cuda.device_count() > 1 and training_opts["multi_gpu"]:
model = torch.nn.DataParallel(model)
......@@ -1399,7 +1326,7 @@ def new_xtrain(dataset_description,
val_acc, val_loss, val_eer = cross_validation(model,
validation_loader,
device,
[validation_loader.dataset.__len__(), model_opts["embedding_size"]],
[validation_loader.dataset.__len__(), embedding_size],
validation_tar_indices,
validation_non_indices,
training_opts["mixed_precision"])
......@@ -2014,7 +1941,6 @@ def cross_validation(model, validation_loader, device, validation_shape, tar_ind
#classes[cursor:cursor + batch_size] = target.detach().cpu()
cursor += batch_size
#print(classes.shape[0])
local_device = "cpu" if embeddings.shape[0] > 3e4 else device
embeddings = embeddings.to(local_device)
scores = torch.einsum('ij,kj', embeddings, embeddings).cpu().numpy()
......@@ -2028,7 +1954,6 @@ def cross_validation(model, validation_loader, device, validation_shape, tar_ind
equal_error_rate = rocch2eer(pmiss, pfa)
return (100. * accuracy.cpu().numpy() / validation_shape[0],
loss.cpu().numpy() / ((batch_idx + 1) * batch_size),
loss.cpu().numpy() / ((batch_idx + 1) * batch_size),
equal_error_rate)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment