Commit be3c8478 authored by Le Lan Gaël's avatar Le Lan Gaël
Browse files

Merge branch 'master' into dev-gl3lan

parents 9ab7a6b8 88f4d2b9
......@@ -162,12 +162,9 @@ if SIDEKIT_CONFIG["cuda"]:
if CUDA:
from .nnet import FForwardNetwork
from .nnet import kaldi_to_hdf5
from .nnet import Xtractor
from .nnet import xtrain
from .nnet import extract_embeddings
from .nnet import extract_sliding_embedding
from .nnet import ResBlock
from .nnet import SincNet
......@@ -190,5 +187,5 @@ __maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'
__version__="1.3.8.5.2"
__version__="1.4"
......@@ -310,8 +310,7 @@ class FeaturesExtractor(object):
dir_name = os.path.dirname(feature_filename) # get the path
if not os.path.exists(dir_name) and not (dir_name == ''):
os.makedirs(dir_name)
h5f = h5py.File(feature_filename, 'a', backing_store=backing_store, driver='core')
h5f = h5py.File(feature_filename, 'w', backing_store=backing_store, driver='core')
if "cep" not in self.save_param:
cep = None
cep_mean = None
......
......@@ -221,7 +221,7 @@ class FeaturesServer(object):
feat = pca_dct(feat, self.dct_pca_config[0], self.dct_pca_config[1], self.dct_pca_config[2])
elif self.sdc:
feat = shifted_delta_cepstral(feat, d=self.sdc_config[0], p=self.sdc_config[1], k=self.sdc_config[2])
# Apply a mask on the features
if self.mask is not None:
feat = self._mask(feat)
......@@ -488,6 +488,7 @@ class FeaturesServer(object):
feat, label = self.post_processing(feat, label, global_mean, global_std)
else:
feat, label = self.post_processing(feat, label)
return feat, label
def get_features_per_speaker(self, show, idmap, channel=0, input_feature_filename=None, label=None):
......
......@@ -28,15 +28,12 @@ Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
"""
from .feed_forward import FForwardNetwork
from .feed_forward import kaldi_to_hdf5
from .xsets import IdMapSetPerSpeaker
from .xsets import SideSet
from .xsets import SideSampler
from .xvector import Xtractor
from .xvector import xtrain
from .xvector import extract_embeddings
from .xvector import extract_sliding_embedding
from .pooling import MeanStdPooling
from .pooling import AttentivePooling
from .pooling import GruPooling
......@@ -49,15 +46,6 @@ from .preprocessor import RawPreprocessor
from .preprocessor import MfccFrontEnd
from .preprocessor import MelSpecFrontEnd
has_pyroom = True
try:
import pyroomacoustics
except ImportError:
has_pyroom = False
if has_pyroom:
from .augmentation import AddReverb
__author__ = "Anthony Larcher and Sylvain Meignier"
__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
......
......@@ -26,21 +26,12 @@ Copyright 2014-2021 Anthony Larcher
"""
import collections
import math
import numpy
from scipy import signal
import pandas
import random
import soundfile
import torch
import torchaudio
has_pyroom = True
try:
import pyroomacoustics
except ImportError:
has_pyroom = False
from scipy import signal
__author__ = "Anthony Larcher and Sylvain Meignier"
......@@ -55,8 +46,10 @@ __docformat__ = 'reStructuredText'
Noise = collections.namedtuple('Noise', 'type file_id duration')
class PreEmphasis(torch.nn.Module):
"""
Apply pre-emphasis filtering
"""
def __init__(self, coef: float = 0.97):
super().__init__()
......@@ -67,13 +60,18 @@ class PreEmphasis(torch.nn.Module):
'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
)
def forward(self, input: torch.tensor) -> torch.tensor:
assert len(input.size()) == 2, 'The number of dimensions of input tensor must be 2!'
# reflect padding to match lengths of in/out
input = input.unsqueeze(1)
input = torch.nn.functional.pad(input, (1, 0), 'reflect')
return torch.nn.functional.conv1d(input, self.flipped_filter).squeeze(1)
def forward(self, input_signal: torch.tensor) -> torch.tensor:
"""
Forward pass of the pre-emphasis filtering
:param input_signal: the input signal
:return: the filtered signal
"""
assert len(input_signal.size()) == 2, 'The number of dimensions of input tensor must be 2!'
# reflect padding to match lengths of in/out
input_signal = input_signal.unsqueeze(1)
input_signal = torch.nn.functional.pad(input_signal, (1, 0), 'reflect')
return torch.nn.functional.conv1d(input_signal, self.flipped_filter).squeeze(1)
class FrequencyMask(object):
......@@ -115,24 +113,26 @@ class TemporalMask(object):
return data, sample[1], sample[2], sample[3], sample[4], sample[5]
def normalize(wav):
"""
Center and reduce a waveform
:param wav:
:return:
:param wav: the input waveform
:return: the normalized waveform
"""
return wav / (numpy.sqrt(numpy.mean(wav ** 2)) + 1e-8)
def crop(signal, duration):
def crop(input_signal, duration):
"""
Select a chunk from an audio segment
:param input_signal: signal to select a chunk from
:param duration: duration of the chunk to select
:return:
"""
start =random.randint(0, signal.shape[0] - duration)
chunk = signal[start: start + duration]
start = random.randint(0, input_signal.shape[0] - duration)
chunk = input_signal[start: start + duration]
return chunk
......@@ -141,13 +141,23 @@ def data_augmentation(speech,
transform_dict,
transform_number,
noise_df=None,
rir_df=None):
rir_df=None,
babble_noise=True):
"""
Perform data augmentation on an input signal.
Each speech chunk is augmented by using 'transform_number' transformations that are picked up randomly from a
dictionary of possible transformations.
:param speech:
:param transform_dict:
:param transform_number:
:return:
:param speech: the input signal to be augmented
:param sample_rate: sampling rate of the input signal to augment
:param transform_dict: the dictionary of possibles augmentations to apply
:param transform_number: the number of transformations to apply on each chunk
:param rir_df: a pandas dataframe object including the list of RIR signals to chose from; default is None
:param noise_df: a pandas dataframe object including the list of NOISE signals to chose from; default is None
:param babble_noise: boolean that enable the use of babble noise, True by default (typically turned to False when
the task includes overlapping speech detection).
:return: augmented signal
tranformation
pipeline: add_noise,add_reverb
......@@ -158,7 +168,6 @@ def data_augmentation(speech,
rir_db_csv: filename.csv
codec: true
phone_filtering: true
"""
# Select the data augmentation randomly
aug_idx = random.sample(range(len(transform_dict.keys())), k=transform_number)
......@@ -183,7 +192,10 @@ def data_augmentation(speech,
if "add_noise" in augmentations:
# Pick a noise type
noise = torch.zeros_like(speech)
noise_idx = random.randrange(3)
if not babble_noise:
noise_idx = random.randrange(1, 3)
else:
noise_idx = random.randrange(0, 4)
# speech
if noise_idx == 0:
......@@ -206,6 +218,19 @@ def data_augmentation(speech,
snr_db = random.randint(0, 15)
noise_row = noise_df.loc['noise'].iloc[random.randrange(noise_df.loc['noise'].shape[0])]
noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
# babble noise with different volume
elif noise_idx == 3:
snr_db = random.randint(13,20)
pick_count = random.randint(5,10) # Randomly select 5 to 10 speakers
index_list = random.choices(range(noise_df.loc['speech'].shape[0]), k=pick_count)
noise = torch.zeros(1,speech.shape[1])
for idx in index_list:
noise_row = noise_df.loc['speech'].iloc[idx]
noise_ = load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
transform = torchaudio.transforms.Vol(gain=random.randint(5,15),gain_type='db') # Randomly select volume level (5-15d)
noise += transform(noise_)
noise /= pick_count
speech_power = speech.norm(p=2)
noise_power = noise.norm(p=2)
......@@ -229,10 +254,10 @@ def data_augmentation(speech,
effects = [
["bandpass","2000","3500"],
["bandstop","200","500"]]
speech,sample_rate = torchaudio.sox_eefects.apply_effects_tensor(
speech, sample_rate = torchaudio.sox_eefects.apply_effects_tensor(
speech,
sample_rate,
effects = [effects[random.randint(0,1)]],
effects=[effects[random.randint(0, 1)]],
)
if "codec" in augmentations:
......@@ -251,6 +276,15 @@ def data_augmentation(speech,
return speech
def load_noise_seg(noise_row, speech_shape, sample_rate, data_path):
"""
Pick a noise signal to add while performing data augmentation
:param noise_row: a row from a Pandas dataframe object
:param speech_shape: shape of the speech signal to be augmented
:param sample_rate: sampling rate of the speech signal to be augmented
:param data_path: directory where to load the noise file from
:return:
"""
noise_start = noise_row['start']
noise_duration = noise_row['duration']
noise_file_id = noise_row['file_id']
......
This diff is collapsed.
......@@ -26,17 +26,12 @@ Copyright 2014-2021 Anthony Larcher
"""
import h5py
import logging
import math
import sys
import numpy
import torch
import torch.optim as optim
import torch.multiprocessing as mp
from collections import OrderedDict
from ..bosaris import IdMap
from ..statserver import StatServer
from torch.nn import Parameter
......@@ -52,6 +47,9 @@ __docformat__ = 'reS'
class ArcMarginModel(torch.nn.Module):
"""
"""
def __init__(self, args):
super(ArcMarginModel, self).__init__()
......@@ -68,6 +66,12 @@ class ArcMarginModel(torch.nn.Module):
self.mm = math.sin(math.pi - self.m) * self.m
def forward(self, input, label):
"""
:param input:
:param label:
:return:
"""
x = F.normalize(input)
W = F.normalize(self.weight)
cosine = F.linear(x, W)
......@@ -85,12 +89,21 @@ class ArcMarginModel(torch.nn.Module):
def l2_norm(input, axis=1):
"""
:param input:
:param axis:
:return:
"""
norm = torch.norm(input, 2, axis, True)
output = torch.div(input, norm)
return output
class ArcFace(torch.nn.Module):
"""
"""
# implementation of additive margin softmax loss in https://arxiv.org/abs/1801.05599
def __init__(self, embedding_size, classnum, s=64., m=0.5):
super(ArcFace, self).__init__()
......@@ -106,6 +119,12 @@ class ArcFace(torch.nn.Module):
self.threshold = math.cos(math.pi - m)
def forward(self, embbedings, target):
"""
:param embbedings:
:param target:
:return:
"""
# weights norm
nB = len(embbedings)
kernel_norm = l2_norm(self.kernel, axis=0)
......@@ -136,6 +155,9 @@ class ArcFace(torch.nn.Module):
################################## Cosface head #############################################################
class Am_softmax(torch.nn.Module):
"""
"""
# implementation of additive margin softmax loss in https://arxiv.org/abs/1801.05599
def __init__(self, embedding_size=512, classnum=51332):
super(Am_softmax, self).__init__()
......@@ -147,6 +169,12 @@ class Am_softmax(torch.nn.Module):
self.s = 30. # see normface https://arxiv.org/abs/1704.06369
def forward(self, embbedings, label):
"""
:param embbedings:
:param label:
:return:
"""
kernel_norm = l2_norm(self.kernel, axis=0)
cos_theta = torch.mm(embbedings, kernel_norm)
cos_theta = cos_theta.clamp(-1, 1) # for numerical stability
......@@ -226,14 +254,15 @@ class ArcLinear(torch.nn.Module):
class ArcMarginProduct(torch.nn.Module):
r"""Implement of large margin arc distance: :
"""
Implement of large margin arc distance: :
Args:
in_features: size of each input sample
out_features: size of each output sample
s: norm of input feature
m: margin
cos(theta + m)
"""
"""
def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
super(ArcMarginProduct, self).__init__()
......@@ -250,10 +279,33 @@ class ArcMarginProduct(torch.nn.Module):
self.th = math.cos(math.pi - self.m)
self.mm = math.sin(math.pi - self.m) * self.m
def change_params(self, s=None, m=None):
"""
:param s:
:param m:
"""
if s is None:
s = self.s
if m is None:
m = self.m
self.s = s
self.m = m
self.cos_m = math.cos(self.m)
self.sin_m = math.sin(self.m)
self.th = math.cos(math.pi - self.m)
self.mm = math.sin(math.pi - self.m) * self.m
def forward(self, input, target=None):
"""
:param input:
:param target:
:return:
"""
# cos(theta)
cosine = torch.nn.functional.linear(torch.nn.functional.normalize(input),
torch.nn.functional.normalize(self.weight))
torch.nn.functional.normalize(self.weight))
if target == None:
return cosine * self.s
# cos(theta + m)
......@@ -275,7 +327,10 @@ class ArcMarginProduct(torch.nn.Module):
class SoftmaxAngularProto(torch.nn.Module):
# from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
def __init__(self, spk_count, emb_dim=256, init_w=10.0, init_b=-5.0, **kwargs):
super(SoftmaxAngularProto, self).__init__()
......@@ -290,27 +345,38 @@ class SoftmaxAngularProto(torch.nn.Module):
]))
def forward(self, x, target=None):
"""
:param x:
:param target:
:return:
"""
assert x.size()[1] >= 2
cce_prediction = self.cce_backend(x)
if target==None:
if target is None:
return cce_prediction
x = x.reshape(-1,2,x.size()[-1]).squeeze(1)
x = x.reshape(-1, 2, x.size()[-1]).squeeze(1)
out_anchor = torch.mean(x[:,1:,:],1)
out_positive = x[:,0,:]
out_anchor = torch.mean(x[:, 1:, :], 1)
out_positive = x[:,0,:]
cos_sim_matrix = torch.nn.functional.cosine_similarity(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2))
cos_sim_matrix = torch.nn.functional.cosine_similarity(out_positive.unsqueeze(-1),
out_anchor.unsqueeze(-1).transpose(0, 2))
torch.clamp(self.w, 1e-6)
cos_sim_matrix = cos_sim_matrix * self.w + self.b
loss = self.criterion(cos_sim_matrix, torch.arange(0, cos_sim_matrix.shape[0], device=x.device)) + self.criterion(cce_prediction, target)
loss = self.criterion(cos_sim_matrix, torch.arange(0,
cos_sim_matrix.shape[0],
device=x.device)) + self.criterion(cce_prediction, target)
return loss, cce_prediction
class AngularProximityMagnet(torch.nn.Module):
# from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
def __init__(self, spk_count, emb_dim=256, batch_size=512, init_w=10.0, init_b=-5.0, **kwargs):
super(AngularProximityMagnet, self).__init__()
......@@ -340,17 +406,22 @@ class AngularProximityMagnet(torch.nn.Module):
self.magnet_criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
def forward(self, x, target=None):
"""
:param x:
:param target:
:return:
"""
assert x.size()[1] >= 2
cce_prediction = self.cce_backend(x)
#x = self.magnitude(x) * torch.nn.functional.normalize(x)
if target==None:
if target is None:
return x, cce_prediction
x = x.reshape(-1,2,x.size()[-1]).squeeze(1)
out_anchor = torch.mean(x[:,1:,:],1)
out_positive = x[:,0,:]
x = x.reshape(-1, 2, x.size()[-1]).squeeze(1)
out_anchor = torch.mean(x[:, 1:, :], 1)
out_positive = x[:, 0, :]
ap_sim_matrix = torch.nn.functional.cosine_similarity(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2))
torch.clamp(self.w, 1e-6)
......
......@@ -111,7 +111,7 @@ class AttentivePooling(torch.nn.Module):
class GruPooling(torch.nn.Module):
"""
Pooling done by using a recurrent network
"""
def __init__(self, input_size, gru_node, nb_gru_layer):
"""
......@@ -136,7 +136,7 @@ class GruPooling(torch.nn.Module):
"""
x = self.bn_before_gru(x)
x = self.lrelu_keras(x)
x = x.permute(0, 2, 1) #(batch, filt, time) >> (batch, time, filt)
x = x.permute(0, 2, 1) # (batch, filt, time) >> (batch, time, filt)
self.gru.flatten_parameters()
x, _ = self.gru(x)
x = x[:, -1, :]
......
......@@ -27,39 +27,14 @@ Copyright 2014-2021 Anthony Larcher, Yevhenii Prokopalo
import logging
import math
import os
import numpy
import pandas
import pickle
import shutil
import time
import os
import torch
import torchaudio
import tqdm
import yaml
from collections import OrderedDict
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from .augmentation import PreEmphasis
from .xsets import SideSet
from .xsets import IdMapSet
from .xsets import IdMapSetPerSpeaker
from .xsets import SideSampler
from .res_net import ResBlockWFMS
from .res_net import ResBlock
from .res_net import PreResNet34
from .res_net import PreFastResNet34
from ..bosaris import IdMap
from ..bosaris import Key
from ..bosaris import Ndx
from ..statserver import StatServer
from ..iv_scoring import cosine_scoring
from .sincnet import SincNet
from .loss import ArcLinear
from .loss import l2_norm
from .loss import ArcMarginProduct
from .sincnet import SincConv1d
from .res_net import LayerNorm
os.environ['MKL_THREADING_LAYER'] = 'GNU'
......@@ -83,12 +58,10 @@ torch.backends.cudnn.benchmark = False
numpy.random.seed(0)
class MfccFrontEnd(torch.nn.Module):
"""
Module that extract MFCC coefficients
"""
def __init__(self,
pre_emphasis=0.97,
sample_rate=16000,
......@@ -153,18 +126,17 @@ class MfccFrontEnd(torch.nn.Module):
class MelSpecFrontEnd(torch.nn.Module):
"""
Module that compute Mel spetrogramm on an audio signal
"""
def __init__(self,
pre_emphasis=0.97,
sample_rate=16000,
n_fft=1024,
f_min=90,
f_max=7600,
win_length=1024,
win_length=400,
window_fn=torch.hann_window,
hop_length=256,
hop_length=160,
power=2.0,
n_mels=80):
......@@ -229,7 +201,8 @@ class MelSpecFrontEnd(torch.nn.Module):
class RawPreprocessor(torch.nn.Module):
"""
Pre-process the raw audio signal by using a SincNet architecture
[ADD REF]
"""
def __init__(self, nb_samp, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, bias=False, groups=1, min_low_hz=50, min_band_hz=50, sample_rate=16000):
"""
......
# -*- coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#