Commit 4b5a23ba authored by Anthony Larcher's avatar Anthony Larcher
Browse files
parents 5fe2945a 63fe771a
......@@ -162,8 +162,6 @@ if SIDEKIT_CONFIG["cuda"]:
if CUDA:
from .nnet import FForwardNetwork
from .nnet import kaldi_to_hdf5
from .nnet import Xtractor
from .nnet import xtrain
from .nnet import extract_embeddings
......
......@@ -28,8 +28,6 @@ Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
"""
from .feed_forward import FForwardNetwork
from .feed_forward import kaldi_to_hdf5
from .xsets import IdMapSetPerSpeaker
from .xsets import SideSet
from .xsets import SideSampler
......@@ -49,8 +47,6 @@ from .preprocessor import MfccFrontEnd
from .preprocessor import MelSpecFrontEnd
__author__ = "Anthony Larcher and Sylvain Meignier"
__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
__license__ = "LGPL"
......
......@@ -26,21 +26,12 @@ Copyright 2014-2021 Anthony Larcher
"""
import collections
import math
import numpy
from scipy import signal
import pandas
import random
import soundfile
import torch
import torchaudio
has_pyroom = True
try:
import pyroomacoustics
except ImportError:
has_pyroom = False
from scipy import signal
__author__ = "Anthony Larcher and Sylvain Meignier"
......@@ -55,8 +46,10 @@ __docformat__ = 'reStructuredText'
Noise = collections.namedtuple('Noise', 'type file_id duration')
class PreEmphasis(torch.nn.Module):
"""
Apply pre-emphasis filtering
"""
def __init__(self, coef: float = 0.97):
super().__init__()
......@@ -67,13 +60,18 @@ class PreEmphasis(torch.nn.Module):
'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
)
def forward(self, input: torch.tensor) -> torch.tensor:
def forward(self, input_signal: torch.tensor) -> torch.tensor:
"""
Forward pass of the pre-emphasis filtering
:param input_signal: the input signal
:return: the filtered signal
"""
assert len(input.size()) == 2, 'The number of dimensions of input tensor must be 2!'
# reflect padding to match lengths of in/out
input = input.unsqueeze(1)
input = torch.nn.functional.pad(input, (1, 0), 'reflect')
return torch.nn.functional.conv1d(input, self.flipped_filter).squeeze(1)
input_signal = input.unsqueeze(1)
input_signal = torch.nn.functional.pad(input_signal, (1, 0), 'reflect')
return torch.nn.functional.conv1d(input_signal, self.flipped_filter).squeeze(1)
class FrequencyMask(object):
......@@ -115,24 +113,26 @@ class TemporalMask(object):
return data, sample[1], sample[2], sample[3], sample[4], sample[5]
def normalize(wav):
"""
Center and reduce a waveform
:param wav:
:return:
:param wav: the input waveform
:return: the normalized waveform
"""
return wav / (numpy.sqrt(numpy.mean(wav ** 2)) + 1e-8)
def crop(signal, duration):
def crop(input_signal, duration):
"""
Select a chunk from an audio segment
:param input_signal: signal to select a chunk from
:param duration: duration of the chunk to select
:return:
"""
start =random.randint(0, signal.shape[0] - duration)
chunk = signal[start: start + duration]
start = random.randint(0, input_signal.shape[0] - duration)
chunk = input_signal[start: start + duration]
return chunk
......@@ -144,11 +144,20 @@ def data_augmentation(speech,
rir_df=None,
babble_noise=True):
"""
Perform data augmentation on an input signal.
Each speech chunk is augmented by using 'transform_number' transformations that are picked up randomly from a
dictionary of possible transformations.
:param speech:
:param transform_dict:
:param transform_number:
:return:
:param speech: the input signal to be augmented
:param sample_rate: sampling rate of the input signal to augment
:param transform_dict: the dictionary of possibles augmentations to apply
:param transform_number: the number of transformations to apply on each chunk
:param rir_df: a pandas dataframe object including the list of RIR signals to chose from; default is None
:param noise_df: a pandas dataframe object including the list of NOISE signals to chose from; default is None
:param babble_noise: boolean that enable the use of babble noise, True by default (typically turned to False when
the task includes overlapping speech detection).
:return: augmented signal
tranformation
pipeline: add_noise,add_reverb
......@@ -159,7 +168,6 @@ def data_augmentation(speech,
rir_db_csv: filename.csv
codec: true
phone_filtering: true
"""
# Select the data augmentation randomly
aug_idx = random.sample(range(len(transform_dict.keys())), k=transform_number)
......@@ -209,18 +217,17 @@ def data_augmentation(speech,
noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
# babble noise with different volume
elif noise_idx == 3:
snr_db = random.randint(13,20)
pick_count = random.randint(5,10) # Randomly select 5 to 10 speakers
index_list = random.choices(range(noise_df.loc['speech'].shape[0]), k=pick_count)
#noise_rows = transform_dict["add_noise"]["data_path"] + "/" + noise_df[noise_df["type"] == "speech"].sample(ns,replace=False)["file_id"].values + ".wav"
noise = torch.zeros(1,speech.shape[1])
for idx in index_list:
#noise_,noise_fs = torchaudio.load(noise_fn[idx],frame_offset=0,num_frames=speech.shape[1])
noise_row = noise_df.loc['speech'].iloc[idx]
noise_ = load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
transform = torchaudio.transforms.Vol(gain=random.randint(5,15),gain_type='db') # Randomly select volume level (5-15d)
noise += transform(noise_)
noise /= pick_count
snr_db = random.randint(13,20)
pick_count = random.randint(5,10) # Randomly select 5 to 10 speakers
index_list = random.choices(range(noise_df.loc['speech'].shape[0]), k=pick_count)
noise = torch.zeros(1,speech.shape[1])
for idx in index_list:
noise_row = noise_df.loc['speech'].iloc[idx]
noise_ = load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
transform = torchaudio.transforms.Vol(gain=random.randint(5,15),gain_type='db') # Randomly select volume level (5-15d)
noise += transform(noise_)
noise /= pick_count
speech_power = speech.norm(p=2)
noise_power = noise.norm(p=2)
......@@ -244,10 +251,10 @@ def data_augmentation(speech,
effects = [
["bandpass","2000","3500"],
["bandstop","200","500"]]
speech,sample_rate = torchaudio.sox_eefects.apply_effects_tensor(
speech, sample_rate = torchaudio.sox_eefects.apply_effects_tensor(
speech,
sample_rate,
effects = [effects[random.randint(0,1)]],
effects=[effects[random.randint(0, 1)]],
)
if "codec" in augmentations:
......@@ -267,11 +274,12 @@ def data_augmentation(speech,
def load_noise_seg(noise_row, speech_shape, sample_rate, data_path):
"""
Pick a noise signal to add while performing data augmentation
:param noise_row:
:param speech_shape:
:param sample_rate:
:param data_path:
:param noise_row: a row from a Pandas dataframe object
:param speech_shape: shape of the speech signal to be augmented
:param sample_rate: sampling rate of the speech signal to be augmented
:param data_path: directory where to load the noise file from
:return:
"""
noise_start = noise_row['start']
......
This diff is collapsed.
......@@ -26,17 +26,12 @@ Copyright 2014-2021 Anthony Larcher
"""
import h5py
import logging
import math
import sys
import numpy
import torch
import torch.optim as optim
import torch.multiprocessing as mp
from collections import OrderedDict
from ..bosaris import IdMap
from ..statserver import StatServer
from torch.nn import Parameter
......@@ -52,6 +47,9 @@ __docformat__ = 'reS'
class ArcMarginModel(torch.nn.Module):
"""
"""
def __init__(self, args):
super(ArcMarginModel, self).__init__()
......@@ -68,6 +66,12 @@ class ArcMarginModel(torch.nn.Module):
self.mm = math.sin(math.pi - self.m) * self.m
def forward(self, input, label):
"""
:param input:
:param label:
:return:
"""
x = F.normalize(input)
W = F.normalize(self.weight)
cosine = F.linear(x, W)
......@@ -85,12 +89,21 @@ class ArcMarginModel(torch.nn.Module):
def l2_norm(input, axis=1):
"""
:param input:
:param axis:
:return:
"""
norm = torch.norm(input, 2, axis, True)
output = torch.div(input, norm)
return output
class ArcFace(torch.nn.Module):
"""
"""
# implementation of additive margin softmax loss in https://arxiv.org/abs/1801.05599
def __init__(self, embedding_size, classnum, s=64., m=0.5):
super(ArcFace, self).__init__()
......@@ -106,6 +119,12 @@ class ArcFace(torch.nn.Module):
self.threshold = math.cos(math.pi - m)
def forward(self, embbedings, target):
"""
:param embbedings:
:param target:
:return:
"""
# weights norm
nB = len(embbedings)
kernel_norm = l2_norm(self.kernel, axis=0)
......@@ -136,6 +155,9 @@ class ArcFace(torch.nn.Module):
################################## Cosface head #############################################################
class Am_softmax(torch.nn.Module):
"""
"""
# implementation of additive margin softmax loss in https://arxiv.org/abs/1801.05599
def __init__(self, embedding_size=512, classnum=51332):
super(Am_softmax, self).__init__()
......@@ -147,6 +169,12 @@ class Am_softmax(torch.nn.Module):
self.s = 30. # see normface https://arxiv.org/abs/1704.06369
def forward(self, embbedings, label):
"""
:param embbedings:
:param label:
:return:
"""
kernel_norm = l2_norm(self.kernel, axis=0)
cos_theta = torch.mm(embbedings, kernel_norm)
cos_theta = cos_theta.clamp(-1, 1) # for numerical stability
......@@ -226,14 +254,15 @@ class ArcLinear(torch.nn.Module):
class ArcMarginProduct(torch.nn.Module):
r"""Implement of large margin arc distance: :
"""
Implement of large margin arc distance: :
Args:
in_features: size of each input sample
out_features: size of each output sample
s: norm of input feature
m: margin
cos(theta + m)
"""
"""
def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
super(ArcMarginProduct, self).__init__()
......@@ -298,7 +327,10 @@ class ArcMarginProduct(torch.nn.Module):
class SoftmaxAngularProto(torch.nn.Module):
# from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
def __init__(self, spk_count, emb_dim=256, init_w=10.0, init_b=-5.0, **kwargs):
super(SoftmaxAngularProto, self).__init__()
......@@ -313,27 +345,38 @@ class SoftmaxAngularProto(torch.nn.Module):
]))
def forward(self, x, target=None):
"""
:param x:
:param target:
:return:
"""
assert x.size()[1] >= 2
cce_prediction = self.cce_backend(x)
if target==None:
if target is None:
return cce_prediction
x = x.reshape(-1,2,x.size()[-1]).squeeze(1)
x = x.reshape(-1, 2, x.size()[-1]).squeeze(1)
out_anchor = torch.mean(x[:,1:,:],1)
out_positive = x[:,0,:]
out_anchor = torch.mean(x[:, 1:, :], 1)
out_positive = x[:,0,:]
cos_sim_matrix = torch.nn.functional.cosine_similarity(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2))
cos_sim_matrix = torch.nn.functional.cosine_similarity(out_positive.unsqueeze(-1),
out_anchor.unsqueeze(-1).transpose(0, 2))
torch.clamp(self.w, 1e-6)
cos_sim_matrix = cos_sim_matrix * self.w + self.b
loss = self.criterion(cos_sim_matrix, torch.arange(0, cos_sim_matrix.shape[0], device=x.device)) + self.criterion(cce_prediction, target)
loss = self.criterion(cos_sim_matrix, torch.arange(0,
cos_sim_matrix.shape[0],
device=x.device)) + self.criterion(cce_prediction, target)
return loss, cce_prediction
class AngularProximityMagnet(torch.nn.Module):
# from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
def __init__(self, spk_count, emb_dim=256, batch_size=512, init_w=10.0, init_b=-5.0, **kwargs):
super(AngularProximityMagnet, self).__init__()
......@@ -363,17 +406,22 @@ class AngularProximityMagnet(torch.nn.Module):
self.magnet_criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
def forward(self, x, target=None):
"""
:param x:
:param target:
:return:
"""
assert x.size()[1] >= 2
cce_prediction = self.cce_backend(x)
#x = self.magnitude(x) * torch.nn.functional.normalize(x)
if target==None:
if target is None:
return x, cce_prediction
x = x.reshape(-1,2,x.size()[-1]).squeeze(1)
out_anchor = torch.mean(x[:,1:,:],1)
out_positive = x[:,0,:]
x = x.reshape(-1, 2, x.size()[-1]).squeeze(1)
out_anchor = torch.mean(x[:, 1:, :], 1)
out_positive = x[:, 0, :]
ap_sim_matrix = torch.nn.functional.cosine_similarity(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2))
torch.clamp(self.w, 1e-6)
......
......@@ -111,7 +111,7 @@ class AttentivePooling(torch.nn.Module):
class GruPooling(torch.nn.Module):
"""
Pooling done by using a recurrent network
"""
def __init__(self, input_size, gru_node, nb_gru_layer):
"""
......@@ -136,7 +136,7 @@ class GruPooling(torch.nn.Module):
"""
x = self.bn_before_gru(x)
x = self.lrelu_keras(x)
x = x.permute(0, 2, 1) #(batch, filt, time) >> (batch, time, filt)
x = x.permute(0, 2, 1) # (batch, filt, time) >> (batch, time, filt)
self.gru.flatten_parameters()
x, _ = self.gru(x)
x = x[:, -1, :]
......
......@@ -27,39 +27,14 @@ Copyright 2014-2021 Anthony Larcher, Yevhenii Prokopalo
import logging
import math
import os
import numpy
import pandas
import pickle
import shutil
import time
import os
import torch
import torchaudio
import tqdm
import yaml
from collections import OrderedDict
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from .augmentation import PreEmphasis
from .xsets import SideSet
from .xsets import IdMapSet
from .xsets import IdMapSetPerSpeaker
from .xsets import SideSampler
from .res_net import ResBlockWFMS
from .res_net import ResBlock
from .res_net import PreResNet34
from .res_net import PreFastResNet34
from ..bosaris import IdMap
from ..bosaris import Key
from ..bosaris import Ndx
from ..statserver import StatServer
from ..iv_scoring import cosine_scoring
from .sincnet import SincNet
from .loss import ArcLinear
from .loss import l2_norm
from .loss import ArcMarginProduct
from .sincnet import SincConv1d
from .res_net import LayerNorm
os.environ['MKL_THREADING_LAYER'] = 'GNU'
......@@ -83,12 +58,10 @@ torch.backends.cudnn.benchmark = False
numpy.random.seed(0)
class MfccFrontEnd(torch.nn.Module):
"""
Module that extract MFCC coefficients
"""
def __init__(self,
pre_emphasis=0.97,
sample_rate=16000,
......@@ -153,9 +126,8 @@ class MfccFrontEnd(torch.nn.Module):
class MelSpecFrontEnd(torch.nn.Module):
"""
Module that compute Mel spetrogramm on an audio signal
"""
def __init__(self,
pre_emphasis=0.97,
sample_rate=16000,
......@@ -230,7 +202,8 @@ class MelSpecFrontEnd(torch.nn.Module):
class RawPreprocessor(torch.nn.Module):
"""
Pre-process the raw audio signal by using a SincNet architecture
[ADD REF]
"""
def __init__(self, nb_samp, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, bias=False, groups=1, min_low_hz=50, min_band_hz=50, sample_rate=16000):
"""
......
# -*- coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2021 Anthony Larcher
"""
import logging
import numpy
import torch
import pandas
import soundfile
import random
import h5py
import torch.optim as optim
import torch.multiprocessing as mp
from torchvision import transforms
from torch.utils.data import DataLoader
from pathlib import Path
from tqdm import tqdm
from torch.utils.data import Dataset
__license__ = "LGPL"
__author__ = "Anthony Larcher"
__copyright__ = "Copyright 2015-2021 Anthony Larcher"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reS'
"""
How to use:
vs = ds.Vox1Set("/lium/raid01_c/larcher/vox1_raw_wav_batches.h5", transform=transforms.Compose([PreEmphasis(),]))
vloader = DataLoader(vs, batch_size=32, shuffle=True, num_workers=5)
"""
def prepare_voxceleb1(vox1_root_dir, output_batch_file, seg_duration=4, samplerate=16000):
# List wav files in VoxCeleb1
vox1_wav_list = [str(f) for f in list(Path(vox1_root_dir).rglob("*.[wW][aA][vV]"))]
vox1_df = pandas.DataFrame(columns=("database", "speaker_id", "file_id", "duration", "speaker_idx"))
print("*** Collect information from VoxCeleb1 data ***")
for fn in tqdm(vox1_wav_list):
file_id = ('/').join(fn.split('/')[-2:]).split('.')[0]
speaker_id = fn.split('/')[-3]
_set = fn.split('/')[-5]
# get the duration of the wav file
data, _ = soundfile.read(fn)
duration = data.shape[0]
vox1_df = vox1_df.append(
{"database": "vox1", "speaker_id": speaker_id, "file_id": file_id, "duration": duration, "speaker_idx": -1,
"set": _set}, ignore_index=True)
print("\n\n*** Create a single HDF5 file with all training data ***")
# Create a HDF5 file and fill it with one 4s segment per session
with h5py.File(output_batch_file, 'w') as fh:
for index, row in tqdm(vox1_df.iterrows()):
session_id = row['speaker_id'] + '/' + row['file_id']
# Load the wav signal
fn = '/'.join((vox1_root_dir, row['set'], 'wav', session_id)) + ".wav"
data, samplerate = soundfile.read(fn, dtype='int16')
_nb_samp = len(data)
# Randomly select a segment of "duration" if it's long enough
if _nb_samp > nb_samp:
cut = numpy.random.randint(low = 0, high = _nb_samp - nb_samp)
# Write the segment in the HDF5 file
fh.create_dataset(session_id,
data=data[cut:cut+nb_samp].astype('int16'),
maxshape=(None,),
fletcher32=True)
def prepare_voxceleb2(vox2_root_dir, output_batch_file, seg_duration=4, samplerate=16000):
# List wav files in VoxCeleb2
vox2_wav_list = [str(f) for f in list(Path(vox2_root_dir).rglob("*.[wW][aA][vV]"))]
vox2_dfs = [pandas.DataFrame(columns=("database", "speaker_id", "file_id", "duration", "speaker_idx"))] * 5
vox2_sublists = [[]]*5
lv2 = len(vox2_wav_list)
vox2_sublists[0] = vox2_wav_list[:lv2//2]
vox2_sublists[1] = vox2_wav_list[lv2 // 2: 2*(lv2 // 2)]
vox2_sublists[2] = vox2_wav_list[2*(lv2 // 2): 3 * (lv2 // 2)]
vox2_sublists[3] = vox2_wav_list[3 * (lv2 // 2): 4 * (lv2 // 2)]
vox2_sublists[3] = vox2_wav_list[4 * (lv2 // 2):]