Commit ebb18b54 authored by Anthony Larcher's avatar Anthony Larcher
Browse files

Merge branch 'dev_al'

# Conflicts:
#	nnet/res_net.py
#	nnet/xvector.py
parents 43927406 2aacbe26
......@@ -34,9 +34,8 @@ import os
import sys
# Read environment variable if it exists
SIDEKIT_CONFIG={"libsvm":True,
SIDEKIT_CONFIG={"libsvm":False,
"mpi":False,
"cuda":True
}
......@@ -165,15 +164,10 @@ if SIDEKIT_CONFIG["cuda"]:
if CUDA:
from .nnet import FForwardNetwork
from .nnet import kaldi_to_hdf5
from .nnet import XvectorMultiDataset
from .nnet import XvectorDataset
from .nnet import StatDataset
from .nnet import Xtractor
from .nnet import xtrain
from .nnet import extract_embeddings
from .nnet import extract_sliding_embedding
from .nnet import ResBlock
from .nnet import ResNet18
from .nnet import SincNet
else:
......
......@@ -235,8 +235,8 @@ class Key:
with h5py.File(input_file_fame, "r") as f:
key = Key()
key.modelset = f.get("modelset")[()]
key.segset = f.get("segset")[()]
key.modelset = f["modelset"][()]
key.segset = f["segset"][()]
# if running python 3, need a conversion to unicode
if sys.version_info[0] == 3:
......
......@@ -189,15 +189,16 @@ class Ndx:
"""
with h5py.File(input_file_name, "r") as f:
ndx = Ndx()
ndx.modelset = f.get("modelset")[()]
ndx.segset = f.get("segset")[()]
ndx.modelset = f["modelset"][()]
ndx.segset = f["segset"][()]
# if running python 3, need a conversion to unicode
if sys.version_info[0] == 3:
ndx.modelset = ndx.modelset.astype('U100', copy=False)
ndx.segset = ndx.segset.astype('U100', copy=False)
ndx.modelset = ndx.modelset.astype('U100')
ndx.segset = ndx.segset.astype('U100')
ndx.trialmask = f.get("trial_mask")[()].astype('bool')
ndx.trialmask = numpy.zeros((ndx.modelset.shape[0], ndx.segset.shape[0]), dtype=numpy.bool)
f["trial_mask"].read_direct(ndx.trialmask)
assert ndx.validate(), "Error: wrong Ndx format"
return ndx
......
......@@ -163,12 +163,18 @@ class Scores:
:return: a vector of target scores.
:return: a vector of non-target scores.
"""
new_score = self.align_with_ndx(key)
tarndx = key.tar & new_score.scoremask
nonndx = key.non & new_score.scoremask
tar = new_score.scoremat[tarndx]
non = new_score.scoremat[nonndx]
return tar, non
if (key.modelset == self.modelset).all() \
and (key.segset == self.segset).all() \
and self.scoremask.shape[0] == key.tar.shape[0] \
and self.scoremask.shape[1] == key.tar.shape[1]:
return self.scoremat[key.tar & self.scoremask], self.scoremat[key.non & self.scoremask]
else:
new_score = self.align_with_ndx(key)
tarndx = key.tar & new_score.scoremask
nonndx = key.non & new_score.scoremask
tar = new_score.scoremat[tarndx]
non = new_score.scoremat[nonndx]
return tar, non
def align_with_ndx(self, ndx):
"""The ordering in the output Scores object corresponds to ndx, so
......
......@@ -221,7 +221,7 @@ class FeaturesServer(object):
feat = pca_dct(feat, self.dct_pca_config[0], self.dct_pca_config[1], self.dct_pca_config[2])
elif self.sdc:
feat = shifted_delta_cepstral(feat, d=self.sdc_config[0], p=self.sdc_config[1], k=self.sdc_config[2])
# Apply a mask on the features
if self.mask is not None:
feat = self._mask(feat)
......@@ -488,6 +488,7 @@ class FeaturesServer(object):
feat, label = self.post_processing(feat, label, global_mean, global_std)
else:
feat, label = self.post_processing(feat, label)
return feat, label
def get_features_per_speaker(self, show, idmap, channel=0, input_feature_filename=None, label=None):
......
......@@ -27,13 +27,15 @@ Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
"""
import copy
import logging
import sys
import numpy
import scipy
from sidekit.bosaris import Ndx
from sidekit.bosaris import Scores
import torch
from sidekit.bosaris import Ndx, Scores
from sidekit.statserver import StatServer
import sys
if sys.version_info.major > 2:
from functools import reduce
......@@ -58,7 +60,7 @@ def _check_missing_model(enroll, test, ndx):
return clean_ndx
def cosine_scoring(enroll, test, ndx, wccn=None, check_missing=True):
def cosine_scoring(enroll, test, ndx, wccn=None, check_missing=True, device=None):
"""Compute the cosine similarities between to sets of vectors. The list of
trials to perform is given in an Ndx object.
......@@ -96,10 +98,15 @@ def cosine_scoring(enroll, test, ndx, wccn=None, check_missing=True):
enroll_copy.norm_stat1()
if enroll_copy != test_copy:
test_copy.norm_stat1()
s = numpy.dot(enroll_copy.stat1, test_copy.stat1.transpose())
s_size_in_bytes = enroll_copy.stat1.shape[0] * test_copy.stat1.shape[0] * 4
if device == None:
device = torch.device("cuda:0" if torch.cuda.is_available() and s_size_in_bytes < 3e9 else "cpu")
else:
device = device if torch.cuda.is_available() and s_size_in_bytes < 3e9 else torch.device("cpu")
score = Scores()
score.scoremat = s
score.scoremat = torch.einsum('ij,kj', torch.FloatTensor(enroll_copy.stat1).to(device),
torch.FloatTensor(test_copy.stat1).to(device)).cpu().numpy()
score.modelset = clean_ndx.modelset
score.segset = clean_ndx.segset
score.scoremask = clean_ndx.trialmask
......
......@@ -28,23 +28,27 @@ Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
"""
from .augmentation import AddNoise
from .feed_forward import FForwardNetwork
from .feed_forward import kaldi_to_hdf5
from .xsets import XvectorMultiDataset, XvectorDataset, StatDataset, IdMapSet_per_speaker
from .xvector import Xtractor, xtrain, extract_embeddings, extract_sliding_embedding, MeanStdPooling
from .res_net import ResBlock, ResNet18, PreResNet34
from .rawnet import prepare_voxceleb1, Vox1Set, PreEmphasis
from .xsets import IdMapSetPerSpeaker
from .xsets import SideSet
from .xsets import SideSampler
from .xvector import Xtractor
from .xvector import xtrain
from .xvector import extract_embeddings
from .pooling import MeanStdPooling
from .pooling import AttentivePooling
from .pooling import GruPooling
from .res_net import ResBlock
from .res_net import PreResNet34
from .res_net import PreFastResNet34
from .res_net import PreHalfResNet34
from .sincnet import SincNet
from .preprocessor import RawPreprocessor
from .preprocessor import MfccFrontEnd
from .preprocessor import MelSpecFrontEnd
has_pyroom = True
try:
import pyroomacoustics
except ImportError:
has_pyroom = False
if has_pyroom:
from .augmentation import AddReverb
__author__ = "Anthony Larcher and Sylvain Meignier"
......
......@@ -26,10 +26,14 @@ Copyright 2014-2021 Anthony Larcher
"""
import collections
import math
import numpy
from scipy import signal
import pandas
import random
import soundfile
import torch
import torchaudio
has_pyroom = True
try:
......@@ -51,320 +55,242 @@ __docformat__ = 'reStructuredText'
Noise = collections.namedtuple('Noise', 'type file_id duration')
def normalize(wav):
"""
:param wav:
:return:
"""
return wav / (numpy.sqrt(numpy.mean(wav ** 2)) + 1e-8)
class PreEmphasis(torch.nn.Module):
def __init__(self, coef: float = 0.97):
super().__init__()
self.coef = coef
# make kernel
# In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
self.register_buffer(
'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
)
def crop(signal, duration):
"""
def forward(self, input: torch.tensor) -> torch.tensor:
assert len(input.size()) == 2, 'The number of dimensions of input tensor must be 2!'
# reflect padding to match lengths of in/out
input = input.unsqueeze(1)
input = torch.nn.functional.pad(input, (1, 0), 'reflect')
return torch.nn.functional.conv1d(input, self.flipped_filter).squeeze(1)
:return:
"""
start =random.randint(0, signal.shape[0] - duration)
chunk = signal[start: start + duration]
return chunk
class FrequencyMask(object):
"""Crop randomly the image in a sample.
class AddNoise(object):
Args:
output_size (tuple or int): Desired output size. If int, square crop
is made.
"""
def __init__(self, max_size, feature_size):
self.max_size = max_size
self.feature_size = feature_size
"""
def __call__(self, sample):
data = sample[0]
if sample[2]:
size = numpy.random.randint(1, self.max_size)
f0 = numpy.random.randint(0, self.feature_size - self.max_size)
data[f0:f0+size, :] = 10.
return data, sample[1], sample[2], sample[3], sample[4], sample[5]
def __init__(self, noise_db_csv, snr_min_max, noise_root_path, sample_rate=16000):
"""
"""
self.snr_min = snr_min_max[0]
self.snr_max = snr_min_max[1]
self.noise_root_path = noise_root_path
self.sample_rate = sample_rate
class TemporalMask(object):
"""Crop randomly the image in a sample.
df = pandas.read_csv(noise_db_csv)
self.noises = []
for index, row in df.iterrows():
self.noises.append(Noise(type=row["type"], file_id=row["file_id"], duration=row["duration"]))
Args:
output_size (tuple or int): Desired output size. If int, square crop
is made.
"""
def __init__(self, max_size):
self.max_size = max_size
def __call__(self, sample):
"""
:param original:
:param sample_rate:
:return:
"""
data = sample[0]
if sample[4]:
original_duration = len(data)
# accumulate enough noise to cover duration of original waveform
noises = []
left = original_duration
while left > 0:
# select noise file at random
file = random.choice(self.noises)
noise_signal, fs = soundfile.read(self.noise_root_path + "/" + file.file_id + ".wav")
if sample[3]:
size = numpy.random.randint(1, self.max_size)
t0 = numpy.random.randint(0, sample[0].shape[1] - self.max_size)
data[:, t0:t0+size] = 10.
return data, sample[1], sample[2], sample[3], sample[4], sample[5]
# Load noise from file
if not fs == self.sample_rate:
print("Problem") # todo
duration = noise_signal.shape[0]
# if noise file is longer than what is needed, crop it
if duration > left:
noise = crop(noise_signal, left)
left = 0
# otherwise, take the whole file
else:
noise = noise_signal
left -= duration
# Todo Downsample if needed
# if sample_rate > fs:
#
def normalize(wav):
"""
noise = normalize(noise)
noises.append(noise.squeeze())
:param wav:
:return:
"""
return wav / (numpy.sqrt(numpy.mean(wav ** 2)) + 1e-8)
# concatenate
noise = numpy.hstack(noises)
# select SNR at random
snr = (self.snr_max - self.snr_min) * numpy.random.random_sample() + self.snr_min
alpha = numpy.exp(-numpy.log(10) * snr / 20)
def crop(signal, duration):
"""
data = normalize(data) + alpha * noise
:return:
"""
start =random.randint(0, signal.shape[0] - duration)
chunk = signal[start: start + duration]
return data.squeeze(), sample[1], sample[2], sample[3], sample[4], sample[5]
return chunk
class AddNoiseFromSilence(object):
"""
def data_augmentation(speech,
sample_rate,
transform_dict,
transform_number,
noise_df=None,
rir_df=None):
"""
def __init__(self, noise_db_csv, snr_min_max, noise_root_path, sample_rate=16000):
"""
"""
self.snr_min = snr_min_max[0]
self.snr_max = snr_min_max[1]
self.noise_root_path = noise_root_path
self.sample_rate = sample_rate
df = pandas.read_csv(noise_db_csv)
self.noises = []
for index, row in df.iterrows():
self.noises.append(Noise(type=row["type"], file_id=row["file_id"], duration=row["duration"]))
:param speech:
:param transform_dict:
:param transform_number:
:return:
def __call__(self, sample):
"""
tranformation
pipeline: add_noise,add_reverb
add_noise:
noise_db_csv: filename.csv
snr: 5,6,7,8,9,10,11,12,13,14,15
add_reverb:
rir_db_csv: filename.csv
codec: true
phone_filtering: true
:param original:
:param sample_rate:
:return:
"""
data = sample[0]
if sample[4]:
original_duration = len(data)
# accumulate enough noise to cover duration of original waveform
noises = []
left = original_duration
while left > 0:
# select noise file at random
file = random.choice(self.noises)
noise_signal, fs = soundfile.read(self.noise_root_path + "/" + file.file_id + ".wav")
# Load noise from file
if not fs == self.sample_rate:
print("Problem") # todo
duration = noise_signal.shape[0]
# if noise file is longer than what is needed, crop it
if duration > left:
noise = crop(noise_signal, left)
left = 0
# otherwise, take the whole file
else:
noise = noise_signal
left -= duration
# Todo Downsample if needed
# if sample_rate > fs:
#
noise = normalize(noise)
noises.append(noise.squeeze())
# concatenate
noise = numpy.hstack(noises)
# select SNR at random
snr = (self.snr_max - self.snr_min) * numpy.random.random_sample() + self.snr_min
alpha = numpy.exp(-numpy.log(10) * snr / 20)
data = normalize(data) + alpha * noise
return data.squeeze(), sample[1], sample[2], sample[3], sample[4], sample[5]
if has_pyroom:
class AddReverb(object):
"""Simulate indoor reverberation
Parameters
----------
depth : (float, float), optional
Minimum and maximum values for room depth (in meters).
Defaults to (2.0, 10.0).
width : (float, float), optional
Minimum and maximum values for room width (in meters).
Defaults to (1.0, 10.0).
height : (float, float), optional
Minimum and maximum values for room heigth (in meters).
Defaults to (2.0, 5.0).
absorption : (float, float), optional
Minimum and maximum values of walls absorption coefficient.
Defaults to (0.2, 0.9).
noise : str or list of str, optional
`pyannote.database` collection(s) used for adding noise.
Defaults to "MUSAN.Collection.BackgroundNoise"
snr : (float, float), optional
Minimum and maximum values of signal-to-noise ratio.
Defaults to (5.0, 15.0)
"""
def __init__(
self,
depth=(2.0, 10.0),
width=(1.0, 10.0),
height=(2.0, 5.0),
absorption=(0.2, 0.9),
noise=None,
snr=(5.0, 15.0)
):
super().__init__()
self.depth = depth
self.width = width
self.height = height
self.absorption = absorption
self.max_order_ = 17
self.noise = noise
self.snr = snr
self.noise_ = noise
self.n_rooms_ = 128
self.new_rooms_prob_ = 0.001
self.main_lock_ = threading.Lock()
self.rooms_ = collections.deque(maxlen=self.n_rooms_)
self.room_lock_ = [threading.Lock() for _ in range(self.n_rooms_)]
@staticmethod
def random(m, M):
"""
:param m:
:param M:
:return:
"""
return (M - m) * numpy.random.random_sample() + m
def new_room(self, sample_rate: int):
"""
:param sample_rate:
:return:
"""
# generate a room at random
depth = self.random(*self.depth)
width = self.random(*self.width)
height = self.random(*self.height)
absorption = self.random(*self.absorption)
room = pyroomacoustics.ShoeBox(
[depth, width, height],
fs=sample_rate,
absorption=absorption,
max_order=self.max_order_,
)
# play the original audio chunk at a random location
original = [
self.random(0, depth),
self.random(0, width),
self.random(0, height),
]
room.add_source(original)
# play the noise audio chunk at a random location
noise = [self.random(0, depth), self.random(0, width), self.random(0, height)]
room.add_source(noise)
# place the microphone at a random location
microphone = [
self.random(0, depth),
self.random(0, width),
self.random(0, height),
]
room.add_microphone_array(
pyroomacoustics.MicrophoneArray(numpy.c_[microphone, microphone], sample_rate)
)
room.compute_rir()
return room
def __call__(self, sample):
data = sample[0]
if sample[5]:
with self.main_lock_:
# initialize rooms (with 2 sources and 1 microphone)
while len(self.rooms_) < self.n_rooms_:
room = self.new_room(self.sample_rate)
self.rooms_.append(room)
# create new room with probability new_rooms_prob_
if numpy.random.rand() > 1.0 - self.new_rooms_prob_:
room = self.new_room(self.sample_rate)
self.rooms_.append(room)
# choose one room at random
index = numpy.random.choice(self.n_rooms_)
# lock chosen room to ensure room.sources are not updated concurrently
with self.room_lock_[index]:
room = self.rooms_[index]
# play normalized original audio chunk at source #1
n_samples = len(data)
data = normalize(original).squeeze()
room.sources[0].add_signal(data)
# generate noise with random SNR
noise = self.noise_(n_samples, self.sample_rate).squeeze()
snr = self.random(*self.snr)
alpha = numpy.exp(-numpy.log(10) * snr / 20)
noise *= alpha
# play noise at source #2
room.sources[1].add_signal(noise)
# simulate room and return microphone signal
room.simulate()
data = room.mic_array.signals[0, :n_samples, numpy.newaxis]
return data, sample[1], sample[2], sample[3] , sample[4], sample[5]
"""
# Select the data augmentation randomly
aug_idx = random.sample(range(len(transform_dict.keys())), k=transform_number)
augmentations = numpy.array(list(transform_dict.keys()))[aug_idx]
if "stretch" in augmentations:
strech = torchaudio.functional.TimeStretch()
rate = random.uniform(0.8,1.2)
speech = strech(speech