Commit 9f28b58f authored by Anthony Larcher's avatar Anthony Larcher
Browse files

massive cleaning

parent b04c99df
......@@ -26,10 +26,13 @@ Copyright 2014-2021 Anthony Larcher
"""
import collections
import math
import numpy
import pandas
import random
import soundfile
import torch
import torchaudio
has_pyroom = True
try:
......@@ -72,6 +75,46 @@ class PreEmphasis(torch.nn.Module):
class FrequencyMask(object):
"""Crop randomly the image in a sample.
Args:
output_size (tuple or int): Desired output size. If int, square crop
is made.
"""
def __init__(self, max_size, feature_size):
self.max_size = max_size
self.feature_size = feature_size
def __call__(self, sample):
data = sample[0]
if sample[2]:
size = numpy.random.randint(1, self.max_size)
f0 = numpy.random.randint(0, self.feature_size - self.max_size)
data[f0:f0+size, :] = 10.
return data, sample[1], sample[2], sample[3], sample[4], sample[5]
class TemporalMask(object):
"""Crop randomly the image in a sample.
Args:
output_size (tuple or int): Desired output size. If int, square crop
is made.
"""
def __init__(self, max_size):
self.max_size = max_size
def __call__(self, sample):
data = sample[0]
if sample[3]:
size = numpy.random.randint(1, self.max_size)
t0 = numpy.random.randint(0, sample[0].shape[1] - self.max_size)
data[:, t0:t0+size] = 10.
return data, sample[1], sample[2], sample[3], sample[4], sample[5]
def normalize(wav):
"""
......@@ -390,6 +433,81 @@ if has_pyroom:
return data, sample[1], sample[2], sample[3] , sample[4], sample[5]
def data_augmentation(speech, sample_rate, transform_dict, transform_number, noise_df=None, rir_df=None):
"""
:param speech:
:param transform_dict:
:param transform_number:
:return:
tranformation
pipeline: add_noise,add_reverb
add_noise:
noise_db_csv: filename.csv
snr: 5,6,7,8,9,10,11,12,13,14,15
add_reverb:
rir_db_csv: filename.csv
codec: true
phone_filtering: true
"""
# Select the data augmentation randomly
if len(transform_dict.keys) >= transform_number:
aug_idx = numpy.arange(len(transform_dict.keys))
else:
aug_idx = numpy.random.randint(0, len(transform_dict), transform_number)
augmentations = list(numpy.array(transform_dict.keys())[aug_idx])
if "phone_filtering" in augmentations:
speech, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
speech,
sample_rate,
effects=[
["lowpass", "4000"],
["compand", "0.02,0.05", "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8", "-8", "-7", "0.05"],
["rate", "16000"],
])
if "stretch" in augmentations:
strech = torchaudio.functional.TimeStretch()
rate = numpy.random.uniform(0.8,1.2)
speech = strech(speech, rate)
if "add_reverb" in augmentations:
rir_nfo = numpy.random.randint(0, len(rir_df))
rir_fn = transform_dict["add_noise"]["data_path"] + "/" + rir_nfo + ".wav"
rir, rir_fs = torchaudio.load(rir_fn)
rir = rir[rir_nfo[1], :] #keep selected channel
speech_ = torch.nn.functional.pad(speech, (rir.shape[1]-1, 0))
speech = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0]
if "add_noise" in augmentations:
# Pick a SNR level
snr_db = random.choice(transform_dict["add_noise"]["snr"])
# Pick a file name from the noise_df
noise_fn = transform_dict["add_noise"]["data_path"] + "/" + random.choice(noise_df) + ".wav"
noise, noise_fs = torchaudio.load(noise_fn, frame_offset=0, num_frames=speech.shape[1])
speech_power = speech.norm(p=2)
noise_power = noise.norm(p=2)
snr = math.exp(snr_db / 10)
scale = snr * noise_power / speech_power
speech = (scale * speech + noise) / 2
if "codec" in augmentations:
configs = [
{"format": "wav", "encoding": 'ULAW', "bits_per_sample": 8},
{"format": "gsm"},
{"format": "mp3", "compression": -9},
{"format": "vorbis", "compression": -1}
]
param, title = random.choice(configs)
speech = torchaudio.functional.apply_codec(speech, sample_rate, **param)
return speech
"""
It might not be 100% on topic, but maybe this is interesting for you anyway. If you do not need to do real time processing, things can be made more easy. Limiting and dynamic compression can be seen as applying a dynamic transfer function. This function just maps input to output values. A linear function then returns the original audio and a "curved" function does compression or expansion. Applying a transfer function is as simple as
......
# coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2021 Anthony Larcher, Yevhenii Prokopalo
"""
import logging
import math
import os
import numpy
import pandas
import pickle
import shutil
import time
import torch
import torchaudio
import tqdm
import yaml
from collections import OrderedDict
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from .xsets import SideSet
from .xsets import FileSet
from .xsets import IdMapSet
from .xsets import IdMapSet_per_speaker
from .xsets import SideSampler
from .res_net import RawPreprocessor
from .res_net import ResBlockWFMS
from .res_net import ResBlock
from .res_net import PreResNet34
from .res_net import PreFastResNet34
from ..bosaris import IdMap
from ..bosaris import Key
from ..bosaris import Ndx
from ..statserver import StatServer
from ..iv_scoring import cosine_scoring
from .sincnet import SincNet
from .loss import ArcLinear
from .loss import l2_norm
from .loss import ArcMarginProduct
os.environ['MKL_THREADING_LAYER'] = 'GNU'
__license__ = "LGPL"
__author__ = "Anthony Larcher"
__copyright__ = "Copyright 2015-2021 Anthony Larcher"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reS'
logging.basicConfig(format='%(asctime)s %(message)s')
# Make PyTorch Deterministic
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
numpy.random.seed(0)
class MfccFrontEnd(torch.nn.Module):
"""
"""
def __init__(self,
pre_emphasis=0.97,
sample_rate=16000,
n_fft=2048,
f_min=133.333,
f_max=6855.4976,
win_length=1024,
window_fn=torch.hann_window,
hop_length=512,
power=2.0,
n_mels=100,
n_mfcc=80):
super(MfccFrontEnd, self).__init__()
self.pre_emphasis = pre_emphasis
self.sample_rate = sample_rate
self.n_fft = n_fft
self.f_min = f_min
self.f_max = f_max
self.win_length = win_length
self.window_fn=window_fn
self.hop_length = hop_length
self.power=power
self.window_fn = window_fn
self.n_mels = n_mels
self.n_mfcc = n_mfcc
self.PreEmphasis = PreEmphasis(self.pre_emphasis)
self.melkwargs = {"n_fft":self.n_fft,
"f_min":self.f_min,
"f_max":self.f_max,
"win_length":self.win_length,
"window_fn":self.window_fn,
"hop_length":self.hop_length,
"power":self.power,
"n_mels":self.n_mels}
self.MFCC = torchaudio.transforms.MFCC(
sample_rate=self.sample_rate,
n_mfcc=self.n_mfcc,
dct_type=2,
log_mels=True,
melkwargs=self.melkwargs)
self.CMVN = torch.nn.InstanceNorm1d(self.n_mfcc)
def forward(self, x):
"""
:param x:
:return:
"""
with torch.no_grad():
with torch.cuda.amp.autocast(enabled=False):
mfcc = self.PreEmphasis(x)
mfcc = self.MFCC(mfcc)
mfcc = self.CMVN(mfcc)
return mfcc
class MelSpecFrontEnd(torch.nn.Module):
"""
"""
def __init__(self,
pre_emphasis=0.97,
sample_rate=16000,
n_fft=1024,
f_min=90,
f_max=7600,
win_length=1024,
window_fn=torch.hann_window,
hop_length=256,
power=2.0,
n_mels=80):
super(MelSpecFrontEnd, self).__init__()
self.pre_emphasis = pre_emphasis
self.sample_rate = sample_rate
self.n_fft = n_fft
self.f_min = f_min
self.f_max = f_max
self.win_length = win_length
self.window_fn=window_fn
self.hop_length = hop_length
self.power=power
self.window_fn = window_fn
self.n_mels = n_mels
self.PreEmphasis = PreEmphasis(self.pre_emphasis)
self.melkwargs = {"n_fft":self.n_fft,
"f_min":self.f_min,
"f_max":self.f_max,
"win_length":self.win_length,
"window_fn":self.window_fn,
"hop_length":self.hop_length,
"power":self.power,
"n_mels":self.n_mels}
self.MelSpec = torchaudio.transforms.MelSpectrogram(sample_rate=self.sample_rate,
n_fft=self.melkwargs['n_fft'],
f_min=self.melkwargs['f_min'],
f_max=self.melkwargs['f_max'],
win_length=self.melkwargs['win_length'],
hop_length=self.melkwargs['hop_length'],
window_fn=self.melkwargs['window_fn'],
power=self.melkwargs['power'],
n_mels=self.melkwargs['n_mels'])
self.CMVN = torch.nn.InstanceNorm1d(self.n_mels)
def forward(self, x):
"""
:param x:
:return:
"""
with torch.no_grad():
with torch.cuda.amp.autocast(enabled=False):
out = self.PreEmphasis(x)
out = self.MelSpec(out)+1e-6
out = torch.log(out)
out = self.CMVN(out)
return out
class RawPreprocessor(torch.nn.Module):
"""
"""
def __init__(self, nb_samp, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, bias=False, groups=1, min_low_hz=50, min_band_hz=50, sample_rate=16000):
"""
:param nb_samp:
:param in_channels:
:param filts:
:param first_conv:
"""
super(RawPreprocessor, self).__init__()
self.ln = LayerNorm(nb_samp)
self.first_conv = SincConv1d(in_channels = in_channels,
out_channels = out_channels,
kernel_size = kernel_size,
sample_rate = sample_rate,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias,
groups=groups,
min_low_hz=min_low_hz,
min_band_hz=min_band_hz
)
self.first_bn = torch.nn.BatchNorm1d(num_features = out_channels)
self.lrelu = torch.nn.LeakyReLU()
self.lrelu_keras = torch.nn.LeakyReLU(negative_slope = 0.3)
def forward(self, x):
"""
:param x:
:return:
"""
nb_samp = x.shape[0]
len_seq = x.shape[1]
out = self.ln(x)
out = out.view(nb_samp, 1, len_seq)
out = torch.nn.functional.max_pool1d(torch.abs(self.first_conv(out)), 3)
out = self.first_bn(out)
out = self.lrelu_keras(out)
return out
......@@ -193,53 +193,6 @@ class LayerNorm(torch.nn.Module):
return self.gamma * (x - mean) / (std + self.eps) + self.beta
class RawPreprocessor(torch.nn.Module):
"""
"""
def __init__(self, nb_samp, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, bias=False, groups=1, min_low_hz=50, min_band_hz=50, sample_rate=16000):
"""
:param nb_samp:
:param in_channels:
:param filts:
:param first_conv:
"""
super(RawPreprocessor, self).__init__()
self.ln = LayerNorm(nb_samp)
self.first_conv = SincConv1d(in_channels = in_channels,
out_channels = out_channels,
kernel_size = kernel_size,
sample_rate = sample_rate,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias,
groups=groups,
min_low_hz=min_low_hz,
min_band_hz=min_band_hz
)
self.first_bn = torch.nn.BatchNorm1d(num_features = out_channels)
self.lrelu = torch.nn.LeakyReLU()
self.lrelu_keras = torch.nn.LeakyReLU(negative_slope = 0.3)
def forward(self, x):
"""
:param x:
:return:
"""
nb_samp = x.shape[0]
len_seq = x.shape[1]
out = self.ln(x)
out = out.view(nb_samp, 1, len_seq)
out = torch.nn.functional.max_pool1d(torch.abs(self.first_conv(out)), 3)
out = self.first_bn(out)
out = self.lrelu_keras(out)
return out
class ResBlock(torch.nn.Module):
"""
......
......@@ -416,7 +416,7 @@ class SincNet(torch.nn.Module):
#return output.transpose(1, 2)
return output
def dimension():
def dimension(self):
doc = "Output features dimension."
def fget(self):
......
This diff is collapsed.
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment