Commit 4497530e authored by Anthony Larcher's avatar Anthony Larcher
Browse files

addnoise and addreverb

parent eaf0efbf
......@@ -27,7 +27,9 @@ Copyright 2014-2020 Anthony Larcher and Sylvain Meignier
:mod:`nnet` provides methods to manage Neural Networks using PyTorch
"""
#from sidekit.nnet.sad_rnn import SAD_RNN
from .augmentation import AddNoise
from .augmentation import AddReverb
from .feed_forward import FForwardNetwork
from .feed_forward import kaldi_to_hdf5
from .xsets import XvectorMultiDataset, XvectorDataset, StatDataset
......
# coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2020 Anthony Larcher
"""
import collections
import numpy
import pandas
import random
import soundfile
import threading
import pyroomacoustics
Noise = collections.namedtuple('Noise', 'type file duration')
def normalize(wav):
"""
:param wav:
:return:
"""
return wav / (numpy.sqrt(numpy.mean(wav ** 2)) + 1e-8)
def crop(signal, duration):
"""
:return:
"""
start =random.randint(0, signal.shape[0] - duration)
chunk = signal[start: start + duration]
return chunk
class AddNoise(object):
"""
"""
def __init__(self, noise_db_csv, snr_min, snr_max, noise_root_path):
"""
"""
self.snr_min = snr_min
self.snr_max = snr_max
self.noise_root_path = noise_root_path
df = pandas.read_csv(noise_db_csv)
self.noises = []
for index, row in df.iterrows():
self.noises.append(Noise(type=row["type"], file=row["file_id"], duration=row["duration"]))
def __call__(self, original, sample_rate):
"""
:param original:
:param sample_rate:
:return:
"""
original_duration = len(original) / sample_rate
# accumulate enough noise to cover duration of original waveform
noises = []
left = original_duration
while left > 0:
# select noise file at random
file = random.choice(self.noises)
noise_signal, fs = soundfile.read(self.noise_root_path + "/" + file.file_id + ".wav")
# Load noise from file
duration = noise_signal.shape[0] / fs
# if noise file is longer than what is needed, crop it
if duration > left:
noise = crop(noise_signal, duration)
left = 0
# otherwise, take the whole file
else:
noise = noise_signal
left -= duration
# Todo Downsample if needed
# if sample_rate > fs:
#
noise = normalize(noise)
noises.append(noise)
# concatenate
noise = numpy.vstack(noises)
# select SNR at random
snr = (self.snr_max - self.snr_min) * numpy.random.random_sample() + self.snr_min
alpha = numpy.exp(-numpy.log(10) * snr / 20)
return normalize(original) + alpha * noise
class AddReverb(object):
"""Simulate indoor reverberation
Parameters
----------
depth : (float, float), optional
Minimum and maximum values for room depth (in meters).
Defaults to (2.0, 10.0).
width : (float, float), optional
Minimum and maximum values for room width (in meters).
Defaults to (1.0, 10.0).
height : (float, float), optional
Minimum and maximum values for room heigth (in meters).
Defaults to (2.0, 5.0).
absorption : (float, float), optional
Minimum and maximum values of walls absorption coefficient.
Defaults to (0.2, 0.9).
noise : str or list of str, optional
`pyannote.database` collection(s) used for adding noise.
Defaults to "MUSAN.Collection.BackgroundNoise"
snr : (float, float), optional
Minimum and maximum values of signal-to-noise ratio.
Defaults to (5.0, 15.0)
"""
def __init__(
self,
depth=(2.0, 10.0),
width=(1.0, 10.0),
height=(2.0, 5.0),
absorption=(0.2, 0.9),
noise=None,
snr=(5.0, 15.0)
):
super().__init__()
self.depth = depth
self.width = width
self.height = height
self.absorption = absorption
self.max_order_ = 17
self.noise = noise
self.snr = snr
self.noise_ = noise
self.n_rooms_ = 128
self.new_rooms_prob_ = 0.001
self.main_lock_ = threading.Lock()
self.rooms_ = collections.deque(maxlen=self.n_rooms_)
self.room_lock_ = [threading.Lock() for _ in range(self.n_rooms_)]
@staticmethod
def random(m, M):
"""
:param m:
:param M:
:return:
"""
return (M - m) * numpy.random.random_sample() + m
def new_room(self, sample_rate: int):
"""
:param sample_rate:
:return:
"""
# generate a room at random
depth = self.random(*self.depth)
width = self.random(*self.width)
height = self.random(*self.height)
absorption = self.random(*self.absorption)
room = pyroomacoustics.ShoeBox(
[depth, width, height],
fs=sample_rate,
absorption=absorption,
max_order=self.max_order_,
)
# play the original audio chunk at a random location
original = [
self.random(0, depth),
self.random(0, width),
self.random(0, height),
]
room.add_source(original)
# play the noise audio chunk at a random location
noise = [self.random(0, depth), self.random(0, width), self.random(0, height)]
room.add_source(noise)
# place the microphone at a random location
microphone = [
self.random(0, depth),
self.random(0, width),
self.random(0, height),
]
room.add_microphone_array(
pyroomacoustics.MicrophoneArray(numpy.c_[microphone, microphone], sample_rate)
)
room.compute_rir()
return room
def __call__(self, original: numpy.ndarray, sample_rate):
with self.main_lock_:
# initialize rooms (with 2 sources and 1 microphone)
while len(self.rooms_) < self.n_rooms_:
room = self.new_room(sample_rate)
self.rooms_.append(room)
# create new room with probability new_rooms_prob_
if numpy.random.rand() > 1.0 - self.new_rooms_prob_:
room = self.new_room(sample_rate)
self.rooms_.append(room)
# choose one room at random
index = numpy.random.choice(self.n_rooms_)
# lock chosen room to ensure room.sources are not updated concurrently
with self.room_lock_[index]:
room = self.rooms_[index]
# play normalized original audio chunk at source #1
n_samples = len(original)
original = normalize(original).squeeze()
room.sources[0].add_signal(original)
# generate noise with random SNR
noise = self.noise_(n_samples, sample_rate).squeeze()
snr = self.random(*self.snr)
alpha = numpy.exp(-numpy.log(10) * snr / 20)
noise *= alpha
# play noise at source #2
room.sources[1].add_signal(noise)
# simulate room and return microphone signal
room.simulate()
return room.mic_array.signals[0, :n_samples, numpy.newaxis]
......@@ -37,6 +37,8 @@ import torch
import soundfile
import yaml
from .augmentation import AddNoise
from .augmentation import AddReverb
from ..bosaris.idmap import IdMap
from ..frontend.vad import pre_emphasis
from ..frontend.features import trfbank
......@@ -226,6 +228,7 @@ class CMVN(object):
return data, sample[1], sample[2], sample[3]
class FrequencyMask(object):
"""Crop randomly the image in a sample.
......@@ -335,7 +338,19 @@ class SideSet(Dataset):
set_type="train",
chunk_per_segment=1,
overlap=0.,
dataset_df=None):
dataset_df=None,
noise_db_csv=None,
noise_root_db=None,
noisy_file_ratio=0.0,
noise_snr=(5.0, 15.0),
reverb_ratio=0.0,
reverb_depth = (2.0, 10.0),
reverb_width = (1.0, 10.0),
reverb_height = (2.0, 5.0),
reverb_absorption = (0.2, 0.9),
reverb_noise = None,
reverb_snr = (5.0, 15.0)
):
"""
:param dataset_yaml: name of the YAML file describing the dataset
......@@ -445,6 +460,18 @@ class SideSet(Dataset):
for t in trans:
if 'PreEmphasis' in t:
_transform.append(PreEmphasis())
if 'AddNoise' in t:
_transform.append(AddNoise(noise_db_csv=noise_db_csv,
snr_min=noise_snr[0],
snr_max=noise_snr[1],
noise_root_path=noise_root_db))
if 'AddReverb' in t:
_transform.append(AddReverb(ratio=reverb_ratio,
depth=reverb_depth,
width=reverb_width,
height=reverb_height,
absorption=reverb_absorption,
snr=reverb_snr))
if 'MFCC' in t:
_transform.append(MFCC())
if "CMVN" in t:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment