Commit 2e5aa5e9 authored by Anthony Larcher's avatar Anthony Larcher
Browse files

add noise and reverb

parent 7f3e22ac
......@@ -57,69 +57,84 @@ def crop(signal, duration):
return chunk
if 'AddReverb' in t:
_transform.append(AddReverb(ratio=dataset["train"]["augmentation"]["reverb_ratio"],
depth=dataset["train"]["augmentation"]["reverb_depth"],
width=dataset["train"]["augmentation"]["reverb_width"],
height=dataset["train"]["augmentation"]["reverb_height"],
absorption=dataset["train"]["augmentation"]["reverb_absorption"],
snr=dataset["train"]["augmentation"]["reverb_snr"]))
class AddNoise(object):
"""
"""
def __init__(self, noise_db_csv, snr_min, snr_max, noise_root_path):
def __init__(self, dataset_length, noisy_file_ratio, noise_db_csv, snr_min_max, noise_root_path, sample_rate=16000):
"""
"""
self.snr_min = snr_min
self.snr_max = snr_max
self.snr_min = snr_min_max[0]
self.snr_max = snr_min_max[1]
self.noise_root_path = noise_root_path
self.sample_rate = sample_rate
df = pandas.read_csv(noise_db_csv)
self.noises = []
for index, row in df.iterrows():
self.noises.append(Noise(type=row["type"], file=row["file_id"], duration=row["duration"]))
def __call__(self, original, sample_rate):
def __call__(self, sample):
"""
:param original:
:param sample_rate:
:return:
"""
original_duration = len(original) / sample_rate
# accumulate enough noise to cover duration of original waveform
noises = []
left = original_duration
while left > 0:
# select noise file at random
file = random.choice(self.noises)
noise_signal, fs = soundfile.read(self.noise_root_path + "/" + file.file_id + ".wav")
# Load noise from file
duration = noise_signal.shape[0] / fs
# if noise file is longer than what is needed, crop it
if duration > left:
noise = crop(noise_signal, duration)
left = 0
# otherwise, take the whole file
else:
noise = noise_signal
left -= duration
# Todo Downsample if needed
# if sample_rate > fs:
#
noise = normalize(noise)
noises.append(noise)
data = sample[0]
if sample[4]:
original_duration = len(data) / self.sample_rate
# accumulate enough noise to cover duration of original waveform
noises = []
left = original_duration
while left > 0:
# select noise file at random
file = random.choice(self.noises)
noise_signal, fs = soundfile.read(self.noise_root_path + "/" + file.file_id + ".wav")
# Load noise from file
duration = noise_signal.shape[0] / fs
# if noise file is longer than what is needed, crop it
if duration > left:
noise = crop(noise_signal, duration)
left = 0
# otherwise, take the whole file
else:
noise = noise_signal
left -= duration
# Todo Downsample if needed
# if sample_rate > fs:
#
noise = normalize(noise)
noises.append(noise)
# concatenate
noise = numpy.vstack(noises)
# select SNR at random
snr = (self.snr_max - self.snr_min) * numpy.random.random_sample() + self.snr_min
alpha = numpy.exp(-numpy.log(10) * snr / 20)
# concatenate
noise = numpy.vstack(noises)
data = normalize(data) + alpha * noise
# select SNR at random
snr = (self.snr_max - self.snr_min) * numpy.random.random_sample() + self.snr_min
alpha = numpy.exp(-numpy.log(10) * snr / 20)
return data, sample[1], sample[2], sample[3], sample[4], sample[5]
return normalize(original) + alpha * noise
class AddReverb(object):
......@@ -229,42 +244,48 @@ class AddReverb(object):
return room
def __call__(self, original: numpy.ndarray, sample_rate):
def __call__(self, sample):
with self.main_lock_:
data = sample[0]
if sample[5]:
# initialize rooms (with 2 sources and 1 microphone)
while len(self.rooms_) < self.n_rooms_:
room = self.new_room(sample_rate)
self.rooms_.append(room)
with self.main_lock_:
# create new room with probability new_rooms_prob_
if numpy.random.rand() > 1.0 - self.new_rooms_prob_:
room = self.new_room(sample_rate)
self.rooms_.append(room)
# initialize rooms (with 2 sources and 1 microphone)
while len(self.rooms_) < self.n_rooms_:
room = self.new_room(self.sample_rate)
self.rooms_.append(room)
# choose one room at random
index = numpy.random.choice(self.n_rooms_)
# create new room with probability new_rooms_prob_
if numpy.random.rand() > 1.0 - self.new_rooms_prob_:
room = self.new_room(self.sample_rate)
self.rooms_.append(room)
# lock chosen room to ensure room.sources are not updated concurrently
with self.room_lock_[index]:
# choose one room at random
index = numpy.random.choice(self.n_rooms_)
room = self.rooms_[index]
# lock chosen room to ensure room.sources are not updated concurrently
with self.room_lock_[index]:
# play normalized original audio chunk at source #1
n_samples = len(original)
original = normalize(original).squeeze()
room.sources[0].add_signal(original)
room = self.rooms_[index]
# generate noise with random SNR
noise = self.noise_(n_samples, sample_rate).squeeze()
snr = self.random(*self.snr)
alpha = numpy.exp(-numpy.log(10) * snr / 20)
noise *= alpha
# play normalized original audio chunk at source #1
n_samples = len(data)
data = normalize(original).squeeze()
room.sources[0].add_signal(data)
# generate noise with random SNR
noise = self.noise_(n_samples, self.sample_rate).squeeze()
snr = self.random(*self.snr)
alpha = numpy.exp(-numpy.log(10) * snr / 20)
noise *= alpha
# play noise at source #2
room.sources[1].add_signal(noise)
# simulate room and return microphone signal
room.simulate()
data = room.mic_array.signals[0, :n_samples, numpy.newaxis]
# play noise at source #2
room.sources[1].add_signal(noise)
return data, sample[1], sample[2], sample[3] , sample[4], sample[5]
# simulate room and return microphone signal
room.simulate()
return room.mic_array.signals[0, :n_samples, numpy.newaxis]
......@@ -208,7 +208,7 @@ class PreEmphasis(object):
def __call__(self, sample):
data = numpy.asarray(sample[0][1:] - 0.97 * sample[0][:-1], dtype=numpy.float32)
return data, sample[1]
return data, sample[1], sample[2], sample[3], sample[4], sample[5]
class CMVN(object):
......@@ -225,8 +225,7 @@ class CMVN(object):
m = sample[0].mean(axis=0)
s = sample[0].std(axis=0)
data = (sample[0] - m) / s
return data, sample[1], sample[2], sample[3]
return data, sample[1], sample[2], sample[3], sample[4], sample[5]
class FrequencyMask(object):
......@@ -246,7 +245,7 @@ class FrequencyMask(object):
size = numpy.random.randint(1, self.max_size)
f0 = numpy.random.randint(0, self.feature_size - self.max_size)
data[f0:f0+size, :] = 10.
return data, sample[1], sample[2], sample[3]
return data, sample[1], sample[2], sample[3], sample[4], sample[5]
class TemporalMask(object):
......@@ -265,7 +264,7 @@ class TemporalMask(object):
size = numpy.random.randint(1, self.max_size)
t0 = numpy.random.randint(0, sample[0].shape[1] - self.max_size)
data[:, t0:t0+size] = 10.
return data, sample[1], sample[2], sample[3]
return data, sample[1], sample[2], sample[3], sample[4], sample[5]
class MFCC(object):
......@@ -327,7 +326,7 @@ class MFCC(object):
# The C0 term is removed as it is the constant term
mfcc = dct(mspec, type=2, norm='ortho', axis=-1)[:, 1:self.nceps + 1]
return mfcc.T, sample[1], sample[2], sample[3]
return mfcc.T, sample[1], sample[2], sample[3], sample[4], sample[5]
......@@ -338,18 +337,7 @@ class SideSet(Dataset):
set_type="train",
chunk_per_segment=1,
overlap=0.,
dataset_df=None,
noise_db_csv=None,
noise_root_db=None,
noisy_file_ratio=0.0,
noise_snr=(5.0, 15.0),
reverb_ratio=0.0,
reverb_depth = (2.0, 10.0),
reverb_width = (1.0, 10.0),
reverb_height = (2.0, 5.0),
reverb_absorption = (0.2, 0.9),
reverb_noise = None,
reverb_snr = (5.0, 15.0)
dataset_df=None
):
"""
......@@ -364,15 +352,15 @@ class SideSet(Dataset):
self.data_path = dataset["data_root_directory"]
self.sample_rate = int(dataset["sample_rate"])
self.data_file_extension = dataset["data_file_extension"]
self.transformation = ''
if set_type == "train":
self.duration = dataset["train"]["duration"]
self.transform_pipeline = dataset["train"]["transformation"]["pipeline"]
self.augmentation = dataset["train"]["augmentation"]
self.transformation = dataset["train"]["transformation"]
else:
self.duration = dataset["eval"]["duration"]
self.transform_pipeline = dataset["eval"]["transformation"]["pipeline"]
self.augmentation = dataset["eval"]["augmentation"]
self.transformation = dataset["eval"]["transformation"]
self.sample_number = int(self.duration * self.sample_rate)
......@@ -412,8 +400,8 @@ class SideSet(Dataset):
else:
chunk_nb = min(len(possible_starts), chunk_per_segment)
starts = numpy.random.permutation(possible_starts)[:chunk_nb] / self.sample_rate
# Once we know how many segments are selected, create the other fieds to fill the dataframe
# Once we know how many segments are selected, create the other fields to fill the DataFrame
for ii in range(chunk_nb):
df_dict["database"].append(df.iloc[idx].database)
df_dict["speaker_id"].append(df.iloc[idx].speaker_id)
......@@ -424,65 +412,62 @@ class SideSet(Dataset):
df_dict["gender"].append(df.iloc[idx].gender)
self.sessions = pandas.DataFrame.from_dict(df_dict)
self.len = len(self.sessions)
# OPTIONAL: Augmentate the list of segments by splitting or processing with a slidding window
# return a new pandas.dataframe
# Get length of the dataset
self.spec_aug = numpy.zeros(self.len, dtype=bool)
self.temp_aug = numpy.zeros(self.len, dtype=bool)
if self.augmentation is not None:
if "spec_aug" in dataset["train"]["augmentation"]:
# Setup temporal and spectral augmentation if any
tmp = numpy.zeros(self.len, dtype=bool)
tmp[:int(self.len * dataset["train"]["augmentation"]["spec_aug"])] = 1
numpy.random.shuffle(tmp)
self.spec_aug = tmp
if "temp_aug" in dataset["train"]["augmentation"]:
tmp2 = numpy.zeros(self.len, dtype=bool)
tmp2[:int(self.len * dataset["train"]["augmentation"]["temp_aug"])] = 1
numpy.random.shuffle(tmp2)
self.temp_aug = tmp2
# Load CSV from the noise dataset
# Select noise files which are long enough
_transform = []
if not self.transformation["pipeline"] == '':
trans = self.transformation["pipeline"].split(',')
# Create data frame with noise info (file_id, start, SNR) aligned with the
# dataset dataframe
self.add_noise = numpy.zeros(self.len, dtype=bool)
self.add_reverb = numpy.zeros(self.len, dtype=bool)
self.spec_aug = numpy.zeros(self.len, dtype=bool)
self.temp_aug = numpy.zeros(self.len, dtype=bool)
_transform = []
if not self.transform_pipeline == '':
trans = self.transform_pipeline.split(',')
for t in trans:
if 'PreEmphasis' in t:
_transform.append(PreEmphasis())
if 'AddNoise' in t:
_transform.append(AddNoise(noise_db_csv=noise_db_csv,
snr_min=noise_snr[0],
snr_max=noise_snr[1],
noise_root_path=noise_root_db))
if 'AddReverb' in t:
_transform.append(AddReverb(ratio=reverb_ratio,
depth=reverb_depth,
width=reverb_width,
height=reverb_height,
absorption=reverb_absorption,
snr=reverb_snr))
if 'add_noise' in t:
self.add_noise[:int(self.len * self.transformation["noisy_file_ratio"])] = 1
numpy.random.shuffle(self.add_noise)
_transform.append(AddNoise(noisy_file_ratio=self.transformation["noise_file_ratio"],
noise_db_csv=self.transformation["noise_db_csv"],
snr_min_max=self.transformation["noise_snr"],
noise_root_path=self.transformation["noise_root_db"]))
if 'add_reverb' in t:
self.add_reverb[:int(self.len * self.transformation["reverb_file_ratio"])] = 1
numpy.random.shuffle(self.add_reverb)
_transform.append(AddReverb(ratio=self.transformation["reverb_ratio"],
depth=self.transformation["reverb_depth"],
width=self.transformation["reverb_width"],
height=self.transformation["reverb_height"],
absorption=self.transformation["reverb_absorption"],
snr=self.transformation["reverb_snr"]))
if 'MFCC' in t:
_transform.append(MFCC())
if "CMVN" in t:
_transform.append(CMVN())
if "FrequencyMask" in t:
# Setup temporal and spectral augmentation if any
self.spec_aug[:int(self.len * self.transformation["spec_aug"])] = 1
numpy.random.shuffle(self.spec_aug)
a = int(t.split('-')[0].split('(')[1])
b = int(t.split('-')[1].split(')')[0])
_transform.append(FrequencyMask(a, b))
if "TemporalMask" in t:
self.temp_aug[:int(self.len * self.transformation["temp_aug"])] = 1
numpy.random.shuffle(self.temp_aug)
a = int(t.split("(")[1].split(")")[0])
_transform.append(TemporalMask(a))
self.transforms = transforms.Compose(_transform)
......@@ -499,10 +484,14 @@ class SideSet(Dataset):
speaker_idx = self.sessions.iloc[index]["speaker_idx"]
# TODO: add data augmentation here!
if self.transform_pipeline:
sig, speaker_idx, _, __ = self.transforms((sig, speaker_idx, self.spec_aug[index], self.temp_aug[index]))
sig, speaker_idx, _, __ = self.transforms((sig,
speaker_idx,
self.spec_aug[index],
self.temp_aug[index],
self.add_noise[index],
self.add_reverb[index],
))
return torch.from_numpy(sig).type(torch.FloatTensor), speaker_idx
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment