Commit 51fa2e4c authored by Anthony Larcher's avatar Anthony Larcher
Browse files

noise augmentation

parent 67ddbd8e
...@@ -473,6 +473,16 @@ def data_augmentation(speech, ...@@ -473,6 +473,16 @@ def data_augmentation(speech,
["rate", "16000"], ["rate", "16000"],
]) ])
if "filtering" in augmentations:
effects = [
["bandpass","2000","3500"],
["bandstop","200","500"]]
speech,sample_rate = torchaudio.sox_eefects.apply_effects_tensor(
speech,
sample_rate,
effects = [effects[random.randint(0,1)]],
)
if "stretch" in augmentations: if "stretch" in augmentations:
strech = torchaudio.functional.TimeStretch() strech = torchaudio.functional.TimeStretch()
rate = random.uniform(0.8,1.2) rate = random.uniform(0.8,1.2)
...@@ -480,14 +490,14 @@ def data_augmentation(speech, ...@@ -480,14 +490,14 @@ def data_augmentation(speech,
if "add_reverb" in augmentations: if "add_reverb" in augmentations:
rir_nfo = rir_df.iloc[random.randrange(rir_df.shape[0])].file_id rir_nfo = rir_df.iloc[random.randrange(rir_df.shape[0])].file_id
rir_fn = transform_dict["add_reverb"]["data_path"] + "/" + rir_nfo + ".wav" rir_fn = transform_dict["add_reverb"]["data_path"] + rir_nfo # TODO harmonize with noise
rir, rir_fs = torchaudio.load(rir_fn) rir, rir_fs = torchaudio.load(rir_fn)
speech = torch.tensor(signal.convolve(speech, rir, mode='full')[:, :speech.shape[1]]) speech = torch.tensor(signal.convolve(speech, rir, mode='full')[:, :speech.shape[1]])
if "add_noise" in augmentations: if "add_noise" in augmentations:
# Pick a noise type # Pick a noise type
noise = torch.zeros_like(speech) noise = torch.zeros_like(speech)
noise_idx = random.randrange(3) noise_idx = random.randrange(4)
# speech # speech
if noise_idx == 0: if noise_idx == 0:
...@@ -510,6 +520,17 @@ def data_augmentation(speech, ...@@ -510,6 +520,17 @@ def data_augmentation(speech,
snr_db = random.randint(0, 15) snr_db = random.randint(0, 15)
noise_row = noise_df.loc['noise'].iloc[random.randrange(noise_df.loc['noise'].shape[0])] noise_row = noise_df.loc['noise'].iloc[random.randrange(noise_df.loc['noise'].shape[0])]
noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"]) noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
# babble noise with different volume
elif noise_idx == 3:
snr_db = random.randint(13,20)
ns = random.randint(5,10) # Randomly select 5 to 10 speakers
# noise_fn = transform_dict["add_noise"]["data_path"] + "/" + noise_df[noise_df["type"] == "speech"].sample(ns,replace=False)["file_id"].values + ".wav"
noise = torch.zeros(1,speech.shape[1])
for idx in range(ns):
noise_,noise_fs = torchaudio.load(noise_fn[idx],frame_offset=0,num_frames=speech.shape[1])
transform = torchaudio.transforms.Vol(gain=random.randint(5,15),gain_type='db') # Randomly select volume level (5-15d)
noise += transform(noise_)
noise /= ns
speech_power = speech.norm(p=2) speech_power = speech.norm(p=2)
noise_power = noise.norm(p=2) noise_power = noise.norm(p=2)
...@@ -529,7 +550,16 @@ def data_augmentation(speech, ...@@ -529,7 +550,16 @@ def data_augmentation(speech,
return speech return speech
def load_noise_seg(noise_row, speech_shape, sample_rate, data_path): def load_noise_seg(noise_row, speech_shape, sample_rate, data_path):
"""
:param noise_row:
:param speech_shape:
:param sample_rate:
:param data_path:
:return:
"""
noise_start = noise_row['start'] noise_start = noise_row['start']
noise_duration = noise_row['duration'] noise_duration = noise_row['duration']
noise_file_id = noise_row['file_id'] noise_file_id = noise_row['file_id']
...@@ -547,56 +577,6 @@ def load_noise_seg(noise_row, speech_shape, sample_rate, data_path): ...@@ -547,56 +577,6 @@ def load_noise_seg(noise_row, speech_shape, sample_rate, data_path):
else: else:
noise_seg, _ = torchaudio.load(noise_fn, frame_offset=int(frame_offset), num_frames=int(noise_duration * sample_rate)) noise_seg, _ = torchaudio.load(noise_fn, frame_offset=int(frame_offset), num_frames=int(noise_duration * sample_rate))
#if numpy.random.randint(0, 2) == 1:
# noise = torch.flip(noise, dims=[0, 1])
if noise_seg.shape[1] < speech_shape[1]: if noise_seg.shape[1] < speech_shape[1]:
noise_seg = torch.tensor(numpy.resize(noise_seg.numpy(), speech_shape)) noise_seg = torch.tensor(numpy.resize(noise_seg.numpy(), speech_shape))
return noise_seg return noise_seg
"""
It might not be 100% on topic, but maybe this is interesting for you anyway. If you do not need to do real time processing, things can be made more easy. Limiting and dynamic compression can be seen as applying a dynamic transfer function. This function just maps input to output values. A linear function then returns the original audio and a "curved" function does compression or expansion. Applying a transfer function is as simple as
import numpy as np
from scipy.interpolate import interp1d
from scipy.io import wavfile
def apply_transfer(signal, transfer, interpolation='linear'):
constant = np.linspace(-1, 1, len(transfer))
interpolator = interp1d(constant, transfer, interpolation)
return interpolator(signal)
Limiting or compression then is just a case of choosing a different transfer function:
# hard limiting
def limiter(x, treshold=0.8):
transfer_len = 1000
transfer = np.concatenate([ np.repeat(-1, int(((1-treshold)/2)*transfer_len)),
np.linspace(-1, 1, int(treshold*transfer_len)),
np.repeat(1, int(((1-treshold)/2)*transfer_len)) ])
return apply_transfer(x, transfer)
# smooth compression: if factor is small, its near linear, the bigger it is the
# stronger the compression
def arctan_compressor(x, factor=2):
constant = np.linspace(-1, 1, 1000)
transfer = np.arctan(factor * constant)
transfer /= np.abs(transfer).max()
return apply_transfer(x, transfer)
This example assumes 16 bit mono wav files as input:
sr, x = wavfile.read("input.wav")
x = x / np.abs(x).max() # x scale between -1 and 1
x2 = limiter(x)
x2 = np.int16(x2 * 32767)
wavfile.write("output_limit.wav", sr, x2)
x3 = arctan_compressor(x)
x3 = np.int16(x3 * 32767)
wavfile.write("output_comp.wav", sr, x3)
"""
...@@ -241,8 +241,11 @@ class SideSet(Dataset): ...@@ -241,8 +241,11 @@ class SideSet(Dataset):
self.rir_df = None self.rir_df = None
if "add_reverb" in self.transform: if "add_reverb" in self.transform:
tmp_rir_df = pandas.read_csv(self.transformation["add_reverb"]["rir_db_csv"])
tmp_rir_df = noise_df.loc[tmp_rir_df["type"] > "simulated_rirs"]
# load the RIR database # load the RIR database
self.rir_df = pandas.read_csv(self.transformation["add_reverb"]["rir_db_csv"]) self.rir_df = tmp_rir_df.set_index(tmp_rir_df.type)
def __getitem__(self, index): def __getitem__(self, index):
""" """
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment