Commit add6044d authored by Anthony Larcher's avatar Anthony Larcher
Browse files

n ew noise from Gael

parent cf1afe6c
......@@ -486,42 +486,31 @@ def data_augmentation(speech, sample_rate, transform_dict, transform_number, noi
# Pick a noise sample from the noise_df
noise_row = noise_df.iloc[random.randrange(noise_df.shape[0])]
noise_type = noise_row['type']
noise_start = noise_row['start']
noise_duration = noise_row['duration']
noise_file_id = noise_row['file_id']
# Pick a SNR level
# TODO make SNRs configurable by noise type
if noise_type == 'music':
# speech
if noise_idx == 0:
# Pick a SNR level
# TODO make SNRs configurable by noise type
snr_db = random.randint(13, 20)
pick_count = random.randint(3, 7)
index_list = random.choices(range(noise_df.loc['speech'].shape[0]), k=pick_count)
for idx in index_list:
noise_row = noise_df.loc['speech'].iloc[idx]
noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
noise /= pick_count
# music
elif noise_idx == 1:
snr_db = random.randint(5, 15)
elif noise_type == 'noise':
noise_row = noise_df.loc['music'].iloc[random.randrange(noise_df.loc['music'].shape[0])]
noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
# noise
elif noise_idx == 2:
snr_db = random.randint(0, 15)
else:
snr_db = random.randint(13, 20)
if noise_duration * sample_rate > speech.shape[1]:
# We force frame_offset to stay in the 20 first seconds of the file, otherwise it takes too long to load
frame_offset = random.randrange(noise_start * sample_rate, min(int(20*sample_rate), int((noise_start + noise_duration) * sample_rate - speech.shape[1])))
else:
frame_offset = noise_start * sample_rate
noise_fn = transform_dict["add_noise"]["data_path"] + "/" + noise_file_id + ".wav"
if noise_duration * sample_rate > speech.shape[1]:
noise, noise_fs = torchaudio.load(noise_fn, frame_offset=int(frame_offset), num_frames=int(speech.shape[1]))
else:
noise, noise_fs = torchaudio.load(noise_fn, frame_offset=int(frame_offset), num_frames=int(noise_duration * sample_rate))
noise_row = noise_df.loc['noise'].iloc[random.randrange(noise_df.loc['noise'].shape[0])]
noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
speech_power = speech.norm(p=2)
noise_power = noise.norm(p=2)
#if numpy.random.randint(0, 2) == 1:
# noise = torch.flip(noise, dims=[0, 1])
if noise.shape[1] < speech.shape[1]:
noise = torch.tensor(numpy.resize(noise.numpy(), speech.shape))
snr = math.exp(snr_db / 10)
snr = 10 ** (snr_db / 20)
scale = snr * noise_power / speech_power
speech = (scale * speech + noise) / 2
......@@ -537,6 +526,31 @@ def data_augmentation(speech, sample_rate, transform_dict, transform_number, noi
return speech
def load_noise_seg(noise_row, speech_shape, sample_rate, data_path):
noise_start = noise_row['start']
noise_duration = noise_row['duration']
noise_file_id = noise_row['file_id']
if noise_duration * sample_rate > speech_shape[1]:
# It is recommended to split noise files (especially speech noise type) in shorter subfiles
# When frame_offset is too high, loading the segment can take much longer
frame_offset = random.randrange(noise_start * sample_rate, int((noise_start + noise_duration) * sample_rate - speech_shape[1]))
else:
frame_offset = noise_start * sample_rate
noise_fn = data_path + "/" + noise_file_id + ".wav"
if noise_duration * sample_rate > speech_shape[1]:
noise_seg, _ = torchaudio.load(noise_fn, frame_offset=int(frame_offset), num_frames=int(speech_shape[1]))
else:
noise_seg, _ = torchaudio.load(noise_fn, frame_offset=int(frame_offset), num_frames=int(noise_duration * sample_rate))
#if numpy.random.randint(0, 2) == 1:
# noise = torch.flip(noise, dims=[0, 1])
if noise_seg.shape[1] < speech_shape[1]:
noise_seg = torch.tensor(numpy.resize(noise_seg.numpy(), speech_shape))
return noise_seg
"""
It might not be 100% on topic, but maybe this is interesting for you anyway. If you do not need to do real time processing, things can be made more easy. Limiting and dynamic compression can be seen as applying a dynamic transfer function. This function just maps input to output values. A linear function then returns the original audio and a "curved" function does compression or expansion. Applying a transfer function is as simple as
......
......@@ -255,7 +255,8 @@ class ArcMarginProduct(torch.nn.Module):
assert input.size()[1] == self.in_features
# cos(theta)
cosine = torch.nn.functional.linear(torch.nn.functional.normalize(input), torch.nn.functional.normalize(self.weight))
cosine = torch.nn.functional.linear(torch.nn.functional.normalize(input),
torch.nn.functional.normalize(self.weight))
# cos(theta + m)
sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
phi = cosine * self.cos_m - sine * self.sin_m
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment