Commit ec43acec authored by Anthony Larcher's avatar Anthony Larcher
Browse files

Merge branch 'dev_al' of https://git-lium.univ-lemans.fr/Larcher/sidekit into dev_al

parents f0ad2a1d f2de4f4e
......@@ -491,42 +491,31 @@ def data_augmentation(speech,
# Pick a noise sample from the noise_df
noise_row = noise_df.iloc[random.randrange(noise_df.shape[0])]
noise_type = noise_row['type']
noise_start = noise_row['start']
noise_duration = noise_row['duration']
noise_file_id = noise_row['file_id']
# Pick a SNR level
# TODO make SNRs configurable by noise type
if noise_type == 'music':
# speech
if noise_idx == 0:
# Pick a SNR level
# TODO make SNRs configurable by noise type
snr_db = random.randint(13, 20)
pick_count = random.randint(3, 7)
index_list = random.choices(range(noise_df.loc['speech'].shape[0]), k=pick_count)
for idx in index_list:
noise_row = noise_df.loc['speech'].iloc[idx]
noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
noise /= pick_count
# music
elif noise_idx == 1:
snr_db = random.randint(5, 15)
elif noise_type == 'noise':
noise_row = noise_df.loc['music'].iloc[random.randrange(noise_df.loc['music'].shape[0])]
noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
# noise
elif noise_idx == 2:
snr_db = random.randint(0, 15)
else:
snr_db = random.randint(13, 20)
if noise_duration * sample_rate > speech.shape[1]:
# We force frame_offset to stay in the 20 first seconds of the file, otherwise it takes too long to load
frame_offset = random.randrange(noise_start * sample_rate, min(int(20*sample_rate), int((noise_start + noise_duration) * sample_rate - speech.shape[1])))
else:
frame_offset = noise_start * sample_rate
noise_fn = transform_dict["add_noise"]["data_path"] + "/" + noise_file_id + ".wav"
if noise_duration * sample_rate > speech.shape[1]:
noise, noise_fs = torchaudio.load(noise_fn, frame_offset=int(frame_offset), num_frames=int(speech.shape[1]))
else:
noise, noise_fs = torchaudio.load(noise_fn, frame_offset=int(frame_offset), num_frames=int(noise_duration * sample_rate))
noise_row = noise_df.loc['noise'].iloc[random.randrange(noise_df.loc['noise'].shape[0])]
noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
speech_power = speech.norm(p=2)
noise_power = noise.norm(p=2)
#if numpy.random.randint(0, 2) == 1:
# noise = torch.flip(noise, dims=[0, 1])
if noise.shape[1] < speech.shape[1]:
noise = torch.tensor(numpy.resize(noise.numpy(), speech.shape))
snr = math.exp(snr_db / 10)
snr = 10 ** (snr_db / 20)
scale = snr * noise_power / speech_power
speech = (scale * speech + noise) / 2
......@@ -542,6 +531,31 @@ def data_augmentation(speech,
return speech
def load_noise_seg(noise_row, speech_shape, sample_rate, data_path):
noise_start = noise_row['start']
noise_duration = noise_row['duration']
noise_file_id = noise_row['file_id']
if noise_duration * sample_rate > speech_shape[1]:
# It is recommended to split noise files (especially speech noise type) in shorter subfiles
# When frame_offset is too high, loading the segment can take much longer
frame_offset = random.randrange(noise_start * sample_rate, int((noise_start + noise_duration) * sample_rate - speech_shape[1]))
else:
frame_offset = noise_start * sample_rate
noise_fn = data_path + "/" + noise_file_id + ".wav"
if noise_duration * sample_rate > speech_shape[1]:
noise_seg, _ = torchaudio.load(noise_fn, frame_offset=int(frame_offset), num_frames=int(speech_shape[1]))
else:
noise_seg, _ = torchaudio.load(noise_fn, frame_offset=int(frame_offset), num_frames=int(noise_duration * sample_rate))
#if numpy.random.randint(0, 2) == 1:
# noise = torch.flip(noise, dims=[0, 1])
if noise_seg.shape[1] < speech_shape[1]:
noise_seg = torch.tensor(numpy.resize(noise_seg.numpy(), speech_shape))
return noise_seg
"""
It might not be 100% on topic, but maybe this is interesting for you anyway. If you do not need to do real time processing, things can be made more easy. Limiting and dynamic compression can be seen as applying a dynamic transfer function. This function just maps input to output values. A linear function then returns the original audio and a "curved" function does compression or expansion. Applying a transfer function is as simple as
......
......@@ -256,7 +256,8 @@ class ArcMarginProduct(torch.nn.Module):
assert input.size()[1] == self.in_features
# cos(theta)
cosine = torch.nn.functional.linear(torch.nn.functional.normalize(input), torch.nn.functional.normalize(self.weight))
cosine = torch.nn.functional.linear(torch.nn.functional.normalize(input),
torch.nn.functional.normalize(self.weight))
# cos(theta + m)
sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
phi = cosine * self.cos_m - sine * self.sin_m
......
......@@ -168,6 +168,7 @@ class SideSet(Dataset):
self.transformation = dataset["eval"]["transformation"]
self.sample_number = int(self.duration * self.sample_rate)
self.overlap = int(overlap * self.sample_rate)
# Load the dataset description as pandas.dataframe
if dataset_df is None:
......@@ -188,16 +189,18 @@ class SideSet(Dataset):
# Create lists for each column of the dataframe
df_dict = dict(zip(df.columns, [[], [], [], [], [], [], []]))
df_dict["file_start"] = list()
df_dict["file_duration"] = list()
# For each segment, get all possible segments with the current overlap
for idx in tqdm.trange(len(tmp_sessions), desc='indexing all ' + set_type + ' segments', mininterval=1):
for idx in tqdm.trange(len(tmp_sessions), desc='indexing all ' + set_type + ' segments', mininterval=1, disable=None):
current_session = tmp_sessions.iloc[idx]
# Compute possible starts
possible_starts = numpy.arange(0,
int(self.sample_rate * (current_session.duration - self.duration)),
self.sample_number - int(self.sample_rate * overlap)
)
self.sample_number
) + int(self.sample_rate * (current_session.duration % self.duration / 2))
possible_starts += int(self.sample_rate * current_session.start)
# Select max(seg_nb, possible_segments) segments
......@@ -206,7 +209,7 @@ class SideSet(Dataset):
chunk_nb = len(possible_starts)
else:
chunk_nb = min(len(possible_starts), chunk_per_segment)
starts = numpy.random.permutation(possible_starts)[:chunk_nb] / self.sample_rate
starts = numpy.random.permutation(possible_starts)[:chunk_nb]
# Once we know how many segments are selected, create the other fields to fill the DataFrame
for ii in range(chunk_nb):
......@@ -215,6 +218,8 @@ class SideSet(Dataset):
df_dict["file_id"].append(current_session.file_id)
df_dict["start"].append(starts[ii])
df_dict["duration"].append(self.duration)
df_dict["file_start"].append(current_session.start)
df_dict["file_duration"].append(current_session.duration)
df_dict["speaker_idx"].append(current_session.speaker_idx)
df_dict["gender"].append(current_session.gender)
......@@ -231,8 +236,9 @@ class SideSet(Dataset):
self.noise_df = None
if "add_noise" in self.transform:
# Load the noise dataset, filter according to the duration
self.noise_df = pandas.read_csv(self.transformation["add_noise"]["noise_db_csv"])
noise_df = pandas.read_csv(self.transformation["add_noise"]["noise_db_csv"])
noise_df = noise_df.loc[noise_df.duration > self.duration]
self.noise_df = noise_df.set_index(noise_df.type)
self.rir_df = None
if "add_reverb" in self.transform:
......@@ -249,7 +255,18 @@ class SideSet(Dataset):
current_session = self.sessions.iloc[index]
nfo = soundfile.info(f"{self.data_path}/{current_session['file_id']}{self.data_file_extension}")
start_frame = int(current_session['start'])
original_start = int(current_session['start'])
if self.overlap > 0:
lowest_shift = self.overlap/2
highest_shift = self.overlap/2
if original_start < (current_session['file_start']*self.sample_rate + self.sample_number/2):
lowest_shift = int(original_start - current_session['file_start']*self.sample_rate)
if original_start + self.sample_number > (current_session['file_start'] + current_session['file_duration'])*self.sample_rate - self.sample_number/2:
highest_shift = int((current_session['file_start'] + current_session['file_duration'])*self.sample_rate - (original_start + self.sample_number))
start_frame = original_start + int(random.uniform(-lowest_shift, highest_shift))
else:
start_frame = original_start
if start_frame + self.sample_number >= nfo.frames:
start_frame = numpy.min(nfo.frames - self.sample_number - 1)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment