Commit 2899da3e authored by Ambuj Mehrish's avatar Ambuj Mehrish
Browse files

Replace xsets.py

parent 5632ab05
......@@ -168,6 +168,7 @@ class SideSet(Dataset):
self.transformation = dataset["eval"]["transformation"]
self.sample_number = int(self.duration * self.sample_rate)
self.overlap = int(overlap * self.sample_rate)
# Load the dataset description as pandas.dataframe
if dataset_df is None:
......@@ -188,16 +189,18 @@ class SideSet(Dataset):
# Create lists for each column of the dataframe
df_dict = dict(zip(df.columns, [[], [], [], [], [], [], []]))
df_dict["file_start"] = list()
df_dict["file_duration"] = list()
# For each segment, get all possible segments with the current overlap
for idx in tqdm.trange(len(tmp_sessions), desc='indexing all ' + set_type + ' segments', mininterval=1):
for idx in tqdm.trange(len(tmp_sessions), desc='indexing all ' + set_type + ' segments', mininterval=1, disable=None):
current_session = tmp_sessions.iloc[idx]
# Compute possible starts
possible_starts = numpy.arange(0,
int(self.sample_rate * (current_session.duration - self.duration)),
self.sample_number - int(self.sample_rate * overlap)
)
self.sample_number
) + int(self.sample_rate * (current_session.duration % self.duration / 2))
possible_starts += int(self.sample_rate * current_session.start)
# Select max(seg_nb, possible_segments) segments
......@@ -206,7 +209,7 @@ class SideSet(Dataset):
chunk_nb = len(possible_starts)
else:
chunk_nb = min(len(possible_starts), chunk_per_segment)
starts = numpy.random.permutation(possible_starts)[:chunk_nb] / self.sample_rate
starts = numpy.random.permutation(possible_starts)[:chunk_nb]
# Once we know how many segments are selected, create the other fields to fill the DataFrame
for ii in range(chunk_nb):
......@@ -215,6 +218,8 @@ class SideSet(Dataset):
df_dict["file_id"].append(current_session.file_id)
df_dict["start"].append(starts[ii])
df_dict["duration"].append(self.duration)
df_dict["file_start"].append(current_session.start)
df_dict["file_duration"].append(current_session.duration)
df_dict["speaker_idx"].append(current_session.speaker_idx)
df_dict["gender"].append(current_session.gender)
......@@ -231,8 +236,9 @@ class SideSet(Dataset):
self.noise_df = None
if "add_noise" in self.transform:
# Load the noise dataset, filter according to the duration
self.noise_df = pandas.read_csv(self.transformation["add_noise"]["noise_db_csv"])
noise_df = pandas.read_csv(self.transformation["add_noise"]["noise_db_csv"])
noise_df = noise_df.loc[noise_df.duration > self.duration]
self.noise_df = noise_df.set_index(noise_df.type)
self.rir_df = None
if "add_reverb" in self.transform:
......@@ -249,7 +255,18 @@ class SideSet(Dataset):
current_session = self.sessions.iloc[index]
nfo = soundfile.info(f"{self.data_path}/{current_session['file_id']}{self.data_file_extension}")
start_frame = int(current_session['start'])
original_start = int(current_session['start'])
if self.overlap > 0:
lowest_shift = self.overlap/2
highest_shift = self.overlap/2
if original_start < (current_session['file_start']*self.sample_rate + self.sample_number/2):
lowest_shift = int(original_start - current_session['file_start']*self.sample_rate)
if original_start + self.sample_number > (current_session['file_start'] + current_session['file_duration'])*self.sample_rate - self.sample_number/2:
highest_shift = int((current_session['file_start'] + current_session['file_duration'])*self.sample_rate - (original_start + self.sample_number))
start_frame = original_start + int(random.uniform(-lowest_shift, highest_shift))
else:
start_frame = original_start
if start_frame + self.sample_number >= nfo.frames:
start_frame = numpy.min(nfo.frames - self.sample_number - 1)
......@@ -292,7 +309,8 @@ class IdMapSet(Dataset):
idmap_name,
data_path,
file_extension,
transform_pipeline="",
transform_pipeline={},
transform_number=1,
sliding_window=False,
window_len=24000,
window_shift=8000,
......@@ -318,20 +336,21 @@ class IdMapSet(Dataset):
self.sliding_window = sliding_window
self.window_len = window_len
self.window_shift = window_shift
self.transform_number = transform_number
self.transform = []
if self.transformation is not None:
self.transform_list = self.transformation.split(",")
#if self.transformation is not None:
# self.transform_list = self.transformation.split(",")
self.noise_df = None
if "add_noise" in self.transform:
if "add_noise" in self.transformation:
# Load the noise dataset, filter according to the duration
noise_df = pandas.read_csv(self.transformation["add_noise"]["noise_db_csv"])
tmp_df = noise_df.loc[noise_df['duration'] > self.duration]
self.noise_df = tmp_df['file_id'].tolist()
#tmp_df = noise_df.loc[noise_df['duration'] > self.duration]
self.noise_df = noise_df.set_index(noise_df.type)
self.rir_df = None
if "add_reverb" in self.transform:
if "add_reverb" in self.transformation:
# load the RIR database
tmp_rir_df = pandas.read_csv(self.transformation["add_reverb"]["rir_db_csv"])
self.rir_df = zip(tmp_rir_df['file_id'].tolist(), tmp_rir_df['channel'].tolist())
......@@ -344,18 +363,19 @@ class IdMapSet(Dataset):
"""
if self.idmap.start[index] is None:
start = 0
else:
start = int(self.idmap.start[index]) * 160
if self.idmap.stop[index] is None:
speech, speech_fs = torchaudio.load(f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}")
duration = speech.shape[1] - start
duration = int(speech.shape[1] - start)
else:
start = int(self.idmap.start[index])
duration = int(self.idmap.stop[index]) - start
duration = int(self.idmap.stop[index]) * 160 - start
# add this in case the segment is too short
if duration <= self.min_sample_nb:
middle = start + duration // 2
start = max(0, int(middle - (self.min_sample_nb / 2)))
duration = self.min_sample_nb
duration = int(self.min_sample_nb)
speech, speech_fs = torchaudio.load(f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}",
frame_offset=start,
......@@ -366,10 +386,10 @@ class IdMapSet(Dataset):
if self.sliding_window:
speech = speech.squeeze().unfold(0,self.window_len,self.window_shift)
if len(self.transform) > 0:
if len(self.transformation.keys()) > 0:
speech = data_augmentation(speech,
speech_fs,
self.transform,
self.transformation,
self.transform_number,
noise_df=self.noise_df,
rir_df=self.rir_df)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment