Commit 78266092 authored by Anthony Larcher's avatar Anthony Larcher
Browse files

Merge branch 'dev_al' of https://git-lium.univ-lemans.fr/Larcher/sidekit into dev_al

parents d59dbae1 856b99b1
......@@ -297,6 +297,7 @@ class SpkSet(Dataset):
def __init__(self,
data_set_yaml,
set_type="train",
transform_number=1,
overlap=0.,
dataset_df=None,
min_duration=0.165,
......@@ -313,6 +314,8 @@ class SpkSet(Dataset):
with open(data_set_yaml, "r") as fh:
dataset = yaml.load(fh, Loader=yaml.FullLoader)
self.transform_number = transform_number
self.data_path = dataset["data_root_directory"]
self.sample_rate = int(dataset["sample_rate"])
self.data_file_extension = dataset["data_file_extension"]
......@@ -396,54 +399,65 @@ class SpkSet(Dataset):
self._spk_dict[speaker]['p'] = numpy.ones((self._spk_dict[speaker]['num_segs'],))/self._spk_dict[speaker]['num_segs']
_transform = []
self.transform = None
if (self.transformation["pipeline"] != '') and (self.transformation["pipeline"] is not None):
trans = self.transformation["pipeline"].split(',')
print(trans)
for t in trans:
self.transform = self.transformation["pipeline"].split(',')
if 'PreEmphasis' in t:
_transform.append(PreEmphasis())
if 'add_noise' in t:
_transform.append(AddNoise(noise_db_csv=self.transformation["noise_db_csv"],
snr_min_max=self.transformation["noise_snr"],
noise_root_path=self.transformation["noise_root_db"]))
if "add_noise" in self.transform:
# Load the noise dataset, filter according to the duration
noise_df = pandas.read_csv(self.transformation["noise_db_csv"])
tmp_df = noise_df.loc[noise_df['duration'] > self.duration]
self.noise_df = tmp_df['file_id'].tolist()
if 'add_reverb' in t:
has_pyroom = True
try:
import pyroomacoustics
except ImportError:
has_pyroom = False
if has_pyroom:
_transform.append(AddReverb(depth=self.transformation["reverb_depth"],
width=self.transformation["reverb_width"],
height=self.transformation["reverb_height"],
absorption=self.transformation["reverb_absorption"],
noise=None,
snr=self.transformation["reverb_snr"]))
if 'MFCC' in t:
_transform.append(MFCC(lowfreq=self.lowfreq,
maxfreq=self.maxfreq,
nlogfilt=self.mfcc_nbfilter,
nceps=self.mfcc_nceps,
n_fft=self.n_fft))
if "add_reverb" in self.transform:
# load the RIR database
pass
if "CMVN" in t:
_transform.append(CMVN())
if "FrequencyMask" in t:
# Setup temporal and spectral augmentation if any
a = int(t.split('-')[0].split('(')[1])
b = int(t.split('-')[1].split(')')[0])
_transform.append(FrequencyMask(a, b))
print(self.transform)
"""
for t in trans:
if "TemporalMask" in t:
a = int(t.split("(")[1].split(")")[0])
_transform.append(TemporalMask(a))
self.transforms = transforms.Compose(_transform)
#if 'add_noise' in t:
# _transform.append(AddNoise(noise_db_csv=self.transformation["noise_db_csv"],
# snr_min_max=self.transformation["noise_snr"],
# noise_root_path=self.transformation["noise_root_db"]))
#if 'add_reverb' in t:
# has_pyroom = True
# try:
# import pyroomacoustics
# except ImportError:
# has_pyroom = False
# if has_pyroom:
# _transform.append(AddReverb(depth=self.transformation["reverb_depth"],
# width=self.transformation["reverb_width"],
# height=self.transformation["reverb_height"],
# absorption=self.transformation["reverb_absorption"],
# noise=None,
# snr=self.transformation["reverb_snr"]))
#if 'MFCC' in t:
# _transform.append(MFCC(lowfreq=self.lowfreq,
# maxfreq=self.maxfreq,
# nlogfilt=self.mfcc_nbfilter,
# nceps=self.mfcc_nceps,
# n_fft=self.n_fft))
#if "CMVN" in t:
# _transform.append(CMVN())
#if "FrequencyMask" in t:
# # Setup temporal and spectral augmentation if any
# a = int(t.split('-')[0].split('(')[1])
# b = int(t.split('-')[1].split(')')[0])
# _transform.append(FrequencyMask(a, b))
#if "TemporalMask" in t:
# a = int(t.split("(")[1].split(")")[0])
# _transform.append(TemporalMask(a))
#self.transforms = transforms.Compose(_transform)
"""
def __getitem__(self, index):
"""
......@@ -464,38 +478,50 @@ class SpkSet(Dataset):
if start_frame + self.sample_number >= nfo.frames:
start_frame = numpy.min(nfo.frames - self.sample_number - 1)
stop_frame = start_frame + self.sample_number
#start_frame = current_segment['start']
#stop_frame = start_frame + self.sample_number
else:
start_frame = int(current_segment['start'] * self.sample_rate)
stop_frame = int(current_segment['duration'] * self.sample_rate)
#sig, sample_rate2 = torchaudio.load(f"{self.data_path}/{current_segment['file_id']}{self.data_file_extension}",
# frame_offset=start_frame,
# num_frames=stop_frame)
sig, _ = soundfile.read(f"{self.data_path}/{current_segment['file_id']}{self.data_file_extension}",
start=start_frame,
stop=stop_frame,
dtype=wav_type
)
sig = sig.astype(numpy.float32)
sig += 0.0001 * numpy.random.randn(sig.shape[0])
speech, speech_fs = torchaudio.load(f"{self.data_path}/{current_segment['file_id']}{self.data_file_extension}",
frame_offset=start_frame,
num_frames=stop_frame)
speaker_idx = self._spk_dict[current_speaker]["speaker_idx"]
# Select the data augmentation randomly
aug_idx = numpy.random.randint(0,len(self.transform), self.transform_number)
augmentations = list(numpy.array(self.transform)[aug_idx])
if self.transformation["pipeline"]:
sig, speaker_idx, _, __, _t, _s = self.transforms((sig,
speaker_idx,
False,#self.spec_aug[index],
False,#self.temp_aug[index],
True,#self.add_noise[index],
False#self.add_reverb[index]
))
if "add_noise" in augmentations:
# Pick a SNR level
snr_db = random.choice(self.transformation["noise_snr"])
# Pick a file name from the noise_df
noise_fn = random.choice(self.noise_df)
noise, noise_fs = torchaudio.load(noise_fn,
frame_offset=0,
num_frames=int(current_segment['duration'] * self.sample_rate))
speech_power = speech.norm(p=2)
noise_power = noise.norm(p=2)
snr = math.exp(snr_db / 10)
scale = snr * noise_power / speech_power
speech = (scale * speech + noise) / 2
if "add_reverb" in augmentations:
pass
if "codec" in augmentations:
pass
if "filter" in augmentations:
pass
speaker_idx = self._spk_dict[current_speaker]["speaker_idx"]
if self.output_format == "pytorch":
return torch.tensor(sig).type(torch.FloatTensor), torch.tensor(speaker_idx)
return speech, torch.tensor(speaker_idx)
else:
return sig.astype(numpy.float32), speaker_idx
return speech, speaker_idx
def __len__(self):
"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment