Commit df8df81b authored by Anthony Larcher's avatar Anthony Larcher
Browse files

mfcc fro,nternd

parent f2160a93
......@@ -275,7 +275,6 @@ class MFCC(object):
"""
sig = sample[0][:, numpy.newaxis] # ajout
framed = framing(sample[0], self.window_length, win_shift=self.window_length - self.overlap).copy()
framed = framing(sample[0], self.window_length, win_shift=self.window_length - self.overlap).copy()
# Pre-emphasis filtering is applied after framing to be consistent with stream processing
framed = pre_emphasis(framed, self.prefac)
# Windowing has been changed to hanning which is supposed to have less noisy sidelobes
......@@ -394,7 +393,7 @@ class SpkSet(Dataset):
self._spk_dict = spk_dict
self._spk_index = list(spk_dict.keys())
self.len = 10 * len(self._spk_index)
self.len = 100 * len(self._spk_index)
for idx, speaker in enumerate(self._spk_index):
self._spk_dict[speaker]['num_segs'] = len(self._spk_dict[speaker]['segments'])
......@@ -416,50 +415,6 @@ class SpkSet(Dataset):
pass
print(self.transform)
"""
for t in trans:
#if 'add_noise' in t:
# _transform.append(AddNoise(noise_db_csv=self.transformation["noise_db_csv"],
# snr_min_max=self.transformation["noise_snr"],
# noise_root_path=self.transformation["noise_root_db"]))
#if 'add_reverb' in t:
# has_pyroom = True
# try:
# import pyroomacoustics
# except ImportError:
# has_pyroom = False
# if has_pyroom:
# _transform.append(AddReverb(depth=self.transformation["reverb_depth"],
# width=self.transformation["reverb_width"],
# height=self.transformation["reverb_height"],
# absorption=self.transformation["reverb_absorption"],
# noise=None,
# snr=self.transformation["reverb_snr"]))
#if 'MFCC' in t:
# _transform.append(MFCC(lowfreq=self.lowfreq,
# maxfreq=self.maxfreq,
# nlogfilt=self.mfcc_nbfilter,
# nceps=self.mfcc_nceps,
# n_fft=self.n_fft))
#if "CMVN" in t:
# _transform.append(CMVN())
#if "FrequencyMask" in t:
# # Setup temporal and spectral augmentation if any
# a = int(t.split('-')[0].split('(')[1])
# b = int(t.split('-')[1].split(')')[0])
# _transform.append(FrequencyMask(a, b))
#if "TemporalMask" in t:
# a = int(t.split("(")[1].split(")")[0])
# _transform.append(TemporalMask(a))
#self.transforms = transforms.Compose(_transform)
"""
def __getitem__(self, index):
"""
......@@ -467,15 +422,18 @@ class SpkSet(Dataset):
:return:
"""
current_speaker = self._spk_index[index % len(self._spk_index)]
current_speaker = self._spk_index[int(math.fmod(index, len(self._spk_index)))]
segment_index = numpy.random.choice(self._spk_dict[current_speaker]['num_segs'], p=self._spk_dict[current_speaker]['p'])
self._spk_dict[current_speaker]['p'][segment_index] /= 2
self._spk_dict[current_speaker]['p'][segment_index] = 0 #/= 2
current_segment = self._spk_dict[current_speaker]['segments'][segment_index]
self._spk_dict[current_speaker]['p'] = self._spk_dict[current_speaker]['p']/numpy.sum(self._spk_dict[current_speaker]['p'])
if numpy.sum(self._spk_dict[current_speaker]['p']) > 0:
self._spk_dict[current_speaker]['p'] = self._spk_dict[current_speaker]['p']/numpy.sum(self._spk_dict[current_speaker]['p'])
else:
self._spk_dict[current_speaker]['p'] += 1/self._spk_dict[current_speaker]['num_segs']
nfo = soundfile.info(f"{self.data_path}/{current_segment['file_id']}{self.data_file_extension}")
if self._windowed:
start_frame = int(current_segment['start'] * self.sample_rate)
if start_frame + self.sample_number >= nfo.frames:
start_frame = numpy.min(nfo.frames - self.sample_number - 1)
......
......@@ -302,6 +302,92 @@ class GruPooling(torch.nn.Module):
return x
class PreEmphasis(torch.nn.Module):
def __init__(self, coef: float = 0.97):
super().__init__()
self.coef = coef
# make kernel
# In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
self.register_buffer(
'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
)
def forward(self, input: torch.tensor) -> torch.tensor:
assert len(input.size()) == 2, 'The number of dimensions of input tensor must be 2!'
# reflect padding to match lengths of in/out
input = input.unsqueeze(1)
input = torch.nn.functional.pad(input, (1, 0), 'reflect')
return torch.nn.functional.conv1d(input, self.flipped_filter).squeeze(1)
class MfccFrontEnd(torch.nn.Module):
"""
"""
def __init__(self,
pre_emphasis=0.97,
sample_rate=16000,
n_fft=2048,
f_min=133.333,
f_max=6855.4976,
win_length=1024,
window_fn=torch.hann_window,
hop_length=512,
power=2.0,
n_mels=100,
n_mfcc=80):
super(MfccFrontEnd, self).__init__()
self.pre_emphasis = pre_emphasis
self.sample_rate = sample_rate
self.n_fft = n_fft
self.f_min = f_min
self.f_max = f_max
self.win_length = win_length
self.window_fn=window_fn
self.hop_length = hop_length
self.power=power
self.window_fn = window_fn
self.n_mels = n_mels
self.n_mfcc = n_mfcc
self.PreEmphasis = PreEmphasis(self.pre_emphasis)
self.melkwargs = {sample_rate:self.sample_rate,
n_fft:self.n_fft,
f_min:self.f_min,
f_max:self.f_max,
win_length:self.win_length,
window_fn:self.window_fn,
hop_length:self.hop_length,
power:self.power,
n_mels:self.n_mels}
self.MFCC = torchaudio.transforms.MFCC(
sample_rate=self.sample_rate,
n_mfcc=self.n_mfcc,
dct_type=2,
log_mels=True,
melkwargs=self.melkwargs)
self.CMVN = torch.nn.InstanceNorm1d(self.n_mfcc)
def forward(self, x):
"""
:param x:
:return:
"""
with torch.no_grad():
with torch.cuda.amp.autocast(enabled=False):
mfcc = self.PreEmphasis(x)
mfcc = self.MFCC(mfcc)
mfcc = self.CMVN(mfcc)
return mfcc
class Xtractor(torch.nn.Module):
"""
Class that defines an x-vector extractor based on 5 convolutional layers and a mean standard deviation pooling
......@@ -337,20 +423,7 @@ class Xtractor(torch.nn.Module):
self.feature_size = 80
self.activation = torch.nn.LeakyReLU(0.2)
# Feature extraction
n_fft = 2048
win_length = None
hop_length = 128
n_mels = 80
n_mfcc = 80
self.MFCC = torchaudio.transforms.MFCC(
sample_rate=16000,
n_mfcc=n_mfcc, melkwargs={'n_fft': n_fft, 'n_mels': n_mels, 'hop_length': hop_length})
self.CMVN = torch.nn.InstanceNorm1d(80)
self.preprocessor = None
self.preprocessor = MfccFrontEnd()
self.sequence_network = torch.nn.Sequential(OrderedDict([
("conv1", torch.nn.Conv1d(self.feature_size, 512, 5, dilation=1)),
......@@ -401,22 +474,8 @@ class Xtractor(torch.nn.Module):
self.embedding_size = 512
elif model_archi == "resnet34":
self.input_nbdim = 2
# Feature extraction
n_fft = 2048
win_length = None
hop_length = 128
n_mels = 80
n_mfcc = 80
self.MFCC = torchaudio.transforms.MFCC(
sample_rate=16000,
n_mfcc=n_mfcc, melkwargs={'n_fft': n_fft, 'n_mels': n_mels, 'hop_length': hop_length})
self.CMVN = torch.nn.InstanceNorm1d(80)
self.preprocessor = None
self.preprocessor = MfccFrontEnd()
self.sequence_network = PreResNet34()
self.before_speaker_embedding = torch.nn.Linear(in_features = 5120,
......@@ -441,22 +500,8 @@ class Xtractor(torch.nn.Module):
self.after_speaker_embedding_weight_decay = 0.00
elif model_archi == "fastresnet34":
self.input_nbdim = 2
# Feature extraction
n_fft = 2048
win_length = None
hop_length = 128
n_mels = 80
n_mfcc = 80
self.MFCC = torchaudio.transforms.MFCC(
sample_rate=16000,
n_mfcc=n_mfcc, melkwargs={'n_fft': n_fft, 'n_mels': n_mels, 'hop_length': hop_length})
self.CMVN = torch.nn.InstanceNorm1d(80)
self.preprocessor = None
self.preprocessor = MfccFrontEnd()
self.sequence_network = PreFastResNet34()
self.before_speaker_embedding = torch.nn.Linear(in_features = 2560,
......@@ -743,9 +788,12 @@ class Xtractor(torch.nn.Module):
x = self.preprocessor(x)
else:
x = self.MFCC(x)
x = self.CMVN(x)
#x = x.unsqueeze(1)
with torch.no_grad():
with torch.cuda.amp.autocast(enabled=False):
x = self.PreEmphasis(x)
x = self.MFCC(x)
x = self.CMVN(x).unsqueeze(1)
x = self.sequence_network(x)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment