Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Anthony Larcher
sidekit
Commits
df8df81b
Commit
df8df81b
authored
Mar 12, 2021
by
Anthony Larcher
Browse files
mfcc fro,nternd
parent
f2160a93
Changes
2
Hide whitespace changes
Inline
Side-by-side
nnet/xsets.py
View file @
df8df81b
...
...
@@ -275,7 +275,6 @@ class MFCC(object):
"""
sig
=
sample
[
0
][:,
numpy
.
newaxis
]
# ajout
framed
=
framing
(
sample
[
0
],
self
.
window_length
,
win_shift
=
self
.
window_length
-
self
.
overlap
).
copy
()
framed
=
framing
(
sample
[
0
],
self
.
window_length
,
win_shift
=
self
.
window_length
-
self
.
overlap
).
copy
()
# Pre-emphasis filtering is applied after framing to be consistent with stream processing
framed
=
pre_emphasis
(
framed
,
self
.
prefac
)
# Windowing has been changed to hanning which is supposed to have less noisy sidelobes
...
...
@@ -394,7 +393,7 @@ class SpkSet(Dataset):
self
.
_spk_dict
=
spk_dict
self
.
_spk_index
=
list
(
spk_dict
.
keys
())
self
.
len
=
10
*
len
(
self
.
_spk_index
)
self
.
len
=
10
0
*
len
(
self
.
_spk_index
)
for
idx
,
speaker
in
enumerate
(
self
.
_spk_index
):
self
.
_spk_dict
[
speaker
][
'num_segs'
]
=
len
(
self
.
_spk_dict
[
speaker
][
'segments'
])
...
...
@@ -416,50 +415,6 @@ class SpkSet(Dataset):
pass
print
(
self
.
transform
)
"""
for t in trans:
#if 'add_noise' in t:
# _transform.append(AddNoise(noise_db_csv=self.transformation["noise_db_csv"],
# snr_min_max=self.transformation["noise_snr"],
# noise_root_path=self.transformation["noise_root_db"]))
#if 'add_reverb' in t:
# has_pyroom = True
# try:
# import pyroomacoustics
# except ImportError:
# has_pyroom = False
# if has_pyroom:
# _transform.append(AddReverb(depth=self.transformation["reverb_depth"],
# width=self.transformation["reverb_width"],
# height=self.transformation["reverb_height"],
# absorption=self.transformation["reverb_absorption"],
# noise=None,
# snr=self.transformation["reverb_snr"]))
#if 'MFCC' in t:
# _transform.append(MFCC(lowfreq=self.lowfreq,
# maxfreq=self.maxfreq,
# nlogfilt=self.mfcc_nbfilter,
# nceps=self.mfcc_nceps,
# n_fft=self.n_fft))
#if "CMVN" in t:
# _transform.append(CMVN())
#if "FrequencyMask" in t:
# # Setup temporal and spectral augmentation if any
# a = int(t.split('-')[0].split('(')[1])
# b = int(t.split('-')[1].split(')')[0])
# _transform.append(FrequencyMask(a, b))
#if "TemporalMask" in t:
# a = int(t.split("(")[1].split(")")[0])
# _transform.append(TemporalMask(a))
#self.transforms = transforms.Compose(_transform)
"""
def
__getitem__
(
self
,
index
):
"""
...
...
@@ -467,15 +422,18 @@ class SpkSet(Dataset):
:return:
"""
current_speaker
=
self
.
_spk_index
[
index
%
len
(
self
.
_spk_index
)]
current_speaker
=
self
.
_spk_index
[
int
(
math
.
fmod
(
index
,
len
(
self
.
_spk_index
)
))
]
segment_index
=
numpy
.
random
.
choice
(
self
.
_spk_dict
[
current_speaker
][
'num_segs'
],
p
=
self
.
_spk_dict
[
current_speaker
][
'p'
])
self
.
_spk_dict
[
current_speaker
][
'p'
][
segment_index
]
/=
2
self
.
_spk_dict
[
current_speaker
][
'p'
][
segment_index
]
=
0
#
/= 2
current_segment
=
self
.
_spk_dict
[
current_speaker
][
'segments'
][
segment_index
]
self
.
_spk_dict
[
current_speaker
][
'p'
]
=
self
.
_spk_dict
[
current_speaker
][
'p'
]
/
numpy
.
sum
(
self
.
_spk_dict
[
current_speaker
][
'p'
])
if
numpy
.
sum
(
self
.
_spk_dict
[
current_speaker
][
'p'
])
>
0
:
self
.
_spk_dict
[
current_speaker
][
'p'
]
=
self
.
_spk_dict
[
current_speaker
][
'p'
]
/
numpy
.
sum
(
self
.
_spk_dict
[
current_speaker
][
'p'
])
else
:
self
.
_spk_dict
[
current_speaker
][
'p'
]
+=
1
/
self
.
_spk_dict
[
current_speaker
][
'num_segs'
]
nfo
=
soundfile
.
info
(
f
"
{
self
.
data_path
}
/
{
current_segment
[
'file_id'
]
}{
self
.
data_file_extension
}
"
)
if
self
.
_windowed
:
start_frame
=
int
(
current_segment
[
'start'
]
*
self
.
sample_rate
)
if
start_frame
+
self
.
sample_number
>=
nfo
.
frames
:
start_frame
=
numpy
.
min
(
nfo
.
frames
-
self
.
sample_number
-
1
)
...
...
nnet/xvector.py
View file @
df8df81b
...
...
@@ -302,6 +302,92 @@ class GruPooling(torch.nn.Module):
return
x
class
PreEmphasis
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
coef
:
float
=
0.97
):
super
().
__init__
()
self
.
coef
=
coef
# make kernel
# In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
self
.
register_buffer
(
'flipped_filter'
,
torch
.
FloatTensor
([
-
self
.
coef
,
1.
]).
unsqueeze
(
0
).
unsqueeze
(
0
)
)
def
forward
(
self
,
input
:
torch
.
tensor
)
->
torch
.
tensor
:
assert
len
(
input
.
size
())
==
2
,
'The number of dimensions of input tensor must be 2!'
# reflect padding to match lengths of in/out
input
=
input
.
unsqueeze
(
1
)
input
=
torch
.
nn
.
functional
.
pad
(
input
,
(
1
,
0
),
'reflect'
)
return
torch
.
nn
.
functional
.
conv1d
(
input
,
self
.
flipped_filter
).
squeeze
(
1
)
class
MfccFrontEnd
(
torch
.
nn
.
Module
):
"""
"""
def
__init__
(
self
,
pre_emphasis
=
0.97
,
sample_rate
=
16000
,
n_fft
=
2048
,
f_min
=
133.333
,
f_max
=
6855.4976
,
win_length
=
1024
,
window_fn
=
torch
.
hann_window
,
hop_length
=
512
,
power
=
2.0
,
n_mels
=
100
,
n_mfcc
=
80
):
super
(
MfccFrontEnd
,
self
).
__init__
()
self
.
pre_emphasis
=
pre_emphasis
self
.
sample_rate
=
sample_rate
self
.
n_fft
=
n_fft
self
.
f_min
=
f_min
self
.
f_max
=
f_max
self
.
win_length
=
win_length
self
.
window_fn
=
window_fn
self
.
hop_length
=
hop_length
self
.
power
=
power
self
.
window_fn
=
window_fn
self
.
n_mels
=
n_mels
self
.
n_mfcc
=
n_mfcc
self
.
PreEmphasis
=
PreEmphasis
(
self
.
pre_emphasis
)
self
.
melkwargs
=
{
sample_rate
:
self
.
sample_rate
,
n_fft
:
self
.
n_fft
,
f_min
:
self
.
f_min
,
f_max
:
self
.
f_max
,
win_length
:
self
.
win_length
,
window_fn
:
self
.
window_fn
,
hop_length
:
self
.
hop_length
,
power
:
self
.
power
,
n_mels
:
self
.
n_mels
}
self
.
MFCC
=
torchaudio
.
transforms
.
MFCC
(
sample_rate
=
self
.
sample_rate
,
n_mfcc
=
self
.
n_mfcc
,
dct_type
=
2
,
log_mels
=
True
,
melkwargs
=
self
.
melkwargs
)
self
.
CMVN
=
torch
.
nn
.
InstanceNorm1d
(
self
.
n_mfcc
)
def
forward
(
self
,
x
):
"""
:param x:
:return:
"""
with
torch
.
no_grad
():
with
torch
.
cuda
.
amp
.
autocast
(
enabled
=
False
):
mfcc
=
self
.
PreEmphasis
(
x
)
mfcc
=
self
.
MFCC
(
mfcc
)
mfcc
=
self
.
CMVN
(
mfcc
)
return
mfcc
class
Xtractor
(
torch
.
nn
.
Module
):
"""
Class that defines an x-vector extractor based on 5 convolutional layers and a mean standard deviation pooling
...
...
@@ -337,20 +423,7 @@ class Xtractor(torch.nn.Module):
self
.
feature_size
=
80
self
.
activation
=
torch
.
nn
.
LeakyReLU
(
0.2
)
# Feature extraction
n_fft
=
2048
win_length
=
None
hop_length
=
128
n_mels
=
80
n_mfcc
=
80
self
.
MFCC
=
torchaudio
.
transforms
.
MFCC
(
sample_rate
=
16000
,
n_mfcc
=
n_mfcc
,
melkwargs
=
{
'n_fft'
:
n_fft
,
'n_mels'
:
n_mels
,
'hop_length'
:
hop_length
})
self
.
CMVN
=
torch
.
nn
.
InstanceNorm1d
(
80
)
self
.
preprocessor
=
None
self
.
preprocessor
=
MfccFrontEnd
()
self
.
sequence_network
=
torch
.
nn
.
Sequential
(
OrderedDict
([
(
"conv1"
,
torch
.
nn
.
Conv1d
(
self
.
feature_size
,
512
,
5
,
dilation
=
1
)),
...
...
@@ -401,22 +474,8 @@ class Xtractor(torch.nn.Module):
self
.
embedding_size
=
512
elif
model_archi
==
"resnet34"
:
self
.
input_nbdim
=
2
# Feature extraction
n_fft
=
2048
win_length
=
None
hop_length
=
128
n_mels
=
80
n_mfcc
=
80
self
.
MFCC
=
torchaudio
.
transforms
.
MFCC
(
sample_rate
=
16000
,
n_mfcc
=
n_mfcc
,
melkwargs
=
{
'n_fft'
:
n_fft
,
'n_mels'
:
n_mels
,
'hop_length'
:
hop_length
})
self
.
CMVN
=
torch
.
nn
.
InstanceNorm1d
(
80
)
self
.
preprocessor
=
None
self
.
preprocessor
=
MfccFrontEnd
()
self
.
sequence_network
=
PreResNet34
()
self
.
before_speaker_embedding
=
torch
.
nn
.
Linear
(
in_features
=
5120
,
...
...
@@ -441,22 +500,8 @@ class Xtractor(torch.nn.Module):
self
.
after_speaker_embedding_weight_decay
=
0.00
elif
model_archi
==
"fastresnet34"
:
self
.
input_nbdim
=
2
# Feature extraction
n_fft
=
2048
win_length
=
None
hop_length
=
128
n_mels
=
80
n_mfcc
=
80
self
.
MFCC
=
torchaudio
.
transforms
.
MFCC
(
sample_rate
=
16000
,
n_mfcc
=
n_mfcc
,
melkwargs
=
{
'n_fft'
:
n_fft
,
'n_mels'
:
n_mels
,
'hop_length'
:
hop_length
})
self
.
CMVN
=
torch
.
nn
.
InstanceNorm1d
(
80
)
self
.
preprocessor
=
None
self
.
preprocessor
=
MfccFrontEnd
()
self
.
sequence_network
=
PreFastResNet34
()
self
.
before_speaker_embedding
=
torch
.
nn
.
Linear
(
in_features
=
2560
,
...
...
@@ -743,9 +788,12 @@ class Xtractor(torch.nn.Module):
x
=
self
.
preprocessor
(
x
)
else
:
x
=
self
.
MFCC
(
x
)
x
=
self
.
CMVN
(
x
)
#x = x.unsqueeze(1)
with
torch
.
no_grad
():
with
torch
.
cuda
.
amp
.
autocast
(
enabled
=
False
):
x
=
self
.
PreEmphasis
(
x
)
x
=
self
.
MFCC
(
x
)
x
=
self
.
CMVN
(
x
).
unsqueeze
(
1
)
x
=
self
.
sequence_network
(
x
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment