Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Anthony Larcher
sidekit
Commits
be3c8478
Commit
be3c8478
authored
Oct 13, 2021
by
Le Lan Gaël
Browse files
Merge branch 'master' into dev-gl3lan
parents
9ab7a6b8
88f4d2b9
Changes
15
Expand all
Hide whitespace changes
Inline
Side-by-side
__init__.py
View file @
be3c8478
...
...
@@ -162,12 +162,9 @@ if SIDEKIT_CONFIG["cuda"]:
if
CUDA
:
from
.nnet
import
FForwardNetwork
from
.nnet
import
kaldi_to_hdf5
from
.nnet
import
Xtractor
from
.nnet
import
xtrain
from
.nnet
import
extract_embeddings
from
.nnet
import
extract_sliding_embedding
from
.nnet
import
ResBlock
from
.nnet
import
SincNet
...
...
@@ -190,5 +187,5 @@ __maintainer__ = "Anthony Larcher"
__email__
=
"anthony.larcher@univ-lemans.fr"
__status__
=
"Production"
__docformat__
=
'reStructuredText'
__version__
=
"1.
3.8.5.2
"
__version__
=
"1.
4
"
features_extractor.py
View file @
be3c8478
...
...
@@ -310,8 +310,7 @@ class FeaturesExtractor(object):
dir_name
=
os
.
path
.
dirname
(
feature_filename
)
# get the path
if
not
os
.
path
.
exists
(
dir_name
)
and
not
(
dir_name
==
''
):
os
.
makedirs
(
dir_name
)
h5f
=
h5py
.
File
(
feature_filename
,
'a'
,
backing_store
=
backing_store
,
driver
=
'core'
)
h5f
=
h5py
.
File
(
feature_filename
,
'w'
,
backing_store
=
backing_store
,
driver
=
'core'
)
if
"cep"
not
in
self
.
save_param
:
cep
=
None
cep_mean
=
None
...
...
features_server.py
View file @
be3c8478
...
...
@@ -221,7 +221,7 @@ class FeaturesServer(object):
feat
=
pca_dct
(
feat
,
self
.
dct_pca_config
[
0
],
self
.
dct_pca_config
[
1
],
self
.
dct_pca_config
[
2
])
elif
self
.
sdc
:
feat
=
shifted_delta_cepstral
(
feat
,
d
=
self
.
sdc_config
[
0
],
p
=
self
.
sdc_config
[
1
],
k
=
self
.
sdc_config
[
2
])
# Apply a mask on the features
if
self
.
mask
is
not
None
:
feat
=
self
.
_mask
(
feat
)
...
...
@@ -488,6 +488,7 @@ class FeaturesServer(object):
feat
,
label
=
self
.
post_processing
(
feat
,
label
,
global_mean
,
global_std
)
else
:
feat
,
label
=
self
.
post_processing
(
feat
,
label
)
return
feat
,
label
def
get_features_per_speaker
(
self
,
show
,
idmap
,
channel
=
0
,
input_feature_filename
=
None
,
label
=
None
):
...
...
nnet/__init__.py
View file @
be3c8478
...
...
@@ -28,15 +28,12 @@ Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
"""
from
.feed_forward
import
FForwardNetwork
from
.feed_forward
import
kaldi_to_hdf5
from
.xsets
import
IdMapSetPerSpeaker
from
.xsets
import
SideSet
from
.xsets
import
SideSampler
from
.xvector
import
Xtractor
from
.xvector
import
xtrain
from
.xvector
import
extract_embeddings
from
.xvector
import
extract_sliding_embedding
from
.pooling
import
MeanStdPooling
from
.pooling
import
AttentivePooling
from
.pooling
import
GruPooling
...
...
@@ -49,15 +46,6 @@ from .preprocessor import RawPreprocessor
from
.preprocessor
import
MfccFrontEnd
from
.preprocessor
import
MelSpecFrontEnd
has_pyroom
=
True
try
:
import
pyroomacoustics
except
ImportError
:
has_pyroom
=
False
if
has_pyroom
:
from
.augmentation
import
AddReverb
__author__
=
"Anthony Larcher and Sylvain Meignier"
__copyright__
=
"Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
...
...
nnet/augmentation.py
View file @
be3c8478
...
...
@@ -26,21 +26,12 @@ Copyright 2014-2021 Anthony Larcher
"""
import
collections
import
math
import
numpy
from
scipy
import
signal
import
pandas
import
random
import
soundfile
import
torch
import
torchaudio
has_pyroom
=
True
try
:
import
pyroomacoustics
except
ImportError
:
has_pyroom
=
False
from
scipy
import
signal
__author__
=
"Anthony Larcher and Sylvain Meignier"
...
...
@@ -55,8 +46,10 @@ __docformat__ = 'reStructuredText'
Noise
=
collections
.
namedtuple
(
'Noise'
,
'type file_id duration'
)
class
PreEmphasis
(
torch
.
nn
.
Module
):
"""
Apply pre-emphasis filtering
"""
def
__init__
(
self
,
coef
:
float
=
0.97
):
super
().
__init__
()
...
...
@@ -67,13 +60,18 @@ class PreEmphasis(torch.nn.Module):
'flipped_filter'
,
torch
.
FloatTensor
([
-
self
.
coef
,
1.
]).
unsqueeze
(
0
).
unsqueeze
(
0
)
)
def
forward
(
self
,
input
:
torch
.
tensor
)
->
torch
.
tensor
:
assert
len
(
input
.
size
())
==
2
,
'The number of dimensions of input tensor must be 2!'
# reflect padding to match lengths of in/out
input
=
input
.
unsqueeze
(
1
)
input
=
torch
.
nn
.
functional
.
pad
(
input
,
(
1
,
0
),
'reflect'
)
return
torch
.
nn
.
functional
.
conv1d
(
input
,
self
.
flipped_filter
).
squeeze
(
1
)
def
forward
(
self
,
input_signal
:
torch
.
tensor
)
->
torch
.
tensor
:
"""
Forward pass of the pre-emphasis filtering
:param input_signal: the input signal
:return: the filtered signal
"""
assert
len
(
input_signal
.
size
())
==
2
,
'The number of dimensions of input tensor must be 2!'
# reflect padding to match lengths of in/out
input_signal
=
input_signal
.
unsqueeze
(
1
)
input_signal
=
torch
.
nn
.
functional
.
pad
(
input_signal
,
(
1
,
0
),
'reflect'
)
return
torch
.
nn
.
functional
.
conv1d
(
input_signal
,
self
.
flipped_filter
).
squeeze
(
1
)
class
FrequencyMask
(
object
):
...
...
@@ -115,24 +113,26 @@ class TemporalMask(object):
return
data
,
sample
[
1
],
sample
[
2
],
sample
[
3
],
sample
[
4
],
sample
[
5
]
def
normalize
(
wav
):
"""
Center and reduce a waveform
:param wav:
:return:
:param wav:
the input waveform
:return:
the normalized waveform
"""
return
wav
/
(
numpy
.
sqrt
(
numpy
.
mean
(
wav
**
2
))
+
1e-8
)
def
crop
(
signal
,
duration
):
def
crop
(
input_
signal
,
duration
):
"""
Select a chunk from an audio segment
:param input_signal: signal to select a chunk from
:param duration: duration of the chunk to select
:return:
"""
start
=
random
.
randint
(
0
,
signal
.
shape
[
0
]
-
duration
)
chunk
=
signal
[
start
:
start
+
duration
]
start
=
random
.
randint
(
0
,
input_signal
.
shape
[
0
]
-
duration
)
chunk
=
input_signal
[
start
:
start
+
duration
]
return
chunk
...
...
@@ -141,13 +141,23 @@ def data_augmentation(speech,
transform_dict
,
transform_number
,
noise_df
=
None
,
rir_df
=
None
):
rir_df
=
None
,
babble_noise
=
True
):
"""
Perform data augmentation on an input signal.
Each speech chunk is augmented by using 'transform_number' transformations that are picked up randomly from a
dictionary of possible transformations.
:param speech:
:param transform_dict:
:param transform_number:
:return:
:param speech: the input signal to be augmented
:param sample_rate: sampling rate of the input signal to augment
:param transform_dict: the dictionary of possibles augmentations to apply
:param transform_number: the number of transformations to apply on each chunk
:param rir_df: a pandas dataframe object including the list of RIR signals to chose from; default is None
:param noise_df: a pandas dataframe object including the list of NOISE signals to chose from; default is None
:param babble_noise: boolean that enable the use of babble noise, True by default (typically turned to False when
the task includes overlapping speech detection).
:return: augmented signal
tranformation
pipeline: add_noise,add_reverb
...
...
@@ -158,7 +168,6 @@ def data_augmentation(speech,
rir_db_csv: filename.csv
codec: true
phone_filtering: true
"""
# Select the data augmentation randomly
aug_idx
=
random
.
sample
(
range
(
len
(
transform_dict
.
keys
())),
k
=
transform_number
)
...
...
@@ -183,7 +192,10 @@ def data_augmentation(speech,
if
"add_noise"
in
augmentations
:
# Pick a noise type
noise
=
torch
.
zeros_like
(
speech
)
noise_idx
=
random
.
randrange
(
3
)
if
not
babble_noise
:
noise_idx
=
random
.
randrange
(
1
,
3
)
else
:
noise_idx
=
random
.
randrange
(
0
,
4
)
# speech
if
noise_idx
==
0
:
...
...
@@ -206,6 +218,19 @@ def data_augmentation(speech,
snr_db
=
random
.
randint
(
0
,
15
)
noise_row
=
noise_df
.
loc
[
'noise'
].
iloc
[
random
.
randrange
(
noise_df
.
loc
[
'noise'
].
shape
[
0
])]
noise
+=
load_noise_seg
(
noise_row
,
speech
.
shape
,
sample_rate
,
transform_dict
[
"add_noise"
][
"data_path"
])
# babble noise with different volume
elif
noise_idx
==
3
:
snr_db
=
random
.
randint
(
13
,
20
)
pick_count
=
random
.
randint
(
5
,
10
)
# Randomly select 5 to 10 speakers
index_list
=
random
.
choices
(
range
(
noise_df
.
loc
[
'speech'
].
shape
[
0
]),
k
=
pick_count
)
noise
=
torch
.
zeros
(
1
,
speech
.
shape
[
1
])
for
idx
in
index_list
:
noise_row
=
noise_df
.
loc
[
'speech'
].
iloc
[
idx
]
noise_
=
load_noise_seg
(
noise_row
,
speech
.
shape
,
sample_rate
,
transform_dict
[
"add_noise"
][
"data_path"
])
transform
=
torchaudio
.
transforms
.
Vol
(
gain
=
random
.
randint
(
5
,
15
),
gain_type
=
'db'
)
# Randomly select volume level (5-15d)
noise
+=
transform
(
noise_
)
noise
/=
pick_count
speech_power
=
speech
.
norm
(
p
=
2
)
noise_power
=
noise
.
norm
(
p
=
2
)
...
...
@@ -229,10 +254,10 @@ def data_augmentation(speech,
effects
=
[
[
"bandpass"
,
"2000"
,
"3500"
],
[
"bandstop"
,
"200"
,
"500"
]]
speech
,
sample_rate
=
torchaudio
.
sox_eefects
.
apply_effects_tensor
(
speech
,
sample_rate
=
torchaudio
.
sox_eefects
.
apply_effects_tensor
(
speech
,
sample_rate
,
effects
=
[
effects
[
random
.
randint
(
0
,
1
)]],
effects
=
[
effects
[
random
.
randint
(
0
,
1
)]],
)
if
"codec"
in
augmentations
:
...
...
@@ -251,6 +276,15 @@ def data_augmentation(speech,
return
speech
def
load_noise_seg
(
noise_row
,
speech_shape
,
sample_rate
,
data_path
):
"""
Pick a noise signal to add while performing data augmentation
:param noise_row: a row from a Pandas dataframe object
:param speech_shape: shape of the speech signal to be augmented
:param sample_rate: sampling rate of the speech signal to be augmented
:param data_path: directory where to load the noise file from
:return:
"""
noise_start
=
noise_row
[
'start'
]
noise_duration
=
noise_row
[
'duration'
]
noise_file_id
=
noise_row
[
'file_id'
]
...
...
nnet/feed_forward.py
deleted
100755 → 0
View file @
9ab7a6b8
This diff is collapsed.
Click to expand it.
nnet/loss.py
View file @
be3c8478
...
...
@@ -26,17 +26,12 @@ Copyright 2014-2021 Anthony Larcher
"""
import
h5py
import
logging
import
math
import
sys
import
numpy
import
torch
import
torch.optim
as
optim
import
torch.multiprocessing
as
mp
from
collections
import
OrderedDict
from
..bosaris
import
IdMap
from
..statserver
import
StatServer
from
torch.nn
import
Parameter
...
...
@@ -52,6 +47,9 @@ __docformat__ = 'reS'
class
ArcMarginModel
(
torch
.
nn
.
Module
):
"""
"""
def
__init__
(
self
,
args
):
super
(
ArcMarginModel
,
self
).
__init__
()
...
...
@@ -68,6 +66,12 @@ class ArcMarginModel(torch.nn.Module):
self
.
mm
=
math
.
sin
(
math
.
pi
-
self
.
m
)
*
self
.
m
def
forward
(
self
,
input
,
label
):
"""
:param input:
:param label:
:return:
"""
x
=
F
.
normalize
(
input
)
W
=
F
.
normalize
(
self
.
weight
)
cosine
=
F
.
linear
(
x
,
W
)
...
...
@@ -85,12 +89,21 @@ class ArcMarginModel(torch.nn.Module):
def
l2_norm
(
input
,
axis
=
1
):
"""
:param input:
:param axis:
:return:
"""
norm
=
torch
.
norm
(
input
,
2
,
axis
,
True
)
output
=
torch
.
div
(
input
,
norm
)
return
output
class
ArcFace
(
torch
.
nn
.
Module
):
"""
"""
# implementation of additive margin softmax loss in https://arxiv.org/abs/1801.05599
def
__init__
(
self
,
embedding_size
,
classnum
,
s
=
64.
,
m
=
0.5
):
super
(
ArcFace
,
self
).
__init__
()
...
...
@@ -106,6 +119,12 @@ class ArcFace(torch.nn.Module):
self
.
threshold
=
math
.
cos
(
math
.
pi
-
m
)
def
forward
(
self
,
embbedings
,
target
):
"""
:param embbedings:
:param target:
:return:
"""
# weights norm
nB
=
len
(
embbedings
)
kernel_norm
=
l2_norm
(
self
.
kernel
,
axis
=
0
)
...
...
@@ -136,6 +155,9 @@ class ArcFace(torch.nn.Module):
################################## Cosface head #############################################################
class
Am_softmax
(
torch
.
nn
.
Module
):
"""
"""
# implementation of additive margin softmax loss in https://arxiv.org/abs/1801.05599
def
__init__
(
self
,
embedding_size
=
512
,
classnum
=
51332
):
super
(
Am_softmax
,
self
).
__init__
()
...
...
@@ -147,6 +169,12 @@ class Am_softmax(torch.nn.Module):
self
.
s
=
30.
# see normface https://arxiv.org/abs/1704.06369
def
forward
(
self
,
embbedings
,
label
):
"""
:param embbedings:
:param label:
:return:
"""
kernel_norm
=
l2_norm
(
self
.
kernel
,
axis
=
0
)
cos_theta
=
torch
.
mm
(
embbedings
,
kernel_norm
)
cos_theta
=
cos_theta
.
clamp
(
-
1
,
1
)
# for numerical stability
...
...
@@ -226,14 +254,15 @@ class ArcLinear(torch.nn.Module):
class
ArcMarginProduct
(
torch
.
nn
.
Module
):
r
"""Implement of large margin arc distance: :
"""
Implement of large margin arc distance: :
Args:
in_features: size of each input sample
out_features: size of each output sample
s: norm of input feature
m: margin
cos(theta + m)
"""
"""
def
__init__
(
self
,
in_features
,
out_features
,
s
=
30.0
,
m
=
0.50
,
easy_margin
=
False
):
super
(
ArcMarginProduct
,
self
).
__init__
()
...
...
@@ -250,10 +279,33 @@ class ArcMarginProduct(torch.nn.Module):
self
.
th
=
math
.
cos
(
math
.
pi
-
self
.
m
)
self
.
mm
=
math
.
sin
(
math
.
pi
-
self
.
m
)
*
self
.
m
def
change_params
(
self
,
s
=
None
,
m
=
None
):
"""
:param s:
:param m:
"""
if
s
is
None
:
s
=
self
.
s
if
m
is
None
:
m
=
self
.
m
self
.
s
=
s
self
.
m
=
m
self
.
cos_m
=
math
.
cos
(
self
.
m
)
self
.
sin_m
=
math
.
sin
(
self
.
m
)
self
.
th
=
math
.
cos
(
math
.
pi
-
self
.
m
)
self
.
mm
=
math
.
sin
(
math
.
pi
-
self
.
m
)
*
self
.
m
def
forward
(
self
,
input
,
target
=
None
):
"""
:param input:
:param target:
:return:
"""
# cos(theta)
cosine
=
torch
.
nn
.
functional
.
linear
(
torch
.
nn
.
functional
.
normalize
(
input
),
torch
.
nn
.
functional
.
normalize
(
self
.
weight
))
torch
.
nn
.
functional
.
normalize
(
self
.
weight
))
if
target
==
None
:
return
cosine
*
self
.
s
# cos(theta + m)
...
...
@@ -275,7 +327,10 @@ class ArcMarginProduct(torch.nn.Module):
class
SoftmaxAngularProto
(
torch
.
nn
.
Module
):
# from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
def
__init__
(
self
,
spk_count
,
emb_dim
=
256
,
init_w
=
10.0
,
init_b
=-
5.0
,
**
kwargs
):
super
(
SoftmaxAngularProto
,
self
).
__init__
()
...
...
@@ -290,27 +345,38 @@ class SoftmaxAngularProto(torch.nn.Module):
]))
def
forward
(
self
,
x
,
target
=
None
):
"""
:param x:
:param target:
:return:
"""
assert
x
.
size
()[
1
]
>=
2
cce_prediction
=
self
.
cce_backend
(
x
)
if
target
==
None
:
if
target
is
None
:
return
cce_prediction
x
=
x
.
reshape
(
-
1
,
2
,
x
.
size
()[
-
1
]).
squeeze
(
1
)
x
=
x
.
reshape
(
-
1
,
2
,
x
.
size
()[
-
1
]).
squeeze
(
1
)
out_anchor
=
torch
.
mean
(
x
[:,
1
:,:],
1
)
out_positive
=
x
[:,
0
,:]
out_anchor
=
torch
.
mean
(
x
[:,
1
:,
:],
1
)
out_positive
=
x
[:,
0
,:]
cos_sim_matrix
=
torch
.
nn
.
functional
.
cosine_similarity
(
out_positive
.
unsqueeze
(
-
1
),
out_anchor
.
unsqueeze
(
-
1
).
transpose
(
0
,
2
))
cos_sim_matrix
=
torch
.
nn
.
functional
.
cosine_similarity
(
out_positive
.
unsqueeze
(
-
1
),
out_anchor
.
unsqueeze
(
-
1
).
transpose
(
0
,
2
))
torch
.
clamp
(
self
.
w
,
1e-6
)
cos_sim_matrix
=
cos_sim_matrix
*
self
.
w
+
self
.
b
loss
=
self
.
criterion
(
cos_sim_matrix
,
torch
.
arange
(
0
,
cos_sim_matrix
.
shape
[
0
],
device
=
x
.
device
))
+
self
.
criterion
(
cce_prediction
,
target
)
loss
=
self
.
criterion
(
cos_sim_matrix
,
torch
.
arange
(
0
,
cos_sim_matrix
.
shape
[
0
],
device
=
x
.
device
))
+
self
.
criterion
(
cce_prediction
,
target
)
return
loss
,
cce_prediction
class
AngularProximityMagnet
(
torch
.
nn
.
Module
):
# from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
def
__init__
(
self
,
spk_count
,
emb_dim
=
256
,
batch_size
=
512
,
init_w
=
10.0
,
init_b
=-
5.0
,
**
kwargs
):
super
(
AngularProximityMagnet
,
self
).
__init__
()
...
...
@@ -340,17 +406,22 @@ class AngularProximityMagnet(torch.nn.Module):
self
.
magnet_criterion
=
torch
.
nn
.
BCEWithLogitsLoss
(
reduction
=
'mean'
)
def
forward
(
self
,
x
,
target
=
None
):
"""
:param x:
:param target:
:return:
"""
assert
x
.
size
()[
1
]
>=
2
cce_prediction
=
self
.
cce_backend
(
x
)
#x = self.magnitude(x) * torch.nn.functional.normalize(x)
if
target
==
None
:
if
target
is
None
:
return
x
,
cce_prediction
x
=
x
.
reshape
(
-
1
,
2
,
x
.
size
()[
-
1
]).
squeeze
(
1
)
out_anchor
=
torch
.
mean
(
x
[:,
1
:,:],
1
)
out_positive
=
x
[:,
0
,:]
x
=
x
.
reshape
(
-
1
,
2
,
x
.
size
()[
-
1
]).
squeeze
(
1
)
out_anchor
=
torch
.
mean
(
x
[:,
1
:,
:],
1
)
out_positive
=
x
[:,
0
,
:]
ap_sim_matrix
=
torch
.
nn
.
functional
.
cosine_similarity
(
out_positive
.
unsqueeze
(
-
1
),
out_anchor
.
unsqueeze
(
-
1
).
transpose
(
0
,
2
))
torch
.
clamp
(
self
.
w
,
1e-6
)
...
...
nnet/pooling.py
View file @
be3c8478
...
...
@@ -111,7 +111,7 @@ class AttentivePooling(torch.nn.Module):
class
GruPooling
(
torch
.
nn
.
Module
):
"""
Pooling done by using a recurrent network
"""
def
__init__
(
self
,
input_size
,
gru_node
,
nb_gru_layer
):
"""
...
...
@@ -136,7 +136,7 @@ class GruPooling(torch.nn.Module):
"""
x
=
self
.
bn_before_gru
(
x
)
x
=
self
.
lrelu_keras
(
x
)
x
=
x
.
permute
(
0
,
2
,
1
)
#(batch, filt, time) >> (batch, time, filt)
x
=
x
.
permute
(
0
,
2
,
1
)
#
(batch, filt, time) >> (batch, time, filt)
self
.
gru
.
flatten_parameters
()
x
,
_
=
self
.
gru
(
x
)
x
=
x
[:,
-
1
,
:]
...
...
nnet/preprocessor.py
View file @
be3c8478
...
...
@@ -27,39 +27,14 @@ Copyright 2014-2021 Anthony Larcher, Yevhenii Prokopalo
import
logging
import
math
import
os
import
numpy
import
pandas
import
pickle
import
shutil
import
time
import
os
import
torch
import
torchaudio
import
tqdm
import
yaml
from
collections
import
OrderedDict
from
torch.utils.data
import
DataLoader
from
sklearn.model_selection
import
train_test_split
from
.augmentation
import
PreEmphasis
from
.xsets
import
SideSet
from
.xsets
import
IdMapSet
from
.xsets
import
IdMapSetPerSpeaker
from
.xsets
import
SideSampler
from
.res_net
import
ResBlockWFMS
from
.res_net
import
ResBlock
from
.res_net
import
PreResNet34
from
.res_net
import
PreFastResNet34
from
..bosaris
import
IdMap
from
..bosaris
import
Key
from
..bosaris
import
Ndx
from
..statserver
import
StatServer
from
..iv_scoring
import
cosine_scoring
from
.sincnet
import
SincNet
from
.loss
import
ArcLinear
from
.loss
import
l2_norm
from
.loss
import
ArcMarginProduct
from
.sincnet
import
SincConv1d
from
.res_net
import
LayerNorm
os
.
environ
[
'MKL_THREADING_LAYER'
]
=
'GNU'
...
...
@@ -83,12 +58,10 @@ torch.backends.cudnn.benchmark = False
numpy
.
random
.
seed
(
0
)
class
MfccFrontEnd
(
torch
.
nn
.
Module
):
"""
Module that extract MFCC coefficients
"""
def
__init__
(
self
,
pre_emphasis
=
0.97
,
sample_rate
=
16000
,
...
...
@@ -153,18 +126,17 @@ class MfccFrontEnd(torch.nn.Module):
class
MelSpecFrontEnd
(
torch
.
nn
.
Module
):
"""
Module that compute Mel spetrogramm on an audio signal
"""
def
__init__
(
self
,
pre_emphasis
=
0.97
,
sample_rate
=
16000
,
n_fft
=
1024
,
f_min
=
90
,
f_max
=
7600
,
win_length
=
1024
,
win_length
=
400
,
window_fn
=
torch
.
hann_window
,
hop_length
=
256
,
hop_length
=
160
,
power
=
2.0
,
n_mels
=
80
):
...
...
@@ -229,7 +201,8 @@ class MelSpecFrontEnd(torch.nn.Module):
class
RawPreprocessor
(
torch
.
nn
.
Module
):
"""
Pre-process the raw audio signal by using a SincNet architecture
[ADD REF]
"""
def
__init__
(
self
,
nb_samp
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
bias
=
False
,
groups
=
1
,
min_low_hz
=
50
,
min_band_hz
=
50
,
sample_rate
=
16000
):
"""
...
...
nnet/rawnet.py
deleted
100644 → 0
View file @
9ab7a6b8
# -*- coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#