Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Anthony Larcher
sidekit
Commits
ebb18b54
Commit
ebb18b54
authored
Jun 17, 2021
by
Anthony Larcher
Browse files
Merge branch 'dev_al'
# Conflicts: # nnet/res_net.py # nnet/xvector.py
parents
43927406
2aacbe26
Changes
16
Expand all
Hide whitespace changes
Inline
Side-by-side
__init__.py
View file @
ebb18b54
...
...
@@ -34,9 +34,8 @@ import os
import
sys
# Read environment variable if it exists
SIDEKIT_CONFIG
=
{
"libsvm"
:
Tru
e
,
SIDEKIT_CONFIG
=
{
"libsvm"
:
Fals
e
,
"mpi"
:
False
,
"cuda"
:
True
}
...
...
@@ -165,15 +164,10 @@ if SIDEKIT_CONFIG["cuda"]:
if
CUDA
:
from
.nnet
import
FForwardNetwork
from
.nnet
import
kaldi_to_hdf5
from
.nnet
import
XvectorMultiDataset
from
.nnet
import
XvectorDataset
from
.nnet
import
StatDataset
from
.nnet
import
Xtractor
from
.nnet
import
xtrain
from
.nnet
import
extract_embeddings
from
.nnet
import
extract_sliding_embedding
from
.nnet
import
ResBlock
from
.nnet
import
ResNet18
from
.nnet
import
SincNet
else
:
...
...
bosaris/key.py
View file @
ebb18b54
...
...
@@ -235,8 +235,8 @@ class Key:
with
h5py
.
File
(
input_file_fame
,
"r"
)
as
f
:
key
=
Key
()
key
.
modelset
=
f
.
get
(
"modelset"
)
[()]
key
.
segset
=
f
.
get
(
"segset"
)
[()]
key
.
modelset
=
f
[
"modelset"
]
[()]
key
.
segset
=
f
[
"segset"
]
[()]
# if running python 3, need a conversion to unicode
if
sys
.
version_info
[
0
]
==
3
:
...
...
bosaris/ndx.py
View file @
ebb18b54
...
...
@@ -189,15 +189,16 @@ class Ndx:
"""
with
h5py
.
File
(
input_file_name
,
"r"
)
as
f
:
ndx
=
Ndx
()
ndx
.
modelset
=
f
.
get
(
"modelset"
)
[()]
ndx
.
segset
=
f
.
get
(
"segset"
)
[()]
ndx
.
modelset
=
f
[
"modelset"
]
[()]
ndx
.
segset
=
f
[
"segset"
]
[()]
# if running python 3, need a conversion to unicode
if
sys
.
version_info
[
0
]
==
3
:
ndx
.
modelset
=
ndx
.
modelset
.
astype
(
'U100'
,
copy
=
False
)
ndx
.
segset
=
ndx
.
segset
.
astype
(
'U100'
,
copy
=
False
)
ndx
.
modelset
=
ndx
.
modelset
.
astype
(
'U100'
)
ndx
.
segset
=
ndx
.
segset
.
astype
(
'U100'
)
ndx
.
trialmask
=
f
.
get
(
"trial_mask"
)[()].
astype
(
'bool'
)
ndx
.
trialmask
=
numpy
.
zeros
((
ndx
.
modelset
.
shape
[
0
],
ndx
.
segset
.
shape
[
0
]),
dtype
=
numpy
.
bool
)
f
[
"trial_mask"
].
read_direct
(
ndx
.
trialmask
)
assert
ndx
.
validate
(),
"Error: wrong Ndx format"
return
ndx
...
...
bosaris/scores.py
View file @
ebb18b54
...
...
@@ -163,12 +163,18 @@ class Scores:
:return: a vector of target scores.
:return: a vector of non-target scores.
"""
new_score
=
self
.
align_with_ndx
(
key
)
tarndx
=
key
.
tar
&
new_score
.
scoremask
nonndx
=
key
.
non
&
new_score
.
scoremask
tar
=
new_score
.
scoremat
[
tarndx
]
non
=
new_score
.
scoremat
[
nonndx
]
return
tar
,
non
if
(
key
.
modelset
==
self
.
modelset
).
all
()
\
and
(
key
.
segset
==
self
.
segset
).
all
()
\
and
self
.
scoremask
.
shape
[
0
]
==
key
.
tar
.
shape
[
0
]
\
and
self
.
scoremask
.
shape
[
1
]
==
key
.
tar
.
shape
[
1
]:
return
self
.
scoremat
[
key
.
tar
&
self
.
scoremask
],
self
.
scoremat
[
key
.
non
&
self
.
scoremask
]
else
:
new_score
=
self
.
align_with_ndx
(
key
)
tarndx
=
key
.
tar
&
new_score
.
scoremask
nonndx
=
key
.
non
&
new_score
.
scoremask
tar
=
new_score
.
scoremat
[
tarndx
]
non
=
new_score
.
scoremat
[
nonndx
]
return
tar
,
non
def
align_with_ndx
(
self
,
ndx
):
"""The ordering in the output Scores object corresponds to ndx, so
...
...
features_server.py
View file @
ebb18b54
...
...
@@ -221,7 +221,7 @@ class FeaturesServer(object):
feat
=
pca_dct
(
feat
,
self
.
dct_pca_config
[
0
],
self
.
dct_pca_config
[
1
],
self
.
dct_pca_config
[
2
])
elif
self
.
sdc
:
feat
=
shifted_delta_cepstral
(
feat
,
d
=
self
.
sdc_config
[
0
],
p
=
self
.
sdc_config
[
1
],
k
=
self
.
sdc_config
[
2
])
# Apply a mask on the features
if
self
.
mask
is
not
None
:
feat
=
self
.
_mask
(
feat
)
...
...
@@ -488,6 +488,7 @@ class FeaturesServer(object):
feat
,
label
=
self
.
post_processing
(
feat
,
label
,
global_mean
,
global_std
)
else
:
feat
,
label
=
self
.
post_processing
(
feat
,
label
)
return
feat
,
label
def
get_features_per_speaker
(
self
,
show
,
idmap
,
channel
=
0
,
input_feature_filename
=
None
,
label
=
None
):
...
...
iv_scoring.py
View file @
ebb18b54
...
...
@@ -27,13 +27,15 @@ Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
"""
import
copy
import
logging
import
sys
import
numpy
import
scipy
from
sidekit.bosaris
import
Ndx
from
sidekit.bosaris
import
Scores
import
torch
from
sidekit.bosaris
import
Ndx
,
Scores
from
sidekit.statserver
import
StatServer
import
sys
if
sys
.
version_info
.
major
>
2
:
from
functools
import
reduce
...
...
@@ -58,7 +60,7 @@ def _check_missing_model(enroll, test, ndx):
return
clean_ndx
def
cosine_scoring
(
enroll
,
test
,
ndx
,
wccn
=
None
,
check_missing
=
True
):
def
cosine_scoring
(
enroll
,
test
,
ndx
,
wccn
=
None
,
check_missing
=
True
,
device
=
None
):
"""Compute the cosine similarities between to sets of vectors. The list of
trials to perform is given in an Ndx object.
...
...
@@ -96,10 +98,15 @@ def cosine_scoring(enroll, test, ndx, wccn=None, check_missing=True):
enroll_copy
.
norm_stat1
()
if
enroll_copy
!=
test_copy
:
test_copy
.
norm_stat1
()
s
=
numpy
.
dot
(
enroll_copy
.
stat1
,
test_copy
.
stat1
.
transpose
())
s_size_in_bytes
=
enroll_copy
.
stat1
.
shape
[
0
]
*
test_copy
.
stat1
.
shape
[
0
]
*
4
if
device
==
None
:
device
=
torch
.
device
(
"cuda:0"
if
torch
.
cuda
.
is_available
()
and
s_size_in_bytes
<
3e9
else
"cpu"
)
else
:
device
=
device
if
torch
.
cuda
.
is_available
()
and
s_size_in_bytes
<
3e9
else
torch
.
device
(
"cpu"
)
score
=
Scores
()
score
.
scoremat
=
s
score
.
scoremat
=
torch
.
einsum
(
'ij,kj'
,
torch
.
FloatTensor
(
enroll_copy
.
stat1
).
to
(
device
),
torch
.
FloatTensor
(
test_copy
.
stat1
).
to
(
device
)).
cpu
().
numpy
()
score
.
modelset
=
clean_ndx
.
modelset
score
.
segset
=
clean_ndx
.
segset
score
.
scoremask
=
clean_ndx
.
trialmask
...
...
nnet/__init__.py
View file @
ebb18b54
...
...
@@ -28,23 +28,27 @@ Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
"""
from
.augmentation
import
AddNoise
from
.feed_forward
import
FForwardNetwork
from
.feed_forward
import
kaldi_to_hdf5
from
.xsets
import
XvectorMultiDataset
,
XvectorDataset
,
StatDataset
,
IdMapSet_per_speaker
from
.xvector
import
Xtractor
,
xtrain
,
extract_embeddings
,
extract_sliding_embedding
,
MeanStdPooling
from
.res_net
import
ResBlock
,
ResNet18
,
PreResNet34
from
.rawnet
import
prepare_voxceleb1
,
Vox1Set
,
PreEmphasis
from
.xsets
import
IdMapSetPerSpeaker
from
.xsets
import
SideSet
from
.xsets
import
SideSampler
from
.xvector
import
Xtractor
from
.xvector
import
xtrain
from
.xvector
import
extract_embeddings
from
.pooling
import
MeanStdPooling
from
.pooling
import
AttentivePooling
from
.pooling
import
GruPooling
from
.res_net
import
ResBlock
from
.res_net
import
PreResNet34
from
.res_net
import
PreFastResNet34
from
.res_net
import
PreHalfResNet34
from
.sincnet
import
SincNet
from
.preprocessor
import
RawPreprocessor
from
.preprocessor
import
MfccFrontEnd
from
.preprocessor
import
MelSpecFrontEnd
has_pyroom
=
True
try
:
import
pyroomacoustics
except
ImportError
:
has_pyroom
=
False
if
has_pyroom
:
from
.augmentation
import
AddReverb
__author__
=
"Anthony Larcher and Sylvain Meignier"
...
...
nnet/augmentation.py
View file @
ebb18b54
...
...
@@ -26,10 +26,14 @@ Copyright 2014-2021 Anthony Larcher
"""
import
collections
import
math
import
numpy
from
scipy
import
signal
import
pandas
import
random
import
soundfile
import
torch
import
torchaudio
has_pyroom
=
True
try
:
...
...
@@ -51,320 +55,242 @@ __docformat__ = 'reStructuredText'
Noise
=
collections
.
namedtuple
(
'Noise'
,
'type file_id duration'
)
def
normalize
(
wav
):
"""
:param wav:
:return:
"""
return
wav
/
(
numpy
.
sqrt
(
numpy
.
mean
(
wav
**
2
))
+
1e-8
)
class
PreEmphasis
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
coef
:
float
=
0.97
):
super
().
__init__
()
self
.
coef
=
coef
# make kernel
# In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
self
.
register_buffer
(
'flipped_filter'
,
torch
.
FloatTensor
([
-
self
.
coef
,
1.
]).
unsqueeze
(
0
).
unsqueeze
(
0
)
)
def
crop
(
signal
,
duration
):
"""
def
forward
(
self
,
input
:
torch
.
tensor
)
->
torch
.
tensor
:
assert
len
(
input
.
size
())
==
2
,
'The number of dimensions of input tensor must be 2!'
# reflect padding to match lengths of in/out
input
=
input
.
unsqueeze
(
1
)
input
=
torch
.
nn
.
functional
.
pad
(
input
,
(
1
,
0
),
'reflect'
)
return
torch
.
nn
.
functional
.
conv1d
(
input
,
self
.
flipped_filter
).
squeeze
(
1
)
:return:
"""
start
=
random
.
randint
(
0
,
signal
.
shape
[
0
]
-
duration
)
chunk
=
signal
[
start
:
start
+
duration
]
return
chunk
class
FrequencyMask
(
object
):
"""Crop randomly the image in a sample.
class
AddNoise
(
object
):
Args:
output_size (tuple or int): Desired output size. If int, square crop
is made.
"""
def
__init__
(
self
,
max_size
,
feature_size
):
self
.
max_size
=
max_size
self
.
feature_size
=
feature_size
"""
def
__call__
(
self
,
sample
):
data
=
sample
[
0
]
if
sample
[
2
]:
size
=
numpy
.
random
.
randint
(
1
,
self
.
max_size
)
f0
=
numpy
.
random
.
randint
(
0
,
self
.
feature_size
-
self
.
max_size
)
data
[
f0
:
f0
+
size
,
:]
=
10.
return
data
,
sample
[
1
],
sample
[
2
],
sample
[
3
],
sample
[
4
],
sample
[
5
]
def
__init__
(
self
,
noise_db_csv
,
snr_min_max
,
noise_root_path
,
sample_rate
=
16000
):
"""
"""
self
.
snr_min
=
snr_min_max
[
0
]
self
.
snr_max
=
snr_min_max
[
1
]
self
.
noise_root_path
=
noise_root_path
self
.
sample_rate
=
sample_rate
class
TemporalMask
(
object
):
"""Crop randomly the image in a sample.
df
=
pandas
.
read_csv
(
noise_db_csv
)
self
.
noises
=
[]
for
index
,
row
in
df
.
iterrows
():
self
.
noises
.
append
(
Noise
(
type
=
row
[
"type"
],
file_id
=
row
[
"file_id"
],
duration
=
row
[
"duration"
]))
Args:
output_size (tuple or int): Desired output size. If int, square crop
is made.
"""
def
__init__
(
self
,
max_size
):
self
.
max_size
=
max_size
def
__call__
(
self
,
sample
):
"""
:param original:
:param sample_rate:
:return:
"""
data
=
sample
[
0
]
if
sample
[
4
]:
original_duration
=
len
(
data
)
# accumulate enough noise to cover duration of original waveform
noises
=
[]
left
=
original_duration
while
left
>
0
:
# select noise file at random
file
=
random
.
choice
(
self
.
noises
)
noise_signal
,
fs
=
soundfile
.
read
(
self
.
noise_root_path
+
"/"
+
file
.
file_id
+
".wav"
)
if
sample
[
3
]:
size
=
numpy
.
random
.
randint
(
1
,
self
.
max_size
)
t0
=
numpy
.
random
.
randint
(
0
,
sample
[
0
].
shape
[
1
]
-
self
.
max_size
)
data
[:,
t0
:
t0
+
size
]
=
10.
return
data
,
sample
[
1
],
sample
[
2
],
sample
[
3
],
sample
[
4
],
sample
[
5
]
# Load noise from file
if
not
fs
==
self
.
sample_rate
:
print
(
"Problem"
)
# todo
duration
=
noise_signal
.
shape
[
0
]
# if noise file is longer than what is needed, crop it
if
duration
>
left
:
noise
=
crop
(
noise_signal
,
left
)
left
=
0
# otherwise, take the whole file
else
:
noise
=
noise_signal
left
-=
duration
# Todo Downsample if needed
# if sample_rate > fs:
#
def
normalize
(
wav
):
"""
noise
=
normalize
(
noise
)
noises
.
append
(
noise
.
squeeze
())
:param wav:
:return:
"""
return
wav
/
(
numpy
.
sqrt
(
numpy
.
mean
(
wav
**
2
))
+
1e-8
)
# concatenate
noise
=
numpy
.
hstack
(
noises
)
# select SNR at random
snr
=
(
self
.
snr_max
-
self
.
snr_min
)
*
numpy
.
random
.
random_sample
()
+
self
.
snr_min
alpha
=
numpy
.
exp
(
-
numpy
.
log
(
10
)
*
snr
/
20
)
def
crop
(
signal
,
duration
):
"""
data
=
normalize
(
data
)
+
alpha
*
noise
:return:
"""
start
=
random
.
randint
(
0
,
signal
.
shape
[
0
]
-
duration
)
chunk
=
signal
[
start
:
start
+
duration
]
return
data
.
squeeze
(),
sample
[
1
],
sample
[
2
],
sample
[
3
],
sample
[
4
],
sample
[
5
]
return
chunk
class
AddNoiseFromSilence
(
object
):
"""
def
data_augmentation
(
speech
,
sample_rate
,
transform_dict
,
transform_number
,
noise_df
=
None
,
rir_df
=
None
):
"""
def
__init__
(
self
,
noise_db_csv
,
snr_min_max
,
noise_root_path
,
sample_rate
=
16000
):
"""
"""
self
.
snr_min
=
snr_min_max
[
0
]
self
.
snr_max
=
snr_min_max
[
1
]
self
.
noise_root_path
=
noise_root_path
self
.
sample_rate
=
sample_rate
df
=
pandas
.
read_csv
(
noise_db_csv
)
self
.
noises
=
[]
for
index
,
row
in
df
.
iterrows
()
:
self
.
noises
.
append
(
Noise
(
type
=
row
[
"type"
],
file_id
=
row
[
"file_id"
],
duration
=
row
[
"duration"
]))
:param speech:
:param transform_dict:
:param transform_number
:
:return:
def
__call__
(
self
,
sample
):
"""
tranformation
pipeline: add_noise,add_reverb
add_noise:
noise_db_csv: filename.csv
snr: 5,6,7,8,9,10,11,12,13,14,15
add_reverb:
rir_db_csv: filename.csv
codec: true
phone_filtering: true
:param original:
:param sample_rate:
:return:
"""
data
=
sample
[
0
]
if
sample
[
4
]:
original_duration
=
len
(
data
)
# accumulate enough noise to cover duration of original waveform
noises
=
[]
left
=
original_duration
while
left
>
0
:
# select noise file at random
file
=
random
.
choice
(
self
.
noises
)
noise_signal
,
fs
=
soundfile
.
read
(
self
.
noise_root_path
+
"/"
+
file
.
file_id
+
".wav"
)
# Load noise from file
if
not
fs
==
self
.
sample_rate
:
print
(
"Problem"
)
# todo
duration
=
noise_signal
.
shape
[
0
]
# if noise file is longer than what is needed, crop it
if
duration
>
left
:
noise
=
crop
(
noise_signal
,
left
)
left
=
0
# otherwise, take the whole file
else
:
noise
=
noise_signal
left
-=
duration
# Todo Downsample if needed
# if sample_rate > fs:
#
noise
=
normalize
(
noise
)
noises
.
append
(
noise
.
squeeze
())
# concatenate
noise
=
numpy
.
hstack
(
noises
)
# select SNR at random
snr
=
(
self
.
snr_max
-
self
.
snr_min
)
*
numpy
.
random
.
random_sample
()
+
self
.
snr_min
alpha
=
numpy
.
exp
(
-
numpy
.
log
(
10
)
*
snr
/
20
)
data
=
normalize
(
data
)
+
alpha
*
noise
return
data
.
squeeze
(),
sample
[
1
],
sample
[
2
],
sample
[
3
],
sample
[
4
],
sample
[
5
]
if
has_pyroom
:
class
AddReverb
(
object
):
"""Simulate indoor reverberation
Parameters
----------
depth : (float, float), optional
Minimum and maximum values for room depth (in meters).
Defaults to (2.0, 10.0).
width : (float, float), optional
Minimum and maximum values for room width (in meters).
Defaults to (1.0, 10.0).
height : (float, float), optional
Minimum and maximum values for room heigth (in meters).
Defaults to (2.0, 5.0).
absorption : (float, float), optional
Minimum and maximum values of walls absorption coefficient.
Defaults to (0.2, 0.9).
noise : str or list of str, optional
`pyannote.database` collection(s) used for adding noise.
Defaults to "MUSAN.Collection.BackgroundNoise"
snr : (float, float), optional
Minimum and maximum values of signal-to-noise ratio.
Defaults to (5.0, 15.0)
"""
def
__init__
(
self
,
depth
=
(
2.0
,
10.0
),
width
=
(
1.0
,
10.0
),
height
=
(
2.0
,
5.0
),
absorption
=
(
0.2
,
0.9
),
noise
=
None
,
snr
=
(
5.0
,
15.0
)
):
super
().
__init__
()
self
.
depth
=
depth
self
.
width
=
width
self
.
height
=
height
self
.
absorption
=
absorption
self
.
max_order_
=
17
self
.
noise
=
noise
self
.
snr
=
snr
self
.
noise_
=
noise
self
.
n_rooms_
=
128
self
.
new_rooms_prob_
=
0.001
self
.
main_lock_
=
threading
.
Lock
()
self
.
rooms_
=
collections
.
deque
(
maxlen
=
self
.
n_rooms_
)
self
.
room_lock_
=
[
threading
.
Lock
()
for
_
in
range
(
self
.
n_rooms_
)]
@
staticmethod
def
random
(
m
,
M
):
"""
:param m:
:param M:
:return:
"""
return
(
M
-
m
)
*
numpy
.
random
.
random_sample
()
+
m
def
new_room
(
self
,
sample_rate
:
int
):
"""
:param sample_rate:
:return:
"""
# generate a room at random
depth
=
self
.
random
(
*
self
.
depth
)
width
=
self
.
random
(
*
self
.
width
)
height
=
self
.
random
(
*
self
.
height
)
absorption
=
self
.
random
(
*
self
.
absorption
)
room
=
pyroomacoustics
.
ShoeBox
(
[
depth
,
width
,
height
],
fs
=
sample_rate
,
absorption
=
absorption
,
max_order
=
self
.
max_order_
,
)
# play the original audio chunk at a random location
original
=
[
self
.
random
(
0
,
depth
),
self
.
random
(
0
,
width
),
self
.
random
(
0
,
height
),
]
room
.
add_source
(
original
)
# play the noise audio chunk at a random location
noise
=
[
self
.
random
(
0
,
depth
),
self
.
random
(
0
,
width
),
self
.
random
(
0
,
height
)]
room
.
add_source
(
noise
)
# place the microphone at a random location
microphone
=
[
self
.
random
(
0
,
depth
),
self
.
random
(
0
,
width
),
self
.
random
(
0
,
height
),
]
room
.
add_microphone_array
(
pyroomacoustics
.
MicrophoneArray
(
numpy
.
c_
[
microphone
,
microphone
],
sample_rate
)
)
room
.
compute_rir
()
return
room
def
__call__
(
self
,
sample
):
data
=
sample
[
0
]
if
sample
[
5
]:
with
self
.
main_lock_
:
# initialize rooms (with 2 sources and 1 microphone)
while
len
(
self
.
rooms_
)
<
self
.
n_rooms_
:
room
=
self
.
new_room
(
self
.
sample_rate
)
self
.
rooms_
.
append
(
room
)
# create new room with probability new_rooms_prob_
if
numpy
.
random
.
rand
()
>
1.0
-
self
.
new_rooms_prob_
:
room
=
self
.
new_room
(
self
.
sample_rate
)
self
.
rooms_
.
append
(
room
)
# choose one room at random
index
=
numpy
.
random
.
choice
(
self
.
n_rooms_
)
# lock chosen room to ensure room.sources are not updated concurrently
with
self
.
room_lock_
[
index
]:
room
=
self
.
rooms_
[
index
]
# play normalized original audio chunk at source #1
n_samples
=
len
(
data
)
data
=
normalize
(
original
).
squeeze
()
room
.
sources
[
0
].
add_signal
(
data
)
# generate noise with random SNR
noise
=
self
.
noise_
(
n_samples
,
self
.
sample_rate
).
squeeze
()
snr
=
self
.
random
(
*
self
.
snr
)
alpha
=
numpy
.
exp
(
-
numpy
.
log
(
10
)
*
snr
/
20
)
noise
*=
alpha
# play noise at source #2
room
.
sources
[
1
].
add_signal
(
noise
)
# simulate room and return microphone signal
room
.
simulate
()
data
=
room
.
mic_array
.
signals
[
0
,
:
n_samples
,
numpy
.
newaxis
]
return
data
,
sample
[
1
],
sample
[
2
],
sample
[
3
]
,
sample
[
4
],
sample
[
5
]
"""
# Select the data augmentation randomly
aug_idx
=
random
.
sample
(
range
(
len
(
transform_dict
.
keys
())),
k
=
transform_number
)
augmentations
=
numpy
.
array
(
list
(
transform_dict
.
keys
()))[
aug_idx
]
if
"stretch"
in
augmentations
:
strech
=
torchaudio
.
functional
.
TimeStretch
()
rate
=
random
.
uniform
(
0.8
,
1.2
)
speech
=
strech
(
speech