Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Anthony Larcher
sidekit
Commits
92f95258
Commit
92f95258
authored
Dec 13, 2021
by
Anthony Larcher
Browse files
mostly doc
parent
88f4d2b9
Changes
12
Expand all
Hide whitespace changes
Inline
Side-by-side
__init__.py
View file @
92f95258
...
...
@@ -4,7 +4,7 @@
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
#
PARALLEL_MODULE
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
...
...
@@ -50,8 +50,8 @@ if 'SIDEKIT' in os.environ:
if
val
==
"true"
:
SIDEKIT_CONFIG
[
"mpi"
]
=
True
if
k
==
"cuda"
:
if
val
==
"
tru
e"
:
SIDEKIT_CONFIG
[
"cuda"
]
=
Tru
e
if
val
==
"
fals
e"
:
SIDEKIT_CONFIG
[
"cuda"
]
=
Fals
e
PARALLEL_MODULE
=
'multiprocessing'
# can be , threading, multiprocessing MPI is planned in the future
...
...
@@ -187,5 +187,5 @@ __maintainer__ = "Anthony Larcher"
__email__
=
"anthony.larcher@univ-lemans.fr"
__status__
=
"Production"
__docformat__
=
'reStructuredText'
__version__
=
"1.
4
"
__version__
=
"1.
9
"
bosaris/idmap.py
View file @
92f95258
...
...
@@ -259,6 +259,14 @@ class IdMap:
return
ok
def
set
(
self
,
left
,
right
,
start
=
None
,
stop
=
None
):
"""
Fill the IdMap object with numpy array of leftids, rightids, and optionally starts and stops
:param left: a numpy array for leftids
:param right: a numpy array for rightids
:param start: a numpy array for start time (optional)
:param stop: a numpy array for stop time (optional)
"""
self
.
leftids
=
copy
.
deepcopy
(
left
)
self
.
rightids
=
copy
.
deepcopy
(
right
)
...
...
bosaris/key.py
View file @
92f95258
...
...
@@ -108,6 +108,14 @@ class Key:
@
classmethod
def
create
(
cls
,
modelset
,
segset
,
tar
,
non
):
"""
Class method that creates a Key object
:param modelset: a numpy array with model IDs
:param segset: a numpy array with segment IDs
:param tar: a matrix of boolean, True if the trial is target, dimensions must be: number of models X number of segments
:param non: a matrix of boolean, True if the trial is impostor, dimensions must be: number of models X number of segments
:return: a new key object
"""
key
=
Key
()
key
.
modelset
=
modelset
key
.
segset
=
segset
...
...
features_server.py
View file @
92f95258
...
...
@@ -48,7 +48,7 @@ __maintainer__ = "Anthony Larcher"
__email__
=
"anthony.larcher@univ-lemans.fr"
__status__
=
"Production"
__docformat__
=
'reStructuredText'
#comment
class
FeaturesServer
(
object
):
"""
...
...
@@ -494,12 +494,12 @@ class FeaturesServer(object):
def
get_features_per_speaker
(
self
,
show
,
idmap
,
channel
=
0
,
input_feature_filename
=
None
,
label
=
None
):
"""
Load a single file and return a dictionary with spk_ids as keys and (feature, label) as data
:param show:
:param channel:
:param input_feature_filename:
:param label:
:param idmap:
:return:
:param show:
name of the show
:param channel:
number of the audio channel
:param input_feature_filename:
name of the input file to read from
:param label:
voice activity detection labels (optional)
:param idmap:
idmap to select the features
:return:
a numpy array of acoustic features
"""
if
input_feature_filename
is
not
None
:
self
.
feature_filename_structure
=
input_feature_filename
...
...
@@ -641,7 +641,6 @@ class FeaturesServer(object):
return
numpy
.
vstack
(
features_list
)
def
_stack_features_worker
(
self
,
input_queue
,
output_queue
):
...
...
@@ -660,8 +659,6 @@ class FeaturesServer(object):
output_queue
.
put
(
self
.
load
(
*
next_task
)[
0
])
input_queue
.
task_done
()
#@profile
def
stack_features_parallel
(
self
,
# fileList, numThread=1):
show_list
,
channel_list
=
None
,
...
...
frontend/io.py
View file @
92f95258
...
...
@@ -99,6 +99,12 @@ def write_pcm(data, output_file_name):
@
check_path_existance
def
write_wav
(
data
,
output_file_name
,
fs
):
"""Write signal to single channel WAV 16 bits
:param data: audio signal to write
:param output_file_name: name of the file to write
:param fs: sample rate in Hz
"""
if
data
.
dtype
!=
numpy
.
int16
:
if
data
.
dtype
==
numpy
.
float32
:
data
/=
numpy
.
abs
(
data
).
max
()
...
...
@@ -980,6 +986,7 @@ def read_htk_segment(input_file_name,
m
=
numpy
.
r_
[
numpy
.
repeat
(
m
[[
0
]],
s
-
start
,
axis
=
0
),
m
,
numpy
.
repeat
(
m
[[
-
1
]],
stop
-
e
,
axis
=
0
)]
return
m
.
astype
(
numpy
.
float32
)
def
_add_dataset_header
(
fh
,
dataset_id
,
_min_val
,
...
...
@@ -988,6 +995,12 @@ def _add_dataset_header(fh,
"""
Create a dataset in the HDF5 file and write the data
after compressing float to int
:param fh: file handler in HDF5 format
:param dataset_id: name of the new dataset to create
:param _min_val: minimum value in the dataset (used for compression)
:param _range: range of the values in the dataset (used for compression)
:param _header: header of the dataset
"""
_c_header
=
(
_header
-
_min_val
)
/
_range
numpy
.
clip
(
_c_header
,
0.
,
1.
)
...
...
@@ -1004,6 +1017,7 @@ def _add_dataset_header(fh,
compression
=
"gzip"
,
fletcher32
=
True
)
def
_add_percentile_dataset
(
fh
,
dataset_id
,
data
):
...
...
@@ -1011,6 +1025,10 @@ def _add_percentile_dataset(fh,
Create the dataset in the HDF5 file, write the data
compressed in int8 format and the header compressed in
int format
:param fh: file handler in HDF5 format
:param dataset_id: name of the new dataset to create
:param data: data to fill the dataset
"""
_min_val
=
data
.
min
()
_range
=
data
.
ptp
()
...
...
@@ -1044,28 +1062,71 @@ def _add_percentile_dataset(fh,
fletcher32
=
True
)
def
_read_dataset
(
h5f
,
dataset_id
):
"""
Read a dataset from HDF5 file
:param h5f: file handler in HDF5 format
:param dataset_id: name of the dataset to read
:return: the data stored in the dataset
"""
data
=
h5f
[
dataset_id
][()]
if
data
.
ndim
==
1
:
data
=
data
[:,
numpy
.
newaxis
]
return
data
def
_read_segment
(
h5f
,
dataset_id
,
s
,
e
):
"""
Read a sequence of features stored in an HDF5 dataset.
:param h5f: file handler in HDF5 format
:param dataset_id: name of the dataset to read from
:param s: start index of the sequence to read
:param e: end index of the sequence to read
:return: the sequence of features in a numpy array format
"""
data
=
h5f
[
dataset_id
][
s
:
e
]
return
data
def
_read_dataset_htk
(
h5f
,
dataset_id
):
"""
Read a dataset from HDF5 file
:param h5f: file handler in HDF5 format
:param dataset_id: name of the dataset to read
:return: the sequence of features in a numpy array format
"""
(
A
,
B
)
=
h5f
[
dataset_id
+
"comp"
][()]
data
=
(
h5f
[
dataset_id
][()]
+
B
)
/
A
if
data
.
ndim
==
1
:
data
=
data
[:,
numpy
.
newaxis
]
return
data
def
_read_segment_htk
(
h5f
,
dataset_id
,
e
,
s
):
"""
Read a sequence of features stored in an HDF5 dataset written in HTK format
:param h5f: file handler in HDF5 format
:param dataset_id: name of the dataset to read from
:param s: start index of the sequence to read
:param e: end index of the sequence to read
:return: the sequence of features in a numpy array format
"""
(
A
,
B
)
=
h5f
[
dataset_id
+
"comp"
][()]
data
=
(
h5f
[
dataset_id
][
s
:
e
,
:]
+
B
)
/
A
return
data
def
read_dataset_percentile
(
h5f
,
dataset_id
):
"""
Read a compressed dataset from HDF5 file
:param h5f: file handler in HDF5 format
:param dataset_id: name of the dataset to read
:return: the sequence of features in a numpy array format
"""
# read the header
(
_min_val
,
_range
)
=
h5f
[
dataset_id
+
"_min_range"
][()]
c_header
=
h5f
[
dataset_id
+
"_header"
][()]
...
...
@@ -1079,7 +1140,17 @@ def read_dataset_percentile(h5f, dataset_id):
mat3
=
(
_header
[:,[
2
]]
+
(
_header
[:,[
3
]]
-
_header
[:,[
2
]])
*
(
c_data
.
T
-
192
)
*
(
1
/
63
))
*
(
c_data
.
T
>
192
)
return
(
mat1
+
mat2
+
mat3
).
T
def
_read_segment_percentile
(
h5f
,
dataset_id
,
s
,
e
):
"""
Read a sequence of features stored in a compressed HDF5 dataset
:param h5f: file handler in HDF5 format
:param dataset_id: name of the dataset to read from
:param s: start index of the sequence to read
:param e: end index of the sequence to read
:return: the sequence of features in a numpy array format
"""
# read the header
(
_min_val
,
_range
)
=
h5f
[
dataset_id
+
"_min_range"
][()]
c_header
=
h5f
[
dataset_id
+
"_header"
][()]
...
...
@@ -1100,6 +1171,25 @@ def _write_show(show,
fb
,
fb_mean
,
fb_std
,
bnf
,
bnf_mean
,
bnf_std
,
label
):
"""
Write features for a given show in HDF5 format
:param show: name of the show to write
:param fh: file handler in HDF5 format
:param cep: cepstral coefficients
:param cep_mean: mean vector of the cepstral coefficients
:param cep_std: standard deviation vector of the cepstral coefficients
:param energy: energy value per frame
:param energy_mean: mean of the energy
:param energy_std: standard deviation of the energy
:param fb: filterbank coefficients
:param fb_mean: mean vector of the filterbank coefficients
:param fb_std: standard deviation vector of filterbank coefficients
:param bnf: bottleneck features
:param bnf_mean: mean vector of the bottleneck features
:param bnf_std: standard deviation vector of bottleneck features
:param label: voice activity detection labels per frame
"""
if
cep
is
not
None
:
fh
.
create_dataset
(
show
+
'/cep'
,
data
=
cep
.
astype
(
'float32'
),
maxshape
=
(
None
,
None
),
...
...
@@ -1161,6 +1251,7 @@ def _write_show(show,
compression
=
"gzip"
,
fletcher32
=
True
)
def
_write_show_htk
(
show
,
fh
,
cep
,
cep_mean
,
cep_std
,
...
...
@@ -1168,6 +1259,24 @@ def _write_show_htk(show,
fb
,
fb_mean
,
fb_std
,
bnf
,
bnf_mean
,
bnf_std
,
label
):
"""
Write features for a given show in HDF5 and HTK format
:param fh: file handler in HDF5 format
:param cep: cepstral coefficients
:param cep_mean: mean vector of the cepstral coefficients
:param cep_std: standard deviation vector of the cepstral coefficients
:param energy: energy value per frame
:param energy_mean: mean of the energy
:param energy_std: standard deviation of the energy
:param fb: filterbank coefficients
:param fb_mean: mean vector of the filterbank coefficients
:param fb_std: standard deviation vector of filterbank coefficients
:param bnf: bottleneck features
:param bnf_mean: mean vector of the bottleneck features
:param bnf_std: standard deviation vector of bottleneck features
:param label: voice activity detection labels per frame
"""
if
cep
is
not
None
:
A_cep
=
2
*
32767.
/
(
cep
.
max
()
-
cep
.
min
())
B_cep
=
(
cep
.
max
()
+
cep
.
min
())
*
32767.
/
(
cep
.
max
()
-
cep
.
min
())
...
...
@@ -1253,6 +1362,7 @@ def _write_show_htk(show,
compression
=
"gzip"
,
fletcher32
=
True
)
def
_write_show_percentile
(
show
,
fh
,
cep
,
cep_mean
,
cep_std
,
...
...
@@ -1260,6 +1370,25 @@ def _write_show_percentile(show,
fb
,
fb_mean
,
fb_std
,
bnf
,
bnf_mean
,
bnf_std
,
label
):
"""
Write features for a given show in HDF5 and HTK format
:param show: name of the sow to write
:param fh: file handler in HDF5 format
:param cep: cepstral coefficients
:param cep_mean: mean vector of the cepstral coefficients
:param cep_std: standard deviation vector of the cepstral coefficients
:param energy: energy value per frame
:param energy_mean: mean of the energy
:param energy_std: standard deviation of the energy
:param fb: filterbank coefficients
:param fb_mean: mean vector of the filterbank coefficients
:param fb_std: standard deviation vector of filterbank coefficients
:param bnf: bottleneck features
:param bnf_mean: mean vector of the bottleneck features
:param bnf_std: standard deviation vector of bottleneck features
:param label: voice activity detection labels per frame
"""
if
cep
is
not
None
:
_add_percentile_dataset
(
fh
,
show
+
'/cep'
,
cep
)
...
...
@@ -1318,7 +1447,6 @@ def _write_show_percentile(show,
fletcher32
=
True
)
def
write_hdf5
(
show
,
fh
,
cep
,
cep_mean
,
cep_std
,
...
...
@@ -1343,7 +1471,7 @@ def write_hdf5(show,
:param bnf_mean: pre-computed mean of the bottleneck features
:param bnf_std: pre-computed standard deviation of the bottleneck features
:param label: vad labels to store
:param compress
ed
: boolean, default is False
:param compress
ion
: boolean, default is False
:return:
"""
#write the the type of compression: could be:
...
...
@@ -1382,13 +1510,14 @@ def write_hdf5(show,
bnf
,
bnf_mean
,
bnf_std
,
label
)
def
read_hdf5
(
h5f
,
show
,
dataset_list
=
(
"cep"
,
"fb"
,
"energy"
,
"vad"
,
"bnf"
)):
"""
:param h5f: HDF5 file handler to read from
:param show: identifier of the show to read
:param dataset_list: list of datasets to read and concatenate
:return:
:return:
a numpy array with acoustic features and one with VAD labels
"""
compression_type
=
{
0
:
'none'
,
1
:
'htk'
,
2
:
'percentile'
}
if
"compression"
not
in
h5f
:
...
...
@@ -1464,17 +1593,18 @@ def read_hdf5(h5f, show, dataset_list=("cep", "fb", "energy", "vad", "bnf")):
return
feat
.
astype
(
numpy
.
float32
),
label
def
_rms_energy
(
x
):
return
10
*
numpy
.
log10
((
1e-12
+
x
.
dot
(
x
))
/
len
(
x
))
def
_add_noise
(
signal
,
noise_file_name
,
snr
,
sample_rate
):
"""
Add noise to a speech signal
:param signal:
:param noise_file_name:
:param snr:
:return:
:param signal:
the original signal to augmente
:param noise_file_name:
the name fo the noise file to use
:param snr:
signal to noise ratio
:return:
the signal augmented with noise
"""
# Open noise file
if
isinstance
(
noise_file_name
,
numpy
.
ndarray
):
...
...
@@ -1502,6 +1632,7 @@ def _add_noise(signal, noise_file_name, snr, sample_rate):
return
(
noisy
-
noisy
.
mean
())
/
noisy
.
std
()
def
bin_interp
(
upcount
,
lwcount
,
upthr
,
lwthr
,
margin
,
tol
=
0.1
):
n_iter
=
1
if
abs
(
upcount
-
upthr
-
margin
)
<
tol
:
...
...
@@ -1525,6 +1656,7 @@ def bin_interp(upcount, lwcount, upthr, lwthr, margin, tol=0.1):
diff
=
midcount
-
midthr
-
margin
return
midcount
def
asl_meter
(
x
,
fs
,
nbits
=
16
):
'''Measure the Active Speech Level (ASR) of x following ITU-T P.56.
If x is integer, it will be scaled to (-1, 1) according to nbits.
...
...
@@ -1587,6 +1719,7 @@ def asl_meter(x, fs, nbits=16):
return
asl
def
_add_reverb
(
signal
,
reverb_file_name
,
sample_rate
,
reverb_level
=-
26.0
,
):
'''Adds reverb (convolutive noise) to a speech signal.
The output speech level is normalized to asl_level.
...
...
@@ -1600,107 +1733,3 @@ def _add_reverb(signal, reverb_file_name, sample_rate, reverb_level=-26.0, ):
return
(
y
-
y
.
mean
())
/
y
.
std
()
def
degrade_audio
(
input_path
,
input_extension
,
output_path
,
output_extension
,
input_filename
,
output_filename
,
sampling_frequency
=
16000
,
noise_file_name
=
None
,
snr
=-
10
,
reverb_file_name
=
None
,
reverb_level
=-
26.
):
"""
:param input_filename:
:param output_filename:
:return:
"""
# Open audio file, get the signal and possibly the sampling frequency
signal
,
sample_rate
=
read_audio
(
input_filename
,
sampling_frequency
)
if
signal
.
ndim
==
1
:
signal
=
signal
[:,
numpy
.
newaxis
]
for
channel
in
range
(
signal
.
shape
[
1
]):
if
noise_file_name
is
not
None
:
signal
[:,
channel
]
=
_add_noise
(
signal
[:,
channel
],
noise_file_name
,
snr
,
sampling_frequency
)
if
reverb_file_name
is
not
None
:
signal
[:,
channel
]
=
_add_reverb
(
signal
[:,
channel
],
reverb_file_name
,
sampling_frequency
,
reverb_level
)
write_wav
(
signal
,
output_filename
,
sample_rate
)
@
process_parallel_lists
def
augment_list
(
input_path
,
input_extension
,
output_path
,
output_extension
,
sampling_frequency
,
show_list
,
channel_list
,
audio_file_list
=
None
,
feature_file_list
=
None
,
noise_file_list
=
None
,
snr_list
=
None
,
reverb_file_list
=
None
,
reverb_levels
=
None
,
num_thread
=
1
):
"""
Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features
for a list of audio files and save them to disk in a HDF5 format
The process is parallelized if num_thread is higher than 1
:param show_list: list of IDs of the show to process
:param channel_list: list of channel indices corresponding to each show
:param audio_file_list: list of input audio files if the name is independent from the ID of the show
:param feature_file_list: list of output audio files if the name is independent from the ID of the show
:param num_thread: number of parallel process to run
:return:
"""
# get the length of the longest list
max_length
=
max
([
len
(
l
)
for
l
in
[
show_list
,
channel_list
,
audio_file_list
,
feature_file_list
]
if
l
is
not
None
])
if
show_list
is
None
:
show_list
=
numpy
.
empty
(
int
(
max_length
),
dtype
=
'|O'
)
if
audio_file_list
is
None
:
audio_file_list
=
numpy
.
empty
(
int
(
max_length
),
dtype
=
'|O'
)
if
feature_file_list
is
None
:
feature_file_list
=
numpy
.
empty
(
int
(
max_length
),
dtype
=
'|O'
)
if
noise_file_list
is
None
:
noise_file_list
=
numpy
.
empty
(
int
(
max_length
),
dtype
=
'|O'
)
snr_list
=
numpy
.
empty
(
int
(
max_length
),
dtype
=
'|O'
)
elif
snr_list
is
None
:
snr_list
=
numpy
.
full
(
int
(
max_length
),
5.
)
if
reverb_file_list
is
None
:
reverb_file_list
=
numpy
.
empty
(
int
(
max_length
),
dtype
=
'|O'
)
reverb_levels
=
numpy
.
empty
(
int
(
max_length
),
dtype
=
'|O'
)
elif
reverb_levels
is
None
:
reverb_levels
=
numpy
.
full
(
int
(
max_length
),
-
26.
)
for
show
,
channel
,
input_file
,
output_file
,
noise_file
,
snr
,
reverb_file
,
reverb_level
in
zip
(
show_list
,
channel_list
,
audio_file_list
,
feature_file_list
,
noise_file_list
,
snr_list
,
reverb_file_list
,
reverb_levels
):
degrade_audio
(
input_path
,
input_extension
,
output_path
,
output_extension
,
show
,
input_file
,
output_file
,
sampling_frequency
,
noise_file
,
snr
,
reverb_file
,
reverb_level
)
nnet/xsets.py
View file @
92f95258
...
...
@@ -26,15 +26,12 @@ Copyright 2014-2021 Anthony Larcher
"""
import
math
import
numpy
import
pandas
import
random
import
torch
import
torchaudio
import
tqdm
import
soundfile
import
yaml
from
.augmentation
import
data_augmentation
from
..bosaris.idmap
import
IdMap
...
...
@@ -64,15 +61,17 @@ class SideSampler(torch.utils.data.Sampler):
rank
=
0
,
num_process
=
1
,
num_replicas
=
1
):
"""[summary]
Args:
data_source ([type]): [description]
spk_count ([type]): [description]
examples_per_speaker ([type]): [description]
samples_per_speaker ([type]): [description]
batch_size ([type]): [description]
num_replicas: number of GPUs for parallel computing
"""
:param data_source:
:param spk_count:
:param examples_per_speaker:
:param samples_per_speaker:
:param batch_size:
:param seed:
:param rank:
:param num_process:
:param num_replicas: number of GPUs for parallel computing
"""
self
.
train_sessions
=
data_source
self
.
labels_to_indices
=
dict
()
...
...
@@ -89,7 +88,6 @@ class SideSampler(torch.utils.data.Sampler):
assert
(
self
.
samples_per_speaker
*
self
.
spk_count
*
self
.
examples_per_speaker
)
%
self
.
num_process
==
0
self
.
batch_size
=
batch_size
//
(
self
.
examples_per_speaker
*
self
.
num_replicas
)
#self.batch_size = batch_size // self.examples_per_speaker
# reference all segment indexes per speaker
for
idx
in
range
(
self
.
spk_count
):
...
...
@@ -105,8 +103,11 @@ class SideSampler(torch.utils.data.Sampler):
self
.
segment_cursors
=
numpy
.
zeros
((
len
(
self
.
labels_to_indices
),),
dtype
=
numpy
.
int
)
def
__iter__
(
self
):
"""
:return:
"""
g
=
torch
.
Generator
()
g
.
manual_seed
(
self
.
seed
+
self
.
epoch
)
numpy
.
random
.
seed
(
self
.
seed
+
self
.
epoch
)
...
...
@@ -159,14 +160,14 @@ class SideSampler(torch.utils.data.Sampler):
#return (self.samples_per_speaker * self.spk_count * self.examples_per_speaker) // self.num_process
return
(
self
.
samples_per_speaker
*
self
.
spk_count
*
self
.
examples_per_speaker
*
self
.
num_replicas
)
//
self
.
num_process
def
set_epoch
(
self
,
epoch
:
int
)
->
None
:
self
.
epoch
=
epoch
class
SideSet
(
Dataset
):
"""
Dataset that loads the data for network training
"""
def
__init__
(
self
,
dataset
,
set_type
=
"train"
,
...
...
@@ -179,10 +180,15 @@ class SideSet(Dataset):
):
"""
:param dataset
_yaml
: name of the YAML file describing the dataset
:param dataset: name of the YAML file describing the dataset
:param set_type: string, can be "train" or "validation"
:param chunk_per_segment: number of chunks to select for each segment
default is 1 and -1 means select all possible chunks
:param transform_number:
:param overlap:
:param dataset_df:
:param min_duration:
:param output_format:
"""
self
.
data_path
=
dataset
[
"data_path"
]
self
.
sample_rate
=
int
(
dataset
[
"sample_rate"
])
...
...
@@ -283,7 +289,6 @@ class SideSet(Dataset):
# load the RIR database
self
.
rir_df
=
tmp_rir_df
.
set_index
(
tmp_rir_df
.
type
)
def
__getitem__
(
self
,
index
):
"""
...
...
@@ -337,13 +342,21 @@ class SideSet(Dataset):
def
__len__
(
self
):
"""
Return the length of the dataset
:param self:
:return:
"""
return
self
.
len
def
get_sample
(
path
,
resample
=
None
):
"""
:param path:
:param resample: