Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Anthony Larcher
sidekit
Commits
2b348c5d
Commit
2b348c5d
authored
Jul 07, 2021
by
Anthony Larcher
Browse files
cleaning
parent
1c480d00
Changes
9
Expand all
Hide whitespace changes
Inline
Side-by-side
__init__.py
View file @
2b348c5d
...
...
@@ -162,8 +162,6 @@ if SIDEKIT_CONFIG["cuda"]:
if
CUDA
:
from
.nnet
import
FForwardNetwork
from
.nnet
import
kaldi_to_hdf5
from
.nnet
import
Xtractor
from
.nnet
import
xtrain
from
.nnet
import
extract_embeddings
...
...
nnet/__init__.py
View file @
2b348c5d
...
...
@@ -28,8 +28,6 @@ Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
"""
from
.feed_forward
import
FForwardNetwork
from
.feed_forward
import
kaldi_to_hdf5
from
.xsets
import
IdMapSetPerSpeaker
from
.xsets
import
SideSet
from
.xsets
import
SideSampler
...
...
@@ -49,8 +47,6 @@ from .preprocessor import MfccFrontEnd
from
.preprocessor
import
MelSpecFrontEnd
__author__
=
"Anthony Larcher and Sylvain Meignier"
__copyright__
=
"Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
__license__
=
"LGPL"
...
...
nnet/augmentation.py
View file @
2b348c5d
...
...
@@ -26,21 +26,12 @@ Copyright 2014-2021 Anthony Larcher
"""
import
collections
import
math
import
numpy
from
scipy
import
signal
import
pandas
import
random
import
soundfile
import
torch
import
torchaudio
has_pyroom
=
True
try
:
import
pyroomacoustics
except
ImportError
:
has_pyroom
=
False
from
scipy
import
signal
__author__
=
"Anthony Larcher and Sylvain Meignier"
...
...
@@ -55,8 +46,10 @@ __docformat__ = 'reStructuredText'
Noise
=
collections
.
namedtuple
(
'Noise'
,
'type file_id duration'
)
class
PreEmphasis
(
torch
.
nn
.
Module
):
"""
Apply pre-emphasis filtering
"""
def
__init__
(
self
,
coef
:
float
=
0.97
):
super
().
__init__
()
...
...
@@ -67,13 +60,18 @@ class PreEmphasis(torch.nn.Module):
'flipped_filter'
,
torch
.
FloatTensor
([
-
self
.
coef
,
1.
]).
unsqueeze
(
0
).
unsqueeze
(
0
)
)
def
forward
(
self
,
input
:
torch
.
tensor
)
->
torch
.
tensor
:
def
forward
(
self
,
input_signal
:
torch
.
tensor
)
->
torch
.
tensor
:
"""
Forward pass of the pre-emphasis filtering
:param input_signal: the input signal
:return: the filtered signal
"""
assert
len
(
input
.
size
())
==
2
,
'The number of dimensions of input tensor must be 2!'
# reflect padding to match lengths of in/out
input
=
input
.
unsqueeze
(
1
)
input
=
torch
.
nn
.
functional
.
pad
(
input
,
(
1
,
0
),
'reflect'
)
return
torch
.
nn
.
functional
.
conv1d
(
input
,
self
.
flipped_filter
).
squeeze
(
1
)
input_signal
=
input
.
unsqueeze
(
1
)
input_signal
=
torch
.
nn
.
functional
.
pad
(
input_signal
,
(
1
,
0
),
'reflect'
)
return
torch
.
nn
.
functional
.
conv1d
(
input_signal
,
self
.
flipped_filter
).
squeeze
(
1
)
class
FrequencyMask
(
object
):
...
...
@@ -115,24 +113,26 @@ class TemporalMask(object):
return
data
,
sample
[
1
],
sample
[
2
],
sample
[
3
],
sample
[
4
],
sample
[
5
]
def
normalize
(
wav
):
"""
Center and reduce a waveform
:param wav:
:return:
:param wav:
the input waveform
:return:
the normalized waveform
"""
return
wav
/
(
numpy
.
sqrt
(
numpy
.
mean
(
wav
**
2
))
+
1e-8
)
def
crop
(
signal
,
duration
):
def
crop
(
input_
signal
,
duration
):
"""
Select a chunk from an audio segment
:param input_signal: signal to select a chunk from
:param duration: duration of the chunk to select
:return:
"""
start
=
random
.
randint
(
0
,
signal
.
shape
[
0
]
-
duration
)
chunk
=
signal
[
start
:
start
+
duration
]
start
=
random
.
randint
(
0
,
input_signal
.
shape
[
0
]
-
duration
)
chunk
=
input_signal
[
start
:
start
+
duration
]
return
chunk
...
...
@@ -144,11 +144,20 @@ def data_augmentation(speech,
rir_df
=
None
,
babble_noise
=
True
):
"""
Perform data augmentation on an input signal.
Each speech chunk is augmented by using 'transform_number' transformations that are picked up randomly from a
dictionary of possible transformations.
:param speech:
:param transform_dict:
:param transform_number:
:return:
:param speech: the input signal to be augmented
:param sample_rate: sampling rate of the input signal to augment
:param transform_dict: the dictionary of possibles augmentations to apply
:param transform_number: the number of transformations to apply on each chunk
:param rir_df: a pandas dataframe object including the list of RIR signals to chose from; default is None
:param noise_df: a pandas dataframe object including the list of NOISE signals to chose from; default is None
:param babble_noise: boolean that enable the use of babble noise, True by default (typically turned to False when
the task includes overlapping speech detection).
:return: augmented signal
tranformation
pipeline: add_noise,add_reverb
...
...
@@ -159,7 +168,6 @@ def data_augmentation(speech,
rir_db_csv: filename.csv
codec: true
phone_filtering: true
"""
# Select the data augmentation randomly
aug_idx
=
random
.
sample
(
range
(
len
(
transform_dict
.
keys
())),
k
=
transform_number
)
...
...
@@ -209,18 +217,17 @@ def data_augmentation(speech,
noise
+=
load_noise_seg
(
noise_row
,
speech
.
shape
,
sample_rate
,
transform_dict
[
"add_noise"
][
"data_path"
])
# babble noise with different volume
elif
noise_idx
==
3
:
snr_db
=
random
.
randint
(
13
,
20
)
pick_count
=
random
.
randint
(
5
,
10
)
# Randomly select 5 to 10 speakers
index_list
=
random
.
choices
(
range
(
noise_df
.
loc
[
'speech'
].
shape
[
0
]),
k
=
pick_count
)
#noise_rows = transform_dict["add_noise"]["data_path"] + "/" + noise_df[noise_df["type"] == "speech"].sample(ns,replace=False)["file_id"].values + ".wav"
noise
=
torch
.
zeros
(
1
,
speech
.
shape
[
1
])
for
idx
in
index_list
:
#noise_,noise_fs = torchaudio.load(noise_fn[idx],frame_offset=0,num_frames=speech.shape[1])
noise_row
=
noise_df
.
loc
[
'speech'
].
iloc
[
idx
]
noise_
=
load_noise_seg
(
noise_row
,
speech
.
shape
,
sample_rate
,
transform_dict
[
"add_noise"
][
"data_path"
])
transform
=
torchaudio
.
transforms
.
Vol
(
gain
=
random
.
randint
(
5
,
15
),
gain_type
=
'db'
)
# Randomly select volume level (5-15d)
noise
+=
transform
(
noise_
)
noise
/=
pick_count
snr_db
=
random
.
randint
(
13
,
20
)
pick_count
=
random
.
randint
(
5
,
10
)
# Randomly select 5 to 10 speakers
index_list
=
random
.
choices
(
range
(
noise_df
.
loc
[
'speech'
].
shape
[
0
]),
k
=
pick_count
)
noise
=
torch
.
zeros
(
1
,
speech
.
shape
[
1
])
for
idx
in
index_list
:
noise_row
=
noise_df
.
loc
[
'speech'
].
iloc
[
idx
]
noise_
=
load_noise_seg
(
noise_row
,
speech
.
shape
,
sample_rate
,
transform_dict
[
"add_noise"
][
"data_path"
])
transform
=
torchaudio
.
transforms
.
Vol
(
gain
=
random
.
randint
(
5
,
15
),
gain_type
=
'db'
)
# Randomly select volume level (5-15d)
noise
+=
transform
(
noise_
)
noise
/=
pick_count
speech_power
=
speech
.
norm
(
p
=
2
)
noise_power
=
noise
.
norm
(
p
=
2
)
...
...
@@ -244,10 +251,10 @@ def data_augmentation(speech,
effects
=
[
[
"bandpass"
,
"2000"
,
"3500"
],
[
"bandstop"
,
"200"
,
"500"
]]
speech
,
sample_rate
=
torchaudio
.
sox_eefects
.
apply_effects_tensor
(
speech
,
sample_rate
=
torchaudio
.
sox_eefects
.
apply_effects_tensor
(
speech
,
sample_rate
,
effects
=
[
effects
[
random
.
randint
(
0
,
1
)]],
effects
=
[
effects
[
random
.
randint
(
0
,
1
)]],
)
if
"codec"
in
augmentations
:
...
...
@@ -267,11 +274,12 @@ def data_augmentation(speech,
def
load_noise_seg
(
noise_row
,
speech_shape
,
sample_rate
,
data_path
):
"""
Pick a noise signal to add while performing data augmentation
:param noise_row:
:param speech_shape:
:param sample_rate:
:param data_path:
:param noise_row:
a row from a Pandas dataframe object
:param speech_shape:
shape of the speech signal to be augmented
:param sample_rate:
sampling rate of the speech signal to be augmented
:param data_path:
directory where to load the noise file from
:return:
"""
noise_start
=
noise_row
[
'start'
]
...
...
nnet/feed_forward.py
deleted
100755 → 0
View file @
1c480d00
This diff is collapsed.
Click to expand it.
nnet/loss.py
View file @
2b348c5d
...
...
@@ -26,17 +26,12 @@ Copyright 2014-2021 Anthony Larcher
"""
import
h5py
import
logging
import
math
import
sys
import
numpy
import
torch
import
torch.optim
as
optim
import
torch.multiprocessing
as
mp
from
collections
import
OrderedDict
from
..bosaris
import
IdMap
from
..statserver
import
StatServer
from
torch.nn
import
Parameter
...
...
@@ -52,6 +47,9 @@ __docformat__ = 'reS'
class
ArcMarginModel
(
torch
.
nn
.
Module
):
"""
"""
def
__init__
(
self
,
args
):
super
(
ArcMarginModel
,
self
).
__init__
()
...
...
@@ -68,6 +66,12 @@ class ArcMarginModel(torch.nn.Module):
self
.
mm
=
math
.
sin
(
math
.
pi
-
self
.
m
)
*
self
.
m
def
forward
(
self
,
input
,
label
):
"""
:param input:
:param label:
:return:
"""
x
=
F
.
normalize
(
input
)
W
=
F
.
normalize
(
self
.
weight
)
cosine
=
F
.
linear
(
x
,
W
)
...
...
@@ -85,12 +89,21 @@ class ArcMarginModel(torch.nn.Module):
def
l2_norm
(
input
,
axis
=
1
):
"""
:param input:
:param axis:
:return:
"""
norm
=
torch
.
norm
(
input
,
2
,
axis
,
True
)
output
=
torch
.
div
(
input
,
norm
)
return
output
class
ArcFace
(
torch
.
nn
.
Module
):
"""
"""
# implementation of additive margin softmax loss in https://arxiv.org/abs/1801.05599
def
__init__
(
self
,
embedding_size
,
classnum
,
s
=
64.
,
m
=
0.5
):
super
(
ArcFace
,
self
).
__init__
()
...
...
@@ -106,6 +119,12 @@ class ArcFace(torch.nn.Module):
self
.
threshold
=
math
.
cos
(
math
.
pi
-
m
)
def
forward
(
self
,
embbedings
,
target
):
"""
:param embbedings:
:param target:
:return:
"""
# weights norm
nB
=
len
(
embbedings
)
kernel_norm
=
l2_norm
(
self
.
kernel
,
axis
=
0
)
...
...
@@ -136,6 +155,9 @@ class ArcFace(torch.nn.Module):
################################## Cosface head #############################################################
class
Am_softmax
(
torch
.
nn
.
Module
):
"""
"""
# implementation of additive margin softmax loss in https://arxiv.org/abs/1801.05599
def
__init__
(
self
,
embedding_size
=
512
,
classnum
=
51332
):
super
(
Am_softmax
,
self
).
__init__
()
...
...
@@ -147,6 +169,12 @@ class Am_softmax(torch.nn.Module):
self
.
s
=
30.
# see normface https://arxiv.org/abs/1704.06369
def
forward
(
self
,
embbedings
,
label
):
"""
:param embbedings:
:param label:
:return:
"""
kernel_norm
=
l2_norm
(
self
.
kernel
,
axis
=
0
)
cos_theta
=
torch
.
mm
(
embbedings
,
kernel_norm
)
cos_theta
=
cos_theta
.
clamp
(
-
1
,
1
)
# for numerical stability
...
...
@@ -226,14 +254,15 @@ class ArcLinear(torch.nn.Module):
class
ArcMarginProduct
(
torch
.
nn
.
Module
):
r
"""Implement of large margin arc distance: :
"""
Implement of large margin arc distance: :
Args:
in_features: size of each input sample
out_features: size of each output sample
s: norm of input feature
m: margin
cos(theta + m)
"""
"""
def
__init__
(
self
,
in_features
,
out_features
,
s
=
30.0
,
m
=
0.50
,
easy_margin
=
False
):
super
(
ArcMarginProduct
,
self
).
__init__
()
...
...
@@ -298,7 +327,10 @@ class ArcMarginProduct(torch.nn.Module):
class
SoftmaxAngularProto
(
torch
.
nn
.
Module
):
# from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
def
__init__
(
self
,
spk_count
,
emb_dim
=
256
,
init_w
=
10.0
,
init_b
=-
5.0
,
**
kwargs
):
super
(
SoftmaxAngularProto
,
self
).
__init__
()
...
...
@@ -313,27 +345,38 @@ class SoftmaxAngularProto(torch.nn.Module):
]))
def
forward
(
self
,
x
,
target
=
None
):
"""
:param x:
:param target:
:return:
"""
assert
x
.
size
()[
1
]
>=
2
cce_prediction
=
self
.
cce_backend
(
x
)
if
target
==
None
:
if
target
is
None
:
return
cce_prediction
x
=
x
.
reshape
(
-
1
,
2
,
x
.
size
()[
-
1
]).
squeeze
(
1
)
x
=
x
.
reshape
(
-
1
,
2
,
x
.
size
()[
-
1
]).
squeeze
(
1
)
out_anchor
=
torch
.
mean
(
x
[:,
1
:,:],
1
)
out_positive
=
x
[:,
0
,:]
out_anchor
=
torch
.
mean
(
x
[:,
1
:,
:],
1
)
out_positive
=
x
[:,
0
,:]
cos_sim_matrix
=
torch
.
nn
.
functional
.
cosine_similarity
(
out_positive
.
unsqueeze
(
-
1
),
out_anchor
.
unsqueeze
(
-
1
).
transpose
(
0
,
2
))
cos_sim_matrix
=
torch
.
nn
.
functional
.
cosine_similarity
(
out_positive
.
unsqueeze
(
-
1
),
out_anchor
.
unsqueeze
(
-
1
).
transpose
(
0
,
2
))
torch
.
clamp
(
self
.
w
,
1e-6
)
cos_sim_matrix
=
cos_sim_matrix
*
self
.
w
+
self
.
b
loss
=
self
.
criterion
(
cos_sim_matrix
,
torch
.
arange
(
0
,
cos_sim_matrix
.
shape
[
0
],
device
=
x
.
device
))
+
self
.
criterion
(
cce_prediction
,
target
)
loss
=
self
.
criterion
(
cos_sim_matrix
,
torch
.
arange
(
0
,
cos_sim_matrix
.
shape
[
0
],
device
=
x
.
device
))
+
self
.
criterion
(
cce_prediction
,
target
)
return
loss
,
cce_prediction
class
AngularProximityMagnet
(
torch
.
nn
.
Module
):
# from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
"""
def
__init__
(
self
,
spk_count
,
emb_dim
=
256
,
batch_size
=
512
,
init_w
=
10.0
,
init_b
=-
5.0
,
**
kwargs
):
super
(
AngularProximityMagnet
,
self
).
__init__
()
...
...
@@ -363,17 +406,22 @@ class AngularProximityMagnet(torch.nn.Module):
self
.
magnet_criterion
=
torch
.
nn
.
BCEWithLogitsLoss
(
reduction
=
'mean'
)
def
forward
(
self
,
x
,
target
=
None
):
"""
:param x:
:param target:
:return:
"""
assert
x
.
size
()[
1
]
>=
2
cce_prediction
=
self
.
cce_backend
(
x
)
#x = self.magnitude(x) * torch.nn.functional.normalize(x)
if
target
==
None
:
if
target
is
None
:
return
x
,
cce_prediction
x
=
x
.
reshape
(
-
1
,
2
,
x
.
size
()[
-
1
]).
squeeze
(
1
)
out_anchor
=
torch
.
mean
(
x
[:,
1
:,:],
1
)
out_positive
=
x
[:,
0
,:]
x
=
x
.
reshape
(
-
1
,
2
,
x
.
size
()[
-
1
]).
squeeze
(
1
)
out_anchor
=
torch
.
mean
(
x
[:,
1
:,
:],
1
)
out_positive
=
x
[:,
0
,
:]
ap_sim_matrix
=
torch
.
nn
.
functional
.
cosine_similarity
(
out_positive
.
unsqueeze
(
-
1
),
out_anchor
.
unsqueeze
(
-
1
).
transpose
(
0
,
2
))
torch
.
clamp
(
self
.
w
,
1e-6
)
...
...
nnet/pooling.py
View file @
2b348c5d
...
...
@@ -111,7 +111,7 @@ class AttentivePooling(torch.nn.Module):
class
GruPooling
(
torch
.
nn
.
Module
):
"""
Pooling done by using a recurrent network
"""
def
__init__
(
self
,
input_size
,
gru_node
,
nb_gru_layer
):
"""
...
...
@@ -136,7 +136,7 @@ class GruPooling(torch.nn.Module):
"""
x
=
self
.
bn_before_gru
(
x
)
x
=
self
.
lrelu_keras
(
x
)
x
=
x
.
permute
(
0
,
2
,
1
)
#(batch, filt, time) >> (batch, time, filt)
x
=
x
.
permute
(
0
,
2
,
1
)
#
(batch, filt, time) >> (batch, time, filt)
self
.
gru
.
flatten_parameters
()
x
,
_
=
self
.
gru
(
x
)
x
=
x
[:,
-
1
,
:]
...
...
nnet/preprocessor.py
View file @
2b348c5d
...
...
@@ -27,39 +27,14 @@ Copyright 2014-2021 Anthony Larcher, Yevhenii Prokopalo
import
logging
import
math
import
os
import
numpy
import
pandas
import
pickle
import
shutil
import
time
import
os
import
torch
import
torchaudio
import
tqdm
import
yaml
from
collections
import
OrderedDict
from
torch.utils.data
import
DataLoader
from
sklearn.model_selection
import
train_test_split
from
.augmentation
import
PreEmphasis
from
.xsets
import
SideSet
from
.xsets
import
IdMapSet
from
.xsets
import
IdMapSetPerSpeaker
from
.xsets
import
SideSampler
from
.res_net
import
ResBlockWFMS
from
.res_net
import
ResBlock
from
.res_net
import
PreResNet34
from
.res_net
import
PreFastResNet34
from
..bosaris
import
IdMap
from
..bosaris
import
Key
from
..bosaris
import
Ndx
from
..statserver
import
StatServer
from
..iv_scoring
import
cosine_scoring
from
.sincnet
import
SincNet
from
.loss
import
ArcLinear
from
.loss
import
l2_norm
from
.loss
import
ArcMarginProduct
from
.sincnet
import
SincConv1d
from
.res_net
import
LayerNorm
os
.
environ
[
'MKL_THREADING_LAYER'
]
=
'GNU'
...
...
@@ -83,12 +58,10 @@ torch.backends.cudnn.benchmark = False
numpy
.
random
.
seed
(
0
)
class
MfccFrontEnd
(
torch
.
nn
.
Module
):
"""
Module that extract MFCC coefficients
"""
def
__init__
(
self
,
pre_emphasis
=
0.97
,
sample_rate
=
16000
,
...
...
@@ -153,9 +126,8 @@ class MfccFrontEnd(torch.nn.Module):
class
MelSpecFrontEnd
(
torch
.
nn
.
Module
):
"""
Module that compute Mel spetrogramm on an audio signal
"""
def
__init__
(
self
,
pre_emphasis
=
0.97
,
sample_rate
=
16000
,
...
...
@@ -230,7 +202,8 @@ class MelSpecFrontEnd(torch.nn.Module):
class
RawPreprocessor
(
torch
.
nn
.
Module
):
"""
Pre-process the raw audio signal by using a SincNet architecture
[ADD REF]
"""
def
__init__
(
self
,
nb_samp
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
bias
=
False
,
groups
=
1
,
min_low_hz
=
50
,
min_band_hz
=
50
,
sample_rate
=
16000
):
"""
...
...
nnet/rawnet.py
deleted
100644 → 0
View file @
1c480d00
# -*- coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2021 Anthony Larcher
"""
import
logging
import
numpy
import
torch
import
pandas
import
soundfile
import
random
import
h5py
import
torch.optim
as
optim
import
torch.multiprocessing
as
mp
from
torchvision
import
transforms
from
torch.utils.data
import
DataLoader
from
pathlib
import
Path
from
tqdm
import
tqdm
from
torch.utils.data
import
Dataset
__license__
=
"LGPL"
__author__
=
"Anthony Larcher"
__copyright__
=
"Copyright 2015-2021 Anthony Larcher"
__maintainer__
=
"Anthony Larcher"
__email__
=
"anthony.larcher@univ-lemans.fr"
__status__
=
"Production"
__docformat__
=
'reS'
"""
How to use:
vs = ds.Vox1Set("/lium/raid01_c/larcher/vox1_raw_wav_batches.h5", transform=transforms.Compose([PreEmphasis(),]))
vloader = DataLoader(vs, batch_size=32, shuffle=True, num_workers=5)
"""
def
prepare_voxceleb1
(
vox1_root_dir
,
output_batch_file
,
seg_duration
=
4
,
samplerate
=
16000
):
# List wav files in VoxCeleb1
vox1_wav_list
=
[
str
(
f
)
for
f
in
list
(
Path
(
vox1_root_dir
).
rglob
(
"*.[wW][aA][vV]"
))]
vox1_df
=
pandas
.
DataFrame
(
columns
=
(
"database"
,
"speaker_id"
,
"file_id"
,
"duration"
,
"speaker_idx"
))
print
(
"*** Collect information from VoxCeleb1 data ***"
)
for
fn
in
tqdm
(
vox1_wav_list
):
file_id
=
(
'/'
).
join
(
fn
.
split
(
'/'
)[
-
2
:]).
split
(
'.'
)[
0
]
speaker_id
=
fn
.
split
(
'/'
)[
-
3
]
_set
=
fn
.
split
(
'/'
)[
-
5
]
# get the duration of the wav file
data
,
_
=
soundfile
.
read
(
fn
)
duration
=
data
.
shape
[
0
]
vox1_df
=
vox1_df
.
append
(
{
"database"
:
"vox1"
,
"speaker_id"
:
speaker_id
,
"file_id"
:
file_id
,
"duration"
:
duration
,
"speaker_idx"
:
-
1
,
"set"
:
_set
},
ignore_index
=
True
)
print
(
"
\n\n
*** Create a single HDF5 file with all training data ***"
)
# Create a HDF5 file and fill it with one 4s segment per session
with
h5py
.
File
(
output_batch_file
,
'w'
)
as
fh
:
for
index
,
row
in
tqdm
(
vox1_df
.
iterrows
()):
session_id
=
row
[
'speaker_id'
]
+
'/'
+
row
[
'file_id'
]
# Load the wav signal
fn
=
'/'
.
join
((
vox1_root_dir
,
row
[
'set'
],
'wav'
,
session_id
))
+
".wav"
data
,
samplerate
=
soundfile
.
read
(
fn
,
dtype
=
'int16'
)
_nb_samp
=
len
(
data
)
# Randomly select a segment of "duration" if it's long enough
if
_nb_samp
>
nb_samp
:
cut
=
numpy
.
random
.
randint
(
low
=
0
,
high
=
_nb_samp
-
nb_samp
)
# Write the segment in the HDF5 file
fh
.
create_dataset
(
session_id
,
data
=
data
[
cut
:
cut
+
nb_samp
].
astype
(
'int16'
),
maxshape
=
(
None
,),
fletcher32
=
True
)
def
prepare_voxceleb2
(
vox2_root_dir
,
output_batch_file
,
seg_duration
=
4
,
samplerate
=
16000
):
# List wav files in VoxCeleb2
vox2_wav_list
=
[
str
(
f
)
for
f
in
list
(
Path
(
vox2_root_dir
).
rglob
(
"*.[wW][aA][vV]"
))]
vox2_dfs
=
[
pandas
.
DataFrame
(
columns
=
(
"database"
,
"speaker_id"
,
"file_id"
,
"duration"
,
"speaker_idx"
))]
*
5
vox2_sublists
=
[[]]
*
5
lv2
=
len
(
vox2_wav_list
)
vox2_sublists
[
0
]
=
vox2_wav_list
[:
lv2
//
2
]
vox2_sublists
[
1
]
=
vox2_wav_list
[
lv2
//
2
:
2
*
(
lv2
//
2
)]
vox2_sublists
[
2
]
=
vox2_wav_list
[
2
*
(
lv2
//
2
):
3
*
(
lv2
//
2
)]
vox2_sublists
[
3
]
=
vox2_wav_list
[
3
*
(
lv2
//
2
):
4
*
(
lv2
//
2
)]
vox2_sublists
[
3
]
=
vox2_wav_list
[
4
*
(
lv2
//
2
):]
print
(
"*** Collect information from VoxCeleb2 data ***"
)