Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Meysam Shamsi
s4d
Commits
1268a4ce
Commit
1268a4ce
authored
Mar 25, 2020
by
Anthony Larcher
Browse files
new nnet module
parent
cfcca3d4
Changes
6
Hide whitespace changes
Inline
Side-by-side
CHANGES.txt
View file @
1268a4ce
s4d-0.1.0, 23/01/2020 -- Repackaging and creation of CHANGES.txt
\ No newline at end of file
s4d-0.1.0, 23/01/2020 -- Repackaging and creation of CHANGES.txt
s4d-0.1.4.2 15/02/2020 -- Bug fixed in scoring due to sklearn deprecated method linear_assignment
s4d-0.1.4.4 17/02/2020 -- Chaznge prototype of MODELIV.train_per_segment not to overwrite i-vectors
s4d/__init__.py
View file @
1268a4ce
...
...
@@ -57,4 +57,4 @@ __maintainer__ = "Sylvain Meignier"
__email__
=
"sylvain.meignierr@univ-lemans.fr"
__status__
=
"Production"
__docformat__
=
'reStructuredText'
__version__
=
"0.1.4.
3
"
__version__
=
"0.1.4.
4
"
s4d/diar.py
View file @
1268a4ce
...
...
@@ -769,8 +769,7 @@ class Diar():
if
not
diarization
.
_attributes
.
exist
(
'channel'
):
diarization
.
add_attribut
(
new_attribut
=
'channel'
,
default
=
'U'
)
try
:
for
line
in
fic
:
line
=
re
.
sub
(
'\s+'
,
' '
,
line
)
for
line
in
fic
line
=
re
.
sub
(
'\s+'
,
' '
,
line
)
line
=
line
.
strip
()
# logging.debug(line)
if
line
.
startswith
(
'#'
)
or
line
.
startswith
(
';;'
):
...
...
s4d/model_iv.py
View file @
1268a4ce
...
...
@@ -117,12 +117,12 @@ class ModelIV:
stat
.
accumulate_stat
(
ubm
=
self
.
ubm
,
feature_server
=
feature_server
,
seg_indices
=
range
(
stat
.
segset
.
shape
[
0
]),
num_thread
=
self
.
nb_thread
)
fa
=
FactorAnalyser
(
mean
=
self
.
tv_mean
,
Sigma
=
self
.
tv_sigma
,
F
=
self
.
tv
)
self
.
ivectors
=
fa
.
extract_ivectors_single
(
self
.
ubm
,
stat
)
ivectors
=
fa
.
extract_ivectors_single
(
self
.
ubm
,
stat
)
if
normalization
:
self
.
ivectors
.
spectral_norm_stat1
(
self
.
norm_mean
[:
1
],
self
.
norm_cov
[:
1
])
ivectors
.
spectral_norm_stat1
(
self
.
norm_mean
[:
1
],
self
.
norm_cov
[:
1
])
return
self
.
ivectors
return
ivectors
def
score_cosine
(
self
,
use_wccn
=
True
):
"""
...
...
s4d/nnet/seqtoseq.py
0 → 100644
View file @
1268a4ce
# -*- coding: utf-8 -*-
#
# This file is part of s4d.
#
# s4d is a python package for speaker diarization.
# Home page: http://www-lium.univ-lemans.fr/s4d/
#
# s4d is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# s4d is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with s4d. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2020 Anthony Larcher
"""
import
os
import
sys
import
numpy
import
random
import
h5py
import
torch
import
torch.nn
as
nn
from
torch
import
optim
from
torch.utils.data
import
Dataset
import
logging
from
sidekit.nnet.vad_rnn
import
BLSTM
__license__
=
"LGPL"
__author__
=
"Anthony Larcher"
__copyright__
=
"Copyright 2015-2020 Anthony Larcher"
__maintainer__
=
"Anthony Larcher"
__email__
=
"anthony.larcher@univ-lemans.fr"
__status__
=
"Production"
__docformat__
=
'reS'
class
PreNet
(
nn
.
Module
):
def
__init
(
self
):
super
(
PreNet
,
self
).
__init__
()
self
.
conv1
=
nn
.
Conv1d
(
in_channels
=
1
,
out_channels
=
64
,
kernel_size
=
200
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
bias
=
True
,
padding_mode
=
'zeros'
)
def
forward
(
self
,
input
):
output
=
self
.
conv1
(
input
)
return
output
class
preprocessingBLSTM
(
nn
.
Module
):
"""
Bi LSTM model used for voice activity detection or speaker turn detection
"""
def
__init__
(
self
,
input_size
,
lstm_1
,
lstm_2
,
linear_1
,
linear_2
,
output_size
=
1
):
"""
:param input_size:
:param lstm_1:
:param lstm_2:
:param linear_1:
:param linear_2:
:param output_size:
"""
super
(
BLSTM
,
self
).
__init__
()
self
.
lstm_1
=
nn
.
LSTM
(
input_size
,
lstm_1
//
2
,
bidirectional
=
True
,
batch_first
=
True
)
self
.
lstm_2
=
nn
.
LSTM
(
lstm_1
,
lstm_2
//
2
,
bidirectional
=
True
,
batch_first
=
True
)
self
.
linear_1
=
nn
.
Linear
(
lstm_2
,
linear_1
)
self
.
linear_2
=
nn
.
Linear
(
linear_1
,
linear_2
)
self
.
output
=
nn
.
Linear
(
linear_2
,
output_size
)
self
.
hidden
=
None
def
forward
(
self
,
inputs
):
"""
:param inputs:
:return:
"""
if
self
.
hidden
is
None
:
hidden_1
,
hidden_2
=
None
,
None
else
:
hidden_1
,
hidden_2
=
self
.
hidden
tmp
,
hidden_1
=
self
.
lstm_1
(
inputs
,
hidden_1
)
x
,
hidden_2
=
self
.
lstm_2
(
tmp
,
hidden_2
)
self
.
hidden
=
(
hidden_1
,
hidden_2
)
x
=
torch
.
tanh
(
self
.
linear_1
(
x
))
x
=
torch
.
tanh
(
self
.
linear_2
(
x
))
x
=
torch
.
sigmoid
(
self
.
output
(
x
))
return
x
class
SeqToSeq
(
nn
.
Module
):
def
__init__
(
self
):
self
.
model
=
BLSTM
(
input_size
=
1
,
lstm_1
=
64
,
lstm_2
=
40
,
linear_1
=
40
,
linear_2
=
10
)
class
VAD_RNN
:
"""
A VAD_RNN is meant to use a PyTorch RNN model for Speech Activity Detection
"""
def
__init__
(
self
,
input_size
,
duration
,
step
,
batch_size
,
model_file_name
=
None
):
"""
:param input_size: size of the input features
:param duration: duration in seconds of each batch of features
:param step: duration in seconds of each step between two batches
:param batch_size: batch size
:param model_file_name: optional pytorch model to load If None, the default model is used.
The default model is made of two BLSTM layers of dimension 64 and 40
followed by two linear layers of dimension 40 and 10.
"""
self
.
input_size
=
input_size
self
.
duration
=
int
(
duration
*
100
)
self
.
step
=
int
(
step
*
100
)
self
.
batch_size
=
batch_size
if
model_file_name
is
None
:
self
.
model
=
BLSTM
(
input_size
=
self
.
input_size
,
lstm_1
=
64
,
lstm_2
=
40
,
linear_1
=
40
,
linear_2
=
10
)
else
:
self
.
model
.
load_state_dict
(
torch
.
load
(
model_file_name
))
self
.
model
.
to
(
device
)
def
_fit_batch
(
self
,
optimizer
,
criterion
,
x
,
y
):
"""
Internal method used to train the network
:param optimizer:
:param criterion:
:param X:
:param Y:
:return: loss of current batch
"""
x
=
x
.
to
(
device
)
y
=
y
.
to
(
device
)
self
.
model
.
hidden
=
None
optimizer
.
zero_grad
()
lstm_out
=
self
.
model
(
x
)
loss
=
criterion
(
lstm_out
,
y
)
loss
.
backward
()
optimizer
.
step
()
return
float
(
loss
.
data
)
def
get_scores
(
self
,
show
,
features_server
,
score_file_format
=
''
):
"""
Computes the scores for one show from the output of the network
:param show: the show to extract
:param features_server: a sidekit FeaturesServer object
:param score_file_format: optional, used to save or load a score file
:return: scores of the show, as an array of 0..1
"""
if
score_file_format
==
''
:
score_fn
=
''
else
:
score_fn
=
score_file_format
.
format
(
show
)
if
os
.
path
.
exists
(
score_fn
):
print
(
"Warning: loading existing scores"
)
return
numpy
.
load
(
score_fn
)
features
,
_
=
features_server
.
load
(
show
)
x
=
[]
for
i
in
range
(
0
,
len
(
features
)
-
self
.
duration
,
self
.
step
):
x
.
append
(
features
[
i
:
i
+
self
.
duration
])
if
i
+
self
.
step
>
len
(
features
)
-
self
.
duration
:
pad_size
=
self
.
batch_size
-
len
(
x
)
pad
=
[[[
0
]
*
self
.
input_size
]
*
self
.
duration
]
*
pad_size
x
+=
pad
x
=
torch
.
Tensor
(
x
).
to
(
device
)
self
.
model
.
hidden
=
None
x
=
self
.
model
(
x
)
o
=
numpy
.
asarray
(
x
.
squeeze
(
2
).
tolist
())
scores
=
numpy
.
zeros
((
len
(
o
)
*
self
.
step
+
self
.
duration
-
self
.
step
))
w
=
numpy
.
zeros
(
scores
.
shape
)
start
=
0
for
i
,
out
in
enumerate
(
o
):
scores
[
start
:
start
+
self
.
duration
]
+=
out
w
[
start
:
start
+
self
.
duration
]
+=
1
start
+=
self
.
step
scores
=
scores
/
w
scores
=
scores
[:
len
(
features
)]
if
score_fn
!=
''
:
numpy
.
save
(
score_fn
,
scores
)
return
scores
def
train_network
(
self
,
nb_epochs
,
training_set
,
model_file_format
):
"""
Trains the network
:param nb_epochs: number of epochs to do
:param training_set: Dataset object to feed the training algorithm as keys. The start and stop are in
centiseconds.
:param model_file_format: file format to save the model. The format uses the current epoch
"""
criterion
=
nn
.
BCELoss
()
optimizer
=
optim
.
RMSprop
(
self
.
model
.
parameters
())
losses
=
[]
for
epoch
in
range
(
nb_epochs
):
it
=
1
losses
.
append
([])
for
batch_idx
,
(
X
,
Y
)
in
enumerate
(
training_set
):
batch_loss
=
self
.
_fit_batch
(
optimizer
,
criterion
,
X
,
Y
)
losses
[
epoch
].
append
(
batch_loss
)
logging
.
critical
(
"Epoch {}/{}, loss {:.5f}"
.
format
(
epoch
+
1
,
nb_epochs
,
numpy
.
mean
(
losses
[
epoch
])))
it
+=
1
torch
.
save
(
self
.
model
.
state_dict
(),
model_file_format
.
format
(
epoch
+
1
))
def
vad_blstm
(
self
,
show
,
features_server
,
onset
=
0.8
,
offset
=
0.95
,
scores_fn
=
''
):
"""
Get the VAD labels for one show
:param show: show to generate the SAD from
:param features_server: a sidekit FeaturesServer object
:param onset: score threshold above which a segment should start
:param offset: score threshold under which a segment should stop
:param scores_fn: optional file name to save the scores
"""
scores
=
self
.
get_scores
(
show
,
features_server
,
scores_fn
)
label
=
numpy
.
zeros
(
len
(
scores
))
start
=
0
segment
=
False
for
i
,
s
in
enumerate
(
scores
):
if
not
segment
and
s
>
onset
:
# speech segment begins
start
=
i
segment
=
True
if
segment
and
s
<
offset
:
# speech segment ends
segment
=
False
label
[
start
:
i
]
=
1
if
segment
:
label
[
start
:
i
]
=
1
return
label
def
write_vad
(
self
,
show_list
,
features_server
,
onset
,
offset
,
vad_file_format
,
scores_file_format
=
''
):
"""
Generates the SAD segment files from the trained model
:param show_list: list of shows to generate the SAD from
:param features_server: a sidekit FeaturesServer object
:param onset: score threshold above which a segment will start
:param offset: score threshold below which a segment will stop
:param vad_file_format: file format for the segments
:param scores_file_format: optional, used to save scores files
"""
for
show
in
sorted
(
show_list
):
scores
=
self
.
get_scores
(
show
,
features_server
,
scores_file_format
)
sad
=
[]
start
=
0
segment
=
False
for
i
,
s
in
enumerate
(
scores
):
if
not
segment
and
s
>
onset
:
start
=
i
segment
=
True
if
segment
and
s
<
offset
:
segment
=
False
sad
.
append
([
show
,
start
,
i
])
if
segment
or
len
(
sad
)
==
0
:
sad
.
append
([
show
,
start
,
i
])
with
open
(
vad_file_format
.
format
(
show
),
'w'
)
as
f
:
for
l
in
sad
:
f
.
write
(
"{} 1 {} {} U U U speech
\n
"
.
format
(
l
[
0
],
l
[
1
],
l
[
2
]))
s4d/nnet/wavsets.py
0 → 100644
View file @
1268a4ce
# -*- coding: utf-8 -*-
#
# This file is part of s4d.
#
# s4d is a python package for speaker diarization.
# Home page: http://www-lium.univ-lemans.fr/s4d/
#
# s4d is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# s4d is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with s4d. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2020 Anthony Larcher
"""
__license__
=
"LGPL"
__author__
=
"Anthony Larcher"
__copyright__
=
"Copyright 2015-2020 Anthony Larcher and Sylvain Meignier"
__maintainer__
=
"Anthony Larcher"
__email__
=
"anthony.larcher@univ-lemans.fr"
__status__
=
"Production"
__docformat__
=
'reStructuredText'
import
numpy
import
scipy
import
sidekit
import
torch
from
..diar
import
Diar
from
pathlib
import
Path
from
torch.utils.data
import
Dataset
def
framing
(
sig
,
win_size
,
win_shift
=
1
,
context
=
(
0
,
0
),
pad
=
'zeros'
):
"""
:param sig: input signal, can be mono or multi dimensional
:param win_size: size of the window in term of samples
:param win_shift: shift of the sliding window in terme of samples
:param context: tuple of left and right context
:param pad: can be zeros or edge
"""
dsize
=
sig
.
dtype
.
itemsize
if
sig
.
ndim
==
1
:
sig
=
sig
[:,
numpy
.
newaxis
]
# Manage padding
c
=
(
context
,
)
+
(
sig
.
ndim
-
1
)
*
((
0
,
0
),
)
_win_size
=
win_size
+
sum
(
context
)
shape
=
(
int
((
sig
.
shape
[
0
]
-
win_size
)
/
win_shift
)
+
1
,
1
,
_win_size
,
sig
.
shape
[
1
])
strides
=
tuple
(
map
(
lambda
x
:
x
*
dsize
,
[
win_shift
*
sig
.
shape
[
1
],
1
,
sig
.
shape
[
1
],
1
]))
return
numpy
.
lib
.
stride_tricks
.
as_strided
(
sig
,
shape
=
shape
,
strides
=
strides
).
squeeze
()
def
load_wav_segment
(
wav_file_name
,
idx
,
duration
,
seg_shift
,
framerate
=
16000
):
"""
:param wav_file_name:
:param idx:
:param duration:
:param seg_shift:
:param framerate:
:return:
"""
# Load waveform
signal
=
sidekit
.
frontend
.
io
.
read_audio
(
wav_file_name
,
framerate
)[
0
]
tmp
=
framing
(
signal
,
int
(
framerate
*
duration
),
win_shift
=
int
(
framerate
*
seg_shift
),
context
=
(
0
,
0
),
pad
=
'zeros'
)
return
tmp
[
idx
]
def
mdtm_to_label
(
mdtm_filename
,
show_duration
,
framerate
):
"""
:param show:
:param show_duration:
:param allies_dir:
:param mode:
:param duration:
:param start:
:param framerate:
:param filter_type:
:param collar_duration:
:return:
"""
diarization
=
Diar
.
read_mdtm
(
mdtm_filename
)
diarization
.
sort
([
'show'
,
'start'
])
# Create a dictionary of speakers
speaker_set
=
diarization
.
unique
(
'cluster'
)
speaker_dict
=
{}
for
idx
,
spk
in
enumerate
(
speaker_set
):
speaker_dict
[
spk
]
=
idx
# Create the empty labels
label
=
numpy
.
zeros
(
show_duration
,
dtype
=
int
)
# Fill the labels with spk_idx
for
segment
in
diarization
:
start
=
int
(
segment
[
'start'
])
*
framerate
/
100.
stop
=
int
(
segment
[
'stop'
])
*
framerate
/
100.
spk_idx
=
speaker_dict
[
segment
[
'cluster'
]]
label
[
start
:
stop
]
=
spk_idx
return
label
def
get_segment_label
(
label
,
seg_idx
,
mode
,
duration
,
framerate
,
seg_shift
,
collar_duration
,
filter_type
=
"gate"
)
# Create labels with Diracs at every speaker change detection
spk_change
=
numpy
.
zeros
(
label
.
shape
,
dtype
=
int
)
spk_change
[:
-
1
]
=
label
[:
-
1
]
^
label
[
1
:]
spk_change
=
numpy
.
not_equal
(
spk_change
,
numpy
.
zeros
(
label
.
shape
,
dtype
=
int
))
# depending of the mode, generates the labels and select the segments
if
mode
==
"vad"
:
output_label
=
(
label
>
0.5
).
astype
(
numpy
.
long
)
elif
mode
==
"spk_turn"
:
# Apply convolution to replace diracs by a chosen shape (gate or triangle)
filter_sample
=
collar_duration
*
framerate
*
2
+
1
conv_filt
=
numpy
.
ones
(
filter_sample
)
if
filter_type
==
"triangle"
:
conv_filt
=
scipy
.
signal
.
triang
(
filter_sample
)
output_label
=
numpy
.
convolve
(
conv_filt
,
spk_change
,
mode
=
'same'
)
elif
mode
==
"overlap"
:
raise
NotImplementedError
()
else
:
raise
ValueError
(
"mode parameter must be 'vad', 'spk_turn' or 'overlap'"
)
# Create segments with overlap
segment_label
=
framing
(
output_label
,
int
(
framerate
*
duration
),
win_shift
=
int
(
framerate
*
seg_shift
),
context
=
(
0
,
0
),
pad
=
'zeros'
)
return
segment_label
[
seg_idx
]
class
AlliesSet
(
Dataset
):
"""
Object creates a dataset for
"""
def
__init__
(
self
,
allies_dir
,
mode
,
duration
=
2.
,
seg_shift
=
0.25
,
filter_type
=
"gate"
,
collar_duration
=
0.1
,
framerate
=
16000
):
"""
Create batches of wavform samples for deep neural network training
:param allies_dir: the root directory of ALLIES data
:param mode: can be "vad", "spk_turn", "overlap"
:param duration: duration of the segments in seconds
:param seg_shift: shift to generate overlaping segments
:param filter_type:
:param collar_duration:
"""
self
.
framerate
=
framerate
self
.
show_duration
=
{}
self
.
segments
=
[]
self
.
duration
=
duration
self
.
seg_shift
=
seg_shift
self
.
input_dir
=
allies_dir
self
.
mode
=
mode
self
.
filter_type
=
filter_type
self
.
collar_duration
=
collar_duration
self
.
wav_name_format
=
allies_dir
+
'/wav/{}.wav'
self
.
mdtm_name_format
=
allies_dir
+
'/mdtm/{}.mdtm'
# load the list of training file names
training_file_list
=
[
str
(
f
).
split
(
"/"
)[
-
1
].
split
(
'.'
)[
0
]
for
f
in
list
(
Path
(
allies_dir
+
"/wav/"
).
rglob
(
"*.[wW][aA][vV]"
))
]
for
show
in
training_file_list
:
# Load waveform
signal
=
sidekit
.
frontend
.
io
.
read_audio
(
self
.
wav_name_format
.
format
(
show
),
self
.
framerate
)[
0
]
# Get speaker labels from MDTM
label
=
mdtm_to_label
(
self
.
mdtm_name_format
.
format
(
show
),
signal
.
shape
,
self
.
framerate
)
# Create labels with Diracs at every speaker change detection
spk_change
=
numpy
.
zeros
(
signal
.
shape
,
dtype
=
int
)
spk_change
[:
-
1
]
=
label
[:
-
1
]
^
label
[
1
:]
spk_change
=
numpy
.
not_equal
(
spk_change
,
numpy
.
zeros
(
signal
.
shape
,
dtype
=
int
))
# Create short segments with overlap
tmp
=
framing
(
spk_change
,
int
(
self
.
framerate
*
duration
),
win_shift
=
int
(
self
.
framerate
*
seg_shift
),
context
=
(
0
,
0
),
pad
=
'zeros'
)
# Select only segments with at least a speaker change
keep_seg
=
numpy
.
not_equal
(
tmp
.
sum
(
1
),
0
)
keep_idx
=
numpy
.
argwhere
(
keep_seg
.
squeeze
()).
squeeze
()
for
idx
in
keep_idx
:
self
.
segments
.
append
((
show
,
idx
))
self
.
len
=
len
(
self
.
segments
)
def
__getitem__
(
self
,
index
):
show
,
idx
=
self
.
segments
[
index
]
data
,
total_duration
=
load_wav_segment
(
self
.
wav_name_format
.
format
(
show
),
idx
,
self
.
duration
,
self
.
seg_shift
,
framerate
=
self
.
framerate
)
tmp_label
=
mdtm_to_label
(
self
.
mdtm_name_format
.
format
(
show
),
total_duration
,
self
.
framerate
)
label
=
get_segment_label
(
tmp_label
,
idx
,
self
.
mode
,
self
.
duration
,
self
.
framerate
,
self
.
seg_shift
,
self
.
collar_duration
,
filter_type
=
self
.
filter_type
)
return
torch
.
from_numpy
(
data
).
type
(
torch
.
FloatTensor
),
torch
.
from_numpy
(
label
.
astype
(
'long'
))
def
__len__
(
self
):
return
self
.
len
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment