Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Anthony Larcher
sidekit
Commits
045ae1b1
Commit
045ae1b1
authored
Oct 04, 2019
by
Florent Desnous
Browse files
modified sad_rnn.py
parent
ba4b91f1
Changes
3
Show whitespace changes
Inline
Side-by-side
__init__.py
View file @
045ae1b1
...
...
@@ -168,6 +168,7 @@ if CUDA:
from
sidekit.nnet
import
extract_idmap
from
sidekit.nnet
import
extract_parallel
from
sidekit.nnet
import
SAD_RNN
from
sidekit.nnet
import
SAD_Dataset
else
:
print
(
"Don't import Torch"
)
...
...
nnet/__init__.py
View file @
045ae1b1
...
...
@@ -27,7 +27,7 @@ Copyright 2014-2019 Anthony Larcher and Sylvain Meignier
:mod:`nnet` provides methods to manage Neural Networks using PyTorch
"""
from
sidekit.nnet.sad_rnn
import
SAD_RNN
from
sidekit.nnet.sad_rnn
import
SAD_RNN
,
SAD_Dataset
from
sidekit.nnet.feed_forward
import
FForwardNetwork
from
sidekit.nnet.feed_forward
import
kaldi_to_hdf5
from
sidekit.nnet.xsets
import
XvectorMultiDataset
,
XvectorDataset
,
StatDataset
...
...
nnet/sad_rnn.py
View file @
045ae1b1
# -*- coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2019 Anthony Larcher, Florent Desnous
The authors would like to thank the BUT Speech@FIT group (http://speech.fit.vutbr.cz) and Lukas BURGET
for sharing the source code that strongly inspired this module. Thank you for your valuable contribution.
"""
import
os
import
sys
import
numpy
import
random
import
h5py
import
torch
import
torch.nn
as
nn
from
torch
import
optim
from
torch.utils.data
import
Dataset
from
sidekit.frontend.io
import
_read_dataset_percentile
import
logging
__license__
=
"LGPL"
__author__
=
"Florent Desnous, Anthony Larcher"
__copyright__
=
"Copyright 2015-2019 Anthony Larcher, Florent Desnous"
__maintainer__
=
"Anthony Larcher"
__email__
=
"florent.desnous@univ-lemans.fr"
__status__
=
"Production"
__docformat__
=
'reS'
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
...
...
@@ -16,14 +53,25 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class
SAD_Dataset
(
Dataset
):
"""
Object that takes a list of files from a file and initialize a Dataset
"""
def
__init__
(
self
,
mdtm_file
,
feature_file
,
batch_size
=
512
,
duration
=
3.2
,
step
=
0.8
,
uem_file
=
None
,
shuffle
=
False
,
compressed
=
'percentile'
):
:param input_size: Size of the MFCC and BLSTM input
:param mdtm_file: File in MDTM format for the training segments
:param features_server: FeaturesServer instance for training MFCC
:param batch_size: Batch size
:param duration: Segment duration in seconds
:param step: in seconds
:param uem_file: File in UEM format for the training segments
:param shuffle: if True, performs a random shuffle on the dataset
:param use_ram: if True, keeps loaded MFCC in ram for faster training
"""
def
__init__
(
self
,
input_size
,
mdtm_file
,
features_server
,
batch_size
=
512
,
duration
=
3.2
,
step
=
0.8
,
uem_file
=
None
,
shuffle
=
False
,
use_ram
=
True
):
self
.
input_size
=
input_size
self
.
batch_size
=
batch_size
self
.
duration
=
int
(
duration
*
100
)
self
.
step
=
int
(
step
*
100
)
#self.feature_file = open(feature_file, 'r')
self
.
feature_file
=
h5py
.
File
(
feature_file
,
'r'
)
self
.
features_server
=
features_server
self
.
features
=
{}
self
.
use_ram
=
use_ram
train_list
=
{}
with
open
(
mdtm_file
,
'r'
)
as
f
:
...
...
@@ -49,20 +97,14 @@ class SAD_Dataset(Dataset):
for
show
in
train_list
.
keys
():
uem_list
[
show
].
append
({
"start"
:
None
,
"stop"
:
None
})
self
.
vad
=
{}
self
.
segments
=
[]
for
show
in
sorted
(
train_list
.
keys
()):
if
compressed
==
'percentile'
:
features
=
_read_dataset_percentile
(
self
.
feature_file
,
show
+
"/cep"
)
elif
compressed
==
'none'
:
features
=
self
.
feature_file
[
show
+
"/cep"
].
value
labels
=
numpy
.
zeros
((
len
(
features
),
1
),
dtype
=
numpy
.
int
)
show_len
=
train_list
[
show
][
-
1
][
'stop'
]
labels
=
numpy
.
zeros
((
show_len
,
1
),
dtype
=
numpy
.
int
)
speech_only_segments
=
[]
speech_nonspeech_segments
=
[]
if
show
in
train_list
and
show
in
uem_list
:
for
seg
in
train_list
[
show
]:
labels
[
seg
[
'start'
]:
seg
[
'stop'
]]
=
1
self
.
vad
[
show
]
=
labels
...
...
@@ -71,54 +113,40 @@ class SAD_Dataset(Dataset):
if
seg
[
'start'
]
is
not
None
:
start
,
stop
=
seg
[
'start'
],
seg
[
'stop'
]
else
:
start
,
stop
=
0
,
len
(
features
)
# cree les segments ne contenant QUE de la parole (sans recouvrement)
for
i
in
range
(
start
,
min
(
stop
,
len
(
features
))
-
self
.
duration
,
self
.
duration
):
if
labels
[
i
:
i
+
self
.
duration
].
sum
()
==
self
.
duration
:
speech_only_segments
.
append
((
show
,
i
,
i
+
self
.
duration
))
# cree les segments contenant de la PAROLE ET DU SILENCE (avec recouvrement pour equilibrer les classes)
for
i
in
range
(
start
,
min
(
stop
,
len
(
features
))
-
self
.
duration
,
self
.
step
):
#self.segments.append((show, i, i + self.duration))
if
labels
[
i
:
i
+
self
.
duration
].
sum
()
<
self
.
duration
-
1
:
speech_nonspeech_segments
.
append
((
show
,
i
,
i
+
self
.
duration
))
#for i in range(start, min(stop, len(features)) - self.duration, self.step):
# self.segments.append((show, i, i + self.duration))
tmp
=
speech_only_segments
+
speech_nonspeech_segments
random
.
shuffle
(
tmp
)
self
.
segments
+=
tmp
print
(
"Show {}, ratio S/NS = {}"
.
format
(
show
,
len
(
speech_only_segments
)
/
(
len
(
speech_nonspeech_segments
)
+
len
(
speech_only_segments
))))
# for i in range(start, min(stop, len(features)) - self.duration, self.step):
# self.segments.append((show, i, i + self.duration))
self
.
input_size
=
features
.
shape
[
1
]
start
,
stop
=
0
,
show_len
for
i
in
range
(
start
,
min
(
stop
,
show_len
)
-
self
.
duration
,
self
.
step
):
if
self
.
vad
[
show
][
i
:
i
+
self
.
duration
].
sum
()
==
self
.
duration
:
# no silence
continue
self
.
segments
.
append
((
show
,
i
,
i
+
self
.
duration
))
if
shuffle
:
random
.
shuffle
(
self
.
segments
)
self
.
len
=
len
(
self
.
segments
)
//
self
.
batch_size
print
(
len
(
self
.
segments
),
"segments,"
,
self
.
len
,
"segments/batch"
)
def
__getitem__
(
self
,
index
):
batch_X
=
numpy
.
zeros
((
self
.
batch_size
,
self
.
duration
,
self
.
input_size
))
batch_Y
=
numpy
.
zeros
((
self
.
batch_size
,
self
.
duration
,
1
))
for
i
in
range
(
self
.
batch_size
):
show
,
start
,
stop
=
self
.
segments
[
index
*
self
.
batch_size
+
i
]
#features = _read_dataset_percentile(self.feature_file, show + "/cep")
features
=
self
.
feature_file
[
show
+
"/cep"
].
value
m
=
features
.
mean
(
axis
=
0
)
s
=
features
.
std
(
axis
=
0
)
features
=
(
features
-
m
)
/
s
batch_X
[
i
]
=
features
[
start
:
stop
]
if
show
not
in
self
.
features
:
self
.
features
[
show
],
_
=
self
.
features_server
.
load
(
show
)
batch_X
[
i
]
=
self
.
features
[
show
][
start
:
stop
]
batch_Y
[
i
]
=
self
.
vad
[
show
][
start
:
stop
]
#batch_X[i] = features[start:stop]
#batch_Y[i] = self.vad[show][start:stop]
if
not
self
.
use_ram
:
self
.
features
=
{}
return
torch
.
Tensor
(
batch_X
),
torch
.
Tensor
(
batch_Y
)
def
__len__
(
self
):
return
self
.
len
class
SAD_RNN
():
"""
A SAD_RNN is meant to use a PyTorch RNN model for Speech Activity Detection
...
...
@@ -149,6 +177,37 @@ class SAD_RNN():
else
:
self
.
model
=
model
def
_sad_generator
(
self
,
train_list
,
uem_list
,
features_server
):
"""
Internal method that generates batches of features
:param train_list:
:param uem_list:
:param features_server:
"""
batch_X
=
numpy
.
zeros
((
self
.
batch_size
,
self
.
duration
,
self
.
input_size
))
batch_Y
=
numpy
.
zeros
((
self
.
batch_size
,
self
.
duration
,
1
))
batch_i
=
0
for
show
in
sorted
(
train_list
.
keys
()):
features
,
_
=
features_server
.
load
(
show
)
labels
=
numpy
.
zeros
((
len
(
features
),
1
),
dtype
=
numpy
.
int
)
for
seg
in
train_list
[
show
]:
labels
[
seg
[
'start'
]:
seg
[
'stop'
]]
=
1
for
seg
in
uem_list
[
show
]:
start
,
stop
=
seg
[
'start'
],
seg
[
'stop'
]
for
i
in
range
(
start
,
min
(
stop
,
len
(
features
))
-
self
.
duration
,
self
.
step
):
batch_X
[
batch_i
]
=
features
[
i
:
i
+
self
.
duration
]
batch_Y
[
batch_i
]
=
labels
[
i
:
i
+
self
.
duration
]
batch_i
+=
1
if
batch_i
==
self
.
batch_size
:
X
=
torch
.
Tensor
(
batch_X
)
Y
=
torch
.
Tensor
(
batch_Y
)
yield
X
,
Y
batch_i
=
0
def
_fit_batch
(
self
,
optimizer
,
criterion
,
X
,
Y
):
"""
Internal method used to train the network
...
...
@@ -231,25 +290,25 @@ class SAD_RNN():
:param features_server: a sidekit FeaturesServer object
:param model_file_format: file format to save the model. The format uses the current epoch
"""
self
.
model
.
to
(
device
)
criterion
=
nn
.
BCELoss
()
optimizer
=
optim
.
RMSprop
(
self
.
model
.
parameters
())
losses
=
[]
est_it
=
training_set
.
len
//
self
.
batch_size
est_it
=
training_set
.
len
for
epoch
in
range
(
nb_epochs
):
it
=
1
losses
.
append
([])
for
batch_idx
,
(
X
,
Y
)
in
enumerate
(
training_set
):
batch_loss
=
self
.
_fit_batch
(
optimizer
,
criterion
,
X
,
Y
)
losses
[
epoch
].
append
(
batch_loss
)
logging
.
critical
(
"Epoch {}/{}, loss {:.5f}"
.
format
(
epoch
+
1
,
nb_epochs
,
numpy
.
mean
(
losses
[
epoch
])))
#sys.stdout.write("\rEpoch {}/{}, loss {:.5f}".format(
# epoch + 1, nb_epochs, numpy.mean(losses[epoch])))
#sys.stdout.flush()
print
(
"
\r
Epoch {}/{} ({}/{}), loss {:.5f}"
.
format
(
epoch
+
1
,
nb_epochs
,
it
,
est_it
,
numpy
.
mean
(
losses
[
epoch
])))
it
+=
1
torch
.
save
(
self
.
model
.
state_dict
(),
model_file_format
.
format
(
epoch
+
1
))
return
losses
def
get_labels
(
self
,
model_fn
,
show
,
features_server
,
onset
=
0.8
,
offset
=
0.95
,
scores_fn
=
''
):
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment