Commit 045ae1b1 authored by Florent Desnous 's avatar Florent Desnous
Browse files

modified sad_rnn.py

parent ba4b91f1
......@@ -168,6 +168,7 @@ if CUDA:
from sidekit.nnet import extract_idmap
from sidekit.nnet import extract_parallel
from sidekit.nnet import SAD_RNN
from sidekit.nnet import SAD_Dataset
else:
print("Don't import Torch")
......
......@@ -27,7 +27,7 @@ Copyright 2014-2019 Anthony Larcher and Sylvain Meignier
:mod:`nnet` provides methods to manage Neural Networks using PyTorch
"""
from sidekit.nnet.sad_rnn import SAD_RNN
from sidekit.nnet.sad_rnn import SAD_RNN, SAD_Dataset
from sidekit.nnet.feed_forward import FForwardNetwork
from sidekit.nnet.feed_forward import kaldi_to_hdf5
from sidekit.nnet.xsets import XvectorMultiDataset, XvectorDataset, StatDataset
......
# -*- coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2019 Anthony Larcher, Florent Desnous
The authors would like to thank the BUT Speech@FIT group (http://speech.fit.vutbr.cz) and Lukas BURGET
for sharing the source code that strongly inspired this module. Thank you for your valuable contribution.
"""
import os
import sys
import numpy
import random
import h5py
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import Dataset
from sidekit.frontend.io import _read_dataset_percentile
import logging
__license__ = "LGPL"
__author__ = "Florent Desnous, Anthony Larcher"
__copyright__ = "Copyright 2015-2019 Anthony Larcher, Florent Desnous"
__maintainer__ = "Anthony Larcher"
__email__ = "florent.desnous@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reS'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
......@@ -16,14 +53,25 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class SAD_Dataset(Dataset):
"""
Object that takes a list of files from a file and initialize a Dataset
"""
def __init__(self, mdtm_file, feature_file, batch_size=512, duration=3.2, step=0.8, uem_file=None, shuffle=False, compressed='percentile'):
:param input_size: Size of the MFCC and BLSTM input
:param mdtm_file: File in MDTM format for the training segments
:param features_server: FeaturesServer instance for training MFCC
:param batch_size: Batch size
:param duration: Segment duration in seconds
:param step: in seconds
:param uem_file: File in UEM format for the training segments
:param shuffle: if True, performs a random shuffle on the dataset
:param use_ram: if True, keeps loaded MFCC in ram for faster training
"""
def __init__(self, input_size, mdtm_file, features_server, batch_size=512, duration=3.2, step=0.8, uem_file=None, shuffle=False, use_ram=True):
self.input_size = input_size
self.batch_size = batch_size
self.duration = int(duration * 100)
self.step = int(step * 100)
#self.feature_file = open(feature_file, 'r')
self.feature_file = h5py.File(feature_file, 'r')
self.features_server = features_server
self.features = {}
self.use_ram = use_ram
train_list = {}
with open(mdtm_file, 'r') as f:
......@@ -48,77 +96,57 @@ class SAD_Dataset(Dataset):
else:
for show in train_list.keys():
uem_list[show].append({"start": None, "stop": None})
self.vad = {}
self.segments = []
for show in sorted(train_list.keys()):
if compressed == 'percentile':
features = _read_dataset_percentile(self.feature_file, show+"/cep")
elif compressed == 'none':
features = self.feature_file[show+"/cep"].value
labels = numpy.zeros((len(features), 1), dtype=numpy.int)
speech_only_segments = []
speech_nonspeech_segments = []
if show in train_list and show in uem_list:
for seg in train_list[show]:
labels[seg['start']:seg['stop']] = 1
self.vad[show] = labels
for seg in uem_list[show]:
if seg['start'] is not None:
start, stop = seg['start'], seg['stop']
else:
start, stop = 0, len(features)
# cree les segments ne contenant QUE de la parole (sans recouvrement)
for i in range(start, min(stop, len(features)) - self.duration, self.duration):
if labels[i:i+self.duration].sum() == self.duration:
speech_only_segments.append((show, i, i + self.duration))
# cree les segments contenant de la PAROLE ET DU SILENCE (avec recouvrement pour equilibrer les classes)
for i in range(start, min(stop, len(features)) - self.duration, self.step):
#self.segments.append((show, i, i + self.duration))
if labels[i:i+self.duration].sum() < self.duration - 1:
speech_nonspeech_segments.append((show, i, i + self.duration))
#for i in range(start, min(stop, len(features)) - self.duration, self.step):
# self.segments.append((show, i, i + self.duration))
tmp = speech_only_segments + speech_nonspeech_segments
random.shuffle(tmp)
self.segments += tmp
print("Show {}, ratio S/NS = {}".format(show, len(speech_only_segments)/(len(speech_nonspeech_segments) + len(speech_only_segments))))
# for i in range(start, min(stop, len(features)) - self.duration, self.step):
# self.segments.append((show, i, i + self.duration))
self.input_size = features.shape[1]
show_len = train_list[show][-1]['stop']
labels = numpy.zeros((show_len, 1), dtype=numpy.int)
for seg in train_list[show]:
labels[seg['start']:seg['stop']] = 1
self.vad[show] = labels
for seg in uem_list[show]:
if seg['start'] is not None:
start, stop = seg['start'], seg['stop']
else:
start, stop = 0, show_len
for i in range(start, min(stop, show_len) - self.duration, self.step):
if self.vad[show][i:i + self.duration].sum() == self.duration: # no silence
continue
self.segments.append((show, i, i + self.duration))
if shuffle:
random.shuffle(self.segments)
self.len = len(self.segments) // self.batch_size
print(len(self.segments), "segments,", self.len, "segments/batch")
def __getitem__(self, index):
batch_X = numpy.zeros((self.batch_size, self.duration, self.input_size))
batch_Y = numpy.zeros((self.batch_size, self.duration, 1))
for i in range(self.batch_size):
show, start, stop = self.segments[index * self.batch_size + i]
#features = _read_dataset_percentile(self.feature_file, show + "/cep")
features = self.feature_file[show + "/cep"].value
m = features.mean(axis=0)
s = features.std(axis=0)
features = (features - m) / s
batch_X[i] = features[start:stop]
if show not in self.features:
self.features[show], _ = self.features_server.load(show)
batch_X[i] = self.features[show][start:stop]
batch_Y[i] = self.vad[show][start:stop]
#batch_X[i] = features[start:stop]
#batch_Y[i] = self.vad[show][start:stop]
if not self.use_ram:
self.features = {}
return torch.Tensor(batch_X), torch.Tensor(batch_Y)
def __len__(self):
return self.len
class SAD_RNN():
"""
A SAD_RNN is meant to use a PyTorch RNN model for Speech Activity Detection
......@@ -149,6 +177,37 @@ class SAD_RNN():
else:
self.model = model
def _sad_generator(self, train_list, uem_list, features_server):
"""
Internal method that generates batches of features
:param train_list:
:param uem_list:
:param features_server:
"""
batch_X = numpy.zeros((self.batch_size, self.duration, self.input_size))
batch_Y = numpy.zeros((self.batch_size, self.duration, 1))
batch_i = 0
for show in sorted(train_list.keys()):
features, _ = features_server.load(show)
labels = numpy.zeros((len(features), 1), dtype=numpy.int)
for seg in train_list[show]:
labels[seg['start']:seg['stop']] = 1
for seg in uem_list[show]:
start, stop = seg['start'], seg['stop']
for i in range(start, min(stop, len(features)) - self.duration, self.step):
batch_X[batch_i] = features[i:i + self.duration]
batch_Y[batch_i] = labels[i:i + self.duration]
batch_i += 1
if batch_i == self.batch_size:
X = torch.Tensor(batch_X)
Y = torch.Tensor(batch_Y)
yield X, Y
batch_i = 0
def _fit_batch(self, optimizer, criterion, X, Y):
"""
Internal method used to train the network
......@@ -231,26 +290,26 @@ class SAD_RNN():
:param features_server: a sidekit FeaturesServer object
:param model_file_format: file format to save the model. The format uses the current epoch
"""
self.model.to(device)
criterion = nn.BCELoss()
optimizer = optim.RMSprop(self.model.parameters())
losses = []
est_it = training_set.len // self.batch_size
est_it = training_set.len
for epoch in range(nb_epochs):
it = 1
losses.append([])
for batch_idx, (X, Y) in enumerate(training_set):
batch_loss = self._fit_batch(optimizer, criterion, X, Y)
losses[epoch].append(batch_loss)
logging.critical("Epoch {}/{}, loss {:.5f}".format(
epoch + 1, nb_epochs, numpy.mean(losses[epoch])))
#sys.stdout.write("\rEpoch {}/{}, loss {:.5f}".format(
# epoch + 1, nb_epochs, numpy.mean(losses[epoch])))
#sys.stdout.flush()
print("\rEpoch {}/{} ({}/{}), loss {:.5f}".format(
epoch + 1, nb_epochs, it, est_it, numpy.mean(losses[epoch])))
it += 1
torch.save(self.model.state_dict(), model_file_format.format(epoch+1))
return losses
def get_labels(self, model_fn, show, features_server,
onset=0.8, offset=0.95, scores_fn=''):
"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment