Commit 4d4c8d39 authored by Florent Desnous 's avatar Florent Desnous
Browse files

adding pytorch vad

parent 4f2f730b
import os
import sys
import numpy
import torch
import torch.nn as nn
from torch import optim
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class SAD_RNN():
"""
A SAD_RNN is meant to use a PyTorch RNN model for Speech Activity Detection
"""
def __init__(self, input_size, duration, step, batch_size, model=None):
"""
:param input_size: size of the input features
:param duration: duration in seconds of each batch of features
:param step: duration in seconds of each step between two batches
:param batch_size: batch size
:param model: optional pytorch model. If None, the default model is used.
The default model is made of two BLSTM layers of dimension 64 and 40
followed by two linear layers of dimension 40 and 10.
"""
self.input_size = input_size
self.duration = duration * 100
self.step = step * 100
self.batch_size = batch_size
if model is None: # load default model
self.model = BLSTM(
input_size=self.input_size,
lstm_1=64,
lstm_2=40,
linear_1=40,
linear_2=10)
else:
self.model = model
def _sad_generator(self, train_list, uem_list, features_server):
"""
Internal method that generates batches of features
:param train_list:
:param uem_list:
:param features_server:
"""
batch_X = numpy.zeros((self.batch_size, self.duration, self.input_size))
batch_Y = numpy.zeros((self.batch_size, self.duration, 1))
batch_i = 0
for show in sorted(train_list.keys()):
features, _ = features_server.load(show)
features = features[:, 1:] # tmp TODO
labels = numpy.zeros((len(features), 1), dtype=numpy.int)
for seg in train_list[show]:
labels[seg['start']:seg['stop']] = 1
for seg in uem_list[show]:
start, stop = seg['start'], seg['stop']
for i in range(start, min(stop, len(features)) - self.duration, self.step):
batch_X[batch_i] = features[i:i + self.duration]
batch_Y[batch_i] = labels[i:i + self.duration]
batch_i += 1
if batch_i == self.batch_size:
X = torch.Tensor(batch_X)
Y = torch.Tensor(batch_Y)
yield X, Y
batch_i = 0
def _fit_batch(self, optimizer, criterion, X, Y):
"""
Internal method used to train the network
:param optimizer:
:param criterion:
:param X:
:param Y:
:return: loss of current batch
"""
X = X.to(device)
Y = Y.to(device)
self.model.hidden = None
optimizer.zero_grad()
lstm_out = self.model(X)
loss = criterion(lstm_out, Y)
loss.backward()
optimizer.step()
return float(loss.data)
def _get_scores(self, show, scores_fmt, features_server):
"""
Internal method to compute the scores for one show from the output of the network
:param show:
:param epoch:
:param features_server:
:return: scores of the show, as an array of 0..1
"""
if scorces_fmt == '':
score_fn = ''
else:
score_fn = scores_fmt.format(show)
if os.path.exists(score_fn):
print("Warning: loading existing scores")
return numpy.load(score_fn)
features, _ = features_server.load(show)
features = features[:, 1:] #tmp TODO
x = []
X = torch.tensor([]).to(device)
for i in range(0, len(features) - self.duration, self.step):
x.append(features[i:i + self.duration] )
if i + self.step > len(features) - self.duration:
pad_size = self.batch_size - len(x)
pad = [[[0] * self.input_size] * self.duration] * pad_size
x += pad
if len(x) == self.batch_size:
x = torch.Tensor(x)
x = x.to(device)
self.model.hidden = None
X = torch.cat((X, self.model(x)))
x = []
o = numpy.asarray(X.squeeze(2).tolist())
scores = numpy.zeros((len(o) * self.step + self.duration - self.step))
w = numpy.zeros(scores.shape)
start = 0
for i, out in enumerate(o):
scores[start:start + self.duration] += out
w[start:start + self.duration] += 1
start += self.step
scores = scores / w
scores = scores[:len(features)]
if scores_fn != '':
numpy.save(score_fn, scores)
return scores
def train_network(self,
nb_epochs,
train_list, uem_list,
features_server,
model_file_format):
"""
Trains the network
:param nb_epochs: number of epochs to do
:param train_list: list of training segment. It is a dictionary with shows as keys and
arrays of dictionaries as values. The dictionaries are the segments with "start" and "stop"
as keys. The start and stop are in centiseconds.
:param uem_list: same as train_list, from uem file.
:param features_server: a sidekit FeaturesServer object
:param model_file_format: file format to save the model. The format uses the current epoch
"""
self.model.to(device)
criterion = nn.BCELoss()
optimizer = optim.RMSprop(self.model.parameters())
losses = []
est_it = 0
for show in uem_list:
for seg in uem_list[show]:
est_it += seg['stop'] - seg['start']
est_it = est_it // self.batch_size // self.step
#est_it = # uem.duration // self.batch_size // self.step
for epoch in range(nb_epochs):
it = 1
losses.append([])
gen = self._sad_generator(train_list, uem_list, features_server)
for X, Y in gen:
batch_loss = self._fit_batch(optimizer, criterion, X, Y)
losses[epoch].append(batch_loss)
sys.stdout.write("\rEpoch {}/{} ({}/{}), loss {:.5f}".format(
epoch + 1, nb_epochs, it, est_it, numpy.mean(losses[epoch])))
sys.stdout.flush()
it += 1
print()
est_it = len(losses[epoch])
torch.save(self.model.state_dict(), model_file_format.format(epoch+1))
def write_sad(self, model_fn, show_list, features_server,
onset, offset, sad_file_format, scores_file_format=''):
"""
Generates the SAD segment files from the trained model
:param model_fn: model file name
:param show_list: list of shows to generate the SAD from
:param features_server: a sidekit FeaturesServer object
:param onset: score value above which a segment will start
:param offset: score value below which a segment will stop
:param sad_file_format: file format for the segments
:param scores_file_format: optional, used to save scores files
"""
self.model.load_state_dict(torch.load(model_fn))
self.model.to(device)
for show in sorted(show_list):
scores = self._get_scores(show, scores_file_format, features_server)
sad = []
start = 0
segment = False
for i, s in enumerate(scores):
if not segment and s > onset:
start = i
segment = True
if segment and s < offset:
segment = False
sad.append([show, start, i])
if segment or len(sad) == 0:
sad.append([show, start, i])
with open(sad_file_format.format(show), 'w') as f:
for l in sad:
f.write("{} 1 {} {} U U U speech\n".format(l[0], l[1], l[2]))
class BLSTM(nn.Module):
def __init__(self,
input_size,
lstm_1, lstm_2,
linear_1, linear_2,
output_size=1):
super(BLSTM, self).__init__()
self.lstm_1 = nn.LSTM(input_size, lstm_1 // 2,
bidirectional=True, batch_first=True)
self.lstm_2 = nn.LSTM(lstm_1, lstm_2 // 2,
bidirectional=True, batch_first=True)
self.linear_1 = nn.Linear(lstm_2, linear_1)
self.linear_2 = nn.Linear(linear_1, linear_2)
self.output = nn.Linear(linear_2, output_size)
self.hidden = None
def forward(self, inputs):
out = []
if self.hidden is None:
hidden_1, hidden_2 = None, None
else:
hidden_1, hidden_2 = self.hidden
tmp, hidden_1 = self.lstm_1(inputs, hidden_1)
X, hidden_2 = self.lstm_2(tmp, hidden_2)
self.hidden = (hidden_1, hidden_2)
X = torch.tanh(self.linear_1(X))
X = torch.tanh(self.linear_2(X))
X = torch.sigmoid(self.output(X))
return X
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment