Commit 98980c62 authored by Anthony Larcher's avatar Anthony Larcher
Browse files


parent a54d0812
......@@ -134,7 +134,8 @@ class SeqToSeq(nn.Module):
def __init__(self):
self.preprocessor = PreNet(sample_rate=16000,
self.sequence_model = BLSTM(input_size=1,
......@@ -145,188 +146,3 @@ class SeqToSeq(nn.Module):
x = self.preprocessor(input)
output = self.sequence_model(x)
return output
class VAD_RNN:
A VAD_RNN is meant to use a PyTorch RNN model for Speech Activity Detection
def __init__(self, input_size, duration, step, batch_size, model_file_name=None):
:param input_size: size of the input features
:param duration: duration in seconds of each batch of features
:param step: duration in seconds of each step between two batches
:param batch_size: batch size
:param model_file_name: optional pytorch model to load If None, the default model is used.
The default model is made of two BLSTM layers of dimension 64 and 40
followed by two linear layers of dimension 40 and 10.
self.input_size = input_size
self.duration = int(duration * 100)
self.step = int(step * 100)
self.batch_size = batch_size
if model_file_name is None:
self.model = BLSTM(input_size=self.input_size,
def _fit_batch(self, optimizer, criterion, x, y):
Internal method used to train the network
:param optimizer:
:param criterion:
:param X:
:param Y:
:return: loss of current batch
x =
y =
self.model.hidden = None
lstm_out = self.model(x)
loss = criterion(lstm_out, y)
return float(
def get_scores(self, show, features_server, score_file_format=''):
Computes the scores for one show from the output of the network
:param show: the show to extract
:param features_server: a sidekit FeaturesServer object
:param score_file_format: optional, used to save or load a score file
:return: scores of the show, as an array of 0..1
if score_file_format == '':
score_fn = ''
score_fn = score_file_format.format(show)
if os.path.exists(score_fn):
print("Warning: loading existing scores")
return numpy.load(score_fn)
features, _ = features_server.load(show)
x = []
for i in range(0, len(features) - self.duration, self.step):
x.append(features[i:i + self.duration])
if i + self.step > len(features) - self.duration:
pad_size = self.batch_size - len(x)
pad = [[[0] * self.input_size] * self.duration] * pad_size
x += pad
x = torch.Tensor(x).to(device)
self.model.hidden = None
x = self.model(x)
o = numpy.asarray(x.squeeze(2).tolist())
scores = numpy.zeros((len(o) * self.step + self.duration - self.step))
w = numpy.zeros(scores.shape)
start = 0
for i, out in enumerate(o):
scores[start:start + self.duration] += out
w[start:start + self.duration] += 1
start += self.step
scores = scores / w
scores = scores[:len(features)]
if score_fn != '':, scores)
return scores
def train_network(self,
Trains the network
:param nb_epochs: number of epochs to do
:param training_set: Dataset object to feed the training algorithm as keys. The start and stop are in
:param model_file_format: file format to save the model. The format uses the current epoch
criterion = nn.BCELoss()
optimizer = optim.RMSprop(self.model.parameters())
losses = []
for epoch in range(nb_epochs):
it = 1
for batch_idx, (X, Y) in enumerate(training_set):
batch_loss = self._fit_batch(optimizer, criterion, X, Y)
logging.critical("Epoch {}/{}, loss {:.5f}".format(
epoch + 1, nb_epochs, numpy.mean(losses[epoch])))
it += 1, model_file_format.format(epoch + 1))
def vad_blstm(self, show, features_server, onset=0.8, offset=0.95, scores_fn=''):
Get the VAD labels for one show
:param show: show to generate the SAD from
:param features_server: a sidekit FeaturesServer object
:param onset: score threshold above which a segment should start
:param offset: score threshold under which a segment should stop
:param scores_fn: optional file name to save the scores
scores = self.get_scores(show, features_server, scores_fn)
label = numpy.zeros(len(scores))
start = 0
segment = False
for i, s in enumerate(scores):
if not segment and s > onset: # speech segment begins
start = i
segment = True
if segment and s < offset: # speech segment ends
segment = False
label[start:i] = 1
if segment:
label[start:i] = 1
return label
def write_vad(self, show_list, features_server, onset, offset, vad_file_format, scores_file_format=''):
Generates the SAD segment files from the trained model
:param show_list: list of shows to generate the SAD from
:param features_server: a sidekit FeaturesServer object
:param onset: score threshold above which a segment will start
:param offset: score threshold below which a segment will stop
:param vad_file_format: file format for the segments
:param scores_file_format: optional, used to save scores files
for show in sorted(show_list):
scores = self.get_scores(show, features_server, scores_file_format)
sad = []
start = 0
segment = False
for i, s in enumerate(scores):
if not segment and s > onset:
start = i
segment = True
if segment and s < offset:
segment = False
sad.append([show, start, i])
if segment or len(sad) == 0:
sad.append([show, start, i])
with open(vad_file_format.format(show), 'w') as f:
for l in sad:
f.write("{} 1 {} {} U U U speech\n".format(l[0], l[1], l[2]))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment