Commit 1268a4ce authored by Anthony Larcher's avatar Anthony Larcher
Browse files

new nnet module

parent cfcca3d4
s4d-0.1.0, 23/01/2020 -- Repackaging and creation of CHANGES.txt
s4d-0.1.4.2 15/02/2020 -- Bug fixed in scoring due to sklearn deprecated method linear_assignment
s4d-0.1.4.4 17/02/2020 -- Chaznge prototype of MODELIV.train_per_segment not to overwrite i-vectors
......@@ -57,4 +57,4 @@ __maintainer__ = "Sylvain Meignier"
__email__ = "sylvain.meignierr@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'
__version__ = "0.1.4.3"
__version__ = "0.1.4.4"
......@@ -769,8 +769,7 @@ class Diar():
if not diarization._attributes.exist('channel'):
diarization.add_attribut(new_attribut='channel', default='U')
try:
for line in fic:
line = re.sub('\s+',' ',line)
for line in fic line = re.sub('\s+',' ',line)
line = line.strip()
# logging.debug(line)
if line.startswith('#') or line.startswith(';;'):
......
......@@ -117,12 +117,12 @@ class ModelIV:
stat.accumulate_stat(ubm=self.ubm, feature_server=feature_server, seg_indices=range(stat.segset.shape[0]),
num_thread=self.nb_thread)
fa = FactorAnalyser(mean=self.tv_mean, Sigma=self.tv_sigma, F=self.tv)
self.ivectors = fa.extract_ivectors_single(self.ubm, stat)
ivectors = fa.extract_ivectors_single(self.ubm, stat)
if normalization:
self.ivectors.spectral_norm_stat1(self.norm_mean[:1], self.norm_cov[:1])
ivectors.spectral_norm_stat1(self.norm_mean[:1], self.norm_cov[:1])
return self.ivectors
return ivectors
def score_cosine(self, use_wccn=True):
"""
......
# -*- coding: utf-8 -*-
#
# This file is part of s4d.
#
# s4d is a python package for speaker diarization.
# Home page: http://www-lium.univ-lemans.fr/s4d/
#
# s4d is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# s4d is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with s4d. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2020 Anthony Larcher
"""
import os
import sys
import numpy
import random
import h5py
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import Dataset
import logging
from sidekit.nnet.vad_rnn import BLSTM
__license__ = "LGPL"
__author__ = "Anthony Larcher"
__copyright__ = "Copyright 2015-2020 Anthony Larcher"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reS'
class PreNet(nn.Module):
def __init(self):
super(PreNet, self).__init__()
self.conv1 = nn.Conv1d(in_channels=1,
out_channels=64,
kernel_size=200,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
padding_mode='zeros')
def forward(self, input):
output = self.conv1(input)
return output
class preprocessingBLSTM(nn.Module):
"""
Bi LSTM model used for voice activity detection or speaker turn detection
"""
def __init__(self,
input_size,
lstm_1,
lstm_2,
linear_1,
linear_2,
output_size=1):
"""
:param input_size:
:param lstm_1:
:param lstm_2:
:param linear_1:
:param linear_2:
:param output_size:
"""
super(BLSTM, self).__init__()
self.lstm_1 = nn.LSTM(input_size, lstm_1 // 2, bidirectional=True, batch_first=True)
self.lstm_2 = nn.LSTM(lstm_1, lstm_2 // 2, bidirectional=True, batch_first=True)
self.linear_1 = nn.Linear(lstm_2, linear_1)
self.linear_2 = nn.Linear(linear_1, linear_2)
self.output = nn.Linear(linear_2, output_size)
self.hidden = None
def forward(self, inputs):
"""
:param inputs:
:return:
"""
if self.hidden is None:
hidden_1, hidden_2 = None, None
else:
hidden_1, hidden_2 = self.hidden
tmp, hidden_1 = self.lstm_1(inputs, hidden_1)
x, hidden_2 = self.lstm_2(tmp, hidden_2)
self.hidden = (hidden_1, hidden_2)
x = torch.tanh(self.linear_1(x))
x = torch.tanh(self.linear_2(x))
x = torch.sigmoid(self.output(x))
return x
class SeqToSeq(nn.Module):
def __init__(self):
self.model = BLSTM(input_size=1,
lstm_1=64,
lstm_2=40,
linear_1=40,
linear_2=10)
class VAD_RNN:
"""
A VAD_RNN is meant to use a PyTorch RNN model for Speech Activity Detection
"""
def __init__(self, input_size, duration, step, batch_size, model_file_name=None):
"""
:param input_size: size of the input features
:param duration: duration in seconds of each batch of features
:param step: duration in seconds of each step between two batches
:param batch_size: batch size
:param model_file_name: optional pytorch model to load If None, the default model is used.
The default model is made of two BLSTM layers of dimension 64 and 40
followed by two linear layers of dimension 40 and 10.
"""
self.input_size = input_size
self.duration = int(duration * 100)
self.step = int(step * 100)
self.batch_size = batch_size
if model_file_name is None:
self.model = BLSTM(input_size=self.input_size,
lstm_1=64,
lstm_2=40,
linear_1=40,
linear_2=10)
else:
self.model.load_state_dict(torch.load(model_file_name))
self.model.to(device)
def _fit_batch(self, optimizer, criterion, x, y):
"""
Internal method used to train the network
:param optimizer:
:param criterion:
:param X:
:param Y:
:return: loss of current batch
"""
x = x.to(device)
y = y.to(device)
self.model.hidden = None
optimizer.zero_grad()
lstm_out = self.model(x)
loss = criterion(lstm_out, y)
loss.backward()
optimizer.step()
return float(loss.data)
def get_scores(self, show, features_server, score_file_format=''):
"""
Computes the scores for one show from the output of the network
:param show: the show to extract
:param features_server: a sidekit FeaturesServer object
:param score_file_format: optional, used to save or load a score file
:return: scores of the show, as an array of 0..1
"""
if score_file_format == '':
score_fn = ''
else:
score_fn = score_file_format.format(show)
if os.path.exists(score_fn):
print("Warning: loading existing scores")
return numpy.load(score_fn)
features, _ = features_server.load(show)
x = []
for i in range(0, len(features) - self.duration, self.step):
x.append(features[i:i + self.duration])
if i + self.step > len(features) - self.duration:
pad_size = self.batch_size - len(x)
pad = [[[0] * self.input_size] * self.duration] * pad_size
x += pad
x = torch.Tensor(x).to(device)
self.model.hidden = None
x = self.model(x)
o = numpy.asarray(x.squeeze(2).tolist())
scores = numpy.zeros((len(o) * self.step + self.duration - self.step))
w = numpy.zeros(scores.shape)
start = 0
for i, out in enumerate(o):
scores[start:start + self.duration] += out
w[start:start + self.duration] += 1
start += self.step
scores = scores / w
scores = scores[:len(features)]
if score_fn != '':
numpy.save(score_fn, scores)
return scores
def train_network(self,
nb_epochs,
training_set,
model_file_format):
"""
Trains the network
:param nb_epochs: number of epochs to do
:param training_set: Dataset object to feed the training algorithm as keys. The start and stop are in
centiseconds.
:param model_file_format: file format to save the model. The format uses the current epoch
"""
criterion = nn.BCELoss()
optimizer = optim.RMSprop(self.model.parameters())
losses = []
for epoch in range(nb_epochs):
it = 1
losses.append([])
for batch_idx, (X, Y) in enumerate(training_set):
batch_loss = self._fit_batch(optimizer, criterion, X, Y)
losses[epoch].append(batch_loss)
logging.critical("Epoch {}/{}, loss {:.5f}".format(
epoch + 1, nb_epochs, numpy.mean(losses[epoch])))
it += 1
torch.save(self.model.state_dict(), model_file_format.format(epoch + 1))
def vad_blstm(self, show, features_server, onset=0.8, offset=0.95, scores_fn=''):
"""
Get the VAD labels for one show
:param show: show to generate the SAD from
:param features_server: a sidekit FeaturesServer object
:param onset: score threshold above which a segment should start
:param offset: score threshold under which a segment should stop
:param scores_fn: optional file name to save the scores
"""
scores = self.get_scores(show, features_server, scores_fn)
label = numpy.zeros(len(scores))
start = 0
segment = False
for i, s in enumerate(scores):
if not segment and s > onset: # speech segment begins
start = i
segment = True
if segment and s < offset: # speech segment ends
segment = False
label[start:i] = 1
if segment:
label[start:i] = 1
return label
def write_vad(self, show_list, features_server, onset, offset, vad_file_format, scores_file_format=''):
"""
Generates the SAD segment files from the trained model
:param show_list: list of shows to generate the SAD from
:param features_server: a sidekit FeaturesServer object
:param onset: score threshold above which a segment will start
:param offset: score threshold below which a segment will stop
:param vad_file_format: file format for the segments
:param scores_file_format: optional, used to save scores files
"""
for show in sorted(show_list):
scores = self.get_scores(show, features_server, scores_file_format)
sad = []
start = 0
segment = False
for i, s in enumerate(scores):
if not segment and s > onset:
start = i
segment = True
if segment and s < offset:
segment = False
sad.append([show, start, i])
if segment or len(sad) == 0:
sad.append([show, start, i])
with open(vad_file_format.format(show), 'w') as f:
for l in sad:
f.write("{} 1 {} {} U U U speech\n".format(l[0], l[1], l[2]))
# -*- coding: utf-8 -*-
#
# This file is part of s4d.
#
# s4d is a python package for speaker diarization.
# Home page: http://www-lium.univ-lemans.fr/s4d/
#
# s4d is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# s4d is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with s4d. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2020 Anthony Larcher
"""
__license__ = "LGPL"
__author__ = "Anthony Larcher"
__copyright__ = "Copyright 2015-2020 Anthony Larcher and Sylvain Meignier"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'
import numpy
import scipy
import sidekit
import torch
from ..diar import Diar
from pathlib import Path
from torch.utils.data import Dataset
def framing(sig, win_size, win_shift=1, context=(0, 0), pad='zeros'):
"""
:param sig: input signal, can be mono or multi dimensional
:param win_size: size of the window in term of samples
:param win_shift: shift of the sliding window in terme of samples
:param context: tuple of left and right context
:param pad: can be zeros or edge
"""
dsize = sig.dtype.itemsize
if sig.ndim == 1:
sig = sig[:, numpy.newaxis]
# Manage padding
c = (context, ) + (sig.ndim - 1) * ((0, 0), )
_win_size = win_size + sum(context)
shape = (int((sig.shape[0] - win_size) / win_shift) + 1, 1, _win_size, sig.shape[1])
strides = tuple(map(lambda x: x * dsize, [win_shift * sig.shape[1], 1, sig.shape[1], 1]))
return numpy.lib.stride_tricks.as_strided(sig,
shape=shape,
strides=strides).squeeze()
def load_wav_segment(wav_file_name, idx, duration, seg_shift, framerate=16000):
"""
:param wav_file_name:
:param idx:
:param duration:
:param seg_shift:
:param framerate:
:return:
"""
# Load waveform
signal = sidekit.frontend.io.read_audio(wav_file_name, framerate)[0]
tmp = framing(signal,
int(framerate * duration),
win_shift=int(framerate * seg_shift),
context=(0, 0),
pad='zeros')
return tmp[idx]
def mdtm_to_label(mdtm_filename,
show_duration,
framerate):
"""
:param show:
:param show_duration:
:param allies_dir:
:param mode:
:param duration:
:param start:
:param framerate:
:param filter_type:
:param collar_duration:
:return:
"""
diarization = Diar.read_mdtm(mdtm_filename)
diarization.sort(['show', 'start'])
# Create a dictionary of speakers
speaker_set = diarization.unique('cluster')
speaker_dict = {}
for idx, spk in enumerate(speaker_set):
speaker_dict[spk] = idx
# Create the empty labels
label = numpy.zeros(show_duration, dtype=int)
# Fill the labels with spk_idx
for segment in diarization:
start = int(segment['start']) * framerate / 100.
stop = int(segment['stop']) * framerate / 100.
spk_idx = speaker_dict[segment['cluster']]
label[start:stop] = spk_idx
return label
def get_segment_label(label, seg_idx, mode, duration, framerate, seg_shift, collar_duration, filter_type="gate")
# Create labels with Diracs at every speaker change detection
spk_change = numpy.zeros(label.shape, dtype=int)
spk_change[:-1] = label[:-1] ^ label[1:]
spk_change = numpy.not_equal(spk_change, numpy.zeros(label.shape, dtype=int))
# depending of the mode, generates the labels and select the segments
if mode == "vad":
output_label = (label > 0.5).astype(numpy.long)
elif mode == "spk_turn":
# Apply convolution to replace diracs by a chosen shape (gate or triangle)
filter_sample = collar_duration * framerate * 2 + 1
conv_filt = numpy.ones(filter_sample)
if filter_type == "triangle":
conv_filt = scipy.signal.triang(filter_sample)
output_label = numpy.convolve(conv_filt, spk_change, mode='same')
elif mode == "overlap":
raise NotImplementedError()
else:
raise ValueError("mode parameter must be 'vad', 'spk_turn' or 'overlap'")
# Create segments with overlap
segment_label = framing(output_label,
int(framerate * duration),
win_shift=int(framerate * seg_shift),
context=(0, 0),
pad='zeros')
return segment_label[seg_idx]
class AlliesSet(Dataset):
"""
Object creates a dataset for
"""
def __init__(self,
allies_dir,
mode,
duration=2.,
seg_shift=0.25,
filter_type="gate",
collar_duration=0.1,
framerate=16000):
"""
Create batches of wavform samples for deep neural network training
:param allies_dir: the root directory of ALLIES data
:param mode: can be "vad", "spk_turn", "overlap"
:param duration: duration of the segments in seconds
:param seg_shift: shift to generate overlaping segments
:param filter_type:
:param collar_duration:
"""
self.framerate = framerate
self.show_duration = {}
self.segments = []
self.duration = duration
self.seg_shift = seg_shift
self.input_dir = allies_dir
self.mode = mode
self. filter_type = filter_type
self.collar_duration = collar_duration
self.wav_name_format = allies_dir + '/wav/{}.wav'
self.mdtm_name_format = allies_dir + '/mdtm/{}.mdtm'
# load the list of training file names
training_file_list = [str(f).split("/")[-1].split('.')[
0] for f in list(Path(allies_dir + "/wav/").rglob("*.[wW][aA][vV]"))
]
for show in training_file_list:
# Load waveform
signal = sidekit.frontend.io.read_audio(self.wav_name_format.format(show), self.framerate)[0]
# Get speaker labels from MDTM
label = mdtm_to_label(self.mdtm_name_format.format(show), signal.shape, self.framerate)
# Create labels with Diracs at every speaker change detection
spk_change = numpy.zeros(signal.shape, dtype=int)
spk_change[:-1] = label[:-1] ^ label[1:]
spk_change = numpy.not_equal(spk_change, numpy.zeros(signal.shape, dtype=int))
# Create short segments with overlap
tmp = framing(spk_change,
int(self.framerate * duration),
win_shift=int(self.framerate * seg_shift),
context=(0, 0),
pad='zeros')
# Select only segments with at least a speaker change
keep_seg = numpy.not_equal(tmp.sum(1), 0)
keep_idx = numpy.argwhere(keep_seg.squeeze()).squeeze()
for idx in keep_idx:
self.segments.append((show, idx))
self.len = len(self.segments)
def __getitem__(self, index):
show, idx = self.segments[index]
data, total_duration = load_wav_segment(self.wav_name_format.format(show),
idx, self.duration, self.seg_shift, framerate=self.framerate)
tmp_label = mdtm_to_label(self.mdtm_name_format.format(show), total_duration, self.framerate)
label = get_segment_label(tmp_label, idx, self.mode, self.duration, self.framerate,
self.seg_shift, self.collar_duration, filter_type=self.filter_type)
return torch.from_numpy(data).type(torch.FloatTensor), torch.from_numpy(label.astype('long'))
def __len__(self):
return self.len
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment