Commit 3476f72e authored by Anthony Larcher's avatar Anthony Larcher
Browse files

data augmentation

parent d1f90b71
......@@ -160,7 +160,7 @@ __maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'
__version__="1.2.7"
__version__="1.2.9"
# __all__ = ["io",
# "vad",
......
......@@ -100,7 +100,7 @@
<span class="n">__email__</span> <span class="o">=</span> <span class="s2">&quot;anthony.larcher@univ-lemans.fr&quot;</span>
<span class="n">__status__</span> <span class="o">=</span> <span class="s2">&quot;Production&quot;</span>
<span class="n">__docformat__</span> <span class="o">=</span> <span class="s1">&#39;reStructuredText&#39;</span>
<span class="c1">#comment</span>
<div class="viewcode-block" id="FeaturesServer"><a class="viewcode-back" href="../api/featuresserver.html#features_server.FeaturesServer">[docs]</a><span class="k">class</span> <span class="nc">FeaturesServer</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
......@@ -733,7 +733,7 @@
</ul>
</div>
<div class="footer" role="contentinfo">
&#169; Copyright 2014-16, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
&#169; Copyright 2014-18, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.7.9.
</div>
</body>
......
......@@ -1206,7 +1206,7 @@
</ul>
</div>
<div class="footer" role="contentinfo">
&#169; Copyright 2014-16, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
&#169; Copyright 2014-18, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.7.9.
</div>
</body>
......
......@@ -1355,7 +1355,7 @@
<span class="k">if</span> <span class="s2">&quot;compression&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">fh</span><span class="p">:</span>
<span class="n">fh</span><span class="o">.</span><span class="n">create_dataset</span><span class="p">(</span><span class="s1">&#39;compression&#39;</span><span class="p">,</span> <span class="n">data</span><span class="o">=</span><span class="n">compression_type</span><span class="p">[</span><span class="n">compression</span><span class="p">])</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">assert</span><span class="p">(</span><span class="n">fh</span><span class="p">[</span><span class="s1">&#39;compression&#39;</span><span class="p">]</span> <span class="o">==</span> <span class="n">compression_type</span><span class="p">[</span><span class="n">compression</span><span class="p">])</span>
<span class="k">assert</span><span class="p">(</span><span class="n">fh</span><span class="p">[</span><span class="s1">&#39;compression&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">value</span> <span class="o">==</span> <span class="n">compression_type</span><span class="p">[</span><span class="n">compression</span><span class="p">])</span>
<span class="k">if</span> <span class="n">compression</span> <span class="o">==</span> <span class="s1">&#39;none&#39;</span><span class="p">:</span>
<span class="n">_write_show</span><span class="p">(</span><span class="n">show</span><span class="p">,</span>
......@@ -1492,7 +1492,7 @@
</ul>
</div>
<div class="footer" role="contentinfo">
&#169; Copyright 2014-16, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
&#169; Copyright 2014-18, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.7.9.
</div>
</body>
......
......@@ -303,7 +303,7 @@
</ul>
</div>
<div class="footer" role="contentinfo">
&#169; Copyright 2014-16, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
&#169; Copyright 2014-18, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.7.9.
</div>
</body>
......
......@@ -523,7 +523,7 @@
</ul>
</div>
<div class="footer" role="contentinfo">
&#169; Copyright 2014-16, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
&#169; Copyright 2014-18, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.7.9.
</div>
</body>
......
......@@ -1983,7 +1983,7 @@
</ul>
</div>
<div class="footer" role="contentinfo">
&#169; Copyright 2014-16, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
&#169; Copyright 2014-18, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.7.9.
</div>
</body>
......
This diff is collapsed.
......@@ -137,7 +137,7 @@ acoustic features. The HDF5 format is the prefered serialization format in <stro
</ul>
</div>
<div class="footer" role="contentinfo">
&#169; Copyright 2014-16, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
&#169; Copyright 2014-18, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.7.9.
</div>
</body>
......
......@@ -152,7 +152,7 @@
</ul>
</div>
<div class="footer" role="contentinfo">
&#169; Copyright 2014-16, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
&#169; Copyright 2014-18, Anthony LARCHER &amp; Sylvain MEIGNIER &amp; Kong Aik LEE.
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.7.9.
</div>
</body>
......
......@@ -52,14 +52,14 @@ master_doc = 'index'
# General information about the project.
project = u'SIDEKIT'
copyright = u'2014-16, Anthony LARCHER & Sylvain MEIGNIER & Kong Aik LEE'
copyright = u'2014-18, Anthony LARCHER & Sylvain MEIGNIER & Kong Aik LEE'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = '1.1.6'
version = '1.2.9'
# The full version, including alpha/beta/rc tags.
......@@ -278,7 +278,7 @@ texinfo_documents = [
epub_title = u'SIDEKIT'
epub_author = u'Anthony LARCHER, Sylvain MEIGNIER & Kong Aik LEE'
epub_publisher = u'Anthony LARCHER, Sylvain MEIGNIER & Kong Aik LEE'
epub_copyright = u'2014-16, Anthony LARCHER, Sylvain MEIGNIER & Kong Aik LEE'
epub_copyright = u'2014-18, Anthony LARCHER, Sylvain MEIGNIER & Kong Aik LEE'
# The basename for the epub file. It defaults to the project name.
# epub_basename = u'SIDEKIT'
......
......@@ -6,7 +6,7 @@
.. |logo| image:: logo_lium.png
Welcome to SIDEKIT 1.2 documentation!
Welcome to SIDEKIT 1.2.9 documentation!
=======================================
| **SIDEKIT** is an open source package for Speaker and Language recognition.
......@@ -19,11 +19,11 @@ Welcome to SIDEKIT 1.2 documentation!
Kong Aik Lee \&
Sylvain Meignier
:Version: 1.2 of 2017/02/09
:Version: 1.2.9 of 2018/12/17
.. seealso::
News for **SIDEKIT** 1.2:
News for **SIDEKIT** 1.2.9:
- new ``sidekit_mpi`` module that allows parallel computing on several nodes (cluster)
MPI implementations are provided for GMM EM algorithm, TotalVariability matrix EM estimation
......
......@@ -32,6 +32,7 @@ import logging
import numpy
import os
from scipy.signal import lfilter
from sidekit import PARAM_TYPE
from sidekit.frontend.features import mfcc, plp
from sidekit.frontend.io import read_audio, read_label, write_hdf5
......@@ -49,6 +50,134 @@ __status__ = "Production"
__docformat__ = 'reStructuredText'
def _rms_energy(x):
return 10*numpy.log10((1e-12 + x.dot(x))/len(x))
def _add_noise(signal, noise_file_name, snr, sample_rate):
"""
:param signal:
:param noise_file_name:
:param snr:
:return:
"""
# Open noise file
noise, fs_noise = read_audio(noise_file_name, sample_rate)
# Generate random section of masker
if len(noise) < len(signal):
dup_factor = len(signal) // len(noise) + 1
noise = numpy.tile(noise, dup_factor)
if len(noise) != len(signal):
idx = numpy.random.randint(0, len(noise) - len(signal))
noise = noise[idx:idx + len(signal)]
# Compute energy of both signals
N_dB = _rms_energy(noise)
S_dB = _rms_energy(signal)
# Rescale N
N_new = S_dB - snr
noise_scaled = 10 ** (N_new / 20) * noise / 10 ** (N_dB / 20)
return signal + noise_scaled
def bin_interp(upcount, lwcount, upthr, lwthr, margin, tol=0.1):
n_iter = 1
if abs(upcount - upthr - margin) < tol:
midcount = upcount
elif abs(lwcount - lwthr - margin) < tol:
midcount = lwcount
else:
midcount = (upcount + lwcount)/2
midthr = (upthr + lwthr)/2
diff = midcount - midthr - margin
while abs(diff) > tol:
n_iter += 1
if n_iter > 20:
tol *= 1.1
if diff > tol:
midcount = (upcount + midcount)/2
midthr = (upthr + midthr)/2
elif diff < -tol:
midcount = (lwcount + midcount)/2
midthr = (lwthr + midthr)/2
diff = midcount - midthr - margin
return midcount
def asl_meter(x, fs, nbits=16):
'''Measure the Active Speech Level (ASR) of x following ITU-T P.56.
If x is integer, it will be scaled to (-1, 1) according to nbits.
'''
if numpy.issubdtype(x.dtype, numpy.integer):
x = x / 2**(nbits-1)
# Constants
MIN_LOG_OFFSET = 1e-20
T = 0.03 # Time constant of smoothing in seconds
g = numpy.exp(-1/(T*fs))
H = 0.20 # Time of handover in seconds
I = int(numpy.ceil(H*fs))
M = 15.9 # Margin between threshold and ASL in dB
a = numpy.zeros(nbits-1) # Activity count
c = 0.5**numpy.arange(nbits-1, 0, step=-1) # Threshold level
h = numpy.ones(nbits)*I # Hangover count
s = 0
sq = 0
p = 0
q = 0
asl = -100
L = len(x)
s = sum(abs(x))
sq = sum(x**2)
dclevel = s/numpy.arange(1, L+1)
lond_term_level = 10*numpy.log10(sq/numpy.arange(1, L+1) + MIN_LOG_OFFSET)
c_dB = 20*numpy.log10(c)
for i in range(L):
p = g * p + (1-g) * abs(x[i])
q = g * q + (1-g) * p
for j in range(nbits-1):
if q >= c[j]:
a[j] += 1
h[j] = 0
elif h[j] < I:
a[j] += 1;
h[j] += 1
a_dB = -100 * numpy.ones(nbits-1)
for i in range(nbits-1):
if a[i] != 0:
a_dB[i] = 10*numpy.log10(sq/a[i])
delta = a_dB - c_dB
idx = numpy.where(delta <= M)[0]
if len(idx) != 0:
idx = idx[0]
if idx > 1:
asl = bin_interp(a_dB[idx], a_dB[idx-1], c_dB[idx], c_dB[idx-1], M)
else:
asl = a_dB[idx]
return asl
def _add_reverb(signal, reverb_file_name, sample_rate, reverb_level=-26.0, ):
'''Adds reverb (convolutive noise) to a speech signal.
The output speech level is normalized to asl_level.
'''
reverb, _ = read_audio(reverb_file_name, sample_rate)
y = lfilter(reverb, 1, signal)
y = y/10**(asl_meter(y, sample_rate)/20) * 10**(reverb_level/20)
return y
class FeaturesExtractor(object):
"""
A FeaturesExtractor process an audio file in SPHERE, WAVE or RAW PCM format and extract filter-banks,
......@@ -183,7 +312,11 @@ class FeaturesExtractor(object):
def extract(self, show, channel,
input_audio_filename=None,
output_feature_filename=None,
backing_store=False):
backing_store=False,
noise_file_name=None,
snr=10,
reverb_file_name=None,
reverb_level=-26.):
"""
Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features
for a single channel from a given audio file.
......@@ -218,6 +351,15 @@ class FeaturesExtractor(object):
if signal.ndim == 1:
signal = signal[:, numpy.newaxis]
# AJOUTER LE BRUITAGE ET REVERB DU SIGNAL SI NECESSAIRE
if noise_file_name is not None:
signal[:, channel] = _add_noise(signal[:, channel], noise_file_name, snr, sample_rate)
if reverb_file_name is not None:
signal[:, channel] = _add_reverb(signal[:, channel], reverb_file_name, sample_rate, reverb_level=-26.0)
#add_reverb(signal, reverb_file_name, fs, sample_rate, reverb_level=-26.0, )
# Process the target channel to return Filter-Banks, Cepstral coefficients and BNF if required
length, chan = signal.shape
......@@ -330,7 +472,15 @@ class FeaturesExtractor(object):
return h5f
def save(self, show, channel=0, input_audio_filename=None, output_feature_filename=None):
def save(self,
show,
channel=0,
input_audio_filename=None,
output_feature_filename=None,
noise_file_name=None,
snr=10,
reverb_file_name=None,
reverb_level=-26.):
"""
Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features
for a single channel from a given audio file and save them to disk in a HDF5 format
......@@ -342,7 +492,15 @@ class FeaturesExtractor(object):
:return:
"""
# Load the cepstral coefficients, energy, filter-banks, bnf and vad labels
h5f = self.extract(show, channel, input_audio_filename, output_feature_filename, backing_store=True)
h5f = self.extract(show,
channel,
input_audio_filename,
output_feature_filename,
backing_store=True,
noise_file_name=noise_file_name,
snr=snr,
reverb_file_name=reverb_file_name,
reverb_level=reverb_level)
logging.info(h5f.filename)
# Write the hdf5 file to disk
......@@ -549,6 +707,10 @@ class FeaturesExtractor(object):
channel_list,
audio_file_list=None,
feature_file_list=None,
noise_file_list=None,
snr_list=None,
reverb_file_list=None,
reverb_levels=None,
num_thread=1):
"""
Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features
......@@ -575,6 +737,25 @@ class FeaturesExtractor(object):
audio_file_list = numpy.empty(int(max_length), dtype='|O')
if feature_file_list is None:
feature_file_list = numpy.empty(int(max_length), dtype='|O')
for show, channel, audio_file, feature_file in zip(show_list, channel_list, audio_file_list, feature_file_list):
self.save(show, channel, audio_file, feature_file)
if noise_file_list is None:
noise_file_list = numpy.empty(int(max_length), dtype='|O')
snr_list = numpy.empty(int(max_length), dtype='|O')
elif snr_list is None:
snr_list = numpy.full(int(max_length), 5.)
if reverb_file_list is None:
reverb_file_list = numpy.empty(int(max_length), dtype='|O')
reverb_levels = numpy.empty(int(max_length), dtype='|O')
elif reverb_levels is None:
reverb_levels = numpy.full(int(max_length), -26.)
for show, channel, audio_file, feature_file, noise_file, snr, reverb_file, reverb_level in zip(show_list,
channel_list,
audio_file_list,
feature_file_list,
noise_file_list,
snr_list,
reverb_file_list,
reverb_levels):
self.save(show, channel, audio_file, feature_file, noise_file, snr, reverb_file, reverb_level)
......@@ -46,7 +46,7 @@ __maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'
#comment
class FeaturesServer(object):
"""
......
# -*- coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.
"""
Copyright 2014-2018 Yevhenii Prokopalo, Anthony Larcher
The authors would like to thank the BUT Speech@FIT group (http://speech.fit.vutbr.cz) and Lukas BURGET
for sharing the source code that strongly inspired this module. Thank you for your valuable contribution.
"""
import copy
import ctypes
import h5py
import logging
import multiprocessing
import numpy
import os
import random
import torch
from torch.autograd import Variable
import json
import subprocess
import resource
import scipy.linalg as la
def GetListOfFiles(MainFolder):
ListOfFiles = []
for file in os.listdir(MainFolder):
path = os.path.join(MainFolder, file)
if not os.path.isdir(path):
ListOfFiles.append(path)
else:
ListOfFiles += GetListOfFiles(path)
return ListOfFiles
def get_gpu_memory_map():
"""Get the current gpu usage.
Returns
-------
usage: dict
Keys are device ids as integers.
Values are memory usage as integers in MB.
"""
result = subprocess.check_output(
[
'nvidia-smi', '--query-gpu=memory.used',
'--format=csv,nounits,noheader'
], encoding='utf-8')
# Convert lines into a dictionary
gpu_memory = [int(x) for x in result.strip().split('\n')]
gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory))
return gpu_memory_map
class Xtractor(torch.nn.Module):
"""
"""
def __init__(self):
"""
"""
super(Xtractor, self).__init__()
self.conv0 = torch.nn.Conv1d(20, 512, 5)
self.conv1 = torch.nn.Conv1d(512, 512, 3, dilation=2)
self.conv2 = torch.nn.Conv1d(512, 512, 3, dilation=3)
self.conv3 = torch.nn.Conv1d(512, 512, 1)
self.conv4 = torch.nn.Conv1d(512, 1500, 1)
self.lin1 = torch.nn.Linear(3000, 512)
self.lin2 = torch.nn.Linear(512, 512)
self.lin3 = torch.nn.Linear(512, 1951)
self.norm1 = torch.nn.BatchNorm1d(512)
self.norm2 = torch.nn.BatchNorm1d(512)
self.norm3 = torch.nn.BatchNorm1d(512)
self.norm4 = torch.nn.BatchNorm1d(512)
self.norm5 = torch.nn.BatchNorm1d(1500)
self.norm7 = torch.nn.BatchNorm1d(512)
self.pooling = torch.nn.AvgPool1d(186)
self.rel = torch.nn.Softplus()
def init_weights(self, SpeakersCount):
"""
:param weights:
:param bias:
:return:
"""
self.conv0.weight.data = torch.FloatTensor(numpy.random.rand(512, 20, 5) - 0.5)
self.conv1.weight.data = torch.FloatTensor(numpy.random.rand(512, 512, 3) - 0.5)
self.conv2.weight.data = torch.FloatTensor(numpy.random.rand(512, 512, 3) - 0.5)
self.conv3.weight.data = torch.FloatTensor(numpy.random.rand(512, 512, 1) - 0.5)
self.conv4.weight.data = torch.FloatTensor(numpy.random.rand(1500, 512, 1) - 0.5)
self.lin1.weight.data = torch.FloatTensor(numpy.random.rand(512, 3000) - 0.5)
self.lin2.weight.data = torch.FloatTensor(numpy.random.rand(512, 512) - 0.5)
self.lin3.weight.data = torch.FloatTensor(numpy.random.rand(SpeakersCount, 512) - 0.5)
self.conv0.bias.data = torch.FloatTensor(numpy.random.rand(512) - 0.5)
self.conv1.bias.data = torch.FloatTensor(numpy.random.rand(512) - 0.5)
self.conv2.bias.data = torch.FloatTensor(numpy.random.rand(512) - 0.5)
self.conv3.bias.data = torch.FloatTensor(numpy.random.rand(512) - 0.5)
self.conv4.bias.data = torch.FloatTensor(numpy.random.rand(1500) - 0.5)
self.lin1.bias.data = torch.FloatTensor(numpy.random.rand(512) - 0.5)
self.lin2.bias.data = torch.FloatTensor(numpy.random.rand(512) - 0.5)
self.lin3.bias.data = torch.FloatTensor(numpy.random.rand(SpeakersCount) - 0.5)
def forward(self, x):
"""
:param x:
:return:
"""
l1out = self.rel(self.conv0(x))
l1norm = self.norm1(l1out)
l1param = {"mean": self.norm1.running_mean.detach().cpu().numpy(),
"var": self.norm1.running_var.detach().cpu().numpy(),
"gamma": self.norm1.weight.detach().cpu().numpy(),
"beta": self.norm1.bias.detach().cpu().numpy(), "eps": self.norm1.eps}
l2out = self.rel(self.conv1(l1norm))
l2norm = self.norm2(l2out)
l2param = {"mean": self.norm2.running_mean.detach().cpu().numpy(),
"var": self.norm2.running_var.detach().cpu().numpy(),
"gamma": self.norm2.weight.detach().cpu().numpy(),
"beta": self.norm2.bias.detach().cpu().numpy(), "eps": self.norm2.eps}
l3out = self.rel(self.conv2(l2norm))
l3norm = self.norm3(l3out)
l3param = {"mean": self.norm3.running_mean.detach().cpu().numpy(),
"var": self.norm3.running_var.detach().cpu().numpy(),
"gamma": self.norm3.weight.detach().cpu().numpy(),
"beta": self.norm3.bias.detach().cpu().numpy(), "eps": self.norm3.eps}
l4out = self.rel(self.conv3(l3norm))
l4norm = self.norm4(l4out)
l4param = {"mean": self.norm4.running_mean.detach().cpu().numpy(),
"var": self.norm4.running_var.detach().cpu().numpy(),
"gamma": self.norm4.weight.detach().cpu().numpy(),
"beta": self.norm4.bias.detach().cpu().numpy(), "eps": self.norm4.eps}
l5out = self.rel(self.conv4(l4norm))
l5norm = self.norm5(l5out)
l5param = {"mean": self.norm5.running_mean.detach().cpu().numpy(),
"var": self.norm5.running_var.detach().cpu().numpy(),
"gamma": self.norm5.weight.detach().cpu().numpy(),
"beta": self.norm5.bias.detach().cpu().numpy(), "eps": self.norm5.eps}
mean = torch.mean(l5norm, dim=2)
std = torch.std(l5norm, dim=2)
l6inp = torch.cat([mean, std], dim=1)
l6out = self.rel(self.lin1(l6inp))
l7out = self.rel(self.lin2(l6out))
l7norm = self.norm7(l7out)