Commit 95406297 authored by Sulfyderz's avatar Sulfyderz
Browse files

[Update]:Updating the README.md file for Pypi.

parent 3467349f
...@@ -11,24 +11,20 @@ PREREQUISITES ...@@ -11,24 +11,20 @@ PREREQUISITES
============= =============
*Sidekit for Diarization* requires the following software installed for your platform: *Sidekit for Diarization* requires the following software installed for your platform:
1. [Python](http://www.python.org) 1. [Python](http://www.python.org)
2. [NumPy](http://www.numpy.org/) 2. [NumPy](http://www.numpy.org/)
3. [Scipy](http://http://www.scipy.org/) 3. [Scipy](http://http://www.scipy.org/)
4. [Pandas](http://http://www.pandas.org/) 4. [Pandas](http://http://www.pandas.org/)
5. [GLPK](https://www.gnu.org/software/glpk/) 5. [GLPK](https://www.gnu.org/software/glpk/)
6. If you want to build the documentation: [Sphinx 1.1.0 or newer](http://http://sphinx-doc.org/) 6. [Sphinx 1.1.0 or newer](http://http://sphinx-doc.org/) to build the documentation
INSTALLATION INSTALLATION
============ ============
We recommend the use of a virtual environment (e.g. [Miniconda](https://conda.io/miniconda.html) or [Virtualenv](https://virtualenv.readthedocs.io/en/latest/)). We recommend the use of a virtual environment (e.g. [Miniconda](https://conda.io/miniconda.html) or [Virtualenv](https://virtualenv.readthedocs.io/en/latest/)).
After downloading the project, install the requirements with: TUTORIALS
``` =========
pip install -r requirements.txt
``` Once your installation is complete, you can take a look at the [tutorials](https://git-lium.univ-lemans.fr/Meignier/s4d/tree/master/tutorials).
Then proceed to install s4d: \ No newline at end of file
```
./install.sh
```
Once done, you can take a look at the [tutorials](https://git-lium.univ-lemans.fr/Meignier/s4d/tree/master/tutorials).
\ No newline at end of file
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Train model for Diarization Train model for Diarization
==== ====
This script trains UBM, TV and PLDA models for a diarization system. This script trains UBM, TV and PLDA models for a diarization system.
Initialization Initialization
--- ---
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
%matplotlib inline %matplotlib inline
from s4d.diar import Diar from s4d.diar import Diar
from s4d.utils import * from s4d.utils import *
from sidekit import Mixture, FactorAnalyser, StatServer, IdMap from sidekit import Mixture, FactorAnalyser, StatServer, IdMap
import numpy import numpy
import logging import logging
import re import re
import sidekit import sidekit
from sidekit.sidekit_io import * from sidekit.sidekit_io import *
try: try:
from sortedcontainers import SortedDict as dict from sortedcontainers import SortedDict as dict
except ImportError: except ImportError:
pass pass
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
init_logging(level=logging.INFO) init_logging(level=logging.INFO)
num_thread = 4 num_thread = 4
audio_dir = '../data/train/{}.wav' audio_dir = '../data/train/{}.wav'
ubm_seg_fn = './data/seg/ubm_ester.seg' ubm_seg_fn = './data/seg/ubm_ester.seg'
nb_gauss = 1024 nb_gauss = 1024
mfcc_ubm_fn = './data/mfcc/ubm.h5' mfcc_ubm_fn = './data/mfcc/ubm.h5'
ubm_idmap_fn = './data/mfcc/ubm_idmap.txt' ubm_idmap_fn = './data/mfcc/ubm_idmap.txt'
ubm_fn = './data/model/ester_ubm_'+str(nb_gauss)+'.h5' ubm_fn = './data/model/ester_ubm_'+str(nb_gauss)+'.h5'
tv_seg_fn = './data/seg/train.tv.seg' tv_seg_fn = './data/seg/train.tv.seg'
rank_tv = 300 rank_tv = 300
it_max_tv = 10 it_max_tv = 10
mfcc_tv_fn = './data/mfcc/tv.h5' mfcc_tv_fn = './data/mfcc/tv.h5'
tv_idmap_fn = './data/mfcc/tv_idmap.h5' tv_idmap_fn = './data/mfcc/tv_idmap.h5'
tv_stat_fn = './data/model/tv.stat.h5' tv_stat_fn = './data/model/tv.stat.h5'
tv_fn = './data/model/tv_'+str(rank_tv)+'.h5' tv_fn = './data/model/tv_'+str(rank_tv)+'.h5'
plda_seg_fn = './data/seg/train.plda.seg' plda_seg_fn = './data/seg/train.plda.seg'
rank_plda = 150 rank_plda = 150
it_max_plda = 10 it_max_plda = 10
mfcc_plda_fn = './data/mfcc/norm_plda.h5' mfcc_plda_fn = './data/mfcc/norm_plda.h5'
plda_idmap_fn = './data/mfcc/plda_idmap.h5' plda_idmap_fn = './data/mfcc/plda_idmap.h5'
plda_fn = './data/model/plda_'+str(rank_tv)+'_'+str(rank_plda)+'.h5' plda_fn = './data/model/plda_'+str(rank_tv)+'_'+str(rank_plda)+'.h5'
norm_stat_fn = './data/model/norm.stat.h5' norm_stat_fn = './data/model/norm.stat.h5'
norm_fn = './data/model/norm.h5' norm_fn = './data/model/norm.h5'
norm_iv_fn = './data/model/norm.iv.h5' norm_iv_fn = './data/model/norm.iv.h5'
matrices_fn = './data/model/matrices.h5' matrices_fn = './data/model/matrices.h5'
model_fn = './data/model/ester_model_{}_{}_{}.h5'.format(nb_gauss, rank_tv, rank_plda) model_fn = './data/model/ester_model_{}_{}_{}.h5'.format(nb_gauss, rank_tv, rank_plda)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Step 1: UBM Step 1: UBM
--- ---
Extract MFCC for the UBM Extract MFCC for the UBM
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
logging.info('Computing MFCC for UBM') logging.info('Computing MFCC for UBM')
diar_ubm = Diar.read_seg(ubm_seg_fn, normalize_cluster=True) diar_ubm = Diar.read_seg(ubm_seg_fn, normalize_cluster=True)
fe = get_feature_extractor(audio_dir, 'sid') fe = get_feature_extractor(audio_dir, 'sid')
ubm_idmap = fe.save_multispeakers(diar_ubm.id_map(), output_feature_filename=mfcc_ubm_fn, keep_all=False) ubm_idmap = fe.save_multispeakers(diar_ubm.id_map(), output_feature_filename=mfcc_ubm_fn, keep_all=False)
ubm_idmap.write_txt(ubm_idmap_fn) ubm_idmap.write_txt(ubm_idmap_fn)
``` ```
%% Output
2018-09-26 11:50:10,393 - INFO - Computing MFCC for UBM
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-11-b9765c5346e8> in <module>()
2 diar_ubm = Diar.read_seg(ubm_seg_fn, normalize_cluster=True)
3 fe = get_feature_extractor(audio_dir, 'sid')
----> 4 ubm_idmap = fe.save_multispeakers(diar_ubm.id_map(), output_feature_filename=mfcc_ubm_fn, keep_all=False)
5 ubm_idmap.write_txt(ubm_idmap_fn)
~/Desktop/Doctorat/Tools/Environments/miniconda/Python3/lib/python3.6/site-packages/sidekit/features_extractor.py in save_multispeakers(self, idmap, channel, input_audio_filename, output_feature_filename, keep_all, skip_existing_file)
460 # logging.info('tmp file name: '+temp_file_name)
461 self.vad = None
--> 462 h5f = self.extract(show, channel, input_audio_filename, backing_store=False)
463 energy = h5f.get(show + '/energy').value
464 label = h5f.get(show + '/vad').value
~/Desktop/Doctorat/Tools/Environments/miniconda/Python3/lib/python3.6/site-packages/sidekit/features_extractor.py in extract(self, show, channel, input_audio_filename, output_feature_filename, backing_store)
215
216 # Open audio file, get the signal and possibly the sampling frequency
--> 217 signal, sample_rate = read_audio(audio_filename, self.sampling_frequency)
218 if signal.ndim == 1:
219 signal = signal[:, numpy.newaxis]
~/Desktop/Doctorat/Tools/Environments/miniconda/Python3/lib/python3.6/site-packages/sidekit/frontend/io.py in read_audio(input_file_name, framerate)
420 sig, read_framerate, sampwidth = read_sph(input_file_name, 'p')
421 elif ext.lower() == '.wav' or ext.lower() == '.wave':
--> 422 sig, read_framerate, sampwidth = read_wav(input_file_name)
423 elif ext.lower() == '.pcm' or ext.lower() == '.raw':
424 sig, read_framerate, sampwidth = read_pcm(input_file_name)
~/Desktop/Doctorat/Tools/Environments/miniconda/Python3/lib/python3.6/site-packages/sidekit/frontend/io.py in read_wav(input_file_name)
117 :return:
118 """
--> 119 with wave.open(input_file_name, "r") as wfh:
120 (nchannels, sampwidth, framerate, nframes, comptype, compname) = wfh.getparams()
121 raw = wfh.readframes(nframes * nchannels)
~/Desktop/Doctorat/Tools/Environments/miniconda/Python3/lib/python3.6/wave.py in open(f, mode)
497 mode = 'rb'
498 if mode in ('r', 'rb'):
--> 499 return Wave_read(f)
500 elif mode in ('w', 'wb'):
501 return Wave_write(f)
~/Desktop/Doctorat/Tools/Environments/miniconda/Python3/lib/python3.6/wave.py in __init__(self, f)
157 self._i_opened_the_file = None
158 if isinstance(f, str):
--> 159 f = builtins.open(f, 'rb')
160 self._i_opened_the_file = f
161 # else, assume it is an open file object already
FileNotFoundError: [Errno 2] No such file or directory: '../data/train/19981207_0700_0800_inter_fm_dga.wav'
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Train the UBM by EM Train the UBM by EM
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
ubm_idmap = IdMap.read_txt(ubm_idmap_fn) ubm_idmap = IdMap.read_txt(ubm_idmap_fn)
fs = get_feature_server(mfcc_ubm_fn, 'sid') fs = get_feature_server(mfcc_ubm_fn, 'sid')
spk_lst = ubm_idmap.rightids spk_lst = ubm_idmap.rightids
ubm = Mixture() ubm = Mixture()
ubm.EM_split(fs, spk_lst, nb_gauss, ubm.EM_split(fs, spk_lst, nb_gauss,
iterations=(1, 2, 2, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8), num_thread=num_thread, iterations=(1, 2, 2, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8), num_thread=num_thread,
llk_gain=0.01) llk_gain=0.01)
ubm.write(ubm_fn, prefix='ubm/') ubm.write(ubm_fn, prefix='ubm/')
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Step 2: TV Step 2: TV
--- ---
Extract MFCC for TV Extract MFCC for TV
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
logging.info('Computing MFCC for TV') logging.info('Computing MFCC for TV')
diar_tv = Diar.read_seg(tv_seg_fn, normalize_cluster=True) diar_tv = Diar.read_seg(tv_seg_fn, normalize_cluster=True)
fe = get_feature_extractor(audio_dir, 'sid') fe = get_feature_extractor(audio_dir, 'sid')
tv_idmap = fe.save_multispeakers(diar_tv.id_map(), output_feature_filename=mfcc_tv_fn, keep_all=False) tv_idmap = fe.save_multispeakers(diar_tv.id_map(), output_feature_filename=mfcc_tv_fn, keep_all=False)
tv_idmap.write(tv_idmap_fn) tv_idmap.write(tv_idmap_fn)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Train a Total Variability model using the FactorAnalyser class Train a Total Variability model using the FactorAnalyser class
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
tv_idmap = IdMap.read(tv_idmap_fn) tv_idmap = IdMap.read(tv_idmap_fn)
ubm = Mixture() ubm = Mixture()
ubm.read(ubm_fn, prefix='ubm/') ubm.read(ubm_fn, prefix='ubm/')
fs = get_feature_server(mfcc_tv_fn, 'sid') fs = get_feature_server(mfcc_tv_fn, 'sid')
tv_idmap.leftids = numpy.copy(tv_idmap.rightids) tv_idmap.leftids = numpy.copy(tv_idmap.rightids)
tv_stat = StatServer(tv_idmap, ubm.get_distrib_nb(), ubm.dim()) tv_stat = StatServer(tv_idmap, ubm.get_distrib_nb(), ubm.dim())
tv_stat.accumulate_stat(ubm=ubm, feature_server=fs, seg_indices=range(tv_stat.segset.shape[0]), num_thread=num_thread) tv_stat.accumulate_stat(ubm=ubm, feature_server=fs, seg_indices=range(tv_stat.segset.shape[0]), num_thread=num_thread)
tv_stat.write(tv_stat_fn) tv_stat.write(tv_stat_fn)
fa = FactorAnalyser() fa = FactorAnalyser()
fa.total_variability(tv_stat_fn, ubm, rank_tv, nb_iter=it_max_tv, batch_size=1000, num_thread=num_thread) fa.total_variability(tv_stat_fn, ubm, rank_tv, nb_iter=it_max_tv, batch_size=1000, num_thread=num_thread)
write_tv_hdf5([fa.F, fa.mean, fa.Sigma], tv_fn) write_tv_hdf5([fa.F, fa.mean, fa.Sigma], tv_fn)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Step 3: PLDA Step 3: PLDA
--- ---
Extract the MFCC for the PLDA Extract the MFCC for the PLDA
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
logging.info('Computing MFCC for PLDA') logging.info('Computing MFCC for PLDA')
diar_plda = Diar.read_seg(plda_seg_fn, normalize_cluster=True) diar_plda = Diar.read_seg(plda_seg_fn, normalize_cluster=True)
fe = get_feature_extractor(audio_dir, 'sid') fe = get_feature_extractor(audio_dir, 'sid')
plda_idmap = fe.save_multispeakers(diar_plda.id_map(), output_feature_filename=mfcc_plda_fn, keep_all=False) plda_idmap = fe.save_multispeakers(diar_plda.id_map(), output_feature_filename=mfcc_plda_fn, keep_all=False)
plda_idmap.write(plda_idmap_fn) plda_idmap.write(plda_idmap_fn)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Accumulate statistics Accumulate statistics
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
plda_idmap = IdMap.read(plda_idmap_fn) plda_idmap = IdMap.read(plda_idmap_fn)
ubm = Mixture() ubm = Mixture()
ubm.read(ubm_fn, prefix='ubm/') ubm.read(ubm_fn, prefix='ubm/')
tv, tv_mean, tv_sigma = read_tv_hdf5(tv_fn) tv, tv_mean, tv_sigma = read_tv_hdf5(tv_fn)
fs = get_feature_server(mfcc_plda_fn, 'sid') fs = get_feature_server(mfcc_plda_fn, 'sid')
plda_norm_stat = StatServer(plda_idmap, ubm.get_distrib_nb(), ubm.dim()) plda_norm_stat = StatServer(plda_idmap, ubm.get_distrib_nb(), ubm.dim())
plda_norm_stat.accumulate_stat(ubm=ubm, feature_server=fs, plda_norm_stat.accumulate_stat(ubm=ubm, feature_server=fs,
seg_indices=range(plda_norm_stat.segset.shape[0]), num_thread=num_thread) seg_indices=range(plda_norm_stat.segset.shape[0]), num_thread=num_thread)
plda_norm_stat.write(norm_stat_fn) plda_norm_stat.write(norm_stat_fn)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Extract i-vectors and compute norm Extract i-vectors and compute norm
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
fa = FactorAnalyser(F=tv, mean=tv_mean, Sigma=tv_sigma) fa = FactorAnalyser(F=tv, mean=tv_mean, Sigma=tv_sigma)
norm_iv = fa.extract_ivectors(ubm, norm_stat_fn, num_thread=num_thread) norm_iv = fa.extract_ivectors(ubm, norm_stat_fn, num_thread=num_thread)
norm_iv.write(norm_iv_fn) norm_iv.write(norm_iv_fn)
norm_mean, norm_cov = norm_iv.estimate_spectral_norm_stat1(1, 'sphNorm') norm_mean, norm_cov = norm_iv.estimate_spectral_norm_stat1(1, 'sphNorm')
write_norm_hdf5([norm_mean, norm_cov], norm_fn) write_norm_hdf5([norm_mean, norm_cov], norm_fn)
norm_iv.spectral_norm_stat1(norm_mean[:1], norm_cov[:1]) norm_iv.spectral_norm_stat1(norm_mean[:1], norm_cov[:1])
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Train the PLDA model Train the PLDA model
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
fa = FactorAnalyser() fa = FactorAnalyser()
fa.plda(norm_iv, rank_plda, nb_iter=it_max_plda) fa.plda(norm_iv, rank_plda, nb_iter=it_max_plda)
write_plda_hdf5([fa.mean, fa.F, numpy.zeros((rank_tv, 0)), fa.Sigma], plda_fn) write_plda_hdf5([fa.mean, fa.F, numpy.zeros((rank_tv, 0)), fa.Sigma], plda_fn)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Step 4: Compute additional data (optional) Step 4: Compute additional data (optional)
--- ---
Adding matrices for additional scoring methods: Adding matrices for additional scoring methods:
* Mahalonobis matrix * Mahalonobis matrix
* Lower Choleski decomposition of the WCCN matrix * Lower Choleski decomposition of the WCCN matrix
* Within- and Between-class Covariance matrices * Within- and Between-class Covariance matrices
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
iv = StatServer(norm_iv_fn) iv = StatServer(norm_iv_fn)
matrix_dict = {} matrix_dict = {}
logging.info('compute mahalanobis_matrix') logging.info('compute mahalanobis_matrix')
mahalanobis_matrix = iv.get_mahalanobis_matrix_stat1() mahalanobis_matrix = iv.get_mahalanobis_matrix_stat1()
matrix_dict['mahalanobis_matrix'] = mahalanobis_matrix matrix_dict['mahalanobis_matrix'] = mahalanobis_matrix
logging.info('compute wccn_choleski') logging.info('compute wccn_choleski')
wccn_choleski = iv.get_wccn_choleski_stat1() wccn_choleski = iv.get_wccn_choleski_stat1()
matrix_dict['wccn_choleski'] = wccn_choleski matrix_dict['wccn_choleski'] = wccn_choleski
logging.info('compute two_covariance') logging.info('compute two_covariance')
within_covariance = iv.get_within_covariance_stat1() within_covariance = iv.get_within_covariance_stat1()
matrix_dict['two_covariance/within_covariance'] = within_covariance matrix_dict['two_covariance/within_covariance'] = within_covariance
between_covariance = iv.get_between_covariance_stat1() between_covariance = iv.get_between_covariance_stat1()
matrix_dict['two_covariance/between_covariance'] = between_covariance matrix_dict['two_covariance/between_covariance'] = between_covariance
write_dict_hdf5(matrix_dict, matrices_fn) write_dict_hdf5(matrix_dict, matrices_fn)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Step 5: Merge in one model Step 5: Merge in one model
--- ---
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
with h5py.File(model_fn, 'w') as model: with h5py.File(model_fn, 'w') as model:
for fn in [ubm_fn, tv_fn, norm_fn, plda_fn, matrices_fn]: for fn in [ubm_fn, tv_fn, norm_fn, plda_fn, matrices_fn]:
if not os.path.exists(fn): if not os.path.exists(fn):
continue continue
with h5py.File(fn, 'r') as fh: with h5py.File(fn, 'r') as fh:
for group in fh: for group in fh:
logging.info(group) logging.info(group)
fh.copy(group, model) fh.copy(group, model)
``` ```
......
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Diarization for ASR Diarization for ASR
=================== ===================
This script performs a BIC diarization (ussally for ASR decoding) This script performs a BIC diarization (ussally for ASR decoding)
The proposed diarization system was inspired by the The proposed diarization system was inspired by the
system [1] which won the RT'04 fall evaluation system [1] which won the RT'04 fall evaluation
and the ESTER1 evaluation. It was developed during the ESTER2 and the ESTER1 evaluation. It was developed during the ESTER2
evaluation campaign for the transcription with the goal of minimizing evaluation campaign for the transcription with the goal of minimizing
word error rate. word error rate.
Automatic transcription requires accurate segment boundaries. Segment Automatic transcription requires accurate segment boundaries. Segment
boundaries have to be set within non-informative zones such as filler boundaries have to be set within non-informative zones such as filler
words. words.
Speaker diarization needs to produce homogeneous speech segments; Speaker diarization needs to produce homogeneous speech segments;
however, purity and coverage of the speaker clusters are the main however, purity and coverage of the speaker clusters are the main
objectives here. Errors such as having two distinct clusters (i.e., objectives here. Errors such as having two distinct clusters (i.e.,
detected speakers) corresponding to the same real speaker, or detected speakers) corresponding to the same real speaker, or
conversely, merging segments of two real speakers into only one cluster, conversely, merging segments of two real speakers into only one cluster,
get heavier penalty in the NIST time-based diarization metric than get heavier penalty in the NIST time-based diarization metric than
misplaced boundaries. misplaced boundaries.
The system is composed of acoustic BIC segmentation followed with BIC The system is composed of acoustic BIC segmentation followed with BIC
hierarchical clustering. Viterbi decoding is performed to adjust the hierarchical clustering. Viterbi decoding is performed to adjust the
segment boundaries. segment boundaries.
Music and jingle regions are not removed but a speech activity Music and jingle regions are not removed but a speech activity
diarization could be load before to segment and cluster the show. diarization could be load before to segment and cluster the show.
Optionally, long segments are cut to be shorter than 20 seconds. Optionally, long segments are cut to be shorter than 20 seconds.
[1] C. Barras, X. Zhu, S. Meignier, and J. L. Gauvain, “Multistage speaker diarization of broadcast news,” IEEE Transactions on Audio, Speech, and Language Processing, vol. 14, no. 5, pp. 1505–1512, Sep. 2006. [1] C. Barras, X. Zhu, S. Meignier, and J. L. Gauvain, “Multistage speaker diarization of broadcast news,” IEEE Transactions on Audio, Speech, and Language Processing, vol. 14, no. 5, pp. 1505–1512, Sep. 2006.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
%matplotlib inline %matplotlib inline
__license__ = "LGPL" __license__ = "LGPL"
__author__ = "Sylvain Meignier" __author__ = "Sylvain Meignier"
__copyright__ = "Copyright 2015-2016 Sylvain Meignier" __copyright__ = "Copyright 2015-2016 Sylvain Meignier"
__license__ = "LGPL" __license__ = "LGPL"
__maintainer__ = "Sylvain Meignier" __maintainer__ = "Sylvain Meignier"
__email__ = "sidekit@univ-lemans.fr" __email__ = "sidekit@univ-lemans.fr"
__status__ = "Production" __status__ = "Production"
__docformat__ = 'reStructuredText' __docformat__ = 'reStructuredText'
import argparse import argparse
import logging import logging
import matplotlib import matplotlib
import copy import copy
import os import os
from matplotlib import pyplot as plot from matplotlib import pyplot as plot
from s4d.utils import * from s4d.utils import *
from s4d.diar import Diar from s4d.diar import Diar
from s4d import viterbi, segmentation from s4d import viterbi, segmentation
from s4d.clustering import hac_bic from s4d.clustering import hac_bic
from sidekit.sidekit_io import init_logging from sidekit.sidekit_io import init_logging
from s4d.gui.dendrogram import plot_dendrogram from s4d.gui.dendrogram import plot_dendrogram
``` ```
%% Output
/Users/Sulfyderz/Desktop/Doctorat/Tools/Environments/miniconda/Python3/lib/python3.6/site-packages/sidekit/bosaris/detplot.py:39: UserWarning: matplotlib.pyplot as already been imported, this call will have no effect.
matplotlib.use('PDF')
WARNING:root:WARNNG: libsvm is not installed, please refer to the documentation if you intend to use SVM classifiers
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
BIC diarization BIC diarization
=============== ===============
Arguments, variables and logger Arguments, variables and logger
------------------------------- -------------------------------
Set the logger Set the logger
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
loglevel = logging.INFO loglevel = logging.INFO
init_logging(level=loglevel) init_logging(level=loglevel)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Set the input audio or mfcc file and the speech activity detection file (optional). Set the input audio or mfcc file and the speech activity detection file (optional).
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
data_dir = 'data' data_dir = 'data'
show = '20041008_1800_1830_INFO_DGA' show = '20041008_1800_1830_INFO_DGA'
input_show = os.path.join(data_dir, 'audio', show + '.wav') input_show = os.path.join(data_dir, 'audio', show + '.wav')
input_sad = os.path.join(data_dir, 'sad', show + '.sad.seg') input_sad = os.path.join(data_dir, 'sad', show + '.sad.seg')
#input_sad = None #input_sad = None
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Size of left or right windows (step 2) Size of left or right windows (step 2)
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
win_size=250 win_size=250
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Threshold for: Threshold for:
* Linear segmentation (step 3) * Linear segmentation (step 3)
* BIC HAC (step 4) * BIC HAC (step 4)
* Viterbi (step 5) * Viterbi (step 5)
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
thr_l = 2 thr_l = 2
thr_h = 3 thr_h = 3
thr_vit = -250 thr_vit = -250
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
If ``save_all`` is ``True`` then all produced diarization are saved If ``save_all`` is ``True`` then all produced diarization are saved
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
save_all = True save_all = True
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Prepare various variables Prepare various variables
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
wdir = os.path.join('out', show) wdir = os.path.join('out', show)
if not os.path.exists(wdir): if not os.path.exists(wdir):
os.makedirs(wdir) os.makedirs(wdir)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Step 1: MFCC Step 1: MFCC
------------- -------------
Extract and load the MFCC Extract and load the MFCC
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
logging.info('Make MFCC') logging.info('Make MFCC')
if save_all: if save_all:
fe = get_feature_extractor(input_show, type_feature_extractor='basic') fe = get_feature_extractor(input_show, type_feature_extractor='basic')
mfcc_filename = os.path.join(wdir, show + '.mfcc.h5') mfcc_filename = os.path.join(wdir, show + '.mfcc.h5')
fe.save(show, output_feature_filename=mfcc_filename) fe.save(show, output_feature_filename=mfcc_filename)
fs = get_feature_server(mfcc_filename, feature_server_type='basic') fs = get_feature_server(mfcc_filename, feature_server_type='basic')
else: else:
fs = get_feature_server(input_show, feature_server_type='basic') fs = get_feature_server(input_show, feature_server_type='basic')
cep, _ = fs.load(show) cep, _ = fs.load(show)
``` ```
%% Output %% Output
2018-06-11 10:46:17,143 - INFO - Make MFCC 2018-06-11 10:46:17,143 - INFO - Make MFCC
2018-06-11 10:46:17,144 - INFO - data/audio ## 20041008_1800_1830_INFO_DGA ## .wav 2018-06-11 10:46:17,144 - INFO - data/audio ## 20041008_1800_1830_INFO_DGA ## .wav
2018-06-11 10:46:17,145 - INFO - -------------------- 2018-06-11 10:46:17,145 - INFO - --------------------
2018-06-11 10:46:17,145 - INFO - show: empty keep_all_features: True 2018-06-11 10:46:17,145 - INFO - show: empty keep_all_features: True
audio_filename_structure: data/audio/20041008_1800_1830_INFO_DGA.wav audio_filename_structure: data/audio/20041008_1800_1830_INFO_DGA.wav
feature_filename_structure: {} feature_filename_structure: {}
pre-emphasis: 0.97 pre-emphasis: 0.97
lower_frequency: 133.3333 higher_frequency: 6855.4976 lower_frequency: 133.3333 higher_frequency: 6855.4976
sampling_frequency: 16000 sampling_frequency: 16000
filter bank: 40 filters of type log filter bank: 40 filters of type log
ceps_number: 13 ceps_number: 13
window_size: 0.025 shift: 0.01 window_size: 0.025 shift: 0.01
vad: None snr: None vad: None snr: None
2018-06-11 10:46:17,146 - INFO - -------------------- 2018-06-11 10:46:17,146 - INFO - --------------------
2018-06-11 10:46:17,147 - INFO - show: empty 2018-06-11 10:46:17,147 - INFO - show: empty
input_feature_filename: empty input_feature_filename: empty
feature_filename_structure: {} feature_filename_structure: {}
Post processing options: Post processing options:
mask: None mask: None
feat_norm: None feat_norm: None
dct_pca: False, dct_pca_config: (12, 12, None) dct_pca: False, dct_pca_config: (12, 12, None)
sdc: False, sdc_config: (1, 3, 7) sdc: False, sdc_config: (1, 3, 7)
delta: False, double_delta: False, delta_filter: [ 0.25 0.5 0.25 0. -0.25 -0.5 -0.25] delta: False, double_delta: False, delta_filter: [ 0.25 0.5 0.25 0. -0.25 -0.5 -0.25]
rasta: False rasta: False
keep_all_features: True keep_all_features: True
2018-06-11 10:46:21,768 - INFO - process part : 0.000000 1822.912125 1822.912125 2018-06-11 10:46:21,768 - INFO - process part : 0.000000 1822.912125 1822.912125
2018-06-11 10:46:24,513 - INFO - no vad 2018-06-11 10:46:24,513 - INFO - no vad
2018-06-11 10:46:24,518 - INFO - !! size of signal cep: 0.000050 len 13 type size 4 2018-06-11 10:46:24,518 - INFO - !! size of signal cep: 0.000050 len 13 type size 4
2018-06-11 10:46:24,602 - INFO - [ True True True ..., True True True] 2018-06-11 10:46:24,602 - INFO - [ True True True ..., True True True]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
cep.shape cep.shape
``` ```
%% Output %% Output
(182289, 14) (182289, 14)
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Step 2: Initialization Step 2: Initialization
------ ------
The initial diarization is loaded from a speech activity detection The initial diarization is loaded from a speech activity detection
diarization (SAD) or a segment is created from the first to the last diarization (SAD) or a segment is created from the first to the last
MFCC feature. MFCC feature.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
logging.info('Check initial segmentation') logging.info('Check initial segmentation')
if input_sad is not None: if input_sad is not None:
init_diar = Diar.read_seg(input_sad) init_diar = Diar.read_seg(input_sad)
init_diar.pack(50) init_diar.pack(50)
else: else:
init_diar = segmentation.init_seg(cep, show) init_diar = segmentation.init_seg(cep, show)
if save_all: if save_all:
init_filename = os.path.join(wdir, show + '.i.seg') init_filename = os.path.join(wdir, show + '.i.seg')
Diar.write_seg(init_filename, init_diar) Diar.write_seg(init_filename, init_diar)
``` ```
%% Output %% Output
2018-06-11 10:46:30,818 - INFO - Check initial segmentation 2018-06-11 10:46:30,818 - INFO - Check initial segmentation
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Step 3: Gaussian Divergence segmentation Step 3: Gaussian Divergence segmentation
---------------------------------------- ----------------------------------------
First segmentation: Segment each segment of ``init_diar`` using the First segmentation: Segment each segment of ``init_diar`` using the
Gaussian Divergence method Gaussian Divergence method
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
logging.info('Gaussian Divergence segmentation') logging.info('Gaussian Divergence segmentation')
seg_diar = segmentation.segmentation(cep, init_diar, win_size) seg_diar = segmentation.segmentation(cep, init_diar, win_size)
if save_all: if save_all:
seg_filename = os.path.join(wdir, show + '.s.seg') seg_filename = os.path.join(wdir, show + '.s.seg')
Diar.write_seg(seg_filename, seg_diar) Diar.write_seg(seg_filename, seg_diar)
``` ```
%% Output %% Output
2018-06-11 10:46:30,828 - INFO - Gaussian Divergence segmentation 2018-06-11 10:46:30,828 - INFO - Gaussian Divergence segmentation
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Step 4: linear BIC segmentation Step 4: linear BIC segmentation
------------------------------- -------------------------------
This segmentation over the signal fuses consecutive segments of the same This segmentation over the signal fuses consecutive segments of the same
speaker from the start to the end of the record. The measure employs the speaker from the start to the end of the record. The measure employs the
$\Delta BIC$ based on Bayesian Information Criterion , using full $\Delta BIC$ based on Bayesian Information Criterion , using full
covariance Gaussians (see class ``gauss.GaussFull``). covariance Gaussians (see class ``gauss.GaussFull``).
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
logging.info('Linear BIC, alpha: %f', thr_l) logging.info('Linear BIC, alpha: %f', thr_l)
bicl_diar = segmentation.bic_linear(cep, seg_diar, thr_l, sr=False) bicl_diar = segmentation.bic_linear(cep, seg_diar, thr_l, sr=False)
if save_all: if save_all:
bicl_filename = os.path.join(wdir, show + '.l.seg') bicl_filename = os.path.join(wdir, show + '.l.seg')
Diar.write_seg(bicl_filename, bicl_diar) Diar.write_seg(bicl_filename, bicl_diar)
``` ```
%% Output %% Output
2018-06-11 10:46:31,246 - INFO - Linear BIC, alpha: 2.000000 2018-06-11 10:46:31,246 - INFO - Linear BIC, alpha: 2.000000
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Step 5: BIC HAC Step 5: BIC HAC
--------------- ---------------
Perform a BIC HAC Perform a BIC HAC
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
logging.info('BIC HAC, alpha: %f', thr_h) logging.info('BIC HAC, alpha: %f', thr_h)
bic = hac_bic.HAC_BIC(cep, bicl_diar, thr_h, sr=False) bic = hac_bic.HAC_BIC(cep, bicl_diar, thr_h, sr=False)
bich_diar = bic.perform(to_the_end=True) bich_diar = bic.perform(to_the_end=True)
if save_all: if save_all:
bichac_filename = os.path.join(wdir, show + '.h.seg') bichac_filename = os.path.join(wdir, show + '.h.seg')
Diar.write_seg(bichac_filename, bich_diar) Diar.write_seg(bichac_filename, bich_diar)
link, data = plot_dendrogram(bic.merge, 0, size=(25,6), log=True) link, data = plot_dendrogram(bic.merge, 0, size=(25,6), log=True)
``` ```
%% Output %% Output
2018-06-11 10:46:31,374 - INFO - BIC HAC, alpha: 3.000000 2018-06-11 10:46:31,374 - INFO - BIC HAC, alpha: 3.000000
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Step 5: re-segmentation Step 5: re-segmentation
----------------------- -----------------------
Viterbi decoding Viterbi decoding
* HMM is trained: one GMM per speaker, GMM has 8 component with diagonal covariance matrix. Only a penalty between state * HMM is trained: one GMM per speaker, GMM has 8 component with diagonal covariance matrix. Only a penalty between state
is fixed. is fixed.
* Emission is computed: likelyhood for each feature * Emission is computed: likelyhood for each feature
* a Viterbi decoding is performed * a Viterbi decoding is performed
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
logging.info('Viterbi decoding, penalties: %f', thr_vit) logging.info('Viterbi decoding, penalties: %f', thr_vit)
vit_diar = viterbi.viterbi_decoding(cep, bich_diar, thr_vit) vit_diar = viterbi.viterbi_decoding(cep, bich_diar, thr_vit)
if save_all: if save_all:
vit_filename = os.path.join(wdir, show + '.d.seg') vit_filename = os.path.join(wdir, show + '.d.seg')
Diar.write_seg(vit_filename, vit_diar) Diar.write_seg(vit_filename, vit_diar)
``` ```
%% Output %% Output
2018-06-11 10:46:32,506 - INFO - Viterbi decoding, penalties: -250.000000 2018-06-11 10:46:32,506 - INFO - Viterbi decoding, penalties: -250.000000
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Compute the diarization error rate Compute the diarization error rate
---------------------------------- ----------------------------------
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from s4d import scoring from s4d import scoring
from tabulate import tabulate from tabulate import tabulate
from s4d.gui.viewer import PlotDiar from s4d.gui.viewer import PlotDiar
from s4d.gui.viewer_utils import * from s4d.gui.viewer_utils import *
ref = Diar.read_mdtm(os.path.join(data_dir, 'seg', 'ester1.tst.mdtm')) ref = Diar.read_mdtm(os.path.join(data_dir, 'seg', 'ester1.tst.mdtm'))
uem = Diar.read_uem(os.path.join(data_dir, 'seg', 'ester1.tst.uem')) uem = Diar.read_uem(os.path.join(data_dir, 'seg', 'ester1.tst.uem'))
uem_show = uem.make_index(['show']) uem_show = uem.make_index(['show'])
ref_show = ref.make_index(['show']) ref_show = ref.make_index(['show'])
der = scoring.compute_der(vit_diar, ref_show[show], uem=uem_show[show], collar=25, no_overlap=False) der = scoring.compute_der(vit_diar, ref_show[show], uem=uem_show[show], collar=25, no_overlap=False)
tab = scoring.get_header() tab = scoring.get_header()
tab += der.get_table(show, time=False) tab += der.get_table(show, time=False)
print(tabulate(tab, tablefmt='psql', floatfmt='.2f', headers='firstrow')) print(tabulate(tab, tablefmt='psql', floatfmt='.2f', headers='firstrow'))
diff_diar = diar_diff(vit_diar, ref_show[show], match=True) diff_diar = diar_diff(vit_diar, ref_show[show], match=True)
p = PlotDiar(diff_diar, size=(25, 6)) p = PlotDiar(diff_diar, size=(25, 6))
p.draw() p.draw()
``` ```
%% Output %% Output
2018-06-11 10:46:41,697 - INFO - append collar 2018-06-11 10:46:41,697 - INFO - append collar
+-----------------------------+--------+------+--------+-------+------+--------+--------+-----------+ +-----------------------------+--------+------+--------+-------+------+--------+--------+-----------+
| show | type | fa | miss | sns | fa | miss | conf | speaker | | show | type | fa | miss | sns | fa | miss | conf | speaker |
|-----------------------------+--------+------+--------+-------+------+--------+--------+-----------| |-----------------------------+--------+------+--------+-------+------+--------+--------+-----------|
| 20041008_1800_1830_INFO_DGA | rate | 0.19 | 0.45 | 0.64 | 0.19 | 1.01 | 12.12 | 13.33 | | 20041008_1800_1830_INFO_DGA | rate | 0.19 | 0.45 | 0.64 | 0.19 | 1.01 | 12.12 | 13.33 |
+-----------------------------+--------+------+--------+-------+------+--------+--------+-----------+ +-----------------------------+--------+------+--------+-------+------+--------+--------+-----------+
uem from ref uem from ref
100 100