Commit 95406297 authored by Sulfyderz's avatar Sulfyderz
Browse files

[Update]:Updating the README.md file for Pypi.

parent 3467349f
......@@ -11,24 +11,20 @@ PREREQUISITES
=============
*Sidekit for Diarization* requires the following software installed for your platform:
1. [Python](http://www.python.org)
2. [NumPy](http://www.numpy.org/)
3. [Scipy](http://http://www.scipy.org/)
4. [Pandas](http://http://www.pandas.org/)
5. [GLPK](https://www.gnu.org/software/glpk/)
6. If you want to build the documentation: [Sphinx 1.1.0 or newer](http://http://sphinx-doc.org/)
6. [Sphinx 1.1.0 or newer](http://http://sphinx-doc.org/) to build the documentation
INSTALLATION
============
We recommend the use of a virtual environment (e.g. [Miniconda](https://conda.io/miniconda.html) or [Virtualenv](https://virtualenv.readthedocs.io/en/latest/)).
After downloading the project, install the requirements with:
```
pip install -r requirements.txt
```
Then proceed to install s4d:
```
./install.sh
```
Once done, you can take a look at the [tutorials](https://git-lium.univ-lemans.fr/Meignier/s4d/tree/master/tutorials).
\ No newline at end of file
TUTORIALS
=========
Once your installation is complete, you can take a look at the [tutorials](https://git-lium.univ-lemans.fr/Meignier/s4d/tree/master/tutorials).
\ No newline at end of file
%% Cell type:markdown id: tags:
Train model for Diarization
====
This script trains UBM, TV and PLDA models for a diarization system.
Initialization
---
%% Cell type:code id: tags:
``` python
%matplotlib inline
from s4d.diar import Diar
from s4d.utils import *
from sidekit import Mixture, FactorAnalyser, StatServer, IdMap
import numpy
import logging
import re
import sidekit
from sidekit.sidekit_io import *
try:
from sortedcontainers import SortedDict as dict
except ImportError:
pass
```
%% Cell type:code id: tags:
``` python
init_logging(level=logging.INFO)
num_thread = 4
audio_dir = '../data/train/{}.wav'
ubm_seg_fn = './data/seg/ubm_ester.seg'
nb_gauss = 1024
mfcc_ubm_fn = './data/mfcc/ubm.h5'
ubm_idmap_fn = './data/mfcc/ubm_idmap.txt'
ubm_fn = './data/model/ester_ubm_'+str(nb_gauss)+'.h5'
tv_seg_fn = './data/seg/train.tv.seg'
rank_tv = 300
it_max_tv = 10
mfcc_tv_fn = './data/mfcc/tv.h5'
tv_idmap_fn = './data/mfcc/tv_idmap.h5'
tv_stat_fn = './data/model/tv.stat.h5'
tv_fn = './data/model/tv_'+str(rank_tv)+'.h5'
plda_seg_fn = './data/seg/train.plda.seg'
rank_plda = 150
it_max_plda = 10
mfcc_plda_fn = './data/mfcc/norm_plda.h5'
plda_idmap_fn = './data/mfcc/plda_idmap.h5'
plda_fn = './data/model/plda_'+str(rank_tv)+'_'+str(rank_plda)+'.h5'
norm_stat_fn = './data/model/norm.stat.h5'
norm_fn = './data/model/norm.h5'
norm_iv_fn = './data/model/norm.iv.h5'
matrices_fn = './data/model/matrices.h5'
model_fn = './data/model/ester_model_{}_{}_{}.h5'.format(nb_gauss, rank_tv, rank_plda)
```
%% Cell type:markdown id: tags:
Step 1: UBM
---
Extract MFCC for the UBM
%% Cell type:code id: tags:
``` python
logging.info('Computing MFCC for UBM')
diar_ubm = Diar.read_seg(ubm_seg_fn, normalize_cluster=True)
fe = get_feature_extractor(audio_dir, 'sid')
ubm_idmap = fe.save_multispeakers(diar_ubm.id_map(), output_feature_filename=mfcc_ubm_fn, keep_all=False)
ubm_idmap.write_txt(ubm_idmap_fn)
```
%%%% Output: stream
2018-09-26 11:50:10,393 - INFO - Computing MFCC for UBM
%%%% Output: error
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-11-b9765c5346e8> in <module>()
2 diar_ubm = Diar.read_seg(ubm_seg_fn, normalize_cluster=True)
3 fe = get_feature_extractor(audio_dir, 'sid')
----> 4 ubm_idmap = fe.save_multispeakers(diar_ubm.id_map(), output_feature_filename=mfcc_ubm_fn, keep_all=False)
5 ubm_idmap.write_txt(ubm_idmap_fn)
~/Desktop/Doctorat/Tools/Environments/miniconda/Python3/lib/python3.6/site-packages/sidekit/features_extractor.py in save_multispeakers(self, idmap, channel, input_audio_filename, output_feature_filename, keep_all, skip_existing_file)
460 # logging.info('tmp file name: '+temp_file_name)
461 self.vad = None
--> 462 h5f = self.extract(show, channel, input_audio_filename, backing_store=False)
463 energy = h5f.get(show + '/energy').value
464 label = h5f.get(show + '/vad').value
~/Desktop/Doctorat/Tools/Environments/miniconda/Python3/lib/python3.6/site-packages/sidekit/features_extractor.py in extract(self, show, channel, input_audio_filename, output_feature_filename, backing_store)
215
216 # Open audio file, get the signal and possibly the sampling frequency
--> 217 signal, sample_rate = read_audio(audio_filename, self.sampling_frequency)
218 if signal.ndim == 1:
219 signal = signal[:, numpy.newaxis]
~/Desktop/Doctorat/Tools/Environments/miniconda/Python3/lib/python3.6/site-packages/sidekit/frontend/io.py in read_audio(input_file_name, framerate)
420 sig, read_framerate, sampwidth = read_sph(input_file_name, 'p')
421 elif ext.lower() == '.wav' or ext.lower() == '.wave':
--> 422 sig, read_framerate, sampwidth = read_wav(input_file_name)
423 elif ext.lower() == '.pcm' or ext.lower() == '.raw':
424 sig, read_framerate, sampwidth = read_pcm(input_file_name)
~/Desktop/Doctorat/Tools/Environments/miniconda/Python3/lib/python3.6/site-packages/sidekit/frontend/io.py in read_wav(input_file_name)
117 :return:
118 """
--> 119 with wave.open(input_file_name, "r") as wfh:
120 (nchannels, sampwidth, framerate, nframes, comptype, compname) = wfh.getparams()
121 raw = wfh.readframes(nframes * nchannels)
~/Desktop/Doctorat/Tools/Environments/miniconda/Python3/lib/python3.6/wave.py in open(f, mode)
497 mode = 'rb'
498 if mode in ('r', 'rb'):
--> 499 return Wave_read(f)
500 elif mode in ('w', 'wb'):
501 return Wave_write(f)
~/Desktop/Doctorat/Tools/Environments/miniconda/Python3/lib/python3.6/wave.py in __init__(self, f)
157 self._i_opened_the_file = None
158 if isinstance(f, str):
--> 159 f = builtins.open(f, 'rb')
160 self._i_opened_the_file = f
161 # else, assume it is an open file object already
FileNotFoundError: [Errno 2] No such file or directory: '../data/train/19981207_0700_0800_inter_fm_dga.wav'
%% Cell type:markdown id: tags:
Train the UBM by EM
%% Cell type:code id: tags:
``` python
ubm_idmap = IdMap.read_txt(ubm_idmap_fn)
fs = get_feature_server(mfcc_ubm_fn, 'sid')
spk_lst = ubm_idmap.rightids
ubm = Mixture()
ubm.EM_split(fs, spk_lst, nb_gauss,
iterations=(1, 2, 2, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8), num_thread=num_thread,
llk_gain=0.01)
ubm.write(ubm_fn, prefix='ubm/')
```
%% Cell type:markdown id: tags:
Step 2: TV
---
Extract MFCC for TV
%% Cell type:code id: tags:
``` python
logging.info('Computing MFCC for TV')
diar_tv = Diar.read_seg(tv_seg_fn, normalize_cluster=True)
fe = get_feature_extractor(audio_dir, 'sid')
tv_idmap = fe.save_multispeakers(diar_tv.id_map(), output_feature_filename=mfcc_tv_fn, keep_all=False)
tv_idmap.write(tv_idmap_fn)
```
%% Cell type:markdown id: tags:
Train a Total Variability model using the FactorAnalyser class
%% Cell type:code id: tags:
``` python
tv_idmap = IdMap.read(tv_idmap_fn)
ubm = Mixture()
ubm.read(ubm_fn, prefix='ubm/')
fs = get_feature_server(mfcc_tv_fn, 'sid')
tv_idmap.leftids = numpy.copy(tv_idmap.rightids)
tv_stat = StatServer(tv_idmap, ubm.get_distrib_nb(), ubm.dim())
tv_stat.accumulate_stat(ubm=ubm, feature_server=fs, seg_indices=range(tv_stat.segset.shape[0]), num_thread=num_thread)
tv_stat.write(tv_stat_fn)
fa = FactorAnalyser()
fa.total_variability(tv_stat_fn, ubm, rank_tv, nb_iter=it_max_tv, batch_size=1000, num_thread=num_thread)
write_tv_hdf5([fa.F, fa.mean, fa.Sigma], tv_fn)
```
%% Cell type:markdown id: tags:
Step 3: PLDA
---
Extract the MFCC for the PLDA
%% Cell type:code id: tags:
``` python
logging.info('Computing MFCC for PLDA')
diar_plda = Diar.read_seg(plda_seg_fn, normalize_cluster=True)
fe = get_feature_extractor(audio_dir, 'sid')
plda_idmap = fe.save_multispeakers(diar_plda.id_map(), output_feature_filename=mfcc_plda_fn, keep_all=False)
plda_idmap.write(plda_idmap_fn)
```
%% Cell type:markdown id: tags:
Accumulate statistics
%% Cell type:code id: tags:
``` python
plda_idmap = IdMap.read(plda_idmap_fn)
ubm = Mixture()
ubm.read(ubm_fn, prefix='ubm/')
tv, tv_mean, tv_sigma = read_tv_hdf5(tv_fn)
fs = get_feature_server(mfcc_plda_fn, 'sid')
plda_norm_stat = StatServer(plda_idmap, ubm.get_distrib_nb(), ubm.dim())
plda_norm_stat.accumulate_stat(ubm=ubm, feature_server=fs,
seg_indices=range(plda_norm_stat.segset.shape[0]), num_thread=num_thread)
plda_norm_stat.write(norm_stat_fn)
```
%% Cell type:markdown id: tags:
Extract i-vectors and compute norm
%% Cell type:code id: tags:
``` python
fa = FactorAnalyser(F=tv, mean=tv_mean, Sigma=tv_sigma)
norm_iv = fa.extract_ivectors(ubm, norm_stat_fn, num_thread=num_thread)
norm_iv.write(norm_iv_fn)
norm_mean, norm_cov = norm_iv.estimate_spectral_norm_stat1(1, 'sphNorm')
write_norm_hdf5([norm_mean, norm_cov], norm_fn)
norm_iv.spectral_norm_stat1(norm_mean[:1], norm_cov[:1])
```
%% Cell type:markdown id: tags:
Train the PLDA model
%% Cell type:code id: tags:
``` python
fa = FactorAnalyser()
fa.plda(norm_iv, rank_plda, nb_iter=it_max_plda)
write_plda_hdf5([fa.mean, fa.F, numpy.zeros((rank_tv, 0)), fa.Sigma], plda_fn)
```
%% Cell type:markdown id: tags:
Step 4: Compute additional data (optional)
---
Adding matrices for additional scoring methods:
* Mahalonobis matrix
* Lower Choleski decomposition of the WCCN matrix
* Within- and Between-class Covariance matrices
%% Cell type:code id: tags:
``` python
iv = StatServer(norm_iv_fn)
matrix_dict = {}
logging.info('compute mahalanobis_matrix')
mahalanobis_matrix = iv.get_mahalanobis_matrix_stat1()
matrix_dict['mahalanobis_matrix'] = mahalanobis_matrix
logging.info('compute wccn_choleski')
wccn_choleski = iv.get_wccn_choleski_stat1()
matrix_dict['wccn_choleski'] = wccn_choleski
logging.info('compute two_covariance')
within_covariance = iv.get_within_covariance_stat1()
matrix_dict['two_covariance/within_covariance'] = within_covariance
between_covariance = iv.get_between_covariance_stat1()
matrix_dict['two_covariance/between_covariance'] = between_covariance
write_dict_hdf5(matrix_dict, matrices_fn)
```
%% Cell type:markdown id: tags:
Step 5: Merge in one model
---
%% Cell type:code id: tags:
``` python
with h5py.File(model_fn, 'w') as model:
for fn in [ubm_fn, tv_fn, norm_fn, plda_fn, matrices_fn]:
if not os.path.exists(fn):
continue
with h5py.File(fn, 'r') as fh:
for group in fh:
logging.info(group)
fh.copy(group, model)
```
......
%% Cell type:markdown id: tags:
Diarization for ASR
===================
This script performs a BIC diarization (ussally for ASR decoding)
The proposed diarization system was inspired by the
system [1] which won the RT'04 fall evaluation
and the ESTER1 evaluation. It was developed during the ESTER2
evaluation campaign for the transcription with the goal of minimizing
word error rate.
Automatic transcription requires accurate segment boundaries. Segment
boundaries have to be set within non-informative zones such as filler
words.
Speaker diarization needs to produce homogeneous speech segments;
however, purity and coverage of the speaker clusters are the main
objectives here. Errors such as having two distinct clusters (i.e.,
detected speakers) corresponding to the same real speaker, or
conversely, merging segments of two real speakers into only one cluster,
get heavier penalty in the NIST time-based diarization metric than
misplaced boundaries.
The system is composed of acoustic BIC segmentation followed with BIC
hierarchical clustering. Viterbi decoding is performed to adjust the
segment boundaries.
Music and jingle regions are not removed but a speech activity
diarization could be load before to segment and cluster the show.
Optionally, long segments are cut to be shorter than 20 seconds.
[1] C. Barras, X. Zhu, S. Meignier, and J. L. Gauvain, “Multistage speaker diarization of broadcast news,” IEEE Transactions on Audio, Speech, and Language Processing, vol. 14, no. 5, pp. 1505–1512, Sep. 2006.
%% Cell type:code id: tags:
``` python
%matplotlib inline
__license__ = "LGPL"
__author__ = "Sylvain Meignier"
__copyright__ = "Copyright 2015-2016 Sylvain Meignier"
__license__ = "LGPL"
__maintainer__ = "Sylvain Meignier"
__email__ = "sidekit@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'
import argparse
import logging
import matplotlib
import copy
import os
from matplotlib import pyplot as plot
from s4d.utils import *
from s4d.diar import Diar
from s4d import viterbi, segmentation
from s4d.clustering import hac_bic
from sidekit.sidekit_io import init_logging
from s4d.gui.dendrogram import plot_dendrogram
```
%%%% Output: stream
/Users/Sulfyderz/Desktop/Doctorat/Tools/Environments/miniconda/Python3/lib/python3.6/site-packages/sidekit/bosaris/detplot.py:39: UserWarning: matplotlib.pyplot as already been imported, this call will have no effect.
matplotlib.use('PDF')
WARNING:root:WARNNG: libsvm is not installed, please refer to the documentation if you intend to use SVM classifiers
%% Cell type:markdown id: tags:
BIC diarization
===============
Arguments, variables and logger
-------------------------------
Set the logger
%% Cell type:code id: tags:
``` python
loglevel = logging.INFO
init_logging(level=loglevel)
```
%% Cell type:markdown id: tags:
Set the input audio or mfcc file and the speech activity detection file (optional).
%% Cell type:code id: tags:
``` python
data_dir = 'data'
show = '20041008_1800_1830_INFO_DGA'
input_show = os.path.join(data_dir, 'audio', show + '.wav')
input_sad = os.path.join(data_dir, 'sad', show + '.sad.seg')
#input_sad = None
```
%% Cell type:markdown id: tags:
Size of left or right windows (step 2)
%% Cell type:code id: tags:
``` python
win_size=250
```
%% Cell type:markdown id: tags:
Threshold for:
* Linear segmentation (step 3)
* BIC HAC (step 4)
* Viterbi (step 5)
%% Cell type:code id: tags:
``` python
thr_l = 2
thr_h = 3
thr_vit = -250
```
%% Cell type:markdown id: tags:
If ``save_all`` is ``True`` then all produced diarization are saved
%% Cell type:code id: tags:
``` python
save_all = True
```
%% Cell type:markdown id: tags:
Prepare various variables
%% Cell type:code id: tags:
``` python
wdir = os.path.join('out', show)
if not os.path.exists(wdir):
os.makedirs(wdir)
```
%% Cell type:markdown id: tags:
Step 1: MFCC
-------------
Extract and load the MFCC
%% Cell type:code id: tags:
``` python
logging.info('Make MFCC')
if save_all:
fe = get_feature_extractor(input_show, type_feature_extractor='basic')
mfcc_filename = os.path.join(wdir, show + '.mfcc.h5')
fe.save(show, output_feature_filename=mfcc_filename)
fs = get_feature_server(mfcc_filename, feature_server_type='basic')
else:
fs = get_feature_server(input_show, feature_server_type='basic')
cep, _ = fs.load(show)
```
%%%% Output: stream
2018-06-11 10:46:17,143 - INFO - Make MFCC
2018-06-11 10:46:17,144 - INFO - data/audio ## 20041008_1800_1830_INFO_DGA ## .wav
2018-06-11 10:46:17,145 - INFO - --------------------
2018-06-11 10:46:17,145 - INFO - show: empty keep_all_features: True
audio_filename_structure: data/audio/20041008_1800_1830_INFO_DGA.wav
feature_filename_structure: {}
pre-emphasis: 0.97
lower_frequency: 133.3333 higher_frequency: 6855.4976
sampling_frequency: 16000
filter bank: 40 filters of type log
ceps_number: 13
window_size: 0.025 shift: 0.01
vad: None snr: None
2018-06-11 10:46:17,146 - INFO - --------------------
2018-06-11 10:46:17,147 - INFO - show: empty
input_feature_filename: empty
feature_filename_structure: {}
Post processing options:
mask: None
feat_norm: None
dct_pca: False, dct_pca_config: (12, 12, None)
sdc: False, sdc_config: (1, 3, 7)
delta: False, double_delta: False, delta_filter: [ 0.25 0.5 0.25 0. -0.25 -0.5 -0.25]
rasta: False
keep_all_features: True
2018-06-11 10:46:21,768 - INFO - process part : 0.000000 1822.912125 1822.912125
2018-06-11 10:46:24,513 - INFO - no vad
2018-06-11 10:46:24,518 - INFO - !! size of signal cep: 0.000050 len 13 type size 4
2018-06-11 10:46:24,602 - INFO - [ True True True ..., True True True]
%% Cell type:code id: tags:
``` python
cep.shape
```
%%%% Output: execute_result
(182289, 14)
%% Cell type:markdown id: tags:
Step 2: Initialization
------
The initial diarization is loaded from a speech activity detection
diarization (SAD) or a segment is created from the first to the last
MFCC feature.
%% Cell type:code id: tags:
``` python
logging.info('Check initial segmentation')
if input_sad is not None:
init_diar = Diar.read_seg(input_sad)
init_diar.pack(50)
else:
init_diar = segmentation.init_seg(cep, show)
if save_all:
init_filename = os.path.join(wdir, show + '.i.seg')
Diar.write_seg(init_filename, init_diar)
```
%%%% Output: stream
2018-06-11 10:46:30,818 - INFO - Check initial segmentation
%% Cell type:markdown id: tags:
Step 3: Gaussian Divergence segmentation
----------------------------------------
First segmentation: Segment each segment of ``init_diar`` using the
Gaussian Divergence method
%% Cell type:code id: tags:
``` python
logging.info('Gaussian Divergence segmentation')
seg_diar = segmentation.segmentation(cep, init_diar, win_size)
if save_all:
seg_filename = os.path.join(wdir, show + '.s.seg')
Diar.write_seg(seg_filename, seg_diar)
```
%%%% Output: stream
2018-06-11 10:46:30,828 - INFO - Gaussian Divergence segmentation
%% Cell type:markdown id: tags:
Step 4: linear BIC segmentation
-------------------------------
This segmentation over the signal fuses consecutive segments of the same
speaker from the start to the end of the record. The measure employs the
$\Delta BIC$ based on Bayesian Information Criterion , using full
covariance Gaussians (see class ``gauss.GaussFull``).
%% Cell type:code id: tags:
``` python
logging.info('Linear BIC, alpha: %f', thr_l)
bicl_diar = segmentation.bic_linear(cep, seg_diar, thr_l, sr=False)
if save_all:
bicl_filename = os.path.join(wdir, show + '.l.seg')