Commit 35f3a0a0 authored by Pierre Champion's avatar Pierre Champion
Browse files

🎉 Installation + bin + egs structure + kaldi example

parent d0270c20
Pipeline #643 canceled with stages
feerci
anonymization_metrics
s3prl
*.egg-info
.done-*
env.sh
venv
Miniconda3-latest-Linux-x86_64.sh
Miniconda3-py38_4.9.2-Linux-x86_64.sh
trials
utt2spk
cosine_score.txt
*.ark
*.scp
*.wav
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# sidekit
SIDEKIT
=======
SIDEKIT is an open source package for Speaker and Language recognition.
Authors: Anthony Larcher & .....
## Installation
```sh
git clone https://git-lium.univ-lemans.fr/speaker/sidekit
cd sidekit
# you might need to adjust $CUDAROOT in ./install.sh
# to match your cuda config. default /usr/local/cuda
./install.sh
```
## Usage
#### Extract x-vector for kaldi-like wav.scp
```sh
# activate the miniconda venv
. ./env.sh
# create wav.scp to extract (kaldi-like)
cat > ./wav_enrolls.scp <<EOF
374-180298-0000 curl https://huggingface.co/proxy-datasets-preview/assets/librispeech_asr/--/clean/train.100/0/audio/audio.mp3 --output - | ffmpeg -f mp3 -i pipe: -f wav - |
374-180298-0023 curl https://huggingface.co/proxy-datasets-preview/assets/librispeech_asr/--/clean/train.100/23/audio/audio.mp3 --output - | ffmpeg -f mp3 -i pipe: -f wav - |
1487-133273-0003 curl https://huggingface.co/proxy-datasets-preview/assets/librispeech_asr/--/clean/train.360/3/audio/audio.mp3 --output - | ffmpeg -f mp3 -i pipe: -f wav - |
1487-133273-0005 curl https://huggingface.co/proxy-datasets-preview/assets/librispeech_asr/--/clean/train.360/5/audio/audio.mp3 --output - | ffmpeg -f mp3 -i pipe: -f wav - |
EOF
# extract and store the x-vectors in scp,ark files
extract_xvectors.py --model PATH_TO_MODEL \
--wav-scp ./wav_enrolls.scp --out-scp ./x-vector-enrolls.scp # the "--vad" flag can be used to remove non speech
```
#### Compute EER for kaldi-like data
```sh
cat > ./utt2spk <<EOF
374-180298-0000 374
374-180298-0023 374
1487-133273-0003 1487
1487-133273-0005 1487
EOF
cat > ./wav_trials.scp <<EOF
374-180298-0029 curl https://huggingface.co/proxy-datasets-preview/assets/librispeech_asr/--/clean/train.100/29/audio/audio.mp3 --output - | ffmpeg -f mp3 -i pipe: -f wav - |
374-180298-0033 curl https://huggingface.co/proxy-datasets-preview/assets/librispeech_asr/--/clean/train.100/33/audio/audio.mp3 --output - | ffmpeg -f mp3 -i pipe: -f wav - |
1487-133273-0067 curl https://huggingface.co/proxy-datasets-preview/assets/librispeech_asr/--/clean/train.360/67/audio/audio.mp3 --output - | ffmpeg -f mp3 -i pipe: -f wav - |
1487-133273-0044 curl https://huggingface.co/proxy-datasets-preview/assets/librispeech_asr/--/clean/train.360/44/audio/audio.mp3 --output - | ffmpeg -f mp3 -i pipe: -f wav - |
2277-149896-0001 curl https://huggingface.co/proxy-datasets-preview/assets/librispeech_asr/--/clean/validation/1/audio/audio.mp3 --output - | ffmpeg -f mp3 -i pipe: -f wav - |
2035-147960-0002 curl https://huggingface.co/proxy-datasets-preview/assets/librispeech_asr/--/clean/validation/97/audio/audio.mp3 --output - | ffmpeg -f mp3 -i pipe: -f wav - |
EOF
# extract and store the x-vectors in scp,ark files
extract_xvectors.py --model PATH_TO_MODEL \
--wav-scp ./wav_trials.scp --out-scp ./x-vector-trials.scp # the "--vad" flag can be used to remove non speech
cat > ./trials <<EOF
374 374-180298-0029 target
374 374-180298-0033 target
1487 1487-133273-0067 target
1487 1487-133273-0044 target
374 1487-133273-0067 nontarget
374 1487-133273-0044 nontarget
1487 374-180298-0029 nontarget
1487 374-180298-0033 nontarget
374 2277-149896-0001 nontarget
374 2035-147960-0002 nontarget
1487 2277-149896-0001 nontarget
1487 2035-147960-0002 nontarget
EOF
compute_spk_cosine.py ./trials ./utt2spk x-vector-trials.scp ./x-vector-enrolls.scp cosine_score.txt
compute_metrics.py -k ./trials -s cosine_score.txt
```
#### For Python
```python
import torch
import torchaudio
TODO
```
import os
import sys
_here = os.path.abspath(os.path.dirname(__file__))
_sidekit_install = os.path.dirname(os.path.dirname(_here))
sys.path.insert(0, os.path.join(_sidekit_install, "anonymization_metrics"))
from performance import linkability, draw_scores, cllr, min_cllr
import feerci
import argparse
import pandas
parser = argparse.ArgumentParser(
description="Computing multiple speaker verification metrics from a kaldi scoring and key file."
)
parser.add_argument(
"-s", dest="score_file", type=str, nargs=1, required=True, help="path to score file"
)
parser.add_argument(
"-k", dest="key_file", type=str, nargs=1, required=True, help="path to key file"
)
parser.add_argument(
"--link_omega",
dest="omega",
type=float,
nargs=1,
required=False,
default=1,
help="linkability prior ratio (default is 1)",
)
parser.add_argument(
"--link_bins",
dest="bins",
type=int,
nargs=1,
required=False,
default=-1,
help="#Bins for linkability estimation (default is min(len(matedScores) / 10), 100))",
)
parser.add_argument(
"--link-d",
dest="draw_scores",
action="store_true",
help="flag: draw the linkability distribution in a figure",
)
parser.add_argument(
"-o",
dest="output_file",
type=str,
nargs=1,
required=False,
help="output path of the png and pdf file (default is linkability_<score_file>)",
)
args = parser.parse_args()
scr = pandas.read_csv(args.score_file[0], sep=" ", header=None).pivot_table(
index=0, columns=1, values=2
)
key = (
pandas.read_csv(args.key_file[0], sep=" ", header=None)
.replace("nontarget", False)
.replace("target", True)
.pivot_table(index=0, columns=1, values=2)
)
matedScores = scr.values[key.values == True]
nonMatedScores = scr.values[key.values == False]
cllr = cllr(matedScores, nonMatedScores)
cmin, eer, matedScores_opt, nonMatedScores_opt = min_cllr(
matedScores, nonMatedScores, compute_eer=True, return_opt=True
)
Dsys, D, bin_centers, bin_edges = linkability(
matedScores, nonMatedScores, args.omega, args.bins
)
if args.draw_scores:
output_file = "linkability_" + args.score_file[0]
if args.output_file is not None:
output_file = args.output_file[0]
draw_scores(
matedScores, nonMatedScores, Dsys, D, bin_centers, bin_edges, output_file
)
feer, ci_lower, ci_upper, bootstrapped_eers = feerci.feerci(
nonMatedScores, matedScores, is_sorted=False
)
print(
"EER_bootci: {:.2f} interval: [{:.2f}, {:.2f}]".format(
feer * 100, ci_lower * 100, ci_upper * 100
)
)
print("EER: {:.2f}".format(eer * 100))
print("Cllr (min/act): %f %f" % (cmin, cllr))
print("linkability: %f" % (Dsys))
import os
import argparse
import kaldiio
from scipy.spatial.distance import cosine
from sklearn.metrics import roc_curve
import numpy as np
def read_utt2spk_file(utt2spk_file):
"""read utt2spk file, second column is the speaker id"""
utt2spk = {}
with open(utt2spk_file) as utt2spk_f:
for line in utt2spk_f:
lns = line.strip().split()
utt2spk[lns[0]] = lns[1]
return utt2spk
def cosine_scoring(embd1s, embd2s):
scores = []
for embd1, embd2 in zip(embd1s, embd2s):
# Multiplying by -1 to ensure compatibility with affinity
# Now lower value will indicate less affinity as compared
# to original cosine distance
score = 1 - cosine(embd1, embd2)
scores.append(score)
return scores
def main(args):
trials = [x.split() for x in open(args.trials)]
utt1s = [x[0] for x in trials]
utt2s = [x[1] for x in trials]
with kaldiio.ReadHelper(f'scp:{args.enroll_scp}') as reader:
utt2embd_enroll = {utt:embd for utt, embd in reader}
with kaldiio.ReadHelper(f'scp:{args.trial_scp}') as reader:
utt2embd_trial = {utt:embd for utt, embd in reader}
# Average the utterance-level xvectors to get speaker-level xvectors.
##
utt2spk = read_utt2spk_file(args.enroll_utt2spk)
# reverse utt2spk
spk2utt = {}
for k, v in utt2spk.items():
spk2utt[v] = spk2utt.get(v, []) + [k]
utt2embd_enroll_mean = {}
for spk, uttrs in spk2utt.items():
mean = np.mean([utt2embd_enroll[utt] for utt in uttrs], axis=0)
norm = np.linalg.norm(mean, ord=2)
mean /= norm
utt2embd_enroll_mean[spk] = mean
utt2embd_enroll_mean = [utt2embd_enroll_mean[utt] for utt in utt1s]
utt2embd_trial = [utt2embd_trial[utt] for utt in utt2s]
scores = cosine_scoring(utt2embd_enroll_mean, utt2embd_trial)
score_file_kaldi = []
for enroll, trial, score in zip(utt1s, utt2s, scores):
score_file_kaldi.append([enroll, trial, str(score)])
with open(args.output, "w") as txt_file:
for line in score_file_kaldi:
txt_file.write(" ".join(line) + "\n") # works with any number of elements in a line
if __name__ == '__main__':
parser = argparse.ArgumentParser('Speaker Verification Trials/Enroll Cosine Calculation.')
# Kaldi trials files
parser.add_argument('trials')
# utt2spk to compute the average speaker-level xvectors
parser.add_argument('enroll_utt2spk')
# Kaldi scp files
parser.add_argument('trial_scp')
parser.add_argument('enroll_scp')
parser.add_argument('output')
args = parser.parse_args()
assert os.path.isfile(args.trials), "NO SUCH FILE: %s" % args.trials
assert os.path.isfile(args.enroll_utt2spk), "NO SUCH FILE: %s" % args.enroll_utt2spk
assert os.path.isfile(args.enroll_scp), "NO SUCH FILE: %s" % args.enroll_scp
assert os.path.isfile(args.trial_scp), "NO SUCH FILE: %s" % args.trial_scp
main(args)
import warnings
warnings.simplefilter("ignore", UserWarning)
import torch
import torchaudio
from sidekit.nnet.xvector import Xtractor
import os
import io
import argparse
import subprocess
import json
import kaldiio
import soundfile
def read_wav_scp(wav_scp):
"""Reads wav.scp file and returns a dictionary
Args:
wav_scp: a string, contains the path to wav.scp
Returns:
utt2wav: a dictionary, keys are the first column of wav.scp
and values are the second column
"""
utt2wav = {}
with open(wav_scp) as ipf:
for line in ipf:
lns = line.strip().split()
uttname = lns[0]
utt2wav[uttname] = lns[1:]
return utt2wav
def prepare(wav):
"""Reads a wav.scp entry like kaldi with embeded unix command
and returns a pytorch tensor like it was open with torchaudio.load()
(within some tolerance due to numerical precision)
signal, _ = torchaudio.load("XX/1272-128104-0000.flac")
signalv2 = prepare(['flac', '-c', '-d', '-s', 'XX/1272-128104-0000.flac', "|"])
signalv3 = prepare(['XX/1272-128104-0000.flac'])
print("all close:", torch.allclose(signal, signalv2, rtol=1e-1))
print("all close:", torch.allclose(signal, signalv3, rtol=1e-1))
Args:
wav: a list containing the scp entry
Returns:
feats_torch torch.tensor dtype float32
"""
wav = ' '.join(wav)
if wav.strip().endswith("|"):
devnull = open(os.devnull, 'w')
try:
wav_read_process = subprocess.Popen(
wav.strip()[:-1],
stdout=subprocess.PIPE,
shell=True,
stderr=devnull
)
sample, sr = soundfile.read(
io.BytesIO(wav_read_process.communicate()[0]),
)
except Exception as e:
raise IOError("Error processing wav file: {}\n{}".format(wav, e))
else:
sample, sr = soundfile.read(wav)
feats_torch = torch.tensor(sample, dtype=torch.float32, requires_grad=False)
return feats_torch, sr
def load_model(model_path, device):
device = torch.device(device)
model_config = torch.load(model_path, map_location=device)
model_opts = model_config["model_archi"]
if "embedding_size" not in model_opts:
model_opts["embedding_size"] = 256
xtractor = Xtractor(model_config["speaker_number"],
model_archi=model_opts["model_type"],
loss=model_opts["loss"]["type"],
embedding_size=model_opts["embedding_size"])
xtractor.load_state_dict(model_config["model_state_dict"], strict=True)
xtractor = xtractor.to(device)
xtractor.eval()
return xtractor, model_config
@torch.no_grad()
def main(xtractor, kaldi_wav_scp, out_file, device, vad, num_samples_per_window, min_silence_samples, model_sample_rate):
device = torch.device(device)
utt2wav = read_wav_scp(kaldi_wav_scp)
out_ark = os.path.realpath(os.path.join(os.path.dirname(out_file), os.path.splitext(os.path.basename(out_file))[0]))
if vad:
torch.set_num_threads(1) # faster vad on cpu
torch.backends.quantized.engine = 'qnnpack' # compatibility
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad:a345715',
model='silero_vad_mini')
(get_speech_ts,
get_speech_ts_adaptive,
_,
read_audio,
_,
_,
collect_chunks) = utils
vad_cache = os.path.splitext(out_file)[0] + "_vad.json"
cache_speech_timestamps = {}
if os.path.isfile(vad_cache):
with open(vad_cache, 'r') as vad_file:
cache_speech_timestamps = json.load(vad_file)
with kaldiio.WriteHelper(f'ark,scp:{out_ark}.ark,{os.path.realpath(out_file)}') as writer:
for key, wav in utt2wav.items():
signal, sr = prepare(wav)
if vad:
signal_for_vad = signal
if sr != 16000: # SR for vad
signal_for_vad = torchaudio.transforms.Resample(orig_freq=sr,
new_freq=16000)(signal)
sr = 16000
if key in cache_speech_timestamps:
speech_timestamps = cache_speech_timestamps[key]
else:
speech_timestamps = get_speech_ts_adaptive(signal_for_vad.mean(dim=0, keepdim=True),
model,
step=500,
num_samples_per_window=num_samples_per_window,
min_silence_samples=min_silence_samples)
if len(speech_timestamps) == 0:
speech_timestamps = [{"start":0, "end":len(signal)}]
signal = collect_chunks(speech_timestamps, signal_for_vad)
cache_speech_timestamps[key] = speech_timestamps
signal = signal.to(device)
if sr != model_sample_rate:
signal = torchaudio.transforms.Resample(orig_freq=sr,
new_freq=model_sample_rate)(signal)
_, vec = xtractor(signal, is_eval=True)
writer(key, vec.detach().cpu().numpy())
if vad and not os.path.isfile(vad_cache):
with open(vad_cache, "w") as vad_file:
json.dump(cache_speech_timestamps, vad_file)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Extract the x-vectors given a sidekit model")
parser.add_argument("--model", type=str, help="SideKit model", required=True)
parser.add_argument("--sample-rate", type=int, help="Must match SideKit SR model", default=16000)
parser.add_argument("--vad", default=False, action='store_true', help="Apply vad before extracting the x-vector")
parser.add_argument("--vad-num-samples-per-window", type=int, default=2000, help="Number of samples in each window, (2000 -> 125ms) per window. Check https://github.com/snakers4/silero-vad for more info")
parser.add_argument("--vad-min-silence-samples", type=int, default=1500, help="Minimum silence duration in samples between to separate speech chunks, (1500). Check https://github.com/snakers4/silero-vad for more info")
parser.add_argument("--wav-scp", type=str, required=True)
parser.add_argument("--out-scp", type=str, required=True)
parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", type=str, help="The device (cpu or cuda:0) to run the inference")
args = parser.parse_args()
assert os.path.isfile(args.model), "NO SUCH FILE: %s" % args.model
assert os.path.isfile(args.wav_scp), "NO SUCH FILE: %s" % args.wav_scp
assert os.path.isdir(os.path.dirname(args.out_scp)), "NO SUCH DIRECTORY: %s" % args.out_scp
# If cuda device is requested, check if cuda is available
args.device = args.device.strip().lower()
if args.device == "cuda":
assert torch.cuda.is_available(), "CUDA is not available, check configuration or run on cpu (--device cpu)"
xtractor, model_config = load_model(args.model, args.device)
main(xtractor, args.wav_scp, args.out_scp, args.device, args.vad, args.vad_num_samples_per_window, args.vad_min_silence_samples, args.sample_rate)