Commit 5c37ba16 authored by Anthony Larcher's avatar Anthony Larcher
Browse files

second pass in progress for xvectors

parent 0ff214ca
......@@ -27,13 +27,15 @@ Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
"""
import copy
import logging
import sys
import numpy
import scipy
from sidekit.bosaris import Ndx
from sidekit.bosaris import Scores
import torch
from sidekit.bosaris import Ndx, Scores
from sidekit.statserver import StatServer
import sys
if sys.version_info.major > 2:
from functools import reduce
......@@ -96,7 +98,9 @@ def cosine_scoring(enroll, test, ndx, wccn=None, check_missing=True):
enroll_copy.norm_stat1()
if enroll_copy != test_copy:
test_copy.norm_stat1()
s = numpy.dot(enroll_copy.stat1, test_copy.stat1.transpose())
s_size_in_bytes = enroll_copy.stat1.shape[0] * test_copy.stat1.shape[0] * 4
device = torch.device("cuda:0" if torch.cuda.is_available() and s_size_in_bytes < 1e9 else "cpu")
s = torch.mm(torch.FloatTensor(enroll_copy.stat1).to(device), torch.FloatTensor(test_copy.stat1).to(device).T).cpu().numpy()
score = Scores()
score.scoremat = s
......
......@@ -223,6 +223,7 @@ class ArcLinear(torch.nn.Module):
# project margin differences into cosθj
return self.s * (cos_theta_j + one_hot * (cos_theta_yi_margin - cos_theta_yi))
class ArcMarginProduct(torch.nn.Module):
r"""Implement of large margin arc distance: :
Args:
......
......@@ -420,10 +420,10 @@ class BasicBlock(torch.nn.Module):
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = torch.nn.functional.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x)
out = F.relu(out)
out = torch.nn.functional.relu(out)
return out
......@@ -488,7 +488,7 @@ class ResNet(torch.nn.Module):
return torch.nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = torch.nn.functional.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
......@@ -534,7 +534,7 @@ class PreResNet34(torch.nn.Module):
return torch.nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = torch.nn.functional.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
......
......@@ -122,12 +122,13 @@ class SegSelection(Dataset):
self.len = len(sessions)
def __getitem__(self, index):
current_session = self.sessions.iloc[index]
# Compute possible starts
possible_starts = numpy.arange(0,
int(self.sample_rate * (self.sessions.iloc[index].duration - self.duration)),
int(self.sample_rate * (current_session.duration - self.duration)),
self.sample_number - int(self.sample_rate * self.overlap)
)
possible_starts += int(self.sample_rate * self.sessions.iloc[index].start)
possible_starts += int(self.sample_rate * current_session.start)
# Select max(seg_nb, possible_segments) segments
if self.chunk_per_segment == -1:
......@@ -138,13 +139,13 @@ class SegSelection(Dataset):
starts = numpy.random.permutation(possible_starts)[:chunk_nb] / self.sample_rate
# On renvoie des listes:
seg_database = [self.sessions.iloc[index].database,] * chunk_nb
seg_speaker_id = [self.sessions.iloc[index].speaker_id,] * chunk_nb
seg_file_id = [self.sessions.iloc[index].file_id,] * chunk_nb
seg_database = [current_session.database,] * chunk_nb
seg_speaker_id = [current_session.speaker_id,] * chunk_nb
seg_file_id = [current_session.file_id,] * chunk_nb
seg_start = starts
seg_duration = numpy.ones(starts.shape) * self.duration
seg_speaker_idx = [int(self.sessions.iloc[index].speaker_idx),] * chunk_nb
seg_gender = [self.sessions.iloc[index].gender,] * chunk_nb
seg_speaker_idx = [int(current_session.speaker_idx),] * chunk_nb
seg_gender = [current_session.gender,] * chunk_nb
return seg_database, seg_speaker_id, seg_file_id, seg_start, seg_duration, seg_speaker_idx, seg_gender
......@@ -157,125 +158,7 @@ class SegSelection(Dataset):
return self.len
class XvectorDataset(Dataset):
"""
Object that takes a list of files from a file and initialize a Dataset
"""
def __init__(self, batch_list, batch_path):
with open(batch_list, 'r') as fh:
self.batch_files = [batch_path + '/' + l.rstrip() for l in fh]
self.len = len(self.batch_files)
def __getitem__(self, index):
data, label = read_batch(self.batch_files[index])
return torch.from_numpy(data).type(torch.FloatTensor), torch.from_numpy(label.astype('long'))
def __len__(self):
return self.len
class XvectorMultiDataset(Dataset):
"""
Object that takes a list of files as a Python List and initialize a DataSet
"""
def __init__(self, batch_list, batch_path):
self.batch_files = [batch_path + '/' + l for l in batch_list]
self.len = len(self.batch_files)
def __getitem__(self, index):
data, label = read_batch(self.batch_files[index])
return torch.from_numpy(data).type(torch.FloatTensor), torch.from_numpy(label.astype('long'))
def __len__(self):
return self.len
class StatDataset(Dataset):
"""
Object that initialize a Dataset from an sidekit.IdMap
"""
def __init__(self, idmap, fs_param):
self.idmap = idmap
self.fs = FeaturesServer(**fs_param)
self.len = self.idmap.leftids.shape[0]
def __getitem__(self, index):
data, _ = self.fs.load(self.idmap.rightids[index], start=self.idmap.start[index], stop=self.idmap.stop[index])
data = (data - data.mean(0)) / data.std(0)
data = data.reshape((1, data.shape[0], data.shape[1])).transpose(0, 2, 1).astype(numpy.float32)
return self.idmap.leftids[index], self.idmap.rightids[index], torch.from_numpy(data).type(torch.FloatTensor)
def __len__(self):
return self.len
class VoxDataset(Dataset):
"""
"""
def __init__(self, segment_df, speaker_dict, duration=500, transform = None, spec_aug_ratio=0.5, temp_aug_ratio=0.5):
"""
:param segment_df:
:param speaker_dict:
:param duration:
:param transform:
:param spec_aug_ratio:
:param temp_aug_ratio:
"""
self.segment_list = segment_df
self.speaker_dict = speaker_dict
self.len = len(self.segment_list)
self.duration = duration
self.transform = transform
tmp = numpy.zeros(self.len, dtype=bool)
tmp[:int(self.len * spec_aug_ratio)] = 1
numpy.random.shuffle(tmp)
tmp2 = numpy.zeros(self.len, dtype=bool)
tmp2[:int(self.len * temp_aug_ratio)] = 1
numpy.random.shuffle(tmp2)
self.spec_aug = tmp
self.temp_aug = tmp2
def __getitem__(self, index):
"""
:return:
"""
fh = h5py.File(self.segment_list.loc[index].hdf5_file, 'r')
feature_size = fh[self.segment_list.session_id[index]].shape[1]
start = int(self.segment_list.start[index])
data = read_dataset_percentile(fh, self.segment_list.session_id[index]).T
if not self.duration is None:
data = data[:, start:start + self.duration]
label = self.speaker_dict[self.segment_list.speaker_id[index]]
else:
label = self.segment_list.speaker_id[index]
fh.close()
spec_aug = False
temp_aug = False
if self.transform:
data, label, spec_aug, temp_aug = self.transform((data, label, self.spec_aug[index], self.temp_aug[index]))
if self.duration is not None:
label = torch.from_numpy(numpy.array([label, ]).astype('long'))
return torch.from_numpy(data).type(torch.FloatTensor), label, spec_aug, temp_aug
def __len__(self):
"""
:param self:
:return:
"""
return self.len
class PreEmphasis(object):
......@@ -470,16 +353,17 @@ class SideSet(Dataset):
# Create lists for each column of the dataframe
df_dict = dict(zip(df.columns, [[], [], [], [], [], [], []]))
weight_dict = dict()
"""
# For each segment, get all possible segments with the current overlap
for idx in tqdm.trange(len(tmp_sessions)):
# Compute possible starts
current_session = tmp_sessions.iloc[idx]
# Compute possible starts
possible_starts = numpy.arange(0,
int(self.sample_rate * (tmp_sessions.iloc[idx].duration - self.duration)),
int(self.sample_rate * (current_session.duration - self.duration)),
self.sample_number - int(self.sample_rate * overlap)
)
possible_starts += int(self.sample_rate * tmp_sessions.iloc[idx].start)
possible_starts += int(self.sample_rate * current_session.start)
# Select max(seg_nb, possible_segments) segments
if chunk_per_segment == -1:
......@@ -491,59 +375,21 @@ class SideSet(Dataset):
# Once we know how many segments are selected, create the other fields to fill the DataFrame
for ii in range(chunk_nb):
df_dict["database"].append(tmp_sessions.iloc[idx].database)
df_dict["speaker_id"].append(tmp_sessions.iloc[idx].speaker_id)
df_dict["file_id"].append(tmp_sessions.iloc[idx].file_id)
df_dict["database"].append(current_session.database)
df_dict["speaker_id"].append(current_session.speaker_id)
df_dict["file_id"].append(current_session.file_id)
df_dict["start"].append(starts[ii])
df_dict["duration"].append(self.duration)
df_dict["speaker_idx"].append(tmp_sessions.iloc[idx].speaker_idx)
df_dict["gender"].append(tmp_sessions.iloc[idx].gender)
"""
"""
New parallel version of segment selection
"""
segset = SegSelection(tmp_sessions, self.sample_rate, self.duration, self.sample_number, overlap, chunk_per_segment)
num_thread = multiprocessing.cpu_count()
print(f"Use {num_thread} cpus")
segloader = DataLoader(segset,
batch_size=1,
drop_last=False,
pin_memory=True,
num_workers=num_thread)
flatten = lambda t: [item for sublist in t for item in sublist]
for seg_database, seg_speaker_id, seg_file_id, seg_start, seg_duration, seg_speaker_idx, seg_gender in tqdm.tqdm(segloader):
seg_database = copy.deepcopy(flatten(seg_database))
seg_speaker_id = copy.deepcopy(flatten(seg_speaker_id))
seg_file_id = copy.deepcopy(flatten(seg_file_id))
seg_start = copy.deepcopy(seg_start[0].numpy())
seg_duration = copy.deepcopy(seg_duration[0].numpy())
seg_speaker_idx = copy.deepcopy(flatten(seg_speaker_idx))
seg_gender = copy.deepcopy(flatten(seg_gender))
df_dict["database"].extend(seg_database)
df_dict["speaker_id"].extend(seg_speaker_id)
df_dict["file_id"].extend(seg_file_id)
df_dict["start"].extend(seg_start)
df_dict["duration"].extend(seg_duration)
df_dict["speaker_idx"].extend(seg_speaker_idx)
df_dict["gender"].extend(seg_gender)
#df_dict["speaker_idx"] = numpy.array(df_dict["speaker_idx"]).astype('long')
#df_dict["speaker_id"].extend(flatten(seg_speaker_id))
#df_dict["speaker_id"].extend(flatten(seg_speaker_id))
#df_dict["file_id"].extend(flatten(seg_file_id))
#df_dict["start"].extend(seg_start[0].numpy())
#df_dict["duration"].extend(seg_duration[0].numpy())
#df_dict["speaker_idx"].extend(flatten(seg_speaker_idx))
#df_dict["gender"].extend(flatten(seg_gender))
df_dict["speaker_idx"].append(current_session.speaker_idx)
df_dict["gender"].append(current_session.gender)
if current_session.speaker_idx in weight_dict:
weight_dict[current_session.speaker_idx] += 1
else:
weight_dict[current_session.speaker_idx] = 1
self.sessions = pandas.DataFrame.from_dict(df_dict)
self.len = len(self.sessions)
self.weights = weight_dict
_transform = []
if (self.transformation["pipeline"] != '') and (self.transformation["pipeline"] is not None):
......@@ -616,13 +462,15 @@ class SideSet(Dataset):
:return:
"""
# Check the size of the file
nfo = soundfile.info(f"{self.data_path}/{self.sessions.iloc[index]['file_id']}{self.data_file_extension}")
start_frame = int(self.sessions.iloc[index]['start'] * self.sample_rate)
current_session = self.sessions.iloc[index]
nfo = soundfile.info(f"{self.data_path}/{current_session['file_id']}{self.data_file_extension}")
start_frame = int(current_session['start'] * self.sample_rate)
if start_frame + self.sample_number >= nfo.frames:
start_frame = numpy.min(nfo.frames - self.sample_number - 1)
stop_frame = start_frame + self.sample_number
sig, _ = soundfile.read(f"{self.data_path}/{self.sessions.iloc[index]['file_id']}{self.data_file_extension}",
sig, _ = soundfile.read(f"{self.data_path}/{current_session['file_id']}{self.data_file_extension}",
start=start_frame,
stop=stop_frame,
dtype=wav_type
......@@ -630,8 +478,7 @@ class SideSet(Dataset):
sig = sig.astype(numpy.float32)
sig += 0.0001 * numpy.random.randn(sig.shape[0])
speaker_idx = self.sessions.iloc[index]["speaker_idx"]
speaker_idx = current_session["speaker_idx"]
if self.transformation["pipeline"]:
sig, speaker_idx, _, __, _t, _s = self.transforms((sig,
......@@ -643,9 +490,9 @@ class SideSet(Dataset):
))
if self.output_format == "pytorch":
return torch.from_numpy(sig).type(torch.FloatTensor), torch.from_numpy(speaker_idx).type(torch.LongTensor)
return torch.from_numpy(sig).type(torch.FloatTensor), torch.from_numpy(speaker_idx).type(torch.LongTensor), torch.from_numpy(1./self.weights[speaker_idx]).type(torch.FloatTensor)
else:
return sig.astype(numpy.float32), speaker_idx
return sig.astype(numpy.float32), speaker_idx, 1./self.weights[speaker_idx]
def __len__(self):
"""
......@@ -711,6 +558,34 @@ def createSideSets(data_set_yaml,
training_df)
class StatServerSet(Dataset):
def __init__(self, input_filename):
"""
:param input_filename:
"""
self.Statserver = StatServer(input_filename)
self.data =
def __getitem__(self, index):
"""
:param index:
:return:
"""
return
def __len__(self):
"""
:param self:
:return:
"""
return self.len
class IdMapSet(Dataset):
"""
DataSet that provide data according to a sidekit.IdMap object
......
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment