Commit 2640573a authored by Anthony Larcher's avatar Anthony Larcher
Browse files

debug

parent 7f61cc0b
......@@ -60,7 +60,7 @@ def _check_missing_model(enroll, test, ndx):
return clean_ndx
def cosine_scoring(enroll, test, ndx, wccn=None, check_missing=True):
def cosine_scoring(enroll, test, ndx, wccn=None, check_missing=True, device=None):
"""Compute the cosine similarities between to sets of vectors. The list of
trials to perform is given in an Ndx object.
......@@ -99,7 +99,10 @@ def cosine_scoring(enroll, test, ndx, wccn=None, check_missing=True):
if enroll_copy != test_copy:
test_copy.norm_stat1()
s_size_in_bytes = enroll_copy.stat1.shape[0] * test_copy.stat1.shape[0] * 4
device = torch.device("cuda:0" if torch.cuda.is_available() and s_size_in_bytes < 1e9 else "cpu")
if device == None:
device = torch.device("cuda:0" if torch.cuda.is_available() and s_size_in_bytes < 3e9 else "cpu")
else:
device = device if torch.cuda.is_available() and s_size_in_bytes < 3e9 else torch.device("cpu")
s = torch.mm(torch.FloatTensor(enroll_copy.stat1).to(device), torch.FloatTensor(test_copy.stat1).to(device).T).cpu().numpy()
score = Scores()
......
......@@ -378,7 +378,7 @@ class SpkSet(Dataset):
for jdx, possible_start in enumerate(possible_starts):
segment_dict = dict()
segment_dict['start'] = possible_start / self.sample_rate
segment_dict['start'] = possible_start
segment_dict['duration'] = self.duration
segment_dict['file_id'] = current_session.file_id
......@@ -434,18 +434,25 @@ class SpkSet(Dataset):
nfo = soundfile.info(f"{self.data_path}/{current_segment['file_id']}{self.data_file_extension}")
if self._windowed:
start_frame = int(current_segment['start'] * self.sample_rate)
if start_frame + self.sample_number >= nfo.frames:
start_frame = numpy.min(nfo.frames - self.sample_number - 1)
start_frame = current_segment['start']
stop_frame = start_frame + self.sample_number
else:
start_frame = int(current_segment['start'] * self.sample_rate)
stop_frame = int(current_segment['duration'] * self.sample_rate)
speech, speech_fs = torchaudio.load(f"{self.data_path}/{current_segment['file_id']}{self.data_file_extension}",
frame_offset=start_frame,
num_frames=self.sample_number)
#speech, speech_fs = torchaudio.load(f"{self.data_path}/{current_segment['file_id']}{self.data_file_extension}",
# frame_offset=start_frame,
# num_frames=self.sample_number)
sig, _ = soundfile.read(f"{self.data_path}/{current_segment['file_id']}{self.data_file_extension}",
start=start_frame,
stop=stop_frame,
dtype=wav_type
)
sig = sig.astype(numpy.float32)
sig += 0.0001 * numpy.random.randn(sig.shape[0])
speech = torch.tensor(sig).type(torch.FloatTensor)
if len(self.transform) > 0:
# Select the data augmentation randomly
......
......@@ -502,6 +502,7 @@ class Xtractor(torch.nn.Module):
self.preprocessor = MfccFrontEnd()
self.sequence_network = PreFastResNet34()
self.embedding_size = 256
self.before_speaker_embedding = torch.nn.Linear(in_features = 2560,
out_features = 256)
......@@ -509,13 +510,11 @@ class Xtractor(torch.nn.Module):
self.stat_pooling = MeanStdPooling()
self.stat_pooling_weight_decay = 0
self.embedding_size = 256
self.loss = "aam"
self.after_speaker_embedding = ArcMarginProduct(256,
self.after_speaker_embedding = ArcMarginProduct(self.embedding_size,
int(self.speaker_number),
s = 30.0,
m = 0.20,
s = 30,
m = 0.2,
easy_margin = False)
self.preprocessor_weight_decay = 0.000
......@@ -996,7 +995,7 @@ def xtrain(speaker_number,
set_type="train",
dataset_df=training_df,
overlap=dataset_params['train']['overlap'],
output_format="pytorch",
output_format=output_format,
windowed=True)
validation_set = SideSet(dataset_yaml,
......@@ -1065,11 +1064,9 @@ def xtrain(speaker_number,
param_list.append({'params': model.module.after_speaker_embedding.parameters(), 'weight_decay': model.module.after_speaker_embedding_weight_decay})
optimizer = _optimizer(param_list, **_options)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
milestones=numpy.arange(50,10000,10),
gamma=0.95,
last_epoch=-1,
verbose=False)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=10 * training_loader.__len__(),
gamma=0.95)
if mixed_precision:
scaler = GradScaler()
......@@ -1099,7 +1096,7 @@ def xtrain(speaker_number,
clipping=clipping)
# Add the cross validation here
if math.fmod(epoch, 10) == 0:
if math.fmod(epoch, 1) == 0:
val_acc, val_loss, val_eer = cross_validation(model, validation_loader, device, [validation_set.__len__(), embedding_size], mixed_precision)
test_eer = test_metrics(model, device, speaker_number, num_thread, mixed_precision)
......@@ -1984,7 +1981,7 @@ def eer(negatives, positives):
n_index = n_index - next_n_jump
if next_p_jump == 0 and next_n_jump == 0:
break
p_score = positives[p_index]
n_score = negatives[n_index]
next_p_jump = next_p_jump//2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment