Commit b17d1364 authored by Anthony Larcher's avatar Anthony Larcher
Browse files

cleaning

parent d7b14542
......@@ -196,59 +196,9 @@ def eer(negatives, positives):
def test_metrics(model,
device,
num_thread,
mixed_precision):
"""Compute model metrics
Args:
model ([type]): [description]
validation_loader ([type]): [description]
device ([type]): [description]
speaker_number ([type]): [description]
model_archi ([type]): [description]
Raises:
NotImplementedError: [description]
NotImplementedError: [description]
Returns:
[type]: [description]
"""
idmap_test_filename = '/lium/raid01_c/larcher/data/allies_dev_verif_idmap.h5'
ndx_test_filename = '/lium/raid01_c/larcher/data/allies_dev_verif_ndx.h5'
key_test_filename = '/lium/raid01_c/larcher/data/allies_dev_verif_key.h5'
data_root_name='/lium/corpus/base/ALLIES/wav'
transform_pipeline = dict()
xv_stat = extract_embeddings(idmap_name=idmap_test_filename,
model_filename=model,
data_root_name=data_root_name,
device=device,
loss="aam",
transform_pipeline=transform_pipeline,
num_thread=num_thread,
mixed_precision=mixed_precision)
tar, non = cosine_scoring(xv_stat,
xv_stat,
Ndx(ndx_test_filename),
wccn=None,
check_missing=True,
device=device
).get_tar_non(Key(key_test_filename))
#test_eer = eer(numpy.array(non).astype(numpy.double), numpy.array(tar).astype(numpy.double))
pmiss, pfa = rocch(tar, non)
return rocch2eer(pmiss, pfa)
def new_test_metrics(model,
device,
model_opts,
data_opts,
train_opts):
model_opts,
data_opts,
train_opts):
"""Compute model metrics
Args:
......@@ -1260,16 +1210,19 @@ def save_model(model, training_monitor, model_opts, training_opts, optimizer, sc
}, training_monitor.is_best, filename=training_opts["tmp_model_name"], best_filename=training_opts["best_model_name"])
def new_xtrain(dataset_description,
model_description,
training_description,
local_rank=-1,
**kwargs):
def xtrain(dataset_description,
model_description,
training_description,
**kwargs):
"""
REFACTORING
- affiner les loggings
- en cas de redemarrage à partir d'un modele existant, recharger l'optimize et le scheduler
"""
local_rank = -1
if "RANK" in os.environ:
local_rank = os.environ['RANK']
# Test to optimize
torch.autograd.profiler.emit_nvtx(enabled=False)
......@@ -1311,7 +1264,6 @@ def new_xtrain(dataset_description,
embedding_size = model.embedding_size
# Set the device and manage parallel processing
#device = torch.cuda.device(local_rank)
torch.cuda.set_device(local_rank)
device = torch.device(local_rank)
......@@ -1380,14 +1332,14 @@ def new_xtrain(dataset_description,
if training_opts["multi_gpu"]:
torch.distributed.barrier()
model = new_train_epoch(model,
training_opts,
monitor,
training_loader,
optimizer,
scheduler,
device,
scaler=scaler)
model = train_epoch(model,
training_opts,
monitor,
training_loader,
optimizer,
scheduler,
device,
scaler=scaler)
# Cross validation
if math.fmod(epoch, training_opts["validation_frequency"]) == 0:
......@@ -1401,7 +1353,7 @@ def new_xtrain(dataset_description,
test_eer = None
if training_opts["compute_test_eer"] and local_rank < 1:
test_eer = new_test_metrics(model, device, model_opts, dataset_opts, training_opts)
test_eer = test_metrics(model, device, model_opts, dataset_opts, training_opts)
monitor.update(test_eer=test_eer,
val_eer=val_eer,
......@@ -1426,461 +1378,15 @@ def new_xtrain(dataset_description,
return monitor.best_eer
def xtrain(speaker_number,
dataset_yaml,
epochs=None,
lr=None,
model_yaml=None,
model_name=None,
loss=None,
aam_margin=None,
aam_s=None,
patience=None,
tmp_model_name=None,
best_model_name=None,
multi_gpu=True,
device=None,
mixed_precision=False,
clipping=False,
opt=None,
reset_parts=[],
freeze_parts=[],
num_thread=None,
compute_test_eer=True):
"""
:param speaker_number:
:param dataset_yaml:
:param epochs:
:param lr:
:param model_yaml:
:param model_name:
:param loss:
:param aam_margin:
:param aam_s:
:param patience:
:param tmp_model_name:
:param best_model_name:
:param multi_gpu:
:param mixed_precision:
:param clipping:
:param opt:
:param reset_parts:
:param freeze_parts:
:param num_thread:
:param compute_test_eer:
:return:
"""
# Test to optimize
torch.autograd.profiler.emit_nvtx(enabled=False)
if num_thread is None:
import multiprocessing
num_thread = multiprocessing.cpu_count()
logging.critical(f"Use {num_thread} cpus")
logging.critical(f"Start process at {time.strftime('%H:%M:%S', time.localtime())}")
if device == None:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Use a predefined architecture
if model_yaml in ["xvector", "rawnet2", "resnet34", "fastresnet34", "halfresnet34"]:
if model_name is None:
model = Xtractor(speaker_number, model_yaml, loss=loss)
else:
logging.critical(f"*** Load model from = {model_name}")
checkpoint = torch.load(model_name)
model = Xtractor(speaker_number, model_yaml)
"""
Here we remove all layers that we don't want to reload
"""
pretrained_dict = checkpoint["model_state_dict"]
for part in reset_parts:
pretrained_dict = {k: v for k, v in pretrained_dict.items() if not k.startswith(part)}
new_model_dict = model.state_dict()
new_model_dict.update(pretrained_dict)
model.load_state_dict(new_model_dict)
# Freeze required layers
for name, param in model.named_parameters():
if name.split(".")[0] in freeze_parts:
param.requires_grad = False
model_archi = model_yaml
# Here use a config file to build the architecture
else:
with open(model_yaml, 'r') as fh:
model_archi = yaml.load(fh, Loader=yaml.FullLoader)
if epochs is None:
epochs = model_archi["training"]["epochs"]
if patience is None:
patience = model_archi["training"]["patience"]
if opt is None:
opt = model_archi["training"]["opt"]
if lr is None:
lr = model_archi["training"]["lr"]
if loss is None:
loss = model_archi["training"]["loss"]
if aam_margin is None and model_archi["training"]["loss"] == "aam":
aam_margin = model_archi["training"]["aam_margin"]
if aam_s is None and model_archi["training"]["loss"] == "aam":
aam_s = model_archi["training"]["aam_s"]
if tmp_model_name is None:
tmp_model_name = model_archi["training"]["tmp_model_name"]
if best_model_name is None:
best_model_name = model_archi["training"]["best_model_name"]
if multi_gpu is None:
multi_gpu = model_archi["training"]["multi_gpu"]
if clipping is None:
clipping = model_archi["training"]["clipping"]
if model_name is None:
model = Xtractor(speaker_number, model_yaml)
# If we start from an existing model
else:
# Load the model
logging.critical(f"*** Load model from = {model_name}")
checkpoint = torch.load(model_name, map_location=device)
model = Xtractor(speaker_number, model_yaml, loss=loss)
"""
Here we remove all layers that we don't want to reload
"""
pretrained_dict = checkpoint["model_state_dict"]
for part in reset_parts:
pretrained_dict = {k: v for k, v in pretrained_dict.items() if not k.startswith(part)}
new_model_dict = model.state_dict()
new_model_dict.update(pretrained_dict)
model.load_state_dict(new_model_dict)
# Freeze required layers
for name, param in model.named_parameters():
if name.split(".")[0] in freeze_parts:
param.requires_grad = False
logging.critical(model)
logging.critical("model_parameters_count: {:d}".format(
sum(p.numel()
for p in model.sequence_network.parameters()
if p.requires_grad) + \
sum(p.numel()
for p in model.before_speaker_embedding.parameters()
if p.requires_grad) + \
sum(p.numel()
for p in model.stat_pooling.parameters()
if p.requires_grad)))
embedding_size = model.embedding_size
if torch.cuda.device_count() > 1 and multi_gpu:
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = torch.nn.DataParallel(model)
else:
print("Train on a single GPU")
model.to(device)
with open(dataset_yaml, "r") as fh:
dataset_params = yaml.load(fh, Loader=yaml.FullLoader)
df = pandas.read_csv(dataset_params["dataset_description"])
"""
Set the dataloaders according to the dataset_yaml
First we load the dataframe from CSV file in order to split it for training and validation purpose
Then we provide those two
"""
training_df, validation_df = train_test_split(df, test_size=dataset_params["validation_ratio"] , stratify=df["speaker_idx"])
torch.manual_seed(dataset_params['seed'])
training_set = SideSet(dataset_yaml,
set_type="train",
chunk_per_segment=-1,
overlap=dataset_params['train']['overlap'],
dataset_df=training_df,
output_format="pytorch",
)
validation_set = SideSet(dataset_yaml,
set_type="validation",
chunk_per_segment=1,
dataset_df=validation_df,
output_format="pytorch")
side_sampler = SideSampler(training_set.sessions['speaker_idx'],
speaker_number,
1,
128,
dataset_params["batch_size"])
training_loader = DataLoader(training_set,
batch_size=dataset_params["batch_size"],
shuffle=False,
drop_last=True,
pin_memory=True,
sampler=side_sampler,
num_workers=num_thread,
persistent_workers=True)
validation_loader = DataLoader(validation_set,
batch_size=dataset_params["batch_size"],
drop_last=False,
pin_memory=True,
num_workers=num_thread,
persistent_workers=False)
"""
Set the training options
"""
if opt == 'adam':
_optimizer = torch.optim.Adam
_options = {'lr': lr}
elif opt == 'rmsprop':
_optimizer = torch.optim.RMSprop
_options = {'lr': lr}
else: # opt == 'sgd'
_optimizer = torch.optim.SGD
_options = {'lr': lr, 'momentum': 0.9}
param_list = []
if type(model) is Xtractor:
if model.preprocessor is not None:
param_list.append({'params': model.preprocessor.parameters(), 'weight_decay': model.preprocessor_weight_decay})
param_list.append({'params': model.sequence_network.parameters(), 'weight_decay': model.sequence_network_weight_decay})
param_list.append({'params': model.stat_pooling.parameters(), 'weight_decay': model.stat_pooling_weight_decay})
param_list.append({'params': model.before_speaker_embedding.parameters(), 'weight_decay': model.before_speaker_embedding_weight_decay})
param_list.append({'params': model.after_speaker_embedding.parameters(), 'weight_decay': model.after_speaker_embedding_weight_decay})
else:
if model.module.preprocessor is not None:
param_list.append({'params': model.module.preprocessor.parameters(), 'weight_decay': model.module.preprocessor_weight_decay})
param_list.append({'params': model.module.sequence_network.parameters(), 'weight_decay': model.module.sequence_network_weight_decay})
param_list.append({'params': model.module.stat_pooling.parameters(), 'weight_decay': model.module.stat_pooling_weight_decay})
param_list.append({'params': model.module.before_speaker_embedding.parameters(), 'weight_decay': model.module.before_speaker_embedding_weight_decay})
param_list.append({'params': model.module.after_speaker_embedding.parameters(), 'weight_decay': model.module.after_speaker_embedding_weight_decay})
optimizer = _optimizer(param_list, **_options)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=5 * training_loader.__len__(),
gamma=0.75)
if mixed_precision:
scaler = torch.cuda.amp.GradScaler()
else:
scaler = None
best_accuracy = 0.0
best_accuracy_epoch = 1
best_eer = 100
curr_patience = patience
test_eer = 100.
classes = torch.ShortTensor(validation_set.sessions['speaker_idx'].to_numpy())
mask = classes.unsqueeze(1) == classes.unsqueeze(1).T
tar_indices = torch.tril(mask, -1).numpy()
non_indices = torch.tril(~mask, -1).numpy()
tar_non_ratio = numpy.sum(tar_indices)/numpy.sum(non_indices)
non_indices *= numpy.random.choice([False, True], size=non_indices.shape, p=[1-tar_non_ratio, tar_non_ratio])
#tar_indices *= numpy.random.choice([False, True], size=tar_indices.shape, p=[0.9, 0.1])
#non_indices *= numpy.random.choice([False, True], size=non_indices.shape, p=[0.9, 0.1])
logging.critical("val tar count : {:d}, non count : {:d}".format(numpy.sum(tar_indices), numpy.sum(non_indices)))
for epoch in range(1, epochs + 1):
# Process one epoch and return the current model
if curr_patience == 0:
print(f"Stopping at epoch {epoch} for cause of patience")
break
model = train_epoch(model,
epoch,
training_loader,
optimizer,
scheduler,
dataset_params["log_interval"],
device,
scaler=scaler,
clipping=clipping)
# Add the cross validation here
if math.fmod(epoch, 1) == 0:
val_acc, val_loss, val_eer = cross_validation(model, validation_loader, device, [validation_set.__len__(), embedding_size], tar_indices, non_indices, mixed_precision)
logging.critical(f"***{time.strftime('%H:%M:%S', time.localtime())} Training metrics - Cross validation accuracy = {val_acc} %, EER = {val_eer * 100} %")
if compute_test_eer:
test_eer = test_metrics(model, device, speaker_number, num_thread, mixed_precision)
#logging.critical(f"***{time.strftime('%H:%M:%S', time.localtime())} Training metrics - Reversed Test EER = {rev_eer * 100} %")
logging.critical(f"***{time.strftime('%H:%M:%S', time.localtime())} Training metrics - Test EER = {test_eer * 100} %")
# remember best accuracy and save checkpoint
if compute_test_eer:
is_best = test_eer < best_eer
best_eer = min(test_eer, best_eer)
else:
is_best = val_eer < best_eer
best_eer = min(val_eer, best_eer)
best_accuracy = max(val_acc, best_accuracy)
if tmp_model_name is None:
tmp_model_name = "tmp_model"
if best_model_name is None:
best_model_name = "best_model"
if type(model) is Xtractor:
save_checkpoint({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'accuracy': best_accuracy,
'scheduler': scheduler,
'speaker_number' : speaker_number,
'model_archi': model_archi,
'loss': loss
}, is_best, filename=tmp_model_name+".pt", best_filename=best_model_name+'.pt')
else:
save_checkpoint({
'epoch': epoch,
'model_state_dict': model.module.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'accuracy': best_accuracy,
'scheduler': scheduler,
'speaker_number': speaker_number,
'model_archi': model_archi,
'loss': loss
}, is_best, filename=tmp_model_name+".pt", best_filename=best_model_name+'.pt')
if is_best:
best_accuracy_epoch = epoch
curr_patience = patience
else:
curr_patience -= 1
for ii in range(torch.cuda.device_count()):
print(torch.cuda.memory_summary(ii))
logging.critical(f"Best accuracy {best_accuracy * 100.} obtained at epoch {best_accuracy_epoch}")
def train_epoch(model, epoch, training_loader, optimizer, scheduler, log_interval, device, scaler=None, clipping=False):
"""
:param model:
:param epoch:
:param training_loader:
:param optimizer:
:param log_interval:
:param device:
:param clipping:
:return:
"""
model.train()
criterion = torch.nn.CrossEntropyLoss(reduction='mean')
if isinstance(model, Xtractor):
loss_criteria = model.loss
else:
loss_criteria = model.module.loss
accuracy = 0.0
running_loss = 0.0
for batch_idx, (data, target) in enumerate(training_loader):
data = data.squeeze().to(device)
target = target.squeeze()
target = target.to(device)
optimizer.zero_grad(set_to_none=True)
if scaler is not None:
with torch.cuda.amp.autocast():
if loss_criteria == 'aam':
output, _ = model(data, target=target)
loss = criterion(output, target)
elif loss_criteria == 'smn':
output_tuple, _ = model(data, target=target)
loss, output = output_tuple
loss += criterion(output, target)
elif loss_criteria == 'aps':
output_tuple, _ = model(data, target=target)
cos_sim_matx, output = output_tuple
loss = criterion(cos_sim_matx, torch.arange(0, int(data.shape[0]/2), device=device)) + criterion(output, target)
else:
output, _ = model(data, target=None)
loss = criterion(output, target)
else:
if loss_criteria == 'aam':
output, _ = model(data, target=target)
loss = criterion(output, target)
elif loss_criteria == 'aps':
output_tuple, _ = model(data, target=target)
cos_sim_matx, output = output_tuple
loss = criterion(cos_sim_matx, torch.arange(0, int(data.shape[0]/2), device=device)) + criterion(output, target)
else:
output, _ = model(data, target=None)
loss = criterion(output, target)
if not torch.isnan(loss):
if scaler is not None:
scaler.scale(loss).backward()
if clipping:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
scaler.step(optimizer)
scaler.update()
else:
loss.backward()
optimizer.step()
running_loss += loss.item()
accuracy += (torch.argmax(output.data, 1) == target).sum()
if math.fmod(batch_idx, log_interval) == 0:
batch_size = target.shape[0]
logging.critical('{}, Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAccuracy: {:.3f}'.format(
time.strftime('%H:%M:%S', time.localtime()),
epoch, batch_idx + 1, training_loader.__len__(),
100. * batch_idx / training_loader.__len__(), loss.item(),
100.0 * accuracy.item() / ((batch_idx + 1) * batch_size)))
else:
save_checkpoint({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'accuracy': 0.0,
'scheduler': 0.0
}, False, filename="model_loss_NAN.pt", best_filename='toto.pt')
with open("batch_loss_NAN.pkl", "wb") as fh:
pickle.dump(data.cpu(), fh)
import sys
sys.exit()
running_loss = 0.0
scheduler.step()
return model
def new_train_epoch(model,
training_opts,
training_monitor,
training_loader,
optimizer,
scheduler,
device,
scaler=None,
clipping=False):
def train_epoch(model,
training_opts,
training_monitor,
training_loader,
optimizer,
scheduler,
device,
scaler=None,
clipping=False):
"""
:param model:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment