Commit 6373b7e1 authored by Anthony Larcher's avatar Anthony Larcher
Browse files

merge branches Meysam

parents 4119c573 50adfcbc
......@@ -178,7 +178,7 @@ def combine_upper_and_bottom_linkage(bottom_linkage, link_clusters_tmp, bottomli
full_link_tmp = copy.deepcopy(bottom_linkage)
merging_cluster_list = list(hac_vec.modelset)
extend_link_clusters = numpy.zeros((link_clusters_tmp.shape[0], link_clusters_tmp.shape[1] + 1)) - 1
extend_link_clusters[:, :-1] = link_clusters_tmp
extend_link_clusters[:, :-1] = copy.deepcopy(link_clusters_tmp)
# idx_node = len(extend_link_clusters)
for idx_node, node in enumerate(extend_link_clusters):
......@@ -188,9 +188,13 @@ def combine_upper_and_bottom_linkage(bottom_linkage, link_clusters_tmp, bottomli
if new_node[0] < len(merging_cluster_list):
cls = bottomline_cluster_list_tmp[0]
for cls in bottomline_cluster_list_tmp:
if cls.startswith(merging_cluster_list[int(node[0])] + "_"):
break
ii = 0
while not bottomline_cluster_list_tmp[ii].startswith(merging_cluster_list[int(node[0])] + "_"):
cls = bottomline_cluster_list_tmp[ii]
ii += 1
# for cls in bottomline_cluster_list_tmp:
# if cls.startswith(merging_cluster_list[int(node[0])] + "_"):
# break
new_node[0] = bottomline_cluster_list_tmp.index(cls)
else:
......@@ -267,7 +271,8 @@ def question_quality(node,
init_diar,
temporary_link_list,
conditional_questioning,
question_type):
question_type,
verbos=False):
"""
Evaluate the question based on conditional_questioning
:param node_list2:
......@@ -284,21 +289,32 @@ def question_quality(node,
:param question_type: separation or clustering
:return: True or False as a proposition of the question to be asked
"""
if conditional_questioning == "calinski_harabasz":
if conditional_questioning in ['calinski_harabasz', 'silhouette', 'davies_bouldin', 'calinski_harabasz_invs',
'silhouette_invs', 'davies_bouldin_invs']:
link_tmp = copy.deepcopy(temporary_link_list)
diar_tmp = copy.deepcopy(init_diar)
current_diar = apply_link_on_diar(diar_tmp, bottomline_cluster_list, link_tmp)
if len(current_diar.unique("cluster")) < 2:
print("calinski_harabasz is not possible to be calculated for one cluster -> True")
if verbos:
print(f"{conditional_questioning} is not possible to be calculated for one cluster -> True")
return True
current_labels = []
for seg in current_diar:
current_labels.append(seg['cluster'])
current_score = sklearn.metrics.calinski_harabasz_score(hac_vec_per_seg.stat1, current_labels)
if "calinski_harabasz" in conditional_questioning:
current_score = sklearn.metrics.calinski_harabasz_score(hac_vec_per_seg.stat1, current_labels)
elif "silhouette" in conditional_questioning:
current_score = sklearn.metrics.silhouette_score(hac_vec_per_seg.stat1, current_labels)
elif "davies_bouldin" in conditional_questioning:
current_score = sklearn.metrics.davies_bouldin_score(hac_vec_per_seg.stat1, current_labels)
else:
if verbos:
print(f"{conditional_questioning} is not valid -> True")
return True
if question_type == "separation":
......@@ -314,13 +330,24 @@ def question_quality(node,
new_diar = apply_link_on_diar(diar_tmp, bottomline_cluster_list, link_tmp)
if len(new_diar.unique("cluster")) < 2:
print("calinski_harabasz is not possible to be calculated for one cluster (after separation) -> True")
if verbos:
print(f"{conditional_questioning} is not possible to be calculated for one cluster (after separation) -> True")
return True
changed_labels = []
for seg in new_diar:
changed_labels.append(seg['cluster'])
changed_score = sklearn.metrics.calinski_harabasz_score(hac_vec_per_seg.stat1, changed_labels)
if "calinski_harabasz" in conditional_questioning:
changed_score = sklearn.metrics.calinski_harabasz_score(hac_vec_per_seg.stat1, changed_labels)
elif "silhouette" in conditional_questioning:
changed_score = sklearn.metrics.silhouette_score(hac_vec_per_seg.stat1, changed_labels)
elif "davies_bouldin" in conditional_questioning:
changed_score = sklearn.metrics.davies_bouldin_score(hac_vec_per_seg.stat1, changed_labels)
else:
if verbos:
print(f"{conditional_questioning} is not valid -> True")
return True
elif question_type == "clustering":
......@@ -331,62 +358,41 @@ def question_quality(node,
new_diar = apply_link_on_diar(diar_tmp, bottomline_cluster_list, link_tmp)
if len(new_diar.unique("cluster")) < 2:
print("calinski_harabasz is not possible to be calculated for one cluster (after clustering) -> True")
if verbos:
print(f"{conditional_questioning} is not possible to be calculated for one cluster (after clustering) -> True")
return True
changed_labels = []
for seg in new_diar:
changed_labels.append(seg['cluster'])
changed_score = sklearn.metrics.calinski_harabasz_score(hac_vec_per_seg.stat1, changed_labels)
if "calinski_harabasz" in conditional_questioning:
changed_score = sklearn.metrics.calinski_harabasz_score(hac_vec_per_seg.stat1, changed_labels)
elif "silhouette" in conditional_questioning:
changed_score = sklearn.metrics.silhouette_score(hac_vec_per_seg.stat1, changed_labels)
elif "davies_bouldin" in conditional_questioning:
changed_score = sklearn.metrics.davies_bouldin_score(hac_vec_per_seg.stat1, changed_labels)
else:
if verbos:
print(f"{conditional_questioning} is not valid -> True")
return True
# changed_score = sklearn.metrics.calinski_harabasz_score(hac_vec_per_seg.stat1, changed_labels)
else:
print(f"Invalid question_type ({question_type}) shoud be separation or clustering"
if verbos:
print(f"Invalid question_type ({question_type}) shoud be separation or clustering"
f"-> ignore conditional questioning ")
return True
# NEVER USE A VARIABLE WITH THE SAME NAME AS THE METHOD
q_quality = current_score < changed_score
print(f"{question_type}: current_score({current_score}) < changed_score({changed_score}) -> {q_quality}")
elif conditional_questioning == "std":
spk_list1 = []
for n in node_list1:
spk_list1.append(bottomline_cluster_list[n])
spk_list2 = []
for n in node_list2:
spk_list2.append(bottomline_cluster_list[n])
segs_cluster_mrg = {}
for i in range(len(hac_vec_per_seg.modelset)):
if hac_vec_per_seg.modelset[i] in spk_list1 or hac_vec_per_seg.modelset[i] in spk_list2:
segs_cluster_mrg[i] = hac_vec_per_seg.stat1[i]
mrg_situation = numpy.mean(numpy.std(list(segs_cluster_mrg.values()), axis=0))
segs_cluster1_sep = {}
for i in range(len(hac_vec_per_seg.modelset)):
if hac_vec_per_seg.modelset[i] in spk_list1:
segs_cluster1_sep[i] = hac_vec_per_seg.stat1[i]
std_sep1 = numpy.std(list(segs_cluster1_sep.values()), axis=0)
segs_cluster2_sep = {}
for i in range(len(hac_vec_per_seg.modelset)):
if hac_vec_per_seg.modelset[i] in spk_list2:
segs_cluster2_sep[i] = hac_vec_per_seg.stat1[i]
std_sep2 = numpy.std(list(segs_cluster2_sep.values()), axis=0)
sep_situation = numpy.mean(numpy.mean([std_sep1, std_sep2], axis=0))
if question_type == "separation":
q_quality = sep_situation < mrg_situation
print(f"separation: sep_situation({sep_situation}) < mrg_situation({mrg_situation}) -> {q_quality}")
elif question_type == "clustering":
q_quality = sep_situation > mrg_situation
print(f"clustering: sep_situation({sep_situation}) > mrg_situation({mrg_situation}) -> {q_quality}")
if 'invs' in conditional_questioning:
q_quality = current_score < changed_score
else:
print("Question type is not valid ...")
q_quality = True
q_quality = current_score > changed_score
if verbos:
print(f"{question_type}: current_score({current_score}) < changed_score({changed_score}) -> {q_quality}")
else:
q_quality = True
return q_quality
......@@ -954,7 +960,8 @@ def apply_ideal_correction(bottomline_diar, ref, uem, der_track_show):
:return:
"""
hyp = s4d_to_allies(copy.deepcopy(bottomline_diar))
der, fa_rate, miss_rate, conf_rate, time, newspkmap = compute_der(ref, hyp, uem, {}, 0.250)
der, fa_rate, miss_rate, conf_rate, error, time, newspkmap = compute_der([ref], [hyp], [uem], collar = 0.250)
# der, fa_rate, miss_rate, conf_rate, time, newspkmap = compute_der(ref, hyp, uem, {}, 0.250)
der_track_cs = {"time": time, "der_log": [der], "correction": ["initial"]}
removelist = []
......@@ -976,7 +983,8 @@ def apply_ideal_correction(bottomline_diar, ref, uem, der_track_show):
new_diar = copy.deepcopy(first_pass_diar)
hyp = s4d_to_allies(new_diar)
der, fa_rate, miss_rate, conf_rate, time, newspkmap = compute_der(ref, hyp, uem, {}, 0.250)
der, fa_rate, miss_rate, conf_rate, error, time, newspkmap = compute_der([ref], [hyp], [uem], collar = 0.250)
# der, fa_rate, miss_rate, conf_rate, time, newspkmap = compute_der(ref, hyp, uem, {}, 0.250)
der_track_cs["der_log"].append(der)
der_track_cs["correction"].append("ideal")
der_track_show['ideal'] = der_track_cs
......@@ -1078,7 +1086,9 @@ def run_active_learning_tree(link,
temporary_link_list,
conditional_questioning,
question_type="separation"):
print(f"Conditional questioning ({conditional_questioning}) does not advice separation")
pass
# stop_separation_list += subnodes_list
# print(f"Conditional questioning ({conditional_questioning}) does not advice separation")
# otherwise as question to the human about this node
else:
......@@ -1150,7 +1160,10 @@ def run_active_learning_tree(link,
temporary_link_list,
conditional_questioning,
question_type="clustering"):
print(f"Conditional questioning ({conditional_questioning}) does not advice clustering")
# stop_clustering_list += get_roots_nodes(node[:4], number_cluster, link)
pass
# print(f"Conditional questioning ({conditional_questioning}) does not advice clustering")
# otherwise as question to the human about this node
else:
......@@ -1233,6 +1246,12 @@ def allies_within_show_hal(model_cfg,
:param ref:
:param user:
:param c2s:
:param conditional_clustering: 'calinski_harabasz',
'silhouette',
'davies_bouldin',
'calinski_harabasz_invs',
'silhouette_invs',
'davies_bouldin_invs'
:return: new diarization object
"""
der_track_show = {}
......@@ -1247,10 +1266,13 @@ def allies_within_show_hal(model_cfg,
# Creat dendrogram by using upper level of current clustering and segments in each cluster######
#####################################################################################################
if len(init_diar.unique("cluster")) < 2:
link_clusters = np.array([])
link_clusters = numpy.array([])
else:
scores_clusters, link_clusters, th_clusters = vec2link(model_cfg, vec_per_cluster, init_diar, model)
model_cfg["model"]["vad"]["type"] = "reference"
model_cfg['first_seg']['thr_h']=model_cfg['within_show']['thr_h']
bottomline_diar, link_within_clusters_dic, vec_within_clusters_dic = create_bottomline_clustering(model,
model_cfg,
show,
......@@ -1266,11 +1288,11 @@ def allies_within_show_hal(model_cfg,
########################################################################################
# check does bottomline_diar plus sub cluster linkage make current_diar #
hyp = s4d_to_allies(init_diar)
der_init, fa_rate, miss_rate, conf_rate, time, newspkmap = compute_der(ref, hyp, uem, {}, 0.250)
der_init, fa_rate, miss_rate, conf_rate, error, time, newspkmap = compute_der([ref], [hyp], [uem], collar = 0.250)
print("Current DER (original) : ", der_init)
hyp = s4d_to_allies(bottomline_diar)
der, fa_rate, miss_rate, conf_rate, time, newspkmap = compute_der(ref, hyp, uem, {}, 0.250)
der, fa_rate, miss_rate, conf_rate, error, time, newspkmap = compute_der([ref], [hyp], [uem], collar = 0.250)
print("DER of bottomline_diar: ", der)
vec_per_seg.modelset = vec_per_seg.modelset.astype(object)
......@@ -1320,7 +1342,7 @@ def allies_within_show_hal(model_cfg,
der_track_cs["time"] = time
der_track_show[c2s] = der_track_cs
der_track_show[f"{str(c2s)}_{model_cfg['within_show']['conditional_questioning']}"] = der_track_cs
#prefix = f"{dir_output_system}/HAL_c2s_{c2s}_th_h_{model_cfg['within_show']['hal_seg']}"
#if model_cfg['within_show']['conditional_questioning']:
......
......@@ -93,7 +93,7 @@ def init_clustering(init_diar, cep, model_cfg, vad_type="none"):
cluster = s4d.clustering.hac_bic.HAC_BIC(cep, output_diar, model_cfg['first_seg']['thr_h'], sr=False)
output_diar = cluster.perform()
# Viterbi devoding is only applied when starting from a VAD segmentation or from scratch
# Viterbi decoding is only applied when starting from a VAD segmentation or from scratch
if vad_type != "reference":
output_diar = s4d.viterbi.viterbi_decoding(cep, output_diar, model_cfg['first_seg']['thr_vit'])
......@@ -127,10 +127,12 @@ def vec2link_xv(model_cfg, xv_vec, current_diar):
"""
th_w = model_cfg["within_show"]["th_w"]
within_iv_mean = xv_vec.mean_stat_per_model()
# Compute scores
ndx = sidekit.Ndx(models=xv_vec.modelset, testsegs=xv_vec.modelset)
ndx = sidekit.Ndx(models=within_iv_mean.modelset, testsegs=within_iv_mean.modelset)
scores = sidekit.iv_scoring.cosine_scoring(xv_vec, xv_vec, ndx,
scores = sidekit.iv_scoring.cosine_scoring(within_iv_mean, within_iv_mean, ndx,
wccn=None,
check_missing=False)
......@@ -142,7 +144,6 @@ def vec2link_xv(model_cfg, xv_vec, current_diar):
# Make the cluster names consistent
for idx in range(len(scores.modelset)):
#scores.modelset[idx] = current_diar[idx]["cluster"]
scores.modelset[idx] = xv_vec.modelset[idx]
######################################################################################
......@@ -180,7 +181,7 @@ def vec2link_iv(model, model_cfg, iv_vec, current_diar):
scores.scoremat = 0.5 * (scores.scoremat + scores.scoremat.transpose())
for idx in range(len(scores.modelset)):
scores.modelset[idx] = current_diar[idx]["cluster"]
scores.modelset[idx] = xv_vec.modelset[idx]
# Get the linkage matrix from the scores
distances, th = s4d.clustering.hac_utils.scores2distance(scores, model_cfg['within_show']['th_w'])
......@@ -227,22 +228,7 @@ def extract_vectors(current_diar, root_folder, model_cfg, show, model=None):
normalization=False)
elif model_cfg["model"]["type"] == "lium_xv":
current_im = current_diar.id_map()
# current_im.write(f"{file_path}/{show}.idmap.h5")
xtractor_name = model_cfg['tmp_dir'] + "model/best_xtractor.pt"
current_vec_per_segment = sidekit.nnet.xvector.extract_embeddings(idmap_name=current_im,
model_filename=xtractor_name,
data_root_name=f"{root_folder}/wav/",
device=torch.device("cuda"),
file_extension="wav",
transform_pipeline={},
sliding_window=False,
sample_rate=16000,
mixed_precision=False)
"""
current_vec_per_cluster = current_vec_per_segment.mean_stat_per_model()
#current_vec_per_cluster.norm_stat1()
#current_vec_per_cluster= sidekit.nnet.xvector.extract_embeddings_per_speaker(idmap_name=current_im,
......@@ -266,6 +252,47 @@ def extract_vectors(current_diar, root_folder, model_cfg, show, model=None):
# device=torch.device("cuda"),
# transform_pipeline=model_cfg["model"]["vectors"]["xvectors"]["transforms"],
# num_thread=5)
current_vec = current_vec_per_segment.mean_stat_per_model()
"""
current_im = current_diar.id_map()
current_im.start = current_im.start * 160
current_im.stop = current_im.stop * 160
if os.path.exists(f"{model_cfg['tmp_dir']}/{show}.idmap.h5"):
os.remove(f"{model_cfg['tmp_dir']}/{show}.idmap.h5")
current_im.write(f"{model_cfg['tmp_dir']}/{show}.idmap.h5")
current_vec_per_cluster= sidekit.nnet.xvector.extract_embeddings_per_speaker(idmap_name=f"{model_cfg['tmp_dir']}/{show}.idmap.h5",
model_filename=f"{model_cfg['model_dir']}/best_xtractor.pt",
# model_filename="../Baseline_LIUM_HAC/cfg/models/Evallies_xv/best_xtractor.pt",
data_root_name=f"{model_cfg['wav_dir']}",
device=torch.device("cuda"),
transform_pipeline={},
num_thread=5)
# current_vec_per_cluster= sidekit.nnet.xvector.extract_embeddings_per_speaker(idmap_name=current_im,
# # model_filename=model_cfg['tmp_dir'] + "model/best_xtractor.pt",
# model_filename="../Baseline_LIUM_HAC/cfg/models/Evallies_xv/best_xtractor.pt",
# data_root_name=f"{root_folder}/wav/",
# device=torch.device("cuda"),
# num_thread=5)
diar_seg=copy.deepcopy(current_diar)
for i in range(len(diar_seg)):
diar_seg[i]["cluster"]="tmp_"+str(i)
current_im = diar_seg.id_map()
current_im.start = current_im.start * 160
current_im.stop = current_im.stop * 160
if os.path.exists(f"{model_cfg['tmp_dir']}/{show}_seg.idmap.h5"):
os.remove(f"{model_cfg['tmp_dir']}/{show}_seg.idmap.h5")
current_im.write(f"{model_cfg['tmp_dir']}/{show}_seg.idmap.h5")
current_vec_per_segment= sidekit.nnet.xvector.extract_embeddings_per_speaker(idmap_name=f"{model_cfg['tmp_dir']}/{show}_seg.idmap.h5",
model_filename=f"{model_cfg['model_dir']}/best_xtractor.pt",
data_root_name=f"{model_cfg['wav_dir']}",
device=torch.device("cuda"),
transform_pipeline={},
num_thread=5)
return current_vec_per_cluster, current_vec_per_segment
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment