Commit d857996d authored by Anthony Larcher's avatar Anthony Larcher
Browse files

first commit with clean cross show HAL

parent 56e053b3
......@@ -34,7 +34,11 @@ import scipy
import sidekit
import torch
from ..user_simulation import MessageToUser
from ..user_simulation import Request
from s4d import Diar
from .utils import s4d_to_allies
from .utils import rename_models
from .utils import concat_statservers
......@@ -195,13 +199,73 @@ def cross_show(previous_vec,
previous_diar,
within_vec,
within_diar,
th_x,
lim,
user,
file_info,
uem,
ref,
human_in_the_loop=False):
th_x):
"""
:param previous_vec:
:param previous_diar:
:param within_vec:
:param within_diar:
:param th_x:
:return:
"""
within_vec_backup = copy.deepcopy(within_vec)
previous_vec_backup = copy.deepcopy(previous_vec)
# get the mean_per_model for previous and within
within_vec_mean = within_vec.mean_stat_per_model()
previous_vec_mean = previous_vec.mean_stat_per_model()
"""
Compute distance matrix to perform HAC between previous and within cluster.
This matrix is normalized to enable/disable clustering between previous/previous
and within/within clusters
"""
ll_vec, scores = compute_distance_cross_show(previous_vec_mean, previous_diar, within_vec_mean)
scores.scoremat += 1.
th_x += 1.
numpy.fill_diagonal(scores.scoremat, 0.0)
squareform_plda = scipy.spatial.distance.squareform(scores.scoremat)
Z = fastcluster.linkage(squareform_plda, method='complete', preserve_input=True)
T = scipy.cluster.hierarchy.fcluster(Z, th_x, 'distance')
# Don't allow to modify the names of previously existing clusters
# Create a dictionary with old_model_name as key and new_cluster as value
cluster_dict = dict()
clusters_by_index = dict()
for ii in range(T.shape[0]):
if T[ii] not in clusters_by_index:
clusters_by_index[T[ii]] = ll_vec.modelset[ii]
cluster_dict[ll_vec.modelset[ii]] = clusters_by_index[T[ii]]
# concatenate previous_vec et within_vec
new_previous_vec = concat_statservers(previous_vec_backup, within_vec_backup)
new_previous_diar = copy.deepcopy(previous_diar)
new_previous_diar.segments += within_diar.segments
# Modify the model names for vectors
for ii, mod in enumerate(new_previous_vec.modelset):
new_previous_vec.modelset[ii] = cluster_dict[mod]
for ii, seg in enumerate(new_previous_diar.segments):
new_previous_diar.segments[ii]['cluster'] = cluster_dict[seg['cluster']]
for ii, seg in enumerate(within_diar.segments):
within_diar.segments[ii]['cluster'] = cluster_dict[seg['cluster']]
return new_previous_vec, new_previous_diar, within_diar, None
def cross_show_HAL(previous_vec,
previous_diar,
within_vec,
within_diar,
th_x,
lim,
user,
file_info):
"""
:param previous_vec:
......@@ -231,95 +295,92 @@ def cross_show(previous_vec,
"""
ll_vec, scores = compute_distance_cross_show(previous_vec_mean, previous_diar, within_vec_mean)
"""
metadata = 0
if human_in_the_loop:
tdict = {}
for i in range(previous_iv_mean.modelset.shape[0], ll_iv_mean.modelset.shape[0]):
insp_name = scores.modelset[i]
lst = scores.scoremat[i][0:previous_iv_mean.modelset.shape[0]]
tmp = []
for j in range(len(lst)):
tmp.append((lst[j], scores.modelset[j]))
tmp.sort()
j=0
q = 0
stop = False
same = False
if tmp[0][0] < th_x:
viewed =[]
while j<len(tmp) and not stop and q<lim:
if tmp[j][1].split('S')[0] not in viewed:
# same = check_dif_files(tmp[j][1],insp_name,within_diar,previous_diar)
same = compare_longest_segment_id(previous_diar_id=tmp[j][1],
within_diar_id=insp_name,
within_diar=within_diar,
previous_diar=previous_diar,
ref_path=reference_path)
metadata += 1
q += 1
if same:
stop = True
tdict[insp_name]=tmp[j][1]
else:
viewed.append(tmp[j][1].split('S')[0])
j += 1
else:
j += 1
"""
tdict = {}
linkage_speaker_dict = {}
# For each speaker in the current diarization
for ii in range(previous_vec_mean.modelset.shape[0], ll_vec.modelset.shape[0]):
# Get the current name of the speaker
current_speaker_name = scores.modelset[ii]
# get all scores of this model with previously known speakers
#lst = scores.scoremat[i][0:previous_vec_mean.modelset.shape[0]]
# Get a list of tuples (score, old_model_name)
#tmp = []
#for j in range(len(lst)):
# tmp.append((lst[j], scores.modelset[j]))
#tmp.sort()
# Trie les previous speakers du plus proche au plus lointain
sorted_idx = numpy.argsort(scores.scoremat[ii][:previous_vec_mean.modelset.shape[0]])
if scores.scoremat[ii, sorted_idx[0]] < th_x:
# Tant qu'on a pas posé "lim" questions, qu'il reste des locuteurs et qu'on n'a pas trouvé
keep_questioning = True
previous_spk_idx = 0
already_asked_questions = 0
already_questioned_speakers = []
while keep_questioning and already_asked_questions < lim and previous_spk_idx < previous_vec_mean.modelset.shape[0]:
if not human_in_the_loop:
scores.scoremat += 1.
th_x += 1.
numpy.fill_diagonal(scores.scoremat, 0.0)
squareform_plda = scipy.spatial.distance.squareform(scores.scoremat)
Z = fastcluster.linkage(squareform_plda, method='complete', preserve_input=True)
# get the name of the previous speaker being investigated
previous_spk_name = scores.modelset[sorted_idx[previous_spk_idx]]
T = scipy.cluster.hierarchy.fcluster(Z, th_x, 'distance')
# Si le previous_spk n'a pas encore été vu dans les questions précédentes
if previous_spk_name not in already_questioned_speakers:
# Get the time of the middle of the longest segment for the longest seg within_diar_id in within_diar
tmp_diar = copy.deepcopy(within_diar)
tmp_diar.filter("cluster", "==", current_speaker_name).add_duration().sort("duration", reverse=True)
show1 = tmp_diar[0]["show"]
t1 = (tmp_diar[0]["stop"] - tmp_diar[0]["start"]) // 2
# Get the time of the middle of the longest segment for previous_diar_id in previous_diar
tmp_diar = copy.deepcopy(previous_diar)
tmp_diar.filter("cluster", "==", previous_spk_name).add_duration().sort("duration", reverse=True)
show2 = tmp_diar[0]["show"]
t2 = (tmp_diar[0]["stop"] - tmp_diar[0]["start"]) // 2
# Ask the question to the user
complete_hyp = copy.deepcopy(previous_diar).append_diar(within_diar)
message_to_user = MessageToUser(file_info,
s4d_to_allies(complete_hyp),
Request('same', t1, t2, show2))
keep_questioning, answer = user.validate(message_to_user)
# On incrémente le compteur de question
already_asked_questions += 1
# If the current speaker has to be linked with a previously seen one
if answer.answer :
linkage_speaker_dict[current_speaker_name] = previous_spk_name
else:
already_questioned_speakers.append(previous_spk_name)
previous_spk_idx
else:
previous_spk_idx += 1
# Don't allow to modify the names of previously existing clusters
# Create a dictionary with old_model_name as key and new_cluster as value
cluster_dict = dict()
clusters_by_index = dict()
for ii in range(T.shape[0]):
if T[ii] not in clusters_by_index:
clusters_by_index[T[ii]] = ll_vec.modelset[ii]
cluster_dict[ll_vec.modelset[ii]] = clusters_by_index[T[ii]]
# concatenate previous_vec et within_vec
new_previous_vec = concat_statservers(previous_vec_backup, within_vec_backup)
new_previous_diar = copy.deepcopy(previous_diar)
new_previous_diar.segments += within_diar.segments
"""
if human_in_the_loop:
for ii, mod in enumerate(new_previous_iv.modelset):
if mod in list(tdict.keys()):
new_previous_iv.modelset[ii] = tdict[mod]
for ii, seg in enumerate(new_previous_diar.segments):
if seg['cluster'] in list(tdict.keys()):
new_previous_diar.segments[ii]['cluster'] = tdict[seg['cluster']]
for ii, seg in enumerate(within_diar.segments):
if seg['cluster'] in list(tdict.keys()):
within_diar.segments[ii]['cluster'] = tdict[seg['cluster']]
return new_previous_vec, new_previous_diar, within_diar, metadata
else:
"""
if not human_in_the_loop:
# Modify the model names for vectors
for ii, mod in enumerate(new_previous_vec.modelset):
new_previous_vec.modelset[ii] = cluster_dict[mod]
for ii, mod in enumerate(new_previous_vec.modelset):
if mod in list(linkage_speaker_dict.keys()):
new_previous_vec.modelset[ii] = linkage_speaker_dict[mod]
for ii, seg in enumerate(new_previous_diar.segments):
if seg['cluster'] in list(linkage_speaker_dict.keys()):
new_previous_diar.segments[ii]['cluster'] = linkage_speaker_dict[seg['cluster']]
for ii, seg in enumerate(within_diar.segments):
if seg['cluster'] in list(linkage_speaker_dict.keys()):
within_diar.segments[ii]['cluster'] = linkage_speaker_dict[seg['cluster']]
for ii, seg in enumerate(new_previous_diar.segments):
new_previous_diar.segments[ii]['cluster'] = cluster_dict[seg['cluster']]
for ii, seg in enumerate(within_diar.segments):
within_diar.segments[ii]['cluster'] = cluster_dict[seg['cluster']]
return new_previous_vec, new_previous_diar, within_diar, None
return new_previous_vec, new_previous_diar, within_diar
def allies_cross_show_clustering(show_idx,
......@@ -330,9 +391,7 @@ def allies_cross_show_clustering(show_idx,
lim,
user,
file_info,
uem,
ref,
hal=False):
human_in_the_loop=False):
"""
:param show_idx:
......@@ -346,29 +405,35 @@ def allies_cross_show_clustering(show_idx,
:param uem:
:param ref:
:param reference_path:
:param hal:
:param human_in_the_loop:
:return:
"""
# Process first show
if show_idx == 0:
archive_vectors["previous_vec"] = copy.deepcopy(current_vec)
archive_vectors["previous_diar"] = current_diar
metadata = 0
else:
previous_vec, previous_diar, current_diar, metadata = cross_show(previous_vec=archive_vectors["previous_vec"],
previous_diar=archive_vectors["previous_diar"],
within_vec=current_vec,
within_diar=current_diar,
th_x=th_x,
lim=lim,
user=user,
file_info=file_info,
uem=uem,
ref=ref,
human_in_the_loop=hal)
if human_in_the_loop:
previous_vec, previous_diar, current_diar = cross_show_HAL(previous_vec=archive_vectors["previous_vec"],
previous_diar=archive_vectors["previous_diar"],
within_vec=current_vec,
within_diar=current_diar,
th_x=th_x,
lim=lim,
user=user,
file_info=file_info,
uem=uem,
ref=ref)
else:
previous_vec, previous_diar, current_diar = cross_show(previous_vec=archive_vectors["previous_vec"],
previous_diar=archive_vectors["previous_diar"],
within_vec=current_vec,
within_diar=current_diar,
th_x=th_x)
archive_vectors["previous_vec"]=previous_vec
archive_vectors["previous_diar"]=previous_diar
return archive_vectors, current_diar, metadata
return archive_vectors, current_diar
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment