Commit 5de3290e authored by Anthony Larcher's avatar Anthony Larcher
Browse files

sad

parents f5bde857 9cfbec14
......@@ -167,7 +167,9 @@ if CUDA:
from sidekit.nnet import xtrain
from sidekit.nnet import extract_idmap
from sidekit.nnet import extract_parallel
from sidekit.nnet import SAD_RNN
else:
print("Don't import Torch")
if SIDEKIT_CONFIG["mpi"]:
found_mpi4py = importlib.find_loader('mpi4py') is not None
......
......@@ -24,7 +24,8 @@ class SAD_Dataset(Dataset):
train_list = {}
with open(mdtm_file, 'r') as f:
for line in f:
lines = [l for l in f]
for line in lines[:500]:
show, _, start, dur, _, _, _, _ = line.rstrip().split()
if show not in train_list:
train_list[show] = []
......@@ -49,9 +50,15 @@ class SAD_Dataset(Dataset):
self.vad = {}
self.segments = []
#speech_only_segments = []
#speech_nonspeech_segments = []
for show in sorted(train_list.keys()):
features, _ = features_server.load(show)
labels = numpy.zeros((len(features), 1), dtype=numpy.int)
speech_only_segments = []
speech_nonspeech_segments = []
if show in train_list and show in uem_list:
for seg in train_list[show]:
......@@ -65,10 +72,36 @@ class SAD_Dataset(Dataset):
start, stop = 0, len(features)
for i in range(start, min(stop, len(features)) - self.duration, self.step):
self.segments.append((show, i, i + self.duration))
# cree les segments ne contenant QUE de la parole (sans recouvrement)
for i in range(start, min(stop, len(features)) - self.duration, self.duration):
if labels[i:i+self.duration].sum() == self.duration:
speech_only_segments.append((show, i, i + self.duration))
# cree les segments contenant de la PAROLE ET DU SILENCE (avec recouvrement pour equilibrer les classes)
for i in range(start, min(stop, len(features)) - self.duration, self.step):
if labels[i:i+self.duration].sum() < self.duration - 1:
speech_nonspeech_segments.append((show, i, i + self.duration))
#for i in range(start, min(stop, len(features)) - self.duration, self.step):
# self.segments.append((show, i, i + self.duration))
tmp = speech_only_segments + speech_nonspeech_segments
random.shuffle(tmp)
self.segments += tmp
print("Show {}, ratio S/NS = {}".format(show, len(speech_only_segments)/(len(speech_nonspeech_segments) + len(speech_only_segments))))
#tmp = speech_only_segments + speech_nonspeech_segments
#if shuffle:
# print("taille de tmp: {}".format(len(tmp)))
# random.shuffle(tmp)
# print("taille de tmp: {}".format(len(tmp)))
# print(tmp[0])
# for t in tmp:
# self.segments.append(t)
#self.segments = tmp.copy()
self.input_size = features.shape[1]
if shuffle:
random.shuffle(self.segments)
print("Final ratio S/NS = {}".format(len(speech_only_segments)/(len(speech_nonspeech_segments) + len(speech_only_segments))))
self.len = len(self.segments) // self.batch_size
......@@ -290,7 +323,6 @@ class SAD_RNN():
:param features_server: a sidekit FeaturesServer object
:param model_file_format: file format to save the model. The format uses the current epoch
"""
self.model.to(device)
criterion = nn.BCELoss()
optimizer = optim.RMSprop(self.model.parameters())
......@@ -303,9 +335,11 @@ class SAD_RNN():
for batch_idx, (X, Y) in enumerate(training_set):
batch_loss = self._fit_batch(optimizer, criterion, X, Y)
losses[epoch].append(batch_loss)
sys.stdout.write("\rEpoch {}/{}, loss {:.5f}".format(
print("Epoch {}/{}, loss {:.5f}\n".format(
epoch + 1, nb_epochs, numpy.mean(losses[epoch])))
sys.stdout.flush()
#sys.stdout.write("\rEpoch {}/{}, loss {:.5f}".format(
# epoch + 1, nb_epochs, numpy.mean(losses[epoch])))
#sys.stdout.flush()
it += 1
torch.save(self.model.state_dict(), model_file_format.format(epoch+1))
......
......@@ -230,7 +230,9 @@ def xtrain(args):
# Decrease learning rate after every epoch
#args.lr = args.lr * 0.9
#args.lr = args.lr * 0.9
args.lr = args.lr * 0.9
print(" Decrease learning rate: {}".format(args.lr))
def train_epoch(epoch, args, initial_model_file_name):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment