Commit b826fc7b authored by Félix Michaud's avatar Félix Michaud
Browse files

1 species training

parent 1ca52594
......@@ -85,11 +85,8 @@ def create_mask(mag):
# grid_warp = torch.from_numpy(warpgrid_log(264, 52, warp=True))
magim = F.grid_sample(magim, grid_warp)
return torch.from_numpy(np.flipud(magim).copy())
#kernel size:5, padding:3, image size:[256, 44]
#kernel size:3, padding:1, image size[256, 44]
#depends on the overlap of the stft
#create a band of zeros in the spectrogram on the frequency range
def freq_mask(spec):
fbank_size = np.shape(spec)
rows , columns = fbank_size[0], fbank_size[1]
......@@ -104,6 +101,7 @@ def freq_mask(spec):
masked = spec * mask
return masked
#create a band of zeros in the spectrogram on the time range
def time_mask(spec):
fbank_size = np.shape(spec)
rows , columns = fbank_size[0], fbank_size[1]
......@@ -177,7 +175,6 @@ def _add_noise(signal, noise_file_name, snr, sample_rate):
#create a new signal of length = max_time
def time_elong(sr, audio, max_time=2):
final_audio = np.zeros((1, sr*max_time))
if len(audio) > sr*max_time:
print('the new audio file has to be longer then the original')
else:
......@@ -193,11 +190,12 @@ def time_elong(sr, audio, max_time=2):
class Dataset(data.Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, path, nb_classes=1, nb_classes_noise=1, augmentation=True, path_background="./noises"):
def __init__(self, path, name_cl, nb_classes=1, nb_classes_noise=1, augmentation=True, path_background="./noises"):
self.dict_classes = self.load_data(path)
print(self.dict_classes)
self.nb_classes = nb_classes
self.nb_classes_noise = nb_classes_noise
self.name_cl = name_cl
self.augmentation = augmentation
self.path_background = path_background
if self.augmentation:
......@@ -216,30 +214,26 @@ class Dataset(data.Dataset):
print("** WARNING ** No data loaded from " + path)
return dict_classes
def mixed_audio_augmentation(self, audio, sampling_rate):
def get_noise(self):
classe_noise = random.randint(0, len(list(self.dict_noises.keys()))-1)
classe_noise = list(self.dict_noises.keys())[classe_noise]
#random natural noise augmentation
filename_noise = self.dict_noises[classe_noise][random.randint(0, len(self.dict_noises[classe_noise])-1)]
# audio_noise, sr = load_file(filename_noise)
# coeff = int(np.ceil(np.max(audio)*random.choice([1, 2, 3, 4, 5, 6, 7])))
# noisy_audio = audio + (audio_noise)*coeff
return filename_noise
def data_augment(audio, sampling_rate):
#random pitch shifting
# step_pitch = random.uniform(-0.001, 0.001)
# mod_audio = librosa.effects.pitch_shift(noisy_audio, sampling_rate, n_steps=step_pitch)
#ramdom time shifting
# final_audio = manipulate(noisy_audio, sampling_rate, 0.1, 'both')
return filename_noise
step_pitch = random.uniform(-0.001, 0.001)
augment_audio = librosa.effects.pitch_shift(audio, sampling_rate, n_steps=step_pitch)
return augment_audio
#apply randomly at list 1 band on the spectrogram
def spec_augmentation(self, spec):
n = random.randint(0, 2)
if n == 0:
t = random.randint(0, 1)
if t == 1:
spec = time_mask(spec)
if t == 0:
if n == 1:
spec = freq_mask(spec)
else:
for ii in range(n):
......@@ -259,20 +253,38 @@ class Dataset(data.Dataset):
files = []
for cl in range(nb_classes):
'Load audio file'
classe_name = list(self.dict_classes.keys())[cl]
#pick a class in the order of the dict
rand_class = random.randint(0, len(self.dict_classes)-1)
classe_name = list(self.dict_classes.keys())[rand_class]
print(classe_name, 'class_name')
#select a random file in the class
idx = int(random.random() * len(self.dict_classes[classe_name]) )
filename = self.dict_classes[classe_name][idx]
files.append([classe_name, filename])
return files
def load_class(self, classe_name):
files = []
'Load audio file'
#select a random file in the class
idx = int(random.random() * len(self.dict_classes[classe_name]) )
filename = self.dict_classes[classe_name][idx]
files.append([classe_name, filename])
return files
'[class_name, filename, [mask], [magnitude], [phase] ]'
def __getitem__(self, index):
'Load audio file for each classe'
files = self.load_files(self.nb_classes)
file = self.load_class(self.name_cl)
print(file, 'file')
audio_mix = None
max_time = 2
for f in files:
for f in file:
audio_raw, sr = load_file(f[1])
audio_raw = self.data_augment(audio_raw, sr)
new = time_elong(sr, audio_raw, max_time)
audio = filt(new, sr)
mag, phase = _stft(audio)
......@@ -284,27 +296,30 @@ class Dataset(data.Dataset):
if audio_mix is None:
audio_mix = audio
else:
audio_mix += audio
#add calls as noise to the longer file
audio_mix += audio
'add calls as noise from a random class'
classes_noise = self.load_files(self.nb_classes_noise)
noisy_mix = None
for fn in classes_noise:
audio_raw, sr = load_file(fn[1])
new = time_elong(sr, audio_raw, max_time)
audion = filt(new, sr)
audio_mix += audion
'Build mixed mask'
'Randomly add either gaussian noise or natural noise'
if self.augmentation:
# file_noise = self.mixed_audio_augmentation(audio_mix, sr)
n_noise = np.random.normal(loc=0, scale=1, size=(1, max_time*sr))
n_noise = librosa.to_mono(n_noise)
snr = np.random.randint(-10, 5) #-10/5 for natural noise, 30/50
audio_mix = _add_noise(audio_mix, n_noise, snr, sr)
if random.randint(0, 1) == 1:
n_noise = self.get_noise()
snr = np.random.randint(-10, 0)
else:
n_noise = np.random.normal(loc=0, scale=1, size=(1, max_time*sr))
n_noise = librosa.to_mono(n_noise)
snr = np.random.randint(7, 20) #-10/5 for natural noise, 30/50
audio_mix = _add_noise(audio_mix, n_noise, snr, sr)
mag_mix, phase_mix = _stft(audio_mix)
mag_mix = mag_mix.squeeze(0).squeeze(0)
mag_mix = self.spec_augmentation(mag_mix)
mags_mix = create_im(mag_mix)
mags_mix = mags_mix.squeeze(0)
return [mags_mix, phase_mix, files]
return [mags_mix, phase_mix, file]
......@@ -7,7 +7,6 @@ Created on Wed Jun 26 22:14:38 2019
"""
import os
import random
import time
import fnmatch
import csv
......@@ -16,14 +15,12 @@ from arguments import ArgParser
from unet import UNet5
import torch.nn as nn
#from tensorboardX import SummaryWriter
from matplotlib import image as mpimg
from Dataloader import Dataset
import matplotlib
from Dataloader_solo import Dataset
import numpy as np
import collections
import scipy
#organize the name files according to their number
import librosa
#organize the name files according to their number
def create_list(path, ext):
list_names = []
for root, dirnames, filenames in os.walk(path):
......@@ -100,36 +97,36 @@ def train(net, loader_train, optimizer, path, args):
# #writing of the Loss values and elapsed time for every batch
batchtime = (time.time() - args.starting_training_time)/60 #minutes
# #Writing of the elapsed time and loss for every batch
with open(path + "/loss_times.csv", "a") as f:
with open(args.path + "/loss_times.csv", "a") as f:
writer = csv.writer(f)
writer.writerow([str(loss.cpu().detach().numpy()), batchtime, num_batch])
if ii%args.save_per_batchs == 0:
torch.save({ 'model_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict()},
path + '/Saved_models/model_batchs{}.pth.tar'.format(num_batch))
args.path + '/Saved_models/model_batchs{}.pth.tar'.format(num_batch))
def evaluation(net, loader, args):
#no upgrade over the gradient
torch.set_grad_enabled(False)
num_batch = 0
criterion = nn.BCELoss()
args.out_threshold = 0.4
for ii, batch_data in enumerate(loader):
# forward pass
magmix = batch_data[0]
magmix = magmix.to(args.device)
masks = unwrap_mask(batch_data[2])
masks = masks.to(args.device)
num_batch += 1
masks_pred = net(magmix)
# #loss
loss = criterion(masks_pred, masks)
#Visualization
with open("./losses/loss_eval/loss_times_eval{}.csv".format(args.saved_model), "a") as f:
writer = csv.writer(f)
writer.writerow([str(loss.cpu().detach().numpy()), batchtime, num_batch])
#def evaluation(net, loader, args):
##no upgrade over the gradient
# torch.set_grad_enabled(False)
# num_batch = 0
# criterion = nn.BCELoss()
# args.out_threshold = 0.4
# for ii, batch_data in enumerate(loader):
# # forward pass
# magmix = batch_data[0]
# magmix = magmix.to(args.device)
# masks = unwrap_mask(batch_data[2])
# masks = masks.to(args.device)
# num_batch += 1
# masks_pred = net(magmix)
## #loss
# loss = criterion(masks_pred, masks)
# #Visualization
# with open("./losses/loss_eval/loss_times_eval{}.csv".format(args.saved_model), "a") as f:
# writer = csv.writer(f)
# writer.writerow([str(loss.cpu().detach().numpy()), batchtime, num_batch])
#***************************************************
......@@ -139,11 +136,11 @@ if __name__ == '__main__':
# arguments
parser = ArgParser()
args = parser.parse_train_arguments()
args.batch_size = 16
args.batch_size = 2
args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
args.starting_training_time = time.time()
args.save_per_batchs = 500
args.nb_classes = 3
args.nb_classes = 1
args.mode = 'train'
args.lr_sounds = 1e-5
args.saved_model = '5_5000'
......@@ -163,9 +160,9 @@ if __name__ == '__main__':
fichierLoss = open(args.path+"/loss_times.csv", "w")
fichierLoss.close()
#Dataset loading
root = './data_sound/trainset/'
root = './data_sound/trainset'
ext = '.wav'
train_classes = Dataset(root, nb_classes=args.nb_classes, path_background="./data_sound/noises/")
train_classes = Dataset(root,'flicker', nb_classes=args.nb_classes, nb_classes_noise=3, path_background="./data_sound/noises/")
loader_train = torch.utils.data.DataLoader(
train_classes,
batch_size = args.batch_size,
......@@ -177,27 +174,27 @@ if __name__ == '__main__':
torch.save({
'model_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict()},
path+'/Saved_models/model_epoch{}.pth.tar'.format(epoch))
args.path+'/Saved_models/model_epoch{}.pth.tar'.format(epoch))
###########################################################
################### EVALUATION ############################
###########################################################
if args.mode == 'eval':
#OverWrite the Files for loss saving and time saving
fichierLoss = open("./losses/loss_eval/loss_times_eval{}.csv".format(args.saved_model), "w")
fichierLoss.close()
#Dataset loading
root = './data_sound/valset/'
ext = '.wav'
val_classes = Dataset(root, nb_classes=args.nb_classes, path_background="./data_sound/noises/")
#inisialization of the model from the saved model
checkpoint = torch.load('Saved_models2/model{}.pth.tar'.format(args.saved_model))
net.load_state_dict(checkpoint['model_state_dict'])
loader_eval = torch.utils.data.DataLoader(
val_classes,
batch_size = args.batch_size,
shuffle=True,
num_workers=20)
for epoch in range(0, 1):
evaluation(net, loader_eval, optimizer, args)
# if args.mode == 'eval':
# #OverWrite the Files for loss saving and time saving
# fichierLoss = open("./losses/loss_eval/loss_times_eval{}.csv".format(args.saved_model), "w")
# fichierLoss.close()
# #Dataset loading
# root = './data_sound/valset/'
# ext = '.wav'
# val_classes = Dataset(root, nb_classes=args.nb_classes, path_background="./data_sound/noises/")
# #inisialization of the model from the saved model
# checkpoint = torch.load('Saved_models2/model{}.pth.tar'.format(args.saved_model))
# net.load_state_dict(checkpoint['model_state_dict'])
#
# loader_eval = torch.utils.data.DataLoader(
# val_classes,
# batch_size = args.batch_size,
# shuffle=True,
# num_workers=20)
#
# for epoch in range(0, 1):
# evaluation(net, loader_eval, optimizer, args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment