Commit 4653a0da authored by Félix Michaud's avatar Félix Michaud
Browse files

infos pour prog amélioré

parent adb5aebe
num_mix2
batch0
save_per_batchs500
out_threshold0.5
audLen24000
audRate48000
stft_frame1022
stft_hop256
beta10.9
modetrain
path./Article/shift
augment
species['crow', 'eastern_wood_pewee', 'flicker']
name_classes['crow', 'eastern_wood_pewee', 'flicker']
lr_sound0.001
batch_size16
devicecuda:0
starting_training_time1568882799.0132837
nb_classes3
lr_sounds1e-05
_augmentUnet5_3masks output for 3 species_base de donnees9species_for 3s training audio_2calls en noise avec seulement pitch shifting
This diff is collapsed.
num_mix2
batch0
save_per_batchs500
out_threshold0.5
audLen24000
audRate48000
stft_frame1022
stft_hop256
beta10.9
modetrain
path./Article/shift_stretch12
augment
species['crow', 'eastern_wood_pewee', 'flicker']
name_classes['crow', 'eastern_wood_pewee', 'flicker']
lr_sound0.001
batch_size16
devicecuda:0
starting_training_time1568884203.838934
nb_classes3
lr_sounds1e-05
_augmentUnet5_3masks output for 3 species_base de donnees9species_for 3s training audio_2calls en noise avec pitch shifting et time stretch(0.8-1.2), -10,0 SNR pour natural noise et 3,50 pour gaussian noise
This diff is collapsed.
num_mix2
batch0
save_per_batchs500
out_threshold0.5
audLen24000
audRate48000
stft_frame1022
stft_hop256
beta10.9
modetrain
path./Article/shift_stretch13
augment
species['crow', 'eastern_wood_pewee', 'flicker']
name_classes['crow', 'eastern_wood_pewee', 'flicker']
lr_sound0.001
batch_size16
devicecuda:0
starting_training_time1568883817.4891977
nb_classes3
lr_sounds1e-05
_augmentUnet5_3masks output for 3 species_base de donnees9species_for 3s training audio_2calls en noise avec pitch shifting et time stretch(0.7-1.3)
This diff is collapsed.
......@@ -94,7 +94,7 @@ def freq_mask(spec):
fbank_size = np.shape(spec)
rows , columns = fbank_size[0], fbank_size[1]
#width of the band
fact1 = random.randint(int(rows/60), int((rows/80)))
fact1 = np.random.randint(int(rows/100), int(rows/60))
frame = np.zeros([fact1, columns])
#position of the band on the y axis
pos = random.randint(10, rows-fact1-1)
......@@ -108,7 +108,7 @@ def time_mask(spec):
fbank_size = np.shape(spec)
rows , columns = fbank_size[0], fbank_size[1]
#width of the band
fact1 = random.randint(int(columns/60), int((columns/80)))
fact1 = np.random.randint(int(columns/100), int(columns/60))
frame = np.zeros([rows, fact1])
#position of the band on the x axis
pos = random.randint(10, columns-fact1-1)
......@@ -160,7 +160,7 @@ def _add_noise(signal, noise_file_name, snr, sample_rate):
noise = np.tile(noise, dup_factor)
if len(noise) != len(signal):
idx = np.random.randint(0, len(noise) - len(signal))
idx = np.random.randint(1, len(noise) - len(signal))
noise = noise[idx:idx + len(signal)]
# Compute energy of both signals
......@@ -176,36 +176,36 @@ def _add_noise(signal, noise_file_name, snr, sample_rate):
#create a new signal of length = max_time
def time_elong(sr, list_audio, max_time=2):
def time_elong(sr, audio, max_time=2):
final_audio = np.zeros((1, sr*max_time))
for f in list_audio:
if len(f) > sr*max_time:
print('the new audio file has to be longer then the original')
else:
dim = len(f)
f = f*np.hanning(dim)
blockl = np.random.randint(0, sr*max_time -dim-1)
print(np.shape(f), 'f')
blockr = blockl + dim
left = np.zeros((blockl))
print(np.shape(left), 'left')
right = np.zeros((sr*max_time - blockr))
print(np.shape(right), 'right')
new = np.concatenate((left, f, right), axis=0)
final_audio += new
return final_audio
if len(audio) > sr*max_time:
print('the new audio file has to be longer then the original')
else:
dim = len(audio)
audio = audio*np.hanning(dim)
blockl = np.random.randint(0, sr*max_time -dim-1)
blockr = blockl + dim
left = np.zeros((blockl))
right = np.zeros((sr*max_time - blockr))
new = np.concatenate((left, audio, right), axis=0)
return librosa.to_mono(new)
class Dataset(data.Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, path, nb_classes=2, augmentation=True, path_background="./noises"):
def __init__(self, path, name_classes='', nb_class_noise =1, augmentation=True, path_background="./noises"):
self.dict_classes = self.load_data(path)
self.nb_classes = nb_classes
self.dict_noise_calls = self.load_data(path)
self.name_classes = name_classes
#delete name of birds from the calls to add as noise
self.create_noise_calls()
self.nb_class_noise = nb_class_noise
self.augmentation = augmentation
self.path_background = path_background
if self.augmentation:
self.dict_noises = self.load_data(path_background)
def load_data(self, path, ext='wav'):
dict_classes = collections.OrderedDict()
......@@ -220,20 +220,30 @@ class Dataset(data.Dataset):
print("** WARNING ** No data loaded from " + path)
return dict_classes
def mixed_audio_augmentation(self, audio, sampling_rate):
#remove names of the classes on which the network is training from the dataset
#used to add unknown calls as noise
def create_noise_calls(self):
for name in self.name_classes:
del self.dict_noise_calls[name]
def get_noise(self):
classe_noise = random.randint(0, len(list(self.dict_noises.keys()))-1)
classe_noise = list(self.dict_noises.keys())[classe_noise]
#random natural noise augmentation
filename_noise = self.dict_noises[classe_noise][random.randint(0, len(self.dict_noises[classe_noise])-1)]
audio_noise, sr = load_file(filename_noise)
coeff = int(np.ceil(np.max(audio)*random.choice([1, 2, 3, 4, 5, 6, 7])))
noisy_audio = audio + (audio_noise)*coeff
return filename_noise
def pitch_shift(self, audio, sampling_rate):
#random pitch shifting
# step_pitch = random.uniform(-0.001, 0.001)
# mod_audio = librosa.effects.pitch_shift(noisy_audio, sampling_rate, n_steps=step_pitch)
#ramdom time shifting
# final_audio = manipulate(noisy_audio, sampling_rate, 0.1, 'both')
return noisy_audio
step_pitch = random.uniform(-0.001, 0.001)
augment_audio = librosa.effects.pitch_shift(audio, sampling_rate, n_steps=step_pitch)
return augment_audio
def time_stretch(self, audio):
speed_factor = random.uniform(0.8, 1.2)
return librosa.effects.time_stretch(audio, speed_factor)
#apply randomly at list 1 band on the spectrogram
def spec_augmentation(self, spec):
......@@ -254,31 +264,50 @@ class Dataset(data.Dataset):
def __len__(self):
'Denotes the total number of samples'
# return len(self.dict_classes)
nb_samples = 400000
nb_samples = 150000
return nb_samples
def load_files(self, nb_classes):
def load_classes(self, classe_name):
files = []
'Load audio file'
for cl in classe_name:
#select a random file in the class
idx = int(random.random() * len(self.dict_classes[cl]) )
filename = self.dict_classes[cl][idx]
files.append([cl, filename])
return files
def load_noise_files(self, nb_noise):
files = []
for cl in range(nb_classes):
for cl in range(nb_noise):
'Load audio file'
classe_name = list(self.dict_classes.keys())[cl]
idx = int(random.random() * len(self.dict_classes[classe_name]) )
filename = self.dict_classes[classe_name][idx]
files.append([classe_name, filename])
#pick a class in the order of the dict
rand_class = random.randint(0, len(self.dict_noise_calls)-1)
classe_name = list( self.dict_noise_calls.keys())[rand_class]
#select a random file in the class
idx = int(random.random() * len( self.dict_noise_calls[classe_name]))
filename = self.dict_noise_calls[classe_name][idx]
files.append([classe_name, filename])
return files
'[class_name, filename, [mask], [magnitude], [phase] ]'
def __getitem__(self, index):
'Load audio file for each classe'
files = self.load_files(self.nb_classes)
files = self.load_classes(self.name_classes)
audio_mix = None
m_time = 3
for f in files:
audio_raw, rate = load_file(f[1])
audio = filt(audio_raw, rate)
audio_raw, sr = load_file(f[1])
audio_raw = self.pitch_shift(audio_raw, sr)
audio_raw = self.time_stretch(audio_raw)
new = time_elong(sr, audio_raw, max_time=m_time)
audio = filt(new, sr)
mag, phase = _stft(audio)
mag = create_mask(mag.squeeze(0).squeeze(0))
mask = threshold(mag)
mag = create_mask(mag.squeeze(0).squeeze(0))
mask = threshold(mag)
f.append(mask)
f.append(mag)
f.append(phase)
......@@ -286,9 +315,27 @@ class Dataset(data.Dataset):
audio_mix = audio
else:
audio_mix += audio
'Build mixed mask'
'add calls as noise from a random class'
classes_noise = self.load_noise_files(self.nb_class_noise)
for fn in classes_noise:
audio_raw, sr = load_file(fn[1])
new = time_elong(sr, audio_raw, m_time)
audion = filt(new, sr)
audio_mix += audion
'Augment Data'
if self.augmentation:
audio_mix = self.mixed_audio_augmentation(audio_mix, rate)
if random.randint(0, 1) == 1:
n_noise = self.get_noise()
snr = np.random.randint(-10, 0)
else:
n_noise = np.random.normal(loc=0, scale=1, size=(1, m_time*sr))
n_noise = librosa.to_mono(n_noise)
snr = np.random.randint(3, 50) #-10/5 for natural noise, 30/50
audio_mix = _add_noise(audio_mix, n_noise, snr, sr)
mag_mix, phase_mix = _stft(audio_mix)
mag_mix = mag_mix.squeeze(0).squeeze(0)
......
......@@ -46,7 +46,7 @@ def init_weights(m):
def unwrap_mask(infos):
#"for a kernel of 5"
gt_masks = torch.empty(args.batch_size, args.nb_classes, 256, 44, dtype=torch.float)
gt_masks = torch.empty(args.batch_size, args.nb_classes, 256, 259, dtype=torch.float)
for ii in range(args.batch_size):
for jj in range(args.nb_classes):
gt_masks[ii, jj] = infos[jj][2][ii]
......@@ -75,6 +75,29 @@ def save_arguments(args, path):
file1.close()
def evaluation(net, loader, args):
#no upgrade over the gradient
torch.set_grad_enabled(False)
num_batch = 0
criterion = nn.BCELoss()
args.out_threshold = 0.4
for ii, batch_data in enumerate(loader):
# forward pass
magmix = batch_data[0]
magmix = magmix.to(args.device)
masks = unwrap_mask(batch_data[2])
masks = masks.to(args.device)
num_batch += 1
masks_pred = net(magmix)
# #loss
loss = criterion(masks_pred, masks)
#writing of the Loss values and elapsed time for every batch
batchtime = (time.time() - args.starting_training_time)/60 #minutes
with open(args.path + "/loss_times_eval.csv", "a") as f:
writer = csv.writer(f)
writer.writerow([str(loss.cpu().detach().numpy()), batchtime, num_batch])
def train(net, loader_train, optimizer, path, args):
torch.set_grad_enabled(True)
num_batch = 0
......@@ -100,36 +123,13 @@ def train(net, loader_train, optimizer, path, args):
# #writing of the Loss values and elapsed time for every batch
batchtime = (time.time() - args.starting_training_time)/60 #minutes
# #Writing of the elapsed time and loss for every batch
with open(path + "/loss_times.csv", "a") as f:
with open(args.path + "/loss_times.csv", "a") as f:
writer = csv.writer(f)
writer.writerow([str(loss.cpu().detach().numpy()), batchtime, num_batch])
if ii%args.save_per_batchs == 0:
torch.save({ 'model_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict()},
path + '/Saved_models/model_batchs{}.pth.tar'.format(num_batch))
def evaluation(net, loader, args):
#no upgrade over the gradient
torch.set_grad_enabled(False)
num_batch = 0
criterion = nn.BCELoss()
args.out_threshold = 0.4
for ii, batch_data in enumerate(loader):
# forward pass
magmix = batch_data[0]
magmix = magmix.to(args.device)
masks = unwrap_mask(batch_data[2])
masks = masks.to(args.device)
num_batch += 1
masks_pred = net(magmix)
# #loss
loss = criterion(masks_pred, masks)
#Visualization
with open("./losses/loss_eval/loss_times_eval{}.csv".format(args.saved_model), "a") as f:
writer = csv.writer(f)
writer.writerow([str(loss.cpu().detach().numpy()), batchtime, num_batch])
args.path + '/Saved_models/model_batchs{}.pth.tar'.format(num_batch))
#***************************************************
......@@ -143,18 +143,18 @@ if __name__ == '__main__':
args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
args.starting_training_time = time.time()
args.save_per_batchs = 500
args.nb_classes = 3
args.nb_classes = 3
args.name_classes = ['crow', 'eastern_wood_pewee', 'flicker']
args.mode = 'train'
args.lr_sounds = 1e-5
args.saved_model = '5_5000'
#model definition
net = UNet5(n_channels=1, n_classes=args.nb_classes)
net.apply(init_weights)
net = net.to(args.device)
# Set up optimizer
optimizer = create_optimizer(net, args)
args.path = "./Unet5/nm"
args._augment = 'nm'
args.path = "./Article/shift_stretch12"
args._augment = 'Unet5_3masks output for 3 species_base de donnees9species_for 3s training audio_2calls en noise avec pitch shifting et time stretch(0.8-1.2), -10,0 SNR pour natural noise et 3,50 pour gaussian noise'
###########################################################
################### TRAINING ##############################
###########################################################
......@@ -163,9 +163,9 @@ if __name__ == '__main__':
fichierLoss = open(args.path+"/loss_times.csv", "w")
fichierLoss.close()
#Dataset loading
root = './data_sound/trainset/'
root = './data_sound/trainset9/'
ext = '.wav'
train_classes = Dataset(root, nb_classes=args.nb_classes, path_background="./data_sound/noises/")
train_classes = Dataset(root, name_classes=args.name_classes, nb_class_noise =2, path_background="./data_sound/noises/")
loader_train = torch.utils.data.DataLoader(
train_classes,
batch_size = args.batch_size,
......@@ -177,27 +177,4 @@ if __name__ == '__main__':
torch.save({
'model_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict()},
path+'/Saved_models/model_epoch{}.pth.tar'.format(epoch))
###########################################################
################### EVALUATION ############################
###########################################################
if args.mode == 'eval':
#OverWrite the Files for loss saving and time saving
fichierLoss = open("./losses/loss_eval/loss_times_eval{}.csv".format(args.saved_model), "w")
fichierLoss.close()
#Dataset loading
root = './data_sound/valset/'
ext = '.wav'
val_classes = Dataset(root, nb_classes=args.nb_classes, path_background="./data_sound/noises/")
#inisialization of the model from the saved model
checkpoint = torch.load('Saved_models2/model{}.pth.tar'.format(args.saved_model))
net.load_state_dict(checkpoint['model_state_dict'])
loader_eval = torch.utils.data.DataLoader(
val_classes,
batch_size = args.batch_size,
shuffle=True,
num_workers=20)
for epoch in range(0, 1):
evaluation(net, loader_eval, optimizer, args)
args.path+'/Saved_models/model_epoch{}.pth.tar'.format(epoch))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment