Commit b9dd2052 authored by Félix Michaud's avatar Félix Michaud
Browse files

spencer comparaison

parent 35171878
num_mix2
batch0
save_per_batchs500
out_threshold0.5
audLen24000
audRate48000
stft_frame1022
stft_hop256
beta10.9
modetrain
path./Article/training3
augment
species['crow', 'eastern_wood_pewee']
name_classes['crow', 'eastern_wood_pewee']
lr_sound0.001
batch_size16
devicecuda:0
starting_training_time1569422085.0147443
nb_classes2
lr_sounds1e-05
_augmentUnet5 output for 2 species_base de donnees9species_for 3s training audio_3alls en noise avec pitch shifting et time stretch(0.7-1.3), -10,0 SNR pour natural noise et 3,50 pour gaussian noise
This diff is collapsed.
num_mix2
batch0
save_per_batchs500
out_threshold0.5
audLen24000
audRate48000
stft_frame1022
stft_hop256
beta10.9
modetrain
path./Article/training4
augment
species['crow', 'eastern_wood_pewee']
name_classes['crow', 'eastern_wood_pewee']
lr_sound0.001
batch_size16
devicecuda:0
starting_training_time1569422126.3140655
nb_classes2
lr_sounds1e-05
_augmentUnet5 output for 2 species_base de donnees9species_for 3s training audio_4alls en noise avec pitch shifting et time stretch(0.7-1.3), -10,0 SNR pour natural noise et 3,50 pour gaussian noise
This diff is collapsed.
num_mix2
batch0
save_per_batchs500
out_threshold0.5
audLen24000
audRate48000
stft_frame1022
stft_hop256
beta10.9
modetrain
path./Article/training5
augment
species['crow', 'eastern_wood_pewee']
name_classes['crow', 'eastern_wood_pewee']
lr_sound0.001
batch_size16
devicecuda:0
starting_training_time1569422239.9374652
nb_classes2
lr_sounds1e-05
_augmentUnet5 output for 2 species_base de donnees9species_for 3s training audio_5alls en noise avec pitch shifting et time stretch(0.7-1.3), -10,0 SNR pour natural noise et 3,50 pour gaussian noise
This diff is collapsed.
......@@ -14,7 +14,7 @@ def load_file(file):
audio_raw, rate = librosa.load(file, sr=22050, mono=True)
return audio_raw, rate
#cleaning the input audio
def filt(audio_raw, rate):
band = [800, 7000] # Desired pass band, Hz
trans_width = 100 # Width of transition from pass band to stop band, Hz
......@@ -40,16 +40,19 @@ def _stft(audio):
tch_phase[0, 0, :, :] = torch.from_numpy(phase)
return tch_mag, tch_phase
#threshold to clean the spectro and get the reference mask
#return 1 torch matrix of dimensions of the stft
def threshold(mag):
def threshold(mag, mag_noise):
gt_mask = torch.zeros(mag.shape[2], mag.shape[3])
av = np.mean(mag[0, 0].numpy())
vari = np.var(mag[0, 0].numpy())
param = av + np.sqrt(vari)*2 #threshold
gt_mask = (mag[0, 0] > param).float()
return gt_mask
final_mask = (gt_mask*mag > mag_noise).float()
return final_mask
#create the grid for the image
#if warp then Melspectro, if not, go from Mel to linear scale
def warpgrid_log(HO, WO, warp=True):
# meshgrid
x = np.linspace(-1, 1, WO)
......@@ -74,7 +77,6 @@ def create_im(mag):
m = torch.mean(magim)
magim = magim - m
grid_warp = torch.from_numpy(warpgrid_log(256, magim.shape[3], warp=True))
# grid_warp = torch.from_numpy(warpgrid_log(384, 192, warp=True))
magim = F.grid_sample(magim, grid_warp)
return torch.from_numpy(np.flipud(magim).copy())
......@@ -82,14 +84,11 @@ def create_im(mag):
def create_mask(mag):
magim = mag.unsqueeze(0).unsqueeze(0)
grid_warp = torch.from_numpy(warpgrid_log(256, magim.shape[3], warp=True))
# grid_warp = torch.from_numpy(warpgrid_log(264, 52, warp=True))
magim = F.grid_sample(magim, grid_warp)
return torch.from_numpy(np.flipud(magim).copy())
#kernel size:5, padding:3, image size:[256, 44]
#kernel size:3, padding:1, image size[256, 44]
#depends on the overlap of the stft
#remove a band in the horizontal direction in the spectrogram
def freq_mask(spec):
fbank_size = np.shape(spec)
rows , columns = fbank_size[0], fbank_size[1]
......@@ -104,6 +103,7 @@ def freq_mask(spec):
masked = spec * mask
return masked
#remove a band in the vertical direction in the spectrogram
def time_mask(spec):
fbank_size = np.shape(spec)
rows , columns = fbank_size[0], fbank_size[1]
......@@ -118,7 +118,7 @@ def time_mask(spec):
masked = spec * mask
return masked
#randomly shift calls in the audio temporal signal
def manipulate(data, sampling_rate, shift_max, shift_direction):
shift = np.random.randint(sampling_rate * shift_max)
if shift_direction == 'right':
......@@ -182,6 +182,7 @@ def time_elong(sr, audio, max_time=2):
print('the new audio file has to be longer then the original')
else:
dim = len(audio)
#windowing to avoid aliasing
audio = audio*np.hanning(dim)
blockl = np.random.randint(0, sr*max_time -dim-1)
blockr = blockl + dim
......@@ -194,37 +195,37 @@ def time_elong(sr, audio, max_time=2):
class Dataset(data.Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, path, name_classes='', nb_class_noise =1, augmentation=True, path_background="./noises"):
self.dict_classes = self.load_data(path)
self.dict_noise_calls = self.load_data(path)
self.name_classes = name_classes
self.dict_classes = self.load_data(path)
self.dict_noise_calls = self.load_data(path)
self.name_classes = name_classes
#delete name of birds from the calls to add as noise
self.create_noise_calls()
self.nb_class_noise = nb_class_noise
self.create_noise_calls()
self.nb_class_noise = nb_class_noise
self.augmentation = augmentation
self.path_background = path_background
if self.augmentation:
self.dict_noises = self.load_data(path_background)
self.augmentation = augmentation
self.path_background = path_background
if self.augmentation:
self.dict_noises = self.load_data(path_background)
def load_data(self, path, ext='wav'):
dict_classes = collections.OrderedDict()
for root, dirnames, filenames in os.walk(path):
for filename in fnmatch.filter(filenames, '*' + ext):
classe = root.split("/")[-1]
if classe in dict_classes.keys():
dict_classes[classe].append(os.path.join(root, filename))
else:
dict_classes[classe] = [os.path.join(root, filename)]
if len(list(dict_classes.keys() )) == 0:
print("** WARNING ** No data loaded from " + path)
return dict_classes
dict_classes = collections.OrderedDict()
for root, dirnames, filenames in os.walk(path):
for filename in fnmatch.filter(filenames, '*' + ext):
classe = root.split("/")[-1]
if classe in dict_classes.keys():
dict_classes[classe].append(os.path.join(root, filename))
else:
dict_classes[classe] = [os.path.join(root, filename)]
if len(list(dict_classes.keys() )) == 0:
print("** WARNING ** No data loaded from " + path)
return dict_classes
#remove names of the classes on which the network is training from the dataset
#used to add unknown calls as noise
def create_noise_calls(self):
for name in self.name_classes:
del self.dict_noise_calls[name]
for name in self.name_classes:
del self.dict_noise_calls[name]
def get_noise(self):
......@@ -241,19 +242,19 @@ class Dataset(data.Dataset):
return augment_audio
def time_stretch(self, audio):
speed_factor = random.uniform(0.8, 1.2)
return librosa.effects.time_stretch(audio, speed_factor)
speed_factor = random.uniform(0.7, 1.3)
return librosa.effects.time_stretch(audio, speed_factor)
#apply randomly at list 1 band on the spectrogram
def spec_augmentation(self, spec):
n = random.randint(0, 2)
if n == 0:
t = random.randint(0, 1)
if t == 1:
spec = time_mask(spec)
if t == 0:
spec = freq_mask(spec)
t = random.randint(0, 1)
if t == 1:
spec = time_mask(spec)
if t == 0:
spec = freq_mask(spec)
else:
for ii in range(n):
spec = time_mask(spec)
......@@ -262,34 +263,34 @@ class Dataset(data.Dataset):
def __len__(self):
'Denotes the total number of samples'
'Denotes the total number of samples'
# return len(self.dict_classes)
nb_samples = 150000
return nb_samples
nb_samples = 150000
return nb_samples
def load_classes(self, classe_name):
files = []
'Load audio file'
for cl in classe_name:
files = []
'Load audio file'
for cl in classe_name:
#select a random file in the class
idx = int(random.random() * len(self.dict_classes[cl]) )
filename = self.dict_classes[cl][idx]
files.append([cl, filename])
return files
idx = int(random.random() * len(self.dict_classes[cl]) )
filename = self.dict_classes[cl][idx]
files.append([cl, filename])
return files
def load_noise_files(self, nb_noise):
files = []
for cl in range(nb_noise):
'Load audio file'
'Load audio file'
#pick a class in the order of the dict
rand_class = random.randint(0, len(self.dict_noise_calls)-1)
classe_name = list( self.dict_noise_calls.keys())[rand_class]
rand_class = random.randint(0, len(self.dict_noise_calls)-1)
classe_name = list( self.dict_noise_calls.keys())[rand_class]
#select a random file in the class
idx = int(random.random() * len( self.dict_noise_calls[classe_name]))
filename = self.dict_noise_calls[classe_name][idx]
files.append([classe_name, filename])
idx = int(random.random() * len( self.dict_noise_calls[classe_name]))
filename = self.dict_noise_calls[classe_name][idx]
files.append([classe_name, filename])
return files
......@@ -298,45 +299,53 @@ class Dataset(data.Dataset):
'Load audio file for each classe'
files = self.load_classes(self.name_classes)
audio_mix = None
m_time = 3
for f in files:
audio_raw, sr = load_file(f[1])
audio_raw = self.pitch_shift(audio_raw, sr)
audio_raw = self.time_stretch(audio_raw)
new = time_elong(sr, audio_raw, max_time=m_time)
audio = filt(new, sr)
mag, phase = _stft(audio)
mag = create_mask(mag.squeeze(0).squeeze(0))
mask = threshold(mag)
f.append(mask)
f.append(mag)
f.append(phase)
if audio_mix is None:
audio_mix = audio
else:
audio_mix += audio
#audio length (s)
m_time = 3
# Creation of the noise #############################
'add calls as noise from a random class'
classes_noise = self.load_noise_files(self.nb_class_noise)
for fn in classes_noise:
audio_raw, sr = load_file(fn[1])
new = time_elong(sr, audio_raw, m_time)
audion = filt(new, sr)
audio_mix += audion
new = time_elong(sr, audio_raw, m_time)
audio_noise = filt(new, sr)
audio_noise += audio_noise
'Augment Data'
if self.augmentation:
if random.randint(0, 1) == 1:
#natural noise
n_noise = self.get_noise()
snr = np.random.randint(-10, 0)
else:
#gaussian noise
n_noise = np.random.normal(loc=0, scale=1, size=(1, m_time*sr))
n_noise = librosa.to_mono(n_noise)
snr = np.random.randint(3, 50) #-10/5 for natural noise, 30/50
audio_mix = _add_noise(audio_mix, n_noise, snr, sr)
final_audio_noise = _add_noise(audio_noise, n_noise, snr, sr)
mag_noise, phase = _stft(final_audio_noise)
mag_noise = create_mask(mag_noise.squeeze(0).squeeze(0))
################## Creation of the audio #############################
for f in files:
audio_raw, sr = load_file(f[1])
audio_raw = self.pitch_shift(audio_raw, sr)
audio_raw = self.time_stretch(audio_raw)
new = time_elong(sr, audio_raw, max_time=m_time)
audio = filt(new, sr)
mag, phase = _stft(audio)
mag = create_mask(mag.squeeze(0).squeeze(0))
mask = threshold(mag, mag_noise)
f.append(mask)
f.append(mag)
f.append(phase)
'[class_name, filename, [mask], [magnitude], [phase] ]'
if audio_mix is None:
audio_mix = audio
else:
audio_mix += audio
######################################################################
'final audio'
audio_mix = audio_mix + final_audio_noise
mag_mix, phase_mix = _stft(audio_mix)
mag_mix = mag_mix.squeeze(0).squeeze(0)
mag_mix = self.spec_augmentation(mag_mix)
......
......@@ -143,8 +143,10 @@ if __name__ == '__main__':
args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
args.starting_training_time = time.time()
args.save_per_batchs = 500
args.nb_classes = 3
args.name_classes = ['crow', 'eastern_wood_pewee', 'flicker']
#nb of class to train the net on
args.nb_classes = 2
#names of the species the net is training on
args.name_classes = ['crow', 'eastern_wood_pewee']
args.mode = 'train'
args.lr_sounds = 1e-5
#model definition
......@@ -153,8 +155,9 @@ if __name__ == '__main__':
net = net.to(args.device)
# Set up optimizer
optimizer = create_optimizer(net, args)
args.path = "./Article/shift_stretch12"
args._augment = 'Unet5_3masks output for 3 species_base de donnees9species_for 3s training audio_2calls en noise avec pitch shifting et time stretch(0.8-1.2), -10,0 SNR pour natural noise et 3,50 pour gaussian noise'
#path to the repertory where to save everything
args.path = "./Article/spencer_modif"
args._augment = 'Unet5 output for 2 species_base de donnees10species_for 3s training audio_3calls en noise avec pitch shifting et time stretch(0.7-1.3), -10,0 SNR pour natural noise et 3,50 pour gaussian noise avec comparaison mag et mag noise pour mask'
###########################################################
################### TRAINING ##############################
###########################################################
......@@ -162,10 +165,14 @@ if __name__ == '__main__':
#OverWrite the Files for loss saving and time saving
fichierLoss = open(args.path+"/loss_times.csv", "w")
fichierLoss.close()
#Dataset loading
root = './data_sound/trainset9/'
#Dataset loading of the bird calls for n different species
#the diversity in the bird call used as noise will make the task more
# complicated for the network
root = './data_sound/trainset10/'
ext = '.wav'
train_classes = Dataset(root, name_classes=args.name_classes, nb_class_noise =2, path_background="./data_sound/noises/")
#path_background is the repertory for the noise we put in the
#background of all the bird calls
train_classes = Dataset(root, name_classes=args.name_classes, nb_class_noise =3, path_background="./data_sound/noises/")
loader_train = torch.utils.data.DataLoader(
train_classes,
batch_size = args.batch_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment