Commit 257e002d authored by Anthony Larcher's avatar Anthony Larcher
Browse files

new augmentation

parent 5c37ba16
......@@ -368,3 +368,50 @@ if has_pyroom:
return data, sample[1], sample[2], sample[3] , sample[4], sample[5]
"""
It might not be 100% on topic, but maybe this is interesting for you anyway. If you do not need to do real time processing, things can be made more easy. Limiting and dynamic compression can be seen as applying a dynamic transfer function. This function just maps input to output values. A linear function then returns the original audio and a "curved" function does compression or expansion. Applying a transfer function is as simple as
import numpy as np
from scipy.interpolate import interp1d
from scipy.io import wavfile
def apply_transfer(signal, transfer, interpolation='linear'):
constant = np.linspace(-1, 1, len(transfer))
interpolator = interp1d(constant, transfer, interpolation)
return interpolator(signal)
Limiting or compression then is just a case of choosing a different transfer function:
# hard limiting
def limiter(x, treshold=0.8):
transfer_len = 1000
transfer = np.concatenate([ np.repeat(-1, int(((1-treshold)/2)*transfer_len)),
np.linspace(-1, 1, int(treshold*transfer_len)),
np.repeat(1, int(((1-treshold)/2)*transfer_len)) ])
return apply_transfer(x, transfer)
# smooth compression: if factor is small, its near linear, the bigger it is the
# stronger the compression
def arctan_compressor(x, factor=2):
constant = np.linspace(-1, 1, 1000)
transfer = np.arctan(factor * constant)
transfer /= np.abs(transfer).max()
return apply_transfer(x, transfer)
This example assumes 16 bit mono wav files as input:
sr, x = wavfile.read("input.wav")
x = x / np.abs(x).max() # x scale between -1 and 1
x2 = limiter(x)
x2 = np.int16(x2 * 32767)
wavfile.write("output_limit.wav", sr, x2)
x3 = arctan_compressor(x)
x3 = np.int16(x3 * 32767)
wavfile.write("output_comp.wav", sr, x3)
"""
\ No newline at end of file
......@@ -66,6 +66,7 @@ from .loss import ArcMarginProduct
os.environ['MKL_THREADING_LAYER'] = 'GNU'
__license__ = "LGPL"
......@@ -381,10 +382,11 @@ class Xtractor(torch.nn.Module):
self.embedding_size = 256
self.loss = "aam"
self.after_speaker_embedding = ArcLinear(256,
int(self.speaker_number),
margin=aam_margin, s=aam_s)
self.after_speaker_embedding = ArcMarginProduct(256,
int(self.speaker_number),
s = 30.0,
m = 0.50,
easy_margin = False)
self.preprocessor_weight_decay = 0.000
self.sequence_network_weight_decay = 0.000
self.stat_pooling_weight_decay = 0.000
......@@ -885,7 +887,6 @@ def xtrain(speaker_number,
pin_memory=True,
num_workers=num_thread)
"""
Set the training options
"""
......@@ -899,20 +900,6 @@ def xtrain(speaker_number,
_optimizer = torch.optim.SGD
_options = {'lr': lr, 'momentum': 0.9}
#params = [
# {
# 'params': [
# param for name, param in model.named_parameters() if 'bn' not in name
# ]
# },
# {
# 'params': [
# param for name, param in model.named_parameters() if 'bn' in name
# ],
# 'weight_decay': 0
# },
#]
param_list = []
if type(model) is Xtractor:
if model.preprocessor is not None:
......@@ -932,14 +919,13 @@ def xtrain(speaker_number,
optimizer = _optimizer(param_list, **_options)
#optimizer = torch.optim.SGD(params,
# lr=lr,
# momentum=0.9,
# weight_decay=0.0005)
#print(f"Learning rate = {lr}")
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
milestones=[50000, 60000, 70000, 80000, 90000,
100000, 110000, 120000, 130000,
140000, 150000],
gamma=0.1,
last_epoch=-1,
verbose=True)
best_accuracy = 0.0
best_accuracy_epoch = 1
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment