Commit aa50412d authored by Anthony Larcher's avatar Anthony Larcher
Browse files

major cleaning

parent c5253816
......@@ -51,6 +51,27 @@ __docformat__ = 'reStructuredText'
Noise = collections.namedtuple('Noise', 'type file_id duration')
class PreEmphasis(torch.nn.Module):
def __init__(self, coef: float = 0.97):
super().__init__()
self.coef = coef
# make kernel
# In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
self.register_buffer(
'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
)
def forward(self, input: torch.tensor) -> torch.tensor:
assert len(input.size()) == 2, 'The number of dimensions of input tensor must be 2!'
# reflect padding to match lengths of in/out
input = input.unsqueeze(1)
input = torch.nn.functional.pad(input, (1, 0), 'reflect')
return torch.nn.functional.conv1d(input, self.flipped_filter).squeeze(1)
def normalize(wav):
"""
......
......@@ -317,88 +317,6 @@ class ResBlock(torch.nn.Module):
return out
class ResNet18(torch.nn.Module):
"""
"""
def __init__(self, spk_number,
entry_conv_kernel_size=(3,3),
entry_conv_out_channels=128,
megablock_out_channels=(64, 128, 256, 512),
megablock_size=(2, 2, 2, 2),
block_type = ResBlock
):
"""
:param spk_number:
:param entry_conv_kernel_size:
:param entry_conv_out_channels:
:param megablock_out_channels:
:param megablock_size:
:param block_type:
"""
super(ResNet18, self).__init__()
self.spk_number = spk_number
self.activation = torch.nn.LeakyReLU()
# First convolution layer for input
self.entry_conv = torch.nn.Conv2d(in_channels=1,
out_channels=entry_conv_out_channels,
kernel_size=entry_conv_kernel_size,
padding=(1),
stride=1)
self.entry_batch_norm = torch.nn.BatchNorm2d(entry_conv_out_channels)
self.top_channel_number = entry_conv_out_channels
# Add ResBlocks
self.mega_blocks = []
for mb_size, mb_out in zip(megablock_size, megablock_out_channels):
self.mega_blocks.append(self._add_megablock(block_type, mb_size, mb_out))
self.top_channel_number = mb_out
# Top layers for classification and embeddings extraction
self.top_lin1 = torch.nn.Linear(megablock_out_channels[-1] * 2 * 40, 512) # a modifier pour voir la taille
self.top_batch_norm1 = torch.nn.BatchNorm1d(512)
self.top_lin2 = torch.nn.Linear(512, spk_number)
def forward(self, x):
"""
:param x:
:return:
"""
x = self.entry_conv(x)
x = self.entry_batch_norm(x)
x = self.activation(x)
for layer in self.mega_blocks:
x = layer(x)
# Pooling done as for x-vectors
mean = torch.mean(x, dim=2)
mean = torch.flatten(mean, 1)
std = torch.std(x, dim=2)
std = torch.flatten(std, 1)
x = torch.cat([mean, std], dim=1)
# Classification layers
x = self.top_lin1(x)
x = self.top_batch_norm1(x)
x = self.activation(x)
x = self.top_lin2(x)
return x
def _add_megablock(self, block_type, block_nb, out_channels, is_first=False):
rblocks = [block_type(self.top_channel_number, out_channels, is_first=is_first),]
for _ in range(1, block_nb):
rblocks.append(block_type(out_channels, out_channels, is_first=False))
return torch.nn.Sequential(*rblocks)
class BasicBlock(torch.nn.Module):
expansion = 1
......
......@@ -84,81 +84,6 @@ torch.backends.cudnn.benchmark = False
numpy.random.seed(0)
class GuruMeditation (torch.autograd.detect_anomaly):
def __init__(self):
super(GuruMeditation, self).__init__()
def __enter__(self):
super(GuruMeditation, self).__enter__()
return self
def __exit__(self, type, value, trace):
super(GuruMeditation, self).__exit__()
if isinstance(value, RuntimeError):
traceback.print_tb(trace)
self.halt(str(value))
def halt(msg):
print (msg)
pdb.set_trace()
def select_n_random(data, labels, n=100):
'''
Selects n random datapoints and their corresponding labels from a dataset
'''
assert len(data) == len(labels)
perm = torch.randperm(len(data))
return data[perm][:n], labels[perm][:n]
def matplotlib_imshow(img, one_channel=False):
if one_channel:
img = img.mean(dim=0)
img = img / 2 + 0.5 # unnormalize
npimg = img.cpu().numpy()
if one_channel:
plt.imshow(npimg, cmap="Greys")
else:
plt.imshow(numpy.transpose(npimg, (1, 2, 0)))
def speech_to_probs(model, speech):
'''
Generates predictions and corresponding probabilities from a trained
network and a list of images
'''
output = model(speech)
# convert output probabilities to predicted class
_, preds_tensor = torch.max(output, 1)
preds = numpy.squeeze(preds_tensor.cpu().numpy())
return preds, [torch.nn.functional.softmax(el, dim=0)[i].item() for i, el in zip(preds, output)]
def plot_classes_preds(model, speech, labels):
'''
Generates matplotlib Figure using a trained network, along with images
and labels from a batch, that shows the network's top prediction along
with its probability, alongside the actual label, coloring this
information based on whether the prediction was correct or not.
Uses the "speech_to_probs" function.
'''
preds, probs = speech_to_probs(model, speech)
# plot the images in the batch, along with predicted and true labels
fig = plt.figure(figsize=(12, 48))
for idx in numpy.arange(4):
ax = fig.add_subplot(1, 4, idx+1, xticks=[], yticks=[])
#matplotlib_imshow(speech[idx], one_channel=True)
ax.set_title("{0}, {1:.1f}%\n(label: {2})".format(
preds[idx],
probs[idx] * 100.0,
labels[idx]),
color=("green" if preds[idx]==labels[idx].item() else "red"))
return fig
def test_metrics(model,
device,
speaker_number,
......@@ -186,16 +111,6 @@ def test_metrics(model,
data_root_name='/data/larcher/voxceleb1/test/wav'
transform_pipeline = dict()
#mfcc_config = dict()
#mfcc_config['nb_filters'] = 81
#mfcc_config['nb_ceps'] = 80
#mfcc_config['lowfreq'] = 133.333
#mfcc_config['maxfreq'] = 6855.4976
#mfcc_config['win_time'] = 0.025
#mfcc_config['shift'] = 0.01
#mfcc_config['n_fft'] = 2048
#transform_pipeline['MFCC'] = mfcc_config
#transform_pipeline['CMVN'] = {}
xv_stat = extract_embeddings(idmap_name=idmap_test_filename,
speaker_number=speaker_number,
......@@ -213,22 +128,10 @@ def test_metrics(model,
check_missing=True)
tar, non = scores.get_tar_non(Key(key_test_filename))
test_eer = eer(numpy.array(non).astype(numpy.double), numpy.array(tar).astype(numpy.double))
return test_eer
def get_lr(optimizer):
"""
:param optimizer:
:return:
"""
for param_group in optimizer.param_groups:
return param_group['lr']
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar', best_filename='model_best.pth.tar'):
"""
......@@ -299,24 +202,6 @@ class GruPooling(torch.nn.Module):
return x
class PreEmphasis(torch.nn.Module):
def __init__(self, coef: float = 0.97):
super().__init__()
self.coef = coef
# make kernel
# In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
self.register_buffer(
'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
)
def forward(self, input: torch.tensor) -> torch.tensor:
assert len(input.size()) == 2, 'The number of dimensions of input tensor must be 2!'
# reflect padding to match lengths of in/out
input = input.unsqueeze(1)
input = torch.nn.functional.pad(input, (1, 0), 'reflect')
return torch.nn.functional.conv1d(input, self.flipped_filter).squeeze(1)
class MfccFrontEnd(torch.nn.Module):
"""
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment