Commit 39a1d2c0 authored by Anthony Larcher's avatar Anthony Larcher
Browse files

read wav issue

parent 4a967e19
......@@ -223,7 +223,6 @@ class FeaturesExtractor(object):
signal, sample_rate = read_audio(audio_filename, self.sampling_frequency)
if signal.ndim == 1:
signal = signal[:, numpy.newaxis]
# AJOUTER LE BRUITAGE ET REVERB DU SIGNAL SI NECESSAIRE
if noise_file_name is not None:
signal[:, channel] = _add_noise(signal[:, channel], noise_file_name, snr, sample_rate)
......@@ -332,7 +331,6 @@ class FeaturesExtractor(object):
if "vad" not in self.save_param:
label = None
logging.info(label)
write_hdf5(show, h5f,
cep, cep_mean, cep_std,
energy, energy_mean, energy_std,
......
......@@ -377,10 +377,8 @@ def power_spectrum(input_sig,
window_length = int(round(win_time * fs))
overlap = window_length - int(shift * fs)
framed = framing(input_sig, window_length, win_shift=window_length-overlap).copy()
# Pre-emphasis filtering is applied after framing to be consistent with stream processing
framed = pre_emphasis(framed, prefac)
l = framed.shape[0]
n_fft = 2 ** int(numpy.ceil(numpy.log2(window_length)))
# Windowing has been changed to hanning which is supposed to have less noisy sidelobes
......@@ -454,13 +452,11 @@ def mfcc(input_sig,
win_time=nwin,
shift=shift,
prefac=prefac)
# Filter the spectrum through the triangle filter-bank
n_fft = 2 ** int(numpy.ceil(numpy.log2(int(round(nwin * fs)))))
fbank = trfbank(fs, n_fft, lowfreq, maxfreq, nlinfilt, nlogfilt)[0]
mspec = numpy.log(numpy.dot(spec, fbank.T)) # A tester avec log10 et log
# Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
# The C0 term is removed as it is the constant term
ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, 1:nceps + 1]
......
......@@ -307,23 +307,22 @@ class MFCC(object):
:param sample:
:return:
"""
sig = sample[0][:, numpy.newaxis] # ajout
framed = framing(sample[0], self.window_length, win_shift=self.window_length - self.overlap).copy()
framed = framing(sample[0], self.window_length, win_shift=self.window_length - self.overlap).copy()
# Pre-emphasis filtering is applied after framing to be consistent with stream processing
framed = pre_emphasis(framed, self.prefac)
# Windowing has been changed to hanning which is supposed to have less noisy sidelobes
# ham = numpy.hamming(window_length)
window = numpy.hanning(self.window_length)
log_energy = numpy.log((framed ** 2).sum(axis=1))
mag = numpy.fft.rfft(framed * window, self.n_fft, axis=-1)
spec = mag.real ** 2 + mag.imag ** 2
# Filter the spectrum through the triangle filter-bank
mspec = numpy.log(numpy.dot(spec, self.fbank)) # A tester avec log10 et log
# Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
# The C0 term is removed as it is the constant term
mfcc = dct(mspec, type=2, norm='ortho', axis=-1)[:, 1:self.nceps + 1]
return mfcc.T, sample[1], sample[2], sample[3], sample[4], sample[5]
......@@ -594,7 +593,8 @@ class IdMapSet(Dataset):
start = 0.0
if self.idmap.start[index] is None and self.idmap.stop[index] is None:
sig, sample_rate = soundfile.read(f"{self.data_root_path}/{self.idmap.rightids[index]}.{self.file_extension}")
sig, sample_rate = soundfile.read(f"{self.data_root_path}/{self.idmap.rightids[index]}.{self.file_extension}", dtype="int16")
sig = sig.astype(numpy.float32)
start = 0
stop = len(sig)
else:
......@@ -609,7 +609,9 @@ class IdMapSet(Dataset):
stop = int(start + self.min_duration * nfo.samplerate)
sig, _ = soundfile.read(f"{self.data_root_path}/{self.idmap.rightids[index]}.{self.file_extension}",
start=start,
stop=stop)
stop=stop,
dtype='int16')
sig = sig.astype(numpy.float32)
sig += 0.0001 * numpy.random.randn(sig.shape[0])
if self.transform_pipeline is not None:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment