Commit 48c71672 authored by Marie Tahon's avatar Marie Tahon
Browse files

update

parent ab17afad
#configuration for musical audio processing
#general load WAV
audio:
filename: '../XP_musico/free jazz 1st take.wav'
begin: 0.0 #start reading the analysed signal after this time (in seconds)
duration: 120 # duration of the analysed signal (in seconds) ou None.
percu: True #in case of method=onset extract onsets on percussive part of the signal. Also create percu and harmonic wav files.
tuning: True
begin_ref: 0.0 #beginning of the reference signal for computing cosine distance between clusters (in seconds).
end_ref: 0.05 #end of the reference signal for computing cosine distance between clusters (in seconds).
seg: #short scale segmentation at the frame level
NFFT: 2048 #(> 2**10) duration of analysis window in samples for feature extraction only.
STEP: 1024 #(>2**6) et (STEP < NFFT) 50% overlap between time windows / also sub-frequency after analyzing spectral structure.
type: 'cqt' #cqt or mel spectro representations
nmels: 128 #if type is mel
fmax: 8000 #if type is mel
output: True #True means onsets are stored in txt file as an annotation file.
features:
feat: ['chroma', 'spectral'] #feat = ['spectral', 'chroma', 'cepstral']
BINS_PER_OCTAVE: 36
N_OCTAVES: 7
display: False
longseg: #long scale segmentation
method: 'fixed' #onsets that are extracted: fixed (regular onset extraction), 'onset' (onset extraction) 'beat' (beat extraction), 'manual' (manual annotations of onsets)
ref: 'manual_onsets_annotation.txt' #ref annotation file in case of method=manual
nframe: 40 #window duration in case of method=fixed in number of frames
display: False #display synchronized spectrograms on detected onsets
cluster:
method: 'fixed' #method for searching for cluster number: fixed (fixed nb of cluster), max (find the right number in a range), evals (find the right number using eigen values) silhouette, davies_bouldin, or calinski_harabaz (find the right number with these methods)
nb: [6,8] #[3,4,6,8] #if method == 'fixed'
max: 10 #if method == 'max'
cluster_dist: True # add cosine distance between clusters on final plot
cluster_nb_max: 5 #maximum nb of clusters in 1 sec.
mu: 'optimize' #either 'optimize', either numerical value
display: True #plot similarity
display_dist: False
display_struct: False
density: #for music signals only
norm_density_win: 60 #duration in sec. for sliding normalization of density
alpha: 0.6 # coefficient for counting which chromas energy overpass this threshold (alpha * max)
\ No newline at end of file
import librosa
import numpy as np
from matplotlib import pyplot as plt
def detect_onsets(sr, C, param):
# detect onsets
oenv = librosa.onset.onset_strength(S=C, sr=sr)
# Detect events without backtracking
onset_raw = librosa.onset.onset_detect(onset_envelope=oenv, backtrack=False)
# Backtrack the events using the onset envelope
onset_bt = librosa.onset.onset_backtrack(onset_raw, oenv)
# we fix_frames to include non-beat frames 0 and C.shape[1] (final frame)
onset_frames = librosa.util.fix_frames(onset_raw[:-1],
x_min=0,
x_max=C.shape[1] - 1)
onset_times = librosa.frames_to_time(librosa.util.fix_frames(onset_raw, x_min=0, x_max=C.shape[1]),
sr=sr,
hop_length=param['seg']['STEP'])
# To reduce dimensionality, we'll beat-synchronous the CQT
Csync = librosa.util.sync(C, onset_raw, aggregate=np.median)
if param['longseg']['display']:
plt.figure(figsize=(12, 4))
plt.plot(oenv, label='Onset strength')
plt.vlines(onset_raw, 0, oenv.max(), label='Raw onsets')
plt.vlines(onset_bt, 0, oenv.max(), label='Backtracked', color='r')
plt.legend(frameon=True, framealpha=0.75)
plt.show()
plt.figure(figsize=(12, 4))
plt.plot(oenv, label='Onset strength')
plt.vlines(onset_raw, 0, oenv.max(), label='Raw onsets')
plt.vlines(onset_bt, 0, oenv.max(), label='Backtracked', color='r')
plt.legend(frameon=True, framealpha=0.75)
plt.show()
plt.figure(figsize=(12, 4))
plt.subplot(2, 1, 1)
plt.title('CQT spectrogram')
librosa.display.specshow(C,
y_axis='cqt_hz',
sr=sr,
hop_length=param['seg']['STEP'],
bins_per_octave=param['features']['BINS_PER_OCTAVE'],
x_axis='time')
plt.tight_layout()
plt.subplot(2, 1, 2)
plt.title('CQT spectrogram synchronized on onsets')
librosa.display.specshow(Csync,
bins_per_octave=param['features']['BINS_PER_OCTAVE'],
y_axis='cqt_hz',
x_axis='time',
x_coords=onset_times)
plt.tight_layout()
plt.show()
return onset_frames, onset_times, Csync
##############################################
def detect_beats(y, sr, M, param):
tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=param['seg']['STEP'], trim=False)
print('Detected tempo: {0:.2f} bpm'.format(tempo))
beat_period = np.diff(librosa.frames_to_time(beats, sr=sr, hop_length=param['seg']['STEP']))
print('mean beat period: {0:.2f} ; std beat period: {1:.2f}'.format(60 / np.mean(beat_period), np.std(beat_period)))
beats_frames = librosa.util.fix_frames(beats[:-1],
x_min=0,
x_max=M.shape[1] - 1)
beat_times = librosa.frames_to_time(librosa.util.fix_frames(beats, x_min=0, x_max=M.shape[1]),
sr=sr,
hop_length=param['seg']['STEP'])
# beat_times = librosa.frames_to_time(beats_frames, sr=sr, hop_length=param['seg']['STEP'])
Msync = librosa.util.sync(M, beats_frames, aggregate=np.median)
if param['longseg']['display']:
plot_onsets(M, Msync, sr, beat_times, param)
return beats_frames, beat_times, Msync
##############################################
def fixed_onsets(sr, M, param):
# compute the CQT transform C: np.array((252, Tmax*sr/STEP))
frames = np.arange(0, M.shape[1], param['longseg']['nframe'])
onsets = librosa.util.fix_frames(frames[:-1],
x_min=0,
x_max=M.shape[1] - 1)
Msync = librosa.util.sync(M, onsets, aggregate=np.median)
onset_times = librosa.frames_to_time(librosa.util.fix_frames(frames, x_min=0, x_max=M.shape[1]),
sr=sr,
hop_length=param['seg']['STEP'])
if param['longseg']['display']:
plot_onsets(M, Msync, sr, onset_times, param)
return onsets, onset_times, Msync
def get_manual_beats(sr, M, param):
with open(param['longseg']['ref'], 'r') as f:
data = f.readlines()
times = np.array([float(x.strip()) for x in data[1:]])
frames = np.array([int(x * sr / param['seg']['STEP']) for x in times])
onsets = librosa.util.fix_frames(frames[:-1],
x_min=0,
x_max=M.shape[1] - 1)
onset_times = librosa.frames_to_time(librosa.util.fix_frames(frames, x_min=0, x_max=M.shape[1]),
sr=sr,
hop_length=param['seg']['STEP'])
Msync = librosa.util.sync(M, onsets, aggregate=np.median)
if param['longseg']['display']:
plot_onsets(M, Msync, sr, onset_times, param)
return onsets, onset_times, Msync
def extract_onsets(y, sr, param):
method = param['longseg']['method']
if param['seg']['type'] == 'cqt':
C = librosa.amplitude_to_db(librosa.core.magphase(
librosa.cqt(y=y,
sr=sr,
bins_per_octave=param['features']['BINS_PER_OCTAVE'],
n_bins=param['features']['N_OCTAVES'] * param['features']['BINS_PER_OCTAVE'],
hop_length=param['seg']['STEP']))[0], ref=np.max)
elif param['seg']['type'] == 'mel':
C = librosa.power_to_db(librosa.feature.melspectrogram(y=y,
sr=sr,
n_mels=param['seg']['nmels'],
fmax=param['seg']['fmax'],
hop_length=param['seg']['STEP'],
n_fft=param['seg']['NFFT']), ref=np.max)
# to reduce dimensionality, we'll onset-synchronous the CQT
# onset is a vector of onset indexes np.array((N+1,)) including 0
# onset_times is a vector of onset times np.array((N+1,)) including 0
# Csync is the CQT transform synchronized on onsets np.array((252, N))
if method == 'fixed':
onset, onset_times, Csync = fixed_onsets(sr, C, param)
elif method == 'onset':
onset, onset_times, Csync = detect_onsets(sr, C, param)
elif method == 'beat':
onset, onset_times, Csync = detect_beats(y, sr, C, param)
elif method == 'manual':
onset, onset_times, Csync = get_manual_beats(sr, C, param)
else:
print('onset parameter is not well-defined')
sys.exit()
print(Csync.shape, onset_times.shape, onset.shape)
# be careful for plotting purposes onset_times is edges wile onset is only onsets.
# by example with fixed onsets
# Csync: (252, 517); onset: (517,); onset_times: (518,)
return onset, onset_times, Csync
################################################
def feature_extraction(y, sr, param):
NFFT = param['seg']['NFFT']
STEP = param['seg']['STEP']
if param['audio']['tuning']:
# extraction of tuning
A440 = librosa.estimate_tuning(y=y, sr=sr, resolution=1e-3)
print('Deviation from A440 is : {0:.2f}'.format(A440))
else:
A440 = 0.0
features = param['features']['feat']
print('Features for local similarity: ', ' '.join(features))
full = []
idx_chroma = 0
if 'cepstral' in features:
mfcc = librosa.feature.mfcc(y=y,
sr=sr,
n_mfcc=20,
n_fft=NFFT,
hop_length=STEP)
mfcc_delta = librosa.feature.delta(mfcc)
fcep = np.concatenate((mfcc, mfcc_delta), axis=0)
full.append(fcep)
if 'chroma' in features:
chroma = librosa.feature.chroma_cqt(y=y,
sr=sr,
n_chroma=12,
n_octaves=param['features']['N_OCTAVES'],
hop_length=STEP,
norm=None,
tuning=A440)
chroma_delta = librosa.feature.delta(chroma)
fchr = np.concatenate((chroma, chroma_delta), axis=0)
idx_chroma = len(full)
full.append(fchr)
if 'spectral' in features:
centroid = librosa.feature.spectral_centroid(y=y,
sr=sr,
n_fft=NFFT,
hop_length=STEP)
contrast = librosa.feature.spectral_contrast(y=y,
sr=sr,
n_fft=NFFT,
n_bands=6,
hop_length=STEP)
flatness = librosa.feature.spectral_flatness(y=y,
n_fft=NFFT,
hop_length=STEP)
rolloff05 = librosa.feature.spectral_rolloff(y=y,
sr=sr,
n_fft=NFFT,
hop_length=STEP,
roll_percent=0.05)
rolloff25 = librosa.feature.spectral_rolloff(y=y,
sr=sr,
n_fft=NFFT,
hop_length=STEP,
roll_percent=0.25)
rolloff50 = librosa.feature.spectral_rolloff(y=y,
sr=sr,
n_fft=NFFT,
hop_length=STEP,
roll_percent=0.50)
rolloff75 = librosa.feature.spectral_rolloff(y=y,
sr=sr,
n_fft=NFFT,
hop_length=STEP,
roll_percent=0.75)
rolloff95 = librosa.feature.spectral_rolloff(y=y,
sr=sr,
n_fft=NFFT,
hop_length=STEP,
roll_percent=0.95)
spec = np.concatenate((centroid, contrast, flatness, rolloff05, rolloff25, rolloff50, rolloff75, rolloff95),
axis=0)
spec_delta = librosa.feature.delta(spec)
fspec = np.concatenate((spec, spec_delta), axis=0)
full.append(fspec)
full = np.array(full, dtype=object)[0]
print('feature shape', full.shape)
return full, idx_chroma
def extract_time_boundaries(cluster_ids, onsets, nb_frames, sr, param):
# Locate segment boundaries from the label sequence
bound_beats = 1 + np.flatnonzero(cluster_ids[:-1] != cluster_ids[1:])
# Count beat 0 as a boundary
bound_beats = librosa.util.fix_frames(bound_beats, x_min=0)
# Compute the segment label for each boundary
bound_labels = list(cluster_ids[bound_beats])
# Convert beat indices to frames
bound_frames = onsets[bound_beats]
# Make sure we cover to the end of the track
bound_frames = librosa.util.fix_frames(bound_frames, x_min=None, x_max=nb_frames - 1)
bound_times = librosa.frames_to_time(bound_frames, sr=sr, hop_length=param['seg']['STEP'])
return bound_times, bound_labels
#################################################
def compute_musical_density(C, onset_times, w, alpha):
density = []
for n in range(C.shape[1]):
t1 = np.min([onset_times[-1], onset_times[n] + w])
t2 = np.min([onset_times[-1] - w, onset_times[n]])
idw = np.where((onset_times < t1) & (onset_times >= t2))
# if n + w < :
threshold_chroma = np.max(C[:, idw])
# else:
# threshold_chroma = np.mean(C[:, N - w : N])
idx = np.where(C[:, n] > alpha * threshold_chroma)
density.append(len(idx[0]))
return density
def plot_features(X, onsets, onset_times, param):
Xsync = librosa.util.sync(X, onsets, aggregate=np.median)
# print(X.shape, Xsync.shape)
# print(onset_times)
# TODO: probleme with pcolormesh
if param['features']['feat'][0] == 'chroma':
fig_c = plt.figure(figsize=(12, 6))
ax0_c = fig_c.add_subplot(3, 1, 1)
ax0_c.set_title('onset-synchronous chroma (12)')
# ax0_c.pcolor(distance, cmap = 'plasma')
librosa.display.specshow(Xsync[:12, :],
y_axis='chroma',
x_axis='time',
x_coords=onset_times,
cmap='OrRd')
# plt.colorbar()
ax1_c = fig_c.add_subplot(3, 1, 2, sharex=ax0_c)
ax1_c.set_title('onset-synchronous delta chroma (12)')
librosa.display.specshow(np.abs(Xsync[12:, :]),
y_axis='chroma',
x_axis='time',
x_coords=onset_times,
cmap='OrRd')
# plt.colorbar()
density = compute_musical_density(Xsync[:12, :],
onset_times[:-1],
param['density']['norm_density_win'],
param['density']['alpha'])
print(len(onset_times), len(density))
ax2_c = fig_c.add_subplot(3, 1, 3, sharex=ax0_c)
ax2_c.set_title('musical density')
ax2_c.plot(onset_times[:-1], density)
plt.tight_layout()
plt.show()
elif param['features']['feat'][0] == 'cepstral':
fig_s = plt.figure(figsize=(12, 6))
ax0_s = fig_s.add_subplot(3, 1, 1)
ax0_s.set_title('onset-synchronous MFCC (20)')
librosa.display.specshow(Xsync[:21, :],
x_axis='time',
x_coords=onset_times)
# plt.colorbar()
# plt.tight_layout()
ax1_s = fig_s.add_subplot(3, 1, 2, sharex=ax0_s)
ax1_s.set_title('onset-synchronous delta MFCC (20)')
librosa.display.specshow(np.abs(Xsync[20:, :]),
x_axis='time',
x_coords=onset_times)
# plt.colorbar()
density = compute_musical_density(Xsync[:21, :],
onset_times[:-1],
param['density']['norm_density_win'],
param['density']['alpha'])
ax2_s = fig_s.add_subplot(3, 1, 3, sharex=ax0_s)
ax2_s.set_title('musical density')
ax2_s.plot(onset_times[:-1], density)
plt.tight_layout()
plt.show()
else:
print('these parameters can not be plot')
def plot_onsets(M, Msync, sr, onset_times, param):
print(M.shape, onset_times.shape)
plt.figure(figsize=(12, 4))
plt.subplot(2, 1, 1)
if param['seg']['type'] == 'cqt':
plt.title('CQT spectrogram')
librosa.display.specshow(M,
y_axis='cqt_hz',
sr=sr,
hop_length=param['seg']['STEP'],
bins_per_octave=param['features']['BINS_PER_OCTAVE'],
x_axis='time')
elif param['seg']['type'] == 'mel':
plt.title('MEL spectrogram')
librosa.display.specshow(M,
y_axis='mel',
sr=sr,
fmax=param['seg']['fmax'],
hop_length=param['seg']['STEP'],
x_axis='time')
plt.tight_layout()
# For plotting purposes, we'll need the timing of the beats
# we fix_frames to include non-beat frames 0 and C.shape[1] (final frame)
plt.subplot(2, 1, 2)
if param['seg']['type'] == 'cqt':
plt.title('CQT spectrogram synchronized on onsets')
librosa.display.specshow(Msync,
y_axis='cqt_hz',
bins_per_octave=param['features']['BINS_PER_OCTAVE'],
x_coords=onset_times,
x_axis='time')
elif param['seg']['type'] == 'mel':
plt.title('MEL spectrogram synchronized on onsets')
librosa.display.specshow(Msync,
y_axis='mel',
sr=sr,
fmax=param['seg']['fmax'],
x_coords=onset_times,
x_axis='time')
plt.tight_layout()
plt.show()
nameaudio.wav
0.345
0.56
0.64530
1.24
1.56
20
50
70
90
import librosa
import scipy
import numpy as np
from sklearn.preprocessing import minmax_scale
import matplotlib.pyplot as plt
#######################################
def build_weighted_rec_matrix(M):
# Let's build a weighted recurrence affinity matrix using onset-synchronous CQT
# the similarity matrix is filtered to prevent linkage errors and fill the gaps
# the filter corresponds to a width=3 time window and a majority vote.
R = librosa.segment.recurrence_matrix(M, width=3, mode='affinity', sym=True)
# Enhance diagonals with a median filter
df = librosa.segment.timelag_filter(scipy.ndimage.median_filter)
Rf = df(R, size=(1, 7))
return Rf
def build_seq_matrix(M, x):
# build the sequence matrix using feature-similarity
# Rpath[i, i+/-1] = \exp(- |M[i] - C[i+/-1]|^2 / sigma^2)`
# synchronize features with onsets
Msync = librosa.util.sync(M, x, aggregate=np.median)
# Msync = M #pas de syncrhonisation
# normalize (rescale) features between 0 and 1
Msync_normed = minmax_scale(Msync)
# constant scaling
path_distance = np.sum(np.diff(Msync_normed, axis=1) ** 2, axis=0)
# sigma is the median distance between successive beats/onsets.
sigma = np.median(path_distance)
path_sim = np.exp(-path_distance / sigma)
# local scaling from A Spectral Clustering Approach to Speaker Diarization, Huazhong Ning, Ming Liu, Hao Tang, Thomas Huang
R_path = np.diag(path_sim, k=1) + np.diag(path_sim, k=-1)
return R_path
def build_laplacian_and_evec(Rf, R_path, onsets, param):
if param['cluster']['mu'] == 'optimize':
# And compute the balanced combination A of the two similarity matrices Rf and R_path
deg_path = np.sum(R_path, axis=1)
deg_rec = np.sum(Rf, axis=1)
mu = deg_path.dot(deg_path + deg_rec) / np.sum((deg_path + deg_rec) ** 2)
print('Optimal weight value (mu): {0:.2f}'.format(mu))
else:
mu = param['cluster']['mu']
print('Given weight value (mu): {0:.2f}'.format(mu))
A = mu * Rf + (1 - mu) * R_path
# Plot the resulting graphs
if param['cluster']['display']:
plot_similarity(Rf, R_path, A, onsets)
# L: symetrized normalized Laplacian
L = scipy.sparse.csgraph.laplacian(A, normed=True)
# and its spectral decomposition (Find eigenvalues w and optionally eigenvectors v of matrix L)
evals, evecs = np.linalg.eigh(L)
print('L shape:', L.shape)
# We can clean this up further with a median filter.
# This can help smooth over small discontinuities
evecs = scipy.ndimage.median_filter(evecs, size=(9, 1))
# cumulative normalization is needed for symmetric normalize laplacian eigenvectors
Cnorm = np.cumsum(evecs ** 2, axis=1) ** 0.5
return Cnorm, evals, evecs
################################################
def compute_nb_clusters(evals, evecs, param):
global c
method = param['cluster']['method']
if method == 'fixed':
c = param['cluster']['nb'] # list
elif method == 'max':
nc = []
for it in range(param['cluster']['max']):
nc.append(cluster_rotate.cluster_rotate(evecs / Cnorm, evals, range(1, 10), 1, False))
c = [int(np.mean(nc)) + 1]
elif method == 'evals':
ind = np.where(1 - evals > 0.75)[0]
# print(ind)
return [len(ind) + 1]
elif method in ['silhouette', 'davies_bouldin', 'calinski_harabaz']:
list_k = range(2, 50, 2)
Cnorm = np.cumsum(e ** 2, axis=1) ** 0.5 # eigenvectors in input
for k in list_k:
print('nb of clusters:', k)
X = e[:, :k] / Cnorm[:, k - 1:k]
# Let's use these k components to cluster beats into segments
# (Algorithm 1)
KM = sklearn.cluster.KMeans(n_clusters=k)
seg_ids = KM.fit_predict(X)
score = []
if method == 'silhouette':
score.append(sklearn.metrics.silhouette_score(X, seg_ids, metric='euclidean')) # max (proche de 1)
elif method == 'davies_bouldin':
score.append(davies_bouldin_score(X, seg_ids)) # min
elif method == 'calinski_harabaz':
score.append(sklearn.metrics.calinski_harabaz_score(X, seg_ids)) # max
if method == 'silhouette':
return list_k[np.argmax(score)]
elif method == 'davies_bouldin':
return list_k[np.argmin(score)]
elif method == 'calinski_harabaz':
return list_k[np.argmax(score)]
else:
print('method for finding the right number of clusters is unknown')
sys.exit()
print('nb of clusters:', c)
return c
def davies_bouldin_score(X, labels):
"""Computes the Davies-Bouldin score.
The score is defined as the ratio of within-cluster distances to
between-cluster distances.
Read more in the :ref:`User Guide <davies-bouldin_index>`.
Parameters
----------
X : array-like, shape (``n_samples``, ``n_features``)
List of ``n_features``-dimensional data points. Each row corresponds
to a single data point.
labels : array-like, shape (``n_samples``,)