augmentation.py 11.7 KB
Newer Older
Anthony Larcher's avatar
Anthony Larcher committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as
# published by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT.  If not, see <http://www.gnu.org/licenses/>.

"""
Anthony Larcher's avatar
v1.3.7    
Anthony Larcher committed
25
Copyright 2014-2021 Anthony Larcher
Anthony Larcher's avatar
Anthony Larcher committed
26
27
28
29
30
"""

import collections
import numpy
import random
Anthony Larcher's avatar
Anthony Larcher committed
31
32
import torch
import torchaudio
Anthony Larcher's avatar
Anthony Larcher committed
33

Anthony Larcher's avatar
Anthony Larcher committed
34
from scipy import signal
Anthony Larcher's avatar
Anthony Larcher committed
35
36


Anthony Larcher's avatar
v1.3.7    
Anthony Larcher committed
37
38
39
40
41
42
43
44
45
__author__ = "Anthony Larcher and Sylvain Meignier"
__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
__license__ = "LGPL"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'


46
Noise = collections.namedtuple('Noise', 'type file_id duration')
Anthony Larcher's avatar
Anthony Larcher committed
47
48


Anthony Larcher's avatar
Anthony Larcher committed
49
class PreEmphasis(torch.nn.Module):
Anthony Larcher's avatar
Anthony Larcher committed
50
51
52
    """
    Apply pre-emphasis filtering
    """
Anthony Larcher's avatar
Anthony Larcher committed
53
54
55
56
57
58
59
60
61
62

    def __init__(self, coef: float = 0.97):
        super().__init__()
        self.coef = coef
        # make kernel
        # In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
        self.register_buffer(
            'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
        )

Anthony Larcher's avatar
Anthony Larcher committed
63
64
65
66
67
68
69
    def forward(self, input_signal: torch.tensor) -> torch.tensor:
        """
        Forward pass of the pre-emphasis filtering

        :param input_signal: the input signal
        :return: the filtered signal
        """
Anthony Larcher's avatar
debug    
Anthony Larcher committed
70
        assert len(input_signal.size()) == 2, 'The number of dimensions of input tensor must be 2!'
Anthony Larcher's avatar
Anthony Larcher committed
71
        # reflect padding to match lengths of in/out
Anthony Larcher's avatar
debug    
Anthony Larcher committed
72
        input_signal = input_signal.unsqueeze(1)
Anthony Larcher's avatar
Anthony Larcher committed
73
74
        input_signal = torch.nn.functional.pad(input_signal, (1, 0), 'reflect')
        return torch.nn.functional.conv1d(input_signal, self.flipped_filter).squeeze(1)
Anthony Larcher's avatar
Anthony Larcher committed
75
76


Anthony Larcher's avatar
Anthony Larcher committed
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
class FrequencyMask(object):
    """Crop randomly the image in a sample.

    Args:
        output_size (tuple or int): Desired output size. If int, square crop
            is made.
    """
    def __init__(self, max_size, feature_size):
        self.max_size = max_size
        self.feature_size = feature_size

    def __call__(self, sample):
        data = sample[0]
        if sample[2]:
            size = numpy.random.randint(1, self.max_size)
            f0 = numpy.random.randint(0, self.feature_size - self.max_size)
            data[f0:f0+size, :] = 10.
        return data, sample[1], sample[2], sample[3], sample[4], sample[5]


class TemporalMask(object):
    """Crop randomly the image in a sample.

    Args:
        output_size (tuple or int): Desired output size. If int, square crop
            is made.
    """
    def __init__(self, max_size):
        self.max_size = max_size

    def __call__(self, sample):
        data = sample[0]
        if sample[3]:
            size = numpy.random.randint(1, self.max_size)
            t0 = numpy.random.randint(0, sample[0].shape[1] - self.max_size)
            data[:, t0:t0+size] = 10.
        return data, sample[1], sample[2], sample[3], sample[4], sample[5]


Anthony Larcher's avatar
Anthony Larcher committed
116
117
def normalize(wav):
    """
Anthony Larcher's avatar
Anthony Larcher committed
118
    Center and reduce a waveform
Anthony Larcher's avatar
Anthony Larcher committed
119

Anthony Larcher's avatar
Anthony Larcher committed
120
121
    :param wav: the input waveform
    :return: the normalized waveform
Anthony Larcher's avatar
Anthony Larcher committed
122
123
124
125
    """
    return wav / (numpy.sqrt(numpy.mean(wav ** 2)) + 1e-8)


Anthony Larcher's avatar
Anthony Larcher committed
126
def crop(input_signal, duration):
Anthony Larcher's avatar
Anthony Larcher committed
127
    """
Anthony Larcher's avatar
Anthony Larcher committed
128
    Select a chunk from an audio segment
Anthony Larcher's avatar
Anthony Larcher committed
129

Anthony Larcher's avatar
Anthony Larcher committed
130
131
    :param input_signal: signal to select a chunk from
    :param duration: duration of the chunk to select
Anthony Larcher's avatar
Anthony Larcher committed
132
133
    :return:
    """
Anthony Larcher's avatar
Anthony Larcher committed
134
135
    start = random.randint(0, input_signal.shape[0] - duration)
    chunk = input_signal[start: start + duration]
Anthony Larcher's avatar
Anthony Larcher committed
136
137
    return chunk

Anthony Larcher's avatar
Anthony Larcher committed
138

Anthony Larcher's avatar
debug    
Anthony Larcher committed
139
140
141
142
143
def data_augmentation(speech,
                      sample_rate,
                      transform_dict,
                      transform_number,
                      noise_df=None,
Anthony Larcher's avatar
Anthony Larcher committed
144
145
                      rir_df=None,
                      babble_noise=True):
Anthony Larcher's avatar
Anthony Larcher committed
146
    """
Anthony Larcher's avatar
Anthony Larcher committed
147
148
149
    Perform data augmentation on an input signal.
    Each speech chunk is augmented by using 'transform_number' transformations that are picked up randomly from a
    dictionary of possible transformations.
Anthony Larcher's avatar
Anthony Larcher committed
150

Anthony Larcher's avatar
Anthony Larcher committed
151
152
153
154
155
156
157
158
159
160
    :param speech: the input signal to be augmented
    :param sample_rate: sampling rate of the input signal to augment
    :param transform_dict: the dictionary of possibles augmentations to apply
    :param transform_number: the number of transformations to apply on each chunk
    :param rir_df: a pandas dataframe object including the list of RIR signals to chose from; default is None
    :param noise_df: a pandas dataframe object including the list of NOISE signals to chose from; default is None
    :param babble_noise: boolean that enable the use of babble noise, True by default (typically turned to False when
    the task includes overlapping speech detection).

    :return: augmented signal
Anthony Larcher's avatar
Anthony Larcher committed
161
162
163
164
165
166
167
168
169
170
171
172

    tranformation
        pipeline: add_noise,add_reverb
        add_noise:
            noise_db_csv: filename.csv
            snr: 5,6,7,8,9,10,11,12,13,14,15
        add_reverb:
            rir_db_csv: filename.csv
        codec: true
        phone_filtering: true
    """
    # Select the data augmentation randomly
Anthony Larcher's avatar
Anthony Larcher committed
173
    aug_idx = random.sample(range(len(transform_dict.keys())), k=transform_number)
174
    augmentations = numpy.array(list(transform_dict.keys()))[aug_idx]
Anthony Larcher's avatar
Anthony Larcher committed
175

Le Lan Gaël's avatar
Le Lan Gaël committed
176
177
178
    if "none" in augmentations:
        pass

Anthony Larcher's avatar
Anthony Larcher committed
179
180
    if "stretch" in augmentations:
        strech = torchaudio.functional.TimeStretch()
181
        rate = random.uniform(0.8,1.2)
Anthony Larcher's avatar
Anthony Larcher committed
182
183
184
        speech = strech(speech, rate)

    if "add_reverb" in augmentations:
Anthony Larcher's avatar
Anthony Larcher committed
185
        rir_nfo = rir_df.iloc[random.randrange(rir_df.shape[0])].file_id
Anthony Larcher's avatar
Anthony Larcher committed
186
        rir_fn = transform_dict["add_reverb"]["data_path"] + rir_nfo  # TODO harmonize with noise
Anthony Larcher's avatar
Anthony Larcher committed
187
        rir, rir_fs = torchaudio.load(rir_fn)
Anthony Larcher's avatar
back    
Anthony Larcher committed
188
189
        assert rir_fs == sample_rate
        #rir = rir[rir_nfo[1], :] #keep selected channel
Anthony Larcher's avatar
Anthony Larcher committed
190
        speech = torch.tensor(signal.convolve(speech, rir, mode='full')[:, :speech.shape[1]])
Anthony Larcher's avatar
Anthony Larcher committed
191
192

    if "add_noise" in augmentations:
Anthony Larcher's avatar
Anthony Larcher committed
193
194
        # Pick a noise type
        noise = torch.zeros_like(speech)
Anthony Larcher's avatar
Anthony Larcher committed
195
196
197
198
        if not babble_noise:
            noise_idx = random.randrange(1, 3)
        else:
            noise_idx = random.randrange(0, 4)
199

Anthony Larcher's avatar
Anthony Larcher committed
200
201
202
203
204
205
        # speech
        if noise_idx == 0:
            # Pick a SNR level
            # TODO make SNRs configurable by noise type
            snr_db = random.randint(13, 20)
            pick_count = random.randint(3, 7)
Anthony Larcher's avatar
Anthony Larcher committed
206
            index_list = random.sample(range(noise_df.loc['speech'].shape[0]), k=pick_count)
Anthony Larcher's avatar
Anthony Larcher committed
207
208
209
210
211
212
            for idx in index_list:
                noise_row = noise_df.loc['speech'].iloc[idx]
                noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
            noise /= pick_count
        # music
        elif noise_idx == 1:
213
            snr_db = random.randint(5, 15)
Anthony Larcher's avatar
Anthony Larcher committed
214
215
216
217
            noise_row = noise_df.loc['music'].iloc[random.randrange(noise_df.loc['music'].shape[0])]
            noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
        # noise
        elif noise_idx == 2:
218
            snr_db = random.randint(0, 15)
Anthony Larcher's avatar
Anthony Larcher committed
219
220
            noise_row = noise_df.loc['noise'].iloc[random.randrange(noise_df.loc['noise'].shape[0])]
            noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
Anthony Larcher's avatar
Anthony Larcher committed
221
222
        # babble noise with different volume
        elif noise_idx == 3:
Anthony Larcher's avatar
Anthony Larcher committed
223
224
225
226
227
228
229
230
231
232
233
            snr_db = random.randint(13,20)
            pick_count = random.randint(5,10) # Randomly select 5 to 10 speakers
            index_list = random.choices(range(noise_df.loc['speech'].shape[0]), k=pick_count)

            noise = torch.zeros(1,speech.shape[1])
            for idx in index_list:
                noise_row = noise_df.loc['speech'].iloc[idx]
                noise_ = load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
                transform = torchaudio.transforms.Vol(gain=random.randint(5,15),gain_type='db') # Randomly select volume level (5-15d)
                noise += transform(noise_)
            noise /= pick_count
Anthony Larcher's avatar
Anthony Larcher committed
234

Anthony Larcher's avatar
Anthony Larcher committed
235
236
        speech_power = speech.norm(p=2)
        noise_power = noise.norm(p=2)
Anthony Larcher's avatar
Anthony Larcher committed
237
        snr = 10 ** (snr_db / 20)
Anthony Larcher's avatar
Anthony Larcher committed
238
239
240
        scale = snr * noise_power / speech_power
        speech = (scale * speech + noise) / 2

Anthony Larcher's avatar
Anthony Larcher committed
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
    if "phone_filtering" in augmentations:
        final_shape = speech.shape[1]
        speech, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
            speech,
            sample_rate,
            effects=[
                ["lowpass", "4000"],
                ["compand", "0.02,0.05", "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8", "-8", "-7", "0.05"],
                ["rate", "16000"],
            ])
        speech = speech[:, :final_shape]

    if "filtering" in augmentations:
        effects = [
            ["bandpass","2000","3500"],
            ["bandstop","200","500"]]
Anthony Larcher's avatar
Anthony Larcher committed
257
        speech, sample_rate = torchaudio.sox_eefects.apply_effects_tensor(
Anthony Larcher's avatar
Anthony Larcher committed
258
259
            speech,
            sample_rate,
Anthony Larcher's avatar
Anthony Larcher committed
260
            effects=[effects[random.randint(0, 1)]],
Anthony Larcher's avatar
Anthony Larcher committed
261
262
        )

Anthony Larcher's avatar
Anthony Larcher committed
263
    if "codec" in augmentations:
Anthony Larcher's avatar
debug    
Anthony Larcher committed
264
        final_shape = speech.shape[1]
Anthony Larcher's avatar
Anthony Larcher committed
265
        configs = [
Anthony Larcher's avatar
debug    
Anthony Larcher committed
266
            ({"format": "wav", "encoding": 'ULAW', "bits_per_sample": 8}, "8 bit mu-law"),
Gaël Le Lan's avatar
Gaël Le Lan committed
267
            ({"format": "wav", "encoding": 'ALAW', "bits_per_sample": 8}, "8 bit a-law"),
Anthony Larcher's avatar
debug    
Anthony Larcher committed
268
269
270
            ({"format": "gsm"}, "GSM-FR"),
            ({"format": "mp3", "compression": -9}, "MP3"),
            ({"format": "vorbis", "compression": -1}, "Vorbis")
Anthony Larcher's avatar
Anthony Larcher committed
271
272
273
        ]
        param, title = random.choice(configs)
        speech = torchaudio.functional.apply_codec(speech, sample_rate, **param)
Anthony Larcher's avatar
debug    
Anthony Larcher committed
274
        speech = speech[:, :final_shape]
Anthony Larcher's avatar
Anthony Larcher committed
275
276
277

    return speech

Anthony Larcher's avatar
Anthony Larcher committed
278

Anthony Larcher's avatar
Anthony Larcher committed
279
def load_noise_seg(noise_row, speech_shape, sample_rate, data_path):
Anthony Larcher's avatar
Anthony Larcher committed
280
    """
Anthony Larcher's avatar
Anthony Larcher committed
281
    Pick a noise signal to add while performing data augmentation
Anthony Larcher's avatar
Anthony Larcher committed
282

Anthony Larcher's avatar
Anthony Larcher committed
283
284
285
286
    :param noise_row: a row from a Pandas dataframe object
    :param speech_shape: shape of the speech signal to be augmented
    :param sample_rate: sampling rate of the speech signal to be augmented
    :param data_path: directory where to load the noise file from
Anthony Larcher's avatar
Anthony Larcher committed
287
288
    :return:
    """
Anthony Larcher's avatar
Anthony Larcher committed
289
290
291
292
293
294
295
    noise_start = noise_row['start']
    noise_duration = noise_row['duration']
    noise_file_id = noise_row['file_id']

    if noise_duration * sample_rate > speech_shape[1]:
        # It is recommended to split noise files (especially speech noise type) in shorter subfiles
        # When frame_offset is too high, loading the segment can take much longer
Anthony Larcher's avatar
Anthony Larcher committed
296
297
        frame_offset = random.randrange(noise_start * sample_rate,
                                        int((noise_start + noise_duration) * sample_rate - speech_shape[1]))
Anthony Larcher's avatar
Anthony Larcher committed
298
299
300
301
302
    else:
        frame_offset = noise_start * sample_rate

    noise_fn = data_path + "/" + noise_file_id + ".wav"
    if noise_duration * sample_rate > speech_shape[1]:
Anthony Larcher's avatar
merge    
Anthony Larcher committed
303
        noise_seg, noise_sr = torchaudio.load(noise_fn, frame_offset=int(frame_offset), num_frames=int(speech_shape[1]))
Anthony Larcher's avatar
Anthony Larcher committed
304
    else:
Anthony Larcher's avatar
Anthony Larcher committed
305
306
307
        noise_seg, noise_sr = torchaudio.load(noise_fn,
                                              frame_offset=int(frame_offset),
                                              num_frames=int(noise_duration * sample_rate))
Anthony Larcher's avatar
merge    
Anthony Larcher committed
308
    assert noise_sr == sample_rate
Anthony Larcher's avatar
Anthony Larcher committed
309
310
311
312

    if noise_seg.shape[1] < speech_shape[1]:
        noise_seg = torch.tensor(numpy.resize(noise_seg.numpy(), speech_shape))
    return noise_seg