io.py 66.8 KB
Newer Older
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# -*- coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#    
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as 
# published by the Free Software Foundation, either version 3 of the License, 
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT.  If not, see <http://www.gnu.org/licenses/>.

"""
Anthony Larcher's avatar
v1.3.7    
Anthony Larcher committed
25
Copyright 2014-2021 Anthony Larcher
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
26
27
28
29

:mod:`frontend` provides methods to process an audio signal in order to extract
useful parameters for speaker verification.
"""
Anthony Larcher's avatar
Anthony Larcher committed
30
import audioop
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
31
import decimal
Anthony Larcher's avatar
Anthony Larcher committed
32
import h5py
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
33
import logging
Anthony Larcher's avatar
Anthony Larcher committed
34
35
36
import math
import numpy
import os
Anthony Larcher's avatar
Anthony Larcher committed
37
import soundfile
Anthony Larcher's avatar
Anthony Larcher committed
38
39
import struct
import warnings
Anthony Larcher's avatar
Anthony Larcher committed
40
import wave
41
import scipy.signal
Anthony Larcher's avatar
Anthony Larcher committed
42
43
import scipy.io.wavfile
from scipy.signal import lfilter
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
44
from scipy.signal import decimate
Anthony Larcher's avatar
Anthony Larcher committed
45
from ..sidekit_wrappers import check_path_existance, process_parallel_lists
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
46
47
48


__author__ = "Anthony Larcher"
Anthony Larcher's avatar
v1.3.7    
Anthony Larcher committed
49
__copyright__ = "Copyright 2014-2021 Anthony Larcher"
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
50
51
52
53
54
55
56
__license__ = "LGPL"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'


Anthony Larcher's avatar
Anthony Larcher committed
57
58
wav_flag = "float32"    # Could be "int16"

Anthony Larcher's avatar
Anthony Larcher committed
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# HTK parameters
WAVEFORM = 0
LPC = 1
LPCREFC = 2
LPCEPSTRA = 3
LPCDELCEP = 4
IREFC = 5
MFCC = 6
FBANK = 7
MELSPEC = 8
USER = 9
DISCRETE = 10
PLP = 11
ANON = 12

_E = 0o000100  # has energy
_N = 0o000200  # absolute energy supressed
_D = 0o000400  # has delta coefficients
_A = 0o001000  # has acceleration coefficients
_C = 0o002000  # is compressed
_Z = 0o004000  # has zero mean static coef.
_K = 0o010000  # has CRC checksum
_0 = 0o020000  # has 0th cepstral coef.
_V = 0o040000  # has VQ data
_T = 0o100000  # has third differential coef.

parms16bit = [WAVEFORM, IREFC, DISCRETE]


Anthony Larcher's avatar
Initial  
Anthony Larcher committed
88
@check_path_existance
Anthony Larcher's avatar
Anthony Larcher committed
89
def write_pcm(data, output_file_name):
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
90
91
92
    """Write signal to single channel PCM 16 bits
    
    :param data: audio signal to write in a RAW PCM file.
Anthony Larcher's avatar
Anthony Larcher committed
93
    :param output_file_name: name of the file to write
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
94
    """
Anthony Larcher's avatar
Anthony Larcher committed
95
    with open(output_file_name, 'wb') as of:
Anthony Larcher's avatar
Anthony Larcher committed
96
        if numpy.abs(data).max() < 1.:
Anthony Larcher's avatar
Anthony Larcher committed
97
            data = numpy.around(numpy.array(data) * 16384, decimals=0).astype('int16')
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
98
99
        of.write(struct.pack('<' + 'h' * data.shape[0], *data))

Anthony Larcher's avatar
Anthony Larcher committed
100
101
102
103
104
105
106
107
108
109
110
@check_path_existance
def write_wav(data, output_file_name, fs):
    if data.dtype != numpy.int16:
        if data.dtype == numpy.float32:
            data /= numpy.abs(data).max()
            data *= 0.9
        data = numpy.array(data * 2 ** 15, dtype=numpy.int16)
    if numpy.any(data > numpy.iinfo(numpy.int16).max) or numpy.any(data < numpy.iinfo(numpy.int16).min):
        warnings.warn('Warning: clipping detected when writing {}'.format(output_file_name))
    scipy.io.wavfile.write(output_file_name, fs, data)

Anthony Larcher's avatar
Initial  
Anthony Larcher committed
111

Anthony Larcher's avatar
Anthony Larcher committed
112
def read_pcm(input_file_name):
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
113
114
    """Read signal from single channel PCM 16 bits

Anthony Larcher's avatar
Anthony Larcher committed
115
    :param input_file_name: name of the PCM file to read.
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
116
    
Anthony Larcher's avatar
Anthony Larcher committed
117
    :return: the audio signal read from the file in a ndarray encoded  on 16 bits, None and 2 (depth of the encoding in bytes)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
118
    """
Anthony Larcher's avatar
Anthony Larcher committed
119
    with open(input_file_name, 'rb') as f:
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
120
121
        f.seek(0, 2)  # Go to te end of the file
        # get the sample count
Anthony Larcher's avatar
Anthony Larcher committed
122
        sample_count = int(f.tell() / 2)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
123
        f.seek(0, 0)  # got to the begining of the file
Anthony Larcher's avatar
Anthony Larcher committed
124
        data = numpy.asarray(struct.unpack('<' + 'h' * sample_count, f.read()))
Anthony Larcher's avatar
Anthony Larcher committed
125
    return data.astype(numpy.float32), None, 2
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
126
127


Anthony Larcher's avatar
Anthony Larcher committed
128
def read_wav(input_file_name):
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
129
    """
Anthony Larcher's avatar
Anthony Larcher committed
130
131
    :param input_file_name:
    :return:
Anthony Larcher's avatar
Anthony Larcher committed
132
    """
Anthony Larcher's avatar
Anthony Larcher committed
133
134
135
136
137
138
139
140
141
142
143
144
    #with wave.open(input_file_name, "r") as wfh:
    #    (nchannels, sampwidth, framerate, nframes, comptype, compname) = wfh.getparams()
    #    raw = wfh.readframes(nframes * nchannels)
    #    out = struct.unpack_from("%dh" % nframes * nchannels, raw)
    #    sig = numpy.reshape(numpy.array(out), (-1, nchannels)).squeeze()
    #    return sig.astype(numpy.float32), framerate, sampwidth
    nfo = soundfile.info(input_file_name)
    sig, sample_rate = soundfile.read(input_file_name, dtype=wav_flag)
    sig = numpy.reshape(numpy.array(sig), (-1, nfo.channels)).squeeze()
    sig = sig.astype(numpy.float32)
    return sig, sample_rate, 4

Anthony Larcher's avatar
Initial  
Anthony Larcher committed
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168

def pcmu2lin(p, s=4004.189931):
    """Convert Mu-law PCM to linear X=(P,S)
    lin = pcmu2lin(pcmu) where pcmu contains a vector
    of mu-law values in the range 0 to 255.
    No checking is performed to see that numbers are in this range.

    Output values are divided by the scale factor s:

        s		Output Range
        1		+-8031	(integer values)
        4004.2	+-2.005649 (default)
        8031		+-1
        8159		+-0.9843118 (+-1 nominal full scale)

    The default scaling factor 4004.189931 is equal to
    sqrt((2207^2 + 5215^2)/2) this follows ITU standard G.711.
    The sine wave with PCM-Mu values [158 139 139 158 30 11 11 30]
    has a mean square value of unity corresponding to 0 dBm0.
    :param p: input signal encoded in PCM mu-law to convert
    :param s: conversion value from mu-scale oto linear scale
    """
    t = 4 / s
    m = 15 - (p % 16)
Anthony Larcher's avatar
Anthony Larcher committed
169
    q = numpy.floor(p // 128)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
170
    e = (127 - p - m + 128 * q) / 16
Anthony Larcher's avatar
Anthony Larcher committed
171
    x = (m + 16.5) * numpy.power(2, e) - 16.5
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
172
173
174
175
    z = (q - 0.5) * x * t
    return z


Anthony Larcher's avatar
Anthony Larcher committed
176
def read_sph(input_file_name, mode='p'):
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
177
178
179
    """
    Read a SPHERE audio file

Anthony Larcher's avatar
Anthony Larcher committed
180
    :param input_file_name: name of the file to read
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
    :param mode: specifies the following (\* =default)
    
    .. note::
    
        - Scaling:
        
            - 's'    Auto scale to make data peak = +-1 (use with caution if reading in chunks)
            - 'r'    Raw unscaled data (integer values)
            - 'p'    Scaled to make +-1 equal full scale
            - 'o'    Scale to bin centre rather than bin edge (e.g. 127 rather than 127.5 for 8 bit values,
                     can be combined with n+p,r,s modes)
            - 'n'    Scale to negative peak rather than positive peak (e.g. 128.5 rather than 127.5 for 8 bit values,
                     can be combined with o+p,r,s modes)

        - Format
       
           - 'l'    Little endian data (Intel,DEC) (overrides indication in file)
           - 'b'    Big endian data (non Intel/DEC) (overrides indication in file)

       - File I/O
       
           - 'f'    Do not close file on exit
           - 'd'    Look in data directory: voicebox('dir_data')
           - 'w'    Also read the annotation file \*.wrd if present (as in TIMIT)
           - 't'    Also read the phonetic transcription file \*.phn if present (as in TIMIT)

        - NMAX     maximum number of samples to read (or -1 for unlimited [default])
        - NSKIP    number of samples to skip from start of file (or -1 to continue from previous read when FFX
                   is given instead of FILENAME [default])

    :return: a tupple such that (Y, FS)
    
    .. note::
    
        - Y data matrix of dimension (samples,channels)
        - FS         sample frequency in Hz
        - WRD{\*,2}  cell array with word annotations: WRD{\*,:)={[t_start t_end],'text'} where times are in seconds
                     only present if 'w' option is given
        - PHN{\*,2}  cell array with phoneme annotations: PHN{\*,:)={[t_start	t_end],'phoneme'} where times
                     are in seconds only present if 't' option is present
        - FFX        Cell array containing

            1. filename
            2. header information
        
            1. first header field name
            2. first header field value
            3. format string (e.g. NIST_1A)
            4. 
                1. file id
                2. current position in file
                3. dataoff    byte offset in file to start of data
                4. order  byte order (l or b)
                5. nsamp    number of samples
                6. number of channels
                7. nbytes    bytes per data value
                8. bits    number of bits of precision
                9. fs	sample frequency
                10. min value
                11. max value
                12. coding 0=PCM,1=uLAW + 0=no compression, 0=shorten,20=wavpack,30=shortpack
                13. file not yet decompressed
                
            5. temporary filename

    If no output parameters are specified,
    header information will be printed.
    The code to decode shorten-encoded files, is 
    not yet released with this toolkit.
    """
    codings = dict([('pcm', 1), ('ulaw', 2)])
    compressions = dict([(',embedded-shorten-', 1),
                         (',embedded-wavpack-', 2),
                         (',embedded-shortpack-', 3)])
Anthony Larcher's avatar
Anthony Larcher committed
255
    byteorder = 'l'
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
256
257
258
259
260
261
262
263
264
265
266
267
    endianess = dict([('l', '<'), ('b', '>')])

    if not mode == 'p':
        mode = [mode, 'p']
    k = list((m >= 'p') & (m <= 's') for m in mode)
    # scale to input limits not output limits
    mno = all([m != 'o' for m in mode])
    sc = ''
    if k[0]:
        sc = mode[0]
    # Get byte order (little/big endian)
    if any([m == 'l' for m in mode]):
Anthony Larcher's avatar
Anthony Larcher committed
268
        byteorder = 'l'
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
269
    elif any([m == 'b' for m in mode]):
Anthony Larcher's avatar
Anthony Larcher committed
270
        byteorder = 'b'
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
271
272
    ffx = ['', '', '', '', '']

Anthony Larcher's avatar
Anthony Larcher committed
273
274
275
276
    if isinstance(input_file_name, str):
        if os.path.exists(input_file_name):
            fid = open(input_file_name, 'rb')
        elif os.path.exists("".join((input_file_name, '.sph'))):
Anthony Larcher's avatar
Anthony Larcher committed
277
278
            input_file_name = "".join((input_file_name, '.sph'))
            fid = open(input_file_name, 'rb')
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
279
        else:
Anthony Larcher's avatar
Anthony Larcher committed
280
281
            raise Exception('Cannot find file {}'.format(input_file_name))
        ffx[0] = input_file_name
Anthony Larcher's avatar
Anthony Larcher committed
282
283
    elif not isinstance(input_file_name, str):
        ffx = input_file_name
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
284
    else:
Anthony Larcher's avatar
Anthony Larcher committed
285
        fid = input_file_name
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317

    # Read the header
    if ffx[3] == '':
        fid.seek(0, 0)  # go to the begining of the file
        l1 = fid.readline().decode("utf-8")
        l2 = fid.readline().decode("utf-8")
        if not (l1 == 'NIST_1A\n') & (l2 == '   1024\n'):
            logging.warning('File does not begin with a SPHERE header')
        ffx[2] = l1.rstrip()
        hlen = int(l2[3:7])
        hdr = {}
        while True:  # Read the header and fill a dictionary
            st = fid.readline().decode("utf-8").rstrip()
            if st[0] != ';':
                elt = st.split(' ')
                if elt[0] == 'end_head':
                    break
                if elt[1][0] != '-':
                    logging.warning('Missing ''-'' in SPHERE header')
                    break
                if elt[1][1] == 's':
                    hdr[elt[0]] = elt[2]
                elif elt[1][1] == 'i':
                    hdr[elt[0]] = int(elt[2])
                else:
                    hdr[elt[0]] = float(elt[2])

        if 'sample_byte_format' in list(hdr.keys()):
            if hdr['sample_byte_format'][0] == '0':
                bord = 'l'
            else:
                bord = 'b'
Anthony Larcher's avatar
Anthony Larcher committed
318
            if (bord != byteorder) & all([m != 'b' for m in mode]) \
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
319
                    & all([m != 'l' for m in mode]):
Anthony Larcher's avatar
Anthony Larcher committed
320
                byteorder = bord
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336

        icode = 0  # Get encoding, default is PCM
        if 'sample_coding' in list(hdr.keys()):
            icode = -1  # unknown code
            for coding in list(codings.keys()):
                if hdr['sample_coding'].startswith(coding):
                    # is the signal compressed
                    # if len(hdr['sample_coding']) > codings[coding]:
                    if len(hdr['sample_coding']) > len(coding):
                        for compression in list(compressions.keys()):
                            if hdr['sample_coding'].endswith(compression):
                                icode = 10 * compressions[compression] \
                                        + codings[coding] - 1
                                break
                    else:  # if the signal is not compressed
                        icode = codings[coding] - 1
337
                        break
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
338
        # initialize info of the files with default values
Anthony Larcher's avatar
Anthony Larcher committed
339
        info = [fid, 0, hlen, ord(byteorder), 0, 1, 2, 16, 1, 1, -1, icode]
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
        # Get existing info from the header
        if 'sample_count' in list(hdr.keys()):
            info[4] = hdr['sample_count']
        if not info[4]:  # if no info sample_count or zero
            # go to the end of the file
            fid.seek(0, 2)  # Go to te end of the file
            # get the sample count
            info[4] = int(math.floor((fid.tell() - info[2]) / (info[5] * info[6])))  # get the sample_count
        if 'channel_count' in list(hdr.keys()):
            info[5] = hdr['channel_count']
        if 'sample_n_bytes' in list(hdr.keys()):
            info[6] = hdr['sample_n_bytes']
        if 'sample_sig_bits' in list(hdr.keys()):
            info[7] = hdr['sample_sig_bits']
        if 'sample_rate' in list(hdr.keys()):
            info[8] = hdr['sample_rate']
        if 'sample_min' in list(hdr.keys()):
            info[9] = hdr['sample_min']
        if 'sample_max' in list(hdr.keys()):
            info[10] = hdr['sample_max']

        ffx[1] = hdr
        ffx[3] = info
    info = ffx[3]
    ksamples = info[4]
    if ksamples > 0:
        fid = info[0]
        if (icode >= 10) & (ffx[4] == ''):  # read compressed signal
            # need to use a script with SHORTEN
            raise Exception('compressed signal, need to unpack in a script with SHORTEN')
        info[1] = ksamples
        # use modes o and n to determine effective peak
        pk = 2 ** (8 * info[6] - 1) * (1 + (float(mno) / 2 - int(all([m != 'b'
                                                                      for m in
                                                                      mode]))) / 2 **
                                       info[7])
        fid.seek(1024)  # jump after the header
        nsamples = info[5] * ksamples
        if info[6] < 3:
            if info[6] < 2:
                logging.debug('Sphere i1 PCM')
Anthony Larcher's avatar
Anthony Larcher committed
381
                y = numpy.fromfile(fid, endianess[byteorder]+"i1", -1)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
382
383
                if info[11] % 10 == 1:
                    if y.shape[0] % 2:
Anthony Larcher's avatar
Anthony Larcher committed
384
385
386
                        y = numpy.frombuffer(audioop.ulaw2lin(
                                numpy.concatenate((y, numpy.zeros(1, 'int8'))), 2),
                                numpy.int16)[:-1]/32768.
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
387
                    else:
Anthony Larcher's avatar
Anthony Larcher committed
388
                        y = numpy.frombuffer(audioop.ulaw2lin(y, 2), numpy.int16)/32768.
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
389
390
391
392
393
                    pk = 1.
                else:
                    y = y - 128
            else:
                logging.debug('Sphere i2')
Anthony Larcher's avatar
Anthony Larcher committed
394
                y = numpy.fromfile(fid, endianess[byteorder]+"i2", -1)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
395
396
        else:  # non verifie
            if info[6] < 4:
Anthony Larcher's avatar
Anthony Larcher committed
397
                y = numpy.fromfile(fid, endianess[byteorder]+"i1", -1)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
398
                y = y.reshape(nsamples, 3).transpose()
Anthony Larcher's avatar
Anthony Larcher committed
399
                y = (numpy.dot(numpy.array([1, 256, 65536]), y) - (numpy.dot(y[2, :], 2 ** (-7)).astype(int) * 2 ** 24))
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
400
            else:
Anthony Larcher's avatar
Anthony Larcher committed
401
                y = numpy.fromfile(fid, endianess[byteorder]+"i4", -1)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
402
403
404
405

        if sc != 'r':
            if sc == 's':
                if info[9] > info[10]:
Anthony Larcher's avatar
Anthony Larcher committed
406
407
                    info[9] = numpy.min(y)
                    info[10] = numpy.max(y)
Anthony Larcher's avatar
Anthony Larcher committed
408
                sf = 1 / numpy.max(list(list(map(abs, info[9:11]))), axis=0)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
409
410
411
412
413
414
415
            else:
                sf = 1 / pk
            y = sf * y

        if info[5] > 1:
            y = y.reshape(ksamples, info[5])
    else:
Anthony Larcher's avatar
Anthony Larcher committed
416
        y = numpy.array([])
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
417
418
419
420
421
    if mode != 'f':
        fid.close()
        info[0] = -1
        if not ffx[4] == '':
            pass  # VERIFY SCRIPT, WHICH CASE IS HANDLED HERE
Anthony Larcher's avatar
Anthony Larcher committed
422
    return y.astype(numpy.float32), int(info[8]), int(info[6])
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
423
424


Anthony Larcher's avatar
Anthony Larcher committed
425
def read_audio(input_file_name, framerate=None):
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
426
427
428
429
430
    """ Read a 1 or 2-channel audio file in SPHERE, WAVE or RAW PCM format.
    The format is determined from the file extension.
    If the sample rate read from the file is a multiple of the one given
    as parameter, we apply a decimation function to subsample the signal.
    
Anthony Larcher's avatar
Anthony Larcher committed
431
    :param input_file_name: name of the file to read from
Anthony Larcher's avatar
Anthony Larcher committed
432
    :param framerate: frame rate, optional, if lower than the one read from the file, subsampling is applied
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
433
434
    :return: the signal as a numpy array and the sampling frequency
    """
Anthony Larcher's avatar
Anthony Larcher committed
435
    if framerate is None:
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
436
        raise TypeError("Expected sampling frequency required in sidekit.frontend.io.read_audio")
Anthony Larcher's avatar
Anthony Larcher committed
437
    ext = os.path.splitext(input_file_name)[-1]
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
438
    if ext.lower() == '.sph':
Anthony Larcher's avatar
Anthony Larcher committed
439
        sig, read_framerate, sampwidth = read_sph(input_file_name, 'p')
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
440
    elif ext.lower() == '.wav' or ext.lower() == '.wave':
Anthony Larcher's avatar
Anthony Larcher committed
441
442
443
444
445
446
447
448
        try:
            sig, read_framerate, sampwidth = read_wav(input_file_name)
        except:
            import pydub
            audio = pydub.AudioSegment.from_wav(input_file_name)
            read_framerate = audio.frame_rate
            sampwidth = audio.sample_width
            sig = numpy.array(audio.split_to_mono()[0].get_array_of_samples())
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
449
    elif ext.lower() == '.pcm' or ext.lower() == '.raw':
Anthony Larcher's avatar
Anthony Larcher committed
450
451
        sig, read_framerate, sampwidth = read_pcm(input_file_name)
        read_framerate = framerate
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
452
    else:
Anthony Larcher's avatar
Anthony Larcher committed
453
454
455
        raise TypeError("Unknown extension of audio file")

    # Convert to 16 bit encoding if needed
Anthony Larcher's avatar
Anthony Larcher committed
456
457
    #if not sampwidth == 2:
    #    sig *= (2**(15-sampwidth))
Anthony Larcher's avatar
Anthony Larcher committed
458

Anthony Larcher's avatar
Anthony Larcher committed
459
460
461
    if framerate > read_framerate:
        print("Warning in read_audio, up-sampling function is not implemented yet!")
    elif read_framerate % float(framerate) == 0 and not framerate == read_framerate:
Anthony Larcher's avatar
Anthony Larcher committed
462
        print("downsample {}".format(input_file_name))
463
        sig = scipy.signal.decimate(sig, int(read_framerate / float(framerate)), n=None, ftype='iir', axis=0)
Anthony Larcher's avatar
Anthony Larcher committed
464
    return sig.astype(numpy.float32), framerate
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
465

Anthony Larcher's avatar
modif    
Anthony Larcher committed
466

Anthony Larcher's avatar
Initial  
Anthony Larcher committed
467
468
@check_path_existance
def write_label(label,
Anthony Larcher's avatar
Anthony Larcher committed
469
470
                output_file_name,
                selected_label='speech',
471
472
473
                frame_per_second=100,
                show=None,
                format="mdtm"):
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
474
475
    """Save labels in ALIZE format

Anthony Larcher's avatar
Anthony Larcher committed
476
    :param output_file_name: name of the file to write to
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
477
    :param label: label to write in the file given as a ndarray of boolean
Anthony Larcher's avatar
Anthony Larcher committed
478
479
    :param selected_label: label to write to the file. Default is 'speech'.
    :param frame_per_second: number of frame per seconds. Used to convert
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
480
481
482
483
484
485
            the frame number into time. Default is 100.
    """
    if label.shape[0] > 0:
        bits = label[:-1] ^ label[1:]
        # convert true value into a list of feature indexes
        # append 0 at the beginning of the list, append the last index to the list
Anthony Larcher's avatar
Anthony Larcher committed
486
        idx = [0] + (numpy.arange(len(bits))[bits] + 1).tolist() + [len(label)]
Anthony Larcher's avatar
Anthony Larcher committed
487
        framerate = decimal.Decimal(1) / decimal.Decimal(frame_per_second)
488
489
490
491
492
493
494
495
496
497

        if format == "lab":
            # for each pair of indexes (idx[i] and idx[i+1]), create a segment
            with open(output_file_name, 'w') as fid:
                for i in range(~label[0], len(idx) - 1, 2):
                    fid.write('{} {} {}\n'.format(str(idx[i]*framerate),
                                                  str(idx[i + 1]*framerate), selected_label))
        else:
            # write in MDTM format
            lst = []
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
498
            for i in range(~label[0], len(idx) - 1, 2):
499
500
501
502
503
504
505
506
507
508
509
510
                gender = 'U'
                env = 'U'
                channel = 'U'
                start = idx[i]*framerate
                stop = idx[i + 1]*framerate
                lst.append('{:s} 1 {:.2f} {:.2f} {:s} {:s} {:s} {:s}\n'.format(
                    show, start, stop - start, gender,
                    channel, env, "speech"))

            with open(output_file_name, 'w', encoding="utf8") as fid:
                for line in lst:
                    fid.write(line)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
511
512


Anthony Larcher's avatar
Anthony Larcher committed
513
def read_label(input_file_name, selected_label='speech', frame_per_second=100):
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
514
515
    """Read label file in ALIZE format

Anthony Larcher's avatar
Anthony Larcher committed
516
517
518
    :param input_file_name: the label file name
    :param selected_label: the label to return. Default is 'speech'.
    :param frame_per_second: number of frame per seconds. Used to convert
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
519
520
521
522
            the frame number into time. Default is 100.

    :return: a logical array
    """
Anthony Larcher's avatar
Anthony Larcher committed
523
    with open(input_file_name) as f:
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
524
525
526
        segments = f.readlines()

    if len(segments) == 0:
Anthony Larcher's avatar
Anthony Larcher committed
527
        lbl = numpy.zeros(0).astype(bool)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
528
529
530
    else:
        # initialize the length from the last segment's end
        foo1, stop, foo2 = segments[-1].rstrip().split()
Anthony Larcher's avatar
Anthony Larcher committed
531
        lbl = numpy.zeros(int(float(stop) * 100)).astype(bool)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
532
    
Anthony Larcher's avatar
Anthony Larcher committed
533
534
        begin = numpy.zeros(len(segments))
        end = numpy.zeros(len(segments))
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
535
536
537
    
        for s in range(len(segments)):
            start, stop, label = segments[s].rstrip().split()
Anthony Larcher's avatar
Anthony Larcher committed
538
539
540
            if label == selected_label:
                begin[s] = int(round(float(start) * frame_per_second))
                end[s] = int(round(float(stop) * frame_per_second))
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
541
542
543
544
                lbl[begin[s]:end[s]] = True
    return lbl


Anthony Larcher's avatar
Anthony Larcher committed
545
546
547
548
def read_spro4(input_file_name,
               label_file_name="",
               selected_label="",
               frame_per_second=100):
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
549
550
    """Read a feature stream in SPRO4 format 
    
Anthony Larcher's avatar
Anthony Larcher committed
551
552
    :param input_file_name: name of the feature file to read from
    :param label_file_name: name of the label file to read if required.
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
553
        By Default, the method assumes no label to read from.    
Anthony Larcher's avatar
Anthony Larcher committed
554
555
    :param selected_label: label to select in the label file. Default is none.
    :param frame_per_second: number of frame per seconds. Used to convert
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
556
557
            the frame number into time. Default is 0.
    
Anthony Larcher's avatar
Anthony Larcher committed
558
    :return: a sequence of features in a numpy array
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
559
    """
Anthony Larcher's avatar
Anthony Larcher committed
560
    with open(input_file_name, 'rb') as f:
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
561

Anthony Larcher's avatar
Anthony Larcher committed
562
563
564
565
        tmp_s = struct.unpack("8c", f.read(8))
        s = ()
        for i in range(len(tmp_s)):
            s += (tmp_s[i].decode("utf-8"),)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
566
567
568
        f.seek(0, 2)  # Go to te end of the file
        size = f.tell()  # get the position
        f.seek(0, 0)  # go back to the begining of the file
Anthony Larcher's avatar
Anthony Larcher committed
569
        head_size = 0
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
570

Anthony Larcher's avatar
Anthony Larcher committed
571
        if "".join(s) == '<header>':
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
572
573
            # swap empty header for general header the code need changing
            struct.unpack("19b", f.read(19))
Anthony Larcher's avatar
Anthony Larcher committed
574
            head_size = 19
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
575
576
577
578

        dim = struct.unpack("H", f.read(2))[0]
        struct.unpack("4b", f.read(4))
        struct.unpack("f", f.read(4))
Anthony Larcher's avatar
Anthony Larcher committed
579
        n_frames = int(math.floor((size - 10 - head_size) / (4 * dim)))
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
580

Anthony Larcher's avatar
Anthony Larcher committed
581
582
583
        features = numpy.asarray(struct.unpack('f' * n_frames * dim,
                                               f.read(4 * n_frames * dim)))
        features.resize((n_frames, dim))
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
584

Anthony Larcher's avatar
Anthony Larcher committed
585
    lbl = numpy.ones(numpy.shape(features)[0]).astype(bool)
Anthony Larcher's avatar
Anthony Larcher committed
586
587
    if not label_file_name == "":
        lbl = read_label(label_file_name, selected_label, frame_per_second)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
588
589

    features = features[lbl, :]
Anthony Larcher's avatar
Anthony Larcher committed
590
    return features.astype(numpy.float32)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
591
592


Anthony Larcher's avatar
Anthony Larcher committed
593
594
595
596
597
598
def read_hdf5_segment(file_handler,
                      show,
                      dataset_list,
                      label,
                      start=None, stop=None,
                      global_cmvn=False):
Anthony Larcher's avatar
Anthony Larcher committed
599
600
601
602
603
    """Read a segment from a stream in HDF5 format. Return the features in the
    range start:end
    In case the start and end cannot be reached, the first or last feature are copied
    so that the length of the returned segment is always end-start

Anthony Larcher's avatar
Anthony Larcher committed
604
605
    :param file_name: name of the file to open
    :param dataset: identifier of the dataset in the HDF5 file
Anthony Larcher's avatar
Anthony Larcher committed
606
607
608
    :param mask:
    :param start:
    :param end:
Anthony Larcher's avatar
Anthony Larcher committed
609

610
    :return:read_hdf5_segment
Anthony Larcher's avatar
Anthony Larcher committed
611
    """
Anthony Larcher's avatar
Anthony Larcher committed
612
    h5f = file_handler
Anthony Larcher's avatar
Anthony Larcher committed
613

Anthony Larcher's avatar
Anthony Larcher committed
614
615
616
617
618
    compression_type = {0: 'none', 1: 'htk', 2: 'percentile'}
    if "compression" not in h5f:
        compression = 'none'
        print("Warning, default feature storage mode is now using compression")
    else:
Anthony Larcher's avatar
beat    
Anthony Larcher committed
619
620
621
622
        if isinstance(h5f["compression"], h5py._hl.dataset.Dataset):
            compression = compression_type[h5f["compression"][()]]
        else:
            compression = compression_type[h5f["compression"]]
Anthony Larcher's avatar
Anthony Larcher committed
623

Anthony Larcher's avatar
Anthony Larcher committed
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
    if show not in h5f:
        raise Exception('show {} is not in the HDF5 file'.format(show))

    # Get the selected segment
    dataset_length = h5f[show + "/" + next(h5f[show].__iter__())].shape[0]

    # Deal with the case where start < 0 or stop > feat.shape[0]
    if start is None:
        start = 0
    pad_begining = -start if start < 0 else 0
    start = max(start, 0)

    if stop is None:
        stop = dataset_length
    pad_end = stop - dataset_length if stop > dataset_length else 0
    stop = min(stop, dataset_length)
    global_cmvn = global_cmvn and not (start is None or stop is None)

    # Get the data between start and stop
    # Concatenate all required datasets
    feat = []
    global_mean = []
    global_std = []

    feat = []
    for data_id in ['energy', 'cep', 'fb', 'bnf']:
        if data_id in dataset_list:
            if "/".join((show, data_id)) in h5f:
                dataset_id = show + '/{}'.format(data_id)
                if compression == 'none':
Anthony Larcher's avatar
Anthony Larcher committed
654
655
656
657
                    data = _read_segment(h5f, dataset_id, start, stop)
                    if data.ndim ==1:
                        data = data[:, numpy.newaxis]
                    feat.append(data)
Anthony Larcher's avatar
Anthony Larcher committed
658
659
660
661
                elif compression == 'htk':
                    feat.append(_read_segment_htk(h5f, dataset_id, start, stop))
                else:
                    feat.append(_read_segment_percentile(h5f, dataset_id, start, stop))
Anthony Larcher's avatar
Anthony Larcher committed
662
663
                global_mean.append(h5f["/".join((show, "{}_mean".format(data_id)))][()])
                global_std.append(h5f["/".join((show, "{}_std".format(data_id)))][()])
Anthony Larcher's avatar
Anthony Larcher committed
664
665
666
667
668
669
670
671
672
673

            else:
                raise Exception('{} is not in the HDF5 file'.format(data_id))

    feat = numpy.hstack(feat)
    global_mean = numpy.hstack(global_mean)
    global_std = numpy.hstack(global_std)

    if label is None:
        if "/".join((show, "vad")) in h5f:
Anthony Larcher's avatar
Anthony Larcher committed
674
            label = h5f.get("/".join((show, "vad")))[()].astype('bool').squeeze()[start:stop]
Anthony Larcher's avatar
Anthony Larcher committed
675
        else:
Anthony Larcher's avatar
Anthony Larcher committed
676
677
678
679
680
681
682
            label = numpy.ones(feat.shape[0], dtype='bool')
    # Pad the segment if needed
    feat = numpy.pad(feat, ((pad_begining, pad_end), (0, 0)), mode='edge')
    label = numpy.pad(label, (pad_begining, pad_end), mode='edge')
    #stop += pad_begining + pad_end

    return  feat, label, global_mean, global_std, global_cmvn
Anthony Larcher's avatar
Anthony Larcher committed
683
684


Anthony Larcher's avatar
Anthony Larcher committed
685
def read_spro4_segment(input_file_name, start=0, end=None):
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
686
687
688
689
690
    """Read a segment from a stream in SPRO4 format. Return the features in the
    range start:end
    In case the start and end cannot be reached, the first or last feature are copied
    so that the length of the returned segment is always end-start
    
Anthony Larcher's avatar
Anthony Larcher committed
691
    :param input_file_name: name of the feature file to read from
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
692
693
694
695
696
697
698
    :param start: index of the first frame to read (start at zero)
    :param end: index of the last frame following the segment to read.
       end < 0 means that end is the value of the right_context to add 
       at the end of the file

    :return: a sequence of features in a ndarray of length end-start
    """
Anthony Larcher's avatar
Anthony Larcher committed
699
    with open(input_file_name, 'rb') as f:
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
700
701

        tmpS = struct.unpack("8c", f.read(8))
Anthony Larcher's avatar
Anthony Larcher committed
702
        s = ()
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
703
        for i in range(len(tmpS)):
Anthony Larcher's avatar
Anthony Larcher committed
704
            s += (tmpS[i].decode("utf-8"),)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
705
706
707
        f.seek(0, 2)  # Go to te end of the file
        size = f.tell()  # get the position
        f.seek(0, 0)  # go back to the begining of the file
Anthony Larcher's avatar
Anthony Larcher committed
708
        head_size = 0
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
709

Anthony Larcher's avatar
Anthony Larcher committed
710
        if "".join(s) == '<header>':
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
711
712
            # swap empty header for general header the code need changing
            struct.unpack("19b", f.read(19))
Anthony Larcher's avatar
Anthony Larcher committed
713
            head_size = 19
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
714
715
716
717

        dim = struct.unpack("H", f.read(2))[0]
        struct.unpack("4b", f.read(4))
        struct.unpack("f", f.read(4))
Anthony Larcher's avatar
Anthony Larcher committed
718
        n_frames = int(math.floor((size - 10 - head_size) / (4 * dim)))
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
719
        if end is None:
Anthony Larcher's avatar
Anthony Larcher committed
720
            end = n_frames
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
721
        elif end < 0:
Anthony Larcher's avatar
Anthony Larcher committed
722
            end = n_frames - end
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
723
            
Anthony Larcher's avatar
Anthony Larcher committed
724
        s, e = max(0, start), min(n_frames, end)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
725
        f.seek(2 + 4 + 4 + dim * 4 * s, 0)
Anthony Larcher's avatar
Anthony Larcher committed
726
        features = numpy.fromfile(f, '<f', (e-s) * dim)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
727
728
729
        features.resize(e-s, dim)
        
    if start != s or end != e:  # repeat first or/and last frame as required
Anthony Larcher's avatar
Anthony Larcher committed
730
731
        features = numpy.r_[numpy.repeat(features[[0]], s-start, axis=0),
                            features, numpy.repeat(features[[-1]], end-e, axis=0)]
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
732
        
Anthony Larcher's avatar
Anthony Larcher committed
733
    return features.astype(numpy.float32)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
734
735
736


@check_path_existance
Anthony Larcher's avatar
Anthony Larcher committed
737
def write_spro4(features, output_file_name):
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
738
739
740
    """Write a feature stream in SPRO4 format.
    
    :param features: sequence of features to write
Anthony Larcher's avatar
Anthony Larcher committed
741
    :param output_file_name: name of the file to write to
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
742
    """
Anthony Larcher's avatar
Anthony Larcher committed
743
744
    _, dim = numpy.shape(features)  # get feature stream's dimensions
    f = open(output_file_name, 'wb')  # open outputFile
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
745
746
747
748
749
750
751
752
753
754
    f.write(struct.pack("H", dim))  # write feature dimension
    f.write(struct.pack("4b", 25, 0, 0, 0))  # write flag (not important)
    f.write(struct.pack("f", 100.0))  # write frequency of feature extraciton
    data = features.flatten()  # Write the data
    f.write(struct.pack('f' * len(data), *data))
    f.close()


@check_path_existance
def write_htk(features,
Anthony Larcher's avatar
Anthony Larcher committed
755
756
              output_file_name,
              framerate=100,
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
              dt=9):
    """ Write htk feature file

            0. WAVEFORM Acoustic waveform
            1.  LPC Linear prediction coefficients
            2.  LPREFC LPC Reflection coefficients: -lpcar2rf([1 LPC]);LPREFC(1)=[];
            3.  LPCEPSTRA    LPC Cepstral coefficients
            4. LPDELCEP     LPC cepstral+delta coefficients (obsolete)
            5.  IREFC        LPC Reflection coefficients (16 bit fixed point)
            6.  MFCC         Mel frequency cepstral coefficients
            7.  FBANK        Log Fliter bank energies
            8.  MELSPEC      linear Mel-scaled spectrum
            9.  USER         User defined features
            10.  DISCRETE     Vector quantised codebook
            11.  PLP          Perceptual Linear prediction    
    
    :param features: vector for waveforms, one row per frame for other types
Anthony Larcher's avatar
Anthony Larcher committed
774
775
    :param output_file_name: name of the file to write to
    :param framerate: feature sample in Hz
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
    :param dt: data type (also includes Voicebox code for generating data)
        
            0. WAVEFORM Acoustic waveform
            1.  LPC Linear prediction coefficients
            2.  LPREFC LPC Reflection coefficients: -lpcar2rf([1 LPC]);LPREFC(1)=[];
            3.  LPCEPSTRA    LPC Cepstral coefficients
            4. LPDELCEP     LPC cepstral+delta coefficients (obsolete)
            5.  IREFC        LPC Reflection coefficients (16 bit fixed point)
            6.  MFCC         Mel frequency cepstral coefficients
            7.  FBANK        Log Fliter bank energies
            8.  MELSPEC      linear Mel-scaled spectrum
            9.  USER         User defined features
            10.  DISCRETE     Vector quantised codebook
            11.  PLP          Perceptual Linear prediction
            12.  ANON
    """
Anthony Larcher's avatar
Anthony Larcher committed
792
    sampling_period = 1./framerate
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
793
794
795
    
    pk = dt & 0x3f
    dt &= ~_K  # clear unsupported CRC bit
Anthony Larcher's avatar
Anthony Larcher committed
796
    features = numpy.atleast_2d(features)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
797
798
    if pk == 0:
        features = features.reshape(-1, 1)
Anthony Larcher's avatar
Anthony Larcher committed
799
800
    with open(output_file_name, 'wb') as fh:
        fh.write(struct.pack(">IIHH", len(features)+(4 if dt & _C else 0), sampling_period*1e7,
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
801
802
803
804
                             features.shape[1] * (2 if (pk in parms16bit or dt & _C) else 4), dt))
        if pk == 5:
            features *= 32767.0
        if pk in parms16bit:
Anthony Larcher's avatar
Anthony Larcher committed
805
            features = features.astype('>h')
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
806
807
808
809
810
811
812
        elif dt & _C:
            mmax, mmin = features.max(axis=0), features.min(axis=0)
            mmax[mmax == mmin] += 32767
            mmin[mmax == mmin] -= 32767  # to avoid division by zero for constant coefficients
            scale = 2 * 32767. / (mmax - mmin)
            bias = 0.5 * scale * (mmax + mmin)
            features = features * scale - bias
Anthony Larcher's avatar
Anthony Larcher committed
813
814
815
            numpy.array([scale]).astype('>f').tofile(fh)
            numpy.array([bias]).astype('>f').tofile(fh)
            features = features.astype('>h')
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
816
817
818
819
        else:
            features = features.astype('>f')
        features.tofile(fh)

Anthony Larcher's avatar
Anthony Larcher committed
820
821
822
823
def read_htk(input_file_name,
             label_file_name="",
             selected_label="",
             frame_per_second=100):
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
824
825
    """Read a sequence of features in HTK format

Anthony Larcher's avatar
Anthony Larcher committed
826
827
828
829
    :param input_file_name: name of the file to read from
    :param label_file_name: name of the label file to read from
    :param selected_label: label to select
    :param frame_per_second: number of frames per second
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
    
    :return: a tupple (d, fp, dt, tc, t) described below
    
    .. note::
    
        - d = data: column vector for waveforms, 1 row per frame for other types
        - fp = frame period in seconds
        - dt = data type (also includes Voicebox code for generating data)
        
            0. WAVEFORM Acoustic waveform
            1.  LPC Linear prediction coefficients
            2.  LPREFC LPC Reflection coefficients: -lpcar2rf([1 LPC]);LPREFC(1)=[];
            3.  LPCEPSTRA    LPC Cepstral coefficients
            4. LPDELCEP     LPC cepstral+delta coefficients (obsolete)
            5.  IREFC        LPC Reflection coefficients (16 bit fixed point)
            6.  MFCC         Mel frequency cepstral coefficients
            7.  FBANK        Log Fliter bank energies
            8.  MELSPEC      linear Mel-scaled spectrum
            9.  USER         User defined features
            10.  DISCRETE     Vector quantised codebook
            11.  PLP          Perceptual Linear prediction
            12.  ANON
            
        - tc = full type code = dt plus (optionally) 
                one or more of the following modifiers
                
            - 64  _E  Includes energy terms
            - 128  _N  Suppress absolute energy
            - 256  _D  Include delta coefs
            - 512  _A  Include acceleration coefs
            - 1024  _C  Compressed
            - 2048  _Z  Zero mean static coefs
            - 4096  _K  CRC checksum (not implemented yet)
            - 8192  _0  Include 0'th cepstral coef
            - 16384  _V  Attach VQ index
            - 32768  _T  Attach delta-delta-delta index
            
        - t = text version of type code e.g. LPC_C_K

    This function is a translation of the Matlab code from
    VOICEBOX is a MATLAB toolbox for speech processing.
    by  Mike Brookes
    Home page: `VOICEBOX <http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html>`
    """
    kinds = ['WAVEFORM', 'LPC', 'LPREFC', 'LPCEPSTRA', 'LPDELCEP', 'IREFC',
             'MFCC', 'FBANK', 'MELSPEC', 'USER', 'DISCRETE', 'PLP', 'ANON',
             '???']
Anthony Larcher's avatar
Anthony Larcher committed
877
    with open(input_file_name, 'rb') as fid:
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
878
879
880
881
882
883
884
        nf = struct.unpack(">l", fid.read(4))[0]  # number of frames
        # frame interval (in seconds)
        fp = struct.unpack(">l", fid.read(4))[0] * 1.e-7
        by = struct.unpack(">h", fid.read(2))[0]  # bytes per frame
        tc = struct.unpack(">h", fid.read(2))[0]  # type code
        tc += 65536 * (tc < 0)
        cc = 'ENDACZK0VT'  # list of suffix codes
Anthony Larcher's avatar
Anthony Larcher committed
885
        nhb = len(cc)  # number of suffix codes
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
        ndt = 6  # number of bits for base type
        hb = list(int(math.floor(tc * 2 ** x))
                  for x in range(- (ndt + nhb), -ndt + 1))
        # extract bits from type code
        hd = list(hb[x] - 2 * hb[x - 1] for x in range(nhb, 0, -1))
        # low six bits of tc represent data type
        dt = tc - hb[-1] * 2 ** ndt

        # hd(7)=1 CRC check
        # hd(5)=1 compressed data
        if dt == 5:
            fid.seek(0, 2)  # Go to te end of the file
            flen = fid.tell()  # get the position
            fid.seek(0, 0)  # go back to the begining of the file
            if flen > 14 + by * nf:  # if file too long
                dt = 2  # change type to LPRFEC
Anthony Larcher's avatar
svm    
Anthony Larcher committed
902
                hd[4] = 1  # set compressed flag
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
903
904
905
906
907
                nf += 4  # frame count doesn't include
                # compression constants in this case

        # 16 bit data for waveforms, IREFC and DISCRETE
        if any([dt == x for x in [0, 5, 10]]):
Anthony Larcher's avatar
Anthony Larcher committed
908
            n_dim = int(by * nf / 2)
Anthony Larcher's avatar
Anthony Larcher committed
909
            data = numpy.asarray(struct.unpack(">" + "h" * n_dim, fid.read(2 * n_dim)))
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
910
911
912
913
            d = data.reshape(nf, by / 2)
            if dt == 5:
                d /= 32767  # scale IREFC
        else:
Anthony Larcher's avatar
svm    
Anthony Larcher committed
914
            if hd[4]:  # compressed data - first read scales
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
915
                nf -= 4  # frame count includes compression constants
Anthony Larcher's avatar
Anthony Larcher committed
916
917
918
919
920
                n_col = int(by / 2)
                scales = numpy.asarray(struct.unpack(">" + "f" * n_col, fid.read(4 * n_col)))
                biases = numpy.asarray(struct.unpack(">" + "f" * n_col, fid.read(4 * n_col)))
                data = numpy.asarray(struct.unpack(">" + "h" * n_col * nf, fid.read(2 * n_col * nf)))
                d = data.reshape(nf, n_col)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
921
922
923
                d = d + biases
                d = d / scales
            else:
Anthony Larcher's avatar
Anthony Larcher committed
924
                data = numpy.asarray(struct.unpack(">" + "f" * int(by / 4) * nf, fid.read(by * nf)))
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
925
926
927
928
                d = data.reshape(nf, by / 4)

    t = kinds[min(dt, len(kinds) - 1)]

Anthony Larcher's avatar
Anthony Larcher committed
929
    lbl = numpy.ones(numpy.shape(d)[0]).astype(bool)
Anthony Larcher's avatar
Anthony Larcher committed
930
931
    if not label_file_name == "":
        lbl = read_label(label_file_name, selected_label, frame_per_second)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
932
933
934

    d = d[lbl, :]

Anthony Larcher's avatar
Anthony Larcher committed
935
    return d.astype(numpy.float32), fp, dt, tc, t
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
936
937


Anthony Larcher's avatar
Anthony Larcher committed
938
def read_htk_segment(input_file_name,
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
939
940
941
942
943
944
945
                     start=0,
                     stop=None):
    """Read a segment from a stream in SPRO4 format. Return the features in the
    range start:end
    In case the start and end cannot be reached, the first or last feature are copied
    so that the length of the returned segment is always end-start
    
Anthony Larcher's avatar
Anthony Larcher committed
946
    :param input_file_name: name of the feature file to read from or file-like
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
947
948
949
950
951
952
953
954
955
        object alowing to seek in the file
    :param start: index of the first frame to read (start at zero)
    :param stop: index of the last frame following the segment to read.
       end < 0 means that end is the value of the right_context to add 
       at the end of the file
       
    :return: a sequence of features in a ndarray of length end-start
    """
    try:
Anthony Larcher's avatar
Anthony Larcher committed
956
        fh = open(input_file_name, 'rb')
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
957
    except TypeError:
Anthony Larcher's avatar
Anthony Larcher committed
958
        fh = input_file_name
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
959
960
    try:
        fh.seek(0)
Anthony Larcher's avatar
Anthony Larcher committed
961
962
963
964
965
966
967
968
969
970
        n_samples, _, sample_size, parm_kind = struct.unpack(">IIHH", fh.read(12))
        pk = parm_kind & 0x3f
        if parm_kind & _C:
            scale, bias = numpy.fromfile(fh, '>f', sample_size).reshape(2, sample_size/2)
            n_samples -= 4
        s, e = max(0, start), min(n_samples, stop)
        fh.seek(s*sample_size, 1)
        dtype, _bytes = ('>h', 2) if parm_kind & _C or pk in parms16bit else ('>f', 4)
        m = numpy.fromfile(fh, dtype, (e - s) * sample_size / _bytes).reshape(e - s, sample_size / _bytes)
        if parm_kind & _C:
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
971
972
973
974
975
976
            m = (m + bias) / scale
        if pk == IREFC:
            m /= 32767.0
        if pk == WAVEFORM:
            m = m.ravel()
    finally:
Anthony Larcher's avatar
Anthony Larcher committed
977
        if fh is not input_file_name:
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
978
979
            fh.close()
    if start != s or stop != e:  # repeat first or/and last frame as required
Anthony Larcher's avatar
Anthony Larcher committed
980
        m = numpy.r_[numpy.repeat(m[[0]], s-start, axis=0), m, numpy.repeat(m[[-1]], stop-e, axis=0)]
Anthony Larcher's avatar
Anthony Larcher committed
981
    return m.astype(numpy.float32)
Anthony Larcher's avatar
Initial  
Anthony Larcher committed
982

Anthony Larcher's avatar
Anthony Larcher committed
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
def _add_dataset_header(fh,
                        dataset_id,
                        _min_val,
                        _range,
                        _header):
    """
    Create a dataset in the HDF5 file and write the data
    after compressing float to int
    """
    _c_header = (_header - _min_val) / _range
    numpy.clip(_c_header, 0., 1.)
    _c_header = (_c_header * 65535 + 0.499).astype(int)

    fh.create_dataset(dataset_id + '_header',
                      data=_c_header,
                      maxshape=(None, None),
                      compression="gzip",
                      fletcher32=True)