idmap.py 15.2 KB
Newer Older
Anthony Larcher's avatar
Anthony Larcher committed
1
2
# -*- coding: utf-8 -*-

3
4
5
6
# This package is a translation of a part of the BOSARIS toolkit.
# The authors thank Niko Brummer and Agnitio for allowing them to
# translate this code and provide the community with efficient structures
# and tools.
Anthony Larcher's avatar
Anthony Larcher committed
7
#
8
9
10
11
12
13
14
15
# The BOSARIS Toolkit is a collection of functions and classes in Matlab
# that can be used to calibrate, fuse and plot scores from speaker recognition
# (or other fields in which scores are used to test the hypothesis that two
# samples are from the same source) trials involving a model and a test segment.
# The toolkit was written at the BOSARIS2010 workshop which took place at the
# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010.
# See the User Guide (available on the toolkit website)1 for a discussion of the
# theory behind the toolkit and descriptions of some of the algorithms used.
Anthony Larcher's avatar
Anthony Larcher committed
16
#
17
18
# The BOSARIS toolkit in MATLAB can be downloaded from `the website
# <https://sites.google.com/site/bosaristoolkit/>`_.
Anthony Larcher's avatar
Anthony Larcher committed
19
20
21
22
23

"""
This is the 'idmap' module
"""
import sys
Anthony Larcher's avatar
Anthony Larcher committed
24
import numpy
Anthony Larcher's avatar
Anthony Larcher committed
25
import logging
Anthony Larcher's avatar
Anthony Larcher committed
26
import copy
Anthony Larcher's avatar
Anthony Larcher committed
27
import h5py
Anthony Larcher's avatar
modif    
Anthony Larcher committed
28

Anthony Larcher's avatar
Anthony Larcher committed
29
from ..sidekit_wrappers import check_path_existance
Anthony Larcher's avatar
Anthony Larcher committed
30
31


32
33
34
35
36
37
38
39
__author__ = "Anthony Larcher"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'
__credits__ = ["Niko Brummer", "Edward de Villiers"]


Anthony Larcher's avatar
Anthony Larcher committed
40
41
42
43
class IdMap:
    """A class that stores a map between identifiers (strings).  One
    list is called 'leftids' and the other 'rightids'.  The class
    provides methods that convert a sequence of left ids to a
44
    sequence of right ids and vice versa.  If `leftids` or `rightids`
Anthony Larcher's avatar
Anthony Larcher committed
45
46
47
48
49
50
51
52
53
    contains duplicates then all occurrences are used as the index
    when mapping.

    :attr leftids: a list of classes in a ndarray
    :attr rightids: a list of segments in a ndarray
    :attr start: index of the first frame of the segment
    :attr stop: index of the last frame of the segment
    """

Anthony Larcher's avatar
vad    
Anthony Larcher committed
54
    def __init__(self, idmap_filename=''):
Anthony Larcher's avatar
Anthony Larcher committed
55
56
        """Initialize an IdMap object

Sylvain Meignier's avatar
Sylvain Meignier committed
57
58
        :param idmap_filename: name of a file to load. Default is ''.
        In case the idmap_filename is empty, initialize an empty IdMap object.
Anthony Larcher's avatar
Anthony Larcher committed
59
        """
Anthony Larcher's avatar
Anthony Larcher committed
60
61
62
63
        self.leftids = numpy.empty(0, dtype="|O")
        self.rightids = numpy.empty(0, dtype="|O")
        self.start = numpy.empty(0, dtype="|O")
        self.stop = numpy.empty(0, dtype="|O")
Anthony Larcher's avatar
Anthony Larcher committed
64

Sylvain Meignier's avatar
Sylvain Meignier committed
65
        if idmap_filename == '':
Anthony Larcher's avatar
Anthony Larcher committed
66
            pass
Anthony Larcher's avatar
vad    
Anthony Larcher committed
67
68
69
70
71
72
        else:
            tmp = IdMap.read(idmap_filename)
            self.leftids = tmp.leftids
            self.rightids = tmp.rightids
            self.start = tmp.start
            self.stop = tmp.stop
Anthony Larcher's avatar
Anthony Larcher committed
73

74
75
76
77
78
79
80
    def __repr__(self):
        ch = '-' * 30 + '\n'
        ch += 'left ids:' + self.leftids.__repr__() + '\n'
        ch += 'right ids:' + self.rightids.__repr__() + '\n'
        ch += 'seg start:' + self.start.__repr__() + '\n'
        ch += 'seg stop:' + self.stop.__repr__() + '\n'
        ch += '-' * 30 + '\n'
Anthony Larcher's avatar
Anthony Larcher committed
81
        return ch
82

83
    @check_path_existance
Anthony Larcher's avatar
Anthony Larcher committed
84
    def write(self, output_file_name):
Anthony Larcher's avatar
Anthony Larcher committed
85
86
        """ Save IdMap in HDF5 format

Anthony Larcher's avatar
Anthony Larcher committed
87
        :param output_file_name: name of the file to write to
Anthony Larcher's avatar
Anthony Larcher committed
88
        """
Anthony Larcher's avatar
Anthony Larcher committed
89
        assert self.validate(), "Error: wrong IdMap format"
Anthony Larcher's avatar
Anthony Larcher committed
90
        with h5py.File(output_file_name, "w") as f:
Anthony Larcher's avatar
Anthony Larcher committed
91
92
93
94
95
96
97
98
99
100
            f.create_dataset("leftids", data=self.leftids.astype('S'),
                             maxshape=(None,),
                             compression="gzip",
                             fletcher32=True)
            f.create_dataset("rightids", data=self.rightids.astype('S'),
                             maxshape=(None,),
                             compression="gzip",
                             fletcher32=True)
            # WRITE START and STOP
            start = copy.deepcopy(self.start)
Anthony Larcher's avatar
Anthony Larcher committed
101
            start[numpy.isnan(self.start.astype('float'))] = -1
102
            start = start.astype('int32', copy=False)
Anthony Larcher's avatar
Anthony Larcher committed
103
104

            stop = copy.deepcopy(self.stop)
Anthony Larcher's avatar
Anthony Larcher committed
105
            stop[numpy.isnan(self.stop.astype('float'))] = -1
106
            stop = stop.astype('int32', copy=False)
Anthony Larcher's avatar
Anthony Larcher committed
107
108
109
110
111
112
113
114
115
116

            f.create_dataset("start", data=start,
                             maxshape=(None,),
                             compression="gzip",
                             fletcher32=True)
            f.create_dataset("stop", data=stop,
                             maxshape=(None,),
                             compression="gzip",
                             fletcher32=True)

117
    @check_path_existance
Anthony Larcher's avatar
Anthony Larcher committed
118
    def write_txt(self, output_file_name):
Anthony Larcher's avatar
Anthony Larcher committed
119
120
        """Saves the Id_Map to a text file.
        
Anthony Larcher's avatar
Anthony Larcher committed
121
        :param output_file_name: name of the output text file
Anthony Larcher's avatar
Anthony Larcher committed
122
        """
Anthony Larcher's avatar
Anthony Larcher committed
123
124
        with open(output_file_name, 'w') as outputFile:
            for left, right, start, stop in zip(self.leftids, self.rightids, self.start, self.stop):
Sylvain Meignier's avatar
Sylvain Meignier committed
125
                line = ' '.join(filter(None, (left, right, str(start), str(stop)))) + '\n'
Anthony Larcher's avatar
Anthony Larcher committed
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
                outputFile.write(line)

    def map_left_to_right(self, leftidlist):
        """Maps an array of ids to a new array of ids using the given map.  
        The input ids are matched against the leftids of the map and the
        output ids are taken from the corresponding rightids of the map.
        
        Beware: if leftids are not unique in the IdMap, only the last value 
        corresponding is kept

        :param leftidlist: an array of strings to be matched against the
            leftids of the idmap.  The rightids corresponding to these
            leftids will be returned.

        :return: an array of strings that are the mappings of the
            strings in leftidlist.
        """
Sylvain Meignier's avatar
Sylvain Meignier committed
143
        tmp_dict = dict(zip(self.leftids, self.rightids))
Anthony Larcher's avatar
Anthony Larcher committed
144
145
        inter = numpy.intersect1d(self.leftids, leftidlist)
        rightids = numpy.empty(inter.shape[0], '|O')
Anthony Larcher's avatar
Anthony Larcher committed
146
147
148
149
        
        idx = 0
        for left in leftidlist:
            if left in inter:
Sylvain Meignier's avatar
Sylvain Meignier committed
150
                rightids[idx] = tmp_dict[left]
Anthony Larcher's avatar
Anthony Larcher committed
151
152
                idx += 1

Sylvain Meignier's avatar
Sylvain Meignier committed
153
154
155
        lost_ids = numpy.unique(leftidlist).shape[0] - inter.shape[0]
        if lost_ids:
            logging.warning('{} ids could not be mapped'.format(lost_ids))
Anthony Larcher's avatar
Anthony Larcher committed
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173

        return rightids

    def map_right_to_left(self, rightidlist):
        """Maps an array of ids to a new array of ids using the given map.  
        The input ids are matched against the rightids of the map and the
        output ids are taken from the corresponding leftids of the map.

        Beware: if rightids are not unique in the IdMap, only the last value 
        corresponding is kept

        :param rightidlist: An array of strings to be matched against the
            rightids of the idmap.  The leftids corresponding to these
            rightids will be returned.

        :return: an array of strings that are the mappings of the
            strings in rightidlist.
        """
Sylvain Meignier's avatar
Sylvain Meignier committed
174
        tmp_dict = dict(zip(self.rightids, self.leftids))
Anthony Larcher's avatar
Anthony Larcher committed
175
176
        inter = numpy.intersect1d(self.rightids, rightidlist)
        leftids = numpy.empty(inter.shape[0], '|O')
Anthony Larcher's avatar
Anthony Larcher committed
177
178
179
180
        
        idx = 0
        for right in rightidlist:
            if right in inter:
Sylvain Meignier's avatar
Sylvain Meignier committed
181
                leftids[idx] = tmp_dict[right]
Anthony Larcher's avatar
Anthony Larcher committed
182
183
                idx += 1        
        
Sylvain Meignier's avatar
Sylvain Meignier committed
184
185
186
        lost_ids = numpy.unique(rightidlist).shape[0] - inter.shape[0]
        if lost_ids:
            logging.warning('{} ids could not be mapped'.format(lost_ids))
Anthony Larcher's avatar
Anthony Larcher committed
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

        return leftids

    def filter_on_left(self, idlist, keep):
        """Removes some of the information in an idmap.  Depending on the
        value of 'keep', the idlist indicates the strings to retain or
        the strings to discard.

        :param idlist: an array of strings which will be compared with
            the leftids of the current.
        :param keep: A boolean indicating whether idlist contains the ids to
            keep or to discard.

        :return: a filtered version of the current IdMap.
        """
        # get the list of ids to keep
        if keep:
Anthony Larcher's avatar
Anthony Larcher committed
204
            keepids = numpy.unique(idlist)
Anthony Larcher's avatar
Anthony Larcher committed
205
        else:
Anthony Larcher's avatar
Anthony Larcher committed
206
            keepids = numpy.setdiff1d(self.leftids, idlist)
Anthony Larcher's avatar
Anthony Larcher committed
207
        
Anthony Larcher's avatar
Anthony Larcher committed
208
        keep_idx = numpy.in1d(self.leftids, keepids)
Anthony Larcher's avatar
Anthony Larcher committed
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
        out_idmap = IdMap()
        out_idmap.leftids = self.leftids[keep_idx]
        out_idmap.rightids = self.rightids[keep_idx]
        out_idmap.start = self.start[keep_idx]
        out_idmap.stop = self.stop[keep_idx]
        
        return out_idmap

    def filter_on_right(self, idlist, keep):
        """Removes some of the information in an idmap.  Depending on the
        value of 'keep', the idlist indicates the strings to retain or
        the strings to discard.

        :param idlist: an array of strings which will be compared with
            the rightids of the current IdMap.
        :param keep: a boolean indicating whether idlist contains the ids to
            keep or to discard.

        :return: a filtered version of the current IdMap.
        """
        # get the list of ids to keep
        if keep:
Anthony Larcher's avatar
Anthony Larcher committed
231
            keepids = numpy.unique(idlist)
Anthony Larcher's avatar
Anthony Larcher committed
232
        else:
Anthony Larcher's avatar
Anthony Larcher committed
233
            keepids = numpy.setdiff1d(self.rightids, idlist)
Anthony Larcher's avatar
Anthony Larcher committed
234
        
Anthony Larcher's avatar
Anthony Larcher committed
235
        keep_idx = numpy.in1d(self.rightids, keepids)
Anthony Larcher's avatar
Anthony Larcher committed
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
        out_idmap = IdMap()
        out_idmap.leftids = self.leftids[keep_idx]
        out_idmap.rightids = self.rightids[keep_idx]
        out_idmap.start = self.start[keep_idx]
        out_idmap.stop = self.stop[keep_idx]
        return out_idmap

    def validate(self, warn=False):
        """Checks that an object of type Id_Map obeys certain rules that
        must alows be true.
        
        :param warn: boolean. If True, print a warning if strings are
            duplicated in either left or right array

        :return: a boolean value indicating whether the object is valid.

        """
Anthony Larcher's avatar
Anthony Larcher committed
253
254
        ok = (self.leftids.shape == self.rightids.shape == self.start.shape == self.stop.shape) & self.leftids.ndim == 1

Anthony Larcher's avatar
Anthony Larcher committed
255
        if warn & (self.leftids.shape != numpy.unique(self.leftids).shape):
Anthony Larcher's avatar
Anthony Larcher committed
256
            logging.warning('The left id list contains duplicate identifiers')
Anthony Larcher's avatar
Anthony Larcher committed
257
        if warn & (self.rightids.shape != numpy.unique(self.rightids).shape):
Anthony Larcher's avatar
Anthony Larcher committed
258
259
260
            logging.warning('The right id list contains duplicate identifiers')
        return ok

Sylvain Meignier's avatar
Sylvain Meignier committed
261
    def set(self, left, right, start=None, stop=None):
Anthony Larcher's avatar
Anthony Larcher committed
262
263
264
265
266
267
268
269
        """
        Fill the IdMap object with numpy array of leftids, rightids, and optionally starts and stops

        :param left: a numpy array for leftids
        :param right: a numpy array for rightids
        :param start: a numpy array for start time (optional)
        :param stop: a numpy array for stop time (optional)
        """
Sylvain Meignier's avatar
Sylvain Meignier committed
270
271
272
273
274
275
276
277
278
279
280
281
        self.leftids = copy.deepcopy(left)
        self.rightids = copy.deepcopy(right)

        if start is not None:
            self.start = copy.deepcopy(start)
        else:
            self.start = numpy.empty(self.rightids.shape, '|O')

        if stop is not None:
            self.stop = copy.deepcopy(stop)
        else:
            self.stop = numpy.empty(self.rightids.shape, '|O')
Anthony Larcher's avatar
Anthony Larcher committed
282

Sylvain Meignier's avatar
Sylvain Meignier committed
283
    @staticmethod
Anthony Larcher's avatar
Anthony Larcher committed
284
    def read(input_file_name):
Anthony Larcher's avatar
Anthony Larcher committed
285
286
        """Read IdMap in hdf5 format.

Anthony Larcher's avatar
Anthony Larcher committed
287
        :param input_file_name: name of the file to read from
Anthony Larcher's avatar
Anthony Larcher committed
288
        """
Anthony Larcher's avatar
Anthony Larcher committed
289
        with h5py.File(input_file_name, "r") as f:
Sylvain Meignier's avatar
Sylvain Meignier committed
290
291
            idmap = IdMap()

Anthony Larcher's avatar
Anthony Larcher committed
292
293
            idmap.leftids = f.get("leftids")[()]
            idmap.rightids = f.get("rightids")[()]
Anthony Larcher's avatar
Anthony Larcher committed
294
295
296

            # if running python 3, need a conversion to unicode
            if sys.version_info[0] == 3:
Sylvain Meignier's avatar
Sylvain Meignier committed
297
298
                idmap.leftids = idmap.leftids.astype('U255', copy=False)
                idmap.rightids = idmap.rightids.astype('U255', copy=False)
Anthony Larcher's avatar
Anthony Larcher committed
299

Anthony Larcher's avatar
Anthony Larcher committed
300
301
            tmpstart = f.get("start")[()]
            tmpstop = f.get("stop")[()]
Sylvain Meignier's avatar
Sylvain Meignier committed
302
303
304
305
            idmap.start = numpy.empty(f["start"].shape, '|O')
            idmap.stop = numpy.empty(f["stop"].shape, '|O')
            idmap.start[tmpstart != -1] = tmpstart[tmpstart != -1]
            idmap.stop[tmpstop != -1] = tmpstop[tmpstop != -1]
Anthony Larcher's avatar
Anthony Larcher committed
306

Sylvain Meignier's avatar
Sylvain Meignier committed
307
308
            assert idmap.validate(), "Error: wrong IdMap format"
            return idmap
Anthony Larcher's avatar
Anthony Larcher committed
309

Sylvain Meignier's avatar
Sylvain Meignier committed
310
311
    @classmethod
    @check_path_existance
Anthony Larcher's avatar
Anthony Larcher committed
312
    def read_txt(cls, input_file_name):
Anthony Larcher's avatar
Anthony Larcher committed
313
314
        """Read IdMap in text format.

Anthony Larcher's avatar
Anthony Larcher committed
315
        :param input_file_name: name of the file to read from
Anthony Larcher's avatar
Anthony Larcher committed
316
        """
Sylvain Meignier's avatar
Sylvain Meignier committed
317
318
        idmap = IdMap()

Anthony Larcher's avatar
Anthony Larcher committed
319
        with open(input_file_name, "r") as f:
Anthony Larcher's avatar
Anthony Larcher committed
320
321
322
            columns = len(f.readline().split(' '))

        if columns == 2:
Anthony Larcher's avatar
Anthony Larcher committed
323
324
            idmap.leftids, idmap.rightids = numpy.loadtxt(input_file_name,
                                                          dtype={'names': ('left', 'right'), 'formats': ('|O', '|O')},
Sylvain Meignier's avatar
Sylvain Meignier committed
325
326
327
                                                          usecols=(0, 1), unpack=True)
            idmap.start = numpy.empty(idmap.rightids.shape, '|O')
            idmap.stop = numpy.empty(idmap.rightids.shape, '|O')
Anthony Larcher's avatar
Anthony Larcher committed
328
329
330
        
        # If four columns
        elif columns == 4:
Anthony Larcher's avatar
Anthony Larcher committed
331
332
333
334
            idmap.leftids, idmap.rightids, idmap.start, idmap.stop = numpy.loadtxt(
                input_file_name,
                dtype={'names': ('left', 'right', 'start', 'stop'),
                       'formats': ('|O', '|O', 'int', 'int')}, unpack=True)
Anthony Larcher's avatar
Anthony Larcher committed
335
    
Sylvain Meignier's avatar
Sylvain Meignier committed
336
        if not idmap.validate():
Anthony Larcher's avatar
Anthony Larcher committed
337
            raise Exception('Wrong format of IdMap')
338
        assert idmap.validate(), "Error: wrong IdMap format"
Sylvain Meignier's avatar
Sylvain Meignier committed
339
340
        return idmap

Anthony Larcher's avatar
Anthony Larcher committed
341
342
343
344
345
346
347
348
349
350
    def merge(self, idmap2):
        """ Merges the current IdMap with another IdMap or a list of IdMap objects..

        :param idmap2: Another Id_Map object.

        :return: an Id_Map object that contains the information from the two
            input Id_Maps.
        """
        idmap = IdMap()
        if self.validate() & idmap2.validate():
Anthony Larcher's avatar
Anthony Larcher committed
351
352
353
354
355
356
357
358
            # create tuples of (model,seg) for both IdMaps for quick comparaison
            tup1 = [(mod, seg) for mod, seg in zip(self.leftids, self.rightids)]
            tup2 = [(mod, seg) for mod, seg in zip(idmap2.leftids, idmap2.rightids)]

            # Get indices of common sessions
            existing_sessions = set(tup1).intersection(set(tup2))
            # Get indices of sessions which are not common in idmap2
            idx_new = numpy.sort(numpy.array([idx for idx, sess in enumerate(tup2) if sess not in tup1]))
Anthony Larcher's avatar
Anthony Larcher committed
359
360
            if len(idx_new) == 0:
                idx_new = numpy.zeros(idmap2.leftids.shape[0], dtype='bool')
Anthony Larcher's avatar
Anthony Larcher committed
361
362
363
364
365
366

            idmap.leftids = numpy.concatenate((self.leftids, idmap2.leftids[idx_new]), axis=0)
            idmap.rightids = numpy.concatenate((self.rightids, idmap2.rightids[idx_new]), axis=0)
            idmap.start = numpy.concatenate((self.start, idmap2.start[idx_new]), axis=0)
            idmap.stop = numpy.concatenate((self.stop, idmap2.stop[idx_new]), axis=0)

Anthony Larcher's avatar
Anthony Larcher committed
367
368
369
370
371
372
373
        else:
            raise Exception('Cannot merge IdMaps, wrong type')

        if not idmap.validate():
            raise Exception('Wrong format of IdMap')

        return idmap
Anthony Larcher's avatar
Anthony Larcher committed
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389

    def split(self, N):
        """
        Split an IdMap object into N IdMap of same size (if possible)

        :param N: the number of IdMap to generate
        :return: a list of IdMap
        """
        session_nb = self.leftids.shape[0]
        sub_indices = numpy.array_split(numpy.arange(session_nb), N)

        im_list = []
        for ii in range(N):
            im_list.append(IdMap())
            im_list[ii].leftids = self.leftids[sub_indices[ii]]
            im_list[ii].rightids = self.rightids[sub_indices[ii]]
Anthony Larcher's avatar
Anthony Larcher committed
390
            im_list[ii].start = self.start[sub_indices[ii]]
Anthony Larcher's avatar
Anthony Larcher committed
391
392
393
            im_list[ii].stop = self.stop[sub_indices[ii]]
            assert im_list[ii].validate(), "Error: wrong IdMap format"

Anthony Larcher's avatar
Anthony Larcher committed
394
        return im_list