Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Ambuj Mehrish
sidekit
Commits
2899da3e
Commit
2899da3e
authored
Apr 01, 2021
by
Ambuj Mehrish
Browse files
Replace xsets.py
parent
5632ab05
Changes
1
Hide whitespace changes
Inline
Side-by-side
nnet/xsets.py
View file @
2899da3e
...
...
@@ -168,6 +168,7 @@ class SideSet(Dataset):
self
.
transformation
=
dataset
[
"eval"
][
"transformation"
]
self
.
sample_number
=
int
(
self
.
duration
*
self
.
sample_rate
)
self
.
overlap
=
int
(
overlap
*
self
.
sample_rate
)
# Load the dataset description as pandas.dataframe
if
dataset_df
is
None
:
...
...
@@ -188,16 +189,18 @@ class SideSet(Dataset):
# Create lists for each column of the dataframe
df_dict
=
dict
(
zip
(
df
.
columns
,
[[],
[],
[],
[],
[],
[],
[]]))
df_dict
[
"file_start"
]
=
list
()
df_dict
[
"file_duration"
]
=
list
()
# For each segment, get all possible segments with the current overlap
for
idx
in
tqdm
.
trange
(
len
(
tmp_sessions
),
desc
=
'indexing all '
+
set_type
+
' segments'
,
mininterval
=
1
):
for
idx
in
tqdm
.
trange
(
len
(
tmp_sessions
),
desc
=
'indexing all '
+
set_type
+
' segments'
,
mininterval
=
1
,
disable
=
None
):
current_session
=
tmp_sessions
.
iloc
[
idx
]
# Compute possible starts
possible_starts
=
numpy
.
arange
(
0
,
int
(
self
.
sample_rate
*
(
current_session
.
duration
-
self
.
duration
)),
self
.
sample_number
-
int
(
self
.
sample_rate
*
overlap
)
)
self
.
sample_number
)
+
int
(
self
.
sample_rate
*
(
current_session
.
duration
%
self
.
duration
/
2
))
possible_starts
+=
int
(
self
.
sample_rate
*
current_session
.
start
)
# Select max(seg_nb, possible_segments) segments
...
...
@@ -206,7 +209,7 @@ class SideSet(Dataset):
chunk_nb
=
len
(
possible_starts
)
else
:
chunk_nb
=
min
(
len
(
possible_starts
),
chunk_per_segment
)
starts
=
numpy
.
random
.
permutation
(
possible_starts
)[:
chunk_nb
]
/
self
.
sample_rate
starts
=
numpy
.
random
.
permutation
(
possible_starts
)[:
chunk_nb
]
# Once we know how many segments are selected, create the other fields to fill the DataFrame
for
ii
in
range
(
chunk_nb
):
...
...
@@ -215,6 +218,8 @@ class SideSet(Dataset):
df_dict
[
"file_id"
].
append
(
current_session
.
file_id
)
df_dict
[
"start"
].
append
(
starts
[
ii
])
df_dict
[
"duration"
].
append
(
self
.
duration
)
df_dict
[
"file_start"
].
append
(
current_session
.
start
)
df_dict
[
"file_duration"
].
append
(
current_session
.
duration
)
df_dict
[
"speaker_idx"
].
append
(
current_session
.
speaker_idx
)
df_dict
[
"gender"
].
append
(
current_session
.
gender
)
...
...
@@ -231,8 +236,9 @@ class SideSet(Dataset):
self
.
noise_df
=
None
if
"add_noise"
in
self
.
transform
:
# Load the noise dataset, filter according to the duration
self
.
noise_df
=
pandas
.
read_csv
(
self
.
transformation
[
"add_noise"
][
"noise_db_csv"
])
noise_df
=
pandas
.
read_csv
(
self
.
transformation
[
"add_noise"
][
"noise_db_csv"
])
noise_df
=
noise_df
.
loc
[
noise_df
.
duration
>
self
.
duration
]
self
.
noise_df
=
noise_df
.
set_index
(
noise_df
.
type
)
self
.
rir_df
=
None
if
"add_reverb"
in
self
.
transform
:
...
...
@@ -249,7 +255,18 @@ class SideSet(Dataset):
current_session
=
self
.
sessions
.
iloc
[
index
]
nfo
=
soundfile
.
info
(
f
"
{
self
.
data_path
}
/
{
current_session
[
'file_id'
]
}{
self
.
data_file_extension
}
"
)
start_frame
=
int
(
current_session
[
'start'
])
original_start
=
int
(
current_session
[
'start'
])
if
self
.
overlap
>
0
:
lowest_shift
=
self
.
overlap
/
2
highest_shift
=
self
.
overlap
/
2
if
original_start
<
(
current_session
[
'file_start'
]
*
self
.
sample_rate
+
self
.
sample_number
/
2
):
lowest_shift
=
int
(
original_start
-
current_session
[
'file_start'
]
*
self
.
sample_rate
)
if
original_start
+
self
.
sample_number
>
(
current_session
[
'file_start'
]
+
current_session
[
'file_duration'
])
*
self
.
sample_rate
-
self
.
sample_number
/
2
:
highest_shift
=
int
((
current_session
[
'file_start'
]
+
current_session
[
'file_duration'
])
*
self
.
sample_rate
-
(
original_start
+
self
.
sample_number
))
start_frame
=
original_start
+
int
(
random
.
uniform
(
-
lowest_shift
,
highest_shift
))
else
:
start_frame
=
original_start
if
start_frame
+
self
.
sample_number
>=
nfo
.
frames
:
start_frame
=
numpy
.
min
(
nfo
.
frames
-
self
.
sample_number
-
1
)
...
...
@@ -292,7 +309,8 @@ class IdMapSet(Dataset):
idmap_name
,
data_path
,
file_extension
,
transform_pipeline
=
""
,
transform_pipeline
=
{},
transform_number
=
1
,
sliding_window
=
False
,
window_len
=
24000
,
window_shift
=
8000
,
...
...
@@ -318,20 +336,21 @@ class IdMapSet(Dataset):
self
.
sliding_window
=
sliding_window
self
.
window_len
=
window_len
self
.
window_shift
=
window_shift
self
.
transform_number
=
transform_number
self
.
transform
=
[]
if
self
.
transformation
is
not
None
:
self
.
transform_list
=
self
.
transformation
.
split
(
","
)
#if self.transformation is not None:
# self.transform_list = self.transformation.split(",")
self
.
noise_df
=
None
if
"add_noise"
in
self
.
transform
:
if
"add_noise"
in
self
.
transform
ation
:
# Load the noise dataset, filter according to the duration
noise_df
=
pandas
.
read_csv
(
self
.
transformation
[
"add_noise"
][
"noise_db_csv"
])
tmp_df
=
noise_df
.
loc
[
noise_df
[
'duration'
]
>
self
.
duration
]
self
.
noise_df
=
tmp_df
[
'file_id'
].
tolist
(
)
#
tmp_df = noise_df.loc[noise_df['duration'] > self.duration]
self
.
noise_df
=
noise_df
.
set_index
(
noise_df
.
type
)
self
.
rir_df
=
None
if
"add_reverb"
in
self
.
transform
:
if
"add_reverb"
in
self
.
transform
ation
:
# load the RIR database
tmp_rir_df
=
pandas
.
read_csv
(
self
.
transformation
[
"add_reverb"
][
"rir_db_csv"
])
self
.
rir_df
=
zip
(
tmp_rir_df
[
'file_id'
].
tolist
(),
tmp_rir_df
[
'channel'
].
tolist
())
...
...
@@ -344,18 +363,19 @@ class IdMapSet(Dataset):
"""
if
self
.
idmap
.
start
[
index
]
is
None
:
start
=
0
else
:
start
=
int
(
self
.
idmap
.
start
[
index
])
*
160
if
self
.
idmap
.
stop
[
index
]
is
None
:
speech
,
speech_fs
=
torchaudio
.
load
(
f
"
{
self
.
data_path
}
/
{
self
.
idmap
.
rightids
[
index
]
}
.
{
self
.
file_extension
}
"
)
duration
=
speech
.
shape
[
1
]
-
start
duration
=
int
(
speech
.
shape
[
1
]
-
start
)
else
:
start
=
int
(
self
.
idmap
.
start
[
index
])
duration
=
int
(
self
.
idmap
.
stop
[
index
])
-
start
duration
=
int
(
self
.
idmap
.
stop
[
index
])
*
160
-
start
# add this in case the segment is too short
if
duration
<=
self
.
min_sample_nb
:
middle
=
start
+
duration
//
2
start
=
max
(
0
,
int
(
middle
-
(
self
.
min_sample_nb
/
2
)))
duration
=
self
.
min_sample_nb
duration
=
int
(
self
.
min_sample_nb
)
speech
,
speech_fs
=
torchaudio
.
load
(
f
"
{
self
.
data_path
}
/
{
self
.
idmap
.
rightids
[
index
]
}
.
{
self
.
file_extension
}
"
,
frame_offset
=
start
,
...
...
@@ -366,10 +386,10 @@ class IdMapSet(Dataset):
if
self
.
sliding_window
:
speech
=
speech
.
squeeze
().
unfold
(
0
,
self
.
window_len
,
self
.
window_shift
)
if
len
(
self
.
transform
)
>
0
:
if
len
(
self
.
transform
ation
.
keys
()
)
>
0
:
speech
=
data_augmentation
(
speech
,
speech_fs
,
self
.
transform
,
self
.
transform
ation
,
self
.
transform_number
,
noise_df
=
self
.
noise_df
,
rir_df
=
self
.
rir_df
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment