-
Notifications
You must be signed in to change notification settings - Fork 165
/
Copy pathdata_preprocess.py
executable file
·58 lines (52 loc) · 3.33 KB
/
data_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#Modified from https://github.com/JanhHyun/Speaker_Verification
import glob
import os
import librosa
import numpy as np
from hparam import hparam as hp
# downloaded dataset path
audio_path = glob.glob(os.path.dirname(hp.unprocessed_data))
def save_spectrogram_tisv():
""" Full preprocess of text independent utterance. The log-mel-spectrogram is saved as numpy file.
Each partial utterance is splitted by voice detection using DB
and the first and the last 180 frames from each partial utterance are saved.
Need : utterance data set (VTCK)
"""
print("start text independent utterance feature extraction")
os.makedirs(hp.data.train_path, exist_ok=True) # make folder to save train file
os.makedirs(hp.data.test_path, exist_ok=True) # make folder to save test file
utter_min_len = (hp.data.tisv_frame * hp.data.hop + hp.data.window) * hp.data.sr # lower bound of utterance length
total_speaker_num = len(audio_path)
train_speaker_num= (total_speaker_num//10)*9 # split total data 90% train and 10% test
print("total speaker number : %d"%total_speaker_num)
print("train : %d, test : %d"%(train_speaker_num, total_speaker_num-train_speaker_num))
for i, folder in enumerate(audio_path):
print("%dth speaker processing..."%i)
utterances_spec = []
for utter_name in os.listdir(folder):
if utter_name[-4:] == '.WAV':
utter_path = os.path.join(folder, utter_name) # path of each utterance
utter, sr = librosa.core.load(utter_path, hp.data.sr) # load utterance audio
intervals = librosa.effects.split(utter, top_db=30) # voice activity detection
# this works fine for timit but if you get array of shape 0 for any other audio change value of top_db
# for vctk dataset use top_db=100
for interval in intervals:
if (interval[1]-interval[0]) > utter_min_len: # If partial utterance is sufficient long,
utter_part = utter[interval[0]:interval[1]] # save first and last 180 frames of spectrogram.
S = librosa.core.stft(y=utter_part, n_fft=hp.data.nfft,
win_length=int(hp.data.window * sr), hop_length=int(hp.data.hop * sr))
S = np.abs(S) ** 2
mel_basis = librosa.filters.mel(sr=hp.data.sr, n_fft=hp.data.nfft, n_mels=hp.data.nmels)
S = np.log10(np.dot(mel_basis, S) + 1e-6) # log mel spectrogram of utterances
utterances_spec.append(S[:, :hp.data.tisv_frame]) # first 180 frames of partial utterance
utterances_spec.append(S[:, -hp.data.tisv_frame:]) # last 180 frames of partial utterance
utterances_spec = np.array(utterances_spec)
print(utterances_spec.shape)
if i<train_speaker_num: # save spectrogram as numpy file
np.save(os.path.join(hp.data.train_path, "speaker%d.npy"%i), utterances_spec)
else:
np.save(os.path.join(hp.data.test_path, "speaker%d.npy"%(i-train_speaker_num)), utterances_spec)
if __name__ == "__main__":
save_spectrogram_tisv()