-
Notifications
You must be signed in to change notification settings - Fork 5
/
synthesis.py
190 lines (154 loc) · 6.17 KB
/
synthesis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# coding: utf-8
"""
Synthesis waveform from trained WaveNet.
usage: synthesis.py [options] <checkpoint> <dst_dir>
options:
--hparams=<parmas> Hyper parameters [default: ].
--length=<T> Steps to generate [default: 32000].
--initial-value=<n> Initial value for the WaveNet decoder.
--conditional=<p> Conditional features path.
--file-name-suffix=<s> File name suffix [default: ].
--speaker-id=<id> Speaker ID (for multi-speaker model).
--output-html Output html for blog post.
-h, --help Show help message.
"""
from docopt import docopt
import sys
import os
from os.path import dirname, join, basename, splitext
import torch
from torch.autograd import Variable
import numpy as np
from nnmnkwii import preprocessing as P
from keras.utils import np_utils
from tqdm import tqdm
import librosa
from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
import audio
from hparams import hparams
use_cuda = torch.cuda.is_available()
def _to_numpy(x):
# this is ugly
if x is None:
return None
if isinstance(x, np.ndarray) or np.isscalar(x):
return x
# remove batch axis
if x.dim() == 3:
x = x.squeeze(0)
return x.numpy()
def wavegen(model, length=None, c=None, g=None, initial_value=None,
fast=False, tqdm=tqdm):
"""Generate waveform samples by WaveNet.
Args:
model (nn.Module) : WaveNet decoder
length (int): Time steps to generate. If conditinlal features are given,
then this is determined by the feature size.
c (numpy.ndarray): Conditional features, of shape T x C
g (scaler): Speaker ID
initial_value (int) : initial_value for the WaveNet decoder.
fast (Bool): Whether to remove weight normalization or not.
tqdm (lambda): tqdm
Returns:
numpy.ndarray : Generated waveform samples
"""
from train import sanity_check
sanity_check(model, c, g)
c = _to_numpy(c)
g = _to_numpy(g)
if use_cuda:
model = model.cuda()
model.eval()
if fast:
model.make_generation_fast_()
if c is None:
assert length is not None
else:
# (Tc, D)
if c.ndim != 2:
raise RuntimeError(
"Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given.".format(hparams.cin_channels, c.shape))
assert c.ndim == 2
Tc = c.shape[0]
upsample_factor = audio.get_hop_size()
# Overwrite length according to feature size
length = Tc * upsample_factor
# (Tc, D) -> (Tc', D)
# Repeat features before feeding it to the network
if not hparams.upsample_conditional_features:
c = np.repeat(c, upsample_factor, axis=0)
# B x C x T
c = Variable(torch.FloatTensor(c.T).unsqueeze(0))
if initial_value is None:
if is_mulaw_quantize(hparams.input_type):
initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
else:
initial_value = 0.0
if is_mulaw_quantize(hparams.input_type):
assert initial_value >= 0 and initial_value < hparams.quantize_channels
initial_input = np_utils.to_categorical(
initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
initial_input = Variable(torch.from_numpy(initial_input)).view(
1, 1, hparams.quantize_channels)
else:
initial_input = Variable(torch.zeros(1, 1, 1)).fill_(initial_value)
g = None if g is None else Variable(torch.LongTensor([g]))
if use_cuda:
initial_input = initial_input.cuda()
g = None if g is None else g.cuda()
c = None if c is None else c.cuda()
y_hat = model.incremental_forward(
initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True,
log_scale_min=hparams.log_scale_min)
if is_mulaw_quantize(hparams.input_type):
y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy()
y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
elif is_mulaw(hparams.input_type):
y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels)
else:
y_hat = y_hat.view(-1).cpu().data.numpy()
return y_hat
if __name__ == "__main__":
args = docopt(__doc__)
print("Command line args:\n", args)
checkpoint_path = args["<checkpoint>"]
dst_dir = args["<dst_dir>"]
length = int(args["--length"])
initial_value = args["--initial-value"]
initial_value = None if initial_value is None else float(initial_value)
conditional_path = args["--conditional"]
file_name_suffix = args["--file-name-suffix"]
output_html = args["--output-html"]
speaker_id = args["--speaker-id"]
speaker_id = None if speaker_id is None else int(speaker_id)
# Override hyper parameters
hparams.parse(args["--hparams"])
assert hparams.name == "wavenet_vocoder"
# Presets
if hparams.preset is not None and hparams.preset != "":
preset = hparams.presets[hparams.preset]
import json
hparams.parse_json(json.dumps(preset))
print("Override hyper parameters with preset \"{}\": {}".format(
hparams.preset, json.dumps(preset, indent=4)))
# Load conditional features
if conditional_path is not None:
c = np.load(conditional_path)
else:
c = None
from train import build_model
# Model
model = build_model()
# Load checkpoint
print("Load checkpoint from {}".format(checkpoint_path))
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint["state_dict"])
checkpoint_name = splitext(basename(checkpoint_path))[0]
os.makedirs(dst_dir, exist_ok=True)
dst_wav_path = join(dst_dir, "{}{}.wav".format(checkpoint_name, file_name_suffix))
# DO generate
waveform = wavegen(model, length, c=c, g=speaker_id, initial_value=initial_value, fast=True)
# save
librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate)
print("Finished! Check out {} for generated audio samples.".format(dst_dir))
sys.exit(0)