-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathoperative_config.gin
349 lines (304 loc) · 14.3 KB
/
operative_config.gin
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
import mesh_tensorflow.optimize
import mesh_tensorflow.transformer.dataset as mesh_tensorflow2
import mesh_tensorflow.transformer.learning_rate_schedules as mesh_tensorflow3
import mesh_tensorflow.transformer.t2t_vocabulary as mesh_tensorflow4
import mesh_tensorflow.transformer.transformer as mesh_tensorflow5
import mesh_tensorflow.transformer.transformer_layers as mesh_tensorflow6
import mesh_tensorflow.transformer.utils as mesh_tensorflow7
import t5.models.mesh_transformer
# Macros:
# ==============================================================================
d_ff = 3072
d_kv = 64
d_model = 768
dropout_rate = 0.1
init_checkpoint = 'gs://t5-zahraa/models/docs.query/model.ckpt-1002900'
MIXTURE_NAME = 'all_mix'
num_heads = 12
num_layers = 12
tokens_per_batch = 131072
# Parameters for adafactor_decay_rate_pow:
# ==============================================================================
adafactor_decay_rate_pow.exponent = 0.8
adafactor_decay_rate_pow.offset = 0
# Parameters for AdafactorOptimizer:
# ==============================================================================
AdafactorOptimizer.beta1 = 0.0
AdafactorOptimizer.clipping_threshold = 1.0
AdafactorOptimizer.decay_rate = None
AdafactorOptimizer.epsilon1 = 1e-30
AdafactorOptimizer.epsilon2 = 0.001
AdafactorOptimizer.exclude_from_parameter_scale = None
AdafactorOptimizer.factored = True
AdafactorOptimizer.min_dim_size_to_factor = 128
AdafactorOptimizer.multiply_by_parameter_scale = True
AdafactorOptimizer.stacked_dim_names = None
# Parameters for Bitransformer:
# ==============================================================================
Bitransformer.shared_embedding = True
# Parameters for constant_learning_rate:
# ==============================================================================
constant_learning_rate.learning_rate = 0.001
# Parameters for decoder/DenseReluDense:
# ==============================================================================
decoder/DenseReluDense.activation = 'relu'
decoder/DenseReluDense.dropout_rate = %dropout_rate
decoder/DenseReluDense.hidden_size = %d_ff
decoder/DenseReluDense.use_bias = False
# Parameters for encoder/DenseReluDense:
# ==============================================================================
encoder/DenseReluDense.activation = 'relu'
encoder/DenseReluDense.dropout_rate = %dropout_rate
encoder/DenseReluDense.hidden_size = %d_ff
encoder/DenseReluDense.use_bias = False
# Parameters for enc_dec_attention:
# ==============================================================================
# None.
# Parameters for enc_dec_attention_bias:
# ==============================================================================
# None.
# Parameters for decoder/EncDecAttention:
# ==============================================================================
decoder/EncDecAttention.relative_attention_type = None
# Parameters for get_variable_dtype:
# ==============================================================================
get_variable_dtype.activation_dtype = 'bfloat16'
# Parameters for get_vocab_embedding_cls:
# ==============================================================================
# None.
# Parameters for get_vocabulary:
# ==============================================================================
get_vocabulary.mixture_or_task_name = %MIXTURE_NAME
# Parameters for init_checkpoint_variable_mapping:
# ==============================================================================
init_checkpoint_variable_mapping.mapping_fn = None
# Parameters for decoder/LayerStack:
# ==============================================================================
decoder/LayerStack.dropout_rate = %dropout_rate
decoder/LayerStack.norm_epsilon = 1e-06
decoder/LayerStack.recompute_grads = False
decoder/LayerStack.sublayers_final = \
[@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
decoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
decoder/LayerStack.sublayers_per_layer = \
[@transformer.sublayer_rms_norm,
@transformer.sublayer_call_layer,
@transformer.sublayer_dropout,
@transformer.sublayer_residual]
# Parameters for encoder/LayerStack:
# ==============================================================================
encoder/LayerStack.dropout_rate = %dropout_rate
encoder/LayerStack.norm_epsilon = 1e-06
encoder/LayerStack.recompute_grads = False
encoder/LayerStack.sublayers_final = \
[@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
encoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
encoder/LayerStack.sublayers_per_layer = \
[@transformer.sublayer_rms_norm,
@transformer.sublayer_call_layer,
@transformer.sublayer_dropout,
@transformer.sublayer_residual]
# Parameters for make_bitransformer:
# ==============================================================================
make_bitransformer.decoder_name = 'decoder'
make_bitransformer.encoder_name = 'encoder'
# Parameters for decoder/make_layer_stack:
# ==============================================================================
decoder/make_layer_stack.block_scope = True
decoder/make_layer_stack.layers = \
[@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
@mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
@mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
decoder/make_layer_stack.num_layers = %num_layers
# Parameters for encoder/make_layer_stack:
# ==============================================================================
encoder/make_layer_stack.block_scope = True
encoder/make_layer_stack.layers = \
[@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
@mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
encoder/make_layer_stack.num_layers = %num_layers
# Parameters for pack_dataset:
# ==============================================================================
pack_dataset.use_custom_ops = False
# Parameters for pack_or_pad:
# ==============================================================================
pack_or_pad.ensure_eos = False
pack_or_pad.feature_keys = None
pack_or_pad.pack = True
# Parameters for packed_parallel_tsv_dataset:
# ==============================================================================
packed_parallel_tsv_dataset.batch_size = None
packed_parallel_tsv_dataset.max_encoded_len = 0
# Parameters for rewrite_stack_variables:
# ==============================================================================
rewrite_stack_variables.max_combined_variable_size = 536870912
# Parameters for run:
# ==============================================================================
run.autostack = True
run.batch_size = ('tokens_per_batch', %tokens_per_batch)
run.checkpoint_input_pipeline = False
run.dataset_split = 'train'
run.ensemble_inputs = None
run.eval_checkpoint_step = None
run.eval_dataset_fn = None
run.eval_dir_suffix = None
run.eval_summary_dir = None
run.export_checkpoint_step = None
run.export_path = ''
run.init_checkpoint = %init_checkpoint
run.iterations_per_loop = 100
run.keep_checkpoint_max = None
run.layout_rules = \
'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
run.learning_rate_schedule = @learning_rate_schedules.constant_learning_rate
run.mesh_devices = None
run.mesh_shape = @mesh_tensorflow.transformer.utils.tpu_mesh_shape()
run.mode = 'train'
run.model_type = 'bitransformer'
run.optimizer = @optimize.AdafactorOptimizer
run.output_eval_examples = True
run.perplexity_eval_steps = 10
run.predict_fn = None
run.save_checkpoints_steps = 2400
run.seen_data_init_step = 0
run.sequence_length = {'inputs': 2048, 'targets': 32}
run.skip_seen_data = False
run.total_run_steps = None
run.train_dataset_fn = @t5.models.mesh_transformer.tsv_dataset_fn
run.train_steps = 1003900
run.variable_filter = None
# Parameters for decoder/SelfAttention:
# ==============================================================================
decoder/SelfAttention.attention_func = None
decoder/SelfAttention.attention_kwargs = None
decoder/SelfAttention.combine_dims = True
decoder/SelfAttention.dropout_rate = %dropout_rate
decoder/SelfAttention.fold_scaling_into_initializer = True
decoder/SelfAttention.hyperprompt_hidden_dim = None
decoder/SelfAttention.hyperprompt_length_decoder = None
decoder/SelfAttention.hyperprompt_length_encoder = None
decoder/SelfAttention.hyperprompt_mtlshare = False
decoder/SelfAttention.hyperprompt_task_num = 8
decoder/SelfAttention.keep_query_heads_dims = False
decoder/SelfAttention.key_value_size = %d_kv
decoder/SelfAttention.num_heads = %num_heads
decoder/SelfAttention.num_memory_heads = 0
decoder/SelfAttention.relative_attention_num_buckets = 32
decoder/SelfAttention.relative_attention_type = 'bias_shared'
decoder/SelfAttention.shared_kv = False
decoder/SelfAttention.use_hyperprompt = False
decoder/SelfAttention.z_loss_coeff = None
# Parameters for encoder/SelfAttention:
# ==============================================================================
encoder/SelfAttention.attention_func = None
encoder/SelfAttention.attention_kwargs = None
encoder/SelfAttention.combine_dims = True
encoder/SelfAttention.dropout_rate = %dropout_rate
encoder/SelfAttention.fold_scaling_into_initializer = True
encoder/SelfAttention.hyperprompt_hidden_dim = None
encoder/SelfAttention.hyperprompt_length_decoder = None
encoder/SelfAttention.hyperprompt_length_encoder = None
encoder/SelfAttention.hyperprompt_mtlshare = False
encoder/SelfAttention.hyperprompt_task_num = 8
encoder/SelfAttention.keep_query_heads_dims = False
encoder/SelfAttention.key_value_size = %d_kv
encoder/SelfAttention.num_heads = %num_heads
encoder/SelfAttention.num_memory_heads = 0
encoder/SelfAttention.relative_attention_num_buckets = 32
encoder/SelfAttention.relative_attention_type = 'bias_shared'
encoder/SelfAttention.shared_kv = False
encoder/SelfAttention.use_hyperprompt = False
encoder/SelfAttention.z_loss_coeff = None
# Parameters for serialize_num_microbatches:
# ==============================================================================
serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
# Parameters for should_load_variable:
# ==============================================================================
should_load_variable.filter_fn = None
# Parameters for SimdMeshImpl:
# ==============================================================================
SimdMeshImpl.allreduce_in_bfloat16_max_group_size = 8
# Parameters for sublayer_call_layer:
# ==============================================================================
# None.
# Parameters for sublayer_dropout:
# ==============================================================================
# None.
# Parameters for sublayer_legacy_dropout:
# ==============================================================================
# None.
# Parameters for sublayer_legacy_final_rms_norm:
# ==============================================================================
# None.
# Parameters for sublayer_legacy_rms_norm:
# ==============================================================================
# None.
# Parameters for sublayer_mask_padding:
# ==============================================================================
# None.
# Parameters for sublayer_residual:
# ==============================================================================
# None.
# Parameters for sublayer_rms_norm:
# ==============================================================================
sublayer_rms_norm.epsilon = 1e-06
# Parameters for tpu_estimator_model_fn:
# ==============================================================================
tpu_estimator_model_fn.hierarchical_tiling_spec = None
tpu_estimator_model_fn.init_variable_filter = ''
tpu_estimator_model_fn.model_info_file = None
tpu_estimator_model_fn.outer_batch_size = 1
tpu_estimator_model_fn.tpu_summaries = False
tpu_estimator_model_fn.weight_decay_checkpoint = None
# Parameters for tpu_mesh_shape:
# ==============================================================================
tpu_mesh_shape.ensemble_parallelism = None
tpu_mesh_shape.model_parallelism = 1
tpu_mesh_shape.tpu_topology = 'v3-8'
# Parameters for tsv_dataset_fn:
# ==============================================================================
tsv_dataset_fn.filename = 'gs://t5-zahraa/orcas/docs.query.document.train.tsv'
tsv_dataset_fn.shuffle_buffer_size = 10000
# Parameters for unit_scaling_convention:
# ==============================================================================
unit_scaling_convention.value = False
# Parameters for decoder/Unitransformer:
# ==============================================================================
decoder/Unitransformer.d_model = %d_model
decoder/Unitransformer.ensemble = None
decoder/Unitransformer.input_full_attention = False
decoder/Unitransformer.label_smoothing = 0.0
decoder/Unitransformer.loss_denominator = 233472
decoder/Unitransformer.loss_fn = None
decoder/Unitransformer.loss_on_targets_only = False
decoder/Unitransformer.max_length = 512
decoder/Unitransformer.positional_embedding = False
decoder/Unitransformer.shared_embedding_and_softmax_weights = True
decoder/Unitransformer.sinusoid_positional_embedding = False
decoder/Unitransformer.token_dropout_rate = 0.0
decoder/Unitransformer.vocab_divisor = 128
decoder/Unitransformer.z_loss = 0.0001
# Parameters for encoder/Unitransformer:
# ==============================================================================
encoder/Unitransformer.d_model = %d_model
encoder/Unitransformer.ensemble = None
encoder/Unitransformer.input_full_attention = False
encoder/Unitransformer.label_smoothing = 0.0
encoder/Unitransformer.loss_denominator = None
encoder/Unitransformer.loss_fn = None
encoder/Unitransformer.loss_on_targets_only = False
encoder/Unitransformer.max_length = 512
encoder/Unitransformer.positional_embedding = False
encoder/Unitransformer.shared_embedding_and_softmax_weights = True
encoder/Unitransformer.sinusoid_positional_embedding = False
encoder/Unitransformer.token_dropout_rate = 0.0
encoder/Unitransformer.vocab_divisor = 128
encoder/Unitransformer.z_loss = 0.0001
# Parameters for VarianceScalingInitializer:
# ==============================================================================
VarianceScalingInitializer.distribution = 'normal'
VarianceScalingInitializer.mode = 'fan_in'
VarianceScalingInitializer.scale = 1.0
# Parameters for VocabEmbedding:
# ==============================================================================
VocabEmbedding.scale_variable_like_classifier_weights = False