Spaces:
Running
Running
| log_dir: "./runs/run_dit_mel_seed" | |
| save_freq: 1 | |
| log_interval: 10 | |
| save_interval: 1000 | |
| device: "cuda" | |
| epochs: 1000 # number of epochs for first stage training (pre-training) | |
| batch_size: 4 | |
| batch_length: 100 # maximum duration of audio in a batch (in seconds) | |
| max_len: 80 # maximum number of frames | |
| pretrained_model: "" | |
| pretrained_encoder: "" | |
| load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters | |
| F0_path: "modules/JDC/bst.t7" | |
| preprocess_params: | |
| sr: 22050 | |
| spect_params: | |
| n_fft: 1024 | |
| win_length: 1024 | |
| hop_length: 256 | |
| n_mels: 80 | |
| model_params: | |
| dit_type: "DiT" # uDiT or DiT | |
| reg_loss_type: "l2" # l1 or l2 | |
| speech_tokenizer: | |
| path: "speech_tokenizer_v1.onnx" | |
| style_encoder: | |
| dim: 192 | |
| campplus_path: "campplus_cn_common.bin" | |
| DAC: | |
| encoder_dim: 64 | |
| encoder_rates: [2, 5, 5, 6] | |
| decoder_dim: 1536 | |
| decoder_rates: [ 6, 5, 5, 2 ] | |
| sr: 24000 | |
| length_regulator: | |
| channels: 768 | |
| is_discrete: true | |
| content_codebook_size: 4096 | |
| in_frame_rate: 50 | |
| out_frame_rate: 80 | |
| sampling_ratios: [1, 1, 1, 1] | |
| DiT: | |
| hidden_dim: 768 | |
| num_heads: 12 | |
| depth: 12 | |
| class_dropout_prob: 0.1 | |
| block_size: 4096 | |
| in_channels: 80 | |
| style_condition: true | |
| final_layer_type: 'wavenet' | |
| target: 'mel' # mel or codec | |
| content_dim: 768 | |
| content_codebook_size: 1024 | |
| content_type: 'discrete' | |
| f0_condition: false | |
| n_f0_bins: 512 | |
| content_codebooks: 1 | |
| is_causal: false | |
| long_skip_connection: true | |
| zero_prompt_speech_token: false # for prompt component, do not input corresponding speech token | |
| wavenet: | |
| hidden_dim: 768 | |
| num_layers: 8 | |
| kernel_size: 5 | |
| dilation_rate: 1 | |
| p_dropout: 0.2 | |
| style_condition: true | |
| loss_params: | |
| base_lr: 0.0001 |