Spaces:
Running
on
Zero
Running
on
Zero
Hunyuan3D-2.1
/
hy3dshape
/configs
/hunyuandit-finetuning-flowmatching-dinog518-bf16-lr1e5-4096.yaml
| name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518" | |
| training: | |
| steps: 10_0000_0000 | |
| use_amp: true | |
| amp_type: "bf16" | |
| base_lr: 1e-5 | |
| gradient_clip_val: 1.0 | |
| gradient_clip_algorithm: "norm" | |
| every_n_train_steps: 2000 # 5000 | |
| val_check_interval: 50 # 4096 | |
| limit_val_batches: 16 | |
| dataset: | |
| target: hy3dshape.data.dit_asl.AlignedShapeLatentModule | |
| params: | |
| #! Base setting | |
| batch_size: 4 | |
| num_workers: 8 | |
| val_num_workers: 4 | |
| # Data | |
| train_data_list: tools/mini_trainset/preprocessed | |
| val_data_list: tools/mini_trainset/preprocessed | |
| #! Image loading | |
| cond_stage_key: "image" # image / text / image_text | |
| image_size: 518 | |
| mean: &mean [0.5, 0.5, 0.5] | |
| std: &std [0.5, 0.5, 0.5] | |
| #! Point cloud sampling | |
| pc_size: &pc_size 81920 | |
| pc_sharpedge_size: &pc_sharpedge_size 0 | |
| sharpedge_label: &sharpedge_label true | |
| return_normal: true | |
| #! Augmentation | |
| padding: true | |
| model: | |
| target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser | |
| params: | |
| first_stage_key: "surface" | |
| cond_stage_key: "image" | |
| scale_by_std: false | |
| z_scale_factor: &z_scale_factor 1.0039506158752403 | |
| torch_compile: false | |
| # ema_config: | |
| # ema_model: LitEma | |
| # ema_decay: 0.999 | |
| # ema_inference: false | |
| first_stage_config: | |
| target: hy3dshape.models.autoencoders.ShapeVAE | |
| from_pretrained: tencent/Hunyuan3D-2.1 | |
| params: | |
| num_latents: &num_latents 4096 | |
| embed_dim: 64 | |
| num_freqs: 8 | |
| include_pi: false | |
| heads: 16 | |
| width: 1024 | |
| num_encoder_layers: 8 | |
| num_decoder_layers: 16 | |
| qkv_bias: false | |
| qk_norm: true | |
| scale_factor: | |
| geo_decoder_mlp_expand_ratio: 4 | |
| geo_decoder_downsample_ratio: 1 | |
| geo_decoder_ln_post: true | |
| point_feats: 4 | |
| pc_size: | |
| pc_sharpedge_size: | |
| cond_stage_config: | |
| target: hy3dshape.models.conditioner.SingleImageEncoder | |
| params: | |
| main_image_encoder: | |
| type: DinoImageEncoder # dino large | |
| kwargs: | |
| config: | |
| attention_probs_dropout_prob: 0.0 | |
| drop_path_rate: 0.0 | |
| hidden_act: gelu | |
| hidden_dropout_prob: 0.0 | |
| hidden_size: 1024 | |
| image_size: 518 | |
| initializer_range: 0.02 | |
| layer_norm_eps: 1.e-6 | |
| layerscale_value: 1.0 | |
| mlp_ratio: 4 | |
| model_type: dinov2 | |
| num_attention_heads: 16 | |
| num_channels: 3 | |
| num_hidden_layers: 24 | |
| patch_size: 14 | |
| qkv_bias: true | |
| torch_dtype: float32 | |
| use_swiglu_ffn: false | |
| image_size: 518 | |
| use_cls_token: true | |
| denoiser_cfg: | |
| target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain | |
| params: | |
| input_size: | |
| in_channels: 64 | |
| hidden_size: 2048 | |
| context_dim: 1024 | |
| depth: 21 | |
| num_heads: 16 | |
| qk_norm: true | |
| text_len: 1370 | |
| with_decoupled_ca: false | |
| use_attention_pooling: false | |
| qk_norm_type: 'rms' | |
| qkv_bias: false | |
| use_pos_emb: false | |
| num_moe_layers: 6 | |
| num_experts: 8 | |
| moe_top_k: 2 | |
| scheduler_cfg: | |
| transport: | |
| target: hy3dshape.models.diffusion.transport.create_transport | |
| params: | |
| path_type: Linear | |
| prediction: velocity | |
| sampler: | |
| target: hy3dshape.models.diffusion.transport.Sampler | |
| params: {} | |
| ode_params: | |
| sampling_method: euler # dopri5 ... | |
| num_steps: &num_steps 50 | |
| optimizer_cfg: | |
| optimizer: | |
| target: torch.optim.AdamW | |
| params: | |
| betas: [0.9, 0.99] | |
| eps: 1.e-6 | |
| weight_decay: 1.e-2 | |
| scheduler: | |
| target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler | |
| params: | |
| warm_up_steps: 50 # 5000 | |
| f_start: 1.e-6 | |
| f_min: 1.e-3 | |
| f_max: 1.0 | |
| pipeline_cfg: | |
| target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline | |
| image_processor_cfg: | |
| target: hy3dshape.preprocessors.ImageProcessorV2 | |
| params: {} | |
| callbacks: | |
| logger: | |
| target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger | |
| params: | |
| step_frequency: 100 # 10000 | |
| num_samples: 1 | |
| sample_times: 1 | |
| mean: | |
| std: | |
| bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01] | |
| octree_depth: 8 | |
| num_chunks: 50000 | |
| mc_level: 0.0 | |
| file_loggers: | |
| target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger | |
| params: | |
| step_frequency: 50 # 5000 | |
| test_data_path: "tools/mini_testset/images.json" | |