Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,953 Bytes
600759a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
training:
steps: 10_0000_0000
use_amp: true
amp_type: "bf16"
base_lr: 1.e-5
gradient_clip_val: 1.0
gradient_clip_algorithm: "norm"
every_n_train_steps: 2000 # 5000
val_check_interval: 50 # 4096
limit_val_batches: 16
dataset:
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
params:
#! Base setting
batch_size: 4
num_workers: 8
val_num_workers: 4
# Data
train_data_list: tools/mini_trainset/preprocessed
val_data_list: tools/mini_trainset/preprocessed
#! Image loading
cond_stage_key: "image" # image / text / image_text
image_size: 518
mean: &mean [0.5, 0.5, 0.5]
std: &std [0.5, 0.5, 0.5]
#! Point cloud sampling
pc_size: &pc_size 30720
pc_sharpedge_size: &pc_sharpedge_size 30720
sharpedge_label: &sharpedge_label true
return_normal: true
#! Augmentation
padding: true
model:
target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
params:
first_stage_key: "surface"
cond_stage_key: "image"
scale_by_std: false
z_scale_factor: &z_scale_factor 0.9990943042622529 # 1 / 1.0009065167661184
torch_compile: false
# ema_config:
# ema_model: LitEma
# ema_decay: 0.999
# ema_inference: false
first_stage_config:
target: hy3dshape.models.autoencoders.ShapeVAE
from_pretrained: tencent/Hunyuan3D-2.1
params:
num_latents: &num_latents 512
embed_dim: 64
num_freqs: 8
include_pi: false
heads: 16
width: 1024
point_feats: 4
num_decoder_layers: 16
pc_size: *pc_size
pc_sharpedge_size: *pc_sharpedge_size
qkv_bias: false
qk_norm: true
scale_factor: *z_scale_factor
geo_decoder_mlp_expand_ratio: 4
geo_decoder_downsample_ratio: 1
geo_decoder_ln_post: true
cond_stage_config:
target: hy3dshape.models.conditioner.SingleImageEncoder
params:
main_image_encoder:
type: DinoImageEncoder # dino giant
kwargs:
config:
attention_probs_dropout_prob: 0.0
drop_path_rate: 0.0
hidden_act: gelu
hidden_dropout_prob: 0.0
hidden_size: 1536
image_size: 518
initializer_range: 0.02
layer_norm_eps: 1.e-6
layerscale_value: 1.0
mlp_ratio: 4
model_type: dinov2
num_attention_heads: 24
num_channels: 3
num_hidden_layers: 40
patch_size: 14
qkv_bias: true
torch_dtype: float32
use_swiglu_ffn: true
image_size: 518
denoiser_cfg:
target: hy3dshape.models.denoisers.hunyuan3ddit.Hunyuan3DDiT
params:
ckpt_path: ~/.cache/hy3dgen/tencent/Hunyuan3D-2-1-Shape/dit/model.fp16.ckpt
input_size: *num_latents
context_in_dim: 1536
hidden_size: 1024
mlp_ratio: 4.0
num_heads: 16
depth: 16
depth_single_blocks: 32
axes_dim: [64]
theta: 10000
qkv_bias: true
use_pe: false
force_norm_fp32: true
scheduler_cfg:
transport:
target: hy3dshape.models.diffusion.transport.create_transport
params:
path_type: Linear
prediction: velocity
sampler:
target: hy3dshape.models.diffusion.transport.Sampler
params: {}
ode_params:
sampling_method: euler # dopri5 ...
num_steps: &num_steps 50
optimizer_cfg:
optimizer:
target: torch.optim.AdamW
params:
betas: [0.9, 0.99]
eps: 1.e-6
weight_decay: 1.e-2
scheduler:
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
params:
warm_up_steps: 50 # 5000
f_start: 1.e-6
f_min: 1.e-3
f_max: 1.0
pipeline_cfg:
target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
image_processor_cfg:
target: hy3dshape.preprocessors.ImageProcessorV2
params: {}
callbacks:
logger:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
params:
step_frequency: 100 # 10000
num_samples: 1
sample_times: 1
mean: *mean
std: *std
bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
octree_depth: 8
num_chunks: 50000
mc_level: 0.0
file_loggers:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
params:
step_frequency: 50 # 5000
test_data_path: "tools/mini_testset/images.json"
|