Open-Sora Plan v1.5.0采用mindspeed-mm套件训练。 ### 前置要求 Open-Sora Plan v1.5.0在CANN 8.0.1版本完成训练,请参照[CANN 系列 昇腾计算 8.0.1 软件补丁下载](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373/software/264595017?idAbsPath=fixnode01|23710424|251366513|22892968|252309113|251168373)安装。 ### 环境安装 1、安装torch、Mindspeed ```python # python3.8 conda create -n osp python=3.8 conda activate osp # 安装 torch 和 torch_npu,注意要选择对应python版本、x86或arm的torch、torch_npu及apex包 pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl # apex for Ascend 参考 https://gitee.com/ascend/apex # 建议从原仓编译安装 # 将shell脚本中的环境变量路径修改为真实路径,下面为参考路径 source /usr/local/Ascend/ascend-toolkit/set_env.sh # 安装加速库 git clone https://gitee.com/ascend/MindSpeed.git cd MindSpeed git checkout 59b4e983b7dc1f537f8c6b97a57e54f0316fafb0 pip install -r requirements.txt pip3 install -e . cd .. # 安装其余依赖库 pip install -e . ``` 2、安装decord ```bash git clone --recursive https://github.com/dmlc/decord mkdir build && cd build cmake .. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release -DFFMPEG_DIR=/usr/local/ffmpeg make cd ../python pwd=$PWD echo "PYTHONPATH=$PYTHONPATH:$pwd" >> ~/.bashrc source ~/.bashrc python3 setup.py install --user ``` ### 权重下载 魔乐社区: https://modelers.cn/models/PKU-YUAN-Group/Open-Sora-Plan-v1.5.0 huggingface: https://huggingface.co/LanguageBind/Open-Sora-Plan-v1.5.0 T5: [google/t5-v1_1-xl · Hugging Face](https://huggingface.co/google/t5-v1_1-xl) CLIP: [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k · Hugging Face](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k) ### Train Text-to-Video 需要设置好data.json和model_opensoraplan1_5.json。 #### data.json: ``` { "dataset_param": { "dataset_type": "t2v", "basic_parameters": { "data_path": "./examples/opensoraplan1.5/data.txt", # 数据路径 "data_folder": "", "data_storage_mode": "combine" }, "preprocess_parameters": { "video_reader_type": "decoder", "image_reader_type": "Image", "num_frames": 121, "frame_interval": 1, "max_height": 576, # 开启固定分辨率时的样本高度,在开启多分辨率时无效 "max_width": 1024, # 开启固定分辨率时的样本宽度,在开启多分辨率时无效 "max_hxw": 589824, # 开启多分辨率时的最大token数 "min_hxw": 589824, # 开启多分辨率时的最小token数。此外,min_hxw需要在开启force_resolution时设置为max_height * max_width以过滤低分辨率样本,或自定义更严格的筛选标准 "force_resolution": true, # 开启固定分辨率训练 "force_5_ratio": false, # 开启5宽高比多分辨率策略训练 "max_h_div_w_ratio": 1.0, # 筛选最大高宽比 "min_h_div_w_ratio": 0.42, # 筛选最小高宽比 "hw_stride": 16, "ae_stride_t": 8, "train_fps": 24, # 训练时采样fps,会将不同fps的视频都重采样到train_fps "speed_factor": 1.0, "drop_short_ratio": 1.0, "min_num_frames": 29, "cfg": 0.1, "batch_size": 1, "gradient_accumulation_size": 4, "use_aesthetic": false, "train_pipeline": { "video": [{ "trans_type": "ToTensorVideo" }, { "trans_type": "CenterCropResizeVideo", "param": { "size": [576, 1024], "interpolation_mode": "bicubic" } }, { "trans_type": "ae_norm" } ], "image": [{ "trans_type": "ToTensorVideo" }, { "trans_type": "CenterCropResizeVideo", "param": { "size": [576, 1024], "interpolation_mode": "bicubic" } }, { "trans_type": "ae_norm" } ] } }, "use_text_processer": true, "enable_text_preprocess": true, "model_max_length": 512, "tokenizer_config": { "hub_backend": "hf", "autotokenizer_name": "AutoTokenizer", "from_pretrained": "/work/share/checkpoint/pretrained/t5/t5-v1_1-xl" }, "tokenizer_config_2": { "hub_backend": "hf", "autotokenizer_name": "AutoTokenizer", "from_pretrained": "/work/share/checkpoint/pretrained/clip/models--laion--CLIP-ViT-bigG-14-laion2B-39B-b160k/snapshots/bc7788f151930d91b58474715fdce5524ad9a189" }, "use_feature_data": false, "use_img_from_vid": false }, "dataloader_param": { "dataloader_mode": "sampler", "sampler_type": "LengthGroupedSampler", # 开启Group Data策略,默认指定 "batch_size": 1, "num_workers": 4, "shuffle": false, "drop_last": true, "pin_memory": false, "group_data": true, "initial_global_step_for_sampler": 0, "gradient_accumulation_size": 4, "collate_param": { "model_name": "GroupLength", # 开启Group Data对应的Collate,默认指定 "batch_size": 1, "num_frames": 121, "group_data": true, "ae_stride": 8, "ae_stride_t": 8, "patch_size": 2, "patch_size_t": 1 } } } ``` #### model_opensoraplan1_5.json ``` { "frames": 121, "allow_tf32": false, "allow_internal_format": false, "load_video_features": false, "load_text_features": false, "enable_encoder_dp": true, # mindspeed架构优化,在TP并行度大于1时起作用 "weight_dtype": "bf16", "ae": { "model_id": "wfvae", "base_channels": 160, "connect_res_layer_num": 1, "decoder_energy_flow_hidden_size": 128, "decoder_num_resblocks": 2, "dropout": 0.0, "encoder_energy_flow_hidden_size": 128, "encoder_num_resblocks": 2, "l1_dowmsample_block": "Spatial2xTime2x3DDownsample", "l1_downsample_wavelet": "HaarWaveletTransform3D", "l1_upsample_block": "Spatial2xTime2x3DUpsample", "l1_upsample_wavelet": "InverseHaarWaveletTransform3D", "l2_dowmsample_block": "Spatial2xTime2x3DDownsample", "l2_downsample_wavelet": "HaarWaveletTransform3D", "l2_upsample_block": "Spatial2xTime2x3DUpsample", "l2_upsample_wavelet": "InverseHaarWaveletTransform3D", "latent_dim": 32, "norm_type": "layernorm", "scale": [0.7031, 0.7109, 1.5391, 1.2969, 0.7109, 1.4141, 1.3828, 2.1719, 1.7266, 1.8281, 1.9141, 1.2031, 0.6875, 0.9609, 1.6484, 1.1875, 1.5312, 1.1328, 0.8828, 0.6836, 0.8828, 0.9219, 1.6953, 1.4453, 1.5312, 0.6836, 0.7656, 0.8242, 1.2344, 1.0312, 1.7266, 0.9492], "shift": [-0.2129, 0.1226, 1.6328, 0.6211, -0.8750, 0.6172, -0.5703, 0.1348, -0.2178, -0.9375, 0.3184, 0.3281, -0.0544, -0.1826, -0.2812, 0.4355, 0.1621, -0.2578, 0.7148, -0.7422, -0.2295, -0.2324, -1.4922, 0.6328, 1.1250, -0.2578, -2.1094, 1.0391, 1.1797, -1.2422, -0.2988, -0.9570], "t_interpolation": "trilinear", "use_attention": true, "use_tiling": true, # 是否开启tiling策略 "from_pretrained": "/work/share/checkpoint/pretrained/vae/Middle888/merged.ckpt", "dtype": "fp32" }, "text_encoder": { "hub_backend": "hf", "model_id": "T5", "from_pretrained": "/work/share/checkpoint/pretrained/t5/t5-v1_1-xl", "low_cpu_mem_usage": false }, "text_encoder_2":{ "hub_backend": "hf", "model_id": "CLIPWithProjection", "from_pretrained": "/work/share/checkpoint/pretrained/clip/models--laion--CLIP-ViT-bigG-14-laion2B-39B-b160k/snapshots/bc7788f151930d91b58474715fdce5524ad9a189", "low_cpu_mem_usage": false }, "predictor": { "model_id": "SparseUMMDiT", "num_layers": [2, 4, 6, 8, 6, 4, 2], # 每个stage的层数 "sparse_n": [1, 2, 4, 8, 4, 2, 1], # 每个stage的稀疏度 "double_ff": true, # 采用visual和text共享FFN还是各自独立FFN "sparse1d": true, # 是否采用Skiparse策略,设置为false则为dense dit "num_heads": 24, "head_dim": 128, "in_channels": 32, "out_channels": 32, "timestep_embed_dim": 1024, "caption_channels": 2048, "pooled_projection_dim": 1280, "skip_connection": true, # 是否添加skip connection "dropout": 0.0, "attention_bias": true, "patch_size": 2, "patch_size_t": 1, "activation_fn": "gelu-approximate", "norm_elementwise_affine": false, "norm_eps": 1e-06, "from_pretrained": null # 预训练权重路径,需采用合并后的权重 }, "diffusion": { "model_id": "OpenSoraPlan", "weighting_scheme": "logit_normal", "use_dynamic_shifting": true } } ``` 进入Open-Sora Plan目录下,运行 ``` bash examples/opensoraplan1.5/pretrain_opensoraplan1_5.sh ``` 参数解析: `--optimizer-selection fused_ema_adamw` 选择使用的优化器,我们这里需要选择fused_ema_adamw以获得EMA版本权重。 `--model_custom_precision` 不同组件使用不同的精度,而不是采用megatron默认的整网bf16精度。例如对VAE使用fp32精度,对text encoder、dit使用bf16精度。 `--clip_grad_ema_decay 0.99` 设置adaptive grad clipping中使用的EMA衰减率。 `--selective_recom` `--recom_ffn_layers 32` 是否开启选择性重加算及选择性重计算的层数。在开启选择性重计算时,我们只对FFN进行重计算而不对Attention进行重计算,以获得加速训练效果。该参数与`--recompute-granularity full` `--recompute-method block` `--recompute-num-layers 0` 互斥,当开启选择性重计算时,默认全重计算已关闭。 ### Sample Text-to-Video 由于模型训练时进行了TP切分,所以我们需要先将切分后的权重进行合并,然后再进行推理。 #### 合并权重 ``` python examples/opensoraplan1.5/convert_mm_to_ckpt.py --load_dir $load_dir --save_dir $save_dir --ema ``` 参数解析: `--load_dir` 训练时经过megatron切分后保存的权重路径 `--save_dir` 合并后的权重路径 `--ema` 是否采用EMA权重 ### 推理 需要配置好inference_t2v_model1_5.json。 ``` { "ae": { "model_id": "wfvae", "base_channels": 160, "connect_res_layer_num": 1, "decoder_energy_flow_hidden_size": 128, "decoder_num_resblocks": 2, "dropout": 0.0, "encoder_energy_flow_hidden_size": 128, "encoder_num_resblocks": 2, "l1_dowmsample_block": "Spatial2xTime2x3DDownsample", "l1_downsample_wavelet": "HaarWaveletTransform3D", "l1_upsample_block": "Spatial2xTime2x3DUpsample", "l1_upsample_wavelet": "InverseHaarWaveletTransform3D", "l2_dowmsample_block": "Spatial2xTime2x3DDownsample", "l2_downsample_wavelet": "HaarWaveletTransform3D", "l2_upsample_block": "Spatial2xTime2x3DUpsample", "l2_upsample_wavelet": "InverseHaarWaveletTransform3D", "latent_dim": 32, "vae_scale_factor": [8, 8, 8], "norm_type": "layernorm", "scale": [0.7031, 0.7109, 1.5391, 1.2969, 0.7109, 1.4141, 1.3828, 2.1719, 1.7266, 1.8281, 1.9141, 1.2031, 0.6875, 0.9609, 1.6484, 1.1875, 1.5312, 1.1328, 0.8828, 0.6836, 0.8828, 0.9219, 1.6953, 1.4453, 1.5312, 0.6836, 0.7656, 0.8242, 1.2344, 1.0312, 1.7266, 0.9492], "shift": [-0.2129, 0.1226, 1.6328, 0.6211, -0.8750, 0.6172, -0.5703, 0.1348, -0.2178, -0.9375, 0.3184, 0.3281, -0.0544, -0.1826, -0.2812, 0.4355, 0.1621, -0.2578, 0.7148, -0.7422, -0.2295, -0.2324, -1.4922, 0.6328, 1.1250, -0.2578, -2.1094, 1.0391, 1.1797, -1.2422, -0.2988, -0.9570], "t_interpolation": "trilinear", "use_attention": true, "use_tiling": true, # 是否开启tiling策略,推理时默认开启节省显存 "from_pretrained": "/work/share/checkpoint/pretrained/vae/Middle888/merged.ckpt", "dtype": "fp16" }, "text_encoder": { "hub_backend": "hf", "model_id": "T5", "from_pretrained": "/work/share/checkpoint/pretrained/t5/t5-v1_1-xl", "low_cpu_mem_usage": false }, "text_encoder_2":{ "hub_backend": "hf", "model_id": "CLIPWithProjection", "from_pretrained": "/work/share/checkpoint/pretrained/clip/models--laion--CLIP-ViT-bigG-14-laion2B-39B-b160k/snapshots/bc7788f151930d91b58474715fdce5524ad9a189", "low_cpu_mem_usage": false }, "tokenizer":{ "hub_backend": "hf", "autotokenizer_name": "AutoTokenizer", "from_pretrained": "/work/share/checkpoint/pretrained/t5/t5-v1_1-xl", "low_cpu_mem_usage": false }, "tokenizer_2":{ "hub_backend": "hf", "autotokenizer_name": "AutoTokenizer", "from_pretrained": "/work/share/checkpoint/pretrained/clip/models--laion--CLIP-ViT-bigG-14-laion2B-39B-b160k/snapshots/bc7788f151930d91b58474715fdce5524ad9a189", "low_cpu_mem_usage": false }, "predictor": { "model_id": "SparseUMMDiT", "num_layers": [2, 4, 6, 8, 6, 4, 2], "sparse_n": [1, 2, 4, 8, 4, 2, 1], "double_ff": true, "sparse1d": true, "num_heads": 24, "head_dim": 128, "in_channels": 32, "out_channels": 32, "timestep_embed_dim": 1024, "caption_channels": 2048, "pooled_projection_dim": 1280, "skip_connection": true, "skip_connection_zero_init": true, "dropout": 0.0, "attention_bias": true, "patch_size": 2, "patch_size_t": 1, "activation_fn": "gelu-approximate", "norm_elementwise_affine": true, "norm_eps": 1e-06, "from_pretrained": "/path/to/pretrained/model" }, "diffusion": { "model_id": "OpenSoraPlan", "num_inference_steps": 50, # 推理步数 "guidance_scale": 8.0, # CFG强度,我们推荐较大的CFG,8.0是较好的值 "guidance_rescale": 0.7, # guidance rescale强度,如认为采样饱和度过高,我们推荐将gudance_rescale增大,而非调整CFG "use_linear_quadratic_schedule": false, # 采用线性——平方采样策略 "use_dynamic_shifting": false, "shift": 7.0 # 采用shifting采样策略 }, "pipeline_config": { "use_attention_mask": true, "input_size": [121, 576, 1024], "version": "v1.5", "model_type": "t2v" }, "micro_batch_size": 1, "frame_interval":1, "model_max_length": 512, "save_path":"./opensoraplan_samples/test_samples", "fps":24, "prompt":"./examples/opensoraplan1.5/sora.txt", "device":"npu", "weight_dtype": "fp16" } ``` 进入Open-Sora Plan目录下,运行 ``` bash examples/opensoraplan1.5/inference_t2v_1_5.sh ``` 实测TP=1即不开启并行策略能够运行121x576x1024推理,如需加快推理速度请自行调节TP并行度。