MultiPerson

Running on Zero

App Files Files Community

C4G-HKUST commited on 13 days ago

Commit

89822bc

1 Parent(s): 00ecfed

feat: time out check

Browse files

Files changed (3) hide show

app.py +28 -28
wan/audio2video_multiID.py +1 -1
wan/utils/infer_utils.py +20 -20

app.py CHANGED Viewed

@@ -483,12 +483,12 @@ def run_graio_demo(args):
                 fps = getattr(cfg, 'fps', 24)
                 calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
-                # Fast模式：如果trim_to_6s为True，强制限制为6秒对应的帧数
                 if trim_to_6s:
-                    # 6秒固定为145帧（4n+1格式）
-                    max_frames_6s = 145
-                    current_frame_num = min(calculated_frame_num, max_frames_6s)
-                    logging.warning(f"Fast mode: Audio duration exceeds 6 seconds. Trimming to 6 seconds ({max_frames_6s} frames). Original: {calculated_frame_num} frames")
                 else:
                     current_frame_num = calculated_frame_num
@@ -678,13 +678,13 @@ def run_graio_demo(args):
     # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/app.py
     # @spaces.GPU 装饰器会自动处理 GPU 初始化，不需要手动初始化
-    # 快速生成模式：220秒，固定8步去噪
-    @spaces.GPU(duration=220)
     def gpu_wrapped_generate_video_fast(*args, **kwargs):
-        # 固定使用10步去噪，通过关键字参数传递
         kwargs['fixed_steps'] = 8
-        # Fast模式音频长度检测：检查是否超过6秒
         # 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
         #          sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
         if len(args) >= 11:
@@ -718,19 +718,19 @@ def run_graio_demo(args):
                 if img2vid_audio_3:
                     audio_paths.append(img2vid_audio_3)
-            # 检测音频长度是否超过6秒
             if audio_paths and len(audio_paths) > 0:
                 fps = getattr(cfg, 'fps', 24)
                 try:
                     calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
-                    # 6秒固定为145帧
-                    max_frames_6s = 145
-                    if calculated_frame_num > max_frames_6s:
-                        # 超过6秒，设置trim_to_6s标记
                         kwargs['trim_to_6s'] = True
                         calculated_duration = calculated_frame_num / fps
-                        logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 6 seconds limit. Will trim to 6 seconds.")
                     else:
                         kwargs['trim_to_6s'] = False
                 except Exception as e:
@@ -823,7 +823,7 @@ def run_graio_demo(args):
                             ⚠️ Important Video Duration Limits
                         </div>
                         <div style="font-size: 14px; color: #856404; line-height: 1.6;">
-                            Fast Mode: Maximum video duration shoule be less than 6 seconds. Videos longer than 6 seconds will timeout.<br>
                         </div>
                     </div>
@@ -899,7 +899,7 @@ def run_graio_demo(args):
                 with gr.Row():
                     run_i2v_button_fast = gr.Button(
-                        "Generate Video (Fast - 220s, 8 steps)",
                         variant="secondary",
                         scale=1
                     )
@@ -910,10 +910,10 @@ def run_graio_demo(args):
                     )
                 gr.Markdown("""
                 **Generation Modes:**
-                - **Fast Mode (220s GPU budget)**: Fixed 8 denoising steps for quick generation. Maximum video duration: 6 seconds.
                 - **Quality Mode (Dynamic GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). GPU duration is dynamically calculated as: video_seconds × steps × 3.5 s.
-                *Note: Fast mode has a fixed 220s GPU budget. Quality mode dynamically allocates GPU time based on video length and denoising steps. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
                 """)
             with gr.Column(scale=2):
@@ -923,7 +923,7 @@ def run_graio_demo(args):
                 gr.Markdown("""
                 ### Example Cases
-                *Note: Generation time (tested on NVIDIA H220 GPU with 40 denoising steps) may vary depending on GPU specifications and system load.*
                 """)
                 # 创建一个函数来处理 examples 选择
@@ -951,7 +951,7 @@ def run_graio_demo(args):
         # 包装函数：处理警告信息显示
         def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
                                     sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
-            # 在开始生成前先检测音频长度，如果超过6秒立即显示警告
             # 根据人数收集音频路径
             audio_paths = []
             if person_num_selector == "1 Person":
@@ -970,18 +970,18 @@ def run_graio_demo(args):
                 if img2vid_audio_3:
                     audio_paths.append(img2vid_audio_3)
-            # 检测音频长度是否超过6秒
             if audio_paths and len(audio_paths) > 0:
                 fps = getattr(cfg, 'fps', 24)
                 try:
                     calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
-                    # 6秒固定为145帧
-                    max_frames_6s = 145
-                    if calculated_frame_num > max_frames_6s:
-                        # 超过6秒，立即显示警告
                         calculated_duration = calculated_frame_num / fps
-                        warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 6-second limit for Fast Mode. The audio will be automatically trimmed to 6 seconds to prevent timeout."
                         gr.Warning(warning_msg, duration=5)
                 except Exception as e:
                     logging.warning(f"Failed to check audio duration: {e}")
@@ -1049,7 +1049,7 @@ def run_graio_demo(args):
             else:
                 return result
-        # 快速生成按钮：220秒，固定10步
         run_i2v_button_fast.click(
             fn=handle_fast_generation,
             inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],

                 fps = getattr(cfg, 'fps', 24)
                 calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
+                # Fast模式：如果trim_to_6s为True，强制限制为5秒对应的帧数
                 if trim_to_6s:
+                    # 5秒固定为121帧（4n+1格式：5秒*24fps=120帧，向上取整为121帧）
+                    max_frames_5s = 121
+                    current_frame_num = min(calculated_frame_num, max_frames_5s)
+                    logging.warning(f"Fast mode: Audio duration exceeds 5 seconds. Trimming to 5 seconds ({max_frames_5s} frames). Original: {calculated_frame_num} frames")
                 else:
                     current_frame_num = calculated_frame_num
     # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/app.py
     # @spaces.GPU 装饰器会自动处理 GPU 初始化，不需要手动初始化
+    # 快速生成模式：120秒，固定8步去噪
+    @spaces.GPU(duration=120)
     def gpu_wrapped_generate_video_fast(*args, **kwargs):
+        # 固定使用8步去噪，通过关键字参数传递
         kwargs['fixed_steps'] = 8
+        # Fast模式音频长度检测：检查是否超过5秒
         # 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
         #          sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
         if len(args) >= 11:
                 if img2vid_audio_3:
                     audio_paths.append(img2vid_audio_3)
+            # 检测音频长度是否超过5秒
             if audio_paths and len(audio_paths) > 0:
                 fps = getattr(cfg, 'fps', 24)
                 try:
                     calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
+                    # 5秒固定为121帧（4n+1格式：5秒*24fps=120帧，向上取整为121帧）
+                    max_frames_5s = 121
+                    if calculated_frame_num > max_frames_5s:
+                        # 超过5秒，设置trim_to_6s标记
                         kwargs['trim_to_6s'] = True
                         calculated_duration = calculated_frame_num / fps
+                        logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 5 seconds limit. Will trim to 5 seconds.")
                     else:
                         kwargs['trim_to_6s'] = False
                 except Exception as e:
                             ⚠️ Important Video Duration Limits
                         </div>
                         <div style="font-size: 14px; color: #856404; line-height: 1.6;">
+                            Fast Mode: Maximum video duration should be less than 5 seconds. Audio inputs longer than 5 seconds will be trimmed to 5 seconds.<br>
                         </div>
                     </div>
                 with gr.Row():
                     run_i2v_button_fast = gr.Button(
+                        "Generate Video (Fast - 120s, 8 steps)",
                         variant="secondary",
                         scale=1
                     )
                     )
                 gr.Markdown("""
                 **Generation Modes:**
+                - **Fast Mode (120s GPU budget)**: Fixed 8 denoising steps for quick generation. Maximum video duration: 5 seconds.
                 - **Quality Mode (Dynamic GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). GPU duration is dynamically calculated as: video_seconds × steps × 3.5 s.
+                *Note: Fast mode has a fixed 120s GPU budget. Quality mode dynamically allocates GPU time based on video length and denoising steps. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
                 """)
             with gr.Column(scale=2):
                 gr.Markdown("""
                 ### Example Cases
+                *Note: Generation time (tested on NVIDIA H100 GPU with 40 denoising steps) may vary depending on GPU specifications and system load.*
                 """)
                 # 创建一个函数来处理 examples 选择
         # 包装函数：处理警告信息显示
         def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
                                     sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
+            # 在开始生成前先检测音频长度，如果超过5秒立即显示警告
             # 根据人数收集音频路径
             audio_paths = []
             if person_num_selector == "1 Person":
                 if img2vid_audio_3:
                     audio_paths.append(img2vid_audio_3)
+            # 检测音频长度是否超过5秒
             if audio_paths and len(audio_paths) > 0:
                 fps = getattr(cfg, 'fps', 24)
                 try:
                     calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
+                    # 5秒固定为121帧（4n+1格式：5秒*24fps=120帧，向上取整为121帧）
+                    max_frames_5s = 121
+                    if calculated_frame_num > max_frames_5s:
+                        # 超过5秒，立即显示警告
                         calculated_duration = calculated_frame_num / fps
+                        warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 5-second limit for Fast Mode. The audio will be automatically trimmed to 5 seconds to prevent timeout."
                         gr.Warning(warning_msg, duration=5)
                 except Exception as e:
                     logging.warning(f"Failed to check audio duration: {e}")
             else:
                 return result
+        # 快速生成按钮：120秒，固定8步
         run_i2v_button_fast.click(
             fn=handle_fast_generation,
             inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],

wan/audio2video_multiID.py CHANGED Viewed

@@ -199,7 +199,7 @@ class WanAF2V:
         audio_paths=None, # New: audio path list, supports multiple audio files
         task_key=None,
         mode="pad",  # Audio processing mode: "pad" or "concat"
-        trim_to_6s=False,  # Fast mode: trim audio to 6 seconds
     ):
         r"""
         Generates video frames from input image and text prompt using diffusion process.

         audio_paths=None, # New: audio path list, supports multiple audio files
         task_key=None,
         mode="pad",  # Audio processing mode: "pad" or "concat"
+        trim_to_6s=False,  # Fast mode: trim audio to 5 seconds
     ):
         r"""
         Generates video frames from input image and text prompt using diffusion process.

wan/utils/infer_utils.py CHANGED Viewed

@@ -118,7 +118,7 @@ def process_audio_features(
     half_dtype=None,
     preprocess_audio=None,
     resample_audio=None,
-    trim_to_6s=False,  # Fast mode: trim audio to 6 seconds
 ):
     """
     Process audio files and extract audio features.
@@ -203,20 +203,20 @@ def process_audio_features(
             total_length = sum(audio_lengths)
             print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
-            # Fast mode: trim to 6 seconds if trim_to_6s is True
             if trim_to_6s:
-                # 6秒固定为145帧
-                max_frames_6s = 145
-                if total_length > max_frames_6s:
-                    print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_6s} frames (6 seconds)")
                     # Truncate each audio proportionally
-                    scale_factor = max_frames_6s / total_length
                     cumulative_length = 0
                     for i, audio_len in enumerate(audio_lengths):
                         if audio_len > 0:
                             new_audio_len = int(audio_len * scale_factor)
                             # Ensure it fits within remaining space
-                            remaining_space = max_frames_6s - cumulative_length
                             new_audio_len = min(new_audio_len, remaining_space)
                             audio_lengths[i] = new_audio_len
                             # Truncate the corresponding raw audio feature
@@ -294,14 +294,14 @@ def process_audio_features(
                     with torch.no_grad():
                         print(f"wav2vec_model: {wav2vec_model}")
                         print(f"cache_dir:{cache_dir}")
-                        # Fast mode: if trim_to_6s, limit to 6 seconds
                         target_frames = F
                         if trim_to_6s:
-                            # 6秒固定为145帧
-                            max_frames_6s = 145
-                            target_frames = min(F, max_frames_6s)
-                            if F > max_frames_6s:
-                                print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_6s} frames (6 seconds)")
                         # Use dynamically determined frame number
                         audio_emb, audio_length = preprocess_audio(
                             wav_path=target_resampled_audio_path,
@@ -343,14 +343,14 @@ def process_audio_features(
                     target_resampled_audio_path,
                 )
             with torch.no_grad():
-                # Fast mode: if trim_to_6s, limit to 6 seconds
                 target_frames = F
                 if trim_to_6s:
-                    # 6秒固定为145帧
-                    max_frames_6s = 145
-                    target_frames = min(F, max_frames_6s)
-                    if F > max_frames_6s:
-                        print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_6s} frames (6 seconds)")
                 # Use dynamically determined frame number
                 audio_emb, audio_length = preprocess_audio(
                     wav_path=audio,

     half_dtype=None,
     preprocess_audio=None,
     resample_audio=None,
+    trim_to_6s=False,  # Fast mode: trim audio to 5 seconds
 ):
     """
     Process audio files and extract audio features.
             total_length = sum(audio_lengths)
             print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
+            # Fast mode: trim to 5 seconds if trim_to_6s is True
             if trim_to_6s:
+                # 5秒固定为121帧（4n+1格式：5秒*24fps=120帧，向上取整为121帧）
+                max_frames_5s = 121
+                if total_length > max_frames_5s:
+                    print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_5s} frames (5 seconds)")
                     # Truncate each audio proportionally
+                    scale_factor = max_frames_5s / total_length
                     cumulative_length = 0
                     for i, audio_len in enumerate(audio_lengths):
                         if audio_len > 0:
                             new_audio_len = int(audio_len * scale_factor)
                             # Ensure it fits within remaining space
+                            remaining_space = max_frames_5s - cumulative_length
                             new_audio_len = min(new_audio_len, remaining_space)
                             audio_lengths[i] = new_audio_len
                             # Truncate the corresponding raw audio feature
                     with torch.no_grad():
                         print(f"wav2vec_model: {wav2vec_model}")
                         print(f"cache_dir:{cache_dir}")
+                        # Fast mode: if trim_to_6s, limit to 5 seconds
                         target_frames = F
                         if trim_to_6s:
+                            # 5秒固定为121帧（4n+1格式：5秒*24fps=120帧，向上取整为121帧）
+                            max_frames_5s = 121
+                            target_frames = min(F, max_frames_5s)
+                            if F > max_frames_5s:
+                                print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_5s} frames (5 seconds)")
                         # Use dynamically determined frame number
                         audio_emb, audio_length = preprocess_audio(
                             wav_path=target_resampled_audio_path,
                     target_resampled_audio_path,
                 )
             with torch.no_grad():
+                # Fast mode: if trim_to_6s, limit to 5 seconds
                 target_frames = F
                 if trim_to_6s:
+                    # 5秒固定为121帧（4n+1格式：5秒*24fps=120帧，向上取整为121帧）
+                    max_frames_5s = 121
+                    target_frames = min(F, max_frames_5s)
+                    if F > max_frames_5s:
+                        print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_5s} frames (5 seconds)")
                 # Use dynamically determined frame number
                 audio_emb, audio_length = preprocess_audio(
                     wav_path=audio,