C4G-HKUST commited on
Commit
89822bc
·
1 Parent(s): 00ecfed

feat: time out check

Browse files
Files changed (3) hide show
  1. app.py +28 -28
  2. wan/audio2video_multiID.py +1 -1
  3. wan/utils/infer_utils.py +20 -20
app.py CHANGED
@@ -483,12 +483,12 @@ def run_graio_demo(args):
483
  fps = getattr(cfg, 'fps', 24)
484
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
485
 
486
- # Fast模式:如果trim_to_6s为True,强制限制为6秒对应的帧数
487
  if trim_to_6s:
488
- # 6秒固定为145帧(4n+1格式)
489
- max_frames_6s = 145
490
- current_frame_num = min(calculated_frame_num, max_frames_6s)
491
- logging.warning(f"Fast mode: Audio duration exceeds 6 seconds. Trimming to 6 seconds ({max_frames_6s} frames). Original: {calculated_frame_num} frames")
492
  else:
493
  current_frame_num = calculated_frame_num
494
 
@@ -678,13 +678,13 @@ def run_graio_demo(args):
678
  # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/app.py
679
  # @spaces.GPU 装饰器会自动处理 GPU 初始化,不需要手动初始化
680
 
681
- # 快速生成模式:220秒,固定8步去噪
682
- @spaces.GPU(duration=220)
683
  def gpu_wrapped_generate_video_fast(*args, **kwargs):
684
- # 固定使用10步去噪,通过关键字参数传递
685
  kwargs['fixed_steps'] = 8
686
 
687
- # Fast模式音频长度检测:检查是否超过6
688
  # 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
689
  # sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
690
  if len(args) >= 11:
@@ -718,19 +718,19 @@ def run_graio_demo(args):
718
  if img2vid_audio_3:
719
  audio_paths.append(img2vid_audio_3)
720
 
721
- # 检测音频长度是否超过6
722
  if audio_paths and len(audio_paths) > 0:
723
  fps = getattr(cfg, 'fps', 24)
724
  try:
725
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
726
- # 6秒固定为145帧
727
- max_frames_6s = 145
728
 
729
- if calculated_frame_num > max_frames_6s:
730
- # 超过6秒,设置trim_to_6s标记
731
  kwargs['trim_to_6s'] = True
732
  calculated_duration = calculated_frame_num / fps
733
- logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 6 seconds limit. Will trim to 6 seconds.")
734
  else:
735
  kwargs['trim_to_6s'] = False
736
  except Exception as e:
@@ -823,7 +823,7 @@ def run_graio_demo(args):
823
  ⚠️ Important Video Duration Limits
824
  </div>
825
  <div style="font-size: 14px; color: #856404; line-height: 1.6;">
826
- Fast Mode: Maximum video duration shoule be less than 6 seconds. Videos longer than 6 seconds will timeout.<br>
827
  </div>
828
  </div>
829
 
@@ -899,7 +899,7 @@ def run_graio_demo(args):
899
 
900
  with gr.Row():
901
  run_i2v_button_fast = gr.Button(
902
- "Generate Video (Fast - 220s, 8 steps)",
903
  variant="secondary",
904
  scale=1
905
  )
@@ -910,10 +910,10 @@ def run_graio_demo(args):
910
  )
911
  gr.Markdown("""
912
  **Generation Modes:**
913
- - **Fast Mode (220s GPU budget)**: Fixed 8 denoising steps for quick generation. Maximum video duration: 6 seconds.
914
  - **Quality Mode (Dynamic GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). GPU duration is dynamically calculated as: video_seconds × steps × 3.5 s.
915
 
916
- *Note: Fast mode has a fixed 220s GPU budget. Quality mode dynamically allocates GPU time based on video length and denoising steps. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
917
  """)
918
 
919
  with gr.Column(scale=2):
@@ -923,7 +923,7 @@ def run_graio_demo(args):
923
  gr.Markdown("""
924
  ### Example Cases
925
 
926
- *Note: Generation time (tested on NVIDIA H220 GPU with 40 denoising steps) may vary depending on GPU specifications and system load.*
927
  """)
928
 
929
  # 创建一个函数来处理 examples 选择
@@ -951,7 +951,7 @@ def run_graio_demo(args):
951
  # 包装函数:处理警告信息显示
952
  def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
953
  sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
954
- # 在开始生成前先检测音频长度,如果超过6秒立即显示警告
955
  # 根据人数收集音频路径
956
  audio_paths = []
957
  if person_num_selector == "1 Person":
@@ -970,18 +970,18 @@ def run_graio_demo(args):
970
  if img2vid_audio_3:
971
  audio_paths.append(img2vid_audio_3)
972
 
973
- # 检测音频长度是否超过6
974
  if audio_paths and len(audio_paths) > 0:
975
  fps = getattr(cfg, 'fps', 24)
976
  try:
977
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
978
- # 6秒固定为145帧
979
- max_frames_6s = 145
980
 
981
- if calculated_frame_num > max_frames_6s:
982
- # 超过6秒,立即显示警告
983
  calculated_duration = calculated_frame_num / fps
984
- warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 6-second limit for Fast Mode. The audio will be automatically trimmed to 6 seconds to prevent timeout."
985
  gr.Warning(warning_msg, duration=5)
986
  except Exception as e:
987
  logging.warning(f"Failed to check audio duration: {e}")
@@ -1049,7 +1049,7 @@ def run_graio_demo(args):
1049
  else:
1050
  return result
1051
 
1052
- # 快速生成按钮:220秒,固定10
1053
  run_i2v_button_fast.click(
1054
  fn=handle_fast_generation,
1055
  inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
 
483
  fps = getattr(cfg, 'fps', 24)
484
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
485
 
486
+ # Fast模式:如果trim_to_6s为True,强制限制为5秒对应的帧数
487
  if trim_to_6s:
488
+ # 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
489
+ max_frames_5s = 121
490
+ current_frame_num = min(calculated_frame_num, max_frames_5s)
491
+ logging.warning(f"Fast mode: Audio duration exceeds 5 seconds. Trimming to 5 seconds ({max_frames_5s} frames). Original: {calculated_frame_num} frames")
492
  else:
493
  current_frame_num = calculated_frame_num
494
 
 
678
  # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/app.py
679
  # @spaces.GPU 装饰器会自动处理 GPU 初始化,不需要手动初始化
680
 
681
+ # 快速生成模式:120秒,固定8步去噪
682
+ @spaces.GPU(duration=120)
683
  def gpu_wrapped_generate_video_fast(*args, **kwargs):
684
+ # 固定使用8步去噪,通过关键字参数传递
685
  kwargs['fixed_steps'] = 8
686
 
687
+ # Fast模式音频长度检测:检查是否超过5
688
  # 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
689
  # sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
690
  if len(args) >= 11:
 
718
  if img2vid_audio_3:
719
  audio_paths.append(img2vid_audio_3)
720
 
721
+ # 检测音频长度是否超过5
722
  if audio_paths and len(audio_paths) > 0:
723
  fps = getattr(cfg, 'fps', 24)
724
  try:
725
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
726
+ # 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
727
+ max_frames_5s = 121
728
 
729
+ if calculated_frame_num > max_frames_5s:
730
+ # 超过5秒,设置trim_to_6s标记
731
  kwargs['trim_to_6s'] = True
732
  calculated_duration = calculated_frame_num / fps
733
+ logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 5 seconds limit. Will trim to 5 seconds.")
734
  else:
735
  kwargs['trim_to_6s'] = False
736
  except Exception as e:
 
823
  ⚠️ Important Video Duration Limits
824
  </div>
825
  <div style="font-size: 14px; color: #856404; line-height: 1.6;">
826
+ Fast Mode: Maximum video duration should be less than 5 seconds. Audio inputs longer than 5 seconds will be trimmed to 5 seconds.<br>
827
  </div>
828
  </div>
829
 
 
899
 
900
  with gr.Row():
901
  run_i2v_button_fast = gr.Button(
902
+ "Generate Video (Fast - 120s, 8 steps)",
903
  variant="secondary",
904
  scale=1
905
  )
 
910
  )
911
  gr.Markdown("""
912
  **Generation Modes:**
913
+ - **Fast Mode (120s GPU budget)**: Fixed 8 denoising steps for quick generation. Maximum video duration: 5 seconds.
914
  - **Quality Mode (Dynamic GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). GPU duration is dynamically calculated as: video_seconds × steps × 3.5 s.
915
 
916
+ *Note: Fast mode has a fixed 120s GPU budget. Quality mode dynamically allocates GPU time based on video length and denoising steps. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
917
  """)
918
 
919
  with gr.Column(scale=2):
 
923
  gr.Markdown("""
924
  ### Example Cases
925
 
926
+ *Note: Generation time (tested on NVIDIA H100 GPU with 40 denoising steps) may vary depending on GPU specifications and system load.*
927
  """)
928
 
929
  # 创建一个函数来处理 examples 选择
 
951
  # 包装函数:处理警告信息显示
952
  def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
953
  sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
954
+ # 在开始生成前先检测音频长度,如果超过5秒立即显示警告
955
  # 根据人数收集音频路径
956
  audio_paths = []
957
  if person_num_selector == "1 Person":
 
970
  if img2vid_audio_3:
971
  audio_paths.append(img2vid_audio_3)
972
 
973
+ # 检测音频长度是否超过5
974
  if audio_paths and len(audio_paths) > 0:
975
  fps = getattr(cfg, 'fps', 24)
976
  try:
977
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
978
+ # 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
979
+ max_frames_5s = 121
980
 
981
+ if calculated_frame_num > max_frames_5s:
982
+ # 超过5秒,立即显示警告
983
  calculated_duration = calculated_frame_num / fps
984
+ warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 5-second limit for Fast Mode. The audio will be automatically trimmed to 5 seconds to prevent timeout."
985
  gr.Warning(warning_msg, duration=5)
986
  except Exception as e:
987
  logging.warning(f"Failed to check audio duration: {e}")
 
1049
  else:
1050
  return result
1051
 
1052
+ # 快速生成按钮:120秒,固定8
1053
  run_i2v_button_fast.click(
1054
  fn=handle_fast_generation,
1055
  inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
wan/audio2video_multiID.py CHANGED
@@ -199,7 +199,7 @@ class WanAF2V:
199
  audio_paths=None, # New: audio path list, supports multiple audio files
200
  task_key=None,
201
  mode="pad", # Audio processing mode: "pad" or "concat"
202
- trim_to_6s=False, # Fast mode: trim audio to 6 seconds
203
  ):
204
  r"""
205
  Generates video frames from input image and text prompt using diffusion process.
 
199
  audio_paths=None, # New: audio path list, supports multiple audio files
200
  task_key=None,
201
  mode="pad", # Audio processing mode: "pad" or "concat"
202
+ trim_to_6s=False, # Fast mode: trim audio to 5 seconds
203
  ):
204
  r"""
205
  Generates video frames from input image and text prompt using diffusion process.
wan/utils/infer_utils.py CHANGED
@@ -118,7 +118,7 @@ def process_audio_features(
118
  half_dtype=None,
119
  preprocess_audio=None,
120
  resample_audio=None,
121
- trim_to_6s=False, # Fast mode: trim audio to 6 seconds
122
  ):
123
  """
124
  Process audio files and extract audio features.
@@ -203,20 +203,20 @@ def process_audio_features(
203
  total_length = sum(audio_lengths)
204
  print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
205
 
206
- # Fast mode: trim to 6 seconds if trim_to_6s is True
207
  if trim_to_6s:
208
- # 6秒固定为145帧
209
- max_frames_6s = 145
210
- if total_length > max_frames_6s:
211
- print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_6s} frames (6 seconds)")
212
  # Truncate each audio proportionally
213
- scale_factor = max_frames_6s / total_length
214
  cumulative_length = 0
215
  for i, audio_len in enumerate(audio_lengths):
216
  if audio_len > 0:
217
  new_audio_len = int(audio_len * scale_factor)
218
  # Ensure it fits within remaining space
219
- remaining_space = max_frames_6s - cumulative_length
220
  new_audio_len = min(new_audio_len, remaining_space)
221
  audio_lengths[i] = new_audio_len
222
  # Truncate the corresponding raw audio feature
@@ -294,14 +294,14 @@ def process_audio_features(
294
  with torch.no_grad():
295
  print(f"wav2vec_model: {wav2vec_model}")
296
  print(f"cache_dir:{cache_dir}")
297
- # Fast mode: if trim_to_6s, limit to 6 seconds
298
  target_frames = F
299
  if trim_to_6s:
300
- # 6秒固定为145帧
301
- max_frames_6s = 145
302
- target_frames = min(F, max_frames_6s)
303
- if F > max_frames_6s:
304
- print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_6s} frames (6 seconds)")
305
  # Use dynamically determined frame number
306
  audio_emb, audio_length = preprocess_audio(
307
  wav_path=target_resampled_audio_path,
@@ -343,14 +343,14 @@ def process_audio_features(
343
  target_resampled_audio_path,
344
  )
345
  with torch.no_grad():
346
- # Fast mode: if trim_to_6s, limit to 6 seconds
347
  target_frames = F
348
  if trim_to_6s:
349
- # 6秒固定为145帧
350
- max_frames_6s = 145
351
- target_frames = min(F, max_frames_6s)
352
- if F > max_frames_6s:
353
- print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_6s} frames (6 seconds)")
354
  # Use dynamically determined frame number
355
  audio_emb, audio_length = preprocess_audio(
356
  wav_path=audio,
 
118
  half_dtype=None,
119
  preprocess_audio=None,
120
  resample_audio=None,
121
+ trim_to_6s=False, # Fast mode: trim audio to 5 seconds
122
  ):
123
  """
124
  Process audio files and extract audio features.
 
203
  total_length = sum(audio_lengths)
204
  print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
205
 
206
+ # Fast mode: trim to 5 seconds if trim_to_6s is True
207
  if trim_to_6s:
208
+ # 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
209
+ max_frames_5s = 121
210
+ if total_length > max_frames_5s:
211
+ print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_5s} frames (5 seconds)")
212
  # Truncate each audio proportionally
213
+ scale_factor = max_frames_5s / total_length
214
  cumulative_length = 0
215
  for i, audio_len in enumerate(audio_lengths):
216
  if audio_len > 0:
217
  new_audio_len = int(audio_len * scale_factor)
218
  # Ensure it fits within remaining space
219
+ remaining_space = max_frames_5s - cumulative_length
220
  new_audio_len = min(new_audio_len, remaining_space)
221
  audio_lengths[i] = new_audio_len
222
  # Truncate the corresponding raw audio feature
 
294
  with torch.no_grad():
295
  print(f"wav2vec_model: {wav2vec_model}")
296
  print(f"cache_dir:{cache_dir}")
297
+ # Fast mode: if trim_to_6s, limit to 5 seconds
298
  target_frames = F
299
  if trim_to_6s:
300
+ # 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
301
+ max_frames_5s = 121
302
+ target_frames = min(F, max_frames_5s)
303
+ if F > max_frames_5s:
304
+ print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_5s} frames (5 seconds)")
305
  # Use dynamically determined frame number
306
  audio_emb, audio_length = preprocess_audio(
307
  wav_path=target_resampled_audio_path,
 
343
  target_resampled_audio_path,
344
  )
345
  with torch.no_grad():
346
+ # Fast mode: if trim_to_6s, limit to 5 seconds
347
  target_frames = F
348
  if trim_to_6s:
349
+ # 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
350
+ max_frames_5s = 121
351
+ target_frames = min(F, max_frames_5s)
352
+ if F > max_frames_5s:
353
+ print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_5s} frames (5 seconds)")
354
  # Use dynamically determined frame number
355
  audio_emb, audio_length = preprocess_audio(
356
  wav_path=audio,