Spaces:
Running
on
Zero
Running
on
Zero
feat: time out check
Browse files- app.py +28 -28
- wan/audio2video_multiID.py +1 -1
- wan/utils/infer_utils.py +20 -20
app.py
CHANGED
|
@@ -483,12 +483,12 @@ def run_graio_demo(args):
|
|
| 483 |
fps = getattr(cfg, 'fps', 24)
|
| 484 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 485 |
|
| 486 |
-
# Fast模式:如果trim_to_6s为True,强制限制为
|
| 487 |
if trim_to_6s:
|
| 488 |
-
#
|
| 489 |
-
|
| 490 |
-
current_frame_num = min(calculated_frame_num,
|
| 491 |
-
logging.warning(f"Fast mode: Audio duration exceeds
|
| 492 |
else:
|
| 493 |
current_frame_num = calculated_frame_num
|
| 494 |
|
|
@@ -678,13 +678,13 @@ def run_graio_demo(args):
|
|
| 678 |
# 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/app.py
|
| 679 |
# @spaces.GPU 装饰器会自动处理 GPU 初始化,不需要手动初始化
|
| 680 |
|
| 681 |
-
# 快速生成模式:
|
| 682 |
-
@spaces.GPU(duration=
|
| 683 |
def gpu_wrapped_generate_video_fast(*args, **kwargs):
|
| 684 |
-
# 固定使用
|
| 685 |
kwargs['fixed_steps'] = 8
|
| 686 |
|
| 687 |
-
# Fast模式音频长度检测:检查是否超过
|
| 688 |
# 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
|
| 689 |
# sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
|
| 690 |
if len(args) >= 11:
|
|
@@ -718,19 +718,19 @@ def run_graio_demo(args):
|
|
| 718 |
if img2vid_audio_3:
|
| 719 |
audio_paths.append(img2vid_audio_3)
|
| 720 |
|
| 721 |
-
# 检测音频长度是否超过
|
| 722 |
if audio_paths and len(audio_paths) > 0:
|
| 723 |
fps = getattr(cfg, 'fps', 24)
|
| 724 |
try:
|
| 725 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 726 |
-
#
|
| 727 |
-
|
| 728 |
|
| 729 |
-
if calculated_frame_num >
|
| 730 |
-
# 超过
|
| 731 |
kwargs['trim_to_6s'] = True
|
| 732 |
calculated_duration = calculated_frame_num / fps
|
| 733 |
-
logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds
|
| 734 |
else:
|
| 735 |
kwargs['trim_to_6s'] = False
|
| 736 |
except Exception as e:
|
|
@@ -823,7 +823,7 @@ def run_graio_demo(args):
|
|
| 823 |
⚠️ Important Video Duration Limits
|
| 824 |
</div>
|
| 825 |
<div style="font-size: 14px; color: #856404; line-height: 1.6;">
|
| 826 |
-
Fast Mode: Maximum video duration
|
| 827 |
</div>
|
| 828 |
</div>
|
| 829 |
|
|
@@ -899,7 +899,7 @@ def run_graio_demo(args):
|
|
| 899 |
|
| 900 |
with gr.Row():
|
| 901 |
run_i2v_button_fast = gr.Button(
|
| 902 |
-
"Generate Video (Fast -
|
| 903 |
variant="secondary",
|
| 904 |
scale=1
|
| 905 |
)
|
|
@@ -910,10 +910,10 @@ def run_graio_demo(args):
|
|
| 910 |
)
|
| 911 |
gr.Markdown("""
|
| 912 |
**Generation Modes:**
|
| 913 |
-
- **Fast Mode (
|
| 914 |
- **Quality Mode (Dynamic GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). GPU duration is dynamically calculated as: video_seconds × steps × 3.5 s.
|
| 915 |
|
| 916 |
-
*Note: Fast mode has a fixed
|
| 917 |
""")
|
| 918 |
|
| 919 |
with gr.Column(scale=2):
|
|
@@ -923,7 +923,7 @@ def run_graio_demo(args):
|
|
| 923 |
gr.Markdown("""
|
| 924 |
### Example Cases
|
| 925 |
|
| 926 |
-
*Note: Generation time (tested on NVIDIA
|
| 927 |
""")
|
| 928 |
|
| 929 |
# 创建一个函数来处理 examples 选择
|
|
@@ -951,7 +951,7 @@ def run_graio_demo(args):
|
|
| 951 |
# 包装函数:处理警告信息显示
|
| 952 |
def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
|
| 953 |
sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
|
| 954 |
-
# 在开始生成前先检测音频长度,如果超过
|
| 955 |
# 根据人数收集音频路径
|
| 956 |
audio_paths = []
|
| 957 |
if person_num_selector == "1 Person":
|
|
@@ -970,18 +970,18 @@ def run_graio_demo(args):
|
|
| 970 |
if img2vid_audio_3:
|
| 971 |
audio_paths.append(img2vid_audio_3)
|
| 972 |
|
| 973 |
-
# 检测音频长度是否超过
|
| 974 |
if audio_paths and len(audio_paths) > 0:
|
| 975 |
fps = getattr(cfg, 'fps', 24)
|
| 976 |
try:
|
| 977 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 978 |
-
#
|
| 979 |
-
|
| 980 |
|
| 981 |
-
if calculated_frame_num >
|
| 982 |
-
# 超过
|
| 983 |
calculated_duration = calculated_frame_num / fps
|
| 984 |
-
warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the
|
| 985 |
gr.Warning(warning_msg, duration=5)
|
| 986 |
except Exception as e:
|
| 987 |
logging.warning(f"Failed to check audio duration: {e}")
|
|
@@ -1049,7 +1049,7 @@ def run_graio_demo(args):
|
|
| 1049 |
else:
|
| 1050 |
return result
|
| 1051 |
|
| 1052 |
-
# 快速生成按钮:
|
| 1053 |
run_i2v_button_fast.click(
|
| 1054 |
fn=handle_fast_generation,
|
| 1055 |
inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
|
|
|
|
| 483 |
fps = getattr(cfg, 'fps', 24)
|
| 484 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 485 |
|
| 486 |
+
# Fast模式:如果trim_to_6s为True,强制限制为5秒对应的帧数
|
| 487 |
if trim_to_6s:
|
| 488 |
+
# 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
|
| 489 |
+
max_frames_5s = 121
|
| 490 |
+
current_frame_num = min(calculated_frame_num, max_frames_5s)
|
| 491 |
+
logging.warning(f"Fast mode: Audio duration exceeds 5 seconds. Trimming to 5 seconds ({max_frames_5s} frames). Original: {calculated_frame_num} frames")
|
| 492 |
else:
|
| 493 |
current_frame_num = calculated_frame_num
|
| 494 |
|
|
|
|
| 678 |
# 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/app.py
|
| 679 |
# @spaces.GPU 装饰器会自动处理 GPU 初始化,不需要手动初始化
|
| 680 |
|
| 681 |
+
# 快速生成模式:120秒,固定8步去噪
|
| 682 |
+
@spaces.GPU(duration=120)
|
| 683 |
def gpu_wrapped_generate_video_fast(*args, **kwargs):
|
| 684 |
+
# 固定使用8步去噪,通过关键字参数传递
|
| 685 |
kwargs['fixed_steps'] = 8
|
| 686 |
|
| 687 |
+
# Fast模式音频长度检测:检查是否超过5秒
|
| 688 |
# 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
|
| 689 |
# sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
|
| 690 |
if len(args) >= 11:
|
|
|
|
| 718 |
if img2vid_audio_3:
|
| 719 |
audio_paths.append(img2vid_audio_3)
|
| 720 |
|
| 721 |
+
# 检测音频长度是否超过5秒
|
| 722 |
if audio_paths and len(audio_paths) > 0:
|
| 723 |
fps = getattr(cfg, 'fps', 24)
|
| 724 |
try:
|
| 725 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 726 |
+
# 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
|
| 727 |
+
max_frames_5s = 121
|
| 728 |
|
| 729 |
+
if calculated_frame_num > max_frames_5s:
|
| 730 |
+
# 超过5秒,设置trim_to_6s标记
|
| 731 |
kwargs['trim_to_6s'] = True
|
| 732 |
calculated_duration = calculated_frame_num / fps
|
| 733 |
+
logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 5 seconds limit. Will trim to 5 seconds.")
|
| 734 |
else:
|
| 735 |
kwargs['trim_to_6s'] = False
|
| 736 |
except Exception as e:
|
|
|
|
| 823 |
⚠️ Important Video Duration Limits
|
| 824 |
</div>
|
| 825 |
<div style="font-size: 14px; color: #856404; line-height: 1.6;">
|
| 826 |
+
Fast Mode: Maximum video duration should be less than 5 seconds. Audio inputs longer than 5 seconds will be trimmed to 5 seconds.<br>
|
| 827 |
</div>
|
| 828 |
</div>
|
| 829 |
|
|
|
|
| 899 |
|
| 900 |
with gr.Row():
|
| 901 |
run_i2v_button_fast = gr.Button(
|
| 902 |
+
"Generate Video (Fast - 120s, 8 steps)",
|
| 903 |
variant="secondary",
|
| 904 |
scale=1
|
| 905 |
)
|
|
|
|
| 910 |
)
|
| 911 |
gr.Markdown("""
|
| 912 |
**Generation Modes:**
|
| 913 |
+
- **Fast Mode (120s GPU budget)**: Fixed 8 denoising steps for quick generation. Maximum video duration: 5 seconds.
|
| 914 |
- **Quality Mode (Dynamic GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). GPU duration is dynamically calculated as: video_seconds × steps × 3.5 s.
|
| 915 |
|
| 916 |
+
*Note: Fast mode has a fixed 120s GPU budget. Quality mode dynamically allocates GPU time based on video length and denoising steps. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
|
| 917 |
""")
|
| 918 |
|
| 919 |
with gr.Column(scale=2):
|
|
|
|
| 923 |
gr.Markdown("""
|
| 924 |
### Example Cases
|
| 925 |
|
| 926 |
+
*Note: Generation time (tested on NVIDIA H100 GPU with 40 denoising steps) may vary depending on GPU specifications and system load.*
|
| 927 |
""")
|
| 928 |
|
| 929 |
# 创建一个函数来处理 examples 选择
|
|
|
|
| 951 |
# 包装函数:处理警告信息显示
|
| 952 |
def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
|
| 953 |
sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
|
| 954 |
+
# 在开始生成前先检测音频长度,如果超过5秒立即显示警告
|
| 955 |
# 根据人数收集音频路径
|
| 956 |
audio_paths = []
|
| 957 |
if person_num_selector == "1 Person":
|
|
|
|
| 970 |
if img2vid_audio_3:
|
| 971 |
audio_paths.append(img2vid_audio_3)
|
| 972 |
|
| 973 |
+
# 检测音频长度是否超过5秒
|
| 974 |
if audio_paths and len(audio_paths) > 0:
|
| 975 |
fps = getattr(cfg, 'fps', 24)
|
| 976 |
try:
|
| 977 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 978 |
+
# 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
|
| 979 |
+
max_frames_5s = 121
|
| 980 |
|
| 981 |
+
if calculated_frame_num > max_frames_5s:
|
| 982 |
+
# 超过5秒,立即显示警告
|
| 983 |
calculated_duration = calculated_frame_num / fps
|
| 984 |
+
warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 5-second limit for Fast Mode. The audio will be automatically trimmed to 5 seconds to prevent timeout."
|
| 985 |
gr.Warning(warning_msg, duration=5)
|
| 986 |
except Exception as e:
|
| 987 |
logging.warning(f"Failed to check audio duration: {e}")
|
|
|
|
| 1049 |
else:
|
| 1050 |
return result
|
| 1051 |
|
| 1052 |
+
# 快速生成按钮:120秒,固定8步
|
| 1053 |
run_i2v_button_fast.click(
|
| 1054 |
fn=handle_fast_generation,
|
| 1055 |
inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
|
wan/audio2video_multiID.py
CHANGED
|
@@ -199,7 +199,7 @@ class WanAF2V:
|
|
| 199 |
audio_paths=None, # New: audio path list, supports multiple audio files
|
| 200 |
task_key=None,
|
| 201 |
mode="pad", # Audio processing mode: "pad" or "concat"
|
| 202 |
-
trim_to_6s=False, # Fast mode: trim audio to
|
| 203 |
):
|
| 204 |
r"""
|
| 205 |
Generates video frames from input image and text prompt using diffusion process.
|
|
|
|
| 199 |
audio_paths=None, # New: audio path list, supports multiple audio files
|
| 200 |
task_key=None,
|
| 201 |
mode="pad", # Audio processing mode: "pad" or "concat"
|
| 202 |
+
trim_to_6s=False, # Fast mode: trim audio to 5 seconds
|
| 203 |
):
|
| 204 |
r"""
|
| 205 |
Generates video frames from input image and text prompt using diffusion process.
|
wan/utils/infer_utils.py
CHANGED
|
@@ -118,7 +118,7 @@ def process_audio_features(
|
|
| 118 |
half_dtype=None,
|
| 119 |
preprocess_audio=None,
|
| 120 |
resample_audio=None,
|
| 121 |
-
trim_to_6s=False, # Fast mode: trim audio to
|
| 122 |
):
|
| 123 |
"""
|
| 124 |
Process audio files and extract audio features.
|
|
@@ -203,20 +203,20 @@ def process_audio_features(
|
|
| 203 |
total_length = sum(audio_lengths)
|
| 204 |
print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
|
| 205 |
|
| 206 |
-
# Fast mode: trim to
|
| 207 |
if trim_to_6s:
|
| 208 |
-
#
|
| 209 |
-
|
| 210 |
-
if total_length >
|
| 211 |
-
print(f"Fast mode: Trimming audio from {total_length} frames to {
|
| 212 |
# Truncate each audio proportionally
|
| 213 |
-
scale_factor =
|
| 214 |
cumulative_length = 0
|
| 215 |
for i, audio_len in enumerate(audio_lengths):
|
| 216 |
if audio_len > 0:
|
| 217 |
new_audio_len = int(audio_len * scale_factor)
|
| 218 |
# Ensure it fits within remaining space
|
| 219 |
-
remaining_space =
|
| 220 |
new_audio_len = min(new_audio_len, remaining_space)
|
| 221 |
audio_lengths[i] = new_audio_len
|
| 222 |
# Truncate the corresponding raw audio feature
|
|
@@ -294,14 +294,14 @@ def process_audio_features(
|
|
| 294 |
with torch.no_grad():
|
| 295 |
print(f"wav2vec_model: {wav2vec_model}")
|
| 296 |
print(f"cache_dir:{cache_dir}")
|
| 297 |
-
# Fast mode: if trim_to_6s, limit to
|
| 298 |
target_frames = F
|
| 299 |
if trim_to_6s:
|
| 300 |
-
#
|
| 301 |
-
|
| 302 |
-
target_frames = min(F,
|
| 303 |
-
if F >
|
| 304 |
-
print(f"Fast mode: Trimming audio {i} from {F} frames to {
|
| 305 |
# Use dynamically determined frame number
|
| 306 |
audio_emb, audio_length = preprocess_audio(
|
| 307 |
wav_path=target_resampled_audio_path,
|
|
@@ -343,14 +343,14 @@ def process_audio_features(
|
|
| 343 |
target_resampled_audio_path,
|
| 344 |
)
|
| 345 |
with torch.no_grad():
|
| 346 |
-
# Fast mode: if trim_to_6s, limit to
|
| 347 |
target_frames = F
|
| 348 |
if trim_to_6s:
|
| 349 |
-
#
|
| 350 |
-
|
| 351 |
-
target_frames = min(F,
|
| 352 |
-
if F >
|
| 353 |
-
print(f"Fast mode: Trimming single audio from {F} frames to {
|
| 354 |
# Use dynamically determined frame number
|
| 355 |
audio_emb, audio_length = preprocess_audio(
|
| 356 |
wav_path=audio,
|
|
|
|
| 118 |
half_dtype=None,
|
| 119 |
preprocess_audio=None,
|
| 120 |
resample_audio=None,
|
| 121 |
+
trim_to_6s=False, # Fast mode: trim audio to 5 seconds
|
| 122 |
):
|
| 123 |
"""
|
| 124 |
Process audio files and extract audio features.
|
|
|
|
| 203 |
total_length = sum(audio_lengths)
|
| 204 |
print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
|
| 205 |
|
| 206 |
+
# Fast mode: trim to 5 seconds if trim_to_6s is True
|
| 207 |
if trim_to_6s:
|
| 208 |
+
# 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
|
| 209 |
+
max_frames_5s = 121
|
| 210 |
+
if total_length > max_frames_5s:
|
| 211 |
+
print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_5s} frames (5 seconds)")
|
| 212 |
# Truncate each audio proportionally
|
| 213 |
+
scale_factor = max_frames_5s / total_length
|
| 214 |
cumulative_length = 0
|
| 215 |
for i, audio_len in enumerate(audio_lengths):
|
| 216 |
if audio_len > 0:
|
| 217 |
new_audio_len = int(audio_len * scale_factor)
|
| 218 |
# Ensure it fits within remaining space
|
| 219 |
+
remaining_space = max_frames_5s - cumulative_length
|
| 220 |
new_audio_len = min(new_audio_len, remaining_space)
|
| 221 |
audio_lengths[i] = new_audio_len
|
| 222 |
# Truncate the corresponding raw audio feature
|
|
|
|
| 294 |
with torch.no_grad():
|
| 295 |
print(f"wav2vec_model: {wav2vec_model}")
|
| 296 |
print(f"cache_dir:{cache_dir}")
|
| 297 |
+
# Fast mode: if trim_to_6s, limit to 5 seconds
|
| 298 |
target_frames = F
|
| 299 |
if trim_to_6s:
|
| 300 |
+
# 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
|
| 301 |
+
max_frames_5s = 121
|
| 302 |
+
target_frames = min(F, max_frames_5s)
|
| 303 |
+
if F > max_frames_5s:
|
| 304 |
+
print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_5s} frames (5 seconds)")
|
| 305 |
# Use dynamically determined frame number
|
| 306 |
audio_emb, audio_length = preprocess_audio(
|
| 307 |
wav_path=target_resampled_audio_path,
|
|
|
|
| 343 |
target_resampled_audio_path,
|
| 344 |
)
|
| 345 |
with torch.no_grad():
|
| 346 |
+
# Fast mode: if trim_to_6s, limit to 5 seconds
|
| 347 |
target_frames = F
|
| 348 |
if trim_to_6s:
|
| 349 |
+
# 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
|
| 350 |
+
max_frames_5s = 121
|
| 351 |
+
target_frames = min(F, max_frames_5s)
|
| 352 |
+
if F > max_frames_5s:
|
| 353 |
+
print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_5s} frames (5 seconds)")
|
| 354 |
# Use dynamically determined frame number
|
| 355 |
audio_emb, audio_length = preprocess_audio(
|
| 356 |
wav_path=audio,
|