Spaces:
Running
Running
feat(video): add text-to-video generation capabilities
Browse files- [feat] Add new file with `generate_video` and `handle_video_generation` functions (video_handler.py)
- [feat] Implement `create_video_tab` function for the UI (ui_components.py:create_video_tab())
- [feat] Integrate video tab into the main application (app.py:create_app())
- [add] Define `DEFAULT_VIDEO_MODEL`, `VIDEO_MODEL_PRESETS`, and `VIDEO_EXAMPLE_PROMPTS` (utils.py)
- [docs] Update main header description to include text-to-video (app.py:create_main_header())
- [docs] Add "Text-to-Video Tab" section to the footer (app.py:create_footer())
- app.py +5 -0
- ui_components.py +130 -3
- utils.py +21 -0
- video_handler.py +152 -0
app.py
CHANGED
|
@@ -6,12 +6,14 @@ A comprehensive AI platform with chat and image generation capabilities.
|
|
| 6 |
import gradio as gr
|
| 7 |
from chat_handler import handle_chat_submit, handle_chat_retry
|
| 8 |
from image_handler import handle_image_generation, handle_image_to_image_generation
|
|
|
|
| 9 |
from tts_handler import handle_text_to_speech_generation
|
| 10 |
from ui_components import (
|
| 11 |
create_main_header,
|
| 12 |
create_chat_tab,
|
| 13 |
create_image_tab,
|
| 14 |
create_image_to_image_tab,
|
|
|
|
| 15 |
create_tts_tab,
|
| 16 |
create_footer
|
| 17 |
)
|
|
@@ -41,6 +43,9 @@ def create_app():
|
|
| 41 |
# Image-to-image tab
|
| 42 |
create_image_to_image_tab(handle_image_to_image_generation)
|
| 43 |
|
|
|
|
|
|
|
|
|
|
| 44 |
# Text-to-speech tab
|
| 45 |
create_tts_tab(handle_text_to_speech_generation)
|
| 46 |
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
from chat_handler import handle_chat_submit, handle_chat_retry
|
| 8 |
from image_handler import handle_image_generation, handle_image_to_image_generation
|
| 9 |
+
from video_handler import handle_video_generation
|
| 10 |
from tts_handler import handle_text_to_speech_generation
|
| 11 |
from ui_components import (
|
| 12 |
create_main_header,
|
| 13 |
create_chat_tab,
|
| 14 |
create_image_tab,
|
| 15 |
create_image_to_image_tab,
|
| 16 |
+
create_video_tab,
|
| 17 |
create_tts_tab,
|
| 18 |
create_footer
|
| 19 |
)
|
|
|
|
| 43 |
# Image-to-image tab
|
| 44 |
create_image_to_image_tab(handle_image_to_image_generation)
|
| 45 |
|
| 46 |
+
# Text-to-Video tab
|
| 47 |
+
create_video_tab(handle_video_generation)
|
| 48 |
+
|
| 49 |
# Text-to-speech tab
|
| 50 |
create_tts_tab(handle_text_to_speech_generation)
|
| 51 |
|
ui_components.py
CHANGED
|
@@ -10,7 +10,8 @@ from utils import (
|
|
| 10 |
DEFAULT_TTS_MODEL,
|
| 11 |
CHAT_CONFIG, IMAGE_CONFIG, IMAGE_PROVIDERS, IMAGE_MODEL_PRESETS,
|
| 12 |
IMAGE_TO_IMAGE_MODEL_PRESETS, TTS_MODEL_PRESETS, TTS_VOICES, TTS_MODEL_CONFIGS,
|
| 13 |
-
IMAGE_EXAMPLE_PROMPTS, IMAGE_TO_IMAGE_EXAMPLE_PROMPTS, TTS_EXAMPLE_TEXTS, TTS_EXAMPLE_AUDIO_URLS
|
|
|
|
| 14 |
)
|
| 15 |
|
| 16 |
|
|
@@ -561,6 +562,126 @@ def create_tts_tab(handle_tts_generation_fn):
|
|
| 561 |
gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
|
| 562 |
|
| 563 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 564 |
def create_image_to_image_presets(img2img_model_name, img2img_provider):
|
| 565 |
"""Create quick model presets for image-to-image generation."""
|
| 566 |
with gr.Group():
|
|
@@ -645,13 +766,14 @@ def create_main_header():
|
|
| 645 |
gr.Markdown("""
|
| 646 |
# π HF-Inferoxy AI Hub
|
| 647 |
|
| 648 |
-
A comprehensive AI platform combining chat, image generation, and text-to-speech capabilities with intelligent token management through HF-Inferoxy.
|
| 649 |
|
| 650 |
**Features:**
|
| 651 |
- π¬ **Smart Chat**: Conversational AI with streaming responses
|
| 652 |
- π¨ **Image Generation**: Text-to-image creation with multiple providers
|
| 653 |
- πΌοΈ **Image-to-Image**: Transform and modify existing images with AI
|
| 654 |
-
-
|
|
|
|
| 655 |
- π **Intelligent Token Management**: Automatic token rotation and error handling
|
| 656 |
- π **Multi-Provider Support**: Works with HF Inference, Cerebras, Cohere, Groq, Together, Fal.ai, and more
|
| 657 |
""")
|
|
@@ -681,6 +803,11 @@ def create_footer():
|
|
| 681 |
- Perfect for style transfers, object additions, and image transformations
|
| 682 |
- Works great with models like Qwen Image Edit and FLUX.1 Kontext
|
| 683 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 684 |
**Text-to-Speech Tab:**
|
| 685 |
- Enter text you want to convert to speech
|
| 686 |
- Choose from various English voices (US and UK accents)
|
|
|
|
| 10 |
DEFAULT_TTS_MODEL,
|
| 11 |
CHAT_CONFIG, IMAGE_CONFIG, IMAGE_PROVIDERS, IMAGE_MODEL_PRESETS,
|
| 12 |
IMAGE_TO_IMAGE_MODEL_PRESETS, TTS_MODEL_PRESETS, TTS_VOICES, TTS_MODEL_CONFIGS,
|
| 13 |
+
IMAGE_EXAMPLE_PROMPTS, IMAGE_TO_IMAGE_EXAMPLE_PROMPTS, TTS_EXAMPLE_TEXTS, TTS_EXAMPLE_AUDIO_URLS,
|
| 14 |
+
DEFAULT_VIDEO_MODEL, VIDEO_MODEL_PRESETS, VIDEO_EXAMPLE_PROMPTS
|
| 15 |
)
|
| 16 |
|
| 17 |
|
|
|
|
| 562 |
gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
|
| 563 |
|
| 564 |
|
| 565 |
+
def create_video_tab(handle_video_generation_fn):
|
| 566 |
+
"""
|
| 567 |
+
Create the text-to-video tab interface.
|
| 568 |
+
"""
|
| 569 |
+
with gr.Tab("π¬ Text-to-Video", id="video"):
|
| 570 |
+
with gr.Row():
|
| 571 |
+
with gr.Column(scale=2):
|
| 572 |
+
# Video output
|
| 573 |
+
output_video = gr.Video(
|
| 574 |
+
label="Generated Video",
|
| 575 |
+
interactive=False,
|
| 576 |
+
show_download_button=True,
|
| 577 |
+
height=480,
|
| 578 |
+
)
|
| 579 |
+
status_text = gr.Textbox(
|
| 580 |
+
label="Generation Status",
|
| 581 |
+
interactive=False,
|
| 582 |
+
lines=2
|
| 583 |
+
)
|
| 584 |
+
|
| 585 |
+
with gr.Column(scale=1):
|
| 586 |
+
# Model and provider inputs
|
| 587 |
+
with gr.Group():
|
| 588 |
+
gr.Markdown("**π€ Model & Provider**")
|
| 589 |
+
vid_model_name = gr.Textbox(
|
| 590 |
+
value=DEFAULT_VIDEO_MODEL,
|
| 591 |
+
label="Model Name",
|
| 592 |
+
placeholder="e.g., tencent/HunyuanVideo, Wan-AI/Wan2.2-T2V-A14B"
|
| 593 |
+
)
|
| 594 |
+
vid_provider = gr.Dropdown(
|
| 595 |
+
choices=IMAGE_PROVIDERS,
|
| 596 |
+
value=DEFAULT_PROVIDER,
|
| 597 |
+
label="Provider",
|
| 598 |
+
interactive=True
|
| 599 |
+
)
|
| 600 |
+
|
| 601 |
+
# Generation parameters
|
| 602 |
+
with gr.Group():
|
| 603 |
+
gr.Markdown("**π Prompt**")
|
| 604 |
+
vid_prompt = gr.Textbox(
|
| 605 |
+
value=VIDEO_EXAMPLE_PROMPTS[0],
|
| 606 |
+
label="Prompt",
|
| 607 |
+
lines=3,
|
| 608 |
+
placeholder="Describe the video you want to generate..."
|
| 609 |
+
)
|
| 610 |
+
|
| 611 |
+
with gr.Group():
|
| 612 |
+
gr.Markdown("**βοΈ Generation Settings (optional)**")
|
| 613 |
+
with gr.Row():
|
| 614 |
+
vid_steps = gr.Slider(
|
| 615 |
+
minimum=10, maximum=100, value=20, step=1,
|
| 616 |
+
label="Inference Steps"
|
| 617 |
+
)
|
| 618 |
+
vid_guidance = gr.Slider(
|
| 619 |
+
minimum=1.0, maximum=20.0, value=7.5, step=0.5,
|
| 620 |
+
label="Guidance Scale"
|
| 621 |
+
)
|
| 622 |
+
vid_seed = gr.Slider(
|
| 623 |
+
minimum=-1, maximum=999999, value=-1, step=1,
|
| 624 |
+
label="Seed", info="-1 for random"
|
| 625 |
+
)
|
| 626 |
+
|
| 627 |
+
# Generate and Stop buttons
|
| 628 |
+
with gr.Row():
|
| 629 |
+
generate_btn = gr.Button(
|
| 630 |
+
"π¬ Generate Video",
|
| 631 |
+
variant="primary",
|
| 632 |
+
size="lg",
|
| 633 |
+
scale=2
|
| 634 |
+
)
|
| 635 |
+
stop_generate_btn = gr.Button("βΉ Stop", variant="secondary", visible=False)
|
| 636 |
+
|
| 637 |
+
# Quick model presets
|
| 638 |
+
with gr.Group():
|
| 639 |
+
gr.Markdown("**π― Popular Presets**")
|
| 640 |
+
for name, model, provider in VIDEO_MODEL_PRESETS:
|
| 641 |
+
btn = gr.Button(name, size="sm")
|
| 642 |
+
btn.click(
|
| 643 |
+
lambda m=model, p=provider: (m, p),
|
| 644 |
+
outputs=[vid_model_name, vid_provider]
|
| 645 |
+
)
|
| 646 |
+
|
| 647 |
+
# Examples for video generation
|
| 648 |
+
with gr.Group():
|
| 649 |
+
gr.Markdown("**π Example Prompts**")
|
| 650 |
+
gr.Examples(
|
| 651 |
+
examples=[[prompt] for prompt in VIDEO_EXAMPLE_PROMPTS],
|
| 652 |
+
inputs=vid_prompt
|
| 653 |
+
)
|
| 654 |
+
|
| 655 |
+
# Connect video generation events
|
| 656 |
+
generate_btn.click(
|
| 657 |
+
fn=lambda: gr.update(visible=True),
|
| 658 |
+
inputs=None,
|
| 659 |
+
outputs=[stop_generate_btn],
|
| 660 |
+
queue=False
|
| 661 |
+
)
|
| 662 |
+
|
| 663 |
+
gen_event = generate_btn.click(
|
| 664 |
+
fn=handle_video_generation_fn,
|
| 665 |
+
inputs=[
|
| 666 |
+
vid_prompt, vid_model_name, vid_provider,
|
| 667 |
+
vid_steps, vid_guidance, vid_seed
|
| 668 |
+
],
|
| 669 |
+
outputs=[output_video, status_text]
|
| 670 |
+
)
|
| 671 |
+
|
| 672 |
+
# Stop current video generation
|
| 673 |
+
stop_generate_btn.click(
|
| 674 |
+
fn=lambda: gr.update(visible=False),
|
| 675 |
+
inputs=None,
|
| 676 |
+
outputs=[stop_generate_btn],
|
| 677 |
+
cancels=[gen_event],
|
| 678 |
+
queue=False
|
| 679 |
+
)
|
| 680 |
+
|
| 681 |
+
# Hide stop after generation completes
|
| 682 |
+
gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
|
| 683 |
+
|
| 684 |
+
|
| 685 |
def create_image_to_image_presets(img2img_model_name, img2img_provider):
|
| 686 |
"""Create quick model presets for image-to-image generation."""
|
| 687 |
with gr.Group():
|
|
|
|
| 766 |
gr.Markdown("""
|
| 767 |
# π HF-Inferoxy AI Hub
|
| 768 |
|
| 769 |
+
A comprehensive AI platform combining chat, image generation, image-to-image, text-to-video, and text-to-speech capabilities with intelligent token management through HF-Inferoxy.
|
| 770 |
|
| 771 |
**Features:**
|
| 772 |
- π¬ **Smart Chat**: Conversational AI with streaming responses
|
| 773 |
- π¨ **Image Generation**: Text-to-image creation with multiple providers
|
| 774 |
- πΌοΈ **Image-to-Image**: Transform and modify existing images with AI
|
| 775 |
+
- π¬ **Text-to-Video**: Generate short videos from text prompts
|
| 776 |
+
- π€ **Text-to-Speech**: Convert text to natural-sounding speech
|
| 777 |
- π **Intelligent Token Management**: Automatic token rotation and error handling
|
| 778 |
- π **Multi-Provider Support**: Works with HF Inference, Cerebras, Cohere, Groq, Together, Fal.ai, and more
|
| 779 |
""")
|
|
|
|
| 803 |
- Perfect for style transfers, object additions, and image transformations
|
| 804 |
- Works great with models like Qwen Image Edit and FLUX.1 Kontext
|
| 805 |
|
| 806 |
+
**Text-to-Video Tab:**
|
| 807 |
+
- Write a concise prompt describing the motion you want
|
| 808 |
+
- Choose a model and provider (default: `auto`)
|
| 809 |
+
- Some models may take several minutes to render
|
| 810 |
+
|
| 811 |
**Text-to-Speech Tab:**
|
| 812 |
- Enter text you want to convert to speech
|
| 813 |
- Choose from various English voices (US and UK accents)
|
utils.py
CHANGED
|
@@ -13,6 +13,7 @@ DEFAULT_CHAT_MODEL = "openai/gpt-oss-20b"
|
|
| 13 |
DEFAULT_IMAGE_MODEL = "Qwen/Qwen-Image"
|
| 14 |
DEFAULT_IMAGE_TO_IMAGE_MODEL = "Qwen/Qwen-Image-Edit"
|
| 15 |
DEFAULT_TTS_MODEL = "hexgrad/Kokoro-82M"
|
|
|
|
| 16 |
|
| 17 |
# Unified default provider used by all non-chat tasks
|
| 18 |
DEFAULT_PROVIDER = "auto"
|
|
@@ -119,6 +120,26 @@ TTS_MODEL_CONFIGS = {
|
|
| 119 |
}
|
| 120 |
}
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
# Voice options for Kokoro TTS (based on the reference app)
|
| 123 |
TTS_VOICES = {
|
| 124 |
'πΊπΈ πΊ Heart β€οΈ': 'af_heart',
|
|
|
|
| 13 |
DEFAULT_IMAGE_MODEL = "Qwen/Qwen-Image"
|
| 14 |
DEFAULT_IMAGE_TO_IMAGE_MODEL = "Qwen/Qwen-Image-Edit"
|
| 15 |
DEFAULT_TTS_MODEL = "hexgrad/Kokoro-82M"
|
| 16 |
+
DEFAULT_VIDEO_MODEL = "Wan-AI/Wan2.2-T2V-A14B"
|
| 17 |
|
| 18 |
# Unified default provider used by all non-chat tasks
|
| 19 |
DEFAULT_PROVIDER = "auto"
|
|
|
|
| 120 |
}
|
| 121 |
}
|
| 122 |
|
| 123 |
+
# -----------------------------
|
| 124 |
+
# Text-to-Video configuration
|
| 125 |
+
# -----------------------------
|
| 126 |
+
|
| 127 |
+
# Model presets for text-to-video generation
|
| 128 |
+
VIDEO_MODEL_PRESETS = [
|
| 129 |
+
("Wan 2.2 (Replicate)", "Wan-AI/Wan2.2-T2V-A14B", "fal-ai"),
|
| 130 |
+
("LTX-Video 0.9.7 (Fal.ai)", "Lightricks/LTX-Video-0.9.7-dev", "fal-ai"),
|
| 131 |
+
("HunyuanVideo (Auto)", "tencent/HunyuanVideo", "auto"),
|
| 132 |
+
("CogVideoX-5b (Fal.ai)", "zai-org/CogVideoX-5b", "fal-ai"),
|
| 133 |
+
]
|
| 134 |
+
|
| 135 |
+
# Example prompts for text-to-video generation
|
| 136 |
+
VIDEO_EXAMPLE_PROMPTS = [
|
| 137 |
+
"A young man walking on the street",
|
| 138 |
+
"A corgi puppy running through a field of flowers, cinematic",
|
| 139 |
+
"A futuristic city skyline at sunset with flying cars, 4k",
|
| 140 |
+
"A serene beach with gentle waves and palm trees swaying",
|
| 141 |
+
]
|
| 142 |
+
|
| 143 |
# Voice options for Kokoro TTS (based on the reference app)
|
| 144 |
TTS_VOICES = {
|
| 145 |
'πΊπΈ πΊ Heart β€οΈ': 'af_heart',
|
video_handler.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text-to-video functionality handler for HF-Inferoxy AI Hub.
|
| 3 |
+
Handles text-to-video generation with multiple providers.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import gradio as gr
|
| 8 |
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
|
| 9 |
+
from huggingface_hub import InferenceClient
|
| 10 |
+
from huggingface_hub.errors import HfHubHTTPError
|
| 11 |
+
from requests.exceptions import ConnectionError
|
| 12 |
+
from hf_token_utils import get_proxy_token, report_token_status
|
| 13 |
+
from utils import (
|
| 14 |
+
validate_proxy_key,
|
| 15 |
+
format_error_message,
|
| 16 |
+
format_success_message,
|
| 17 |
+
check_org_access,
|
| 18 |
+
format_access_denied_message,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Timeout configuration for video generation
|
| 23 |
+
VIDEO_GENERATION_TIMEOUT = 600 # up to 10 minutes, videos can be slow
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def generate_video(
|
| 27 |
+
prompt: str,
|
| 28 |
+
model_name: str,
|
| 29 |
+
provider: str,
|
| 30 |
+
num_inference_steps: int | None = None,
|
| 31 |
+
guidance_scale: float | None = None,
|
| 32 |
+
seed: int | None = None,
|
| 33 |
+
):
|
| 34 |
+
"""
|
| 35 |
+
Generate a video using the specified model and provider through HF-Inferoxy.
|
| 36 |
+
Returns (video_bytes_or_url, status_message)
|
| 37 |
+
"""
|
| 38 |
+
# Validate proxy API key
|
| 39 |
+
is_valid, error_msg = validate_proxy_key()
|
| 40 |
+
if not is_valid:
|
| 41 |
+
return None, error_msg
|
| 42 |
+
|
| 43 |
+
proxy_api_key = os.getenv("PROXY_KEY")
|
| 44 |
+
|
| 45 |
+
token_id = None
|
| 46 |
+
try:
|
| 47 |
+
# Get token from HF-Inferoxy proxy server with timeout handling
|
| 48 |
+
print(f"π Video: Requesting token from proxy...")
|
| 49 |
+
token, token_id = get_proxy_token(api_key=proxy_api_key)
|
| 50 |
+
print(f"β
Video: Got token: {token_id}")
|
| 51 |
+
|
| 52 |
+
print(f"π¬ Video: Using model='{model_name}', provider='{provider}'")
|
| 53 |
+
|
| 54 |
+
# Create client with specified provider
|
| 55 |
+
client = InferenceClient(
|
| 56 |
+
provider=provider,
|
| 57 |
+
api_key=token
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Prepare generation parameters
|
| 61 |
+
generation_params: dict = {
|
| 62 |
+
"model": model_name,
|
| 63 |
+
"prompt": prompt,
|
| 64 |
+
}
|
| 65 |
+
if num_inference_steps is not None:
|
| 66 |
+
generation_params["num_inference_steps"] = num_inference_steps
|
| 67 |
+
if guidance_scale is not None:
|
| 68 |
+
generation_params["guidance_scale"] = guidance_scale
|
| 69 |
+
if seed is not None and seed != -1:
|
| 70 |
+
generation_params["seed"] = seed
|
| 71 |
+
|
| 72 |
+
print(f"π‘ Video: Making generation request with {VIDEO_GENERATION_TIMEOUT}s timeout...")
|
| 73 |
+
|
| 74 |
+
# Create generation function for timeout handling
|
| 75 |
+
def generate_video_task():
|
| 76 |
+
return client.text_to_video(**generation_params)
|
| 77 |
+
|
| 78 |
+
# Execute with timeout using ThreadPoolExecutor
|
| 79 |
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
| 80 |
+
future = executor.submit(generate_video_task)
|
| 81 |
+
try:
|
| 82 |
+
video = future.result(timeout=VIDEO_GENERATION_TIMEOUT)
|
| 83 |
+
except FutureTimeoutError:
|
| 84 |
+
future.cancel()
|
| 85 |
+
raise TimeoutError(f"Video generation timed out after {VIDEO_GENERATION_TIMEOUT} seconds")
|
| 86 |
+
|
| 87 |
+
print(f"ποΈ Video: Generation completed! Type: {type(video)}")
|
| 88 |
+
|
| 89 |
+
# Report successful token usage
|
| 90 |
+
if token_id:
|
| 91 |
+
report_token_status(token_id, "success", api_key=proxy_api_key)
|
| 92 |
+
|
| 93 |
+
return video, format_success_message("Video generated", f"using {model_name} on {provider}")
|
| 94 |
+
|
| 95 |
+
except ConnectionError as e:
|
| 96 |
+
error_msg = f"Cannot connect to HF-Inferoxy server: {str(e)}"
|
| 97 |
+
print(f"π Video connection error: {error_msg}")
|
| 98 |
+
if token_id:
|
| 99 |
+
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
|
| 100 |
+
return None, format_error_message("Connection Error", "Unable to connect to the proxy server. Please check if it's running.")
|
| 101 |
+
|
| 102 |
+
except TimeoutError as e:
|
| 103 |
+
error_msg = f"Video generation timed out: {str(e)}"
|
| 104 |
+
print(f"β° Video timeout: {error_msg}")
|
| 105 |
+
if token_id:
|
| 106 |
+
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
|
| 107 |
+
return None, format_error_message("Timeout Error", f"Video generation took too long (>{VIDEO_GENERATION_TIMEOUT//60} minutes). Try a shorter prompt.")
|
| 108 |
+
|
| 109 |
+
except HfHubHTTPError as e:
|
| 110 |
+
error_msg = str(e)
|
| 111 |
+
print(f"π€ Video HF error: {error_msg}")
|
| 112 |
+
if token_id:
|
| 113 |
+
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
|
| 114 |
+
if "401" in error_msg:
|
| 115 |
+
return None, format_error_message("Authentication Error", "Invalid or expired API token. The proxy will provide a new token on retry.")
|
| 116 |
+
elif "402" in error_msg:
|
| 117 |
+
return None, format_error_message("Quota Exceeded", "API quota exceeded. The proxy will try alternative providers.")
|
| 118 |
+
elif "429" in error_msg:
|
| 119 |
+
return None, format_error_message("Rate Limited", "Too many requests. Please wait a moment and try again.")
|
| 120 |
+
else:
|
| 121 |
+
return None, format_error_message("HuggingFace API Error", error_msg)
|
| 122 |
+
|
| 123 |
+
except Exception as e:
|
| 124 |
+
error_msg = str(e)
|
| 125 |
+
print(f"β Video unexpected error: {error_msg}")
|
| 126 |
+
if token_id:
|
| 127 |
+
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
|
| 128 |
+
return None, format_error_message("Unexpected Error", f"An unexpected error occurred: {error_msg}")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def handle_video_generation(prompt_val, model_val, provider_val, steps_val, guidance_val, seed_val, hf_token: gr.OAuthToken = None):
|
| 132 |
+
"""
|
| 133 |
+
Handle text-to-video generation request with validation and org access.
|
| 134 |
+
"""
|
| 135 |
+
if not prompt_val or not prompt_val.strip():
|
| 136 |
+
return None, format_error_message("Validation Error", "Please enter a prompt for video generation")
|
| 137 |
+
|
| 138 |
+
access_token = getattr(hf_token, "token", None) if hf_token is not None else None
|
| 139 |
+
is_allowed, access_msg, _username, _matched = check_org_access(access_token)
|
| 140 |
+
if not is_allowed:
|
| 141 |
+
return None, format_access_denied_message(access_msg)
|
| 142 |
+
|
| 143 |
+
return generate_video(
|
| 144 |
+
prompt=prompt_val.strip(),
|
| 145 |
+
model_name=model_val,
|
| 146 |
+
provider=provider_val,
|
| 147 |
+
num_inference_steps=steps_val if steps_val is not None else None,
|
| 148 |
+
guidance_scale=guidance_val if guidance_val is not None else None,
|
| 149 |
+
seed=seed_val if seed_val is not None else None,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
|