Spaces:

nazdridoy
/

inferoxy-hub

Running

App Files Files Community

nazdridoy commited on Aug 22

Commit

6244d01

verified ·

1 Parent(s): 79662c9

feat(video): add text-to-video generation capabilities

Browse files

- [feat] Add new file with `generate_video` and `handle_video_generation` functions (video_handler.py)
- [feat] Implement `create_video_tab` function for the UI (ui_components.py:create_video_tab())
- [feat] Integrate video tab into the main application (app.py:create_app())
- [add] Define `DEFAULT_VIDEO_MODEL`, `VIDEO_MODEL_PRESETS`, and `VIDEO_EXAMPLE_PROMPTS` (utils.py)
- [docs] Update main header description to include text-to-video (app.py:create_main_header())
- [docs] Add "Text-to-Video Tab" section to the footer (app.py:create_footer())

Files changed (4) hide show

app.py +5 -0
ui_components.py +130 -3
utils.py +21 -0
video_handler.py +152 -0

app.py CHANGED Viewed

@@ -6,12 +6,14 @@ A comprehensive AI platform with chat and image generation capabilities.
 import gradio as gr
 from chat_handler import handle_chat_submit, handle_chat_retry
 from image_handler import handle_image_generation, handle_image_to_image_generation
 from tts_handler import handle_text_to_speech_generation
 from ui_components import (
     create_main_header,
     create_chat_tab,
     create_image_tab,
     create_image_to_image_tab,
     create_tts_tab,
     create_footer
 )
@@ -41,6 +43,9 @@ def create_app():
             # Image-to-image tab
             create_image_to_image_tab(handle_image_to_image_generation)
             # Text-to-speech tab
             create_tts_tab(handle_text_to_speech_generation)

 import gradio as gr
 from chat_handler import handle_chat_submit, handle_chat_retry
 from image_handler import handle_image_generation, handle_image_to_image_generation
+from video_handler import handle_video_generation
 from tts_handler import handle_text_to_speech_generation
 from ui_components import (
     create_main_header,
     create_chat_tab,
     create_image_tab,
     create_image_to_image_tab,
+    create_video_tab,
     create_tts_tab,
     create_footer
 )
             # Image-to-image tab
             create_image_to_image_tab(handle_image_to_image_generation)
+            # Text-to-Video tab
+            create_video_tab(handle_video_generation)
             # Text-to-speech tab
             create_tts_tab(handle_text_to_speech_generation)

ui_components.py CHANGED Viewed

@@ -10,7 +10,8 @@ from utils import (
     DEFAULT_TTS_MODEL,
     CHAT_CONFIG, IMAGE_CONFIG, IMAGE_PROVIDERS, IMAGE_MODEL_PRESETS,
     IMAGE_TO_IMAGE_MODEL_PRESETS, TTS_MODEL_PRESETS, TTS_VOICES, TTS_MODEL_CONFIGS,
-    IMAGE_EXAMPLE_PROMPTS, IMAGE_TO_IMAGE_EXAMPLE_PROMPTS, TTS_EXAMPLE_TEXTS, TTS_EXAMPLE_AUDIO_URLS
 )
@@ -561,6 +562,126 @@ def create_tts_tab(handle_tts_generation_fn):
         gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
 def create_image_to_image_presets(img2img_model_name, img2img_provider):
     """Create quick model presets for image-to-image generation."""
     with gr.Group():
@@ -645,13 +766,14 @@ def create_main_header():
     gr.Markdown("""
     # 🚀 HF-Inferoxy AI Hub
-    A comprehensive AI platform combining chat, image generation, and text-to-speech capabilities with intelligent token management through HF-Inferoxy.
     **Features:**
     - 💬 **Smart Chat**: Conversational AI with streaming responses
     - 🎨 **Image Generation**: Text-to-image creation with multiple providers
     - 🖼️ **Image-to-Image**: Transform and modify existing images with AI
-    - 🎤 **Text-to-Speech**: Convert text to natural-sounding speech with Kokoro
     - 🔄 **Intelligent Token Management**: Automatic token rotation and error handling
     - 🌐 **Multi-Provider Support**: Works with HF Inference, Cerebras, Cohere, Groq, Together, Fal.ai, and more
     """)
@@ -681,6 +803,11 @@ def create_footer():
     - Perfect for style transfers, object additions, and image transformations
     - Works great with models like Qwen Image Edit and FLUX.1 Kontext
     **Text-to-Speech Tab:**
     - Enter text you want to convert to speech
     - Choose from various English voices (US and UK accents)

     DEFAULT_TTS_MODEL,
     CHAT_CONFIG, IMAGE_CONFIG, IMAGE_PROVIDERS, IMAGE_MODEL_PRESETS,
     IMAGE_TO_IMAGE_MODEL_PRESETS, TTS_MODEL_PRESETS, TTS_VOICES, TTS_MODEL_CONFIGS,
+    IMAGE_EXAMPLE_PROMPTS, IMAGE_TO_IMAGE_EXAMPLE_PROMPTS, TTS_EXAMPLE_TEXTS, TTS_EXAMPLE_AUDIO_URLS,
+    DEFAULT_VIDEO_MODEL, VIDEO_MODEL_PRESETS, VIDEO_EXAMPLE_PROMPTS
 )
         gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
+def create_video_tab(handle_video_generation_fn):
+    """
+    Create the text-to-video tab interface.
+    """
+    with gr.Tab("🎬 Text-to-Video", id="video"):
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Video output
+                output_video = gr.Video(
+                    label="Generated Video",
+                    interactive=False,
+                    show_download_button=True,
+                    height=480,
+                )
+                status_text = gr.Textbox(
+                    label="Generation Status",
+                    interactive=False,
+                    lines=2
+                )
+            with gr.Column(scale=1):
+                # Model and provider inputs
+                with gr.Group():
+                    gr.Markdown("**🤖 Model & Provider**")
+                    vid_model_name = gr.Textbox(
+                        value=DEFAULT_VIDEO_MODEL,
+                        label="Model Name",
+                        placeholder="e.g., tencent/HunyuanVideo, Wan-AI/Wan2.2-T2V-A14B"
+                    )
+                    vid_provider = gr.Dropdown(
+                        choices=IMAGE_PROVIDERS,
+                        value=DEFAULT_PROVIDER,
+                        label="Provider",
+                        interactive=True
+                    )
+                # Generation parameters
+                with gr.Group():
+                    gr.Markdown("**📝 Prompt**")
+                    vid_prompt = gr.Textbox(
+                        value=VIDEO_EXAMPLE_PROMPTS[0],
+                        label="Prompt",
+                        lines=3,
+                        placeholder="Describe the video you want to generate..."
+                    )
+                with gr.Group():
+                    gr.Markdown("**⚙️ Generation Settings (optional)**")
+                    with gr.Row():
+                        vid_steps = gr.Slider(
+                            minimum=10, maximum=100, value=20, step=1,
+                            label="Inference Steps"
+                        )
+                        vid_guidance = gr.Slider(
+                            minimum=1.0, maximum=20.0, value=7.5, step=0.5,
+                            label="Guidance Scale"
+                        )
+                    vid_seed = gr.Slider(
+                        minimum=-1, maximum=999999, value=-1, step=1,
+                        label="Seed", info="-1 for random"
+                    )
+                # Generate and Stop buttons
+                with gr.Row():
+                    generate_btn = gr.Button(
+                        "🎬 Generate Video",
+                        variant="primary",
+                        size="lg",
+                        scale=2
+                    )
+                    stop_generate_btn = gr.Button("⏹ Stop", variant="secondary", visible=False)
+                # Quick model presets
+                with gr.Group():
+                    gr.Markdown("**🎯 Popular Presets**")
+                    for name, model, provider in VIDEO_MODEL_PRESETS:
+                        btn = gr.Button(name, size="sm")
+                        btn.click(
+                            lambda m=model, p=provider: (m, p),
+                            outputs=[vid_model_name, vid_provider]
+                        )
+        # Examples for video generation
+        with gr.Group():
+            gr.Markdown("**🌟 Example Prompts**")
+            gr.Examples(
+                examples=[[prompt] for prompt in VIDEO_EXAMPLE_PROMPTS],
+                inputs=vid_prompt
+            )
+        # Connect video generation events
+        generate_btn.click(
+            fn=lambda: gr.update(visible=True),
+            inputs=None,
+            outputs=[stop_generate_btn],
+            queue=False
+        )
+        gen_event = generate_btn.click(
+            fn=handle_video_generation_fn,
+            inputs=[
+                vid_prompt, vid_model_name, vid_provider,
+                vid_steps, vid_guidance, vid_seed
+            ],
+            outputs=[output_video, status_text]
+        )
+        # Stop current video generation
+        stop_generate_btn.click(
+            fn=lambda: gr.update(visible=False),
+            inputs=None,
+            outputs=[stop_generate_btn],
+            cancels=[gen_event],
+            queue=False
+        )
+        # Hide stop after generation completes
+        gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
 def create_image_to_image_presets(img2img_model_name, img2img_provider):
     """Create quick model presets for image-to-image generation."""
     with gr.Group():
     gr.Markdown("""
     # 🚀 HF-Inferoxy AI Hub
+    A comprehensive AI platform combining chat, image generation, image-to-image, text-to-video, and text-to-speech capabilities with intelligent token management through HF-Inferoxy.
     **Features:**
     - 💬 **Smart Chat**: Conversational AI with streaming responses
     - 🎨 **Image Generation**: Text-to-image creation with multiple providers
     - 🖼️ **Image-to-Image**: Transform and modify existing images with AI
+    - 🎬 **Text-to-Video**: Generate short videos from text prompts
+    - 🎤 **Text-to-Speech**: Convert text to natural-sounding speech
     - 🔄 **Intelligent Token Management**: Automatic token rotation and error handling
     - 🌐 **Multi-Provider Support**: Works with HF Inference, Cerebras, Cohere, Groq, Together, Fal.ai, and more
     """)
     - Perfect for style transfers, object additions, and image transformations
     - Works great with models like Qwen Image Edit and FLUX.1 Kontext
+    **Text-to-Video Tab:**
+    - Write a concise prompt describing the motion you want
+    - Choose a model and provider (default: `auto`)
+    - Some models may take several minutes to render
     **Text-to-Speech Tab:**
     - Enter text you want to convert to speech
     - Choose from various English voices (US and UK accents)

utils.py CHANGED Viewed

@@ -13,6 +13,7 @@ DEFAULT_CHAT_MODEL = "openai/gpt-oss-20b"
 DEFAULT_IMAGE_MODEL = "Qwen/Qwen-Image"
 DEFAULT_IMAGE_TO_IMAGE_MODEL = "Qwen/Qwen-Image-Edit"
 DEFAULT_TTS_MODEL = "hexgrad/Kokoro-82M"
 # Unified default provider used by all non-chat tasks
 DEFAULT_PROVIDER = "auto"
@@ -119,6 +120,26 @@ TTS_MODEL_CONFIGS = {
     }
 }
 # Voice options for Kokoro TTS (based on the reference app)
 TTS_VOICES = {
     '🇺🇸 🚺 Heart ❤️': 'af_heart',

 DEFAULT_IMAGE_MODEL = "Qwen/Qwen-Image"
 DEFAULT_IMAGE_TO_IMAGE_MODEL = "Qwen/Qwen-Image-Edit"
 DEFAULT_TTS_MODEL = "hexgrad/Kokoro-82M"
+DEFAULT_VIDEO_MODEL = "Wan-AI/Wan2.2-T2V-A14B"
 # Unified default provider used by all non-chat tasks
 DEFAULT_PROVIDER = "auto"
     }
 }
+# -----------------------------
+# Text-to-Video configuration
+# -----------------------------
+# Model presets for text-to-video generation
+VIDEO_MODEL_PRESETS = [
+    ("Wan 2.2 (Replicate)", "Wan-AI/Wan2.2-T2V-A14B", "fal-ai"),
+    ("LTX-Video 0.9.7 (Fal.ai)", "Lightricks/LTX-Video-0.9.7-dev", "fal-ai"),
+    ("HunyuanVideo (Auto)", "tencent/HunyuanVideo", "auto"),
+    ("CogVideoX-5b (Fal.ai)", "zai-org/CogVideoX-5b", "fal-ai"),
+]
+# Example prompts for text-to-video generation
+VIDEO_EXAMPLE_PROMPTS = [
+    "A young man walking on the street",
+    "A corgi puppy running through a field of flowers, cinematic",
+    "A futuristic city skyline at sunset with flying cars, 4k",
+    "A serene beach with gentle waves and palm trees swaying",
+]
 # Voice options for Kokoro TTS (based on the reference app)
 TTS_VOICES = {
     '🇺🇸 🚺 Heart ❤️': 'af_heart',

video_handler.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""
+Text-to-video functionality handler for HF-Inferoxy AI Hub.
+Handles text-to-video generation with multiple providers.
+"""
+import os
+import gradio as gr
+from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
+from huggingface_hub import InferenceClient
+from huggingface_hub.errors import HfHubHTTPError
+from requests.exceptions import ConnectionError
+from hf_token_utils import get_proxy_token, report_token_status
+from utils import (
+    validate_proxy_key,
+    format_error_message,
+    format_success_message,
+    check_org_access,
+    format_access_denied_message,
+)
+# Timeout configuration for video generation
+VIDEO_GENERATION_TIMEOUT = 600  # up to 10 minutes, videos can be slow
+def generate_video(
+    prompt: str,
+    model_name: str,
+    provider: str,
+    num_inference_steps: int | None = None,
+    guidance_scale: float | None = None,
+    seed: int | None = None,
+):
+    """
+    Generate a video using the specified model and provider through HF-Inferoxy.
+    Returns (video_bytes_or_url, status_message)
+    """
+    # Validate proxy API key
+    is_valid, error_msg = validate_proxy_key()
+    if not is_valid:
+        return None, error_msg
+    proxy_api_key = os.getenv("PROXY_KEY")
+    token_id = None
+    try:
+        # Get token from HF-Inferoxy proxy server with timeout handling
+        print(f"🔑 Video: Requesting token from proxy...")
+        token, token_id = get_proxy_token(api_key=proxy_api_key)
+        print(f"✅ Video: Got token: {token_id}")
+        print(f"🎬 Video: Using model='{model_name}', provider='{provider}'")
+        # Create client with specified provider
+        client = InferenceClient(
+            provider=provider,
+            api_key=token
+        )
+        # Prepare generation parameters
+        generation_params: dict = {
+            "model": model_name,
+            "prompt": prompt,
+        }
+        if num_inference_steps is not None:
+            generation_params["num_inference_steps"] = num_inference_steps
+        if guidance_scale is not None:
+            generation_params["guidance_scale"] = guidance_scale
+        if seed is not None and seed != -1:
+            generation_params["seed"] = seed
+        print(f"📡 Video: Making generation request with {VIDEO_GENERATION_TIMEOUT}s timeout...")
+        # Create generation function for timeout handling
+        def generate_video_task():
+            return client.text_to_video(**generation_params)
+        # Execute with timeout using ThreadPoolExecutor
+        with ThreadPoolExecutor(max_workers=1) as executor:
+            future = executor.submit(generate_video_task)
+            try:
+                video = future.result(timeout=VIDEO_GENERATION_TIMEOUT)
+            except FutureTimeoutError:
+                future.cancel()
+                raise TimeoutError(f"Video generation timed out after {VIDEO_GENERATION_TIMEOUT} seconds")
+        print(f"🎞️ Video: Generation completed! Type: {type(video)}")
+        # Report successful token usage
+        if token_id:
+            report_token_status(token_id, "success", api_key=proxy_api_key)
+        return video, format_success_message("Video generated", f"using {model_name} on {provider}")
+    except ConnectionError as e:
+        error_msg = f"Cannot connect to HF-Inferoxy server: {str(e)}"
+        print(f"🔌 Video connection error: {error_msg}")
+        if token_id:
+            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
+        return None, format_error_message("Connection Error", "Unable to connect to the proxy server. Please check if it's running.")
+    except TimeoutError as e:
+        error_msg = f"Video generation timed out: {str(e)}"
+        print(f"⏰ Video timeout: {error_msg}")
+        if token_id:
+            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
+        return None, format_error_message("Timeout Error", f"Video generation took too long (>{VIDEO_GENERATION_TIMEOUT//60} minutes). Try a shorter prompt.")
+    except HfHubHTTPError as e:
+        error_msg = str(e)
+        print(f"🤗 Video HF error: {error_msg}")
+        if token_id:
+            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
+        if "401" in error_msg:
+            return None, format_error_message("Authentication Error", "Invalid or expired API token. The proxy will provide a new token on retry.")
+        elif "402" in error_msg:
+            return None, format_error_message("Quota Exceeded", "API quota exceeded. The proxy will try alternative providers.")
+        elif "429" in error_msg:
+            return None, format_error_message("Rate Limited", "Too many requests. Please wait a moment and try again.")
+        else:
+            return None, format_error_message("HuggingFace API Error", error_msg)
+    except Exception as e:
+        error_msg = str(e)
+        print(f"❌ Video unexpected error: {error_msg}")
+        if token_id:
+            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
+        return None, format_error_message("Unexpected Error", f"An unexpected error occurred: {error_msg}")
+def handle_video_generation(prompt_val, model_val, provider_val, steps_val, guidance_val, seed_val, hf_token: gr.OAuthToken = None):
+    """
+    Handle text-to-video generation request with validation and org access.
+    """
+    if not prompt_val or not prompt_val.strip():
+        return None, format_error_message("Validation Error", "Please enter a prompt for video generation")
+    access_token = getattr(hf_token, "token", None) if hf_token is not None else None
+    is_allowed, access_msg, _username, _matched = check_org_access(access_token)
+    if not is_allowed:
+        return None, format_access_denied_message(access_msg)
+    return generate_video(
+        prompt=prompt_val.strip(),
+        model_name=model_val,
+        provider=provider_val,
+        num_inference_steps=steps_val if steps_val is not None else None,
+        guidance_scale=guidance_val if guidance_val is not None else None,
+        seed=seed_val if seed_val is not None else None,
+    )