nazdridoy commited on
Commit
6244d01
Β·
verified Β·
1 Parent(s): 79662c9

feat(video): add text-to-video generation capabilities

Browse files

- [feat] Add new file with `generate_video` and `handle_video_generation` functions (video_handler.py)
- [feat] Implement `create_video_tab` function for the UI (ui_components.py:create_video_tab())
- [feat] Integrate video tab into the main application (app.py:create_app())
- [add] Define `DEFAULT_VIDEO_MODEL`, `VIDEO_MODEL_PRESETS`, and `VIDEO_EXAMPLE_PROMPTS` (utils.py)
- [docs] Update main header description to include text-to-video (app.py:create_main_header())
- [docs] Add "Text-to-Video Tab" section to the footer (app.py:create_footer())

Files changed (4) hide show
  1. app.py +5 -0
  2. ui_components.py +130 -3
  3. utils.py +21 -0
  4. video_handler.py +152 -0
app.py CHANGED
@@ -6,12 +6,14 @@ A comprehensive AI platform with chat and image generation capabilities.
6
  import gradio as gr
7
  from chat_handler import handle_chat_submit, handle_chat_retry
8
  from image_handler import handle_image_generation, handle_image_to_image_generation
 
9
  from tts_handler import handle_text_to_speech_generation
10
  from ui_components import (
11
  create_main_header,
12
  create_chat_tab,
13
  create_image_tab,
14
  create_image_to_image_tab,
 
15
  create_tts_tab,
16
  create_footer
17
  )
@@ -41,6 +43,9 @@ def create_app():
41
  # Image-to-image tab
42
  create_image_to_image_tab(handle_image_to_image_generation)
43
 
 
 
 
44
  # Text-to-speech tab
45
  create_tts_tab(handle_text_to_speech_generation)
46
 
 
6
  import gradio as gr
7
  from chat_handler import handle_chat_submit, handle_chat_retry
8
  from image_handler import handle_image_generation, handle_image_to_image_generation
9
+ from video_handler import handle_video_generation
10
  from tts_handler import handle_text_to_speech_generation
11
  from ui_components import (
12
  create_main_header,
13
  create_chat_tab,
14
  create_image_tab,
15
  create_image_to_image_tab,
16
+ create_video_tab,
17
  create_tts_tab,
18
  create_footer
19
  )
 
43
  # Image-to-image tab
44
  create_image_to_image_tab(handle_image_to_image_generation)
45
 
46
+ # Text-to-Video tab
47
+ create_video_tab(handle_video_generation)
48
+
49
  # Text-to-speech tab
50
  create_tts_tab(handle_text_to_speech_generation)
51
 
ui_components.py CHANGED
@@ -10,7 +10,8 @@ from utils import (
10
  DEFAULT_TTS_MODEL,
11
  CHAT_CONFIG, IMAGE_CONFIG, IMAGE_PROVIDERS, IMAGE_MODEL_PRESETS,
12
  IMAGE_TO_IMAGE_MODEL_PRESETS, TTS_MODEL_PRESETS, TTS_VOICES, TTS_MODEL_CONFIGS,
13
- IMAGE_EXAMPLE_PROMPTS, IMAGE_TO_IMAGE_EXAMPLE_PROMPTS, TTS_EXAMPLE_TEXTS, TTS_EXAMPLE_AUDIO_URLS
 
14
  )
15
 
16
 
@@ -561,6 +562,126 @@ def create_tts_tab(handle_tts_generation_fn):
561
  gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
562
 
563
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
  def create_image_to_image_presets(img2img_model_name, img2img_provider):
565
  """Create quick model presets for image-to-image generation."""
566
  with gr.Group():
@@ -645,13 +766,14 @@ def create_main_header():
645
  gr.Markdown("""
646
  # πŸš€ HF-Inferoxy AI Hub
647
 
648
- A comprehensive AI platform combining chat, image generation, and text-to-speech capabilities with intelligent token management through HF-Inferoxy.
649
 
650
  **Features:**
651
  - πŸ’¬ **Smart Chat**: Conversational AI with streaming responses
652
  - 🎨 **Image Generation**: Text-to-image creation with multiple providers
653
  - πŸ–ΌοΈ **Image-to-Image**: Transform and modify existing images with AI
654
- - 🎀 **Text-to-Speech**: Convert text to natural-sounding speech with Kokoro
 
655
  - πŸ”„ **Intelligent Token Management**: Automatic token rotation and error handling
656
  - 🌐 **Multi-Provider Support**: Works with HF Inference, Cerebras, Cohere, Groq, Together, Fal.ai, and more
657
  """)
@@ -681,6 +803,11 @@ def create_footer():
681
  - Perfect for style transfers, object additions, and image transformations
682
  - Works great with models like Qwen Image Edit and FLUX.1 Kontext
683
 
 
 
 
 
 
684
  **Text-to-Speech Tab:**
685
  - Enter text you want to convert to speech
686
  - Choose from various English voices (US and UK accents)
 
10
  DEFAULT_TTS_MODEL,
11
  CHAT_CONFIG, IMAGE_CONFIG, IMAGE_PROVIDERS, IMAGE_MODEL_PRESETS,
12
  IMAGE_TO_IMAGE_MODEL_PRESETS, TTS_MODEL_PRESETS, TTS_VOICES, TTS_MODEL_CONFIGS,
13
+ IMAGE_EXAMPLE_PROMPTS, IMAGE_TO_IMAGE_EXAMPLE_PROMPTS, TTS_EXAMPLE_TEXTS, TTS_EXAMPLE_AUDIO_URLS,
14
+ DEFAULT_VIDEO_MODEL, VIDEO_MODEL_PRESETS, VIDEO_EXAMPLE_PROMPTS
15
  )
16
 
17
 
 
562
  gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
563
 
564
 
565
+ def create_video_tab(handle_video_generation_fn):
566
+ """
567
+ Create the text-to-video tab interface.
568
+ """
569
+ with gr.Tab("🎬 Text-to-Video", id="video"):
570
+ with gr.Row():
571
+ with gr.Column(scale=2):
572
+ # Video output
573
+ output_video = gr.Video(
574
+ label="Generated Video",
575
+ interactive=False,
576
+ show_download_button=True,
577
+ height=480,
578
+ )
579
+ status_text = gr.Textbox(
580
+ label="Generation Status",
581
+ interactive=False,
582
+ lines=2
583
+ )
584
+
585
+ with gr.Column(scale=1):
586
+ # Model and provider inputs
587
+ with gr.Group():
588
+ gr.Markdown("**πŸ€– Model & Provider**")
589
+ vid_model_name = gr.Textbox(
590
+ value=DEFAULT_VIDEO_MODEL,
591
+ label="Model Name",
592
+ placeholder="e.g., tencent/HunyuanVideo, Wan-AI/Wan2.2-T2V-A14B"
593
+ )
594
+ vid_provider = gr.Dropdown(
595
+ choices=IMAGE_PROVIDERS,
596
+ value=DEFAULT_PROVIDER,
597
+ label="Provider",
598
+ interactive=True
599
+ )
600
+
601
+ # Generation parameters
602
+ with gr.Group():
603
+ gr.Markdown("**πŸ“ Prompt**")
604
+ vid_prompt = gr.Textbox(
605
+ value=VIDEO_EXAMPLE_PROMPTS[0],
606
+ label="Prompt",
607
+ lines=3,
608
+ placeholder="Describe the video you want to generate..."
609
+ )
610
+
611
+ with gr.Group():
612
+ gr.Markdown("**βš™οΈ Generation Settings (optional)**")
613
+ with gr.Row():
614
+ vid_steps = gr.Slider(
615
+ minimum=10, maximum=100, value=20, step=1,
616
+ label="Inference Steps"
617
+ )
618
+ vid_guidance = gr.Slider(
619
+ minimum=1.0, maximum=20.0, value=7.5, step=0.5,
620
+ label="Guidance Scale"
621
+ )
622
+ vid_seed = gr.Slider(
623
+ minimum=-1, maximum=999999, value=-1, step=1,
624
+ label="Seed", info="-1 for random"
625
+ )
626
+
627
+ # Generate and Stop buttons
628
+ with gr.Row():
629
+ generate_btn = gr.Button(
630
+ "🎬 Generate Video",
631
+ variant="primary",
632
+ size="lg",
633
+ scale=2
634
+ )
635
+ stop_generate_btn = gr.Button("⏹ Stop", variant="secondary", visible=False)
636
+
637
+ # Quick model presets
638
+ with gr.Group():
639
+ gr.Markdown("**🎯 Popular Presets**")
640
+ for name, model, provider in VIDEO_MODEL_PRESETS:
641
+ btn = gr.Button(name, size="sm")
642
+ btn.click(
643
+ lambda m=model, p=provider: (m, p),
644
+ outputs=[vid_model_name, vid_provider]
645
+ )
646
+
647
+ # Examples for video generation
648
+ with gr.Group():
649
+ gr.Markdown("**🌟 Example Prompts**")
650
+ gr.Examples(
651
+ examples=[[prompt] for prompt in VIDEO_EXAMPLE_PROMPTS],
652
+ inputs=vid_prompt
653
+ )
654
+
655
+ # Connect video generation events
656
+ generate_btn.click(
657
+ fn=lambda: gr.update(visible=True),
658
+ inputs=None,
659
+ outputs=[stop_generate_btn],
660
+ queue=False
661
+ )
662
+
663
+ gen_event = generate_btn.click(
664
+ fn=handle_video_generation_fn,
665
+ inputs=[
666
+ vid_prompt, vid_model_name, vid_provider,
667
+ vid_steps, vid_guidance, vid_seed
668
+ ],
669
+ outputs=[output_video, status_text]
670
+ )
671
+
672
+ # Stop current video generation
673
+ stop_generate_btn.click(
674
+ fn=lambda: gr.update(visible=False),
675
+ inputs=None,
676
+ outputs=[stop_generate_btn],
677
+ cancels=[gen_event],
678
+ queue=False
679
+ )
680
+
681
+ # Hide stop after generation completes
682
+ gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
683
+
684
+
685
  def create_image_to_image_presets(img2img_model_name, img2img_provider):
686
  """Create quick model presets for image-to-image generation."""
687
  with gr.Group():
 
766
  gr.Markdown("""
767
  # πŸš€ HF-Inferoxy AI Hub
768
 
769
+ A comprehensive AI platform combining chat, image generation, image-to-image, text-to-video, and text-to-speech capabilities with intelligent token management through HF-Inferoxy.
770
 
771
  **Features:**
772
  - πŸ’¬ **Smart Chat**: Conversational AI with streaming responses
773
  - 🎨 **Image Generation**: Text-to-image creation with multiple providers
774
  - πŸ–ΌοΈ **Image-to-Image**: Transform and modify existing images with AI
775
+ - 🎬 **Text-to-Video**: Generate short videos from text prompts
776
+ - 🎀 **Text-to-Speech**: Convert text to natural-sounding speech
777
  - πŸ”„ **Intelligent Token Management**: Automatic token rotation and error handling
778
  - 🌐 **Multi-Provider Support**: Works with HF Inference, Cerebras, Cohere, Groq, Together, Fal.ai, and more
779
  """)
 
803
  - Perfect for style transfers, object additions, and image transformations
804
  - Works great with models like Qwen Image Edit and FLUX.1 Kontext
805
 
806
+ **Text-to-Video Tab:**
807
+ - Write a concise prompt describing the motion you want
808
+ - Choose a model and provider (default: `auto`)
809
+ - Some models may take several minutes to render
810
+
811
  **Text-to-Speech Tab:**
812
  - Enter text you want to convert to speech
813
  - Choose from various English voices (US and UK accents)
utils.py CHANGED
@@ -13,6 +13,7 @@ DEFAULT_CHAT_MODEL = "openai/gpt-oss-20b"
13
  DEFAULT_IMAGE_MODEL = "Qwen/Qwen-Image"
14
  DEFAULT_IMAGE_TO_IMAGE_MODEL = "Qwen/Qwen-Image-Edit"
15
  DEFAULT_TTS_MODEL = "hexgrad/Kokoro-82M"
 
16
 
17
  # Unified default provider used by all non-chat tasks
18
  DEFAULT_PROVIDER = "auto"
@@ -119,6 +120,26 @@ TTS_MODEL_CONFIGS = {
119
  }
120
  }
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  # Voice options for Kokoro TTS (based on the reference app)
123
  TTS_VOICES = {
124
  'πŸ‡ΊπŸ‡Έ 🚺 Heart ❀️': 'af_heart',
 
13
  DEFAULT_IMAGE_MODEL = "Qwen/Qwen-Image"
14
  DEFAULT_IMAGE_TO_IMAGE_MODEL = "Qwen/Qwen-Image-Edit"
15
  DEFAULT_TTS_MODEL = "hexgrad/Kokoro-82M"
16
+ DEFAULT_VIDEO_MODEL = "Wan-AI/Wan2.2-T2V-A14B"
17
 
18
  # Unified default provider used by all non-chat tasks
19
  DEFAULT_PROVIDER = "auto"
 
120
  }
121
  }
122
 
123
+ # -----------------------------
124
+ # Text-to-Video configuration
125
+ # -----------------------------
126
+
127
+ # Model presets for text-to-video generation
128
+ VIDEO_MODEL_PRESETS = [
129
+ ("Wan 2.2 (Replicate)", "Wan-AI/Wan2.2-T2V-A14B", "fal-ai"),
130
+ ("LTX-Video 0.9.7 (Fal.ai)", "Lightricks/LTX-Video-0.9.7-dev", "fal-ai"),
131
+ ("HunyuanVideo (Auto)", "tencent/HunyuanVideo", "auto"),
132
+ ("CogVideoX-5b (Fal.ai)", "zai-org/CogVideoX-5b", "fal-ai"),
133
+ ]
134
+
135
+ # Example prompts for text-to-video generation
136
+ VIDEO_EXAMPLE_PROMPTS = [
137
+ "A young man walking on the street",
138
+ "A corgi puppy running through a field of flowers, cinematic",
139
+ "A futuristic city skyline at sunset with flying cars, 4k",
140
+ "A serene beach with gentle waves and palm trees swaying",
141
+ ]
142
+
143
  # Voice options for Kokoro TTS (based on the reference app)
144
  TTS_VOICES = {
145
  'πŸ‡ΊπŸ‡Έ 🚺 Heart ❀️': 'af_heart',
video_handler.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text-to-video functionality handler for HF-Inferoxy AI Hub.
3
+ Handles text-to-video generation with multiple providers.
4
+ """
5
+
6
+ import os
7
+ import gradio as gr
8
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
9
+ from huggingface_hub import InferenceClient
10
+ from huggingface_hub.errors import HfHubHTTPError
11
+ from requests.exceptions import ConnectionError
12
+ from hf_token_utils import get_proxy_token, report_token_status
13
+ from utils import (
14
+ validate_proxy_key,
15
+ format_error_message,
16
+ format_success_message,
17
+ check_org_access,
18
+ format_access_denied_message,
19
+ )
20
+
21
+
22
+ # Timeout configuration for video generation
23
+ VIDEO_GENERATION_TIMEOUT = 600 # up to 10 minutes, videos can be slow
24
+
25
+
26
+ def generate_video(
27
+ prompt: str,
28
+ model_name: str,
29
+ provider: str,
30
+ num_inference_steps: int | None = None,
31
+ guidance_scale: float | None = None,
32
+ seed: int | None = None,
33
+ ):
34
+ """
35
+ Generate a video using the specified model and provider through HF-Inferoxy.
36
+ Returns (video_bytes_or_url, status_message)
37
+ """
38
+ # Validate proxy API key
39
+ is_valid, error_msg = validate_proxy_key()
40
+ if not is_valid:
41
+ return None, error_msg
42
+
43
+ proxy_api_key = os.getenv("PROXY_KEY")
44
+
45
+ token_id = None
46
+ try:
47
+ # Get token from HF-Inferoxy proxy server with timeout handling
48
+ print(f"πŸ”‘ Video: Requesting token from proxy...")
49
+ token, token_id = get_proxy_token(api_key=proxy_api_key)
50
+ print(f"βœ… Video: Got token: {token_id}")
51
+
52
+ print(f"🎬 Video: Using model='{model_name}', provider='{provider}'")
53
+
54
+ # Create client with specified provider
55
+ client = InferenceClient(
56
+ provider=provider,
57
+ api_key=token
58
+ )
59
+
60
+ # Prepare generation parameters
61
+ generation_params: dict = {
62
+ "model": model_name,
63
+ "prompt": prompt,
64
+ }
65
+ if num_inference_steps is not None:
66
+ generation_params["num_inference_steps"] = num_inference_steps
67
+ if guidance_scale is not None:
68
+ generation_params["guidance_scale"] = guidance_scale
69
+ if seed is not None and seed != -1:
70
+ generation_params["seed"] = seed
71
+
72
+ print(f"πŸ“‘ Video: Making generation request with {VIDEO_GENERATION_TIMEOUT}s timeout...")
73
+
74
+ # Create generation function for timeout handling
75
+ def generate_video_task():
76
+ return client.text_to_video(**generation_params)
77
+
78
+ # Execute with timeout using ThreadPoolExecutor
79
+ with ThreadPoolExecutor(max_workers=1) as executor:
80
+ future = executor.submit(generate_video_task)
81
+ try:
82
+ video = future.result(timeout=VIDEO_GENERATION_TIMEOUT)
83
+ except FutureTimeoutError:
84
+ future.cancel()
85
+ raise TimeoutError(f"Video generation timed out after {VIDEO_GENERATION_TIMEOUT} seconds")
86
+
87
+ print(f"🎞️ Video: Generation completed! Type: {type(video)}")
88
+
89
+ # Report successful token usage
90
+ if token_id:
91
+ report_token_status(token_id, "success", api_key=proxy_api_key)
92
+
93
+ return video, format_success_message("Video generated", f"using {model_name} on {provider}")
94
+
95
+ except ConnectionError as e:
96
+ error_msg = f"Cannot connect to HF-Inferoxy server: {str(e)}"
97
+ print(f"πŸ”Œ Video connection error: {error_msg}")
98
+ if token_id:
99
+ report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
100
+ return None, format_error_message("Connection Error", "Unable to connect to the proxy server. Please check if it's running.")
101
+
102
+ except TimeoutError as e:
103
+ error_msg = f"Video generation timed out: {str(e)}"
104
+ print(f"⏰ Video timeout: {error_msg}")
105
+ if token_id:
106
+ report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
107
+ return None, format_error_message("Timeout Error", f"Video generation took too long (>{VIDEO_GENERATION_TIMEOUT//60} minutes). Try a shorter prompt.")
108
+
109
+ except HfHubHTTPError as e:
110
+ error_msg = str(e)
111
+ print(f"πŸ€— Video HF error: {error_msg}")
112
+ if token_id:
113
+ report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
114
+ if "401" in error_msg:
115
+ return None, format_error_message("Authentication Error", "Invalid or expired API token. The proxy will provide a new token on retry.")
116
+ elif "402" in error_msg:
117
+ return None, format_error_message("Quota Exceeded", "API quota exceeded. The proxy will try alternative providers.")
118
+ elif "429" in error_msg:
119
+ return None, format_error_message("Rate Limited", "Too many requests. Please wait a moment and try again.")
120
+ else:
121
+ return None, format_error_message("HuggingFace API Error", error_msg)
122
+
123
+ except Exception as e:
124
+ error_msg = str(e)
125
+ print(f"❌ Video unexpected error: {error_msg}")
126
+ if token_id:
127
+ report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
128
+ return None, format_error_message("Unexpected Error", f"An unexpected error occurred: {error_msg}")
129
+
130
+
131
+ def handle_video_generation(prompt_val, model_val, provider_val, steps_val, guidance_val, seed_val, hf_token: gr.OAuthToken = None):
132
+ """
133
+ Handle text-to-video generation request with validation and org access.
134
+ """
135
+ if not prompt_val or not prompt_val.strip():
136
+ return None, format_error_message("Validation Error", "Please enter a prompt for video generation")
137
+
138
+ access_token = getattr(hf_token, "token", None) if hf_token is not None else None
139
+ is_allowed, access_msg, _username, _matched = check_org_access(access_token)
140
+ if not is_allowed:
141
+ return None, format_access_denied_message(access_msg)
142
+
143
+ return generate_video(
144
+ prompt=prompt_val.strip(),
145
+ model_name=model_val,
146
+ provider=provider_val,
147
+ num_inference_steps=steps_val if steps_val is not None else None,
148
+ guidance_scale=guidance_val if guidance_val is not None else None,
149
+ seed=seed_val if seed_val is not None else None,
150
+ )
151
+
152
+