Spaces:

QingShuai
/

test_T2M

Running on Zero

App Files Files Community

chingshuai commited on 3 days ago

Commit

f6152b4

1 Parent(s): 848f72a

update

Browse files

Files changed (8) hide show

gradio_app.py +74 -377
hymotion/prompt_engineering/prompt_rewrite.py +60 -18
hymotion/utils/gradio_css.py +250 -0
hymotion/utils/gradio_runtime.py +352 -0
hymotion/utils/gradio_utils.py +72 -0
hymotion/utils/smplh2fbx.py +0 -585
hymotion/utils/t2m_runtime.py +25 -5
scripts/gradio/templates/placeholder_scene.html +331 -0

gradio_app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# we should use gradio==5.38.2
 import argparse
 import codecs as cs
 import json
@@ -10,47 +9,33 @@ import textwrap
 from typing import List, Optional, Tuple, Union
 import gradio as gr
-import torch
-from huggingface_hub import snapshot_download
-def try_to_download_model():
-    repo_id = "tencent/HY-Motion-1.0"
-    target_folder = "HY-Motion-1.0-Lite"
-    print(f">>> start download ", repo_id, target_folder)
-    local_dir = snapshot_download(
-        repo_id=repo_id,
-        allow_patterns=f"{target_folder}/*",
-        local_dir="./downloaded_models"
-    )
-    final_model_path = os.path.join(local_dir, target_folder)
-    print(f">>> Final model path: {final_model_path}")
-    return final_model_path
 # Import spaces for Hugging Face Zero GPU support
-try:
-    import spaces
-    SPACES_AVAILABLE = True
-except ImportError:
-    SPACES_AVAILABLE = False
-    # Create a dummy decorator when spaces is not available
-    class spaces:
-        @staticmethod
-        def GPU(func=None, duration=None):
-            def decorator(fn):
-                return fn
-            if func is not None:
-                return func
-            return decorator
-from hymotion.utils.t2m_runtime import T2MRuntime
-NUM_WORKERS = torch.cuda.device_count() if torch.cuda.is_available() else 1
-# Global runtime instance for Zero GPU lazy loading
-_global_runtime = None
-_global_args = None
 def _init_runtime_if_needed():
     """Initialize runtime lazily for Zero GPU support."""
@@ -81,267 +66,18 @@ def _init_runtime_if_needed():
         ckpt_name=ckpt,
         skip_text=skip_text,
         device_ids=None,
-        prompt_engineering_host=args.prompt_engineering_host,
         skip_model_loading=skip_model_loading,
     )
     return _global_runtime
-@spaces.GPU(duration=120)
-def generate_motion_on_gpu(
-    text: str,
-    seeds_csv: str,
-    motion_duration: float,
-    cfg_scale: float,
-    output_format: str,
-    original_text: str,
-    output_dir: str,
-) -> Tuple[str, List[str]]:
-    """
-    GPU-decorated function for motion generation.
-    This function will request GPU allocation on Hugging Face Zero GPU.
-    """
-    runtime = _init_runtime_if_needed()
-    html_content, fbx_files, _ = runtime.generate_motion(
-        text=text,
-        seeds_csv=seeds_csv,
-        duration=motion_duration,
-        cfg_scale=cfg_scale,
-        output_format=output_format,
-        original_text=original_text,
-        output_dir=output_dir,
-    )
-    return html_content, fbx_files
 # define data sources
 DATA_SOURCES = {
     "example_prompts": "examples/example_prompts/example_subset.json",
 }
-# create interface
-APP_CSS = """
-    :root{
-    --primary-start:#667eea; --primary-end:#764ba2;
-    --secondary-start:#4facfe; --secondary-end:#00f2fe;
-    --accent-start:#f093fb; --accent-end:#f5576c;
-    --page-bg:linear-gradient(135deg,#f5f7fa 0%,#c3cfe2 100%);
-    --card-bg:linear-gradient(135deg,#ffffff 0%,#f8f9fa 100%);
-    --radius:12px;
-    --iframe-bg:#ffffff;
-    }
-    /* Dark mode variables */
-    [data-theme="dark"], .dark {
-    --page-bg:linear-gradient(135deg,#1a1a1a 0%,#2d3748 100%);
-    --card-bg:linear-gradient(135deg,#2d3748 0%,#374151 100%);
-    --text-primary:#f7fafc;
-    --text-secondary:#e2e8f0;
-    --border-color:#4a5568;
-    --input-bg:#374151;
-    --input-border:#4a5568;
-    --iframe-bg:#1a1a2e;
-    }
-    /* Page and card */
-    .gradio-container{
-    background:var(--page-bg) !important;
-    min-height:100vh !important;
-    color:var(--text-primary, #333) !important;
-    }
-    .main-header{
-    background:transparent !important; border:none !important; box-shadow:none !important;
-    padding:0 !important; margin:10px 0 16px !important;
-    text-align:center !important;
-    }
-    .main-header h1, .main-header p, .main-header li {
-    color:var(--text-primary, #333) !important;
-    }
-    .left-panel,.right-panel{
-    background:var(--card-bg) !important;
-    border:1px solid var(--border-color, #e9ecef) !important;
-    border-radius:15px !important;
-    box-shadow:0 4px 20px rgba(0,0,0,.08) !important;
-    padding:24px !important;
-    }
-    .gradio-accordion{
-    border:1px solid var(--border-color, #e1e5e9) !important;
-    border-radius:var(--radius) !important;
-    margin:12px 0 !important; background:transparent !important;
-    }
-    .gradio-accordion summary{
-    background:transparent !important;
-    padding:14px 18px !important;
-    font-weight:600 !important;
-    color:var(--text-primary, #495057) !important;
-    }
-    .gradio-group{
-    background:transparent !important; border:none !important;
-    border-radius:8px !important; padding:12px 0 !important; margin:8px 0 !important;
-    }
-    /* Input class style - dark mode adaptation */
-    .gradio-textbox input,.gradio-textbox textarea,.gradio-dropdown .wrap{
-    border-radius:8px !important;
-    border:2px solid var(--input-border, #e9ecef) !important;
-    background:var(--input-bg, #fff) !important;
-    color:var(--text-primary, #333) !important;
-    transition:.2s all !important;
-    }
-    .gradio-textbox input:focus,.gradio-textbox textarea:focus,.gradio-dropdown .wrap:focus-within{
-    border-color:var(--primary-start) !important;
-    box-shadow:0 0 0 3px rgba(102,126,234,.1) !important;
-    }
-    .gradio-slider input[type="range"]{
-    background:linear-gradient(to right,var(--primary-start),var(--primary-end)) !important;
-    border-radius:10px !important;
-    }
-    .gradio-checkbox input[type="checkbox"]{
-    border-radius:4px !important;
-    border:2px solid var(--input-border, #e9ecef) !important;
-    transition:.2s all !important;
-    }
-    .gradio-checkbox input[type="checkbox"]:checked{
-    background:linear-gradient(45deg,var(--primary-start),var(--primary-end)) !important;
-    border-color:var(--primary-start) !important;
-    }
-    /* Label text color adaptation */
-    .gradio-textbox label, .gradio-dropdown label, .gradio-slider label,
-    .gradio-checkbox label, .gradio-html label {
-    color:var(--text-primary, #333) !important;
-    }
-    .gradio-textbox .info, .gradio-dropdown .info, .gradio-slider .info,
-    .gradio-checkbox .info {
-    color:var(--text-secondary, #666) !important;
-    }
-    /* Status information - dark mode adaptation */
-    .gradio-textbox[data-testid*="状态信息"] input{
-    background:var(--input-bg, linear-gradient(135deg,#f8f9fa 0%,#e9ecef 100%)) !important;
-    border:2px solid var(--input-border, #dee2e6) !important;
-    color:var(--text-primary, #495057) !important;
-    font-weight:500 !important;
-    }
-    /* Button base class and variant */
-    .generate-button,.rewrite-button,.dice-button{
-    border:none !important; color:#fff !important; font-weight:600 !important;
-    border-radius:8px !important; transition:.3s all !important;
-    box-shadow:0 4px 15px rgba(0,0,0,.12) !important;
-    }
-    .generate-button{ background:linear-gradient(45deg,var(--primary-start),var(--primary-end)) !important; }
-    .rewrite-button{ background:linear-gradient(45deg,var(--secondary-start),var(--secondary-end)) !important; }
-    .dice-button{
-    background:linear-gradient(45deg,var(--accent-start),var(--accent-end)) !important;
-    height:40px !important;
-    }
-    .generate-button:hover,.rewrite-button:hover{ transform:translateY(-2px) !important; }
-    .dice-button:hover{
-    transform:scale(1.05) !important;
-    box-shadow:0 4px 12px rgba(240,147,251,.28) !important;
-    }
-    .dice-container{
-    display:flex !important;
-    align-items:flex-end !important;
-    justify-content:center !important;
-    }
-    /* Right panel clipping overflow, avoid double scrollbars */
-    .right-panel{
-    background:var(--card-bg) !important;
-    border:1px solid var(--border-color, #e9ecef) !important;
-    border-radius:15px !important;
-    box-shadow:0 4px 20px rgba(0,0,0,.08) !important;
-    padding:24px !important; overflow:hidden !important;
-    }
-    /* Main content row - ensure equal heights */
-    .main-row {
-    display: flex !important;
-    align-items: stretch !important;
-    }
-    /* Flask area - match left panel height */
-    .flask-display{
-    padding:0 !important; margin:0 !important; border:none !important;
-    box-shadow:none !important; background:var(--iframe-bg) !important;
-    border-radius:10px !important; position:relative !important;
-    height:100% !important; min-height:750px !important;
-    display:flex !important; flex-direction:column !important;
-    }
-    .flask-display iframe{
-    width:100% !important; flex:1 !important; min-height:750px !important;
-    border:none !important; border-radius:10px !important; display:block !important;
-    background:var(--iframe-bg) !important;
-    }
-    /* Right panel should stretch to match left panel */
-    .right-panel{
-    background:var(--card-bg) !important;
-    border:1px solid var(--border-color, #e9ecef) !important;
-    border-radius:15px !important;
-    box-shadow:0 4px 20px rgba(0,0,0,.08) !important;
-    padding:24px !important; overflow:hidden !important;
-    display:flex !important; flex-direction:column !important;
-    }
-    /* Ensure dropdown menu is visible in dark mode */
-    [data-theme="dark"] .gradio-dropdown .wrap,
-    .dark .gradio-dropdown .wrap {
-    background:var(--input-bg) !important;
-    color:var(--text-primary) !important;
-    }
-    [data-theme="dark"] .gradio-dropdown .option,
-    .dark .gradio-dropdown .option {
-    background:var(--input-bg) !important;
-    color:var(--text-primary) !important;
-    }
-    [data-theme="dark"] .gradio-dropdown .option:hover,
-    .dark .gradio-dropdown .option:hover {
-    background:var(--border-color) !important;
-    }
-    .footer{
-    text-align:center !important;
-    margin-top:20px !important;
-    padding:10px !important;
-    color:var(--text-secondary, #666) !important;
-    }
-"""
-HEADER_BASE_MD = "# HY-Motion-1.0: Text-to-Motion Playground"
-FOOTER_MD = "*This is a Beta version, any issues or feedback are welcome!*"
-HTML_OUTPUT_PLACEHOLDER = """
-<div style='height: 750px; width: 100%; border-radius: 8px; border-color: #e5e7eb; border-style: solid; border-width: 1px; display: flex; justify-content: center; align-items: center;'>
-    <div style='text-align: center; font-size: 16px; color: #6b7280;'>
-        <p style="color: #8d8d8d;">Welcome to HY-Motion-1.0!</p>
-        <p style="color: #8d8d8d;">No motion visualization here yet.</p>
-    </div>
-</div>
-"""
 def load_examples_from_txt(txt_path: str, example_record_fps=20, max_duration=12):
     """Load examples from txt file."""
@@ -393,20 +129,12 @@ def load_examples_from_txt(txt_path: str, example_record_fps=20, max_duration=12
 class T2MGradioUI:
-    def __init__(self, runtime: T2MRuntime, args: argparse.Namespace):
-        self.runtime = runtime
-        self.args = args
-        # Check if rewrite is available:
-        # - prompt_engineering_host must be provided
-        # - disable_rewrite must not be set
-        print(f">>> args: {vars(args)}")
-        self.rewrite_available = (
-            args.prompt_engineering_host is not None
-            and args.prompt_engineering_host.strip() != ""
-            and not args.disable_rewrite
-        )
         self.all_example_data = {}
         self._init_example_data()
@@ -449,7 +177,7 @@ class T2MGradioUI:
         else:
             print(f"\t>>> Using LLM to estimate duration/rewrite text...")
             try:
-                predicted_duration, rewritten_text = self.runtime.rewrite_text_and_infer_time(text=text)
             except Exception as e:
                 print(f"\t>>> Text rewriting/duration prediction failed: {e}")
                 return (
@@ -473,7 +201,7 @@ class T2MGradioUI:
         cfg_scale: float,
     ) -> Tuple[str, List[str]]:
         # When rewrite is not available, use original_text directly
-        if not self.rewrite_available:
             text_to_use = original_text.strip()
             if not text_to_use:
                 return "Error: Input text is empty, please enter text first", []
@@ -484,31 +212,30 @@ class T2MGradioUI:
         try:
             # Use runtime from global if available (for Zero GPU), otherwise use self.runtime
-            runtime = _global_runtime if _global_runtime is not None else self.runtime
-            fbx_ok = getattr(runtime, "fbx_available", False)
             req_format = "fbx" if fbx_ok else "dict"
             # Use GPU-decorated function for Zero GPU support
-            html_content, fbx_files = generate_motion_on_gpu(
                 text=text_to_use,
                 seeds_csv=seed_input,
-                motion_duration=duration,
                 cfg_scale=cfg_scale,
                 output_format=req_format,
                 original_text=original_text,
-                output_dir=self.args.output_dir,
             )
             # Escape HTML content for srcdoc attribute
-            escaped_html = html_content.replace('"', '&quot;')
             # Return iframe with srcdoc - directly embed HTML content
-            iframe_html = f'''
                 <iframe
                     srcdoc="{escaped_html}"
                     width="100%"
                     height="750px"
                     style="border: none; border-radius: 12px; box-shadow: 0 4px 20px rgba(0,0,0,0.1);"
                 ></iframe>
-            '''
             return iframe_html, fbx_files
         except Exception as e:
             print(f"\t>>> Motion generation failed: {e}")
@@ -549,9 +276,14 @@ class T2MGradioUI:
                 # Left control panel
                 with gr.Column(scale=2, elem_classes=["left-panel"]):
                     # Input textbox
                     self.text_input = gr.Textbox(
                         label="📝 Input Text",
-                        placeholder="Enter text to generate motion, support Chinese and English text input.",
                     )
                     # Rewritten textbox
                     self.rewritten_text = gr.Textbox(
@@ -572,7 +304,7 @@ class T2MGradioUI:
                     # Execute buttons
                     with gr.Row():
-                        if self.rewrite_available:
                             self.rewrite_btn = gr.Button(
                                 "🔄 Rewrite Text",
                                 variant="secondary",
@@ -595,17 +327,14 @@ class T2MGradioUI:
                             variant="primary",
                             size="lg",
                             elem_classes=["generate-button"],
-                            interactive=not self.rewrite_available,  # Enable directly if rewrite not available
                         )
-                    if not self.rewrite_available:
                         gr.Markdown(
                             "> ⚠️ **Prompt engineering is not available.** Text rewriting and duration estimation are disabled. Your input text and duration will be used directly."
                         )
-                    # Advanced settings
-                    with gr.Accordion("🔧 Advanced Settings", open=False):
-                        self._build_advanced_settings()
                     # Example selection dropdown
                     self.example_dropdown = gr.Dropdown(
@@ -616,8 +345,12 @@ class T2MGradioUI:
                         interactive=True,
                     )
                     # Status message depends on whether rewrite is available
-                    if self.rewrite_available:
                         status_msg = "Please click the [🔄 Rewrite Text] button to rewrite the text first"
                     else:
                         status_msg = "Enter your text and click [🚀 Generate Motion] directly."
@@ -629,7 +362,7 @@ class T2MGradioUI:
                     # FBX Download section
                     with gr.Row(visible=False) as self.fbx_download_row:
-                        if getattr(self.runtime, "fbx_available", False):
                             self.fbx_files = gr.File(
                                 label="📦 Download FBX Files",
                                 file_count="multiple",
@@ -641,7 +374,7 @@ class T2MGradioUI:
                 # Right display area
                 with gr.Column(scale=3):
                     self.output_display = gr.HTML(
-                        value=HTML_OUTPUT_PLACEHOLDER,
                         show_label=False,
                         elem_classes=["flask-display"]
                     )
@@ -655,7 +388,7 @@ class T2MGradioUI:
     def _build_advanced_settings(self):
         # Only show rewrite options if rewrite is available
-        if self.rewrite_available:
             with gr.Group():
                 gr.Markdown("### 🔄 Text Rewriting Options")
                 with gr.Row():
@@ -730,7 +463,7 @@ class T2MGradioUI:
         )
         # Rewrite text logic (only bind when rewrite is available)
-        if self.rewrite_available:
             self.rewrite_btn.click(fn=lambda: "Rewriting text, please wait...", outputs=[self.status_output]).then(
                 self._prompt_engineering,
                 inputs=[
@@ -750,7 +483,7 @@ class T2MGradioUI:
         # Generate motion logic
         self.generate_btn.click(
-            fn=lambda: "Generating motion, please wait... (It takes some extra time to start the renderer for the first generation)",
             outputs=[self.status_output],
         ).then(
             self._generate_motion,
@@ -761,8 +494,7 @@ class T2MGradioUI:
                 self.duration_slider,
                 self.cfg_slider,
             ],
-            outputs=[self.output_display, self.fbx_files],
-            concurrency_limit=NUM_WORKERS,
         ).then(
             fn=lambda fbx_list: (
                 (
@@ -777,7 +509,7 @@ class T2MGradioUI:
         )
         # Reset logic - different behavior based on rewrite availability
-        if self.rewrite_available:
             self.text_input.change(
                 fn=lambda: (
                     gr.update(visible=False),
@@ -802,7 +534,7 @@ class T2MGradioUI:
                 outputs=[self.rewritten_text, self.generate_btn, self.status_output],
             )
         # Only bind rewritten_text change when rewrite is available
-        if self.rewrite_available:
             self.rewritten_text.change(
                 fn=lambda text: (
                     gr.update(interactive=bool(text.strip())),
@@ -819,16 +551,17 @@ class T2MGradioUI:
 def create_demo(final_model_path):
     """Create the Gradio demo with Zero GPU support."""
-    global _global_runtime, _global_args
     class Args:
         model_path = final_model_path
         output_dir = "output/gradio"
         prompt_engineering_host = os.environ.get("PROMPT_HOST", None)
-        disable_rewrite = False
     args = Args()
-    _global_args = args  # Set global args for lazy loading
     # Check required files:
     cfg = osp.join(args.model_path, "config.yml")
@@ -841,55 +574,19 @@ def create_demo(final_model_path):
     # For Zero GPU: Don't load model at startup, use lazy loading
     # Create a minimal runtime for UI initialization (without model loading)
-    if SPACES_AVAILABLE:
-        print(">>> Hugging Face Spaces detected. Using Zero GPU lazy loading.")
-        print(">>> Model will be loaded on first GPU request.")
-        # Create a placeholder runtime with minimal initialization for UI
-        class PlaceholderRuntime:
-            def __init__(self):
-                self.fbx_available = False
-                self.prompt_engineering_host = args.prompt_engineering_host
-            def rewrite_text_and_infer_time(self, text: str):
-                # For prompt rewriting, we don't need GPU
-                from hymotion.prompt_engineering.prompt_rewrite import PromptRewriter
-                rewriter = PromptRewriter(host=self.prompt_engineering_host)
-                return rewriter.rewrite_prompt_and_infer_time(text)
-        runtime = PlaceholderRuntime()
-    else:
-        # Local development: load model immediately
-        print(">>> Local environment detected. Loading model at startup.")
-        skip_model_loading = False
-        if not os.path.exists(ckpt):
-            print(f">>> [WARNING] Checkpoint file not found: {ckpt}")
-            print(f">>> [WARNING] Model loading will be skipped. Motion generation will not be available.")
-            skip_model_loading = True
-        print(">>> Initializing T2MRuntime...")
-        if "USE_HF_MODELS" not in os.environ:
-            os.environ["USE_HF_MODELS"] = "1"
-        skip_text = False
-        runtime = T2MRuntime(
-            config_path=cfg,
-            ckpt_name=ckpt,
-            skip_text=skip_text,
-            device_ids=None,
-            prompt_engineering_host=args.prompt_engineering_host,
-            skip_model_loading=skip_model_loading,
-        )
-        _global_runtime = runtime  # Set global runtime for GPU function
-    ui = T2MGradioUI(runtime=runtime, args=args)
     demo = ui.build_ui()
     return demo
 # Create demo at module level for Hugging Face Spaces
-final_model_path = try_to_download_model()
-demo = create_demo(final_model_path)
 if __name__ == "__main__":
     demo.launch()

 import argparse
 import codecs as cs
 import json
 from typing import List, Optional, Tuple, Union
 import gradio as gr
+from hymotion.utils.gradio_runtime import ModelInference
+from hymotion.utils.gradio_utils import try_to_download_model, try_to_download_text_encoder
+from hymotion.utils.gradio_css import get_placeholder_html, APP_CSS, HEADER_BASE_MD, FOOTER_MD
 # Import spaces for Hugging Face Zero GPU support
+import spaces
+# Apply @spaces.GPU decorator to run_inference method
+# This ensures GPU operations happen in isolated subprocess
+# Model loading and inference will occur in GPU subprocess, not main process
+original_run_inference = ModelInference.run_inference
+@spaces.GPU(duration=120)  # Request GPU for up to 120 seconds per inference
+def gpu_run_inference(self, *args, **kwargs):
+    """
+    GPU-accelerated inference with Spaces decorator.
+    This function runs in a GPU subprocess where:
+    - Model is loaded and moved to GPU (safe)
+    - CUDA operations are allowed
+    - All CUDA tensors are moved to CPU before return (for pickle safety)
+    """
+    return original_run_inference(self, *args, **kwargs)
+# Replace the original method with the GPU-decorated version
+ModelInference.run_inference = gpu_run_inference
+from hymotion.utils.t2m_runtime import T2MRuntime
 def _init_runtime_if_needed():
     """Initialize runtime lazily for Zero GPU support."""
         ckpt_name=ckpt,
         skip_text=skip_text,
         device_ids=None,
         skip_model_loading=skip_model_loading,
+        disable_prompt_engineering=args.disable_prompt_engineering,
+        prompt_engineering_host=args.prompt_engineering_host,
+        prompt_engineering_model_path=args.prompt_engineering_model_path,
     )
     return _global_runtime
 # define data sources
 DATA_SOURCES = {
     "example_prompts": "examples/example_prompts/example_subset.json",
 }
 def load_examples_from_txt(txt_path: str, example_record_fps=20, max_duration=12):
     """Load examples from txt file."""
 class T2MGradioUI:
+    def __init__(self, args):
+        self.output_dir = args.output_dir
+        print(f"[{self.__class__.__name__}] output_dir: {self.output_dir}")
+        self.model_inference = ModelInference(args.model_path, use_prompt_engineering=args.use_prompt_engineering, use_text_encoder=args.use_text_encoder)
+        # self.args = args
+        self.prompt_engineering_available = args.use_prompt_engineering
         self.all_example_data = {}
         self._init_example_data()
         else:
             print(f"\t>>> Using LLM to estimate duration/rewrite text...")
             try:
+                predicted_duration, rewritten_text = self.model_inference.rewrite_text_and_infer_time(text=text)
             except Exception as e:
                 print(f"\t>>> Text rewriting/duration prediction failed: {e}")
                 return (
         cfg_scale: float,
     ) -> Tuple[str, List[str]]:
         # When rewrite is not available, use original_text directly
+        if not self.prompt_engineering_available:
             text_to_use = original_text.strip()
             if not text_to_use:
                 return "Error: Input text is empty, please enter text first", []
         try:
             # Use runtime from global if available (for Zero GPU), otherwise use self.runtime
+            fbx_ok = getattr(self.model_inference, "fbx_available", False)
             req_format = "fbx" if fbx_ok else "dict"
             # Use GPU-decorated function for Zero GPU support
+            html_content, fbx_files = self.model_inference.run_inference(
                 text=text_to_use,
                 seeds_csv=seed_input,
+                duration=duration,
                 cfg_scale=cfg_scale,
                 output_format=req_format,
                 original_text=original_text,
+                output_dir=self.output_dir
             )
             # Escape HTML content for srcdoc attribute
+            escaped_html = html_content.replace('"', "&quot;")
             # Return iframe with srcdoc - directly embed HTML content
+            iframe_html = f"""
                 <iframe
                     srcdoc="{escaped_html}"
                     width="100%"
                     height="750px"
                     style="border: none; border-radius: 12px; box-shadow: 0 4px 20px rgba(0,0,0,0.1);"
                 ></iframe>
+            """
             return iframe_html, fbx_files
         except Exception as e:
             print(f"\t>>> Motion generation failed: {e}")
                 # Left control panel
                 with gr.Column(scale=2, elem_classes=["left-panel"]):
                     # Input textbox
+                    if self.prompt_engineering_available:
+                        input_place_holder = "Enter text to generate motion, support Chinese and English text input."
+                    else:
+                        input_place_holder = "Enter text to generate motion, please use `A person ...` format to describe the motion"
                     self.text_input = gr.Textbox(
                         label="📝 Input Text",
+                        placeholder=input_place_holder,
                     )
                     # Rewritten textbox
                     self.rewritten_text = gr.Textbox(
                     # Execute buttons
                     with gr.Row():
+                        if self.prompt_engineering_available:
                             self.rewrite_btn = gr.Button(
                                 "🔄 Rewrite Text",
                                 variant="secondary",
                             variant="primary",
                             size="lg",
                             elem_classes=["generate-button"],
+                            interactive=not self.prompt_engineering_available,  # Enable directly if rewrite not available
                         )
+                    if not self.prompt_engineering_available:
                         gr.Markdown(
                             "> ⚠️ **Prompt engineering is not available.** Text rewriting and duration estimation are disabled. Your input text and duration will be used directly."
                         )
                     # Example selection dropdown
                     self.example_dropdown = gr.Dropdown(
                         interactive=True,
                     )
+                    # Advanced settings
+                    with gr.Accordion("🔧 Advanced Settings", open=False):
+                        self._build_advanced_settings()
                     # Status message depends on whether rewrite is available
+                    if self.prompt_engineering_available:
                         status_msg = "Please click the [🔄 Rewrite Text] button to rewrite the text first"
                     else:
                         status_msg = "Enter your text and click [🚀 Generate Motion] directly."
                     # FBX Download section
                     with gr.Row(visible=False) as self.fbx_download_row:
+                        if getattr(self.model_inference, "fbx_available", False):
                             self.fbx_files = gr.File(
                                 label="📦 Download FBX Files",
                                 file_count="multiple",
                 # Right display area
                 with gr.Column(scale=3):
                     self.output_display = gr.HTML(
+                        value=get_placeholder_html(),
                         show_label=False,
                         elem_classes=["flask-display"]
                     )
     def _build_advanced_settings(self):
         # Only show rewrite options if rewrite is available
+        if self.prompt_engineering_available:
             with gr.Group():
                 gr.Markdown("### 🔄 Text Rewriting Options")
                 with gr.Row():
         )
         # Rewrite text logic (only bind when rewrite is available)
+        if self.prompt_engineering_available:
             self.rewrite_btn.click(fn=lambda: "Rewriting text, please wait...", outputs=[self.status_output]).then(
                 self._prompt_engineering,
                 inputs=[
         # Generate motion logic
         self.generate_btn.click(
+            fn=lambda: "Generating motion, please wait... (It takes some extra time for the first generation)",
             outputs=[self.status_output],
         ).then(
             self._generate_motion,
                 self.duration_slider,
                 self.cfg_slider,
             ],
+            outputs=[self.output_display, self.fbx_files]
         ).then(
             fn=lambda fbx_list: (
                 (
         )
         # Reset logic - different behavior based on rewrite availability
+        if self.prompt_engineering_available:
             self.text_input.change(
                 fn=lambda: (
                     gr.update(visible=False),
                 outputs=[self.rewritten_text, self.generate_btn, self.status_output],
             )
         # Only bind rewritten_text change when rewrite is available
+        if self.prompt_engineering_available:
             self.rewritten_text.change(
                 fn=lambda text: (
                     gr.update(interactive=bool(text.strip())),
 def create_demo(final_model_path):
     """Create the Gradio demo with Zero GPU support."""
     class Args:
         model_path = final_model_path
         output_dir = "output/gradio"
+        use_prompt_engineering = False
+        use_text_encoder = True
         prompt_engineering_host = os.environ.get("PROMPT_HOST", None)
+        prompt_engineering_model_path = os.environ.get("PROMPT_MODEL_PATH", None)
+        disable_prompt_engineering = os.environ.get("DISABLE_PROMPT_ENGINEERING", False)
     args = Args()
     # Check required files:
     cfg = osp.join(args.model_path, "config.yml")
     # For Zero GPU: Don't load model at startup, use lazy loading
     # Create a minimal runtime for UI initialization (without model loading)
+    ui = T2MGradioUI(args=args)
     demo = ui.build_ui()
     return demo
 # Create demo at module level for Hugging Face Spaces
+# Pre-download text encoder models first (without loading)
 if __name__ == "__main__":
+    # Create demo at module level for Hugging Face Spaces
+    try_to_download_text_encoder()
+    # Then download the main model
+    final_model_path = try_to_download_model()
+    demo = create_demo(final_model_path)
     demo.launch()

hymotion/prompt_engineering/prompt_rewrite.py CHANGED Viewed

@@ -13,8 +13,10 @@ import uuid
 from dataclasses import dataclass
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 from openai import OpenAI
 from requests import exceptions as req_exc
 from .model_constants import REWRITE_AND_INFER_TIME_PROMPT_FORMAT
@@ -242,18 +244,39 @@ class ResponseParser:
 class PromptRewriter:
-    def __init__(self, host: Optional[str] = None, parser: Optional[ResponseParser] = None):
         self.parser = parser or ResponseParser()
         self.logger = logging.getLogger(__name__)
-        self.api = OpenAIChatApi(
-            ApiConfig(
-                host=host,
-                user="",
-                apikey="EMPTY",
-                model="Qwen3-30B-A3B-SFT",
-                api_version="",
             )
-        )
     def rewrite_prompt_and_infer_time(
         self,
@@ -261,17 +284,36 @@ class PromptRewriter:
         prompt_format: str = REWRITE_AND_INFER_TIME_PROMPT_FORMAT,
         retry_config: Optional[RetryConfig] = None,
     ) -> Tuple[float, str]:
-        self.logger.info("Start rewriting prompt...")
-        try:
-            result, cost, elapsed = self.parser.call_data_eval_with_retry(
-                self.api, prompt_format.format(text), retry_config
             )
-            self.logger.info(f"Rewriting completed - cost: {cost:.6f}, time: {elapsed:.2f}s")
-            return round(float(result["duration"]) / 30.0, 2), result["short_caption"]
-        except Exception as e:
-            self.logger.error(f"Prompt rewriting failed: {e}")
-            raise
 if __name__ == "__main__":

 from dataclasses import dataclass
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+import torch
 from openai import OpenAI
 from requests import exceptions as req_exc
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from .model_constants import REWRITE_AND_INFER_TIME_PROMPT_FORMAT
 class PromptRewriter:
+    def __init__(
+        self, host: Optional[str] = None, model_path: Optional[str] = None, parser: Optional[ResponseParser] = None, device="auto"
+    ):
         self.parser = parser or ResponseParser()
         self.logger = logging.getLogger(__name__)
+        self.host = host
+        if host:
+            self.api = OpenAIChatApi(
+                ApiConfig(
+                    host=host,
+                    user="",
+                    apikey="EMPTY",
+                    model="Qwen3-30B-A3B-SFT",
+                    api_version="",
+                )
             )
+        else:
+            self.model_path = model_path or "Text2MotionPrompter/Text2MotionPrompter"
+            self.tokenizer = None
+            self.model = None
+            self._load_model(device)
+    def _load_model(self, device="auto"):
+        if self.model is None:
+            print(f">>> Loading prompter model from {self.model_path}")
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_path,
+                torch_dtype=torch.float16,
+                device_map=device,
+                load_in_4bit=True,
+            )
+            self.model.eval()
     def rewrite_prompt_and_infer_time(
         self,
         prompt_format: str = REWRITE_AND_INFER_TIME_PROMPT_FORMAT,
         retry_config: Optional[RetryConfig] = None,
     ) -> Tuple[float, str]:
+        if self.host:
+            self.logger.info("Start rewriting prompt...")
+            try:
+                result, cost, elapsed = self.parser.call_data_eval_with_retry(
+                    self.api, prompt_format.format(text), retry_config
+                )
+                self.logger.info(f"Rewriting completed - cost: {cost:.6f}, time: {elapsed:.2f}s")
+                return round(float(result["duration"]) / 30.0, 2), result["short_caption"]
+            except Exception as e:
+                self.logger.error(f"Prompt rewriting failed: {e}")
+                raise
+        else:
+            messages = [{"role": "user", "content": prompt_format.format(text)}]
+            full_prompt = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
             )
+            inputs = self.tokenizer([full_prompt], return_tensors="pt").to(self.model.device)
+            with torch.no_grad():
+                outputs = self.model.generate(**inputs, max_new_tokens=8192)
+            response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1] :].tolist(), skip_special_tokens=True)
+            try:
+                json_str = re.search(r"\{.*\}", response, re.DOTALL).group()
+                result = json.loads(json_str)
+                return round(float(result["duration"]) / 30.0, 2), result["short_caption"]
+            except:
+                return 5.0, text
 if __name__ == "__main__":

hymotion/utils/gradio_css.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import os.path as osp
+# create interface
+APP_CSS = """
+    :root{
+    --primary-start:#667eea; --primary-end:#764ba2;
+    --secondary-start:#4facfe; --secondary-end:#00f2fe;
+    --accent-start:#f093fb; --accent-end:#f5576c;
+    --page-bg:linear-gradient(135deg,#f5f7fa 0%,#c3cfe2 100%);
+    --card-bg:linear-gradient(135deg,#ffffff 0%,#f8f9fa 100%);
+    --radius:12px;
+    --iframe-bg:#ffffff;
+    }
+    /* Dark mode variables */
+    [data-theme="dark"], .dark {
+    --page-bg:linear-gradient(135deg,#1a1a1a 0%,#2d3748 100%);
+    --card-bg:linear-gradient(135deg,#2d3748 0%,#374151 100%);
+    --text-primary:#f7fafc;
+    --text-secondary:#e2e8f0;
+    --border-color:#4a5568;
+    --input-bg:#374151;
+    --input-border:#4a5568;
+    --iframe-bg:#1a1a2e;
+    }
+    /* Page and card */
+    .gradio-container{
+    background:var(--page-bg) !important;
+    min-height:100vh !important;
+    color:var(--text-primary, #333) !important;
+    }
+    .main-header{
+    background:transparent !important; border:none !important; box-shadow:none !important;
+    padding:0 !important; margin:10px 0 16px !important;
+    text-align:center !important;
+    }
+    .main-header h1, .main-header p, .main-header li {
+    color:var(--text-primary, #333) !important;
+    }
+    .left-panel,.right-panel{
+    background:var(--card-bg) !important;
+    border:1px solid var(--border-color, #e9ecef) !important;
+    border-radius:15px !important;
+    box-shadow:0 4px 20px rgba(0,0,0,.08) !important;
+    padding:24px !important;
+    }
+    .gradio-accordion{
+    border:1px solid var(--border-color, #e1e5e9) !important;
+    border-radius:var(--radius) !important;
+    margin:12px 0 !important; background:transparent !important;
+    }
+    .gradio-accordion summary{
+    background:transparent !important;
+    padding:14px 18px !important;
+    font-weight:600 !important;
+    color:var(--text-primary, #495057) !important;
+    }
+    .gradio-group{
+    background:transparent !important; border:none !important;
+    border-radius:8px !important; padding:12px 0 !important; margin:8px 0 !important;
+    }
+    /* Input class style - dark mode adaptation */
+    .gradio-textbox input,.gradio-textbox textarea,.gradio-dropdown .wrap{
+    border-radius:8px !important;
+    border:2px solid var(--input-border, #e9ecef) !important;
+    background:var(--input-bg, #fff) !important;
+    color:var(--text-primary, #333) !important;
+    transition:.2s all !important;
+    }
+    .gradio-textbox input:focus,.gradio-textbox textarea:focus,.gradio-dropdown .wrap:focus-within{
+    border-color:var(--primary-start) !important;
+    box-shadow:0 0 0 3px rgba(102,126,234,.1) !important;
+    }
+    .gradio-slider input[type="range"]{
+    background:linear-gradient(to right,var(--primary-start),var(--primary-end)) !important;
+    border-radius:10px !important;
+    }
+    .gradio-checkbox input[type="checkbox"]{
+    border-radius:4px !important;
+    border:2px solid var(--input-border, #e9ecef) !important;
+    transition:.2s all !important;
+    }
+    .gradio-checkbox input[type="checkbox"]:checked{
+    background:linear-gradient(45deg,var(--primary-start),var(--primary-end)) !important;
+    border-color:var(--primary-start) !important;
+    }
+    /* Label text color adaptation */
+    .gradio-textbox label, .gradio-dropdown label, .gradio-slider label,
+    .gradio-checkbox label, .gradio-html label {
+    color:var(--text-primary, #333) !important;
+    }
+    .gradio-textbox .info, .gradio-dropdown .info, .gradio-slider .info,
+    .gradio-checkbox .info {
+    color:var(--text-secondary, #666) !important;
+    }
+    /* Status information - dark mode adaptation */
+    .gradio-textbox[data-testid*="状态信息"] input{
+    background:var(--input-bg, linear-gradient(135deg,#f8f9fa 0%,#e9ecef 100%)) !important;
+    border:2px solid var(--input-border, #dee2e6) !important;
+    color:var(--text-primary, #495057) !important;
+    font-weight:500 !important;
+    }
+    /* Button base class and variant */
+    .generate-button,.rewrite-button,.dice-button{
+    border:none !important; color:#fff !important; font-weight:600 !important;
+    border-radius:8px !important; transition:.3s all !important;
+    box-shadow:0 4px 15px rgba(0,0,0,.12) !important;
+    }
+    .generate-button{ background:linear-gradient(45deg,var(--primary-start),var(--primary-end)) !important; }
+    .rewrite-button{ background:linear-gradient(45deg,var(--secondary-start),var(--secondary-end)) !important; }
+    .dice-button{
+    background:linear-gradient(45deg,var(--accent-start),var(--accent-end)) !important;
+    height:40px !important;
+    }
+    .generate-button:hover,.rewrite-button:hover{ transform:translateY(-2px) !important; }
+    .dice-button:hover{
+    transform:scale(1.05) !important;
+    box-shadow:0 4px 12px rgba(240,147,251,.28) !important;
+    }
+    .dice-container{
+    display:flex !important;
+    align-items:flex-end !important;
+    justify-content:center !important;
+    }
+    /* Right panel clipping overflow, avoid double scrollbars */
+    .right-panel{
+    background:var(--card-bg) !important;
+    border:1px solid var(--border-color, #e9ecef) !important;
+    border-radius:15px !important;
+    box-shadow:0 4px 20px rgba(0,0,0,.08) !important;
+    padding:24px !important; overflow:hidden !important;
+    }
+    /* Main content row - ensure equal heights */
+    .main-row {
+    display: flex !important;
+    align-items: stretch !important;
+    }
+    /* Flask area - match left panel height */
+    .flask-display{
+    padding:0 !important; margin:0 !important; border:none !important;
+    box-shadow:none !important; background:var(--iframe-bg) !important;
+    border-radius:10px !important; position:relative !important;
+    height:100% !important; min-height:750px !important;
+    display:flex !important; flex-direction:column !important;
+    }
+    .flask-display iframe{
+    width:100% !important; flex:1 !important; min-height:750px !important;
+    border:none !important; border-radius:10px !important; display:block !important;
+    background:var(--iframe-bg) !important;
+    }
+    /* Right panel should stretch to match left panel */
+    .right-panel{
+    background:var(--card-bg) !important;
+    border:1px solid var(--border-color, #e9ecef) !important;
+    border-radius:15px !important;
+    box-shadow:0 4px 20px rgba(0,0,0,.08) !important;
+    padding:24px !important; overflow:hidden !important;
+    display:flex !important; flex-direction:column !important;
+    }
+    /* Ensure dropdown menu is visible in dark mode */
+    [data-theme="dark"] .gradio-dropdown .wrap,
+    .dark .gradio-dropdown .wrap {
+    background:var(--input-bg) !important;
+    color:var(--text-primary) !important;
+    }
+    [data-theme="dark"] .gradio-dropdown .option,
+    .dark .gradio-dropdown .option {
+    background:var(--input-bg) !important;
+    color:var(--text-primary) !important;
+    }
+    [data-theme="dark"] .gradio-dropdown .option:hover,
+    .dark .gradio-dropdown .option:hover {
+    background:var(--border-color) !important;
+    }
+    .footer{
+    text-align:center !important;
+    margin-top:20px !important;
+    padding:10px !important;
+    color:var(--text-secondary, #666) !important;
+    }
+"""
+HEADER_BASE_MD = "# HY-Motion-1.0: Text-to-Motion Playground\n### *Tencent Hunyuan 3D Digital Human Team*"
+FOOTER_MD = "*This is a Beta version, any issues or feedback are welcome!*"
+# Path to placeholder scene HTML template
+PLACEHOLDER_SCENE_TEMPLATE = osp.join(osp.dirname(__file__), "..", "..", "scripts/gradio/templates/placeholder_scene.html")
+def get_placeholder_html() -> str:
+    """
+    Load the placeholder scene HTML and wrap it in an iframe for display.
+    Returns an iframe HTML string with the embedded placeholder scene.
+    """
+    try:
+        with open(PLACEHOLDER_SCENE_TEMPLATE, "r", encoding="utf-8") as f:
+            html_content = f.read()
+        # Escape HTML content for srcdoc attribute
+        escaped_html = html_content.replace('"', '&quot;')
+        iframe_html = f'''
+            <iframe
+                srcdoc="{escaped_html}"
+                width="100%"
+                height="750px"
+                style="border: none; border-radius: 12px; box-shadow: 0 4px 20px rgba(0,0,0,0.1);"
+            ></iframe>
+        '''
+        return iframe_html
+    except Exception as e:
+        print(f">>> Failed to load placeholder scene HTML: {e}")
+        # Fallback to simple placeholder
+        return """
+        <div style='height: 750px; width: 100%; border-radius: 8px; border-color: #e5e7eb; border-style: solid; border-width: 1px; display: flex; justify-content: center; align-items: center; background: #424242;'>
+            <div style='text-align: center; font-size: 16px; color: #a0aec0;'>
+                <p>Welcome to HY-Motion-1.0!</p>
+                <p>Enter a text description and generate motion to see the 3D visualization here.</p>
+            </div>
+        </div>
+        """

hymotion/utils/gradio_runtime.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import os
+import threading
+import time
+import uuid
+from typing import List, Optional, Tuple, Union
+import torch
+import yaml
+from ..prompt_engineering.prompt_rewrite import PromptRewriter
+from .loaders import load_object
+from .visualize_mesh_web import save_visualization_data, generate_static_html_content
+try:
+    import fbx
+    FBX_AVAILABLE = True
+    print(">>> FBX module found.")
+except ImportError:
+    FBX_AVAILABLE = False
+    print(">>> FBX module not found.")
+def _now():
+    t = time.time()
+    ms = int((t - int(t)) * 1000)
+    return time.strftime("%Y%m%d_%H%M%S", time.localtime(t)) + f"{ms:03d}"
+_MODEL_CACHE = None
+class SimpleRuntime(torch.nn.Module):
+    def __init__(self, config_path, ckpt_name, load_prompt_engineering=False, load_text_encoder=False):
+        super().__init__()
+        self.load_prompt_engineering = load_prompt_engineering
+        self.load_text_encoder = load_text_encoder
+        # prompt engineering
+        if self.load_prompt_engineering:
+            print(f"[{self.__class__.__name__}] Loading prompt engineering...")
+            self.prompt_rewriter = PromptRewriter(
+                host=None, model_path=None, device="cpu"
+            )
+        else:
+            self.prompt_rewriter = None
+        # text encoder
+        if self.load_text_encoder:
+            print(f"[{self.__class__.__name__}] Loading text encoder...")
+            _text_encoder_module = "hymotion/network/text_encoders/text_encoder.HYTextModel"
+            _text_encoder_cfg = {
+                "llm_type": "qwen3",
+                "max_length_llm": 128
+            }
+            text_encoder = load_object(_text_encoder_module, _text_encoder_cfg)
+        else:
+            text_encoder = None
+        # 2. load model
+        print(f"[{self.__class__.__name__}] Loading model...")
+        with open(config_path, "r") as f:
+            config = yaml.load(f, Loader=yaml.FullLoader)
+        pipeline = load_object(
+            config["train_pipeline"],
+            config["train_pipeline_args"],
+            network_module=config["network_module"],
+            network_module_args=config["network_module_args"],
+        )
+        print(f"[{self.__class__.__name__}] Loading ckpt: {ckpt_name}")
+        pipeline.load_in_demo(
+            os.path.join(os.path.dirname(config_path), ckpt_name),
+            "stats",
+            build_text_encoder=False,
+            allow_empty_ckpt=False,
+        )
+        pipeline.text_encoder = text_encoder
+        self.pipeline = pipeline
+        #
+        self.fbx_available = FBX_AVAILABLE
+        if self.fbx_available:
+            try:
+                from .smplh2woodfbx import SMPLH2WoodFBX
+                self.fbx_converter = SMPLH2WoodFBX()
+            except Exception as e:
+                print(f">>> Failed to initialize FBX converter: {e}")
+                self.fbx_available = False
+                self.fbx_converter = None
+        else:
+            self.fbx_converter = None
+            print(">>> FBX module not found. FBX export will be disabled.")
+    def _generate_html_content(
+        self,
+        timestamp: str,
+        file_path: str,
+        output_dir: Optional[str] = None,
+    ) -> str:
+        """
+        Generate static HTML content with embedded data for iframe srcdoc.
+        All JavaScript code is embedded directly in the HTML, no external static resources needed.
+        Args:
+            timestamp: Timestamp string for logging
+            file_path: Base filename (without extension)
+            output_dir: Directory where NPZ/meta files are stored
+        Returns:
+            HTML content string (to be used in iframe srcdoc)
+        """
+        print(f">>> Generating static HTML content, timestamp: {timestamp}")
+        gradio_dir = output_dir if output_dir is not None else "output/gradio"
+        try:
+            # Generate static HTML content with embedded data (all JS is embedded in template)
+            html_content = generate_static_html_content(
+                folder_name=gradio_dir,
+                file_name=file_path,
+                hide_captions=False,
+            )
+            print(f">>> Static HTML content generated for: {file_path}")
+            return html_content
+        except Exception as e:
+            print(f">>> Failed to generate static HTML content: {e}")
+            import traceback
+            traceback.print_exc()
+            # Return error HTML
+            return f"<html><body><h1>Error generating visualization</h1><p>{str(e)}</p></body></html>"
+    def _generate_fbx_files(
+        self,
+        visualization_data: dict,
+        output_dir: Optional[str] = None,
+        fbx_filename: Optional[str] = None,
+    ) -> List[str]:
+        assert "smpl_data" in visualization_data, "smpl_data not found in visualization_data"
+        fbx_files = []
+        if output_dir is None:
+            root_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+            output_dir = os.path.join(root_dir, "output", "gradio")
+        smpl_data_list = visualization_data["smpl_data"]
+        unique_id = str(uuid.uuid4())[:8]
+        text = visualization_data["text"]
+        timestamp = visualization_data["timestamp"]
+        for bb in range(len(smpl_data_list)):
+            smpl_data = smpl_data_list[bb]
+            if fbx_filename is None:
+                fbx_filename_bb = f"{timestamp}_{unique_id}_{bb:03d}.fbx"
+            else:
+                fbx_filename_bb = f"{fbx_filename}_{bb:03d}.fbx"
+            fbx_path = os.path.join(output_dir, fbx_filename_bb)
+            success = self.fbx_converter.convert_npz_to_fbx(smpl_data, fbx_path)
+            if success:
+                fbx_files.append(fbx_path)
+                print(f"\t>>> FBX file generated: {fbx_path}")
+                txt_path = fbx_path.replace(".fbx", ".txt")
+                with open(txt_path, "w", encoding="utf-8") as f:
+                    f.write(text)
+                fbx_files.append(txt_path)
+        return fbx_files
+    def generate_motion(
+        self,
+        text: str,
+        seeds_csv: str,
+        duration: float,
+        cfg_scale: float,
+        output_format: str = "fbx",
+        output_dir: Optional[str] = None,
+        output_filename: Optional[str] = None,
+        original_text: Optional[str] = None,
+        use_special_game_feat: bool = False,
+    ) -> Tuple[Union[str, list[str]], dict]:
+        seeds = [int(s.strip()) for s in seeds_csv.split(",") if s.strip() != ""]
+        print(f"[{self.__class__.__name__}] Generating motion...")
+        print(f"[{self.__class__.__name__}] text: {text}")
+        if self.load_prompt_engineering:
+            duration, rewritten_text = self.prompt_rewriter.rewrite_prompt_and_infer_time(f"{text}")
+        else:
+            rewritten_text = text
+            duration = duration
+        pipeline = self.pipeline
+        pipeline.eval()
+        # When skip_text=True (debug mode), use blank text features
+        if not self.load_text_encoder:
+            print(">>> [Debug Mode] Using blank text features (skip_text=True)")
+            device = next(pipeline.parameters()).device
+            batch_size = len(seeds) if seeds else 1
+            # Create blank hidden_state_dict using null features
+            hidden_state_dict = {
+                "text_vec_raw": pipeline.null_vtxt_feat.expand(batch_size, -1, -1).to(device),
+                "text_ctxt_raw": pipeline.null_ctxt_input.expand(batch_size, -1, -1).to(device),
+                "text_ctxt_raw_length": torch.tensor([1] * batch_size, device=device),
+            }
+            # Disable CFG in debug mode (use cfg_scale=1.0)
+            model_output = pipeline.generate(
+                rewritten_text,
+                seeds,
+                duration,
+                cfg_scale=1.0,
+                use_special_game_feat=False,
+                hidden_state_dict=hidden_state_dict,
+            )
+        else:
+            model_output = pipeline.generate(
+                rewritten_text, seeds, duration, cfg_scale=cfg_scale, use_special_game_feat=use_special_game_feat
+            )
+        ts = _now()
+        save_data, base_filename = save_visualization_data(
+            output=model_output,
+            text=text if original_text is None else original_text,
+            rewritten_text=rewritten_text,
+            timestamp=ts,
+            output_dir=output_dir,
+            output_filename=output_filename,
+        )
+        html_content = self._generate_html_content(
+            timestamp=ts,
+            file_path=base_filename,
+            output_dir=output_dir,
+        )
+        if output_format == "fbx" and not self.fbx_available:
+            print(">>> Warning: FBX export requested but FBX SDK is not available. Falling back to dict format.")
+            output_format = "dict"
+        if output_format == "fbx" and self.fbx_available:
+            fbx_files = self._generate_fbx_files(
+                visualization_data=save_data,
+                output_dir=output_dir,
+                fbx_filename=output_filename,
+            )
+            return html_content, fbx_files, model_output
+        elif output_format == "dict":
+            # Return HTML content and empty list for fbx_files when using dict format
+            return html_content, [], model_output
+        else:
+            raise ValueError(f">>> Invalid output format: {output_format}")
+class ModelInference:
+    """
+    Handles model inference and data processing for Depth Anything 3.
+    """
+    def __init__(self, model_path, use_prompt_engineering, use_text_encoder):
+        """Initialize the model inference handler.
+        Note: Do not store model in instance variable to avoid
+        cross-process state issues with @spaces.GPU decorator.
+        """
+        # No instance variables - model cached in global variable
+        self.model_path = model_path
+        self.use_prompt_engineering = use_prompt_engineering
+        self.use_text_encoder = use_text_encoder
+        self.fbx_available = FBX_AVAILABLE
+    def initialize_model(self, device: str = "cuda"):
+        """
+        Initialize the DepthAnything3 model using global cache.
+        Optimization: Load model to CPU first, then move to GPU when needed.
+        This is faster than reloading from disk each time.
+        This uses a global variable which is safe because @spaces.GPU
+        runs in isolated subprocess, each with its own global namespace.
+        Args:
+            device: Device to run inference on (will move model to this device)
+        Returns:
+            Model instance ready for inference on specified device
+        """
+        global _MODEL_CACHE
+        if _MODEL_CACHE is None:
+            # First time loading in this subprocess
+            # Load to CPU first (faster than loading directly to GPU)
+            _MODEL_CACHE = SimpleRuntime(
+                config_path=os.path.join(self.model_path, "config.yml"),
+                ckpt_name="latest.ckpt",
+                load_prompt_engineering=self.use_prompt_engineering,
+                load_text_encoder=self.use_text_encoder
+            )
+            # Load to CPU first (faster, and allows reuse)
+            _MODEL_CACHE = _MODEL_CACHE.to("cpu")
+            _MODEL_CACHE.eval()
+            print("✅ Model loaded to CPU memory (cached in subprocess)")
+        # Move to target device for inference
+        if device != "cpu" and next(_MODEL_CACHE.parameters()).device.type != device:
+            print(f"🚀 Moving model from {next(_MODEL_CACHE.parameters()).device} to {device}...")
+            _MODEL_CACHE = _MODEL_CACHE.to(device)
+            print(f"✅ Model ready on {device}")
+        elif device == "cpu":
+            # Already on CPU or requested CPU
+            pass
+        return _MODEL_CACHE
+    def run_inference(
+        self, *args, **kwargs
+    ):
+        """
+        Run DepthAnything3 model inference on images.
+        Args:
+            target_dir: Directory containing images
+            apply_mask: Whether to apply mask for ambiguous depth classes
+            mask_edges: Whether to mask edges
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            process_res_method: Method for resizing input images
+            show_camera: Whether to show camera in 3D view
+            selected_first_frame: Selected first frame filename
+            save_percentage: Percentage of points to save (0-100)
+            infer_gs: Whether to infer 3D Gaussian Splatting
+        Returns:
+            Tuple of (prediction, processed_data)
+        """
+        # Device check
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        device = torch.device(device)
+        # Initialize model if needed - get model instance (not stored in self)
+        model = self.initialize_model(device)
+        with torch.no_grad():
+            print(f"[{self.__class__.__name__}] Running inference...")
+            html_content, fbx_files, model_output = model.generate_motion(*args, **kwargs)
+        # CRITICAL: Move all CUDA tensors to CPU before returning
+        # This prevents CUDA initialization in main process during unpickling
+        for k, val in model_output.items():
+            if isinstance(val, torch.Tensor):
+                model_output[k] = val.detach().cpu()
+        # # Clean up
+        torch.cuda.empty_cache()
+        return html_content, fbx_files
+if __name__ == "__main__":
+    # python -m hymotion.utils.gradio_runtime
+    runtime = SimpleRuntime(config_path="assets/config_simplified.yml", ckpt_name="latest.ckpt", load_prompt_engineering=False, load_text_encoder=False)
+    print(runtime.pipeline)

hymotion/utils/gradio_utils.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+from huggingface_hub import snapshot_download
+# 本地模型路径配置（如果已经下载，直接使用本地路径）
+QWEN_LOCAL_PATH = "ckpts/Qwen3-8B"
+CLIP_LOCAL_PATH = "ckpts/clip-vit-large-patch14"
+def try_to_download_text_encoder():
+    """
+    Pre-download text encoder models (Qwen3-8B and CLIP) to local cache.
+    This ensures the models are cached locally before they are needed,
+    so later loading will not require downloading again.
+    If models already exist in local paths (ckpts/), skip downloading.
+    """
+    # Text encoder model IDs (same as in hymotion/network/text_encoders/text_encoder.py)
+    QWEN_REPO_ID = "Qwen/Qwen3-8B"
+    CLIP_REPO_ID = "openai/clip-vit-large-patch14"
+    token = os.environ.get("HF_TOKEN", None)
+    if token is None:
+        token = ""
+    # 检查 Qwen3-8B 是否已存在
+    if os.path.exists(QWEN_LOCAL_PATH) and os.path.isdir(QWEN_LOCAL_PATH):
+        print(f">>> Found local Qwen model at: {QWEN_LOCAL_PATH}, skipping download.")
+    else:
+        print(f">>> Pre-downloading text encoder: {QWEN_REPO_ID} to {QWEN_LOCAL_PATH}")
+        try:
+            snapshot_download(
+                repo_id=QWEN_REPO_ID,
+                local_dir=QWEN_LOCAL_PATH,
+                token=token,
+            )
+            print(f">>> Successfully pre-downloaded: {QWEN_REPO_ID}")
+        except Exception as e:
+            print(f">>> [WARNING] Failed to pre-download {QWEN_REPO_ID}: {e}")
+    # 检查 CLIP 是否已存在
+    if os.path.exists(CLIP_LOCAL_PATH) and os.path.isdir(CLIP_LOCAL_PATH):
+        print(f">>> Found local CLIP model at: {CLIP_LOCAL_PATH}, skipping download.")
+    else:
+        print(f">>> Pre-downloading text encoder: {CLIP_REPO_ID} to {CLIP_LOCAL_PATH}")
+        try:
+            snapshot_download(
+                repo_id=CLIP_REPO_ID,
+                local_dir=CLIP_LOCAL_PATH,
+                token=token,
+            )
+            print(f">>> Successfully pre-downloaded: {CLIP_REPO_ID}")
+        except Exception as e:
+            print(f">>> [WARNING] Failed to pre-download {CLIP_REPO_ID}: {e}")
+    print(">>> Text encoder pre-download complete.")
+def try_to_download_model():
+    repo_id = "tencent/HY-Motion-1.0"
+    target_folder = "HY-Motion-1.0-Lite"
+    print(f">>> start download ", repo_id, target_folder)
+    token = os.environ.get("HF_TOKEN", None)
+    if token is None:
+        token = ""
+    local_dir = snapshot_download(
+        repo_id=repo_id,
+        allow_patterns=f"{target_folder}/*",
+        local_dir="./downloaded_models",
+        token=token
+    )
+    final_model_path = os.path.join(local_dir, target_folder)
+    print(f">>> Final model path: {final_model_path}")
+    return final_model_path

hymotion/utils/smplh2fbx.py DELETED Viewed

@@ -1,585 +0,0 @@
-import glob
-import os
-import shutil
-import sys
-import tempfile
-import fbx
-import numpy as np
-import torch
-from transforms3d.euler import mat2euler
-from .geometry import angle_axis_to_rotation_matrix, rot_mat2trans_mat, trans2trans_mat
-# yapf: disable
-SMPLH_JOINT2NUM = {
-    "Pelvis": 0, "L_Hip": 1, "R_Hip": 2, "Spine1": 3,
-    "L_Knee": 4, "R_Knee": 5, "Spine2": 6,
-    "L_Ankle": 7, "R_Ankle": 8,
-    "Spine3": 9,
-    "L_Foot": 10, "R_Foot": 11,
-    "Neck": 12, "L_Collar": 13, "R_Collar": 14, "Head": 15,
-    "L_Shoulder": 16, "R_Shoulder": 17,
-    "L_Elbow": 18, "R_Elbow": 19,
-    "L_Wrist": 20, "R_Wrist": 21,
-    # "Jaw": 22, "L_Eye": 23, "R_Eye": 24,
-    "L_Index1": 22, "L_Index2": 23, "L_Index3": 24,
-    "L_Middle1": 25, "L_Middle2": 26, "L_Middle3": 27,
-    "L_Pinky1": 28, "L_Pinky2": 29, "L_Pinky3": 30,
-    "L_Ring1": 31, "L_Ring2": 32, "L_Ring3": 33,
-    "L_Thumb1": 34, "L_Thumb2": 35, "L_Thumb3": 36,
-    "R_Index1": 37, "R_Index2": 38, "R_Index3": 39,
-    "R_Middle1": 40, "R_Middle2": 41, "R_Middle3": 42,
-    "R_Pinky1": 43, "R_Pinky2": 44, "R_Pinky3": 45,
-    "R_Ring1": 46, "R_Ring2": 47, "R_Ring3": 48,
-    "R_Thumb1": 49, "R_Thumb2": 50, "R_Thumb3": 51,
-}
-# yapf: enable
-def _parse_obj_file(obj_path):
-    vertices = []
-    uv_coords = []
-    faces = []
-    uv_faces = []
-    with open(obj_path, "r") as f:
-        for line in f:
-            line = line.strip()
-            if line.startswith("v "):
-                parts = line.split()
-                vertices.append([float(parts[1]), float(parts[2]), float(parts[3])])
-            elif line.startswith("vt "):
-                parts = line.split()
-                uv_coords.append([float(parts[1]), float(parts[2])])
-            elif line.startswith("f "):
-                parts = line.split()
-                face_vertices = []
-                face_uvs = []
-                for part in parts[1:]:
-                    indices = part.split("/")
-                    face_vertices.append(int(indices[0]) - 1)
-                    if len(indices) > 1 and indices[1]:
-                        face_uvs.append(int(indices[1]) - 1)
-                if len(face_vertices) == 3:
-                    faces.append(face_vertices)
-                    if len(face_uvs) == 3:
-                        uv_faces.append(face_uvs)
-    return np.array(vertices), np.array(uv_coords), np.array(faces), np.array(uv_faces)
-def _blend_shapes(betas: torch.Tensor, shape_disps: torch.Tensor) -> torch.Tensor:
-    """Calculates the per vertex displacement due to the blend shapes.
-    Parameters
-    ----------
-    betas : torch.tensor Bx(num_betas)
-        Blend shape coefficients
-    shape_disps: torch.tensor Vx3x(num_betas)
-        Blend shapes
-    Returns
-    -------
-    torch.tensor BxVx3
-        The per-vertex displacement due to shape deformation
-    """
-    # Displacement[b, m, k] = sum_{l} betas[b, l] * shape_disps[m, k, l]
-    # i.e. Multiply each shape displacement by its corresponding beta and
-    # then sum them.
-    blend_shape = torch.einsum("bl,mkl->bmk", [betas, shape_disps])
-    return blend_shape
-def _vertices2joints(J_regressor: torch.Tensor, vertices: torch.Tensor) -> torch.Tensor:
-    """Calculates the 3D joint locations from the vertices.
-    Parameters
-    ----------
-    J_regressor : torch.tensor JxV
-        The regressor array that is used to calculate the joints from the
-        position of the vertices
-    vertices : torch.tensor BxVx3
-        The tensor of mesh vertices
-    Returns
-    -------
-    torch.tensor BxJx3
-        The location of the joints
-    """
-    return torch.einsum("bik,ji->bjk", [vertices, J_regressor])
-def _addSmplXMesh(fbxScene, v_posed, faces, uv_coords=None, uv_faces=None):
-    # Obtain a reference to the scene's root node.
-    rootNode = fbxScene.GetRootNode()
-    # Create a new node in the scene.
-    geometryNode = fbx.FbxNode.Create(fbxScene, "Geometry")
-    rootNode.AddChild(geometryNode)
-    # Create a new mesh node attribute in the scene, and
-    # set it as the new node's attribute
-    mesh = fbx.FbxMesh.Create(fbxScene, "body")
-    geometryNode.SetNodeAttribute(mesh)
-    # Define the new mesh's control points.
-    # v_posed, faces = smplx['v_posed'], smplx['faces']
-    v_posed = np.array(v_posed)
-    faces = np.array(faces)
-    minValue = np.min(v_posed)
-    maxValue = np.max(v_posed)
-    # print(f"min = {minValue}, max = {maxValue}")
-    # print("min = {}, max = {}".format(minValue, maxValue))
-    # m = axangle2mat((1, 0, 0), np.radians(180))
-    mesh.InitControlPoints(v_posed.shape[0])
-    for i in range(v_posed.shape[0]):
-        v = v_posed[i, :]
-        # v = np.matmul(m, v)
-        vertex = fbx.FbxVector4(v[0], v[1], v[2])
-        mesh.SetControlPointAt(vertex, i)
-    for i in range(faces.shape[0]):
-        mesh.BeginPolygon(i)
-        mesh.AddPolygon(faces[i, 0])
-        mesh.AddPolygon(faces[i, 1])
-        mesh.AddPolygon(faces[i, 2])
-        mesh.EndPolygon()
-    if uv_coords is not None and uv_faces is not None:
-        uv_layer = mesh.CreateElementUV("UVSet")
-        uv_layer.SetMappingMode(fbx.FbxLayerElement.EMappingMode.eByPolygonVertex)
-        uv_layer.SetReferenceMode(fbx.FbxLayerElement.EReferenceMode.eIndexToDirect)
-        uv_array = uv_layer.GetDirectArray()
-        for i in range(len(uv_coords)):
-            uv_array.Add(fbx.FbxVector2(uv_coords[i][0], uv_coords[i][1]))
-        uv_index_array = uv_layer.GetIndexArray()
-        for i in range(len(uv_faces)):
-            for j in range(3):
-                uv_index_array.Add(uv_faces[i][j])
-    return geometryNode
-def _addSmplXSkeleton(fbxManager, fbxScene, trans, joint2num, kintree_table):
-    num2joint = ["" for key in joint2num]
-    for key, value in joint2num.items():
-        num2joint[value] = key
-    # trans = np.array(trans)
-    # Obtain a reference to the scene's root node.
-    rootNode = fbxScene.GetRootNode()
-    # Create a new node in the scene.
-    referenceNode = fbx.FbxNode.Create(fbxScene, "Reference")
-    rootNode.AddChild(referenceNode)
-    # Create skeletons
-    skeletonNodes = []
-    for nth in range(len(kintree_table)):
-        skeleton = fbx.FbxSkeleton.Create(fbxManager, "")
-        skeleton.SetSkeletonType(fbx.FbxSkeleton.EType.eRoot if nth == -1 else fbx.FbxSkeleton.EType.eLimbNode)
-        node = fbx.FbxNode.Create(fbxScene, num2joint[nth])
-        node.SetNodeAttribute(skeleton)
-        node.LclTranslation.Set(fbx.FbxDouble3(trans[nth, 0], trans[nth, 1], trans[nth, 2]))
-        skeletonNodes.append(node)
-        if kintree_table[nth] != -1:
-            skeletonNodes[kintree_table[nth]].AddChild(node)
-    referenceNode.AddChild(skeletonNodes[0])
-    return referenceNode, skeletonNodes
-def _addSkiningWeight(fbxScene, lbs_weights, geometryNode, skeletonNodes):
-    clusters = []
-    for i in range(lbs_weights.shape[1]):
-        cluster = fbx.FbxCluster.Create(fbxScene, "")
-        cluster.SetLink(skeletonNodes[i])
-        cluster.SetLinkMode(fbx.FbxCluster.ELinkMode.eTotalOne)
-        for j in range(lbs_weights.shape[0]):
-            weight = lbs_weights[j, i]
-            if weight > 0:
-                cluster.AddControlPointIndex(j, weight)
-        clusters.append(cluster)
-    # Now we have the Geometry and the skeleton correctly positioned,
-    # set the transform and TransformLink matrix accordingly.
-    matrix = fbxScene.GetAnimationEvaluator().GetNodeGlobalTransform(geometryNode)
-    for cluster in clusters:
-        cluster.SetTransformMatrix(matrix)
-    for i in range(len(skeletonNodes)):
-        matrix = fbxScene.GetAnimationEvaluator().GetNodeGlobalTransform(skeletonNodes[i])
-        clusters[i].SetTransformLinkMatrix(matrix)
-    # Add the clusters to the patch by creating a skin and adding those clusters to that skin.
-    skin = fbx.FbxSkin.Create(fbxScene, "")
-    for cluster in clusters:
-        skin.AddCluster(cluster)
-    geometryNode.GetNodeAttribute().AddDeformer(skin)
-def _storeBindPose(fbxScene, geometryNode):
-    # In the bind pose, we must store all the link's global matrix at the
-    # time of the bind.
-    # Plus, we must store all the parent(s) global matrix of a link, even
-    # if they are not themselves deforming any model.
-    clusteredNodes = []
-    if geometryNode and geometryNode.GetNodeAttribute():
-        skinCount = 0
-        clusterCount = 0
-        attributeType = geometryNode.GetNodeAttribute().GetAttributeType()
-        if attributeType in (
-            fbx.FbxNodeAttribute.EType.eMesh,
-            fbx.FbxNodeAttribute.EType.eNurbs,
-            fbx.FbxNodeAttribute.EType.ePatch,
-        ):
-            skinCount = geometryNode.GetNodeAttribute().GetDeformerCount(fbx.FbxDeformer.EDeformerType.eSkin)
-            for i in range(skinCount):
-                skin = geometryNode.GetNodeAttribute().GetDeformer(i, fbx.FbxDeformer.EDeformerType.eSkin)
-                clusterCount += skin.GetClusterCount()
-        if clusterCount:
-            for i in range(skinCount):
-                skin = geometryNode.GetNodeAttribute().GetDeformer(i, fbx.FbxDeformer.EDeformerType.eSkin)
-                clusterCount = skin.GetClusterCount()
-                for j in range(clusterCount):
-                    link = skin.GetCluster(j).GetLink()
-                    _addNodeRecursively(clusteredNodes, link)
-            # Add the geometry to the pose
-            clusteredNodes += [geometryNode]
-    # Now create a bind pose with the link list
-    if len(clusteredNodes):
-        # A pose must be named. Arbitrarily use the name of the geometry node.
-        pose = fbx.FbxPose.Create(fbxScene, geometryNode.GetName())
-        pose.SetIsBindPose(True)
-        for node in clusteredNodes:
-            bindMatrix = fbxScene.GetAnimationEvaluator().GetNodeGlobalTransform(node)
-            pose.Add(node, fbx.FbxMatrix(bindMatrix))
-        fbxScene.AddPose(pose)
-def _addNodeRecursively(nodeArray, node):
-    """Add the specified node to the node array.
-    Also, add recursively all the parent node of the specified node to the array.
-    """
-    if node:
-        _addNodeRecursively(nodeArray, node.GetParent())
-        found = False
-        if node in nodeArray:
-            if node.GetName() == node.GetName():
-                found = True
-        if not found:
-            nodeArray += [node]
-def _animateGlobalTransformsFromTransMat(animLayer, referenceNode, global_translation, frameDuration):
-    _animateSingleChannel(animLayer, referenceNode.LclTranslation, "X", global_translation, frameDuration)
-    _animateSingleChannel(animLayer, referenceNode.LclTranslation, "Y", global_translation, frameDuration)
-    _animateSingleChannel(animLayer, referenceNode.LclTranslation, "Z", global_translation, frameDuration)
-def _animateSingleChannel(animLayer, component, name, values, frameDuration):
-    ncomp = 0
-    if name == "X":
-        ncomp = 0
-    elif name == "Y":
-        ncomp = 1
-    elif name == "Z":
-        ncomp = 2
-    time = fbx.FbxTime()
-    curve = component.GetCurve(animLayer, name, True)
-    curve.KeyModifyBegin()
-    for nth in range(len(values)):
-        time.SetSecondDouble(nth * frameDuration)
-        keyIndex = curve.KeyAdd(time)[0]
-        curve.KeySetValue(keyIndex, values[nth][ncomp])
-        curve.KeySetInterpolation(
-            keyIndex, fbx.FbxAnimCurveDef.EInterpolationType.eInterpolationConstant
-        )  # NOTE: using eInterpolationCubic to do interpolation causes error.
-    curve.KeyModifyEnd()
-def _animateRotationKeyFrames(animLayer, node, transforms_mat, frameDuration):
-    rotations = []
-    for nth in range(len(transforms_mat)):
-        rotations.append(np.rad2deg(mat2euler(transforms_mat[nth][0:3, 0:3], axes="sxyz")))
-    _animateSingleChannel(animLayer, node.LclRotation, "X", rotations, frameDuration)
-    _animateSingleChannel(animLayer, node.LclRotation, "Y", rotations, frameDuration)
-    _animateSingleChannel(animLayer, node.LclRotation, "Z", rotations, frameDuration)
-def _animateTranslationKeyFrames(animLayer, node, transforms_mat, frameDuration):
-    translations = []
-    for nth in range(len(transforms_mat)):
-        translations.append(transforms_mat[nth][0:3, 3])
-    _animateSingleChannel(animLayer, node.LclTranslation, "X", translations, frameDuration)
-    _animateSingleChannel(animLayer, node.LclTranslation, "Y", translations, frameDuration)
-    _animateSingleChannel(animLayer, node.LclTranslation, "Z", translations, frameDuration)
-def _animateScalingKeyFrames(animLayer, node, transforms_mat, frameDuration):
-    scalings = []
-    for nth in range(len(transforms_mat)):
-        scalings.append(
-            np.array(
-                (
-                    transforms_mat[nth][0, 0],
-                    transforms_mat[nth][1, 1],
-                    transforms_mat[nth][2, 2],
-                )
-            )
-        )
-    _animateSingleChannel(animLayer, node.LclTranslation, "X", scalings, frameDuration)
-    _animateSingleChannel(animLayer, node.LclTranslation, "Y", scalings, frameDuration)
-    _animateSingleChannel(animLayer, node.LclTranslation, "Z", scalings, frameDuration)
-def _animateSkeleton(fbxScene, skeletonNodes, frames, frameRate, name="Take1"):
-    frameDuration = 1.0 / frameRate
-    if name != "Take1":
-        subs = name.split("/")
-        name = subs[-1][:-5]
-    animStack = fbx.FbxAnimStack.Create(fbxScene, name)
-    animLayer = fbx.FbxAnimLayer.Create(fbxScene, "Base Layer")
-    animStack.AddMember(animLayer)
-    _animateGlobalTransformsFromTransMat(
-        animLayer=animLayer,
-        referenceNode=skeletonNodes[0],
-        global_translation=frames[:, 0, :3, 3],
-        frameDuration=frameDuration,
-    )
-    for nId in range(len(skeletonNodes)):
-        _animateRotationKeyFrames(
-            animLayer=animLayer,
-            node=skeletonNodes[nId],
-            transforms_mat=frames[:, nId],
-            frameDuration=frameDuration,
-        )
-def _saveScene(filename, fbxManager, fbxScene):
-    exporter = fbx.FbxExporter.Create(fbxManager, "")
-    isInitialized = exporter.Initialize(filename)
-    if isInitialized is False:
-        raise Exception(
-            "Exporter failed to initialized. Error returned: {}".format(exporter.GetStatus().GetErrorString())
-        )
-    exporter.Export(fbxScene)
-    exporter.Destroy()
-def _get_offsets_from_beta(beta, smplx_params, return_template_mesh=True):
-    v_template = torch.FloatTensor(smplx_params["v_template"]).unsqueeze(0)
-    shape_dirs = torch.FloatTensor(smplx_params["shapedirs"])
-    J_regressor = torch.FloatTensor(smplx_params["J_regressor"])
-    v_shaped = v_template + _blend_shapes(beta, shape_dirs)
-    J = _vertices2joints(J_regressor, v_shaped).squeeze(0).numpy()
-    parents = smplx_params["kintree_table"][()][0]
-    parents[0] = -1
-    Translates = J[()].copy()
-    Translates[1:] -= J[parents[1:]]
-    if not return_template_mesh:
-        return Translates
-    else:
-        return Translates, v_shaped
-def _preprocess_smplx(smplx_params, source_anim_data, scale=1, debug=False):
-    Translates, v_shaped = _get_offsets_from_beta(
-        torch.FloatTensor(source_anim_data["betas"]),
-        smplx_params,
-        return_template_mesh=True,
-    )
-    parents = smplx_params["kintree_table"][()][0]
-    parents[0] = -1
-    poses = torch.FloatTensor(source_anim_data["poses"])
-    source_LclRotation = angle_axis_to_rotation_matrix(poses).numpy()
-    source_LclTranslation = np.tile(Translates, (source_LclRotation.shape[0], 1, 1))
-    source_LclTranslation[:, 0] += source_anim_data["trans"]
-    source_skeleton = {
-        "parent": parents,
-        "LclRotation": source_LclRotation,
-        "LclTranslation": source_LclTranslation * scale,
-        "Translate": Translates * scale,
-        "v_shaped": v_shaped.squeeze(0).numpy() * scale,
-    }
-    return source_skeleton
-def _convert_npz_to_fbx(smplh_params, npz_data, save_fn, fps=30, uv_coords=None, uv_faces=None):
-    kintree = smplh_params["kintree_table"][0]
-    kintree[0] = -1
-    source_anim_data = {
-        "betas": npz_data["betas"],
-        "poses": npz_data["poses"].reshape(npz_data["poses"].shape[0], -1, 3),
-        "trans": npz_data["trans"],
-    }
-    source_skeleton = _preprocess_smplx(smplh_params, source_anim_data, scale=100)
-    rot = rot_mat2trans_mat(source_skeleton["LclRotation"])
-    trans = trans2trans_mat(source_skeleton["LclTranslation"])
-    frame_data = np.einsum("Btnk,Btkm ->Btnm", trans, rot)
-    fbxManager = fbx.FbxManager.Create()
-    fbxScene = fbx.FbxScene.Create(fbxManager, "")
-    timeMode = fbx.FbxTime().ConvertFrameRateToTimeMode(fps)
-    fbxScene.GetGlobalSettings().SetTimeMode(timeMode)
-    geometryNode = _addSmplXMesh(
-        fbxScene,
-        source_skeleton["v_shaped"],
-        smplh_params["f"],
-        uv_coords=uv_coords,
-        uv_faces=uv_faces,
-    )
-    referenceNode, skeletonNodes = _addSmplXSkeleton(
-        fbxManager,
-        fbxScene=fbxScene,
-        trans=source_skeleton["Translate"],
-        joint2num=SMPLH_JOINT2NUM,
-        kintree_table=kintree,
-    )
-    _addSkiningWeight(fbxScene, smplh_params["weights"], geometryNode, skeletonNodes)
-    _storeBindPose(fbxScene, geometryNode)
-    _animateSkeleton(
-        fbxScene=fbxScene,
-        skeletonNodes=skeletonNodes,
-        frames=frame_data,
-        frameRate=fps,
-    )
-    with tempfile.NamedTemporaryFile(suffix=".fbx", delete=False) as tmp_f:
-        temp_file = tmp_f.name
-    try:
-        # Save to temporary location
-        _saveScene(temp_file, fbxManager, fbxScene)
-        # If successful, copy to final destination
-        shutil.copy2(temp_file, save_fn)
-    except Exception as e:
-        print(f"Error saving FBX file: {e}")
-    finally:
-        # Remove temporary file
-        if os.path.exists(temp_file):
-            os.remove(temp_file)
-    # CLEANUP
-    fbxManager.Destroy()
-    del fbxManager, fbxScene
-def _read_uv(obj_template):
-    uv_coords = None
-    uv_faces = None
-    if obj_template and os.path.isfile(obj_template):
-        try:
-            print("Loading UV coordinates from OBJ template: {}".format(obj_template))
-            obj_vertices, uv_coords, obj_faces, uv_faces = _parse_obj_file(obj_template)
-            print("Loaded {} UV coordinates and {} UV faces".format(len(uv_coords), len(uv_faces)))
-        except Exception as e:
-            print("Warning: Failed to load UV coordinates from OBJ file: {}".format(e))
-            uv_coords = None
-            uv_faces = None
-    return uv_coords, uv_faces
-class SMPLH2FBX:
-    def __init__(
-        self,
-        obj_template="./assets/smpl_family_models/smplh/textures/male_smplh.obj",
-        smplh_model_path="./assets/body_models/smplh/neutral/model.npz",
-    ):
-        print(f"[{self.__class__.__name__}] Load obj_template: {obj_template}")
-        self.uv_coords, self.uv_faces = _read_uv(obj_template)
-        print(f"[{self.__class__.__name__}] Load smplh_model_path: {smplh_model_path}")
-        self.smplh_params = dict(np.load(smplh_model_path, allow_pickle=True))
-    def convert_npz_to_fbx(self, npz_file, outname, fps=30):
-        os.makedirs(os.path.dirname(outname), exist_ok=True)
-        if isinstance(npz_file, str) and os.path.isfile(npz_file):
-            npz_data = dict(np.load(npz_file, allow_pickle=True))
-        else:
-            npz_data = npz_file
-        _convert_npz_to_fbx(
-            self.smplh_params,
-            npz_data,
-            outname,
-            uv_coords=self.uv_coords,
-            uv_faces=self.uv_faces,
-        )
-        return os.path.exists(outname)
-    def convert_params_to_fbx(self, params, outname):
-        fps = params.get("mocap_framerate", 30)
-        os.makedirs(os.path.dirname(outname), exist_ok=True)
-        assert len(params["poses"].shape) == 3, f"poses shape should be (F, 52, 3), but got {params['poses'].shape}"
-        assert len(params["betas"].shape) == 2, f"betas shape should be (1, 16), but got {params['betas'].shape}"
-        assert len(params["trans"].shape) == 2, f"trans shape should be (1, 3), but got {params['trans'].shape}"
-        _convert_npz_to_fbx(
-            self.smplh_params,
-            params,
-            outname,
-            fps=fps,
-            uv_coords=self.uv_coords,
-            uv_faces=self.uv_faces,
-        )
-        return os.path.exists(outname)
-if __name__ == "__main__":
-    # python hymotion/utils/smplh2fbx.py
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("root", type=str)
-    args = parser.parse_args()
-    converter = SMPLH2FBX()
-    if os.path.isdir(args.root):
-        npzfiles = sorted(glob.glob(os.path.join(args.root, "*.npz")))
-    else:
-        if args.root.endswith(".npz"):
-            npzfiles = [args.root]
-        else:
-            raise ValueError(f"Unknown file type: {args.root}")
-    for npzfile in npzfiles:
-        converter.convert_npz_to_fbx(npzfile, npzfile.replace(".npz", ".fbx").replace("motions", "motions_fbx"))

hymotion/utils/t2m_runtime.py CHANGED Viewed

@@ -46,14 +46,18 @@ class T2MRuntime:
         ckpt_name: str = "latest.ckpt",
         skip_text: bool = False,
         device_ids: Union[list[int], None] = None,
-        prompt_engineering_host: Optional[str] = None,
         skip_model_loading: bool = False,
         force_cpu: bool = False,
     ):
         self.config_path = config_path
         self.ckpt_name = ckpt_name
         self.skip_text = skip_text
         self.prompt_engineering_host = prompt_engineering_host
         self.skip_model_loading = skip_model_loading
         self.local_ip = _get_local_ip()
@@ -71,7 +75,12 @@ class T2MRuntime:
         self._lock = threading.Lock()
         self._loaded = False
-        self.prompt_rewriter = PromptRewriter(host=self.prompt_engineering_host)
         # Skip model loading if checkpoint not found
         if self.skip_model_loading:
             print(">>> [WARNING] Checkpoint not found, will use randomly initialized model weights")
@@ -92,7 +101,9 @@ class T2MRuntime:
         device_info = self.device_ids if self.device_ids else "cpu"
         if self.skip_model_loading:
-            print(f">>> T2MRuntime initialized (using randomly initialized weights) in IP {self.local_ip}, devices={device_info}")
         else:
             print(f">>> T2MRuntime loaded in IP {self.local_ip}, devices={device_info}")
@@ -116,7 +127,10 @@ class T2MRuntime:
             )
             device = torch.device("cpu")
             pipeline.load_in_demo(
-                self.ckpt_name, os.path.dirname(self.ckpt_name), build_text_encoder=not self.skip_text, allow_empty_ckpt=allow_empty_ckpt
             )
             pipeline.to(device)
             self.pipelines = [pipeline]
@@ -129,7 +143,12 @@ class T2MRuntime:
                     network_module=config["network_module"],
                     network_module_args=config["network_module_args"],
                 )
-                p.load_in_demo(self.ckpt_name, os.path.dirname(self.ckpt_name), build_text_encoder=not self.skip_text, allow_empty_ckpt=allow_empty_ckpt)
                 p.to(torch.device(f"cuda:{gid}"))
                 self.pipelines.append(p)
             self._gpu_load = [0] * len(self.pipelines)
@@ -360,6 +379,7 @@ class T2MRuntime:
         except Exception as e:
             print(f">>> Failed to generate static HTML content: {e}")
             import traceback
             traceback.print_exc()
             # Return error HTML
             return f"<html><body><h1>Error generating visualization</h1><p>{str(e)}</p></body></html>"

         ckpt_name: str = "latest.ckpt",
         skip_text: bool = False,
         device_ids: Union[list[int], None] = None,
         skip_model_loading: bool = False,
         force_cpu: bool = False,
+        disable_prompt_engineering: bool = False,
+        prompt_engineering_host: Optional[str] = None,
+        prompt_engineering_model_path: Optional[str] = None,
     ):
         self.config_path = config_path
         self.ckpt_name = ckpt_name
         self.skip_text = skip_text
         self.prompt_engineering_host = prompt_engineering_host
+        self.prompt_engineering_model_path = prompt_engineering_model_path
+        self.disable_prompt_engineering = disable_prompt_engineering
         self.skip_model_loading = skip_model_loading
         self.local_ip = _get_local_ip()
         self._lock = threading.Lock()
         self._loaded = False
+        if self.disable_prompt_engineering:
+            self.prompt_rewriter = None
+        else:
+            self.prompt_rewriter = PromptRewriter(
+                host=self.prompt_engineering_host, model_path=self.prompt_engineering_model_path
+            )
         # Skip model loading if checkpoint not found
         if self.skip_model_loading:
             print(">>> [WARNING] Checkpoint not found, will use randomly initialized model weights")
         device_info = self.device_ids if self.device_ids else "cpu"
         if self.skip_model_loading:
+            print(
+                f">>> T2MRuntime initialized (using randomly initialized weights) in IP {self.local_ip}, devices={device_info}"
+            )
         else:
             print(f">>> T2MRuntime loaded in IP {self.local_ip}, devices={device_info}")
             )
             device = torch.device("cpu")
             pipeline.load_in_demo(
+                self.ckpt_name,
+                os.path.dirname(self.ckpt_name),
+                build_text_encoder=not self.skip_text,
+                allow_empty_ckpt=allow_empty_ckpt,
             )
             pipeline.to(device)
             self.pipelines = [pipeline]
                     network_module=config["network_module"],
                     network_module_args=config["network_module_args"],
                 )
+                p.load_in_demo(
+                    self.ckpt_name,
+                    os.path.dirname(self.ckpt_name),
+                    build_text_encoder=not self.skip_text,
+                    allow_empty_ckpt=allow_empty_ckpt,
+                )
                 p.to(torch.device(f"cuda:{gid}"))
                 self.pipelines.append(p)
             self._gpu_load = [0] * len(self.pipelines)
         except Exception as e:
             print(f">>> Failed to generate static HTML content: {e}")
             import traceback
             traceback.print_exc()
             # Return error HTML
             return f"<html><body><h1>Error generating visualization</h1><p>{str(e)}</p></body></html>"

scripts/gradio/templates/placeholder_scene.html ADDED Viewed

	@@ -0,0 +1,331 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>Motion Visualization</title>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
+    <style>
+        html, body {
+            background: #424242 !important;
+            color: #e2e8f0;
+            margin: 0;
+            padding: 0;
+            width: 100%;
+            height: 100%;
+            overflow: hidden;
+        }
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        .fullscreen-container {
+            position: fixed;
+            top: 0;
+            left: 0;
+            width: 100vw;
+            height: 100vh;
+            background: #424242;
+            overflow: hidden;
+        }
+        #vis3d {
+            position: absolute;
+            top: 0;
+            left: 0;
+            width: 100%;
+            height: 100%;
+            background: #424242;
+        }
+        #vis3d canvas {
+            display: block;
+            width: 100% !important;
+            height: 100% !important;
+        }
+        .welcome-overlay {
+            position: absolute;
+            top: 50%;
+            left: 50%;
+            transform: translate(-50%, -50%);
+            background: rgba(0, 0, 0, 0.6);
+            backdrop-filter: blur(10px);
+            -webkit-backdrop-filter: blur(10px);
+            color: white;
+            padding: 30px 50px;
+            border-radius: 16px;
+            font-size: 16px;
+            z-index: 200;
+            text-align: center;
+            box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
+        }
+        .welcome-overlay h2 {
+            font-size: 20px;
+            font-weight: 600;
+            margin-bottom: 12px;
+            color: #4a9eff;
+        }
+        .welcome-overlay p {
+            color: #a0aec0;
+            font-size: 14px;
+            line-height: 1.6;
+        }
+        .control-overlay {
+            position: absolute;
+            bottom: 30px;
+            left: 50%;
+            transform: translateX(-50%);
+            width: 80%;
+            max-width: 600px;
+            z-index: 100;
+            background: rgba(0, 0, 0, 0.4);
+            backdrop-filter: blur(8px);
+            -webkit-backdrop-filter: blur(8px);
+            padding: 15px 20px;
+            border-radius: 12px;
+        }
+        .control-row-minimal {
+            display: flex;
+            align-items: center;
+            gap: 20px;
+        }
+        .progress-container {
+            flex: 1;
+        }
+        .progress-slider-minimal {
+            width: 100%;
+            height: 8px;
+            border-radius: 4px;
+            background: rgba(255, 255, 255, 0.3);
+            outline: none;
+            cursor: not-allowed;
+            -webkit-appearance: none;
+            appearance: none;
+            opacity: 0.5;
+        }
+        .progress-slider-minimal::-webkit-slider-runnable-track {
+            width: 100%;
+            height: 8px;
+            border-radius: 4px;
+            background: rgba(255, 255, 255, 0.3);
+        }
+        .progress-slider-minimal::-webkit-slider-thumb {
+            -webkit-appearance: none;
+            appearance: none;
+            width: 20px;
+            height: 20px;
+            border-radius: 50%;
+            background: #4a9eff;
+            cursor: not-allowed;
+            border: 2px solid white;
+            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.4);
+            margin-top: -6px;
+        }
+        .progress-slider-minimal::-moz-range-track {
+            width: 100%;
+            height: 8px;
+            border-radius: 4px;
+            background: rgba(255, 255, 255, 0.3);
+        }
+        .progress-slider-minimal::-moz-range-thumb {
+            width: 20px;
+            height: 20px;
+            border-radius: 50%;
+            background: #4a9eff;
+            cursor: not-allowed;
+            border: 2px solid white;
+            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.4);
+        }
+        .frame-counter {
+            font-family: 'SF Mono', 'Consolas', monospace;
+            font-size: 14px;
+            font-weight: 500;
+            color: rgba(255, 255, 255, 0.5);
+            text-shadow: 0 1px 3px rgba(0, 0, 0, 0.5);
+            white-space: nowrap;
+            min-width: 80px;
+            text-align: right;
+        }
+    </style>
+</head>
+<body>
+<div class="fullscreen-container">
+    <div id="vis3d"></div>
+    <div class="welcome-overlay">
+        <h2>Welcome to HY-Motion-1.0!</h2>
+        <p>Enter a text description and generate motion<br>to see the 3D visualization here.</p>
+    </div>
+    <div class="control-overlay">
+        <div class="control-row-minimal">
+            <div class="progress-container">
+                <input type="range" class="progress-slider-minimal" min="0" max="100" value="0" disabled>
+            </div>
+            <div class="frame-counter">
+                <span>0</span> / <span>0</span>
+            </div>
+        </div>
+    </div>
+</div>
+<script type="importmap">
+    {
+        "imports": {
+            "three": "https://cdn.jsdelivr.net/npm/[email protected]/build/three.module.js",
+            "three/addons/": "https://cdn.jsdelivr.net/npm/[email protected]/examples/jsm/"
+        }
+    }
+</script>
+<script type="module">
+    import * as THREE from 'three';
+    import { OrbitControls } from 'three/addons/controls/OrbitControls.js';
+    function createBaseChessboard(
+        grid_size = 50,
+        divisions = 50,
+        white = "#ffffff",
+        black = "#3a3a3a",
+        texture_size = 1024
+    ) {
+        var adjusted_texture_size = Math.floor(texture_size / divisions) * divisions;
+        var canvas = document.createElement("canvas");
+        canvas.width = canvas.height = adjusted_texture_size;
+        var context = canvas.getContext("2d");
+        context.imageSmoothingEnabled = false;
+        var step = adjusted_texture_size / divisions;
+        for (var i = 0; i < divisions; i++) {
+            for (var j = 0; j < divisions; j++) {
+                context.fillStyle = (i + j) % 2 === 0 ? white : black;
+                context.fillRect(i * step, j * step, step, step);
+            }
+        }
+        var texture = new THREE.CanvasTexture(canvas);
+        texture.wrapS = THREE.RepeatWrapping;
+        texture.wrapT = THREE.RepeatWrapping;
+        texture.magFilter = THREE.NearestFilter;
+        texture.minFilter = THREE.NearestFilter;
+        texture.generateMipmaps = false;
+        var planeGeometry = new THREE.PlaneGeometry(grid_size, grid_size);
+        var planeMaterial = new THREE.MeshStandardMaterial({
+            map: texture,
+            side: THREE.DoubleSide,
+            transparent: true,
+            opacity: 0.85,
+            roughness: 0.9,
+            metalness: 0.1,
+            emissiveIntensity: 0.05,
+        });
+        var plane = new THREE.Mesh(planeGeometry, planeMaterial);
+        plane.receiveShadow = true;
+        return plane;
+    }
+    function getChessboardXZ() {
+        var plane = createBaseChessboard();
+        plane.rotation.x = -Math.PI / 2;
+        plane.name = 'ground';
+        plane.receiveShadow = true;
+        return plane;
+    }
+    let scene, camera, renderer, controls;
+    function init() {
+        const width = window.innerWidth;
+        const height = window.innerHeight;
+        scene = new THREE.Scene();
+        camera = new THREE.PerspectiveCamera(45, width / height, 0.1, 50);
+        renderer = new THREE.WebGLRenderer({ antialias: true, logarithmicDepthBuffer: true });
+        // Camera setup
+        camera.up.set(0, 1, 0);
+        camera.position.set(3, 2.5, 5);
+        camera.lookAt(new THREE.Vector3(0, 1, 0));
+        // Scene background and fog
+        scene.background = new THREE.Color(0x424242);
+        scene.fog = new THREE.FogExp2(0x424242, 0.06);
+        // Renderer setup
+        renderer.shadowMap.enabled = true;
+        renderer.shadowMap.type = THREE.PCFSoftShadowMap;
+        renderer.toneMapping = THREE.ACESFilmicToneMapping;
+        renderer.toneMappingExposure = 1.0;
+        renderer.outputColorSpace = THREE.SRGBColorSpace;
+        renderer.setPixelRatio(window.devicePixelRatio);
+        renderer.setSize(width, height);
+        // Lights
+        const hemisphereLight = new THREE.HemisphereLight(0xffffff, 0x444444, 1.2);
+        hemisphereLight.position.set(0, 2, 0);
+        scene.add(hemisphereLight);
+        const directionalLight = new THREE.DirectionalLight(0xffffff, 1.5);
+        directionalLight.position.set(3, 5, 4);
+        directionalLight.castShadow = true;
+        directionalLight.shadow.mapSize.width = 2048;
+        directionalLight.shadow.mapSize.height = 2048;
+        directionalLight.shadow.camera.near = 0.5;
+        directionalLight.shadow.camera.far = 50;
+        directionalLight.shadow.camera.left = -10;
+        directionalLight.shadow.camera.right = 10;
+        directionalLight.shadow.camera.top = 10;
+        directionalLight.shadow.camera.bottom = -10;
+        directionalLight.shadow.bias = -0.0001;
+        scene.add(directionalLight);
+        const fillLight = new THREE.DirectionalLight(0xaaccff, 0.5);
+        fillLight.position.set(-3, 3, -2);
+        scene.add(fillLight);
+        const rimLight = new THREE.DirectionalLight(0xffeedd, 0.4);
+        rimLight.position.set(0, 4, -5);
+        scene.add(rimLight);
+        // Ground
+        scene.add(getChessboardXZ());
+        // Add to DOM
+        var container = document.getElementById('vis3d');
+        container.appendChild(renderer.domElement);
+        // Controls
+        controls = new OrbitControls(camera, renderer.domElement);
+        controls.minDistance = 1;
+        controls.maxDistance = 15;
+        controls.enableDamping = true;
+        controls.dampingFactor = 0.05;
+        controls.target.set(0, 0.5, 0);
+        controls.update();
+        window.addEventListener('resize', onWindowResize);
+        animate();
+    }
+    function animate() {
+        requestAnimationFrame(animate);
+        if (controls && controls.enableDamping) {
+            controls.update();
+        }
+        renderer.render(scene, camera);
+    }
+    function onWindowResize() {
+        const width = window.innerWidth;
+        const height = window.innerHeight;
+        camera.aspect = width / height;
+        camera.updateProjectionMatrix();
+        renderer.setSize(width, height);
+    }
+    init();
+</script>
+</body>
+</html>