PenPaperKeyCode commited on 19 days ago

Commit

3169f6c

0 Parent(s):

Init

Browse files

Files changed (45) hide show

.gitattributes +8 -0
LICENSE +122 -0
README.md +459 -0
chat_template.jinja +173 -0
config.json +320 -0
configuration_hyperclovax.py +228 -0
configuration_vlm.py +169 -0
cosyvoice.py +516 -0
decoder/audio/NCCosybigvganDecoder.mar +3 -0
decoder/audio/NCZSCosybigvganDecoder.mar +3 -0
decoder/vision/model_index.json +25 -0
decoder/vision/scheduler/scheduler_config.json +18 -0
decoder/vision/token_embedder/config.json +7 -0
decoder/vision/token_embedder/diffusion_pytorch_model.safetensors +3 -0
decoder/vision/transformer/config.json +21 -0
decoder/vision/transformer/diffusion_pytorch_model.safetensors +3 -0
decoder/vision/transformer2/config.json +21 -0
decoder/vision/transformer2/diffusion_pytorch_model.safetensors +3 -0
decoder/vision/vae/config.json +38 -0
decoder/vision/vae/diffusion_pytorch_model.safetensors +3 -0
generation_config.json +6 -0
mambamia_videoaudio_compressor.py +803 -0
model-00001-of-00010.safetensors +3 -0
model-00002-of-00010.safetensors +3 -0
model-00003-of-00010.safetensors +3 -0
model-00004-of-00010.safetensors +3 -0
model-00005-of-00010.safetensors +3 -0
model-00006-of-00010.safetensors +3 -0
model-00007-of-00010.safetensors +3 -0
model-00008-of-00010.safetensors +3 -0
model-00009-of-00010.safetensors +3 -0
model-00010-of-00010.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_hyperclovax.py +1866 -0
modeling_vlm.py +0 -0
patch_vuvlm.py +1085 -0
preprocessor.py +0 -0
preprocessor_config.json +32 -0
processing_vlm.py +963 -0
processor_config.json +6 -0
special_tokens_map.json +30 -0
ta_tok.py +379 -0
tokenizer.json +3 -0
tokenizer_config.json +3 -0
video_preprocessor_config.json +89 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,8 @@

+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+decoder/audio/NCCosybigvganDecoder.mar filter=lfs diff=lfs merge=lfs -text
+decoder/audio/NCZSCosybigvganDecoder.mar filter=lfs diff=lfs merge=lfs -text
+tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,122 @@

+HyperCLOVA X SEED 8B Omni Model License Agreement
+Model Release Date: December 29, 2025
+This HyperCLOVA X SEED 8B Omni Model License Agreement (the “Agreement”) is a legal agreement between you and NAVER Corporation (“Naver Corp.”) and NAVER Cloud Corporation (“Naver Cloud Corp.”) (Naver Corp. and Naver Cloud Corp. are collectively referred to as “NAVER”) and governs your use of the Models that NAVER provides to You under this Agreement.
+NAVER Corp., as the holder of the intellectual property of the Model, and its affiliate, NAVER Cloud Corp., as the exclusive business operator of HyperCLOVA X, enter into this Agreement with you. NAVER and you are each a “party” and collectively the “parties.”
+By using, reproducing, modifying, distributing, performing or displaying any portion or element of the Model or Derivative Model, or otherwise accepting the terms of this Agreement, you agree to be bound by this Agreement. You represent to us that you are lawfully able to enter into contracts, and if you are entering into this Agreement for an entity, that you have legal authority to bind that entity.
+1.	Definitions.
+1.1.	"Affiliate” means any entity directly or indirectly controlling, controlled by or under common control with either party, where “control” means the possession, directly or indirectly, of the power to independently direct or cause the direction of the management and policies of an entity, whether through ownership of more than fifty percent (50%) of the stock or other equity interests entitled to vote for representation on its board of directors, or body performing similar functions, by contract or otherwise.
+1.2.	“Derivative Model” means all (i) modifications to the Model, (ii) works based on the Model, or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of the Model, to that model in order to cause that model to perform similarly to the Model, including distillation methods that use intermediate data representations or methods based on the generation of synthetic data Outputs by the Model for training that Model. For clarity, Outputs are not deemed Derivative Model.
+1.3.	“Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+1.4.	“Model” means the foundational large language models and software and algorithms, including machine-learning model code and trained model weights distributed by NAVER.
+1.5.	“Output” means the information content output of the Model or a Derivative Model that results from operating or otherwise using the Model or Derivative Model.
+2.	Conditions for Use, License Grant and Restrictions
+2.1.	Conditions for Use. The Model and any Derivative Model are subject to the terms of this Agreement and govern your use. If You institute copyright or patent litigation against any entity (including a crossclaim or counterclaim in a lawsuit) alleging that the Model or Derivative Model constitutes direct or contributory copyright or patent infringement, then any license granted to you under this Agreement for that Model or Derivative Model will terminate as of the date such litigation is filed. NAVER may update this Agreement to comply with legal and regulatory requirements any time and You agree to either comply with any updated license or cease your copying, use, and distribution of the Model and any Derivative Model.
+2.2.	License Grant. Subject to the terms and conditions of this Agreement, NAVER hereby grants to you a non-exclusive, worldwide, non-transferable, revocable and royalty-free limited license under NAVER’s intellectual property or other rights owned by NAVER embodied in the Model to access, download, install, copy, use, reproduce, distribute, create derivative works of, and make modifications to the Model.
+2.3.	Prohibited Use Policy. NAVER is committed to ensuring safety trust, and transparency in the development and use of AI technologies. Accordingly, your use of the Model and any Derivative Models is subject to the following conditions:
+(i)	You must ensure that any product or service you develop, use, offer as a service, or distribute complies with all applicable laws and regulations, and is operated appropriately for the relevant industry or use case.
+(ii)	You must comply with the Acceptable Use Policy applicable to the Model and any Derivative Models, which is attached hereto as Addendum A and incorporated by reference into this Agreement.
+(iii)	NAVER expressly prohibits the use of its products or services for any purpose in violation of applicable law and regulation, including but not limited to:
+(a)	illegal surveillance,
+(b)	illegal collection or processing of biometric information without the consent of the subject which is required under applicable law, or
+(c)	illegal harassment, abuse, threatening or bullying of individuals or groups of individuals or intentionally misleading or deceiving others.
+(iv)	You must take reasonable measures to address unintended bias and to mitigate harm to others, including underrepresented or vulnerable groups.
+3.	Redistribution.
+3.1.	You may reproduce, distribute or make available the Model or Derivative Models thereof, or a product or service (including another AI model) that contains any of them, if you meet all of the following conditions: you must (i) include the Prohibited Use Policy referenced in Section 2.3. as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of the Model or Derivative Model and you must provide notice to subsequence users you distribute to the Model or Derivative Models are subject to the use restrictions in Section 2.3., (ii) provide all third party recipients of the Model or Derivative Models a copy of this Agreement, (iii) cause any modified files to carry prominent notices stating that you modified the files; (iv) include the following attribution notice within a “Notice” text file distributed as part of such copies: “HyperCLOVA X SEED 8B Omni Model is licensed under the HyperCLOVA X SEED 8B Omni Model License Agreement, Copyright © NAVER Corp. All Rights Reserved.”, and (v) prominently display “Powered by HyperCLOVA X” on a related website, user interface, blogpost, about page, or product documentation. If you use the Model or any Outputs of the Model to create, train, fine tune, or otherwise improve an AI model, which is distributed or made available, you shall also include “HyperCLOVA X” at the beginning of any such AI model name.
+3.2.	You may add your own copyright statement to your modifications and, except as set forth in this Section, may provide additional or different license terms and conditions for use, reproduction, or distribution of your modifications, or for any such Derivative Models as a whole, provided your use, reproduction, and distribution of the Model or Derivative Models otherwise comply with the terms and conditions stated in this Agreement. Any additional or different terms and conditions you impose must not conflict with the terms of this Agreement.
+4.	Additional Commercial Terms. If (i) as of the Model Release Date, the monthly active users of the products or services made available by or for Licensee, or Licensee’s Affiliates, is greater than 10 million monthly active users in the preceding calendar month, or (ii) the Licensee or its Affiliate distributes or makes available any product or service, which is substantially similar to or directly competes with any product and service provided by NAVER, then the Licensee must request a license from NAVER. Such a license may be granted by NAVER at its sole discretion, and the Licensee is not authorized to exercise any rights under this Agreement unless and until NAVER expressly grants you such rights.
+5.	Generated Output. NAVER claims no rights in Outputs you generate using the Model. You and your use are solely responsible for Outputs and their subsequent uses.
+6.	DISCLAIMER OF WARRANTY. UNLESS REQUIRED BY APPLICABLE LAW, THE MODEL AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OR ANY KIND, AND NAVER DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE MODEL, DERIVATIVE MODELS, OUTPUTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE MODEL AND ANY OUTPUTS AND RESULTS AND YOUR EXERCISE OF PERMISSION UNDER THIS AGREEMENT.
+7.	LIMITATION OF LIABILITY. IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, UNLESS REQUIRED BY APPLICABLE LAW (SUCH AS IN CASES OF DELIBERATE AND GROSSLY NEGLIGENT ACTS), WILL NAVER BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY, OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND, ARISING FROM OR RELATED TO THIS AGREEMENT, OR RESULTING FROM THE USE OR INABILITY TO USE THE MODEL, DERIVATIVE MODELS OR, OUTPUTS (INCLUDING, BUT NOT LIMITED TO, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGES, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES), EVEN IF NAVER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+8.	Indemnity. You will indemnify and hold harmless NAVER from and against any claim by any third party arising out of or related to your use or distribution of the Model, Derivative Model or Outputs.
+9.	Intellectual Property.
+9.1.	This Agreement does not grant permission to use the trade names, trademarks, service marks, or product names of NAVER, except as required for reasonable and customary use in describing the origin of the Model and reproducing the content of the “Notice” text file.
+9.2.	NAVER Corp. owns the Model and any Derivative Model created by NAVER Corp. Except as expressively granted in this Agreement, NAVER Corp. reserves all rights, interests and remedies in connection with the Model and Derivative Model created by NAVER Corp. and no other license or right is granted to you by implication, estoppel or otherwise. Subject to NAVER Corp.’s ownership of the Model and any Derivative Model made by or for NAVER Corp., with respect to any derivative works and modifications of the Model that are made by you, as between you and NAVER Corp., you are and will be the owner of such derivative works and modifications.
+10.	Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Model and will continue in full force and effect until terminated in accordance with the terms and conditions of this Agreement. NAVER may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Model and Derivative Model. Section 5, 6, 7 and 10 shall survive the termination of this Agreement.
+11.	Governing Law and Jurisdiction.
+11.1.	This Agreement will be governed by and construed in accordance with the laws of the Republic of Korea, without regard to its conflicts of laws principles.
+11.2.	Any disputes, controversies, or claims arising out of or relating to this Agreement, including its existence, validity, interpretation, performance, breach, or termination, shall be referred to and finally resolved by arbitration administered by the Korean Commercial Arbitration Board (KCAB) in accordance with the International Arbitration Rules of the Korean Commercial Arbitration Board in force at the time of the commencement of the arbitration. The seat of arbitration shall be Seoul, Republic of Korea. The tribunal shall consist of one arbitrator. The language of the arbitration shall be English. Either party may seek interim or provisional relief from a court of competent jurisdiction and doing so shall not be considered a waiver of any provision in this section. The arbitral tribunal also has the authority to issue orders for interim or provisional relief.
+12.	Modifications. NAVER reserves the right to modify or amend this Agreement at any time, in its sole discretion. Any modifications will be effective upon posting the updated Agreement on our website or through other means of communication. You are responsible for reviewing the Agreement periodically for changes.
+13.	No Waiver. NAVER will not be treated as having waived any rights by not exercising (or delaying the exercise of) any rights under this Agreement.
+Addendum A – Acceptable Use Policy
+NAVER is committed to promoting safe and responsible use of its AI technologies, including the HyperCLOVA X SEED 8B Omni Model (the “Model”). By accessing or using the Model and Derivative Model (Defined in the Model License Agreement) (the Model and Derivative Model are collectively referred to as the “Models”), you agree to this Acceptable Use Policy (“Policy”).
+We want everyone to use the Models safely, legally, and ethically. You agree that you will not use, or allow others to use, the Models to:
+1. Violate applicable laws or the rights of others, including by:
+a. Engaging in, promoting, contributing to, encouraging, planning, inciting, or furthering illegal or unlawful activity or content, such as:
+	Violence or terrorism
+	Exploitation or harm to children, including the creation or dissemination of child exploitative content
+	Human trafficking, exploitation, or sexual violence
+	The unlawful distribution of obscene or harmful material to minors, or failure to apply legally required age restrictions
+	Sexual solicitation or sexually exploitative behavior
+	Any other criminal activity
+b. Engaging in, promoting, inciting, or facilitating the harassment, abuse, threatening, or bullying of individuals or groups
+c. Engaging in, promoting, inciting, or facilitating discrimination or other unlawful or harmful conduct in the provision of employment, credit, housing, or access to essential goods and services
+d. Providing unauthorized or unlicensed professional services, including but not limited to financial, legal, medical/health, or related services
+e. Collecting, processing, disclosing, generating, or inferring private or sensitive personal information, including identity, health, or demographic data, unless lawfully permitted under applicable laws
+f. Infringing, misappropriating, or otherwise violating third-party rights, including through the generation or use of outputs derived from the Models
+g. Creating, generating, or facilitating malicious code, malware, or computer viruses, or interfering with the functioning, security, or integrity of a website, application, or system
+h. Intentionally bypassing or disabling usage restrictions, safety measures, or access controls imposed by NAVER
+2. Engage in or promote use cases that may pose a risk of death, bodily harm, or significant safety hazard to individuals, including use of the Models in connection with:
+a. Military, warfare, nuclear technology or espionage
+b. The development or distribution of firearms or illegal weapons
+c. Illegal drugs or regulated controlled substances
+d. Operation of critical infrastructure, transportation systems, or heavy machinery
+e. Content promoting self-harm, including suicide, or eating disorders
+f. Any other use intended to incite or cause physical harm
+3. Intentionally deceive or mislead others, including by:
+a. Generating, promoting, or disseminating fraudulent or misleading content
+b. Creating or sharing defamatory content
+c. Generating or distributing spam
+d. Impersonating another individual or entity without proper authorization
+e. Representing Model output as human-generated
+f. Generating or enabling fake online engagement, such as fake reviews or fake users
+4. Fail to disclose to end users any known risks or limitations of an AI system that incorporates the Models.
+5. Use the Models in conjunction with third-party tools, models, or software designed to generate unlawful content or conduct, or falsely represent outputs from such tools as associated with NAVER or HyperCLOVA X.
+If you become aware of a violation of this Policy, a bug, or any behavior that could result in a breach of this Policy, please report it to us:
+Reporting risky outputs: [email protected]
+Reporting policy violations or unauthorized use: [email protected]

README.md ADDED Viewed

	@@ -0,0 +1,459 @@

+---
+license: other
+license_name: hyperclovax
+license_link: LICENSE
+library_name: transformers
+---
+![image](https://cdn-uploads.huggingface.co/production/uploads/64383d54c5a91b84ece18d62/3gaPG3_F4Fxn-SOZWrmfU.png)
+# Overview
+HyperCLOVA X SEED 8B Omni is a unified multimodal model that brings text, vision, and speech together, based on an auto-regressive Transformer architecture, enabling consistent multimodal understanding and generation. SEED 8B Omni aligns textual, visual, and audio representations in a shared semantic space and supports bidirectional interactions across modalities, including established text capabilities as well as vision–language QA, text-to-image generation and editing, speech recognition and translation, and text-to-speech, within a 32K context window. As an early pathfinding milestone of HyperCLOVA X toward **Any-to-Any-Korean-First** intelligence, SEED 8B Omni serves as a practical exploration of unified multimodal modeling and provides a reference point for future development and scaling.
+---
+# Basic Information
+- **Architecture** : Transformer-based omni-model architecture (Dense Model)
+- **Parameters** : 8B
+- **Input Format**: Text/Image/Video/Audio(Speech)
+- **Output Format**: Text/Image/Audio(Speech)
+- **Context Length** : 32K
+- **Knowledge Cutoff**: May 2025
+---
+# Benchmarks
+![테크니컬 리포트 05_2@2x](https://cdn-uploads.huggingface.co/production/uploads/646acf46086023e36edce4c4/x1IvD9Rt_NK71CklecpN2.png)
+- **Text-to-Text** : MMLU-Pro, GSM8K, KMMLU-Pro, HAERAE 1.0
+- **Vision-to-Text** :SEED-IMG, AI2D, K-MMBench
+- **Text-to-Vision**: GenEval, ImgEdit
+- **Audio-to-Text**: Librispeech, Ksponspeech
+- **Audio-to-Audio**:Fleurs en2ko, Fleurs ko2en
+---
+# Examples
+## Text-to-Image Generation
+![hf_img01](https://cdn-uploads.huggingface.co/production/uploads/64383d54c5a91b84ece18d62/6fRekMbt_9ab5I80GTkdG.png)
+## Text-based Image Editing
+![hf_img02](https://cdn-uploads.huggingface.co/production/uploads/64383d54c5a91b84ece18d62/aoecU357A0fVvR8uerozh.png)
+![hf_img03](https://cdn-uploads.huggingface.co/production/uploads/64383d54c5a91b84ece18d62/0fpcq--rj1kqPa9m8DYgt.png)
+![hf_img04](https://cdn-uploads.huggingface.co/production/uploads/64383d54c5a91b84ece18d62/Z24JUQZSmeaVNrhDMYG6K.png)
+---
+# Inference
+We provide [OmniServe](https://github.com/NAVER-Cloud-HyperCLOVA-X/OmniServe), a production-ready multimodal inference system with OpenAI-compatible API.
+## Capabilities
+- **Inputs**: Text, Image, Audio, Video
+- **Outputs**: Text, Image, Audio (no video generation)
+## Requirements
+- 4x NVIDIA A100 80GB
+- Docker & Docker Compose
+- NVIDIA Driver 525+, CUDA 12.1+
+- S3-compatible storage (for image/audio output)
+## Installation
+```bash
+# Clone OmniServe
+git clone https://github.com/NAVER-Cloud-HyperCLOVA-X/OmniServe.git
+cd OmniServe
+# Install dependencies
+pip install huggingface_hub safetensors torch openai easydict
+# Download model (~16GB)
+huggingface-cli download naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B \
+    --local-dir ./models/HyperCLOVAX-SEED-Omni-8B
+# Convert model to component format
+python convert_model.py \
+    --input ./models/HyperCLOVAX-SEED-Omni-8B \
+    --output ./track_b \
+    --track b
+# Configure environment
+cp .env.example .env
+# Edit .env with model paths and S3 credentials
+# Build and run (Track B only - OMNI model)
+docker compose --profile track-b build
+docker compose --profile track-b up -d
+# Wait for model loading (~5 minutes)
+docker compose logs -f omni
+# Note: To run both VLM and OMNI models together:
+# docker compose --profile track-a --profile track-b up -d
+```
+## Basic Usage
+```python
+from openai import OpenAI
+client = OpenAI(
+    base_url="http://localhost:8000/b/v1",
+    api_key="not-needed"
+)
+# Image understanding
+response = client.chat.completions.create(
+    model="track_b_model",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}},
+                {"type": "text", "text": "What is in this image?"}
+            ]
+        }
+    ],
+    max_tokens=256,
+    extra_body={"chat_template_kwargs": {"skip_reasoning": True}}
+)
+print(response.choices[0].message.content)
+```
+## More Examples
+<details>
+<summary>Text to Image</summary>
+```python
+import json
+SYSTEM_PROMPT = """You are an AI assistant that generates images. When asked to draw or create an image, you MUST use the t2i_model_generation tool to generate the image. Always respond by calling the tool."""
+response = client.chat.completions.create(
+    model="track_b_model",
+    messages=[
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": "Draw a sunset over mountains"}
+    ],
+    tools=[{
+        "type": "function",
+        "function": {
+            "name": "t2i_model_generation",
+            "description": "Generates an RGB image based on the provided discrete image representation.",
+            "parameters": {
+                "type": "object",
+                "required": ["discrete_image_token"],
+                "properties": {
+                    "discrete_image_token": {
+                        "type": "string",
+                        "description": "A serialized string of discrete vision tokens, encapsulated by special tokens. The format must be strictly followed: <|discrete_image_start|><|vision_ratio_4:3|><|vision_token|><|visionaaaaa|><|visionbbbbb|>... <|visionzzzzz|><|vision_eol|><|vision_eof|><|discrete_image_end|>.",
+                        "minLength": 1
+                    }
+                }
+            }
+        }
+    }],
+    max_tokens=7000,
+    extra_body={"chat_template_kwargs": {"skip_reasoning": True}}
+)
+if response.choices[0].message.tool_calls:
+    args = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
+    print(f"Generated image: {args['discrete_image_token']}")
+```
+</details>
+<details>
+<summary>Text to Audio</summary>
+```python
+import base64
+# Prompt should explicitly request speech/audio output
+response = client.chat.completions.create(
+    model="track_b_model",
+    messages=[{
+        "role": "user",
+        "content": "Read this text aloud in a cheerful female voice:\nHello! How are you today?"
+    }],
+    max_tokens=1000,
+    extra_body={"chat_template_kwargs": {"skip_reasoning": True}}
+)
+if response.choices[0].message.audio:
+    audio_url = base64.b64decode(response.choices[0].message.audio.data).decode()
+    print(f"Generated audio: {audio_url}")
+```
+</details>
+<details>
+<summary>Audio Input</summary>
+```python
+import base64
+audio_url = "https://example.com/audio.mp3"
+audio_data = base64.b64encode(audio_url.encode()).decode()
+response = client.chat.completions.create(
+    model="track_b_model",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "input_audio", "input_audio": {"data": audio_data, "format": "mp3"}},
+                {"type": "text", "text": "What is being said?"}
+            ]
+        }
+    ],
+    max_tokens=256,
+    extra_body={"chat_template_kwargs": {"skip_reasoning": True}}
+)
+print(response.choices[0].message.content)
+```
+</details>
+<details>
+<summary>Video Input</summary>
+```python
+response = client.chat.completions.create(
+    model="track_b_model",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": "https://example.com/video.mp4"}},
+                {"type": "text", "text": "Describe this video."}
+            ]
+        }
+    ],
+    max_tokens=512,
+    extra_body={"chat_template_kwargs": {"skip_reasoning": True}}
+)
+print(response.choices[0].message.content)
+```
+</details>
+<details>
+<summary>Image to Image</summary>
+```python
+import json
+SYSTEM_PROMPT = """You are an AI assistant that transforms images. When asked to transform, edit, or stylize an image, you MUST use the t2i_model_generation tool to generate the new image. Always respond by calling the tool."""
+response = client.chat.completions.create(
+    model="track_b_model",
+    messages=[
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": "https://example.com/photo.jpg"}},
+                {"type": "text", "text": "Transform to watercolor style"}
+            ]
+        }
+    ],
+    tools=[{
+        "type": "function",
+        "function": {
+            "name": "t2i_model_generation",
+            "description": "Generates an RGB image based on the provided discrete image representation.",
+            "parameters": {
+                "type": "object",
+                "required": ["discrete_image_token"],
+                "properties": {
+                    "discrete_image_token": {
+                        "type": "string",
+                        "description": "A serialized string of discrete vision tokens, encapsulated by special tokens. The format must be strictly followed: <|discrete_image_start|><|vision_ratio_4:3|><|vision_token|><|visionaaaaa|><|visionbbbbb|>... <|visionzzzzz|><|vision_eol|><|vision_eof|><|discrete_image_end|>.",
+                        "minLength": 1
+                    }
+                }
+            }
+        }
+    }],
+    max_tokens=7000,
+    extra_body={"chat_template_kwargs": {"skip_reasoning": True}}
+)
+if response.choices[0].message.tool_calls:
+    args = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
+    print(f"Generated image: {args['discrete_image_token']}")
+```
+</details>
+<details>
+<summary>Audio to Audio</summary>
+```python
+import base64
+# Input audio (URL encoded as base64)
+audio_url = "https://example.com/input.mp3"
+audio_data = base64.b64encode(audio_url.encode()).decode()
+response = client.chat.completions.create(
+    model="track_b_model",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "input_audio", "input_audio": {"data": audio_data, "format": "mp3"}},
+                {"type": "text", "text": "Listen to this and respond with speech"}
+            ]
+        }
+    ],
+    max_tokens=2000,
+    extra_body={"chat_template_kwargs": {"skip_reasoning": True}}
+)
+if response.choices[0].message.audio:
+    audio_url = base64.b64decode(response.choices[0].message.audio.data).decode()
+    print(f"Generated audio: {audio_url}")
+```
+</details>
+<details>
+<summary>Using curl</summary>
+```bash
+# Image understanding
+curl -X POST http://localhost:8000/b/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "track_b_model",
+    "messages": [{"role": "user", "content": [
+      {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}},
+      {"type": "text", "text": "Describe this image."}
+    ]}],
+    "max_tokens": 256,
+    "extra_body": {"chat_template_kwargs": {"skip_reasoning": true}}
+  }'
+# Text to audio
+curl -X POST http://localhost:8000/b/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "track_b_model",
+    "messages": [{"role": "user", "content": "Say hello"}],
+    "max_tokens": 1000,
+    "extra_body": {"chat_template_kwargs": {"skip_reasoning": true}}
+  }'
+```
+</details>
+## Architecture
+```
+                         User Request
+                    (Image/Audio/Video/Text)
+                              │
+                              ▼
+┌─────────────────────────────────────────────────────────────────────────┐
+│                            OmniServe                                    │
+│                  POST /b/v1/chat/completions                            │
+│                                                                         │
+│  ┌──────────────────────────────────────────────────────────────────┐   │
+│  │                     [1] INPUT ENCODING                           │   │
+│  │                                                                  │   │
+│  │    ┌─────────────────┐               ┌─────────────────┐         │   │
+│  │    │  Vision Encoder │               │  Audio Encoder  │         │   │
+│  │    └────────┬────────┘               └────────┬────────┘         │   │
+│  │             │                                 │                  │   │
+│  │             └────────────┬────────────────────┘                  │   │
+│  │                          │ embeddings                            │   │
+│  └──────────────────────────┼───────────────────────────────────────┘   │
+│                             ▼                                           │
+│                     ┌──────────────┐                                    │
+│                     │   LLM (8B)   │◀──── text                          │
+│                     └──────┬───────┘                                    │
+│                            │                                            │
+│  ┌─────────────────────────┼────────────────────────────────────────┐   │
+│  │                  [2] OUTPUT DECODING                             │   │
+│  │                         │                                        │   │
+│  │          ┌──────────────┼──────────────┐                         │   │
+│  │          ▼              ▼              ▼                         │   │
+│  │    ┌───────────┐  ┌───────────┐  ┌───────────┐                   │   │
+│  │    │   Text    │  │  Vision   │  │   Audio   │                   │   │
+│  │    │           │  │  Decoder  │  │  Decoder  │                   │   │
+│  │    └───────────┘  └─────┬─────┘  └─────┬─────┘                   │   │
+│  │                         │              │                         │   │
+│  │                         ▼              ▼                         │   │
+│  │                    Image URL      Audio URL                      │   │
+│  │                      (S3)           (S3)                         │   │
+│  └──────────────────────────────────────────────────────────────────┘   │
+│                                                                         │
+└─────────────────────────────────────────────────────────────────────────┘
+                              │
+                              ▼
+                         Response
+                   (Text / Image URL / Audio URL)
+```
+## Hardware Requirements
+| Component | GPU | VRAM |
+|-----------|-----|------|
+| Vision Encoder | 1x | ~8GB |
+| Audio Encoder | (shared) | ~4GB |
+| LLM (8B) | 1x | ~16GB |
+| Vision Decoder | 1x | ~16GB |
+| Audio Decoder | (shared) | ~4GB |
+| **Total** | **3x** | **~48GB** |
+## Key Parameters
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `chat_template_kwargs.skip_reasoning` | Skip reasoning | `true` |
+| `max_tokens` | Max output tokens | - |
+| `temperature` | Sampling temperature | 0.7 |
+| `tools` | Required for image generation | - |
+## S3 Configuration
+Required for image/audio generation:
+```bash
+NCP_S3_ENDPOINT=https://your-s3-endpoint.com
+NCP_S3_ACCESS_KEY=your-access-key
+NCP_S3_SECRET_KEY=your-secret-key
+NCP_S3_BUCKET_NAME=your-bucket-name
+```
+For more details, see [OmniServe documentation](https://github.com/NAVER-Cloud-HyperCLOVA-X/OmniServe).
+---
+# Citation
+TBU (Technical Report)
+---
+# Questions
+For any other questions, please feel free to contact us at [email protected].
+---
+# License
+The model is licensed under [HyperCLOVA X SEED 8B Omni Model License Agreement](./LICENSE)

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,173 @@

+{%- set ns_img = namespace(count=0) %}
+{%- set ns_aud = namespace(count=0) %}
+{%- set ns_vid = namespace(count=0) %}
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content + '\n\n' }}
+        {%- elif messages[0].content is sequence %}
+            {%- for part in messages[0].content %}
+                {%- if part.type == 'text' %}
+                    {{- part.text }}
+                {%- endif %}
+            {%- endfor %}
+            {{- '\n\n' }}
+        {%- endif %}
+    {%- endif %}
+    {{- '# Tools\n\n' }}
+    {{- 'You may call one or more functions to assist with the user query.\n\n' }}
+    {{- 'You are provided with function signatures within <tools></tools> XML tags:\n' }}
+    {{- '<tools>\n' }}
+    {%- for tool in tools %}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- '\n</tools>\n\n' }}
+    {{- 'For each function call, output the function name and arguments within the following XML format:\n' }}
+    {{- '<tool_call>{function-name}\n' }}
+    {{- '<arg_key>{arg-key-1}</arg_key>\n' }}
+    {{- '<arg_value>{arg-value-1}</arg_value>\n' }}
+    {{- '<arg_key>{arg-key-2}</arg_key>\n' }}
+    {{- '<arg_value>{arg-value-2}</arg_value>\n' }}
+    {{- '...\n' }}
+    {{- '</tool_call><|im_end|>\n' }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' }}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- elif messages[0].content is sequence %}
+            {%- for part in messages[0].content %}
+                {%- if part.type == 'text' %}
+                    {{- part.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(last_user_index=-1) %}
+{%- for m in messages %}
+    {%- if m.role == 'user' %}
+        {%- set ns.last_user_index = loop.index0 %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- set content = message.content %}
+    {%- if (message.role == 'system' and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if content is string %}
+            {{- content }}
+        {%- elif content is sequence %}
+            {%- for part in content %}
+                {%- if part.type == 'text' %}
+                    {{- part.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>' + '\n' }}
+    {%- elif message.role == 'user' %}
+        {{- '<|im_start|>user\n' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] + '<|im_end|>\n' }}
+        {%- elif message['content'] is sequence %}
+            {%- for content in message['content'] %}
+                {%- if not loop.first %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if content['type'] == 'image_url' %}
+                    {%- set media_url = content.get('image_url', {}).get('url', '') %}
+                    {%- set url_lower = media_url.lower() %}
+                    {%- set image_extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".svg"] %}
+                    {%- set video_extensions = [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv", ".m4v"] %}
+                    {%- set ns_check = namespace(is_video=False) %}
+                    {%- for ext in video_extensions %}
+                        {%- if url_lower.endswith(ext) %}
+                            {%- set ns_check.is_video = True %}
+                        {%- endif %}
+                    {%- endfor %}
+                    {%- if ns_check.is_video %}
+                        {%- set video_id = 'video_%02d' % ns_vid.count %}
+                        {%- set ns_vid.count = ns_vid.count + 1 %}
+                        {{- '<|mime_start|>{"id": "' + video_id + '", "type": "video/mp4", "filename": "video.mp4"}<|mime_end|>\n' }}
+                        {{- '<|video_aux_start|>다음 중 video_duration은 비디오 길이 정보입니다. 참고하여 답변하세요. {"video_duration": "<|video_meta_duration|>"}<|video_aux_end|>\n' }}
+                        {{- '<|video_start|><|VIDEO_PAD|><|video_end|>' }}
+                    {%- else %}
+                        {%- set image_id = 'image_%02d' % ns_img.count %}
+                        {%- set ns_img.count = ns_img.count + 1 %}
+                        {{- '<|mime_start|>{"id": "' + image_id + '", "type": "image/jpeg", "filename": "image.jpg"}<|mime_end|>\n' }}
+                        {{- '<|discrete_image_start|><|DISCRETE_IMAGE_PAD|><|discrete_image_end|>\n' }}
+                        {{- '<|image_start|><|IMAGE_PAD|><|image_end|>' }}
+                    {%- endif %}
+                {%- elif content['type'] == 'input_audio' %}
+                    {%- set audio_id = 'audio_%02d' % ns_aud.count %}
+                    {%- set ns_aud.count = ns_aud.count + 1 %}
+                    {%- set input_audio = content.get('input_audio', {}) %}
+                    {{- '<|mime_start|>{"id": "' + audio_id + '", "type": "audio/mpeg", "filename": "user_query.wav"}<|mime_end|>\n' }}
+                    {{- '<|audio_aux_start|>다음 중 audio_duration은 오디오 길이 정보입니다. 참고하여 답변하세요. {"audio_duration": "<|audio_meta_duration|>"}<|audio_aux_end|>\n'}}
+                    {{- '<|discrete_audio_start|><|DISCRETE_AUDIO_PAD|><|discrete_audio_end|>\n'}}
+                    {{- '<|audio_start|><|AUDIO_PAD|><|audio_end|>'}}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] }}
+                {%- endif %}
+            {%- endfor %}
+            {{- '<|im_end|>\n'}}
+        {%- endif %}
+    {%- elif message.role == 'assistant' %}
+        {%- set reasoning_content = '' %}
+        {%- if message.get('reasoning_content') is string %}
+            {%- set reasoning_content = message.get('reasoning_content') %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_user_index %}
+            {%- if loop.last or reasoning_content %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' }}
+        {%- endif %}
+        {{- content }}
+        {%- if message.get('tool_calls') %}
+            {%- for tool_call in message.get('tool_calls', []) %}
+                {%- if not loop.first or content %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>' + tool_call.name + '\n' }}
+                {%- set _args = tool_call.arguments %}
+                {%- for k, v in _args.items() %}
+                    {{- '<arg_key>' + k + '</arg_key>\n' }}
+                    {{- '<arg_value>' + (v | tojson if v is not string else v) + '</arg_value>\n' }}
+                {%- endfor %}
+                {{- '</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == 'tool' %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}
+            {{- '<|im_start|>tool' }}
+        {%- endif %}
+        {{- '\n<tool_response>' + message.get('name', '') + '\n' }}
+        {%- if message['content'] is string %}
+            {{- content }}
+        {%- endif %}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n<think>\n' }}
+    {%- if skip_reasoning is defined and skip_reasoning is true %}
+        {{- '\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,320 @@

+{
+  "anyres": false,
+  "architectures": [
+    "HCXVisionV2ForCausalLM"
+  ],
+  "audio_config": {
+    "activation_dropout": 0.0,
+    "activation_function": "gelu",
+    "add_cross_attention": false,
+    "architectures": [
+      "Qwen2AudioEncoder"
+    ],
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "d_model": 1280,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_attention_heads": 20,
+    "encoder_ffn_dim": 5120,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 32,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "init_std": 0.02,
+    "initializer_range": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_source_positions": 1500,
+    "min_length": 0,
+    "model_type": "qwen2_audio_encoder",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 32,
+    "num_mel_bins": 128,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "scale_embedding": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "audio_model_name_or_path": null,
+  "audio_projector_type": "mlp",
+  "audio_start_id": 128071,
+  "audio_token_id": 128071,
+  "auto_map": {
+    "AutoConfig": "configuration_vlm.HCXVisionConfig",
+    "AutoModelForCausalLM": "modeling_vlm.HCXVisionForCausalLM",
+    "AutoModelForSequenceClassification": "modeling_vlm.HCXVisionForSequenceClassification"
+  },
+  "discrete_audio_config": {
+    "model_name_or_path": null,
+    "model_type": "cosyvoice2",
+    "torch_dtype": "float32"
+  },
+  "discrete_audio_model_name_or_path": null,
+  "discrete_audio_start_id": 128074,
+  "discrete_audio_token_id": 128074,
+  "discrete_audio_unit_0_id": 128606,
+  "discrete_image_start_id": 128069,
+  "discrete_image_token_id": 128069,
+  "discrete_image_unit_0_id": 135168,
+  "discrete_vision_config": {
+    "model_name_or_path": null,
+    "model_type": "ta_tok",
+    "torch_dtype": "float32"
+  },
+  "discrete_vision_model_name_or_path": null,
+  "end_token_id": 128001,
+  "eos_token_id": 128001,
+  "freeze_audio_projector": true,
+  "freeze_before_sampler": false,
+  "freeze_decoder": false,
+  "freeze_encoder": true,
+  "freeze_mm_projector": false,
+  "freeze_video_audio_compressor": false,
+  "hidden_size": 4096,
+  "ignore_index": -100,
+  "image_token_id": 128062,
+  "img_start_id": 128062,
+  "is_safetensor_save": true,
+  "max_num_grids": -1,
+  "mm_projector_type": "linear",
+  "model_type": "vlm",
+  "num_queries_vis_abstractor": -1,
+  "possible_resolutions": [],
+  "proj_pos_emb": true,
+  "proj_prenorm": false,
+  "q_former_model_name_or_path": null,
+  "text_config": {
+    "add_cross_attention": false,
+    "architectures": [
+      "LlamaForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 128000,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 128001,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "logits_scaling": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 8192,
+    "min_length": 0,
+    "mlp_bias": false,
+    "model_type": "llama",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 32,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "pretraining_tp": 1,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 5000000,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 200704
+  },
+  "text_model_name_or_path": null,
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.4",
+  "unpad": false,
+  "use_1x1_grid": false,
+  "use_nth_layer": -2,
+  "video_audio_compressor_type": "mambamia",
+  "video_audio_start_id": 128070,
+  "video_audio_token_id": 128070,
+  "video_first_last_frames_slows": null,
+  "video_max_num_frames": 120,
+  "video_num_queries_fast": null,
+  "video_num_queries_slow": null,
+  "video_start_id": 128063,
+  "video_token_id": 128063,
+  "vision_config": {
+    "add_cross_attention": false,
+    "anyres": false,
+    "architectures": [
+      "Qwen2_5_VisionTransformerPretrainedModel"
+    ],
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "depth": 32,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "in_channels": 3,
+    "in_chans": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 3456,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_num_grids": -1,
+    "min_length": 0,
+    "model_type": "qwen2_5_vl",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_heads": 16,
+    "num_return_sequences": 1,
+    "out_hidden_size": 5120,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "temporal_patch_size": 2,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "tokens_per_second": 2,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "window_size": 112
+  },
+  "vision_input_chunk_size": null,
+  "vision_model_name_or_path": null
+}

configuration_hyperclovax.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LLaMA model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+# from transformers.modeling_rope_utils import rope_config_validation
+# from transformers import PretrainedConfig, rope_config_validation
+class HyperCLOVAXConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaMA-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
+            Llama 2 up to 4096, CodeLlama up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
+            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
+            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        head_dim (`int`, *optional*):
+            The attention head dimension. If None, it will default to hidden_size // num_heads
+    ```python
+    >>> from transformers import LlamaModel, LlamaConfig
+    >>> # Initializing a LLaMA llama-7b style configuration
+    >>> configuration = LlamaConfig()
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = LlamaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "hyperclovax"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        head_dim=None,
+        embedding_multiplier=1.0,  # mup
+        logits_scaling=1.0,  # mup
+        attention_multiplier=1.0,  # mup
+        residual_multiplier=1.0,  # mup
+        use_post_norm=False,  # post-norm
+        auto_map={
+            "AutoConfig": "configuration_hyperclovax.HyperCLOVAXConfig",
+            "AutoModel": "modeling_hyperclovax.HyperCLOVAXModel",
+            "AutoModelForCausalLM": "modeling_hyperclovax.HyperCLOVAXForCausalLM",
+        },
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        # rope_config_validation(self)
+        # mup
+        self.embedding_multiplier = embedding_multiplier
+        self.logits_scaling = logits_scaling
+        self.attention_multiplier = attention_multiplier
+        self.residual_multiplier = residual_multiplier
+        # post-norm (dual-norm)
+        self.use_post_norm = use_post_norm
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            auto_map=auto_map,
+            **kwargs,
+        )

configuration_vlm.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import transformers
+from transformers import AutoConfig, PretrainedConfig
+class HCXVisionConfig(PretrainedConfig):
+    model_type = "vlm"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        discrete_vision_config=None,
+        audio_config=None,
+        discrete_audio_config=None,
+        text_model_name_or_path=None,
+        vision_model_name_or_path=None,
+        discrete_vision_model_name_or_path=None,
+        audio_model_name_or_path=None,
+        discrete_audio_model_name_or_path=None,
+        q_former_model_name_or_path=None,
+        mm_projector_type="mlp",
+        audio_projector_type="mlp",
+        video_audio_compressor_type=None,
+        use_nth_layer=-2,
+        img_start_id=128062,  # <|IMAGE_PAD|> # Manually adjusted value from previous checkpoint
+        discrete_image_start_id=128250,  # <|DISCRETE_AUDIO_PAD|>
+        discrete_image_unit_0_id=135166,  # <|vision00000|>
+        video_start_id=128063,  # <|VIDEO_PAD|>
+        video_audio_start_id=None,  # <|VIDEO_AUDIO_PAD|> - will be set dynamically
+        audio_start_id=128253,  # <|AUDIO_PAD|>
+        discrete_audio_start_id=128250,  # <|DISCRETE_AUDIO_PAD|>
+        discrete_audio_unit_0_id=128604,  # <|audio0000|>
+        freeze_encoder=False,
+        freeze_decoder=False,
+        freeze_mm_projector=False,
+        freeze_audio_projector=False,
+        freeze_video_audio_compressor=False,
+        anyres=False,
+        unpad=False,
+        max_num_grids=-1,
+        num_queries_vis_abstractor=-1,
+        video_num_queries_fast=None,
+        video_num_queries_slow=None,
+        video_first_last_frames_slows=None,
+        video_max_num_frames=None,
+        ignore_index=-100,
+        proj_pos_emb=True,
+        proj_prenorm=False,
+        use_1x1_grid=False,
+        possible_resolutions=[],
+        **kwargs,
+    ):
+        from transformers import CONFIG_MAPPING
+        if kwargs.get("language_config", None) is not None:  # for bc
+            text_config = CONFIG_MAPPING[kwargs["language_config"]["model_type"]](**kwargs["language_config"])
+        elif text_config is None and text_model_name_or_path is not None:
+            text_config = AutoConfig.from_pretrained(text_model_name_or_path, trust_remote_code=True)
+        if vision_config is None and vision_model_name_or_path is not None:
+            vision_config = AutoConfig.from_pretrained(vision_model_name_or_path, trust_remote_code=True)
+        if discrete_vision_config is None and discrete_vision_model_name_or_path is not None:
+            discrete_vision_config = {
+                "model_type": "ta_tok",
+                "model_name_or_path": discrete_vision_model_name_or_path,
+            }
+        if audio_config is None and audio_model_name_or_path is not None:
+            audio_config = AutoConfig.from_pretrained(audio_model_name_or_path)
+        if discrete_audio_config is None and discrete_audio_model_name_or_path is not None:
+            discrete_audio_config = {
+                "model_type": "cosyvoice2",
+                "model_name_or_path": discrete_audio_model_name_or_path,
+            }
+        if isinstance(text_config, dict):
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        if isinstance(vision_config, dict):
+            if vision_config["model_type"] == "qwen2_5_vl":
+                vision_config["model_type"] = "qwen2_5_vl_visual"
+                assert transformers.__version__ >= "4.52.4", "please upgrade transformers to 4.52.4 or higher"
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        if isinstance(audio_config, dict):
+            audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config)
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.discrete_vision_config = discrete_vision_config
+        self.audio_config = audio_config
+        self.discrete_audio_config = discrete_audio_config
+        if text_config is not None:
+            # deepspeed zero3에서 config의 hidden_size를 보고 메모리 크기를 자동으로 결정함.
+            self.hidden_size = text_config.hidden_size if hasattr(text_config, "hidden_size") else text_config.n_embd
+        # add VLM configs
+        self.text_model_name_or_path = text_model_name_or_path
+        self.vision_model_name_or_path = vision_model_name_or_path
+        self.discrete_vision_model_name_or_path = discrete_vision_model_name_or_path
+        self.audio_model_name_or_path = audio_model_name_or_path
+        self.discrete_audio_model_name_or_path = discrete_audio_model_name_or_path
+        self.q_former_model_name_or_path = q_former_model_name_or_path
+        self.mm_projector_type = mm_projector_type
+        self.audio_projector_type = audio_projector_type
+        self.video_audio_compressor_type = video_audio_compressor_type
+        self.use_nth_layer = use_nth_layer
+        self.freeze_encoder = freeze_encoder
+        self.freeze_decoder = freeze_decoder
+        self.freeze_mm_projector = freeze_mm_projector
+        self.freeze_audio_projector = freeze_audio_projector
+        self.freeze_video_audio_compressor = freeze_video_audio_compressor
+        self.anyres = anyres
+        self.unpad = unpad
+        self.max_num_grids = max_num_grids
+        self.num_queries_vis_abstractor = num_queries_vis_abstractor
+        self.video_num_queries_fast = video_num_queries_fast
+        self.video_num_queries_slow = video_num_queries_slow
+        self.video_first_last_frames_slows = video_first_last_frames_slows
+        self.video_max_num_frames = video_max_num_frames
+        self.img_start_id = img_start_id
+        self.image_token_id = img_start_id
+        self.discrete_image_start_id = discrete_image_start_id
+        self.discrete_image_token_id = discrete_image_start_id
+        self.discrete_image_unit_0_id = discrete_image_unit_0_id
+        self.video_start_id = video_start_id
+        self.video_token_id = video_start_id
+        self.video_audio_start_id = video_audio_start_id
+        self.video_audio_token_id = video_audio_start_id
+        self.audio_start_id = audio_start_id
+        self.audio_token_id = audio_start_id
+        self.discrete_audio_start_id = discrete_audio_start_id
+        self.discrete_audio_token_id = discrete_audio_start_id
+        self.discrete_audio_unit_0_id = discrete_audio_unit_0_id
+        self.ignore_index = ignore_index
+        self.proj_pos_emb = proj_pos_emb
+        self.proj_prenorm = proj_prenorm
+        self.use_1x1_grid = use_1x1_grid
+        self.possible_resolutions = possible_resolutions
+        super().__init__(**kwargs)
+        if self.text_config is not None:  # needed for HCXVisionForSequenceClassification
+            self.pad_token_id = self.text_config.pad_token_id
+AutoConfig.register("vlm", HCXVisionConfig)
+try:
+    from .configuration_hyperclovax import HyperCLOVAXConfig
+    AutoConfig.register("hyperclovax", HyperCLOVAXConfig)
+except:
+    pass
+try:
+    from transformers import CONFIG_MAPPING, MODEL_MAPPING
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+        Qwen2_5_VisionTransformerPretrainedModel,
+        Qwen2_5_VLPatchMerger,
+        Qwen2_5_VLVisionConfig,
+    )
+    MODEL_MAPPING.register(Qwen2_5_VLVisionConfig, Qwen2_5_VisionTransformerPretrainedModel)
+    CONFIG_MAPPING.register("qwen2_5_vl_visual", Qwen2_5_VLVisionConfig)
+except:
+    pass

cosyvoice.py ADDED Viewed

	@@ -0,0 +1,516 @@

+# Copyright (c)  (Mddct: Dinghao Zhou)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import librosa
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+DEFAULT_SAMPLE_RATE = 16000  # NOTE: 당분간 고정할 예정.
+MIN_DISCRETE_AUDIO_CHUNK_SAMPLES = 1600  # 0.1초, CosyVoice conv 두 번 지나도 code_len >= 1 보장
+@dataclass
+class ModelConfig:
+    n_mels: int = 128
+    n_audio_ctx: int = 1500
+    n_audio_state: int = 1280
+    n_audio_head: int = 20
+    n_audio_layer: int = 6
+    n_codebook_size: int = 3**8
+    use_sdpa: bool = True
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, scaling=None):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    if scaling is not None:
+        t = t * scaling
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return torch.cat((freqs_cis, freqs_cis), dim=-1)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    real = torch.view_as_real(freqs_cis)
+    cos, sin = real[:, :, 0], real[:, :, 1]
+    cos = cos.unsqueeze(0).unsqueeze(2)
+    sin = sin.unsqueeze(0).unsqueeze(2)
+    D = xq.shape[-1]
+    half_l, half_r = xq[:, :, :, : D // 2], xq[:, :, :, D // 2 :]
+    xq_r = torch.cat((-half_r, half_l), dim=-1)
+    D = xk.shape[-1]
+    half_l, half_r = xk[:, :, :, : D // 2], xk[:, :, :, D // 2 :]
+    xk_r = torch.cat((-half_r, half_l), dim=-1)
+    return xq * cos + xq_r * sin, xk * cos + xk_r * sin
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class Linear(nn.Linear):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(
+            x,
+            self.weight.to(x.dtype),
+            None if self.bias is None else self.bias.to(x.dtype),
+        )
+class Conv1d(nn.Conv1d):
+    def _conv_forward(self, x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor:
+        return super()._conv_forward(x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype))
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int, use_sdpa: bool = True):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+        self.use_sdpa = use_sdpa
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ):
+        q = self.query(x)
+        k = self.key(x)
+        v = self.value(x)
+        wv, qk = self.qkv_attention(q, k, v, mask)
+        return self.out(wv), qk
+    def qkv_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None):
+        _, _, D = q.shape
+        scale = (D // self.n_head) ** -0.25
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
+        k = k.view(*k.shape[:2], self.n_head, -1)
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        if not self.use_sdpa:
+            k = k.permute(0, 2, 3, 1) * scale
+            qk = q @ k  # (B, n_head, T, T)
+            if mask is not None:
+                qk = qk + mask
+            qk = qk.float()
+            w = torch.nn.functional.softmax(qk, dim=-1).to(q.dtype)
+            return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach()
+        else:
+            k = k.permute(0, 2, 1, 3) * scale
+            assert mask is not None
+            output = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, scale=1.0)
+            output = output.transpose(1, 2).contiguous().view(q.size(0), -1, D)  # (batch, time1, d_model)
+            return output, None
+class FSQCodebook(torch.nn.Module):
+    def __init__(self, dim: int, level: int = 3):
+        super().__init__()
+        self.project_down = torch.nn.Linear(dim, 8)
+        self.level = level
+        self.embed = None
+    @torch.inference_mode()
+    def preprocess(self, x: torch.Tensor) -> torch.Tensor:
+        x = rearrange(x, "... d -> (...) d")
+        return x
+    @torch.inference_mode()
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        x_shape = x.shape
+        # pre-process
+        x = self.preprocess(x)
+        # quantize
+        h = self.project_down(x).float()
+        h = h.tanh()
+        h = h * 0.9990000128746033
+        h = h.round() + 1
+        # h = ((self.level - 1) * h).round()  # range [-k, k]
+        powers = torch.pow(self.level, torch.arange(2**self.level, device=x.device, dtype=h.dtype))
+        mu = torch.sum(h * powers.unsqueeze(0), dim=-1)
+        ind = mu.reshape(x_shape[0], x_shape[1]).int()
+        return ind
+    @torch.inference_mode()
+    def decode(self, embed_ind: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError("There is no official up project component provided")
+class FSQVectorQuantization(torch.nn.Module):
+    """Vector quantization implementation (inference-only).
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+    ):
+        super().__init__()
+        assert 3**8 == codebook_size
+        self._codebook = FSQCodebook(dim=dim, level=3)
+        self.codebook_size = codebook_size
+    @property
+    def codebook(self):
+        return self._codebook.embed
+    @torch.inference_mode()
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        return self._codebook.encode(x)
+    @torch.inference_mode()
+    def decode(self, embed_ind: torch.Tensor) -> torch.Tensor:
+        quantize = self._codebook.decode(embed_ind)
+        quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize
+class FSMNMultiHeadAttention(MultiHeadAttention):
+    def __init__(
+        self,
+        n_state: int,
+        n_head: int,
+        kernel_size: int = 31,
+        use_sdpa: bool = True,
+    ):
+        super().__init__(n_state, n_head)
+        self.fsmn_block = torch.nn.Conv1d(
+            n_state, n_state, kernel_size, stride=1, padding=0, groups=n_state, bias=False
+        )
+        self.left_padding = (kernel_size - 1) // 2
+        self.right_padding = kernel_size - 1 - self.left_padding
+        self.pad_fn = torch.nn.ConstantPad1d((self.left_padding, self.right_padding), 0.0)
+        self.use_sdpa = use_sdpa
+    def forward_fsmn(self, inputs: torch.Tensor, mask: Optional[torch.Tensor] = None):
+        b, t, _, _ = inputs.size()
+        inputs = inputs.view(b, t, -1)
+        if mask is not None and mask.size(2) > 0:  # time2 > 0
+            inputs = inputs * mask
+        x = inputs.transpose(1, 2)
+        x = self.pad_fn(x)
+        x = self.fsmn_block(x)
+        x = x.transpose(1, 2)
+        x += inputs
+        return x * mask
+    def qkv_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        mask_pad: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[torch.Tensor] = None,
+    ):
+        _, _, D = q.shape
+        scale = (D // self.n_head) ** -0.25
+        q = q.view(*q.shape[:2], self.n_head, -1)
+        k = k.view(*k.shape[:2], self.n_head, -1)
+        v = v.view(*v.shape[:2], self.n_head, -1)
+        if freqs_cis is not None:
+            q, k = apply_rotary_emb(q, k, freqs_cis=freqs_cis)
+        fsm_memory = self.forward_fsmn(v, mask_pad)
+        q = q.permute(0, 2, 1, 3) * scale
+        v = v.permute(0, 2, 1, 3)
+        if not self.use_sdpa:
+            k = k.permute(0, 2, 3, 1) * scale
+            qk = q @ k  # (B, n_head, T, T)
+            if mask is not None:
+                qk = qk + mask
+            qk = qk.float()
+            w = torch.nn.functional.softmax(qk, dim=-1).to(q.dtype)
+            return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach(), fsm_memory
+        else:
+            k = k.permute(0, 2, 1, 3) * scale
+            assert mask is not None
+            output = torch.nn.functional.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=mask,
+                dropout_p=0.0,
+                scale=1.0,
+            )
+            output = output.transpose(1, 2).contiguous().view(q.size(0), -1, D)  # (batch, time1, d_model)
+            return output, None, fsm_memory
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        mask_pad: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[torch.Tensor] = None,
+    ):
+        q = self.query(x)
+        k = self.key(x)
+        v = self.value(x)
+        wv, qk, fsm_memory = self.qkv_attention(q, k, v, mask, mask_pad, freqs_cis)
+        return self.out(wv) + fsm_memory, qk
+class ResidualAttentionBlock(torch.nn.Module):
+    def __init__(
+        self,
+        n_state: int,
+        n_head: int,
+        kernel_size: int = 31,
+        use_sdpa: bool = False,
+    ):
+        super().__init__()
+        self.attn = FSMNMultiHeadAttention(n_state, n_head, kernel_size, use_sdpa=use_sdpa)
+        self.attn_ln = LayerNorm(n_state, eps=1e-6)
+        n_mlp = n_state * 4
+        self.mlp = torch.nn.Sequential(Linear(n_state, n_mlp), torch.nn.GELU(), Linear(n_mlp, n_state))
+        self.mlp_ln = LayerNorm(n_state)
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        mask_pad: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[torch.Tensor] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x), mask=mask, mask_pad=mask_pad, freqs_cis=freqs_cis)[0]
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+class AudioEncoderV2(torch.nn.Module):
+    def __init__(
+        self,
+        n_mels: int,
+        n_state: int,
+        n_head: int,
+        n_layer: int,
+        stride: int,
+        use_sdpa: bool,
+    ):
+        super().__init__()
+        self.stride = stride
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, stride=stride, padding=1)
+        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        self.freqs_cis = precompute_freqs_cis(64, 1024 * 2)
+        self.blocks = torch.nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head, use_sdpa=use_sdpa) for _ in range(n_layer)]
+        )
+    def forward(self, x: torch.Tensor, x_len: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, T)
+            the mel spectrogram of the audio
+        x_len: torch.Tensor, shape = (batch_size,)
+            length of each audio in x
+        """
+        mask = self.make_non_pad_mask(x_len).unsqueeze(1)
+        x = torch.nn.functional.gelu(self.conv1(x * mask))
+        x_len = (x_len + 2 - 1 * (3 - 1) - 1) // self.stride + 1
+        mask = self.make_non_pad_mask(x_len).unsqueeze(1)
+        x = torch.nn.functional.gelu(self.conv2(x * mask))
+        x_len = (x_len + 2 - 1 * (3 - 1) - 1) // 2 + 1
+        mask = self.make_non_pad_mask(x_len).unsqueeze(1)
+        x = x.permute(0, 2, 1)  # (B, T // 2, n_state)
+        freqs_cis = self.freqs_cis.to(x.device)
+        mask_pad = mask.transpose(1, 2)
+        mask = self.mask_to_bias(mask, x.dtype)
+        tmp = torch.view_as_real(freqs_cis)
+        cos, sin = tmp[:, :, 0], tmp[:, :, 1]
+        cos = torch.cat((cos, cos), dim=-1)
+        sin = torch.cat((sin, sin), dim=-1)
+        cos = cos.unsqueeze(0).unsqueeze(2)
+        sin = sin.unsqueeze(0).unsqueeze(2)
+        for block in self.blocks:
+            x = block(x, mask.unsqueeze(1), mask_pad, freqs_cis[: x.size(1)])
+        return x, x_len
+    @staticmethod
+    def make_non_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+        """Make mask tensor containing indices of non-padded part.
+        The sequences in a batch may have different lengths. To enable
+        batch computing, padding is need to make all sequence in same
+        size. To avoid the padding part pass value to context dependent
+        block such as attention or convolution , this padding part is
+        masked.
+        1 for non-padded part and 0 for padded part.
+        Parameters
+        ----------
+            lengths (torch.Tensor): Batch of lengths (B,).
+        Returns:
+        -------
+            torch.Tensor: Mask tensor containing indices of padded part (B, max_T).
+        Examples:
+            >>> import torch
+            >>> import s3tokenizer
+            >>> lengths = torch.tensor([5, 3, 2])
+            >>> masks = s3tokenizer.make_non_pad_mask(lengths)
+            masks = [[1, 1, 1, 1, 1],
+                    [1, 1, 1, 0, 0],
+                    [1, 1, 0, 0, 0]]
+        """
+        batch_size = lengths.size(0)
+        max_len = max_len if max_len > 0 else lengths.max().item()
+        seq_range = torch.arange(0, max_len, dtype=torch.int64, device=lengths.device)
+        seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+        seq_length_expand = lengths.unsqueeze(-1)
+        mask = seq_range_expand >= seq_length_expand
+        return ~mask
+    @staticmethod
+    def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+        """Convert bool-tensor to float-tensor for flash attention.
+        Parameters
+        ----------
+            lengths (torch.Tensor): Batch of lengths (B, ?).
+        Returns:
+        -------
+            torch.Tensor: Mask tensor containing indices of padded part (B, ?).
+        Examples:
+            >>> import torch
+            >>> import s3tokenizer
+            >>> lengths = torch.tensor([5, 3, 2])
+            >>> masks = self.make_non_pad_mask(lengths)
+            masks = [[1, 1, 1, 1, 1],
+                    [1, 1, 1, 0, 0],
+                    [1, 1, 0, 0, 0]]
+            >>> new_masks = self.mask_to_bias(masks, torch.float32)
+            new_masks =
+                [[-0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00],
+                [-0.0000e+00, -0.0000e+00, -0.0000e+00, -1.0000e+10, -1.0000e+10],
+                [-0.0000e+00, -0.0000e+00, -1.0000e+10, -1.0000e+10, -1.0000e+10]]
+        """
+        assert mask.dtype == torch.bool
+        assert dtype in [torch.float32, torch.bfloat16, torch.float16]
+        mask = mask.to(dtype)
+        # attention mask bias
+        # NOTE(Mddct): torch.finfo jit issues
+        #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
+        mask = (1.0 - mask) * -1.0e10
+        return mask
+class CosyvoiceEncoder(nn.Module):
+    """S3 tokenizer of the CosyVoice2 implementation (inference-only).
+    Args:
+        config (ModelConfig): Config
+    """
+    def __init__(self, config: ModelConfig = ModelConfig()):
+        super().__init__()
+        self.config = config
+        self.encoder = AudioEncoderV2(
+            self.config.n_mels,
+            self.config.n_audio_state,
+            self.config.n_audio_head,
+            self.config.n_audio_layer,
+            2,
+            self.config.use_sdpa,
+        )
+        self.quantizer = FSQVectorQuantization(
+            self.config.n_audio_state,
+            self.config.n_codebook_size,
+        )
+    def forward(self, wav: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mel = self.mel_spectrogram(wav, n_mels=self.config.n_mels)
+        mel_len = torch.tensor([mel.shape[-1]]).to(self.device)
+        return self.quantize(mel, mel_len)
+    @torch.inference_mode()
+    def quantize(self, mel: torch.Tensor, mel_len: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        hidden, code_len = self.encoder(mel, mel_len)
+        code = self.quantizer.encode(hidden)
+        return code
+    @staticmethod
+    def mel_spectrogram(
+        wav: torch.Tensor,
+        n_mels: int = 80,
+        padding: int = 0,
+    ) -> torch.Tensor:
+        """
+        This method is based on the whisper.log_mel_spectrogram().
+        So, don't use this as a general mel spectrogram function.
+        """
+        device = wav.device
+        if padding > 0:
+            wav = torch.nn.functional.pad(wav, (0, padding))
+        window = torch.hann_window(400).to(device)
+        stft = torch.stft(wav, 400, 160, window=window, return_complex=True)
+        mag = stft[..., :-1].abs() ** 2
+        filters = torch.from_numpy(librosa.filters.mel(sr=16000, n_fft=400, n_mels=n_mels)).to(device)
+        mel_spec = filters @ mag
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        return log_spec
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def freeze(self):
+        for p in self.parameters():
+            p.requires_grad = False
+    @classmethod
+    def from_pretrained(cls, model_path: str):
+        model = cls()
+        model.load_state_dict(torch.load(model_path, map_location="cpu"), strict=True)
+        model.eval()
+        model.freeze()
+        return model

decoder/audio/NCCosybigvganDecoder.mar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b71bace1a8ed9f1eac40d98e99ebe9978a25b2de68c25d89674743d550d9abec
+size 517187360

decoder/audio/NCZSCosybigvganDecoder.mar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d3cf46e024952093e19dded52befc61c751e3a759138cc055f8b008d1da34a0
+size 539807544

decoder/vision/model_index.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_class_name": "VisionTokenToImagePipeline",
+  "_diffusers_version": "0.32.2",
+  "_custom_pipeline": "pipeline",
+  "transformer": [
+    "pipeline",
+    "VisionTransformer"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ],
+  "scheduler": [
+    "diffusers",
+    "FlowMatchEulerDiscreteScheduler"
+  ],
+  "token_embedder": [
+    "pipeline",
+    "VisionTokenEmbedder"
+  ],
+  "transformer2": [
+    "pipeline",
+    "VisionTransformer"
+  ]
+}

decoder/vision/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "_class_name": "FlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.35.2",
+  "base_image_seq_len": 256,
+  "base_shift": 0.5,
+  "invert_sigmas": false,
+  "max_image_seq_len": 4096,
+  "max_shift": 1.15,
+  "num_train_timesteps": 1000,
+  "shift": 1.0,
+  "shift_terminal": null,
+  "stochastic_sampling": false,
+  "time_shift_type": "exponential",
+  "use_beta_sigmas": false,
+  "use_dynamic_shifting": false,
+  "use_exponential_sigmas": false,
+  "use_karras_sigmas": false
+}

decoder/vision/token_embedder/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_class_name": "VisionTokenEmbedder",
+  "_diffusers_version": "0.35.2",
+  "embedding_dim": 1536,
+  "token_length": 729,
+  "vocab_size": 65536
+}

decoder/vision/token_embedder/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d84815b2f681f45dbc5dbbad513b04f7fe3fa4444ca6410a9354555bb3410c7f
+size 201329872

decoder/vision/transformer/config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "_class_name": "VisionTransformer",
+  "_diffusers_version": "0.35.2",
+  "axes_dim": [
+    8,
+    36,
+    36
+  ],
+  "context_in_dim": 1536,
+  "depth": 0,
+  "depth_single_blocks": 35,
+  "guidance_embed": false,
+  "hidden_size": 1920,
+  "in_channels": 16,
+  "mlp_ratio": 4.0,
+  "num_heads": 24,
+  "qkv_bias": true,
+  "theta": 10000,
+  "use_patchify": false,
+  "vec_in_dim": 1536
+}

decoder/vision/transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f07c014f8575090455e9d3c024b4f58a8ad480b957f2c45fc6eec4fc08edbe94
+size 3914661840

decoder/vision/transformer2/config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "_class_name": "VisionTransformer",
+  "_diffusers_version": "0.35.2",
+  "axes_dim": [
+    6,
+    18,
+    18
+  ],
+  "context_in_dim": 1536,
+  "depth": 0,
+  "depth_single_blocks": 25,
+  "guidance_embed": false,
+  "hidden_size": 1008,
+  "in_channels": 16,
+  "mlp_ratio": 4.0,
+  "num_heads": 24,
+  "qkv_bias": true,
+  "theta": 10000,
+  "use_patchify": false,
+  "vec_in_dim": 1536
+}

decoder/vision/transformer2/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca76187d0f52336f6d385c4d56f43f0e631a4030343b27647b30baf188bcbc96
+size 777545632

decoder/vision/vae/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.35.2",
+  "_name_or_path": "black-forest-labs/FLUX.1-schnell",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "force_upcast": true,
+  "in_channels": 3,
+  "latent_channels": 16,
+  "latents_mean": null,
+  "latents_std": null,
+  "layers_per_block": 2,
+  "mid_block_add_attention": true,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 1024,
+  "scaling_factor": 0.3611,
+  "shift_factor": 0.1159,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "use_post_quant_conv": false,
+  "use_quant_conv": false
+}

decoder/vision/vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5b59a26851551b67ae1fe58d32e76486e1e812def4696a4bea97f16604d40a3
+size 167666902

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 128000,
+  "eos_token_id": 0,
+  "transformers_version": "4.52.4"
+}

mambamia_videoaudio_compressor.py ADDED Viewed

	@@ -0,0 +1,803 @@

+# -*- coding: utf-8 -*-
+# This module is integrated into 'HyperCLOVAX-SEED-Omni-8B' to mitigate
+# audio stream token explosion during hour-long video understanding.
+# It utilizes the MambaMia architecture (AAAI-26 Oral) to
+# effectively compress high-frequency audio tokens into a manageable
+# context for the LLM.
+# Research Context:
+#   - MambaMia: https://github.com/naver-ai/mambamia
+#   - LLaVA-AV-SSM: https://github.com/naver-ai/LLaVA-AV-SSM
+# Acknowledgements:
+#   This implementation is heavily modified and extended from the following
+#   foundational repositories:
+#     - Transformers: https://github.com/huggingface/transformers (Apache License v2.0)
+#     - Mamba: https://github.com/state-spaces/mamba (Apache License v2.0)
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import ModelOutput, logging
+from transformers.utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
+logger = logging.get_logger(__name__)
+# ============================================================================
+# Check for fast path availability
+# ============================================================================
+if is_mamba_2_ssm_available():
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
+else:
+    selective_state_update = None
+    mamba_split_conv1d_scan_combined = None
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))
+# ============================================================================
+# MambaMia2Config (Simplified for v04 only)
+# ============================================================================
+class MambaMia2Config(PretrainedConfig):
+    """
+    Simplified MambaMia2 configuration for v04 version only.
+    """
+    model_type = "mamba2"
+    def __init__(
+        self,
+        num_heads=128,
+        head_dim=64,
+        vocab_size=32768,
+        hidden_size=4096,
+        state_size=128,
+        num_hidden_layers=64,
+        layer_norm_epsilon=1e-5,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        expand=2,
+        conv_kernel=4,
+        n_groups=8,
+        use_bias=False,
+        use_conv_bias=True,
+        hidden_act="silu",
+        initializer_range=0.1,
+        residual_in_fp32=False,
+        time_step_rank="auto",
+        time_step_min=0.001,
+        time_step_max=0.1,
+        time_step_floor=1e-4,
+        time_step_limit=(0.0, float("inf")),
+        rescale_prenorm_residual=False,
+        use_cache=True,
+        norm_before_gate=True,
+        rms_norm=True,
+        chunk_size=256,
+        tie_word_embeddings=False,
+        mambamia_chunk_size=10,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.state_size = state_size
+        self.num_hidden_layers = num_hidden_layers
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.conv_kernel = conv_kernel
+        self.expand = expand
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.use_bias = use_bias
+        self.use_conv_bias = use_conv_bias
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank
+        self.time_step_min = time_step_min
+        self.time_step_max = time_step_max
+        self.time_step_floor = time_step_floor
+        self.rescale_prenorm_residual = rescale_prenorm_residual
+        self.residual_in_fp32 = residual_in_fp32
+        self.use_cache = use_cache
+        self.n_groups = n_groups
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.norm_before_gate = norm_before_gate
+        self.rms_norm = rms_norm
+        self.state_size = state_size
+        self.chunk_size = chunk_size
+        self.time_step_limit = time_step_limit
+        self.tie_word_embeddings = tie_word_embeddings
+        self.mambamia_chunk_size = mambamia_chunk_size
+        self.output_hidden_states = False
+        self.output_deltas = False
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+# ============================================================================
+# Helper Modules
+# ============================================================================
+class MambaRMSNormGated(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states, gate=None):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        if gate is not None:
+            hidden_states = hidden_states * nn.functional.silu(gate.to(torch.float32))
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class MambaMia2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+# ============================================================================
+# MambaMia2Mixer (v04 version - unidirectional with GPA)
+# ============================================================================
+class MambaMia2Mixer(nn.Module):
+    """
+    Unidirectional Mamba2 Mixer for v04 version.
+    v04 = v0 (unidirectional Mamba) + GPA (Gated Pooling Attention in Block)
+    """
+    def __init__(self, config: MambaMia2Config, layer_idx: int):
+        super().__init__()
+        self.num_heads = config.num_heads
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.state_size
+        self.conv_kernel_size = config.conv_kernel
+        self.intermediate_size = int(config.expand * self.hidden_size)
+        self.time_step_rank = int(config.time_step_rank)
+        self.layer_idx = layer_idx
+        self.use_conv_bias = config.use_conv_bias
+        self.activation = config.hidden_act
+        self.act = ACT2FN[config.hidden_act]
+        self.norm_before_gate = config.norm_before_gate
+        self.layer_norm_epsilon = config.layer_norm_epsilon
+        self.rms_norm = config.rms_norm
+        self.n_groups = config.n_groups
+        self.head_dim = config.head_dim
+        self.chunk_size = config.chunk_size
+        self.time_step_limit = config.time_step_limit
+        self.time_step_min = config.time_step_min
+        self.time_step_max = config.time_step_max
+        self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
+        # Conv1d for SSM
+        self.conv1d = nn.Conv1d(
+            in_channels=self.conv_dim,
+            out_channels=self.conv_dim,
+            bias=config.use_conv_bias,
+            kernel_size=config.conv_kernel,
+            groups=self.conv_dim,
+            padding=config.conv_kernel - 1,
+        )
+        # projection of the input hidden states
+        projection_size = self.intermediate_size + self.conv_dim + self.num_heads
+        self.in_proj = nn.Linear(self.hidden_size, projection_size, bias=config.use_bias)
+        # time step projection
+        self.dt_bias = nn.Parameter(torch.ones(self.num_heads))
+        # S4D real initialization
+        A = torch.arange(1, self.num_heads + 1)
+        self.A_log = nn.Parameter(torch.log(A))
+        self.A_log._no_weight_decay = True
+        self.norm = MambaRMSNormGated(self.intermediate_size, eps=self.layer_norm_epsilon)
+        self.D = nn.Parameter(torch.ones(self.num_heads))
+        self.D._no_weight_decay = True
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
+        self.use_bias = config.use_bias
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because one of "
+                "`(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. "
+                "Falling back to the naive implementation. To install follow "
+                "https://github.com/state-spaces/mamba/#installation and "
+                "https://github.com/Dao-AILab/causal-conv1d"
+            )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        v04 unidirectional forward pass using CUDA kernels.
+        """
+        import os
+        rank = int(os.environ.get("RANK", -1))
+        debug = False # (rank <= 0)
+        assert is_fast_path_available and "cuda" in self.in_proj.weight.device.type, \
+            "CUDA kernels required for MambaMia2Mixer"
+        dtype = hidden_states.dtype
+        batch_size, seq_len, _ = hidden_states.shape
+        if debug:
+            print(f"[Mixer DEBUG] input: min={hidden_states.min().item():.6f}, max={hidden_states.max().item():.6f}, nan={torch.isnan(hidden_states).any().item()}, seq_len={seq_len}, chunk_size={self.chunk_size}")
+        if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+            hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+        # Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states)
+        if debug:
+            print(f"[Mixer DEBUG] after in_proj: min={projected_states.min().item():.6f}, max={projected_states.max().item():.6f}, nan={torch.isnan(projected_states).any().item()}")
+            print(f"[Mixer DEBUG] A_log: {self.A_log[:5].tolist()}, dt_bias: {self.dt_bias[:5].tolist()}, D: {self.D[:5].tolist()}")
+        dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit}
+        # Unidirectional forward pass (same as v0)
+        outputs = mamba_split_conv1d_scan_combined(
+            projected_states,
+            self.conv1d.weight.squeeze(1),
+            self.conv1d.bias,
+            self.dt_bias,
+            -torch.exp(self.A_log.float()),
+            D=self.D,
+            chunk_size=self.chunk_size,
+            seq_idx=None,
+            activation=self.activation,
+            rmsnorm_weight=self.norm.weight,
+            rmsnorm_eps=self.norm.variance_epsilon,
+            outproj_weight=self.out_proj.weight,
+            outproj_bias=self.out_proj.bias,
+            headdim=self.head_dim,
+            ngroups=self.n_groups,
+            norm_before_gate=self.norm_before_gate,
+            return_final_states=False,
+            **dt_limit_kwargs,
+        )
+        if debug:
+            print(f"[Mixer DEBUG] after mamba_kernel: min={outputs.min().item():.6f}, max={outputs.max().item():.6f}, nan={torch.isnan(outputs).any().item()}")
+        return outputs.to(dtype)
+# ============================================================================
+# MambaMia2Block (v04 version only)
+# ============================================================================
+class MambaMia2Block(nn.Module):
+    """
+    Single MambaMia2 block with v04 gated pooling attention mechanism.
+    """
+    def __init__(self, config: MambaMia2Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.residual_in_fp32 = config.residual_in_fp32
+        self.mambamia_chunk_size = config.mambamia_chunk_size
+        self.norm = MambaMia2RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.mixer = MambaMia2Mixer(config, layer_idx=layer_idx)
+        # v04 specific: Gated Pooling Attention (GPA)
+        self.drop = nn.Dropout(p=0.1)
+        # Per-frame weight prediction
+        self.weight_fc = nn.Linear(config.hidden_size, self.mambamia_chunk_size)
+        nn.init.zeros_(self.weight_fc.bias)
+        with torch.no_grad():
+            self.weight_fc.weight.mul_(1e-3)
+        # Query vs aggregator gating
+        self.gate_fc = nn.Linear(config.hidden_size, 1)
+        nn.init.zeros_(self.gate_fc.bias)
+        with torch.no_grad():
+            self.gate_fc.weight.mul_(1e-3)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        input_dtype = hidden_states.dtype
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))
+        if self.residual_in_fp32:
+            residual = residual.to(torch.float32)
+        # v04 Gated Pooling Attention
+        assert hidden_states.dim() == 3, f"hidden_states.dim()={hidden_states.dim()} != 3"
+        bsz, seq_len, hidden_dim = hidden_states.shape
+        mambamia_chunk_size = self.mambamia_chunk_size
+        chunk_with_query = mambamia_chunk_size + 1
+        if seq_len % chunk_with_query != 0:
+            raise ValueError(
+                f"seq_len={seq_len} must be divisible by (mambamia_chunk_size+1)={chunk_with_query}"
+            )
+        n_chunk = seq_len // chunk_with_query
+        # Reshape to (bsz, n_chunk, chunk_size+1, hidden_dim)
+        hidden_4d = hidden_states.view(bsz, n_chunk, chunk_with_query, hidden_dim)
+        frames = hidden_4d[:, :, :mambamia_chunk_size, :]  # (bsz, n_chunk, chunk_size, hidden_dim)
+        queries = hidden_4d[:, :, mambamia_chunk_size, :]  # (bsz, n_chunk, hidden_dim)
+        # Weight prediction for frames (float32로 계산하여 안정성 확보)
+        w_in = self.drop(queries)
+        raw_weights = self.weight_fc(w_in)
+        alpha = torch.softmax(raw_weights.float(), dim=-1).to(input_dtype)  # (bsz, n_chunk, chunk_size)
+        # Weighted average: aggregator
+        aggregator = (frames * alpha.unsqueeze(-1)).sum(dim=2)  # (bsz, n_chunk, hidden_dim)
+        # Gating between queries and aggregator (float32로 계산)
+        gating_in = self.drop(queries)
+        gating = torch.sigmoid(self.gate_fc(gating_in).float()).to(input_dtype)  # (bsz, n_chunk, 1)
+        epsilon = 0.01
+        gating = gating * (1 - 2 * epsilon) + epsilon  # [0.01, 0.99]
+        gating_broad = gating.expand(-1, -1, hidden_dim)
+        aggregator = aggregator * gating_broad
+        queries = queries * (1 - gating_broad)
+        queries_new = queries + aggregator
+        # Update query positions
+        hidden_4d = hidden_4d.clone()
+        hidden_4d[:, :, mambamia_chunk_size, :] = queries_new
+        hidden_states = hidden_4d.view(bsz, seq_len, hidden_dim)
+        # Mixer forward
+        hidden_states = self.mixer(hidden_states, attention_mask=attention_mask)
+        # Residual connection
+        hidden_states = hidden_states + residual
+        return hidden_states
+# ============================================================================
+# MambaMia2Model (Simplified)
+# ============================================================================
+@dataclass
+class MambaMia2Output(ModelOutput):
+    """Output class for MambaMia2Model."""
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+class MambaMia2PreTrainedModel(PreTrainedModel):
+    """Base class for MambaMia2 models."""
+    config_class = MambaMia2Config
+    base_model_prefix = "backbone"
+    _no_split_modules = ["MambaMia2Block"]
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        if isinstance(module, MambaMia2Mixer):
+            module.A_log._no_weight_decay = True
+            module.D._no_weight_decay = True
+            dt = torch.exp(
+                torch.rand(self.config.num_heads)
+                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                + math.log(self.config.time_step_min)
+            ).clamp(min=self.config.time_step_floor)
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            with torch.no_grad():
+                module.dt_bias.copy_(inv_dt)
+            module.dt_bias._no_reinit = True
+        if isinstance(module, nn.Linear):
+            if module.bias is not None:
+                if not getattr(module.bias, "_no_reinit", False):
+                    nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=self.config.initializer_range)
+        if self.config.rescale_prenorm_residual:
+            for name, p in module.named_parameters():
+                if name in ["out_proj.weight"]:
+                    nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                    with torch.no_grad():
+                        p /= math.sqrt(self.config.num_hidden_layers)
+class MambaMia2Model(MambaMia2PreTrainedModel):
+    """
+    Simplified MambaMia2 Model for v04 version.
+    Takes inputs_embeds directly (no embedding layer used for audio/video).
+    """
+    def __init__(self, config: MambaMia2Config):
+        super().__init__(config)
+        self.layers = nn.ModuleList([
+            MambaMia2Block(config, layer_idx=idx)
+            for idx in range(config.num_hidden_layers)
+        ])
+        self.gradient_checkpointing = False
+        self.norm_f = MambaMia2RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.post_init()
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MambaMia2Output]:
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        hidden_states = inputs_embeds
+        all_hidden_states = () if output_hidden_states else None
+        for mixer_block in self.layers:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    mixer_block.__call__, hidden_states, attention_mask
+                )
+            else:
+                hidden_states = mixer_block(hidden_states, attention_mask=attention_mask)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+        hidden_states = self.norm_f(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+        return MambaMia2Output(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+        )
+# ============================================================================
+# MambaMiaVideoAudioCompressorConfig
+# ============================================================================
+class MambaMiaVideoAudioCompressorConfig(PretrainedConfig):
+    """
+    Configuration for MambaMiaVideoAudioCompressor.
+    Args:
+        input_size: Input embedding dimension (e.g., 1280 for Whisper)
+        output_size: Output embedding dimension (e.g., 2048 for LLM)
+        chunk_size: Number of tokens per chunk (default: 25, i.e., 1 second at 25Hz)
+        num_hidden_layers: Number of MambaMia2 layers (default: 1)
+        hidden_size: Internal hidden size (default: 3072, must be divisible by 24)
+    """
+    model_type = "mambamia_videoaudio_compressor"
+    def __init__(
+        self,
+        input_size: int = 1280,
+        output_size: int = 2048,
+        chunk_size: int = 25,
+        num_hidden_layers: int = 1,
+        hidden_size: int = 3072,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_size = input_size
+        self.output_size = output_size
+        self.chunk_size = chunk_size
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_size = hidden_size
+# ============================================================================
+# MambaMiaVideoAudioCompressor - Main Interface (PreTrainedModel 기반)
+# ============================================================================
+class MambaMiaVideoAudioCompressor(PreTrainedModel):
+    """
+    Video/Audio Compressor using MambaMia2 (v04 bidirectional version).
+    This module compresses sequential embeddings (e.g., audio frames at 25Hz)
+    by inserting learnable query tokens and extracting them after processing.
+    Args:
+        config: MambaMiaVideoAudioCompressorConfig
+    Input:
+        inputs_embeds: (batch_size, num_frames, hidden_dim) where num_frames is
+                       typically the audio length and hidden_dim matches input_size
+    Output:
+        compressed_embeds: (batch_size, num_queries, output_size) where
+                          num_queries = num_frames // chunk_size
+    """
+    config_class = MambaMiaVideoAudioCompressorConfig
+    base_model_prefix = "mambamia_compressor"
+    _no_split_modules = ["MambaMia2Block"]
+    def __init__(self, config: MambaMiaVideoAudioCompressorConfig):
+        super().__init__(config)
+        self.input_size = config.input_size
+        self.output_size = config.output_size
+        self.chunk_size = config.chunk_size
+        self.hidden_size = config.hidden_size
+        # Input projection: input_size -> hidden_size
+        self.input_proj = nn.Linear(config.input_size, config.hidden_size)
+        # Learnable query token
+        self.query_token = nn.Parameter(torch.randn(config.hidden_size))
+        # MambaMia2 backbone
+        # 중요: chunk_size는 SSM kernel의 chunk size로, 시퀀스 길이보다 작아야 함
+        # mambamia_chunk_size는 압축 비율 (25:1)
+        # 시퀀스 길이가 짧을 수 있으므로 (예: 390 tokens) chunk_size=64로 설정
+        mamba_config = MambaMia2Config(
+            vocab_size=0,
+            hidden_size=config.hidden_size,
+            num_hidden_layers=config.num_hidden_layers,
+            head_dim=64,
+            num_heads=config.hidden_size * 2 // 64,  # e.g., 3072*2/64 = 96
+            n_groups=1,
+            expand=2.0,
+            use_cache=False,
+            chunk_size=256,  # SSM kernel chunk size
+            mambamia_chunk_size=config.chunk_size,  # 압축 비율용 (25)
+            residual_in_fp32=False,
+        )
+        self.model = MambaMia2Model(mamba_config)
+        # LayerNorm before Mamba2 to normalize input scales
+        # This ensures query_token and input_proj outputs are on the same scale
+        self.input_norm = nn.LayerNorm(config.hidden_size, eps=1e-6)
+        # Output projection: hidden_size -> output_size
+        self.output_proj = nn.Linear(config.hidden_size, config.output_size)
+        # Initialize weights (transformers style)
+        self.post_init()
+    def _init_weights(self, module):
+        """
+        Initialize weights - called by post_init() for all submodules.
+        주의: MambaMia2Model 내부의 가중치는 건드리지 않음 (자체 post_init에서 처리됨)
+        """
+        # query_token 초기화 - std=1.0으로 input_proj 출력 스케일과 맞춤
+        # (작은 std는 LayerNorm에서 variance가 0에 가까워져 inf 발생)
+        if module is self:
+            with torch.no_grad():
+                self.query_token.data.normal_(mean=0.0, std=1.0)
+        # input_proj, output_proj만 xavier 초기화 (MambaMia2 내부는 건드리지 않음)
+        if module is self.input_proj or module is self.output_proj:
+            nn.init.xavier_uniform_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+    def _init_all_weights(self):
+        """
+        Force re-initialize all weights. Call after dtype conversion for FSDP compatibility.
+        This ensures weights are properly initialized even after model transformations.
+        """
+        # 1. input_proj, output_proj 초기화
+        nn.init.xavier_uniform_(self.input_proj.weight)
+        if self.input_proj.bias is not None:
+            nn.init.zeros_(self.input_proj.bias)
+        nn.init.xavier_uniform_(self.output_proj.weight)
+        if self.output_proj.bias is not None:
+            nn.init.zeros_(self.output_proj.bias)
+        # 2. query_token 초기화 - std=1.0으로 input_proj 출력 스케일과 맞춤
+        self.query_token.data.normal_(mean=0.0, std=1.0)
+        # 3. input_norm (LayerNorm) 초기화
+        nn.init.ones_(self.input_norm.weight)
+        nn.init.zeros_(self.input_norm.bias)
+        # 4. MambaMia2Model 내부 초기화 (중요!)
+        for name, module in self.model.named_modules():
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.Conv1d):
+                nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+        # 5. MambaMia2Block의 특수 초기화 (weight_fc, gate_fc)
+        for layer in self.model.layers:
+            if hasattr(layer, 'weight_fc'):
+                nn.init.xavier_uniform_(layer.weight_fc.weight)
+                layer.weight_fc.weight.data.mul_(0.01)  # Scale down
+                nn.init.zeros_(layer.weight_fc.bias)
+            if hasattr(layer, 'gate_fc'):
+                nn.init.xavier_uniform_(layer.gate_fc.weight)
+                layer.gate_fc.weight.data.mul_(0.01)  # Scale down
+                nn.init.zeros_(layer.gate_fc.bias)
+        # 6. A_log, D, dt_bias 파라미터 초기화 (SSM specific)
+        for layer in self.model.layers:
+            if hasattr(layer, 'mixer'):
+                mixer = layer.mixer
+                # A_log: S4D real initialization
+                A = torch.arange(1, mixer.num_heads + 1, dtype=mixer.A_log.dtype, device=mixer.A_log.device)
+                mixer.A_log.data.copy_(torch.log(A))
+                # D: scaling factor
+                mixer.D.data.fill_(1.0)
+                # dt_bias: time step bias (중요!)
+                mixer.dt_bias.data.fill_(1.0)
+        # 7. RMSNorm weight 초기화 (MambaRMSNormGated)
+        for layer in self.model.layers:
+            if hasattr(layer, 'mixer') and hasattr(layer.mixer, 'norm'):
+                layer.mixer.norm.weight.data.fill_(1.0)
+            if hasattr(layer, 'norm') and hasattr(layer.norm, 'weight'):
+                layer.norm.weight.data.fill_(1.0)
+        # 8. MambaMia2Model의 최종 norm_f 초기화
+        if hasattr(self.model, 'norm_f') and hasattr(self.model.norm_f, 'weight'):
+            self.model.norm_f.weight.data.fill_(1.0)
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Forward pass.
+        Args:
+            inputs_embeds: (batch_size, seq_len, input_size) or
+                          (batch_size, num_frames, chunk_size, input_size)
+        Returns:
+            compressed: (batch_size, num_queries, output_size)
+        """
+        import os
+        rank = int(os.environ.get("RANK", -1))
+        debug = False # True if (rank <= 0) else False
+        # Handle different input shapes
+        if inputs_embeds.dim() == 4:
+            # (batch_size, num_frames, chunk_size, input_size)
+            bsz, num_frames, chunk_size, _ = inputs_embeds.shape
+            assert chunk_size == self.chunk_size, \
+                f"Input chunk_size {chunk_size} != expected {self.chunk_size}"
+            inputs_embeds = inputs_embeds.view(bsz, -1, self.input_size)
+        bsz, seq_len, _ = inputs_embeds.shape
+        # Ensure seq_len is divisible by chunk_size
+        if seq_len % self.chunk_size != 0:
+            # Pad to make divisible
+            pad_len = self.chunk_size - (seq_len % self.chunk_size)
+            inputs_embeds = F.pad(inputs_embeds, (0, 0, 0, pad_len))
+            seq_len = inputs_embeds.shape[1]
+        n_chunk = seq_len // self.chunk_size
+        # Project input
+        hidden_states = self.input_proj(inputs_embeds)  # (bsz, seq_len, hidden_size)
+        if debug:
+            print(f"[MambaMia DEBUG] input_proj output: min={hidden_states.min().item():.6f}, max={hidden_states.max().item():.6f}, has_nan={torch.isnan(hidden_states).any().item()}")
+        # Reshape to (bsz, n_chunk, chunk_size, hidden_size)
+        hidden_4d = hidden_states.view(bsz, n_chunk, self.chunk_size, self.hidden_size)
+        # Add query token to each chunk
+        # query_token: (hidden_size,) -> (1, 1, 1, hidden_size)
+        query_expanded = self.query_token.view(1, 1, 1, -1).expand(bsz, n_chunk, 1, self.hidden_size)
+        if debug:
+            print(f"[MambaMia DEBUG] query_token: min={self.query_token.min().item():.6f}, max={self.query_token.max().item():.6f}, has_nan={torch.isnan(self.query_token).any().item()}")
+        # Concatenate: (bsz, n_chunk, chunk_size+1, hidden_size)
+        hidden_with_query = torch.cat([hidden_4d, query_expanded], dim=2)
+        # Flatten for model: (bsz, n_chunk * (chunk_size+1), hidden_size)
+        model_input = hidden_with_query.view(bsz, -1, self.hidden_size)
+        # Apply LayerNorm to normalize input scales before Mamba2
+        model_input = self.input_norm(model_input)
+        if debug:
+            print(f"[MambaMia DEBUG] model_input (after LayerNorm, before Mamba2): min={model_input.min().item():.6f}, max={model_input.max().item():.6f}, has_nan={torch.isnan(model_input).any().item()}")
+        # Forward through MambaMia2
+        outputs = self.model(inputs_embeds=model_input)
+        hidden_states = outputs.last_hidden_state  # (bsz, n_chunk * (chunk_size+1), hidden_size)
+        if debug:
+            print(f"[MambaMia DEBUG] model output (after Mamba2): min={hidden_states.min().item():.6f}, max={hidden_states.max().item():.6f}, has_nan={torch.isnan(hidden_states).any().item()}")
+        # Check for NaN and replace with zeros if found (defensive)
+        if torch.isnan(hidden_states).any():
+            hidden_states = torch.nan_to_num(hidden_states, nan=0.0)
+        # Reshape back: (bsz, n_chunk, chunk_size+1, hidden_size)
+        hidden_out_4d = hidden_states.view(bsz, n_chunk, self.chunk_size + 1, self.hidden_size)
+        # Extract query positions (last position in each chunk)
+        query_outputs = hidden_out_4d[:, :, self.chunk_size, :]  # (bsz, n_chunk, hidden_size)
+        if debug:
+            print(f"[MambaMia DEBUG] query_outputs (extracted): min={query_outputs.min().item():.6f}, max={query_outputs.max().item():.6f}, has_nan={torch.isnan(query_outputs).any().item()}")
+        # Project to output size
+        compressed = self.output_proj(query_outputs)  # (bsz, n_chunk, output_size)
+        if debug:
+            print(f"[MambaMia DEBUG] output_proj output: min={compressed.min().item():.6f}, max={compressed.max().item():.6f}, has_nan={torch.isnan(compressed).any().item()}")
+        return compressed
+# ============================================================================
+# Convenience function for quick instantiation
+# ============================================================================
+def create_mambamia_compressor(
+    input_size: int,
+    output_size: int,
+    chunk_size: int = 25,
+    num_hidden_layers: int = 2,
+    hidden_size: int = 3072,
+) -> MambaMiaVideoAudioCompressor:
+    """
+    Create a MambaMiaVideoAudioCompressor with default settings.
+    Example:
+        compressor = create_mambamia_compressor(1280, 2048, chunk_size=25)
+    """
+    config = MambaMiaVideoAudioCompressorConfig(
+        input_size=input_size,
+        output_size=output_size,
+        chunk_size=chunk_size,
+        num_hidden_layers=num_hidden_layers,
+        hidden_size=hidden_size,
+    )
+    return MambaMiaVideoAudioCompressor(config)

model-00001-of-00010.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f0c21a9149c81295ffd42490cfb2daf7c9e6dcc39f11a4e4c4b4fe4be8a9e2a
+size 4707522584

model-00002-of-00010.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91399f86327e37a1b75ca14154cdc42d9994f39c08c34e8156503e89f10cc800
+size 3454903840

model-00003-of-00010.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f779b660d0a569700efef179dda58a3496decb3049b8c8269860f83b32bb647f
+size 4999679056

model-00004-of-00010.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73022915ccce7a97bb31c0cc60c7b7b52fe7cd4a6859949a1e8f3d60c51e2fb8
+size 4832042296

model-00005-of-00010.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36449388c5903053f1e6cbac60140feb7b321a006e8493ed0f10480bca540d46
+size 4832042328

model-00006-of-00010.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3696ca372b23a411837f961c00f05a0eaf9a3035438f37525793753bd08823fa
+size 4999848088

model-00007-of-00010.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5481aa36ff6148bb35812b4de5bf2af12daf4e6d07a3106fd72d082088816cab
+size 4832042352

model-00008-of-00010.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:772db476e45a055871d22c837a70fbd2d4960dd075031f3d2b6b5a6ea0888db2
+size 4832042352

model-00009-of-00010.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb63af1e386f7ccf472c4d3108c134d863869ab13c4f9b945586a2994f3258d6
+size 1744948136

model-00010-of-00010.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c5a2c76f5b7960665d75ed473ae43eb76a5d86e27d8820ede0b9b16bb40b68c
+size 3731831368

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_hyperclovax.py ADDED Viewed

	@@ -0,0 +1,1866 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_flash_attention_utils import _flash_attention_forward
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_greater_or_equal_2_10,
+    is_torchdynamo_compiling,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_hyperclovax import HyperCLOVAXConfig
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "HyperCLOVAXConfig"
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    min_dtype: float,
+    cache_position: torch.Tensor,
+    batch_size: int,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(padding_mask, min_dtype)
+    return causal_mask
+class HyperCLOVAXRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        HyperCLOVAXRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+ALL_LAYERNORM_LAYERS.append(HyperCLOVAXRMSNorm)
+class HyperCLOVAXRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[HyperCLOVAXConfig] = None,
+    ):
+        super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`HyperCLOVAXRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.46"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class HyperCLOVAXLinearScalingRotaryEmbedding(HyperCLOVAXRotaryEmbedding):
+    """HyperCLOVAXRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, *args, **kwargs):
+        logger.warning_once(
+            "`HyperCLOVAXLinearScalingRotaryEmbedding` is deprecated an will be removed in v4.46. Please use "
+            "`HyperCLOVAXRotaryEmbedding`, which now also does linear scaling (simply pass the model config to __init__)."
+        )
+        kwargs["rope_type"] = "linear"
+        super().__init__(*args, **kwargs)
+class HyperCLOVAXDynamicNTKScalingRotaryEmbedding(HyperCLOVAXRotaryEmbedding):
+    """HyperCLOVAXRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, *args, **kwargs):
+        logger.warning_once(
+            "`HyperCLOVAXDynamicNTKScalingRotaryEmbedding` is deprecated an will be removed in v4.46. Please use "
+            "`HyperCLOVAXRotaryEmbedding`, which now also does dynamic ntk scaling (simply pass the model config to "
+            "__init__)."
+        )
+        kwargs["rope_type"] = "dynamic"
+        super().__init__(*args, **kwargs)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class HyperCLOVAXMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class HyperCLOVAXAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: HyperCLOVAXConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.scaling = config.attention_multiplier
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        # TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers)
+        self.rotary_emb = HyperCLOVAXRotaryEmbedding(config=self.config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        # attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling / math.sqrt(self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class HyperCLOVAXFlashAttention2(HyperCLOVAXAttention):
+    """
+    HyperCLOVAX flash attention module. This module inherits from `HyperCLOVAXAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.attention_dropout if self.training else 0.0
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (HyperCLOVAXRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            softmax_scale=self.scaling,  # mup
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class HyperCLOVAXSdpaAttention(HyperCLOVAXAttention):
+    """
+    HyperCLOVAX attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `HyperCLOVAXAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from HyperCLOVAXAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "HyperCLOVAXModel is using HyperCLOVAXSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+            scale=self.scaling,  # mup
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+HyperCLOVAX_ATTENTION_CLASSES = {
+    "eager": HyperCLOVAXAttention,
+    "flash_attention_2": HyperCLOVAXFlashAttention2,
+    "sdpa": HyperCLOVAXSdpaAttention,
+}
+class HyperCLOVAXDecoderLayer(nn.Module):
+    def __init__(self, config: HyperCLOVAXConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = HyperCLOVAX_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+        self.mlp = HyperCLOVAXMLP(config)
+        self.input_layernorm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        # post-norm (dual-norm)
+        self.use_post_norm = config.use_post_norm
+        if self.use_post_norm:
+            self.post_norm1 = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.post_norm2 = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.residual_multiplier = config.residual_multiplier  # mup
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        if self.use_post_norm:
+            hidden_states = self.post_norm1(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier  # mup
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if self.use_post_norm:
+            hidden_states = self.post_norm2(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier  # mup
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+HyperCLOVAX_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`HyperCLOVAXConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare HyperCLOVAX Model outputting raw hidden-states without any specific head on top.",
+    HyperCLOVAX_START_DOCSTRING,
+)
+class HyperCLOVAXPreTrainedModel(PreTrainedModel):
+    config_class = HyperCLOVAXConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HyperCLOVAXDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+HyperCLOVAX_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@add_start_docstrings(
+    "The bare HyperCLOVAX Model outputting raw hidden-states without any specific head on top.",
+    HyperCLOVAX_START_DOCSTRING,
+)
+class HyperCLOVAXModel(HyperCLOVAXPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`HyperCLOVAXDecoderLayer`]
+    Args:
+        config: HyperCLOVAXConfig
+    """
+    def __init__(self, config: HyperCLOVAXConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [HyperCLOVAXDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = HyperCLOVAXRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+        # mup
+        self.embedding_multiplier = config.embedding_multiplier
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = inputs_embeds * self.embedding_multiplier  # mup
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            min_dtype=min_dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+class HyperCLOVAXForCausalLM(HyperCLOVAXPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = HyperCLOVAXModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _get_apply_liger_kernel_converter(self):
+        return _apply_liger_kernel_to_instance
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, HyperCLOVAXForCausalLM
+        >>> model = HyperCLOVAXForCausalLM.from_pretrained(YOUR_DIR)
+        >>> tokenizer = AutoTokenizer.from_pretrained(YOUR_DIR)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            if labels is None and not is_torchdynamo_compiling():
+                logger.warning_once(
+                    "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
+                )
+            # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+            # TODO: remove the float() operation in v4.46
+            logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = logits * self.config.logits_scaling  # mup
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            # The clone here is for the same reason as for `position_ids`.
+            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+            dtype = self.lm_head.weight.dtype
+            min_dtype = torch.finfo(dtype).min
+            attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_length(),
+                dtype=dtype,
+                device=device,
+                min_dtype=min_dtype,
+                cache_position=cache_position,
+                batch_size=batch_size,
+            )
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+@add_start_docstrings(
+    """
+    The HyperCLOVAX Model transformer with a sequence classification head on top (linear layer).
+    [`HyperCLOVAXForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    HyperCLOVAX_START_DOCSTRING,
+)
+class HyperCLOVAXForSequenceClassification(HyperCLOVAXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = HyperCLOVAXModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+The HyperCLOVAX Model transformer with a span classification head on top for extractive question-answering tasks like
+SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    HyperCLOVAX_START_DOCSTRING,
+)
+class HyperCLOVAXForQuestionAnswering(HyperCLOVAXPreTrainedModel):
+    base_model_prefix = "transformer"
+    # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->HyperCLOVAX
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = HyperCLOVAXModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.transformer.embed_tokens
+    def set_input_embeddings(self, value):
+        self.transformer.embed_tokens = value
+    @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    The HyperCLOVAX Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    HyperCLOVAX_START_DOCSTRING,
+)
+class HyperCLOVAXForTokenClassification(HyperCLOVAXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = HyperCLOVAXModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+################################################################################################
+################################################################################################
+"""
+liger kernel monkey patching
+https://github.com/linkedin/Liger-Kernel/blob/v0.5.2/src/liger_kernel/transformers/monkey_patch.py
+"""
+import inspect
+import logging
+from functools import partial
+from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import transformers
+from packaging import version
+from torch.nn import CrossEntropyLoss
+from transformers import PreTrainedModel
+if TYPE_CHECKING:
+    from transformers.cache_utils import Cache
+import sys
+from packaging.version import parse
+if sys.version_info < (3, 8):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+try:
+    from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
+    from liger_kernel.transformers.functional import liger_cross_entropy
+    from liger_kernel.transformers.fused_linear_cross_entropy import (
+        LigerFusedLinearCrossEntropyLoss,
+    )
+    from liger_kernel.transformers.rms_norm import LigerRMSNorm
+    from liger_kernel.transformers.rope import liger_rotary_pos_emb
+    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
+    _is_liger_kernel_available = True
+    LIGER_KERNEL_MATCHING_VERSION = parse("0.5.2")
+    liger_kernel_version = parse(importlib_metadata.version("liger_kernel"))
+    _is_liger_kernel_version_matching = (
+        liger_kernel_version.major,
+        liger_kernel_version.minor,
+        liger_kernel_version.release[-1],
+    ) == (
+        LIGER_KERNEL_MATCHING_VERSION.major,
+        LIGER_KERNEL_MATCHING_VERSION.minor,
+        LIGER_KERNEL_MATCHING_VERSION.release[-1],
+    )
+except Exception:
+    _is_liger_kernel_available = False
+    _is_liger_kernel_version_matching = False
+def lce_forward_deprecated(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Union["Cache", List[torch.FloatTensor]]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    num_logits_to_keep: int = 0,
+) -> Union[Tuple, CausalLMOutputWithPast]:
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+    )
+    hidden_states = outputs[0]
+    loss = None
+    logits = None
+    if self.training and (labels is not None):
+        if num_logits_to_keep != 0:
+            hidden_states = hidden_states[:, -num_logits_to_keep:, :]  # not sure if it has bug
+        hidden_states = hidden_states * self.config.logits_scaling  ## muP
+        shift_hidden_states = hidden_states[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # flatten tokens
+        shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size)
+        shift_labels = shift_labels.view(-1)
+        lce = LigerFusedLinearCrossEntropyLoss()
+        loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
+    else:
+        assert self.config.pretraining_tp == 1, "not supported"
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = logits * self.config.logits_scaling  ## muP
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        return (loss,) + output if loss is not None else output
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
+def _bind_method_to_module(module, method_name: str, new_method: Callable):
+    # Binds a new method to a module instance so that self is passed as the first argument
+    module.__dict__[method_name] = new_method.__get__(module, module.__class__)
+def _patch_rms_norm_module(module, offset=0.0, eps=1e-6, casting_mode="llama", in_place=True):
+    module.offset = offset
+    module.casting_mode = casting_mode
+    module.variance_epsilon = getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
+    module.in_place = in_place
+    _bind_method_to_module(module, "forward", LigerRMSNorm.forward)
+    _bind_method_to_module(module, "extra_repr", LigerRMSNorm.extra_repr)
+def apply_liger_kernel_to_hyperclovax(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    assert not cross_entropy, "not supported"
+    if rope:
+        apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        HyperCLOVAXRMSNorm = LigerRMSNorm
+    if swiglu:
+        HyperCLOVAXMLP = LigerSwiGLUMLP
+    # to use VLM forward in VLM repo
+    # if fused_linear_cross_entropy:
+    #     HyperCLOVAXForCausalLM.forward = lce_forward_deprecated
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules (e.g. LlamaRMSNorm or LlamaMLP)
+        # get the base model from the model instance
+        base_model: HyperCLOVAXModel = getattr(model, model.base_model_prefix, model)
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _bind_method_to_module(decoder_layer.mlp, "forward", LigerSwiGLUMLP.forward)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+                if decoder_layer.use_post_norm:
+                    _patch_rms_norm_module(decoder_layer.post_norm1)
+                    _patch_rms_norm_module(decoder_layer.post_norm2)
+def _apply_liger_kernel_to_instance(model: PreTrainedModel, **kwargs) -> None:
+    model_type = getattr(model, "config", None) and getattr(model.config, "model_type", None)
+    assert model_type == "hyperclovax"
+    apply_fn = apply_liger_kernel_to_hyperclovax
+    apply_fn_signature = inspect.signature(apply_fn)
+    # Filter out the keyword arguments that are not supported by the apply function
+    applicable_kwargs = {key: value for key, value in kwargs.items() if key in apply_fn_signature.parameters}
+    logger.info(
+        f"Applying Liger kernels to model instance with model type: {model_type} with kwargs: {applicable_kwargs}"
+    )
+    apply_fn(model=model, **applicable_kwargs)
+################################################################################################
+################################################################################################

modeling_vlm.py ADDED Viewed

The diff for this file is too large to render. See raw diff

patch_vuvlm.py ADDED Viewed

	@@ -0,0 +1,1085 @@

+import contextlib
+import gc
+import inspect
+import json
+import os
+import time
+from functools import partial
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from liger_kernel.transformers import (
+    LigerCrossEntropyLoss,
+    LigerFusedLinearCrossEntropyLoss,
+)
+from torch.nn import CrossEntropyLoss
+from transformers import AutoTokenizer
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_utils import is_fsdp_enabled, is_local_dist_rank_0
+from hcxvlm.models.ulysses.sp_utils import (
+    gather_outputs_and_unpad,
+    get_ulysses_sequence_parallel_group,
+    get_ulysses_sequence_parallel_rank,
+    get_ulysses_sequence_parallel_world_size,
+    slice_input_tensor,
+)
+from .configuration_vlm import HCXVisionConfig
+from .modeling_vlm import HCXVisionForCausalLM, get_rank
+extra_special_tokens = {
+    "image_token": "<|IMAGE_PAD|>",
+    "discrete_image_token": "<|DISCRETE_IMAGE_PAD|>",
+    "discrete_image_unit_0_id": "<|vision00000|>",
+    "video_token": "<|VIDEO_PAD|>",
+    "video_audio_token": "<|VIDEO_AUDIO_PAD|>",
+    "audio_token": "<|AUDIO_PAD|>",
+    "discrete_audio_token": "<|DISCRETE_AUDIO_PAD|>",
+    "discrete_audio_unit_0_id": "<|audio0000|>",
+}
+def load_state_dict_into_model(model_to_load, state_dict, strict=True, start_prefix=""):
+    old_keys = []
+    new_keys = []
+    for key in state_dict.keys():
+        new_key = None
+        if "gamma" in key:
+            new_key = key.replace("gamma", "weight")
+        if "beta" in key:
+            new_key = key.replace("beta", "bias")
+        if new_key:
+            old_keys.append(key)
+            new_keys.append(new_key)
+    for old_key, new_key in zip(old_keys, new_keys):
+        state_dict[new_key] = state_dict.pop(old_key)
+    metadata = getattr(state_dict, "_metadata", None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+    error_msgs = []
+    def load(module: nn.Module, state_dict, prefix=""):
+        local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+        args = (state_dict, prefix, local_metadata, strict, [], [], error_msgs)
+        if len([key for key in state_dict if key.startswith(prefix)]) > 0:
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
+                named_parameters = dict(
+                    module.named_parameters(prefix=prefix[:-1], recurse=False)
+                )
+                params_to_gather = [
+                    named_parameters[k]
+                    for k in state_dict.keys()
+                    if k in named_parameters
+                ]
+                if len(params_to_gather) > 0:
+                    with deepspeed.zero.GatheredParameters(
+                        params_to_gather, modifier_rank=0
+                    ):
+                        if torch.distributed.get_rank() == 0:
+                            module._load_from_state_dict(*args)
+            else:
+                module._load_from_state_dict(*args)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, state_dict, prefix + name + ".")
+    load(model_to_load, state_dict, prefix=start_prefix)
+    del state_dict
+    return error_msgs
+def load_sharded_checkpoint(
+    model,
+    folder,
+    pick_prefix="",
+    replace_prefix_list=[],
+    replace_prefix_dict={},
+    print_info=True,
+):
+    if folder is None:
+        return {}
+    files = os.listdir(folder)
+    pytorch_bin_files = [
+        file
+        for file in files
+        if file.startswith("pytorch_model") and file.endswith(".bin")
+    ]
+    safetensor_files = [file for file in files if file.endswith(".safetensors")]
+    shard_index_file = [file for file in files if file.endswith(".index.json")]
+    index_present = len(shard_index_file) > 0
+    index_file = os.path.join(folder, shard_index_file[0]) if index_present else []
+    is_safetensor = len(safetensor_files) > 0
+    model_keys = model.state_dict().keys()
+    if is_safetensor:
+        from safetensors.torch import load_file
+        load_function = load_file
+        shard_files = safetensor_files
+    else:
+        load_function = partial(torch.load, map_location="cpu")
+        shard_files = pytorch_bin_files
+    if index_present:
+        with open(index_file, "r", encoding="utf-8") as f:
+            index = json.load(f)
+        loaded_keys = index["weight_map"].keys()
+        if pick_prefix:
+            loaded_keys = [
+                k[len(pick_prefix) :] for k in loaded_keys if k.startswith(pick_prefix)
+            ]
+        if replace_prefix_list:
+            for rep_prefix in replace_prefix_list:
+                loaded_keys = [
+                    k[len(rep_prefix) :] if k.startswith(rep_prefix) else k
+                    for k in loaded_keys
+                ]
+        if replace_prefix_dict:
+            for rep_prefix in replace_prefix_dict:
+                loaded_keys = [
+                    (
+                        k.replace(rep_prefix, replace_prefix_dict[rep_prefix])
+                        if k.startswith(rep_prefix)
+                        else k
+                    )
+                    for k in loaded_keys
+                ]
+    for i, shard_file in enumerate(shard_files):
+        state_dict = load_function(os.path.join(folder, shard_file))
+        if pick_prefix:
+            state_dict = {
+                k[len(pick_prefix) :]: v
+                for k, v in state_dict.items()
+                if k.startswith(pick_prefix)
+            }
+        for rep_prefix in replace_prefix_list:
+            state_dict = {
+                k[len(rep_prefix) :] if k.startswith(rep_prefix) else k: v
+                for k, v in state_dict.items()
+            }
+        for rep_prefix in replace_prefix_dict:
+            state_dict = {
+                (
+                    k.replace(rep_prefix, replace_prefix_dict[rep_prefix])
+                    if k.startswith(rep_prefix)
+                    else k
+                ): v
+                for k, v in state_dict.items()
+            }
+        if is_deepspeed_zero3_enabled():
+            rank = torch.distributed.get_rank()
+            print(f"# [info] ZeRo3 - load sharded no {i}, rank {rank}")
+            load_state_dict_into_model(model, state_dict, strict=False)
+        elif is_fsdp_enabled():
+            if is_local_dist_rank_0():
+                model.load_state_dict(state_dict, strict=False)
+        else:
+            model.load_state_dict(state_dict, strict=False)
+        if not index_present:
+            loaded_keys = state_dict.keys()
+        del state_dict
+        gc.collect()
+    missing_keys = [key for key in model_keys if key not in loaded_keys]
+    unexpected_keys = [key for key in loaded_keys if key not in model_keys]
+    if get_rank() == 0 and print_info:
+        print(f"[info] missing_keys: {missing_keys}")
+        print(f"[info] unexpected_keys: {unexpected_keys}")
+    return {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys}
+class HCXVisionForCausalLM_VU(HCXVisionForCausalLM):
+    def __init__(self, config, **kwargs):
+        self.use_liger = kwargs.pop("use_liger", True)
+        self.use_fused_ce = kwargs.pop("use_fused_ce", True)
+        self.use_meansum_loss = kwargs.pop("use_meansum_loss", True)
+        self.use_turnmeansum_loss = kwargs.pop("use_turnmeansum_loss", False)
+        self.use_sqrtsum_loss = kwargs.pop("use_sqrtsum_loss", False)
+        use_sum_loss = True if kwargs.pop("use_sum_loss", False) else False
+        self.sequence_parallel_size = kwargs.pop("sequence_parallel_size", 1)
+        self.sp_manager = kwargs.pop("sp_manager", None)
+        self.train_video = kwargs.pop("train_video", False)
+        assert (
+            int(self.use_meansum_loss)
+            + int(self.use_turnmeansum_loss)
+            + int(self.use_sqrtsum_loss)
+        ) <= 1, "use_meansum_loss, use_turnmeansum_loss, use_sqrtsum_loss 중 둘 이상을 동시에 True로 설정할 수 없습니다."
+        if self.use_meansum_loss or self.use_turnmeansum_loss or self.use_sqrtsum_loss:
+            self.reduction = "none"
+        elif use_sum_loss:
+            self.reduction = "sum"
+        else:
+            self.reduction = "mean"
+        super().__init__(config, **kwargs)
+        if config.text_config.model_type == "hyperclovax" and self.use_liger:
+            self.language_model._get_apply_liger_kernel_converter()(
+                model=self.language_model
+            )
+            print("[info] use liger kernel for hcx 24b")
+        if config.freeze_encoder:
+            for param in self.vision_model.parameters():
+                param.requires_grad = False
+            assert (
+                all(param.requires_grad for param in self.vision_model.parameters())
+                == False
+            )
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        text_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        vision_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        discrete_vision_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        audio_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        discrete_audio_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        q_former_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        without_llm: bool = False,
+        *model_args,
+        **kwargs,
+    ):
+        """
+        :param pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] : pre-trained path for LLM(text_model_name_or_path) e.g. /path/to/model/
+        :param vision_model_name_or_path: Optional[Union[str, os.PathLike]] : pre-trained path for VisionModule(HyperClova-VisionModule) e.g. /path/to/vision/module/
+        :param q_former_model_name_or_path: Optional[Union[str, os.PathLike]] : pre-trained path for VLM e.g. /path/to/vlm/checkpoint/
+        :param without_llm: Bool: False: init/load llm weight from pre-trained True: init/load llm weight from dummy file
+        :param model_args:
+        :param kwargs:
+        :return:
+        """
+        assert pretrained_model_name_or_path is not None or (
+            text_model_name_or_path is not None
+            and vision_model_name_or_path is not None
+        )
+        cache_dirpath = kwargs.pop("cache_dirpath", None)
+        if cache_dirpath is None:
+            cache_dirpath = "~/.cache"
+        runtime_only_keys = {
+            "use_liger",
+            "use_fused_ce",
+            "use_meansum_loss",
+            "use_turnmeansum_loss",
+            "use_sqrtsum_loss",
+            "use_sum_loss",
+            "sequence_parallel_size",
+            "sp_manager",
+            "train_video",
+        }
+        runtime_kwargs = {}
+        for k in list(runtime_only_keys):
+            if k in kwargs:
+                runtime_kwargs[k] = kwargs.pop(k)
+        kwargs["vision_model_name_or_path"] = vision_model_name_or_path
+        kwargs["discrete_vision_model_name_or_path"] = (
+            discrete_vision_model_name_or_path
+        )
+        kwargs["audio_model_name_or_path"] = audio_model_name_or_path
+        kwargs["discrete_audio_model_name_or_path"] = discrete_audio_model_name_or_path
+        save_only_vision = (
+            kwargs.pop("save_only_vision") if "save_only_vision" in kwargs else False
+        )
+        save_only_qformer = (
+            kwargs.pop("save_only_qformer") if "save_only_qformer" in kwargs else False
+        )
+        save_shard_size = (
+            kwargs.pop("save_shard_size") if "save_shard_size" in kwargs else "5GB"
+        )
+        def _purge_runtime_from_config(cfg):
+            for rk in runtime_only_keys:
+                if hasattr(cfg, rk):
+                    delattr(cfg, rk)
+        template_path = "hcxvlm/dataset/chat_template.jinja"
+        with open(template_path, "r", encoding="utf-8") as f:
+            chat_template_str = f.read()
+        if without_llm:
+            assert pretrained_model_name_or_path is not None and os.path.exists(
+                pretrained_model_name_or_path
+            )
+            dummy_config = HCXVisionConfig.from_pretrained(
+                pretrained_model_name_or_path=pretrained_model_name_or_path,
+                *model_args,
+                **kwargs,
+            )
+            _purge_runtime_from_config(dummy_config)
+            dummy_config.text_config.num_hidden_layers = 0
+            dummy_config.text_config.num_attention_heads = 1
+            if isinstance(
+                dummy_config.vision_model_name_or_path, str
+            ) and os.path.exists(dummy_config.vision_model_name_or_path):
+                vision_model_name_or_path = dummy_config.vision_model_name_or_path
+            assert isinstance(vision_model_name_or_path, str) and os.path.exists(
+                vision_model_name_or_path
+            ), f"# [error] invalid vision_model_name_or_path: {vision_model_name_or_path}"
+            dummy_config.vision_model_name_or_path = vision_model_name_or_path
+            dummy_config.vision_config._name_or_path = vision_model_name_or_path
+            dummy_config.vision_config.vison_pretrained_name_or_path = (
+                vision_model_name_or_path
+            )
+            model = super().from_pretrained(
+                pretrained_model_name_or_path=pretrained_model_name_or_path,
+                without_llm=True,
+                config=dummy_config,
+                *model_args,
+                **{**kwargs, **runtime_kwargs},
+            )
+            model.tokenizer = AutoTokenizer.from_pretrained(
+                pretrained_model_name_or_path
+            )
+            model.tokenizer.chat_template = chat_template_str
+            model.transformer = None
+        else:
+            if pretrained_model_name_or_path is not None and (
+                audio_model_name_or_path is not None
+                or discrete_audio_model_name_or_path is not None
+                or discrete_vision_model_name_or_path is not None
+            ):
+                assert (
+                    audio_model_name_or_path is not None
+                    and discrete_audio_model_name_or_path is not None
+                    and discrete_vision_model_name_or_path is not None
+                )
+                print(f"[DEBUG] image stage2 끝난 시점에서 audio 를 stage3 로 붙일때.")
+                pt_config = HCXVisionConfig.from_pretrained(
+                    pretrained_model_name_or_path
+                )
+                _purge_runtime_from_config(pt_config)
+                config_dict = pt_config.to_dict()
+                config_dict["audio_model_name_or_path"] = audio_model_name_or_path
+                config_dict["discrete_audio_model_name_or_path"] = (
+                    discrete_audio_model_name_or_path
+                )
+                config_dict["discrete_vision_model_name_or_path"] = (
+                    discrete_vision_model_name_or_path
+                )
+                config = HCXVisionConfig.from_dict(config_dict)
+                print(f"config: {config}")
+                model = super().from_pretrained(
+                    pretrained_model_name_or_path,
+                    without_llm=False,
+                    config=config,
+                    _fast_init=False,
+                    *model_args,
+                    **kwargs,
+                )
+                model.tokenizer = AutoTokenizer.from_pretrained(
+                    pretrained_model_name_or_path
+                )
+                model.tokenizer.chat_template = chat_template_str
+            elif isinstance(q_former_model_name_or_path, str):
+                config = HCXVisionConfig.from_dict(
+                    {"text_model_name_or_path": text_model_name_or_path, **kwargs}
+                )
+                _purge_runtime_from_config(config)
+                model = super().from_pretrained(
+                    q_former_model_name_or_path,
+                    without_llm=False,
+                    config=config,
+                    _fast_init=False,
+                    *model_args,
+                    **{**kwargs, **runtime_kwargs},
+                )
+                model.tokenizer = AutoTokenizer.from_pretrained(
+                    q_former_model_name_or_path
+                )
+                model.tokenizer.chat_template = chat_template_str
+            elif pretrained_model_name_or_path is not None:
+                config = HCXVisionConfig.from_pretrained(
+                    pretrained_model_name_or_path, *model_args, **kwargs
+                )
+                _purge_runtime_from_config(config)
+                model = super().from_pretrained(
+                    pretrained_model_name_or_path,
+                    *model_args,
+                    config=config,
+                    **runtime_kwargs,
+                )
+                model.tokenizer = AutoTokenizer.from_pretrained(
+                    pretrained_model_name_or_path
+                )
+                model.tokenizer.chat_template = chat_template_str
+            else:
+                config = HCXVisionConfig.from_dict(
+                    {"text_model_name_or_path": text_model_name_or_path, **kwargs}
+                )
+                _purge_runtime_from_config(config)
+                model = HCXVisionForCausalLM_VU(
+                    config, *model_args, **{**kwargs, **runtime_kwargs}
+                )
+                model.tokenizer = AutoTokenizer.from_pretrained(text_model_name_or_path)
+                model.tokenizer.chat_template = chat_template_str
+                model.mm_projector.apply(model._init_weights)
+        img_start_id = model.tokenizer.encode(
+            extra_special_tokens["image_token"], add_special_tokens=False
+        )
+        assert (
+            len(img_start_id) == 1
+        ), f'{extra_special_tokens["image_token"]} was not encoded into a single special token. Encoding result: {img_start_id}'
+        model.config.img_start_id = img_start_id[0]
+        model.config.image_token_id = img_start_id[0]
+        video_start_id = model.tokenizer.encode(
+            extra_special_tokens["video_token"], add_special_tokens=False
+        )
+        assert (
+            len(video_start_id) == 1
+        ), f"video_token was not encoded into a single special token. Encoding result: {video_start_id}"
+        model.config.video_start_id = video_start_id[0]
+        model.config.video_token_id = video_start_id[0]
+        video_audio_start_id = model.tokenizer.encode(
+            extra_special_tokens["video_audio_token"], add_special_tokens=False
+        )
+        assert (
+            len(video_audio_start_id) == 1
+        ), f"video_audio_token was not encoded into a single special token. Encoding result: {video_audio_start_id}"
+        model.config.video_audio_start_id = video_audio_start_id[0]
+        model.config.video_audio_token_id = video_audio_start_id[0]
+        if (
+            audio_model_name_or_path is not None
+            or discrete_audio_model_name_or_path is not None
+            or discrete_vision_model_name_or_path is not None
+        ):
+            audio_start_id = model.tokenizer.encode(
+                extra_special_tokens["audio_token"], add_special_tokens=False
+            )
+            assert (
+                len(audio_start_id) == 1
+            ), f"audio_token was not encoded into a single special token. Encoding result: {audio_start_id}"
+            model.config.audio_start_id = audio_start_id[0]
+            model.config.audio_token_id = audio_start_id[0]
+            discrete_audio_start_id = model.tokenizer.encode(
+                extra_special_tokens["discrete_audio_token"], add_special_tokens=False
+            )
+            assert (
+                len(discrete_audio_start_id) == 1
+            ), f"discrete_audio_token was not encoded into a single special token. Encoding result: {discrete_audio_start_id}"
+            model.config.discrete_audio_start_id = discrete_audio_start_id[0]
+            model.config.discrete_audio_token_id = discrete_audio_start_id[0]
+            discrete_audio_unit_0_id = model.tokenizer.encode(
+                extra_special_tokens["discrete_audio_unit_0_id"],
+                add_special_tokens=False,
+            )
+            assert (
+                len(discrete_audio_unit_0_id) == 1
+            ), f'{extra_special_tokens["discrete_audio_unit_0_id"]} was not encoded into a single special token. Encoding result: {discrete_audio_unit_0_id}'
+            model.config.discrete_audio_unit_0_id = discrete_audio_unit_0_id[0]
+            discrete_image_start_id = model.tokenizer.encode(
+                extra_special_tokens["discrete_image_token"], add_special_tokens=False
+            )
+            assert (
+                len(discrete_image_start_id) == 1
+            ), f'{extra_special_tokens["discrete_image_token"]} was not encoded into a single special token. Encoding result: {discrete_image_start_id}'
+            model.config.discrete_image_start_id = discrete_image_start_id[0]
+            model.config.discrete_image_token_id = discrete_image_start_id[0]
+            discrete_image_unit_0_id = model.tokenizer.encode(
+                extra_special_tokens["discrete_image_unit_0_id"],
+                add_special_tokens=False,
+            )
+            assert (
+                len(discrete_image_unit_0_id) == 1
+            ), f'{extra_special_tokens["discrete_image_unit_0_id"]} was not encoded into a single special token. Encoding result: {discrete_image_unit_0_id}'
+            model.config.discrete_image_unit_0_id = discrete_image_unit_0_id[0]
+        model.save_only_vision = save_only_vision
+        model.save_only_qformer = save_only_qformer
+        model.save_shard_size = save_shard_size
+        if pretrained_model_name_or_path is None or (
+            pretrained_model_name_or_path is not None
+            and audio_model_name_or_path is not None
+        ):
+            vision_model_name_or_path = kwargs.get("vision_model_name_or_path", None)
+            if vision_model_name_or_path is not None:
+                load_sharded_checkpoint(model.vision_model, vision_model_name_or_path)
+                if get_rank() == 0:
+                    print("[info] vision model loading complete")
+            discrete_vision_model_name_or_path = kwargs.get(
+                "discrete_vision_model_name_or_path", None
+            )
+            if discrete_vision_model_name_or_path is not None:
+                model.discrete_vision_model.load_state_dict(
+                    torch.load(
+                        discrete_vision_model_name_or_path,
+                        map_location=model.device,
+                        weights_only=False,
+                    )["model"]["sd"],
+                    strict=True,
+                )
+                if get_rank() == 0:
+                    print("[info] discrete vision model loading complete")
+            audio_model_name_or_path = kwargs.get("audio_model_name_or_path", None)
+            if audio_model_name_or_path is not None:
+                load_sharded_checkpoint(model.audio_model, audio_model_name_or_path)
+                if get_rank() == 0:
+                    print("[info] audio model loading complete")
+            discrete_audio_model_name_or_path = kwargs.get(
+                "discrete_audio_model_name_or_path", None
+            )
+            if discrete_audio_model_name_or_path is not None:
+                model.discrete_audio_model.load_state_dict(
+                    torch.load(
+                        discrete_audio_model_name_or_path,
+                        map_location=model.device,
+                        weights_only=False,
+                    ),
+                    strict=True,
+                )
+                if get_rank() == 0:
+                    print("[info] discrete audio model loading complete")
+            if text_model_name_or_path is not None:
+                load_sharded_checkpoint(model.language_model, text_model_name_or_path)
+                if get_rank() == 0:
+                    print("[info] text model loading complete")
+            if isinstance(q_former_model_name_or_path, str):
+                assert Path(
+                    q_former_model_name_or_path
+                ).exists(), f"# [error] given q_former_name_or_path not exist: {q_former_model_name_or_path}"
+                load_result = load_sharded_checkpoint(
+                    model,
+                    q_former_model_name_or_path,
+                    replace_prefix_dict={
+                        "vision_model.image_encoder.model.vision_tower": "vision_model",
+                        "model": "language_model.model",
+                        "lm_head.weight": "language_model.lm_head.weight",
+                    },
+                    print_info=False,
+                )
+                if get_rank() == 0:
+                    missing_keys_summary = dict()
+                    for key in load_result["missing_keys"]:
+                        if key.split(".")[0] in missing_keys_summary:
+                            missing_keys_summary[key.split(".")[0]] += 1
+                        else:
+                            missing_keys_summary[key.split(".")[0]] = 1
+                    print(f"[info] missing_keys summary : {missing_keys_summary}")
+                    print("[info] q_former model loading complete")
+        config: HCXVisionConfig = model.config
+        if config.model_type != "vlm":
+            model.config.model_type = "vlm"
+        return model
+    def _pad_sequence_for_sp(
+        self,
+        inputs_embeds: torch.Tensor,
+        labels: Optional[torch.Tensor],
+        sp_world_size: int,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Ensure sequence length is divisible by the SP group size by padding on the sequence dimension.
+        Returns the possibly padded (inputs_embeds, labels).
+        """
+        batch_size, seqlen, hidden_size = inputs_embeds.shape
+        remainder = seqlen % sp_world_size
+        if remainder != 0:
+            print(
+                f"[info] Padding sequence dimension to make it divisible by {sp_world_size}"
+            )
+            pad_len = sp_world_size - remainder
+            pad_embeds = torch.zeros(
+                (batch_size, pad_len, hidden_size),
+                dtype=inputs_embeds.dtype,
+                device=inputs_embeds.device,
+            )
+            inputs_embeds = torch.cat([inputs_embeds, pad_embeds], dim=1)
+            if labels is not None:
+                ignore_index = getattr(self.config, "ignore_index", -100)
+                pad_labels = torch.full(
+                    (batch_size, pad_len),
+                    fill_value=ignore_index,
+                    dtype=labels.dtype,
+                    device=labels.device,
+                )
+                labels = torch.cat([labels, pad_labels], dim=1)
+        return inputs_embeds, labels
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[List[List[torch.FloatTensor]]] = None,
+        discrete_pixel_values: Optional[List[List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        image_sizes: Optional[List[List[List[int]]]] = None,
+        mm_query_lengths: Optional[List[List[int]]] = None,
+        non_mm_query_lengths: Optional[List[List[int]]] = None,
+        img_start_ids_list: Optional[List[List[int]]] = None,
+        num_queries_vis_abstractors: Optional[List[List[int]]] = None,
+        num_queries_vis_abstractors_slow: Optional[List[List[int]]] = None,
+        first_last_frames_slows: Optional[List[List[bool]]] = None,
+        is_videos: Optional[List[List[bool]]] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        video_audio_values: Optional[torch.FloatTensor] = None,
+        video_audio_masks: Optional[torch.FloatTensor] = None,
+        audio_values: Optional[torch.FloatTensor] = None,
+        discrete_audio_values: Optional[torch.FloatTensor] = None,
+        discrete_audio_value_num_per_sample: Optional[torch.LongTensor] = None,
+        audio_masks: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        """
+        :param input_ids: torch.int64 : torch.size([batchsize, variable)]) : SystemPrompt with Question text token indices for tokenizer.
+         In positions where images are inputted, the value is replaced by config.img_start_id, which is a vocabulary index used to indicate the start of image data.
+        :param pixel_values: List of List of 4D tensor (torch.float32)
+         Each outer list corresponds to a batch and contains inner lists, each holding tensors for images in a sample. The structure accounts for samples with multiple images.
+        :param past_key_values: None
+        :param inputs_embeds: None
+        :param labels: Optional[torch.int64] : [batchsize, variable (input_ids.size(1)+ num visual tokens)] visual token 들은 모두 IGNORE_INDEX
+        :param use_cache: None
+        :param output_attentions: Optional[bool] : get attention weights of each layers of transformer network (true: 결과값에 포함, false: 결과값에 미포함)
+        :param output_hidden_states: Optional[bool] : get hidden states of each layers of transformer network (true: 결과값에 포함, false: 결과값에 미포함)
+        :param return_dict: Optional[bool] : True - return dict, Fasle - return tensor
+        :param image_sizes: Stacked as a List of List, representing image sizes (width, height).
+         In cases where a sample contains no images, a single dummy image is included.
+        :param mm_query_lengths: A List of List that stores the lengths when each image is converted into visual tokens for LLM input.
+         In cases where a sample does not contain any images, an empty list is included.
+        :param non_mm_query_lengths: contains the lengths of text tokens (excluding visual tokens) for each sample in a batch.
+        :img_start_ids_list: contains the indices of the img_start_id tokens for each sample.
+        :num_queries_vis_abstractors: A List of List that contains the number of visual tokens for each image grid.
+        :num_queries_vis_abstractors_slow: A List of List that contains the number of visual tokens for the slow part when applying the slowfast algorithm to video frames. If the slowfast algorithm is not applied, it will have a value of None.
+        :first_last_frames_slows: A List of List that contains the only first and last frames slow mode for each sample in a batch.
+        :is_videos: A List of List that contains the boolean value indicating whether each sample in a batch is a video.
+        :image_grid_thw: A 3D tensor (torch.int64) for qwen2.5-vl visual encoder.
+        :pixel_values_videos: A 2D tensor (torch.float32) for qwen2.5-vl visual encoder.
+        :video_grid_thw: A 3D tensor (torch.int64) for qwen2.5-vl visual encoder.
+        :return:
+        """
+        if self.sp_manager is not None and self.train_video:
+            sp_group = get_ulysses_sequence_parallel_group()
+            if sp_group is not None:
+                sp_rank = get_ulysses_sequence_parallel_rank(sp_group)
+                sp_world_size = get_ulysses_sequence_parallel_world_size(sp_group)
+                if sp_rank == 0:
+                    payload = {
+                        "input_ids": input_ids,
+                        "labels": labels,
+                        "pixel_values": pixel_values,
+                        "image_grid_thw": image_grid_thw,
+                        "pixel_values_videos": pixel_values_videos,
+                        "video_grid_thw": video_grid_thw,
+                        "video_audio_values": video_audio_values,
+                        "video_audio_masks": video_audio_masks,
+                    }
+                else:
+                    payload = {
+                        "input_ids": None,
+                        "labels": None,
+                        "pixel_values": None,
+                        "image_grid_thw": None,
+                        "pixel_values_videos": None,
+                        "video_grid_thw": None,
+                        "video_audio_values": None,
+                        "video_audio_masks": None,
+                    }
+                obj_list = [payload]
+                src_global_rank = dist.get_global_rank(sp_group, 0)
+                dist.broadcast_object_list(
+                    obj_list, src=src_global_rank, group=sp_group
+                )
+                payload = obj_list[0]
+                if sp_rank != 0:
+                    device = input_ids.device
+                    input_ids = payload["input_ids"]
+                    if isinstance(input_ids, torch.Tensor):
+                        input_ids = input_ids.to(device)
+                    labels = payload["labels"]
+                    if isinstance(labels, torch.Tensor):
+                        labels = labels.to(device)
+                    image_grid_thw = payload["image_grid_thw"]
+                    if isinstance(image_grid_thw, torch.Tensor):
+                        image_grid_thw = image_grid_thw.to(device)
+                    pixel_values_videos = payload["pixel_values_videos"]
+                    if isinstance(pixel_values_videos, torch.Tensor):
+                        pixel_values_videos = pixel_values_videos.to(device)
+                    video_grid_thw = payload["video_grid_thw"]
+                    if isinstance(video_grid_thw, torch.Tensor):
+                        video_grid_thw = video_grid_thw.to(device)
+                    video_audio_values = payload["video_audio_values"]
+                    if isinstance(video_audio_values, torch.Tensor):
+                        video_audio_values = video_audio_values.to(device)
+                    video_audio_masks = payload["video_audio_masks"]
+                    if isinstance(video_audio_masks, torch.Tensor):
+                        video_audio_masks = video_audio_masks.to(device)
+                    pixel_values = payload["pixel_values"]
+                    if isinstance(pixel_values, torch.Tensor):
+                        pixel_values = pixel_values.to(device)
+        attention_mask = None
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.vision_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.vision_config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if inputs_embeds is None and past_key_values is None:
+            inputs_embeds, labels = self.model.extract_inputs_embeds(
+                input_ids=input_ids,
+                labels=labels,
+                pixel_values=pixel_values,
+                discrete_pixel_values=discrete_pixel_values,
+                past_key_values=past_key_values,
+                image_sizes=image_sizes,
+                mm_query_lengths=mm_query_lengths,
+                non_mm_query_lengths=non_mm_query_lengths,
+                img_start_ids_list=img_start_ids_list,
+                num_queries_vis_abstractors=num_queries_vis_abstractors,
+                num_queries_vis_abstractors_slow=num_queries_vis_abstractors_slow,
+                first_last_frames_slows=first_last_frames_slows,
+                is_videos=is_videos,
+                image_grid_thw=image_grid_thw,
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+                video_audio_values=video_audio_values,
+                video_audio_masks=video_audio_masks,
+                audio_values=audio_values,
+                discrete_audio_values=discrete_audio_values,
+                discrete_audio_value_num_per_sample=discrete_audio_value_num_per_sample,
+                audio_masks=audio_masks,
+            )
+        if labels is not None and labels.size(1) > 32768:
+            print(
+                f"[RANK {rank} debug] ❌ labels.size(1) > 32768. labels.size(): {labels.size()}"
+            )
+        if inputs_embeds is not None:
+            input_ids = None
+        import os
+        rank = int(os.environ.get("RANK", -1))
+        if inputs_embeds is not None:
+            expected_hidden_size = self.config.text_config.hidden_size
+            if inputs_embeds.shape[-1] != expected_hidden_size:
+                print(f"[RANK {rank}] ❌ inputs_embeds dimension mismatch!")
+                print(
+                    f"  Expected: {expected_hidden_size}, Got: {inputs_embeds.shape[-1]}"
+                )
+        if labels is not None:
+            vocab_size = self.get_input_embeddings().num_embeddings
+            valid_labels = labels[labels != -100]
+            if len(valid_labels) > 0:
+                if (valid_labels >= vocab_size).any() or (valid_labels < 0).any():
+                    print(f"[RANK {rank}] ❌ CRITICAL: labels out of vocab range!")
+                    print(
+                        f"  labels min/max: {valid_labels.min().item()}/{valid_labels.max().item()}"
+                    )
+                    print(f"  vocab_size: {vocab_size}")
+                    print(
+                        f"  Out-of-range count: {(valid_labels >= vocab_size).sum().item()}"
+                    )
+        if attention_mask is not None and inputs_embeds is not None:
+            if attention_mask.shape[1] != inputs_embeds.shape[1]:
+                print(f"[RANK {rank}] ❌ attention_mask shape mismatch!")
+                print(
+                    f"  attention_mask: {attention_mask.shape}, inputs_embeds: {inputs_embeds.shape}"
+                )
+        if position_ids is not None:
+            max_position = position_ids.max().item()
+            if hasattr(self.language_model.config, "max_position_embeddings"):
+                max_allowed = self.language_model.config.max_position_embeddings
+                if max_position >= max_allowed:
+                    print(f"[RANK {rank}] ❌ position_ids out of range!")
+                    print(f"  max_position: {max_position}, max_allowed: {max_allowed}")
+        if self.sp_manager is not None:
+            batch_size, seqlen, hidden_size = inputs_embeds.shape
+            sp_group = get_ulysses_sequence_parallel_group()
+            sp_world_size = get_ulysses_sequence_parallel_world_size(sp_group)
+            inputs_embeds, labels = self._pad_sequence_for_sp(
+                inputs_embeds, labels, sp_world_size
+            )
+            if position_ids is None:
+                position_ids = torch.arange(
+                    seqlen, device=inputs_embeds.device, dtype=torch.long
+                )
+                position_ids = (
+                    position_ids.unsqueeze(0).expand(batch_size, -1).contiguous()
+                )
+            inputs_embeds = slice_input_tensor(
+                inputs_embeds, 1, padding=False, group=sp_group
+            )
+            labels = slice_input_tensor(labels, 1, padding=False, group=sp_group)
+            use_cache = False
+        outputs = self.language_model.base_model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = hidden_states * self.config.text_config.logits_scaling
+        loss = None
+        logits = None
+        if labels is not None:
+            if self.use_liger and self.use_fused_ce:
+                shift_labels = labels[..., 1:].contiguous()
+                shift_labels = shift_labels.view(-1)
+                hidden_states = hidden_states[..., :-1, :].contiguous()
+                hidden_states = hidden_states.view(
+                    -1, self.language_model.config.hidden_size
+                ).to(self.language_model.lm_head.weight.dtype)
+                import os
+                rank = int(os.environ.get("RANK", -1))
+                vocab_size = self.language_model.lm_head.weight.shape[0]
+                valid_labels = shift_labels[shift_labels != -100]
+                if len(valid_labels) > 0 and (
+                    (valid_labels >= vocab_size).any() or (valid_labels < 0).any()
+                ):
+                    print(
+                        f"[RANK {rank}] ❌ CRITICAL: shift_labels out of vocab range!"
+                    )
+                    print(
+                        f"  min/max: {valid_labels.min().item()}/{valid_labels.max().item()}, vocab: {vocab_size}"
+                    )
+                    print(
+                        f"  Out-of-range count: {(valid_labels >= vocab_size).sum().item()}"
+                    )
+                lce = LigerFusedLinearCrossEntropyLoss(reduction=self.reduction)
+                try:
+                    loss = lce(
+                        self.language_model.lm_head.weight, hidden_states, shift_labels
+                    )
+                except RuntimeError as e:
+                    print(
+                        f"[RANK {rank}] ❌ FATAL: LigerFusedLinearCrossEntropyLoss failed!"
+                    )
+                    print(f"  Error: {e}")
+                    print(
+                        f"  hidden_states: shape={hidden_states.shape}, dtype={hidden_states.dtype}"
+                    )
+                    print(
+                        f"  shift_labels: shape={shift_labels.shape}, unique_values={torch.unique(shift_labels).tolist()[:20]}"
+                    )
+                    print(
+                        f"  lm_head.weight: shape={self.language_model.lm_head.weight.shape}"
+                    )
+                    raise
+            elif self.use_liger:
+                logits = self.language_model.lm_head(hidden_states)
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                loss_fct = LigerCrossEntropyLoss(reduction=self.reduction)
+                shift_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+                shift_labels = shift_labels.view(-1)
+                shift_labels = shift_labels.to(shift_logits.device)
+                loss = loss_fct(shift_logits, shift_labels)
+            else:
+                logits = self.language_model.lm_head(hidden_states)
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                loss_fct = CrossEntropyLoss(reduction=self.reduction)
+                shift_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+                shift_labels = shift_labels.view(-1)
+                shift_labels = shift_labels.to(shift_logits.device)
+                loss = loss_fct(shift_logits, shift_labels)
+            if self.sp_manager is not None:
+                loss = gather_outputs_and_unpad(
+                    loss, gather_dim=0, unpad_dim=0, padding_size=0, group=sp_group
+                )
+            if self.use_meansum_loss:
+                loss = loss.view(labels.size(0), -1).mean(dim=1).sum()
+            elif self.use_sqrtsum_loss:
+                per_token = loss.view(labels.size(0), -1)
+                per_sample_mean = per_token.mean(dim=1)
+                with torch.no_grad():
+                    labels_2d = labels.view(labels.size(0), -1)
+                    ignore_index = getattr(self.config, "ignore_index", -100)
+                    valid_mask = labels_2d.ne(ignore_index)
+                    valid_count = valid_mask.sum(dim=1).clamp(min=1).float()
+                    raw_w = valid_count.sqrt()
+                    w_mean = raw_w.mean().clamp(min=1e-6)
+                    norm_w = raw_w / w_mean
+                loss = (per_sample_mean * norm_w).sum()
+            elif self.use_turnmeansum_loss:
+                with torch.no_grad():
+                    mask = shift_labels.view(labels.size(0), -1).ne(
+                        self.config.ignore_index
+                    )
+                    prev_mask = mask.roll(shifts=1, dims=1)
+                    prev_mask[:, 0] = False
+                    turn_starts = mask & (~prev_mask)
+                    turn_count = turn_starts.sum(dim=1).clamp(min=1).float()
+                loss = (loss.view(labels.size(0), -1).mean(dim=1) * turn_count).sum()
+            if self.sp_manager is not None:
+                loss = loss / self.sp_manager.device_mesh.shape[1]
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        *args,
+        **kwargs,
+    ):
+        state_dict = (
+            kwargs["state_dict"]
+            if kwargs.get("state_dict", None)
+            else self.state_dict()
+        )
+        partial_state_dict = self.get_pretrained_state_dict(
+            state_dict,
+        )
+        kwargs["state_dict"] = partial_state_dict
+        kwargs["safe_serialization"] = self.is_safetensor_save
+        kwargs.setdefault("max_shard_size", self.save_shard_size)
+        super().save_pretrained(save_directory, *args, **kwargs)
+        if self.is_qwen_visual:
+            self.config.architectures = ["HCXVisionV2ForCausalLM"]
+        else:
+            self.config.architectures = ["HCXVisionForCausalLM"]
+        self.config.auto_map["AutoModelForCausalLM"] = (
+            "modeling_vlm.HCXVisionForCausalLM"
+        )
+        self.config.auto_map["AutoModelForSequenceClassification"] = (
+            "modeling_vlm.HCXVisionForSequenceClassification"
+        )
+        self.config.save_pretrained(save_directory)
+    def get_pretrained_state_dict(self, state_dict):
+        vision_key = "vision_model."
+        llm_keys = ["language_model."]
+        head_key = "lm_head."
+        for key in list(state_dict.keys()):
+            if self.save_only_vision:
+                for llm_key in llm_keys:
+                    if llm_key in key:
+                        state_dict.pop(key)
+                if key.startswith(head_key):
+                    state_dict.pop(key)
+                elif self.save_only_qformer:
+                    if f"{vision_key}" in key:
+                        state_dict.pop(key)
+        return state_dict

preprocessor.py ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_vlm.HCXVisionV2Processor"
+  },
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 2073600,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "HCXVisionV2Processor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2
+}

processing_vlm.py ADDED Viewed

	@@ -0,0 +1,963 @@

+import copy
+import math
+import os
+from typing import Dict, List, Optional, Union
+import numpy as np
+import torch
+from PIL import Image
+from transformers import Qwen2_5_VLProcessor
+from transformers.image_processing_utils import (
+    BaseImageProcessor,
+    BatchFeature,
+    get_size_dict,
+)
+from transformers.image_transforms import (
+    convert_to_rgb,
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import (
+    Qwen2_5_VLProcessorKwargs,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import TensorType, logging
+from transformers.video_utils import VideoInput
+from typing_extensions import Unpack
+logger = logging.get_logger(__name__)
+def determine_possible_resolutions(
+    anyres: bool, max_num_grids: int, grid_size: int, use_1x1_grid: bool = False
+):
+    """총 max_num_grids 이하의 possible resolution 조합을 찾아 반환합니다.
+    max_num_grids 가 예를 들어 4인 경우, 총 가능한 grid 조합은 [1x1, 1x2, 1x3, 1x4, 2x1, 2x2, 3x1, 4x1] 이고, 따라서 아래와 같이 계산됩니다.
+    >>> possible_resolutions = determine_possible_resolutions(anyres=True, max_num_grids=4, grid_size=336)
+    >>> print(possible_resolutions)
+    [[336, 336], [336, 672], [336, 1008], [336, 1344], [672, 336], [672, 672], [1008, 336], [1344, 336]]
+    """
+    possible_resolutions = []
+    if anyres:
+        assert max_num_grids > 0
+        for i in range(1, max_num_grids + 1):
+            for j in range(1, max_num_grids + 1):
+                if i == 1 and j == 1 and not use_1x1_grid:
+                    continue
+                if i * j <= max_num_grids:
+                    possible_resolutions.append([i, j])
+        possible_resolutions = [
+            [ys * grid_size, xs * grid_size] for ys, xs in possible_resolutions
+        ]
+    return possible_resolutions
+def divide_to_grids(
+    image: np.array, grid_size: int, input_data_format=None
+) -> List[np.array]:
+    """local image 를 (grid_size x grid_size) grid 로 divide"""
+    grids = []
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    for i in range(0, height, grid_size):
+        for j in range(0, width, grid_size):
+            if input_data_format == ChannelDimension.LAST:
+                grid = image[i : i + grid_size, j : j + grid_size]
+            else:
+                grid = image[:, i : i + grid_size, j : j + grid_size]
+            grids.append(grid)
+    return grids
+def pad(
+    image: np.array,
+    target_size: tuple,
+    background_color=(127, 127, 127),
+    input_data_format=None,
+) -> np.array:
+    """image 양옆, 좌우에 padding 을 하여 target_height, target_width 만큼 키움"""
+    target_height, target_width = target_size
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    result = np.empty((target_height, target_width, image.shape[2]), dtype=image.dtype)
+    for i in range(image.shape[2]):
+        result[..., i].fill(background_color[i])
+    paste_x = (target_width - width) // 2
+    paste_y = (target_height - height) // 2
+    result[paste_y : paste_y + height, paste_x : paste_x + width, :] = image
+    return result
+def expand2square(
+    image: np.array,
+    bboxes_dict=None,
+    background_color=(127, 127, 127),
+    input_data_format=None,
+) -> np.array:
+    """
+    새로운 canvas 를 만들어 두고, 거기에 이미지를 붙여넣는 방식으로 이미지를 정사각형으로 만드는 함수
+    유의할 사항은, 이미지를 붙여 넣을 때 중앙으로 붙여넣는다는 점. 양옆 또는 위아래로 PADDING 이 들어가는 형태
+    Args:
+        pil_img: numpy array
+        bboxes_dict: dict, {"ocr": NDArray shape (N, 4, 2), "html": NDArray shape (N, 4, 2), ... }
+            `[[xtl, ytl], [xtr, ytr], [xbr, ybr], [xbl, ybl]]` 형태로 박스 형태는 통일. OCR, HTML 등 다양한 박스들을 한번에 처리 가능
+        background_color: tuple, RGB
+    # >>> _img = np.ones((80, 100), dtype=np.uint8) * 100
+    # >>> _bboxes_dict = {"words": np.array([[[10, 10], [20, 10], [20, 20], [10, 20]],
+    # ...                                    [[30, 30], [40, 30], [40, 40], [30, 40]]])}
+    # >>> _img, _bboxes_dict = expand2square(_img, _bboxes_dict, (255, 255, 255))
+    # >>> _img.shape
+    # (100, 100)
+    # >>> guessed_ocr_bboxes = np.array([[[20, 10], [30, 10], [30, 20], [20, 20]],
+    # ...                                [[40, 30], [50, 30], [50, 40], [40, 40]]])
+    # >>> np.testing.assert_array_almost_equal(_bboxes_dict["words"], guessed_ocr_bboxes) is None
+    # True
+    """
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    if width == height:
+        return image, bboxes_dict
+    elif width > height:
+        result = np.empty((width, width, image.shape[2]), dtype=image.dtype)
+        for i in range(image.shape[2]):
+            result[..., i].fill(background_color[i])
+        result[(width - height) // 2 : (width - height) // 2 + height, :] = image
+        if bboxes_dict is not None:
+            for key in bboxes_dict:
+                bboxes_dict[key][:, :, 1] += (width - height) // 2
+        return result, bboxes_dict
+    else:
+        result = np.empty((height, height, image.shape[2]), dtype=image.dtype)
+        for i in range(image.shape[2]):
+            result[..., i].fill(background_color[i])
+        result[:, (height - width) // 2 : (height - width) // 2 + width] = image
+        if bboxes_dict is not None:
+            for key in bboxes_dict:
+                bboxes_dict[key][:, :, 0] += (height - width) // 2
+        return result, bboxes_dict
+def resize_longside(
+    image: np.array,
+    size: int,
+    resample: PILImageResampling = PILImageResampling.BICUBIC,
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+):
+    """
+    장축 길이를 size 에 맞게 resize
+    """
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    if width == height:
+        target_height, target_width = size, size
+    elif width > height:
+        target_width = size
+        target_height = math.ceil(height / width * size)
+    else:
+        target_width = math.ceil(width / height * size)
+        target_height = size
+    return resize(
+        image,
+        size=(target_height, target_width),
+        resample=resample,
+        data_format=data_format,
+        input_data_format=input_data_format,
+    )
+def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
+    """From LLaVA-Next (https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/llava_next/image_processing_llava_next.py)
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    This is done by calculating the effective and wasted resolution for each possible resolution.
+    The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
+    Args:
+        original_size (tuple):
+            The original size of the image in the format (height, width).
+        possible_resolutions (list):
+            A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (height, width).
+    """
+    original_height, original_width = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+    for height, width in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(
+            original_height * scale
+        )
+        effective_resolution = min(
+            downscaled_width * downscaled_height, original_width * original_height
+        )
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (
+            effective_resolution == max_effective_resolution
+            and wasted_resolution < min_wasted_resolution
+        ):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (height, width)
+    return best_fit
+def _get_local_grids_output_size(
+    image: np.array, target_resolution: tuple, input_data_format=None
+):
+    original_height, original_width = get_image_size(
+        image, channel_dim=input_data_format
+    )
+    target_height, target_width = target_resolution
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    return new_height, new_width
+def determine_anyres_num_vision_patches(
+    num_grids,
+    image_size,
+    grid_size,
+    patch_size,
+    possible_resolutions,
+    anyres=False,
+    unpad=True,
+    num_queries_vis_abstractor=0,
+    num_queries_vis_abstractor_slow=0,
+    video=False,
+    first_last_frames_slow=False,
+    is_first_or_last_frames=False,
+):
+    """visual tokens 수를 계산해주는 함수"""
+    if not anyres:
+        return (
+            num_queries_vis_abstractor
+            if num_queries_vis_abstractor > 0
+            else (grid_size // patch_size) ** 2
+        )
+    if num_queries_vis_abstractor > 0:
+        num_patch_per_grid = int(num_queries_vis_abstractor**0.5)
+    else:
+        num_patch_per_grid = grid_size // patch_size
+    num_global_per_grid = num_patch_per_grid
+    height, width = select_best_resolution(image_size, possible_resolutions)
+    num_patch_height = (height // grid_size) * num_patch_per_grid
+    num_patch_width = (width // grid_size) * num_patch_per_grid
+    if unpad:
+        original_height, original_width = image_size
+        original_aspect_ratio = original_width / original_height
+        current_aspect_ratio = num_patch_width / num_patch_height
+        if original_aspect_ratio > current_aspect_ratio:
+            scale_factor = num_patch_width / original_width
+            new_height = int(original_height * scale_factor)
+            padding = (num_patch_height - new_height) // 2
+            num_patch_height = num_patch_height - padding * 2
+        else:
+            scale_factor = num_patch_height / original_height
+            new_width = int(original_width * scale_factor)
+            padding = (num_patch_width - new_width) // 2
+            num_patch_width = num_patch_width - padding * 2
+        num_patches = num_patch_width * num_patch_height + num_patch_height
+    else:
+        num_patches = num_patch_width * num_patch_height
+    if num_queries_vis_abstractor_slow > 0:
+        if first_last_frames_slow:
+            if is_first_or_last_frames:
+                num_patches += (
+                    num_queries_vis_abstractor_slow - num_queries_vis_abstractor
+                )
+        else:
+            num_patches += num_queries_vis_abstractor_slow - num_queries_vis_abstractor
+        assert unpad is False
+    if not video:
+        num_patches += num_global_per_grid**2
+    return num_patches
+class HCXVisionImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a VLM image processor. Based on [`CLIPImageProcessor`] with incorporation of additional techniques for processing high resolution images.
+    Args:
+        anyres: (bool) anyres 기능을 사용할지 안할지
+        unpad: (bool) anyres 사용시, unpad 기능 (순수 pad 영역에 해당하는 visual tokens 은 LLM input 에서 제거) 을 사용할지 안할지
+        num_queries_vis_abstractor: (int) 각 grid 에 대해서 resampler 를 사용하는 경우, visual query 수
+        possible_resolutions: (List) anyres 기능 사용시, 가능한 resolution 조합, 예: [[336, 336], [336, 672], [672, 336]]
+        patch_size: (int) ViT patch size
+        pad_to_square: (bool) 정사각형으로 padding 을 수행할지, 안할지를 결정. False 이면 정사각형이 아니기 때문에 center crop 을 거쳐 ViT 의 입력으로 들어감
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        anyres: bool = False,
+        unpad: bool = False,
+        num_queries_vis_abstractor: int = 0,
+        possible_resolutions: List = [],
+        patch_size: int = 14,
+        pad_to_square: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 336}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = (
+            crop_size if crop_size is not None else {"height": 336, "width": 336}
+        )
+        crop_size = get_size_dict(
+            crop_size, default_to_square=True, param_name="crop_size"
+        )
+        self.do_resize = do_resize
+        self.size = size
+        self.anyres = anyres
+        self.unpad = unpad
+        self.num_queries_vis_abstractor = num_queries_vis_abstractor
+        self.possible_resolutions = [
+            _resolution for _resolution in possible_resolutions
+        ]
+        self.patch_size = patch_size
+        self.pad_to_square = pad_to_square
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain either 'shortest_edge' or 'height' and 'width'."
+            )
+        output_size = get_resize_output_image_size(
+            image,
+            size=size,
+            default_to_square=default_to_square,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+    def _preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> Image.Image:
+        images = make_list_of_images(images)
+        if do_resize:
+            images = [
+                self.resize(
+                    image=image,
+                    size=size,
+                    resample=resample,
+                    input_data_format=input_data_format,
+                )
+                for image in images
+            ]
+        if do_center_crop:
+            images = [
+                self.center_crop(
+                    image=image, size=crop_size, input_data_format=input_data_format
+                )
+                for image in images
+            ]
+        if do_rescale:
+            images = [
+                self.rescale(
+                    image=image,
+                    scale=rescale_factor,
+                    input_data_format=input_data_format,
+                )
+                for image in images
+            ]
+        if do_normalize:
+            images = [
+                self.normalize(
+                    image=image,
+                    mean=image_mean,
+                    std=image_std,
+                    input_data_format=input_data_format,
+                )
+                for image in images
+            ]
+        images = [
+            to_channel_dimension_format(
+                image, data_format, input_channel_dim=input_data_format
+            )
+            for image in images
+        ]
+        return images
+    def _resize_for_local_grids(
+        self,
+        image: np.array,
+        target_resolution: tuple,
+        resample,
+        input_data_format: ChannelDimension,
+    ) -> np.array:
+        new_height, new_width = _get_local_grids_output_size(
+            image, target_resolution, input_data_format
+        )
+        resized_image = resize(
+            image,
+            (new_height, new_width),
+            resample=resample,
+            input_data_format=input_data_format,
+        )
+        return resized_image
+    def _pad_for_patching(
+        self,
+        image: np.array,
+        target_resolution: tuple,
+        input_data_format: ChannelDimension,
+    ) -> np.array:
+        """
+        Pad an image to a target resolution while maintaining aspect ratio.
+        """
+        target_height, target_width = target_resolution
+        background_color = tuple(int(x * 255) for x in self.image_mean)
+        padded_image = pad(
+            image,
+            target_size=(target_height, target_width),
+            background_color=background_color,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+    def get_image_grids(
+        self,
+        image: np.array,
+        possible_resolutions,
+        grid_size: int,
+        resample: PILImageResampling,
+        data_format: ChannelDimension,
+        input_data_format: ChannelDimension,
+    ) -> List[np.array]:
+        if not isinstance(possible_resolutions, list):
+            raise ValueError(
+                "possible_resolutions must be a list of possible resolutions."
+            )
+        image_size = get_image_size(image, channel_dim=input_data_format)
+        best_resolution = select_best_resolution(image_size, possible_resolutions)
+        resized_image = self._resize_for_local_grids(
+            image,
+            best_resolution,
+            resample=resample,
+            input_data_format=input_data_format,
+        )
+        padded_image = self._pad_for_patching(
+            resized_image, best_resolution, input_data_format=input_data_format
+        )
+        local_grids = divide_to_grids(
+            padded_image, grid_size=grid_size, input_data_format=input_data_format
+        )
+        local_grids = [
+            to_channel_dimension_format(
+                grid, channel_dim=data_format, input_channel_dim=input_data_format
+            )
+            for grid in local_grids
+        ]
+        return local_grids
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        anyres: bool = None,
+        unpad: bool = None,
+        video: bool = None,
+        num_queries_vis_abstractor: int = None,
+        possible_resolutions: List = None,
+        patch_size: int = None,
+        pad_to_square: bool = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        return_dummy_image: bool = False,
+        num_queries_vis_abstractor_slow: int = 0,
+        first_last_frames_slow: bool = False,
+        is_first_or_last_frames: bool = False,
+    ):
+        """
+        HCXVisionImageProcessor 로 image tensor, original image size (width, height), visual tokens
+        :return pixel_values: List of 4D tensor 로 image tensor
+        :return image_sizes: List of Dict 로 image width, height [{"width": image 1 의 width, "height": image 1 의 height}, {"width": image 2 의 width, "height": image 2 의 height}, ...]
+        :return vision_query_lengths: List of int 로 각 image 가 LLM 입력으로 전달될때 변환되는 visual token 수
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        anyres = anyres if anyres is not None else self.anyres
+        unpad = unpad if unpad is not None else self.unpad
+        if video:
+            unpad = False
+        num_queries_vis_abstractor = (
+            num_queries_vis_abstractor
+            if num_queries_vis_abstractor is not None
+            else self.num_queries_vis_abstractor
+        )
+        possible_resolutions = (
+            possible_resolutions
+            if possible_resolutions is not None
+            else self.possible_resolutions
+        )
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        pad_to_square = (
+            pad_to_square if pad_to_square is not None else self.pad_to_square
+        )
+        resample = resample if resample is not None else self.resample
+        do_center_crop = (
+            do_center_crop if do_center_crop is not None else self.do_center_crop
+        )
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(
+            crop_size, param_name="crop_size", default_to_square=True
+        )
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = (
+            rescale_factor if rescale_factor is not None else self.rescale_factor
+        )
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = (
+            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        )
+        if return_dummy_image:
+            images = Image.new("RGB", (224, 224), (0, 0, 0))
+        images = make_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        images = [to_numpy_array(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(images[0])
+        new_images = []
+        image_sizes = [
+            get_image_size(image, channel_dim=input_data_format) for image in images
+        ]
+        vision_query_lengths = []
+        assert crop_size["height"] == crop_size["width"]
+        if anyres:
+            anyres_global_images = copy.deepcopy(images)
+            if pad_to_square:
+                background_color = tuple(int(x * 255) for x in self.image_mean)
+                anyres_global_images = [
+                    resize_longside(
+                        copy.deepcopy(image),
+                        size["shortest_edge"],
+                        resample,
+                        input_data_format,
+                    )
+                    for image in anyres_global_images
+                ]
+                anyres_global_images = [
+                    expand2square(
+                        image,
+                        background_color=background_color,
+                        input_data_format=input_data_format,
+                    )[0]
+                    for image in anyres_global_images
+                ]
+            else:
+                anyres_global_images = [
+                    self.resize(
+                        image=image,
+                        size={
+                            "height": size["shortest_edge"],
+                            "width": size["shortest_edge"],
+                        },
+                        resample=resample,
+                        input_data_format=input_data_format,
+                    )
+                    for image in anyres_global_images
+                ]
+        else:
+            anyres_global_images = [None for _ in range(len(images))]
+            if pad_to_square:
+                background_color = tuple(int(x * 255) for x in self.image_mean)
+                images = [
+                    resize_longside(
+                        image, size["shortest_edge"], resample, input_data_format
+                    )
+                    for image in images
+                ]
+                images = [
+                    expand2square(
+                        image,
+                        background_color=background_color,
+                        input_data_format=input_data_format,
+                    )[0]
+                    for image in images
+                ]
+        for image, anyres_global_image, image_size in zip(
+            images, anyres_global_images, image_sizes
+        ):
+            if anyres:
+                image_grids = self.get_image_grids(
+                    image,
+                    possible_resolutions,
+                    grid_size=crop_size["height"],
+                    resample=resample,
+                    data_format=input_data_format,
+                    input_data_format=input_data_format,
+                )
+                if not video:
+                    image_grids = [anyres_global_image] + image_grids
+            else:
+                image_grids = [image]
+            pixel_values = self._preprocess(
+                image_grids,
+                do_resize=do_resize,
+                size=size,
+                resample=resample,
+                do_center_crop=do_center_crop,
+                crop_size=crop_size,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            pixel_values = np.array(pixel_values)
+            new_images.append(pixel_values)
+            num_grids = pixel_values.shape[0]
+            vision_query_length = determine_anyres_num_vision_patches(
+                num_grids=num_grids,
+                image_size=image_size,
+                grid_size=crop_size["height"],
+                patch_size=patch_size,
+                possible_resolutions=possible_resolutions,
+                anyres=anyres,
+                unpad=unpad,
+                num_queries_vis_abstractor=num_queries_vis_abstractor,
+                num_queries_vis_abstractor_slow=num_queries_vis_abstractor_slow,
+                video=video,
+                first_last_frames_slow=first_last_frames_slow,
+                is_first_or_last_frames=is_first_or_last_frames,
+            )
+            vision_query_lengths.append(vision_query_length)
+        if return_dummy_image:
+            vision_query_lengths = []
+        data = {
+            "pixel_values": [torch.tensor(new_image) for new_image in new_images],
+            "image_sizes": [
+                {"width": image_size[1], "height": image_size[0]}
+                for image_size in image_sizes
+            ],
+            "vision_query_lengths": vision_query_lengths,
+        }
+        return BatchFeature(data=data)
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        *args,
+        **kwargs,
+    ):
+        self.register_for_auto_class()
+        super().save_pretrained(save_directory, *args, **kwargs)
+class HCXVisionV2Processor(Qwen2_5_VLProcessor):
+    attributes = ["image_processor", "tokenizer", "video_processor"]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = (
+        "GPT2Tokenizer",
+        "GPT2TokenizerFast",
+        "PreTrainedTokenizer",
+        "PreTrainedTokenizerFast",
+    )
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        video_processor=None,
+        chat_template=None,
+        **kwargs,
+    ):
+        self.tokenizer = tokenizer
+        super().__init__(
+            image_processor,
+            tokenizer,
+            video_processor,
+            chat_template=self.tokenizer.chat_template,
+        )
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        *args,
+        **kwargs,
+    ):
+        self.register_for_auto_class()
+        super().save_pretrained(save_directory, *args, **kwargs)
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[
+            TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
+        ] = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Qwen2_5_VLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        image_inputs = videos_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(
+                images=images, **output_kwargs["images_kwargs"]
+            )
+            image_grid_thw = image_inputs["image_grid_thw"]
+        if videos is not None:
+            videos_inputs = self.video_processor(
+                videos=videos, **output_kwargs["videos_kwargs"]
+            )
+            video_grid_thw = videos_inputs["video_grid_thw"]
+        if not isinstance(text, list):
+            text = [text]
+        text = text.copy()
+        if images is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    num_image_tokens = image_grid_thw[index].prod() // merge_length
+                    text[i] = text[i].replace(
+                        self.image_token, "<|placeholder|>" * num_image_tokens, 1
+                    )
+                    text[i] = text[i].replace(
+                        '{"resolution": [w, h]}',
+                        '{"resolution": ' + str(list(images[i].size)) + "}",
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+        if videos is not None:
+            merge_length = self.video_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    num_video_tokens = video_grid_thw[index].prod() // merge_length
+                    text[i] = text[i].replace(
+                        self.video_token, "<|placeholder|>" * num_video_tokens, 1
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop(
+            "return_mm_token_type_ids", False
+        )
+        text_inputs = self.tokenizer(
+            text, **output_kwargs["text_kwargs"], return_tensors=None
+        )
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+        return BatchFeature(
+            data={**text_inputs, **image_inputs, **videos_inputs},
+            tensor_type=return_tensors,
+        )

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_vlm.HCXVisionV2Processor"
+  },
+  "processor_class": "HCXVisionV2Processor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

ta_tok.py ADDED Viewed

	@@ -0,0 +1,379 @@

+import copy
+import inspect
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torchvision.transforms import Resize
+from transformers import AutoConfig, AutoModel, Siglip2VisionConfig, Siglip2VisionModel
+def models_make(model_spec, args=None, load_sd=False) -> torch.nn.Module:
+    if args is not None:
+        model_args = copy.deepcopy(model_spec["args"])
+        model_args.update(args)
+    else:
+        model_args = model_spec["args"]
+    model_params = inspect.signature(models[model_spec["name"]]).parameters
+    if "kwargs" not in model_params:
+        model_args = {k: v for k, v in model_args.items() if k in model_params}
+    model = models[model_spec["name"]](**model_args)
+    if load_sd:
+        if (
+            ("abs_pe" in model_spec["sd"])
+            and hasattr(model, "abs_pe")
+            and model_spec["sd"]["abs_pe"].shape != model.abs_pe.shape
+        ):
+            del model_spec["sd"]["abs_pe"]
+        msg = model.load_state_dict(model_spec["sd"], strict=False)
+        print(msg)
+    return model
+class Bottleneck(nn.Module):
+    def __init__(
+        self,
+        bottleneck_dim: int,
+        input_dim: int,
+        output_dim: int,
+        token_nums: int,
+        regularizer=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.token_nums = token_nums
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        if bottleneck_dim > 0:
+            self.bottleneck_dim = bottleneck_dim
+        else:
+            assert (
+                self.input_dim == self.output_dim
+            ), "input_dim and output_dim must be the same when bottleneck_dim is not specified"
+            self.bottleneck_dim = self.input_dim
+        self.project_dim = self.bottleneck_dim
+        if self.bottleneck_dim > 0:
+            self.in_linear = nn.Linear(self.input_dim, self.project_dim)
+            self.out_linear = nn.Linear(self.bottleneck_dim, self.output_dim)
+        else:
+            self.in_linear = self.out_linear = lambda x: x
+        regularizer["args"]["dim"] = self.bottleneck_dim
+        regularizer["args"]["token_nums"] = self.token_nums
+        self.regularizer = models_make(regularizer)
+    def project_in(self, x):
+        assert len(x.shape) == 3, "Input shape must be (batch, n_tokens, e_dim)"
+        z = self.in_linear(x)
+        return z
+    def project_out(self, z_cat):
+        z = self.out_linear(z_cat)
+        return z
+    def decode(self, bottleneck_rep):
+        regularized_z = self.regularizer.decode(bottleneck_rep)
+        return self.project_out(regularized_z)
+    def forward(self, x):
+        z = self.project_in(x)
+        projected_z = z
+        regularized_output = self.regularizer(z)
+        x_hat = self.project_out(regularized_output["regularized_z"])
+        bottleneck_rep = regularized_output.pop("bottleneck_rep")
+        return {
+            "output": x_hat,
+            "bottleneck_rep": bottleneck_rep,
+            "projected_z": projected_z,
+            **regularized_output,
+        }
+class SimVectorQuantizer(nn.Module):
+    def __init__(
+        self,
+        dim,
+        codebook_size,
+        l2_normalized=False,
+        same_index_shape=True,
+        stochastic=False,
+        stochastic_temperature=1.0,
+        **kwargs,
+    ):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.dim = dim
+        assert isinstance(l2_normalized, bool)
+        self.l2_normalized = l2_normalized
+        self.stochastic = stochastic
+        self.eval_deterministic = False
+        self.default_stochastic_temperature = stochastic_temperature
+        if self.stochastic:
+            if stochastic_temperature > 0:
+                self.stochastic_temperature_inv = 1 / stochastic_temperature
+            else:
+                self.stochastic_temperature_inv = nn.Parameter(torch.tensor(10.0))
+        self.embedding = nn.Embedding(self.codebook_size, self.dim)
+        self.embedding_proj = nn.Linear(self.dim, self.dim)
+        self.same_index_shape = same_index_shape
+    def set_eval_deterministic(self, deterministic=True):
+        self.eval_deterministic = deterministic
+    def set_stochastic_temperature(self, temperature):
+        self.stochastic_temperature_inv = 1 / temperature
+    @torch.autocast(device_type="cuda", enabled=False)
+    def get_emb(self):
+        emb = self.embedding_proj(self.embedding.weight)
+        if self.l2_normalized:
+            emb = F.normalize(emb, p=2, dim=-1)
+        return emb
+    @torch.autocast(device_type="cuda", enabled=False)
+    def forward(self, z):
+        emb = self.get_emb()
+        z = z.to(emb)
+        assert len(z.shape) == 3, "Input shape must be (batch, n_tokens, e_dim)"
+        if self.l2_normalized:
+            z = F.normalize(z, p=2, dim=-1)
+        z_flattened = rearrange(z, "b n d -> (b n) d")
+        if self.stochastic:
+            assert self.l2_normalized, "Stochastic sampling requires l2 normalization"
+            cos_sim = torch.einsum("bd,nd->bn", z_flattened, emb)
+            probs = F.softmax(cos_sim * self.stochastic_temperature_inv, dim=-1)
+            if self.eval_deterministic and not self.training:
+                q_indices = torch.argmax(probs, dim=-1)
+            else:
+                q_indices = torch.multinomial(probs, 1).squeeze(-1)
+        else:
+            d = (
+                torch.sum(z_flattened**2, dim=1, keepdim=True)
+                + torch.sum(emb**2, dim=1)
+                - 2
+                * torch.einsum("bd,dn->bn", z_flattened, rearrange(emb, "n d -> d n"))
+            )
+            q_indices = torch.argmin(d, dim=1)
+        quantized = F.embedding(
+            q_indices,
+            emb,
+            self.embedding.padding_idx,
+            self.embedding.max_norm,
+            self.embedding.norm_type,
+            self.embedding.scale_grad_by_freq,
+            self.embedding.sparse,
+        ).view(z.shape)
+        quantized = z + (quantized - z).detach()
+        if self.same_index_shape:
+            q_indices = q_indices.reshape(quantized.shape[0], quantized.shape[1])
+        return_dict = {
+            "unregularized_z": z,
+            "emb": emb,
+            "regularized_z": quantized,
+            "bottleneck_rep": q_indices,
+        }
+        return return_dict
+    def get_codebook_entry(self, indices, shape=None):
+        indices_shape = indices.shape
+        indices_flatten = rearrange(indices, "... -> (...)")
+        emb = self.get_emb()
+        z_q = F.embedding(indices_flatten, emb)
+        if self.l2_normalized:
+            z_q = F.normalize(z_q, p=2, dim=-1)
+        if shape is not None:
+            z_q = z_q.reshape(shape)
+        else:
+            z_q = z_q.reshape([*indices_shape, self.dim])
+        return z_q
+    def decode(self, indices):
+        return self.get_codebook_entry(indices)
+models = {"simvq": SimVectorQuantizer, "bottleneck": Bottleneck}
+class ScalingLayer(nn.Module):
+    def __init__(self, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]):
+        super().__init__()
+        self.register_buffer("shift", torch.Tensor(mean)[None, :, None, None])
+        self.register_buffer("scale", torch.Tensor(std)[None, :, None, None])
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+    def inv(self, inp):
+        return inp * self.scale + self.shift
+class TextAlignedTokenizer(nn.Module):
+    def __init__(
+        self,
+        bottleneck,
+        bottleneck_token_num=256,
+        input_size=384,
+        teacher="google/siglip2-so400m-patch14-384",
+        input_type="quant",
+        pool_scale=1,
+        decoder_depth=3,
+        select_layer_id=-2,
+        *args,
+        **kwargs,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.bottleneck_token_num = bottleneck_token_num
+        self.teacher = teacher
+        self.input_type = input_type
+        self.pool_scale = pool_scale
+        self.decoder_depth = decoder_depth
+        self.select_layer_id = select_layer_id
+        self.bottleneck_dim = bottleneck["args"]["bottleneck_dim"]
+        self.encoder_config = AutoConfig.from_pretrained(teacher)
+        self.encoder = AutoModel.from_config(self.encoder_config).vision_model
+        self.encoder_hidden_dim = self.encoder.config.hidden_size
+        self.decoder_config = Siglip2VisionConfig()
+        self.decoder_config.update(
+            {
+                "patch_size": 1,
+                "num_hidden_layers": self.decoder_depth,
+                "num_channels": self.bottleneck_dim,
+                "hidden_size": self.encoder_hidden_dim,
+            }
+        )
+        self.decoder = Siglip2VisionModel(self.decoder_config)
+        self.encode_task_layer = nn.Sequential(
+            nn.Linear(self.encoder_hidden_dim, self.encoder_hidden_dim), nn.Tanh()
+        )
+        self.decode_task_layer = nn.Sequential(
+            nn.Linear(self.encoder_hidden_dim, self.encoder_hidden_dim),
+            nn.Tanh(),
+            nn.Linear(self.encoder_hidden_dim, self.encoder_hidden_dim),
+        )
+        bottleneck_args = {
+            "token_nums": self.bottleneck_token_num,
+            "input_dim": self.encoder_hidden_dim,
+            "output_dim": self.bottleneck_dim,
+        }
+        self.bottleneck = models_make(bottleneck, args=bottleneck_args)
+        self.scale_layer = ScalingLayer(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+        self.image_resize = Resize((self.input_size, self.input_size))
+    def set_vq_eval_deterministic(self, deterministic=True):
+        self.bottleneck.regularizer.set_eval_deterministic(deterministic)
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    @classmethod
+    def from_checkpoint(cls, ckpt, load_teacher=True, **kwargs):
+        ckpt = torch.load(ckpt, map_location="cpu", weights_only=False)
+        ckpt_kwargs = ckpt["model"]["args"]
+        print(ckpt_kwargs)
+        model = cls(**kwargs, **ckpt_kwargs)
+        sd = ckpt["model"]["sd"]
+        if not load_teacher:
+            sd = {k: v for k, v in sd.items() if not k.startswith("teacher")}
+        model.load_state_dict(sd, strict=True)
+        return model
+    def encode(self, x, **kwargs):
+        if x.ndim == 5:
+            x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.scale_layer(x)
+        if tuple(x.shape[-2:]) != (self.input_size, self.input_size):
+            x = self.image_resize(x)
+        vq_feats = self.encoder(x, output_hidden_states=True).hidden_states[
+            self.select_layer_id
+        ]
+        pool_scale = self.pool_scale
+        pool_scale = kwargs.get("pool_scale", pool_scale)
+        if pool_scale != 1:
+            vq_feats = self.avg_pool(vq_feats, pool_scale)
+        vq_feats = self.encode_task_layer(vq_feats.to(x))
+        bottleneck_out = self.bottleneck(vq_feats)
+        z = bottleneck_out.pop("output")
+        return {
+            "encoded": z,
+            "pool_scale": pool_scale,
+            "vq_feats": vq_feats,
+            **bottleneck_out,
+        }
+    def avg_pool(self, z, pool_scale=1):
+        if z.ndim == 3:
+            b, n, c = z.shape
+            p = int(n**0.5)
+            z = rearrange(z, "b (p1 p2) c -> b c p1 p2", p1=p, p2=p)
+        else:
+            b, c, p, _ = z.shape
+        p_s = int(p // pool_scale)
+        z = F.avg_pool2d(
+            z, kernel_size=(pool_scale, pool_scale), stride=(pool_scale, pool_scale)
+        ).contiguous()
+        z = rearrange(z, "b c p1 p2 -> b (p1 p2) c")
+        return z
+    def decode(self, z):
+        if z.ndim == 4:
+            z = rearrange(z, "b c p1 p2 -> b (p1 p2) c")
+        attention_mask = torch.ones(z.shape[:2], dtype=torch.int, device=z.device)
+        p = int(z.shape[1] ** 0.5)
+        spatial_shape = torch.tensor([[p, p]] * z.shape[0], device=self.device)
+        z = self.decoder(
+            z, attention_mask, spatial_shape, output_hidden_states=True
+        ).last_hidden_state
+        z = self.decode_task_layer(z)
+        return z
+    def decode_from_bottleneck(self, bottleneck_rep):
+        z = self.bottleneck.decode(bottleneck_rep)
+        p = int(z.shape[1] ** 0.5)
+        z = rearrange(z, "b (p1 p2) c -> b c p1 p2", p1=p, p2=p)
+        return self.decode(z)
+    def forward(self, data, **kwargs):
+        encode_output = self.encode(data, **kwargs)
+        vq_feats = encode_output["encoded"]
+        p = int(vq_feats.shape[1] ** 0.5)
+        vq_feats = rearrange(vq_feats, "b (h w) c -> b c h w", h=p, w=p)
+        pred_feats = self.decode(vq_feats)
+        if self.input_type == "quant":
+            z = encode_output["regularized_z"]
+        elif self.input_type == "indices":
+            z = encode_output["bottleneck_rep"]
+        elif self.input_type == "rec":
+            z = pred_feats
+        encode_output["encoded"] = z
+        return encode_output

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:666f303c324b9b2e2e8f13950cd44a18896a6fc1a70aae70583a77663d0ebe31
+size 23621510

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d7bac0a38c3af8f44d4b3b23d536111c1493fea74d3e7e2a71d804f63dada55
+size 13220225

video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,89 @@

+{
+  "_valid_kwargs_names": [
+    "do_convert_rgb",
+    "do_resize",
+    "size",
+    "size_divisor",
+    "default_to_square",
+    "resample",
+    "do_rescale",
+    "rescale_factor",
+    "do_normalize",
+    "image_mean",
+    "image_std",
+    "do_pad",
+    "do_center_crop",
+    "crop_size",
+    "data_format",
+    "input_data_format",
+    "device",
+    "min_pixels",
+    "max_pixels",
+    "patch_size",
+    "temporal_patch_size",
+    "merge_size"
+  ],
+  "auto_map": {
+    "AutoProcessor": "processing_vlm.HCXVisionV2Processor"
+  },
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "model_valid_processing_keys": [
+    "do_convert_rgb",
+    "do_resize",
+    "size",
+    "size_divisor",
+    "default_to_square",
+    "resample",
+    "do_rescale",
+    "rescale_factor",
+    "do_normalize",
+    "image_mean",
+    "image_std",
+    "do_pad",
+    "do_center_crop",
+    "crop_size",
+    "data_format",
+    "input_data_format",
+    "device",
+    "min_pixels",
+    "max_pixels",
+    "patch_size",
+    "temporal_patch_size",
+    "merge_size"
+  ],
+  "patch_size": 14,
+  "processor_class": "HCXVisionV2Processor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "size_divisor": null,
+  "temporal_patch_size": 2,
+  "video_processor_type": "Qwen2VLVideoProcessor"
+}