Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

asr_gradio/rwkv7-g1a-0.4b-20250905-ctx4096.pth +3 -0
asr_gradio/rwkv7_0.1b_audio_lm_latents_280k/config.json +50 -0
asr_gradio/rwkv7_0.1b_audio_lm_latents_280k/model.safetensors +3 -0
asr_gradio/rwkv7_0.1b_audio_lm_latents_280k/model_converted.pth +3 -0
asr_gradio/rwkv7_0.1b_audio_lm_latents_280k/projector1.pt +3 -0
asr_gradio/rwkv7_0.1b_audio_lm_latents_280k/projector2.pt +3 -0
asr_gradio/utils/__pycache__/asr_inference_with_chatrwkv.cpython-311.pyc +0 -0
asr_gradio/utils/asr_inference_with_chatrwkv.py +93 -27

asr_gradio/rwkv7-g1a-0.4b-20250905-ctx4096.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d852e99ef6c95726109660c64e7c51a8df30c53b0832a68645bfcd15253b3109
+size 901776757

asr_gradio/rwkv7_0.1b_audio_lm_latents_280k/config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "a_low_rank_dim": 64,
+  "architectures": [
+    "RWKV7ModelForLatentInputs"
+  ],
+  "attn": null,
+  "attn_mode": "chunk",
+  "bos_token_id": 0,
+  "decay_low_rank_dim": 64,
+  "eos_token_id": 0,
+  "fuse_cross_entropy": true,
+  "fuse_linear_cross_entropy": false,
+  "fuse_norm": false,
+  "gate_low_rank_dim": 128,
+  "head_dim": 64,
+  "hidden_act": "sqrelu",
+  "hidden_ratio": 4.0,
+  "hidden_size": 768,
+  "initializer_range": 0.006,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 2048,
+  "model_type": "rwkv7",
+  "norm_bias": true,
+  "norm_eps": 1e-05,
+  "norm_first": true,
+  "num_heads": 32,
+  "num_hidden_layers": 12,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "train_time_state": false,
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "use_l2warp": true,
+  "v_low_rank_dim": 32,
+  "value_dim": [
+    768,
+    768,
+    768,
+    768,
+    768,
+    768,
+    768,
+    768,
+    768,
+    768,
+    768,
+    768
+  ],
+  "vocab_size": 10
+}

asr_gradio/rwkv7_0.1b_audio_lm_latents_280k/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ecf58ad41179876c07d341991b5bbfb2796d2669ebcb76657d7befe7b139934
+size 361523904

asr_gradio/rwkv7_0.1b_audio_lm_latents_280k/model_converted.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3dc29a8ee50e423522d8f0710f4942f7934a574889fd5b84b104b1bc2eb3139d
+size 361668745

asr_gradio/rwkv7_0.1b_audio_lm_latents_280k/projector1.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2d102133e65bb3da053d33c70d05083a43221b9e41ac7e37d759ddc98f909aa
+size 3937149

asr_gradio/rwkv7_0.1b_audio_lm_latents_280k/projector2.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65febcb6484e4a5464d0df62fe088d4533e8dac2724e1c2fcabe59d60ac0b1b2
+size 3151741

asr_gradio/utils/__pycache__/asr_inference_with_chatrwkv.cpython-311.pyc CHANGED Viewed

Binary files a/asr_gradio/utils/__pycache__/asr_inference_with_chatrwkv.cpython-311.pyc and b/asr_gradio/utils/__pycache__/asr_inference_with_chatrwkv.cpython-311.pyc differ

asr_gradio/utils/asr_inference_with_chatrwkv.py CHANGED Viewed

@@ -15,6 +15,9 @@ import numpy as np
 import click
 import time
 import copy
 @dataclass
 class AsrModels:
     audio_llm: RWKV
@@ -24,6 +27,7 @@ class AsrModels:
     project2_linear: torch.nn.Linear
     llm: RWKV
     tokenizer: TRIE_TOKENIZER
 def forward_one_with_embeds(model :RWKV,embeds:torch.Tensor,state:List[torch.Tensor]):
@@ -107,6 +111,9 @@ def load_asr_models(audio_lm_path, llm_path,whisper_path,tokenizer_path,device,d
     project2_linear = torch.nn.Linear(project2['weight'].shape[1], project2['weight'].shape[0])
     project2_linear.load_state_dict(project2)
     tokenizer = TRIE_TOKENIZER(tokenizer_path)
     return AsrModels(
         audio_llm=audio_llm,
         whisper_feature_extractor=whisper_feature_extractor,
@@ -115,6 +122,7 @@ def load_asr_models(audio_lm_path, llm_path,whisper_path,tokenizer_path,device,d
         project2_linear=project2_linear.to(device=device,dtype=dtype),
         llm=llm,
         tokenizer=tokenizer,
     )
 def calculate_perplexity(models, generated_tokens, dtype, device):
@@ -159,6 +167,46 @@ def calculate_perplexity(models, generated_tokens, dtype, device):
         return perplexity
 def sample_logits(logits, temperature=1.0, top_p=0.85, top_k=0):
     if temperature == 0:
         temperature = 1.0
@@ -231,7 +279,7 @@ def extract_audio_latents(models, audio_file_path,dtype):
     return projected_latents,audio_valid_length
 @torch.inference_mode()
-def inference_asr(models, audio_path, language,dtype,device,resample_count = 1):
     if language == 'chinese':
         print(f'language: {language}')
         instruction = "User: 请将以下语音转写为中文。\n"
@@ -244,7 +292,7 @@ def inference_asr(models, audio_path, language,dtype,device,resample_count = 1):
     print(f'load audio from {audio_path}')
     audio_path = audio_path
     time_start = time.time()
-    audio_latents,audio_valid_length = extract_audio_latents(models, audio_path,dtype)
     time_end = time.time()
     print(f'whisper time: {time_end - time_start}')
     time_start = time.time()
@@ -261,33 +309,47 @@ def inference_asr(models, audio_path, language,dtype,device,resample_count = 1):
     with torch.no_grad():
         audio_latents = F.layer_norm(audio_latents, (models.llm.n_embd,), weight=models.llm.z['blocks.0.ln0.weight'], bias=models.llm.z['blocks.0.ln0.bias'])#do the first layer norm for embeddings input
     whole_input_embeds = torch.cat([instruction_input_embeds, audio_latents, hints_input_embeds], dim=0)
-    hidden_states,init_state = forward_seq_with_embeds(models.llm, whole_input_embeds, dtype, device, None, False)
     time_end = time.time()
     print(f'prefill time: {time_end - time_start}')
     with torch.no_grad():
         initial_logits = hidden_states @ models.llm.z['head.weight']
     scored_results = []
-    for i in range(resample_count):
-        next_token = sample_logits(initial_logits,top_k=10,top_p=0.6,temperature=0.6)
-        results = []
-        results.append(next_token)
-        state = copy.deepcopy(init_state)
-        while len(results) < 1024:
-            logits,state = models.llm.forward([next_token], state)
-            next_token = sample_logits(logits,top_k=10,top_p=0.6,temperature=0.6)
-            results.append(next_token)
-            if next_token == 0:
-                break
-        # 计算生成序列的perplexity
-        print(f"计算生成序列的perplexity，序列长度: {len(results)}")
-        perplexity = calculate_perplexity(models, results, dtype, device)
-        print(f"生成序列的perplexity: {perplexity:.4f}")
-        scored_results.append((results, perplexity))
     print(f'scored_results: {scored_results}')
     results, perplexity = min(scored_results, key=lambda x: x[1])
     return results[:-1], perplexity
 @click.command()
 @click.option('--audio-lm-path', default="/home/yueyulin/models/rwkv7_0.1b_audio_lm_latents_1.5b_44k",
               help='音频语言模型路径')
@@ -325,13 +387,17 @@ def main(audio_lm_path, llm_path, whisper_path, audio_path, tokenizer_path, lang
     print(f'project1: {models.project1_linear}')
     print(f'project2: {models.project2_linear}')
     start_time = time.time()
-    results, perplexity = inference_asr(models, audio_path, language, dtype, device, resample_count=3)
-    print(f'results: {results}')
-    print(f'decode results: {models.tokenizer.decode(results)}')
-    print(f'perplexity: {perplexity:.4f}')
-    end_time = time.time()
-    print(f'time: {end_time - start_time}')
-    return results, perplexity
 if __name__ == "__main__":
     main()

 import click
 import time
 import copy
+from concurrent.futures import ThreadPoolExecutor
+import threading
 @dataclass
 class AsrModels:
     audio_llm: RWKV
     project2_linear: torch.nn.Linear
     llm: RWKV
     tokenizer: TRIE_TOKENIZER
+    thread_pool: ThreadPoolExecutor
 def forward_one_with_embeds(model :RWKV,embeds:torch.Tensor,state:List[torch.Tensor]):
     project2_linear = torch.nn.Linear(project2['weight'].shape[1], project2['weight'].shape[0])
     project2_linear.load_state_dict(project2)
     tokenizer = TRIE_TOKENIZER(tokenizer_path)
+    # 创建常驻线程池，固定3个线程
+    thread_pool = ThreadPoolExecutor(thread_name_prefix="ASR-Inference")
     return AsrModels(
         audio_llm=audio_llm,
         whisper_feature_extractor=whisper_feature_extractor,
         project2_linear=project2_linear.to(device=device,dtype=dtype),
         llm=llm,
         tokenizer=tokenizer,
+        thread_pool=thread_pool,
     )
 def calculate_perplexity(models, generated_tokens, dtype, device):
         return perplexity
+def single_inference_task(initial_logits, init_state, models, dtype, device, task_id):
+    """
+    单个推理任务，用于并发执行
+    Args:
+        initial_logits: 初始logits
+        init_state: 初始状态
+        models: ASR模型集合
+        dtype: 数据类型
+        device: 设备
+        task_id: 任务ID
+    Returns:
+        tuple: (results, perplexity)
+    """
+    start_time = time.time()
+    print(f"任务 {task_id} 开始执行")
+    # 生成token序列
+    next_token = sample_logits(initial_logits, top_k=10, top_p=0.6, temperature=0.6)
+    results = []
+    results.append(next_token)
+    state = copy.deepcopy(init_state)
+    while len(results) < 1024:
+        logits, state = models.llm.forward([next_token], state)
+        next_token = sample_logits(logits, top_k=10, top_p=0.6, temperature=0.6)
+        results.append(next_token)
+        if next_token == 0:
+            break
+    # 计算生成序列的perplexity
+    print(f"任务 {task_id} 计算生成序列的perplexity，序列长度: {len(results)}")
+    perplexity = calculate_perplexity(models, results, dtype, device)
+    print(f"任务 {task_id} 生成序列的perplexity: {perplexity:.4f}")
+    end_time = time.time()
+    print(f"任务 {task_id} 执行时间: {end_time - start_time}")
+    return results, perplexity
 def sample_logits(logits, temperature=1.0, top_p=0.85, top_k=0):
     if temperature == 0:
         temperature = 1.0
     return projected_latents,audio_valid_length
 @torch.inference_mode()
+def inference_asr(models, audio_path, language, dtype, device, resample_count=1):
     if language == 'chinese':
         print(f'language: {language}')
         instruction = "User: 请将以下语音转写为中文。\n"
     print(f'load audio from {audio_path}')
     audio_path = audio_path
     time_start = time.time()
+    audio_latents, audio_valid_length = extract_audio_latents(models, audio_path, dtype)
     time_end = time.time()
     print(f'whisper time: {time_end - time_start}')
     time_start = time.time()
     with torch.no_grad():
         audio_latents = F.layer_norm(audio_latents, (models.llm.n_embd,), weight=models.llm.z['blocks.0.ln0.weight'], bias=models.llm.z['blocks.0.ln0.bias'])#do the first layer norm for embeddings input
     whole_input_embeds = torch.cat([instruction_input_embeds, audio_latents, hints_input_embeds], dim=0)
+    hidden_states, init_state = forward_seq_with_embeds(models.llm, whole_input_embeds, dtype, device, None, False)
     time_end = time.time()
     print(f'prefill time: {time_end - time_start}')
     with torch.no_grad():
         initial_logits = hidden_states @ models.llm.z['head.weight']
+    # 使用models的常驻ThreadPoolExecutor进行并发推理
+    print(f"开始并发推理")
     scored_results = []
+    # 提交所有任务到常驻线程池
+    future_to_task = {
+        models.thread_pool.submit(single_inference_task, initial_logits, init_state, models, dtype, device, i): i
+        for i in range(resample_count)
+    }
+    # 收集所有结果
+    for future in future_to_task:
+        try:
+            results, perplexity = future.result()
+            scored_results.append((results, perplexity))
+        except Exception as exc:
+            task_id = future_to_task[future]
+            print(f'任务 {task_id} 产生异常: {exc}')
     print(f'scored_results: {scored_results}')
     results, perplexity = min(scored_results, key=lambda x: x[1])
     return results[:-1], perplexity
+def cleanup_models(models):
+    """
+    清理models资源，关闭线程池
+    Args:
+        models: AsrModels实例
+    """
+    if hasattr(models, 'thread_pool') and models.thread_pool:
+        print("正在关闭线程池...")
+        models.thread_pool.shutdown(wait=True)
+        print("线程池已关闭")
 @click.command()
 @click.option('--audio-lm-path', default="/home/yueyulin/models/rwkv7_0.1b_audio_lm_latents_1.5b_44k",
               help='音频语言模型路径')
     print(f'project1: {models.project1_linear}')
     print(f'project2: {models.project2_linear}')
     start_time = time.time()
+    try:
+        results, perplexity = inference_asr(models, audio_path, language, dtype, device, resample_count=3)
+        print(f'results: {results}')
+        print(f'decode results: {models.tokenizer.decode(results)}')
+        print(f'perplexity: {perplexity:.4f}')
+        end_time = time.time()
+        print(f'time: {end_time - start_time}')
+        return results, perplexity
+    finally:
+        # 确保在程序结束时清理线程池
+        cleanup_models(models)
 if __name__ == "__main__":
     main()