|
|
|
|
|
""" |
|
|
Qwen3-Omni GGUF格式使用範例 |
|
|
|
|
|
這個腳本展示如何使用GGUF格式的Qwen3-Omni模型進行各種任務, |
|
|
包括Ollama API、llama-cpp-python直接調用等方法。 |
|
|
""" |
|
|
|
|
|
import json |
|
|
import time |
|
|
import requests |
|
|
import subprocess |
|
|
from pathlib import Path |
|
|
from typing import Dict, List, Optional |
|
|
|
|
|
try: |
|
|
from llama_cpp import Llama |
|
|
LLAMA_CPP_AVAILABLE = True |
|
|
except ImportError: |
|
|
LLAMA_CPP_AVAILABLE = False |
|
|
print("⚠️ llama-cpp-python not installed. Install with: pip install llama-cpp-python") |
|
|
|
|
|
|
|
|
class QwenGGUFRunner: |
|
|
"""Qwen GGUF格式運行器""" |
|
|
|
|
|
def __init__(self, model_path: str = "qwen3_omni_quantized.gguf"): |
|
|
self.model_path = model_path |
|
|
self.llm = None |
|
|
|
|
|
def load_with_llama_cpp(self, **kwargs): |
|
|
"""使用llama-cpp-python載入模型""" |
|
|
if not LLAMA_CPP_AVAILABLE: |
|
|
raise ImportError("llama-cpp-python not available") |
|
|
|
|
|
default_params = { |
|
|
'n_gpu_layers': 35, |
|
|
'n_ctx': 4096, |
|
|
'n_batch': 512, |
|
|
'verbose': False, |
|
|
'n_threads': 8, |
|
|
} |
|
|
default_params.update(kwargs) |
|
|
|
|
|
print(f"🚀 Loading GGUF model: {self.model_path}") |
|
|
start_time = time.time() |
|
|
|
|
|
self.llm = Llama(model_path=self.model_path, **default_params) |
|
|
|
|
|
load_time = time.time() - start_time |
|
|
print(f"✅ Model loaded in {load_time:.2f}s") |
|
|
return self.llm |
|
|
|
|
|
def generate_with_llama_cpp(self, prompt: str, **kwargs) -> str: |
|
|
"""使用llama-cpp-python生成文本""" |
|
|
if not self.llm: |
|
|
raise ValueError("Model not loaded. Call load_with_llama_cpp() first.") |
|
|
|
|
|
default_params = { |
|
|
'max_tokens': 256, |
|
|
'temperature': 0.7, |
|
|
'top_p': 0.8, |
|
|
'top_k': 50, |
|
|
'repeat_penalty': 1.1, |
|
|
'stop': ["</s>", "<|endoftext|>"] |
|
|
} |
|
|
default_params.update(kwargs) |
|
|
|
|
|
print(f"💭 Generating response...") |
|
|
start_time = time.time() |
|
|
|
|
|
response = self.llm(prompt, **default_params) |
|
|
|
|
|
gen_time = time.time() - start_time |
|
|
tokens = len(response['choices'][0]['text'].split()) |
|
|
speed = tokens / gen_time if gen_time > 0 else 0 |
|
|
|
|
|
print(f"⚡ Generated {tokens} tokens in {gen_time:.2f}s ({speed:.1f} tok/s)") |
|
|
|
|
|
return response['choices'][0]['text'] |
|
|
|
|
|
|
|
|
class OllamaAPI: |
|
|
"""Ollama API 接口""" |
|
|
|
|
|
def __init__(self, base_url: str = "http://localhost:11434"): |
|
|
self.base_url = base_url |
|
|
self.model_name = "qwen3-omni-quantized" |
|
|
|
|
|
def check_connection(self) -> bool: |
|
|
"""檢查Ollama連接""" |
|
|
try: |
|
|
response = requests.get(f"{self.base_url}/api/tags", timeout=5) |
|
|
return response.status_code == 200 |
|
|
except: |
|
|
return False |
|
|
|
|
|
def is_model_available(self) -> bool: |
|
|
"""檢查模型是否可用""" |
|
|
try: |
|
|
response = requests.get(f"{self.base_url}/api/tags") |
|
|
models = response.json().get("models", []) |
|
|
return any(model["name"] == self.model_name for model in models) |
|
|
except: |
|
|
return False |
|
|
|
|
|
def generate(self, prompt: str, **kwargs) -> str: |
|
|
"""使用Ollama API生成文本""" |
|
|
if not self.check_connection(): |
|
|
raise ConnectionError("Cannot connect to Ollama API") |
|
|
|
|
|
if not self.is_model_available(): |
|
|
raise ValueError(f"Model {self.model_name} not found in Ollama") |
|
|
|
|
|
payload = { |
|
|
"model": self.model_name, |
|
|
"prompt": prompt, |
|
|
"stream": False, |
|
|
"options": { |
|
|
"temperature": kwargs.get("temperature", 0.7), |
|
|
"top_p": kwargs.get("top_p", 0.8), |
|
|
"top_k": kwargs.get("top_k", 50), |
|
|
"repeat_penalty": kwargs.get("repeat_penalty", 1.1), |
|
|
"num_predict": kwargs.get("max_tokens", 256), |
|
|
} |
|
|
} |
|
|
|
|
|
print(f"💭 Sending request to Ollama...") |
|
|
start_time = time.time() |
|
|
|
|
|
response = requests.post( |
|
|
f"{self.base_url}/api/generate", |
|
|
json=payload, |
|
|
timeout=60 |
|
|
) |
|
|
|
|
|
if response.status_code != 200: |
|
|
raise RuntimeError(f"Ollama API error: {response.text}") |
|
|
|
|
|
result = response.json() |
|
|
gen_time = time.time() - start_time |
|
|
|
|
|
|
|
|
output_text = result["response"] |
|
|
tokens = len(output_text.split()) |
|
|
speed = tokens / gen_time if gen_time > 0 else 0 |
|
|
|
|
|
print(f"⚡ Generated {tokens} tokens in {gen_time:.2f}s ({speed:.1f} tok/s)") |
|
|
|
|
|
return output_text |
|
|
|
|
|
|
|
|
def run_examples(): |
|
|
"""運行示例代碼""" |
|
|
|
|
|
examples = [ |
|
|
{ |
|
|
"name": "🌟 創意寫作", |
|
|
"prompt": "請寫一個關於AI和人類合作探索宇宙的短故事,要有科幻感和哲理思考。", |
|
|
"params": {"temperature": 0.8, "max_tokens": 400} |
|
|
}, |
|
|
{ |
|
|
"name": "💻 代碼生成", |
|
|
"prompt": "請用Python寫一個快速排序算法,包含詳細註解和時間複雜度分析。", |
|
|
"params": {"temperature": 0.3, "max_tokens": 500} |
|
|
}, |
|
|
{ |
|
|
"name": "🧮 數學推理", |
|
|
"prompt": "一個圓的半徑是5cm,請計算其面積和周長,並解釋計算過程。", |
|
|
"params": {"temperature": 0.2, "max_tokens": 300} |
|
|
}, |
|
|
{ |
|
|
"name": "🌐 多語言翻譯", |
|
|
"prompt": "Please translate this English text to Chinese: 'Artificial Intelligence is revolutionizing the way we interact with technology, making it more intuitive and human-friendly.'", |
|
|
"params": {"temperature": 0.3, "max_tokens": 200} |
|
|
}, |
|
|
{ |
|
|
"name": "🤔 邏輯推理", |
|
|
"prompt": "如果所有的A都是B,所有的B都是C,而某個X是A,那麼X是什麼?請解釋邏輯推理過程。", |
|
|
"params": {"temperature": 0.1, "max_tokens": 250} |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
ollama = OllamaAPI() |
|
|
ollama_available = ollama.check_connection() and ollama.is_model_available() |
|
|
|
|
|
|
|
|
gguf_available = LLAMA_CPP_AVAILABLE and Path("qwen3_omni_quantized.gguf").exists() |
|
|
|
|
|
print("=" * 80) |
|
|
print("🔥 Qwen3-Omni GGUF格式使用範例") |
|
|
print("=" * 80) |
|
|
print(f"💾 Ollama API 可用: {'✅' if ollama_available else '❌'}") |
|
|
print(f"📁 GGUF文件可用: {'✅' if gguf_available else '❌'}") |
|
|
print() |
|
|
|
|
|
|
|
|
if not ollama_available and not gguf_available: |
|
|
print("⚠️ 請先設置Ollama或下載GGUF文件:") |
|
|
print() |
|
|
print("🚀 Ollama 設置:") |
|
|
print(" 1. ollama create qwen3-omni-quantized -f Qwen3OmniQuantized.modelfile") |
|
|
print(" 2. ollama serve") |
|
|
print() |
|
|
print("📁 GGUF文件下載:") |
|
|
print(" huggingface-cli download vito95311/Qwen3-Omni-30B-A3B-Thinking-GGUF-INT8FP16 qwen3_omni_quantized.gguf") |
|
|
return |
|
|
|
|
|
|
|
|
if ollama_available: |
|
|
print("🎯 使用Ollama API進行推理") |
|
|
runner_type = "ollama" |
|
|
api = ollama |
|
|
else: |
|
|
print("🎯 使用llama-cpp-python進行推理") |
|
|
runner_type = "llama_cpp" |
|
|
runner = QwenGGUFRunner() |
|
|
runner.load_with_llama_cpp() |
|
|
|
|
|
print("=" * 80) |
|
|
|
|
|
|
|
|
for i, example in enumerate(examples, 1): |
|
|
print(f"\n📝 示例 {i}: {example['name']}") |
|
|
print(f"💬 提示: {example['prompt'][:100]}...") |
|
|
print("-" * 40) |
|
|
|
|
|
try: |
|
|
if runner_type == "ollama": |
|
|
response = api.generate(example['prompt'], **example['params']) |
|
|
else: |
|
|
response = runner.generate_with_llama_cpp(example['prompt'], **example['params']) |
|
|
|
|
|
print(f"🤖 回應: {response.strip()}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ 錯誤: {str(e)}") |
|
|
|
|
|
print("-" * 40) |
|
|
|
|
|
|
|
|
time.sleep(1) |
|
|
|
|
|
|
|
|
def benchmark_performance(): |
|
|
"""性能基準測試""" |
|
|
|
|
|
print("\n🏆 性能基準測試") |
|
|
print("=" * 50) |
|
|
|
|
|
test_prompts = [ |
|
|
"解釋什麼是機器學習", |
|
|
"寫一個Python函數來計算斐波那契數列", |
|
|
"描述量子計算的基本原理", |
|
|
"What are the benefits of renewable energy?", |
|
|
"如何優化深度學習模型的性能?" |
|
|
] |
|
|
|
|
|
ollama = OllamaAPI() |
|
|
|
|
|
if ollama.check_connection() and ollama.is_model_available(): |
|
|
print("📊 測試Ollama API性能...") |
|
|
|
|
|
total_time = 0 |
|
|
total_tokens = 0 |
|
|
|
|
|
for i, prompt in enumerate(test_prompts, 1): |
|
|
print(f" Test {i}/5: ", end="", flush=True) |
|
|
|
|
|
start_time = time.time() |
|
|
response = ollama.generate(prompt, max_tokens=100, temperature=0.7) |
|
|
end_time = time.time() |
|
|
|
|
|
test_time = end_time - start_time |
|
|
tokens = len(response.split()) |
|
|
speed = tokens / test_time if test_time > 0 else 0 |
|
|
|
|
|
total_time += test_time |
|
|
total_tokens += tokens |
|
|
|
|
|
print(f"{speed:.1f} tok/s") |
|
|
|
|
|
avg_speed = total_tokens / total_time if total_time > 0 else 0 |
|
|
print(f"\n📈 平均性能: {avg_speed:.1f} tokens/秒") |
|
|
print(f"⏱️ 總時間: {total_time:.2f}秒") |
|
|
print(f"📝 總tokens: {total_tokens}") |
|
|
|
|
|
else: |
|
|
print("⚠️ Ollama不可用,跳過性能測試") |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""主函數""" |
|
|
print("🔥 Qwen3-Omni GGUF 使用範例") |
|
|
print("這個腳本展示如何使用GGUF格式的模型進行各種AI任務") |
|
|
|
|
|
|
|
|
run_examples() |
|
|
|
|
|
|
|
|
user_input = input("\n🤔 是否運行性能基準測試? (y/n): ") |
|
|
if user_input.lower() in ['y', 'yes']: |
|
|
benchmark_performance() |
|
|
|
|
|
print("\n✨ 示例運行完成!") |
|
|
print("💡 更多使用方法請參考 README.md") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|