from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import torch model_id = "Qwen/Qwen2.5-Coder-0.5B-Instruct" lora_path = "./Qwen2.5-Coder-0.5B-lora" tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=True ) tokenizer.pad_token = tokenizer.eos_token # 🔴 CPU ONLY — NO CUDA, NO device_map base_model = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, torch_dtype=torch.float32, low_cpu_mem_usage=True, ) model = PeftModel.from_pretrained( base_model, lora_path, ) print("🔄 Merging LoRA (this will take time on CPU)...") merged_model = model.merge_and_unload() merged_model.save_pretrained("./Qwen2.5-Coder-0.5B-lora-merged") tokenizer.save_pretrained("./Qwen2.5-Coder-0.5B-lora-merged") print("✅ Merge complete")