Spaces:
Paused
Paused
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, pipeline | |
| from datasets import load_dataset, Dataset | |
| import json | |
| class HuggingFaceHelper: | |
| def __init__(self, model_path="./merged_model", dataset_path=None): | |
| self.model_path = model_path | |
| self.dataset_path = dataset_path | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Load tokenizer and model | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map="auto") | |
| def check_model_integrity(self): | |
| print("π Checking model integrity...") | |
| for param_tensor in self.model.state_dict(): | |
| print(f"{param_tensor}: {self.model.state_dict()[param_tensor].size()}") | |
| print("β Model integrity check completed.") | |
| def test_pipeline(self): | |
| try: | |
| pipe = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer) | |
| output = pipe("What is the future of AI?", max_length=100) | |
| print("β Model successfully generates text:", output) | |
| except Exception as e: | |
| print(f"β Pipeline Error: {e}") | |
| def load_dataset(self): | |
| if self.dataset_path: | |
| dataset = load_dataset("json", data_files=self.dataset_path, split="train") | |
| return dataset.map(self.tokenize_function, batched=True) | |
| else: | |
| raise ValueError("Dataset path not provided.") | |
| def tokenize_function(self, examples): | |
| return self.tokenizer(examples["messages"], truncation=True, padding="max_length", max_length=512) | |
| def fine_tune(self, output_dir="./fine_tuned_model", epochs=3, batch_size=4): | |
| dataset = self.load_dataset() | |
| training_args = TrainingArguments( | |
| output_dir=output_dir, | |
| evaluation_strategy="epoch", | |
| save_strategy="epoch", | |
| per_device_train_batch_size=batch_size, | |
| per_device_eval_batch_size=batch_size, | |
| num_train_epochs=epochs, | |
| weight_decay=0.01, | |
| logging_dir=f"{output_dir}/logs", | |
| push_to_hub=False, | |
| ) | |
| trainer = Trainer( | |
| model=self.model, | |
| args=training_args, | |
| train_dataset=dataset, | |
| tokenizer=self.tokenizer, | |
| ) | |
| trainer.train() | |
| self.save_model(output_dir) | |
| def save_model(self, output_dir): | |
| self.model.save_pretrained(output_dir) | |
| self.tokenizer.save_pretrained(output_dir) | |
| print(f"β Model saved to {output_dir}") | |
| def generate_response(self, prompt, max_length=200): | |
| inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) | |
| output = self.model.generate(**inputs, max_length=max_length) | |
| return self.tokenizer.decode(output[0], skip_special_tokens=True) | |
| # Example usage | |
| if __name__ == "__main__": | |
| helper = HuggingFaceHelper(model_path="./merged_model", dataset_path="codette_training_data_finetune_fixed.jsonl") | |
| helper.check_model_integrity() | |
| helper.test_pipeline() | |
| helper.fine_tune(output_dir="./codette_finetuned", epochs=3, batch_size=4) | |
| print(helper.generate_response("How will AI impact cybersecurity?")) | |