| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments | |
| from datasets import load_dataset | |
| import torch, json | |
| def train_model(): | |
| model_name = "Salesforce/codet5-small" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
| dataset = load_dataset("json", data_files={"train": "data_examples/sample_dataset.jsonl"}) | |
| def preprocess(batch): | |
| inputs = tokenizer(batch["code"], truncation=True, padding="max_length", max_length=128) | |
| labels = tokenizer(batch["doc"], truncation=True, padding="max_length", max_length=128) | |
| inputs["labels"] = labels["input_ids"] | |
| return inputs | |
| tokenized = dataset["train"].map(preprocess, batched=True) | |
| args = TrainingArguments( | |
| output_dir="results", | |
| num_train_epochs=3, | |
| per_device_train_batch_size=2, | |
| save_strategy="epoch", | |
| logging_dir="logs", | |
| ) | |
| trainer = Trainer(model=model, args=args, train_dataset=tokenized) | |
| trainer.train() | |
| model.save_pretrained("trained_model") | |
| tokenizer.save_pretrained("trained_model") | |
| if __name__ == "__main__": | |
| train_model() | |