Vriti29 commited on
Commit
393894b
·
1 Parent(s): 47fbcd8

Create t5-base-qg-hl finetuned

Browse files
Files changed (1) hide show
  1. valhalla/t5-base-qg-hl finetuned +109 -0
valhalla/t5-base-qg-hl finetuned ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled6.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/11megvyfcr49Oy4FGK7kteQ2iMdxZYp4L
8
+ """
9
+
10
+ pip install transformers datasets sentence-transformers evaluate scikit-learn
11
+
12
+ from google.colab import files
13
+ uploaded = files.upload()
14
+
15
+ from google.colab import files
16
+ uploaded = files.upload()
17
+
18
+ from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
19
+ from datasets import load_dataset, DatasetDict, load_metric
20
+ from sentence_transformers import SentenceTransformer, util
21
+ from sklearn.metrics.pairwise import cosine_similarity
22
+ import pandas as pd
23
+ import torch
24
+ import numpy as np
25
+
26
+ def load_csv_datasets(train_path, eval_path):
27
+ train_df = pd.read_csv(train_path)
28
+ eval_df = pd.read_csv(eval_path)
29
+
30
+ dataset = DatasetDict({
31
+ 'train': Dataset.from_pandas(train_df),
32
+ 'eval': Dataset.from_pandas(eval_df)
33
+ })
34
+ return dataset
35
+
36
+ def preprocess(example):
37
+ input_text = example['input']
38
+ target_text = example['target']
39
+ model_inputs = tokenizer(input_text, max_length=512, padding='max_length', truncation=True)
40
+ labels = tokenizer(target_text, max_length=64, padding='max_length', truncation=True)
41
+ model_inputs["labels"] = labels["input_ids"]
42
+ return model_inputs
43
+
44
+ model_name = "valhalla/t5-base-qg-hl"
45
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
46
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
47
+
48
+ dataset = load_csv_datasets("train.csv", "eval.csv")
49
+ tokenized_dataset = dataset.map(preprocess, batched=True)
50
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
51
+
52
+ training_args = TrainingArguments(
53
+ output_dir="./qg_finetuned",
54
+ eval_strategy="epoch",
55
+ save_strategy="epoch",
56
+ per_device_train_batch_size=4,
57
+ per_device_eval_batch_size=4,
58
+ num_train_epochs=3,
59
+ logging_dir='./logs',
60
+ logging_steps=10,
61
+ save_total_limit=1,
62
+ load_best_model_at_end=True,
63
+ metric_for_best_model="cosine",
64
+ greater_is_better=True
65
+ )
66
+
67
+ def compute_metrics(eval_pred):
68
+ predictions, labels = eval_pred
69
+ # Ensure predictions is a list of lists of integers for batch_decode
70
+ if isinstance(predictions, tuple):
71
+ predictions = predictions[0]
72
+
73
+ # Replace -100 in labels as we can't decode them.
74
+ labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
75
+
76
+ # Filter out invalid token IDs from predictions
77
+ valid_vocab_size = tokenizer.vocab_size
78
+ filtered_predictions = []
79
+ for pred_seq in predictions.tolist():
80
+ filtered_seq = [token_id for token_id in pred_seq[0] if 0 <= token_id < valid_vocab_size]
81
+ filtered_predictions.append(filtered_seq)
82
+
83
+
84
+ decoded_preds = tokenizer.batch_decode(filtered_predictions, skip_special_tokens=True)
85
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
86
+
87
+ # Using SentenceTransformer for cosine similarity
88
+ embedder = SentenceTransformer('all-MiniLM-L6-v2')
89
+ embeddings_pred = embedder.encode(decoded_preds, convert_to_tensor=True)
90
+ embeddings_label = embedder.encode(decoded_labels, convert_to_tensor=True)
91
+
92
+ cosine_scores = util.cos_sim(embeddings_pred, embeddings_label).diagonal()
93
+ avg_cosine = cosine_scores.mean().item()
94
+
95
+ return {"cosine": avg_cosine}
96
+
97
+ trainer = Trainer(
98
+ model=model,
99
+ args=training_args,
100
+ train_dataset=tokenized_dataset["train"],
101
+ eval_dataset=tokenized_dataset["eval"],
102
+ tokenizer=tokenizer,
103
+ data_collator=data_collator,
104
+ compute_metrics=compute_metrics,
105
+ )
106
+ trainer.train()
107
+
108
+ results = trainer.evaluate()
109
+ print("Evaluation Results:", results)