Spaces:
Build error
Build error
openai zero-shot results
Browse files- .env.example +2 -2
- data/openai_results.csv +0 -0
- llm_toolkit/eval_openai.py +91 -0
- llm_toolkit/logical_reasoning_utils.py +54 -3
- notebooks/04_Few-shot_Prompting_OpenAI.ipynb +0 -0
.env.example
CHANGED
|
@@ -5,7 +5,7 @@ HF_TOKEN=
|
|
| 5 |
WANDB_API_KEY=
|
| 6 |
|
| 7 |
LOAD_IN_4BIT=false
|
| 8 |
-
NUM_TRAIN_EPOCHS=
|
| 9 |
|
| 10 |
LOGICAL_REASONING_DATA_PATH=datasets/mgtv
|
| 11 |
-
LOGICAL_REASONING_RESULTS_PATH=results/mgtv-
|
|
|
|
| 5 |
WANDB_API_KEY=
|
| 6 |
|
| 7 |
LOAD_IN_4BIT=false
|
| 8 |
+
NUM_TRAIN_EPOCHS=2
|
| 9 |
|
| 10 |
LOGICAL_REASONING_DATA_PATH=datasets/mgtv
|
| 11 |
+
LOGICAL_REASONING_RESULTS_PATH=results/mgtv-results.csv
|
data/openai_results.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
llm_toolkit/eval_openai.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from dotenv import find_dotenv, load_dotenv
|
| 4 |
+
|
| 5 |
+
found_dotenv = find_dotenv(".env")
|
| 6 |
+
|
| 7 |
+
if len(found_dotenv) == 0:
|
| 8 |
+
found_dotenv = find_dotenv(".env.example")
|
| 9 |
+
print(f"loading env vars from: {found_dotenv}")
|
| 10 |
+
load_dotenv(found_dotenv, override=False)
|
| 11 |
+
|
| 12 |
+
path = os.path.dirname(found_dotenv)
|
| 13 |
+
print(f"Adding {path} to sys.path")
|
| 14 |
+
sys.path.append(path)
|
| 15 |
+
|
| 16 |
+
from llm_toolkit.llm_utils import *
|
| 17 |
+
from llm_toolkit.logical_reasoning_utils import *
|
| 18 |
+
|
| 19 |
+
model_name = os.getenv("MODEL_NAME")
|
| 20 |
+
data_path = os.getenv("LOGICAL_REASONING_DATA_PATH")
|
| 21 |
+
results_path = os.getenv("LOGICAL_REASONING_RESULTS_PATH")
|
| 22 |
+
max_new_tokens = int(os.getenv("MAX_NEW_TOKENS", 2048))
|
| 23 |
+
|
| 24 |
+
print(
|
| 25 |
+
model_name,
|
| 26 |
+
data_path,
|
| 27 |
+
results_path,
|
| 28 |
+
max_new_tokens,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def on_num_shots_step_completed(model_name, dataset, predictions, results_path):
|
| 33 |
+
save_results(
|
| 34 |
+
model_name,
|
| 35 |
+
results_path,
|
| 36 |
+
dataset,
|
| 37 |
+
predictions,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
metrics = calc_metrics(dataset["label"], predictions, debug=True)
|
| 41 |
+
print(f"{model_name} metrics: {metrics}")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def evaluate_model_with_num_shots(
|
| 45 |
+
model_name,
|
| 46 |
+
datasets,
|
| 47 |
+
results_path=None,
|
| 48 |
+
range_num_shots=[0],
|
| 49 |
+
max_new_tokens=2048,
|
| 50 |
+
result_column_name=None,
|
| 51 |
+
):
|
| 52 |
+
print(f"Evaluating model: {model_name}")
|
| 53 |
+
|
| 54 |
+
eval_dataset = datasets["test"].to_pandas()
|
| 55 |
+
print_row_details(eval_dataset)
|
| 56 |
+
|
| 57 |
+
for num_shots in range_num_shots:
|
| 58 |
+
print(f"*** Evaluating with num_shots: {num_shots}")
|
| 59 |
+
|
| 60 |
+
predictions = eval_openai(eval_dataset, model=model_name, max_new_tokens=max_new_tokens)
|
| 61 |
+
model_name_with_shorts = (
|
| 62 |
+
result_column_name
|
| 63 |
+
if result_column_name
|
| 64 |
+
else f"{model_name}/shots-{num_shots:02d}"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
try:
|
| 68 |
+
on_num_shots_step_completed(
|
| 69 |
+
model_name_with_shorts, eval_dataset, predictions, results_path
|
| 70 |
+
)
|
| 71 |
+
except Exception as e:
|
| 72 |
+
print(e)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
if __name__ == "__main__":
|
| 76 |
+
datasets = load_logical_reasoning_dataset(
|
| 77 |
+
data_path,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
if len(sys.argv) > 1:
|
| 81 |
+
num = int(sys.argv[1])
|
| 82 |
+
if num > 0:
|
| 83 |
+
print(f"--- evaluating {num} entries")
|
| 84 |
+
datasets["test"] = datasets["test"].select(range(num))
|
| 85 |
+
|
| 86 |
+
evaluate_model_with_num_shots(
|
| 87 |
+
model_name,
|
| 88 |
+
datasets,
|
| 89 |
+
results_path=results_path,
|
| 90 |
+
max_new_tokens=max_new_tokens,
|
| 91 |
+
)
|
llm_toolkit/logical_reasoning_utils.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
import re
|
|
|
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
import seaborn as sns
|
| 5 |
import matplotlib.pyplot as plt
|
|
@@ -8,6 +10,7 @@ from matplotlib.ticker import MultipleLocator
|
|
| 8 |
from datasets import load_dataset
|
| 9 |
import numpy as np
|
| 10 |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
|
|
|
| 11 |
|
| 12 |
print(f"loading {__file__}")
|
| 13 |
|
|
@@ -86,6 +89,7 @@ Please strictly follow these rules when answering the participant's questions.
|
|
| 86 |
**Participant's question:** {}
|
| 87 |
"""
|
| 88 |
|
|
|
|
| 89 |
|
| 90 |
def get_prompt_template(using_p1=True, chinese_prompt=True):
|
| 91 |
if using_p1:
|
|
@@ -177,8 +181,12 @@ def save_results(model_name, results_path, dataset, predictions, debug=False):
|
|
| 177 |
|
| 178 |
# Create all directories in the path (if they don't exist)
|
| 179 |
os.makedirs(dir_path, exist_ok=True)
|
| 180 |
-
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
else:
|
| 183 |
df = pd.read_csv(results_path, on_bad_lines="warn")
|
| 184 |
|
|
@@ -215,7 +223,7 @@ def load_logical_reasoning_dataset(
|
|
| 215 |
messages = [
|
| 216 |
{
|
| 217 |
"role": "system",
|
| 218 |
-
"content":
|
| 219 |
},
|
| 220 |
None,
|
| 221 |
]
|
|
@@ -419,3 +427,46 @@ def plot_metrics(perf_df, model_name):
|
|
| 419 |
plt.legend(loc="center left", bbox_to_anchor=(1.0, 0.5))
|
| 420 |
|
| 421 |
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
+
from langchain_openai import ChatOpenAI
|
| 4 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 5 |
import pandas as pd
|
| 6 |
import seaborn as sns
|
| 7 |
import matplotlib.pyplot as plt
|
|
|
|
| 10 |
from datasets import load_dataset
|
| 11 |
import numpy as np
|
| 12 |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
| 13 |
+
from tqdm import tqdm
|
| 14 |
|
| 15 |
print(f"loading {__file__}")
|
| 16 |
|
|
|
|
| 89 |
**Participant's question:** {}
|
| 90 |
"""
|
| 91 |
|
| 92 |
+
system_prompt = "You are an expert in logical reasoning."
|
| 93 |
|
| 94 |
def get_prompt_template(using_p1=True, chinese_prompt=True):
|
| 95 |
if using_p1:
|
|
|
|
| 181 |
|
| 182 |
# Create all directories in the path (if they don't exist)
|
| 183 |
os.makedirs(dir_path, exist_ok=True)
|
| 184 |
+
|
| 185 |
+
if isinstance(dataset, pd.DataFrame):
|
| 186 |
+
df = dataset
|
| 187 |
+
else:
|
| 188 |
+
df = dataset.to_pandas()
|
| 189 |
+
df.drop(columns=["answer", "prompt", "train_text"], inplace=True, errors="ignore")
|
| 190 |
else:
|
| 191 |
df = pd.read_csv(results_path, on_bad_lines="warn")
|
| 192 |
|
|
|
|
| 223 |
messages = [
|
| 224 |
{
|
| 225 |
"role": "system",
|
| 226 |
+
"content": system_prompt,
|
| 227 |
},
|
| 228 |
None,
|
| 229 |
]
|
|
|
|
| 427 |
plt.legend(loc="center left", bbox_to_anchor=(1.0, 0.5))
|
| 428 |
|
| 429 |
plt.show()
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
def reasoning_with_openai(
|
| 433 |
+
row, user_prompt, max_tokens=None, model="gpt-4o-mini", base_url=None
|
| 434 |
+
):
|
| 435 |
+
llm = ChatOpenAI(
|
| 436 |
+
model=model,
|
| 437 |
+
temperature=0,
|
| 438 |
+
max_tokens=max_tokens,
|
| 439 |
+
timeout=None,
|
| 440 |
+
max_retries=2,
|
| 441 |
+
base_url=base_url,
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
+
prompt = ChatPromptTemplate.from_messages(
|
| 445 |
+
[
|
| 446 |
+
(
|
| 447 |
+
"system",
|
| 448 |
+
system_prompt,
|
| 449 |
+
),
|
| 450 |
+
(
|
| 451 |
+
"human",
|
| 452 |
+
user_prompt.format(row["puzzle"], row["truth"], row["text"]),
|
| 453 |
+
),
|
| 454 |
+
]
|
| 455 |
+
)
|
| 456 |
+
|
| 457 |
+
chain = prompt | llm
|
| 458 |
+
response = chain.invoke(input={})
|
| 459 |
+
|
| 460 |
+
return response.content
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
def eval_openai(eval_dataset, model="gpt-4o-mini", max_new_tokens=300):
|
| 464 |
+
user_prompt = get_prompt_template(using_p1=False, chinese_prompt=True)
|
| 465 |
+
total = len(eval_dataset)
|
| 466 |
+
predictions = []
|
| 467 |
+
|
| 468 |
+
for i in tqdm(range(total)):
|
| 469 |
+
output = reasoning_with_openai(eval_dataset.iloc[i], user_prompt,model=model, max_tokens=max_new_tokens)
|
| 470 |
+
predictions.append(output)
|
| 471 |
+
|
| 472 |
+
return predictions
|
notebooks/04_Few-shot_Prompting_OpenAI.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|