Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

inflaton commited on Sep 10, 2024

Commit

8b9bb19

1 Parent(s): 090acf8

openai zero-shot results

Browse files

Files changed (5) hide show

.env.example +2 -2
data/openai_results.csv +0 -0
llm_toolkit/eval_openai.py +91 -0
llm_toolkit/logical_reasoning_utils.py +54 -3
notebooks/04_Few-shot_Prompting_OpenAI.ipynb +0 -0

.env.example CHANGED Viewed

@@ -5,7 +5,7 @@ HF_TOKEN=
 WANDB_API_KEY=
 LOAD_IN_4BIT=false
-NUM_TRAIN_EPOCHS=3
 LOGICAL_REASONING_DATA_PATH=datasets/mgtv
-LOGICAL_REASONING_RESULTS_PATH=results/mgtv-results_l40.csv

 WANDB_API_KEY=
 LOAD_IN_4BIT=false
+NUM_TRAIN_EPOCHS=2
 LOGICAL_REASONING_DATA_PATH=datasets/mgtv
+LOGICAL_REASONING_RESULTS_PATH=results/mgtv-results.csv

data/openai_results.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

llm_toolkit/eval_openai.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import sys
+from dotenv import find_dotenv, load_dotenv
+found_dotenv = find_dotenv(".env")
+if len(found_dotenv) == 0:
+    found_dotenv = find_dotenv(".env.example")
+print(f"loading env vars from: {found_dotenv}")
+load_dotenv(found_dotenv, override=False)
+path = os.path.dirname(found_dotenv)
+print(f"Adding {path} to sys.path")
+sys.path.append(path)
+from llm_toolkit.llm_utils import *
+from llm_toolkit.logical_reasoning_utils import *
+model_name = os.getenv("MODEL_NAME")
+data_path = os.getenv("LOGICAL_REASONING_DATA_PATH")
+results_path = os.getenv("LOGICAL_REASONING_RESULTS_PATH")
+max_new_tokens = int(os.getenv("MAX_NEW_TOKENS", 2048))
+print(
+    model_name,
+    data_path,
+    results_path,
+    max_new_tokens,
+)
+def on_num_shots_step_completed(model_name, dataset, predictions, results_path):
+    save_results(
+        model_name,
+        results_path,
+        dataset,
+        predictions,
+    )
+    metrics = calc_metrics(dataset["label"], predictions, debug=True)
+    print(f"{model_name} metrics: {metrics}")
+def evaluate_model_with_num_shots(
+    model_name,
+    datasets,
+    results_path=None,
+    range_num_shots=[0],
+    max_new_tokens=2048,
+    result_column_name=None,
+):
+    print(f"Evaluating model: {model_name}")
+    eval_dataset = datasets["test"].to_pandas()
+    print_row_details(eval_dataset)
+    for num_shots in range_num_shots:
+        print(f"*** Evaluating with num_shots: {num_shots}")
+        predictions = eval_openai(eval_dataset, model=model_name, max_new_tokens=max_new_tokens)
+        model_name_with_shorts = (
+            result_column_name
+            if result_column_name
+            else f"{model_name}/shots-{num_shots:02d}"
+        )
+        try:
+            on_num_shots_step_completed(
+                model_name_with_shorts, eval_dataset, predictions, results_path
+            )
+        except Exception as e:
+            print(e)
+if __name__ == "__main__":
+    datasets = load_logical_reasoning_dataset(
+        data_path,
+    )
+    if len(sys.argv) > 1:
+        num = int(sys.argv[1])
+        if num > 0:
+            print(f"--- evaluating {num} entries")
+            datasets["test"] = datasets["test"].select(range(num))
+    evaluate_model_with_num_shots(
+        model_name,
+        datasets,
+        results_path=results_path,
+        max_new_tokens=max_new_tokens,
+    )

llm_toolkit/logical_reasoning_utils.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import os
 import re
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
@@ -8,6 +10,7 @@ from matplotlib.ticker import MultipleLocator
 from datasets import load_dataset
 import numpy as np
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 print(f"loading {__file__}")
@@ -86,6 +89,7 @@ Please strictly follow these rules when answering the participant's questions.
 **Participant's question:** {}
 """
 def get_prompt_template(using_p1=True, chinese_prompt=True):
     if using_p1:
@@ -177,8 +181,12 @@ def save_results(model_name, results_path, dataset, predictions, debug=False):
         # Create all directories in the path (if they don't exist)
         os.makedirs(dir_path, exist_ok=True)
-        df = dataset.to_pandas()
-        df.drop(columns=["answer", "prompt", "train_text"], inplace=True)
     else:
         df = pd.read_csv(results_path, on_bad_lines="warn")
@@ -215,7 +223,7 @@ def load_logical_reasoning_dataset(
             messages = [
                 {
                     "role": "system",
-                    "content": "You are an expert in logical reasoning.",
                 },
                 None,
             ]
@@ -419,3 +427,46 @@ def plot_metrics(perf_df, model_name):
     plt.legend(loc="center left", bbox_to_anchor=(1.0, 0.5))
     plt.show()

 import os
 import re
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 from datasets import load_dataset
 import numpy as np
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+from tqdm import tqdm
 print(f"loading {__file__}")
 **Participant's question:** {}
 """
+system_prompt = "You are an expert in logical reasoning."
 def get_prompt_template(using_p1=True, chinese_prompt=True):
     if using_p1:
         # Create all directories in the path (if they don't exist)
         os.makedirs(dir_path, exist_ok=True)
+        if isinstance(dataset, pd.DataFrame):
+            df = dataset
+        else:
+            df = dataset.to_pandas()
+        df.drop(columns=["answer", "prompt", "train_text"], inplace=True, errors="ignore")
     else:
         df = pd.read_csv(results_path, on_bad_lines="warn")
             messages = [
                 {
                     "role": "system",
+                    "content": system_prompt,
                 },
                 None,
             ]
     plt.legend(loc="center left", bbox_to_anchor=(1.0, 0.5))
     plt.show()
+def reasoning_with_openai(
+    row, user_prompt, max_tokens=None, model="gpt-4o-mini", base_url=None
+):
+    llm = ChatOpenAI(
+        model=model,
+        temperature=0,
+        max_tokens=max_tokens,
+        timeout=None,
+        max_retries=2,
+        base_url=base_url,
+    )
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            (
+                "system",
+                system_prompt,
+            ),
+            (
+                "human",
+                user_prompt.format(row["puzzle"], row["truth"], row["text"]),
+            ),
+        ]
+    )
+    chain = prompt | llm
+    response = chain.invoke(input={})
+    return response.content
+def eval_openai(eval_dataset, model="gpt-4o-mini", max_new_tokens=300):
+    user_prompt = get_prompt_template(using_p1=False, chinese_prompt=True)
+    total = len(eval_dataset)
+    predictions = []
+    for i in tqdm(range(total)):
+        output = reasoning_with_openai(eval_dataset.iloc[i], user_prompt,model=model, max_tokens=max_new_tokens)
+        predictions.append(output)
+    return predictions

notebooks/04_Few-shot_Prompting_OpenAI.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff