Spaces:
Running
Running
Simplified
Browse files
app.py
CHANGED
|
@@ -10,7 +10,7 @@ from dotenv import load_dotenv
|
|
| 10 |
load_dotenv()
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
-
from gen_api_answer import get_model_response, parse_model_response
|
| 14 |
from db import add_vote, create_db_connection, get_votes
|
| 15 |
from utils import Vote
|
| 16 |
from common import (
|
|
@@ -195,11 +195,12 @@ def vote(
|
|
| 195 |
|
| 196 |
# Return updates for UI components
|
| 197 |
return [
|
| 198 |
-
gr.update(visible=False), #
|
|
|
|
|
|
|
| 199 |
gr.update(value=f"*Model: {model_a}*"), # model_name_a
|
| 200 |
gr.update(value=f"*Model: {model_b}*"), # model_name_b
|
| 201 |
-
gr.update(interactive=True), # send_btn
|
| 202 |
-
gr.update(visible=True, interactive=True), # regenerate_button
|
| 203 |
]
|
| 204 |
|
| 205 |
|
|
@@ -283,41 +284,6 @@ def get_leaderboard(show_preliminary=True):
|
|
| 283 |
return leaderboard
|
| 284 |
|
| 285 |
|
| 286 |
-
def regenerate_prompt(model_a, model_b, eval_prompt, *variable_values):
|
| 287 |
-
variables = parse_variables(eval_prompt)
|
| 288 |
-
variable_values_dict = {var: val for var, val in zip(variables, variable_values)}
|
| 289 |
-
final_prompt = get_final_prompt(eval_prompt, variable_values_dict)
|
| 290 |
-
|
| 291 |
-
# Get available models excluding the previous ones
|
| 292 |
-
available_models = [m for m in model_data.keys() if m not in (model_a, model_b)]
|
| 293 |
-
|
| 294 |
-
# If we have enough models for new pairs
|
| 295 |
-
if len(available_models) >= 2:
|
| 296 |
-
model1, model2 = random.sample(available_models, 2)
|
| 297 |
-
else:
|
| 298 |
-
# Fallback to allowing previous models if necessary
|
| 299 |
-
model1, model2 = random.sample(list(model_data.keys()), 2)
|
| 300 |
-
|
| 301 |
-
response_a = get_model_response(model1, model_data.get(model1), final_prompt)
|
| 302 |
-
response_b = get_model_response(model2, model_data.get(model2), final_prompt)
|
| 303 |
-
|
| 304 |
-
# Parse the responses
|
| 305 |
-
score_a, critique_a = parse_model_response(response_a)
|
| 306 |
-
score_b, critique_b = parse_model_response(response_b)
|
| 307 |
-
|
| 308 |
-
return (
|
| 309 |
-
score_a, # score_a textbox
|
| 310 |
-
critique_a, # critique_a textbox
|
| 311 |
-
score_b, # score_b textbox
|
| 312 |
-
critique_b, # critique_b textbox
|
| 313 |
-
gr.update(visible=True), # action_buttons_row
|
| 314 |
-
gr.update(value="*Model: Hidden*"), # model_name_a
|
| 315 |
-
gr.update(value="*Model: Hidden*"), # model_name_b
|
| 316 |
-
model1, # model_a_state
|
| 317 |
-
model2, # model_b_state
|
| 318 |
-
)
|
| 319 |
-
|
| 320 |
-
|
| 321 |
def calculate_elo_change(rating_a, rating_b, winner):
|
| 322 |
"""Calculate ELO rating changes for both players."""
|
| 323 |
expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
|
|
@@ -413,138 +379,120 @@ def get_leaderboard_stats():
|
|
| 413 |
"""
|
| 414 |
|
| 415 |
|
| 416 |
-
def set_example_metric(metric_name):
|
| 417 |
-
if metric_name == "Custom":
|
| 418 |
-
variables = parse_variables(DEFAULT_EVAL_PROMPT)
|
| 419 |
-
variable_values = []
|
| 420 |
-
for var in variables:
|
| 421 |
-
if var == "input":
|
| 422 |
-
variable_values.append(DEFAULT_INPUT)
|
| 423 |
-
elif var == "response":
|
| 424 |
-
variable_values.append(DEFAULT_RESPONSE)
|
| 425 |
-
else:
|
| 426 |
-
variable_values.append("") # Default empty value
|
| 427 |
# Pad variable_values to match the length of variable_rows
|
| 428 |
-
while len(variable_values) < len(variable_rows):
|
| 429 |
-
variable_values.append("")
|
| 430 |
-
return [DEFAULT_EVAL_PROMPT] + variable_values
|
| 431 |
-
|
| 432 |
-
metric_data = EXAMPLE_METRICS[metric_name]
|
| 433 |
-
variables = parse_variables(metric_data["prompt"])
|
| 434 |
-
variable_values = []
|
| 435 |
-
for var in variables:
|
| 436 |
-
value = metric_data.get(var, "") # Default to empty string if not found
|
| 437 |
-
variable_values.append(value)
|
| 438 |
# Pad variable_values to match the length of variable_rows
|
| 439 |
-
while len(variable_values) < len(variable_rows):
|
| 440 |
-
variable_values.append("")
|
| 441 |
-
return [metric_data["prompt"]] + variable_values
|
| 442 |
|
| 443 |
|
| 444 |
# Select random metric at startup
|
| 445 |
-
def get_random_metric():
|
| 446 |
-
metrics = list(EXAMPLE_METRICS.keys())
|
| 447 |
-
return set_example_metric(random.choice(metrics))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
|
| 450 |
with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
| 451 |
gr.Markdown(MAIN_TITLE)
|
| 452 |
gr.Markdown(HOW_IT_WORKS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
|
| 454 |
with gr.Tabs():
|
| 455 |
with gr.TabItem("Judge Arena"):
|
| 456 |
-
|
| 457 |
-
with gr.Row():
|
| 458 |
-
with gr.Column():
|
| 459 |
-
gr.Markdown(BATTLE_RULES)
|
| 460 |
-
|
| 461 |
-
# Add Example Metrics Section
|
| 462 |
-
with gr.Accordion("Evaluator Prompt Templates", open=False):
|
| 463 |
-
with gr.Row():
|
| 464 |
-
custom_btn = gr.Button("Custom", variant="secondary")
|
| 465 |
-
hallucination_btn = gr.Button("Hallucination")
|
| 466 |
-
precision_btn = gr.Button("Precision")
|
| 467 |
-
recall_btn = gr.Button("Recall")
|
| 468 |
-
coherence_btn = gr.Button("Logical coherence")
|
| 469 |
-
faithfulness_btn = gr.Button("Faithfulness")
|
| 470 |
-
|
| 471 |
-
# Eval Prompt and Variables side by side
|
| 472 |
with gr.Row():
|
| 473 |
-
# Left
|
| 474 |
with gr.Column(scale=1):
|
| 475 |
-
gr.
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
"input" if i == 0
|
| 495 |
-
else "ground_truth" if i == 1
|
| 496 |
-
else "response" if i == 2
|
| 497 |
-
else ""
|
| 498 |
-
)
|
| 499 |
-
default_value = (
|
| 500 |
-
EXAMPLE_METRICS["Hallucination"]["input"] if i == 0
|
| 501 |
-
else EXAMPLE_METRICS["Hallucination"]["ground_truth"] if i == 1
|
| 502 |
-
else EXAMPLE_METRICS["Hallucination"]["response"] if i == 2
|
| 503 |
-
else ""
|
| 504 |
)
|
| 505 |
-
var_input = gr.Textbox(
|
| 506 |
-
container=True,
|
| 507 |
-
label=default_label,
|
| 508 |
-
value=default_value
|
| 509 |
-
)
|
| 510 |
-
variable_rows.append((var_row, var_input))
|
| 511 |
-
|
| 512 |
-
# Send button
|
| 513 |
-
with gr.Row(elem_classes="send-button-row"):
|
| 514 |
-
send_btn = gr.Button(
|
| 515 |
-
value="Test the evaluators", variant="primary", size="lg", scale=1
|
| 516 |
-
)
|
| 517 |
-
|
| 518 |
-
# Add divider heading for model outputs
|
| 519 |
-
gr.Markdown(VOTING_HEADER)
|
| 520 |
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
gr.Markdown("###
|
| 525 |
-
|
| 526 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
model_name_a = gr.Markdown("*Model: Hidden*")
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
model_name_b = gr.Markdown("*Model: Hidden*")
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
with gr.Row(visible=False) as action_buttons_row:
|
| 536 |
-
vote_a = gr.Button("Choose A", variant="primary")
|
| 537 |
-
vote_tie = gr.Button("Tie", variant="secondary")
|
| 538 |
-
vote_b = gr.Button("Choose B", variant="primary")
|
| 539 |
-
regenerate_button = gr.Button(
|
| 540 |
-
"Regenerate with different models", variant="secondary", visible=False
|
| 541 |
-
)
|
| 542 |
-
|
| 543 |
gr.Markdown("<br>")
|
| 544 |
|
| 545 |
-
# Add evaluation tips
|
| 546 |
-
gr.Markdown(EVAL_DESCRIPTION)
|
| 547 |
-
|
| 548 |
# Add spacing and acknowledgements at the bottom
|
| 549 |
gr.Markdown(ACKNOWLEDGEMENTS)
|
| 550 |
|
|
@@ -634,29 +582,30 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 634 |
)
|
| 635 |
return updates
|
| 636 |
|
| 637 |
-
eval_prompt.change(
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
)
|
| 642 |
|
| 643 |
# Regenerate button functionality
|
| 644 |
-
regenerate_button.click(
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
|
|
|
| 660 |
|
| 661 |
# Update model names after responses are generated
|
| 662 |
def update_model_names(model_a, model_b):
|
|
@@ -681,11 +630,12 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 681 |
critique_b,
|
| 682 |
],
|
| 683 |
outputs=[
|
| 684 |
-
|
|
|
|
|
|
|
| 685 |
model_name_a,
|
| 686 |
model_name_b,
|
| 687 |
send_btn,
|
| 688 |
-
regenerate_button,
|
| 689 |
],
|
| 690 |
)
|
| 691 |
|
|
@@ -702,11 +652,12 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 702 |
critique_b,
|
| 703 |
],
|
| 704 |
outputs=[
|
| 705 |
-
|
|
|
|
|
|
|
| 706 |
model_name_a,
|
| 707 |
model_name_b,
|
| 708 |
send_btn,
|
| 709 |
-
regenerate_button,
|
| 710 |
],
|
| 711 |
)
|
| 712 |
|
|
@@ -723,11 +674,12 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 723 |
critique_b,
|
| 724 |
],
|
| 725 |
outputs=[
|
| 726 |
-
|
|
|
|
|
|
|
| 727 |
model_name_a,
|
| 728 |
model_name_b,
|
| 729 |
send_btn,
|
| 730 |
-
regenerate_button,
|
| 731 |
],
|
| 732 |
)
|
| 733 |
|
|
@@ -759,32 +711,39 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 759 |
critique_a,
|
| 760 |
score_b,
|
| 761 |
critique_b,
|
| 762 |
-
|
| 763 |
-
gr.update(
|
| 764 |
-
|
| 765 |
-
), # Show and enable regenerate button
|
| 766 |
model_a,
|
| 767 |
model_b,
|
| 768 |
final_prompt, # Add final_prompt to state
|
| 769 |
gr.update(value="*Model: Hidden*"),
|
| 770 |
gr.update(value="*Model: Hidden*"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 771 |
)
|
| 772 |
|
| 773 |
send_btn.click(
|
| 774 |
fn=submit_and_store,
|
| 775 |
-
inputs=[eval_prompt
|
| 776 |
outputs=[
|
| 777 |
score_a,
|
| 778 |
critique_a,
|
| 779 |
score_b,
|
| 780 |
critique_b,
|
| 781 |
-
|
| 782 |
-
|
|
|
|
| 783 |
model_a_state,
|
| 784 |
model_b_state,
|
| 785 |
-
final_prompt_state,
|
| 786 |
model_name_a,
|
| 787 |
model_name_b,
|
|
|
|
| 788 |
],
|
| 789 |
)
|
| 790 |
|
|
@@ -802,44 +761,68 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 802 |
]
|
| 803 |
|
| 804 |
# Update the change handlers for prompt and variables
|
| 805 |
-
eval_prompt.change(
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
)
|
| 810 |
-
|
| 811 |
-
for _, var_input in variable_rows:
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
|
| 818 |
# Add click handlers for metric buttons
|
| 819 |
-
outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
|
| 820 |
|
| 821 |
-
custom_btn.click(fn=lambda: set_example_metric("Custom"), outputs=outputs_list)
|
| 822 |
|
| 823 |
-
hallucination_btn.click(
|
| 824 |
-
|
| 825 |
-
)
|
| 826 |
|
| 827 |
-
precision_btn.click(fn=lambda: set_example_metric("Precision"), outputs=outputs_list)
|
| 828 |
|
| 829 |
-
recall_btn.click(fn=lambda: set_example_metric("Recall"), outputs=outputs_list)
|
| 830 |
|
| 831 |
-
coherence_btn.click(
|
| 832 |
-
|
| 833 |
-
)
|
| 834 |
|
| 835 |
-
faithfulness_btn.click(
|
| 836 |
-
|
| 837 |
-
)
|
| 838 |
|
| 839 |
# Set default metric at startup
|
| 840 |
demo.load(
|
| 841 |
-
fn=lambda: set_example_metric("Hallucination"),
|
| 842 |
-
outputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 843 |
)
|
| 844 |
|
| 845 |
if __name__ == "__main__":
|
|
|
|
| 10 |
load_dotenv()
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
+
from gen_api_answer import get_model_response, parse_model_response, get_random_human_ai_pair
|
| 14 |
from db import add_vote, create_db_connection, get_votes
|
| 15 |
from utils import Vote
|
| 16 |
from common import (
|
|
|
|
| 195 |
|
| 196 |
# Return updates for UI components
|
| 197 |
return [
|
| 198 |
+
gr.update(visible=False), # vote_a
|
| 199 |
+
gr.update(visible=False), # vote_b
|
| 200 |
+
gr.update(visible=False), # tie_button_row
|
| 201 |
gr.update(value=f"*Model: {model_a}*"), # model_name_a
|
| 202 |
gr.update(value=f"*Model: {model_b}*"), # model_name_b
|
| 203 |
+
gr.update(interactive=True, value="Run the evaluators", variant="primary"), # send_btn
|
|
|
|
| 204 |
]
|
| 205 |
|
| 206 |
|
|
|
|
| 284 |
return leaderboard
|
| 285 |
|
| 286 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
def calculate_elo_change(rating_a, rating_b, winner):
|
| 288 |
"""Calculate ELO rating changes for both players."""
|
| 289 |
expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
|
|
|
|
| 379 |
"""
|
| 380 |
|
| 381 |
|
| 382 |
+
#def set_example_metric(metric_name):
|
| 383 |
+
# if metric_name == "Custom":
|
| 384 |
+
# variables = parse_variables(DEFAULT_EVAL_PROMPT)
|
| 385 |
+
# variable_values = []
|
| 386 |
+
# for var in variables:
|
| 387 |
+
# if var == "input":
|
| 388 |
+
# variable_values.append(DEFAULT_INPUT)
|
| 389 |
+
# elif var == "response":
|
| 390 |
+
# variable_values.append(DEFAULT_RESPONSE)
|
| 391 |
+
# else:
|
| 392 |
+
# variable_values.append("") # Default empty value
|
| 393 |
# Pad variable_values to match the length of variable_rows
|
| 394 |
+
# while len(variable_values) < len(variable_rows):
|
| 395 |
+
# variable_values.append("")
|
| 396 |
+
# return [DEFAULT_EVAL_PROMPT] + variable_values
|
| 397 |
+
|
| 398 |
+
# metric_data = EXAMPLE_METRICS[metric_name]
|
| 399 |
+
# variables = parse_variables(metric_data["prompt"])
|
| 400 |
+
# variable_values = []
|
| 401 |
+
# for var in variables:
|
| 402 |
+
# value = metric_data.get(var, "") # Default to empty string if not found
|
| 403 |
+
# variable_values.append(value)
|
| 404 |
# Pad variable_values to match the length of variable_rows
|
| 405 |
+
# while len(variable_values) < len(variable_rows):
|
| 406 |
+
# variable_values.append("")
|
| 407 |
+
# return [metric_data["prompt"]] + variable_values
|
| 408 |
|
| 409 |
|
| 410 |
# Select random metric at startup
|
| 411 |
+
# def get_random_metric():
|
| 412 |
+
# metrics = list(EXAMPLE_METRICS.keys())
|
| 413 |
+
# return set_example_metric(random.choice(metrics))
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
def populate_random_example(request: gr.Request):
|
| 417 |
+
"""Generate a random human-AI conversation example."""
|
| 418 |
+
human_msg, ai_msg = get_random_human_ai_pair()
|
| 419 |
+
return [
|
| 420 |
+
gr.update(value=human_msg),
|
| 421 |
+
gr.update(value=ai_msg)
|
| 422 |
+
]
|
| 423 |
|
| 424 |
|
| 425 |
with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
| 426 |
gr.Markdown(MAIN_TITLE)
|
| 427 |
gr.Markdown(HOW_IT_WORKS)
|
| 428 |
+
|
| 429 |
+
# Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
|
| 430 |
+
eval_prompt = gr.Textbox(
|
| 431 |
+
value=DEFAULT_EVAL_PROMPT,
|
| 432 |
+
visible=False
|
| 433 |
+
)
|
| 434 |
|
| 435 |
with gr.Tabs():
|
| 436 |
with gr.TabItem("Judge Arena"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
with gr.Row():
|
| 438 |
+
# Left side - Input section
|
| 439 |
with gr.Column(scale=1):
|
| 440 |
+
random_btn = gr.Button("🎲", scale=0)
|
| 441 |
+
with gr.Group():
|
| 442 |
+
human_input = gr.TextArea(
|
| 443 |
+
label="👩 Human Input",
|
| 444 |
+
lines=8,
|
| 445 |
+
placeholder="Enter the human message here..."
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
ai_response = gr.TextArea(
|
| 449 |
+
label="🤖 AI Response",
|
| 450 |
+
lines=8,
|
| 451 |
+
placeholder="Enter the AI response here..."
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
with gr.Row(elem_classes="send-button-row"):
|
| 455 |
+
send_btn = gr.Button(
|
| 456 |
+
value="Run the evaluators",
|
| 457 |
+
variant="primary",
|
| 458 |
+
size="lg"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
|
| 461 |
+
# Right side - Model outputs
|
| 462 |
+
with gr.Column(scale=1):
|
| 463 |
+
gr.Markdown("<br>")
|
| 464 |
+
gr.Markdown("\n### 👩⚖️ Judge A")
|
| 465 |
+
with gr.Group():
|
| 466 |
+
with gr.Row():
|
| 467 |
+
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
| 468 |
+
score_a = gr.Textbox(label="Score", interactive=False)
|
| 469 |
+
vote_a = gr.Button("Vote A", variant="primary", visible=False)
|
| 470 |
+
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
| 471 |
+
critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
|
| 472 |
model_name_a = gr.Markdown("*Model: Hidden*")
|
| 473 |
+
|
| 474 |
+
# Add spacing between judges
|
| 475 |
+
gr.Markdown("<br>")
|
| 476 |
+
|
| 477 |
+
# Center the Tie button between judges
|
| 478 |
+
with gr.Row(visible=False) as tie_button_row:
|
| 479 |
+
with gr.Column():
|
| 480 |
+
vote_tie = gr.Button("Tie", variant="secondary")
|
| 481 |
+
gr.Markdown("<br>")
|
| 482 |
+
|
| 483 |
+
gr.Markdown("### 👩⚖️ Judge B")
|
| 484 |
+
with gr.Group():
|
| 485 |
+
with gr.Row():
|
| 486 |
+
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
| 487 |
+
score_b = gr.Textbox(label="Score", interactive=False)
|
| 488 |
+
vote_b = gr.Button("Vote B", variant="primary", visible=False)
|
| 489 |
+
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
| 490 |
+
critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
|
| 491 |
model_name_b = gr.Markdown("*Model: Hidden*")
|
| 492 |
+
# Place Vote B button directly under Judge B
|
| 493 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
gr.Markdown("<br>")
|
| 495 |
|
|
|
|
|
|
|
|
|
|
| 496 |
# Add spacing and acknowledgements at the bottom
|
| 497 |
gr.Markdown(ACKNOWLEDGEMENTS)
|
| 498 |
|
|
|
|
| 582 |
)
|
| 583 |
return updates
|
| 584 |
|
| 585 |
+
#eval_prompt.change(
|
| 586 |
+
# fn=update_variables,
|
| 587 |
+
# inputs=eval_prompt,
|
| 588 |
+
# outputs=[item for sublist in variable_rows for item in sublist],
|
| 589 |
+
#)
|
| 590 |
|
| 591 |
# Regenerate button functionality
|
| 592 |
+
#regenerate_button.click(
|
| 593 |
+
# fn=regenerate_prompt,
|
| 594 |
+
# inputs=[model_a_state, model_b_state, eval_prompt, human_input, ai_response],
|
| 595 |
+
# outputs=[
|
| 596 |
+
# score_a,
|
| 597 |
+
# critique_a,
|
| 598 |
+
# score_b,
|
| 599 |
+
# critique_b,
|
| 600 |
+
# vote_a,
|
| 601 |
+
# vote_b,
|
| 602 |
+
# tie_button_row,
|
| 603 |
+
# model_name_a,
|
| 604 |
+
# model_name_b,
|
| 605 |
+
# model_a_state,
|
| 606 |
+
# model_b_state,
|
| 607 |
+
# ],
|
| 608 |
+
#)
|
| 609 |
|
| 610 |
# Update model names after responses are generated
|
| 611 |
def update_model_names(model_a, model_b):
|
|
|
|
| 630 |
critique_b,
|
| 631 |
],
|
| 632 |
outputs=[
|
| 633 |
+
vote_a,
|
| 634 |
+
vote_b,
|
| 635 |
+
tie_button_row,
|
| 636 |
model_name_a,
|
| 637 |
model_name_b,
|
| 638 |
send_btn,
|
|
|
|
| 639 |
],
|
| 640 |
)
|
| 641 |
|
|
|
|
| 652 |
critique_b,
|
| 653 |
],
|
| 654 |
outputs=[
|
| 655 |
+
vote_a,
|
| 656 |
+
vote_b,
|
| 657 |
+
tie_button_row,
|
| 658 |
model_name_a,
|
| 659 |
model_name_b,
|
| 660 |
send_btn,
|
|
|
|
| 661 |
],
|
| 662 |
)
|
| 663 |
|
|
|
|
| 674 |
critique_b,
|
| 675 |
],
|
| 676 |
outputs=[
|
| 677 |
+
vote_a,
|
| 678 |
+
vote_b,
|
| 679 |
+
tie_button_row,
|
| 680 |
model_name_a,
|
| 681 |
model_name_b,
|
| 682 |
send_btn,
|
|
|
|
| 683 |
],
|
| 684 |
)
|
| 685 |
|
|
|
|
| 711 |
critique_a,
|
| 712 |
score_b,
|
| 713 |
critique_b,
|
| 714 |
+
gr.update(visible=True), # vote_a
|
| 715 |
+
gr.update(visible=True), # vote_b
|
| 716 |
+
gr.update(visible=True), # tie_button_row
|
|
|
|
| 717 |
model_a,
|
| 718 |
model_b,
|
| 719 |
final_prompt, # Add final_prompt to state
|
| 720 |
gr.update(value="*Model: Hidden*"),
|
| 721 |
gr.update(value="*Model: Hidden*"),
|
| 722 |
+
# Change the button to "Regenerate" mode after evaluation
|
| 723 |
+
gr.update(
|
| 724 |
+
value="Regenerate with different models",
|
| 725 |
+
variant="secondary",
|
| 726 |
+
interactive=True
|
| 727 |
+
),
|
| 728 |
)
|
| 729 |
|
| 730 |
send_btn.click(
|
| 731 |
fn=submit_and_store,
|
| 732 |
+
inputs=[eval_prompt, human_input, ai_response],
|
| 733 |
outputs=[
|
| 734 |
score_a,
|
| 735 |
critique_a,
|
| 736 |
score_b,
|
| 737 |
critique_b,
|
| 738 |
+
vote_a,
|
| 739 |
+
vote_b,
|
| 740 |
+
tie_button_row,
|
| 741 |
model_a_state,
|
| 742 |
model_b_state,
|
| 743 |
+
final_prompt_state,
|
| 744 |
model_name_a,
|
| 745 |
model_name_b,
|
| 746 |
+
send_btn,
|
| 747 |
],
|
| 748 |
)
|
| 749 |
|
|
|
|
| 761 |
]
|
| 762 |
|
| 763 |
# Update the change handlers for prompt and variables
|
| 764 |
+
#eval_prompt.change(
|
| 765 |
+
# fn=handle_input_changes,
|
| 766 |
+
# inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
| 767 |
+
# outputs=[send_btn, regenerate_button],
|
| 768 |
+
#)
|
| 769 |
+
|
| 770 |
+
# for _, var_input in variable_rows:
|
| 771 |
+
# var_input.change(
|
| 772 |
+
# fn=handle_input_changes,
|
| 773 |
+
# inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
| 774 |
+
# outputs=[send_btn, regenerate_button],
|
| 775 |
+
# )
|
| 776 |
|
| 777 |
# Add click handlers for metric buttons
|
| 778 |
+
#outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
|
| 779 |
|
| 780 |
+
#custom_btn.click(fn=lambda: set_example_metric("Custom"), outputs=outputs_list)
|
| 781 |
|
| 782 |
+
#hallucination_btn.click(
|
| 783 |
+
# fn=lambda: set_example_metric("Hallucination"), outputs=outputs_list
|
| 784 |
+
#)
|
| 785 |
|
| 786 |
+
#precision_btn.click(fn=lambda: set_example_metric("Precision"), outputs=outputs_list)
|
| 787 |
|
| 788 |
+
#recall_btn.click(fn=lambda: set_example_metric("Recall"), outputs=outputs_list)
|
| 789 |
|
| 790 |
+
#coherence_btn.click(
|
| 791 |
+
# fn=lambda: set_example_metric("Logical_Coherence"), outputs=outputs_list
|
| 792 |
+
#)
|
| 793 |
|
| 794 |
+
#faithfulness_btn.click(
|
| 795 |
+
# fn=lambda: set_example_metric("Faithfulness"), outputs=outputs_list
|
| 796 |
+
#)
|
| 797 |
|
| 798 |
# Set default metric at startup
|
| 799 |
demo.load(
|
| 800 |
+
#fn=lambda: set_example_metric("Hallucination"),
|
| 801 |
+
#outputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
| 802 |
+
)
|
| 803 |
+
|
| 804 |
+
# Add random button handler
|
| 805 |
+
random_btn.click(
|
| 806 |
+
fn=populate_random_example,
|
| 807 |
+
inputs=[],
|
| 808 |
+
outputs=[human_input, ai_response]
|
| 809 |
+
)
|
| 810 |
+
|
| 811 |
+
# Add new input change handlers
|
| 812 |
+
def handle_input_change():
|
| 813 |
+
return gr.update(value="Run the evaluators", variant="primary")
|
| 814 |
+
|
| 815 |
+
# Update the change handlers for inputs
|
| 816 |
+
human_input.change(
|
| 817 |
+
fn=handle_input_change,
|
| 818 |
+
inputs=[],
|
| 819 |
+
outputs=[send_btn]
|
| 820 |
+
)
|
| 821 |
+
|
| 822 |
+
ai_response.change(
|
| 823 |
+
fn=handle_input_change,
|
| 824 |
+
inputs=[],
|
| 825 |
+
outputs=[send_btn]
|
| 826 |
)
|
| 827 |
|
| 828 |
if __name__ == "__main__":
|