Spaces:
Running
Running
Update app.py
Browse filesNewer models get featured in battles more to catch up on votes
app.py
CHANGED
|
@@ -13,7 +13,8 @@ import gradio as gr
|
|
| 13 |
from gen_api_answer import (
|
| 14 |
get_model_response,
|
| 15 |
parse_model_response,
|
| 16 |
-
prometheus_parse_model_response
|
|
|
|
| 17 |
)
|
| 18 |
|
| 19 |
from random_sample_generation import (
|
|
@@ -113,40 +114,6 @@ def get_final_prompt(eval_prompt, variable_values):
|
|
| 113 |
return eval_prompt
|
| 114 |
|
| 115 |
|
| 116 |
-
def submit_prompt(eval_prompt, *variable_values):
|
| 117 |
-
try:
|
| 118 |
-
variables = parse_variables(eval_prompt)
|
| 119 |
-
variable_values_dict = {var: val for var, val in zip(variables, variable_values)}
|
| 120 |
-
final_prompt = get_final_prompt(eval_prompt, variable_values_dict)
|
| 121 |
-
|
| 122 |
-
models = list(model_data.keys())
|
| 123 |
-
model1, model2 = random.sample(models, 2)
|
| 124 |
-
model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
|
| 125 |
-
|
| 126 |
-
response_a = get_model_response(model_a, model_data.get(model_a), final_prompt)
|
| 127 |
-
response_b = get_model_response(model_b, model_data.get(model_b), final_prompt)
|
| 128 |
-
|
| 129 |
-
return (
|
| 130 |
-
response_a,
|
| 131 |
-
response_b,
|
| 132 |
-
gr.update(visible=True),
|
| 133 |
-
gr.update(visible=True),
|
| 134 |
-
model_a,
|
| 135 |
-
model_b,
|
| 136 |
-
final_prompt,
|
| 137 |
-
)
|
| 138 |
-
except Exception as e:
|
| 139 |
-
print(f"Error in submit_prompt: {str(e)}")
|
| 140 |
-
return (
|
| 141 |
-
"Error generating response",
|
| 142 |
-
"Error generating response",
|
| 143 |
-
gr.update(visible=False),
|
| 144 |
-
gr.update(visible=False),
|
| 145 |
-
None,
|
| 146 |
-
None,
|
| 147 |
-
None,
|
| 148 |
-
)
|
| 149 |
-
|
| 150 |
|
| 151 |
def get_ip(request: gr.Request) -> str:
|
| 152 |
"""Get and hash the IP address from the request."""
|
|
@@ -492,7 +459,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 492 |
show_preliminary = gr.Checkbox(
|
| 493 |
label="Reveal preliminary results",
|
| 494 |
value=True, # Checked by default
|
| 495 |
-
info="Show all models, including models with less human ratings (<
|
| 496 |
interactive=True
|
| 497 |
)
|
| 498 |
stats_display = gr.Markdown()
|
|
@@ -714,6 +681,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 714 |
score3_description,
|
| 715 |
score4_description,
|
| 716 |
score5_description,
|
|
|
|
| 717 |
):
|
| 718 |
# Build prompt data dictionary
|
| 719 |
prompt_data = {
|
|
@@ -728,9 +696,40 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 728 |
'score5_desc': score5_description,
|
| 729 |
}
|
| 730 |
|
| 731 |
-
models
|
| 732 |
-
|
| 733 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 734 |
|
| 735 |
# Get responses from models
|
| 736 |
response_a = get_model_response(
|
|
@@ -746,13 +745,18 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 746 |
use_reference=use_reference
|
| 747 |
)
|
| 748 |
|
| 749 |
-
# Parse the responses based on model, using
|
| 750 |
is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
|
| 751 |
is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
|
|
|
|
|
|
|
| 752 |
|
| 753 |
if is_prometheus_a:
|
| 754 |
score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
|
| 755 |
score_a_val = f"{score_a_val} / 5"
|
|
|
|
|
|
|
|
|
|
| 756 |
else:
|
| 757 |
score_a_val, critique_a_val = parse_model_response(response_a)
|
| 758 |
score_a_val = f"{score_a_val} / 5"
|
|
@@ -760,6 +764,9 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 760 |
if is_prometheus_b:
|
| 761 |
score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
|
| 762 |
score_b_val = f"{score_b_val} / 5"
|
|
|
|
|
|
|
|
|
|
| 763 |
else:
|
| 764 |
score_b_val, critique_b_val = parse_model_response(response_b)
|
| 765 |
score_b_val = f"{score_b_val} / 5"
|
|
@@ -781,9 +788,21 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 781 |
gr.update(value="🎲"), # random_btn
|
| 782 |
)
|
| 783 |
|
| 784 |
-
# Update the click handler to use
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 785 |
send_btn.click(
|
| 786 |
-
fn=
|
| 787 |
inputs=[
|
| 788 |
use_reference_toggle,
|
| 789 |
eval_criteria_text,
|
|
|
|
| 13 |
from gen_api_answer import (
|
| 14 |
get_model_response,
|
| 15 |
parse_model_response,
|
| 16 |
+
prometheus_parse_model_response,
|
| 17 |
+
atla_parse_model_response
|
| 18 |
)
|
| 19 |
|
| 20 |
from random_sample_generation import (
|
|
|
|
| 114 |
return eval_prompt
|
| 115 |
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
def get_ip(request: gr.Request) -> str:
|
| 119 |
"""Get and hash the IP address from the request."""
|
|
|
|
| 459 |
show_preliminary = gr.Checkbox(
|
| 460 |
label="Reveal preliminary results",
|
| 461 |
value=True, # Checked by default
|
| 462 |
+
info="Show all models, including models with less human ratings (< 300 votes)",
|
| 463 |
interactive=True
|
| 464 |
)
|
| 465 |
stats_display = gr.Markdown()
|
|
|
|
| 681 |
score3_description,
|
| 682 |
score4_description,
|
| 683 |
score5_description,
|
| 684 |
+
is_first_game=False
|
| 685 |
):
|
| 686 |
# Build prompt data dictionary
|
| 687 |
prompt_data = {
|
|
|
|
| 696 |
'score5_desc': score5_description,
|
| 697 |
}
|
| 698 |
|
| 699 |
+
# Get list of active models only for matches
|
| 700 |
+
active_models = [name for name, info in model_data.items()
|
| 701 |
+
if info.get("active", True)] # Default to True for backward compatibility
|
| 702 |
+
|
| 703 |
+
# Modified model selection logic
|
| 704 |
+
atla_model = "Atla-8B-preview-2024-01-08"
|
| 705 |
+
|
| 706 |
+
if is_first_game:
|
| 707 |
+
# For the first game, ensure Atla is one of the models
|
| 708 |
+
other_models = [m for m in active_models if m != atla_model]
|
| 709 |
+
other_model = random.choice(other_models)
|
| 710 |
+
|
| 711 |
+
# Randomly assign Atla to either position A or B
|
| 712 |
+
if random.random() < 0.5:
|
| 713 |
+
model_a, model_b = atla_model, other_model
|
| 714 |
+
else:
|
| 715 |
+
model_a, model_b = other_model, atla_model
|
| 716 |
+
else:
|
| 717 |
+
# For subsequent games, Atla appears 30% of the time
|
| 718 |
+
if random.random() < 0.3:
|
| 719 |
+
# Include Atla in this battle
|
| 720 |
+
other_models = [m for m in active_models if m != atla_model]
|
| 721 |
+
other_model = random.choice(other_models)
|
| 722 |
+
|
| 723 |
+
# Randomly assign Atla to either position A or B
|
| 724 |
+
if random.random() < 0.5:
|
| 725 |
+
model_a, model_b = atla_model, other_model
|
| 726 |
+
else:
|
| 727 |
+
model_a, model_b = other_model, atla_model
|
| 728 |
+
else:
|
| 729 |
+
# Battle between two non-Atla models
|
| 730 |
+
non_atla_models = [m for m in active_models if m != atla_model]
|
| 731 |
+
model1, model2 = random.sample(non_atla_models, 2)
|
| 732 |
+
model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
|
| 733 |
|
| 734 |
# Get responses from models
|
| 735 |
response_a = get_model_response(
|
|
|
|
| 745 |
use_reference=use_reference
|
| 746 |
)
|
| 747 |
|
| 748 |
+
# Parse the responses based on model, using appropriate parsing for different models
|
| 749 |
is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
|
| 750 |
is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
|
| 751 |
+
is_atla_a = (model_data.get(model_a)['organization'] == 'Atla')
|
| 752 |
+
is_atla_b = (model_data.get(model_b)['organization'] == 'Atla')
|
| 753 |
|
| 754 |
if is_prometheus_a:
|
| 755 |
score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
|
| 756 |
score_a_val = f"{score_a_val} / 5"
|
| 757 |
+
elif is_atla_a:
|
| 758 |
+
score_a_val, critique_a_val = atla_parse_model_response(response_a)
|
| 759 |
+
score_a_val = f"{score_a_val} / 5"
|
| 760 |
else:
|
| 761 |
score_a_val, critique_a_val = parse_model_response(response_a)
|
| 762 |
score_a_val = f"{score_a_val} / 5"
|
|
|
|
| 764 |
if is_prometheus_b:
|
| 765 |
score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
|
| 766 |
score_b_val = f"{score_b_val} / 5"
|
| 767 |
+
elif is_atla_b:
|
| 768 |
+
score_b_val, critique_b_val = atla_parse_model_response(response_b)
|
| 769 |
+
score_b_val = f"{score_b_val} / 5"
|
| 770 |
else:
|
| 771 |
score_b_val, critique_b_val = parse_model_response(response_b)
|
| 772 |
score_b_val = f"{score_b_val} / 5"
|
|
|
|
| 788 |
gr.update(value="🎲"), # random_btn
|
| 789 |
)
|
| 790 |
|
| 791 |
+
# Update the click handler to use False for is_first_game after first submission
|
| 792 |
+
def create_submit_handler():
|
| 793 |
+
first_game = True
|
| 794 |
+
|
| 795 |
+
def handler(*args):
|
| 796 |
+
nonlocal first_game
|
| 797 |
+
result = submit_and_store(*args, first_game)
|
| 798 |
+
first_game = False # Set to False after first submission
|
| 799 |
+
return result
|
| 800 |
+
|
| 801 |
+
return handler
|
| 802 |
+
|
| 803 |
+
# Update the send_btn click handler
|
| 804 |
send_btn.click(
|
| 805 |
+
fn=create_submit_handler(),
|
| 806 |
inputs=[
|
| 807 |
use_reference_toggle,
|
| 808 |
eval_criteria_text,
|