Spaces:
Sleeping
Sleeping
fix-memory-requirements-for-cpu (#36)
Browse files- feat(cpu): fix the displaying of memory requirements for intel cpu (1fbcd42f775a19529dee237d0b6ee72e6c5ab8b2)
- app.py +4 -7
- hardware.yaml +1 -1
- src/llm_perf.py +27 -7
app.py
CHANGED
|
@@ -67,13 +67,10 @@ with demo:
|
|
| 67 |
search_bar, columns_checkboxes, leaderboard_table = (
|
| 68 |
create_leaderboard_table(open_llm_perf_df)
|
| 69 |
)
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
lat_score_mem_plot = create_lat_score_mem_plot(
|
| 75 |
-
open_llm_perf_df
|
| 76 |
-
)
|
| 77 |
###################### ATTENTIONS SPEEDUP TAB #######################
|
| 78 |
# with gr.TabItem("Attention 📈", id=2):
|
| 79 |
# attn_prefill_plot, attn_decode_plot = create_attn_plots(
|
|
|
|
| 67 |
search_bar, columns_checkboxes, leaderboard_table = (
|
| 68 |
create_leaderboard_table(open_llm_perf_df)
|
| 69 |
)
|
| 70 |
+
with gr.TabItem("Find Your Best Model 🧭", id=1):
|
| 71 |
+
lat_score_mem_plot = create_lat_score_mem_plot(
|
| 72 |
+
open_llm_perf_df
|
| 73 |
+
)
|
|
|
|
|
|
|
|
|
|
| 74 |
###################### ATTENTIONS SPEEDUP TAB #######################
|
| 75 |
# with gr.TabItem("Attention 📈", id=2):
|
| 76 |
# attn_prefill_plot, attn_decode_plot = create_attn_plots(
|
hardware.yaml
CHANGED
|
@@ -39,7 +39,7 @@
|
|
| 39 |
- machine: 32vCPU-C7i
|
| 40 |
description: Intel-Xeon-SPR-385W 🖥️
|
| 41 |
detail: |
|
| 42 |
-
We tested the [32vCPU AWS C7i](https://aws.amazon.com/ec2/instance-types/c7i/) instance for the benchmark.
|
| 43 |
hardware_provider: intel
|
| 44 |
hardware_type: cpu
|
| 45 |
subsets:
|
|
|
|
| 39 |
- machine: 32vCPU-C7i
|
| 40 |
description: Intel-Xeon-SPR-385W 🖥️
|
| 41 |
detail: |
|
| 42 |
+
We tested the [32vCPU AWS C7i](https://aws.amazon.com/ec2/instance-types/c7i/) instance for the benchmark. The memory requirement is the max RAM consumption during the decode phase.
|
| 43 |
hardware_provider: intel
|
| 44 |
hardware_type: cpu
|
| 45 |
subsets:
|
src/llm_perf.py
CHANGED
|
@@ -15,7 +15,6 @@ COLUMNS_MAPPING = {
|
|
| 15 |
"report.per_token.latency.p50": "Per Token (s)",
|
| 16 |
"report.decode.throughput.value": "Decode (tokens/s)",
|
| 17 |
"report.decode.efficiency.value": "Energy (tokens/kWh)",
|
| 18 |
-
"report.decode.memory.max_allocated": "Memory (MB)",
|
| 19 |
# deployment settings
|
| 20 |
"config.backend.name": "Backend 🏭",
|
| 21 |
"config.backend.torch_dtype": "Precision 📥",
|
|
@@ -28,6 +27,15 @@ COLUMNS_MAPPING = {
|
|
| 28 |
"Average ⬆️": "Open LLM Score (%)",
|
| 29 |
"#Params (B)": "Params (B)",
|
| 30 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
|
| 32 |
SORTING_ASCENDING = [False, True, False]
|
| 33 |
|
|
@@ -39,9 +47,10 @@ def get_raw_llm_perf_df(
|
|
| 39 |
for subset in subsets:
|
| 40 |
for backend in backends:
|
| 41 |
try:
|
|
|
|
| 42 |
dfs.append(
|
| 43 |
pd.read_csv(
|
| 44 |
-
|
| 45 |
)
|
| 46 |
)
|
| 47 |
except Exception:
|
|
@@ -70,7 +79,7 @@ def get_raw_llm_perf_df(
|
|
| 70 |
return llm_perf_df
|
| 71 |
|
| 72 |
|
| 73 |
-
def processed_llm_perf_df(llm_perf_df):
|
| 74 |
# some assertions
|
| 75 |
assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
|
| 76 |
assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
|
|
@@ -105,15 +114,23 @@ def processed_llm_perf_df(llm_perf_df):
|
|
| 105 |
"report.decode.throughput.value": 3,
|
| 106 |
"report.decode.efficiency.value": 3,
|
| 107 |
"report.decode.memory.max_allocated": 3,
|
|
|
|
| 108 |
"Average ⬆️": 3,
|
| 109 |
"prefill+decode": 3,
|
| 110 |
"#Params (B)": 3,
|
| 111 |
}
|
| 112 |
)
|
|
|
|
| 113 |
# filter columns
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
# sort by metric
|
| 118 |
llm_perf_df.sort_values(
|
| 119 |
by=SORTING_COLUMNS,
|
|
@@ -121,6 +138,9 @@ def processed_llm_perf_df(llm_perf_df):
|
|
| 121 |
inplace=True,
|
| 122 |
)
|
| 123 |
|
|
|
|
|
|
|
|
|
|
| 124 |
return llm_perf_df
|
| 125 |
|
| 126 |
|
|
@@ -137,7 +157,7 @@ def get_llm_perf_df(
|
|
| 137 |
else:
|
| 138 |
print(f"Dataset machine {machine} not found, downloading...")
|
| 139 |
llm_perf_df = get_raw_llm_perf_df(machine, subsets, backends, hardware_type)
|
| 140 |
-
llm_perf_df = processed_llm_perf_df(llm_perf_df)
|
| 141 |
llm_perf_df.to_csv(
|
| 142 |
f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv", index=False
|
| 143 |
)
|
|
|
|
| 15 |
"report.per_token.latency.p50": "Per Token (s)",
|
| 16 |
"report.decode.throughput.value": "Decode (tokens/s)",
|
| 17 |
"report.decode.efficiency.value": "Energy (tokens/kWh)",
|
|
|
|
| 18 |
# deployment settings
|
| 19 |
"config.backend.name": "Backend 🏭",
|
| 20 |
"config.backend.torch_dtype": "Precision 📥",
|
|
|
|
| 27 |
"Average ⬆️": "Open LLM Score (%)",
|
| 28 |
"#Params (B)": "Params (B)",
|
| 29 |
}
|
| 30 |
+
|
| 31 |
+
CUDA_COLUMNS_MAPPING = COLUMNS_MAPPING | {
|
| 32 |
+
"report.decode.memory.max_allocated": "Memory (MB)",
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
INTEL_COLUMNS_MAPPING = COLUMNS_MAPPING | {
|
| 36 |
+
"report.decode.memory.max_ram": "Memory (MB)",
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
|
| 40 |
SORTING_ASCENDING = [False, True, False]
|
| 41 |
|
|
|
|
| 47 |
for subset in subsets:
|
| 48 |
for backend in backends:
|
| 49 |
try:
|
| 50 |
+
url = f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/perf-df-{backend}-{hardware_type}-{subset}-{machine}.csv"
|
| 51 |
dfs.append(
|
| 52 |
pd.read_csv(
|
| 53 |
+
url
|
| 54 |
)
|
| 55 |
)
|
| 56 |
except Exception:
|
|
|
|
| 79 |
return llm_perf_df
|
| 80 |
|
| 81 |
|
| 82 |
+
def processed_llm_perf_df(llm_perf_df, hardware_type: str):
|
| 83 |
# some assertions
|
| 84 |
assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
|
| 85 |
assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
|
|
|
|
| 114 |
"report.decode.throughput.value": 3,
|
| 115 |
"report.decode.efficiency.value": 3,
|
| 116 |
"report.decode.memory.max_allocated": 3,
|
| 117 |
+
"report.decode.memory.max_ram": 3,
|
| 118 |
"Average ⬆️": 3,
|
| 119 |
"prefill+decode": 3,
|
| 120 |
"#Params (B)": 3,
|
| 121 |
}
|
| 122 |
)
|
| 123 |
+
|
| 124 |
# filter columns
|
| 125 |
+
if hardware_type == "cuda":
|
| 126 |
+
llm_perf_df = llm_perf_df[list(CUDA_COLUMNS_MAPPING.keys())]
|
| 127 |
+
llm_perf_df.rename(columns=CUDA_COLUMNS_MAPPING, inplace=True)
|
| 128 |
+
elif hardware_type == "cpu":
|
| 129 |
+
llm_perf_df = llm_perf_df[list(INTEL_COLUMNS_MAPPING.keys())]
|
| 130 |
+
llm_perf_df.rename(columns=INTEL_COLUMNS_MAPPING, inplace=True)
|
| 131 |
+
else:
|
| 132 |
+
raise ValueError(f"Hardware type {hardware_type} not supported")
|
| 133 |
+
|
| 134 |
# sort by metric
|
| 135 |
llm_perf_df.sort_values(
|
| 136 |
by=SORTING_COLUMNS,
|
|
|
|
| 138 |
inplace=True,
|
| 139 |
)
|
| 140 |
|
| 141 |
+
assert llm_perf_df["Memory (MB)"].notna().any(), "The dataset should contain at least one memory value, otherwise this implies that all the benchmarks have failed (contains only a traceback)"
|
| 142 |
+
assert llm_perf_df.columns.is_unique, "All columns should be unique"
|
| 143 |
+
|
| 144 |
return llm_perf_df
|
| 145 |
|
| 146 |
|
|
|
|
| 157 |
else:
|
| 158 |
print(f"Dataset machine {machine} not found, downloading...")
|
| 159 |
llm_perf_df = get_raw_llm_perf_df(machine, subsets, backends, hardware_type)
|
| 160 |
+
llm_perf_df = processed_llm_perf_df(llm_perf_df, hardware_type)
|
| 161 |
llm_perf_df.to_csv(
|
| 162 |
f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv", index=False
|
| 163 |
)
|