|
|
|
|
|
""" |
|
|
Helion-2.5-Rnd Benchmark Runner |
|
|
Comprehensive benchmarking suite for performance testing |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import json |
|
|
import logging |
|
|
import statistics |
|
|
import time |
|
|
from collections import defaultdict |
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
|
from datetime import datetime |
|
|
from pathlib import Path |
|
|
from typing import Dict, List, Optional |
|
|
|
|
|
import numpy as np |
|
|
|
|
|
from inference.client import HelionClient |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class BenchmarkRunner: |
|
|
"""Run comprehensive benchmarks on Helion model""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
base_url: str = "http://localhost:8000", |
|
|
output_dir: str = "./benchmark_results" |
|
|
): |
|
|
""" |
|
|
Initialize benchmark runner |
|
|
|
|
|
Args: |
|
|
base_url: API base URL |
|
|
output_dir: Directory for results |
|
|
""" |
|
|
self.client = HelionClient(base_url=base_url) |
|
|
self.output_dir = Path(output_dir) |
|
|
self.output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
self.results = { |
|
|
'timestamp': datetime.now().isoformat(), |
|
|
'base_url': base_url, |
|
|
'tests': {} |
|
|
} |
|
|
|
|
|
def benchmark_latency( |
|
|
self, |
|
|
num_requests: int = 100, |
|
|
prompt_lengths: List[int] = [128, 512, 2048], |
|
|
max_tokens: int = 256 |
|
|
) -> Dict: |
|
|
""" |
|
|
Benchmark inference latency |
|
|
|
|
|
Args: |
|
|
num_requests: Number of requests per test |
|
|
prompt_lengths: Different prompt lengths to test |
|
|
max_tokens: Maximum tokens to generate |
|
|
|
|
|
Returns: |
|
|
Latency benchmark results |
|
|
""" |
|
|
logger.info("Running latency benchmark...") |
|
|
|
|
|
results = {} |
|
|
|
|
|
for prompt_len in prompt_lengths: |
|
|
logger.info(f"Testing prompt length: {prompt_len}") |
|
|
|
|
|
|
|
|
test_prompt = "Hello world. " * (prompt_len // 13) |
|
|
|
|
|
latencies = [] |
|
|
first_token_latencies = [] |
|
|
|
|
|
for i in range(num_requests): |
|
|
try: |
|
|
start_time = time.time() |
|
|
|
|
|
response = self.client.complete( |
|
|
prompt=test_prompt, |
|
|
max_tokens=max_tokens, |
|
|
temperature=0.7, |
|
|
stream=False |
|
|
) |
|
|
|
|
|
end_time = time.time() |
|
|
latency = (end_time - start_time) * 1000 |
|
|
|
|
|
latencies.append(latency) |
|
|
|
|
|
if i % 10 == 0: |
|
|
logger.info(f" Progress: {i+1}/{num_requests}") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Request failed: {e}") |
|
|
|
|
|
if latencies: |
|
|
results[f"prompt_{prompt_len}"] = { |
|
|
'num_samples': len(latencies), |
|
|
'mean_ms': statistics.mean(latencies), |
|
|
'median_ms': statistics.median(latencies), |
|
|
'std_dev_ms': statistics.stdev(latencies) if len(latencies) > 1 else 0, |
|
|
'min_ms': min(latencies), |
|
|
'max_ms': max(latencies), |
|
|
'p50_ms': np.percentile(latencies, 50), |
|
|
'p90_ms': np.percentile(latencies, 90), |
|
|
'p95_ms': np.percentile(latencies, 95), |
|
|
'p99_ms': np.percentile(latencies, 99) |
|
|
} |
|
|
|
|
|
return results |
|
|
|
|
|
def benchmark_throughput( |
|
|
self, |
|
|
duration_seconds: int = 60, |
|
|
concurrent_requests: int = 10, |
|
|
prompt_length: int = 512, |
|
|
max_tokens: int = 128 |
|
|
) -> Dict: |
|
|
""" |
|
|
Benchmark throughput with concurrent requests |
|
|
|
|
|
Args: |
|
|
duration_seconds: How long to run test |
|
|
concurrent_requests: Number of concurrent requests |
|
|
prompt_length: Prompt length for testing |
|
|
max_tokens: Maximum tokens to generate |
|
|
|
|
|
Returns: |
|
|
Throughput benchmark results |
|
|
""" |
|
|
logger.info(f"Running throughput benchmark for {duration_seconds}s...") |
|
|
|
|
|
test_prompt = "The quick brown fox jumps over the lazy dog. " * (prompt_length // 45) |
|
|
|
|
|
start_time = time.time() |
|
|
end_time = start_time + duration_seconds |
|
|
|
|
|
completed_requests = 0 |
|
|
failed_requests = 0 |
|
|
total_tokens = 0 |
|
|
latencies = [] |
|
|
|
|
|
def make_request(): |
|
|
try: |
|
|
req_start = time.time() |
|
|
response = self.client.complete( |
|
|
prompt=test_prompt, |
|
|
max_tokens=max_tokens, |
|
|
temperature=0.7 |
|
|
) |
|
|
req_end = time.time() |
|
|
|
|
|
return { |
|
|
'success': True, |
|
|
'latency': req_end - req_start, |
|
|
'tokens': len(response.split()) |
|
|
} |
|
|
except Exception as e: |
|
|
return {'success': False, 'error': str(e)} |
|
|
|
|
|
with ThreadPoolExecutor(max_workers=concurrent_requests) as executor: |
|
|
while time.time() < end_time: |
|
|
futures = [executor.submit(make_request) for _ in range(concurrent_requests)] |
|
|
|
|
|
for future in as_completed(futures): |
|
|
result = future.result() |
|
|
|
|
|
if result['success']: |
|
|
completed_requests += 1 |
|
|
latencies.append(result['latency'] * 1000) |
|
|
total_tokens += result.get('tokens', 0) |
|
|
else: |
|
|
failed_requests += 1 |
|
|
|
|
|
actual_duration = time.time() - start_time |
|
|
|
|
|
return { |
|
|
'duration_seconds': actual_duration, |
|
|
'concurrent_requests': concurrent_requests, |
|
|
'completed_requests': completed_requests, |
|
|
'failed_requests': failed_requests, |
|
|
'requests_per_second': completed_requests / actual_duration, |
|
|
'total_tokens': total_tokens, |
|
|
'tokens_per_second': total_tokens / actual_duration, |
|
|
'avg_latency_ms': statistics.mean(latencies) if latencies else 0, |
|
|
'p95_latency_ms': np.percentile(latencies, 95) if latencies else 0 |
|
|
} |
|
|
|
|
|
def benchmark_context_length( |
|
|
self, |
|
|
context_lengths: List[int] = [1024, 4096, 16384, 65536], |
|
|
num_samples: int = 10 |
|
|
) -> Dict: |
|
|
""" |
|
|
Benchmark performance across different context lengths |
|
|
|
|
|
Args: |
|
|
context_lengths: List of context lengths to test |
|
|
num_samples: Number of samples per length |
|
|
|
|
|
Returns: |
|
|
Context length benchmark results |
|
|
""" |
|
|
logger.info("Running context length benchmark...") |
|
|
|
|
|
results = {} |
|
|
|
|
|
for ctx_len in context_lengths: |
|
|
logger.info(f"Testing context length: {ctx_len}") |
|
|
|
|
|
|
|
|
base_text = "This is a test sentence for context length benchmarking. " |
|
|
long_prompt = base_text * (ctx_len // len(base_text)) |
|
|
long_prompt = long_prompt[:ctx_len] + "\n\nSummarize the above text:" |
|
|
|
|
|
latencies = [] |
|
|
|
|
|
for i in range(num_samples): |
|
|
try: |
|
|
start_time = time.time() |
|
|
|
|
|
response = self.client.complete( |
|
|
prompt=long_prompt, |
|
|
max_tokens=256, |
|
|
temperature=0.5 |
|
|
) |
|
|
|
|
|
end_time = time.time() |
|
|
latencies.append((end_time - start_time) * 1000) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Context length {ctx_len} failed: {e}") |
|
|
|
|
|
if latencies: |
|
|
results[f"context_{ctx_len}"] = { |
|
|
'mean_latency_ms': statistics.mean(latencies), |
|
|
'median_latency_ms': statistics.median(latencies), |
|
|
'std_dev_ms': statistics.stdev(latencies) if len(latencies) > 1 else 0 |
|
|
} |
|
|
|
|
|
return results |
|
|
|
|
|
def benchmark_generation_quality( |
|
|
self, |
|
|
test_prompts: Optional[List[str]] = None, |
|
|
num_samples: int = 5 |
|
|
) -> Dict: |
|
|
""" |
|
|
Benchmark generation quality with diverse prompts |
|
|
|
|
|
Args: |
|
|
test_prompts: Custom test prompts |
|
|
num_samples: Number of samples per prompt type |
|
|
|
|
|
Returns: |
|
|
Quality benchmark results |
|
|
""" |
|
|
logger.info("Running generation quality benchmark...") |
|
|
|
|
|
if test_prompts is None: |
|
|
test_prompts = [ |
|
|
"Explain quantum computing in simple terms:", |
|
|
"Write a Python function to calculate fibonacci numbers:", |
|
|
"Translate 'Hello, how are you?' to Spanish, French, and German:", |
|
|
"Solve: If x + 5 = 12, what is x?", |
|
|
"Write a haiku about artificial intelligence:" |
|
|
] |
|
|
|
|
|
results = {} |
|
|
|
|
|
for i, prompt in enumerate(test_prompts): |
|
|
logger.info(f"Testing prompt {i+1}/{len(test_prompts)}") |
|
|
|
|
|
responses = [] |
|
|
|
|
|
for _ in range(num_samples): |
|
|
try: |
|
|
response = self.client.complete( |
|
|
prompt=prompt, |
|
|
max_tokens=512, |
|
|
temperature=0.7 |
|
|
) |
|
|
responses.append(response) |
|
|
except Exception as e: |
|
|
logger.error(f"Generation failed: {e}") |
|
|
|
|
|
if responses: |
|
|
results[f"prompt_{i+1}"] = { |
|
|
'prompt': prompt[:50] + "...", |
|
|
'num_responses': len(responses), |
|
|
'avg_length': statistics.mean([len(r) for r in responses]), |
|
|
'sample_response': responses[0][:200] + "..." |
|
|
} |
|
|
|
|
|
return results |
|
|
|
|
|
def run_all_benchmarks(self, quick_mode: bool = False) -> Dict: |
|
|
""" |
|
|
Run all benchmark suites |
|
|
|
|
|
Args: |
|
|
quick_mode: Run faster with fewer samples |
|
|
|
|
|
Returns: |
|
|
Complete benchmark results |
|
|
""" |
|
|
logger.info("Starting comprehensive benchmark suite...") |
|
|
|
|
|
if quick_mode: |
|
|
logger.info("Running in quick mode (fewer samples)") |
|
|
|
|
|
|
|
|
logger.info("\n=== Latency Benchmark ===") |
|
|
self.results['tests']['latency'] = self.benchmark_latency( |
|
|
num_requests=20 if quick_mode else 100, |
|
|
prompt_lengths=[128, 512] if quick_mode else [128, 512, 2048] |
|
|
) |
|
|
|
|
|
|
|
|
logger.info("\n=== Throughput Benchmark ===") |
|
|
self.results['tests']['throughput'] = self.benchmark_throughput( |
|
|
duration_seconds=30 if quick_mode else 60, |
|
|
concurrent_requests=5 if quick_mode else 10 |
|
|
) |
|
|
|
|
|
|
|
|
logger.info("\n=== Context Length Benchmark ===") |
|
|
self.results['tests']['context_length'] = self.benchmark_context_length( |
|
|
context_lengths=[1024, 4096] if quick_mode else [1024, 4096, 16384], |
|
|
num_samples=5 if quick_mode else 10 |
|
|
) |
|
|
|
|
|
|
|
|
logger.info("\n=== Generation Quality Benchmark ===") |
|
|
self.results['tests']['generation_quality'] = self.benchmark_generation_quality( |
|
|
num_samples=2 if quick_mode else 5 |
|
|
) |
|
|
|
|
|
return self.results |
|
|
|
|
|
def save_results(self, filename: Optional[str] = None): |
|
|
""" |
|
|
Save benchmark results to file |
|
|
|
|
|
Args: |
|
|
filename: Output filename |
|
|
""" |
|
|
if filename is None: |
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
filename = f"benchmark_{timestamp}.json" |
|
|
|
|
|
output_path = self.output_dir / filename |
|
|
|
|
|
with open(output_path, 'w') as f: |
|
|
json.dump(self.results, f, indent=2) |
|
|
|
|
|
logger.info(f"Results saved to {output_path}") |
|
|
|
|
|
def print_summary(self): |
|
|
"""Print benchmark summary""" |
|
|
logger.info("\n" + "="*60) |
|
|
logger.info("BENCHMARK SUMMARY") |
|
|
logger.info("="*60) |
|
|
|
|
|
if 'latency' in self.results['tests']: |
|
|
logger.info("\nLatency Results:") |
|
|
for prompt_type, metrics in self.results['tests']['latency'].items(): |
|
|
logger.info(f" {prompt_type}:") |
|
|
logger.info(f" Mean: {metrics['mean_ms']:.2f}ms") |
|
|
logger.info(f" P95: {metrics['p95_ms']:.2f}ms") |
|
|
logger.info(f" P99: {metrics['p99_ms']:.2f}ms") |
|
|
|
|
|
if 'throughput' in self.results['tests']: |
|
|
logger.info("\nThroughput Results:") |
|
|
metrics = self.results['tests']['throughput'] |
|
|
logger.info(f" Requests/sec: {metrics['requests_per_second']:.2f}") |
|
|
logger.info(f" Tokens/sec: {metrics['tokens_per_second']:.2f}") |
|
|
logger.info(f" Avg Latency: {metrics['avg_latency_ms']:.2f}ms") |
|
|
|
|
|
logger.info("\n" + "="*60) |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main entry point""" |
|
|
parser = argparse.ArgumentParser(description="Helion Benchmark Runner") |
|
|
parser.add_argument("--base-url", type=str, default="http://localhost:8000") |
|
|
parser.add_argument("--output-dir", type=str, default="./benchmark_results") |
|
|
parser.add_argument("--quick", action="store_true", help="Run quick benchmark") |
|
|
parser.add_argument("--test", type=str, choices=['latency', 'throughput', 'context', 'quality', 'all'], |
|
|
default='all', help="Specific test to run") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
runner = BenchmarkRunner( |
|
|
base_url=args.base_url, |
|
|
output_dir=args.output_dir |
|
|
) |
|
|
|
|
|
if args.test == 'all': |
|
|
results = runner.run_all_benchmarks(quick_mode=args.quick) |
|
|
elif args.test == 'latency': |
|
|
results = runner.benchmark_latency(num_requests=20 if args.quick else 100) |
|
|
elif args.test == 'throughput': |
|
|
results = runner.benchmark_throughput(duration_seconds=30 if args.quick else 60) |
|
|
elif args.test == 'context': |
|
|
results = runner.benchmark_context_length(num_samples=5 if args.quick else 10) |
|
|
elif args.test == 'quality': |
|
|
results = runner.benchmark_generation_quality(num_samples=2 if args.quick else 5) |
|
|
|
|
|
runner.save_results() |
|
|
runner.print_summary() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |