#!/usr/bin/env python3 """ Helion-2.5-Rnd Benchmark Runner Comprehensive benchmarking suite for performance testing """ import argparse import json import logging import statistics import time from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime from pathlib import Path from typing import Dict, List, Optional import numpy as np from inference.client import HelionClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class BenchmarkRunner: """Run comprehensive benchmarks on Helion model""" def __init__( self, base_url: str = "http://localhost:8000", output_dir: str = "./benchmark_results" ): """ Initialize benchmark runner Args: base_url: API base URL output_dir: Directory for results """ self.client = HelionClient(base_url=base_url) self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) self.results = { 'timestamp': datetime.now().isoformat(), 'base_url': base_url, 'tests': {} } def benchmark_latency( self, num_requests: int = 100, prompt_lengths: List[int] = [128, 512, 2048], max_tokens: int = 256 ) -> Dict: """ Benchmark inference latency Args: num_requests: Number of requests per test prompt_lengths: Different prompt lengths to test max_tokens: Maximum tokens to generate Returns: Latency benchmark results """ logger.info("Running latency benchmark...") results = {} for prompt_len in prompt_lengths: logger.info(f"Testing prompt length: {prompt_len}") # Generate test prompt test_prompt = "Hello world. " * (prompt_len // 13) latencies = [] first_token_latencies = [] for i in range(num_requests): try: start_time = time.time() response = self.client.complete( prompt=test_prompt, max_tokens=max_tokens, temperature=0.7, stream=False ) end_time = time.time() latency = (end_time - start_time) * 1000 # Convert to ms latencies.append(latency) if i % 10 == 0: logger.info(f" Progress: {i+1}/{num_requests}") except Exception as e: logger.error(f"Request failed: {e}") if latencies: results[f"prompt_{prompt_len}"] = { 'num_samples': len(latencies), 'mean_ms': statistics.mean(latencies), 'median_ms': statistics.median(latencies), 'std_dev_ms': statistics.stdev(latencies) if len(latencies) > 1 else 0, 'min_ms': min(latencies), 'max_ms': max(latencies), 'p50_ms': np.percentile(latencies, 50), 'p90_ms': np.percentile(latencies, 90), 'p95_ms': np.percentile(latencies, 95), 'p99_ms': np.percentile(latencies, 99) } return results def benchmark_throughput( self, duration_seconds: int = 60, concurrent_requests: int = 10, prompt_length: int = 512, max_tokens: int = 128 ) -> Dict: """ Benchmark throughput with concurrent requests Args: duration_seconds: How long to run test concurrent_requests: Number of concurrent requests prompt_length: Prompt length for testing max_tokens: Maximum tokens to generate Returns: Throughput benchmark results """ logger.info(f"Running throughput benchmark for {duration_seconds}s...") test_prompt = "The quick brown fox jumps over the lazy dog. " * (prompt_length // 45) start_time = time.time() end_time = start_time + duration_seconds completed_requests = 0 failed_requests = 0 total_tokens = 0 latencies = [] def make_request(): try: req_start = time.time() response = self.client.complete( prompt=test_prompt, max_tokens=max_tokens, temperature=0.7 ) req_end = time.time() return { 'success': True, 'latency': req_end - req_start, 'tokens': len(response.split()) # Approximate } except Exception as e: return {'success': False, 'error': str(e)} with ThreadPoolExecutor(max_workers=concurrent_requests) as executor: while time.time() < end_time: futures = [executor.submit(make_request) for _ in range(concurrent_requests)] for future in as_completed(futures): result = future.result() if result['success']: completed_requests += 1 latencies.append(result['latency'] * 1000) total_tokens += result.get('tokens', 0) else: failed_requests += 1 actual_duration = time.time() - start_time return { 'duration_seconds': actual_duration, 'concurrent_requests': concurrent_requests, 'completed_requests': completed_requests, 'failed_requests': failed_requests, 'requests_per_second': completed_requests / actual_duration, 'total_tokens': total_tokens, 'tokens_per_second': total_tokens / actual_duration, 'avg_latency_ms': statistics.mean(latencies) if latencies else 0, 'p95_latency_ms': np.percentile(latencies, 95) if latencies else 0 } def benchmark_context_length( self, context_lengths: List[int] = [1024, 4096, 16384, 65536], num_samples: int = 10 ) -> Dict: """ Benchmark performance across different context lengths Args: context_lengths: List of context lengths to test num_samples: Number of samples per length Returns: Context length benchmark results """ logger.info("Running context length benchmark...") results = {} for ctx_len in context_lengths: logger.info(f"Testing context length: {ctx_len}") # Generate long context base_text = "This is a test sentence for context length benchmarking. " long_prompt = base_text * (ctx_len // len(base_text)) long_prompt = long_prompt[:ctx_len] + "\n\nSummarize the above text:" latencies = [] for i in range(num_samples): try: start_time = time.time() response = self.client.complete( prompt=long_prompt, max_tokens=256, temperature=0.5 ) end_time = time.time() latencies.append((end_time - start_time) * 1000) except Exception as e: logger.error(f"Context length {ctx_len} failed: {e}") if latencies: results[f"context_{ctx_len}"] = { 'mean_latency_ms': statistics.mean(latencies), 'median_latency_ms': statistics.median(latencies), 'std_dev_ms': statistics.stdev(latencies) if len(latencies) > 1 else 0 } return results def benchmark_generation_quality( self, test_prompts: Optional[List[str]] = None, num_samples: int = 5 ) -> Dict: """ Benchmark generation quality with diverse prompts Args: test_prompts: Custom test prompts num_samples: Number of samples per prompt type Returns: Quality benchmark results """ logger.info("Running generation quality benchmark...") if test_prompts is None: test_prompts = [ "Explain quantum computing in simple terms:", "Write a Python function to calculate fibonacci numbers:", "Translate 'Hello, how are you?' to Spanish, French, and German:", "Solve: If x + 5 = 12, what is x?", "Write a haiku about artificial intelligence:" ] results = {} for i, prompt in enumerate(test_prompts): logger.info(f"Testing prompt {i+1}/{len(test_prompts)}") responses = [] for _ in range(num_samples): try: response = self.client.complete( prompt=prompt, max_tokens=512, temperature=0.7 ) responses.append(response) except Exception as e: logger.error(f"Generation failed: {e}") if responses: results[f"prompt_{i+1}"] = { 'prompt': prompt[:50] + "...", 'num_responses': len(responses), 'avg_length': statistics.mean([len(r) for r in responses]), 'sample_response': responses[0][:200] + "..." } return results def run_all_benchmarks(self, quick_mode: bool = False) -> Dict: """ Run all benchmark suites Args: quick_mode: Run faster with fewer samples Returns: Complete benchmark results """ logger.info("Starting comprehensive benchmark suite...") if quick_mode: logger.info("Running in quick mode (fewer samples)") # Latency benchmark logger.info("\n=== Latency Benchmark ===") self.results['tests']['latency'] = self.benchmark_latency( num_requests=20 if quick_mode else 100, prompt_lengths=[128, 512] if quick_mode else [128, 512, 2048] ) # Throughput benchmark logger.info("\n=== Throughput Benchmark ===") self.results['tests']['throughput'] = self.benchmark_throughput( duration_seconds=30 if quick_mode else 60, concurrent_requests=5 if quick_mode else 10 ) # Context length benchmark logger.info("\n=== Context Length Benchmark ===") self.results['tests']['context_length'] = self.benchmark_context_length( context_lengths=[1024, 4096] if quick_mode else [1024, 4096, 16384], num_samples=5 if quick_mode else 10 ) # Generation quality logger.info("\n=== Generation Quality Benchmark ===") self.results['tests']['generation_quality'] = self.benchmark_generation_quality( num_samples=2 if quick_mode else 5 ) return self.results def save_results(self, filename: Optional[str] = None): """ Save benchmark results to file Args: filename: Output filename """ if filename is None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"benchmark_{timestamp}.json" output_path = self.output_dir / filename with open(output_path, 'w') as f: json.dump(self.results, f, indent=2) logger.info(f"Results saved to {output_path}") def print_summary(self): """Print benchmark summary""" logger.info("\n" + "="*60) logger.info("BENCHMARK SUMMARY") logger.info("="*60) if 'latency' in self.results['tests']: logger.info("\nLatency Results:") for prompt_type, metrics in self.results['tests']['latency'].items(): logger.info(f" {prompt_type}:") logger.info(f" Mean: {metrics['mean_ms']:.2f}ms") logger.info(f" P95: {metrics['p95_ms']:.2f}ms") logger.info(f" P99: {metrics['p99_ms']:.2f}ms") if 'throughput' in self.results['tests']: logger.info("\nThroughput Results:") metrics = self.results['tests']['throughput'] logger.info(f" Requests/sec: {metrics['requests_per_second']:.2f}") logger.info(f" Tokens/sec: {metrics['tokens_per_second']:.2f}") logger.info(f" Avg Latency: {metrics['avg_latency_ms']:.2f}ms") logger.info("\n" + "="*60) def main(): """Main entry point""" parser = argparse.ArgumentParser(description="Helion Benchmark Runner") parser.add_argument("--base-url", type=str, default="http://localhost:8000") parser.add_argument("--output-dir", type=str, default="./benchmark_results") parser.add_argument("--quick", action="store_true", help="Run quick benchmark") parser.add_argument("--test", type=str, choices=['latency', 'throughput', 'context', 'quality', 'all'], default='all', help="Specific test to run") args = parser.parse_args() runner = BenchmarkRunner( base_url=args.base_url, output_dir=args.output_dir ) if args.test == 'all': results = runner.run_all_benchmarks(quick_mode=args.quick) elif args.test == 'latency': results = runner.benchmark_latency(num_requests=20 if args.quick else 100) elif args.test == 'throughput': results = runner.benchmark_throughput(duration_seconds=30 if args.quick else 60) elif args.test == 'context': results = runner.benchmark_context_length(num_samples=5 if args.quick else 10) elif args.test == 'quality': results = runner.benchmark_generation_quality(num_samples=2 if args.quick else 5) runner.save_results() runner.print_summary() if __name__ == "__main__": main()