File size: 15,052 Bytes

0574c09

#!/usr/bin/env python3
"""
Helion-2.5-Rnd Benchmark Runner
Comprehensive benchmarking suite for performance testing
"""

import argparse
import json
import logging
import statistics
import time
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional

import numpy as np

from inference.client import HelionClient

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class BenchmarkRunner:
    """Run comprehensive benchmarks on Helion model"""
    
    def __init__(
        self,
        base_url: str = "http://localhost:8000",
        output_dir: str = "./benchmark_results"
    ):
        """
        Initialize benchmark runner
        
        Args:
            base_url: API base URL
            output_dir: Directory for results
        """
        self.client = HelionClient(base_url=base_url)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        self.results = {
            'timestamp': datetime.now().isoformat(),
            'base_url': base_url,
            'tests': {}
        }
    
    def benchmark_latency(
        self,
        num_requests: int = 100,
        prompt_lengths: List[int] = [128, 512, 2048],
        max_tokens: int = 256
    ) -> Dict:
        """
        Benchmark inference latency
        
        Args:
            num_requests: Number of requests per test
            prompt_lengths: Different prompt lengths to test
            max_tokens: Maximum tokens to generate
            
        Returns:
            Latency benchmark results
        """
        logger.info("Running latency benchmark...")
        
        results = {}
        
        for prompt_len in prompt_lengths:
            logger.info(f"Testing prompt length: {prompt_len}")
            
            # Generate test prompt
            test_prompt = "Hello world. " * (prompt_len // 13)
            
            latencies = []
            first_token_latencies = []
            
            for i in range(num_requests):
                try:
                    start_time = time.time()
                    
                    response = self.client.complete(
                        prompt=test_prompt,
                        max_tokens=max_tokens,
                        temperature=0.7,
                        stream=False
                    )
                    
                    end_time = time.time()
                    latency = (end_time - start_time) * 1000  # Convert to ms
                    
                    latencies.append(latency)
                    
                    if i % 10 == 0:
                        logger.info(f"  Progress: {i+1}/{num_requests}")
                
                except Exception as e:
                    logger.error(f"Request failed: {e}")
            
            if latencies:
                results[f"prompt_{prompt_len}"] = {
                    'num_samples': len(latencies),
                    'mean_ms': statistics.mean(latencies),
                    'median_ms': statistics.median(latencies),
                    'std_dev_ms': statistics.stdev(latencies) if len(latencies) > 1 else 0,
                    'min_ms': min(latencies),
                    'max_ms': max(latencies),
                    'p50_ms': np.percentile(latencies, 50),
                    'p90_ms': np.percentile(latencies, 90),
                    'p95_ms': np.percentile(latencies, 95),
                    'p99_ms': np.percentile(latencies, 99)
                }
        
        return results
    
    def benchmark_throughput(
        self,
        duration_seconds: int = 60,
        concurrent_requests: int = 10,
        prompt_length: int = 512,
        max_tokens: int = 128
    ) -> Dict:
        """
        Benchmark throughput with concurrent requests
        
        Args:
            duration_seconds: How long to run test
            concurrent_requests: Number of concurrent requests
            prompt_length: Prompt length for testing
            max_tokens: Maximum tokens to generate
            
        Returns:
            Throughput benchmark results
        """
        logger.info(f"Running throughput benchmark for {duration_seconds}s...")
        
        test_prompt = "The quick brown fox jumps over the lazy dog. " * (prompt_length // 45)
        
        start_time = time.time()
        end_time = start_time + duration_seconds
        
        completed_requests = 0
        failed_requests = 0
        total_tokens = 0
        latencies = []
        
        def make_request():
            try:
                req_start = time.time()
                response = self.client.complete(
                    prompt=test_prompt,
                    max_tokens=max_tokens,
                    temperature=0.7
                )
                req_end = time.time()
                
                return {
                    'success': True,
                    'latency': req_end - req_start,
                    'tokens': len(response.split())  # Approximate
                }
            except Exception as e:
                return {'success': False, 'error': str(e)}
        
        with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
            while time.time() < end_time:
                futures = [executor.submit(make_request) for _ in range(concurrent_requests)]
                
                for future in as_completed(futures):
                    result = future.result()
                    
                    if result['success']:
                        completed_requests += 1
                        latencies.append(result['latency'] * 1000)
                        total_tokens += result.get('tokens', 0)
                    else:
                        failed_requests += 1
        
        actual_duration = time.time() - start_time
        
        return {
            'duration_seconds': actual_duration,
            'concurrent_requests': concurrent_requests,
            'completed_requests': completed_requests,
            'failed_requests': failed_requests,
            'requests_per_second': completed_requests / actual_duration,
            'total_tokens': total_tokens,
            'tokens_per_second': total_tokens / actual_duration,
            'avg_latency_ms': statistics.mean(latencies) if latencies else 0,
            'p95_latency_ms': np.percentile(latencies, 95) if latencies else 0
        }
    
    def benchmark_context_length(
        self,
        context_lengths: List[int] = [1024, 4096, 16384, 65536],
        num_samples: int = 10
    ) -> Dict:
        """
        Benchmark performance across different context lengths
        
        Args:
            context_lengths: List of context lengths to test
            num_samples: Number of samples per length
            
        Returns:
            Context length benchmark results
        """
        logger.info("Running context length benchmark...")
        
        results = {}
        
        for ctx_len in context_lengths:
            logger.info(f"Testing context length: {ctx_len}")
            
            # Generate long context
            base_text = "This is a test sentence for context length benchmarking. "
            long_prompt = base_text * (ctx_len // len(base_text))
            long_prompt = long_prompt[:ctx_len] + "\n\nSummarize the above text:"
            
            latencies = []
            
            for i in range(num_samples):
                try:
                    start_time = time.time()
                    
                    response = self.client.complete(
                        prompt=long_prompt,
                        max_tokens=256,
                        temperature=0.5
                    )
                    
                    end_time = time.time()
                    latencies.append((end_time - start_time) * 1000)
                    
                except Exception as e:
                    logger.error(f"Context length {ctx_len} failed: {e}")
            
            if latencies:
                results[f"context_{ctx_len}"] = {
                    'mean_latency_ms': statistics.mean(latencies),
                    'median_latency_ms': statistics.median(latencies),
                    'std_dev_ms': statistics.stdev(latencies) if len(latencies) > 1 else 0
                }
        
        return results
    
    def benchmark_generation_quality(
        self,
        test_prompts: Optional[List[str]] = None,
        num_samples: int = 5
    ) -> Dict:
        """
        Benchmark generation quality with diverse prompts
        
        Args:
            test_prompts: Custom test prompts
            num_samples: Number of samples per prompt type
            
        Returns:
            Quality benchmark results
        """
        logger.info("Running generation quality benchmark...")
        
        if test_prompts is None:
            test_prompts = [
                "Explain quantum computing in simple terms:",
                "Write a Python function to calculate fibonacci numbers:",
                "Translate 'Hello, how are you?' to Spanish, French, and German:",
                "Solve: If x + 5 = 12, what is x?",
                "Write a haiku about artificial intelligence:"
            ]
        
        results = {}
        
        for i, prompt in enumerate(test_prompts):
            logger.info(f"Testing prompt {i+1}/{len(test_prompts)}")
            
            responses = []
            
            for _ in range(num_samples):
                try:
                    response = self.client.complete(
                        prompt=prompt,
                        max_tokens=512,
                        temperature=0.7
                    )
                    responses.append(response)
                except Exception as e:
                    logger.error(f"Generation failed: {e}")
            
            if responses:
                results[f"prompt_{i+1}"] = {
                    'prompt': prompt[:50] + "...",
                    'num_responses': len(responses),
                    'avg_length': statistics.mean([len(r) for r in responses]),
                    'sample_response': responses[0][:200] + "..."
                }
        
        return results
    
    def run_all_benchmarks(self, quick_mode: bool = False) -> Dict:
        """
        Run all benchmark suites
        
        Args:
            quick_mode: Run faster with fewer samples
            
        Returns:
            Complete benchmark results
        """
        logger.info("Starting comprehensive benchmark suite...")
        
        if quick_mode:
            logger.info("Running in quick mode (fewer samples)")
        
        # Latency benchmark
        logger.info("\n=== Latency Benchmark ===")
        self.results['tests']['latency'] = self.benchmark_latency(
            num_requests=20 if quick_mode else 100,
            prompt_lengths=[128, 512] if quick_mode else [128, 512, 2048]
        )
        
        # Throughput benchmark
        logger.info("\n=== Throughput Benchmark ===")
        self.results['tests']['throughput'] = self.benchmark_throughput(
            duration_seconds=30 if quick_mode else 60,
            concurrent_requests=5 if quick_mode else 10
        )
        
        # Context length benchmark
        logger.info("\n=== Context Length Benchmark ===")
        self.results['tests']['context_length'] = self.benchmark_context_length(
            context_lengths=[1024, 4096] if quick_mode else [1024, 4096, 16384],
            num_samples=5 if quick_mode else 10
        )
        
        # Generation quality
        logger.info("\n=== Generation Quality Benchmark ===")
        self.results['tests']['generation_quality'] = self.benchmark_generation_quality(
            num_samples=2 if quick_mode else 5
        )
        
        return self.results
    
    def save_results(self, filename: Optional[str] = None):
        """
        Save benchmark results to file
        
        Args:
            filename: Output filename
        """
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"benchmark_{timestamp}.json"
        
        output_path = self.output_dir / filename
        
        with open(output_path, 'w') as f:
            json.dump(self.results, f, indent=2)
        
        logger.info(f"Results saved to {output_path}")
    
    def print_summary(self):
        """Print benchmark summary"""
        logger.info("\n" + "="*60)
        logger.info("BENCHMARK SUMMARY")
        logger.info("="*60)
        
        if 'latency' in self.results['tests']:
            logger.info("\nLatency Results:")
            for prompt_type, metrics in self.results['tests']['latency'].items():
                logger.info(f"  {prompt_type}:")
                logger.info(f"    Mean: {metrics['mean_ms']:.2f}ms")
                logger.info(f"    P95: {metrics['p95_ms']:.2f}ms")
                logger.info(f"    P99: {metrics['p99_ms']:.2f}ms")
        
        if 'throughput' in self.results['tests']:
            logger.info("\nThroughput Results:")
            metrics = self.results['tests']['throughput']
            logger.info(f"  Requests/sec: {metrics['requests_per_second']:.2f}")
            logger.info(f"  Tokens/sec: {metrics['tokens_per_second']:.2f}")
            logger.info(f"  Avg Latency: {metrics['avg_latency_ms']:.2f}ms")
        
        logger.info("\n" + "="*60)


def main():
    """Main entry point"""
    parser = argparse.ArgumentParser(description="Helion Benchmark Runner")
    parser.add_argument("--base-url", type=str, default="http://localhost:8000")
    parser.add_argument("--output-dir", type=str, default="./benchmark_results")
    parser.add_argument("--quick", action="store_true", help="Run quick benchmark")
    parser.add_argument("--test", type=str, choices=['latency', 'throughput', 'context', 'quality', 'all'],
                       default='all', help="Specific test to run")
    
    args = parser.parse_args()
    
    runner = BenchmarkRunner(
        base_url=args.base_url,
        output_dir=args.output_dir
    )
    
    if args.test == 'all':
        results = runner.run_all_benchmarks(quick_mode=args.quick)
    elif args.test == 'latency':
        results = runner.benchmark_latency(num_requests=20 if args.quick else 100)
    elif args.test == 'throughput':
        results = runner.benchmark_throughput(duration_seconds=30 if args.quick else 60)
    elif args.test == 'context':
        results = runner.benchmark_context_length(num_samples=5 if args.quick else 10)
    elif args.test == 'quality':
        results = runner.benchmark_generation_quality(num_samples=2 if args.quick else 5)
    
    runner.save_results()
    runner.print_summary()


if __name__ == "__main__":
    main()