| { | |
| "performance_profiles": { | |
| "max_throughput": { | |
| "description": "Optimized for maximum throughput with batching", | |
| "use_case": "High-volume production serving", | |
| "settings": { | |
| "batch_size": 32, | |
| "max_batch_total_tokens": 8192, | |
| "tensor_parallel_size": 4, | |
| "pipeline_parallel_size": 1, | |
| "gpu_memory_utilization": 0.95, | |
| "max_num_seqs": 256, | |
| "max_num_batched_tokens": 8192, | |
| "enable_prefix_caching": true, | |
| "enable_chunked_prefill": true, | |
| "max_prefill_tokens": 4096 | |
| }, | |
| "expected_performance": { | |
| "throughput_tokens_per_second": "80-120", | |
| "latency_p50_ms": "200-400", | |
| "latency_p95_ms": "400-800", | |
| "concurrent_requests": "64-128" | |
| } | |
| }, | |
| "low_latency": { | |
| "description": "Optimized for lowest latency with small batches", | |
| "use_case": "Interactive applications, real-time responses", | |
| "settings": { | |
| "batch_size": 1, | |
| "max_batch_total_tokens": 4096, | |
| "tensor_parallel_size": 4, | |
| "pipeline_parallel_size": 1, | |
| "gpu_memory_utilization": 0.90, | |
| "max_num_seqs": 32, | |
| "max_num_batched_tokens": 4096, | |
| "enable_prefix_caching": false, | |
| "enable_chunked_prefill": false, | |
| "use_flash_attention": true | |
| }, | |
| "expected_performance": { | |
| "throughput_tokens_per_second": "30-50", | |
| "latency_p50_ms": "80-150", | |
| "latency_p95_ms": "150-300", | |
| "concurrent_requests": "8-16" | |
| } | |
| }, | |
| "balanced": { | |
| "description": "Balanced configuration for general use", | |
| "use_case": "General purpose inference", | |
| "settings": { | |
| "batch_size": 8, | |
| "max_batch_total_tokens": 4096, | |
| "tensor_parallel_size": 2, | |
| "pipeline_parallel_size": 1, | |
| "gpu_memory_utilization": 0.90, | |
| "max_num_seqs": 64, | |
| "max_num_batched_tokens": 4096, | |
| "enable_prefix_caching": true, | |
| "enable_chunked_prefill": true | |
| }, | |
| "expected_performance": { | |
| "throughput_tokens_per_second": "50-80", | |
| "latency_p50_ms": "150-250", | |
| "latency_p95_ms": "250-500", | |
| "concurrent_requests": "32-64" | |
| } | |
| }, | |
| "memory_efficient": { | |
| "description": "Optimized for lower memory usage", | |
| "use_case": "Limited GPU memory, smaller deployments", | |
| "settings": { | |
| "batch_size": 4, | |
| "max_batch_total_tokens": 2048, | |
| "tensor_parallel_size": 2, | |
| "pipeline_parallel_size": 1, | |
| "gpu_memory_utilization": 0.80, | |
| "max_num_seqs": 32, | |
| "max_num_batched_tokens": 2048, | |
| "enable_prefix_caching": false, | |
| "enable_chunked_prefill": false, | |
| "swap_space": 8 | |
| }, | |
| "expected_performance": { | |
| "throughput_tokens_per_second": "20-40", | |
| "latency_p50_ms": "200-350", | |
| "latency_p95_ms": "350-600", | |
| "concurrent_requests": "16-32" | |
| } | |
| } | |
| }, | |
| "hardware_optimizations": { | |
| "nvidia_a100": { | |
| "recommended_profile": "max_throughput", | |
| "gpu_count": 2, | |
| "optimizations": [ | |
| "Enable Flash Attention 2", | |
| "Use tensor parallelism", | |
| "Enable prefix caching", | |
| "Optimize batch sizes" | |
| ], | |
| "settings": { | |
| "tensor_parallel_size": 2, | |
| "gpu_memory_utilization": 0.95, | |
| "enable_cuda_graph": true | |
| } | |
| }, | |
| "nvidia_h100": { | |
| "recommended_profile": "max_throughput", | |
| "gpu_count": 4, | |
| "optimizations": [ | |
| "Enable FP8 computation", | |
| "Use larger batch sizes", | |
| "Enable advanced caching", | |
| "Utilize higher memory bandwidth" | |
| ], | |
| "settings": { | |
| "tensor_parallel_size": 4, | |
| "gpu_memory_utilization": 0.95, | |
| "enable_cuda_graph": true, | |
| "max_batch_size": 64 | |
| } | |
| }, | |
| "nvidia_v100": { | |
| "recommended_profile": "memory_efficient", | |
| "gpu_count": 4, | |
| "optimizations": [ | |
| "Reduce batch sizes", | |
| "Enable memory swapping", | |
| "Use gradient checkpointing", | |
| "Optimize tensor parallelism" | |
| ], | |
| "settings": { | |
| "tensor_parallel_size": 4, | |
| "gpu_memory_utilization": 0.85, | |
| "swap_space": 16, | |
| "max_batch_size": 8 | |
| } | |
| } | |
| }, | |
| "context_length_optimizations": { | |
| "short_context": { | |
| "description": "Optimized for contexts under 4K tokens", | |
| "max_tokens": 4096, | |
| "settings": { | |
| "max_model_len": 4096, | |
| "block_size": 16, | |
| "enable_prefix_caching": false | |
| }, | |
| "throughput_multiplier": 2.0 | |
| }, | |
| "medium_context": { | |
| "description": "Optimized for contexts 4K-32K tokens", | |
| "max_tokens": 32768, | |
| "settings": { | |
| "max_model_len": 32768, | |
| "block_size": 32, | |
| "enable_prefix_caching": true, | |
| "enable_chunked_prefill": true | |
| }, | |
| "throughput_multiplier": 1.0 | |
| }, | |
| "long_context": { | |
| "description": "Optimized for contexts 32K-131K tokens", | |
| "max_tokens": 131072, | |
| "settings": { | |
| "max_model_len": 131072, | |
| "block_size": 64, | |
| "enable_prefix_caching": true, | |
| "enable_chunked_prefill": true, | |
| "max_num_batched_tokens": 4096 | |
| }, | |
| "throughput_multiplier": 0.5 | |
| } | |
| }, | |
| "workload_patterns": { | |
| "batch_processing": { | |
| "description": "Offline batch processing workloads", | |
| "characteristics": { | |
| "latency_sensitive": false, | |
| "throughput_priority": "high", | |
| "batch_sizes": "large" | |
| }, | |
| "recommended_settings": { | |
| "profile": "max_throughput", | |
| "batch_size": 32, | |
| "concurrent_requests": 128, | |
| "enable_async": true | |
| } | |
| }, | |
| "interactive": { | |
| "description": "Real-time interactive applications", | |
| "characteristics": { | |
| "latency_sensitive": true, | |
| "throughput_priority": "medium", | |
| "batch_sizes": "small" | |
| }, | |
| "recommended_settings": { | |
| "profile": "low_latency", | |
| "batch_size": 1, | |
| "concurrent_requests": 16, | |
| "enable_streaming": true | |
| } | |
| }, | |
| "api_serving": { | |
| "description": "Production API serving", | |
| "characteristics": { | |
| "latency_sensitive": true, | |
| "throughput_priority": "high", | |
| "batch_sizes": "medium" | |
| }, | |
| "recommended_settings": { | |
| "profile": "balanced", | |
| "batch_size": 8, | |
| "concurrent_requests": 64, | |
| "enable_auto_scaling": true | |
| } | |
| } | |
| }, | |
| "monitoring_metrics": { | |
| "critical": [ | |
| "requests_per_second", | |
| "tokens_per_second", | |
| "p95_latency_ms", | |
| "error_rate", | |
| "gpu_memory_utilization" | |
| ], | |
| "important": [ | |
| "p50_latency_ms", | |
| "p99_latency_ms", | |
| "queue_depth", | |
| "cache_hit_rate", | |
| "active_requests" | |
| ], | |
| "optional": [ | |
| "gpu_temperature", | |
| "power_usage", | |
| "batch_size_distribution", | |
| "context_length_distribution" | |
| ] | |
| }, | |
| "auto_tuning": { | |
| "enabled": false, | |
| "parameters": [ | |
| "batch_size", | |
| "tensor_parallel_size", | |
| "gpu_memory_utilization" | |
| ], | |
| "optimization_goal": "maximize_throughput", | |
| "constraints": { | |
| "max_latency_ms": 1000, | |
| "min_throughput_tps": 30 | |
| }, | |
| "tuning_duration_minutes": 30 | |
| }, | |
| "troubleshooting": { | |
| "high_latency": { | |
| "possible_causes": [ | |
| "Large batch sizes", | |
| "Long context lengths", | |
| "Insufficient GPU memory", | |
| "Network bottlenecks" | |
| ], | |
| "solutions": [ | |
| "Reduce batch size", | |
| "Enable prefix caching", | |
| "Increase tensor parallelism", | |
| "Optimize network configuration" | |
| ] | |
| }, | |
| "low_throughput": { | |
| "possible_causes": [ | |
| "Small batch sizes", | |
| "Underutilized GPUs", | |
| "Disabled optimizations", | |
| "Suboptimal parallelism" | |
| ], | |
| "solutions": [ | |
| "Increase batch size", | |
| "Enable chunked prefill", | |
| "Adjust tensor parallelism", | |
| "Enable prefix caching" | |
| ] | |
| }, | |
| "out_of_memory": { | |
| "possible_causes": [ | |
| "Batch size too large", | |
| "Context length too long", | |
| "GPU memory fragmentation", | |
| "Insufficient tensor parallelism" | |
| ], | |
| "solutions": [ | |
| "Reduce batch size", | |
| "Increase tensor parallelism", | |
| "Reduce max_model_len", | |
| "Enable memory swapping" | |
| ] | |
| } | |
| } | |
| } |