Helion-V2.5-Rnd / performance_config.json
Trouter-Library's picture
Create performance_config.json (#4)
6658165 verified
raw
history blame
8.39 kB
{
"performance_profiles": {
"max_throughput": {
"description": "Optimized for maximum throughput with batching",
"use_case": "High-volume production serving",
"settings": {
"batch_size": 32,
"max_batch_total_tokens": 8192,
"tensor_parallel_size": 4,
"pipeline_parallel_size": 1,
"gpu_memory_utilization": 0.95,
"max_num_seqs": 256,
"max_num_batched_tokens": 8192,
"enable_prefix_caching": true,
"enable_chunked_prefill": true,
"max_prefill_tokens": 4096
},
"expected_performance": {
"throughput_tokens_per_second": "80-120",
"latency_p50_ms": "200-400",
"latency_p95_ms": "400-800",
"concurrent_requests": "64-128"
}
},
"low_latency": {
"description": "Optimized for lowest latency with small batches",
"use_case": "Interactive applications, real-time responses",
"settings": {
"batch_size": 1,
"max_batch_total_tokens": 4096,
"tensor_parallel_size": 4,
"pipeline_parallel_size": 1,
"gpu_memory_utilization": 0.90,
"max_num_seqs": 32,
"max_num_batched_tokens": 4096,
"enable_prefix_caching": false,
"enable_chunked_prefill": false,
"use_flash_attention": true
},
"expected_performance": {
"throughput_tokens_per_second": "30-50",
"latency_p50_ms": "80-150",
"latency_p95_ms": "150-300",
"concurrent_requests": "8-16"
}
},
"balanced": {
"description": "Balanced configuration for general use",
"use_case": "General purpose inference",
"settings": {
"batch_size": 8,
"max_batch_total_tokens": 4096,
"tensor_parallel_size": 2,
"pipeline_parallel_size": 1,
"gpu_memory_utilization": 0.90,
"max_num_seqs": 64,
"max_num_batched_tokens": 4096,
"enable_prefix_caching": true,
"enable_chunked_prefill": true
},
"expected_performance": {
"throughput_tokens_per_second": "50-80",
"latency_p50_ms": "150-250",
"latency_p95_ms": "250-500",
"concurrent_requests": "32-64"
}
},
"memory_efficient": {
"description": "Optimized for lower memory usage",
"use_case": "Limited GPU memory, smaller deployments",
"settings": {
"batch_size": 4,
"max_batch_total_tokens": 2048,
"tensor_parallel_size": 2,
"pipeline_parallel_size": 1,
"gpu_memory_utilization": 0.80,
"max_num_seqs": 32,
"max_num_batched_tokens": 2048,
"enable_prefix_caching": false,
"enable_chunked_prefill": false,
"swap_space": 8
},
"expected_performance": {
"throughput_tokens_per_second": "20-40",
"latency_p50_ms": "200-350",
"latency_p95_ms": "350-600",
"concurrent_requests": "16-32"
}
}
},
"hardware_optimizations": {
"nvidia_a100": {
"recommended_profile": "max_throughput",
"gpu_count": 2,
"optimizations": [
"Enable Flash Attention 2",
"Use tensor parallelism",
"Enable prefix caching",
"Optimize batch sizes"
],
"settings": {
"tensor_parallel_size": 2,
"gpu_memory_utilization": 0.95,
"enable_cuda_graph": true
}
},
"nvidia_h100": {
"recommended_profile": "max_throughput",
"gpu_count": 4,
"optimizations": [
"Enable FP8 computation",
"Use larger batch sizes",
"Enable advanced caching",
"Utilize higher memory bandwidth"
],
"settings": {
"tensor_parallel_size": 4,
"gpu_memory_utilization": 0.95,
"enable_cuda_graph": true,
"max_batch_size": 64
}
},
"nvidia_v100": {
"recommended_profile": "memory_efficient",
"gpu_count": 4,
"optimizations": [
"Reduce batch sizes",
"Enable memory swapping",
"Use gradient checkpointing",
"Optimize tensor parallelism"
],
"settings": {
"tensor_parallel_size": 4,
"gpu_memory_utilization": 0.85,
"swap_space": 16,
"max_batch_size": 8
}
}
},
"context_length_optimizations": {
"short_context": {
"description": "Optimized for contexts under 4K tokens",
"max_tokens": 4096,
"settings": {
"max_model_len": 4096,
"block_size": 16,
"enable_prefix_caching": false
},
"throughput_multiplier": 2.0
},
"medium_context": {
"description": "Optimized for contexts 4K-32K tokens",
"max_tokens": 32768,
"settings": {
"max_model_len": 32768,
"block_size": 32,
"enable_prefix_caching": true,
"enable_chunked_prefill": true
},
"throughput_multiplier": 1.0
},
"long_context": {
"description": "Optimized for contexts 32K-131K tokens",
"max_tokens": 131072,
"settings": {
"max_model_len": 131072,
"block_size": 64,
"enable_prefix_caching": true,
"enable_chunked_prefill": true,
"max_num_batched_tokens": 4096
},
"throughput_multiplier": 0.5
}
},
"workload_patterns": {
"batch_processing": {
"description": "Offline batch processing workloads",
"characteristics": {
"latency_sensitive": false,
"throughput_priority": "high",
"batch_sizes": "large"
},
"recommended_settings": {
"profile": "max_throughput",
"batch_size": 32,
"concurrent_requests": 128,
"enable_async": true
}
},
"interactive": {
"description": "Real-time interactive applications",
"characteristics": {
"latency_sensitive": true,
"throughput_priority": "medium",
"batch_sizes": "small"
},
"recommended_settings": {
"profile": "low_latency",
"batch_size": 1,
"concurrent_requests": 16,
"enable_streaming": true
}
},
"api_serving": {
"description": "Production API serving",
"characteristics": {
"latency_sensitive": true,
"throughput_priority": "high",
"batch_sizes": "medium"
},
"recommended_settings": {
"profile": "balanced",
"batch_size": 8,
"concurrent_requests": 64,
"enable_auto_scaling": true
}
}
},
"monitoring_metrics": {
"critical": [
"requests_per_second",
"tokens_per_second",
"p95_latency_ms",
"error_rate",
"gpu_memory_utilization"
],
"important": [
"p50_latency_ms",
"p99_latency_ms",
"queue_depth",
"cache_hit_rate",
"active_requests"
],
"optional": [
"gpu_temperature",
"power_usage",
"batch_size_distribution",
"context_length_distribution"
]
},
"auto_tuning": {
"enabled": false,
"parameters": [
"batch_size",
"tensor_parallel_size",
"gpu_memory_utilization"
],
"optimization_goal": "maximize_throughput",
"constraints": {
"max_latency_ms": 1000,
"min_throughput_tps": 30
},
"tuning_duration_minutes": 30
},
"troubleshooting": {
"high_latency": {
"possible_causes": [
"Large batch sizes",
"Long context lengths",
"Insufficient GPU memory",
"Network bottlenecks"
],
"solutions": [
"Reduce batch size",
"Enable prefix caching",
"Increase tensor parallelism",
"Optimize network configuration"
]
},
"low_throughput": {
"possible_causes": [
"Small batch sizes",
"Underutilized GPUs",
"Disabled optimizations",
"Suboptimal parallelism"
],
"solutions": [
"Increase batch size",
"Enable chunked prefill",
"Adjust tensor parallelism",
"Enable prefix caching"
]
},
"out_of_memory": {
"possible_causes": [
"Batch size too large",
"Context length too long",
"GPU memory fragmentation",
"Insufficient tensor parallelism"
],
"solutions": [
"Reduce batch size",
"Increase tensor parallelism",
"Reduce max_model_len",
"Enable memory swapping"
]
}
}
}