Trouter-Library commited on
Commit
b7062e4
·
verified ·
1 Parent(s): e9b17f1

Create monitoring_config.json

Browse files
Files changed (1) hide show
  1. monitoring_config.json +193 -0
monitoring_config.json ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "monitoring": {
3
+ "enabled": true,
4
+ "interval_seconds": 15,
5
+ "retention_days": 30
6
+ },
7
+ "metrics": {
8
+ "system": {
9
+ "enabled": true,
10
+ "collect": [
11
+ "cpu_usage",
12
+ "memory_usage",
13
+ "disk_usage",
14
+ "network_io"
15
+ ]
16
+ },
17
+ "gpu": {
18
+ "enabled": true,
19
+ "collect": [
20
+ "gpu_utilization",
21
+ "gpu_memory_used",
22
+ "gpu_memory_total",
23
+ "gpu_temperature",
24
+ "gpu_power_usage"
25
+ ],
26
+ "alert_thresholds": {
27
+ "temperature_celsius": 85,
28
+ "memory_utilization_percent": 95,
29
+ "power_watts": 400
30
+ }
31
+ },
32
+ "model": {
33
+ "enabled": true,
34
+ "collect": [
35
+ "requests_per_second",
36
+ "tokens_per_second",
37
+ "average_latency_ms",
38
+ "p50_latency_ms",
39
+ "p95_latency_ms",
40
+ "p99_latency_ms",
41
+ "error_rate",
42
+ "active_connections",
43
+ "queue_depth"
44
+ ]
45
+ },
46
+ "inference": {
47
+ "enabled": true,
48
+ "collect": [
49
+ "prompt_tokens",
50
+ "completion_tokens",
51
+ "total_tokens",
52
+ "generation_time_ms",
53
+ "preprocessing_time_ms",
54
+ "postprocessing_time_ms"
55
+ ]
56
+ }
57
+ },
58
+ "alerts": {
59
+ "enabled": true,
60
+ "channels": [
61
+ "email",
62
+ "slack",
63
+ "pagerduty"
64
+ ],
65
+ "rules": [
66
+ {
67
+ "name": "high_error_rate",
68
+ "condition": "error_rate > 0.05",
69
+ "duration_seconds": 300,
70
+ "severity": "critical",
71
+ "message": "Error rate exceeded 5% for 5 minutes"
72
+ },
73
+ {
74
+ "name": "high_latency",
75
+ "condition": "p95_latency_ms > 5000",
76
+ "duration_seconds": 180,
77
+ "severity": "warning",
78
+ "message": "P95 latency exceeded 5 seconds"
79
+ },
80
+ {
81
+ "name": "gpu_temperature_high",
82
+ "condition": "gpu_temperature > 85",
83
+ "duration_seconds": 60,
84
+ "severity": "critical",
85
+ "message": "GPU temperature critically high"
86
+ },
87
+ {
88
+ "name": "memory_pressure",
89
+ "condition": "gpu_memory_used / gpu_memory_total > 0.95",
90
+ "duration_seconds": 300,
91
+ "severity": "warning",
92
+ "message": "GPU memory utilization above 95%"
93
+ },
94
+ {
95
+ "name": "low_throughput",
96
+ "condition": "tokens_per_second < 10",
97
+ "duration_seconds": 600,
98
+ "severity": "warning",
99
+ "message": "Throughput below 10 tokens/second"
100
+ }
101
+ ]
102
+ },
103
+ "logging": {
104
+ "level": "INFO",
105
+ "format": "json",
106
+ "outputs": [
107
+ {
108
+ "type": "file",
109
+ "path": "./logs/monitoring.log",
110
+ "rotation": "daily",
111
+ "retention_days": 30
112
+ },
113
+ {
114
+ "type": "stdout",
115
+ "enabled": true
116
+ },
117
+ {
118
+ "type": "elasticsearch",
119
+ "enabled": false,
120
+ "host": "localhost:9200",
121
+ "index": "helion-metrics"
122
+ }
123
+ ]
124
+ },
125
+ "prometheus": {
126
+ "enabled": true,
127
+ "port": 8001,
128
+ "path": "/metrics",
129
+ "namespace": "helion",
130
+ "subsystem": "inference",
131
+ "labels": {
132
+ "model": "Helion-2.5-Rnd",
133
+ "version": "2.5.0-rnd",
134
+ "environment": "production"
135
+ }
136
+ },
137
+ "grafana": {
138
+ "enabled": true,
139
+ "dashboards": [
140
+ {
141
+ "name": "Helion Overview",
142
+ "file": "./monitoring/dashboards/overview.json",
143
+ "refresh": "30s"
144
+ },
145
+ {
146
+ "name": "GPU Metrics",
147
+ "file": "./monitoring/dashboards/gpu.json",
148
+ "refresh": "15s"
149
+ },
150
+ {
151
+ "name": "Inference Performance",
152
+ "file": "./monitoring/dashboards/inference.json",
153
+ "refresh": "30s"
154
+ }
155
+ ]
156
+ },
157
+ "health_checks": {
158
+ "enabled": true,
159
+ "endpoint": "/health",
160
+ "interval_seconds": 30,
161
+ "timeout_seconds": 10,
162
+ "checks": [
163
+ {
164
+ "name": "model_loaded",
165
+ "type": "internal",
166
+ "critical": true
167
+ },
168
+ {
169
+ "name": "gpu_available",
170
+ "type": "internal",
171
+ "critical": true
172
+ },
173
+ {
174
+ "name": "inference_responsive",
175
+ "type": "endpoint",
176
+ "url": "http://localhost:8000/v1/models",
177
+ "critical": false
178
+ }
179
+ ]
180
+ },
181
+ "tracing": {
182
+ "enabled": true,
183
+ "sample_rate": 0.1,
184
+ "exporter": "jaeger",
185
+ "endpoint": "http://localhost:14268/api/traces"
186
+ },
187
+ "profiling": {
188
+ "enabled": false,
189
+ "interval_seconds": 3600,
190
+ "duration_seconds": 300,
191
+ "output_dir": "./profiling"
192
+ }
193
+ }