124 lines
4.1 KiB
YAML
124 lines
4.1 KiB
YAML
|
|
groups:
|
||
|
|
- name: calejo_control_adapter
|
||
|
|
rules:
|
||
|
|
# Application health alerts
|
||
|
|
- alert: CalejoApplicationDown
|
||
|
|
expr: up{job="calejo-control-adapter"} == 0
|
||
|
|
for: 1m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
annotations:
|
||
|
|
summary: "Calejo Control Adapter is down"
|
||
|
|
description: "The Calejo Control Adapter application has been down for more than 1 minute."
|
||
|
|
|
||
|
|
- alert: CalejoHealthCheckFailing
|
||
|
|
expr: calejo_health_check_status == 0
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "Calejo health check failing"
|
||
|
|
description: "One or more health checks have been failing for 2 minutes."
|
||
|
|
|
||
|
|
# Database alerts
|
||
|
|
- alert: DatabaseConnectionHigh
|
||
|
|
expr: calejo_db_connections_active > 8
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High database connections"
|
||
|
|
description: "Database connections are consistently high ({{ $value }} active connections)."
|
||
|
|
|
||
|
|
- alert: DatabaseQuerySlow
|
||
|
|
expr: rate(calejo_db_query_duration_seconds_sum[5m]) / rate(calejo_db_query_duration_seconds_count[5m]) > 1
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "Slow database queries"
|
||
|
|
description: "Average database query time is above 1 second."
|
||
|
|
|
||
|
|
# Safety alerts
|
||
|
|
- alert: SafetyViolationDetected
|
||
|
|
expr: increase(calejo_safety_violations_total[5m]) > 0
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
annotations:
|
||
|
|
summary: "Safety violation detected"
|
||
|
|
description: "{{ $value }} safety violations detected in the last 5 minutes."
|
||
|
|
|
||
|
|
- alert: EmergencyStopActive
|
||
|
|
expr: calejo_emergency_stops_active > 0
|
||
|
|
for: 1m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
annotations:
|
||
|
|
summary: "Emergency stop active"
|
||
|
|
description: "Emergency stop is active for {{ $value }} pump(s)."
|
||
|
|
|
||
|
|
# Performance alerts
|
||
|
|
- alert: HighAPIRequestRate
|
||
|
|
expr: rate(calejo_rest_api_requests_total[5m]) > 100
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High API request rate"
|
||
|
|
description: "API request rate is high ({{ $value }} requests/second)."
|
||
|
|
|
||
|
|
- alert: OPCUAConnectionDrop
|
||
|
|
expr: calejo_opcua_connections == 0
|
||
|
|
for: 3m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "No OPC UA connections"
|
||
|
|
description: "No active OPC UA connections for 3 minutes."
|
||
|
|
|
||
|
|
- alert: ModbusConnectionDrop
|
||
|
|
expr: calejo_modbus_connections == 0
|
||
|
|
for: 3m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "No Modbus connections"
|
||
|
|
description: "No active Modbus connections for 3 minutes."
|
||
|
|
|
||
|
|
# Resource alerts
|
||
|
|
- alert: HighMemoryUsage
|
||
|
|
expr: process_resident_memory_bytes{job="calejo-control-adapter"} > 1.5e9
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High memory usage"
|
||
|
|
description: "Application memory usage is high ({{ $value }} bytes)."
|
||
|
|
|
||
|
|
- alert: HighCPUUsage
|
||
|
|
expr: rate(process_cpu_seconds_total{job="calejo-control-adapter"}[5m]) * 100 > 80
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High CPU usage"
|
||
|
|
description: "Application CPU usage is high ({{ $value }}%)."
|
||
|
|
|
||
|
|
# Optimization alerts
|
||
|
|
- alert: OptimizationRunFailed
|
||
|
|
expr: increase(calejo_optimization_runs_total[10m]) == 0
|
||
|
|
for: 15m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "No optimization runs"
|
||
|
|
description: "No optimization runs completed in the last 15 minutes."
|
||
|
|
|
||
|
|
- alert: LongOptimizationDuration
|
||
|
|
expr: calejo_optimization_duration_seconds > 300
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "Long optimization duration"
|
||
|
|
description: "Optimization runs are taking longer than 5 minutes."
|