CalejoControl/monitoring/alert_rules.yml

124 lines
4.1 KiB
YAML
Raw Permalink Normal View History

groups:
- name: calejo_control_adapter
rules:
# Application health alerts
- alert: CalejoApplicationDown
expr: up{job="calejo-control-adapter"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Calejo Control Adapter is down"
description: "The Calejo Control Adapter application has been down for more than 1 minute."
- alert: CalejoHealthCheckFailing
expr: calejo_health_check_status == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Calejo health check failing"
description: "One or more health checks have been failing for 2 minutes."
# Database alerts
- alert: DatabaseConnectionHigh
expr: calejo_db_connections_active > 8
for: 5m
labels:
severity: warning
annotations:
summary: "High database connections"
description: "Database connections are consistently high ({{ $value }} active connections)."
- alert: DatabaseQuerySlow
expr: rate(calejo_db_query_duration_seconds_sum[5m]) / rate(calejo_db_query_duration_seconds_count[5m]) > 1
for: 2m
labels:
severity: warning
annotations:
summary: "Slow database queries"
description: "Average database query time is above 1 second."
# Safety alerts
- alert: SafetyViolationDetected
expr: increase(calejo_safety_violations_total[5m]) > 0
labels:
severity: critical
annotations:
summary: "Safety violation detected"
description: "{{ $value }} safety violations detected in the last 5 minutes."
- alert: EmergencyStopActive
expr: calejo_emergency_stops_active > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Emergency stop active"
description: "Emergency stop is active for {{ $value }} pump(s)."
# Performance alerts
- alert: HighAPIRequestRate
expr: rate(calejo_rest_api_requests_total[5m]) > 100
for: 2m
labels:
severity: warning
annotations:
summary: "High API request rate"
description: "API request rate is high ({{ $value }} requests/second)."
- alert: OPCUAConnectionDrop
expr: calejo_opcua_connections == 0
for: 3m
labels:
severity: warning
annotations:
summary: "No OPC UA connections"
description: "No active OPC UA connections for 3 minutes."
- alert: ModbusConnectionDrop
expr: calejo_modbus_connections == 0
for: 3m
labels:
severity: warning
annotations:
summary: "No Modbus connections"
description: "No active Modbus connections for 3 minutes."
# Resource alerts
- alert: HighMemoryUsage
expr: process_resident_memory_bytes{job="calejo-control-adapter"} > 1.5e9
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Application memory usage is high ({{ $value }} bytes)."
- alert: HighCPUUsage
expr: rate(process_cpu_seconds_total{job="calejo-control-adapter"}[5m]) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage"
description: "Application CPU usage is high ({{ $value }}%)."
# Optimization alerts
- alert: OptimizationRunFailed
expr: increase(calejo_optimization_runs_total[10m]) == 0
for: 15m
labels:
severity: warning
annotations:
summary: "No optimization runs"
description: "No optimization runs completed in the last 15 minutes."
- alert: LongOptimizationDuration
expr: calejo_optimization_duration_seconds > 300
for: 2m
labels:
severity: warning
annotations:
summary: "Long optimization duration"
description: "Optimization runs are taking longer than 5 minutes."