groups: - name: calejo_control_adapter rules: # Application health alerts - alert: CalejoApplicationDown expr: up{job="calejo-control-adapter"} == 0 for: 1m labels: severity: critical annotations: summary: "Calejo Control Adapter is down" description: "The Calejo Control Adapter application has been down for more than 1 minute." - alert: CalejoHealthCheckFailing expr: calejo_health_check_status == 0 for: 2m labels: severity: warning annotations: summary: "Calejo health check failing" description: "One or more health checks have been failing for 2 minutes." # Database alerts - alert: DatabaseConnectionHigh expr: calejo_db_connections_active > 8 for: 5m labels: severity: warning annotations: summary: "High database connections" description: "Database connections are consistently high ({{ $value }} active connections)." - alert: DatabaseQuerySlow expr: rate(calejo_db_query_duration_seconds_sum[5m]) / rate(calejo_db_query_duration_seconds_count[5m]) > 1 for: 2m labels: severity: warning annotations: summary: "Slow database queries" description: "Average database query time is above 1 second." # Safety alerts - alert: SafetyViolationDetected expr: increase(calejo_safety_violations_total[5m]) > 0 labels: severity: critical annotations: summary: "Safety violation detected" description: "{{ $value }} safety violations detected in the last 5 minutes." - alert: EmergencyStopActive expr: calejo_emergency_stops_active > 0 for: 1m labels: severity: critical annotations: summary: "Emergency stop active" description: "Emergency stop is active for {{ $value }} pump(s)." # Performance alerts - alert: HighAPIRequestRate expr: rate(calejo_rest_api_requests_total[5m]) > 100 for: 2m labels: severity: warning annotations: summary: "High API request rate" description: "API request rate is high ({{ $value }} requests/second)." - alert: OPCUAConnectionDrop expr: calejo_opcua_connections == 0 for: 3m labels: severity: warning annotations: summary: "No OPC UA connections" description: "No active OPC UA connections for 3 minutes." - alert: ModbusConnectionDrop expr: calejo_modbus_connections == 0 for: 3m labels: severity: warning annotations: summary: "No Modbus connections" description: "No active Modbus connections for 3 minutes." # Resource alerts - alert: HighMemoryUsage expr: process_resident_memory_bytes{job="calejo-control-adapter"} > 1.5e9 for: 5m labels: severity: warning annotations: summary: "High memory usage" description: "Application memory usage is high ({{ $value }} bytes)." - alert: HighCPUUsage expr: rate(process_cpu_seconds_total{job="calejo-control-adapter"}[5m]) * 100 > 80 for: 5m labels: severity: warning annotations: summary: "High CPU usage" description: "Application CPU usage is high ({{ $value }}%)." # Optimization alerts - alert: OptimizationRunFailed expr: increase(calejo_optimization_runs_total[10m]) == 0 for: 15m labels: severity: warning annotations: summary: "No optimization runs" description: "No optimization runs completed in the last 15 minutes." - alert: LongOptimizationDuration expr: calejo_optimization_duration_seconds > 300 for: 2m labels: severity: warning annotations: summary: "Long optimization duration" description: "Optimization runs are taking longer than 5 minutes."