""" Database Watchdog for Calejo Control Adapter. Monitors database updates and triggers failsafe mode when updates stop, preventing stale optimization plans from controlling pumps indefinitely. """ import asyncio import structlog from datetime import datetime, timedelta from typing import Dict, Optional, Any from src.database.flexible_client import FlexibleDatabaseClient logger = structlog.get_logger() class DatabaseWatchdog: """ Monitors database updates and triggers failsafe mode when updates stop. Safety Feature: If optimization system stops updating plans for more than 20 minutes, automatically revert to default safe setpoints to prevent pumps from running on stale optimization plans. """ def __init__(self, db_client: FlexibleDatabaseClient, alert_manager: Any, timeout_seconds: int = 1200): # 20 minutes default self.db_client = db_client self.timeout_seconds = timeout_seconds self.last_update_times: Dict[tuple, datetime] = {} # (station_id, pump_id) -> last_update self.failsafe_active: Dict[tuple, bool] = {} self.running = False self.check_interval_seconds = 60 # Check every minute async def start(self): """Start the watchdog monitoring.""" self.running = True logger.info("database_watchdog_started", timeout_seconds=self.timeout_seconds) # Initial check await self._check_updates() # Start periodic monitoring asyncio.create_task(self._monitor_loop()) async def stop(self): """Stop the watchdog monitoring.""" self.running = False logger.info("database_watchdog_stopped") async def _monitor_loop(self): """Main monitoring loop.""" while self.running: try: await asyncio.sleep(self.check_interval_seconds) await self._check_updates() except Exception as e: logger.error("watchdog_monitor_loop_error", error=str(e)) async def _check_updates(self): """Check for recent updates and trigger failsafe if needed.""" try: # Get latest pump plans to check for recent updates latest_plans = self.db_client.get_latest_pump_plans() current_time = datetime.now() for plan in latest_plans: key = (plan['station_id'], plan['pump_id']) plan_updated_at = plan.get('plan_updated_at') or plan.get('plan_created_at') if plan_updated_at: # Update last known update time self.last_update_times[key] = plan_updated_at # Check if failsafe should be deactivated if self.failsafe_active.get(key, False): # Recent update detected - deactivate failsafe await self._deactivate_failsafe(plan['station_id'], plan['pump_id']) else: # No update time available - treat as no recent update self.last_update_times[key] = current_time - timedelta(seconds=self.timeout_seconds + 1) # Check for stale updates for key, last_update in self.last_update_times.items(): station_id, pump_id = key time_since_update = (current_time - last_update).total_seconds() if time_since_update > self.timeout_seconds and not self.failsafe_active.get(key, False): # Trigger failsafe mode await self._activate_failsafe(station_id, pump_id, time_since_update) # Log status for monitoring if time_since_update > self.timeout_seconds * 0.8: # 80% of timeout logger.warning( "watchdog_update_stale", station_id=station_id, pump_id=pump_id, seconds_since_update=time_since_update, timeout_seconds=self.timeout_seconds ) except Exception as e: logger.error("watchdog_check_updates_failed", error=str(e)) async def _activate_failsafe(self, station_id: str, pump_id: str, time_since_update: float): """Activate failsafe mode for a pump.""" try: key = (station_id, pump_id) self.failsafe_active[key] = True # Get default setpoint from pump configuration pump_config = self.db_client.get_pump(station_id, pump_id) if pump_config: default_setpoint = pump_config.get('default_setpoint_hz', 30.0) # Log failsafe activation logger.critical( "failsafe_mode_activated", station_id=station_id, pump_id=pump_id, time_since_update_seconds=time_since_update, default_setpoint_hz=default_setpoint ) # Record failsafe event in database self._record_failsafe_event(station_id, pump_id, default_setpoint) # TODO: In Phase 3, this will trigger the SetpointManager to use default setpoints # For now, we just log the event else: logger.error( "failsafe_activation_failed_no_pump_config", station_id=station_id, pump_id=pump_id ) except Exception as e: logger.error( "failsafe_activation_failed", station_id=station_id, pump_id=pump_id, error=str(e) ) async def _deactivate_failsafe(self, station_id: str, pump_id: str): """Deactivate failsafe mode for a pump.""" try: key = (station_id, pump_id) self.failsafe_active[key] = False logger.info( "failsafe_mode_deactivated", station_id=station_id, pump_id=pump_id ) # Record failsafe deactivation in database self._record_failsafe_deactivation(station_id, pump_id) except Exception as e: logger.error( "failsafe_deactivation_failed", station_id=station_id, pump_id=pump_id, error=str(e) ) def _record_failsafe_event(self, station_id: str, pump_id: str, default_setpoint: float): """Record failsafe activation in database.""" try: query = """ INSERT INTO failsafe_events (station_id, pump_id, default_setpoint, timestamp) VALUES (:station_id, :pump_id, :default_setpoint, :timestamp) """ self.db_client.execute(query, { 'station_id': station_id, 'pump_id': pump_id, 'default_setpoint': default_setpoint, 'timestamp': datetime.now() }) except Exception as e: logger.error("failed_to_record_failsafe_event", error=str(e)) def _record_failsafe_deactivation(self, station_id: str, pump_id: str): """Record failsafe deactivation in database.""" try: query = """ INSERT INTO failsafe_events (station_id, pump_id, event_type, timestamp) VALUES (:station_id, :pump_id, 'DEACTIVATED', :timestamp) """ self.db_client.execute(query, { 'station_id': station_id, 'pump_id': pump_id, 'timestamp': datetime.now() }) except Exception as e: logger.error("failed_to_record_failsafe_deactivation", error=str(e)) def is_failsafe_active(self, station_id: str, pump_id: str) -> bool: """Check if failsafe mode is active for a pump.""" key = (station_id, pump_id) return self.failsafe_active.get(key, False) def get_last_update_time(self, station_id: str, pump_id: str) -> Optional[datetime]: """Get the last known update time for a pump.""" key = (station_id, pump_id) return self.last_update_times.get(key) async def activate_failsafe_mode(self, station_id: str, pump_id: str, reason: str): """ Manually activate failsafe mode for testing purposes. This method is intended for testing scenarios where failsafe mode needs to be triggered manually, rather than waiting for automatic detection of stale data. Args: station_id: Station identifier pump_id: Pump identifier reason: Reason for manual activation (for logging) """ logger.info( "manual_failsafe_activation", station_id=station_id, pump_id=pump_id, reason=reason ) # Use a large time_since_update to trigger failsafe await self._activate_failsafe(station_id, pump_id, self.timeout_seconds + 1) async def activate_failsafe_mode_station(self, station_id: str, reason: str): """ Manually activate failsafe mode for all pumps in a station. This method is intended for testing scenarios where station-wide failsafe mode needs to be triggered manually. Args: station_id: Station identifier reason: Reason for manual activation (for logging) """ logger.info( "manual_failsafe_activation_station", station_id=station_id, reason=reason ) # Get all pumps in the station pumps = self.db_client.get_pumps(station_id) for pump in pumps: await self.activate_failsafe_mode(station_id, pump['pump_id'], reason) async def clear_failsafe_mode(self, station_id: str, pump_id: str): """ Manually clear failsafe mode for a pump. This method is intended for testing scenarios where failsafe mode needs to be cleared manually. Args: station_id: Station identifier pump_id: Pump identifier """ logger.info( "manual_failsafe_clear", station_id=station_id, pump_id=pump_id ) await self._deactivate_failsafe(station_id, pump_id) async def clear_failsafe_mode_station(self, station_id: str): """ Manually clear failsafe mode for all pumps in a station. This method is intended for testing scenarios where station-wide failsafe mode needs to be cleared manually. Args: station_id: Station identifier """ logger.info( "manual_failsafe_clear_station", station_id=station_id ) # Get all pumps in the station pumps = self.db_client.get_pumps(station_id) for pump in pumps: await self.clear_failsafe_mode(station_id, pump['pump_id']) def get_status(self) -> Dict[str, Any]: """Get watchdog status information.""" current_time = datetime.now() status_info = { 'running': self.running, 'timeout_seconds': self.timeout_seconds, 'check_interval_seconds': self.check_interval_seconds, 'monitored_pumps': len(self.last_update_times), 'failsafe_active_pumps': sum(self.failsafe_active.values()), 'pump_status': {} } for key, last_update in self.last_update_times.items(): station_id, pump_id = key time_since_update = (current_time - last_update).total_seconds() status_info['pump_status'][f"{station_id}_{pump_id}"] = { 'last_update': last_update.isoformat(), 'seconds_since_update': time_since_update, 'failsafe_active': self.failsafe_active.get(key, False), 'timeout_percentage': min(100, (time_since_update / self.timeout_seconds) * 100) } return status_info