CalejoControl/src/monitoring/watchdog.py

317 lines
12 KiB
Python
Raw Normal View History

"""
Database Watchdog for Calejo Control Adapter.
Monitors database updates and triggers failsafe mode when updates stop,
preventing stale optimization plans from controlling pumps indefinitely.
"""
import asyncio
import structlog
from datetime import datetime, timedelta
from typing import Dict, Optional, Any
from src.database.flexible_client import FlexibleDatabaseClient
logger = structlog.get_logger()
class DatabaseWatchdog:
"""
Monitors database updates and triggers failsafe mode when updates stop.
Safety Feature: If optimization system stops updating plans for more than
20 minutes, automatically revert to default safe setpoints to prevent
pumps from running on stale optimization plans.
"""
def __init__(self, db_client: FlexibleDatabaseClient, alert_manager: Any, timeout_seconds: int = 1200): # 20 minutes default
self.db_client = db_client
self.timeout_seconds = timeout_seconds
self.last_update_times: Dict[tuple, datetime] = {} # (station_id, pump_id) -> last_update
self.failsafe_active: Dict[tuple, bool] = {}
self.running = False
self.check_interval_seconds = 60 # Check every minute
async def start(self):
"""Start the watchdog monitoring."""
self.running = True
logger.info("database_watchdog_started", timeout_seconds=self.timeout_seconds)
# Initial check
await self._check_updates()
# Start periodic monitoring
asyncio.create_task(self._monitor_loop())
async def stop(self):
"""Stop the watchdog monitoring."""
self.running = False
logger.info("database_watchdog_stopped")
async def _monitor_loop(self):
"""Main monitoring loop."""
while self.running:
try:
await asyncio.sleep(self.check_interval_seconds)
await self._check_updates()
except Exception as e:
logger.error("watchdog_monitor_loop_error", error=str(e))
async def _check_updates(self):
"""Check for recent updates and trigger failsafe if needed."""
try:
# Get latest pump plans to check for recent updates
latest_plans = self.db_client.get_latest_pump_plans()
current_time = datetime.now()
for plan in latest_plans:
key = (plan['station_id'], plan['pump_id'])
plan_updated_at = plan.get('plan_updated_at') or plan.get('plan_created_at')
if plan_updated_at:
# Update last known update time
self.last_update_times[key] = plan_updated_at
# Check if failsafe should be deactivated
if self.failsafe_active.get(key, False):
# Recent update detected - deactivate failsafe
await self._deactivate_failsafe(plan['station_id'], plan['pump_id'])
else:
# No update time available - treat as no recent update
self.last_update_times[key] = current_time - timedelta(seconds=self.timeout_seconds + 1)
# Check for stale updates
for key, last_update in self.last_update_times.items():
station_id, pump_id = key
time_since_update = (current_time - last_update).total_seconds()
if time_since_update > self.timeout_seconds and not self.failsafe_active.get(key, False):
# Trigger failsafe mode
await self._activate_failsafe(station_id, pump_id, time_since_update)
# Log status for monitoring
if time_since_update > self.timeout_seconds * 0.8: # 80% of timeout
logger.warning(
"watchdog_update_stale",
station_id=station_id,
pump_id=pump_id,
seconds_since_update=time_since_update,
timeout_seconds=self.timeout_seconds
)
except Exception as e:
logger.error("watchdog_check_updates_failed", error=str(e))
async def _activate_failsafe(self, station_id: str, pump_id: str, time_since_update: float):
"""Activate failsafe mode for a pump."""
try:
key = (station_id, pump_id)
self.failsafe_active[key] = True
# Get default setpoint from pump configuration
pump_config = self.db_client.get_pump(station_id, pump_id)
if pump_config:
default_setpoint = pump_config.get('default_setpoint_hz', 30.0)
# Log failsafe activation
logger.critical(
"failsafe_mode_activated",
station_id=station_id,
pump_id=pump_id,
time_since_update_seconds=time_since_update,
default_setpoint_hz=default_setpoint
)
# Record failsafe event in database
self._record_failsafe_event(station_id, pump_id, default_setpoint)
# TODO: In Phase 3, this will trigger the SetpointManager to use default setpoints
# For now, we just log the event
else:
logger.error(
"failsafe_activation_failed_no_pump_config",
station_id=station_id,
pump_id=pump_id
)
except Exception as e:
logger.error(
"failsafe_activation_failed",
station_id=station_id,
pump_id=pump_id,
error=str(e)
)
async def _deactivate_failsafe(self, station_id: str, pump_id: str):
"""Deactivate failsafe mode for a pump."""
try:
key = (station_id, pump_id)
self.failsafe_active[key] = False
logger.info(
"failsafe_mode_deactivated",
station_id=station_id,
pump_id=pump_id
)
# Record failsafe deactivation in database
self._record_failsafe_deactivation(station_id, pump_id)
except Exception as e:
logger.error(
"failsafe_deactivation_failed",
station_id=station_id,
pump_id=pump_id,
error=str(e)
)
def _record_failsafe_event(self, station_id: str, pump_id: str, default_setpoint: float):
"""Record failsafe activation in database."""
try:
query = """
INSERT INTO failsafe_events
(station_id, pump_id, default_setpoint, timestamp)
VALUES (:station_id, :pump_id, :default_setpoint, :timestamp)
"""
self.db_client.execute(query, {
'station_id': station_id,
'pump_id': pump_id,
'default_setpoint': default_setpoint,
'timestamp': datetime.now()
})
except Exception as e:
logger.error("failed_to_record_failsafe_event", error=str(e))
def _record_failsafe_deactivation(self, station_id: str, pump_id: str):
"""Record failsafe deactivation in database."""
try:
query = """
INSERT INTO failsafe_events
(station_id, pump_id, event_type, timestamp)
VALUES (:station_id, :pump_id, 'DEACTIVATED', :timestamp)
"""
self.db_client.execute(query, {
'station_id': station_id,
'pump_id': pump_id,
'timestamp': datetime.now()
})
except Exception as e:
logger.error("failed_to_record_failsafe_deactivation", error=str(e))
def is_failsafe_active(self, station_id: str, pump_id: str) -> bool:
"""Check if failsafe mode is active for a pump."""
key = (station_id, pump_id)
return self.failsafe_active.get(key, False)
def get_last_update_time(self, station_id: str, pump_id: str) -> Optional[datetime]:
"""Get the last known update time for a pump."""
key = (station_id, pump_id)
return self.last_update_times.get(key)
async def activate_failsafe_mode(self, station_id: str, pump_id: str, reason: str):
"""
Manually activate failsafe mode for testing purposes.
This method is intended for testing scenarios where failsafe mode
needs to be triggered manually, rather than waiting for automatic
detection of stale data.
Args:
station_id: Station identifier
pump_id: Pump identifier
reason: Reason for manual activation (for logging)
"""
logger.info(
"manual_failsafe_activation",
station_id=station_id,
pump_id=pump_id,
reason=reason
)
# Use a large time_since_update to trigger failsafe
await self._activate_failsafe(station_id, pump_id, self.timeout_seconds + 1)
async def activate_failsafe_mode_station(self, station_id: str, reason: str):
"""
Manually activate failsafe mode for all pumps in a station.
This method is intended for testing scenarios where station-wide
failsafe mode needs to be triggered manually.
Args:
station_id: Station identifier
reason: Reason for manual activation (for logging)
"""
logger.info(
"manual_failsafe_activation_station",
station_id=station_id,
reason=reason
)
# Get all pumps in the station
pumps = self.db_client.get_pumps(station_id)
for pump in pumps:
await self.activate_failsafe_mode(station_id, pump['pump_id'], reason)
async def clear_failsafe_mode(self, station_id: str, pump_id: str):
"""
Manually clear failsafe mode for a pump.
This method is intended for testing scenarios where failsafe mode
needs to be cleared manually.
Args:
station_id: Station identifier
pump_id: Pump identifier
"""
logger.info(
"manual_failsafe_clear",
station_id=station_id,
pump_id=pump_id
)
await self._deactivate_failsafe(station_id, pump_id)
async def clear_failsafe_mode_station(self, station_id: str):
"""
Manually clear failsafe mode for all pumps in a station.
This method is intended for testing scenarios where station-wide
failsafe mode needs to be cleared manually.
Args:
station_id: Station identifier
"""
logger.info(
"manual_failsafe_clear_station",
station_id=station_id
)
# Get all pumps in the station
pumps = self.db_client.get_pumps(station_id)
for pump in pumps:
await self.clear_failsafe_mode(station_id, pump['pump_id'])
def get_status(self) -> Dict[str, Any]:
"""Get watchdog status information."""
current_time = datetime.now()
status_info = {
'running': self.running,
'timeout_seconds': self.timeout_seconds,
'check_interval_seconds': self.check_interval_seconds,
'monitored_pumps': len(self.last_update_times),
'failsafe_active_pumps': sum(self.failsafe_active.values()),
'pump_status': {}
}
for key, last_update in self.last_update_times.items():
station_id, pump_id = key
time_since_update = (current_time - last_update).total_seconds()
status_info['pump_status'][f"{station_id}_{pump_id}"] = {
'last_update': last_update.isoformat(),
'seconds_since_update': time_since_update,
'failsafe_active': self.failsafe_active.get(key, False),
'timeout_percentage': min(100, (time_since_update / self.timeout_seconds) * 100)
}
return status_info