2025-10-27 07:32:01 +00:00
|
|
|
"""
|
|
|
|
|
Database Watchdog for Calejo Control Adapter.
|
|
|
|
|
|
|
|
|
|
Monitors database updates and triggers failsafe mode when updates stop,
|
|
|
|
|
preventing stale optimization plans from controlling pumps indefinitely.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import asyncio
|
|
|
|
|
import structlog
|
|
|
|
|
from datetime import datetime, timedelta
|
|
|
|
|
from typing import Dict, Optional, Any
|
|
|
|
|
|
2025-10-27 13:11:17 +00:00
|
|
|
from src.database.flexible_client import FlexibleDatabaseClient
|
2025-10-27 07:32:01 +00:00
|
|
|
|
|
|
|
|
logger = structlog.get_logger()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DatabaseWatchdog:
|
|
|
|
|
"""
|
|
|
|
|
Monitors database updates and triggers failsafe mode when updates stop.
|
|
|
|
|
|
|
|
|
|
Safety Feature: If optimization system stops updating plans for more than
|
|
|
|
|
20 minutes, automatically revert to default safe setpoints to prevent
|
|
|
|
|
pumps from running on stale optimization plans.
|
|
|
|
|
"""
|
|
|
|
|
|
2025-10-27 13:11:17 +00:00
|
|
|
def __init__(self, db_client: FlexibleDatabaseClient, alert_manager: Any, timeout_seconds: int = 1200): # 20 minutes default
|
2025-10-27 07:32:01 +00:00
|
|
|
self.db_client = db_client
|
|
|
|
|
self.timeout_seconds = timeout_seconds
|
|
|
|
|
self.last_update_times: Dict[tuple, datetime] = {} # (station_id, pump_id) -> last_update
|
|
|
|
|
self.failsafe_active: Dict[tuple, bool] = {}
|
|
|
|
|
self.running = False
|
|
|
|
|
self.check_interval_seconds = 60 # Check every minute
|
|
|
|
|
|
|
|
|
|
async def start(self):
|
|
|
|
|
"""Start the watchdog monitoring."""
|
|
|
|
|
self.running = True
|
|
|
|
|
logger.info("database_watchdog_started", timeout_seconds=self.timeout_seconds)
|
|
|
|
|
|
|
|
|
|
# Initial check
|
|
|
|
|
await self._check_updates()
|
|
|
|
|
|
|
|
|
|
# Start periodic monitoring
|
|
|
|
|
asyncio.create_task(self._monitor_loop())
|
|
|
|
|
|
|
|
|
|
async def stop(self):
|
|
|
|
|
"""Stop the watchdog monitoring."""
|
|
|
|
|
self.running = False
|
|
|
|
|
logger.info("database_watchdog_stopped")
|
|
|
|
|
|
|
|
|
|
async def _monitor_loop(self):
|
|
|
|
|
"""Main monitoring loop."""
|
|
|
|
|
while self.running:
|
|
|
|
|
try:
|
|
|
|
|
await asyncio.sleep(self.check_interval_seconds)
|
|
|
|
|
await self._check_updates()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error("watchdog_monitor_loop_error", error=str(e))
|
|
|
|
|
|
|
|
|
|
async def _check_updates(self):
|
|
|
|
|
"""Check for recent updates and trigger failsafe if needed."""
|
|
|
|
|
try:
|
|
|
|
|
# Get latest pump plans to check for recent updates
|
|
|
|
|
latest_plans = self.db_client.get_latest_pump_plans()
|
|
|
|
|
|
|
|
|
|
current_time = datetime.now()
|
|
|
|
|
|
|
|
|
|
for plan in latest_plans:
|
|
|
|
|
key = (plan['station_id'], plan['pump_id'])
|
|
|
|
|
plan_updated_at = plan.get('plan_updated_at') or plan.get('plan_created_at')
|
|
|
|
|
|
|
|
|
|
if plan_updated_at:
|
|
|
|
|
# Update last known update time
|
|
|
|
|
self.last_update_times[key] = plan_updated_at
|
|
|
|
|
|
|
|
|
|
# Check if failsafe should be deactivated
|
|
|
|
|
if self.failsafe_active.get(key, False):
|
|
|
|
|
# Recent update detected - deactivate failsafe
|
|
|
|
|
await self._deactivate_failsafe(plan['station_id'], plan['pump_id'])
|
|
|
|
|
else:
|
|
|
|
|
# No update time available - treat as no recent update
|
|
|
|
|
self.last_update_times[key] = current_time - timedelta(seconds=self.timeout_seconds + 1)
|
|
|
|
|
|
|
|
|
|
# Check for stale updates
|
|
|
|
|
for key, last_update in self.last_update_times.items():
|
|
|
|
|
station_id, pump_id = key
|
|
|
|
|
time_since_update = (current_time - last_update).total_seconds()
|
|
|
|
|
|
|
|
|
|
if time_since_update > self.timeout_seconds and not self.failsafe_active.get(key, False):
|
|
|
|
|
# Trigger failsafe mode
|
|
|
|
|
await self._activate_failsafe(station_id, pump_id, time_since_update)
|
|
|
|
|
|
|
|
|
|
# Log status for monitoring
|
|
|
|
|
if time_since_update > self.timeout_seconds * 0.8: # 80% of timeout
|
|
|
|
|
logger.warning(
|
|
|
|
|
"watchdog_update_stale",
|
|
|
|
|
station_id=station_id,
|
|
|
|
|
pump_id=pump_id,
|
|
|
|
|
seconds_since_update=time_since_update,
|
|
|
|
|
timeout_seconds=self.timeout_seconds
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error("watchdog_check_updates_failed", error=str(e))
|
|
|
|
|
|
|
|
|
|
async def _activate_failsafe(self, station_id: str, pump_id: str, time_since_update: float):
|
|
|
|
|
"""Activate failsafe mode for a pump."""
|
|
|
|
|
try:
|
|
|
|
|
key = (station_id, pump_id)
|
|
|
|
|
self.failsafe_active[key] = True
|
|
|
|
|
|
|
|
|
|
# Get default setpoint from pump configuration
|
|
|
|
|
pump_config = self.db_client.get_pump(station_id, pump_id)
|
|
|
|
|
if pump_config:
|
|
|
|
|
default_setpoint = pump_config.get('default_setpoint_hz', 30.0)
|
|
|
|
|
|
|
|
|
|
# Log failsafe activation
|
|
|
|
|
logger.critical(
|
|
|
|
|
"failsafe_mode_activated",
|
|
|
|
|
station_id=station_id,
|
|
|
|
|
pump_id=pump_id,
|
|
|
|
|
time_since_update_seconds=time_since_update,
|
|
|
|
|
default_setpoint_hz=default_setpoint
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Record failsafe event in database
|
|
|
|
|
self._record_failsafe_event(station_id, pump_id, default_setpoint)
|
|
|
|
|
|
|
|
|
|
# TODO: In Phase 3, this will trigger the SetpointManager to use default setpoints
|
|
|
|
|
# For now, we just log the event
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
logger.error(
|
|
|
|
|
"failsafe_activation_failed_no_pump_config",
|
|
|
|
|
station_id=station_id,
|
|
|
|
|
pump_id=pump_id
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(
|
|
|
|
|
"failsafe_activation_failed",
|
|
|
|
|
station_id=station_id,
|
|
|
|
|
pump_id=pump_id,
|
|
|
|
|
error=str(e)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
async def _deactivate_failsafe(self, station_id: str, pump_id: str):
|
|
|
|
|
"""Deactivate failsafe mode for a pump."""
|
|
|
|
|
try:
|
|
|
|
|
key = (station_id, pump_id)
|
|
|
|
|
self.failsafe_active[key] = False
|
|
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
|
"failsafe_mode_deactivated",
|
|
|
|
|
station_id=station_id,
|
|
|
|
|
pump_id=pump_id
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Record failsafe deactivation in database
|
|
|
|
|
self._record_failsafe_deactivation(station_id, pump_id)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(
|
|
|
|
|
"failsafe_deactivation_failed",
|
|
|
|
|
station_id=station_id,
|
|
|
|
|
pump_id=pump_id,
|
|
|
|
|
error=str(e)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def _record_failsafe_event(self, station_id: str, pump_id: str, default_setpoint: float):
|
|
|
|
|
"""Record failsafe activation in database."""
|
|
|
|
|
try:
|
|
|
|
|
query = """
|
|
|
|
|
INSERT INTO failsafe_events
|
2025-10-29 08:54:12 +00:00
|
|
|
(station_id, pump_id, default_setpoint, timestamp)
|
|
|
|
|
VALUES (:station_id, :pump_id, :default_setpoint, :timestamp)
|
2025-10-27 07:32:01 +00:00
|
|
|
"""
|
2025-10-29 08:54:12 +00:00
|
|
|
self.db_client.execute(query, {
|
|
|
|
|
'station_id': station_id,
|
|
|
|
|
'pump_id': pump_id,
|
|
|
|
|
'default_setpoint': default_setpoint,
|
|
|
|
|
'timestamp': datetime.now()
|
|
|
|
|
})
|
2025-10-27 07:32:01 +00:00
|
|
|
except Exception as e:
|
|
|
|
|
logger.error("failed_to_record_failsafe_event", error=str(e))
|
|
|
|
|
|
|
|
|
|
def _record_failsafe_deactivation(self, station_id: str, pump_id: str):
|
|
|
|
|
"""Record failsafe deactivation in database."""
|
|
|
|
|
try:
|
|
|
|
|
query = """
|
|
|
|
|
INSERT INTO failsafe_events
|
|
|
|
|
(station_id, pump_id, event_type, timestamp)
|
2025-10-29 08:54:12 +00:00
|
|
|
VALUES (:station_id, :pump_id, 'DEACTIVATED', :timestamp)
|
2025-10-27 07:32:01 +00:00
|
|
|
"""
|
2025-10-29 08:54:12 +00:00
|
|
|
self.db_client.execute(query, {
|
|
|
|
|
'station_id': station_id,
|
|
|
|
|
'pump_id': pump_id,
|
|
|
|
|
'timestamp': datetime.now()
|
|
|
|
|
})
|
2025-10-27 07:32:01 +00:00
|
|
|
except Exception as e:
|
|
|
|
|
logger.error("failed_to_record_failsafe_deactivation", error=str(e))
|
|
|
|
|
|
|
|
|
|
def is_failsafe_active(self, station_id: str, pump_id: str) -> bool:
|
|
|
|
|
"""Check if failsafe mode is active for a pump."""
|
|
|
|
|
key = (station_id, pump_id)
|
|
|
|
|
return self.failsafe_active.get(key, False)
|
|
|
|
|
|
|
|
|
|
def get_last_update_time(self, station_id: str, pump_id: str) -> Optional[datetime]:
|
|
|
|
|
"""Get the last known update time for a pump."""
|
|
|
|
|
key = (station_id, pump_id)
|
|
|
|
|
return self.last_update_times.get(key)
|
|
|
|
|
|
2025-10-29 08:54:12 +00:00
|
|
|
async def activate_failsafe_mode(self, station_id: str, pump_id: str, reason: str):
|
|
|
|
|
"""
|
|
|
|
|
Manually activate failsafe mode for testing purposes.
|
|
|
|
|
|
|
|
|
|
This method is intended for testing scenarios where failsafe mode
|
|
|
|
|
needs to be triggered manually, rather than waiting for automatic
|
|
|
|
|
detection of stale data.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
station_id: Station identifier
|
|
|
|
|
pump_id: Pump identifier
|
|
|
|
|
reason: Reason for manual activation (for logging)
|
|
|
|
|
"""
|
|
|
|
|
logger.info(
|
|
|
|
|
"manual_failsafe_activation",
|
|
|
|
|
station_id=station_id,
|
|
|
|
|
pump_id=pump_id,
|
|
|
|
|
reason=reason
|
|
|
|
|
)
|
|
|
|
|
# Use a large time_since_update to trigger failsafe
|
|
|
|
|
await self._activate_failsafe(station_id, pump_id, self.timeout_seconds + 1)
|
|
|
|
|
|
|
|
|
|
async def activate_failsafe_mode_station(self, station_id: str, reason: str):
|
|
|
|
|
"""
|
|
|
|
|
Manually activate failsafe mode for all pumps in a station.
|
|
|
|
|
|
|
|
|
|
This method is intended for testing scenarios where station-wide
|
|
|
|
|
failsafe mode needs to be triggered manually.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
station_id: Station identifier
|
|
|
|
|
reason: Reason for manual activation (for logging)
|
|
|
|
|
"""
|
|
|
|
|
logger.info(
|
|
|
|
|
"manual_failsafe_activation_station",
|
|
|
|
|
station_id=station_id,
|
|
|
|
|
reason=reason
|
|
|
|
|
)
|
|
|
|
|
# Get all pumps in the station
|
|
|
|
|
pumps = self.db_client.get_pumps(station_id)
|
|
|
|
|
for pump in pumps:
|
|
|
|
|
await self.activate_failsafe_mode(station_id, pump['pump_id'], reason)
|
|
|
|
|
|
|
|
|
|
async def clear_failsafe_mode(self, station_id: str, pump_id: str):
|
|
|
|
|
"""
|
|
|
|
|
Manually clear failsafe mode for a pump.
|
|
|
|
|
|
|
|
|
|
This method is intended for testing scenarios where failsafe mode
|
|
|
|
|
needs to be cleared manually.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
station_id: Station identifier
|
|
|
|
|
pump_id: Pump identifier
|
|
|
|
|
"""
|
|
|
|
|
logger.info(
|
|
|
|
|
"manual_failsafe_clear",
|
|
|
|
|
station_id=station_id,
|
|
|
|
|
pump_id=pump_id
|
|
|
|
|
)
|
|
|
|
|
await self._deactivate_failsafe(station_id, pump_id)
|
|
|
|
|
|
|
|
|
|
async def clear_failsafe_mode_station(self, station_id: str):
|
|
|
|
|
"""
|
|
|
|
|
Manually clear failsafe mode for all pumps in a station.
|
|
|
|
|
|
|
|
|
|
This method is intended for testing scenarios where station-wide
|
|
|
|
|
failsafe mode needs to be cleared manually.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
station_id: Station identifier
|
|
|
|
|
"""
|
|
|
|
|
logger.info(
|
|
|
|
|
"manual_failsafe_clear_station",
|
|
|
|
|
station_id=station_id
|
|
|
|
|
)
|
|
|
|
|
# Get all pumps in the station
|
|
|
|
|
pumps = self.db_client.get_pumps(station_id)
|
|
|
|
|
for pump in pumps:
|
|
|
|
|
await self.clear_failsafe_mode(station_id, pump['pump_id'])
|
|
|
|
|
|
2025-10-27 07:32:01 +00:00
|
|
|
def get_status(self) -> Dict[str, Any]:
|
|
|
|
|
"""Get watchdog status information."""
|
|
|
|
|
current_time = datetime.now()
|
|
|
|
|
|
|
|
|
|
status_info = {
|
|
|
|
|
'running': self.running,
|
|
|
|
|
'timeout_seconds': self.timeout_seconds,
|
|
|
|
|
'check_interval_seconds': self.check_interval_seconds,
|
|
|
|
|
'monitored_pumps': len(self.last_update_times),
|
|
|
|
|
'failsafe_active_pumps': sum(self.failsafe_active.values()),
|
|
|
|
|
'pump_status': {}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for key, last_update in self.last_update_times.items():
|
|
|
|
|
station_id, pump_id = key
|
|
|
|
|
time_since_update = (current_time - last_update).total_seconds()
|
|
|
|
|
|
|
|
|
|
status_info['pump_status'][f"{station_id}_{pump_id}"] = {
|
|
|
|
|
'last_update': last_update.isoformat(),
|
|
|
|
|
'seconds_since_update': time_since_update,
|
|
|
|
|
'failsafe_active': self.failsafe_active.get(key, False),
|
|
|
|
|
'timeout_percentage': min(100, (time_since_update / self.timeout_seconds) * 100)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return status_info
|