- status_monitor: add availability_managed set; _monitor_loop skips devices in this set so the LWT/availability topic is the sole online/offline source - device_manager: register device with status_monitor.set_availability_managed() so the monitor actually skips them (previously the monitor had no knowledge of DeviceManager.availability_managed) - mqtt_bridge: remove blanket 'reset all devices to offline' on bridge restart; this was causing a race condition where the cron reset state AFTER the bridge had already sent device_online events via retained MQTT messages; stale running session cleanup is kept (still needed)
340 lines
12 KiB
Python
340 lines
12 KiB
Python
"""
|
||
Device Status Monitor - Tracks device online/offline status based on MQTT activity.
|
||
|
||
Monitors last_seen timestamps and emits device_online/device_offline events to Odoo.
|
||
"""
|
||
|
||
import json
|
||
import threading
|
||
import time
|
||
import uuid
|
||
from collections.abc import Callable
|
||
from dataclasses import asdict, dataclass
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import structlog
|
||
|
||
logger = structlog.get_logger()
|
||
|
||
|
||
@dataclass
|
||
class DeviceStatus:
|
||
"""Status information for a device."""
|
||
|
||
device_id: str
|
||
last_seen: float # Unix timestamp
|
||
is_online: bool
|
||
last_state_change: float # Unix timestamp of last online/offline transition
|
||
|
||
|
||
class DeviceStatusMonitor:
|
||
"""
|
||
Monitors device activity and tracks online/offline status.
|
||
|
||
- Tracks last_seen timestamp per device on every MQTT message
|
||
- Runs background thread that checks timeouts
|
||
- Emits device_online/device_offline events
|
||
- Persists status to /data/device_status.json for restart recovery
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
timeout_seconds: int = 30,
|
||
check_interval_seconds: int = 5,
|
||
persistence_path: str = "/data/device_status.json",
|
||
event_callback: Callable | None = None,
|
||
):
|
||
"""
|
||
Initialize Device Status Monitor.
|
||
|
||
Args:
|
||
timeout_seconds: Seconds without message before device is marked offline
|
||
check_interval_seconds: How often to check for timeouts
|
||
persistence_path: Path to persist device status
|
||
event_callback: Callback(event_dict) for device_online/offline events
|
||
"""
|
||
self.timeout_seconds = timeout_seconds
|
||
self.check_interval_seconds = check_interval_seconds
|
||
self.persistence_path = Path(persistence_path)
|
||
self.event_callback = event_callback
|
||
|
||
# Device status tracking: device_id -> DeviceStatus
|
||
self.devices: dict[str, DeviceStatus] = {}
|
||
self.lock = threading.Lock()
|
||
|
||
# Devices managed by LWT/availability topic - timeout monitor skips them
|
||
self.availability_managed: set[str] = set()
|
||
|
||
# Background thread
|
||
self.monitor_thread: threading.Thread | None = None
|
||
self.stop_flag = threading.Event()
|
||
|
||
# Load persisted status
|
||
self._load_status()
|
||
|
||
logger.info(
|
||
"device_status_monitor_initialized",
|
||
timeout_s=timeout_seconds,
|
||
check_interval_s=check_interval_seconds,
|
||
persistence_path=str(self.persistence_path),
|
||
)
|
||
|
||
def _load_status(self):
|
||
"""Load persisted device status from disk."""
|
||
if not self.persistence_path.exists():
|
||
logger.info("no_persisted_status_found", path=str(self.persistence_path))
|
||
return
|
||
|
||
try:
|
||
with open(self.persistence_path) as f:
|
||
data = json.load(f)
|
||
|
||
for device_id, status_dict in data.items():
|
||
self.devices[device_id] = DeviceStatus(**status_dict)
|
||
|
||
logger.info(
|
||
"device_status_loaded",
|
||
device_count=len(self.devices),
|
||
path=str(self.persistence_path),
|
||
)
|
||
except Exception as e:
|
||
logger.error("failed_to_load_device_status", error=str(e))
|
||
|
||
def _save_status(self):
|
||
"""Persist device status to disk."""
|
||
try:
|
||
# Ensure directory exists
|
||
self.persistence_path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Convert to dict
|
||
data = {device_id: asdict(status) for device_id, status in self.devices.items()}
|
||
|
||
# Write atomically (write to temp, then rename)
|
||
temp_path = self.persistence_path.with_suffix(".tmp")
|
||
with open(temp_path, "w") as f:
|
||
json.dump(data, f, indent=2)
|
||
|
||
temp_path.replace(self.persistence_path)
|
||
|
||
logger.debug("device_status_saved", device_count=len(self.devices))
|
||
except Exception as e:
|
||
logger.error("failed_to_save_device_status", error=str(e))
|
||
|
||
def update_last_seen(self, device_id: str):
|
||
"""
|
||
Update last_seen timestamp for a device.
|
||
|
||
Called on every MQTT message. If device was offline, emit device_online event.
|
||
|
||
Args:
|
||
device_id: Device identifier
|
||
"""
|
||
now = time.time()
|
||
|
||
with self.lock:
|
||
if device_id not in self.devices:
|
||
# New device - mark as online
|
||
self.devices[device_id] = DeviceStatus(
|
||
device_id=device_id, last_seen=now, is_online=True, last_state_change=now
|
||
)
|
||
logger.info("device_registered", device_id=device_id)
|
||
self._emit_device_online(device_id, now)
|
||
self._save_status()
|
||
else:
|
||
# Existing device - update last_seen
|
||
device = self.devices[device_id]
|
||
device.last_seen = now
|
||
|
||
# If was offline, mark online and emit event
|
||
if not device.is_online:
|
||
device.is_online = True
|
||
device.last_state_change = now
|
||
logger.info("device_came_online", device_id=device_id)
|
||
self._emit_device_online(device_id, now)
|
||
self._save_status()
|
||
|
||
def set_availability_managed(self, device_id: str):
|
||
"""Register a device as LWT-managed so the timeout monitor skips it."""
|
||
self.availability_managed.add(device_id)
|
||
|
||
def mark_online_silent(self, device_id: str):
|
||
"""Mark device as online and update last_seen WITHOUT emitting an event.
|
||
|
||
Used by route_message when the availability message itself already
|
||
enqueues the device_online event – avoids double emission.
|
||
|
||
Args:
|
||
device_id: Device identifier
|
||
"""
|
||
now = time.time()
|
||
with self.lock:
|
||
if device_id not in self.devices:
|
||
self.devices[device_id] = DeviceStatus(
|
||
device_id=device_id, last_seen=now, is_online=True, last_state_change=now
|
||
)
|
||
else:
|
||
device = self.devices[device_id]
|
||
device.last_seen = now
|
||
if not device.is_online:
|
||
device.is_online = True
|
||
device.last_state_change = now
|
||
self._save_status()
|
||
|
||
def _emit_device_online(self, device_id: str, timestamp: float):
|
||
"""Emit device_online event to Odoo."""
|
||
if not self.event_callback:
|
||
return
|
||
|
||
event = {
|
||
"event_uid": str(uuid.uuid4()),
|
||
"event_type": "device_online",
|
||
"device_id": device_id,
|
||
"timestamp": datetime.fromtimestamp(timestamp, tz=timezone.utc).isoformat(),
|
||
"payload": {
|
||
"last_seen": datetime.fromtimestamp(timestamp, tz=timezone.utc).isoformat(),
|
||
"reason": "activity_detected",
|
||
},
|
||
}
|
||
|
||
try:
|
||
self.event_callback(event)
|
||
except Exception as e:
|
||
logger.error("failed_to_emit_device_online", device_id=device_id, error=str(e))
|
||
|
||
def _emit_device_offline(self, device_id: str, timestamp: float, reason: str = "timeout"):
|
||
"""Emit device_offline event to Odoo."""
|
||
if not self.event_callback:
|
||
return
|
||
|
||
device = self.devices.get(device_id)
|
||
if not device:
|
||
return
|
||
|
||
event = {
|
||
"event_uid": str(uuid.uuid4()),
|
||
"event_type": "device_offline",
|
||
"device_id": device_id,
|
||
"timestamp": datetime.fromtimestamp(timestamp, tz=timezone.utc).isoformat(),
|
||
"payload": {
|
||
"last_seen": datetime.fromtimestamp(device.last_seen, tz=timezone.utc).isoformat(),
|
||
"reason": reason,
|
||
"offline_duration_s": int(timestamp - device.last_seen),
|
||
},
|
||
}
|
||
|
||
try:
|
||
self.event_callback(event)
|
||
except Exception as e:
|
||
logger.error("failed_to_emit_device_offline", device_id=device_id, error=str(e))
|
||
|
||
def _monitor_loop(self):
|
||
"""Background thread that checks for device timeouts."""
|
||
logger.info("device_status_monitor_started")
|
||
|
||
while not self.stop_flag.is_set():
|
||
try:
|
||
now = time.time()
|
||
|
||
with self.lock:
|
||
for device_id, device in self.devices.items():
|
||
# Skip devices whose online/offline is managed by LWT
|
||
if device_id in self.availability_managed:
|
||
continue
|
||
# Check if device timed out
|
||
if device.is_online:
|
||
elapsed = now - device.last_seen
|
||
if elapsed > self.timeout_seconds:
|
||
# Mark offline
|
||
device.is_online = False
|
||
device.last_state_change = now
|
||
logger.warning(
|
||
"device_went_offline",
|
||
device_id=device_id,
|
||
elapsed_s=int(elapsed),
|
||
timeout_s=self.timeout_seconds,
|
||
)
|
||
self._emit_device_offline(device_id, now, reason="timeout")
|
||
self._save_status()
|
||
|
||
# Sleep with interruptible wait
|
||
self.stop_flag.wait(self.check_interval_seconds)
|
||
|
||
except Exception as e:
|
||
logger.error("monitor_loop_error", error=str(e))
|
||
self.stop_flag.wait(1)
|
||
|
||
logger.info("device_status_monitor_stopped")
|
||
|
||
def start(self):
|
||
"""Start the background monitoring thread."""
|
||
if self.monitor_thread and self.monitor_thread.is_alive():
|
||
logger.warning("monitor_already_running")
|
||
return
|
||
|
||
self.stop_flag.clear()
|
||
self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
||
self.monitor_thread.start()
|
||
logger.info("device_status_monitor_thread_started")
|
||
|
||
def stop(self):
|
||
"""Stop the background monitoring thread."""
|
||
if not self.monitor_thread:
|
||
return
|
||
|
||
logger.info("stopping_device_status_monitor")
|
||
self.stop_flag.set()
|
||
self.monitor_thread.join(timeout=self.check_interval_seconds + 5)
|
||
|
||
# Save final state
|
||
self._save_status()
|
||
|
||
logger.info("device_status_monitor_stopped")
|
||
|
||
def get_status(self, device_id: str) -> dict[str, Any] | None:
|
||
"""
|
||
Get status for a specific device.
|
||
|
||
Args:
|
||
device_id: Device identifier to query.
|
||
|
||
Returns:
|
||
Dict with device status or None if not found
|
||
"""
|
||
with self.lock:
|
||
device = self.devices.get(device_id)
|
||
if not device:
|
||
return None
|
||
|
||
return {
|
||
"device_id": device.device_id,
|
||
"is_online": device.is_online,
|
||
"last_seen": datetime.fromtimestamp(device.last_seen, tz=timezone.utc).isoformat(),
|
||
"last_state_change": datetime.fromtimestamp(
|
||
device.last_state_change, tz=timezone.utc
|
||
).isoformat(),
|
||
"seconds_since_seen": int(time.time() - device.last_seen),
|
||
}
|
||
|
||
def get_all_status(self) -> dict[str, dict[str, Any]]:
|
||
"""Get status information for all tracked devices.
|
||
|
||
Returns:
|
||
Mapping of device IDs to current status dictionaries.
|
||
"""
|
||
with self.lock:
|
||
now = time.time()
|
||
return {
|
||
device_id: {
|
||
"device_id": device.device_id,
|
||
"is_online": device.is_online,
|
||
"last_seen": datetime.fromtimestamp(device.last_seen, tz=timezone.utc).isoformat(),
|
||
"last_state_change": datetime.fromtimestamp(
|
||
device.last_state_change, tz=timezone.utc
|
||
).isoformat(),
|
||
"seconds_since_seen": int(now - device.last_seen),
|
||
}
|
||
for device_id, device in self.devices.items()
|
||
}
|