odoo_mqtt/iot_bridge/utils/status_monitor.py
matthias.lotz 0bce1e1bed fix(device-availability): fix timeout monitor and bridge-restart race conditions
- status_monitor: add availability_managed set; _monitor_loop skips devices
  in this set so the LWT/availability topic is the sole online/offline source
- device_manager: register device with status_monitor.set_availability_managed()
  so the monitor actually skips them (previously the monitor had no knowledge
  of DeviceManager.availability_managed)
- mqtt_bridge: remove blanket 'reset all devices to offline' on bridge restart;
  this was causing a race condition where the cron reset state AFTER the bridge
  had already sent device_online events via retained MQTT messages;
  stale running session cleanup is kept (still needed)
2026-03-19 19:11:03 +01:00

340 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Device Status Monitor - Tracks device online/offline status based on MQTT activity.
Monitors last_seen timestamps and emits device_online/device_offline events to Odoo.
"""
import json
import threading
import time
import uuid
from collections.abc import Callable
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import structlog
logger = structlog.get_logger()
@dataclass
class DeviceStatus:
"""Status information for a device."""
device_id: str
last_seen: float # Unix timestamp
is_online: bool
last_state_change: float # Unix timestamp of last online/offline transition
class DeviceStatusMonitor:
"""
Monitors device activity and tracks online/offline status.
- Tracks last_seen timestamp per device on every MQTT message
- Runs background thread that checks timeouts
- Emits device_online/device_offline events
- Persists status to /data/device_status.json for restart recovery
"""
def __init__(
self,
timeout_seconds: int = 30,
check_interval_seconds: int = 5,
persistence_path: str = "/data/device_status.json",
event_callback: Callable | None = None,
):
"""
Initialize Device Status Monitor.
Args:
timeout_seconds: Seconds without message before device is marked offline
check_interval_seconds: How often to check for timeouts
persistence_path: Path to persist device status
event_callback: Callback(event_dict) for device_online/offline events
"""
self.timeout_seconds = timeout_seconds
self.check_interval_seconds = check_interval_seconds
self.persistence_path = Path(persistence_path)
self.event_callback = event_callback
# Device status tracking: device_id -> DeviceStatus
self.devices: dict[str, DeviceStatus] = {}
self.lock = threading.Lock()
# Devices managed by LWT/availability topic - timeout monitor skips them
self.availability_managed: set[str] = set()
# Background thread
self.monitor_thread: threading.Thread | None = None
self.stop_flag = threading.Event()
# Load persisted status
self._load_status()
logger.info(
"device_status_monitor_initialized",
timeout_s=timeout_seconds,
check_interval_s=check_interval_seconds,
persistence_path=str(self.persistence_path),
)
def _load_status(self):
"""Load persisted device status from disk."""
if not self.persistence_path.exists():
logger.info("no_persisted_status_found", path=str(self.persistence_path))
return
try:
with open(self.persistence_path) as f:
data = json.load(f)
for device_id, status_dict in data.items():
self.devices[device_id] = DeviceStatus(**status_dict)
logger.info(
"device_status_loaded",
device_count=len(self.devices),
path=str(self.persistence_path),
)
except Exception as e:
logger.error("failed_to_load_device_status", error=str(e))
def _save_status(self):
"""Persist device status to disk."""
try:
# Ensure directory exists
self.persistence_path.parent.mkdir(parents=True, exist_ok=True)
# Convert to dict
data = {device_id: asdict(status) for device_id, status in self.devices.items()}
# Write atomically (write to temp, then rename)
temp_path = self.persistence_path.with_suffix(".tmp")
with open(temp_path, "w") as f:
json.dump(data, f, indent=2)
temp_path.replace(self.persistence_path)
logger.debug("device_status_saved", device_count=len(self.devices))
except Exception as e:
logger.error("failed_to_save_device_status", error=str(e))
def update_last_seen(self, device_id: str):
"""
Update last_seen timestamp for a device.
Called on every MQTT message. If device was offline, emit device_online event.
Args:
device_id: Device identifier
"""
now = time.time()
with self.lock:
if device_id not in self.devices:
# New device - mark as online
self.devices[device_id] = DeviceStatus(
device_id=device_id, last_seen=now, is_online=True, last_state_change=now
)
logger.info("device_registered", device_id=device_id)
self._emit_device_online(device_id, now)
self._save_status()
else:
# Existing device - update last_seen
device = self.devices[device_id]
device.last_seen = now
# If was offline, mark online and emit event
if not device.is_online:
device.is_online = True
device.last_state_change = now
logger.info("device_came_online", device_id=device_id)
self._emit_device_online(device_id, now)
self._save_status()
def set_availability_managed(self, device_id: str):
"""Register a device as LWT-managed so the timeout monitor skips it."""
self.availability_managed.add(device_id)
def mark_online_silent(self, device_id: str):
"""Mark device as online and update last_seen WITHOUT emitting an event.
Used by route_message when the availability message itself already
enqueues the device_online event avoids double emission.
Args:
device_id: Device identifier
"""
now = time.time()
with self.lock:
if device_id not in self.devices:
self.devices[device_id] = DeviceStatus(
device_id=device_id, last_seen=now, is_online=True, last_state_change=now
)
else:
device = self.devices[device_id]
device.last_seen = now
if not device.is_online:
device.is_online = True
device.last_state_change = now
self._save_status()
def _emit_device_online(self, device_id: str, timestamp: float):
"""Emit device_online event to Odoo."""
if not self.event_callback:
return
event = {
"event_uid": str(uuid.uuid4()),
"event_type": "device_online",
"device_id": device_id,
"timestamp": datetime.fromtimestamp(timestamp, tz=timezone.utc).isoformat(),
"payload": {
"last_seen": datetime.fromtimestamp(timestamp, tz=timezone.utc).isoformat(),
"reason": "activity_detected",
},
}
try:
self.event_callback(event)
except Exception as e:
logger.error("failed_to_emit_device_online", device_id=device_id, error=str(e))
def _emit_device_offline(self, device_id: str, timestamp: float, reason: str = "timeout"):
"""Emit device_offline event to Odoo."""
if not self.event_callback:
return
device = self.devices.get(device_id)
if not device:
return
event = {
"event_uid": str(uuid.uuid4()),
"event_type": "device_offline",
"device_id": device_id,
"timestamp": datetime.fromtimestamp(timestamp, tz=timezone.utc).isoformat(),
"payload": {
"last_seen": datetime.fromtimestamp(device.last_seen, tz=timezone.utc).isoformat(),
"reason": reason,
"offline_duration_s": int(timestamp - device.last_seen),
},
}
try:
self.event_callback(event)
except Exception as e:
logger.error("failed_to_emit_device_offline", device_id=device_id, error=str(e))
def _monitor_loop(self):
"""Background thread that checks for device timeouts."""
logger.info("device_status_monitor_started")
while not self.stop_flag.is_set():
try:
now = time.time()
with self.lock:
for device_id, device in self.devices.items():
# Skip devices whose online/offline is managed by LWT
if device_id in self.availability_managed:
continue
# Check if device timed out
if device.is_online:
elapsed = now - device.last_seen
if elapsed > self.timeout_seconds:
# Mark offline
device.is_online = False
device.last_state_change = now
logger.warning(
"device_went_offline",
device_id=device_id,
elapsed_s=int(elapsed),
timeout_s=self.timeout_seconds,
)
self._emit_device_offline(device_id, now, reason="timeout")
self._save_status()
# Sleep with interruptible wait
self.stop_flag.wait(self.check_interval_seconds)
except Exception as e:
logger.error("monitor_loop_error", error=str(e))
self.stop_flag.wait(1)
logger.info("device_status_monitor_stopped")
def start(self):
"""Start the background monitoring thread."""
if self.monitor_thread and self.monitor_thread.is_alive():
logger.warning("monitor_already_running")
return
self.stop_flag.clear()
self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
self.monitor_thread.start()
logger.info("device_status_monitor_thread_started")
def stop(self):
"""Stop the background monitoring thread."""
if not self.monitor_thread:
return
logger.info("stopping_device_status_monitor")
self.stop_flag.set()
self.monitor_thread.join(timeout=self.check_interval_seconds + 5)
# Save final state
self._save_status()
logger.info("device_status_monitor_stopped")
def get_status(self, device_id: str) -> dict[str, Any] | None:
"""
Get status for a specific device.
Args:
device_id: Device identifier to query.
Returns:
Dict with device status or None if not found
"""
with self.lock:
device = self.devices.get(device_id)
if not device:
return None
return {
"device_id": device.device_id,
"is_online": device.is_online,
"last_seen": datetime.fromtimestamp(device.last_seen, tz=timezone.utc).isoformat(),
"last_state_change": datetime.fromtimestamp(
device.last_state_change, tz=timezone.utc
).isoformat(),
"seconds_since_seen": int(time.time() - device.last_seen),
}
def get_all_status(self) -> dict[str, dict[str, Any]]:
"""Get status information for all tracked devices.
Returns:
Mapping of device IDs to current status dictionaries.
"""
with self.lock:
now = time.time()
return {
device_id: {
"device_id": device.device_id,
"is_online": device.is_online,
"last_seen": datetime.fromtimestamp(device.last_seen, tz=timezone.utc).isoformat(),
"last_state_change": datetime.fromtimestamp(
device.last_state_change, tz=timezone.utc
).isoformat(),
"seconds_since_seen": int(now - device.last_seen),
}
for device_id, device in self.devices.items()
}