From 0bce1e1bed2ca77c406550f07f2ec49a5d74838f Mon Sep 17 00:00:00 2001 From: "matthias.lotz" Date: Thu, 19 Mar 2026 19:11:03 +0100 Subject: [PATCH] fix(device-availability): fix timeout monitor and bridge-restart race conditions - status_monitor: add availability_managed set; _monitor_loop skips devices in this set so the LWT/availability topic is the sole online/offline source - device_manager: register device with status_monitor.set_availability_managed() so the monitor actually skips them (previously the monitor had no knowledge of DeviceManager.availability_managed) - mqtt_bridge: remove blanket 'reset all devices to offline' on bridge restart; this was causing a race condition where the cron reset state AFTER the bridge had already sent device_online events via retained MQTT messages; stale running session cleanup is kept (still needed) --- .../open_workshop_mqtt/models/mqtt_bridge.py | 16 ++++++++-------- iot_bridge/core/device_manager.py | 2 ++ iot_bridge/utils/status_monitor.py | 10 ++++++++++ 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/extra-addons/open_workshop/open_workshop_mqtt/models/mqtt_bridge.py b/extra-addons/open_workshop/open_workshop_mqtt/models/mqtt_bridge.py index 29ac598..03964dd 100644 --- a/extra-addons/open_workshop/open_workshop_mqtt/models/mqtt_bridge.py +++ b/extra-addons/open_workshop/open_workshop_mqtt/models/mqtt_bridge.py @@ -191,17 +191,17 @@ class MqttBridge(models.Model): ), }) - # Bridge came back online → push fresh config and reset device states. - # This ensures Odoo and Bridge are in sync after a restart. + # Bridge came back online → push fresh config and close stale sessions. + # Device states are NOT reset here - instead we rely on the bridge + # sending device_online/device_offline events after reconnect. + # (Resetting here would race with the device_online events that + # the bridge sends immediately on startup via retained MQTT messages.) if was_offline: _logger.info( - f"Bridge {bridge.name} came back online – pushing config and resetting device states" + f"Bridge {bridge.name} came back online – pushing config and closing stale sessions" ) - # Reset stale device states to 'offline' so UI is consistent - # until the bridge reports real events. - devices = self.env['ows.mqtt.device'].sudo().search([('active', '=', True)]) - devices.write({'state': 'offline'}) - # Also close any stale 'running' sessions + # Close any stale 'running' sessions - they couldn't have ended + # cleanly while the bridge was offline stale_sessions = self.env['ows.mqtt.session'].sudo().search([ ('status', '=', 'running') ]) diff --git a/iot_bridge/core/device_manager.py b/iot_bridge/core/device_manager.py index 8ead701..7593ee5 100644 --- a/iot_bridge/core/device_manager.py +++ b/iot_bridge/core/device_manager.py @@ -151,6 +151,8 @@ class DeviceManager: self.device_map[avail_topic] = device_id self.mqtt_client.subscribe(avail_topic) self.availability_managed.add(device_id) + if self.status_monitor: + self.status_monitor.set_availability_managed(device_id) logger.info("availability_topic_subscribed", device_id=device_id, topic=avail_topic) status_topic = pcfg.get("status_topic", "") diff --git a/iot_bridge/utils/status_monitor.py b/iot_bridge/utils/status_monitor.py index 0b109bb..0792774 100644 --- a/iot_bridge/utils/status_monitor.py +++ b/iot_bridge/utils/status_monitor.py @@ -64,6 +64,9 @@ class DeviceStatusMonitor: self.devices: dict[str, DeviceStatus] = {} self.lock = threading.Lock() + # Devices managed by LWT/availability topic - timeout monitor skips them + self.availability_managed: set[str] = set() + # Background thread self.monitor_thread: threading.Thread | None = None self.stop_flag = threading.Event() @@ -152,6 +155,10 @@ class DeviceStatusMonitor: self._emit_device_online(device_id, now) self._save_status() + def set_availability_managed(self, device_id: str): + """Register a device as LWT-managed so the timeout monitor skips it.""" + self.availability_managed.add(device_id) + def mark_online_silent(self, device_id: str): """Mark device as online and update last_seen WITHOUT emitting an event. @@ -232,6 +239,9 @@ class DeviceStatusMonitor: with self.lock: for device_id, device in self.devices.items(): + # Skip devices whose online/offline is managed by LWT + if device_id in self.availability_managed: + continue # Check if device timed out if device.is_online: elapsed = now - device.last_seen