fix(device-availability): fix timeout monitor and bridge-restart race conditions

- status_monitor: add availability_managed set; _monitor_loop skips devices
  in this set so the LWT/availability topic is the sole online/offline source
- device_manager: register device with status_monitor.set_availability_managed()
  so the monitor actually skips them (previously the monitor had no knowledge
  of DeviceManager.availability_managed)
- mqtt_bridge: remove blanket 'reset all devices to offline' on bridge restart;
  this was causing a race condition where the cron reset state AFTER the bridge
  had already sent device_online events via retained MQTT messages;
  stale running session cleanup is kept (still needed)
This commit is contained in:
Matthias Lotz 2026-03-19 19:11:03 +01:00
parent 2fb45a6582
commit 0bce1e1bed
3 changed files with 20 additions and 8 deletions

View File

@ -191,17 +191,17 @@ class MqttBridge(models.Model):
),
})
# Bridge came back online → push fresh config and reset device states.
# This ensures Odoo and Bridge are in sync after a restart.
# Bridge came back online → push fresh config and close stale sessions.
# Device states are NOT reset here - instead we rely on the bridge
# sending device_online/device_offline events after reconnect.
# (Resetting here would race with the device_online events that
# the bridge sends immediately on startup via retained MQTT messages.)
if was_offline:
_logger.info(
f"Bridge {bridge.name} came back online pushing config and resetting device states"
f"Bridge {bridge.name} came back online pushing config and closing stale sessions"
)
# Reset stale device states to 'offline' so UI is consistent
# until the bridge reports real events.
devices = self.env['ows.mqtt.device'].sudo().search([('active', '=', True)])
devices.write({'state': 'offline'})
# Also close any stale 'running' sessions
# Close any stale 'running' sessions - they couldn't have ended
# cleanly while the bridge was offline
stale_sessions = self.env['ows.mqtt.session'].sudo().search([
('status', '=', 'running')
])

View File

@ -151,6 +151,8 @@ class DeviceManager:
self.device_map[avail_topic] = device_id
self.mqtt_client.subscribe(avail_topic)
self.availability_managed.add(device_id)
if self.status_monitor:
self.status_monitor.set_availability_managed(device_id)
logger.info("availability_topic_subscribed", device_id=device_id, topic=avail_topic)
status_topic = pcfg.get("status_topic", "")

View File

@ -64,6 +64,9 @@ class DeviceStatusMonitor:
self.devices: dict[str, DeviceStatus] = {}
self.lock = threading.Lock()
# Devices managed by LWT/availability topic - timeout monitor skips them
self.availability_managed: set[str] = set()
# Background thread
self.monitor_thread: threading.Thread | None = None
self.stop_flag = threading.Event()
@ -152,6 +155,10 @@ class DeviceStatusMonitor:
self._emit_device_online(device_id, now)
self._save_status()
def set_availability_managed(self, device_id: str):
"""Register a device as LWT-managed so the timeout monitor skips it."""
self.availability_managed.add(device_id)
def mark_online_silent(self, device_id: str):
"""Mark device as online and update last_seen WITHOUT emitting an event.
@ -232,6 +239,9 @@ class DeviceStatusMonitor:
with self.lock:
for device_id, device in self.devices.items():
# Skip devices whose online/offline is managed by LWT
if device_id in self.availability_managed:
continue
# Check if device timed out
if device.is_online:
elapsed = now - device.last_seen