odoo_mqtt/iot_bridge/utils/status_monitor.py

"""
Device Status Monitor - Tracks device online/offline status based on MQTT activity.

Monitors last_seen timestamps and emits device_online/device_offline events to Odoo.
"""

import json
import threading
import time
import uuid
from collections.abc import Callable
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import structlog

logger = structlog.get_logger()


@dataclass
class DeviceStatus:
    """Status information for a device."""

    device_id: str
    last_seen: float  # Unix timestamp
    is_online: bool
    last_state_change: float  # Unix timestamp of last online/offline transition


class DeviceStatusMonitor:
    """
    Monitors device activity and tracks online/offline status.

    - Tracks last_seen timestamp per device on every MQTT message
    - Runs background thread that checks timeouts
    - Emits device_online/device_offline events
    - Persists status to /data/device_status.json for restart recovery
    """

    def __init__(
        self,
        timeout_seconds: int = 30,
        check_interval_seconds: int = 5,
        persistence_path: str = "/data/device_status.json",
        event_callback: Callable | None = None,
    ):
        """
        Initialize Device Status Monitor.

        Args:
            timeout_seconds: Seconds without message before device is marked offline
            check_interval_seconds: How often to check for timeouts
            persistence_path: Path to persist device status
            event_callback: Callback(event_dict) for device_online/offline events
        """
        self.timeout_seconds = timeout_seconds
        self.check_interval_seconds = check_interval_seconds
        self.persistence_path = Path(persistence_path)
        self.event_callback = event_callback

        # Device status tracking: device_id -> DeviceStatus
        self.devices: dict[str, DeviceStatus] = {}
        self.lock = threading.Lock()

        # Devices managed by LWT/availability topic - timeout monitor skips them
        self.availability_managed: set[str] = set()

        # Background thread
        self.monitor_thread: threading.Thread | None = None
        self.stop_flag = threading.Event()

        # Load persisted status
        self._load_status()

        logger.info(
            "device_status_monitor_initialized",
            timeout_s=timeout_seconds,
            check_interval_s=check_interval_seconds,
            persistence_path=str(self.persistence_path),
        )

    def _load_status(self):
        """Load persisted device status from disk."""
        if not self.persistence_path.exists():
            logger.info("no_persisted_status_found", path=str(self.persistence_path))
            return

        try:
            with open(self.persistence_path) as f:
                data = json.load(f)

            for device_id, status_dict in data.items():
                self.devices[device_id] = DeviceStatus(**status_dict)

            logger.info(
                "device_status_loaded",
                device_count=len(self.devices),
                path=str(self.persistence_path),
            )
        except Exception as e:
            logger.error("failed_to_load_device_status", error=str(e))

    def _save_status(self):
        """Persist device status to disk."""
        try:
            # Ensure directory exists
            self.persistence_path.parent.mkdir(parents=True, exist_ok=True)

            # Convert to dict
            data = {device_id: asdict(status) for device_id, status in self.devices.items()}

            # Write atomically (write to temp, then rename)
            temp_path = self.persistence_path.with_suffix(".tmp")
            with open(temp_path, "w") as f:
                json.dump(data, f, indent=2)

            temp_path.replace(self.persistence_path)

            logger.debug("device_status_saved", device_count=len(self.devices))
        except Exception as e:
            logger.error("failed_to_save_device_status", error=str(e))

    def update_last_seen(self, device_id: str):
        """
        Update last_seen timestamp for a device.

        Called on every MQTT message. If device was offline, emit device_online event.

        Args:
            device_id: Device identifier
        """
        now = time.time()

        with self.lock:
            if device_id not in self.devices:
                # New device - mark as online
                self.devices[device_id] = DeviceStatus(
                    device_id=device_id, last_seen=now, is_online=True, last_state_change=now
                )
                logger.info("device_registered", device_id=device_id)
                self._emit_device_online(device_id, now)
                self._save_status()
            else:
                # Existing device - update last_seen
                device = self.devices[device_id]
                device.last_seen = now

                # If was offline, mark online and emit event
                if not device.is_online:
                    device.is_online = True
                    device.last_state_change = now
                    logger.info("device_came_online", device_id=device_id)
                    self._emit_device_online(device_id, now)
                    self._save_status()

    def set_availability_managed(self, device_id: str):
        """Register a device as LWT-managed so the timeout monitor skips it."""
        self.availability_managed.add(device_id)

    def mark_online_silent(self, device_id: str):
        """Mark device as online and update last_seen WITHOUT emitting an event.

        Used by route_message when the availability message itself already
        enqueues the device_online event – avoids double emission.

        Args:
            device_id: Device identifier
        """
        now = time.time()
        with self.lock:
            if device_id not in self.devices:
                self.devices[device_id] = DeviceStatus(
                    device_id=device_id, last_seen=now, is_online=True, last_state_change=now
                )
            else:
                device = self.devices[device_id]
                device.last_seen = now
                if not device.is_online:
                    device.is_online = True
                    device.last_state_change = now
            self._save_status()

    def _emit_device_online(self, device_id: str, timestamp: float):
        """Emit device_online event to Odoo."""
        if not self.event_callback:
            return

        event = {
            "event_uid": str(uuid.uuid4()),
            "event_type": "device_online",
            "device_id": device_id,
            "timestamp": datetime.fromtimestamp(timestamp, tz=timezone.utc).isoformat(),
            "payload": {
                "last_seen": datetime.fromtimestamp(timestamp, tz=timezone.utc).isoformat(),
                "reason": "activity_detected",
            },
        }

        try:
            self.event_callback(event)
        except Exception as e:
            logger.error("failed_to_emit_device_online", device_id=device_id, error=str(e))

    def _emit_device_offline(self, device_id: str, timestamp: float, reason: str = "timeout"):
        """Emit device_offline event to Odoo."""
        if not self.event_callback:
            return

        device = self.devices.get(device_id)
        if not device:
            return

        event = {
            "event_uid": str(uuid.uuid4()),
            "event_type": "device_offline",
            "device_id": device_id,
            "timestamp": datetime.fromtimestamp(timestamp, tz=timezone.utc).isoformat(),
            "payload": {
                "last_seen": datetime.fromtimestamp(device.last_seen, tz=timezone.utc).isoformat(),
                "reason": reason,
                "offline_duration_s": int(timestamp - device.last_seen),
            },
        }

        try:
            self.event_callback(event)
        except Exception as e:
            logger.error("failed_to_emit_device_offline", device_id=device_id, error=str(e))

    def _monitor_loop(self):
        """Background thread that checks for device timeouts."""
        logger.info("device_status_monitor_started")

        while not self.stop_flag.is_set():
            try:
                now = time.time()

                with self.lock:
                    for device_id, device in self.devices.items():
                        # Skip devices whose online/offline is managed by LWT
                        if device_id in self.availability_managed:
                            continue
                        # Check if device timed out
                        if device.is_online:
                            elapsed = now - device.last_seen
                            if elapsed > self.timeout_seconds:
                                # Mark offline
                                device.is_online = False
                                device.last_state_change = now
                                logger.warning(
                                    "device_went_offline",
                                    device_id=device_id,
                                    elapsed_s=int(elapsed),
                                    timeout_s=self.timeout_seconds,
                                )
                                self._emit_device_offline(device_id, now, reason="timeout")
                                self._save_status()

                # Sleep with interruptible wait
                self.stop_flag.wait(self.check_interval_seconds)

            except Exception as e:
                logger.error("monitor_loop_error", error=str(e))
                self.stop_flag.wait(1)

        logger.info("device_status_monitor_stopped")

    def start(self):
        """Start the background monitoring thread."""
        if self.monitor_thread and self.monitor_thread.is_alive():
            logger.warning("monitor_already_running")
            return

        self.stop_flag.clear()
        self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
        self.monitor_thread.start()
        logger.info("device_status_monitor_thread_started")

    def stop(self):
        """Stop the background monitoring thread."""
        if not self.monitor_thread:
            return

        logger.info("stopping_device_status_monitor")
        self.stop_flag.set()
        self.monitor_thread.join(timeout=self.check_interval_seconds + 5)

        # Save final state
        self._save_status()

        logger.info("device_status_monitor_stopped")

    def get_status(self, device_id: str) -> dict[str, Any] | None:
        """
        Get status for a specific device.

        Args:
            device_id: Device identifier to query.

        Returns:
            Dict with device status or None if not found
        """
        with self.lock:
            device = self.devices.get(device_id)
            if not device:
                return None

            return {
                "device_id": device.device_id,
                "is_online": device.is_online,
                "last_seen": datetime.fromtimestamp(device.last_seen, tz=timezone.utc).isoformat(),
                "last_state_change": datetime.fromtimestamp(
                    device.last_state_change, tz=timezone.utc
                ).isoformat(),
                "seconds_since_seen": int(time.time() - device.last_seen),
            }

    def get_all_status(self) -> dict[str, dict[str, Any]]:
        """Get status information for all tracked devices.

        Returns:
            Mapping of device IDs to current status dictionaries.
        """
        with self.lock:
            now = time.time()
            return {
                device_id: {
                    "device_id": device.device_id,
                    "is_online": device.is_online,
                    "last_seen": datetime.fromtimestamp(device.last_seen, tz=timezone.utc).isoformat(),
                    "last_state_change": datetime.fromtimestamp(
                        device.last_state_change, tz=timezone.utc
                    ).isoformat(),
                    "seconds_since_seen": int(now - device.last_seen),
                }
                for device_id, device in self.devices.items()
            }