La-Fabrik/backend/hand_tracker.py

from __future__ import annotations

import base64
import math
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any

import cv2
import mediapipe as mp
import numpy as np
from mediapipe.tasks import python
from mediapipe.tasks.python import vision


@dataclass(frozen=True)
class HandData:
    x: float
    y: float
    z: float
    landmarks: list[dict[str, float]]
    handedness: str
    is_pinch: bool
    pinch_distance: float
    score: float

    def to_payload(self) -> dict[str, float | str | bool | list[dict[str, float]]]:
        return {
            "x": self.x,
            "y": self.y,
            "z": self.z,
            "landmarks": self.landmarks,
            "handedness": self.handedness,
            "isPinch": self.is_pinch,
            "pinchDistance": self.pinch_distance,
            "score": self.score,
        }


class HandTracker:
    def __init__(self, max_hands: int = 2) -> None:
        model_path = Path(__file__).with_name("hand_landmarker.task")
        if not model_path.exists():
            raise FileNotFoundError(
                "Missing hand_landmarker.task. Run `python backend/download_model.py`.",
            )

        base_options = python.BaseOptions(model_asset_path=str(model_path))
        options = vision.HandLandmarkerOptions(
            base_options=base_options,
            running_mode=vision.RunningMode.IMAGE,
            num_hands=max_hands,
        )
        self._detector = vision.HandLandmarker.create_from_options(options)

    def detect_from_base64_jpeg(self, image_base64: str) -> list[HandData]:
        image_data = base64.b64decode(image_base64, validate=True)
        image_buffer = np.frombuffer(image_data, dtype=np.uint8)
        frame = cv2.imdecode(image_buffer, cv2.IMREAD_COLOR)
        if frame is None:
            raise ValueError("Invalid JPEG frame")

        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)
        result = self._detector.detect(mp_image)
        return self._to_hands(result)

    def close(self) -> None:
        self._detector.close()

    def _to_hands(self, result: vision.HandLandmarkerResult) -> list[HandData]:
        hands: list[HandData] = []
        if not result.hand_landmarks or not result.handedness:
            return hands

        for landmarks, handedness_categories in zip(
            result.hand_landmarks,
            result.handedness,
        ):
            index_tip = landmarks[8]
            thumb_tip = landmarks[4]
            pinch_distance = self._calculate_distance(index_tip, thumb_tip)
            handedness = handedness_categories[0]

            hands.append(
                HandData(
                    x=index_tip.x,
                    y=index_tip.y,
                    z=index_tip.z,
                    landmarks=[
                        {"x": point.x, "y": point.y, "z": point.z}
                        for point in landmarks
                    ],
                    handedness=handedness.category_name,
                    is_pinch=pinch_distance < 0.07,
                    pinch_distance=pinch_distance,
                    score=handedness.score,
                ),
            )

        return hands

    def _calculate_distance(self, point_a: Any, point_b: Any) -> float:
        return math.sqrt(
            (point_a.x - point_b.x) ** 2
            + (point_a.y - point_b.y) ** 2
            + (point_a.z - point_b.z) ** 2,
        )


def now_ms() -> int:
    return time.monotonic_ns() // 1_000_000