sbrunomello · sbrunomello · Aug 14, 2025
diff --git a/docs/HandGestureDetector.md b/docs/HandGestureDetector.md
@@ -0,0 +1,39 @@
+# HandGestureDetector
+
+Este módulo detecta mãos e classifica gestos utilizando um modelo `.onnx` executado com OpenCV DNN. A classe `HandGestureDetector` segue o mesmo padrão de `YoloDetector` e `FaceDetector`, permitindo fácil integração ao pipeline existente.
+
+## Inserindo o modelo
+1. Coloque o arquivo do modelo em `models/hand_gesture.onnx` ou ajuste o caminho em `config.properties` pela chave `handgesture.model.path`.
+2. Opcionalmente defina um arquivo com os nomes dos gestos (`handgesture.names`).
+
+## Parâmetros
+- `handgesture.enabled` – ativa ou desativa o módulo.
+- `handgesture.confThreshold` – confiança mínima para aceitar um gesto.
+- `handgesture.nmsThreshold` – limiar para Non‑Max Suppression.
+- `handgesture.inputSize` – tamanho do tensor de entrada (padrão 224x224).
+- `processing.skipFramesHandGesture` – número de frames a ignorar entre inferências.
+
+## Saída para o front‑end
+Cada gesto gera um evento `hand_gesture` via WebSocket com a estrutura:
+```json
+{
+  "type": "hand_gesture",
+  "payload": {
+    "gestureName": "thumbs_up",
+    "conf": 0.92,
+    "personTrackId": 3,
+    "bbox": [x, y, w, h]
+  },
+  "confidence": 0.92,
+  "priority": "MEDIUM",
+  "timestamp": 1699999999999
+}
+```
+A sobreposição no vídeo pode ser ativada com `output.drawHandGestures=true`.
+
+## Desempenho
+Para manter latência abaixo de 200 ms, recomenda‑se:
+- Utilizar o recorte de pessoas detectadas pelo YOLO (já implementado).
+- Ajustar `processing.skipFramesHandGesture` conforme a capacidade da CPU.
+- Testar com vídeos ao vivo e imagens estáticas para validar o modelo.
+
diff --git a/src/main/java/dev/bot/zeno/app/Main.java b/src/main/java/dev/bot/zeno/app/Main.java
@@ -7,6 +7,7 @@
 import dev.bot.zeno.dnn.ArcFaceRecognizer;
 import dev.bot.zeno.dnn.FaceDetector;
 import dev.bot.zeno.dnn.YoloDetector;
+import dev.bot.zeno.dnn.HandGestureDetector;
 import dev.bot.zeno.domain.Event;
 import dev.bot.zeno.domain.Event.Priority;
 import dev.bot.zeno.domain.ports.*;
@@ -212,6 +213,11 @@ private static void startDetectors(Config cfg,
                 ? new ArcFaceRecognizer(cfg) : null;
         startDaemonThread("face-detector",
                 new FaceDetector(cfg, latestFrame, detections, events, recognizer));
+
+        if (cfg.getBool("handgesture.enabled", false)) {
+            startDaemonThread("hand-gesture-detector",
+                    new HandGestureDetector(cfg, latestFrame, detections, events));
+        }
     }
 
     /** Periodically overlays detections on frames and calculates FPS for the preview. */

diff --git a/src/main/java/dev/bot/zeno/debug/DebugServer.java b/src/main/java/dev/bot/zeno/debug/DebugServer.java
@@ -4,6 +4,7 @@
 import com.sun.management.OperatingSystemMXBean;
 import dev.bot.zeno.app.EventBus;
 import dev.bot.zeno.overlay.DetectionResult;
+import dev.bot.zeno.dnn.HandGestureResult;
 import dev.bot.zeno.domain.Event;
 import io.javalin.Javalin;
 import io.javalin.plugin.bundled.CorsPluginConfig;
@@ -249,8 +250,26 @@ private void broadcastDetections() {
                 }
             }
 
+            List<Map<String, Object>> gestures = new ArrayList<>();
+            synchronized (dr.handGestures) {
+                for (HandGestureResult g : dr.handGestures) {
+                    Map<String, Object> hg = new LinkedHashMap<>();
+                    hg.put("trackId", g.personTrackId);
+                    hg.put("gestureName", g.gestureName);
+                    hg.put("confidence", (double) g.confidence);
+                    hg.put("bbox", Map.of(
+                            "x", g.bbox.x(),
+                            "y", g.bbox.y(),
+                            "w", g.bbox.width(),
+                            "h", g.bbox.height()
+                    ));
+                    gestures.add(hg);
+                }
+            }
+
             root.put("objects", objects);
             root.put("faces", faces);
+            root.put("gestures", gestures);
 
             String json = mapper.writeValueAsString(root);
             detectionSessions.forEach(s -> {

diff --git a/src/main/java/dev/bot/zeno/dnn/HandGestureDetector.java b/src/main/java/dev/bot/zeno/dnn/HandGestureDetector.java
@@ -0,0 +1,217 @@
+package dev.bot.zeno.dnn;
+
+import dev.bot.zeno.overlay.DetectionResult;
+import dev.bot.zeno.domain.Event;
+import dev.bot.zeno.domain.Event.Priority;
+import dev.bot.zeno.util.Config;
+import dev.bot.zeno.util.QueueUtils;
+
+import org.bytedeco.javacpp.FloatPointer;
+import org.bytedeco.opencv.opencv_core.Mat;
+import org.bytedeco.opencv.opencv_core.Rect;
+import org.bytedeco.opencv.opencv_core.Scalar;
+import org.bytedeco.opencv.opencv_core.Size;
+import org.bytedeco.opencv.opencv_dnn.Net;
+
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.time.Instant;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.atomic.AtomicReference;
+
+import static org.bytedeco.opencv.global.opencv_core.CV_32F;
+import static org.bytedeco.opencv.global.opencv_dnn.*;
+import static org.bytedeco.opencv.global.opencv_imgproc.*;
+
+/**
+ * Detector de gestos de mão utilizando modelos ONNX e OpenCV DNN.
+ * <p>
+ * O detector executa apenas dentro das regiões onde uma pessoa foi
+ * detectada pelo {@link YoloDetector}, reduzindo o custo de inferência.
+ * Cada gesto identificado gera um {@link Event} com tipo "hand_gesture".
+ */
+public class HandGestureDetector implements Runnable {
+
+    private final Config cfg;
+    private final AtomicReference<Mat> latestFrame;
+    private final AtomicReference<DetectionResult> latestDetections;
+    private final BlockingQueue<Event> eventQueue;
+
+    private final Net net;
+    private final List<String> gestureNames;
+    private final Size inputSize;
+    private final float confThreshold;
+    private final float nmsThreshold;
+    private final int skip;
+    private int frameCount = 0;
+
+    public HandGestureDetector(Config cfg,
+                               AtomicReference<Mat> latestFrame,
+                               AtomicReference<DetectionResult> detections,
+                               BlockingQueue<Event> eq) {
+        this.cfg = cfg;
+        this.latestFrame = latestFrame;
+        this.latestDetections = detections;
+        this.eventQueue = eq;
+
+        String modelPath = cfg.get("handgesture.model.path", "models/hand_gesture.onnx");
+        this.net = readNetFromONNX(modelPath);
+        net.setPreferableBackend(DNN_BACKEND_OPENCV);
+        net.setPreferableTarget(DNN_TARGET_CPU);
+
+        this.confThreshold = (float) cfg.getDouble("handgesture.confThreshold", 0.5);
+        this.nmsThreshold = (float) cfg.getDouble("handgesture.nmsThreshold", 0.45);
+        int inp = cfg.getInt("handgesture.inputSize", 224);
+        this.inputSize = new Size(inp, inp);
+        this.skip = Math.max(0, cfg.getInt("processing.skipFramesHandGesture", 1));
+
+        // Carrega nomes de gestos se arquivo for fornecido; caso contrário,
+        // assume lista vazia, utilizando índices como fallback.
+        List<String> names;
+        try {
+            String namesPath = cfg.get("handgesture.names", "models/hand_gesture.names");
+            names = Files.exists(Paths.get(namesPath)) ? Files.readAllLines(Paths.get(namesPath)) : List.of();
+        } catch (Exception e) {
+            names = List.of();
+        }
+        this.gestureNames = names;
+    }
+
+    @Override
+    public void run() {
+        while (!Thread.currentThread().isInterrupted()) {
+            Mat src = latestFrame.get();
+            if (src == null || src.empty()) {
+                try { Thread.sleep(2); } catch (InterruptedException e) { break; }
+                continue;
+            }
+
+            Mat frame = src.clone();
+            src.release();
+
+            frameCount++;
+            if (skip > 0 && (frameCount % (skip + 1)) != 0) {
+                frame.release();
+                continue;
+            }
+
+            // Copia boxes de pessoas detectadas pelo YOLO.
+            List<DetectionResult.Box> persons = new ArrayList<>();
+            DetectionResult dr = latestDetections.get();
+            synchronized (dr.yoloBoxes) {
+                for (DetectionResult.Box b : dr.yoloBoxes) {
+                    if ("person".equalsIgnoreCase(b.label)) {
+                        persons.add(b);
+                    }
+                }
+            }
+
+            List<HandGestureResult> newGestures = new ArrayList<>();
+            for (DetectionResult.Box person : persons) {
+                Rect r = person.rect;
+                // Garante que ROI esteja dentro dos limites da imagem.
+                Rect roiRect = new Rect(
+                        Math.max(0, r.x()),
+                        Math.max(0, r.y()),
+                        Math.min(r.width(), frame.cols() - r.x()),
+                        Math.min(r.height(), frame.rows() - r.y())
+                );
+                Mat roi = new Mat(frame, roiRect);
+
+                Mat blob = blobFromImage(roi, 1.0 / 255.0, inputSize, new Scalar(0.0), true, false, CV_32F);
+                net.setInput(blob);
+                Mat out = net.forward();
+
+                FloatPointer fp = new FloatPointer(out.data());
+                int detections = out.size(2); // assume formato [1,1,N,7]
+                List<Rect> boxes = new ArrayList<>();
+                List<Float> scores = new ArrayList<>();
+                List<Integer> classIds = new ArrayList<>();
+
+                int w = roi.cols();
+                int h = roi.rows();
+                for (int i = 0; i < detections; i++) {
+                    int base = i * 7;
+                    float conf = fp.get(base + 2);
+                    if (conf < confThreshold) continue;
+                    int classId = Math.round(fp.get(base + 1));
+                    int x1 = Math.max(0, Math.round(fp.get(base + 3) * w));
+                    int y1 = Math.max(0, Math.round(fp.get(base + 4) * h));
+                    int x2 = Math.min(w - 1, Math.round(fp.get(base + 5) * w));
+                    int y2 = Math.min(h - 1, Math.round(fp.get(base + 6) * h));
+                    boxes.add(new Rect(x1, y1, Math.max(0, x2 - x1), Math.max(0, y2 - y1)));
+                    scores.add(conf);
+                    classIds.add(classId);
+                }
+
+                // NMS para remover boxes redundantes dentro do ROI.
+                if (!boxes.isEmpty()) {
+                    int size = boxes.size();
+                    org.bytedeco.opencv.opencv_core.Rect2dVector boxesVec = new org.bytedeco.opencv.opencv_core.Rect2dVector();
+                    org.bytedeco.javacpp.FloatPointer confVec = new org.bytedeco.javacpp.FloatPointer(size);
+                    for (int i = 0; i < size; i++) {
+                        org.bytedeco.opencv.opencv_core.Rect b = boxes.get(i);
+                        boxesVec.push_back(new org.bytedeco.opencv.opencv_core.Rect2d(b.x(), b.y(), b.width(), b.height()));
+                        confVec.put(i, scores.get(i));
+                    }
+                    org.bytedeco.javacpp.IntPointer indices = new org.bytedeco.javacpp.IntPointer(size);
+                    NMSBoxes(boxesVec, confVec, confThreshold, nmsThreshold, indices, 1.f, 0);
+
+                    for (int i = 0; i < indices.limit(); i++) {
+                        int idx = indices.get(i);
+                        Rect b = boxes.get(idx);
+                        // Converte coordenadas do ROI para imagem completa.
+                        Rect abs = new Rect(
+                                roiRect.x() + b.x(),
+                                roiRect.y() + b.y(),
+                                b.width(),
+                                b.height()
+                        );
+                        float score = scores.get(idx);
+                        int classId = classIds.get(idx);
+                        String name = classId >= 0 && classId < gestureNames.size()
+                                ? gestureNames.get(classId)
+                                : "gesture_" + classId;
+
+                        HandGestureResult result = new HandGestureResult(abs, name, score, person.id, Instant.now().toEpochMilli());
+                        newGestures.add(result);
+
+                        Map<String, Object> payload = Map.of(
+                                "gestureName", name,
+                                "conf", (double) score,
+                                "personTrackId", person.id,
+                                "bbox", List.of(abs.x(), abs.y(), abs.width(), abs.height())
+                        );
+                        Event ev = new Event.Builder()
+                                .type("hand_gesture")
+                                .payload(payload)
+                                .confidence(score)
+                                .priority(Priority.MEDIUM)
+                                .build();
+                        QueueUtils.offerLatest(eventQueue, ev);
+                    }
+
+                    boxesVec.deallocate();
+                    confVec.deallocate();
+                    indices.deallocate();
+                }
+
+                fp.deallocate();
+                out.release();
+                blob.release();
+                roi.release();
+            }
+
+            synchronized (dr.handGestures) {
+                dr.clearHandGestures();
+                dr.handGestures.addAll(newGestures);
+            }
+
+            frame.release();
+        }
+    }
+}
+
diff --git a/src/main/java/dev/bot/zeno/dnn/HandGestureResult.java b/src/main/java/dev/bot/zeno/dnn/HandGestureResult.java
@@ -0,0 +1,44 @@
+package dev.bot.zeno.dnn;
+
+import org.bytedeco.opencv.opencv_core.Rect;
+
+import java.time.Instant;
+
+/**
+ * Resultado individual da detecção de gestos de mão.
+ * <p>
+ * Cada instância representa a posição da mão, o gesto reconhecido e
+ * metadados auxiliares como confiança, ID da pessoa e timestamp da
+ * detecção. A estrutura é simples e imutável para facilitar o
+ * compartilhamento entre threads.
+ */
+public class HandGestureResult {
+
+    /** Caixa delimitadora da mão no frame original. */
+    public final Rect bbox;
+
+    /** Nome amigável do gesto reconhecido (por exemplo, "thumbs_up"). */
+    public final String gestureName;
+
+    /** Confiança do modelo para o gesto detectado (0..1). */
+    public final float confidence;
+
+    /** TrackId da pessoa, herdado da detecção de pessoas do YOLO. */
+    public final int personTrackId;
+
+    /** Momento da detecção em epoch millis. */
+    public final long timestamp;
+
+    public HandGestureResult(Rect bbox, String gestureName, float confidence, int personTrackId) {
+        this(bbox, gestureName, confidence, personTrackId, Instant.now().toEpochMilli());
+    }
+
+    public HandGestureResult(Rect bbox, String gestureName, float confidence, int personTrackId, long timestamp) {
+        this.bbox = bbox;
+        this.gestureName = gestureName;
+        this.confidence = confidence;
+        this.personTrackId = personTrackId;
+        this.timestamp = timestamp;
+    }
+}
+
diff --git a/src/main/java/dev/bot/zeno/overlay/DetectionOverlay.java b/src/main/java/dev/bot/zeno/overlay/DetectionOverlay.java
@@ -1,5 +1,6 @@
 package dev.bot.zeno.overlay;
 
+import dev.bot.zeno.dnn.HandGestureResult;
 import dev.bot.zeno.util.Config;
 import org.bytedeco.opencv.opencv_core.*;
 
@@ -45,5 +46,17 @@ public static void drawAll(Mat img, DetectionResult res, Config cfg) {
                 }
             }
         }
+        if (cfg.getBool("output.drawHandGestures", true)) {
+            synchronized (res.handGestures) {
+                for (HandGestureResult g : res.handGestures) {
+                    Scalar color = new Scalar(255, 0, 255, 0); // magenta
+                    rectangle(img, g.bbox, color, 2, LINE_8, 0);
+                    String text = g.gestureName + " #" + g.personTrackId + String.format(" %.2f", g.confidence);
+                    putText(img, text,
+                            new Point(g.bbox.x(), Math.max(0, g.bbox.y() - 5)),
+                            FONT_HERSHEY_SIMPLEX, 0.6, color, 2, LINE_AA, false);
+                }
+            }
+        }
     }
 }