diff --git a/docs/HandGestureDetector.md b/docs/HandGestureDetector.md new file mode 100644 index 0000000..6933844 --- /dev/null +++ b/docs/HandGestureDetector.md @@ -0,0 +1,39 @@ +# HandGestureDetector + +Este módulo detecta mãos e classifica gestos utilizando um modelo `.onnx` executado com OpenCV DNN. A classe `HandGestureDetector` segue o mesmo padrão de `YoloDetector` e `FaceDetector`, permitindo fácil integração ao pipeline existente. + +## Inserindo o modelo +1. Coloque o arquivo do modelo em `models/hand_gesture.onnx` ou ajuste o caminho em `config.properties` pela chave `handgesture.model.path`. +2. Opcionalmente defina um arquivo com os nomes dos gestos (`handgesture.names`). + +## Parâmetros +- `handgesture.enabled` – ativa ou desativa o módulo. +- `handgesture.confThreshold` – confiança mínima para aceitar um gesto. +- `handgesture.nmsThreshold` – limiar para Non‑Max Suppression. +- `handgesture.inputSize` – tamanho do tensor de entrada (padrão 224x224). +- `processing.skipFramesHandGesture` – número de frames a ignorar entre inferências. + +## Saída para o front‑end +Cada gesto gera um evento `hand_gesture` via WebSocket com a estrutura: +```json +{ + "type": "hand_gesture", + "payload": { + "gestureName": "thumbs_up", + "conf": 0.92, + "personTrackId": 3, + "bbox": [x, y, w, h] + }, + "confidence": 0.92, + "priority": "MEDIUM", + "timestamp": 1699999999999 +} +``` +A sobreposição no vídeo pode ser ativada com `output.drawHandGestures=true`. + +## Desempenho +Para manter latência abaixo de 200 ms, recomenda‑se: +- Utilizar o recorte de pessoas detectadas pelo YOLO (já implementado). +- Ajustar `processing.skipFramesHandGesture` conforme a capacidade da CPU. +- Testar com vídeos ao vivo e imagens estáticas para validar o modelo. + diff --git a/src/main/java/dev/bot/zeno/app/Main.java b/src/main/java/dev/bot/zeno/app/Main.java index 9fb8e0a..d4c124c 100644 --- a/src/main/java/dev/bot/zeno/app/Main.java +++ b/src/main/java/dev/bot/zeno/app/Main.java @@ -7,6 +7,7 @@ import dev.bot.zeno.dnn.ArcFaceRecognizer; import dev.bot.zeno.dnn.FaceDetector; import dev.bot.zeno.dnn.YoloDetector; +import dev.bot.zeno.dnn.HandGestureDetector; import dev.bot.zeno.domain.Event; import dev.bot.zeno.domain.Event.Priority; import dev.bot.zeno.domain.ports.*; @@ -212,6 +213,11 @@ private static void startDetectors(Config cfg, ? new ArcFaceRecognizer(cfg) : null; startDaemonThread("face-detector", new FaceDetector(cfg, latestFrame, detections, events, recognizer)); + + if (cfg.getBool("handgesture.enabled", false)) { + startDaemonThread("hand-gesture-detector", + new HandGestureDetector(cfg, latestFrame, detections, events)); + } } /** Periodically overlays detections on frames and calculates FPS for the preview. */ diff --git a/src/main/java/dev/bot/zeno/debug/DebugServer.java b/src/main/java/dev/bot/zeno/debug/DebugServer.java index 7cc0d3a..f5408a0 100644 --- a/src/main/java/dev/bot/zeno/debug/DebugServer.java +++ b/src/main/java/dev/bot/zeno/debug/DebugServer.java @@ -4,6 +4,7 @@ import com.sun.management.OperatingSystemMXBean; import dev.bot.zeno.app.EventBus; import dev.bot.zeno.overlay.DetectionResult; +import dev.bot.zeno.dnn.HandGestureResult; import dev.bot.zeno.domain.Event; import io.javalin.Javalin; import io.javalin.plugin.bundled.CorsPluginConfig; @@ -249,8 +250,26 @@ private void broadcastDetections() { } } + List> gestures = new ArrayList<>(); + synchronized (dr.handGestures) { + for (HandGestureResult g : dr.handGestures) { + Map hg = new LinkedHashMap<>(); + hg.put("trackId", g.personTrackId); + hg.put("gestureName", g.gestureName); + hg.put("confidence", (double) g.confidence); + hg.put("bbox", Map.of( + "x", g.bbox.x(), + "y", g.bbox.y(), + "w", g.bbox.width(), + "h", g.bbox.height() + )); + gestures.add(hg); + } + } + root.put("objects", objects); root.put("faces", faces); + root.put("gestures", gestures); String json = mapper.writeValueAsString(root); detectionSessions.forEach(s -> { diff --git a/src/main/java/dev/bot/zeno/dnn/HandGestureDetector.java b/src/main/java/dev/bot/zeno/dnn/HandGestureDetector.java new file mode 100644 index 0000000..3283908 --- /dev/null +++ b/src/main/java/dev/bot/zeno/dnn/HandGestureDetector.java @@ -0,0 +1,217 @@ +package dev.bot.zeno.dnn; + +import dev.bot.zeno.overlay.DetectionResult; +import dev.bot.zeno.domain.Event; +import dev.bot.zeno.domain.Event.Priority; +import dev.bot.zeno.util.Config; +import dev.bot.zeno.util.QueueUtils; + +import org.bytedeco.javacpp.FloatPointer; +import org.bytedeco.opencv.opencv_core.Mat; +import org.bytedeco.opencv.opencv_core.Rect; +import org.bytedeco.opencv.opencv_core.Scalar; +import org.bytedeco.opencv.opencv_core.Size; +import org.bytedeco.opencv.opencv_dnn.Net; + +import java.nio.file.Files; +import java.nio.file.Paths; +import java.time.Instant; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.atomic.AtomicReference; + +import static org.bytedeco.opencv.global.opencv_core.CV_32F; +import static org.bytedeco.opencv.global.opencv_dnn.*; +import static org.bytedeco.opencv.global.opencv_imgproc.*; + +/** + * Detector de gestos de mão utilizando modelos ONNX e OpenCV DNN. + *

+ * O detector executa apenas dentro das regiões onde uma pessoa foi + * detectada pelo {@link YoloDetector}, reduzindo o custo de inferência. + * Cada gesto identificado gera um {@link Event} com tipo "hand_gesture". + */ +public class HandGestureDetector implements Runnable { + + private final Config cfg; + private final AtomicReference latestFrame; + private final AtomicReference latestDetections; + private final BlockingQueue eventQueue; + + private final Net net; + private final List gestureNames; + private final Size inputSize; + private final float confThreshold; + private final float nmsThreshold; + private final int skip; + private int frameCount = 0; + + public HandGestureDetector(Config cfg, + AtomicReference latestFrame, + AtomicReference detections, + BlockingQueue eq) { + this.cfg = cfg; + this.latestFrame = latestFrame; + this.latestDetections = detections; + this.eventQueue = eq; + + String modelPath = cfg.get("handgesture.model.path", "models/hand_gesture.onnx"); + this.net = readNetFromONNX(modelPath); + net.setPreferableBackend(DNN_BACKEND_OPENCV); + net.setPreferableTarget(DNN_TARGET_CPU); + + this.confThreshold = (float) cfg.getDouble("handgesture.confThreshold", 0.5); + this.nmsThreshold = (float) cfg.getDouble("handgesture.nmsThreshold", 0.45); + int inp = cfg.getInt("handgesture.inputSize", 224); + this.inputSize = new Size(inp, inp); + this.skip = Math.max(0, cfg.getInt("processing.skipFramesHandGesture", 1)); + + // Carrega nomes de gestos se arquivo for fornecido; caso contrário, + // assume lista vazia, utilizando índices como fallback. + List names; + try { + String namesPath = cfg.get("handgesture.names", "models/hand_gesture.names"); + names = Files.exists(Paths.get(namesPath)) ? Files.readAllLines(Paths.get(namesPath)) : List.of(); + } catch (Exception e) { + names = List.of(); + } + this.gestureNames = names; + } + + @Override + public void run() { + while (!Thread.currentThread().isInterrupted()) { + Mat src = latestFrame.get(); + if (src == null || src.empty()) { + try { Thread.sleep(2); } catch (InterruptedException e) { break; } + continue; + } + + Mat frame = src.clone(); + src.release(); + + frameCount++; + if (skip > 0 && (frameCount % (skip + 1)) != 0) { + frame.release(); + continue; + } + + // Copia boxes de pessoas detectadas pelo YOLO. + List persons = new ArrayList<>(); + DetectionResult dr = latestDetections.get(); + synchronized (dr.yoloBoxes) { + for (DetectionResult.Box b : dr.yoloBoxes) { + if ("person".equalsIgnoreCase(b.label)) { + persons.add(b); + } + } + } + + List newGestures = new ArrayList<>(); + for (DetectionResult.Box person : persons) { + Rect r = person.rect; + // Garante que ROI esteja dentro dos limites da imagem. + Rect roiRect = new Rect( + Math.max(0, r.x()), + Math.max(0, r.y()), + Math.min(r.width(), frame.cols() - r.x()), + Math.min(r.height(), frame.rows() - r.y()) + ); + Mat roi = new Mat(frame, roiRect); + + Mat blob = blobFromImage(roi, 1.0 / 255.0, inputSize, new Scalar(0.0), true, false, CV_32F); + net.setInput(blob); + Mat out = net.forward(); + + FloatPointer fp = new FloatPointer(out.data()); + int detections = out.size(2); // assume formato [1,1,N,7] + List boxes = new ArrayList<>(); + List scores = new ArrayList<>(); + List classIds = new ArrayList<>(); + + int w = roi.cols(); + int h = roi.rows(); + for (int i = 0; i < detections; i++) { + int base = i * 7; + float conf = fp.get(base + 2); + if (conf < confThreshold) continue; + int classId = Math.round(fp.get(base + 1)); + int x1 = Math.max(0, Math.round(fp.get(base + 3) * w)); + int y1 = Math.max(0, Math.round(fp.get(base + 4) * h)); + int x2 = Math.min(w - 1, Math.round(fp.get(base + 5) * w)); + int y2 = Math.min(h - 1, Math.round(fp.get(base + 6) * h)); + boxes.add(new Rect(x1, y1, Math.max(0, x2 - x1), Math.max(0, y2 - y1))); + scores.add(conf); + classIds.add(classId); + } + + // NMS para remover boxes redundantes dentro do ROI. + if (!boxes.isEmpty()) { + int size = boxes.size(); + org.bytedeco.opencv.opencv_core.Rect2dVector boxesVec = new org.bytedeco.opencv.opencv_core.Rect2dVector(); + org.bytedeco.javacpp.FloatPointer confVec = new org.bytedeco.javacpp.FloatPointer(size); + for (int i = 0; i < size; i++) { + org.bytedeco.opencv.opencv_core.Rect b = boxes.get(i); + boxesVec.push_back(new org.bytedeco.opencv.opencv_core.Rect2d(b.x(), b.y(), b.width(), b.height())); + confVec.put(i, scores.get(i)); + } + org.bytedeco.javacpp.IntPointer indices = new org.bytedeco.javacpp.IntPointer(size); + NMSBoxes(boxesVec, confVec, confThreshold, nmsThreshold, indices, 1.f, 0); + + for (int i = 0; i < indices.limit(); i++) { + int idx = indices.get(i); + Rect b = boxes.get(idx); + // Converte coordenadas do ROI para imagem completa. + Rect abs = new Rect( + roiRect.x() + b.x(), + roiRect.y() + b.y(), + b.width(), + b.height() + ); + float score = scores.get(idx); + int classId = classIds.get(idx); + String name = classId >= 0 && classId < gestureNames.size() + ? gestureNames.get(classId) + : "gesture_" + classId; + + HandGestureResult result = new HandGestureResult(abs, name, score, person.id, Instant.now().toEpochMilli()); + newGestures.add(result); + + Map payload = Map.of( + "gestureName", name, + "conf", (double) score, + "personTrackId", person.id, + "bbox", List.of(abs.x(), abs.y(), abs.width(), abs.height()) + ); + Event ev = new Event.Builder() + .type("hand_gesture") + .payload(payload) + .confidence(score) + .priority(Priority.MEDIUM) + .build(); + QueueUtils.offerLatest(eventQueue, ev); + } + + boxesVec.deallocate(); + confVec.deallocate(); + indices.deallocate(); + } + + fp.deallocate(); + out.release(); + blob.release(); + roi.release(); + } + + synchronized (dr.handGestures) { + dr.clearHandGestures(); + dr.handGestures.addAll(newGestures); + } + + frame.release(); + } + } +} + diff --git a/src/main/java/dev/bot/zeno/dnn/HandGestureResult.java b/src/main/java/dev/bot/zeno/dnn/HandGestureResult.java new file mode 100644 index 0000000..5710301 --- /dev/null +++ b/src/main/java/dev/bot/zeno/dnn/HandGestureResult.java @@ -0,0 +1,44 @@ +package dev.bot.zeno.dnn; + +import org.bytedeco.opencv.opencv_core.Rect; + +import java.time.Instant; + +/** + * Resultado individual da detecção de gestos de mão. + *

+ * Cada instância representa a posição da mão, o gesto reconhecido e + * metadados auxiliares como confiança, ID da pessoa e timestamp da + * detecção. A estrutura é simples e imutável para facilitar o + * compartilhamento entre threads. + */ +public class HandGestureResult { + + /** Caixa delimitadora da mão no frame original. */ + public final Rect bbox; + + /** Nome amigável do gesto reconhecido (por exemplo, "thumbs_up"). */ + public final String gestureName; + + /** Confiança do modelo para o gesto detectado (0..1). */ + public final float confidence; + + /** TrackId da pessoa, herdado da detecção de pessoas do YOLO. */ + public final int personTrackId; + + /** Momento da detecção em epoch millis. */ + public final long timestamp; + + public HandGestureResult(Rect bbox, String gestureName, float confidence, int personTrackId) { + this(bbox, gestureName, confidence, personTrackId, Instant.now().toEpochMilli()); + } + + public HandGestureResult(Rect bbox, String gestureName, float confidence, int personTrackId, long timestamp) { + this.bbox = bbox; + this.gestureName = gestureName; + this.confidence = confidence; + this.personTrackId = personTrackId; + this.timestamp = timestamp; + } +} + diff --git a/src/main/java/dev/bot/zeno/overlay/DetectionOverlay.java b/src/main/java/dev/bot/zeno/overlay/DetectionOverlay.java index 1890aa8..dc079cc 100644 --- a/src/main/java/dev/bot/zeno/overlay/DetectionOverlay.java +++ b/src/main/java/dev/bot/zeno/overlay/DetectionOverlay.java @@ -1,5 +1,6 @@ package dev.bot.zeno.overlay; +import dev.bot.zeno.dnn.HandGestureResult; import dev.bot.zeno.util.Config; import org.bytedeco.opencv.opencv_core.*; @@ -45,5 +46,17 @@ public static void drawAll(Mat img, DetectionResult res, Config cfg) { } } } + if (cfg.getBool("output.drawHandGestures", true)) { + synchronized (res.handGestures) { + for (HandGestureResult g : res.handGestures) { + Scalar color = new Scalar(255, 0, 255, 0); // magenta + rectangle(img, g.bbox, color, 2, LINE_8, 0); + String text = g.gestureName + " #" + g.personTrackId + String.format(" %.2f", g.confidence); + putText(img, text, + new Point(g.bbox.x(), Math.max(0, g.bbox.y() - 5)), + FONT_HERSHEY_SIMPLEX, 0.6, color, 2, LINE_AA, false); + } + } + } } } diff --git a/src/main/java/dev/bot/zeno/overlay/DetectionResult.java b/src/main/java/dev/bot/zeno/overlay/DetectionResult.java index f37f1bc..d533058 100644 --- a/src/main/java/dev/bot/zeno/overlay/DetectionResult.java +++ b/src/main/java/dev/bot/zeno/overlay/DetectionResult.java @@ -1,6 +1,8 @@ package dev.bot.zeno.overlay; import org.bytedeco.opencv.opencv_core.Rect; + +import dev.bot.zeno.dnn.HandGestureResult; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -31,8 +33,10 @@ public Box(int id, Rect r, String label, float confidence) { public final List yoloBoxes = Collections.synchronizedList(new ArrayList<>()); public final List faceBoxes = Collections.synchronizedList(new ArrayList<>()); + public final List handGestures = Collections.synchronizedList(new ArrayList<>()); public Box largestFace = null; public void clearYolo() { yoloBoxes.clear(); } public void clearFaces() { faceBoxes.clear(); largestFace = null; } + public void clearHandGestures() { handGestures.clear(); } } diff --git a/src/main/resources/config.properties b/src/main/resources/config.properties index 5711903..4a8ffc5 100644 --- a/src/main/resources/config.properties +++ b/src/main/resources/config.properties @@ -26,13 +26,23 @@ arcface.onnx=models/arcface.onnx arcface.similarityThreshold=0.42 arcface.minFaceSize=60 +# === Hand Gesture Detection === +handgesture.enabled=false +handgesture.model.path=models/hand_gesture.onnx +handgesture.confThreshold=0.5 +handgesture.nmsThreshold=0.45 +handgesture.inputSize=224 +handgesture.names=models/hand_gesture.names + # === Performance === processing.skipFramesYolo=3 processing.skipFramesFace=3 +processing.skipFramesHandGesture=3 # === Output === output.drawYolo=true output.drawFaces=true +output.drawHandGestures=true # === Vision === vision.overlay.enabled=false diff --git a/src/main/resources/public/dashboard/index.html b/src/main/resources/public/dashboard/index.html index 1f054f4..48b4c3c 100644 --- a/src/main/resources/public/dashboard/index.html +++ b/src/main/resources/public/dashboard/index.html @@ -62,6 +62,7 @@

Visão • vídeo + overlay

Objetos (YOLO) Faces (ArcFace) + Gestos (Hands)
@@ -84,6 +85,7 @@

Status • sistema

Tracks: Objetos: Faces: + Gestos: WS:desconectado

Entidades ativas

@@ -98,7 +100,7 @@

Entidades ativas

const img = $('#mjpeg'), cvs = $('#overlay'), ctx = cvs.getContext('2d'); const frameInfo = $('#frameInfo'); const wsStatus = $('#wsStatus'), wsVal = $('#wsVal'); - const fpsVal = $('#fpsVal'), tracksVal = $('#tracksVal'), objsVal = $('#objsVal'), facesVal = $('#facesVal'); + const fpsVal = $('#fpsVal'), tracksVal = $('#tracksVal'), objsVal = $('#objsVal'), facesVal = $('#facesVal'), gestsVal = $('#gestsVal'); const entities = $('#entities'); const links = { metrics: $('#metricsLink'), events: $('#eventsLink'), snap: $('#snapshotLink') }; const health = $('#health'); @@ -157,12 +159,23 @@

Entidades ativas

ctx.fillStyle = 'rgba(8, 58, 45, .8)'; ctx.fillRect(x*sx, y*sy-16, tw, 16); ctx.fillStyle = '#d1fae5'; ctx.fillText(label, x*sx+4, y*sy-4); }); + // gestures + (data.gestures||[]).forEach(g=>{ + const {x,y,w,h} = g.bbox; + ctx.strokeStyle = '#d946ef'; + ctx.strokeRect(x*sx, y*sy, w*sx, h*sy); + const label = `${g.gestureName ?? 'gesture'} #${g.trackId ?? ''} ${g.confidence?.toFixed(2) ?? ''}`.trim(); + const tw = ctx.measureText(label).width + 8; + ctx.fillStyle = 'rgba(71, 0, 58, .8)'; ctx.fillRect(x*sx, y*sy-16, tw, 16); + ctx.fillStyle = '#f5d0fe'; ctx.fillText(label, x*sx+4, y*sy-4); + }); } function renderEntities(data){ entities.innerHTML=''; const items = []; (data.objects||[]).forEach(o=>items.push({k:`obj:${o.className}`, v:`#${o.trackId} • conf ${o.confidence?.toFixed(2)}`})); (data.faces||[]).forEach(f=>items.push({k:`face:${f.name}`, v:`#${f.trackId} • conf ${f.confidence?.toFixed(2)}`})); + (data.gestures||[]).forEach(g=>items.push({k:`gesture:${g.gestureName}`, v:`#${g.trackId} • conf ${g.confidence?.toFixed(2)}`})); if(!items.length){ entities.innerHTML = '
— sem entidades ativas —
'; return; } for(const it of items){ const div = document.createElement('div'); div.className='item'; @@ -207,6 +220,7 @@

Entidades ativas

frameInfo.textContent = `frame ${data.frameId} • ${new Date(data.ts).toLocaleTimeString()}`; objsVal.textContent = data.objects?.length ?? 0; facesVal.textContent = data.faces?.length ?? 0; + gestsVal.textContent = data.gestures?.length ?? 0; drawOverlay(data, lastImgSize); renderEntities(data); });