Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions docs/HandGestureDetector.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# HandGestureDetector

Este módulo detecta mãos e classifica gestos utilizando um modelo `.onnx` executado com OpenCV DNN. A classe `HandGestureDetector` segue o mesmo padrão de `YoloDetector` e `FaceDetector`, permitindo fácil integração ao pipeline existente.

## Inserindo o modelo
1. Coloque o arquivo do modelo em `models/hand_gesture.onnx` ou ajuste o caminho em `config.properties` pela chave `handgesture.model.path`.
2. Opcionalmente defina um arquivo com os nomes dos gestos (`handgesture.names`).

## Parâmetros
- `handgesture.enabled` – ativa ou desativa o módulo.
- `handgesture.confThreshold` – confiança mínima para aceitar um gesto.
- `handgesture.nmsThreshold` – limiar para Non‑Max Suppression.
- `handgesture.inputSize` – tamanho do tensor de entrada (padrão 224x224).
- `processing.skipFramesHandGesture` – número de frames a ignorar entre inferências.

## Saída para o front‑end
Cada gesto gera um evento `hand_gesture` via WebSocket com a estrutura:
```json
{
"type": "hand_gesture",
"payload": {
"gestureName": "thumbs_up",
"conf": 0.92,
"personTrackId": 3,
"bbox": [x, y, w, h]
},
"confidence": 0.92,
"priority": "MEDIUM",
"timestamp": 1699999999999
}
```
A sobreposição no vídeo pode ser ativada com `output.drawHandGestures=true`.

## Desempenho
Para manter latência abaixo de 200 ms, recomenda‑se:
- Utilizar o recorte de pessoas detectadas pelo YOLO (já implementado).
- Ajustar `processing.skipFramesHandGesture` conforme a capacidade da CPU.
- Testar com vídeos ao vivo e imagens estáticas para validar o modelo.

6 changes: 6 additions & 0 deletions src/main/java/dev/bot/zeno/app/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import dev.bot.zeno.dnn.ArcFaceRecognizer;
import dev.bot.zeno.dnn.FaceDetector;
import dev.bot.zeno.dnn.YoloDetector;
import dev.bot.zeno.dnn.HandGestureDetector;
import dev.bot.zeno.domain.Event;
import dev.bot.zeno.domain.Event.Priority;
import dev.bot.zeno.domain.ports.*;
Expand Down Expand Up @@ -212,6 +213,11 @@ private static void startDetectors(Config cfg,
? new ArcFaceRecognizer(cfg) : null;
startDaemonThread("face-detector",
new FaceDetector(cfg, latestFrame, detections, events, recognizer));

if (cfg.getBool("handgesture.enabled", false)) {
startDaemonThread("hand-gesture-detector",
new HandGestureDetector(cfg, latestFrame, detections, events));
}
}

/** Periodically overlays detections on frames and calculates FPS for the preview. */
Expand Down
19 changes: 19 additions & 0 deletions src/main/java/dev/bot/zeno/debug/DebugServer.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import com.sun.management.OperatingSystemMXBean;
import dev.bot.zeno.app.EventBus;
import dev.bot.zeno.overlay.DetectionResult;
import dev.bot.zeno.dnn.HandGestureResult;
import dev.bot.zeno.domain.Event;
import io.javalin.Javalin;
import io.javalin.plugin.bundled.CorsPluginConfig;
Expand Down Expand Up @@ -249,8 +250,26 @@ private void broadcastDetections() {
}
}

List<Map<String, Object>> gestures = new ArrayList<>();
synchronized (dr.handGestures) {
for (HandGestureResult g : dr.handGestures) {
Map<String, Object> hg = new LinkedHashMap<>();
hg.put("trackId", g.personTrackId);
hg.put("gestureName", g.gestureName);
hg.put("confidence", (double) g.confidence);
hg.put("bbox", Map.of(
"x", g.bbox.x(),
"y", g.bbox.y(),
"w", g.bbox.width(),
"h", g.bbox.height()
));
gestures.add(hg);
}
}

root.put("objects", objects);
root.put("faces", faces);
root.put("gestures", gestures);

String json = mapper.writeValueAsString(root);
detectionSessions.forEach(s -> {
Expand Down
217 changes: 217 additions & 0 deletions src/main/java/dev/bot/zeno/dnn/HandGestureDetector.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
package dev.bot.zeno.dnn;

import dev.bot.zeno.overlay.DetectionResult;
import dev.bot.zeno.domain.Event;
import dev.bot.zeno.domain.Event.Priority;
import dev.bot.zeno.util.Config;
import dev.bot.zeno.util.QueueUtils;

import org.bytedeco.javacpp.FloatPointer;
import org.bytedeco.opencv.opencv_core.Mat;
import org.bytedeco.opencv.opencv_core.Rect;
import org.bytedeco.opencv.opencv_core.Scalar;
import org.bytedeco.opencv.opencv_core.Size;
import org.bytedeco.opencv.opencv_dnn.Net;

import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicReference;

import static org.bytedeco.opencv.global.opencv_core.CV_32F;
import static org.bytedeco.opencv.global.opencv_dnn.*;
import static org.bytedeco.opencv.global.opencv_imgproc.*;

/**
* Detector de gestos de mão utilizando modelos ONNX e OpenCV DNN.
* <p>
* O detector executa apenas dentro das regiões onde uma pessoa foi
* detectada pelo {@link YoloDetector}, reduzindo o custo de inferência.
* Cada gesto identificado gera um {@link Event} com tipo "hand_gesture".
*/
public class HandGestureDetector implements Runnable {

private final Config cfg;
private final AtomicReference<Mat> latestFrame;
private final AtomicReference<DetectionResult> latestDetections;
private final BlockingQueue<Event> eventQueue;

private final Net net;
private final List<String> gestureNames;
private final Size inputSize;
private final float confThreshold;
private final float nmsThreshold;
private final int skip;
private int frameCount = 0;

public HandGestureDetector(Config cfg,
AtomicReference<Mat> latestFrame,
AtomicReference<DetectionResult> detections,
BlockingQueue<Event> eq) {
this.cfg = cfg;
this.latestFrame = latestFrame;
this.latestDetections = detections;
this.eventQueue = eq;

String modelPath = cfg.get("handgesture.model.path", "models/hand_gesture.onnx");
this.net = readNetFromONNX(modelPath);
net.setPreferableBackend(DNN_BACKEND_OPENCV);
net.setPreferableTarget(DNN_TARGET_CPU);

this.confThreshold = (float) cfg.getDouble("handgesture.confThreshold", 0.5);
this.nmsThreshold = (float) cfg.getDouble("handgesture.nmsThreshold", 0.45);
int inp = cfg.getInt("handgesture.inputSize", 224);
this.inputSize = new Size(inp, inp);
this.skip = Math.max(0, cfg.getInt("processing.skipFramesHandGesture", 1));

// Carrega nomes de gestos se arquivo for fornecido; caso contrário,
// assume lista vazia, utilizando índices como fallback.
List<String> names;
try {
String namesPath = cfg.get("handgesture.names", "models/hand_gesture.names");
names = Files.exists(Paths.get(namesPath)) ? Files.readAllLines(Paths.get(namesPath)) : List.of();
} catch (Exception e) {
names = List.of();
}
this.gestureNames = names;
}

@Override
public void run() {
while (!Thread.currentThread().isInterrupted()) {
Mat src = latestFrame.get();
if (src == null || src.empty()) {
try { Thread.sleep(2); } catch (InterruptedException e) { break; }
continue;
}

Mat frame = src.clone();
src.release();

frameCount++;
if (skip > 0 && (frameCount % (skip + 1)) != 0) {
frame.release();
continue;
}

// Copia boxes de pessoas detectadas pelo YOLO.
List<DetectionResult.Box> persons = new ArrayList<>();
DetectionResult dr = latestDetections.get();
synchronized (dr.yoloBoxes) {
for (DetectionResult.Box b : dr.yoloBoxes) {
if ("person".equalsIgnoreCase(b.label)) {
persons.add(b);
}
}
}

List<HandGestureResult> newGestures = new ArrayList<>();
for (DetectionResult.Box person : persons) {
Rect r = person.rect;
// Garante que ROI esteja dentro dos limites da imagem.
Rect roiRect = new Rect(
Math.max(0, r.x()),
Math.max(0, r.y()),
Math.min(r.width(), frame.cols() - r.x()),
Math.min(r.height(), frame.rows() - r.y())
);
Mat roi = new Mat(frame, roiRect);

Mat blob = blobFromImage(roi, 1.0 / 255.0, inputSize, new Scalar(0.0), true, false, CV_32F);
net.setInput(blob);
Mat out = net.forward();

FloatPointer fp = new FloatPointer(out.data());
int detections = out.size(2); // assume formato [1,1,N,7]
List<Rect> boxes = new ArrayList<>();
List<Float> scores = new ArrayList<>();
List<Integer> classIds = new ArrayList<>();

int w = roi.cols();
int h = roi.rows();
for (int i = 0; i < detections; i++) {
int base = i * 7;
float conf = fp.get(base + 2);
if (conf < confThreshold) continue;
int classId = Math.round(fp.get(base + 1));
int x1 = Math.max(0, Math.round(fp.get(base + 3) * w));
int y1 = Math.max(0, Math.round(fp.get(base + 4) * h));
int x2 = Math.min(w - 1, Math.round(fp.get(base + 5) * w));
int y2 = Math.min(h - 1, Math.round(fp.get(base + 6) * h));
boxes.add(new Rect(x1, y1, Math.max(0, x2 - x1), Math.max(0, y2 - y1)));
scores.add(conf);
classIds.add(classId);
}

// NMS para remover boxes redundantes dentro do ROI.
if (!boxes.isEmpty()) {
int size = boxes.size();
org.bytedeco.opencv.opencv_core.Rect2dVector boxesVec = new org.bytedeco.opencv.opencv_core.Rect2dVector();
org.bytedeco.javacpp.FloatPointer confVec = new org.bytedeco.javacpp.FloatPointer(size);
for (int i = 0; i < size; i++) {
org.bytedeco.opencv.opencv_core.Rect b = boxes.get(i);
boxesVec.push_back(new org.bytedeco.opencv.opencv_core.Rect2d(b.x(), b.y(), b.width(), b.height()));
confVec.put(i, scores.get(i));
}
org.bytedeco.javacpp.IntPointer indices = new org.bytedeco.javacpp.IntPointer(size);
NMSBoxes(boxesVec, confVec, confThreshold, nmsThreshold, indices, 1.f, 0);

for (int i = 0; i < indices.limit(); i++) {
int idx = indices.get(i);
Rect b = boxes.get(idx);
// Converte coordenadas do ROI para imagem completa.
Rect abs = new Rect(
roiRect.x() + b.x(),
roiRect.y() + b.y(),
b.width(),
b.height()
);
float score = scores.get(idx);
int classId = classIds.get(idx);
String name = classId >= 0 && classId < gestureNames.size()
? gestureNames.get(classId)
: "gesture_" + classId;

HandGestureResult result = new HandGestureResult(abs, name, score, person.id, Instant.now().toEpochMilli());
newGestures.add(result);

Map<String, Object> payload = Map.of(
"gestureName", name,
"conf", (double) score,
"personTrackId", person.id,
"bbox", List.of(abs.x(), abs.y(), abs.width(), abs.height())
);
Event ev = new Event.Builder()
.type("hand_gesture")
.payload(payload)
.confidence(score)
.priority(Priority.MEDIUM)
.build();
QueueUtils.offerLatest(eventQueue, ev);
}

boxesVec.deallocate();
confVec.deallocate();
indices.deallocate();
}

fp.deallocate();
out.release();
blob.release();
roi.release();
}

synchronized (dr.handGestures) {
dr.clearHandGestures();
dr.handGestures.addAll(newGestures);
}

frame.release();
}
}
}

44 changes: 44 additions & 0 deletions src/main/java/dev/bot/zeno/dnn/HandGestureResult.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package dev.bot.zeno.dnn;

import org.bytedeco.opencv.opencv_core.Rect;

import java.time.Instant;

/**
* Resultado individual da detecção de gestos de mão.
* <p>
* Cada instância representa a posição da mão, o gesto reconhecido e
* metadados auxiliares como confiança, ID da pessoa e timestamp da
* detecção. A estrutura é simples e imutável para facilitar o
* compartilhamento entre threads.
*/
public class HandGestureResult {

/** Caixa delimitadora da mão no frame original. */
public final Rect bbox;

/** Nome amigável do gesto reconhecido (por exemplo, "thumbs_up"). */
public final String gestureName;

/** Confiança do modelo para o gesto detectado (0..1). */
public final float confidence;

/** TrackId da pessoa, herdado da detecção de pessoas do YOLO. */
public final int personTrackId;

/** Momento da detecção em epoch millis. */
public final long timestamp;

public HandGestureResult(Rect bbox, String gestureName, float confidence, int personTrackId) {
this(bbox, gestureName, confidence, personTrackId, Instant.now().toEpochMilli());
}

public HandGestureResult(Rect bbox, String gestureName, float confidence, int personTrackId, long timestamp) {
this.bbox = bbox;
this.gestureName = gestureName;
this.confidence = confidence;
this.personTrackId = personTrackId;
this.timestamp = timestamp;
}
}

13 changes: 13 additions & 0 deletions src/main/java/dev/bot/zeno/overlay/DetectionOverlay.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package dev.bot.zeno.overlay;

import dev.bot.zeno.dnn.HandGestureResult;
import dev.bot.zeno.util.Config;
import org.bytedeco.opencv.opencv_core.*;

Expand Down Expand Up @@ -45,5 +46,17 @@ public static void drawAll(Mat img, DetectionResult res, Config cfg) {
}
}
}
if (cfg.getBool("output.drawHandGestures", true)) {
synchronized (res.handGestures) {
for (HandGestureResult g : res.handGestures) {
Scalar color = new Scalar(255, 0, 255, 0); // magenta
rectangle(img, g.bbox, color, 2, LINE_8, 0);
String text = g.gestureName + " #" + g.personTrackId + String.format(" %.2f", g.confidence);
putText(img, text,
new Point(g.bbox.x(), Math.max(0, g.bbox.y() - 5)),
FONT_HERSHEY_SIMPLEX, 0.6, color, 2, LINE_AA, false);
}
}
}
}
}
Loading