diff --git a/TRACKING.md b/TRACKING.md new file mode 100644 index 0000000..aa0ffed --- /dev/null +++ b/TRACKING.md @@ -0,0 +1,30 @@ +# Tracking Module + +This project uses a lightweight yet robust tracking pipeline combining a +constant‑velocity Kalman filter with appearance embeddings and Hungarian +assignment. The goal is to provide stable identifiers for objects and faces +across frames so that higher PSI layers can reason about persistent entities. + +## Parameters +Parameters are defined in `config.properties` and loaded into +`TrackingParams`: + +| Key | Description | Default | +|-----|-------------|---------| +| `tracking.lambda` | Weight between IoU (1.0) and appearance similarity (0.0). | `0.6` | +| `tracking.iou.min` | Minimum IoU for gating. Pairs with lower IoU *and* low appearance are ignored. | `0.10` | +| `tracking.appearance.minCosine` | Minimum cosine similarity for gating. | `0.50` | +| `tracking.maxAge` | Frames without matches before a track is marked lost. | `30` | +| `tracking.nInit` | Consecutive hits required to confirm a track. | `3` | +| `tracking.smooth.alpha` | Exponential smoothing factor for boxes and embeddings. | `0.2` | + +Tuning these values allows balancing responsiveness and stability. For example, +increasing `lambda` favours geometric overlap while reducing it prioritises +appearance features. + +## Usage +Both `YoloDetector` and `FaceDetector` instantiate a `TrackManager` and publish +`object.track`/`face.track` events for confirmed tracks. Each event includes the +track identifier, smoothed bounding box, velocity estimate and current state. +These events can be consumed by PSI layers to create symbolic representations +and reason about interactions. diff --git a/data/object_memory.json b/data/object_memory.json deleted file mode 100644 index 9e26dfe..0000000 --- a/data/object_memory.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/src/main/java/dev/bot/zeno/app/Main.java b/src/main/java/dev/bot/zeno/app/Main.java index 9fb8e0a..9d5d8d6 100644 --- a/src/main/java/dev/bot/zeno/app/Main.java +++ b/src/main/java/dev/bot/zeno/app/Main.java @@ -14,6 +14,7 @@ import dev.bot.zeno.overlay.DetectionResult; import dev.bot.zeno.overlay.DetectionOverlay; import dev.bot.zeno.util.Config; +import dev.bot.zeno.tracking.TrackingParams; import io.javalin.Javalin; import io.javalin.plugin.bundled.CorsPluginConfig; import io.javalin.websocket.WsConfig; @@ -206,12 +207,21 @@ private static void startDetectors(Config cfg, AtomicReference latestFrame, AtomicReference detections, BlockingQueue events) { - startDaemonThread("yolo-detector", new YoloDetector(cfg, latestFrame, detections, events, null)); + TrackingParams params = new TrackingParams( + cfg.getDouble("tracking.lambda", 0.6), + cfg.getDouble("tracking.iou.min", 0.10), + cfg.getDouble("tracking.appearance.minCosine", 0.50), + cfg.getInt("tracking.maxAge", 30), + cfg.getInt("tracking.nInit", 3), + cfg.getDouble("tracking.smooth.alpha", 0.2) + ); + + startDaemonThread("yolo-detector", new YoloDetector(cfg, params, latestFrame, detections, events, null)); ArcFaceRecognizer recognizer = cfg.getBool("arcface.enabled", true) ? new ArcFaceRecognizer(cfg) : null; startDaemonThread("face-detector", - new FaceDetector(cfg, latestFrame, detections, events, recognizer)); + new FaceDetector(cfg, params, latestFrame, detections, events, recognizer)); } /** Periodically overlays detections on frames and calculates FPS for the preview. */ diff --git a/src/main/java/dev/bot/zeno/dnn/AppearanceEncoder.java b/src/main/java/dev/bot/zeno/dnn/AppearanceEncoder.java deleted file mode 100644 index ba42e73..0000000 --- a/src/main/java/dev/bot/zeno/dnn/AppearanceEncoder.java +++ /dev/null @@ -1,62 +0,0 @@ -package dev.bot.zeno.dnn; - -import org.bytedeco.javacpp.FloatPointer; -import org.bytedeco.opencv.opencv_core.*; - -import static org.bytedeco.opencv.global.opencv_core.*; -import static org.bytedeco.opencv.global.opencv_imgproc.*; - -/** - * Extrai um embedding de aparência leve baseado em histograma de cores HSV. - *

- * O vetor possui 64 dimensões (4 bins por canal) e é normalizado em L2. - * Essa abordagem é intencionalmente simples para manter o consumo de memória - * e processamento abaixo de 1 GB, conforme requisito do projeto. - */ -public class AppearanceEncoder { - private static final int BINS_PER_CHANNEL = 4; // 4 x 4 x 4 = 64 dimensões - - /** - * Gera um embedding normalizado para a região especificada. - * - * @param frame frame completo (BGR) - * @param rect região de interesse - * @return vetor de 64 floats normalizado (L2) - */ - public float[] encode(Mat frame, Rect rect) { - // Saneia a região requisitada para evitar exceções ao acessar áreas - // fora dos limites do frame. Também garante pelo menos 1x1 pixel. - int x = Math.min(Math.max(rect.x(), 0), frame.cols() - 1); - int y = Math.min(Math.max(rect.y(), 0), frame.rows() - 1); - int w = Math.max(1, Math.min(rect.width(), frame.cols() - x)); - int h = Math.max(1, Math.min(rect.height(), frame.rows() - y)); - Rect bounded = new Rect(x, y, w, h); - - Mat roi = new Mat(frame, bounded); - Mat hsv = new Mat(); - cvtColor(roi, hsv, COLOR_BGR2HSV); - - int[] channels = {0, 1, 2}; - int[] histSize = {BINS_PER_CHANNEL, BINS_PER_CHANNEL, BINS_PER_CHANNEL}; - float[] ranges = {0f, 180f, 0f, 256f, 0f, 256f}; // H, S, V ranges - Mat hist = new Mat(); - calcHist(hsv, 1, channels, new Mat(), hist, 3, histSize, ranges, true, false); - - // Normaliza histograma em L2 para robustez - normalize(hist, hist, 1.0, 0.0, NORM_L2, -1, new Mat()); - - int len = BINS_PER_CHANNEL * BINS_PER_CHANNEL * BINS_PER_CHANNEL; - float[] vec = new float[len]; - FloatPointer fp = new FloatPointer(hist.data()); - for (int i = 0; i < len; i++) { - vec[i] = fp.get(i); - } - - // Libera recursos temporários. O FloatPointer não deve ser - // desalocado manualmente, pois a memória pertence ao `Mat`. - hist.release(); - hsv.release(); - roi.release(); - return vec; - } -} diff --git a/src/main/java/dev/bot/zeno/dnn/FaceDetector.java b/src/main/java/dev/bot/zeno/dnn/FaceDetector.java index cb25162..55bebe9 100644 --- a/src/main/java/dev/bot/zeno/dnn/FaceDetector.java +++ b/src/main/java/dev/bot/zeno/dnn/FaceDetector.java @@ -4,21 +4,25 @@ import dev.bot.zeno.domain.Event; import dev.bot.zeno.domain.Event.Priority; import dev.bot.zeno.util.Config; -import dev.bot.zeno.dnn.ArcFaceRecognizer; +import dev.bot.zeno.tracking.*; import org.bytedeco.javacpp.FloatPointer; - import org.bytedeco.opencv.opencv_core.*; import org.bytedeco.opencv.opencv_dnn.Net; import javax.swing.*; import java.util.*; +import java.util.Arrays; import java.util.concurrent.BlockingQueue; import java.util.concurrent.atomic.AtomicReference; + import dev.bot.zeno.util.QueueUtils; import static org.bytedeco.opencv.global.opencv_core.CV_32F; -import static org.bytedeco.opencv.global.opencv_dnn.*; +import static org.bytedeco.opencv.global.opencv_dnn.DNN_BACKEND_OPENCV; +import static org.bytedeco.opencv.global.opencv_dnn.DNN_TARGET_CPU; +import static org.bytedeco.opencv.global.opencv_dnn.blobFromImage; +import static org.bytedeco.opencv.global.opencv_dnn.readNetFromCaffe; import static org.bytedeco.opencv.global.opencv_imgproc.*; public class FaceDetector implements Runnable { @@ -27,25 +31,37 @@ public class FaceDetector implements Runnable { private final AtomicReference latestDetections; private final BlockingQueue eventQueue; private final ArcFaceRecognizer recognizer; + private final Net net; private final double confTh; private final int skip; private int frameCount = 0; - private int nextId = 1; - public FaceDetector(Config cfg, AtomicReference latestFrame, AtomicReference latestDetections, - BlockingQueue eq, ArcFaceRecognizer recognizer) { + private final TrackManager tracker; + private final AppearanceEncoder encoder = new AppearanceEncoder(); + + public FaceDetector(Config cfg, TrackingParams params, + AtomicReference latestFrame, + AtomicReference latestDetections, + BlockingQueue eq, + ArcFaceRecognizer recognizer) { this.cfg = cfg; this.latestFrame = latestFrame; this.latestDetections = latestDetections; this.eventQueue = eq; this.recognizer = recognizer; - this.net = readNetFromCaffe(cfg.get("face.prototxt", "models/deploy.prototxt"), - cfg.get("face.caffemodel", "models/res10_300x300_ssd_iter_140000.caffemodel")); + + this.tracker = new TrackManager(params); + + this.net = readNetFromCaffe( + cfg.get("face.prototxt", "models/deploy.prototxt"), + cfg.get("face.caffemodel", "models/res10_300x300_ssd_iter_140000.caffemodel") + ); net.setPreferableBackend(DNN_BACKEND_OPENCV); net.setPreferableTarget(DNN_TARGET_CPU); + this.confTh = cfg.getDouble("face.confThreshold", 0.5); - this.skip = Math.max(0, cfg.getInt("processing.skipFramesFace", 1)); + this.skip = Math.max(0, cfg.getInt("processing.skipFramesFace", 0)); // recomenda-se 0 } @Override @@ -57,87 +73,155 @@ public void run() { continue; } - // Clona o frame para evitar que ele seja liberado enquanto está em uso. + // Clona para uso seguro neste loop Mat frame = src.clone(); - // Libera o frame original imediatamente para evitar retenção - // de memória nativa até que o GC rode. + // Libera referência do último frame (não a memória nativa do clone) src.release(); - frameCount++; - if (skip > 0 && (frameCount % (skip + 1)) != 0) { - // Se este frame for pulado, liberamos o clone manualmente. - frame.release(); - continue; - } + try { + frameCount++; + if (skip > 0 && (frameCount % (skip + 1)) != 0) { + continue; + } - int width = frame.cols(), height = frame.rows(); - Mat blob = blobFromImage(frame, 1.0, new Size(300,300), new Scalar(104,117,123,0), false, false, CV_32F); - net.setInput(blob); - Mat out = net.forward(); // [1,1,N,7] - - FloatPointer data = new FloatPointer(out.data()); - int detections = out.size(2); - List newFaces = new ArrayList<>(); - DetectionResult.Box largest = null; - for (int i = 0; i < detections; i++) { - int base = i * 7; - float confidence = data.get(base + 2); - if (confidence < confTh) continue; - int x1 = Math.max(0, Math.round(data.get(base + 3) * width)); - int y1 = Math.max(0, Math.round(data.get(base + 4) * height)); - int x2 = Math.min(width - 1, Math.round(data.get(base + 5) * width)); - int y2 = Math.min(height - 1, Math.round(data.get(base + 6) * height)); - Rect rect = new Rect(x1, y1, Math.max(0, x2 - x1), Math.max(0, y2 - y1)); - - // Por padrão, rotulamos como "unknown" até que o reconhecimento prove o contrário. - String label = "unknown"; - if (recognizer != null && rect.width() >= cfg.getInt("arcface.minFaceSize", 60) - && rect.height() >= cfg.getInt("arcface.minFaceSize", 60)) { - String who = recognizer.recognize(frame, rect); - if (who != null) { - label = who; // Face conhecida + final int width = frame.cols(); + final int height = frame.rows(); + + // Res10 SSD: blob 300x300, mean (104,117,123), BGR + Mat blob = blobFromImage(frame, 1.0, new Size(300, 300), + new Scalar(104, 117, 123, 0), + false, false, CV_32F); + net.setInput(blob); + Mat out = net.forward(); // shape: [1, 1, N, 7] + + try { + FloatPointer data = new FloatPointer(out.data()); + int detections = out.size(2); + + List detBoxes = new ArrayList<>(); + List feats = new ArrayList<>(); + List labels = new ArrayList<>(); + List confs = new ArrayList<>(); + + for (int i = 0; i < detections; i++) { + int base = i * 7; + float confidence = data.get(base + 2); + if (confidence < confTh) continue; + + // coords normalizadas [0..1] -> pixels do frame original + float x1n = data.get(base + 3); + float y1n = data.get(base + 4); + float x2n = data.get(base + 5); + float y2n = data.get(base + 6); + Rect2d box = ssdBoxToRect2d(frame, x1n, y1n, x2n, y2n); + + // Rect para recorte/ArcFace + Rect rect = new Rect( + (int) box.x(), (int) box.y(), + (int) box.width(), (int) box.height() + ); + + String label = "unknown"; + int minFace = cfg.getInt("arcface.minFaceSize", 60); + if (recognizer != null && rect.width() >= minFace && rect.height() >= minFace) { + String who = recognizer.recognize(frame, rect); + if (who != null) label = who; + } + + detBoxes.add(box); + feats.add(encoder.encode(frame, rect)); + labels.add(label); + confs.add(confidence); } - } - DetectionResult.Box box = new DetectionResult.Box(nextId++, rect, label, confidence); - newFaces.add(box); - if (largest == null || rect.area() > largest.rect.area()) { - largest = box; - } - Map payload = Map.of( - "symbol", "face:" + label, - "class", label, - "conf", (double) confidence, - "trackId", box.id, - "bbox", List.of(rect.x(), rect.y(), rect.width(), rect.height()) - ); - Event ev = new Event.Builder() - .type("perception") - .payload(payload) - .confidence(confidence) - .priority(Priority.MEDIUM) - .build(); - QueueUtils.offerLatest(eventQueue, ev); - } - DetectionResult res = latestDetections.get(); - synchronized (res.faceBoxes) { - res.clearFaces(); - res.faceBoxes.addAll(newFaces); - res.largestFace = largest; - } - System.out.printf("[FaceDetector] %d faces detectadas.%n", newFaces.size()); + List updated = tracker.update(detBoxes, feats, labels, confs, "face", frame); + + // Monta overlay/resultados e eventos + List newFaces = new ArrayList<>(); + DetectionResult.Box largest = null; + + for (int i = 0; i < updated.size(); i++) { + Track t = updated.get(i); + if (t == null) continue; // pode ocorrer por gating + + Rect2d r = t.lastRect; + Rect draw = new Rect((int) r.x(), (int) r.y(), (int) r.width(), (int) r.height()); + float conf = (i < confs.size()) ? confs.get(i) : 1.0f; + + DetectionResult.Box box = new DetectionResult.Box(t.id, draw, t.label, conf); + newFaces.add(box); + if (largest == null || draw.area() > largest.rect.area()) { + largest = box; + } + + if (t.state == Track.State.Confirmed) { + FloatPointer s = new FloatPointer(t.kf.statePost().data()); + float vx = s.get(4); + float vy = s.get(5); + + Map payload = Map.of( + "trackId", t.id, + "label", t.label, + "conf", (double) conf, + "bbox", List.of(draw.x(), draw.y(), draw.width(), draw.height()), + "velocity", List.of(vx, vy), + "state", t.state.toString(), + "source", "face", + "ts", System.currentTimeMillis(), + "appearanceHash", Integer.toHexString(Arrays.hashCode(t.feat)) + ); + Event ev = new Event.Builder() + .type("face.track") + .payload(payload) + .confidence(conf) + .priority(Priority.MEDIUM) + .build(); + QueueUtils.offerLatest(eventQueue, ev); + } + } - blob.release(); out.release(); - frame.release(); + // Atualiza estrutura compartilhada do overlay + DetectionResult res = latestDetections.get(); + synchronized (res.faceBoxes) { + res.clearFaces(); + res.faceBoxes.addAll(newFaces); + res.largestFace = largest; + } + System.out.printf("[FaceDetector] %d faces detectadas.%n", newFaces.size()); + } finally { + // Libera tensores temporários do DNN + out.release(); + blob.release(); + } + } finally { + // Libera o clone do frame + frame.release(); + } } } + /** + * Converte a bbox normalizada do SSD (x1n,y1n,x2n,y2n em [0..1]) para pixels do frame base, + * com clamp e tamanho mínimo de 2px por segurança. + */ + private static Rect2d ssdBoxToRect2d(Mat frameBgr, float x1n, float y1n, float x2n, float y2n) { + int W = frameBgr.cols(), H = frameBgr.rows(); + double x1 = Math.max(0, Math.min(x1n * W, W - 1)); + double y1 = Math.max(0, Math.min(y1n * H, H - 1)); + double x2 = Math.max(0, Math.min(x2n * W, W - 1)); + double y2 = Math.max(0, Math.min(y2n * H, H - 1)); + double w = Math.max(2, x2 - x1); + double h = Math.max(2, y2 - y1); + return new Rect2d(x1, y1, w, h); + } + public static void enrollCurrentFace(Mat frame, DetectionResult res, ArcFaceRecognizer recognizer) { if (recognizer == null) { JOptionPane.showMessageDialog(null, "ArcFace está desativado.", "Aviso", JOptionPane.WARNING_MESSAGE); return; } - DetectionResult.Box face = res.largestFace; // captura referência para evitar corrida de dados + // captura referência para evitar corrida de dados + DetectionResult.Box face = res.largestFace; if (face == null) { JOptionPane.showMessageDialog(null, "Nenhuma face detectada para cadastro.", "Aviso", JOptionPane.WARNING_MESSAGE); return; @@ -145,7 +229,6 @@ public static void enrollCurrentFace(Mat frame, DetectionResult res, ArcFaceReco String name = JOptionPane.showInputDialog(null, "Nome para esta face:", "Cadastrar Face", JOptionPane.QUESTION_MESSAGE); if (name == null || name.trim().isEmpty()) return; - // Compute embedding and save float[] vec = recognizer.embed(frame, face.rect); recognizer.addToDb(name.trim(), vec); System.out.printf("[FaceDetector] Face cadastrada: %s%n", name.trim()); diff --git a/src/main/java/dev/bot/zeno/dnn/YoloDetector.java b/src/main/java/dev/bot/zeno/dnn/YoloDetector.java index 17dfa90..c789373 100644 --- a/src/main/java/dev/bot/zeno/dnn/YoloDetector.java +++ b/src/main/java/dev/bot/zeno/dnn/YoloDetector.java @@ -6,6 +6,7 @@ import dev.bot.zeno.util.Config; import dev.bot.zeno.memory.ObjectMemory; import dev.bot.zeno.util.QueueUtils; +import dev.bot.zeno.tracking.*; import org.bytedeco.javacpp.FloatPointer; import org.bytedeco.javacpp.IntPointer; @@ -38,20 +39,10 @@ public class YoloDetector implements Runnable { private final int skip; private int frameCount = 0; private final ObjectMemory memory; + private final TrackManager tracker; + private final AppearanceEncoder encoder = new AppearanceEncoder(); - /** - * Conjunto de classes COCO que desejamos desenhar na tela. - * Apenas objetos pertencentes a esta lista serão reportados - * ao {@link DetectionResult} e renderizados posteriormente. - */ - private static final Set ALLOWED_LABELS = - Set.of("person", "cell phone", "knife", "book"); - - // Rastreamento simples de objetos para atribuir IDs consistentes entre frames. - private final Map tracks = new HashMap<>(); - private int nextId = 1; - - public YoloDetector(Config cfg, AtomicReference latestFrame, + public YoloDetector(Config cfg, TrackingParams params, AtomicReference latestFrame, AtomicReference latestDetections, BlockingQueue eq, ObjectMemory memory) { this.cfg = cfg; @@ -59,6 +50,7 @@ public YoloDetector(Config cfg, AtomicReference latestFrame, this.latestDetections = latestDetections; this.eventQueue = eq; this.memory = memory; + this.tracker = new TrackManager(params); try { String namesPath = cfg.get("yolo.names", "src/main/resources/coco.names"); classNames = Files.readAllLines(Paths.get(namesPath)); @@ -156,63 +148,64 @@ public void run() { confVec.deallocate(); DetectionResult res = latestDetections.get(); - List newBoxes = new ArrayList<>(); - Map updatedTracks = new HashMap<>(); + List detBoxes = new ArrayList<>(); + List feats = new ArrayList<>(); + List labels = new ArrayList<>(); + List confs = new ArrayList<>(); + for (int i = 0; i < indices.limit(); i++) { int idx = indices.get(i); Rect2d b = boxes.get(idx); - - // Procura por um track existente com maior IoU. - int id = -1; - double bestIou = 0.0; - for (Map.Entry t : tracks.entrySet()) { - double iouVal = iou(b, t.getValue()); - if (iouVal > bestIou) { - bestIou = iouVal; - id = t.getKey(); - } - } - if (bestIou < 0.3) { // novo objeto - id = nextId++; - } - updatedTracks.put(id, b); - org.bytedeco.opencv.opencv_core.Rect rect = new org.bytedeco.opencv.opencv_core.Rect( - (int) b.x(), (int) b.y(), (int) b.width(), (int) b.height() - ); + (int) b.x(), (int) b.y(), (int) b.width(), (int) b.height()); String baseLabel = classIds.get(idx) >= 0 && classIds.get(idx) < classNames.size() ? classNames.get(classIds.get(idx)) : "obj"; - - // Desconsidera classes fora do conjunto solicitado, evitando - // processamento e sobrecarga visual desnecessários. -// if (!ALLOWED_LABELS.contains(baseLabel)) { -// continue; -// } - - // Se houver memória de objetos ativada, o rótulo poderá ser - // refinado com base em embeddings prévios. String label = baseLabel; if (memory != null) { label = memory.classify(frame, rect, baseLabel); } - float conf = confidences.get(idx); - newBoxes.add(new DetectionResult.Box(id, rect, label, conf)); - Map payload = Map.of( - "symbol", "object:" + label, - "class", label, - "conf", (double) conf, - "trackId", id, - "bbox", List.of(rect.x(), rect.y(), rect.width(), rect.height()) - ); - Event ev = new Event.Builder() - .type("perception") - .payload(payload) - .confidence(conf) - .priority(Priority.MEDIUM) - .build(); - QueueUtils.offerLatest(eventQueue, ev); + detBoxes.add(b); + feats.add(encoder.encode(frame, rect)); + labels.add(label); + confs.add(confidences.get(idx)); + } + + List updated = tracker.update(detBoxes, feats, labels, confs, "yolo", frame); + List newBoxes = new ArrayList<>(); + + for (int i = 0; i < updated.size(); i++) { + Track t = updated.get(i); + Rect2d r = t.lastRect; + org.bytedeco.opencv.opencv_core.Rect rect = new org.bytedeco.opencv.opencv_core.Rect( + (int) r.x(), (int) r.y(), (int) r.width(), (int) r.height()); + float conf = confs.get(i); + newBoxes.add(new DetectionResult.Box(t.id, rect, t.label, conf)); + + if (t.state == Track.State.Confirmed) { + FloatPointer s = new FloatPointer(t.kf.statePost().data()); + float vx = s.get(4); + float vy = s.get(5); + Map payload = Map.of( + "trackId", t.id, + "label", t.label, + "conf", (double) conf, + "bbox", List.of(rect.x(), rect.y(), rect.width(), rect.height()), + "velocity", List.of(vx, vy), + "state", t.state.toString(), + "source", "yolo", + "ts", System.currentTimeMillis(), + "appearanceHash", Integer.toHexString(java.util.Arrays.hashCode(t.feat)) + ); + Event ev = new Event.Builder() + .type("object.track") + .payload(payload) + .confidence(conf) + .priority(Priority.MEDIUM) + .build(); + QueueUtils.offerLatest(eventQueue, ev); + } } synchronized (res.yoloBoxes) { @@ -221,9 +214,6 @@ public void run() { } System.out.printf("[YOLO] %d objetos detectados.%n", newBoxes.size()); - tracks.clear(); - tracks.putAll(updatedTracks); - blob.release(); outs.deallocate(); outNames.deallocate(); @@ -231,15 +221,4 @@ public void run() { frame.release(); } } - - /** Calcula o Intersection-over-Union entre duas caixas. */ - private static double iou(Rect2d a, Rect2d b) { - double x1 = Math.max(a.x(), b.x()); - double y1 = Math.max(a.y(), b.y()); - double x2 = Math.min(a.x() + a.width(), b.x() + b.width()); - double y2 = Math.min(a.y() + a.height(), b.y() + b.height()); - double inter = Math.max(0, x2 - x1) * Math.max(0, y2 - y1); - double union = a.width() * a.height() + b.width() * b.height() - inter; - return union <= 0 ? 0.0 : inter / union; - } } diff --git a/src/main/java/dev/bot/zeno/memory/ObjectMemory.java b/src/main/java/dev/bot/zeno/memory/ObjectMemory.java index 7fb7ec9..ccca23c 100644 --- a/src/main/java/dev/bot/zeno/memory/ObjectMemory.java +++ b/src/main/java/dev/bot/zeno/memory/ObjectMemory.java @@ -1,6 +1,6 @@ package dev.bot.zeno.memory; -import dev.bot.zeno.dnn.AppearanceEncoder; +import dev.bot.zeno.tracking.AppearanceEncoder; import dev.bot.zeno.util.Config; import org.bytedeco.opencv.opencv_core.Mat; import org.bytedeco.opencv.opencv_core.Rect; diff --git a/src/main/java/dev/bot/zeno/tracking/AppearanceEncoder.java b/src/main/java/dev/bot/zeno/tracking/AppearanceEncoder.java new file mode 100644 index 0000000..3360eac --- /dev/null +++ b/src/main/java/dev/bot/zeno/tracking/AppearanceEncoder.java @@ -0,0 +1,89 @@ +package dev.bot.zeno.tracking; + +import org.bytedeco.javacpp.FloatPointer; +import org.bytedeco.javacpp.IntPointer; +import org.bytedeco.opencv.opencv_core.Mat; +import org.bytedeco.opencv.opencv_core.MatVector; +import org.bytedeco.opencv.opencv_core.Rect; + +import static org.bytedeco.opencv.global.opencv_core.NORM_L2; +import static org.bytedeco.opencv.global.opencv_core.normalize; +import static org.bytedeco.opencv.global.opencv_imgproc.COLOR_BGR2HSV; +import static org.bytedeco.opencv.global.opencv_imgproc.calcHist; +import static org.bytedeco.opencv.global.opencv_imgproc.cvtColor; + +/** + * Extrai um embedding de aparência leve baseado em histograma de cores HSV. + * Vetor de 64 dimensões (4 bins por canal) normalizado em L2. + * Implementação defensiva para evitar exceções do OpenCV quando o ROI é inválido. + */ +public class AppearanceEncoder { + + private static final int BINS_PER_CHANNEL = 4; // 4 x 4 x 4 = 64 dimensões + private static final int VEC_LEN = BINS_PER_CHANNEL * BINS_PER_CHANNEL * BINS_PER_CHANNEL; + + /** + * Gera um embedding normalizado (L2) para a região especificada. + * + * @param frame frame completo (BGR) + * @param rect região de interesse (pode ser parcialmente fora dos limites) + * @return vetor de 64 floats normalizado; zeros se ROI inválido/pequeno + */ + public float[] encode(Mat frame, Rect rect) { + // Fallback: frame inválido + if (frame == null || frame.empty()) { + return new float[VEC_LEN]; + } + + // 1) Clamp seguro do ROI + int fx = Math.max(0, Math.min(rect.x(), Math.max(0, frame.cols() - 1))); + int fy = Math.max(0, Math.min(rect.y(), Math.max(0, frame.rows() - 1))); + int fw = Math.max(1, Math.min(rect.width(), frame.cols() - fx)); + int fh = Math.max(1, Math.min(rect.height(), frame.rows() - fy)); + + // Evita ROI "pelado" (muito pequeno → histograma ruidoso/instável) + if (fw < 2 || fh < 2) { + return new float[VEC_LEN]; + } + + Mat roi = new Mat(frame, new Rect(fx, fy, fw, fh)); + Mat hsv = new Mat(); + Mat hist = new Mat(); + try { + // 2) BGR -> HSV (OpenCV usa H em [0,180]) + cvtColor(roi, hsv, COLOR_BGR2HSV); + if (hsv.empty()) { + return new float[VEC_LEN]; + } + + // 3) Histograma 3D (H,S,V) com API explícita (evita ranges inválidos) + // H: [0,180], S: [0,256], V: [0,256] + MatVector images = new MatVector(hsv); + IntPointer channels = new IntPointer(new int[]{0, 1, 2}); + IntPointer histSize = new IntPointer(new int[]{BINS_PER_CHANNEL, BINS_PER_CHANNEL, BINS_PER_CHANNEL}); + FloatPointer ranges = new FloatPointer(new float[]{0f, 180f, 0f, 256f, 0f, 256f}); + + calcHist(images, channels, new Mat(), hist, histSize, ranges, false); + + if (hist.empty()) { + return new float[VEC_LEN]; + } + + // 4) Normaliza em L2 para robustez + normalize(hist, hist, 1.0, 0.0, NORM_L2, -1, new Mat()); + + // 5) Copia para vetor Java + float[] vec = new float[VEC_LEN]; + FloatPointer fp = new FloatPointer(hist.ptr()); + for (int i = 0; i < VEC_LEN; i++) { + vec[i] = fp.get(i); + } + return vec; + } finally { + // Libera temporários + hist.release(); + hsv.release(); + roi.release(); + } + } +} diff --git a/src/main/java/dev/bot/zeno/tracking/HungarianMatcher.java b/src/main/java/dev/bot/zeno/tracking/HungarianMatcher.java new file mode 100644 index 0000000..b915b0a --- /dev/null +++ b/src/main/java/dev/bot/zeno/tracking/HungarianMatcher.java @@ -0,0 +1,157 @@ +package dev.bot.zeno.tracking; + +import java.util.Arrays; + +/** + * Minimal implementation of the Hungarian algorithm for rectangular cost + * matrices. The algorithm runs in O(n^3) time and is sufficient for the small + * matrices produced by typical object detectors (tens of tracks at most). + */ +public class HungarianMatcher { + + /** + * Solves the assignment problem for the given cost matrix. + * + * @param cost matrix of size m x n + * @return array of size m where index is row and value is assigned column + * or -1 if the row is unmatched + */ + public int[] match(double[][] cost) { + int nRows = cost.length; + int nCols = cost.length == 0 ? 0 : cost[0].length; + int dim = Math.max(nRows, nCols); + double[][] mat = new double[dim][dim]; + for (int i = 0; i < dim; i++) { + if (i < nRows) { + System.arraycopy(cost[i], 0, mat[i], 0, nCols); + for (int j = nCols; j < dim; j++) mat[i][j] = 0.0; + } else { + Arrays.fill(mat[i], 0.0); + } + } + + // Step 1: subtract row minima + for (int i = 0; i < dim; i++) { + double min = Double.POSITIVE_INFINITY; + for (int j = 0; j < dim; j++) if (mat[i][j] < min) min = mat[i][j]; + for (int j = 0; j < dim; j++) mat[i][j] -= min; + } + // Step 2: subtract column minima + for (int j = 0; j < dim; j++) { + double min = Double.POSITIVE_INFINITY; + for (int i = 0; i < dim; i++) if (mat[i][j] < min) min = mat[i][j]; + for (int i = 0; i < dim; i++) mat[i][j] -= min; + } + + int[] starRowOfCol = new int[dim]; + int[] starColOfRow = new int[dim]; + int[] primeColOfRow = new int[dim]; + Arrays.fill(starRowOfCol, -1); + Arrays.fill(starColOfRow, -1); + Arrays.fill(primeColOfRow, -1); + boolean[] rowCover = new boolean[dim]; + boolean[] colCover = new boolean[dim]; + + // Step 3: star zeros + for (int i = 0; i < dim; i++) { + for (int j = 0; j < dim; j++) { + if (mat[i][j] == 0 && starRowOfCol[j] == -1 && starColOfRow[i] == -1) { + starColOfRow[i] = j; + starRowOfCol[j] = i; + } + } + } + // Cover columns containing a star + for (int j = 0; j < dim; j++) { + if (starRowOfCol[j] != -1) colCover[j] = true; + } + + while (countTrue(colCover) < dim) { + int[] zero = findZero(mat, rowCover, colCover); + while (zero == null) { + adjustMatrix(mat, rowCover, colCover); + zero = findZero(mat, rowCover, colCover); + } + int r = zero[0]; + int c = zero[1]; + primeColOfRow[r] = c; + if (starColOfRow[r] != -1) { + rowCover[r] = true; + colCover[starColOfRow[r]] = false; + } else { + augmentPath(r, c, starColOfRow, starRowOfCol, primeColOfRow); + Arrays.fill(rowCover, false); + Arrays.fill(colCover, false); + Arrays.fill(primeColOfRow, -1); + for (int j = 0; j < dim; j++) { + if (starRowOfCol[j] != -1) colCover[j] = true; + } + } + } + + int[] result = new int[nRows]; + Arrays.fill(result, -1); + for (int i = 0; i < nRows; i++) { + if (starColOfRow[i] < nCols) result[i] = starColOfRow[i]; + } + return result; + } + + private static int countTrue(boolean[] arr) { + int c = 0; for (boolean b : arr) if (b) c++; return c; + } + + private static int[] findZero(double[][] mat, boolean[] rowCover, boolean[] colCover) { + for (int i = 0; i < mat.length; i++) { + if (rowCover[i]) continue; + for (int j = 0; j < mat.length; j++) { + if (!colCover[j] && mat[i][j] == 0) return new int[]{i, j}; + } + } + return null; + } + + private static void adjustMatrix(double[][] mat, boolean[] rowCover, boolean[] colCover) { + double min = Double.POSITIVE_INFINITY; + for (int i = 0; i < mat.length; i++) { + if (rowCover[i]) continue; + for (int j = 0; j < mat.length; j++) { + if (!colCover[j] && mat[i][j] < min) min = mat[i][j]; + } + } + for (int i = 0; i < mat.length; i++) { + for (int j = 0; j < mat.length; j++) { + if (rowCover[i]) mat[i][j] += min; + if (!colCover[j]) mat[i][j] -= min; + } + } + } + + private static void augmentPath(int r, int c, int[] starColOfRow, int[] starRowOfCol, int[] primeColOfRow) { + int[][] path = new int[starColOfRow.length * 2][2]; + int idx = 0; + path[idx][0] = r; path[idx][1] = c; + boolean done = false; + while (!done) { + int row = starRowOfCol[path[idx][1]]; + if (row != -1) { + idx++; path[idx][0] = row; path[idx][1] = path[idx-1][1]; + int col = primeColOfRow[row]; + idx++; path[idx][0] = row; path[idx][1] = col; + } else { + done = true; + } + } + for (int i = 0; i <= idx; i++) { + int pr = path[i][0]; + int pc = path[i][1]; + if (starColOfRow[pr] == pc) { + starColOfRow[pr] = -1; + starRowOfCol[pc] = -1; + } else { + starColOfRow[pr] = pc; + starRowOfCol[pc] = pr; + } + } + } +} diff --git a/src/main/java/dev/bot/zeno/tracking/Track.java b/src/main/java/dev/bot/zeno/tracking/Track.java new file mode 100644 index 0000000..f804faa --- /dev/null +++ b/src/main/java/dev/bot/zeno/tracking/Track.java @@ -0,0 +1,38 @@ +package dev.bot.zeno.tracking; + +import org.bytedeco.opencv.opencv_video.KalmanFilter; +import org.bytedeco.opencv.opencv_core.Rect2d; + +/** + * Represents a single tracked object/person. + *

+ * Each track maintains its own Kalman filter and appearance embedding to + * provide stable identifiers across frames. The structure intentionally keeps + * fields public to minimise allocations in high-frequency tracking loops. + */ +public final class Track { + public final int id; + public final String source; // "yolo" | "face" + public String label; // classe base ou nome reconhecido + public KalmanFilter kf; // estado [cx, cy, w, h, vx, vy] + public Rect2d predicted, lastRect; + public float[] feat; // média exponencial do embedding + public int hits, age, timeSinceUpdate; + public enum State { Tentative, Confirmed, Lost } + public State state; + + public Track(int id, String source, String label, + KalmanFilter kf, Rect2d initialRect, float[] feat) { + this.id = id; + this.source = source; + this.label = label; + this.kf = kf; + this.predicted = initialRect; + this.lastRect = initialRect; + this.feat = feat; + this.hits = 1; + this.age = 1; + this.timeSinceUpdate = 0; + this.state = State.Tentative; + } +} diff --git a/src/main/java/dev/bot/zeno/tracking/TrackManager.java b/src/main/java/dev/bot/zeno/tracking/TrackManager.java new file mode 100644 index 0000000..a1d0a8a --- /dev/null +++ b/src/main/java/dev/bot/zeno/tracking/TrackManager.java @@ -0,0 +1,247 @@ +package dev.bot.zeno.tracking; + +import org.bytedeco.javacpp.FloatPointer; +import org.bytedeco.opencv.opencv_core.Mat; +import org.bytedeco.opencv.opencv_core.Rect2d; +import org.bytedeco.opencv.opencv_core.Scalar; +import org.bytedeco.opencv.opencv_video.KalmanFilter; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.bytedeco.opencv.global.opencv_core.CV_32F; +import static org.bytedeco.opencv.global.opencv_core.setIdentity; + +/** + * Manages the lifecycle of {@link Track} instances performing prediction, + * association and termination according to the specified {@link TrackingParams}. + */ +public final class TrackManager { + private final TrackingParams p; + private final List tracks = new ArrayList<>(); + private final HungarianMatcher matcher = new HungarianMatcher(); + private int nextId = 1; + + public TrackManager(TrackingParams p) { this.p = p; } + + /** + * Updates the current tracks with new detections. + * + * @return list of tracks matched to the provided detections in the same order + */ + public List update(List boxes, + List feats, + List baseLabels, + List confs, + String sourceName, + Mat frameForDebug) { + final int N = tracks.size(); + final int M = boxes.size(); + final int frameW = (frameForDebug != null) ? frameForDebug.cols() : Integer.MAX_VALUE; + final int frameH = (frameForDebug != null) ? frameForDebug.rows() : Integer.MAX_VALUE; + + // 1) Predict all tracks (usaremos a predição só para gating/associação) + for (Track t : tracks) { + Mat prediction = t.kf.predict(); + t.predicted = rectFromState(prediction); + t.timeSinceUpdate++; + t.age++; + } + + // 2) Custo (IoU + Aparência) com gating + double[][] cost = new double[N][M]; + for (int i = 0; i < N; i++) { + Track t = tracks.get(i); + for (int j = 0; j < M; j++) { + double iouVal = iou(t.predicted, boxes.get(j)); + double cos = cosine(t.feat, feats.get(j)); + if (iouVal < p.minIoU() && cos < p.minCosine()) { + cost[i][j] = 1e6; // gating + continue; + } + double c = p.lambda() * (1 - iouVal) + (1 - p.lambda()) * (1 - cos); + if (t.label != null && t.label.equals(baseLabels.get(j))) { + c = Math.max(0, c - 0.15); + } + cost[i][j] = c; + } + } + + int[] assignment = (N > 0 && M > 0) ? matcher.match(cost) : new int[N]; + boolean[] matchedDet = new boolean[M]; + java.util.Arrays.fill(matchedDet, false); + List output = new ArrayList<>(Collections.nCopies(M, null)); + + // 3) Update matched tracks (usa statePost do KF para saída) + for (int i = 0; i < N; i++) { + int j = assignment.length > i ? assignment[i] : -1; + Track t = tracks.get(i); + if (j >= 0 && j < M && cost[i][j] < 1e5) { + matchedDet[j] = true; + Rect2d meas = boxes.get(j); + updateTrack(t, meas, feats.get(j), baseLabels.get(j), frameW, frameH); + output.set(j, t); + } + } + + // 4) Create tracks for unmatched detections + for (int j = 0; j < M; j++) { + if (!matchedDet[j]) { + Track t = createTrack(boxes.get(j), feats.get(j), baseLabels.get(j), sourceName, frameW, frameH); + tracks.add(t); + output.set(j, t); + } + } + + // 5) Cleanup + tracks.removeIf(t -> { + if (t.timeSinceUpdate > p.maxAge()) { + t.state = Track.State.Lost; + return true; + } + return false; + }); + + return output; + } + + private Track createTrack(Rect2d box, float[] feat, String label, String source, + int frameW, int frameH) { + // Estado: [cx, cy, w, h, vx, vy] + KalmanFilter kf = new KalmanFilter(6, 4, 0, CV_32F); + + // F = dt=1 => posição recebe velocidade + // [1 0 0 0 1 0 + // 0 1 0 0 0 1 + // 0 0 1 0 0 0 + // 0 0 0 1 0 0 + // 0 0 0 0 1 0 + // 0 0 0 0 0 1] + kf.transitionMatrix(new Mat(6, 6, CV_32F, new FloatPointer( + 1, 0, 0, 0, 1, 0, + 0, 1, 0, 0, 0, 1, + 0, 0, 1, 0, 0, 0, + 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 1 + ))); + setIdentity(kf.measurementMatrix()); // 4x6, mede [cx,cy,w,h] + setIdentity(kf.processNoiseCov(), new Scalar(1e-2)); + setIdentity(kf.measurementNoiseCov(), new Scalar(1e-1)); + setIdentity(kf.errorCovPost(), new Scalar(1)); + + float cx = (float) (box.x() + box.width() / 2.0); + float cy = (float) (box.y() + box.height() / 2.0); + float w = (float) box.width(); + float h = (float) box.height(); + + // Estado inicial + FloatPointer sp = new FloatPointer(kf.statePost().data()); + sp.put(0, cx); sp.put(1, cy); sp.put(2, w); sp.put(3, h); sp.put(4, 0); sp.put(5, 0); + + float[] normFeat = normalize(feat); + Rect2d clamped = clamp(box, frameW, frameH); + Track t = new Track(nextId++, source, label, kf, clamped, normFeat); + t.lastRect = clamped; + return t; + } + + private void updateTrack(Track t, Rect2d meas, float[] feat, String label, + int frameW, int frameH) { + // Medição em centro + float cx = (float) (meas.x() + meas.width() / 2.0); + float cy = (float) (meas.y() + meas.height() / 2.0); + float w = (float) meas.width(); + float h = (float) meas.height(); + + // z = [cx, cy, w, h] + Mat measurement = new Mat(4, 1, CV_32F); + FloatPointer mp = new FloatPointer(measurement.data()); + mp.put(0, cx); mp.put(1, cy); mp.put(2, w); mp.put(3, h); + + // Corrige o estado e usa o statePost como caixa de saída + Mat corrected = t.kf.correct(measurement); + Rect2d kfRect = rectFromState(t.kf.statePost()); + + // Suavização opcional SOMENTE no tamanho (evita “inflar”/lag) + double a = p.smoothAlpha(); // 0.0 = sem suavização + double sw = a * w + (1 - a) * kfRect.width(); + double sh = a * h + (1 - a) * kfRect.height(); + + // mantém centro do KF, aplica tamanho suavizado + double outX = (kfRect.x() + kfRect.width() / 2.0) - sw / 2.0; + double outY = (kfRect.y() + kfRect.height() / 2.0) - sh / 2.0; + Rect2d out = new Rect2d(outX, outY, sw, sh); + + // Clamp ao frame + t.lastRect = clamp(out, frameW, frameH); + t.predicted = kfRect; // mantém para debug/telemetria + + // Atualiza aparência e ciclo de vida + t.feat = blendFeatures(t.feat, feat); + t.hits++; + t.timeSinceUpdate = 0; + t.label = label; + if (t.state == Track.State.Tentative && t.hits >= p.nInit()) { + t.state = Track.State.Confirmed; + } + } + + private static Rect2d clamp(Rect2d r, int W, int H) { + if (W == Integer.MAX_VALUE || H == Integer.MAX_VALUE) return r; + double x = Math.max(0, Math.min(r.x(), W - 2)); + double y = Math.max(0, Math.min(r.y(), H - 2)); + double w = Math.max(2, Math.min(r.width(), W - x)); + double h = Math.max(2, Math.min(r.height(), H - y)); + return new Rect2d(x, y, w, h); + } + + private Rect2d rectFromState(Mat state) { + FloatPointer fp = new FloatPointer(state.data()); + float cx = fp.get(0); + float cy = fp.get(1); + float w = fp.get(2); + float h = fp.get(3); + return new Rect2d(cx - w / 2.0, cy - h / 2.0, w, h); + } + + private static double iou(Rect2d a, Rect2d b) { + double x1 = Math.max(a.x(), b.x()); + double y1 = Math.max(a.y(), b.y()); + double x2 = Math.min(a.x() + a.width(), b.x() + b.width()); + double y2 = Math.min(a.y() + a.height(), b.y() + b.height()); + double inter = Math.max(0, x2 - x1) * Math.max(0, y2 - y1); + double union = a.width() * a.height() + b.width() * b.height() - inter; + return union <= 0 ? 0 : inter / union; + } + + private static double cosine(float[] a, float[] b) { + int n = Math.min(a.length, b.length); + double dot = 0, na = 0, nb = 0; + for (int i = 0; i < n; i++) { + dot += a[i] * b[i]; + na += a[i] * a[i]; + nb += b[i] * b[i]; + } + return dot / (Math.sqrt(na) * Math.sqrt(nb) + 1e-12); + } + + private float[] blendFeatures(float[] old, float[] cur) { + float[] nf = normalize(cur); + double a = p.smoothAlpha(); // reaproveitamos como EMA da aparência + for (int i = 0; i < old.length && i < nf.length; i++) { + old[i] = (float) (a * nf[i] + (1 - a) * old[i]); + } + return normalize(old); + } + + private static float[] normalize(float[] v) { + double norm = 0; + for (float f : v) norm += f * f; + norm = Math.sqrt(norm) + 1e-12; + float[] out = java.util.Arrays.copyOf(v, v.length); + for (int i = 0; i < out.length; i++) out[i] /= norm; + return out; + } +} diff --git a/src/main/java/dev/bot/zeno/tracking/TrackingParams.java b/src/main/java/dev/bot/zeno/tracking/TrackingParams.java new file mode 100644 index 0000000..5336c39 --- /dev/null +++ b/src/main/java/dev/bot/zeno/tracking/TrackingParams.java @@ -0,0 +1,17 @@ +package dev.bot.zeno.tracking; + +/** + * Configuration parameters controlling the behaviour of {@link TrackManager}. + *

+ * All values are loaded from {@code config.properties} allowing easy tuning + * without recompilation. + */ +public record TrackingParams( + double lambda, + double minIoU, + double minCosine, + int maxAge, + int nInit, + double smoothAlpha +) { +} diff --git a/src/main/resources/config.properties b/src/main/resources/config.properties index 5711903..33c2a22 100644 --- a/src/main/resources/config.properties +++ b/src/main/resources/config.properties @@ -27,8 +27,8 @@ arcface.similarityThreshold=0.42 arcface.minFaceSize=60 # === Performance === -processing.skipFramesYolo=3 -processing.skipFramesFace=3 +processing.skipFramesYolo=0 +processing.skipFramesFace=0 # === Output === output.drawYolo=true @@ -42,3 +42,11 @@ data.faceDb=data/face_db.json memory.kPerLabel=20 memory.similarityThreshold=0.85 data.objectMemory=data/object_memory.json + +# === Tracking === +tracking.lambda=0.6 +tracking.iou.min=0.10 +tracking.appearance.minCosine=0.50 +tracking.maxAge=30 +tracking.nInit=3 +tracking.smooth.alpha=0.0