From 185dd508c28f5b7afe9766d630ee4b383038926c Mon Sep 17 00:00:00 2001
From: Joshua Cork <joshua.cork@intel.com>
Date: Wed, 26 Nov 2025 11:55:30 -0700
Subject: [PATCH 1/5] Add configurable queue optimization and inference
 interval for low latency

- Add LOW_LATENCY and MEDIUM_LATENCY queue optimization modes
- Add configurable INFERENCE_INTERVAL (default=3, use 1 for every frame)
- Add separate inference options for face detection vs object detection
- Add support for INT8 model paths for NPU compatibility
---
 src/pipelines/obj_detection_age_prediction.sh | 78 ++++++++++++++-----
 1 file changed, 60 insertions(+), 18 deletions(-)

diff --git a/src/pipelines/obj_detection_age_prediction.sh b/src/pipelines/obj_detection_age_prediction.sh
index 1bf91442..cc91839b 100755
--- a/src/pipelines/obj_detection_age_prediction.sh
+++ b/src/pipelines/obj_detection_age_prediction.sh
@@ -19,7 +19,48 @@ OBJECT_DETECTION_DEVICE="${OBJECT_DETECTION_DEVICE:=$DEVICE}"
 OBJECT_CLASSIFICATION_DEVICE="${OBJECT_CLASSIFICATION_DEVICE:=$CLASSIFICATION_DEVICE}"
 FACE_DETECTION_DEVICE="${FACE_DETECTION_DEVICE:=$DEVICE}"
 AGE_CLASSIFICATION_DEVICE="${AGE_CLASSIFICATION_DEVICE:=$CLASSIFICATION_DEVICE}"
+# Support INT8 models for NPU compatibility
+FACE_DETECTION_MODEL="${FACE_DETECTION_MODEL:=/home/pipeline-server/models/face_detection/FP16/face-detection-retail-0004.xml}"
+AGE_PREDICTION_MODEL="${AGE_PREDICTION_MODEL:=/home/pipeline-server/models/age_prediction/FP16/age-gender-recognition-retail-0013.xml}"
 PRE_PROCESS="${PRE_PROCESS:=""}"
+# Separate inference options for object detection and face detection pipelines
+# Use default only if variable is unset (not if it's empty string)
+if [ -z "${FACE_DETECTION_OPTIONS+x}" ]; then
+    FACE_DETECTION_OPTIONS="$DETECTION_OPTIONS"
+fi
+if [ -z "${AGE_CLASSIFICATION_OPTIONS+x}" ]; then
+    AGE_CLASSIFICATION_OPTIONS="$CLASSIFICATION_OPTIONS"
+fi
+
+# Queue optimization for low latency
+# Set LOW_LATENCY=1 to reduce queue sizes and minimize end-to-end latency (aggressive)
+# Set MEDIUM_LATENCY=1 for production-realistic settings (balanced latency vs robustness)
+# Set DROP_OLD_FRAMES=1 to always process most recent frames (drops old frames when queue is full)
+if [ "$LOW_LATENCY" == "1" ]; then
+    if [ "$DROP_OLD_FRAMES" == "1" ]; then
+        QUEUE_PARAMS="max-size-buffers=3 max-size-time=100000000 leaky=downstream"
+        echo "LOW-LATENCY MODE + DROP OLD FRAMES: Always processing most recent frames (max-size-buffers=3, leaky=downstream)"
+    else
+        QUEUE_PARAMS="max-size-buffers=3 max-size-time=100000000"
+        echo "LOW-LATENCY MODE: Queue sizes optimized (max-size-buffers=3, max-size-time=0.1s)"
+    fi
+elif [ "$MEDIUM_LATENCY" == "1" ]; then
+    if [ "$DROP_OLD_FRAMES" == "1" ]; then
+        QUEUE_PARAMS="max-size-buffers=10 max-size-time=500000000 leaky=downstream"
+        echo "MEDIUM-LATENCY MODE + DROP OLD FRAMES: Always processing most recent frames (max-size-buffers=10, max-size-time=0.5s, leaky=downstream)"
+    else
+        QUEUE_PARAMS="max-size-buffers=10 max-size-time=500000000"
+        echo "MEDIUM-LATENCY MODE: Production-realistic queue sizes (max-size-buffers=10, max-size-time=0.5s)"
+    fi
+else
+    QUEUE_PARAMS=""
+    echo "STANDARD MODE: Using default queue sizes"
+fi
+
+# Inference interval optimization
+# Set INFERENCE_INTERVAL to control frame processing (default=3, 1=every frame)
+INFERENCE_INTERVAL="${INFERENCE_INTERVAL:-3}"
+echo "INFERENCE INTERVAL: Processing every ${INFERENCE_INTERVAL} frame(s)"
 
 if [ "$RENDER_MODE" == "1" ]; then
     OUTPUT="gvawatermark ! videoconvert ! fpsdisplaysink video-sink=autovideosink text-overlay=false signal-fps-measurements=true name=obj_fps_sink"
@@ -35,23 +76,23 @@ fi
 echo "Running object detection pipeline on $DEVICE with detection batch size = $BATCH_SIZE_DETECT and classification batch size = $BATCH_SIZE_CLASSIFY"
 echo "Running age prediction pipeline on $AGE_PREDICTION_VIDEO"
 
-gstLaunchCmd="GST_DEBUG=\"GST_TRACER:7\" GST_TRACERS='latency_tracer(flags=pipeline)' gst-launch-1.0 --verbose \
+gstLaunchCmd="GST_DEBUG=\"GST_TRACER:7\" GST_TRACERS='latency_tracer(flags=pipeline+element)' gst-launch-1.0 --verbose \
     $inputsrc_oc1 ! $DECODE \
-    ! queue \
+    ! queue $QUEUE_PARAMS \
     ! gvadetect batch-size=$BATCH_SIZE_DETECT \
         model-instance-id=odmodel \
         name=object_detection \
         model=/home/pipeline-server/models/object_detection/yolo11n/INT8/yolo11n.xml \
         threshold=0.5 \
-        inference-interval=3 \
+        inference-interval=$INFERENCE_INTERVAL \
         scale-method=fast \
         device=$OBJECT_DETECTION_DEVICE \
         $PRE_PROCESS $DETECTION_OPTIONS \
-    ! queue \
+    ! queue $QUEUE_PARAMS \
     ! gvatrack \
         name=object_tracking \
         tracking-type=zero-term-imageless \
-    ! queue \
+    ! queue $QUEUE_PARAMS \
     ! gvaclassify batch-size=$BATCH_SIZE_CLASSIFY \
         model-instance-id=classifier \
         labels=/home/pipeline-server/models/object_classification/efficientnet-b0/INT8/imagenet_2012.txt \
@@ -62,45 +103,46 @@ gstLaunchCmd="GST_DEBUG=\"GST_TRACER:7\" GST_TRACERS='latency_tracer(flags=pipel
         inference-region=1 \
         object-class=object \
         reclassify-interval=1 \
-    
+
         $CLASSIFICATION_PRE_PROCESS $CLASSIFICATION_OPTIONS \
     ! gvametaconvert \
     ! tee name=t_obj \
-        t_obj. ! queue ! $OUTPUT \
-        t_obj. ! queue ! gvametapublish name=obj_destination file-format=json-lines file-path=/tmp/results/rs_obj\$cid.jsonl ! fakesink sync=false async=false \
+        t_obj. ! queue $QUEUE_PARAMS ! $OUTPUT \
+        t_obj. ! queue $QUEUE_PARAMS ! gvametapublish name=obj_destination file-format=json-lines file-path=/tmp/results/rs_obj\$cid.jsonl ! fakesink sync=false async=false \
     \
     $inputsrc_ap1 ! $DECODE \
-    ! queue \
+    ! queue $QUEUE_PARAMS \
     ! gvadetect batch-size=$BATCH_SIZE_DETECT \
         model-instance-id=facemodel \
         name=face_detection \
-        model=/home/pipeline-server/models/face_detection/FP16/face-detection-retail-0004.xml \
+        model=$FACE_DETECTION_MODEL \
         model-proc=/home/pipeline-server/models/face_detection/face-detection-retail-0004.json \
-        inference-interval=3 \
+        inference-interval=$INFERENCE_INTERVAL \
         scale-method=fast \
         inference-region=full-frame \
         threshold=0.5 \
         device=$FACE_DETECTION_DEVICE \
-        $PRE_PROCESS $DETECTION_OPTIONS \
-    ! queue \
+        $PRE_PROCESS $FACE_DETECTION_OPTIONS \
+    ! queue $QUEUE_PARAMS \
     ! gvatrack \
         name=face_tracking \
         tracking-type=zero-term-imageless \
-    ! queue \
+    ! queue $QUEUE_PARAMS \
     ! gvaclassify batch-size=$BATCH_SIZE_CLASSIFY \
         model-instance-id=age_classifier \
-        model=/home/pipeline-server/models/age_prediction/FP16/age-gender-recognition-retail-0013.xml \
+        model=$AGE_PREDICTION_MODEL \
         model-proc=/home/pipeline-server/models/age_prediction/age-gender-recognition-retail-0013.json \
         device=$AGE_CLASSIFICATION_DEVICE \
         name=age_classification \
         inference-region=roi-list \
         object-class=face \
         reclassify-interval=1 \
-    ! queue \
+        $AGE_CLASSIFICATION_OPTIONS \
+    ! queue $QUEUE_PARAMS \
     ! gvametaconvert \
     ! tee name=t \
-        t. ! queue ! $AGE_OUTPUT \
-        t. ! queue ! gvametapublish name=destination file-format=json-lines file-path=/tmp/results/rs_age\$cid.jsonl ! fakesink sync=false async=false \
+        t. ! queue $QUEUE_PARAMS ! $AGE_OUTPUT \
+        t. ! queue $QUEUE_PARAMS ! gvametapublish name=destination file-format=json-lines file-path=/tmp/results/rs_age\$cid.jsonl ! fakesink sync=false async=false \
     2>&1 | tee /tmp/results/gst-launch_\$cid.log \
     | (stdbuf -oL awk '
         BEGIN { 

From 3064ef38e74af84de19cc9c8940dc80298c5c0fc Mon Sep 17 00:00:00 2001
From: Joshua Cork <joshua.cork@intel.com>
Date: Wed, 26 Nov 2025 11:56:46 -0700
Subject: [PATCH 2/5] Add environment variable passthrough for latency
 configuration

Allow shell environment variables to override .env file defaults for:
- LOW_LATENCY, MEDIUM_LATENCY
- INFERENCE_INTERVAL
- BATCH_SIZE_DETECT, BATCH_SIZE_CLASSIFY

Also add sample-media volume mount for benchmarking.
---
 src/docker-compose.yml | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/docker-compose.yml b/src/docker-compose.yml
index 849219af..f431a432 100644
--- a/src/docker-compose.yml
+++ b/src/docker-compose.yml
@@ -26,7 +26,7 @@ services:
       - ./start_stream.sh:/home/pipeline-server/src/start_stream.sh
 
   ClientGst:
-    image: dlstreamer:dev
+    image: pipeline-runner-asc:latest
     deploy:
       mode: replicated
       replicas: ${PIPELINE_COUNT:-1}
@@ -46,8 +46,15 @@ services:
       - RTSP_SERVER=${RTSP_SERVER}
       - RTSP_PATH=${RTSP_PATH}
       - RENDER_MODE=${RENDER_MODE}
+      # Latency and inference configuration - allow shell overrides
+      - LOW_LATENCY=${LOW_LATENCY:-0}
+      - MEDIUM_LATENCY=${MEDIUM_LATENCY:-0}
+      - INFERENCE_INTERVAL=${INFERENCE_INTERVAL:-1}
+      - BATCH_SIZE_DETECT=${BATCH_SIZE_DETECT:-1}
+      - BATCH_SIZE_CLASSIFY=${BATCH_SIZE_CLASSIFY:-1}
     volumes:
       - ${RESULTS_DIR:-../results}:/tmp/results
+      - ../performance-tools/sample-media:/home/pipeline-server/sample-media
       - ~/.Xauthority:/home/dlstreamer/.Xauthority
       - /tmp/.X11-unix:/tmp/.X11-unix
       - ~/.cl-cache:/home/pipeline-server/.cl-cache
@@ -55,4 +62,4 @@ services:
       - ./pipelines:/home/pipeline-server/pipelines
       - ./extensions:/home/pipeline-server/extensions
       - ${RETAIL_USE_CASE_ROOT:-}/models:/home/pipeline-server/models
-    restart: on-failure  
\ No newline at end of file
+    restart: on-failure

From e48d52cadfdb2d16e6c5ef18148295114865babc Mon Sep 17 00:00:00 2001
From: Joshua Cork <joshua.cork@intel.com>
Date: Wed, 26 Nov 2025 11:57:23 -0700
Subject: [PATCH 3/5] Add environment variable passthrough for latency
 configuration (registry version)

Allow shell environment variables to override .env file defaults for:
- LOW_LATENCY, MEDIUM_LATENCY
- INFERENCE_INTERVAL
- BATCH_SIZE_DETECT, BATCH_SIZE_CLASSIFY

Also add sample-media volume mount and update image name.
---
 src/docker-compose-reg.yml | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/docker-compose-reg.yml b/src/docker-compose-reg.yml
index 9aa834e1..809f85ed 100644
--- a/src/docker-compose-reg.yml
+++ b/src/docker-compose-reg.yml
@@ -26,7 +26,7 @@ services:
       - ./start_stream.sh:/home/pipeline-server/src/start_stream.sh
 
   ClientGst:
-    image: iotgdevcloud/dlstreamer:latest
+    image: intel/pipeline-runner-asc:latest
     deploy:
       mode: replicated
       replicas: ${PIPELINE_COUNT:-1}
@@ -46,8 +46,15 @@ services:
       - RTSP_SERVER=${RTSP_SERVER}
       - RTSP_PATH=${RTSP_PATH}
       - RENDER_MODE=${RENDER_MODE}
+      # Latency and inference configuration - allow shell overrides
+      - LOW_LATENCY=${LOW_LATENCY:-0}
+      - MEDIUM_LATENCY=${MEDIUM_LATENCY:-0}
+      - INFERENCE_INTERVAL=${INFERENCE_INTERVAL:-1}
+      - BATCH_SIZE_DETECT=${BATCH_SIZE_DETECT:-1}
+      - BATCH_SIZE_CLASSIFY=${BATCH_SIZE_CLASSIFY:-1}
     volumes:
       - ${RESULTS_DIR:-../results}:/tmp/results
+      - ../performance-tools/sample-media:/home/pipeline-server/sample-media
       - ~/.Xauthority:/home/dlstreamer/.Xauthority
       - /tmp/.X11-unix:/tmp/.X11-unix
       - ~/.cl-cache:/home/pipeline-server/.cl-cache
@@ -55,4 +62,4 @@ services:
       - ./pipelines:/home/pipeline-server/pipelines
       - ./extensions:/home/pipeline-server/extensions
       - ${RETAIL_USE_CASE_ROOT:-}/models:/home/pipeline-server/models
-    restart: on-failure  
+    restart: on-failure

From fe356bd05eaae83b5c21cd266f968931e11de9b2 Mon Sep 17 00:00:00 2001
From: Joshua Cork <joshua.cork@intel.com>
Date: Wed, 26 Nov 2025 11:57:41 -0700
Subject: [PATCH 4/5] Add NPU+GPU hybrid device configuration

New device configuration for Lunar Lake that runs:
- YOLO11n object detection on NPU
- EfficientNet classification on GPU with VA surface sharing
- Face detection and age classification on GPU

This configuration achieves sub-second latency while maximizing stream density.
---
 src/res/npu-gpu-flip.env | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 src/res/npu-gpu-flip.env

diff --git a/src/res/npu-gpu-flip.env b/src/res/npu-gpu-flip.env
new file mode 100644
index 00000000..0f97b3cb
--- /dev/null
+++ b/src/res/npu-gpu-flip.env
@@ -0,0 +1,20 @@
+DECODE='h264parse ! vah264dec ! vapostproc ! "video/x-raw(memory:VAMemory)"'
+OCR_DEVICE=GPU
+PRE_PROCESS=pre-process-backend=va-surface-sharing
+# Object detection pipeline on NPU (testing heavy workload on NPU)
+DEVICE=NPU
+OBJECT_DETECTION_DEVICE=NPU
+OBJECT_CLASSIFICATION_DEVICE=GPU
+# Age prediction pipeline on GPU (lighter models back to GPU)
+FACE_DETECTION_DEVICE=GPU
+AGE_CLASSIFICATION_DEVICE=GPU
+CLASSIFICATION_DEVICE=GPU
+CLASSIFICATION_PRE_PROCESS=pre-process-backend=va-surface-sharing
+BATCH_SIZE_DETECT=${BATCH_SIZE_DETECT:-1}
+BATCH_SIZE_CLASSIFY=${BATCH_SIZE_CLASSIFY:-1}
+# NPU doesn't support GPU-specific options - leave empty for object detection
+DETECTION_OPTIONS=""
+# GPU classification options for object classification and age pipeline
+CLASSIFICATION_OPTIONS="ie-config=GPU_THROUGHPUT_STREAMS=2 nireq=2 reclassify-interval=1"
+FACE_DETECTION_OPTIONS="ie-config=GPU_THROUGHPUT_STREAMS=2 nireq=2"
+AGE_CLASSIFICATION_OPTIONS="ie-config=GPU_THROUGHPUT_STREAMS=2 nireq=2 reclassify-interval=1"

From 9ab8ee127fb2faa5457cfabc921f0d7af4aeb354 Mon Sep 17 00:00:00 2001
From: Joshua Cork <joshua.cork@intel.com>
Date: Wed, 26 Nov 2025 12:34:06 -0700
Subject: [PATCH 5/5] Revert to original image name, add comment about tested
 configuration

Latency benchmarks were run using a locally-built image (pipeline-runner-asc)
based on DLStreamer 2025.0.1 with Intel NPU drivers for Lunar Lake.
---
 src/docker-compose.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/docker-compose.yml b/src/docker-compose.yml
index f431a432..f6abdd42 100644
--- a/src/docker-compose.yml
+++ b/src/docker-compose.yml
@@ -26,7 +26,10 @@ services:
       - ./start_stream.sh:/home/pipeline-server/src/start_stream.sh
 
   ClientGst:
-    image: pipeline-runner-asc:latest
+    # Note: Latency benchmarks were run using a locally-built image (pipeline-runner-asc:latest)
+    # based on DLStreamer 2025.0.1-ubuntu24 with Intel NPU drivers for Lunar Lake.
+    # Build with: make build (uses Dockerfile in this repo)
+    image: dlstreamer:dev
     deploy:
       mode: replicated
       replicas: ${PIPELINE_COUNT:-1}