intel-retail · jcork-intel · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
diff --git a/src/docker-compose-reg.yml b/src/docker-compose-reg.yml
@@ -26,7 +26,7 @@ services:
       - ./start_stream.sh:/home/pipeline-server/src/start_stream.sh
 
   ClientGst:
-    image: iotgdevcloud/dlstreamer:latest
+    image: intel/pipeline-runner-asc:latest
     deploy:
       mode: replicated
       replicas: ${PIPELINE_COUNT:-1}
@@ -46,13 +46,20 @@ services:
       - RTSP_SERVER=${RTSP_SERVER}
       - RTSP_PATH=${RTSP_PATH}
       - RENDER_MODE=${RENDER_MODE}
+      # Latency and inference configuration - allow shell overrides
+      - LOW_LATENCY=${LOW_LATENCY:-0}
+      - MEDIUM_LATENCY=${MEDIUM_LATENCY:-0}
+      - INFERENCE_INTERVAL=${INFERENCE_INTERVAL:-1}
+      - BATCH_SIZE_DETECT=${BATCH_SIZE_DETECT:-1}
+      - BATCH_SIZE_CLASSIFY=${BATCH_SIZE_CLASSIFY:-1}
     volumes:
       - ${RESULTS_DIR:-../results}:/tmp/results
+      - ../performance-tools/sample-media:/home/pipeline-server/sample-media
       - ~/.Xauthority:/home/dlstreamer/.Xauthority
       - /tmp/.X11-unix:/tmp/.X11-unix
       - ~/.cl-cache:/home/pipeline-server/.cl-cache
       - ./res/:/home/pipeline-server/envs
       - ./pipelines:/home/pipeline-server/pipelines
       - ./extensions:/home/pipeline-server/extensions
       - ${RETAIL_USE_CASE_ROOT:-}/models:/home/pipeline-server/models
-    restart: on-failure  
+    restart: on-failure
diff --git a/src/docker-compose.yml b/src/docker-compose.yml
@@ -26,6 +26,9 @@ services:
       - ./start_stream.sh:/home/pipeline-server/src/start_stream.sh
 
   ClientGst:
+    # Note: Latency benchmarks were run using a locally-built image (pipeline-runner-asc:latest)
+    # based on DLStreamer 2025.0.1-ubuntu24 with Intel NPU drivers for Lunar Lake.
+    # Build with: make build (uses Dockerfile in this repo)
     image: dlstreamer:dev
     deploy:
       mode: replicated
@@ -46,13 +49,20 @@ services:
       - RTSP_SERVER=${RTSP_SERVER}
       - RTSP_PATH=${RTSP_PATH}
       - RENDER_MODE=${RENDER_MODE}
+      # Latency and inference configuration - allow shell overrides
+      - LOW_LATENCY=${LOW_LATENCY:-0}
+      - MEDIUM_LATENCY=${MEDIUM_LATENCY:-0}
+      - INFERENCE_INTERVAL=${INFERENCE_INTERVAL:-1}
+      - BATCH_SIZE_DETECT=${BATCH_SIZE_DETECT:-1}
+      - BATCH_SIZE_CLASSIFY=${BATCH_SIZE_CLASSIFY:-1}
     volumes:
       - ${RESULTS_DIR:-../results}:/tmp/results
+      - ../performance-tools/sample-media:/home/pipeline-server/sample-media
       - ~/.Xauthority:/home/dlstreamer/.Xauthority
       - /tmp/.X11-unix:/tmp/.X11-unix
       - ~/.cl-cache:/home/pipeline-server/.cl-cache
       - ./res/:/home/pipeline-server/envs
       - ./pipelines:/home/pipeline-server/pipelines
       - ./extensions:/home/pipeline-server/extensions
       - ${RETAIL_USE_CASE_ROOT:-}/models:/home/pipeline-server/models
-    restart: on-failure  
+    restart: on-failure
diff --git a/src/pipelines/obj_detection_age_prediction.sh b/src/pipelines/obj_detection_age_prediction.sh
@@ -19,7 +19,48 @@ OBJECT_DETECTION_DEVICE="${OBJECT_DETECTION_DEVICE:=$DEVICE}"
 OBJECT_CLASSIFICATION_DEVICE="${OBJECT_CLASSIFICATION_DEVICE:=$CLASSIFICATION_DEVICE}"
 FACE_DETECTION_DEVICE="${FACE_DETECTION_DEVICE:=$DEVICE}"
 AGE_CLASSIFICATION_DEVICE="${AGE_CLASSIFICATION_DEVICE:=$CLASSIFICATION_DEVICE}"
+# Support INT8 models for NPU compatibility
+FACE_DETECTION_MODEL="${FACE_DETECTION_MODEL:=/home/pipeline-server/models/face_detection/FP16/face-detection-retail-0004.xml}"
+AGE_PREDICTION_MODEL="${AGE_PREDICTION_MODEL:=/home/pipeline-server/models/age_prediction/FP16/age-gender-recognition-retail-0013.xml}"
 PRE_PROCESS="${PRE_PROCESS:=""}"
+# Separate inference options for object detection and face detection pipelines
+# Use default only if variable is unset (not if it's empty string)
+if [ -z "${FACE_DETECTION_OPTIONS+x}" ]; then
+    FACE_DETECTION_OPTIONS="$DETECTION_OPTIONS"
+fi
+if [ -z "${AGE_CLASSIFICATION_OPTIONS+x}" ]; then
+    AGE_CLASSIFICATION_OPTIONS="$CLASSIFICATION_OPTIONS"
+fi
+
+# Queue optimization for low latency
+# Set LOW_LATENCY=1 to reduce queue sizes and minimize end-to-end latency (aggressive)
+# Set MEDIUM_LATENCY=1 for production-realistic settings (balanced latency vs robustness)
+# Set DROP_OLD_FRAMES=1 to always process most recent frames (drops old frames when queue is full)
+if [ "$LOW_LATENCY" == "1" ]; then
+    if [ "$DROP_OLD_FRAMES" == "1" ]; then
+        QUEUE_PARAMS="max-size-buffers=3 max-size-time=100000000 leaky=downstream"
+        echo "LOW-LATENCY MODE + DROP OLD FRAMES: Always processing most recent frames (max-size-buffers=3, leaky=downstream)"
+    else
+        QUEUE_PARAMS="max-size-buffers=3 max-size-time=100000000"
+        echo "LOW-LATENCY MODE: Queue sizes optimized (max-size-buffers=3, max-size-time=0.1s)"
+    fi
+elif [ "$MEDIUM_LATENCY" == "1" ]; then
+    if [ "$DROP_OLD_FRAMES" == "1" ]; then
+        QUEUE_PARAMS="max-size-buffers=10 max-size-time=500000000 leaky=downstream"
+        echo "MEDIUM-LATENCY MODE + DROP OLD FRAMES: Always processing most recent frames (max-size-buffers=10, max-size-time=0.5s, leaky=downstream)"
+    else
+        QUEUE_PARAMS="max-size-buffers=10 max-size-time=500000000"
+        echo "MEDIUM-LATENCY MODE: Production-realistic queue sizes (max-size-buffers=10, max-size-time=0.5s)"
+    fi
+else
+    QUEUE_PARAMS=""
+    echo "STANDARD MODE: Using default queue sizes"
+fi
+
+# Inference interval optimization
+# Set INFERENCE_INTERVAL to control frame processing (default=3, 1=every frame)
+INFERENCE_INTERVAL="${INFERENCE_INTERVAL:-3}"
+echo "INFERENCE INTERVAL: Processing every ${INFERENCE_INTERVAL} frame(s)"
 
 if [ "$RENDER_MODE" == "1" ]; then
     OUTPUT="gvawatermark ! videoconvert ! fpsdisplaysink video-sink=autovideosink text-overlay=false signal-fps-measurements=true name=obj_fps_sink"
@@ -35,23 +76,23 @@ fi
 echo "Running object detection pipeline on $DEVICE with detection batch size = $BATCH_SIZE_DETECT and classification batch size = $BATCH_SIZE_CLASSIFY"
 echo "Running age prediction pipeline on $AGE_PREDICTION_VIDEO"
 
-gstLaunchCmd="GST_DEBUG=\"GST_TRACER:7\" GST_TRACERS='latency_tracer(flags=pipeline)' gst-launch-1.0 --verbose \
+gstLaunchCmd="GST_DEBUG=\"GST_TRACER:7\" GST_TRACERS='latency_tracer(flags=pipeline+element)' gst-launch-1.0 --verbose \
     $inputsrc_oc1 ! $DECODE \
-    ! queue \
+    ! queue $QUEUE_PARAMS \
     ! gvadetect batch-size=$BATCH_SIZE_DETECT \
         model-instance-id=odmodel \
         name=object_detection \
         model=/home/pipeline-server/models/object_detection/yolo11n/INT8/yolo11n.xml \
         threshold=0.5 \
-        inference-interval=3 \
+        inference-interval=$INFERENCE_INTERVAL \
         scale-method=fast \
         device=$OBJECT_DETECTION_DEVICE \
         $PRE_PROCESS $DETECTION_OPTIONS \
-    ! queue \
+    ! queue $QUEUE_PARAMS \
     ! gvatrack \
         name=object_tracking \
         tracking-type=zero-term-imageless \
-    ! queue \
+    ! queue $QUEUE_PARAMS \
     ! gvaclassify batch-size=$BATCH_SIZE_CLASSIFY \
         model-instance-id=classifier \
         labels=/home/pipeline-server/models/object_classification/efficientnet-b0/INT8/imagenet_2012.txt \
@@ -62,45 +103,46 @@ gstLaunchCmd="GST_DEBUG=\"GST_TRACER:7\" GST_TRACERS='latency_tracer(flags=pipel
         inference-region=1 \
         object-class=object \
         reclassify-interval=1 \
-    
+
         $CLASSIFICATION_PRE_PROCESS $CLASSIFICATION_OPTIONS \
     ! gvametaconvert \
     ! tee name=t_obj \
-        t_obj. ! queue ! $OUTPUT \
-        t_obj. ! queue ! gvametapublish name=obj_destination file-format=json-lines file-path=/tmp/results/rs_obj\$cid.jsonl ! fakesink sync=false async=false \
+        t_obj. ! queue $QUEUE_PARAMS ! $OUTPUT \
+        t_obj. ! queue $QUEUE_PARAMS ! gvametapublish name=obj_destination file-format=json-lines file-path=/tmp/results/rs_obj\$cid.jsonl ! fakesink sync=false async=false \
     \
     $inputsrc_ap1 ! $DECODE \
-    ! queue \
+    ! queue $QUEUE_PARAMS \
     ! gvadetect batch-size=$BATCH_SIZE_DETECT \
         model-instance-id=facemodel \
         name=face_detection \
-        model=/home/pipeline-server/models/face_detection/FP16/face-detection-retail-0004.xml \
+        model=$FACE_DETECTION_MODEL \
         model-proc=/home/pipeline-server/models/face_detection/face-detection-retail-0004.json \
-        inference-interval=3 \
+        inference-interval=$INFERENCE_INTERVAL \
         scale-method=fast \
         inference-region=full-frame \
         threshold=0.5 \
         device=$FACE_DETECTION_DEVICE \
-        $PRE_PROCESS $DETECTION_OPTIONS \
-    ! queue \
+        $PRE_PROCESS $FACE_DETECTION_OPTIONS \
+    ! queue $QUEUE_PARAMS \
     ! gvatrack \
         name=face_tracking \
         tracking-type=zero-term-imageless \
-    ! queue \
+    ! queue $QUEUE_PARAMS \
     ! gvaclassify batch-size=$BATCH_SIZE_CLASSIFY \
         model-instance-id=age_classifier \
-        model=/home/pipeline-server/models/age_prediction/FP16/age-gender-recognition-retail-0013.xml \
+        model=$AGE_PREDICTION_MODEL \
         model-proc=/home/pipeline-server/models/age_prediction/age-gender-recognition-retail-0013.json \
         device=$AGE_CLASSIFICATION_DEVICE \
         name=age_classification \
         inference-region=roi-list \
         object-class=face \
         reclassify-interval=1 \
-    ! queue \
+        $AGE_CLASSIFICATION_OPTIONS \
+    ! queue $QUEUE_PARAMS \
     ! gvametaconvert \
     ! tee name=t \
-        t. ! queue ! $AGE_OUTPUT \
-        t. ! queue ! gvametapublish name=destination file-format=json-lines file-path=/tmp/results/rs_age\$cid.jsonl ! fakesink sync=false async=false \
+        t. ! queue $QUEUE_PARAMS ! $AGE_OUTPUT \
+        t. ! queue $QUEUE_PARAMS ! gvametapublish name=destination file-format=json-lines file-path=/tmp/results/rs_age\$cid.jsonl ! fakesink sync=false async=false \
     2>&1 | tee /tmp/results/gst-launch_\$cid.log \
     | (stdbuf -oL awk '
         BEGIN { 

diff --git a/src/res/npu-gpu-flip.env b/src/res/npu-gpu-flip.env
@@ -0,0 +1,20 @@
+DECODE='h264parse ! vah264dec ! vapostproc ! "video/x-raw(memory:VAMemory)"'
+OCR_DEVICE=GPU
+PRE_PROCESS=pre-process-backend=va-surface-sharing
+# Object detection pipeline on NPU (testing heavy workload on NPU)
+DEVICE=NPU
+OBJECT_DETECTION_DEVICE=NPU
+OBJECT_CLASSIFICATION_DEVICE=GPU
+# Age prediction pipeline on GPU (lighter models back to GPU)
+FACE_DETECTION_DEVICE=GPU
+AGE_CLASSIFICATION_DEVICE=GPU
+CLASSIFICATION_DEVICE=GPU
+CLASSIFICATION_PRE_PROCESS=pre-process-backend=va-surface-sharing
+BATCH_SIZE_DETECT=${BATCH_SIZE_DETECT:-1}
+BATCH_SIZE_CLASSIFY=${BATCH_SIZE_CLASSIFY:-1}
+# NPU doesn't support GPU-specific options - leave empty for object detection
+DETECTION_OPTIONS=""
+# GPU classification options for object classification and age pipeline
+CLASSIFICATION_OPTIONS="ie-config=GPU_THROUGHPUT_STREAMS=2 nireq=2 reclassify-interval=1"
+FACE_DETECTION_OPTIONS="ie-config=GPU_THROUGHPUT_STREAMS=2 nireq=2"
+AGE_CLASSIFICATION_OPTIONS="ie-config=GPU_THROUGHPUT_STREAMS=2 nireq=2 reclassify-interval=1"