From 185dd508c28f5b7afe9766d630ee4b383038926c Mon Sep 17 00:00:00 2001 From: Joshua Cork Date: Wed, 26 Nov 2025 11:55:30 -0700 Subject: [PATCH 1/5] Add configurable queue optimization and inference interval for low latency - Add LOW_LATENCY and MEDIUM_LATENCY queue optimization modes - Add configurable INFERENCE_INTERVAL (default=3, use 1 for every frame) - Add separate inference options for face detection vs object detection - Add support for INT8 model paths for NPU compatibility --- src/pipelines/obj_detection_age_prediction.sh | 78 ++++++++++++++----- 1 file changed, 60 insertions(+), 18 deletions(-) diff --git a/src/pipelines/obj_detection_age_prediction.sh b/src/pipelines/obj_detection_age_prediction.sh index 1bf91442..cc91839b 100755 --- a/src/pipelines/obj_detection_age_prediction.sh +++ b/src/pipelines/obj_detection_age_prediction.sh @@ -19,7 +19,48 @@ OBJECT_DETECTION_DEVICE="${OBJECT_DETECTION_DEVICE:=$DEVICE}" OBJECT_CLASSIFICATION_DEVICE="${OBJECT_CLASSIFICATION_DEVICE:=$CLASSIFICATION_DEVICE}" FACE_DETECTION_DEVICE="${FACE_DETECTION_DEVICE:=$DEVICE}" AGE_CLASSIFICATION_DEVICE="${AGE_CLASSIFICATION_DEVICE:=$CLASSIFICATION_DEVICE}" +# Support INT8 models for NPU compatibility +FACE_DETECTION_MODEL="${FACE_DETECTION_MODEL:=/home/pipeline-server/models/face_detection/FP16/face-detection-retail-0004.xml}" +AGE_PREDICTION_MODEL="${AGE_PREDICTION_MODEL:=/home/pipeline-server/models/age_prediction/FP16/age-gender-recognition-retail-0013.xml}" PRE_PROCESS="${PRE_PROCESS:=""}" +# Separate inference options for object detection and face detection pipelines +# Use default only if variable is unset (not if it's empty string) +if [ -z "${FACE_DETECTION_OPTIONS+x}" ]; then + FACE_DETECTION_OPTIONS="$DETECTION_OPTIONS" +fi +if [ -z "${AGE_CLASSIFICATION_OPTIONS+x}" ]; then + AGE_CLASSIFICATION_OPTIONS="$CLASSIFICATION_OPTIONS" +fi + +# Queue optimization for low latency +# Set LOW_LATENCY=1 to reduce queue sizes and minimize end-to-end latency (aggressive) +# Set MEDIUM_LATENCY=1 for production-realistic settings (balanced latency vs robustness) +# Set DROP_OLD_FRAMES=1 to always process most recent frames (drops old frames when queue is full) +if [ "$LOW_LATENCY" == "1" ]; then + if [ "$DROP_OLD_FRAMES" == "1" ]; then + QUEUE_PARAMS="max-size-buffers=3 max-size-time=100000000 leaky=downstream" + echo "LOW-LATENCY MODE + DROP OLD FRAMES: Always processing most recent frames (max-size-buffers=3, leaky=downstream)" + else + QUEUE_PARAMS="max-size-buffers=3 max-size-time=100000000" + echo "LOW-LATENCY MODE: Queue sizes optimized (max-size-buffers=3, max-size-time=0.1s)" + fi +elif [ "$MEDIUM_LATENCY" == "1" ]; then + if [ "$DROP_OLD_FRAMES" == "1" ]; then + QUEUE_PARAMS="max-size-buffers=10 max-size-time=500000000 leaky=downstream" + echo "MEDIUM-LATENCY MODE + DROP OLD FRAMES: Always processing most recent frames (max-size-buffers=10, max-size-time=0.5s, leaky=downstream)" + else + QUEUE_PARAMS="max-size-buffers=10 max-size-time=500000000" + echo "MEDIUM-LATENCY MODE: Production-realistic queue sizes (max-size-buffers=10, max-size-time=0.5s)" + fi +else + QUEUE_PARAMS="" + echo "STANDARD MODE: Using default queue sizes" +fi + +# Inference interval optimization +# Set INFERENCE_INTERVAL to control frame processing (default=3, 1=every frame) +INFERENCE_INTERVAL="${INFERENCE_INTERVAL:-3}" +echo "INFERENCE INTERVAL: Processing every ${INFERENCE_INTERVAL} frame(s)" if [ "$RENDER_MODE" == "1" ]; then OUTPUT="gvawatermark ! videoconvert ! fpsdisplaysink video-sink=autovideosink text-overlay=false signal-fps-measurements=true name=obj_fps_sink" @@ -35,23 +76,23 @@ fi echo "Running object detection pipeline on $DEVICE with detection batch size = $BATCH_SIZE_DETECT and classification batch size = $BATCH_SIZE_CLASSIFY" echo "Running age prediction pipeline on $AGE_PREDICTION_VIDEO" -gstLaunchCmd="GST_DEBUG=\"GST_TRACER:7\" GST_TRACERS='latency_tracer(flags=pipeline)' gst-launch-1.0 --verbose \ +gstLaunchCmd="GST_DEBUG=\"GST_TRACER:7\" GST_TRACERS='latency_tracer(flags=pipeline+element)' gst-launch-1.0 --verbose \ $inputsrc_oc1 ! $DECODE \ - ! queue \ + ! queue $QUEUE_PARAMS \ ! gvadetect batch-size=$BATCH_SIZE_DETECT \ model-instance-id=odmodel \ name=object_detection \ model=/home/pipeline-server/models/object_detection/yolo11n/INT8/yolo11n.xml \ threshold=0.5 \ - inference-interval=3 \ + inference-interval=$INFERENCE_INTERVAL \ scale-method=fast \ device=$OBJECT_DETECTION_DEVICE \ $PRE_PROCESS $DETECTION_OPTIONS \ - ! queue \ + ! queue $QUEUE_PARAMS \ ! gvatrack \ name=object_tracking \ tracking-type=zero-term-imageless \ - ! queue \ + ! queue $QUEUE_PARAMS \ ! gvaclassify batch-size=$BATCH_SIZE_CLASSIFY \ model-instance-id=classifier \ labels=/home/pipeline-server/models/object_classification/efficientnet-b0/INT8/imagenet_2012.txt \ @@ -62,45 +103,46 @@ gstLaunchCmd="GST_DEBUG=\"GST_TRACER:7\" GST_TRACERS='latency_tracer(flags=pipel inference-region=1 \ object-class=object \ reclassify-interval=1 \ - + $CLASSIFICATION_PRE_PROCESS $CLASSIFICATION_OPTIONS \ ! gvametaconvert \ ! tee name=t_obj \ - t_obj. ! queue ! $OUTPUT \ - t_obj. ! queue ! gvametapublish name=obj_destination file-format=json-lines file-path=/tmp/results/rs_obj\$cid.jsonl ! fakesink sync=false async=false \ + t_obj. ! queue $QUEUE_PARAMS ! $OUTPUT \ + t_obj. ! queue $QUEUE_PARAMS ! gvametapublish name=obj_destination file-format=json-lines file-path=/tmp/results/rs_obj\$cid.jsonl ! fakesink sync=false async=false \ \ $inputsrc_ap1 ! $DECODE \ - ! queue \ + ! queue $QUEUE_PARAMS \ ! gvadetect batch-size=$BATCH_SIZE_DETECT \ model-instance-id=facemodel \ name=face_detection \ - model=/home/pipeline-server/models/face_detection/FP16/face-detection-retail-0004.xml \ + model=$FACE_DETECTION_MODEL \ model-proc=/home/pipeline-server/models/face_detection/face-detection-retail-0004.json \ - inference-interval=3 \ + inference-interval=$INFERENCE_INTERVAL \ scale-method=fast \ inference-region=full-frame \ threshold=0.5 \ device=$FACE_DETECTION_DEVICE \ - $PRE_PROCESS $DETECTION_OPTIONS \ - ! queue \ + $PRE_PROCESS $FACE_DETECTION_OPTIONS \ + ! queue $QUEUE_PARAMS \ ! gvatrack \ name=face_tracking \ tracking-type=zero-term-imageless \ - ! queue \ + ! queue $QUEUE_PARAMS \ ! gvaclassify batch-size=$BATCH_SIZE_CLASSIFY \ model-instance-id=age_classifier \ - model=/home/pipeline-server/models/age_prediction/FP16/age-gender-recognition-retail-0013.xml \ + model=$AGE_PREDICTION_MODEL \ model-proc=/home/pipeline-server/models/age_prediction/age-gender-recognition-retail-0013.json \ device=$AGE_CLASSIFICATION_DEVICE \ name=age_classification \ inference-region=roi-list \ object-class=face \ reclassify-interval=1 \ - ! queue \ + $AGE_CLASSIFICATION_OPTIONS \ + ! queue $QUEUE_PARAMS \ ! gvametaconvert \ ! tee name=t \ - t. ! queue ! $AGE_OUTPUT \ - t. ! queue ! gvametapublish name=destination file-format=json-lines file-path=/tmp/results/rs_age\$cid.jsonl ! fakesink sync=false async=false \ + t. ! queue $QUEUE_PARAMS ! $AGE_OUTPUT \ + t. ! queue $QUEUE_PARAMS ! gvametapublish name=destination file-format=json-lines file-path=/tmp/results/rs_age\$cid.jsonl ! fakesink sync=false async=false \ 2>&1 | tee /tmp/results/gst-launch_\$cid.log \ | (stdbuf -oL awk ' BEGIN { From 3064ef38e74af84de19cc9c8940dc80298c5c0fc Mon Sep 17 00:00:00 2001 From: Joshua Cork Date: Wed, 26 Nov 2025 11:56:46 -0700 Subject: [PATCH 2/5] Add environment variable passthrough for latency configuration Allow shell environment variables to override .env file defaults for: - LOW_LATENCY, MEDIUM_LATENCY - INFERENCE_INTERVAL - BATCH_SIZE_DETECT, BATCH_SIZE_CLASSIFY Also add sample-media volume mount for benchmarking. --- src/docker-compose.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/docker-compose.yml b/src/docker-compose.yml index 849219af..f431a432 100644 --- a/src/docker-compose.yml +++ b/src/docker-compose.yml @@ -26,7 +26,7 @@ services: - ./start_stream.sh:/home/pipeline-server/src/start_stream.sh ClientGst: - image: dlstreamer:dev + image: pipeline-runner-asc:latest deploy: mode: replicated replicas: ${PIPELINE_COUNT:-1} @@ -46,8 +46,15 @@ services: - RTSP_SERVER=${RTSP_SERVER} - RTSP_PATH=${RTSP_PATH} - RENDER_MODE=${RENDER_MODE} + # Latency and inference configuration - allow shell overrides + - LOW_LATENCY=${LOW_LATENCY:-0} + - MEDIUM_LATENCY=${MEDIUM_LATENCY:-0} + - INFERENCE_INTERVAL=${INFERENCE_INTERVAL:-1} + - BATCH_SIZE_DETECT=${BATCH_SIZE_DETECT:-1} + - BATCH_SIZE_CLASSIFY=${BATCH_SIZE_CLASSIFY:-1} volumes: - ${RESULTS_DIR:-../results}:/tmp/results + - ../performance-tools/sample-media:/home/pipeline-server/sample-media - ~/.Xauthority:/home/dlstreamer/.Xauthority - /tmp/.X11-unix:/tmp/.X11-unix - ~/.cl-cache:/home/pipeline-server/.cl-cache @@ -55,4 +62,4 @@ services: - ./pipelines:/home/pipeline-server/pipelines - ./extensions:/home/pipeline-server/extensions - ${RETAIL_USE_CASE_ROOT:-}/models:/home/pipeline-server/models - restart: on-failure \ No newline at end of file + restart: on-failure From e48d52cadfdb2d16e6c5ef18148295114865babc Mon Sep 17 00:00:00 2001 From: Joshua Cork Date: Wed, 26 Nov 2025 11:57:23 -0700 Subject: [PATCH 3/5] Add environment variable passthrough for latency configuration (registry version) Allow shell environment variables to override .env file defaults for: - LOW_LATENCY, MEDIUM_LATENCY - INFERENCE_INTERVAL - BATCH_SIZE_DETECT, BATCH_SIZE_CLASSIFY Also add sample-media volume mount and update image name. --- src/docker-compose-reg.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/docker-compose-reg.yml b/src/docker-compose-reg.yml index 9aa834e1..809f85ed 100644 --- a/src/docker-compose-reg.yml +++ b/src/docker-compose-reg.yml @@ -26,7 +26,7 @@ services: - ./start_stream.sh:/home/pipeline-server/src/start_stream.sh ClientGst: - image: iotgdevcloud/dlstreamer:latest + image: intel/pipeline-runner-asc:latest deploy: mode: replicated replicas: ${PIPELINE_COUNT:-1} @@ -46,8 +46,15 @@ services: - RTSP_SERVER=${RTSP_SERVER} - RTSP_PATH=${RTSP_PATH} - RENDER_MODE=${RENDER_MODE} + # Latency and inference configuration - allow shell overrides + - LOW_LATENCY=${LOW_LATENCY:-0} + - MEDIUM_LATENCY=${MEDIUM_LATENCY:-0} + - INFERENCE_INTERVAL=${INFERENCE_INTERVAL:-1} + - BATCH_SIZE_DETECT=${BATCH_SIZE_DETECT:-1} + - BATCH_SIZE_CLASSIFY=${BATCH_SIZE_CLASSIFY:-1} volumes: - ${RESULTS_DIR:-../results}:/tmp/results + - ../performance-tools/sample-media:/home/pipeline-server/sample-media - ~/.Xauthority:/home/dlstreamer/.Xauthority - /tmp/.X11-unix:/tmp/.X11-unix - ~/.cl-cache:/home/pipeline-server/.cl-cache @@ -55,4 +62,4 @@ services: - ./pipelines:/home/pipeline-server/pipelines - ./extensions:/home/pipeline-server/extensions - ${RETAIL_USE_CASE_ROOT:-}/models:/home/pipeline-server/models - restart: on-failure + restart: on-failure From fe356bd05eaae83b5c21cd266f968931e11de9b2 Mon Sep 17 00:00:00 2001 From: Joshua Cork Date: Wed, 26 Nov 2025 11:57:41 -0700 Subject: [PATCH 4/5] Add NPU+GPU hybrid device configuration New device configuration for Lunar Lake that runs: - YOLO11n object detection on NPU - EfficientNet classification on GPU with VA surface sharing - Face detection and age classification on GPU This configuration achieves sub-second latency while maximizing stream density. --- src/res/npu-gpu-flip.env | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 src/res/npu-gpu-flip.env diff --git a/src/res/npu-gpu-flip.env b/src/res/npu-gpu-flip.env new file mode 100644 index 00000000..0f97b3cb --- /dev/null +++ b/src/res/npu-gpu-flip.env @@ -0,0 +1,20 @@ +DECODE='h264parse ! vah264dec ! vapostproc ! "video/x-raw(memory:VAMemory)"' +OCR_DEVICE=GPU +PRE_PROCESS=pre-process-backend=va-surface-sharing +# Object detection pipeline on NPU (testing heavy workload on NPU) +DEVICE=NPU +OBJECT_DETECTION_DEVICE=NPU +OBJECT_CLASSIFICATION_DEVICE=GPU +# Age prediction pipeline on GPU (lighter models back to GPU) +FACE_DETECTION_DEVICE=GPU +AGE_CLASSIFICATION_DEVICE=GPU +CLASSIFICATION_DEVICE=GPU +CLASSIFICATION_PRE_PROCESS=pre-process-backend=va-surface-sharing +BATCH_SIZE_DETECT=${BATCH_SIZE_DETECT:-1} +BATCH_SIZE_CLASSIFY=${BATCH_SIZE_CLASSIFY:-1} +# NPU doesn't support GPU-specific options - leave empty for object detection +DETECTION_OPTIONS="" +# GPU classification options for object classification and age pipeline +CLASSIFICATION_OPTIONS="ie-config=GPU_THROUGHPUT_STREAMS=2 nireq=2 reclassify-interval=1" +FACE_DETECTION_OPTIONS="ie-config=GPU_THROUGHPUT_STREAMS=2 nireq=2" +AGE_CLASSIFICATION_OPTIONS="ie-config=GPU_THROUGHPUT_STREAMS=2 nireq=2 reclassify-interval=1" From 9ab8ee127fb2faa5457cfabc921f0d7af4aeb354 Mon Sep 17 00:00:00 2001 From: Joshua Cork Date: Wed, 26 Nov 2025 12:34:06 -0700 Subject: [PATCH 5/5] Revert to original image name, add comment about tested configuration Latency benchmarks were run using a locally-built image (pipeline-runner-asc) based on DLStreamer 2025.0.1 with Intel NPU drivers for Lunar Lake. --- src/docker-compose.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/docker-compose.yml b/src/docker-compose.yml index f431a432..f6abdd42 100644 --- a/src/docker-compose.yml +++ b/src/docker-compose.yml @@ -26,7 +26,10 @@ services: - ./start_stream.sh:/home/pipeline-server/src/start_stream.sh ClientGst: - image: pipeline-runner-asc:latest + # Note: Latency benchmarks were run using a locally-built image (pipeline-runner-asc:latest) + # based on DLStreamer 2025.0.1-ubuntu24 with Intel NPU drivers for Lunar Lake. + # Build with: make build (uses Dockerfile in this repo) + image: dlstreamer:dev deploy: mode: replicated replicas: ${PIPELINE_COUNT:-1}