diff --git a/src/docker-compose-reg.yml b/src/docker-compose-reg.yml index 9aa834e1..809f85ed 100644 --- a/src/docker-compose-reg.yml +++ b/src/docker-compose-reg.yml @@ -26,7 +26,7 @@ services: - ./start_stream.sh:/home/pipeline-server/src/start_stream.sh ClientGst: - image: iotgdevcloud/dlstreamer:latest + image: intel/pipeline-runner-asc:latest deploy: mode: replicated replicas: ${PIPELINE_COUNT:-1} @@ -46,8 +46,15 @@ services: - RTSP_SERVER=${RTSP_SERVER} - RTSP_PATH=${RTSP_PATH} - RENDER_MODE=${RENDER_MODE} + # Latency and inference configuration - allow shell overrides + - LOW_LATENCY=${LOW_LATENCY:-0} + - MEDIUM_LATENCY=${MEDIUM_LATENCY:-0} + - INFERENCE_INTERVAL=${INFERENCE_INTERVAL:-1} + - BATCH_SIZE_DETECT=${BATCH_SIZE_DETECT:-1} + - BATCH_SIZE_CLASSIFY=${BATCH_SIZE_CLASSIFY:-1} volumes: - ${RESULTS_DIR:-../results}:/tmp/results + - ../performance-tools/sample-media:/home/pipeline-server/sample-media - ~/.Xauthority:/home/dlstreamer/.Xauthority - /tmp/.X11-unix:/tmp/.X11-unix - ~/.cl-cache:/home/pipeline-server/.cl-cache @@ -55,4 +62,4 @@ services: - ./pipelines:/home/pipeline-server/pipelines - ./extensions:/home/pipeline-server/extensions - ${RETAIL_USE_CASE_ROOT:-}/models:/home/pipeline-server/models - restart: on-failure + restart: on-failure diff --git a/src/docker-compose.yml b/src/docker-compose.yml index 849219af..f6abdd42 100644 --- a/src/docker-compose.yml +++ b/src/docker-compose.yml @@ -26,6 +26,9 @@ services: - ./start_stream.sh:/home/pipeline-server/src/start_stream.sh ClientGst: + # Note: Latency benchmarks were run using a locally-built image (pipeline-runner-asc:latest) + # based on DLStreamer 2025.0.1-ubuntu24 with Intel NPU drivers for Lunar Lake. + # Build with: make build (uses Dockerfile in this repo) image: dlstreamer:dev deploy: mode: replicated @@ -46,8 +49,15 @@ services: - RTSP_SERVER=${RTSP_SERVER} - RTSP_PATH=${RTSP_PATH} - RENDER_MODE=${RENDER_MODE} + # Latency and inference configuration - allow shell overrides + - LOW_LATENCY=${LOW_LATENCY:-0} + - MEDIUM_LATENCY=${MEDIUM_LATENCY:-0} + - INFERENCE_INTERVAL=${INFERENCE_INTERVAL:-1} + - BATCH_SIZE_DETECT=${BATCH_SIZE_DETECT:-1} + - BATCH_SIZE_CLASSIFY=${BATCH_SIZE_CLASSIFY:-1} volumes: - ${RESULTS_DIR:-../results}:/tmp/results + - ../performance-tools/sample-media:/home/pipeline-server/sample-media - ~/.Xauthority:/home/dlstreamer/.Xauthority - /tmp/.X11-unix:/tmp/.X11-unix - ~/.cl-cache:/home/pipeline-server/.cl-cache @@ -55,4 +65,4 @@ services: - ./pipelines:/home/pipeline-server/pipelines - ./extensions:/home/pipeline-server/extensions - ${RETAIL_USE_CASE_ROOT:-}/models:/home/pipeline-server/models - restart: on-failure \ No newline at end of file + restart: on-failure diff --git a/src/pipelines/obj_detection_age_prediction.sh b/src/pipelines/obj_detection_age_prediction.sh index 1bf91442..cc91839b 100755 --- a/src/pipelines/obj_detection_age_prediction.sh +++ b/src/pipelines/obj_detection_age_prediction.sh @@ -19,7 +19,48 @@ OBJECT_DETECTION_DEVICE="${OBJECT_DETECTION_DEVICE:=$DEVICE}" OBJECT_CLASSIFICATION_DEVICE="${OBJECT_CLASSIFICATION_DEVICE:=$CLASSIFICATION_DEVICE}" FACE_DETECTION_DEVICE="${FACE_DETECTION_DEVICE:=$DEVICE}" AGE_CLASSIFICATION_DEVICE="${AGE_CLASSIFICATION_DEVICE:=$CLASSIFICATION_DEVICE}" +# Support INT8 models for NPU compatibility +FACE_DETECTION_MODEL="${FACE_DETECTION_MODEL:=/home/pipeline-server/models/face_detection/FP16/face-detection-retail-0004.xml}" +AGE_PREDICTION_MODEL="${AGE_PREDICTION_MODEL:=/home/pipeline-server/models/age_prediction/FP16/age-gender-recognition-retail-0013.xml}" PRE_PROCESS="${PRE_PROCESS:=""}" +# Separate inference options for object detection and face detection pipelines +# Use default only if variable is unset (not if it's empty string) +if [ -z "${FACE_DETECTION_OPTIONS+x}" ]; then + FACE_DETECTION_OPTIONS="$DETECTION_OPTIONS" +fi +if [ -z "${AGE_CLASSIFICATION_OPTIONS+x}" ]; then + AGE_CLASSIFICATION_OPTIONS="$CLASSIFICATION_OPTIONS" +fi + +# Queue optimization for low latency +# Set LOW_LATENCY=1 to reduce queue sizes and minimize end-to-end latency (aggressive) +# Set MEDIUM_LATENCY=1 for production-realistic settings (balanced latency vs robustness) +# Set DROP_OLD_FRAMES=1 to always process most recent frames (drops old frames when queue is full) +if [ "$LOW_LATENCY" == "1" ]; then + if [ "$DROP_OLD_FRAMES" == "1" ]; then + QUEUE_PARAMS="max-size-buffers=3 max-size-time=100000000 leaky=downstream" + echo "LOW-LATENCY MODE + DROP OLD FRAMES: Always processing most recent frames (max-size-buffers=3, leaky=downstream)" + else + QUEUE_PARAMS="max-size-buffers=3 max-size-time=100000000" + echo "LOW-LATENCY MODE: Queue sizes optimized (max-size-buffers=3, max-size-time=0.1s)" + fi +elif [ "$MEDIUM_LATENCY" == "1" ]; then + if [ "$DROP_OLD_FRAMES" == "1" ]; then + QUEUE_PARAMS="max-size-buffers=10 max-size-time=500000000 leaky=downstream" + echo "MEDIUM-LATENCY MODE + DROP OLD FRAMES: Always processing most recent frames (max-size-buffers=10, max-size-time=0.5s, leaky=downstream)" + else + QUEUE_PARAMS="max-size-buffers=10 max-size-time=500000000" + echo "MEDIUM-LATENCY MODE: Production-realistic queue sizes (max-size-buffers=10, max-size-time=0.5s)" + fi +else + QUEUE_PARAMS="" + echo "STANDARD MODE: Using default queue sizes" +fi + +# Inference interval optimization +# Set INFERENCE_INTERVAL to control frame processing (default=3, 1=every frame) +INFERENCE_INTERVAL="${INFERENCE_INTERVAL:-3}" +echo "INFERENCE INTERVAL: Processing every ${INFERENCE_INTERVAL} frame(s)" if [ "$RENDER_MODE" == "1" ]; then OUTPUT="gvawatermark ! videoconvert ! fpsdisplaysink video-sink=autovideosink text-overlay=false signal-fps-measurements=true name=obj_fps_sink" @@ -35,23 +76,23 @@ fi echo "Running object detection pipeline on $DEVICE with detection batch size = $BATCH_SIZE_DETECT and classification batch size = $BATCH_SIZE_CLASSIFY" echo "Running age prediction pipeline on $AGE_PREDICTION_VIDEO" -gstLaunchCmd="GST_DEBUG=\"GST_TRACER:7\" GST_TRACERS='latency_tracer(flags=pipeline)' gst-launch-1.0 --verbose \ +gstLaunchCmd="GST_DEBUG=\"GST_TRACER:7\" GST_TRACERS='latency_tracer(flags=pipeline+element)' gst-launch-1.0 --verbose \ $inputsrc_oc1 ! $DECODE \ - ! queue \ + ! queue $QUEUE_PARAMS \ ! gvadetect batch-size=$BATCH_SIZE_DETECT \ model-instance-id=odmodel \ name=object_detection \ model=/home/pipeline-server/models/object_detection/yolo11n/INT8/yolo11n.xml \ threshold=0.5 \ - inference-interval=3 \ + inference-interval=$INFERENCE_INTERVAL \ scale-method=fast \ device=$OBJECT_DETECTION_DEVICE \ $PRE_PROCESS $DETECTION_OPTIONS \ - ! queue \ + ! queue $QUEUE_PARAMS \ ! gvatrack \ name=object_tracking \ tracking-type=zero-term-imageless \ - ! queue \ + ! queue $QUEUE_PARAMS \ ! gvaclassify batch-size=$BATCH_SIZE_CLASSIFY \ model-instance-id=classifier \ labels=/home/pipeline-server/models/object_classification/efficientnet-b0/INT8/imagenet_2012.txt \ @@ -62,45 +103,46 @@ gstLaunchCmd="GST_DEBUG=\"GST_TRACER:7\" GST_TRACERS='latency_tracer(flags=pipel inference-region=1 \ object-class=object \ reclassify-interval=1 \ - + $CLASSIFICATION_PRE_PROCESS $CLASSIFICATION_OPTIONS \ ! gvametaconvert \ ! tee name=t_obj \ - t_obj. ! queue ! $OUTPUT \ - t_obj. ! queue ! gvametapublish name=obj_destination file-format=json-lines file-path=/tmp/results/rs_obj\$cid.jsonl ! fakesink sync=false async=false \ + t_obj. ! queue $QUEUE_PARAMS ! $OUTPUT \ + t_obj. ! queue $QUEUE_PARAMS ! gvametapublish name=obj_destination file-format=json-lines file-path=/tmp/results/rs_obj\$cid.jsonl ! fakesink sync=false async=false \ \ $inputsrc_ap1 ! $DECODE \ - ! queue \ + ! queue $QUEUE_PARAMS \ ! gvadetect batch-size=$BATCH_SIZE_DETECT \ model-instance-id=facemodel \ name=face_detection \ - model=/home/pipeline-server/models/face_detection/FP16/face-detection-retail-0004.xml \ + model=$FACE_DETECTION_MODEL \ model-proc=/home/pipeline-server/models/face_detection/face-detection-retail-0004.json \ - inference-interval=3 \ + inference-interval=$INFERENCE_INTERVAL \ scale-method=fast \ inference-region=full-frame \ threshold=0.5 \ device=$FACE_DETECTION_DEVICE \ - $PRE_PROCESS $DETECTION_OPTIONS \ - ! queue \ + $PRE_PROCESS $FACE_DETECTION_OPTIONS \ + ! queue $QUEUE_PARAMS \ ! gvatrack \ name=face_tracking \ tracking-type=zero-term-imageless \ - ! queue \ + ! queue $QUEUE_PARAMS \ ! gvaclassify batch-size=$BATCH_SIZE_CLASSIFY \ model-instance-id=age_classifier \ - model=/home/pipeline-server/models/age_prediction/FP16/age-gender-recognition-retail-0013.xml \ + model=$AGE_PREDICTION_MODEL \ model-proc=/home/pipeline-server/models/age_prediction/age-gender-recognition-retail-0013.json \ device=$AGE_CLASSIFICATION_DEVICE \ name=age_classification \ inference-region=roi-list \ object-class=face \ reclassify-interval=1 \ - ! queue \ + $AGE_CLASSIFICATION_OPTIONS \ + ! queue $QUEUE_PARAMS \ ! gvametaconvert \ ! tee name=t \ - t. ! queue ! $AGE_OUTPUT \ - t. ! queue ! gvametapublish name=destination file-format=json-lines file-path=/tmp/results/rs_age\$cid.jsonl ! fakesink sync=false async=false \ + t. ! queue $QUEUE_PARAMS ! $AGE_OUTPUT \ + t. ! queue $QUEUE_PARAMS ! gvametapublish name=destination file-format=json-lines file-path=/tmp/results/rs_age\$cid.jsonl ! fakesink sync=false async=false \ 2>&1 | tee /tmp/results/gst-launch_\$cid.log \ | (stdbuf -oL awk ' BEGIN { diff --git a/src/res/npu-gpu-flip.env b/src/res/npu-gpu-flip.env new file mode 100644 index 00000000..0f97b3cb --- /dev/null +++ b/src/res/npu-gpu-flip.env @@ -0,0 +1,20 @@ +DECODE='h264parse ! vah264dec ! vapostproc ! "video/x-raw(memory:VAMemory)"' +OCR_DEVICE=GPU +PRE_PROCESS=pre-process-backend=va-surface-sharing +# Object detection pipeline on NPU (testing heavy workload on NPU) +DEVICE=NPU +OBJECT_DETECTION_DEVICE=NPU +OBJECT_CLASSIFICATION_DEVICE=GPU +# Age prediction pipeline on GPU (lighter models back to GPU) +FACE_DETECTION_DEVICE=GPU +AGE_CLASSIFICATION_DEVICE=GPU +CLASSIFICATION_DEVICE=GPU +CLASSIFICATION_PRE_PROCESS=pre-process-backend=va-surface-sharing +BATCH_SIZE_DETECT=${BATCH_SIZE_DETECT:-1} +BATCH_SIZE_CLASSIFY=${BATCH_SIZE_CLASSIFY:-1} +# NPU doesn't support GPU-specific options - leave empty for object detection +DETECTION_OPTIONS="" +# GPU classification options for object classification and age pipeline +CLASSIFICATION_OPTIONS="ie-config=GPU_THROUGHPUT_STREAMS=2 nireq=2 reclassify-interval=1" +FACE_DETECTION_OPTIONS="ie-config=GPU_THROUGHPUT_STREAMS=2 nireq=2" +AGE_CLASSIFICATION_OPTIONS="ie-config=GPU_THROUGHPUT_STREAMS=2 nireq=2 reclassify-interval=1"