diff --git a/.cargo/config.toml b/.cargo/config.toml index 1dd4bae..d8a47e3 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -7,3 +7,10 @@ PKG_CONFIG_PATH = "/opt/homebrew/opt/ffmpeg@7/lib/pkgconfig" CPATH = "/opt/homebrew/opt/ffmpeg@7/include" LIBRARY_PATH = "/opt/homebrew/opt/ffmpeg@7/lib" BINDGEN_EXTRA_CLANG_ARGS = "-I/opt/homebrew/opt/ffmpeg@7/include" + +# +# Linux: Configure RPATH to find shared libraries in the executable's directory +# This allows the binary to correspond to libonnxruntime*.so in the same folder +# without needing LD_LIBRARY_PATH set manually. +[target.x86_64-unknown-linux-gnu] +rustflags = ["-C", "link-arg=-Wl,-rpath,$ORIGIN"] diff --git a/Cargo.lock b/Cargo.lock index e900895..ff32fcd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -51,6 +51,12 @@ dependencies = [ "equator", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "anstream" version = "0.6.21" @@ -570,6 +576,12 @@ dependencies = [ "syn", ] +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + [[package]] name = "errno" version = "0.3.14" @@ -686,6 +698,12 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "foreign-types" version = "0.3.2" @@ -948,6 +966,17 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] + [[package]] name = "heck" version = "0.5.0" @@ -1322,6 +1351,15 @@ dependencies = [ "imgref", ] +[[package]] +name = "lru" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" +dependencies = [ + "hashbrown", +] + [[package]] name = "lzma-rust2" version = "0.15.4" @@ -2594,7 +2632,7 @@ checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" [[package]] name = "ultralytics-inference" -version = "0.0.7" +version = "0.0.8" dependencies = [ "ab_glyph", "bytemuck", @@ -2608,6 +2646,7 @@ dependencies = [ "image", "imageproc", "jpeg-decoder", + "lru", "minifb", "ndarray 0.16.1", "ndarray 0.17.1", @@ -2617,6 +2656,7 @@ dependencies = [ "tempfile", "ureq", "video-rs", + "wide", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 6d6765c..f8713b1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ [package] name = "ultralytics-inference" -version = "0.0.7" +version = "0.0.8" edition = "2024" authors = [ "Glenn Jocher ", @@ -50,6 +50,12 @@ image = "^0.25" jpeg-decoder = "^0.3" fast_image_resize = { version = "^5.5", features = ["image", "rayon"] } +# SIMD for fast preprocessing +wide = "0.7" + +# LRU cache for preprocessing LUT +lru = "0.12" + # Numerical computing (must match ort's ndarray version 0.17) ndarray = { version = "^0.17", features = ["rayon"] } @@ -74,6 +80,7 @@ bytemuck = { version = "^1.21", features = ["derive"] } clap = { version = "4.5.54", features = ["derive"] } colored = "3.0.0" +# Optional - Visualization and Video support minifb = { version = "^0.28.0", optional = true } video-rs = { version = "^0.10.5", features = ["ndarray"], optional = true } @@ -151,7 +158,7 @@ opt-level = 3 [profile.release] opt-level = 3 -lto = true +lto = "fat" codegen-units = 1 panic = "abort" strip = true diff --git a/README.md b/README.md index 3bf4622..9c42f79 100644 --- a/README.md +++ b/README.md @@ -77,20 +77,26 @@ cargo run --release -- predict --model yolo11n.onnx --source video.mp4 --show -- # Save individual frames for video input cargo run --release -- predict --model yolo11n.onnx --source video.mp4 --save-frames + +# Rectangular inference +cargo run --release -- predict --model yolo11n.onnx --source image.jpg --rect ``` ### Example Output ``` +# ultralytics-inference predict + +WARNING ⚠️ 'model' argument is missing. Using default 'model=yolo11n.onnx'. WARNING ⚠️ 'source' argument is missing. Using default images: https://ultralytics.com/images/bus.jpg, https://ultralytics.com/images/zidane.jpg -Ultralytics 0.0.7 🚀 Rust ONNX FP32 CPU +Ultralytics 0.0.8 🚀 Rust ONNX FP32 CPU Using ONNX Runtime CPUExecutionProvider YOLO11n summary: 80 classes, imgsz=(640, 640) -image 1/2 bus.jpg: 640x640 3 persons, 1 bus, 57.3ms -image 2/2 zidane.jpg: 640x640 2 persons, 1 tie, 52.9ms -Speed: 75.8ms preprocess, 55.1ms inference, 19.9ms postprocess per image at shape (1, 3, 640, 640) -Results saved to runs/detect/predict53 +image 1/2 /home/ultralytics/inference/bus.jpg: 640x480 640x480 4 persons, 1 bus, 36.4ms +image 2/2 /home/ultralytics/inference/zidane.jpg: 384x640 2 persons, 1 tie, 28.6ms +Speed: 1.5ms preprocess, 32.5ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640) +Results saved to runs/detect/predict1 💡 Learn more at https://docs.ultralytics.com/modes/predict ``` @@ -117,8 +123,11 @@ cargo run --release -- predict --model --source | `--source` | `-s` | Input source (image, video, webcam index, or URL) | `Task dependent Ultralytics URL assets` | | `--device` | | Device to use (cpu, cuda:0, mps, coreml, etc.) | `cpu` | | `--conf` | | Confidence threshold | `0.25` | -| `--iou` | | IoU threshold for NMS | `0.45` | +| `--iou` | | IoU threshold for NMS | `0.7` | +| `--max-det` | | Maximum number of detections | `300` | | `--imgsz` | | Inference image size | `Model metadata` | +| `--rect` | | Enable rectangular inference (minimal padding) | `true` | +| `--batch` | | Batch size for inference | `1` | | `--half` | | Use FP16 half-precision inference | `false` | | `--save` | | Save annotated results to runs//predict | `true` | | `--save-frames` | | Save individual frames for video | `false` | @@ -183,7 +192,7 @@ fn main() -> Result<(), Box> { let config = InferenceConfig::new() .with_confidence(0.5) .with_iou(0.45) - .with_max_det(100); + .with_max_det(300); let mut model = YOLOModel::load_with_config("yolo11n.onnx", config)?; let results = model.predict("image.jpg")?; @@ -236,16 +245,25 @@ inference/ │ ├── main.rs # CLI application │ ├── model.rs # YOLOModel - ONNX session and inference │ ├── results.rs # Results, Boxes, Masks, Keypoints, Probs, Obb -│ ├── preprocessing.rs # Image preprocessing (letterbox, normalize) -│ ├── postprocessing.rs # Detection post-processing (NMS, decode) +│ ├── preprocessing.rs # Image preprocessing (letterbox, normalize, SIMD) +│ ├── postprocessing.rs # Detection post-processing (NMS, decode, SIMD) │ ├── metadata.rs # ONNX model metadata parsing -│ ├── source.rs # Input source handling -│ ├── task.rs # Task enum (Detect, Segment, Pose, etc.) +│ ├── source.rs # Input source handling (images, video, webcam) +│ ├── task.rs # Task enum (Detect, Segment, Pose, Classify, Obb) │ ├── inference.rs # InferenceConfig +│ ├── batch.rs # Batch processing pipeline +│ ├── device.rs # Device enum (CPU, CUDA, MPS, CoreML, etc.) │ ├── download.rs # Model and asset downloading -│ ├── visualizer/ # Visualization tools (Viewer) +│ ├── annotate.rs # Image annotation (bounding boxes, masks, keypoints) +│ ├── io.rs # Result saving (images, videos) +│ ├── logging.rs # Logging macros │ ├── error.rs # Error types -│ └── utils.rs # Utility functions (NMS, IoU) +│ ├── utils.rs # Utility functions (NMS, IoU) +│ ├── cli/ # CLI module +│ │ ├── mod.rs # CLI module exports +│ │ ├── args.rs # CLI argument parsing +│ │ └── predict.rs # Predict command implementation +│ └── visualizer/ # Real-time visualization (minifb) ├── tests/ │ └── integration_test.rs # Integration tests ├── assets/ # Test images @@ -300,13 +318,16 @@ One of the key benefits of this library is **minimal dependencies** - no PyTorch ### Core Dependencies (always included) -| Crate | Purpose | -| ------------------- | ----------------------- | -| `ort` | ONNX Runtime bindings | -| `ndarray` | N-dimensional arrays | -| `image` | Image loading/decoding | -| `fast_image_resize` | SIMD-optimized resizing | -| `half` | FP16 support | +| Crate | Purpose | +| ------------------- | ------------------------------- | +| `ort` | ONNX Runtime bindings | +| `ndarray` | N-dimensional arrays | +| `image` | Image loading/decoding | +| `jpeg-decoder` | JPEG decoding | +| `fast_image_resize` | SIMD-optimized resizing | +| `half` | FP16 support | +| `lru` | LRU cache for preprocessing LUT | +| `wide` | SIMD for fast preprocessing | ### Optional Dependencies (for `--save` feature) @@ -372,16 +393,18 @@ ONNX Runtime threading is set to auto (`num_threads: 0`) which lets ORT choose o - [x] Detection, Segmentation, Pose, Classification, OBB inference - [x] ONNX model metadata parsing (auto-detect classes, task, imgsz) +- [x] Hardware acceleration support (CUDA, TensorRT, CoreML, OpenVINO, XNNPACK) - [x] Ultralytics-compatible Results API (`Boxes`, `Masks`, `Keypoints`, `Probs`, `Obb`) - [x] Multiple input sources (images, directories, globs, URLs) - [x] Video file support and webcam/RTSP streaming - [x] Image annotation and visualization - [x] FP16 half-precision inference +- [x] Batch inference support +- [x] Rectangular inference support and optimization ### In Progress - [ ] Python bindings (PyO3) -- [ ] Batch inference optimization - [ ] WebAssembly (WASM) support for browser inference ## 💡 Contributing diff --git a/src/cli/args.rs b/src/cli/args.rs index 945d050..1d9a261 100644 --- a/src/cli/args.rs +++ b/src/cli/args.rs @@ -11,23 +11,27 @@ use clap::{Args, Parser, Subcommand}; --model, -m Path to ONNX model file [default: yolo11n.onnx] --source, -s Input source (image, directory, glob, video, webcam, or URL) --conf Confidence threshold [default: 0.25] - --iou IoU threshold for NMS [default: 0.45] + --iou IoU threshold for NMS [default: 0.7] + --max-det Maximum number of detections [default: 300] --imgsz Inference image size + --rect Enable rectangular inference (minimal padding) [default: true] + --batch Batch size for inference [default: 1] --half Use FP16 half-precision inference - --save Save annotated images to runs//predict + --save Save annotated images to runs//predict [default: true] --save-frames Save individual frames for video input (instead of video file) --show Display results in a window - --device Device (cpu, cuda:0, mps, coreml, directml:0, openvino, xnnpack) - --verbose Show verbose output + --device Device (cpu, cuda:0, mps, coreml, directml:0, openvino, tensorrt:0, xnnpack) + --verbose Show verbose output [default: true] Examples: + ultralytics-inference predict ultralytics-inference predict --model yolo11n.onnx --source image.jpg - ultralytics-inference predict --model yolo11n.onnx --source video.mp4 - ultralytics-inference predict --model yolo11n.onnx --source video.mp4 --save-frames - ultralytics-inference predict --model yolo11n.onnx --source 0 --conf 0.5 - ultralytics-inference predict -m yolo11n.onnx -s assets/ --save --half - ultralytics-inference predict -m yolo11n.onnx -s video.mp4 --imgsz 1280 --show - ultralytics-inference predict --model yolo11n.onnx --source image.jpg --device mps"#)] + ultralytics-inference predict --source video.mp4 --rect + ultralytics-inference predict --source video.mp4 --save-frames + ultralytics-inference predict --source 0 --conf 0.5 --show + ultralytics-inference predict --source assets/ --save --half + ultralytics-inference predict --source image.jpg --device cuda:0 + ultralytics-inference predict --source image.jpg --device mps"#)] pub struct Cli { #[command(subcommand)] /// Subcommand to execute. @@ -58,7 +62,7 @@ pub struct PredictArgs { pub conf: f32, /// `IoU` threshold for NMS - #[arg(long, default_value_t = 0.45)] + #[arg(long, default_value_t = 0.7)] pub iou: f32, /// Maximum number of detections @@ -69,6 +73,10 @@ pub struct PredictArgs { #[arg(long)] pub imgsz: Option, + /// Enable minimal padding (rectangular inference) + #[arg(long, default_value_t = true, num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set)] + pub rect: bool, + /// Batch size for inference #[arg(long, default_value_t = 1, value_parser = clap::value_parser!(u32).range(1..))] pub batch: u32, @@ -78,7 +86,7 @@ pub struct PredictArgs { pub half: bool, /// Save annotated images to runs//predict - #[arg(long, default_value_t = true, action = clap::ArgAction::Set)] + #[arg(long, default_value_t = true, num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set)] pub save: bool, /// Save individual frames for video input (instead of video file) @@ -115,7 +123,8 @@ mod tests { Commands::Predict(predict_args) => { assert_eq!(predict_args.model, "yolo11n.onnx"); assert!((predict_args.conf - 0.25).abs() < f32::EPSILON); - assert!((predict_args.iou - 0.45).abs() < f32::EPSILON); + assert!((predict_args.iou - 0.7).abs() < f32::EPSILON); + assert!(predict_args.rect); assert_eq!(predict_args.max_det, 300); assert!(!predict_args.half); assert!(predict_args.verbose); diff --git a/src/cli/predict.rs b/src/cli/predict.rs index eb43c80..6559211 100644 --- a/src/cli/predict.rs +++ b/src/cli/predict.rs @@ -14,6 +14,8 @@ use crate::annotate::{annotate_image, find_next_run_dir}; #[cfg(feature = "visualize")] use crate::visualizer::Viewer; +#[cfg(feature = "visualize")] +use image::GenericImageView; use crate::utils::pluralize; use crate::{InferenceConfig, Results, VERSION, YOLOModel}; @@ -29,7 +31,8 @@ use crate::{error, verbose, warn}; clippy::cast_precision_loss, clippy::cast_possible_truncation, clippy::cast_sign_loss, - clippy::missing_panics_doc + clippy::missing_panics_doc, + clippy::redundant_clone )] pub fn run_prediction(args: &PredictArgs) { // Parse arguments @@ -49,9 +52,14 @@ pub fn run_prediction(args: &PredictArgs) { .map(|d| d.parse().expect("Invalid device")); #[cfg(feature = "visualize")] let show = args.show; - // Use defaults with warnings if not specified - // Clap handles default model path, so model_path is always set. + // Warn if using default model (like Python does) + if model_path == crate::download::DEFAULT_MODEL && verbose { + warn!( + "'model' argument is missing. Using default '--model={}'.", + crate::download::DEFAULT_MODEL + ); + } // Load model first so we can determine appropriate default source based on task let mut config = InferenceConfig::new() @@ -59,8 +67,9 @@ pub fn run_prediction(args: &PredictArgs) { .with_iou(iou_threshold) .with_half(half) .with_batch(batch_size) - .with_max_det(args.max_det) - .with_save_frames(save_frames); + .with_save_frames(save_frames) + .with_rect(args.rect) + .with_max_det(args.max_det); // Apply imgsz if specified if let Some(sz) = imgsz { @@ -181,14 +190,6 @@ pub fn run_prediction(args: &PredictArgs) { process::exit(1); } - let iter = match crate::source::SourceIterator::new(source) { - Ok(iter) => iter, - Err(e) => { - error!("Error initializing source: {e}"); - process::exit(1); - } - }; - // Process each image/frame let mut all_results: Vec<(String, Results)> = Vec::new(); let mut total_preprocess = 0.0; @@ -207,6 +208,29 @@ pub fn run_prediction(args: &PredictArgs) { #[cfg(not(feature = "annotate"))] let mut result_saver: Option = None; + // Create a bounded channel for pipelined processing + // Buffer size 2x batch size ensures we can decode the next batch while processing current one + let channel_capacity = batch_size * 2; + let (sender, receiver) = std::sync::mpsc::sync_channel(channel_capacity); + + // Spawn producer thread for frame decoding + let source_clone = source.clone(); + std::thread::spawn(move || { + let iter = match crate::source::SourceIterator::new(source_clone) { + Ok(iter) => iter, + Err(e) => { + error!("Error initializing source in thread: {e}"); + return; + } + }; + + for item in iter { + if sender.send(item).is_err() { + break; // Receiver dropped, stop decoding + } + } + }); + // Use BatchProcessor for centralized batch management { let mut batch_processor = BatchProcessor::new( @@ -240,8 +264,8 @@ pub fn run_prediction(args: &PredictArgs) { meta.frame_idx + 1, total_frames_str, image_path, - inference_shape.1, inference_shape.0, + inference_shape.1, detection_summary, result.speed.inference.unwrap_or(0.0) ); @@ -251,8 +275,8 @@ pub fn run_prediction(args: &PredictArgs) { meta.frame_idx + 1, total_frames_str, image_path, - inference_shape.1, inference_shape.0, + inference_shape.1, detection_summary, result.speed.inference.unwrap_or(0.0) ); @@ -272,8 +296,9 @@ pub fn run_prediction(args: &PredictArgs) { #[cfg(feature = "visualize")] if show { - let view_width = inference_shape.1 as usize; - let view_height = inference_shape.0 as usize; + let (orig_w, orig_h) = img.dimensions(); + let view_width = orig_w as usize; + let view_height = orig_h as usize; if let Some(ref v) = viewer && (v.width != view_width || v.height != view_height) @@ -290,14 +315,13 @@ pub fn run_prediction(args: &PredictArgs) { if let Some(ref mut v) = viewer { let annotated = annotate_image(img, &result, None); - let resized = annotated.resize_exact( - view_width as u32, - view_height as u32, - image::imageops::FilterType::Triangle, - ); - if v.update(&resized).is_ok() && !is_video { - let _ = v.wait(Duration::from_millis(200)); + if v.update(&annotated).is_ok() { + // Main thread is blocking on channel, so visualizer wait is less critical + // but we keep a small wait to allow window events processing + if !is_video { + let _ = v.wait(Duration::from_millis(200)); + } } } } @@ -305,14 +329,14 @@ pub fn run_prediction(args: &PredictArgs) { total_preprocess += result.speed.preprocess.unwrap_or(0.0); total_inference += result.speed.inference.unwrap_or(0.0); total_postprocess += result.speed.postprocess.unwrap_or(0.0); - all_results.push((image_path.clone(), result)); } } }, ); - for item in iter { + // Main thread: consume frames from channel and run inference + for item in receiver { let (img, meta) = match item { Ok(val) => val, Err(e) => { @@ -335,10 +359,11 @@ pub fn run_prediction(args: &PredictArgs) { // Print speed summary with inference tensor shape (after letterboxing) let num_results = all_results.len().max(1) as f64; verbose!( - "Speed: {:.1}ms preprocess, {:.1}ms inference, {:.1}ms postprocess per image at shape (1, 3, {}, {})", + "Speed: {:.1}ms preprocess, {:.1}ms inference, {:.1}ms postprocess per image at shape ({}, 3, {}, {})", total_preprocess / num_results, total_inference / num_results, total_postprocess / num_results, + batch_size, last_inference_shape.0, last_inference_shape.1 ); diff --git a/src/download.rs b/src/download.rs index f0db10d..0f7654b 100644 --- a/src/download.rs +++ b/src/download.rs @@ -382,17 +382,31 @@ pub fn download_image(url: &str) -> Result { let filename = url.rsplit('/').next().unwrap_or("image.jpg"); let dest_path = Path::new(filename); + // Get absolute path for display consistency with Python + let abs_path = dest_path + .canonicalize() + .or_else(|_| std::env::current_dir().map(|p| p.join(filename))) + .map_or_else( + |_| filename.to_string(), + |p| p.to_string_lossy().to_string(), + ); + // Skip download if file already exists if dest_path.exists() { - eprintln!("Image already exists: {filename}"); - return Ok(filename.to_string()); + return Ok(abs_path); } eprintln!("Downloading {url}..."); download_file(url, dest_path)?; - Ok(filename.to_string()) + // Get absolute path after download + let abs_path = dest_path.canonicalize().map_or_else( + |_| filename.to_string(), + |p| p.to_string_lossy().to_string(), + ); + + Ok(abs_path) } /// Download multiple images from URLs to the current directory. diff --git a/src/inference.rs b/src/inference.rs index 0db5033..142ebb9 100644 --- a/src/inference.rs +++ b/src/inference.rs @@ -20,7 +20,7 @@ /// let config = InferenceConfig::new() /// .with_confidence(0.5) /// .with_iou(0.45) -/// .with_max_det(100) +/// .with_max_det(300) /// .with_imgsz(640, 640); /// ``` /// @@ -33,6 +33,7 @@ /// .with_device(Device::Cuda(0)); /// ``` #[derive(Debug, Clone)] +#[allow(clippy::struct_excessive_bools)] pub struct InferenceConfig { /// Confidence threshold for detections (0.0 to 1.0). /// Detections with confidence scores lower than this value will be discarded. @@ -65,6 +66,9 @@ pub struct InferenceConfig { /// Whether to save individual frames instead of a video file when input is video. /// Defaults to `false` (save as video). pub save_frames: bool, + /// Whether to use minimal padding (rectangular inference). + /// Defaults to `true` to match Ultralytics Python. + pub rect: bool, } impl Default for InferenceConfig { @@ -80,6 +84,7 @@ impl Default for InferenceConfig { device: None, save: true, save_frames: false, + rect: true, } } } @@ -267,6 +272,21 @@ impl InferenceConfig { self.save_frames = save_frames; self } + + /// Set whether to use minimal padding (rectangular inference). + /// + /// # Arguments + /// + /// * `rect` - `true` to enable, `false` to disable. + /// + /// # Returns + /// + /// * The modified `InferenceConfig`. + #[must_use] + pub const fn with_rect(mut self, rect: bool) -> Self { + self.rect = rect; + self + } } #[cfg(test)] @@ -286,13 +306,13 @@ mod tests { let config = InferenceConfig::new() .with_confidence(0.5) .with_iou(0.6) - .with_max_det(100) + .with_max_det(300) .with_imgsz(640, 640) .with_threads(8); assert!((config.confidence_threshold - 0.5).abs() < f32::EPSILON); assert!((config.iou_threshold - 0.6).abs() < f32::EPSILON); - assert_eq!(config.max_det, 100); + assert_eq!(config.max_det, 300); assert_eq!(config.imgsz, Some((640, 640))); assert_eq!(config.num_threads, 8); } diff --git a/src/lib.rs b/src/lib.rs index 8210a16..62fdcd2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -172,7 +172,7 @@ //! let config = InferenceConfig::new() //! .with_confidence(0.5) // Confidence threshold //! .with_iou(0.45) // NMS IoU threshold -//! .with_max_det(100) // Max detections per image +//! .with_max_det(300) // Max detections per image //! .with_imgsz(640, 640); // Input image size //! ``` //! diff --git a/src/main.rs b/src/main.rs index 6d9aed9..0ce9b0d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -25,9 +25,14 @@ use ultralytics_inference::cli::predict::run_prediction; use ultralytics_inference::logging::set_verbose; /// Entry point for the Ultralytics YOLO Inference CLI. -fn main() { +#[allow(clippy::unnecessary_wraps)] +fn main() -> Result<(), Box> { ultralytics_inference::io::init_logging(); + // Initialize ONNX Runtime with verbose logging to debug execution provider issues + #[cfg(debug_assertions)] + let _ = ort::init().commit(); + let cli = Cli::parse(); match &cli.command { @@ -36,4 +41,5 @@ fn main() { run_prediction(args); } } + Ok(()) } diff --git a/src/metadata.rs b/src/metadata.rs index cd0a2c8..9926b3b 100644 --- a/src/metadata.rs +++ b/src/metadata.rs @@ -91,7 +91,6 @@ impl ModelMetadata { /// /// Returns an error if the YAML is malformed or missing required fields. pub fn from_yaml_str(yaml_str: &str) -> Result { - // Parse YAML manually to avoid serde_yaml dependency complexity let mut metadata = Self::default(); for line in yaml_str.lines() { diff --git a/src/model.rs b/src/model.rs index 598935b..1751165 100644 --- a/src/model.rs +++ b/src/model.rs @@ -10,7 +10,7 @@ use std::path::Path; use std::time::Instant; use half::f16; -use image::DynamicImage; +use image::{DynamicImage, GenericImageView}; use ndarray::Array3; use ort::session::Session; use ort::tensor::TensorElementType; @@ -23,7 +23,8 @@ use crate::inference::InferenceConfig; use crate::metadata::ModelMetadata; use crate::postprocessing::postprocess; use crate::preprocessing::{ - image_to_array, preprocess_image_center_crop, preprocess_image_with_precision, + calculate_rect_size, image_to_array, preprocess_image_center_crop, + preprocess_image_with_precision, }; use crate::results::{Results, Speed}; use crate::task::Task; @@ -60,6 +61,8 @@ pub struct YOLOModel { fp16_input: bool, /// Execution provider used for inference execution_provider: String, + /// Whether the model accepts dynamic input shapes. + is_dynamic: bool, } #[allow( @@ -67,7 +70,12 @@ pub struct YOLOModel { clippy::needless_pass_by_value, clippy::missing_errors_doc, clippy::missing_panics_doc, - clippy::cast_possible_truncation + clippy::cast_possible_truncation, + clippy::cast_possible_wrap, + clippy::if_not_else, + clippy::manual_is_multiple_of, + clippy::cast_sign_loss, + clippy::cast_precision_loss )] impl YOLOModel { /// Load a YOLO model from an ONNX file. @@ -266,10 +274,16 @@ impl YOLOModel { } if !eps.is_empty() { + crate::info!( + "Registering {} execution providers (primary: {})", + eps.len(), + provider_name + ); session_builder = session_builder.with_execution_providers(eps).map_err(|e| { InferenceError::ModelLoadError(format!("Failed to set execution providers: {e}")) })?; } + // CPU is the default - no warning needed when no accelerators are registered let session = session_builder .with_optimization_level(ort::session::builder::GraphOptimizationLevel::Level3) @@ -381,6 +395,7 @@ impl YOLOModel { warmed_up: false, fp16_input, execution_provider: provider_name.to_string(), + is_dynamic, }; // Warmup inference to trigger JIT compilation and memory allocation @@ -589,18 +604,60 @@ impl YOLOModel { .or(self.metadata.imgsz) .unwrap_or((640, 640)); + // Check if target_size is divisible by stride (one-time warning logic per batch call) + // We only warn if the configured size itself is not divisible. + // If rect adjusts it, that's expected. + let stride = self.metadata.stride as usize; + if target_size.0 % stride != 0 || target_size.1 % stride != 0 { + warn!( + "WARNING ⚠️ imgsz=[{:?}] must be multiple of max stride {}, updating to [{}, {}]", + target_size, + stride, + (target_size.0 as f32 / stride as f32).ceil() as usize * stride, + (target_size.1 as f32 / stride as f32).ceil() as usize * stride + ); + } + // Preprocess all images let start_preprocess = Instant::now(); let mut preprocessed_results = Vec::with_capacity(images.len()); + // Check if we can use rect inference + // 1. Enabled in config + // 2. Model supports dynamic shapes + // 3. Batch is homogeneous (all images have same dimensions) or batch size is 1 + let use_rect = self.config.rect && self.is_dynamic; + let uniform_shape = if images.len() > 1 { + let first_dims = images[0].dimensions(); + images.iter().all(|img| img.dimensions() == first_dims) + } else { + true + }; + let actual_rect = use_rect && uniform_shape; + + // Warn if rect requested but disabled due to mixed batch + if self.config.rect && !uniform_shape { + warn!( + "Batch contains images of different sizes. Rectangular inference disabled for this batch (falling back to square padding)." + ); + } + // We will stack tensors later for image in images { + // Determine target size for this image + let current_target_size = if actual_rect { + let (w, h) = image.dimensions(); + calculate_rect_size(w, h, target_size, self.metadata.stride) + } else { + target_size + }; + let res = if self.metadata.task == Task::Classify { - preprocess_image_center_crop(image, target_size, self.fp16_input) + preprocess_image_center_crop(image, current_target_size, self.fp16_input) } else { preprocess_image_with_precision( image, - target_size, + current_target_size, self.metadata.stride, self.fp16_input, ) @@ -623,6 +680,7 @@ impl YOLOModel { .view(), ); } + // Concatenate along batch dimension (axis 0) let batch_tensor = ndarray::concatenate(ndarray::Axis(0), &arrays).map_err(|e| { InferenceError::InferenceError(format!("Failed to concatenate FP16 tensors: {e}")) @@ -658,52 +716,48 @@ impl YOLOModel { #[allow(clippy::cast_precision_loss)] let inference_time = start_inference.elapsed().as_secs_f64() * 1000.0 / images.len() as f64; + // Process each image's output + let mut image_arrays = Vec::with_capacity(images.len()); + for image in images { + image_arrays.push(image_to_array(image)); + } + // Post-process let start_postprocess = Instant::now(); let mut batch_results = Vec::with_capacity(images.len()); // Process each image's output - for (i, image) in images.iter().enumerate() { + for (i, (orig_img, preprocess_res)) in image_arrays + .into_iter() + .zip(preprocessed_results.into_iter()) + .enumerate() + { + let path = paths.get(i).cloned().unwrap_or_default(); + let speed = Speed::new(preprocess_time, inference_time, 0.0); + // Construct outputs for this single image let mut img_outputs = Vec::new(); for (data, shape) in &outputs { - // Calculate size of one image's output let batch_size = shape[0]; let actual_batch_size = if batch_size > 0 { batch_size } else { 1 }; - let total_elements = data.len(); let elements_per_img = total_elements / actual_batch_size; - let start = i * elements_per_img; let end = start + elements_per_img; - - if start >= total_elements || end > total_elements { - return Err(InferenceError::InferenceError(format!( - "Index out of bounds slicing output data: range {start}..{end} with length {total_elements}" - ))); - } - let img_data = data[start..end].to_vec(); - - // Adjust shape for single image: [1, ...] + let img_data = &data[start..end]; let mut img_shape = shape.clone(); img_shape[0] = 1; - img_outputs.push((img_data, img_shape)); } - let orig_img = image_to_array(image); - let path = paths.get(i).cloned().unwrap_or_default(); - - let speed = Speed::new(preprocess_time, inference_time, 0.0); - - let tensor_shape = preprocessed_results[i].tensor.shape(); + let tensor_shape = preprocess_res.tensor.shape(); let inference_shape = (tensor_shape[2] as u32, tensor_shape[3] as u32); let result = postprocess( img_outputs, self.metadata.task, - &preprocessed_results[i], + &preprocess_res, &self.config, &self.metadata.names, orig_img, @@ -711,6 +765,7 @@ impl YOLOModel { speed, inference_shape, ); + batch_results.push(vec![result]); } diff --git a/src/postprocessing.rs b/src/postprocessing.rs index 1b6d816..479e14b 100644 --- a/src/postprocessing.rs +++ b/src/postprocessing.rs @@ -5,11 +5,23 @@ //! This module handles task-specific post-processing of raw model outputs, //! including NMS, coordinate transformation, and result construction. +#![allow( + unsafe_code, + clippy::doc_markdown, + clippy::too_many_lines, + clippy::if_not_else, + clippy::ptr_as_ptr, + clippy::cast_possible_truncation, + clippy::cast_sign_loss +)] + use std::collections::HashMap; +use wide::{CmpGt, f32x8}; + use fast_image_resize::images::Image; use fast_image_resize::{FilterType, PixelType, ResizeAlg, ResizeOptions, Resizer}; -use ndarray::{Array2, Array3, ArrayView2, Zip, s}; +use ndarray::{Array2, Array3, ArrayView1, ArrayViewMut2, Zip, s}; use crate::inference::InferenceConfig; use crate::preprocessing::{PreprocessResult, clip_coords, scale_coords}; @@ -41,7 +53,7 @@ use crate::utils::{nms_per_class, nms_rotated_per_class}; clippy::implicit_hasher )] pub fn postprocess( - outputs: Vec<(Vec, Vec)>, + outputs: Vec<(&[f32], Vec)>, task: Task, preprocess: &PreprocessResult, config: &InferenceConfig, @@ -113,23 +125,7 @@ pub fn postprocess( /// Post-process detection model output. /// -/// Converts raw YOLO model output into a list of bounding boxes with class scores. -/// -/// # Arguments -/// -/// * `output` - Flat vector of model output values. -/// * `output_shape` - Shape of the output tensor. -/// * `preprocess` - Preprocessing metadata (scaling, padding). -/// * `config` - Inference configuration (thresholds). -/// * `names` - Class ID to name mapping. -/// * `orig_img` - Original image data. -/// * `path` - Source image path. -/// * `speed` - Timing metrics. -/// * `inference_shape` - Input shape used for inference. -/// -/// # Returns -/// -/// `Results` struct containing detected bounding boxes. +/// Zero-copy implementation using stride-based indexing to avoid memory allocations. #[allow( clippy::too_many_arguments, clippy::similar_names, @@ -156,24 +152,15 @@ fn postprocess_detect( return results; } - // Convert flat output to 2D array - let output_2d = if is_transposed { - // Shape is [1, num_preds, num_features] - already in correct format - Array2::from_shape_vec((num_predictions, 4 + num_classes), output.to_vec()) - .unwrap_or_else(|_| Array2::zeros((0, 0))) - } else { - // Shape is [1, num_features, num_preds] - need to transpose - let arr = Array2::from_shape_vec((4 + num_classes, num_predictions), output.to_vec()) - .unwrap_or_else(|_| Array2::zeros((0, 0))); - arr.t().to_owned() - }; - - if output_2d.is_empty() { - return results; - } - - // Extract boxes and scores - let boxes_data = extract_detect_boxes(output_2d.view(), num_classes, preprocess, config); + // Zero-copy extraction with stride-based indexing + let boxes_data = extract_detect_boxes( + output, + num_classes, + num_predictions, + is_transposed, + preprocess, + config, + ); if !boxes_data.is_empty() { results.boxes = Some(Boxes::new(boxes_data, preprocess.orig_shape)); @@ -241,93 +228,261 @@ fn parse_detect_shape(shape: &[usize], expected_classes: usize) -> (usize, usize } } -/// Extract detection boxes from model output. +/// Ultra-fast detection extraction - single-threaded tight loop. +/// +/// Key optimizations: +/// - No parallelization overhead (Rayon adds ~0.5ms for small workloads) +/// - Pre-sized allocations +/// - Minimal branching in hot loops +/// - Direct unsafe indexing +#[allow(clippy::cast_precision_loss, clippy::too_many_arguments)] +#[derive(Clone, Copy)] +struct Candidate { + bbox: [f32; 4], + score: f32, + class: usize, +} + +/// Optimized detection extraction with SIMD acceleration. /// -/// Filters predictions by confidence threshold and converts coordinates to original image space. -#[allow(clippy::cast_precision_loss, clippy::needless_pass_by_value)] +/// Key optimizations: +/// - SIMD-accelerated candidate extraction (f32x8) +/// - Parallel Bitmask NMS (IoU 1 vs 8) +/// - Struct-of-Arrays (SoA) layout for NMS cache locality +/// - Direct unsafe indexing for performance +#[allow(clippy::cast_precision_loss, clippy::too_many_arguments)] fn extract_detect_boxes( - output: ArrayView2, - _num_classes: usize, + output: &[f32], + num_classes: usize, + num_predictions: usize, + is_transposed: bool, preprocess: &PreprocessResult, config: &InferenceConfig, ) -> Array2 { - let _num_predictions = output.nrows(); - let mut candidates = Vec::new(); - - // Iterate over rows efficiently - // output shape is (num_predictions, 4 + num_classes) - // We can iterate over raw elements if we are careful, but using outer_iter() is safer and still fast - - // Pre-calculate scaling factors to avoid repeated struct access + let feat_count = 4 + num_classes; let (scale_y, scale_x) = preprocess.scale; let (pad_top, pad_left) = preprocess.padding; let orig_shape = preprocess.orig_shape; let (max_w, max_h) = (orig_shape.1 as f32, orig_shape.0 as f32); + let conf_thresh = config.confidence_threshold; + let max_det = config.max_det; + let iou_thresh = config.iou_threshold; + let conf_v = f32x8::splat(conf_thresh); + + let mut candidates: Vec = Vec::with_capacity(256); + + // Candidate Extraction + if !is_transposed { + // Layout [feat, pred] - Cache-friendly linear scan + let mut max_scores = vec![conf_thresh; num_predictions]; + let mut max_classes = vec![0usize; num_predictions]; + + for c in 0..num_classes { + let offset = (4 + c) * num_predictions; + let class_scores = &output[offset..offset + num_predictions]; + for (idx, &score) in class_scores.iter().enumerate() { + if score > max_scores[idx] { + max_scores[idx] = score; + max_classes[idx] = c; + } + } + } - for row in output.outer_iter() { - // Row is [cx, cy, w, h, class_scores...] - - // Efficiently find the best class score without allocating a new slice or iterator chain. - // We skip low-confidence detections early to avoid expensive coordinate scaling and NMS operations later. - let scores = row.slice(s![4..]); - - // Find best class manually to avoid iterator overhead - let (best_class, best_score) = - scores - .iter() - .enumerate() - .fold((0, 0.0f32), |(best_idx, best_val), (idx, &val)| { - if val > best_val { - (idx, val) - } else { - (best_idx, best_val) - } + for (idx, &score) in max_scores.iter().enumerate() { + if score > conf_thresh { + let cx = unsafe { *output.get_unchecked(idx) }; + let cy = unsafe { *output.get_unchecked(num_predictions + idx) }; + let w = unsafe { *output.get_unchecked(2 * num_predictions + idx) }; + let h = unsafe { *output.get_unchecked(3 * num_predictions + idx) }; + + let x1 = (cx - w * 0.5 - pad_left) / scale_x; + let y1 = (cy - h * 0.5 - pad_top) / scale_y; + let x2 = (cx + w * 0.5 - pad_left) / scale_x; + let y2 = (cy + h * 0.5 - pad_top) / scale_y; + + candidates.push(Candidate { + bbox: [x1, y1, x2, y2], + score, + class: max_classes[idx], }); - - // Filter by confidence threshold early to reduce computation - if best_score < config.confidence_threshold { - continue; + } } + } else { + // Layout [pred, feat] - Process 8 classes at once + for idx in 0..num_predictions { + let base = idx * feat_count; + let row_ptr = unsafe { output.as_ptr().add(base + 4) }; + let mut best_score = conf_thresh; + let mut best_class = 0; + let mut found = false; + + for c_idx in (0..num_classes).step_by(8) { + if num_classes - c_idx >= 8 { + let scores: f32x8 = + unsafe { (row_ptr.add(c_idx) as *const f32x8).read_unaligned() }; + if scores.cmp_gt(conf_v).any() { + for i in 0..8 { + let s = unsafe { *row_ptr.add(c_idx + i) }; + if s > best_score { + best_score = s; + best_class = c_idx + i; + found = true; + } + } + } + } else { + for i in c_idx..num_classes { + let s = unsafe { *row_ptr.add(i) }; + if s > best_score { + best_score = s; + best_class = i; + found = true; + } + } + } + } - // Extract coordinates only for candidate detections - let cx = row[0]; - let cy = row[1]; - let w = row[2]; - let h = row[3]; - - let x1 = (cx - w / 2.0 - pad_left) / scale_x; - let y1 = (cy - h / 2.0 - pad_top) / scale_y; - let x2 = (cx + w / 2.0 - pad_left) / scale_x; - let y2 = (cy + h / 2.0 - pad_top) / scale_y; - - // Clip (clamp) - let x1 = x1.clamp(0.0, max_w); - let y1 = y1.clamp(0.0, max_h); - let x2 = x2.clamp(0.0, max_w); - let y2 = y2.clamp(0.0, max_h); - - candidates.push(([x1, y1, x2, y2], best_score, best_class)); + if found { + let cx = unsafe { *output.get_unchecked(base) }; + let cy = unsafe { *output.get_unchecked(base + 1) }; + let w = unsafe { *output.get_unchecked(base + 2) }; + let h = unsafe { *output.get_unchecked(base + 3) }; + + let x1 = (cx - w * 0.5 - pad_left) / scale_x; + let y1 = (cy - h * 0.5 - pad_top) / scale_y; + let x2 = (cx + w * 0.5 - pad_left) / scale_x; + let y2 = (cy + h * 0.5 - pad_top) / scale_y; + + candidates.push(Candidate { + bbox: [x1, y1, x2, y2], + score: best_score, + class: best_class, + }); + } + } } if candidates.is_empty() { return Array2::zeros((0, 6)); } - // Apply per-class NMS (only suppress boxes within the same class) - let keep_indices = nms_per_class(&candidates, config.iou_threshold); + // Top-K Selection & Sort + let nms_limit = (max_det * 10).min(candidates.len()); + if candidates.len() > nms_limit { + candidates.select_nth_unstable_by(nms_limit, |a, b| b.score.partial_cmp(&a.score).unwrap()); + candidates.truncate(nms_limit); + } + candidates.sort_unstable_by(|a, b| b.score.partial_cmp(&a.score).unwrap()); + + // Population of SoA for NMS (small copy, very fast) + let n = candidates.len(); + let mut x1 = Vec::with_capacity(n); + let mut y1 = Vec::with_capacity(n); + let mut x2 = Vec::with_capacity(n); + let mut y2 = Vec::with_capacity(n); + let mut areas = Vec::with_capacity(n); + + for c in &candidates { + x1.push(c.bbox[0]); + y1.push(c.bbox[1]); + x2.push(c.bbox[2]); + y2.push(c.bbox[3]); + areas.push((c.bbox[2] - c.bbox[0]) * (c.bbox[3] - c.bbox[1])); + } + let mut suppressed = vec![false; n]; + let mut keep = Vec::with_capacity(max_det); + let iou_v = f32x8::splat(iou_thresh); // Build output array with kept detections - let num_kept = keep_indices.len().min(config.max_det); - let mut result = Array2::zeros((num_kept, 6)); + // let num_kept = keep_indices.len().min(config.max_det); + // let mut result = Array2::zeros((num_kept, 6)); - for (out_idx, &keep_idx) in keep_indices.iter().take(num_kept).enumerate() { - let (bbox, score, class) = &candidates[keep_idx]; - result[[out_idx, 0]] = bbox[0]; - result[[out_idx, 1]] = bbox[1]; - result[[out_idx, 2]] = bbox[2]; - result[[out_idx, 3]] = bbox[3]; - result[[out_idx, 4]] = *score; - result[[out_idx, 5]] = *class as f32; + for i in 0..n { + if suppressed[i] { + continue; + } + keep.push(i); + if keep.len() >= max_det { + break; + } + + let ax1 = f32x8::splat(x1[i]); + let ay1 = f32x8::splat(y1[i]); + let ax2 = f32x8::splat(x2[i]); + let ay2 = f32x8::splat(y2[i]); + let aa = f32x8::splat(areas[i]); + let ac = candidates[i].class; + + let mut j = i + 1; + while j < n { + if n - j >= 8 { + // Inline fast class and suppression check + let mut chunk_needs_processing = false; + for k in 0..8 { + if candidates[j + k].class == ac && !suppressed[j + k] { + chunk_needs_processing = true; + break; + } + } + + if chunk_needs_processing { + let bx1 = unsafe { (x1.as_ptr().add(j) as *const f32x8).read_unaligned() }; + let by1 = unsafe { (y1.as_ptr().add(j) as *const f32x8).read_unaligned() }; + let bx2 = unsafe { (x2.as_ptr().add(j) as *const f32x8).read_unaligned() }; + let by2 = unsafe { (y2.as_ptr().add(j) as *const f32x8).read_unaligned() }; + let ba = unsafe { (areas.as_ptr().add(j) as *const f32x8).read_unaligned() }; + + let ix1 = ax1.max(bx1); + let iy1 = ay1.max(by1); + let ix2 = ax2.min(bx2); + let iy2 = ay2.min(by2); + + let iw = (ix2 - ix1).max(f32x8::ZERO); + let ih = (iy2 - iy1).max(f32x8::ZERO); + let ia = iw * ih; + let iou = ia / (aa + ba - ia); + + let mask = iou.cmp_gt(iou_v).move_mask() as u8; + if mask != 0 { + for k in 0..8 { + if (mask & (1 << k)) != 0 && candidates[j + k].class == ac { + suppressed[j + k] = true; + } + } + } + } + j += 8; + } else { + for k in j..n { + if !suppressed[k] && candidates[k].class == ac { + let ix1 = x1[i].max(x1[k]); + let iy1 = y1[i].max(y1[k]); + let ix2 = x2[i].min(x2[k]); + let iy2 = y2[i].min(y2[k]); + let iw = (ix2 - ix1).max(0.0); + let ih = (iy2 - iy1).max(0.0); + let ia = iw * ih; + let iou = ia / (areas[i] + areas[k] - ia); + if iou > iou_thresh { + suppressed[k] = true; + } + } + } + break; + } + } + } + // Result Construction + let num_kept = keep.len(); + let mut result = Array2::zeros((num_kept, 6)); + for (out_idx, &idx) in keep.iter().enumerate() { + let c = &candidates[idx]; + result[[out_idx, 0]] = c.bbox[0].clamp(0.0, max_w); + result[[out_idx, 1]] = c.bbox[1].clamp(0.0, max_h); + result[[out_idx, 2]] = c.bbox[2].clamp(0.0, max_w); + result[[out_idx, 3]] = c.bbox[3].clamp(0.0, max_h); + result[[out_idx, 4]] = c.score; + result[[out_idx, 5]] = c.class as f32; } result @@ -361,7 +516,7 @@ fn extract_detect_boxes( clippy::cast_possible_truncation )] fn postprocess_segment( - outputs: Vec<(Vec, Vec)>, + outputs: Vec<(&[f32], Vec)>, preprocess: &PreprocessResult, config: &InferenceConfig, names: &HashMap, @@ -413,10 +568,10 @@ fn postprocess_segment( // Convert to 2D [preds, features] let output_2d = if is_transposed { - Array2::from_shape_vec((num_preds, expected_features), output0.clone()) + Array2::from_shape_vec((num_preds, expected_features), output0.to_vec()) .unwrap_or_else(|_| Array2::zeros((0, 0))) } else { - let arr = Array2::from_shape_vec((expected_features, num_preds), output0.clone()) + let arr = Array2::from_shape_vec((expected_features, num_preds), output0.to_vec()) .unwrap_or_else(|_| Array2::zeros((0, 0))); arr.t().to_owned() }; @@ -429,7 +584,7 @@ fn postprocess_segment( let (best_class, best_score) = scores .iter() .enumerate() - .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) + .max_by(|&(_, a), &(_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)) .map_or((0, 0.0), |(idx, &score)| (idx, score)); if best_score < config.confidence_threshold { @@ -514,7 +669,7 @@ fn postprocess_segment( ); } - let protos = match Array2::from_shape_vec((num_masks, mh * mw), output1.clone()) { + let protos = match Array2::from_shape_vec((num_masks, mh * mw), output1.to_vec()) { Ok(arr) => arr, Err(e) => { eprintln!("WARNING ⚠️ Failed to create protos array: {e}. Skipping mask generation."); @@ -551,77 +706,81 @@ fn postprocess_segment( Zip::from(masks_data.outer_iter_mut()) .and(masks_flat.outer_iter()) .and(boxes_data.outer_iter()) - .par_for_each(|mut mask_out, mask_flat, box_data| { - // Create a local resizer for each task (Resizer is not Sync) - let mut resizer = Resizer::new(); - let resize_alg = ResizeAlg::Convolution(FilterType::Bilinear); - - // Sigmoid into a Vec - let f32_data: Vec = mask_flat - .iter() - .map(|&val| 1.0 / (1.0 + (-val).exp())) - .collect(); - - // Use bytemuck for efficient f32->bytes conversion - let src_bytes: &[u8] = bytemuck::cast_slice(&f32_data); - - // Create source image (160x160) - let src_image = match Image::from_vec_u8( - mw as u32, - mh as u32, - src_bytes.to_vec(), - PixelType::F32, - ) { - Ok(img) => img, - Err(_) => return, // Skip if creation fails - }; - - // Create dest image (orig_w x orig_h) - let mut dst_image = Image::new(ow, oh, PixelType::F32); - - // Configure resize with crop - let safe_crop_x = f64::from(crop_x.max(0.0)); - let safe_crop_y = f64::from(crop_y.max(0.0)); - let safe_crop_w = f64::from(crop_w.max(1.0).min(mw as f32)); - let safe_crop_h = f64::from(crop_h.max(1.0).min(mh as f32)); - - let options = ResizeOptions::new().resize_alg(resize_alg).crop( - safe_crop_x, - safe_crop_y, - safe_crop_w, - safe_crop_h, - ); - - // Handle resize errors gracefully - if resizer - .resize(&src_image, &mut dst_image, &options) - .is_err() - { - return; - } + .par_for_each( + |mut mask_out: ArrayViewMut2, + mask_flat: ArrayView1, + box_data: ArrayView1| { + // Create a local resizer for each task (Resizer is not Sync) + let mut resizer = Resizer::new(); + let resize_alg = ResizeAlg::Convolution(FilterType::Bilinear); + + // Sigmoid into a Vec + let f32_data: Vec = mask_flat + .iter() + .map(|&val| 1.0 / (1.0 + (-val).exp())) + .collect(); + + // Use bytemuck for efficient f32->bytes conversion + let src_bytes: &[u8] = bytemuck::cast_slice(&f32_data); + + // Create source image (160x160) + let src_image = match Image::from_vec_u8( + mw as u32, + mh as u32, + src_bytes.to_vec(), + PixelType::F32, + ) { + Ok(img) => img, + Err(_) => return, // Skip if creation fails + }; + + // Create dest image (orig_w x orig_h) + let mut dst_image = Image::new(ow, oh, PixelType::F32); + + // Configure resize with crop + let safe_crop_x = f64::from(crop_x.max(0.0)); + let safe_crop_y = f64::from(crop_y.max(0.0)); + let safe_crop_w = f64::from(crop_w.max(1.0).min(mw as f32)); + let safe_crop_h = f64::from(crop_h.max(1.0).min(mh as f32)); + + let options = ResizeOptions::new().resize_alg(resize_alg).crop( + safe_crop_x, + safe_crop_y, + safe_crop_w, + safe_crop_h, + ); + + // Handle resize errors gracefully + if resizer + .resize(&src_image, &mut dst_image, &options) + .is_err() + { + return; + } - // Get resized data as f32 slice - let dst_bytes = dst_image.buffer(); - let dst_slice: &[f32] = bytemuck::cast_slice(dst_bytes); - - // Apply bbox cropping and store directly to output array - let x1 = box_data[0].max(0.0).min(ow as f32); - let y1 = box_data[1].max(0.0).min(oh as f32); - let x2 = box_data[2].max(0.0).min(ow as f32); - let y2 = box_data[3].max(0.0).min(oh as f32); - - for y in 0..oh as usize { - for x in 0..ow as usize { - let val = dst_slice[y * ow as usize + x]; - let x_f = x as f32; - let y_f = y as f32; - // Apply bounding box mask: invalid pixels outside the box are zeroed. - if x_f >= x1 && x_f <= x2 && y_f >= y1 && y_f <= y2 { - mask_out[[y, x]] = val; + // Get resized data as f32 slice + let dst_bytes = dst_image.buffer(); + let dst_slice: &[f32] = bytemuck::cast_slice(dst_bytes); + + // Apply bbox cropping and store directly to output array + let x1 = box_data[0].max(0.0).min(ow as f32); + let y1 = box_data[1].max(0.0).min(oh as f32); + let x2 = box_data[2].max(0.0).min(ow as f32); + let y2 = box_data[3].max(0.0).min(oh as f32); + + for y in 0..oh as usize { + for x in 0..ow as usize { + let val = dst_slice[y * ow as usize + x]; + let x_f = x as f32; + let y_f = y as f32; + // Apply bounding box mask: invalid pixels outside the box are zeroed. + if x_f >= x1 && x_f <= x2 && y_f >= y1 && y_f <= y2 { + mask_out[[y, x]] = val; + } } } - } - }); + }, + ); results.masks = Some(Masks::new(masks_data, preprocess.orig_shape)); @@ -869,11 +1028,8 @@ fn postprocess_classify( return results; } - // Filter out NaN values and ensure valid probabilities - let mut probs_vec: Vec = output - .iter() - .map(|&v| if v.is_nan() { 0.0 } else { v }) - .collect(); + // Probs::new expects an Array1, which we can create from the slice + let mut probs_vec = output.to_vec(); // Check if softmax is already applied (sum ≈ 1.0) let sum: f32 = probs_vec.iter().sum(); diff --git a/src/preprocessing.rs b/src/preprocessing.rs index 91541cf..68fcd1a 100644 --- a/src/preprocessing.rs +++ b/src/preprocessing.rs @@ -5,13 +5,71 @@ //! This module handles all image preprocessing operations needed before //! running YOLO model inference, including resizing, padding, and normalization. +#![allow( + unsafe_code, + clippy::similar_names, + clippy::cast_precision_loss, + clippy::cast_possible_wrap, + clippy::cast_sign_loss, + clippy::cast_possible_truncation, + clippy::too_many_arguments, + clippy::too_many_lines, + clippy::wildcard_imports, + clippy::ptr_as_ptr, + clippy::cast_lossless, + clippy::single_match_else, + clippy::suboptimal_flops, + clippy::manual_div_ceil +)] + +use std::cell::RefCell; +use std::num::NonZeroUsize; + use half::f16; -use image::{DynamicImage, GenericImageView, ImageBuffer, Rgb, RgbImage}; +use image::{DynamicImage, GenericImageView, RgbImage}; +use lru::LruCache; use ndarray::{Array3, Array4}; +// ================================================================================================ +// Constants +// ================================================================================================ + /// Default letterbox padding color (gray). pub const LETTERBOX_COLOR: [u8; 3] = [114, 114, 114]; +/// Fixed-point scale factor for integer bilinear interpolation (2^11 = 2048). +const SCALE_BITS: i32 = 11; +const SCALE_INT: i32 = 1 << SCALE_BITS; + +/// Normalized letterbox padding color (114/255 ≈ 0.447). +const LETTERBOX_NORM: f32 = 114.0 / 255.0; + +/// Reciprocal of 255 for normalization. +const INV_255: f32 = 1.0 / 255.0; + +/// Maximum LRU cache size for X coordinate LUTs. +const LUT_CACHE_SIZE: usize = 8; + +// ================================================================================================ +// Type Aliases +// ================================================================================================ + +type XLutEntry = (usize, usize, i32, i32); +type XLutKey = (u32, u32); + +// ================================================================================================ +// Thread-Local State +// ================================================================================================ + +thread_local! { + static X_LUT_CACHE: RefCell>> = + RefCell::new(LruCache::new(NonZeroUsize::new(LUT_CACHE_SIZE).unwrap())); +} + +// ================================================================================================ +// Types +// ================================================================================================ + /// Tensor data that can be either FP32 or FP16. #[derive(Debug, Clone)] pub enum TensorData { @@ -96,13 +154,37 @@ pub fn preprocess_image_with_precision( let (new_width, new_height, pad_left, pad_top, scale) = calculate_letterbox_params(orig_width, orig_height, target_size, stride); - // Perform letterbox resize - let letterboxed = letterbox_image(image, new_width, new_height, pad_left, pad_top, target_size); - - let tensor = image_to_tensor(&letterboxed); + // Zero-copy path: avoid to_rgb8() allocation when possible + let tensor = match image { + // Fast path: already RGB8, use bytes directly without copy + DynamicImage::ImageRgb8(rgb) => fused_zerocopy_preprocess( + rgb.as_raw(), + orig_width, + orig_height, + target_size, + pad_top, + pad_left, + new_width, + new_height, + ), + // Fallback: convert to RGB8 (allocates) + _ => { + let src_rgb = image.to_rgb8(); + fused_zerocopy_preprocess( + src_rgb.as_raw(), + orig_width, + orig_height, + target_size, + pad_top, + pad_left, + new_width, + new_height, + ) + } + }; let tensor_f16 = if half { - Some(image_to_tensor_f16(&letterboxed)) + Some(tensor_f32_to_f16(&tensor)) } else { None }; @@ -117,6 +199,279 @@ pub fn preprocess_image_with_precision( } } +// ================================================================================================ +// Public API Functions +// ================================================================================================ + +/// Get or compute the X coordinate LUT for bilinear interpolation. +fn get_or_compute_x_lut(src_w: u32, dst_w: u32) -> Vec { + let key = (src_w, dst_w); + + X_LUT_CACHE.with(|cache| { + let mut cache = cache.borrow_mut(); + + if let Some(lut) = cache.get(&key) { + return lut.clone(); + } + + let scale_x = src_w as f32 / dst_w as f32; + let src_w_max = (src_w - 1) as i32; + + let lut: Vec = (0..dst_w) + .map(|dx| { + let sx = ((dx as f32 + 0.5) * scale_x - 0.5).max(0.0); + let x0 = sx.floor() as i32; + let fx = ((sx - x0 as f32) * SCALE_INT as f32) as i32; + let x0c = x0.clamp(0, src_w_max) as usize * 3; + let x1c = (x0 + 1).clamp(0, src_w_max) as usize * 3; + (x0c, x1c, SCALE_INT - fx, fx) + }) + .collect(); + + cache.put(key, lut.clone()); + lut + }) +} + +/// Zero-copy fused preprocessing for maximum performance. +/// +/// Combines bilinear resize, letterbox padding, and NCHW normalization +/// in a single memory pass with parallel row processing. +fn fused_zerocopy_preprocess( + src_raw: &[u8], + src_w: u32, + src_h: u32, + target_size: (usize, usize), + pad_top: u32, + pad_left: u32, + new_width: u32, + new_height: u32, +) -> Array4 { + use rayon::prelude::*; + use std::mem::MaybeUninit; + use std::sync::atomic::{AtomicPtr, Ordering}; + use wide::f32x4; + + let (dst_h, dst_w) = target_size; + let channel_size = dst_h * dst_w; + let src_stride = (src_w * 3) as usize; + + // ALLOCATE UNINITIALIZED: Saves ~0.2ms by not zeroing memory + let mut tensor: Array4> = Array4::uninit((1, 3, dst_h, dst_w)); + let out_ptr = tensor.as_mut_ptr() as *mut f32; + + // Use AtomicPtr for thread-safe pointer sharing (each thread writes to disjoint rows) + let atomic_ptr = AtomicPtr::new(out_ptr); + + let x_lut = get_or_compute_x_lut(src_w, new_width); + let scale_y = src_h as f32 / new_height as f32; + let src_h_max = (src_h - 1) as i32; + let inv_255_vec = f32x4::splat(INV_255); + + let pad_top_usize = pad_top as usize; + let pad_left_usize = pad_left as usize; + let new_height_usize = new_height as usize; + let new_width_usize = new_width as usize; + + // Parallel row processing with raw pointers (no bounds checks) + (0..dst_h).into_par_iter().for_each(|dy| { + let data_ptr = atomic_ptr.load(Ordering::Relaxed); + unsafe { + // Calculate row pointers for R, G, B channels + + let r_row = data_ptr.add(dy * dst_w); + let g_row = data_ptr.add(channel_size + dy * dst_w); + let b_row = data_ptr.add(2 * channel_size + dy * dst_w); + + // Vertical padding (top/bottom rows) + if dy < pad_top_usize || dy >= pad_top_usize + new_height_usize { + for dx in 0..dst_w { + *r_row.add(dx) = LETTERBOX_NORM; + *g_row.add(dx) = LETTERBOX_NORM; + *b_row.add(dx) = LETTERBOX_NORM; + } + return; + } + + // Image row calculations + let img_dy = dy - pad_top_usize; + let sy = ((img_dy as f32 + 0.5) * scale_y - 0.5).max(0.0); + let y0 = sy.floor() as i32; + let fy = ((sy - y0 as f32) * SCALE_INT as f32) as i32; + let fy_inv = SCALE_INT - fy; + + let y0c = y0.clamp(0, src_h_max) as usize; + let y1c = (y0 + 1).clamp(0, src_h_max) as usize; + let row0_off = y0c * src_stride; + let row1_off = y1c * src_stride; + + // Left padding + for dx in 0..pad_left_usize { + *r_row.add(dx) = LETTERBOX_NORM; + *g_row.add(dx) = LETTERBOX_NORM; + *b_row.add(dx) = LETTERBOX_NORM; + } + + // Inner image - SIMD loop (4 pixels at a time) + let mut img_dx = 0usize; + let src_ptr = src_raw.as_ptr(); + + while img_dx + 4 <= new_width_usize { + let mut r_vals = [0.0f32; 4]; + let mut g_vals = [0.0f32; 4]; + let mut b_vals = [0.0f32; 4]; + + for i in 0..4 { + let (x0_off, x1_off, fx_inv, fx) = *x_lut.get_unchecked(img_dx + i); + let w00 = (fx_inv * fy_inv) >> SCALE_BITS; + let w10 = (fx * fy_inv) >> SCALE_BITS; + let w01 = (fx_inv * fy) >> SCALE_BITS; + let w11 = (fx * fy) >> SCALE_BITS; + + let p00 = src_ptr.add(row0_off + x0_off); + let p10 = src_ptr.add(row0_off + x1_off); + let p01 = src_ptr.add(row1_off + x0_off); + let p11 = src_ptr.add(row1_off + x1_off); + + r_vals[i] = ((*p00 as i32 * w00 + + *p10 as i32 * w10 + + *p01 as i32 * w01 + + *p11 as i32 * w11) + >> SCALE_BITS) as f32; + g_vals[i] = ((*p00.add(1) as i32 * w00 + + *p10.add(1) as i32 * w10 + + *p01.add(1) as i32 * w01 + + *p11.add(1) as i32 * w11) + >> SCALE_BITS) as f32; + b_vals[i] = ((*p00.add(2) as i32 * w00 + + *p10.add(2) as i32 * w10 + + *p01.add(2) as i32 * w01 + + *p11.add(2) as i32 * w11) + >> SCALE_BITS) as f32; + } + + // SIMD normalize + let r_simd = f32x4::new(r_vals) * inv_255_vec; + let g_simd = f32x4::new(g_vals) * inv_255_vec; + let b_simd = f32x4::new(b_vals) * inv_255_vec; + + let out_x = pad_left_usize + img_dx; + let r_arr: [f32; 4] = r_simd.into(); + let g_arr: [f32; 4] = g_simd.into(); + let b_arr: [f32; 4] = b_simd.into(); + + // Direct raw pointer writes (no bounds checks) + std::ptr::copy_nonoverlapping(r_arr.as_ptr(), r_row.add(out_x), 4); + std::ptr::copy_nonoverlapping(g_arr.as_ptr(), g_row.add(out_x), 4); + std::ptr::copy_nonoverlapping(b_arr.as_ptr(), b_row.add(out_x), 4); + + img_dx += 4; + } + + // Scalar tail + while img_dx < new_width_usize { + let (x0_off, x1_off, fx_inv, fx) = *x_lut.get_unchecked(img_dx); + let w00 = (fx_inv * fy_inv) >> SCALE_BITS; + let w10 = (fx * fy_inv) >> SCALE_BITS; + let w01 = (fx_inv * fy) >> SCALE_BITS; + let w11 = (fx * fy) >> SCALE_BITS; + + let p00 = src_ptr.add(row0_off + x0_off); + let p10 = src_ptr.add(row0_off + x1_off); + let p01 = src_ptr.add(row1_off + x0_off); + let p11 = src_ptr.add(row1_off + x1_off); + + let out_x = pad_left_usize + img_dx; + *r_row.add(out_x) = ((*p00 as i32 * w00 + + *p10 as i32 * w10 + + *p01 as i32 * w01 + + *p11 as i32 * w11) + >> SCALE_BITS) as f32 + * INV_255; + *g_row.add(out_x) = ((*p00.add(1) as i32 * w00 + + *p10.add(1) as i32 * w10 + + *p01.add(1) as i32 * w01 + + *p11.add(1) as i32 * w11) + >> SCALE_BITS) as f32 + * INV_255; + *b_row.add(out_x) = ((*p00.add(2) as i32 * w00 + + *p10.add(2) as i32 * w10 + + *p01.add(2) as i32 * w01 + + *p11.add(2) as i32 * w11) + >> SCALE_BITS) as f32 + * INV_255; + + img_dx += 1; + } + + // Right padding + for dx in (pad_left_usize + new_width_usize)..dst_w { + *r_row.add(dx) = LETTERBOX_NORM; + *g_row.add(dx) = LETTERBOX_NORM; + *b_row.add(dx) = LETTERBOX_NORM; + } + } + }); + + // SAFETY: All elements have been initialized + unsafe { tensor.assume_init() } +} + +/// Convert f32 tensor to f16 tensor. +fn tensor_f32_to_f16(tensor: &Array4) -> Array4 { + tensor.mapv(half::f16::from_f32) +} + +/// Calculate target size for rectangular inference mode. +/// +/// Adjusts `target_size` such that the image's aspect ratio is preserved, +/// and both dimensions are multiples of `stride`. +/// +/// # Arguments +/// +/// * `orig_width` - Original image width. +/// * `orig_height` - Original image height. +/// * `target_size` - Base target size (e.g. 640x640). +/// * `stride` - Model stride for alignment. +/// +/// # Returns +/// +/// Adjusted target size as (height, width). +#[must_use] +pub fn calculate_rect_size( + orig_width: u32, + orig_height: u32, + target_size: (usize, usize), + stride: u32, +) -> (usize, usize) { + let (target_h, target_w) = target_size; + + #[allow(clippy::cast_precision_loss)] + let orig_h = orig_height as f32; + #[allow(clippy::cast_precision_loss)] + let orig_w = orig_width as f32; + #[allow(clippy::cast_precision_loss)] + let target_h_f = target_h as f32; + #[allow(clippy::cast_precision_loss)] + let target_w_f = target_w as f32; + + // Calculate scale to fit within target while maintaining aspect ratio + let scale = (target_h_f / orig_h).min(target_w_f / orig_w); + + // New dimensions after scaling + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let new_h = (orig_h * scale).round() as usize; + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let new_w = (orig_w * scale).round() as usize; + + // Round up to nearest multiple of stride + let stride = stride as usize; + let rect_h = ((new_h + stride - 1) / stride) * stride; + let rect_w = ((new_w + stride - 1) / stride) * stride; + + (rect_h, rect_w) +} + /// Calculate letterbox parameters for resizing. /// /// Computes new dimensions and padding to fit the image within the target size while maintaining aspect ratio. @@ -173,72 +528,6 @@ fn calculate_letterbox_params( (new_w, new_h, pad_left, pad_top, (scale_y, scale_x)) } -/// Apply letterbox transformation to an image. -/// -/// Resizes the image maintaining aspect ratio and adds padding usually to center it. -/// Uses SIMD-accelerated resizing via `fast_image_resize`. -/// -/// # Arguments -/// -/// * `image` - Source dynamic image. -/// * `new_width` - Target width after scaling (before padding). -/// * `new_height` - Target height after scaling (before padding). -/// * `pad_left` - Padding to add on the left. -/// * `pad_top` - Padding to add on the top. -/// * `target_size` - Final output dimensions (height, width). -/// -/// # Returns -/// -/// `RgbImage` padded and resized to `target_size`. -fn letterbox_image( - image: &DynamicImage, - new_width: u32, - new_height: u32, - pad_left: u32, - pad_top: u32, - target_size: (usize, usize), -) -> RgbImage { - use fast_image_resize::{PixelType, ResizeAlg, ResizeOptions, Resizer, images::Image}; - - let src_rgb = image.to_rgb8(); - let (src_w, src_h) = src_rgb.dimensions(); - - let src_image = Image::from_vec_u8(src_w, src_h, src_rgb.into_raw(), PixelType::U8x3) - .expect("Failed to create source image"); - - let mut dst_image = Image::new(new_width, new_height, PixelType::U8x3); - - let mut resizer = Resizer::new(); - let options = ResizeOptions::new().resize_alg(ResizeAlg::Convolution( - // Use Lanczos3 for high-quality resizing. This is critical for OBB tasks where - // preserving small features (like harbors in DOTA8) is essential for detection. - // It matches the default behavior of Ultralytics Python preprocessing. - fast_image_resize::FilterType::Lanczos3, - )); - resizer - .resize(&src_image, &mut dst_image, Some(&options)) - .expect("Failed to resize image"); - - // Create output image with letterbox color - #[allow(clippy::cast_possible_truncation)] - let mut output: RgbImage = ImageBuffer::from_pixel( - target_size.1 as u32, - target_size.0 as u32, - Rgb(LETTERBOX_COLOR), - ); - - let resized_rgb: RgbImage = ImageBuffer::from_raw(new_width, new_height, dst_image.into_vec()) - .expect("Failed to create resized image buffer"); - - image::imageops::overlay( - &mut output, - &resized_rgb, - i64::from(pad_left), - i64::from(pad_top), - ); - - output -} /// Convert an RGB image to a normalized NCHW tensor (FP32). /// diff --git a/src/source.rs b/src/source.rs index 03d63b5..c8d10ca 100644 --- a/src/source.rs +++ b/src/source.rs @@ -740,7 +740,6 @@ impl Iterator for SourceIterator { } #[cfg(feature = "video")] -/// Convert a `video_rs` Frame (ndarray 0.16) to `DynamicImage`. fn video_frame_to_image(arr: &video_rs::Frame) -> Result { let shape = arr.shape(); let height = u32::try_from(shape[0]) @@ -748,14 +747,14 @@ fn video_frame_to_image(arr: &video_rs::Frame) -> Result { let width = u32::try_from(shape[1]) .map_err(|_| InferenceError::ImageError("Image width exceeds u32::MAX".to_string()))?; - let mut rgb_data = Vec::with_capacity((height * width * 3) as usize); - for y in 0..height as usize { - for x in 0..width as usize { - rgb_data.push(arr[[y, x, 0]]); - rgb_data.push(arr[[y, x, 1]]); - rgb_data.push(arr[[y, x, 2]]); - } - } + // video_rs::Frame is an ndarray::Array3 with shape (H, W, 3) and standard layout (C-contiguous). + // We can directly copy the raw data. + let rgb_data = arr + .as_slice() + .ok_or_else(|| { + InferenceError::ImageError("Failed to get raw slice from video frame".to_string()) + })? + .to_vec(); let img_buffer = image::RgbImage::from_raw(width, height, rgb_data).ok_or_else(|| { InferenceError::ImageError("Failed to create image from video frame".to_string()) diff --git a/tests/integration_test.rs b/tests/integration_test.rs index dc645bc..1c0717c 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -28,6 +28,7 @@ fn test_run_prediction_e2e() { show: false, device: None, verbose: true, + rect: false, }; // This should run successfully (download model/images and predict) @@ -47,11 +48,11 @@ fn test_inference_config_builder() { let config = InferenceConfig::new() .with_confidence(0.5) .with_iou(0.7) - .with_max_det(100); + .with_max_det(300); assert_eq!(config.confidence_threshold, 0.5); assert_eq!(config.iou_threshold, 0.7); - assert_eq!(config.max_det, 100); + assert_eq!(config.max_det, 300); } #[test]