ultralytics · onuralpszr · Jan 6, 2026 · Jan 6, 2026 · Jan 10, 2026 · Jan 10, 2026
diff --git a/.cargo/config.toml b/.cargo/config.toml
@@ -7,3 +7,10 @@ PKG_CONFIG_PATH = "/opt/homebrew/opt/ffmpeg@7/lib/pkgconfig"
 CPATH = "/opt/homebrew/opt/ffmpeg@7/include"
 LIBRARY_PATH = "/opt/homebrew/opt/ffmpeg@7/lib"
 BINDGEN_EXTRA_CLANG_ARGS = "-I/opt/homebrew/opt/ffmpeg@7/include"
+
+#
+# Linux: Configure RPATH to find shared libraries in the executable's directory
+# This allows the binary to correspond to libonnxruntime*.so in the same folder
+# without needing LD_LIBRARY_PATH set manually.
+[target.x86_64-unknown-linux-gnu]
+rustflags = ["-C", "link-arg=-Wl,-rpath,$ORIGIN"]
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,7 +2,7 @@
 
 [package]
 name = "ultralytics-inference"
-version = "0.0.7"
+version = "0.0.8"
 edition = "2024"
 authors = [
     "Glenn Jocher <[email protected]>",
@@ -50,6 +50,12 @@ image = "^0.25"
 jpeg-decoder = "^0.3"
 fast_image_resize = { version = "^5.5", features = ["image", "rayon"] }
 
+# SIMD for fast preprocessing
+wide = "0.7"
+
+# LRU cache for preprocessing LUT
+lru = "0.12"
+
 # Numerical computing (must match ort's ndarray version 0.17)
 ndarray = { version = "^0.17", features = ["rayon"] }
 
@@ -74,6 +80,7 @@ bytemuck = { version = "^1.21", features = ["derive"] }
 clap = { version = "4.5.54", features = ["derive"] }
 colored = "3.0.0"
 
+# Optional - Visualization and Video support
 minifb = { version = "^0.28.0", optional = true }
 video-rs = { version = "^0.10.5", features = ["ndarray"], optional = true }
 
@@ -151,7 +158,7 @@ opt-level = 3
 
 [profile.release]
 opt-level = 3
-lto = true
+lto = "fat"
 codegen-units = 1
 panic = "abort"
 strip = true
diff --git a/README.md b/README.md
@@ -77,20 +77,26 @@ cargo run --release -- predict --model yolo11n.onnx --source video.mp4 --show --
 
 # Save individual frames for video input
 cargo run --release -- predict --model yolo11n.onnx --source video.mp4 --save-frames
+
+# Rectangular inference
+cargo run --release -- predict --model yolo11n.onnx --source image.jpg --rect
 ```
 
 ### Example Output
 
 ```
+# ultralytics-inference predict
+
+WARNING ⚠️ 'model' argument is missing. Using default 'model=yolo11n.onnx'.
 WARNING ⚠️ 'source' argument is missing. Using default images: https://ultralytics.com/images/bus.jpg, https://ultralytics.com/images/zidane.jpg
-Ultralytics 0.0.7 🚀 Rust ONNX FP32 CPU
+Ultralytics 0.0.8 🚀 Rust ONNX FP32 CPU
 Using ONNX Runtime CPUExecutionProvider
 YOLO11n summary: 80 classes, imgsz=(640, 640)
 
-image 1/2 bus.jpg: 640x640 3 persons, 1 bus, 57.3ms
-image 2/2 zidane.jpg: 640x640 2 persons, 1 tie, 52.9ms
-Speed: 75.8ms preprocess, 55.1ms inference, 19.9ms postprocess per image at shape (1, 3, 640, 640)
-Results saved to runs/detect/predict53
+image 1/2 /home/ultralytics/inference/bus.jpg: 640x480 640x480 4 persons, 1 bus, 36.4ms
+image 2/2 /home/ultralytics/inference/zidane.jpg: 384x640 2 persons, 1 tie, 28.6ms
+Speed: 1.5ms preprocess, 32.5ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)
+Results saved to runs/detect/predict1
 💡 Learn more at https://docs.ultralytics.com/modes/predict
 ```
 
@@ -117,8 +123,11 @@ cargo run --release -- predict --model <model.onnx> --source <source>
 | `--source`      | `-s`  | Input source (image, video, webcam index, or URL) | `Task dependent Ultralytics URL assets` |
 | `--device`      |       | Device to use (cpu, cuda:0, mps, coreml, etc.)    | `cpu`                                   |
 | `--conf`        |       | Confidence threshold                              | `0.25`                                  |
-| `--iou`         |       | IoU threshold for NMS                             | `0.45`                                  |
+| `--iou`         |       | IoU threshold for NMS                             | `0.7`                                   |
+| `--max-det`     |       | Maximum number of detections                      | `300`                                   |
 | `--imgsz`       |       | Inference image size                              | `Model metadata`                        |
+| `--rect`        |       | Enable rectangular inference (minimal padding)    | `true`                                  |
+| `--batch`       |       | Batch size for inference                          | `1`                                     |
 | `--half`        |       | Use FP16 half-precision inference                 | `false`                                 |
 | `--save`        |       | Save annotated results to runs/<task>/predict     | `true`                                  |
 | `--save-frames` |       | Save individual frames for video                  | `false`                                 |
@@ -183,7 +192,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     let config = InferenceConfig::new()
         .with_confidence(0.5)
         .with_iou(0.45)
-        .with_max_det(100);
+        .with_max_det(300);
 
     let mut model = YOLOModel::load_with_config("yolo11n.onnx", config)?;
     let results = model.predict("image.jpg")?;
@@ -236,16 +245,25 @@ inference/
 │   ├── main.rs             # CLI application
 │   ├── model.rs            # YOLOModel - ONNX session and inference
 │   ├── results.rs          # Results, Boxes, Masks, Keypoints, Probs, Obb
-│   ├── preprocessing.rs    # Image preprocessing (letterbox, normalize)
-│   ├── postprocessing.rs   # Detection post-processing (NMS, decode)
+│   ├── preprocessing.rs    # Image preprocessing (letterbox, normalize, SIMD)
+│   ├── postprocessing.rs   # Detection post-processing (NMS, decode, SIMD)
 │   ├── metadata.rs         # ONNX model metadata parsing
-│   ├── source.rs           # Input source handling
-│   ├── task.rs             # Task enum (Detect, Segment, Pose, etc.)
+│   ├── source.rs           # Input source handling (images, video, webcam)
+│   ├── task.rs             # Task enum (Detect, Segment, Pose, Classify, Obb)
 │   ├── inference.rs        # InferenceConfig
+│   ├── batch.rs            # Batch processing pipeline
+│   ├── device.rs           # Device enum (CPU, CUDA, MPS, CoreML, etc.)
 │   ├── download.rs         # Model and asset downloading
-│   ├── visualizer/         # Visualization tools (Viewer)
+│   ├── annotate.rs         # Image annotation (bounding boxes, masks, keypoints)
+│   ├── io.rs               # Result saving (images, videos)
+│   ├── logging.rs          # Logging macros
 │   ├── error.rs            # Error types
-│   └── utils.rs            # Utility functions (NMS, IoU)
+│   ├── utils.rs            # Utility functions (NMS, IoU)
+│   ├── cli/                # CLI module
+│   │   ├── mod.rs          # CLI module exports
+│   │   ├── args.rs         # CLI argument parsing
+│   │   └── predict.rs      # Predict command implementation
+│   └── visualizer/         # Real-time visualization (minifb)
 ├── tests/
 │   └── integration_test.rs # Integration tests
 ├── assets/                 # Test images
@@ -300,13 +318,16 @@ One of the key benefits of this library is **minimal dependencies** - no PyTorch
 
 ### Core Dependencies (always included)
 
-| Crate               | Purpose                 |
-| ------------------- | ----------------------- |
-| `ort`               | ONNX Runtime bindings   |
-| `ndarray`           | N-dimensional arrays    |
-| `image`             | Image loading/decoding  |
-| `fast_image_resize` | SIMD-optimized resizing |
-| `half`              | FP16 support            |
+| Crate               | Purpose                         |
+| ------------------- | ------------------------------- |
+| `ort`               | ONNX Runtime bindings           |
+| `ndarray`           | N-dimensional arrays            |
+| `image`             | Image loading/decoding          |
+| `jpeg-decoder`      | JPEG decoding                   |
+| `fast_image_resize` | SIMD-optimized resizing         |
+| `half`              | FP16 support                    |
+| `lru`               | LRU cache for preprocessing LUT |
+| `wide`              | SIMD for fast preprocessing     |
 
 ### Optional Dependencies (for `--save` feature)
 
@@ -372,16 +393,18 @@ ONNX Runtime threading is set to auto (`num_threads: 0`) which lets ORT choose o
 
 - [x] Detection, Segmentation, Pose, Classification, OBB inference
 - [x] ONNX model metadata parsing (auto-detect classes, task, imgsz)
+- [x] Hardware acceleration support (CUDA, TensorRT, CoreML, OpenVINO, XNNPACK)
 - [x] Ultralytics-compatible Results API (`Boxes`, `Masks`, `Keypoints`, `Probs`, `Obb`)
 - [x] Multiple input sources (images, directories, globs, URLs)
 - [x] Video file support and webcam/RTSP streaming
 - [x] Image annotation and visualization
 - [x] FP16 half-precision inference
+- [x] Batch inference support
+- [x] Rectangular inference support and optimization
 
 ### In Progress
 
 - [ ] Python bindings (PyO3)
-- [ ] Batch inference optimization
 - [ ] WebAssembly (WASM) support for browser inference
 
 ## 💡 Contributing

diff --git a/src/cli/args.rs b/src/cli/args.rs
@@ -11,23 +11,27 @@ use clap::{Args, Parser, Subcommand};
     --model, -m <MODEL>    Path to ONNX model file [default: yolo11n.onnx]
     --source, -s <SOURCE>  Input source (image, directory, glob, video, webcam, or URL)
     --conf <CONF>          Confidence threshold [default: 0.25]
-    --iou <IOU>            IoU threshold for NMS [default: 0.45]
+    --iou <IOU>            IoU threshold for NMS [default: 0.7]
+    --max-det <MAX_DET>    Maximum number of detections [default: 300]
     --imgsz <IMGSZ>        Inference image size
+    --rect                 Enable rectangular inference (minimal padding) [default: true]
+    --batch <BATCH>        Batch size for inference [default: 1]
     --half                 Use FP16 half-precision inference
-    --save                 Save annotated images to runs/<task>/predict
+    --save                 Save annotated images to runs/<task>/predict [default: true]
     --save-frames          Save individual frames for video input (instead of video file)
     --show                 Display results in a window
-    --device <DEVICE>      Device (cpu, cuda:0, mps, coreml, directml:0, openvino, xnnpack)
-    --verbose              Show verbose output
+    --device <DEVICE>      Device (cpu, cuda:0, mps, coreml, directml:0, openvino, tensorrt:0, xnnpack)
+    --verbose              Show verbose output [default: true]
 
 Examples:
+    ultralytics-inference predict
     ultralytics-inference predict --model yolo11n.onnx --source image.jpg
-    ultralytics-inference predict --model yolo11n.onnx --source video.mp4
-    ultralytics-inference predict --model yolo11n.onnx --source video.mp4 --save-frames
-    ultralytics-inference predict --model yolo11n.onnx --source 0 --conf 0.5
-    ultralytics-inference predict -m yolo11n.onnx -s assets/ --save --half
-    ultralytics-inference predict -m yolo11n.onnx -s video.mp4 --imgsz 1280 --show
-    ultralytics-inference predict --model yolo11n.onnx --source image.jpg --device mps"#)]
+    ultralytics-inference predict --source video.mp4 --rect
+    ultralytics-inference predict --source video.mp4 --save-frames
+    ultralytics-inference predict --source 0 --conf 0.5 --show
+    ultralytics-inference predict --source assets/ --save --half
+    ultralytics-inference predict --source image.jpg --device cuda:0
+    ultralytics-inference predict --source image.jpg --device mps"#)]
 pub struct Cli {
     #[command(subcommand)]
     /// Subcommand to execute.
@@ -58,7 +62,7 @@ pub struct PredictArgs {
     pub conf: f32,
 
     /// `IoU` threshold for NMS
-    #[arg(long, default_value_t = 0.45)]
+    #[arg(long, default_value_t = 0.7)]
     pub iou: f32,
 
     /// Maximum number of detections
@@ -69,6 +73,10 @@ pub struct PredictArgs {
     #[arg(long)]
     pub imgsz: Option<usize>,
 
+    /// Enable minimal padding (rectangular inference)
+    #[arg(long, default_value_t = true, num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set)]
+    pub rect: bool,
+
     /// Batch size for inference
     #[arg(long, default_value_t = 1, value_parser = clap::value_parser!(u32).range(1..))]
     pub batch: u32,
@@ -78,7 +86,7 @@ pub struct PredictArgs {
     pub half: bool,
 
     /// Save annotated images to runs/<task>/predict
-    #[arg(long, default_value_t = true, action = clap::ArgAction::Set)]
+    #[arg(long, default_value_t = true, num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set)]
     pub save: bool,
 
     /// Save individual frames for video input (instead of video file)
@@ -115,7 +123,8 @@ mod tests {
             Commands::Predict(predict_args) => {
                 assert_eq!(predict_args.model, "yolo11n.onnx");
                 assert!((predict_args.conf - 0.25).abs() < f32::EPSILON);
-                assert!((predict_args.iou - 0.45).abs() < f32::EPSILON);
+                assert!((predict_args.iou - 0.7).abs() < f32::EPSILON);
+                assert!(predict_args.rect);
                 assert_eq!(predict_args.max_det, 300);
                 assert!(!predict_args.half);
                 assert!(predict_args.verbose);