diff --git a/.cargo/config.toml b/.cargo/config.toml
index 1dd4bae..d8a47e3 100644
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -7,3 +7,10 @@ PKG_CONFIG_PATH = "/opt/homebrew/opt/ffmpeg@7/lib/pkgconfig"
 CPATH = "/opt/homebrew/opt/ffmpeg@7/include"
 LIBRARY_PATH = "/opt/homebrew/opt/ffmpeg@7/lib"
 BINDGEN_EXTRA_CLANG_ARGS = "-I/opt/homebrew/opt/ffmpeg@7/include"
+
+#
+# Linux: Configure RPATH to find shared libraries in the executable's directory
+# This allows the binary to correspond to libonnxruntime*.so in the same folder
+# without needing LD_LIBRARY_PATH set manually.
+[target.x86_64-unknown-linux-gnu]
+rustflags = ["-C", "link-arg=-Wl,-rpath,$ORIGIN"]
diff --git a/Cargo.lock b/Cargo.lock
index e900895..ff32fcd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -51,6 +51,12 @@ dependencies = [
  "equator",
 ]
 
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
 [[package]]
 name = "anstream"
 version = "0.6.21"
@@ -570,6 +576,12 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+
 [[package]]
 name = "errno"
 version = "0.3.14"
@@ -686,6 +698,12 @@ dependencies = [
  "miniz_oxide",
 ]
 
+[[package]]
+name = "foldhash"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+
 [[package]]
 name = "foreign-types"
 version = "0.3.2"
@@ -948,6 +966,17 @@ dependencies = [
  "zerocopy",
 ]
 
+[[package]]
+name = "hashbrown"
+version = "0.15.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash",
+]
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -1322,6 +1351,15 @@ dependencies = [
  "imgref",
 ]
 
+[[package]]
+name = "lru"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38"
+dependencies = [
+ "hashbrown",
+]
+
 [[package]]
 name = "lzma-rust2"
 version = "0.15.4"
@@ -2594,7 +2632,7 @@ checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
 
 [[package]]
 name = "ultralytics-inference"
-version = "0.0.7"
+version = "0.0.8"
 dependencies = [
  "ab_glyph",
  "bytemuck",
@@ -2608,6 +2646,7 @@ dependencies = [
  "image",
  "imageproc",
  "jpeg-decoder",
+ "lru",
  "minifb",
  "ndarray 0.16.1",
  "ndarray 0.17.1",
@@ -2617,6 +2656,7 @@ dependencies = [
  "tempfile",
  "ureq",
  "video-rs",
+ "wide",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 6d6765c..f8713b1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@
 
 [package]
 name = "ultralytics-inference"
-version = "0.0.7"
+version = "0.0.8"
 edition = "2024"
 authors = [
     "Glenn Jocher <glenn.jocher@ultralytics.com>",
@@ -50,6 +50,12 @@ image = "^0.25"
 jpeg-decoder = "^0.3"
 fast_image_resize = { version = "^5.5", features = ["image", "rayon"] }
 
+# SIMD for fast preprocessing
+wide = "0.7"
+
+# LRU cache for preprocessing LUT
+lru = "0.12"
+
 # Numerical computing (must match ort's ndarray version 0.17)
 ndarray = { version = "^0.17", features = ["rayon"] }
 
@@ -74,6 +80,7 @@ bytemuck = { version = "^1.21", features = ["derive"] }
 clap = { version = "4.5.54", features = ["derive"] }
 colored = "3.0.0"
 
+# Optional - Visualization and Video support
 minifb = { version = "^0.28.0", optional = true }
 video-rs = { version = "^0.10.5", features = ["ndarray"], optional = true }
 
@@ -151,7 +158,7 @@ opt-level = 3
 
 [profile.release]
 opt-level = 3
-lto = true
+lto = "fat"
 codegen-units = 1
 panic = "abort"
 strip = true
diff --git a/README.md b/README.md
index 3bf4622..9c42f79 100644
--- a/README.md
+++ b/README.md
@@ -77,20 +77,26 @@ cargo run --release -- predict --model yolo11n.onnx --source video.mp4 --show --
 
 # Save individual frames for video input
 cargo run --release -- predict --model yolo11n.onnx --source video.mp4 --save-frames
+
+# Rectangular inference
+cargo run --release -- predict --model yolo11n.onnx --source image.jpg --rect
 ```
 
 ### Example Output
 
 ```
+# ultralytics-inference predict
+
+WARNING ⚠️ 'model' argument is missing. Using default 'model=yolo11n.onnx'.
 WARNING ⚠️ 'source' argument is missing. Using default images: https://ultralytics.com/images/bus.jpg, https://ultralytics.com/images/zidane.jpg
-Ultralytics 0.0.7 🚀 Rust ONNX FP32 CPU
+Ultralytics 0.0.8 🚀 Rust ONNX FP32 CPU
 Using ONNX Runtime CPUExecutionProvider
 YOLO11n summary: 80 classes, imgsz=(640, 640)
 
-image 1/2 bus.jpg: 640x640 3 persons, 1 bus, 57.3ms
-image 2/2 zidane.jpg: 640x640 2 persons, 1 tie, 52.9ms
-Speed: 75.8ms preprocess, 55.1ms inference, 19.9ms postprocess per image at shape (1, 3, 640, 640)
-Results saved to runs/detect/predict53
+image 1/2 /home/ultralytics/inference/bus.jpg: 640x480 640x480 4 persons, 1 bus, 36.4ms
+image 2/2 /home/ultralytics/inference/zidane.jpg: 384x640 2 persons, 1 tie, 28.6ms
+Speed: 1.5ms preprocess, 32.5ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)
+Results saved to runs/detect/predict1
 💡 Learn more at https://docs.ultralytics.com/modes/predict
 ```
 
@@ -117,8 +123,11 @@ cargo run --release -- predict --model <model.onnx> --source <source>
 | `--source`      | `-s`  | Input source (image, video, webcam index, or URL) | `Task dependent Ultralytics URL assets` |
 | `--device`      |       | Device to use (cpu, cuda:0, mps, coreml, etc.)    | `cpu`                                   |
 | `--conf`        |       | Confidence threshold                              | `0.25`                                  |
-| `--iou`         |       | IoU threshold for NMS                             | `0.45`                                  |
+| `--iou`         |       | IoU threshold for NMS                             | `0.7`                                   |
+| `--max-det`     |       | Maximum number of detections                      | `300`                                   |
 | `--imgsz`       |       | Inference image size                              | `Model metadata`                        |
+| `--rect`        |       | Enable rectangular inference (minimal padding)    | `true`                                  |
+| `--batch`       |       | Batch size for inference                          | `1`                                     |
 | `--half`        |       | Use FP16 half-precision inference                 | `false`                                 |
 | `--save`        |       | Save annotated results to runs/<task>/predict     | `true`                                  |
 | `--save-frames` |       | Save individual frames for video                  | `false`                                 |
@@ -183,7 +192,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     let config = InferenceConfig::new()
         .with_confidence(0.5)
         .with_iou(0.45)
-        .with_max_det(100);
+        .with_max_det(300);
 
     let mut model = YOLOModel::load_with_config("yolo11n.onnx", config)?;
     let results = model.predict("image.jpg")?;
@@ -236,16 +245,25 @@ inference/
 │   ├── main.rs             # CLI application
 │   ├── model.rs            # YOLOModel - ONNX session and inference
 │   ├── results.rs          # Results, Boxes, Masks, Keypoints, Probs, Obb
-│   ├── preprocessing.rs    # Image preprocessing (letterbox, normalize)
-│   ├── postprocessing.rs   # Detection post-processing (NMS, decode)
+│   ├── preprocessing.rs    # Image preprocessing (letterbox, normalize, SIMD)
+│   ├── postprocessing.rs   # Detection post-processing (NMS, decode, SIMD)
 │   ├── metadata.rs         # ONNX model metadata parsing
-│   ├── source.rs           # Input source handling
-│   ├── task.rs             # Task enum (Detect, Segment, Pose, etc.)
+│   ├── source.rs           # Input source handling (images, video, webcam)
+│   ├── task.rs             # Task enum (Detect, Segment, Pose, Classify, Obb)
 │   ├── inference.rs        # InferenceConfig
+│   ├── batch.rs            # Batch processing pipeline
+│   ├── device.rs           # Device enum (CPU, CUDA, MPS, CoreML, etc.)
 │   ├── download.rs         # Model and asset downloading
-│   ├── visualizer/         # Visualization tools (Viewer)
+│   ├── annotate.rs         # Image annotation (bounding boxes, masks, keypoints)
+│   ├── io.rs               # Result saving (images, videos)
+│   ├── logging.rs          # Logging macros
 │   ├── error.rs            # Error types
-│   └── utils.rs            # Utility functions (NMS, IoU)
+│   ├── utils.rs            # Utility functions (NMS, IoU)
+│   ├── cli/                # CLI module
+│   │   ├── mod.rs          # CLI module exports
+│   │   ├── args.rs         # CLI argument parsing
+│   │   └── predict.rs      # Predict command implementation
+│   └── visualizer/         # Real-time visualization (minifb)
 ├── tests/
 │   └── integration_test.rs # Integration tests
 ├── assets/                 # Test images
@@ -300,13 +318,16 @@ One of the key benefits of this library is **minimal dependencies** - no PyTorch
 
 ### Core Dependencies (always included)
 
-| Crate               | Purpose                 |
-| ------------------- | ----------------------- |
-| `ort`               | ONNX Runtime bindings   |
-| `ndarray`           | N-dimensional arrays    |
-| `image`             | Image loading/decoding  |
-| `fast_image_resize` | SIMD-optimized resizing |
-| `half`              | FP16 support            |
+| Crate               | Purpose                         |
+| ------------------- | ------------------------------- |
+| `ort`               | ONNX Runtime bindings           |
+| `ndarray`           | N-dimensional arrays            |
+| `image`             | Image loading/decoding          |
+| `jpeg-decoder`      | JPEG decoding                   |
+| `fast_image_resize` | SIMD-optimized resizing         |
+| `half`              | FP16 support                    |
+| `lru`               | LRU cache for preprocessing LUT |
+| `wide`              | SIMD for fast preprocessing     |
 
 ### Optional Dependencies (for `--save` feature)
 
@@ -372,16 +393,18 @@ ONNX Runtime threading is set to auto (`num_threads: 0`) which lets ORT choose o
 
 - [x] Detection, Segmentation, Pose, Classification, OBB inference
 - [x] ONNX model metadata parsing (auto-detect classes, task, imgsz)
+- [x] Hardware acceleration support (CUDA, TensorRT, CoreML, OpenVINO, XNNPACK)
 - [x] Ultralytics-compatible Results API (`Boxes`, `Masks`, `Keypoints`, `Probs`, `Obb`)
 - [x] Multiple input sources (images, directories, globs, URLs)
 - [x] Video file support and webcam/RTSP streaming
 - [x] Image annotation and visualization
 - [x] FP16 half-precision inference
+- [x] Batch inference support
+- [x] Rectangular inference support and optimization
 
 ### In Progress
 
 - [ ] Python bindings (PyO3)
-- [ ] Batch inference optimization
 - [ ] WebAssembly (WASM) support for browser inference
 
 ## 💡 Contributing
diff --git a/src/cli/args.rs b/src/cli/args.rs
index 945d050..1d9a261 100644
--- a/src/cli/args.rs
+++ b/src/cli/args.rs
@@ -11,23 +11,27 @@ use clap::{Args, Parser, Subcommand};
     --model, -m <MODEL>    Path to ONNX model file [default: yolo11n.onnx]
     --source, -s <SOURCE>  Input source (image, directory, glob, video, webcam, or URL)
     --conf <CONF>          Confidence threshold [default: 0.25]
-    --iou <IOU>            IoU threshold for NMS [default: 0.45]
+    --iou <IOU>            IoU threshold for NMS [default: 0.7]
+    --max-det <MAX_DET>    Maximum number of detections [default: 300]
     --imgsz <IMGSZ>        Inference image size
+    --rect                 Enable rectangular inference (minimal padding) [default: true]
+    --batch <BATCH>        Batch size for inference [default: 1]
     --half                 Use FP16 half-precision inference
-    --save                 Save annotated images to runs/<task>/predict
+    --save                 Save annotated images to runs/<task>/predict [default: true]
     --save-frames          Save individual frames for video input (instead of video file)
     --show                 Display results in a window
-    --device <DEVICE>      Device (cpu, cuda:0, mps, coreml, directml:0, openvino, xnnpack)
-    --verbose              Show verbose output
+    --device <DEVICE>      Device (cpu, cuda:0, mps, coreml, directml:0, openvino, tensorrt:0, xnnpack)
+    --verbose              Show verbose output [default: true]
 
 Examples:
+    ultralytics-inference predict
     ultralytics-inference predict --model yolo11n.onnx --source image.jpg
-    ultralytics-inference predict --model yolo11n.onnx --source video.mp4
-    ultralytics-inference predict --model yolo11n.onnx --source video.mp4 --save-frames
-    ultralytics-inference predict --model yolo11n.onnx --source 0 --conf 0.5
-    ultralytics-inference predict -m yolo11n.onnx -s assets/ --save --half
-    ultralytics-inference predict -m yolo11n.onnx -s video.mp4 --imgsz 1280 --show
-    ultralytics-inference predict --model yolo11n.onnx --source image.jpg --device mps"#)]
+    ultralytics-inference predict --source video.mp4 --rect
+    ultralytics-inference predict --source video.mp4 --save-frames
+    ultralytics-inference predict --source 0 --conf 0.5 --show
+    ultralytics-inference predict --source assets/ --save --half
+    ultralytics-inference predict --source image.jpg --device cuda:0
+    ultralytics-inference predict --source image.jpg --device mps"#)]
 pub struct Cli {
     #[command(subcommand)]
     /// Subcommand to execute.
@@ -58,7 +62,7 @@ pub struct PredictArgs {
     pub conf: f32,
 
     /// `IoU` threshold for NMS
-    #[arg(long, default_value_t = 0.45)]
+    #[arg(long, default_value_t = 0.7)]
     pub iou: f32,
 
     /// Maximum number of detections
@@ -69,6 +73,10 @@ pub struct PredictArgs {
     #[arg(long)]
     pub imgsz: Option<usize>,
 
+    /// Enable minimal padding (rectangular inference)
+    #[arg(long, default_value_t = true, num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set)]
+    pub rect: bool,
+
     /// Batch size for inference
     #[arg(long, default_value_t = 1, value_parser = clap::value_parser!(u32).range(1..))]
     pub batch: u32,
@@ -78,7 +86,7 @@ pub struct PredictArgs {
     pub half: bool,
 
     /// Save annotated images to runs/<task>/predict
-    #[arg(long, default_value_t = true, action = clap::ArgAction::Set)]
+    #[arg(long, default_value_t = true, num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set)]
     pub save: bool,
 
     /// Save individual frames for video input (instead of video file)
@@ -115,7 +123,8 @@ mod tests {
             Commands::Predict(predict_args) => {
                 assert_eq!(predict_args.model, "yolo11n.onnx");
                 assert!((predict_args.conf - 0.25).abs() < f32::EPSILON);
-                assert!((predict_args.iou - 0.45).abs() < f32::EPSILON);
+                assert!((predict_args.iou - 0.7).abs() < f32::EPSILON);
+                assert!(predict_args.rect);
                 assert_eq!(predict_args.max_det, 300);
                 assert!(!predict_args.half);
                 assert!(predict_args.verbose);
diff --git a/src/cli/predict.rs b/src/cli/predict.rs
index eb43c80..6559211 100644
--- a/src/cli/predict.rs
+++ b/src/cli/predict.rs
@@ -14,6 +14,8 @@ use crate::annotate::{annotate_image, find_next_run_dir};
 
 #[cfg(feature = "visualize")]
 use crate::visualizer::Viewer;
+#[cfg(feature = "visualize")]
+use image::GenericImageView;
 
 use crate::utils::pluralize;
 use crate::{InferenceConfig, Results, VERSION, YOLOModel};
@@ -29,7 +31,8 @@ use crate::{error, verbose, warn};
     clippy::cast_precision_loss,
     clippy::cast_possible_truncation,
     clippy::cast_sign_loss,
-    clippy::missing_panics_doc
+    clippy::missing_panics_doc,
+    clippy::redundant_clone
 )]
 pub fn run_prediction(args: &PredictArgs) {
     // Parse arguments
@@ -49,9 +52,14 @@ pub fn run_prediction(args: &PredictArgs) {
         .map(|d| d.parse().expect("Invalid device"));
     #[cfg(feature = "visualize")]
     let show = args.show;
-
     // Use defaults with warnings if not specified
-    // Clap handles default model path, so model_path is always set.
+    // Warn if using default model (like Python does)
+    if model_path == crate::download::DEFAULT_MODEL && verbose {
+        warn!(
+            "'model' argument is missing. Using default '--model={}'.",
+            crate::download::DEFAULT_MODEL
+        );
+    }
 
     // Load model first so we can determine appropriate default source based on task
     let mut config = InferenceConfig::new()
@@ -59,8 +67,9 @@ pub fn run_prediction(args: &PredictArgs) {
         .with_iou(iou_threshold)
         .with_half(half)
         .with_batch(batch_size)
-        .with_max_det(args.max_det)
-        .with_save_frames(save_frames);
+        .with_save_frames(save_frames)
+        .with_rect(args.rect)
+        .with_max_det(args.max_det);
 
     // Apply imgsz if specified
     if let Some(sz) = imgsz {
@@ -181,14 +190,6 @@ pub fn run_prediction(args: &PredictArgs) {
         process::exit(1);
     }
 
-    let iter = match crate::source::SourceIterator::new(source) {
-        Ok(iter) => iter,
-        Err(e) => {
-            error!("Error initializing source: {e}");
-            process::exit(1);
-        }
-    };
-
     // Process each image/frame
     let mut all_results: Vec<(String, Results)> = Vec::new();
     let mut total_preprocess = 0.0;
@@ -207,6 +208,29 @@ pub fn run_prediction(args: &PredictArgs) {
     #[cfg(not(feature = "annotate"))]
     let mut result_saver: Option<crate::io::SaveResults> = None;
 
+    // Create a bounded channel for pipelined processing
+    // Buffer size 2x batch size ensures we can decode the next batch while processing current one
+    let channel_capacity = batch_size * 2;
+    let (sender, receiver) = std::sync::mpsc::sync_channel(channel_capacity);
+
+    // Spawn producer thread for frame decoding
+    let source_clone = source.clone();
+    std::thread::spawn(move || {
+        let iter = match crate::source::SourceIterator::new(source_clone) {
+            Ok(iter) => iter,
+            Err(e) => {
+                error!("Error initializing source in thread: {e}");
+                return;
+            }
+        };
+
+        for item in iter {
+            if sender.send(item).is_err() {
+                break; // Receiver dropped, stop decoding
+            }
+        }
+    });
+
     // Use BatchProcessor for centralized batch management
     {
         let mut batch_processor = BatchProcessor::new(
@@ -240,8 +264,8 @@ pub fn run_prediction(args: &PredictArgs) {
                                 meta.frame_idx + 1,
                                 total_frames_str,
                                 image_path,
-                                inference_shape.1,
                                 inference_shape.0,
+                                inference_shape.1,
                                 detection_summary,
                                 result.speed.inference.unwrap_or(0.0)
                             );
@@ -251,8 +275,8 @@ pub fn run_prediction(args: &PredictArgs) {
                                 meta.frame_idx + 1,
                                 total_frames_str,
                                 image_path,
-                                inference_shape.1,
                                 inference_shape.0,
+                                inference_shape.1,
                                 detection_summary,
                                 result.speed.inference.unwrap_or(0.0)
                             );
@@ -272,8 +296,9 @@ pub fn run_prediction(args: &PredictArgs) {
 
                         #[cfg(feature = "visualize")]
                         if show {
-                            let view_width = inference_shape.1 as usize;
-                            let view_height = inference_shape.0 as usize;
+                            let (orig_w, orig_h) = img.dimensions();
+                            let view_width = orig_w as usize;
+                            let view_height = orig_h as usize;
 
                             if let Some(ref v) = viewer
                                 && (v.width != view_width || v.height != view_height)
@@ -290,14 +315,13 @@ pub fn run_prediction(args: &PredictArgs) {
 
                             if let Some(ref mut v) = viewer {
                                 let annotated = annotate_image(img, &result, None);
-                                let resized = annotated.resize_exact(
-                                    view_width as u32,
-                                    view_height as u32,
-                                    image::imageops::FilterType::Triangle,
-                                );
 
-                                if v.update(&resized).is_ok() && !is_video {
-                                    let _ = v.wait(Duration::from_millis(200));
+                                if v.update(&annotated).is_ok() {
+                                    // Main thread is blocking on channel, so visualizer wait is less critical
+                                    // but we keep a small wait to allow window events processing
+                                    if !is_video {
+                                        let _ = v.wait(Duration::from_millis(200));
+                                    }
                                 }
                             }
                         }
@@ -305,14 +329,14 @@ pub fn run_prediction(args: &PredictArgs) {
                         total_preprocess += result.speed.preprocess.unwrap_or(0.0);
                         total_inference += result.speed.inference.unwrap_or(0.0);
                         total_postprocess += result.speed.postprocess.unwrap_or(0.0);
-
                         all_results.push((image_path.clone(), result));
                     }
                 }
             },
         );
 
-        for item in iter {
+        // Main thread: consume frames from channel and run inference
+        for item in receiver {
             let (img, meta) = match item {
                 Ok(val) => val,
                 Err(e) => {
@@ -335,10 +359,11 @@ pub fn run_prediction(args: &PredictArgs) {
     // Print speed summary with inference tensor shape (after letterboxing)
     let num_results = all_results.len().max(1) as f64;
     verbose!(
-        "Speed: {:.1}ms preprocess, {:.1}ms inference, {:.1}ms postprocess per image at shape (1, 3, {}, {})",
+        "Speed: {:.1}ms preprocess, {:.1}ms inference, {:.1}ms postprocess per image at shape ({}, 3, {}, {})",
         total_preprocess / num_results,
         total_inference / num_results,
         total_postprocess / num_results,
+        batch_size,
         last_inference_shape.0,
         last_inference_shape.1
     );
diff --git a/src/download.rs b/src/download.rs
index f0db10d..0f7654b 100644
--- a/src/download.rs
+++ b/src/download.rs
@@ -382,17 +382,31 @@ pub fn download_image(url: &str) -> Result<String> {
     let filename = url.rsplit('/').next().unwrap_or("image.jpg");
     let dest_path = Path::new(filename);
 
+    // Get absolute path for display consistency with Python
+    let abs_path = dest_path
+        .canonicalize()
+        .or_else(|_| std::env::current_dir().map(|p| p.join(filename)))
+        .map_or_else(
+            |_| filename.to_string(),
+            |p| p.to_string_lossy().to_string(),
+        );
+
     // Skip download if file already exists
     if dest_path.exists() {
-        eprintln!("Image already exists: {filename}");
-        return Ok(filename.to_string());
+        return Ok(abs_path);
     }
 
     eprintln!("Downloading {url}...");
 
     download_file(url, dest_path)?;
 
-    Ok(filename.to_string())
+    // Get absolute path after download
+    let abs_path = dest_path.canonicalize().map_or_else(
+        |_| filename.to_string(),
+        |p| p.to_string_lossy().to_string(),
+    );
+
+    Ok(abs_path)
 }
 
 /// Download multiple images from URLs to the current directory.
diff --git a/src/inference.rs b/src/inference.rs
index 0db5033..142ebb9 100644
--- a/src/inference.rs
+++ b/src/inference.rs
@@ -20,7 +20,7 @@
 /// let config = InferenceConfig::new()
 ///     .with_confidence(0.5)
 ///     .with_iou(0.45)
-///     .with_max_det(100)
+///     .with_max_det(300)
 ///     .with_imgsz(640, 640);
 /// ```
 ///
@@ -33,6 +33,7 @@
 ///     .with_device(Device::Cuda(0));
 /// ```
 #[derive(Debug, Clone)]
+#[allow(clippy::struct_excessive_bools)]
 pub struct InferenceConfig {
     /// Confidence threshold for detections (0.0 to 1.0).
     /// Detections with confidence scores lower than this value will be discarded.
@@ -65,6 +66,9 @@ pub struct InferenceConfig {
     /// Whether to save individual frames instead of a video file when input is video.
     /// Defaults to `false` (save as video).
     pub save_frames: bool,
+    /// Whether to use minimal padding (rectangular inference).
+    /// Defaults to `true` to match Ultralytics Python.
+    pub rect: bool,
 }
 
 impl Default for InferenceConfig {
@@ -80,6 +84,7 @@ impl Default for InferenceConfig {
             device: None,
             save: true,
             save_frames: false,
+            rect: true,
         }
     }
 }
@@ -267,6 +272,21 @@ impl InferenceConfig {
         self.save_frames = save_frames;
         self
     }
+
+    /// Set whether to use minimal padding (rectangular inference).
+    ///
+    /// # Arguments
+    ///
+    /// * `rect` - `true` to enable, `false` to disable.
+    ///
+    /// # Returns
+    ///
+    /// * The modified `InferenceConfig`.
+    #[must_use]
+    pub const fn with_rect(mut self, rect: bool) -> Self {
+        self.rect = rect;
+        self
+    }
 }
 
 #[cfg(test)]
@@ -286,13 +306,13 @@ mod tests {
         let config = InferenceConfig::new()
             .with_confidence(0.5)
             .with_iou(0.6)
-            .with_max_det(100)
+            .with_max_det(300)
             .with_imgsz(640, 640)
             .with_threads(8);
 
         assert!((config.confidence_threshold - 0.5).abs() < f32::EPSILON);
         assert!((config.iou_threshold - 0.6).abs() < f32::EPSILON);
-        assert_eq!(config.max_det, 100);
+        assert_eq!(config.max_det, 300);
         assert_eq!(config.imgsz, Some((640, 640)));
         assert_eq!(config.num_threads, 8);
     }
diff --git a/src/lib.rs b/src/lib.rs
index 8210a16..62fdcd2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -172,7 +172,7 @@
 //! let config = InferenceConfig::new()
 //!     .with_confidence(0.5)    // Confidence threshold
 //!     .with_iou(0.45)          // NMS IoU threshold
-//!     .with_max_det(100) // Max detections per image
+//!     .with_max_det(300) // Max detections per image
 //!     .with_imgsz(640, 640);   // Input image size
 //! ```
 //!
diff --git a/src/main.rs b/src/main.rs
index 6d9aed9..0ce9b0d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -25,9 +25,14 @@ use ultralytics_inference::cli::predict::run_prediction;
 use ultralytics_inference::logging::set_verbose;
 
 /// Entry point for the Ultralytics YOLO Inference CLI.
-fn main() {
+#[allow(clippy::unnecessary_wraps)]
+fn main() -> Result<(), Box<dyn std::error::Error>> {
     ultralytics_inference::io::init_logging();
 
+    // Initialize ONNX Runtime with verbose logging to debug execution provider issues
+    #[cfg(debug_assertions)]
+    let _ = ort::init().commit();
+
     let cli = Cli::parse();
 
     match &cli.command {
@@ -36,4 +41,5 @@ fn main() {
             run_prediction(args);
         }
     }
+    Ok(())
 }
diff --git a/src/metadata.rs b/src/metadata.rs
index cd0a2c8..9926b3b 100644
--- a/src/metadata.rs
+++ b/src/metadata.rs
@@ -91,7 +91,6 @@ impl ModelMetadata {
     ///
     /// Returns an error if the YAML is malformed or missing required fields.
     pub fn from_yaml_str(yaml_str: &str) -> Result<Self> {
-        // Parse YAML manually to avoid serde_yaml dependency complexity
         let mut metadata = Self::default();
 
         for line in yaml_str.lines() {
diff --git a/src/model.rs b/src/model.rs
index 598935b..1751165 100644
--- a/src/model.rs
+++ b/src/model.rs
@@ -10,7 +10,7 @@ use std::path::Path;
 use std::time::Instant;
 
 use half::f16;
-use image::DynamicImage;
+use image::{DynamicImage, GenericImageView};
 use ndarray::Array3;
 use ort::session::Session;
 use ort::tensor::TensorElementType;
@@ -23,7 +23,8 @@ use crate::inference::InferenceConfig;
 use crate::metadata::ModelMetadata;
 use crate::postprocessing::postprocess;
 use crate::preprocessing::{
-    image_to_array, preprocess_image_center_crop, preprocess_image_with_precision,
+    calculate_rect_size, image_to_array, preprocess_image_center_crop,
+    preprocess_image_with_precision,
 };
 use crate::results::{Results, Speed};
 use crate::task::Task;
@@ -60,6 +61,8 @@ pub struct YOLOModel {
     fp16_input: bool,
     /// Execution provider used for inference
     execution_provider: String,
+    /// Whether the model accepts dynamic input shapes.
+    is_dynamic: bool,
 }
 
 #[allow(
@@ -67,7 +70,12 @@ pub struct YOLOModel {
     clippy::needless_pass_by_value,
     clippy::missing_errors_doc,
     clippy::missing_panics_doc,
-    clippy::cast_possible_truncation
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::if_not_else,
+    clippy::manual_is_multiple_of,
+    clippy::cast_sign_loss,
+    clippy::cast_precision_loss
 )]
 impl YOLOModel {
     /// Load a YOLO model from an ONNX file.
@@ -266,10 +274,16 @@ impl YOLOModel {
         }
 
         if !eps.is_empty() {
+            crate::info!(
+                "Registering {} execution providers (primary: {})",
+                eps.len(),
+                provider_name
+            );
             session_builder = session_builder.with_execution_providers(eps).map_err(|e| {
                 InferenceError::ModelLoadError(format!("Failed to set execution providers: {e}"))
             })?;
         }
+        // CPU is the default - no warning needed when no accelerators are registered
 
         let session = session_builder
             .with_optimization_level(ort::session::builder::GraphOptimizationLevel::Level3)
@@ -381,6 +395,7 @@ impl YOLOModel {
             warmed_up: false,
             fp16_input,
             execution_provider: provider_name.to_string(),
+            is_dynamic,
         };
 
         // Warmup inference to trigger JIT compilation and memory allocation
@@ -589,18 +604,60 @@ impl YOLOModel {
             .or(self.metadata.imgsz)
             .unwrap_or((640, 640));
 
+        // Check if target_size is divisible by stride (one-time warning logic per batch call)
+        // We only warn if the configured size itself is not divisible.
+        // If rect adjusts it, that's expected.
+        let stride = self.metadata.stride as usize;
+        if target_size.0 % stride != 0 || target_size.1 % stride != 0 {
+            warn!(
+                "WARNING ⚠️ imgsz=[{:?}] must be multiple of max stride {}, updating to [{}, {}]",
+                target_size,
+                stride,
+                (target_size.0 as f32 / stride as f32).ceil() as usize * stride,
+                (target_size.1 as f32 / stride as f32).ceil() as usize * stride
+            );
+        }
+
         // Preprocess all images
         let start_preprocess = Instant::now();
         let mut preprocessed_results = Vec::with_capacity(images.len());
 
+        // Check if we can use rect inference
+        // 1. Enabled in config
+        // 2. Model supports dynamic shapes
+        // 3. Batch is homogeneous (all images have same dimensions) or batch size is 1
+        let use_rect = self.config.rect && self.is_dynamic;
+        let uniform_shape = if images.len() > 1 {
+            let first_dims = images[0].dimensions();
+            images.iter().all(|img| img.dimensions() == first_dims)
+        } else {
+            true
+        };
+        let actual_rect = use_rect && uniform_shape;
+
+        // Warn if rect requested but disabled due to mixed batch
+        if self.config.rect && !uniform_shape {
+            warn!(
+                "Batch contains images of different sizes. Rectangular inference disabled for this batch (falling back to square padding)."
+            );
+        }
+
         // We will stack tensors later
         for image in images {
+            // Determine target size for this image
+            let current_target_size = if actual_rect {
+                let (w, h) = image.dimensions();
+                calculate_rect_size(w, h, target_size, self.metadata.stride)
+            } else {
+                target_size
+            };
+
             let res = if self.metadata.task == Task::Classify {
-                preprocess_image_center_crop(image, target_size, self.fp16_input)
+                preprocess_image_center_crop(image, current_target_size, self.fp16_input)
             } else {
                 preprocess_image_with_precision(
                     image,
-                    target_size,
+                    current_target_size,
                     self.metadata.stride,
                     self.fp16_input,
                 )
@@ -623,6 +680,7 @@ impl YOLOModel {
                         .view(),
                 );
             }
+
             // Concatenate along batch dimension (axis 0)
             let batch_tensor = ndarray::concatenate(ndarray::Axis(0), &arrays).map_err(|e| {
                 InferenceError::InferenceError(format!("Failed to concatenate FP16 tensors: {e}"))
@@ -658,52 +716,48 @@ impl YOLOModel {
         #[allow(clippy::cast_precision_loss)]
         let inference_time = start_inference.elapsed().as_secs_f64() * 1000.0 / images.len() as f64;
 
+        // Process each image's output
+        let mut image_arrays = Vec::with_capacity(images.len());
+        for image in images {
+            image_arrays.push(image_to_array(image));
+        }
+
         // Post-process
         let start_postprocess = Instant::now();
 
         let mut batch_results = Vec::with_capacity(images.len());
 
         // Process each image's output
-        for (i, image) in images.iter().enumerate() {
+        for (i, (orig_img, preprocess_res)) in image_arrays
+            .into_iter()
+            .zip(preprocessed_results.into_iter())
+            .enumerate()
+        {
+            let path = paths.get(i).cloned().unwrap_or_default();
+            let speed = Speed::new(preprocess_time, inference_time, 0.0);
+
             // Construct outputs for this single image
             let mut img_outputs = Vec::new();
             for (data, shape) in &outputs {
-                // Calculate size of one image's output
                 let batch_size = shape[0];
                 let actual_batch_size = if batch_size > 0 { batch_size } else { 1 };
-
                 let total_elements = data.len();
                 let elements_per_img = total_elements / actual_batch_size;
-
                 let start = i * elements_per_img;
                 let end = start + elements_per_img;
-
-                if start >= total_elements || end > total_elements {
-                    return Err(InferenceError::InferenceError(format!(
-                        "Index out of bounds slicing output data: range {start}..{end} with length {total_elements}"
-                    )));
-                }
-                let img_data = data[start..end].to_vec();
-
-                // Adjust shape for single image: [1, ...]
+                let img_data = &data[start..end];
                 let mut img_shape = shape.clone();
                 img_shape[0] = 1;
-
                 img_outputs.push((img_data, img_shape));
             }
 
-            let orig_img = image_to_array(image);
-            let path = paths.get(i).cloned().unwrap_or_default();
-
-            let speed = Speed::new(preprocess_time, inference_time, 0.0);
-
-            let tensor_shape = preprocessed_results[i].tensor.shape();
+            let tensor_shape = preprocess_res.tensor.shape();
             let inference_shape = (tensor_shape[2] as u32, tensor_shape[3] as u32);
 
             let result = postprocess(
                 img_outputs,
                 self.metadata.task,
-                &preprocessed_results[i],
+                &preprocess_res,
                 &self.config,
                 &self.metadata.names,
                 orig_img,
@@ -711,6 +765,7 @@ impl YOLOModel {
                 speed,
                 inference_shape,
             );
+
             batch_results.push(vec![result]);
         }
 
diff --git a/src/postprocessing.rs b/src/postprocessing.rs
index 1b6d816..479e14b 100644
--- a/src/postprocessing.rs
+++ b/src/postprocessing.rs
@@ -5,11 +5,23 @@
 //! This module handles task-specific post-processing of raw model outputs,
 //! including NMS, coordinate transformation, and result construction.
 
+#![allow(
+    unsafe_code,
+    clippy::doc_markdown,
+    clippy::too_many_lines,
+    clippy::if_not_else,
+    clippy::ptr_as_ptr,
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss
+)]
+
 use std::collections::HashMap;
 
+use wide::{CmpGt, f32x8};
+
 use fast_image_resize::images::Image;
 use fast_image_resize::{FilterType, PixelType, ResizeAlg, ResizeOptions, Resizer};
-use ndarray::{Array2, Array3, ArrayView2, Zip, s};
+use ndarray::{Array2, Array3, ArrayView1, ArrayViewMut2, Zip, s};
 
 use crate::inference::InferenceConfig;
 use crate::preprocessing::{PreprocessResult, clip_coords, scale_coords};
@@ -41,7 +53,7 @@ use crate::utils::{nms_per_class, nms_rotated_per_class};
     clippy::implicit_hasher
 )]
 pub fn postprocess(
-    outputs: Vec<(Vec<f32>, Vec<usize>)>,
+    outputs: Vec<(&[f32], Vec<usize>)>,
     task: Task,
     preprocess: &PreprocessResult,
     config: &InferenceConfig,
@@ -113,23 +125,7 @@ pub fn postprocess(
 
 /// Post-process detection model output.
 ///
-/// Converts raw YOLO model output into a list of bounding boxes with class scores.
-///
-/// # Arguments
-///
-/// * `output` - Flat vector of model output values.
-/// * `output_shape` - Shape of the output tensor.
-/// * `preprocess` - Preprocessing metadata (scaling, padding).
-/// * `config` - Inference configuration (thresholds).
-/// * `names` - Class ID to name mapping.
-/// * `orig_img` - Original image data.
-/// * `path` - Source image path.
-/// * `speed` - Timing metrics.
-/// * `inference_shape` - Input shape used for inference.
-///
-/// # Returns
-///
-/// `Results` struct containing detected bounding boxes.
+/// Zero-copy implementation using stride-based indexing to avoid memory allocations.
 #[allow(
     clippy::too_many_arguments,
     clippy::similar_names,
@@ -156,24 +152,15 @@ fn postprocess_detect(
         return results;
     }
 
-    // Convert flat output to 2D array
-    let output_2d = if is_transposed {
-        // Shape is [1, num_preds, num_features] - already in correct format
-        Array2::from_shape_vec((num_predictions, 4 + num_classes), output.to_vec())
-            .unwrap_or_else(|_| Array2::zeros((0, 0)))
-    } else {
-        // Shape is [1, num_features, num_preds] - need to transpose
-        let arr = Array2::from_shape_vec((4 + num_classes, num_predictions), output.to_vec())
-            .unwrap_or_else(|_| Array2::zeros((0, 0)));
-        arr.t().to_owned()
-    };
-
-    if output_2d.is_empty() {
-        return results;
-    }
-
-    // Extract boxes and scores
-    let boxes_data = extract_detect_boxes(output_2d.view(), num_classes, preprocess, config);
+    // Zero-copy extraction with stride-based indexing
+    let boxes_data = extract_detect_boxes(
+        output,
+        num_classes,
+        num_predictions,
+        is_transposed,
+        preprocess,
+        config,
+    );
 
     if !boxes_data.is_empty() {
         results.boxes = Some(Boxes::new(boxes_data, preprocess.orig_shape));
@@ -241,93 +228,261 @@ fn parse_detect_shape(shape: &[usize], expected_classes: usize) -> (usize, usize
     }
 }
 
-/// Extract detection boxes from model output.
+/// Ultra-fast detection extraction - single-threaded tight loop.
+///
+/// Key optimizations:
+/// - No parallelization overhead (Rayon adds ~0.5ms for small workloads)
+/// - Pre-sized allocations
+/// - Minimal branching in hot loops
+/// - Direct unsafe indexing
+#[allow(clippy::cast_precision_loss, clippy::too_many_arguments)]
+#[derive(Clone, Copy)]
+struct Candidate {
+    bbox: [f32; 4],
+    score: f32,
+    class: usize,
+}
+
+/// Optimized detection extraction with SIMD acceleration.
 ///
-/// Filters predictions by confidence threshold and converts coordinates to original image space.
-#[allow(clippy::cast_precision_loss, clippy::needless_pass_by_value)]
+/// Key optimizations:
+/// - SIMD-accelerated candidate extraction (f32x8)
+/// - Parallel Bitmask NMS (IoU 1 vs 8)
+/// - Struct-of-Arrays (SoA) layout for NMS cache locality
+/// - Direct unsafe indexing for performance
+#[allow(clippy::cast_precision_loss, clippy::too_many_arguments)]
 fn extract_detect_boxes(
-    output: ArrayView2<f32>,
-    _num_classes: usize,
+    output: &[f32],
+    num_classes: usize,
+    num_predictions: usize,
+    is_transposed: bool,
     preprocess: &PreprocessResult,
     config: &InferenceConfig,
 ) -> Array2<f32> {
-    let _num_predictions = output.nrows();
-    let mut candidates = Vec::new();
-
-    // Iterate over rows efficiently
-    // output shape is (num_predictions, 4 + num_classes)
-    // We can iterate over raw elements if we are careful, but using outer_iter() is safer and still fast
-
-    // Pre-calculate scaling factors to avoid repeated struct access
+    let feat_count = 4 + num_classes;
     let (scale_y, scale_x) = preprocess.scale;
     let (pad_top, pad_left) = preprocess.padding;
     let orig_shape = preprocess.orig_shape;
     let (max_w, max_h) = (orig_shape.1 as f32, orig_shape.0 as f32);
+    let conf_thresh = config.confidence_threshold;
+    let max_det = config.max_det;
+    let iou_thresh = config.iou_threshold;
+    let conf_v = f32x8::splat(conf_thresh);
+
+    let mut candidates: Vec<Candidate> = Vec::with_capacity(256);
+
+    // Candidate Extraction
+    if !is_transposed {
+        // Layout [feat, pred] - Cache-friendly linear scan
+        let mut max_scores = vec![conf_thresh; num_predictions];
+        let mut max_classes = vec![0usize; num_predictions];
+
+        for c in 0..num_classes {
+            let offset = (4 + c) * num_predictions;
+            let class_scores = &output[offset..offset + num_predictions];
+            for (idx, &score) in class_scores.iter().enumerate() {
+                if score > max_scores[idx] {
+                    max_scores[idx] = score;
+                    max_classes[idx] = c;
+                }
+            }
+        }
 
-    for row in output.outer_iter() {
-        // Row is [cx, cy, w, h, class_scores...]
-
-        // Efficiently find the best class score without allocating a new slice or iterator chain.
-        // We skip low-confidence detections early to avoid expensive coordinate scaling and NMS operations later.
-        let scores = row.slice(s![4..]);
-
-        // Find best class manually to avoid iterator overhead
-        let (best_class, best_score) =
-            scores
-                .iter()
-                .enumerate()
-                .fold((0, 0.0f32), |(best_idx, best_val), (idx, &val)| {
-                    if val > best_val {
-                        (idx, val)
-                    } else {
-                        (best_idx, best_val)
-                    }
+        for (idx, &score) in max_scores.iter().enumerate() {
+            if score > conf_thresh {
+                let cx = unsafe { *output.get_unchecked(idx) };
+                let cy = unsafe { *output.get_unchecked(num_predictions + idx) };
+                let w = unsafe { *output.get_unchecked(2 * num_predictions + idx) };
+                let h = unsafe { *output.get_unchecked(3 * num_predictions + idx) };
+
+                let x1 = (cx - w * 0.5 - pad_left) / scale_x;
+                let y1 = (cy - h * 0.5 - pad_top) / scale_y;
+                let x2 = (cx + w * 0.5 - pad_left) / scale_x;
+                let y2 = (cy + h * 0.5 - pad_top) / scale_y;
+
+                candidates.push(Candidate {
+                    bbox: [x1, y1, x2, y2],
+                    score,
+                    class: max_classes[idx],
                 });
-
-        // Filter by confidence threshold early to reduce computation
-        if best_score < config.confidence_threshold {
-            continue;
+            }
         }
+    } else {
+        // Layout [pred, feat] - Process 8 classes at once
+        for idx in 0..num_predictions {
+            let base = idx * feat_count;
+            let row_ptr = unsafe { output.as_ptr().add(base + 4) };
+            let mut best_score = conf_thresh;
+            let mut best_class = 0;
+            let mut found = false;
+
+            for c_idx in (0..num_classes).step_by(8) {
+                if num_classes - c_idx >= 8 {
+                    let scores: f32x8 =
+                        unsafe { (row_ptr.add(c_idx) as *const f32x8).read_unaligned() };
+                    if scores.cmp_gt(conf_v).any() {
+                        for i in 0..8 {
+                            let s = unsafe { *row_ptr.add(c_idx + i) };
+                            if s > best_score {
+                                best_score = s;
+                                best_class = c_idx + i;
+                                found = true;
+                            }
+                        }
+                    }
+                } else {
+                    for i in c_idx..num_classes {
+                        let s = unsafe { *row_ptr.add(i) };
+                        if s > best_score {
+                            best_score = s;
+                            best_class = i;
+                            found = true;
+                        }
+                    }
+                }
+            }
 
-        // Extract coordinates only for candidate detections
-        let cx = row[0];
-        let cy = row[1];
-        let w = row[2];
-        let h = row[3];
-
-        let x1 = (cx - w / 2.0 - pad_left) / scale_x;
-        let y1 = (cy - h / 2.0 - pad_top) / scale_y;
-        let x2 = (cx + w / 2.0 - pad_left) / scale_x;
-        let y2 = (cy + h / 2.0 - pad_top) / scale_y;
-
-        // Clip (clamp)
-        let x1 = x1.clamp(0.0, max_w);
-        let y1 = y1.clamp(0.0, max_h);
-        let x2 = x2.clamp(0.0, max_w);
-        let y2 = y2.clamp(0.0, max_h);
-
-        candidates.push(([x1, y1, x2, y2], best_score, best_class));
+            if found {
+                let cx = unsafe { *output.get_unchecked(base) };
+                let cy = unsafe { *output.get_unchecked(base + 1) };
+                let w = unsafe { *output.get_unchecked(base + 2) };
+                let h = unsafe { *output.get_unchecked(base + 3) };
+
+                let x1 = (cx - w * 0.5 - pad_left) / scale_x;
+                let y1 = (cy - h * 0.5 - pad_top) / scale_y;
+                let x2 = (cx + w * 0.5 - pad_left) / scale_x;
+                let y2 = (cy + h * 0.5 - pad_top) / scale_y;
+
+                candidates.push(Candidate {
+                    bbox: [x1, y1, x2, y2],
+                    score: best_score,
+                    class: best_class,
+                });
+            }
+        }
     }
 
     if candidates.is_empty() {
         return Array2::zeros((0, 6));
     }
 
-    // Apply per-class NMS (only suppress boxes within the same class)
-    let keep_indices = nms_per_class(&candidates, config.iou_threshold);
+    // Top-K Selection & Sort
+    let nms_limit = (max_det * 10).min(candidates.len());
+    if candidates.len() > nms_limit {
+        candidates.select_nth_unstable_by(nms_limit, |a, b| b.score.partial_cmp(&a.score).unwrap());
+        candidates.truncate(nms_limit);
+    }
+    candidates.sort_unstable_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
+
+    // Population of SoA for NMS (small copy, very fast)
+    let n = candidates.len();
+    let mut x1 = Vec::with_capacity(n);
+    let mut y1 = Vec::with_capacity(n);
+    let mut x2 = Vec::with_capacity(n);
+    let mut y2 = Vec::with_capacity(n);
+    let mut areas = Vec::with_capacity(n);
+
+    for c in &candidates {
+        x1.push(c.bbox[0]);
+        y1.push(c.bbox[1]);
+        x2.push(c.bbox[2]);
+        y2.push(c.bbox[3]);
+        areas.push((c.bbox[2] - c.bbox[0]) * (c.bbox[3] - c.bbox[1]));
+    }
 
+    let mut suppressed = vec![false; n];
+    let mut keep = Vec::with_capacity(max_det);
+    let iou_v = f32x8::splat(iou_thresh);
     // Build output array with kept detections
-    let num_kept = keep_indices.len().min(config.max_det);
-    let mut result = Array2::zeros((num_kept, 6));
+    // let num_kept = keep_indices.len().min(config.max_det);
+    // let mut result = Array2::zeros((num_kept, 6));
 
-    for (out_idx, &keep_idx) in keep_indices.iter().take(num_kept).enumerate() {
-        let (bbox, score, class) = &candidates[keep_idx];
-        result[[out_idx, 0]] = bbox[0];
-        result[[out_idx, 1]] = bbox[1];
-        result[[out_idx, 2]] = bbox[2];
-        result[[out_idx, 3]] = bbox[3];
-        result[[out_idx, 4]] = *score;
-        result[[out_idx, 5]] = *class as f32;
+    for i in 0..n {
+        if suppressed[i] {
+            continue;
+        }
+        keep.push(i);
+        if keep.len() >= max_det {
+            break;
+        }
+
+        let ax1 = f32x8::splat(x1[i]);
+        let ay1 = f32x8::splat(y1[i]);
+        let ax2 = f32x8::splat(x2[i]);
+        let ay2 = f32x8::splat(y2[i]);
+        let aa = f32x8::splat(areas[i]);
+        let ac = candidates[i].class;
+
+        let mut j = i + 1;
+        while j < n {
+            if n - j >= 8 {
+                // Inline fast class and suppression check
+                let mut chunk_needs_processing = false;
+                for k in 0..8 {
+                    if candidates[j + k].class == ac && !suppressed[j + k] {
+                        chunk_needs_processing = true;
+                        break;
+                    }
+                }
+
+                if chunk_needs_processing {
+                    let bx1 = unsafe { (x1.as_ptr().add(j) as *const f32x8).read_unaligned() };
+                    let by1 = unsafe { (y1.as_ptr().add(j) as *const f32x8).read_unaligned() };
+                    let bx2 = unsafe { (x2.as_ptr().add(j) as *const f32x8).read_unaligned() };
+                    let by2 = unsafe { (y2.as_ptr().add(j) as *const f32x8).read_unaligned() };
+                    let ba = unsafe { (areas.as_ptr().add(j) as *const f32x8).read_unaligned() };
+
+                    let ix1 = ax1.max(bx1);
+                    let iy1 = ay1.max(by1);
+                    let ix2 = ax2.min(bx2);
+                    let iy2 = ay2.min(by2);
+
+                    let iw = (ix2 - ix1).max(f32x8::ZERO);
+                    let ih = (iy2 - iy1).max(f32x8::ZERO);
+                    let ia = iw * ih;
+                    let iou = ia / (aa + ba - ia);
+
+                    let mask = iou.cmp_gt(iou_v).move_mask() as u8;
+                    if mask != 0 {
+                        for k in 0..8 {
+                            if (mask & (1 << k)) != 0 && candidates[j + k].class == ac {
+                                suppressed[j + k] = true;
+                            }
+                        }
+                    }
+                }
+                j += 8;
+            } else {
+                for k in j..n {
+                    if !suppressed[k] && candidates[k].class == ac {
+                        let ix1 = x1[i].max(x1[k]);
+                        let iy1 = y1[i].max(y1[k]);
+                        let ix2 = x2[i].min(x2[k]);
+                        let iy2 = y2[i].min(y2[k]);
+                        let iw = (ix2 - ix1).max(0.0);
+                        let ih = (iy2 - iy1).max(0.0);
+                        let ia = iw * ih;
+                        let iou = ia / (areas[i] + areas[k] - ia);
+                        if iou > iou_thresh {
+                            suppressed[k] = true;
+                        }
+                    }
+                }
+                break;
+            }
+        }
+    }
+    // Result Construction
+    let num_kept = keep.len();
+    let mut result = Array2::zeros((num_kept, 6));
+    for (out_idx, &idx) in keep.iter().enumerate() {
+        let c = &candidates[idx];
+        result[[out_idx, 0]] = c.bbox[0].clamp(0.0, max_w);
+        result[[out_idx, 1]] = c.bbox[1].clamp(0.0, max_h);
+        result[[out_idx, 2]] = c.bbox[2].clamp(0.0, max_w);
+        result[[out_idx, 3]] = c.bbox[3].clamp(0.0, max_h);
+        result[[out_idx, 4]] = c.score;
+        result[[out_idx, 5]] = c.class as f32;
     }
 
     result
@@ -361,7 +516,7 @@ fn extract_detect_boxes(
     clippy::cast_possible_truncation
 )]
 fn postprocess_segment(
-    outputs: Vec<(Vec<f32>, Vec<usize>)>,
+    outputs: Vec<(&[f32], Vec<usize>)>,
     preprocess: &PreprocessResult,
     config: &InferenceConfig,
     names: &HashMap<usize, String>,
@@ -413,10 +568,10 @@ fn postprocess_segment(
 
     // Convert to 2D [preds, features]
     let output_2d = if is_transposed {
-        Array2::from_shape_vec((num_preds, expected_features), output0.clone())
+        Array2::from_shape_vec((num_preds, expected_features), output0.to_vec())
             .unwrap_or_else(|_| Array2::zeros((0, 0)))
     } else {
-        let arr = Array2::from_shape_vec((expected_features, num_preds), output0.clone())
+        let arr = Array2::from_shape_vec((expected_features, num_preds), output0.to_vec())
             .unwrap_or_else(|_| Array2::zeros((0, 0)));
         arr.t().to_owned()
     };
@@ -429,7 +584,7 @@ fn postprocess_segment(
         let (best_class, best_score) = scores
             .iter()
             .enumerate()
-            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
+            .max_by(|&(_, a), &(_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
             .map_or((0, 0.0), |(idx, &score)| (idx, score));
 
         if best_score < config.confidence_threshold {
@@ -514,7 +669,7 @@ fn postprocess_segment(
         );
     }
 
-    let protos = match Array2::from_shape_vec((num_masks, mh * mw), output1.clone()) {
+    let protos = match Array2::from_shape_vec((num_masks, mh * mw), output1.to_vec()) {
         Ok(arr) => arr,
         Err(e) => {
             eprintln!("WARNING ⚠️ Failed to create protos array: {e}. Skipping mask generation.");
@@ -551,77 +706,81 @@ fn postprocess_segment(
     Zip::from(masks_data.outer_iter_mut())
         .and(masks_flat.outer_iter())
         .and(boxes_data.outer_iter())
-        .par_for_each(|mut mask_out, mask_flat, box_data| {
-            // Create a local resizer for each task (Resizer is not Sync)
-            let mut resizer = Resizer::new();
-            let resize_alg = ResizeAlg::Convolution(FilterType::Bilinear);
-
-            // Sigmoid into a Vec<f32>
-            let f32_data: Vec<f32> = mask_flat
-                .iter()
-                .map(|&val| 1.0 / (1.0 + (-val).exp()))
-                .collect();
-
-            // Use bytemuck for efficient f32->bytes conversion
-            let src_bytes: &[u8] = bytemuck::cast_slice(&f32_data);
-
-            // Create source image (160x160)
-            let src_image = match Image::from_vec_u8(
-                mw as u32,
-                mh as u32,
-                src_bytes.to_vec(),
-                PixelType::F32,
-            ) {
-                Ok(img) => img,
-                Err(_) => return, // Skip if creation fails
-            };
-
-            // Create dest image (orig_w x orig_h)
-            let mut dst_image = Image::new(ow, oh, PixelType::F32);
-
-            // Configure resize with crop
-            let safe_crop_x = f64::from(crop_x.max(0.0));
-            let safe_crop_y = f64::from(crop_y.max(0.0));
-            let safe_crop_w = f64::from(crop_w.max(1.0).min(mw as f32));
-            let safe_crop_h = f64::from(crop_h.max(1.0).min(mh as f32));
-
-            let options = ResizeOptions::new().resize_alg(resize_alg).crop(
-                safe_crop_x,
-                safe_crop_y,
-                safe_crop_w,
-                safe_crop_h,
-            );
-
-            // Handle resize errors gracefully
-            if resizer
-                .resize(&src_image, &mut dst_image, &options)
-                .is_err()
-            {
-                return;
-            }
+        .par_for_each(
+            |mut mask_out: ArrayViewMut2<f32>,
+             mask_flat: ArrayView1<f32>,
+             box_data: ArrayView1<f32>| {
+                // Create a local resizer for each task (Resizer is not Sync)
+                let mut resizer = Resizer::new();
+                let resize_alg = ResizeAlg::Convolution(FilterType::Bilinear);
+
+                // Sigmoid into a Vec<f32>
+                let f32_data: Vec<f32> = mask_flat
+                    .iter()
+                    .map(|&val| 1.0 / (1.0 + (-val).exp()))
+                    .collect();
+
+                // Use bytemuck for efficient f32->bytes conversion
+                let src_bytes: &[u8] = bytemuck::cast_slice(&f32_data);
+
+                // Create source image (160x160)
+                let src_image = match Image::from_vec_u8(
+                    mw as u32,
+                    mh as u32,
+                    src_bytes.to_vec(),
+                    PixelType::F32,
+                ) {
+                    Ok(img) => img,
+                    Err(_) => return, // Skip if creation fails
+                };
+
+                // Create dest image (orig_w x orig_h)
+                let mut dst_image = Image::new(ow, oh, PixelType::F32);
+
+                // Configure resize with crop
+                let safe_crop_x = f64::from(crop_x.max(0.0));
+                let safe_crop_y = f64::from(crop_y.max(0.0));
+                let safe_crop_w = f64::from(crop_w.max(1.0).min(mw as f32));
+                let safe_crop_h = f64::from(crop_h.max(1.0).min(mh as f32));
+
+                let options = ResizeOptions::new().resize_alg(resize_alg).crop(
+                    safe_crop_x,
+                    safe_crop_y,
+                    safe_crop_w,
+                    safe_crop_h,
+                );
+
+                // Handle resize errors gracefully
+                if resizer
+                    .resize(&src_image, &mut dst_image, &options)
+                    .is_err()
+                {
+                    return;
+                }
 
-            // Get resized data as f32 slice
-            let dst_bytes = dst_image.buffer();
-            let dst_slice: &[f32] = bytemuck::cast_slice(dst_bytes);
-
-            // Apply bbox cropping and store directly to output array
-            let x1 = box_data[0].max(0.0).min(ow as f32);
-            let y1 = box_data[1].max(0.0).min(oh as f32);
-            let x2 = box_data[2].max(0.0).min(ow as f32);
-            let y2 = box_data[3].max(0.0).min(oh as f32);
-
-            for y in 0..oh as usize {
-                for x in 0..ow as usize {
-                    let val = dst_slice[y * ow as usize + x];
-                    let x_f = x as f32;
-                    let y_f = y as f32;
-                    // Apply bounding box mask: invalid pixels outside the box are zeroed.
-                    if x_f >= x1 && x_f <= x2 && y_f >= y1 && y_f <= y2 {
-                        mask_out[[y, x]] = val;
+                // Get resized data as f32 slice
+                let dst_bytes = dst_image.buffer();
+                let dst_slice: &[f32] = bytemuck::cast_slice(dst_bytes);
+
+                // Apply bbox cropping and store directly to output array
+                let x1 = box_data[0].max(0.0).min(ow as f32);
+                let y1 = box_data[1].max(0.0).min(oh as f32);
+                let x2 = box_data[2].max(0.0).min(ow as f32);
+                let y2 = box_data[3].max(0.0).min(oh as f32);
+
+                for y in 0..oh as usize {
+                    for x in 0..ow as usize {
+                        let val = dst_slice[y * ow as usize + x];
+                        let x_f = x as f32;
+                        let y_f = y as f32;
+                        // Apply bounding box mask: invalid pixels outside the box are zeroed.
+                        if x_f >= x1 && x_f <= x2 && y_f >= y1 && y_f <= y2 {
+                            mask_out[[y, x]] = val;
+                        }
                     }
                 }
-            }
-        });
+            },
+        );
 
     results.masks = Some(Masks::new(masks_data, preprocess.orig_shape));
 
@@ -869,11 +1028,8 @@ fn postprocess_classify(
         return results;
     }
 
-    // Filter out NaN values and ensure valid probabilities
-    let mut probs_vec: Vec<f32> = output
-        .iter()
-        .map(|&v| if v.is_nan() { 0.0 } else { v })
-        .collect();
+    // Probs::new expects an Array1, which we can create from the slice
+    let mut probs_vec = output.to_vec();
 
     // Check if softmax is already applied (sum ≈ 1.0)
     let sum: f32 = probs_vec.iter().sum();
diff --git a/src/preprocessing.rs b/src/preprocessing.rs
index 91541cf..68fcd1a 100644
--- a/src/preprocessing.rs
+++ b/src/preprocessing.rs
@@ -5,13 +5,71 @@
 //! This module handles all image preprocessing operations needed before
 //! running YOLO model inference, including resizing, padding, and normalization.
 
+#![allow(
+    unsafe_code,
+    clippy::similar_names,
+    clippy::cast_precision_loss,
+    clippy::cast_possible_wrap,
+    clippy::cast_sign_loss,
+    clippy::cast_possible_truncation,
+    clippy::too_many_arguments,
+    clippy::too_many_lines,
+    clippy::wildcard_imports,
+    clippy::ptr_as_ptr,
+    clippy::cast_lossless,
+    clippy::single_match_else,
+    clippy::suboptimal_flops,
+    clippy::manual_div_ceil
+)]
+
+use std::cell::RefCell;
+use std::num::NonZeroUsize;
+
 use half::f16;
-use image::{DynamicImage, GenericImageView, ImageBuffer, Rgb, RgbImage};
+use image::{DynamicImage, GenericImageView, RgbImage};
+use lru::LruCache;
 use ndarray::{Array3, Array4};
 
+// ================================================================================================
+// Constants
+// ================================================================================================
+
 /// Default letterbox padding color (gray).
 pub const LETTERBOX_COLOR: [u8; 3] = [114, 114, 114];
 
+/// Fixed-point scale factor for integer bilinear interpolation (2^11 = 2048).
+const SCALE_BITS: i32 = 11;
+const SCALE_INT: i32 = 1 << SCALE_BITS;
+
+/// Normalized letterbox padding color (114/255 ≈ 0.447).
+const LETTERBOX_NORM: f32 = 114.0 / 255.0;
+
+/// Reciprocal of 255 for normalization.
+const INV_255: f32 = 1.0 / 255.0;
+
+/// Maximum LRU cache size for X coordinate LUTs.
+const LUT_CACHE_SIZE: usize = 8;
+
+// ================================================================================================
+// Type Aliases
+// ================================================================================================
+
+type XLutEntry = (usize, usize, i32, i32);
+type XLutKey = (u32, u32);
+
+// ================================================================================================
+// Thread-Local State
+// ================================================================================================
+
+thread_local! {
+    static X_LUT_CACHE: RefCell<LruCache<XLutKey, Vec<XLutEntry>>> =
+        RefCell::new(LruCache::new(NonZeroUsize::new(LUT_CACHE_SIZE).unwrap()));
+}
+
+// ================================================================================================
+// Types
+// ================================================================================================
+
 /// Tensor data that can be either FP32 or FP16.
 #[derive(Debug, Clone)]
 pub enum TensorData {
@@ -96,13 +154,37 @@ pub fn preprocess_image_with_precision(
     let (new_width, new_height, pad_left, pad_top, scale) =
         calculate_letterbox_params(orig_width, orig_height, target_size, stride);
 
-    // Perform letterbox resize
-    let letterboxed = letterbox_image(image, new_width, new_height, pad_left, pad_top, target_size);
-
-    let tensor = image_to_tensor(&letterboxed);
+    // Zero-copy path: avoid to_rgb8() allocation when possible
+    let tensor = match image {
+        // Fast path: already RGB8, use bytes directly without copy
+        DynamicImage::ImageRgb8(rgb) => fused_zerocopy_preprocess(
+            rgb.as_raw(),
+            orig_width,
+            orig_height,
+            target_size,
+            pad_top,
+            pad_left,
+            new_width,
+            new_height,
+        ),
+        // Fallback: convert to RGB8 (allocates)
+        _ => {
+            let src_rgb = image.to_rgb8();
+            fused_zerocopy_preprocess(
+                src_rgb.as_raw(),
+                orig_width,
+                orig_height,
+                target_size,
+                pad_top,
+                pad_left,
+                new_width,
+                new_height,
+            )
+        }
+    };
 
     let tensor_f16 = if half {
-        Some(image_to_tensor_f16(&letterboxed))
+        Some(tensor_f32_to_f16(&tensor))
     } else {
         None
     };
@@ -117,6 +199,279 @@ pub fn preprocess_image_with_precision(
     }
 }
 
+// ================================================================================================
+// Public API Functions
+// ================================================================================================
+
+/// Get or compute the X coordinate LUT for bilinear interpolation.
+fn get_or_compute_x_lut(src_w: u32, dst_w: u32) -> Vec<XLutEntry> {
+    let key = (src_w, dst_w);
+
+    X_LUT_CACHE.with(|cache| {
+        let mut cache = cache.borrow_mut();
+
+        if let Some(lut) = cache.get(&key) {
+            return lut.clone();
+        }
+
+        let scale_x = src_w as f32 / dst_w as f32;
+        let src_w_max = (src_w - 1) as i32;
+
+        let lut: Vec<XLutEntry> = (0..dst_w)
+            .map(|dx| {
+                let sx = ((dx as f32 + 0.5) * scale_x - 0.5).max(0.0);
+                let x0 = sx.floor() as i32;
+                let fx = ((sx - x0 as f32) * SCALE_INT as f32) as i32;
+                let x0c = x0.clamp(0, src_w_max) as usize * 3;
+                let x1c = (x0 + 1).clamp(0, src_w_max) as usize * 3;
+                (x0c, x1c, SCALE_INT - fx, fx)
+            })
+            .collect();
+
+        cache.put(key, lut.clone());
+        lut
+    })
+}
+
+/// Zero-copy fused preprocessing for maximum performance.
+///
+/// Combines bilinear resize, letterbox padding, and NCHW normalization
+/// in a single memory pass with parallel row processing.
+fn fused_zerocopy_preprocess(
+    src_raw: &[u8],
+    src_w: u32,
+    src_h: u32,
+    target_size: (usize, usize),
+    pad_top: u32,
+    pad_left: u32,
+    new_width: u32,
+    new_height: u32,
+) -> Array4<f32> {
+    use rayon::prelude::*;
+    use std::mem::MaybeUninit;
+    use std::sync::atomic::{AtomicPtr, Ordering};
+    use wide::f32x4;
+
+    let (dst_h, dst_w) = target_size;
+    let channel_size = dst_h * dst_w;
+    let src_stride = (src_w * 3) as usize;
+
+    // ALLOCATE UNINITIALIZED: Saves ~0.2ms by not zeroing memory
+    let mut tensor: Array4<MaybeUninit<f32>> = Array4::uninit((1, 3, dst_h, dst_w));
+    let out_ptr = tensor.as_mut_ptr() as *mut f32;
+
+    // Use AtomicPtr for thread-safe pointer sharing (each thread writes to disjoint rows)
+    let atomic_ptr = AtomicPtr::new(out_ptr);
+
+    let x_lut = get_or_compute_x_lut(src_w, new_width);
+    let scale_y = src_h as f32 / new_height as f32;
+    let src_h_max = (src_h - 1) as i32;
+    let inv_255_vec = f32x4::splat(INV_255);
+
+    let pad_top_usize = pad_top as usize;
+    let pad_left_usize = pad_left as usize;
+    let new_height_usize = new_height as usize;
+    let new_width_usize = new_width as usize;
+
+    // Parallel row processing with raw pointers (no bounds checks)
+    (0..dst_h).into_par_iter().for_each(|dy| {
+        let data_ptr = atomic_ptr.load(Ordering::Relaxed);
+        unsafe {
+            // Calculate row pointers for R, G, B channels
+
+            let r_row = data_ptr.add(dy * dst_w);
+            let g_row = data_ptr.add(channel_size + dy * dst_w);
+            let b_row = data_ptr.add(2 * channel_size + dy * dst_w);
+
+            // Vertical padding (top/bottom rows)
+            if dy < pad_top_usize || dy >= pad_top_usize + new_height_usize {
+                for dx in 0..dst_w {
+                    *r_row.add(dx) = LETTERBOX_NORM;
+                    *g_row.add(dx) = LETTERBOX_NORM;
+                    *b_row.add(dx) = LETTERBOX_NORM;
+                }
+                return;
+            }
+
+            // Image row calculations
+            let img_dy = dy - pad_top_usize;
+            let sy = ((img_dy as f32 + 0.5) * scale_y - 0.5).max(0.0);
+            let y0 = sy.floor() as i32;
+            let fy = ((sy - y0 as f32) * SCALE_INT as f32) as i32;
+            let fy_inv = SCALE_INT - fy;
+
+            let y0c = y0.clamp(0, src_h_max) as usize;
+            let y1c = (y0 + 1).clamp(0, src_h_max) as usize;
+            let row0_off = y0c * src_stride;
+            let row1_off = y1c * src_stride;
+
+            // Left padding
+            for dx in 0..pad_left_usize {
+                *r_row.add(dx) = LETTERBOX_NORM;
+                *g_row.add(dx) = LETTERBOX_NORM;
+                *b_row.add(dx) = LETTERBOX_NORM;
+            }
+
+            // Inner image - SIMD loop (4 pixels at a time)
+            let mut img_dx = 0usize;
+            let src_ptr = src_raw.as_ptr();
+
+            while img_dx + 4 <= new_width_usize {
+                let mut r_vals = [0.0f32; 4];
+                let mut g_vals = [0.0f32; 4];
+                let mut b_vals = [0.0f32; 4];
+
+                for i in 0..4 {
+                    let (x0_off, x1_off, fx_inv, fx) = *x_lut.get_unchecked(img_dx + i);
+                    let w00 = (fx_inv * fy_inv) >> SCALE_BITS;
+                    let w10 = (fx * fy_inv) >> SCALE_BITS;
+                    let w01 = (fx_inv * fy) >> SCALE_BITS;
+                    let w11 = (fx * fy) >> SCALE_BITS;
+
+                    let p00 = src_ptr.add(row0_off + x0_off);
+                    let p10 = src_ptr.add(row0_off + x1_off);
+                    let p01 = src_ptr.add(row1_off + x0_off);
+                    let p11 = src_ptr.add(row1_off + x1_off);
+
+                    r_vals[i] = ((*p00 as i32 * w00
+                        + *p10 as i32 * w10
+                        + *p01 as i32 * w01
+                        + *p11 as i32 * w11)
+                        >> SCALE_BITS) as f32;
+                    g_vals[i] = ((*p00.add(1) as i32 * w00
+                        + *p10.add(1) as i32 * w10
+                        + *p01.add(1) as i32 * w01
+                        + *p11.add(1) as i32 * w11)
+                        >> SCALE_BITS) as f32;
+                    b_vals[i] = ((*p00.add(2) as i32 * w00
+                        + *p10.add(2) as i32 * w10
+                        + *p01.add(2) as i32 * w01
+                        + *p11.add(2) as i32 * w11)
+                        >> SCALE_BITS) as f32;
+                }
+
+                // SIMD normalize
+                let r_simd = f32x4::new(r_vals) * inv_255_vec;
+                let g_simd = f32x4::new(g_vals) * inv_255_vec;
+                let b_simd = f32x4::new(b_vals) * inv_255_vec;
+
+                let out_x = pad_left_usize + img_dx;
+                let r_arr: [f32; 4] = r_simd.into();
+                let g_arr: [f32; 4] = g_simd.into();
+                let b_arr: [f32; 4] = b_simd.into();
+
+                // Direct raw pointer writes (no bounds checks)
+                std::ptr::copy_nonoverlapping(r_arr.as_ptr(), r_row.add(out_x), 4);
+                std::ptr::copy_nonoverlapping(g_arr.as_ptr(), g_row.add(out_x), 4);
+                std::ptr::copy_nonoverlapping(b_arr.as_ptr(), b_row.add(out_x), 4);
+
+                img_dx += 4;
+            }
+
+            // Scalar tail
+            while img_dx < new_width_usize {
+                let (x0_off, x1_off, fx_inv, fx) = *x_lut.get_unchecked(img_dx);
+                let w00 = (fx_inv * fy_inv) >> SCALE_BITS;
+                let w10 = (fx * fy_inv) >> SCALE_BITS;
+                let w01 = (fx_inv * fy) >> SCALE_BITS;
+                let w11 = (fx * fy) >> SCALE_BITS;
+
+                let p00 = src_ptr.add(row0_off + x0_off);
+                let p10 = src_ptr.add(row0_off + x1_off);
+                let p01 = src_ptr.add(row1_off + x0_off);
+                let p11 = src_ptr.add(row1_off + x1_off);
+
+                let out_x = pad_left_usize + img_dx;
+                *r_row.add(out_x) = ((*p00 as i32 * w00
+                    + *p10 as i32 * w10
+                    + *p01 as i32 * w01
+                    + *p11 as i32 * w11)
+                    >> SCALE_BITS) as f32
+                    * INV_255;
+                *g_row.add(out_x) = ((*p00.add(1) as i32 * w00
+                    + *p10.add(1) as i32 * w10
+                    + *p01.add(1) as i32 * w01
+                    + *p11.add(1) as i32 * w11)
+                    >> SCALE_BITS) as f32
+                    * INV_255;
+                *b_row.add(out_x) = ((*p00.add(2) as i32 * w00
+                    + *p10.add(2) as i32 * w10
+                    + *p01.add(2) as i32 * w01
+                    + *p11.add(2) as i32 * w11)
+                    >> SCALE_BITS) as f32
+                    * INV_255;
+
+                img_dx += 1;
+            }
+
+            // Right padding
+            for dx in (pad_left_usize + new_width_usize)..dst_w {
+                *r_row.add(dx) = LETTERBOX_NORM;
+                *g_row.add(dx) = LETTERBOX_NORM;
+                *b_row.add(dx) = LETTERBOX_NORM;
+            }
+        }
+    });
+
+    // SAFETY: All elements have been initialized
+    unsafe { tensor.assume_init() }
+}
+
+/// Convert f32 tensor to f16 tensor.
+fn tensor_f32_to_f16(tensor: &Array4<f32>) -> Array4<half::f16> {
+    tensor.mapv(half::f16::from_f32)
+}
+
+/// Calculate target size for rectangular inference mode.
+///
+/// Adjusts `target_size` such that the image's aspect ratio is preserved,
+/// and both dimensions are multiples of `stride`.
+///
+/// # Arguments
+///
+/// * `orig_width` - Original image width.
+/// * `orig_height` - Original image height.
+/// * `target_size` - Base target size (e.g. 640x640).
+/// * `stride` - Model stride for alignment.
+///
+/// # Returns
+///
+/// Adjusted target size as (height, width).
+#[must_use]
+pub fn calculate_rect_size(
+    orig_width: u32,
+    orig_height: u32,
+    target_size: (usize, usize),
+    stride: u32,
+) -> (usize, usize) {
+    let (target_h, target_w) = target_size;
+
+    #[allow(clippy::cast_precision_loss)]
+    let orig_h = orig_height as f32;
+    #[allow(clippy::cast_precision_loss)]
+    let orig_w = orig_width as f32;
+    #[allow(clippy::cast_precision_loss)]
+    let target_h_f = target_h as f32;
+    #[allow(clippy::cast_precision_loss)]
+    let target_w_f = target_w as f32;
+
+    // Calculate scale to fit within target while maintaining aspect ratio
+    let scale = (target_h_f / orig_h).min(target_w_f / orig_w);
+
+    // New dimensions after scaling
+    #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
+    let new_h = (orig_h * scale).round() as usize;
+    #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
+    let new_w = (orig_w * scale).round() as usize;
+
+    // Round up to nearest multiple of stride
+    let stride = stride as usize;
+    let rect_h = ((new_h + stride - 1) / stride) * stride;
+    let rect_w = ((new_w + stride - 1) / stride) * stride;
+
+    (rect_h, rect_w)
+}
+
 /// Calculate letterbox parameters for resizing.
 ///
 /// Computes new dimensions and padding to fit the image within the target size while maintaining aspect ratio.
@@ -173,72 +528,6 @@ fn calculate_letterbox_params(
 
     (new_w, new_h, pad_left, pad_top, (scale_y, scale_x))
 }
-/// Apply letterbox transformation to an image.
-///
-/// Resizes the image maintaining aspect ratio and adds padding usually to center it.
-/// Uses SIMD-accelerated resizing via `fast_image_resize`.
-///
-/// # Arguments
-///
-/// * `image` - Source dynamic image.
-/// * `new_width` - Target width after scaling (before padding).
-/// * `new_height` - Target height after scaling (before padding).
-/// * `pad_left` - Padding to add on the left.
-/// * `pad_top` - Padding to add on the top.
-/// * `target_size` - Final output dimensions (height, width).
-///
-/// # Returns
-///
-/// `RgbImage` padded and resized to `target_size`.
-fn letterbox_image(
-    image: &DynamicImage,
-    new_width: u32,
-    new_height: u32,
-    pad_left: u32,
-    pad_top: u32,
-    target_size: (usize, usize),
-) -> RgbImage {
-    use fast_image_resize::{PixelType, ResizeAlg, ResizeOptions, Resizer, images::Image};
-
-    let src_rgb = image.to_rgb8();
-    let (src_w, src_h) = src_rgb.dimensions();
-
-    let src_image = Image::from_vec_u8(src_w, src_h, src_rgb.into_raw(), PixelType::U8x3)
-        .expect("Failed to create source image");
-
-    let mut dst_image = Image::new(new_width, new_height, PixelType::U8x3);
-
-    let mut resizer = Resizer::new();
-    let options = ResizeOptions::new().resize_alg(ResizeAlg::Convolution(
-        // Use Lanczos3 for high-quality resizing. This is critical for OBB tasks where
-        // preserving small features (like harbors in DOTA8) is essential for detection.
-        // It matches the default behavior of Ultralytics Python preprocessing.
-        fast_image_resize::FilterType::Lanczos3,
-    ));
-    resizer
-        .resize(&src_image, &mut dst_image, Some(&options))
-        .expect("Failed to resize image");
-
-    // Create output image with letterbox color
-    #[allow(clippy::cast_possible_truncation)]
-    let mut output: RgbImage = ImageBuffer::from_pixel(
-        target_size.1 as u32,
-        target_size.0 as u32,
-        Rgb(LETTERBOX_COLOR),
-    );
-
-    let resized_rgb: RgbImage = ImageBuffer::from_raw(new_width, new_height, dst_image.into_vec())
-        .expect("Failed to create resized image buffer");
-
-    image::imageops::overlay(
-        &mut output,
-        &resized_rgb,
-        i64::from(pad_left),
-        i64::from(pad_top),
-    );
-
-    output
-}
 
 /// Convert an RGB image to a normalized NCHW tensor (FP32).
 ///
diff --git a/src/source.rs b/src/source.rs
index 03d63b5..c8d10ca 100644
--- a/src/source.rs
+++ b/src/source.rs
@@ -740,7 +740,6 @@ impl Iterator for SourceIterator {
 }
 
 #[cfg(feature = "video")]
-/// Convert a `video_rs` Frame (ndarray 0.16) to `DynamicImage`.
 fn video_frame_to_image(arr: &video_rs::Frame) -> Result<DynamicImage> {
     let shape = arr.shape();
     let height = u32::try_from(shape[0])
@@ -748,14 +747,14 @@ fn video_frame_to_image(arr: &video_rs::Frame) -> Result<DynamicImage> {
     let width = u32::try_from(shape[1])
         .map_err(|_| InferenceError::ImageError("Image width exceeds u32::MAX".to_string()))?;
 
-    let mut rgb_data = Vec::with_capacity((height * width * 3) as usize);
-    for y in 0..height as usize {
-        for x in 0..width as usize {
-            rgb_data.push(arr[[y, x, 0]]);
-            rgb_data.push(arr[[y, x, 1]]);
-            rgb_data.push(arr[[y, x, 2]]);
-        }
-    }
+    // video_rs::Frame is an ndarray::Array3<u8> with shape (H, W, 3) and standard layout (C-contiguous).
+    // We can directly copy the raw data.
+    let rgb_data = arr
+        .as_slice()
+        .ok_or_else(|| {
+            InferenceError::ImageError("Failed to get raw slice from video frame".to_string())
+        })?
+        .to_vec();
 
     let img_buffer = image::RgbImage::from_raw(width, height, rgb_data).ok_or_else(|| {
         InferenceError::ImageError("Failed to create image from video frame".to_string())
diff --git a/tests/integration_test.rs b/tests/integration_test.rs
index dc645bc..1c0717c 100644
--- a/tests/integration_test.rs
+++ b/tests/integration_test.rs
@@ -28,6 +28,7 @@ fn test_run_prediction_e2e() {
         show: false,
         device: None,
         verbose: true,
+        rect: false,
     };
 
     // This should run successfully (download model/images and predict)
@@ -47,11 +48,11 @@ fn test_inference_config_builder() {
     let config = InferenceConfig::new()
         .with_confidence(0.5)
         .with_iou(0.7)
-        .with_max_det(100);
+        .with_max_det(300);
 
     assert_eq!(config.confidence_threshold, 0.5);
     assert_eq!(config.iou_threshold, 0.7);
-    assert_eq!(config.max_det, 100);
+    assert_eq!(config.max_det, 300);
 }
 
 #[test]