Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,10 @@ PKG_CONFIG_PATH = "/opt/homebrew/opt/ffmpeg@7/lib/pkgconfig"
CPATH = "/opt/homebrew/opt/ffmpeg@7/include"
LIBRARY_PATH = "/opt/homebrew/opt/ffmpeg@7/lib"
BINDGEN_EXTRA_CLANG_ARGS = "-I/opt/homebrew/opt/ffmpeg@7/include"

#
# Linux: Configure RPATH to find shared libraries in the executable's directory
# This allows the binary to correspond to libonnxruntime*.so in the same folder
# without needing LD_LIBRARY_PATH set manually.
[target.x86_64-unknown-linux-gnu]
rustflags = ["-C", "link-arg=-Wl,-rpath,$ORIGIN"]
42 changes: 41 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 9 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

[package]
name = "ultralytics-inference"
version = "0.0.7"
version = "0.0.8"
edition = "2024"
authors = [
"Glenn Jocher <[email protected]>",
Expand Down Expand Up @@ -50,6 +50,12 @@ image = "^0.25"
jpeg-decoder = "^0.3"
fast_image_resize = { version = "^5.5", features = ["image", "rayon"] }

# SIMD for fast preprocessing
wide = "0.7"

# LRU cache for preprocessing LUT
lru = "0.12"

# Numerical computing (must match ort's ndarray version 0.17)
ndarray = { version = "^0.17", features = ["rayon"] }

Expand All @@ -74,6 +80,7 @@ bytemuck = { version = "^1.21", features = ["derive"] }
clap = { version = "4.5.54", features = ["derive"] }
colored = "3.0.0"

# Optional - Visualization and Video support
minifb = { version = "^0.28.0", optional = true }
video-rs = { version = "^0.10.5", features = ["ndarray"], optional = true }

Expand Down Expand Up @@ -151,7 +158,7 @@ opt-level = 3

[profile.release]
opt-level = 3
lto = true
lto = "fat"
codegen-units = 1
panic = "abort"
strip = true
65 changes: 44 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,20 +77,26 @@ cargo run --release -- predict --model yolo11n.onnx --source video.mp4 --show --

# Save individual frames for video input
cargo run --release -- predict --model yolo11n.onnx --source video.mp4 --save-frames

# Rectangular inference
cargo run --release -- predict --model yolo11n.onnx --source image.jpg --rect
```

### Example Output

```
# ultralytics-inference predict

WARNING ⚠️ 'model' argument is missing. Using default 'model=yolo11n.onnx'.
WARNING ⚠️ 'source' argument is missing. Using default images: https://ultralytics.com/images/bus.jpg, https://ultralytics.com/images/zidane.jpg
Ultralytics 0.0.7 🚀 Rust ONNX FP32 CPU
Ultralytics 0.0.8 🚀 Rust ONNX FP32 CPU
Using ONNX Runtime CPUExecutionProvider
YOLO11n summary: 80 classes, imgsz=(640, 640)

image 1/2 bus.jpg: 640x640 3 persons, 1 bus, 57.3ms
image 2/2 zidane.jpg: 640x640 2 persons, 1 tie, 52.9ms
Speed: 75.8ms preprocess, 55.1ms inference, 19.9ms postprocess per image at shape (1, 3, 640, 640)
Results saved to runs/detect/predict53
image 1/2 /home/ultralytics/inference/bus.jpg: 640x480 640x480 4 persons, 1 bus, 36.4ms
image 2/2 /home/ultralytics/inference/zidane.jpg: 384x640 2 persons, 1 tie, 28.6ms
Speed: 1.5ms preprocess, 32.5ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)
Results saved to runs/detect/predict1
💡 Learn more at https://docs.ultralytics.com/modes/predict
```

Expand All @@ -117,8 +123,11 @@ cargo run --release -- predict --model <model.onnx> --source <source>
| `--source` | `-s` | Input source (image, video, webcam index, or URL) | `Task dependent Ultralytics URL assets` |
| `--device` | | Device to use (cpu, cuda:0, mps, coreml, etc.) | `cpu` |
| `--conf` | | Confidence threshold | `0.25` |
| `--iou` | | IoU threshold for NMS | `0.45` |
| `--iou` | | IoU threshold for NMS | `0.7` |
| `--max-det` | | Maximum number of detections | `300` |
| `--imgsz` | | Inference image size | `Model metadata` |
| `--rect` | | Enable rectangular inference (minimal padding) | `true` |
| `--batch` | | Batch size for inference | `1` |
| `--half` | | Use FP16 half-precision inference | `false` |
| `--save` | | Save annotated results to runs/<task>/predict | `true` |
| `--save-frames` | | Save individual frames for video | `false` |
Expand Down Expand Up @@ -183,7 +192,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let config = InferenceConfig::new()
.with_confidence(0.5)
.with_iou(0.45)
.with_max_det(100);
.with_max_det(300);

let mut model = YOLOModel::load_with_config("yolo11n.onnx", config)?;
let results = model.predict("image.jpg")?;
Expand Down Expand Up @@ -236,16 +245,25 @@ inference/
│ ├── main.rs # CLI application
│ ├── model.rs # YOLOModel - ONNX session and inference
│ ├── results.rs # Results, Boxes, Masks, Keypoints, Probs, Obb
│ ├── preprocessing.rs # Image preprocessing (letterbox, normalize)
│ ├── postprocessing.rs # Detection post-processing (NMS, decode)
│ ├── preprocessing.rs # Image preprocessing (letterbox, normalize, SIMD)
│ ├── postprocessing.rs # Detection post-processing (NMS, decode, SIMD)
│ ├── metadata.rs # ONNX model metadata parsing
│ ├── source.rs # Input source handling
│ ├── task.rs # Task enum (Detect, Segment, Pose, etc.)
│ ├── source.rs # Input source handling (images, video, webcam)
│ ├── task.rs # Task enum (Detect, Segment, Pose, Classify, Obb)
│ ├── inference.rs # InferenceConfig
│ ├── batch.rs # Batch processing pipeline
│ ├── device.rs # Device enum (CPU, CUDA, MPS, CoreML, etc.)
│ ├── download.rs # Model and asset downloading
│ ├── visualizer/ # Visualization tools (Viewer)
│ ├── annotate.rs # Image annotation (bounding boxes, masks, keypoints)
│ ├── io.rs # Result saving (images, videos)
│ ├── logging.rs # Logging macros
│ ├── error.rs # Error types
│ └── utils.rs # Utility functions (NMS, IoU)
│ ├── utils.rs # Utility functions (NMS, IoU)
│ ├── cli/ # CLI module
│ │ ├── mod.rs # CLI module exports
│ │ ├── args.rs # CLI argument parsing
│ │ └── predict.rs # Predict command implementation
│ └── visualizer/ # Real-time visualization (minifb)
├── tests/
│ └── integration_test.rs # Integration tests
├── assets/ # Test images
Expand Down Expand Up @@ -300,13 +318,16 @@ One of the key benefits of this library is **minimal dependencies** - no PyTorch

### Core Dependencies (always included)

| Crate | Purpose |
| ------------------- | ----------------------- |
| `ort` | ONNX Runtime bindings |
| `ndarray` | N-dimensional arrays |
| `image` | Image loading/decoding |
| `fast_image_resize` | SIMD-optimized resizing |
| `half` | FP16 support |
| Crate | Purpose |
| ------------------- | ------------------------------- |
| `ort` | ONNX Runtime bindings |
| `ndarray` | N-dimensional arrays |
| `image` | Image loading/decoding |
| `jpeg-decoder` | JPEG decoding |
| `fast_image_resize` | SIMD-optimized resizing |
| `half` | FP16 support |
| `lru` | LRU cache for preprocessing LUT |
| `wide` | SIMD for fast preprocessing |

### Optional Dependencies (for `--save` feature)

Expand Down Expand Up @@ -372,16 +393,18 @@ ONNX Runtime threading is set to auto (`num_threads: 0`) which lets ORT choose o

- [x] Detection, Segmentation, Pose, Classification, OBB inference
- [x] ONNX model metadata parsing (auto-detect classes, task, imgsz)
- [x] Hardware acceleration support (CUDA, TensorRT, CoreML, OpenVINO, XNNPACK)
- [x] Ultralytics-compatible Results API (`Boxes`, `Masks`, `Keypoints`, `Probs`, `Obb`)
- [x] Multiple input sources (images, directories, globs, URLs)
- [x] Video file support and webcam/RTSP streaming
- [x] Image annotation and visualization
- [x] FP16 half-precision inference
- [x] Batch inference support
- [x] Rectangular inference support and optimization

### In Progress

- [ ] Python bindings (PyO3)
- [ ] Batch inference optimization
- [ ] WebAssembly (WASM) support for browser inference

## 💡 Contributing
Expand Down
35 changes: 22 additions & 13 deletions src/cli/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,27 @@ use clap::{Args, Parser, Subcommand};
--model, -m <MODEL> Path to ONNX model file [default: yolo11n.onnx]
--source, -s <SOURCE> Input source (image, directory, glob, video, webcam, or URL)
--conf <CONF> Confidence threshold [default: 0.25]
--iou <IOU> IoU threshold for NMS [default: 0.45]
--iou <IOU> IoU threshold for NMS [default: 0.7]
--max-det <MAX_DET> Maximum number of detections [default: 300]
--imgsz <IMGSZ> Inference image size
--rect Enable rectangular inference (minimal padding) [default: true]
--batch <BATCH> Batch size for inference [default: 1]
--half Use FP16 half-precision inference
--save Save annotated images to runs/<task>/predict
--save Save annotated images to runs/<task>/predict [default: true]
--save-frames Save individual frames for video input (instead of video file)
--show Display results in a window
--device <DEVICE> Device (cpu, cuda:0, mps, coreml, directml:0, openvino, xnnpack)
--verbose Show verbose output
--device <DEVICE> Device (cpu, cuda:0, mps, coreml, directml:0, openvino, tensorrt:0, xnnpack)
--verbose Show verbose output [default: true]

Examples:
ultralytics-inference predict
ultralytics-inference predict --model yolo11n.onnx --source image.jpg
ultralytics-inference predict --model yolo11n.onnx --source video.mp4
ultralytics-inference predict --model yolo11n.onnx --source video.mp4 --save-frames
ultralytics-inference predict --model yolo11n.onnx --source 0 --conf 0.5
ultralytics-inference predict -m yolo11n.onnx -s assets/ --save --half
ultralytics-inference predict -m yolo11n.onnx -s video.mp4 --imgsz 1280 --show
ultralytics-inference predict --model yolo11n.onnx --source image.jpg --device mps"#)]
ultralytics-inference predict --source video.mp4 --rect
ultralytics-inference predict --source video.mp4 --save-frames
ultralytics-inference predict --source 0 --conf 0.5 --show
ultralytics-inference predict --source assets/ --save --half
ultralytics-inference predict --source image.jpg --device cuda:0
ultralytics-inference predict --source image.jpg --device mps"#)]
pub struct Cli {
#[command(subcommand)]
/// Subcommand to execute.
Expand Down Expand Up @@ -58,7 +62,7 @@ pub struct PredictArgs {
pub conf: f32,

/// `IoU` threshold for NMS
#[arg(long, default_value_t = 0.45)]
#[arg(long, default_value_t = 0.7)]
pub iou: f32,

/// Maximum number of detections
Expand All @@ -69,6 +73,10 @@ pub struct PredictArgs {
#[arg(long)]
pub imgsz: Option<usize>,

/// Enable minimal padding (rectangular inference)
#[arg(long, default_value_t = true, num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set)]
pub rect: bool,

/// Batch size for inference
#[arg(long, default_value_t = 1, value_parser = clap::value_parser!(u32).range(1..))]
pub batch: u32,
Expand All @@ -78,7 +86,7 @@ pub struct PredictArgs {
pub half: bool,

/// Save annotated images to runs/<task>/predict
#[arg(long, default_value_t = true, action = clap::ArgAction::Set)]
#[arg(long, default_value_t = true, num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set)]
pub save: bool,

/// Save individual frames for video input (instead of video file)
Expand Down Expand Up @@ -115,7 +123,8 @@ mod tests {
Commands::Predict(predict_args) => {
assert_eq!(predict_args.model, "yolo11n.onnx");
assert!((predict_args.conf - 0.25).abs() < f32::EPSILON);
assert!((predict_args.iou - 0.45).abs() < f32::EPSILON);
assert!((predict_args.iou - 0.7).abs() < f32::EPSILON);
assert!(predict_args.rect);
assert_eq!(predict_args.max_det, 300);
assert!(!predict_args.half);
assert!(predict_args.verbose);
Expand Down
Loading
Loading