diff --git a/src/images/blogs/multilora.png b/src/images/blogs/multilora.png new file mode 100644 index 0000000000000..b0998fe069401 Binary files /dev/null and b/src/images/blogs/multilora.png differ diff --git a/src/routes/blogs/+page.svelte b/src/routes/blogs/+page.svelte index fc3a9f47a016a..e9c86fc18bdeb 100644 --- a/src/routes/blogs/+page.svelte +++ b/src/routes/blogs/+page.svelte @@ -22,6 +22,7 @@ import GoodnotesThumbnail from '../../images/blogs/goodnotes-scribble-to-erase/Thumbnail.png'; import OliveCli from '../../images/blogs/olive-flow.png'; import QuantizeFinetune from '../../images/blogs/Quantize-finetune.jpg'; + import MultiLoraThumbnail from '../../images/blogs/multilora.png'; onMount(() => { anime({ targets: '.border-primary', @@ -49,6 +50,16 @@ dispatch('switchTab', tab); } let featuredblog = [ + { + title: + 'Announcing MultiLoRA with ONNX Runtime: Revolutionizing AI Customization', + date: '20th November, 2024', + blurb: + 'MultiLoRA with ONNX Runtime brings flexible, efficient AI customization by enabling easy integration of LoRA adapters for dynamic, personalized models with minimal resource demands.', + link: 'blogs/multilora', + image: MultiLoraThumbnail, + imgalt: 'Serving LoRA models separately vs with MultiLoRA' + }, { title: 'Is it better to quantize before or after finetuning?', @@ -69,6 +80,8 @@ image: GoodnotesThumbnail, imgalt: 'Scribble to Erase feature on Goodnotes for Windows, Web, and Android' }, + ]; + let blogs = [ { title: 'Democratizing AI Model optimization with the new Olive CLI', date: 'November 11th, 2024', @@ -77,9 +90,7 @@ link: 'blogs/olive-cli', image: OliveCli, imgalt: 'Olive Flow' - } - ]; - let blogs = [ + }, { title: 'Enhancing team collaboration during AI model optimization with the Olive Shared Cache', diff --git a/src/routes/blogs/multilora/+page.svx b/src/routes/blogs/multilora/+page.svx new file mode 100644 index 0000000000000..c04f4e1b4a5a8 --- /dev/null +++ b/src/routes/blogs/multilora/+page.svx @@ -0,0 +1,123 @@ +--- +title: 'Announcing MultiLoRA with ONNX Runtime: Revolutionizing AI Customization' +date: '20th November, 2024' +description: 'MultiLoRA with ONNX Runtime brings flexible, efficient AI customization by enabling easy integration of LoRA adapters for dynamic, personalized models with minimal resource demands.' +keywords: 'MultiLoRA, LoRA, ONNX Runtime, AI customization, LoRA adapters, Olive toolchain, large language models, model personalization, model optimization, Hugging Face, AI deployment, AI performance' +authors: ['Dmitri Smirnov', 'Jambay Kinley', 'Natalie Kershaw', 'Parinita Rahi', 'Pranav Sharma', 'Devang Patel', 'Samuel Kemp'] +authorsLink: ['https://www.linkedin.com/in/yuslepukhinlinkedin/', 'https://www.linkedin.com/in/jambayk/', 'https://www.linkedin.com/in/natkershaw/', 'https://www.linkedin.com/in/parinitaparinita/', 'https://www.linkedin.com/in/pranavli/', 'https://www.linkedin.com/in/devangpatel/', 'https://www.linkedin.com/in/samuel-kemp-a9253724/' ] +image: 'https://iili.io/25Mlea9.png' +imageSquare: 'https://iili.io/25Mlea9.png' +url: 'https://onnxruntime.ai/blogs/multilora' +--- + + + +## A New Era of AI Customization + +Today's AI services must cater to a vast range of users—each with unique requirements and preferences. Customizing large language models for individual customers or speakers has traditionally been a resource-intensive and time-consuming task. + +LoRA adapters have proven transformative in customizing large AI models for specific tasks without requiring resource-intensive fine-tuning. MultiLoRA changes the game by enabling seamless integration of lightweight adapters, allowing models to adapt dynamically to different contexts and customers. With ONNX Runtime as its foundation, MultiLoRA offers unparalleled performance, ensuring efficient memory usage. + + +## Streamlined Integration with Olive + +MultiLoRA relies on the existing Olive toolchain to generate adapted ONNX models. + +This ensures: +- A unified pipeline for creating LoRA-adapted models. +- Consistent handling of versioning and metadata across models and adapters. + +By standardizing around Olive, MultiLoRA simplifies workflows and eliminates compatibility concerns with third-party sources. + + +## ONNX Runtime Adaptations + +### Simplified Adapter Activation and Loading +- **Dynamic Activation API:** A single API, `SetActiveAdapters(string[] adapters)`, allows activating or deactivating adapters at runtime. + - **Empty Input:** Resets the model to its base state, running without any adapters. + - **Multi-Adapter Support:** Simultaneously activate multiple adapters to meet complex customer requirements. + +- **Generative Loop Support:** + - Active adapters remain loaded as long as the `GeneratorParams` instance persists, ensuring efficient memory use. + - References are automatically released when the instance is destroyed, avoiding resource leaks. + +### Adapter Management Without Generative Loops +For models not tied to user prompts or generative processes, a new `Run()` API is introduced: + + + +- **RunOptions Class:** Facilitates seamless execution of base models or adapter-enhanced variants. +- **Shared Adapter Loading:** Adapters are stored within the model instance, allowing efficient reuse across multiple sessions. + +### Language Bindings Expansion +The current MultiLoRA implementation offers bindings for Python, C, C++, C#, and Java. + +### Memory Management +Our implementation memory maps LoRA parameters from disk, which improves memory management. + +## How MultiLoRA Works + +### Generate the ONNX Models and Adapters +If you have an existing base model and adapter in Hugging Face PEFT format, you can automatically create optimized ONNX models that will run efficiently on the ONNX runtime using the MultiLoRA paradigm by leveraging the following command: + + + +You can then add additional adapters that exist on Hugging Face (or local disk) for the same base model by converting them into the ONNX adapter format using: + + + +Alternatively, you can fine-tune your own adapter using: + + + +## Run the ONNX Models and Switch Adapters + +1. **Load Adapters:** Dynamically load adapters for the base model: + + + +2. **Set Active Adapter:** Switch adapters on the fly based on customer requests: + + + + +## Looking Ahead + +**In Development:** +- **Batching Support:** Enhancing ONNX Runtime kernels for adapter-aware batching. +- **Expanded Bindings:** Introducing language bindings for broader adoption. +- **Memory Features:** Additional memory management improvements. + +## Your Feedback Matters +As MultiLoRA evolves, we invite developers to test the feature, provide insights, and shape its roadmap. By working together, we aim to create a flexible, powerful foundation for AI adaptation. + + +## Conclusion +MultiLoRA is more than an enhancement to ONNX Runtime—it's a step forward in making AI systems modular, adaptable, and accessible. By addressing technical challenges like memory management, batching, and data format inefficiencies, MultiLoRA lays the groundwork for a new era of AI deployment. + +Let's build the future of adaptable AI together. Join us in exploring MultiLoRA with ONNX Runtime! + + +## Resources +- **ONNX Runtime Tutorial:** [Run with LoRA adapters](https://onnxruntime.ai/docs/genai/tutorials/finetune.html) +- **Python API docs:** [Python API](https://onnxruntime.ai/docs/genai/api/python.html#adapter-class) +- **Olive Example:** [Finetune and Deploy](https://github.com/microsoft/Olive/blob/main/examples/llama2/llama2_multilora.ipynb)