Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

onnxruntime-genai #767

Merged
merged 9 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions packages/ml/onnxruntime/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ def onnxruntime(version, branch=None, requires=None, default=False):

package = [
onnxruntime('1.21', requires=['>=36', '>=cu124'], default=False, branch='main'),
onnxruntime('1.20', requires=['>=36', '>=cu124'], default=True),
onnxruntime('1.20.1', requires=['>=36', '>=cu124'], default=True),
onnxruntime('1.20', requires=['>=36', '>=cu124'], default=False),
onnxruntime('1.19.2', requires=['>=36', '>=cu124'], default=False),
onnxruntime('1.17', requires=['>=36', '<=cu122'], default=True),
onnxruntime('1.16.3', requires='==35.*', default=True),
onnxruntime('1.11', requires='==32.*', default=True),
]
]
18 changes: 18 additions & 0 deletions packages/ml/onnxruntime_genai/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#---
# name: onnxruntime_genai
# group: ml
# config: config.py
# depends: [torch, torchvision, torchaudio, transformers, onnx, onnxruntime]
# test: test.py
#---
ARG BASE_IMAGE
FROM ${BASE_IMAGE}

ARG ONNXRUNTIME_GENAI_VERSION \
ONNXRUNTIME_GENAI_BRANCH \
CUDA_VERSION \
FORCE_BUILD=off

COPY install.sh build.sh /tmp/onnxruntime_genai/

RUN /tmp/onnxruntime_genai/install.sh || /tmp/onnxruntime_genai/build.sh
50 changes: 50 additions & 0 deletions packages/ml/onnxruntime_genai/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env bash
set -ex

# Extract ONNX Runtime version using Python
ONNXRUNTIME_VERSION=$(python3 -c "import onnxruntime as ort; print(ort.__version__)")

echo "CUDA Version: ${CUDA_VERSION}"
echo "Detected ONNX Runtime version: ${ONNXRUNTIME_VERSION}"
echo "Building onnxruntime_genai ${ONNXRUNTIME_GENAI_VERSION} (branch=${ONNXRUNTIME_GENAI_BRANCH})"

git clone --branch=rel-${ONNXRUNTIME_GENAI_VERSION} --depth=1 --recursive https://github.com/microsoft/onnxruntime-genai /opt/onnxruntime_genai || \
git clone --recursive https://github.com/microsoft/onnxruntime-genai /opt/onnxruntime_genai

mkdir -p /opt/onnxruntime_genai/ort/lib/
mkdir -p /opt/onnxruntime_genai/ort/include/

cp /usr/local/lib/python3.10/dist-packages/onnxruntime/capi/libonnxruntime*.so* /opt/onnxruntime_genai/ort/lib/

cd /opt/onnxruntime_genai/ort/include/
# Use the dynamically detected version for downloading ONNX Runtime headers
wget https://raw.githubusercontent.com/microsoft/onnxruntime/rel-${ONNXRUNTIME_VERSION}/include/onnxruntime/core/session/onnxruntime_c_api.h
wget https://raw.githubusercontent.com/microsoft/onnxruntime/rel-${ONNXRUNTIME_VERSION}/include/onnxruntime/core/session/onnxruntime_float16.h

# Use the dynamically detected version for symbolic linking
ln -s /opt/onnxruntime_genai/ort/lib/libonnxruntime.so.${ONNXRUNTIME_VERSION} /opt/onnxruntime_genai/ort/lib/libonnxruntime.so

cd /opt/onnxruntime_genai

install_dir="/opt/onnxruntime_genai/install"

./build.sh --use_cuda --config Release --update --parallel --build \
--skip_tests ${ONNXRUNTIME_FLAGS} \
--cmake_extra_defines CMAKE_CXX_FLAGS="-Wno-unused-variable -I/usr/local/cuda/include" \
--cmake_extra_defines CMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHITECTURES}" \
--cmake_extra_defines CMAKE_INSTALL_PREFIX=${install_dir} \
--cuda_home /usr/local/cuda --ort_home ./ort

cd build/Linux/Release
make install

ls -ll wheel
cp wheel/onnxruntime_genai*.whl /opt
cd /

pip3 install --no-cache-dir --verbose /opt/onnxruntime_genai*.whl
python3 -c 'import onnxruntime_genai; print(onnxruntime_genai.__version__);'

twine upload --verbose /opt/onnxruntime_genai*.whl || echo "failed to upload wheel to ${TWINE_REPOSITORY_URL}"
tarpack upload onnxruntime_genai-${ONNXRUNTIME_GENAI_VERSION} ${install_dir} || echo "failed to upload tarball"
# rm -rf /tmp/onnxruntime_genai
37 changes: 37 additions & 0 deletions packages/ml/onnxruntime_genai/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from jetson_containers import L4T_VERSION, CUDA_VERSION, update_dependencies
from packaging.version import Version

def onnxruntime_genai(version, branch=None, requires=None, default=False):
ort = package.copy()

ort['name'] = f'onnxruntime_genai:{version}'

if requires:
ort['requires'] = requires

if len(version.split('.')) < 3:
version = version + '.0'

if not branch:
branch = 'v' + version

ort['build_args'] = {
'ONNXRUNTIME_GENAI_VERSION': version,
'ONNXRUNTIME_GENAI_BRANCH': branch,
'CUDA_VERSION': CUDA_VERSION,
}

builder = ort.copy()
builder['name'] = builder['name'] + '-builder'
builder['build_args'] = {**builder['build_args'], 'FORCE_BUILD': 'on'}

if default:
ort['alias'] = 'onnxruntime_genai'
builder['alias'] = 'onnxruntime_genai:builder'

return ort, builder


package = [
onnxruntime_genai('0.6.0', requires=['>=36', '>=cu126'], default=True, branch='main')
]
11 changes: 11 additions & 0 deletions packages/ml/onnxruntime_genai/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash
set -ex

if [ "$FORCE_BUILD" == "on" ]; then
echo "Forcing build of onnxruntime_genai ${ONNXRUNTIME_GENAI_VERSION} (branch=${ONNXRUNTIME_GENAI_BRANCH}"
exit 1
fi

pip3 install --no-cache-dir --verbose onnxruntime_genai==${ONNXRUNTIME_GENAI_VERSION}

python3 -c 'import onnxruntime_genai; print(onnxruntime_genai.__version__);'
72 changes: 72 additions & 0 deletions packages/ml/onnxruntime_genai/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env python3

import os
import time
import shutil
import pprint
import argparse
import requests
import numpy as np

from packaging.version import Version
import onnxruntime as ort

print('onnxruntime version: ' + str(ort.__version__))

ort_version = Version(ort.__version__)

if ort_version > Version('1.10'):
print(ort.get_build_info())

# Verify execution providers
providers = ort.get_available_providers()
print(f'Execution providers: {providers}')

print('Testing onnxruntime_genai...')
import onnxruntime_genai as og
print('onnxruntime_genai version: ' + str(og.__version__))

# Execute Hugging Face CLI
os.system("huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* --local-dir .")

# Initialize model and tokenizer
model = og.Model('cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4')
tokenizer = og.Tokenizer(model)
tokenizer_stream = tokenizer.create_stream()

# Set default search options
search_options = {
'max_length': 2048,
'batch_size': 1
}

chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>'

text = "Input: Hello, I'm in jetson containers"
if not text:
print("Error, input cannot be empty")
exit()

prompt = f'{chat_template.format(input=text)}'
input_tokens = tokenizer.encode(prompt)

# Initialize generator
params = og.GeneratorParams(model)
params.set_search_options(**search_options)
generator = og.Generator(model, params)

print("Output: ", end='', flush=True)

try:
generator.append_tokens(input_tokens)
while not generator.is_done():
generator.generate_next_token()
new_token = generator.get_next_tokens()[0]
print(tokenizer_stream.decode(new_token), end='', flush=True)
except KeyboardInterrupt:
print(" --Control+C pressed, aborting generation--")

print()
del generator

print("\n onnxruntime_genai OK\n")