From 67e1d3ce0b78299dc535f503240c243865a918e4 Mon Sep 17 00:00:00 2001
From: Marcus Sorensen <msorensen@nvidia.com>
Date: Thu, 29 Aug 2024 10:54:00 -0600
Subject: [PATCH 1/7] Add basic example of NIM with Run.ai inference

---
 README.md                                     |  2 +-
 run.ai/README.md                              | 67 +++++++++++++++++++
 run.ai/examples/basic-llama/.helmignore       | 23 +++++++
 run.ai/examples/basic-llama/Chart.yaml        | 24 +++++++
 .../templates/inferenceworkload.yaml          | 27 ++++++++
 .../basic-llama/templates/ngc-secret.yaml     |  8 +++
 run.ai/examples/basic-llama/values.yaml       |  5 ++
 run.ai/examples/query-llama.sh                | 28 ++++++++
 8 files changed, 183 insertions(+), 1 deletion(-)
 create mode 100644 run.ai/README.md
 create mode 100644 run.ai/examples/basic-llama/.helmignore
 create mode 100644 run.ai/examples/basic-llama/Chart.yaml
 create mode 100644 run.ai/examples/basic-llama/templates/inferenceworkload.yaml
 create mode 100644 run.ai/examples/basic-llama/templates/ngc-secret.yaml
 create mode 100644 run.ai/examples/basic-llama/values.yaml
 create mode 100755 run.ai/examples/query-llama.sh

diff --git a/README.md b/README.md
index 55488fd..fa08e15 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ This repo showcases different ways NVIDIA NIMs can be deployed. This repo contai
 |                                    | **Open Source Platforms**                                   |             |
 |                                    | | [KServe](https://github.com/NVIDIA/nim-deploy/tree/main/kserve)                                             |             |
 |                                    | **Independent Software Vendors**                            |             |
-|                                    | | Run.ai (coming soon)                               |             |
+|                                    | | [Run.ai on DGX Cloud](./run.ai/README.md)                               |             |
 | **Cloud Service Provider Deployments** | **Azure**                                                |             |
 |                                    | | [AKS Managed Kubernetes](https://github.com/NVIDIA/nim-deploy/tree/main/cloud-service-providers/azure/aks)                             |             |
 |                                    | | [Azure ML](https://github.com/NVIDIA/nim-deploy/tree/main/cloud-service-providers/azure/azureml)                                    |             |
diff --git a/run.ai/README.md b/run.ai/README.md
new file mode 100644
index 0000000..4b151d9
--- /dev/null
+++ b/run.ai/README.md
@@ -0,0 +1,67 @@
+# NIMs on Run.ai
+
+Run.ai provides a fast and efficient platform for running AI workloads. It sits on top of a group of Kubernetes clusters and provides UI, GPU-aware scheduling, container orchestration, node pooling, organizational resource quota management, and more. It gives customers the tools to manage resources across multiple Kubernetes clusters and subdivide them across project and departments, and automates Kubernetes primitives with its own AI optimized resources.
+
+## InferenceWorkload
+
+Run.ai provides an [InferenceWorkload](https://docs.run.ai/latest/Researcher/workloads/inference-overview/) resource to help automate inference services like NIMs. It leverages Knative to automate the underlying service and routing of traffic.
+
+It should be noted that InferenceWorkload is an optional add-on for Run.ai. Consult your Run.ai UI portal or administrator to determine which clusters support InferenceWorkload.
+
+### Example
+
+At the core, running NIMs with InferenceWorkload is quite simple. However, many customizations are possible, such as adding variables, PVCs to cache models, health checks, and other special configurations that will pass through to the pods backing the services. The `examples` directory can evolve over time with more complex deployment examples. The following example is a bare minimum configuration.
+
+This example can also be deployed through [UI](https://docs.run.ai/latest/Researcher/workloads/inference-overview/) - including creating the secret and InferenceWorkload.
+
+**Prerequisites**:
+* A Runai Project (and corresponding Kubernetes namespace, which is the project name prefixed with `runai-`). You should be set up to run "kubectl" commands to the target cluster and namespace.
+* An NGC API Key
+* A Docker registry secret for `nvcr.io` needs to exist in your Run.ai project. This can only be created through the UI, via "credentials" section. Add a new docker-registry credential, choose the scope to be your project, set username to `$oauthtoken` and password to your NGC API key. Set the registry url to `ngcr.io`. This only has to be done once per scope, and Run.ai will detect and use it when it is needed.
+
+1. Deploy InferenceWorkload to your current Kubernetes context via Helm, with working directory being the same as this README, setting the neccessary environment variables
+
+```
+% export NAMESPACE=[namespace]
+% export NGC_KEY=[ngc key]
+% helm install --set namespace=$NAMESPACE --set ngcKey=$NGC_KEY my-llama-1 examples/basic-llama
+```
+
+Now, wait for the InferenceWorkload to become ready.
+
+```
+% kubectl get ksvc basic-llama -o wide --watch
+NAME          URL                                                                                                     LATESTCREATED       LATESTREADY   READY     REASON
+basic-llama   http://basic-llama.runai-myproject.inference.12345678.dgxc.ngc.nvidia.com   basic-llama-00001                 Unknown   RevisionMissing
+basic-llama   http://basic-llama.runai-myproject.inference.12345678.dgxc.ngc.nvidia.com   basic-llama-00001   basic-llama-00001   Unknown   RevisionMissing
+basic-llama   http://basic-llama.runai-myproject.inference.12345678.dgxc.ngc.nvidia.com   basic-llama-00001   basic-llama-00001   Unknown   IngressNotConfigured
+basic-llama   http://basic-llama.runai-myproject.inference.12345678.dgxc.ngc.nvidia.com   basic-llama-00001   basic-llama-00001   Unknown   Uninitialized
+basic-llama   http://basic-llama.runai-myproject.inference.12345678.dgxc.ngc.nvidia.com   basic-llama-00001   basic-llama-00001   True
+```
+
+2. Query your new inference service
+
+As seen above, you will get a new service with host-based routing at a DNS name of [workloadname].[namespace].inference.[cluster-suffix]. Use this to pass to the test script by setting an environment variable `LHOST`
+
+```
+% export LHOST="basic-llama.runai-myproject.inference.12345678.dgxc.ngc.nvidia.com"
+% ./examples/query-llama.sh
+Here's a song about pizza:
+
+**Verse 1**
+I'm walkin' down the street, smellin' something sweet
+Followin' the aroma to my favorite treat
+A slice of heaven in a box, or so I've been told
+Gimme that pizza love, and my heart will be gold
+```
+
+3. Remove inference service
+
+```
+% helm uninstall my-llama-1
+release "my-llama-1" uninstalled
+```
+
+#### Troubleshooting
+
+Users can troubleshoot workloads by looking at the underlying resources that are created. There should be deployments, pods, ksvcs to describe or view logs from.
diff --git a/run.ai/examples/basic-llama/.helmignore b/run.ai/examples/basic-llama/.helmignore
new file mode 100644
index 0000000..0e8a0eb
--- /dev/null
+++ b/run.ai/examples/basic-llama/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/run.ai/examples/basic-llama/Chart.yaml b/run.ai/examples/basic-llama/Chart.yaml
new file mode 100644
index 0000000..37c63bd
--- /dev/null
+++ b/run.ai/examples/basic-llama/Chart.yaml
@@ -0,0 +1,24 @@
+apiVersion: v2
+name: basic-llama
+description: A Helm chart for Kubernetes
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "1.0.0"
diff --git a/run.ai/examples/basic-llama/templates/inferenceworkload.yaml b/run.ai/examples/basic-llama/templates/inferenceworkload.yaml
new file mode 100644
index 0000000..6d92149
--- /dev/null
+++ b/run.ai/examples/basic-llama/templates/inferenceworkload.yaml
@@ -0,0 +1,27 @@
+apiVersion: run.ai/v2alpha1
+kind: InferenceWorkload
+metadata:
+  name: basic-llama
+  namespace: {{ .Values.namespace }}
+spec:
+  name:
+    value: basic-llama
+  environment:
+    items:
+      NGC_API_KEY:
+        value: SECRET:ngc-secret,NGC_API_KEY
+  gpu:
+    value: "1"
+  image:
+    value: "nvcr.io/nim/meta/llama3-8b-instruct"
+  minScale:
+    value: 1
+  maxScale:
+    value: 2
+  ports:
+    items:
+      serving-port:
+        value:
+          container: 8000
+          protocol: http
+          serviceType: ServingPort
diff --git a/run.ai/examples/basic-llama/templates/ngc-secret.yaml b/run.ai/examples/basic-llama/templates/ngc-secret.yaml
new file mode 100644
index 0000000..78fb174
--- /dev/null
+++ b/run.ai/examples/basic-llama/templates/ngc-secret.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: Secret
+type: Opaque
+metadata:
+  name: ngc-secret
+  namespace: {{ .Values.namespace}}
+data:
+  NGC_API_KEY: {{ .Values.ngcKey | b64enc }}
diff --git a/run.ai/examples/basic-llama/values.yaml b/run.ai/examples/basic-llama/values.yaml
new file mode 100644
index 0000000..e09c86e
--- /dev/null
+++ b/run.ai/examples/basic-llama/values.yaml
@@ -0,0 +1,5 @@
+
+# These can be edited here locally, but should be overridden like so:
+# helm install --set namespace=$NAMESPACE --set ngcKey=$NGC_KEY
+namespace: override-with-flag
+ngcKey: override-with-flag
diff --git a/run.ai/examples/query-llama.sh b/run.ai/examples/query-llama.sh
new file mode 100755
index 0000000..c38d314
--- /dev/null
+++ b/run.ai/examples/query-llama.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+if [[ -z $LHOST ]]; then
+  echo "please provide an LHOST env var"
+  exit 1
+fi
+
+Q="Write a song about pizza"
+
+curl "http://${LHOST}/v1/chat/completions" \
+  -H "Accept: application/json" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+    {
+        "content": "'"${Q}"'",
+        "role": "user"
+    }
+   ],
+   "model": "meta/llama3-8b-instruct",
+   "max_tokens": 500,
+   "top_p": 0.8,
+   "temperature": 0.9,
+   "seed": '$RANDOM',
+   "stream": false,
+   "stop": ["hello\n"],
+   "frequency_penalty": 1.0
+}' | jq -r '.choices[0]|.message.content'

From 85c3a97ef07e2bf34530d651eded28c147799a4c Mon Sep 17 00:00:00 2001
From: Marcus Sorensen <msorensen@nvidia.com>
Date: Thu, 29 Aug 2024 11:02:43 -0600
Subject: [PATCH 2/7] Update READMEs

---
 README.md        | 2 +-
 run.ai/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index fa08e15..232e3ce 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ This repo showcases different ways NVIDIA NIMs can be deployed. This repo contai
 |                                    | **Open Source Platforms**                                   |             |
 |                                    | | [KServe](https://github.com/NVIDIA/nim-deploy/tree/main/kserve)                                             |             |
 |                                    | **Independent Software Vendors**                            |             |
-|                                    | | [Run.ai on DGX Cloud](./run.ai/README.md)                               |             |
+|                                    | | [Run.ai](./run.ai/README.md)                               |             |
 | **Cloud Service Provider Deployments** | **Azure**                                                |             |
 |                                    | | [AKS Managed Kubernetes](https://github.com/NVIDIA/nim-deploy/tree/main/cloud-service-providers/azure/aks)                             |             |
 |                                    | | [Azure ML](https://github.com/NVIDIA/nim-deploy/tree/main/cloud-service-providers/azure/azureml)                                    |             |
diff --git a/run.ai/README.md b/run.ai/README.md
index 4b151d9..b79b10b 100644
--- a/run.ai/README.md
+++ b/run.ai/README.md
@@ -6,7 +6,7 @@ Run.ai provides a fast and efficient platform for running AI workloads. It sits
 
 Run.ai provides an [InferenceWorkload](https://docs.run.ai/latest/Researcher/workloads/inference-overview/) resource to help automate inference services like NIMs. It leverages Knative to automate the underlying service and routing of traffic.
 
-It should be noted that InferenceWorkload is an optional add-on for Run.ai. Consult your Run.ai UI portal or administrator to determine which clusters support InferenceWorkload.
+It should be noted that InferenceWorkload is an optional add-on for Run.ai. Consult your Run.ai UI portal or cluster administrator to determine which clusters support InferenceWorkload.
 
 ### Example
 

From b219df90c6163d7289a10e60bff5a9535e3c8c1f Mon Sep 17 00:00:00 2001
From: Marcus Sorensen <msorensen@nvidia.com>
Date: Thu, 29 Aug 2024 13:01:58 -0600
Subject: [PATCH 3/7] Minor doc fixes and optimizations

---
 run.ai/README.md                                             | 5 +++--
 run.ai/examples/basic-llama/templates/inferenceworkload.yaml | 2 +-
 run.ai/examples/query-llama.sh                               | 5 +++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/run.ai/README.md b/run.ai/README.md
index b79b10b..39879a0 100644
--- a/run.ai/README.md
+++ b/run.ai/README.md
@@ -17,6 +17,7 @@ This example can also be deployed through [UI](https://docs.run.ai/latest/Resear
 **Prerequisites**:
 * A Runai Project (and corresponding Kubernetes namespace, which is the project name prefixed with `runai-`). You should be set up to run "kubectl" commands to the target cluster and namespace.
 * An NGC API Key
+* `curl` and `jq` for the test script
 * A Docker registry secret for `nvcr.io` needs to exist in your Run.ai project. This can only be created through the UI, via "credentials" section. Add a new docker-registry credential, choose the scope to be your project, set username to `$oauthtoken` and password to your NGC API key. Set the registry url to `ngcr.io`. This only has to be done once per scope, and Run.ai will detect and use it when it is needed.
 
 1. Deploy InferenceWorkload to your current Kubernetes context via Helm, with working directory being the same as this README, setting the neccessary environment variables
@@ -27,7 +28,7 @@ This example can also be deployed through [UI](https://docs.run.ai/latest/Resear
 % helm install --set namespace=$NAMESPACE --set ngcKey=$NGC_KEY my-llama-1 examples/basic-llama
 ```
 
-Now, wait for the InferenceWorkload to become ready.
+Now, wait for the InferenceWorkload's ksvc to become ready.
 
 ```
 % kubectl get ksvc basic-llama -o wide --watch
@@ -41,7 +42,7 @@ basic-llama   http://basic-llama.runai-myproject.inference.12345678.dgxc.ngc.nvi
 
 2. Query your new inference service
 
-As seen above, you will get a new service with host-based routing at a DNS name of [workloadname].[namespace].inference.[cluster-suffix]. Use this to pass to the test script by setting an environment variable `LHOST`
+As seen above, you will get a new knative service accessible via hostname-based routing. Use the hostname from this URL to pass to the test script by setting an environment variable `LHOST`.
 
 ```
 % export LHOST="basic-llama.runai-myproject.inference.12345678.dgxc.ngc.nvidia.com"
diff --git a/run.ai/examples/basic-llama/templates/inferenceworkload.yaml b/run.ai/examples/basic-llama/templates/inferenceworkload.yaml
index 6d92149..acb65cd 100644
--- a/run.ai/examples/basic-llama/templates/inferenceworkload.yaml
+++ b/run.ai/examples/basic-llama/templates/inferenceworkload.yaml
@@ -13,7 +13,7 @@ spec:
   gpu:
     value: "1"
   image:
-    value: "nvcr.io/nim/meta/llama3-8b-instruct"
+    value: "nvcr.io/nim/meta/llama-3.1-8b-instruct"
   minScale:
     value: 1
   maxScale:
diff --git a/run.ai/examples/query-llama.sh b/run.ai/examples/query-llama.sh
index c38d314..09012c0 100755
--- a/run.ai/examples/query-llama.sh
+++ b/run.ai/examples/query-llama.sh
@@ -6,8 +6,9 @@ if [[ -z $LHOST ]]; then
 fi
 
 Q="Write a song about pizza"
+MODEL=$(curl -s "http://${LHOST}/v1/models" | jq -r '.data[0]|.id')
 
-curl "http://${LHOST}/v1/chat/completions" \
+curl -s "http://${LHOST}/v1/chat/completions" \
   -H "Accept: application/json" \
   -H "Content-Type: application/json" \
   -d '{
@@ -17,7 +18,7 @@ curl "http://${LHOST}/v1/chat/completions" \
         "role": "user"
     }
    ],
-   "model": "meta/llama3-8b-instruct",
+   "model": "'"${MODEL}"'",
    "max_tokens": 500,
    "top_p": 0.8,
    "temperature": 0.9,

From b2bb104a050e87b027ec56a1cf17f8b117999ac8 Mon Sep 17 00:00:00 2001
From: Marcus Sorensen <msorensen@nvidia.com>
Date: Thu, 29 Aug 2024 14:52:53 -0600
Subject: [PATCH 4/7] Add PVC based example to Run.ai

---
 run.ai/README.md                              | 13 ++++++-
 run.ai/examples/basic-llama-pvc/.helmignore   | 23 +++++++++++
 run.ai/examples/basic-llama-pvc/Chart.yaml    | 24 ++++++++++++
 .../templates/inferenceworkload.yaml          | 39 +++++++++++++++++++
 .../basic-llama-pvc/templates/ngc-secret.yaml |  8 ++++
 .../basic-llama-pvc/templates/pvc.yaml        | 14 +++++++
 run.ai/examples/basic-llama-pvc/values.yaml   |  8 ++++
 7 files changed, 127 insertions(+), 2 deletions(-)
 create mode 100644 run.ai/examples/basic-llama-pvc/.helmignore
 create mode 100644 run.ai/examples/basic-llama-pvc/Chart.yaml
 create mode 100644 run.ai/examples/basic-llama-pvc/templates/inferenceworkload.yaml
 create mode 100644 run.ai/examples/basic-llama-pvc/templates/ngc-secret.yaml
 create mode 100644 run.ai/examples/basic-llama-pvc/templates/pvc.yaml
 create mode 100644 run.ai/examples/basic-llama-pvc/values.yaml

diff --git a/run.ai/README.md b/run.ai/README.md
index 39879a0..9f1026e 100644
--- a/run.ai/README.md
+++ b/run.ai/README.md
@@ -8,7 +8,7 @@ Run.ai provides an [InferenceWorkload](https://docs.run.ai/latest/Researcher/wor
 
 It should be noted that InferenceWorkload is an optional add-on for Run.ai. Consult your Run.ai UI portal or cluster administrator to determine which clusters support InferenceWorkload.
 
-### Example
+### Basic Example
 
 At the core, running NIMs with InferenceWorkload is quite simple. However, many customizations are possible, such as adding variables, PVCs to cache models, health checks, and other special configurations that will pass through to the pods backing the services. The `examples` directory can evolve over time with more complex deployment examples. The following example is a bare minimum configuration.
 
@@ -62,7 +62,16 @@ Gimme that pizza love, and my heart will be gold
 % helm uninstall my-llama-1
 release "my-llama-1" uninstalled
 ```
+### PVC Example
 
-#### Troubleshooting
+The PVC example runs in much the same way. It adds a mounted PVC to the example NIM container in a place where it can be used as a cache - `/opt/nim/.cache`, and configured to be retained between helm uninstall and install, so that the model data need only be downloaded on first use.
+
+```
+% helm install --set namespace=$NAMESPACE --set ngcKey=$NGC_KEY my-llama-pvc examples/basic-llama-pvc
+
+% kubectl get ksvc basic-llama-pvc --watch
+```
+
+### Troubleshooting
 
 Users can troubleshoot workloads by looking at the underlying resources that are created. There should be deployments, pods, ksvcs to describe or view logs from.
diff --git a/run.ai/examples/basic-llama-pvc/.helmignore b/run.ai/examples/basic-llama-pvc/.helmignore
new file mode 100644
index 0000000..0e8a0eb
--- /dev/null
+++ b/run.ai/examples/basic-llama-pvc/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/run.ai/examples/basic-llama-pvc/Chart.yaml b/run.ai/examples/basic-llama-pvc/Chart.yaml
new file mode 100644
index 0000000..2602149
--- /dev/null
+++ b/run.ai/examples/basic-llama-pvc/Chart.yaml
@@ -0,0 +1,24 @@
+apiVersion: v2
+name: basic-llama-pvc
+description: A Helm chart for Kubernetes
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "1.0.0"
diff --git a/run.ai/examples/basic-llama-pvc/templates/inferenceworkload.yaml b/run.ai/examples/basic-llama-pvc/templates/inferenceworkload.yaml
new file mode 100644
index 0000000..4acc3f1
--- /dev/null
+++ b/run.ai/examples/basic-llama-pvc/templates/inferenceworkload.yaml
@@ -0,0 +1,39 @@
+apiVersion: run.ai/v2alpha1
+kind: InferenceWorkload
+metadata:
+  name: basic-llama-pvc
+  namespace: {{ .Values.namespace }}
+spec:
+  name:
+    value: basic-llama-pvc
+  environment:
+    items:
+      NGC_API_KEY:
+        value: SECRET:ngc-secret-pvc,NGC_API_KEY
+  gpu:
+    value: "1"
+  image:
+    value: "nvcr.io/nim/meta/llama-3.1-8b-instruct"
+  minScale:
+    value: 1
+  maxScale:
+    value: 2
+  runAsUid:
+    value: 1000
+  runAsGid:
+    value: 1000
+  ports:
+    items:
+      serving-port:
+        value:
+          container: 8000
+          protocol: http
+          serviceType: ServingPort
+  pvcs:
+    items:
+      pvc:
+        value:
+          claimName: nim-cache
+          existingPvc: true
+          path: /opt/nim/.cache
+          readOnly: false
diff --git a/run.ai/examples/basic-llama-pvc/templates/ngc-secret.yaml b/run.ai/examples/basic-llama-pvc/templates/ngc-secret.yaml
new file mode 100644
index 0000000..53772fb
--- /dev/null
+++ b/run.ai/examples/basic-llama-pvc/templates/ngc-secret.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: Secret
+type: Opaque
+metadata:
+  name: ngc-secret-pvc
+  namespace: {{ .Values.namespace}}
+data:
+  NGC_API_KEY: {{ .Values.ngcKey | b64enc }}
diff --git a/run.ai/examples/basic-llama-pvc/templates/pvc.yaml b/run.ai/examples/basic-llama-pvc/templates/pvc.yaml
new file mode 100644
index 0000000..07ef52c
--- /dev/null
+++ b/run.ai/examples/basic-llama-pvc/templates/pvc.yaml
@@ -0,0 +1,14 @@
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: nim-cache
+  namespace: {{ .Values.namespace }}
+  annotations:
+    helm.sh/resource-policy: "keep"
+spec:
+  storageClassName: {{ .Values.storageClassName }}
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 32Gi
diff --git a/run.ai/examples/basic-llama-pvc/values.yaml b/run.ai/examples/basic-llama-pvc/values.yaml
new file mode 100644
index 0000000..69b36db
--- /dev/null
+++ b/run.ai/examples/basic-llama-pvc/values.yaml
@@ -0,0 +1,8 @@
+
+# These can be edited here locally, but should be overridden like so:
+# helm install --set namespace=$NAMESPACE --set ngcKey=$NGC_KEY
+namespace: override-with-flag
+ngcKey: override-with-flag
+
+## optional to override
+storageClassName: standard-rwx

From 6e8c6f14aa4733bf0c3061ab05ee9a1e014a7644 Mon Sep 17 00:00:00 2001
From: Marcus Sorensen <msorensen@nvidia.com>
Date: Fri, 30 Aug 2024 09:09:25 -0600
Subject: [PATCH 5/7] Fixes

---
 run.ai/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/run.ai/README.md b/run.ai/README.md
index 9f1026e..2ecb3f5 100644
--- a/run.ai/README.md
+++ b/run.ai/README.md
@@ -4,7 +4,7 @@ Run.ai provides a fast and efficient platform for running AI workloads. It sits
 
 ## InferenceWorkload
 
-Run.ai provides an [InferenceWorkload](https://docs.run.ai/latest/Researcher/workloads/inference-overview/) resource to help automate inference services like NIMs. It leverages Knative to automate the underlying service and routing of traffic.
+Run.ai provides an [InferenceWorkload](https://docs.run.ai/latest/Researcher/workloads/inference-overview/) resource to help automate inference services like NIMs. It leverages Knative to automate the underlying service and routing of traffic. YAML examples can be found [here](https://docs.run.ai/latest/developer/cluster-api/submit-yaml/#inference-workload-example).
 
 It should be noted that InferenceWorkload is an optional add-on for Run.ai. Consult your Run.ai UI portal or cluster administrator to determine which clusters support InferenceWorkload.
 
@@ -18,9 +18,9 @@ This example can also be deployed through [UI](https://docs.run.ai/latest/Resear
 * A Runai Project (and corresponding Kubernetes namespace, which is the project name prefixed with `runai-`). You should be set up to run "kubectl" commands to the target cluster and namespace.
 * An NGC API Key
 * `curl` and `jq` for the test script
-* A Docker registry secret for `nvcr.io` needs to exist in your Run.ai project. This can only be created through the UI, via "credentials" section. Add a new docker-registry credential, choose the scope to be your project, set username to `$oauthtoken` and password to your NGC API key. Set the registry url to `ngcr.io`. This only has to be done once per scope, and Run.ai will detect and use it when it is needed.
+* A Docker registry secret for `nvcr.io` needs to exist in your Run.ai project. This can only be created through the UI, via "credentials" section. Add a new docker-registry credential, choose the scope to be your project, set username to `$oauthtoken` and password to your NGC API key. Set the registry url to `nvcr.io`. This only has to be done once per scope, and Run.ai will detect and use it when it is needed.
 
-1. Deploy InferenceWorkload to your current Kubernetes context via Helm, with working directory being the same as this README, setting the neccessary environment variables
+1. Deploy InferenceWorkload to your current Kubernetes context via Helm, with working directory being the same as this README, setting the necessary environment variables
 
 ```
 % export NAMESPACE=[namespace]

From c996ee82c776748b954d8389ab8903d28ab21fe9 Mon Sep 17 00:00:00 2001
From: Marcus Sorensen <msorensen@nvidia.com>
Date: Fri, 30 Aug 2024 10:11:53 -0600
Subject: [PATCH 6/7] Move run.ai to docs, address comments

---
 {run.ai => docs/run.ai}/README.md             | 19 ++++++++++++++++---
 .../examples/basic-llama-pvc/.helmignore      |  0
 .../examples/basic-llama-pvc/Chart.yaml       |  0
 .../templates/inferenceworkload.yaml          |  0
 .../basic-llama-pvc/templates/ngc-secret.yaml |  0
 .../basic-llama-pvc/templates/pvc.yaml        |  0
 .../examples/basic-llama-pvc/values.yaml      |  0
 .../run.ai}/examples/basic-llama/.helmignore  |  0
 .../run.ai}/examples/basic-llama/Chart.yaml   |  0
 .../templates/inferenceworkload.yaml          |  0
 .../basic-llama/templates/ngc-secret.yaml     |  0
 .../run.ai}/examples/basic-llama/values.yaml  |  0
 .../run.ai}/examples/query-llama.sh           |  0
 13 files changed, 16 insertions(+), 3 deletions(-)
 rename {run.ai => docs/run.ai}/README.md (62%)
 rename {run.ai => docs/run.ai}/examples/basic-llama-pvc/.helmignore (100%)
 rename {run.ai => docs/run.ai}/examples/basic-llama-pvc/Chart.yaml (100%)
 rename {run.ai => docs/run.ai}/examples/basic-llama-pvc/templates/inferenceworkload.yaml (100%)
 rename {run.ai => docs/run.ai}/examples/basic-llama-pvc/templates/ngc-secret.yaml (100%)
 rename {run.ai => docs/run.ai}/examples/basic-llama-pvc/templates/pvc.yaml (100%)
 rename {run.ai => docs/run.ai}/examples/basic-llama-pvc/values.yaml (100%)
 rename {run.ai => docs/run.ai}/examples/basic-llama/.helmignore (100%)
 rename {run.ai => docs/run.ai}/examples/basic-llama/Chart.yaml (100%)
 rename {run.ai => docs/run.ai}/examples/basic-llama/templates/inferenceworkload.yaml (100%)
 rename {run.ai => docs/run.ai}/examples/basic-llama/templates/ngc-secret.yaml (100%)
 rename {run.ai => docs/run.ai}/examples/basic-llama/values.yaml (100%)
 rename {run.ai => docs/run.ai}/examples/query-llama.sh (100%)

diff --git a/run.ai/README.md b/docs/run.ai/README.md
similarity index 62%
rename from run.ai/README.md
rename to docs/run.ai/README.md
index 2ecb3f5..daec17d 100644
--- a/run.ai/README.md
+++ b/docs/run.ai/README.md
@@ -1,10 +1,23 @@
 # NIMs on Run.ai
 
-Run.ai provides a fast and efficient platform for running AI workloads. It sits on top of a group of Kubernetes clusters and provides UI, GPU-aware scheduling, container orchestration, node pooling, organizational resource quota management, and more. It gives customers the tools to manage resources across multiple Kubernetes clusters and subdivide them across project and departments, and automates Kubernetes primitives with its own AI optimized resources.
+[Run.ai](https://www.run.ai/) provides a platform for accelerating AI development delivering life cycle support spanning from concept to deployment of AI workloads. It layers on top of Kubernetes starting with a single cluster but extending to centralized multi-cluster management. It provides UI, GPU-aware scheduling, container orchestration, node pooling, organizational resource quota management, and more. And it offers administrators, researchers, and developers tools to manage resources across multiple Kubernetes clusters and subdivide them across project and departments, and automates Kubernetes primitives with its own AI optimized resources.
+
+## Run.ai Deployment Options
+
+The Run:ai Control Plane is available as a [hosted service](https://docs.run.ai/latest/home/components/#runai-control-plane-on-the-cloud) or alternatively as a [self-hosted](https://docs.run.ai/latest/home/components/#self-hosted-control-plane) option (including in disconnected "air-gapped" environments). In either case, the control plane can manage Run:ai "cluster engine" equipped clusters whether local or remotely cloud hosted.
+
+## Prerequisites
+
+1. A conformant Kubernetes cluster ([RunAI K8s version requirements](https://docs.run.ai/latest/admin/overview-administrator/))
+2. RunAI Control Plane and cluster(s) [installed](https://docs.run.ai/latest/admin/runai-setup/cluster-setup/cluster-install/) and operational
+3. [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) installed
+4. General NIM requirements: [NIM Prerequisites](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#prerequisites)
+5. An NVIDIA AI Enterprise (NVAIE) License: [Sign up for NVAIE license](https://build.nvidia.com/meta/llama-3-8b-instruct?snippet_tab=Docker&signin=true&integrate_nim=true&self_hosted_api=true) or [Request a Free 90-Day NVAIE License](https://enterpriseproductregistration.nvidia.com/?LicType=EVAL&ProductFamily=NVAIEnterprise) through the NVIDIA Developer Program.
+6. An NVIDIA NGC API Key: please follow the guidance in the [NVIDIA NIM Getting Started](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#option-2-from-ngc) documentation to generate a properly scoped API key if you haven't already.
 
 ## InferenceWorkload
 
-Run.ai provides an [InferenceWorkload](https://docs.run.ai/latest/Researcher/workloads/inference-overview/) resource to help automate inference services like NIMs. It leverages Knative to automate the underlying service and routing of traffic. YAML examples can be found [here](https://docs.run.ai/latest/developer/cluster-api/submit-yaml/#inference-workload-example).
+Run.ai provides an [InferenceWorkload](https://docs.run.ai/latest/Researcher/workloads/inference-overview/) resource to help automate inference services like NIMs. It leverages [Knative](https://github.com/knative) to automate the underlying service and routing of traffic. YAML examples can be found [here](https://docs.run.ai/latest/developer/cluster-api/submit-yaml/#inference-workload-example).
 
 It should be noted that InferenceWorkload is an optional add-on for Run.ai. Consult your Run.ai UI portal or cluster administrator to determine which clusters support InferenceWorkload.
 
@@ -14,7 +27,7 @@ At the core, running NIMs with InferenceWorkload is quite simple. However, many
 
 This example can also be deployed through [UI](https://docs.run.ai/latest/Researcher/workloads/inference-overview/) - including creating the secret and InferenceWorkload.
 
-**Prerequisites**:
+**Preparation**:
 * A Runai Project (and corresponding Kubernetes namespace, which is the project name prefixed with `runai-`). You should be set up to run "kubectl" commands to the target cluster and namespace.
 * An NGC API Key
 * `curl` and `jq` for the test script
diff --git a/run.ai/examples/basic-llama-pvc/.helmignore b/docs/run.ai/examples/basic-llama-pvc/.helmignore
similarity index 100%
rename from run.ai/examples/basic-llama-pvc/.helmignore
rename to docs/run.ai/examples/basic-llama-pvc/.helmignore
diff --git a/run.ai/examples/basic-llama-pvc/Chart.yaml b/docs/run.ai/examples/basic-llama-pvc/Chart.yaml
similarity index 100%
rename from run.ai/examples/basic-llama-pvc/Chart.yaml
rename to docs/run.ai/examples/basic-llama-pvc/Chart.yaml
diff --git a/run.ai/examples/basic-llama-pvc/templates/inferenceworkload.yaml b/docs/run.ai/examples/basic-llama-pvc/templates/inferenceworkload.yaml
similarity index 100%
rename from run.ai/examples/basic-llama-pvc/templates/inferenceworkload.yaml
rename to docs/run.ai/examples/basic-llama-pvc/templates/inferenceworkload.yaml
diff --git a/run.ai/examples/basic-llama-pvc/templates/ngc-secret.yaml b/docs/run.ai/examples/basic-llama-pvc/templates/ngc-secret.yaml
similarity index 100%
rename from run.ai/examples/basic-llama-pvc/templates/ngc-secret.yaml
rename to docs/run.ai/examples/basic-llama-pvc/templates/ngc-secret.yaml
diff --git a/run.ai/examples/basic-llama-pvc/templates/pvc.yaml b/docs/run.ai/examples/basic-llama-pvc/templates/pvc.yaml
similarity index 100%
rename from run.ai/examples/basic-llama-pvc/templates/pvc.yaml
rename to docs/run.ai/examples/basic-llama-pvc/templates/pvc.yaml
diff --git a/run.ai/examples/basic-llama-pvc/values.yaml b/docs/run.ai/examples/basic-llama-pvc/values.yaml
similarity index 100%
rename from run.ai/examples/basic-llama-pvc/values.yaml
rename to docs/run.ai/examples/basic-llama-pvc/values.yaml
diff --git a/run.ai/examples/basic-llama/.helmignore b/docs/run.ai/examples/basic-llama/.helmignore
similarity index 100%
rename from run.ai/examples/basic-llama/.helmignore
rename to docs/run.ai/examples/basic-llama/.helmignore
diff --git a/run.ai/examples/basic-llama/Chart.yaml b/docs/run.ai/examples/basic-llama/Chart.yaml
similarity index 100%
rename from run.ai/examples/basic-llama/Chart.yaml
rename to docs/run.ai/examples/basic-llama/Chart.yaml
diff --git a/run.ai/examples/basic-llama/templates/inferenceworkload.yaml b/docs/run.ai/examples/basic-llama/templates/inferenceworkload.yaml
similarity index 100%
rename from run.ai/examples/basic-llama/templates/inferenceworkload.yaml
rename to docs/run.ai/examples/basic-llama/templates/inferenceworkload.yaml
diff --git a/run.ai/examples/basic-llama/templates/ngc-secret.yaml b/docs/run.ai/examples/basic-llama/templates/ngc-secret.yaml
similarity index 100%
rename from run.ai/examples/basic-llama/templates/ngc-secret.yaml
rename to docs/run.ai/examples/basic-llama/templates/ngc-secret.yaml
diff --git a/run.ai/examples/basic-llama/values.yaml b/docs/run.ai/examples/basic-llama/values.yaml
similarity index 100%
rename from run.ai/examples/basic-llama/values.yaml
rename to docs/run.ai/examples/basic-llama/values.yaml
diff --git a/run.ai/examples/query-llama.sh b/docs/run.ai/examples/query-llama.sh
similarity index 100%
rename from run.ai/examples/query-llama.sh
rename to docs/run.ai/examples/query-llama.sh

From 48644bb6fbf806403e5f0cc2bdb151790464afeb Mon Sep 17 00:00:00 2001
From: Marcus Sorensen <msorensen@nvidia.com>
Date: Fri, 30 Aug 2024 10:48:32 -0600
Subject: [PATCH 7/7] Add air-gapped text

---
 README.md             | 2 +-
 docs/run.ai/README.md | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 232e3ce..aa72d59 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ This repo showcases different ways NVIDIA NIMs can be deployed. This repo contai
 |                                    | **Open Source Platforms**                                   |             |
 |                                    | | [KServe](https://github.com/NVIDIA/nim-deploy/tree/main/kserve)                                             |             |
 |                                    | **Independent Software Vendors**                            |             |
-|                                    | | [Run.ai](./run.ai/README.md)                               |             |
+|                                    | | [Run.ai](./docs/run.ai/README.md)                               |             |
 | **Cloud Service Provider Deployments** | **Azure**                                                |             |
 |                                    | | [AKS Managed Kubernetes](https://github.com/NVIDIA/nim-deploy/tree/main/cloud-service-providers/azure/aks)                             |             |
 |                                    | | [Azure ML](https://github.com/NVIDIA/nim-deploy/tree/main/cloud-service-providers/azure/azureml)                                    |             |
diff --git a/docs/run.ai/README.md b/docs/run.ai/README.md
index daec17d..b188ace 100644
--- a/docs/run.ai/README.md
+++ b/docs/run.ai/README.md
@@ -88,3 +88,7 @@ The PVC example runs in much the same way. It adds a mounted PVC to the example
 ### Troubleshooting
 
 Users can troubleshoot workloads by looking at the underlying resources that are created. There should be deployments, pods, ksvcs to describe or view logs from.
+
+## Air-gapped operations
+
+For scenarios in which Run:ai clusters are operating in air-gapped (disconnected) environments, please see NVIDIA NIM documentation for [serving models from local assets](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#serving-models-from-local-assets).