diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 5d1d536704..2253e6271d 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -13,6 +13,8 @@ ENV DEFAULT_CONDA_ENV=rapids
 
 FROM ${PYTHON_PACKAGE_MANAGER}-base
 
+ARG TARGETARCH
+
 ARG CUDA
 ENV CUDAARCHS="RAPIDS"
 ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
@@ -24,7 +26,30 @@ ENV PYTHONSAFEPATH="1"
 ENV PYTHONUNBUFFERED="1"
 ENV PYTHONDONTWRITEBYTECODE="1"
 
+ENV HISTFILE="/home/coder/.cache/._bash_history"
+
+###
+# sccache configuration
+###
+ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
-ENV HISTFILE="/home/coder/.cache/._bash_history"
+ENV SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE=true
+ENV SCCACHE_IDLE_TIMEOUT=0
+
+###
+# sccache-dist configuration
+###
+# Enable sccache-dist by default
+ENV DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST=1
+# Compile locally if max retries exceeded
+ENV SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=true
+# Retry transient errors 4 times (for a total of 5 attempts)
+ENV SCCACHE_DIST_MAX_RETRIES=4
+# 1hr 59min (to accommodate debug builds)
+ENV SCCACHE_DIST_REQUEST_TIMEOUT=7140
+ENV SCCACHE_DIST_URL="https://${TARGETARCH}.linux.sccache.rapids.nvidia.com"
+
+# Build as much in parallel as possible
+ENV INFER_NUM_DEVICE_ARCHITECTURES=1
+ENV MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL=20
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json
similarity index 86%
rename from .devcontainer/cuda11.8-conda/devcontainer.json
rename to .devcontainer/cuda12.9-conda/devcontainer.json
index 184d664799..c5effd0db8 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda12.9-conda/devcontainer.json
@@ -3,24 +3,26 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "11.8",
+      "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:26.02-cpp-mambaforge"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.02-cuda12.9-conda",
+    "--ulimit",
+    "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.9-envs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
@@ -29,7 +31,7 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda11.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.9-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json
similarity index 85%
rename from .devcontainer/cuda11.8-pip/devcontainer.json
rename to .devcontainer/cuda12.9-pip/devcontainer.json
index 702422dbd4..49702d7f5d 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda12.9-pip/devcontainer.json
@@ -3,24 +3,26 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "11.8",
+      "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.06-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:26.02-cpp-cuda12.9"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.02-cuda12.9-pip",
+    "--ulimit",
+    "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.9-venvs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
@@ -28,7 +30,7 @@
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.9-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.8-conda/devcontainer.json b/.devcontainer/cuda13.0-conda/devcontainer.json
similarity index 86%
rename from .devcontainer/cuda12.8-conda/devcontainer.json
rename to .devcontainer/cuda13.0-conda/devcontainer.json
index ee9b1ddad7..b1dec3d161 100644
--- a/.devcontainer/cuda12.8-conda/devcontainer.json
+++ b/.devcontainer/cuda13.0-conda/devcontainer.json
@@ -3,24 +3,26 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.8",
+      "CUDA": "13.0",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.06-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:26.02-cpp-mambaforge"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.02-cuda13.0-conda",
+    "--ulimit",
+    "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.8-envs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda13.0-envs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
@@ -29,7 +31,7 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda13.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.8-pip/devcontainer.json b/.devcontainer/cuda13.0-pip/devcontainer.json
similarity index 85%
rename from .devcontainer/cuda12.8-pip/devcontainer.json
rename to .devcontainer/cuda13.0-pip/devcontainer.json
index 84454b0886..df77a93038 100644
--- a/.devcontainer/cuda12.8-pip/devcontainer.json
+++ b/.devcontainer/cuda13.0-pip/devcontainer.json
@@ -3,24 +3,26 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.8",
+      "CUDA": "13.0",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.06-cpp-cuda12.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:26.02-cpp-cuda13.0"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.02-cuda13.0-pip",
+    "--ulimit",
+    "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda13.0-venvs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
@@ -28,7 +30,7 @@
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda13.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.flake8 b/.flake8
deleted file mode 100644
index 01f556d94c..0000000000
--- a/.flake8
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-
-[flake8]
-filename = *.py, *.pyx, *.pxd, *.pxi
-force-check = True
-max-line-length = 88
-exclude =
-    .eggs,
-    *.egg,
-    build,
-    docs,
-    .git,
-    _skbuild,
-ignore =
-    # line break before binary operator
-    W503,
-    # whitespace before :
-    E203
-per-file-ignores =
-    # Ignore black/flake8-pyi conflicts
-    *.pyi:E301 E302 E704
-    # Rules ignored only in Cython:
-    # E211: whitespace before '(' (used in multi-line imports)
-    # E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
-    # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
-    # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
-    # E275: Missing whitespace after keyword (Doesn't work with Cython except?)
-    # E402: invalid syntax (works for Python, not Cython)
-    # E999: invalid syntax (works for Python, not Cython)
-    # W503: line break before binary operator (breaks lines that start with a pointer)
-    # W504: line break after binary operator (breaks lines that end with a pointer)
-    *.pyx: E211, E225, E226, E227, E275, E402, E999, W503, W504
-    *.pxd: E211, E225, E226, E227, E275, E402, E999, W503, W504
-    *.pxi: E211, E225, E226, E227, E275, E402, E999, W503, W504
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index f5fe2c9d4c..d94819012e 100755
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -14,6 +14,7 @@ python/setup.py    @rapidsai/kvikio-cmake-codeowners @rapidsai/kvikio-python-cod
 #CI code owners
 /.github/                @rapidsai/ci-codeowners
 /ci/                     @rapidsai/ci-codeowners
+/.shellcheckrc           @rapidsai/ci-codeowners
 
 #packaging code owners
 /.pre-commit-config.yaml @rapidsai/packaging-codeowners
diff --git a/.github/release.yml b/.github/release.yml
new file mode 100644
index 0000000000..2c9a85805c
--- /dev/null
+++ b/.github/release.yml
@@ -0,0 +1,27 @@
+# GitHub Auto-Generated Release Notes Configuration for RAPIDS
+# This file configures how GitHub automatically generates release notes
+
+changelog:
+  exclude:
+    labels:
+      - ignore-for-release
+      - dependencies
+    authors:
+      - rapids-bot[bot]
+      - dependabot[bot]
+  categories:
+    - title: 🚨 Breaking Changes
+      labels:
+        - breaking
+    - title: 🐛 Bug Fixes
+      labels:
+        - bug
+    - title: 📖 Documentation
+      labels:
+        - doc
+    - title: 🚀 New Features
+      labels:
+        - feature request
+    - title: 🛠️ Improvements
+      labels:
+        - improvement
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index f88bb15618..489ca20f76 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -3,21 +3,27 @@ name: build
 on:
   push:
     branches:
-      - "branch-*"
+      - "main"
     tags:
       - v[0-9][0-9].[0-9][0-9].[0-9][0-9]
   workflow_dispatch:
     inputs:
       branch:
+        description: |
+          branch: git branch the workflow run targets.
+          Required even when 'sha' is provided because it is also used for organizing artifacts.
         required: true
         type: string
       date:
+        description: "date: Date (YYYY-MM-DD) this run is for. Used to organize artifacts produced by nightly builds"
         required: true
         type: string
       sha:
+        description: "sha: full git commit SHA to check out"
         required: true
         type: string
       build_type:
+        description: "build_type: one of [branch, nightly, pull-request]"
         type: string
         default: nightly
 
@@ -28,27 +34,30 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
+      node_type: cpu8
       date: ${{ inputs.date }}
       script: ci/build_cpp.sh
       sha: ${{ inputs.sha }}
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       script: ci/build_python.sh
       sha: ${{ inputs.sha }}
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -58,44 +67,48 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:26.02-latest"
       date: ${{ inputs.date }}
       node_type: "gpu-l4-latest-1"
       script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
+      node_type: cpu8
       script: ci/build_wheel_cpp.sh
       package-name: libkvikio
       package-type: cpp
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
+      node_type: cpu8
       script: ci/build_wheel_python.sh
       package-name: kvikio
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
       package-type: python
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -106,7 +119,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 97273209c6..3f1bf94819 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -27,15 +27,12 @@ jobs:
       - wheel-python-tests
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@main
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
   check-nightly-ci:
-    # Switch to ubuntu-latest once it defaults to a version of Ubuntu that
-    # provides at least Python 3.11 (see
-    # https://docs.python.org/3/library/datetime.html#datetime.date.fromisoformat)
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-latest
     needs: telemetry-setup
     env:
       RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -46,7 +43,7 @@ jobs:
           repo: kvikio
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@main
     with:
       files_yaml: |
         test_cpp:
@@ -89,97 +86,114 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@main
     with:
       ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
     with:
       build_type: pull-request
+      node_type: cpu8
       script: ci/build_cpp.sh
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
       script: ci/test_cpp.sh
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:26.02-latest"
       script: "ci/test_java.sh"
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_python.sh
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_python.sh
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:26.02-latest"
       script: "ci/build_docs.sh"
   devcontainer:
     needs: telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@main
     with:
-      arch: '["amd64"]'
-      cuda: '["12.8"]'
+      arch: '["amd64", "arm64"]'
+      cuda: '["13.0"]'
+      node_type: "cpu8"
+      rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN
+      env: |
+        SCCACHE_DIST_MAX_RETRIES=inf
+        SCCACHE_SERVER_LOG=sccache=debug
+        SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=false
+        SCCACHE_DIST_AUTH_TOKEN_VAR=RAPIDS_AUX_SECRET_1
       build_command: |
-        sccache -z;
-        build-all --verbose;
+        sccache --zero-stats;
+        build-all --verbose 2>&1 | tee telemetry-artifacts/build.log;
         python -c "import kvikio; print(kvikio.__version__)";
-        sccache -s;
+        sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt;
   wheel-cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
+      node_type: cpu8
       script: ci/build_wheel_cpp.sh
       package-name: libkvikio
       package-type: cpp
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: pull-request
+      node_type: cpu8
       script: ci/build_wheel_python.sh
       package-name: kvikio
       package-type: python
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   wheel-python-tests:
     needs: [wheel-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   telemetry-summarize:
     # This job must use a self-hosted runner to record telemetry traces.
     runs-on: linux-amd64-cpu4
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 3191e26114..34bb32d67f 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -4,40 +4,48 @@ on:
   workflow_dispatch:
     inputs:
       branch:
+        description: |
+          branch: git branch the workflow run targets.
+          Required even when 'sha' is provided because it is also used for organizing artifacts.
         required: true
         type: string
       date:
+        description: "date: Date (YYYY-MM-DD) this run is for. Used to organize artifacts produced by nightly builds"
         required: true
         type: string
       sha:
+        description: "sha: full git commit SHA to check out"
         required: true
         type: string
       build_type:
+        description: "build_type: one of [branch, nightly, pull-request]"
         type: string
         default: nightly
 
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       script: ci/test_cpp.sh
       sha: ${{ inputs.sha }}
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       script: ci/test_python.sh
       sha: ${{ inputs.sha }}
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -45,5 +53,5 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:26.02-latest"
       script: "ci/test_java.sh"
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 475380cb68..c471e2a151 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@main
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index db0629d88f..2065300064 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,5 @@
-# Copyright (c) 2019-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
@@ -12,22 +13,13 @@ repos:
       - id: isort
         args: ["--config-root=python/", "--resolve-all-configs"]
         files: python/.*
-        types_or: [python, cython, pyi]
-  - repo: https://github.com/psf/black
-    rev: 23.3.0
+        types: [cython]
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.14.1
     hooks:
-      - id: black
-        files: python/.*
-        args: ["--config", "python/kvikio/pyproject.toml"]
-  - repo: https://github.com/PyCQA/flake8
-    rev: 7.1.1
-    hooks:
-      - id: flake8
-        args: ["--config=.flake8"]
-        files: python/.*$
-        types: [file]
-        types_or: [python, cython]
-        additional_dependencies: ["flake8-force"]
+      - id: ruff-check
+        args: ["--fix"]
+      - id: ruff-format
   - repo: https://github.com/MarcoGorelli/cython-lint
     rev: v0.15.0
     hooks:
@@ -82,29 +74,33 @@ repos:
             ^CHANGELOG.md$
           )
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.6.0
+    rev: v1.1.0
     hooks:
       - id: verify-copyright
+        args: [--fix, --spdx]
         files: |
           (?x)
-              [.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$|
-              ^[.]pre-commit-config[.]yaml$|
-              CMakeLists[.]txt$|
-              meta[.]yaml$
+            [.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx|pyi|java|bat)$|
+            ^[.]pre-commit-config[.]yaml$|
+            CMakeLists[.]txt$|
+            meta[.]yaml$|
+            pyproject[.]toml$|
+            recipe[.]yaml$|
+            dependencies[.]yaml$|
+            Makefile$|
+            pom[.]xml$
       - id: verify-alpha-spec
       - id: verify-codeowners
         args: [--fix, --project-prefix=kvikio]
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.17.0
+    rev: v1.20.0
     hooks:
       - id: rapids-dependency-file-generator
-        args: ["--clean"]
+        args: ["--clean", "--warn-all", "--strict"]
   - repo: https://github.com/shellcheck-py/shellcheck-py
     rev: v0.10.0.1
     hooks:
       - id: shellcheck
-        args: ["--severity=warning"]
-        files: ^ci/
 
 
 default_language_version:
diff --git a/.shellcheckrc b/.shellcheckrc
new file mode 100644
index 0000000000..b57b9d1962
--- /dev/null
+++ b/.shellcheckrc
@@ -0,0 +1,2 @@
+# Disable file checks (otherwise every use of `gha-tools` will get flagged)
+disable=SC1091
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6911639b10..1e6789b685 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,104 @@
+# kvikio 25.10.00 (8 Oct 2025)
+
+## 🚨 Breaking Changes
+
+- Support access to public S3 ([#820](https://github.com/rapidsai/kvikio/pull/820)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Reduce duplication between compat manager and defaults ([#816](https://github.com/rapidsai/kvikio/pull/816)) [@vyasr](https://github.com/vyasr)
+- Devendor libnvcomp from libkvikio ([#805](https://github.com/rapidsai/kvikio/pull/805)) [@bdice](https://github.com/bdice)
+- Remove Python nvCOMP bindings and Zarr 2 support ([#798](https://github.com/rapidsai/kvikio/pull/798)) [@vuule](https://github.com/vuule)
+
+## 🐛 Bug Fixes
+
+- Keep string alive until nvtxDomainResourceCreate ([#832](https://github.com/rapidsai/kvikio/pull/832)) [@vyasr](https://github.com/vyasr)
+- Fix a bug in public S3 inference ([#831](https://github.com/rapidsai/kvikio/pull/831)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Fix CUDA 13 handling of libcufile on aarch64 ([#827](https://github.com/rapidsai/kvikio/pull/827)) [@bdice](https://github.com/bdice)
+- Skip max_device_cache_size setter when BAR1 memory isn&#39;t present on the GPUs in the system ([#814](https://github.com/rapidsai/kvikio/pull/814)) [@ahoyle-nvidia](https://github.com/ahoyle-nvidia)
+- Fix an S3 parsing bug in the open function. Improve regex usage ([#810](https://github.com/rapidsai/kvikio/pull/810)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Remove remaining nvcomp references ([#801](https://github.com/rapidsai/kvikio/pull/801)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Set compiler versions in context ([#755)&quot; (#784](https://github.com/rapidsai/kvikio/pull/755)&quot; (#784)) [@vyasr](https://github.com/vyasr)
+- Relax mmap read requirement. Improve error message. ([#781](https://github.com/rapidsai/kvikio/pull/781)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
+## 🚀 New Features
+
+- Support access to public S3 ([#820](https://github.com/rapidsai/kvikio/pull/820)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Upgrade to nvCOMP 5.0.0.6 ([#800](https://github.com/rapidsai/kvikio/pull/800)) [@vuule](https://github.com/vuule)
+- Remove Python nvCOMP bindings and Zarr 2 support ([#798](https://github.com/rapidsai/kvikio/pull/798)) [@vuule](https://github.com/vuule)
+- Support WebHDFS (2/2): Python binding ([#791](https://github.com/rapidsai/kvikio/pull/791)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Support file size query for S3 presigned URL ([#789](https://github.com/rapidsai/kvikio/pull/789)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Support WebHDFS (1/2): C++ implementation ([#788](https://github.com/rapidsai/kvikio/pull/788)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Support file-backed mapping (2/n): host/device read Python binding ([#742](https://github.com/rapidsai/kvikio/pull/742)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
+## 🛠️ Improvements
+
+- Empty commit to trigger a build ([#840](https://github.com/rapidsai/kvikio/pull/840)) [@msarahan](https://github.com/msarahan)
+- Configure repo for automatic release notes generation ([#825](https://github.com/rapidsai/kvikio/pull/825)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Use C++20 for KvikIO main library ([#819](https://github.com/rapidsai/kvikio/pull/819)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Explicitly manage TLS/SSL CA paths for remote I/O ([#817](https://github.com/rapidsai/kvikio/pull/817)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Reduce duplication between compat manager and defaults ([#816](https://github.com/rapidsai/kvikio/pull/816)) [@vyasr](https://github.com/vyasr)
+- Added KVIKIO_REMOTE_VERBOSE option ([#815](https://github.com/rapidsai/kvikio/pull/815)) [@TomAugspurger](https://github.com/TomAugspurger)
+- Use branch-25.10 again ([#812](https://github.com/rapidsai/kvikio/pull/812)) [@jameslamb](https://github.com/jameslamb)
+- Update rapids-dependency-file-generator ([#809](https://github.com/rapidsai/kvikio/pull/809)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Add a unified remote I/O interface that infers the endpoint type from URL (2/2): Python binding ([#808](https://github.com/rapidsai/kvikio/pull/808)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Devendor libnvcomp from libkvikio ([#805](https://github.com/rapidsai/kvikio/pull/805)) [@bdice](https://github.com/bdice)
+- Remove more nvcomp packaging for conda ([#804](https://github.com/rapidsai/kvikio/pull/804)) [@bdice](https://github.com/bdice)
+- Build and test with CUDA 13.0.0 ([#803](https://github.com/rapidsai/kvikio/pull/803)) [@jameslamb](https://github.com/jameslamb)
+- Optionally require zarr&gt;=3.0.0 ([#802](https://github.com/rapidsai/kvikio/pull/802)) [@TomAugspurger](https://github.com/TomAugspurger)
+- Use build cluster in devcontainers ([#797](https://github.com/rapidsai/kvikio/pull/797)) [@trxcllnt](https://github.com/trxcllnt)
+- Improve KvikIO Python binding performance by releasing GIL wherever deemed necessary ([#796](https://github.com/rapidsai/kvikio/pull/796)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Update rapids_config to handle user defined branch name ([#794](https://github.com/rapidsai/kvikio/pull/794)) [@robertmaynard](https://github.com/robertmaynard)
+- Add a unified remote I/O interface that infers the endpoint type from URL (1/2): C++ implementation ([#793](https://github.com/rapidsai/kvikio/pull/793)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Update rapids-build-backend to 0.4.0 ([#790](https://github.com/rapidsai/kvikio/pull/790)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Fix C++20 warning in the mmap test ([#785](https://github.com/rapidsai/kvikio/pull/785)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Allow latest OS in devcontainers ([#780](https://github.com/rapidsai/kvikio/pull/780)) [@bdice](https://github.com/bdice)
+- Update build infra to support new branching strategy ([#776](https://github.com/rapidsai/kvikio/pull/776)) [@robertmaynard](https://github.com/robertmaynard)
+- Use GCC 14 in conda builds. ([#756](https://github.com/rapidsai/kvikio/pull/756)) [@vyasr](https://github.com/vyasr)
+- Use C++20 standard ([#749](https://github.com/rapidsai/kvikio/pull/749)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
+# kvikio 25.08.00 (6 Aug 2025)
+
+## 🚨 Breaking Changes
+
+- Remove CUDA 11 from dependencies.yaml ([#745](https://github.com/rapidsai/kvikio/pull/745)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- stop uploading packages to downloads.rapids.ai ([#734](https://github.com/rapidsai/kvikio/pull/734)) [@jameslamb](https://github.com/jameslamb)
+
+## 🐛 Bug Fixes
+
+- Fix remote I/O file size overflow in Python binding ([#766](https://github.com/rapidsai/kvikio/pull/766)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Fix a case sensitivity bug in S3 remote I/O ([#765](https://github.com/rapidsai/kvikio/pull/765)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Use RAPIDS&#39; CUDA major-minor version in Conda&#39;s `cuda_version` ([#758](https://github.com/rapidsai/kvikio/pull/758)) [@vyasr](https://github.com/vyasr)
+- Fix unit test UB resulting from incorrect initialization ([#751](https://github.com/rapidsai/kvikio/pull/751)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Add an overload getenv_or that supports env var alias. Add new env var KVIKIO_NUM_THREADS. Fix UB. ([#735](https://github.com/rapidsai/kvikio/pull/735)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
+## 📖 Documentation
+
+- Remove CUDA 11 mentions from docs ([#769](https://github.com/rapidsai/kvikio/pull/769)) [@pentschev](https://github.com/pentschev)
+- add docs on CI workflow inputs ([#764](https://github.com/rapidsai/kvikio/pull/764)) [@jameslamb](https://github.com/jameslamb)
+
+## 🚀 New Features
+
+- Add the utility function to clear page cache ([#741](https://github.com/rapidsai/kvikio/pull/741)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Support file-backed mapping (1/n): host/device read C++ implementation ([#740](https://github.com/rapidsai/kvikio/pull/740)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
+## 🛠️ Improvements
+
+- Use libnvcomp conda package ([#774](https://github.com/rapidsai/kvikio/pull/774)) [@bdice](https://github.com/bdice)
+- fix(docker): use versioned `-latest` tag for all `rapidsai` images ([#771](https://github.com/rapidsai/kvikio/pull/771)) [@gforsyth](https://github.com/gforsyth)
+- Remove CUDA 11 workarounds ([#770](https://github.com/rapidsai/kvikio/pull/770)) [@pentschev](https://github.com/pentschev)
+- Use CUDA 12.9 in Conda, Devcontainers, Spark, GHA, etc. ([#762](https://github.com/rapidsai/kvikio/pull/762)) [@jakirkham](https://github.com/jakirkham)
+- refactor(shellcheck): enable for all files and fix remaining warnings ([#760](https://github.com/rapidsai/kvikio/pull/760)) [@gforsyth](https://github.com/gforsyth)
+- Remove nvidia and dask channels ([#759](https://github.com/rapidsai/kvikio/pull/759)) [@vyasr](https://github.com/vyasr)
+- Set compiler versions in context ([#755](https://github.com/rapidsai/kvikio/pull/755)) [@jakirkham](https://github.com/jakirkham)
+- Use variants to produce separate builds with and without cufile support ([#754](https://github.com/rapidsai/kvikio/pull/754)) [@vyasr](https://github.com/vyasr)
+- Only use cufile on x86 ([#753](https://github.com/rapidsai/kvikio/pull/753)) [@vyasr](https://github.com/vyasr)
+- refactor(rattler): remove explicit `libnuma` pin now that upstream feedstock is fixed ([#752](https://github.com/rapidsai/kvikio/pull/752)) [@gforsyth](https://github.com/gforsyth)
+- Remove CUDA 11 from dependencies.yaml ([#745](https://github.com/rapidsai/kvikio/pull/745)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Add the conda dependency libnuma necessary for GDS I/O ([#744](https://github.com/rapidsai/kvikio/pull/744)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Remove CUDA 11 devcontainers and update CI scripts ([#738](https://github.com/rapidsai/kvikio/pull/738)) [@bdice](https://github.com/bdice)
+- refactor(rattler): remove cuda11 options and general cleanup ([#737](https://github.com/rapidsai/kvikio/pull/737)) [@gforsyth](https://github.com/gforsyth)
+- stop uploading packages to downloads.rapids.ai ([#734](https://github.com/rapidsai/kvikio/pull/734)) [@jameslamb](https://github.com/jameslamb)
+- Forward-merge branch-25.06 into branch-25.08 ([#722](https://github.com/rapidsai/kvikio/pull/722)) [@gforsyth](https://github.com/gforsyth)
+- resolve forward-merge from branch-25.06 to branch-25.08 ([#710](https://github.com/rapidsai/kvikio/pull/710)) [@gforsyth](https://github.com/gforsyth)
+
 # kvikio 25.06.00 (5 Jun 2025)
 
 ## 🚨 Breaking Changes
diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH
new file mode 100644
index 0000000000..ba2906d066
--- /dev/null
+++ b/RAPIDS_BRANCH
@@ -0,0 +1 @@
+main
diff --git a/README.md b/README.md
index ac2ee2a4bd..549d8bcfc4 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,6 @@ KvikIO also works efficiently when GDS isn't available and can read/write both h
 * Concurrent reads and writes using an internal thread pool.
 * Non-blocking API.
 * Transparently handles reads and writes to/from memory on both host and device.
-* (Deprecated) Provides Python bindings to [nvCOMP](https://docs.nvidia.com/cuda/nvcomp/py_api.html).
 
 
 ### Documentation
diff --git a/VERSION b/VERSION
index cc83d7ab43..5c33046aca 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-25.06.00
+26.02.00
diff --git a/build.sh b/build.sh
index c02d0c876a..785753b2d4 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 # kvikio build script
 
@@ -16,13 +17,14 @@ ARGS=$*
 
 # NOTE: ensure all dir changes are relative to the location of this
 # script, and that this script resides in the repo dir!
-REPODIR=$(cd $(dirname $0); pwd)
+REPODIR=$(cd "$(dirname "$0")"; pwd)
 
-VALIDARGS="clean libkvikio kvikio -v -g -n --pydevelop -h"
-HELP="$0 [clean] [libkvikio] [kvikio] [-v] [-g] [-n] [--cmake-args=\"<args>\"] [-h]
+VALIDARGS="clean libkvikio kvikio benchmarks -v -g -n --pydevelop -h"
+HELP="$0 [clean] [libkvikio] [kvikio] [benchmarks] [-v] [-g] [-n] [--cmake-args=\"<args>\"] [-h]
    clean                       - remove all existing build artifacts and configuration (start over)
    libkvikio                   - build and install the libkvikio C++ code
    kvikio                      - build and install the kvikio Python package (requires libkvikio)
+   benchmarks                  - build benchmarks
    -v                          - verbose build mode
    -g                          - build for debug
    -n                          - no install step
@@ -39,8 +41,9 @@ BUILD_DIRS="${LIBKVIKIO_BUILD_DIR} ${KVIKIO_BUILD_DIR}"
 VERBOSE_FLAG=""
 BUILD_TYPE=Release
 INSTALL_TARGET=install
+BUILD_BENCHMARKS=OFF
 RAN_CMAKE=0
-PYTHON_ARGS_FOR_INSTALL="-v --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true"
+PYTHON_ARGS_FOR_INSTALL=("-v" "--no-build-isolation" "--no-deps" "--config-settings" "rapidsai.disable-cuda=true")
 
 
 # Set defaults for vars that may not have been defined externally
@@ -55,24 +58,25 @@ function hasArg {
 
 function cmakeArgs {
     # Check for multiple cmake args options
-    if [[ $(echo $ARGS | { grep -Eo "\-\-cmake\-args" || true; } | wc -l ) -gt 1 ]]; then
+    if [[ $(echo "$ARGS" | { grep -Eo "\-\-cmake\-args" || true; } | wc -l ) -gt 1 ]]; then
         echo "Multiple --cmake-args options were provided, please provide only one: ${ARGS}"
         exit 1
     fi
 
     # Check for cmake args option
-    if [[ -n $(echo $ARGS | { grep -E "\-\-cmake\-args" || true; } ) ]]; then
+    if [[ -n $(echo "$ARGS" | { grep -E "\-\-cmake\-args" || true; } ) ]]; then
         # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently
         # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back
         # on the invalid option error
-        EXTRA_CMAKE_ARGS=$(echo $ARGS | { grep -Eo "\-\-cmake\-args=\".+\"" || true; })
+        EXTRA_CMAKE_ARGS=$(echo "$ARGS" | { grep -Eo "\-\-cmake\-args=\".+\"" || true; })
         if [[ -n ${EXTRA_CMAKE_ARGS} ]]; then
             # Remove the full  EXTRA_CMAKE_ARGS argument from list of args so that it passes validArgs function
             ARGS=${ARGS//$EXTRA_CMAKE_ARGS/}
             # Filter the full argument down to just the extra string that will be added to cmake call
-            EXTRA_CMAKE_ARGS=$(echo $EXTRA_CMAKE_ARGS | grep -Eo "\".+\"" | sed -e 's/^"//' -e 's/"$//')
+            EXTRA_CMAKE_ARGS=$(echo "$EXTRA_CMAKE_ARGS" | grep -Eo "\".+\"" | sed -e 's/^"//' -e 's/"$//')
         fi
     fi
+    read -ra EXTRA_CMAKE_ARGS <<< "$EXTRA_CMAKE_ARGS"
 }
 
 
@@ -80,13 +84,14 @@ function cmakeArgs {
 # LIBKVIKIO_BUILD_DIR
 function ensureCMakeRan {
     mkdir -p "${LIBKVIKIO_BUILD_DIR}"
-    cd ${REPODIR}/cpp
+    cd "${REPODIR}"/cpp
     if (( RAN_CMAKE == 0 )); then
         echo "Executing cmake for libkvikio..."
         cmake -B "${LIBKVIKIO_BUILD_DIR}" -S . \
               -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" \
               -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-              ${EXTRA_CMAKE_ARGS}
+              -DKvikIO_BUILD_BENCHMARKS=${BUILD_BENCHMARKS} \
+              "${EXTRA_CMAKE_ARGS[@]}"
         RAN_CMAKE=1
     fi
 }
@@ -97,7 +102,7 @@ if hasArg -h || hasArg --help; then
 fi
 
 # Check for valid usage
-if (( ${NUMARGS} != 0 )); then
+if (( NUMARGS != 0 )); then
     # Check for cmake args
     cmakeArgs
     for a in ${ARGS}; do
@@ -123,8 +128,11 @@ fi
 if hasArg -n; then
     INSTALL_TARGET=""
 fi
+if hasArg benchmarks; then
+    BUILD_BENCHMARKS=ON
+fi
 if hasArg --pydevelop; then
-    PYTHON_ARGS_FOR_INSTALL="${PYTHON_ARGS_FOR_INSTALL} -e"
+    PYTHON_ARGS_FOR_INSTALL+=("-e")
 fi
 
 # If clean given, run it prior to any other steps
@@ -146,7 +154,7 @@ fi
 if (( NUMARGS == 0 )) || hasArg libkvikio; then
     ensureCMakeRan
     echo "building libkvikio..."
-    cmake --build "${LIBKVIKIO_BUILD_DIR}" -j${PARALLEL_LEVEL} ${VERBOSE_FLAG}
+    cmake --build "${LIBKVIKIO_BUILD_DIR}" -j"${PARALLEL_LEVEL}" ${VERBOSE_FLAG}
     if [[ ${INSTALL_TARGET} != "" ]]; then
         echo "installing libkvikio..."
         cmake --build "${LIBKVIKIO_BUILD_DIR}" --target install ${VERBOSE_FLAG}
@@ -156,7 +164,8 @@ fi
 # Build and install the kvikio Python package
 if (( NUMARGS == 0 )) || hasArg kvikio; then
     echo "building kvikio..."
-    cd ${REPODIR}/python/kvikio
-    SKBUILD_CMAKE_ARGS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX};-DCMAKE_LIBRARY_PATH=${LIBKVIKIO_BUILD_DIR};${EXTRA_CMAKE_ARGS}" \
-        python -m pip install ${PYTHON_ARGS_FOR_INSTALL} .
+    cd "${REPODIR}"/python/kvikio
+    _EXTRA_CMAKE_ARGS=$(IFS=';'; echo "${EXTRA_CMAKE_ARGS[*]}")
+    SKBUILD_CMAKE_ARGS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX};-DCMAKE_LIBRARY_PATH=${LIBKVIKIO_BUILD_DIR};$_EXTRA_CMAKE_ARGS" \
+        python -m pip install "${PYTHON_ARGS_FOR_INSTALL[@]}" .
 fi
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 8cf60a3d13..a787fb7510 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
 source rapids-configure-sccache
-
 source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
@@ -13,25 +13,60 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
-sccache --zero-stats
+sccache --stop-server 2>/dev/null || true
 
 RAPIDS_PACKAGE_VERSION=$(rapids-generate-version)
 export RAPIDS_PACKAGE_VERSION
 
+RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"}
+mkdir -p "${RAPIDS_ARTIFACTS_DIR}"
+export RAPIDS_ARTIFACTS_DIR
+
 # populates `RATTLER_CHANNELS` array and `RATTLER_ARGS` array
 source rapids-rattler-channel-string
 
+# Construct the extra variants according to the architecture
+if [[ "$(arch)" == "x86_64" ]]; then
+    cat > variants.yaml << EOF
+    c_compiler_version:
+      - 14
+
+    cxx_compiler_version:
+      - 14
+
+    cuda_version:
+      - ${RAPIDS_CUDA_VERSION%.*}
+EOF
+else
+    cat > variants.yaml << EOF
+    zip_keys:
+    - [c_compiler_version, cxx_compiler_version, cuda_version]
+
+    c_compiler_version:
+    - 12
+    - 14
+
+    cxx_compiler_version:
+    - 12
+    - 14
+
+    cuda_version:
+    - 12.1 # The last version to not support cufile
+    - ${RAPIDS_CUDA_VERSION%.*}
+EOF
+fi
+
 # --no-build-id allows for caching with `sccache`
 # more info is available at
 # https://rattler.build/latest/tips_and_tricks/#using-sccache-or-ccache-with-rattler-build
 rattler-build build --recipe conda/recipes/libkvikio \
+                    --variant-config variants.yaml \
                     "${RATTLER_ARGS[@]}" \
                     "${RATTLER_CHANNELS[@]}"
 
 sccache --show-adv-stats
+sccache --stop-server >/dev/null 2>&1 || true
 
 # remove build_cache directory to avoid uploading the entire source tree
 # tracked in https://github.com/prefix-dev/rattler-build/issues/1424
 rm -rf "$RAPIDS_CONDA_BLD_OUTPUT_DIR"/build_cache
-
-rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 982e9a0d63..6813851dd6 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 8bf17ea2a0..8b4bb2b948 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
 source rapids-configure-sccache
-
 source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
@@ -25,7 +25,7 @@ rapids-logger "Prepending channel ${CPP_CHANNEL} to RATTLER_CHANNELS"
 
 RATTLER_CHANNELS=("--channel" "${CPP_CHANNEL}" "${RATTLER_CHANNELS[@]}")
 
-sccache --zero-stats
+sccache --stop-server 2>/dev/null || true
 
 rapids-logger "Building kvikio"
 
@@ -37,9 +37,8 @@ rattler-build build --recipe conda/recipes/kvikio \
                     "${RATTLER_CHANNELS[@]}"
 
 sccache --show-adv-stats
+sccache --stop-server >/dev/null 2>&1 || true
 
 # remove build_cache directory to avoid uploading the entire source tree
 # tracked in https://github.com/prefix-dev/rattler-build/issues/1424
 rm -rf "$RAPIDS_CONDA_BLD_OUTPUT_DIR"/build_cache
-
-rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index c22e809ae5..04a3055972 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
@@ -10,11 +11,14 @@ source rapids-configure-sccache
 source rapids-date-string
 source rapids-init-pip
 
+export SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX="${package_name}/${RAPIDS_CONDA_ARCH}/cuda${RAPIDS_CUDA_VERSION%%.*}/wheel/preprocessor-cache"
+export SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE=true
+
 rapids-generate-version > ./VERSION
 
 cd "${package_dir}"
 
-sccache --zero-stats
+sccache --stop-server 2>/dev/null || true
 
 rapids-logger "Building '${package_name}' wheel"
 rapids-pip-retry wheel \
@@ -25,3 +29,4 @@ rapids-pip-retry wheel \
     .
 
 sccache --show-adv-stats
+sccache --stop-server >/dev/null 2>&1 || true
diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
index 1bdf439a29..83c8280215 100755
--- a/ci/build_wheel_cpp.sh
+++ b/ci/build_wheel_cpp.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
@@ -27,15 +28,10 @@ rapids-pip-retry install \
 # 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
 export PIP_NO_BUILD_ISOLATION=0
 
-export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=OFF"
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
-
 python -m auditwheel repair \
     -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \
     ${package_dir}/dist/*
 
 ./ci/validate_wheel.sh ${package_dir} "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}"
-
-RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}"
diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
index f00609cc17..e515139105 100755
--- a/ci/build_wheel_python.sh
+++ b/ci/build_wheel_python.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
@@ -17,15 +18,11 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 LIBKVIKIO_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="libkvikio_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github cpp)
 echo "libkvikio-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo "${LIBKVIKIO_WHEELHOUSE}"/libkvikio_*.whl)" >> "${PIP_CONSTRAINT}"
 
-export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 python -m auditwheel repair \
     --exclude libkvikio.so \
-    --exclude libnvcomp.so.4 \
     -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \
     ${package_dir}/dist/*
 
 ./ci/validate_wheel.sh ${package_dir} "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}"
-
-RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}"
diff --git a/ci/check_style.sh b/ci/check_style.sh
index 675f251098..cbd17858d2 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
@@ -14,10 +15,10 @@ rapids-dependency-file-generator \
 rapids-mamba-retry env create --yes -f env.yaml -n checks
 conda activate checks
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_BRANCH="$(cat "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../RAPIDS_BRANCH)"
 
-FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION_MAJOR_MINOR}/cmake-format-rapids-cmake.json
-export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
+FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/${RAPIDS_BRANCH}/cmake-format-rapids-cmake.json
+export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-format-rapids-cmake.json
 mkdir -p "$(dirname ${RAPIDS_CMAKE_FORMAT_FILE})"
 wget -O ${RAPIDS_CMAKE_FORMAT_FILE} "${FORMAT_FILE_URL}"
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 75029bc0c4..0bc6d899ec 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,14 +1,65 @@
 #!/bin/bash
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 ##########################
 # KvikIO Version Updater #
 ##########################
 
 ## Usage
-# bash update-version.sh <new_version>
+# NOTE: This script must be run from the repository root, not from the ci/release/ directory
+# Primary interface:   bash ci/release/update-version.sh <new_version> [--run-context=main|release]
+# Fallback interface:  [RAPIDS_RUN_CONTEXT=main|release] bash ci/release/update-version.sh <new_version>
+# CLI arguments take precedence over environment variables
+# Defaults to main when no run-context is specified
+
+# Parse command line arguments
+CLI_RUN_CONTEXT=""
+VERSION_ARG=""
+
+for arg in "$@"; do
+    case $arg in
+        --run-context=*)
+            CLI_RUN_CONTEXT="${arg#*=}"
+            shift
+            ;;
+        *)
+            if [[ -z "$VERSION_ARG" ]]; then
+                VERSION_ARG="$arg"
+            fi
+            ;;
+    esac
+done
 
 # Format is YY.MM.PP - no leading 'v' or trailing 'a'
-NEXT_FULL_TAG=$1
+NEXT_FULL_TAG="$VERSION_ARG"
+
+# Determine RUN_CONTEXT with CLI precedence over environment variable, defaulting to main
+if [[ -n "$CLI_RUN_CONTEXT" ]]; then
+    RUN_CONTEXT="$CLI_RUN_CONTEXT"
+    echo "Using run-context from CLI: $RUN_CONTEXT"
+elif [[ -n "${RAPIDS_RUN_CONTEXT}" ]]; then
+    RUN_CONTEXT="$RAPIDS_RUN_CONTEXT"
+    echo "Using run-context from environment: $RUN_CONTEXT"
+else
+    RUN_CONTEXT="main"
+    echo "No run-context provided, defaulting to: $RUN_CONTEXT"
+fi
+
+# Validate RUN_CONTEXT value
+if [[ "${RUN_CONTEXT}" != "main" && "${RUN_CONTEXT}" != "release" ]]; then
+    echo "Error: Invalid run-context value '${RUN_CONTEXT}'"
+    echo "Valid values: main, release"
+    exit 1
+fi
+
+# Validate version argument
+if [[ -z "$NEXT_FULL_TAG" ]]; then
+    echo "Error: Version argument is required"
+    echo "Usage: $0 <new_version> [--run-context=<context>]"
+    echo "   or: [RAPIDS_RUN_CONTEXT=<context>] $0 <new_version>"
+    echo "Note: Defaults to main when run-context is not specified"
+    exit 1
+fi
 
 # Get current version
 CURRENT_TAG=$(git tag --merged HEAD | grep -xE '^v.*' | sort --version-sort | tail -n 1 | tr -d 'v')
@@ -23,7 +74,14 @@ NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
 PATCH_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_PATCH}'))")
 
-echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
+# Set branch references based on RUN_CONTEXT
+if [[ "${RUN_CONTEXT}" == "main" ]]; then
+    RAPIDS_BRANCH_NAME="main"
+    echo "Preparing development branch update $CURRENT_TAG => $NEXT_FULL_TAG (targeting main branch)"
+elif [[ "${RUN_CONTEXT}" == "release" ]]; then
+    RAPIDS_BRANCH_NAME="release/${NEXT_SHORT_TAG}"
+    echo "Preparing release branch update $CURRENT_TAG => $NEXT_FULL_TAG (targeting release/${NEXT_SHORT_TAG} branch)"
+fi
 
 # Inplace sed replace; workaround for Linux and Mac
 function sed_runner() {
@@ -32,6 +90,7 @@ function sed_runner() {
 
 # Centralized version file update
 echo "${NEXT_FULL_TAG}" > VERSION
+echo "${RAPIDS_BRANCH_NAME}" > RAPIDS_BRANCH
 
 DEPENDENCIES=(
   kvikio
@@ -50,7 +109,8 @@ done
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
-  sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+  sed_runner "/shared-workflows/ s|@.*|@${RAPIDS_BRANCH_NAME}|g" "${FILE}"
+  sed_runner "s|:[0-9]*\\.[0-9]*-|:${NEXT_SHORT_TAG}-|g" "${FILE}"
 done
 
 # .devcontainer files
@@ -60,7 +120,16 @@ find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r
     sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done
 
-# The example of a downstream project
+# Update downstream example GIT_TAG based on context
+if [[ "${RUN_CONTEXT}" == "main" ]]; then
+    # In main context, convert any release/X.Y references to main
+    sed_runner "s|GIT_TAG release/[^[:space:]]*|GIT_TAG main|g" "cpp/examples/downstream/cmake/get_kvikio.cmake"
+elif [[ "${RUN_CONTEXT}" == "release" ]]; then
+    # In release context, convert main to release/X.Y
+    sed_runner "s|GIT_TAG main|GIT_TAG release/${NEXT_SHORT_TAG}|g" "cpp/examples/downstream/cmake/get_kvikio.cmake"
+fi
+
+# The example of a downstream project - update version number
 sed_runner "s/find_and_configure_kvikio(.*)/find_and_configure_kvikio(\"${NEXT_SHORT_TAG}\")/g" "cpp/examples/downstream/cmake/get_kvikio.cmake"
 
 # Java files
diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh
index dcb938cbdd..08c015fa1b 100755
--- a/ci/run_ctests.sh
+++ b/ci/run_ctests.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/run_pytests.sh b/ci/run_pytests.sh
index 1a7edb5be5..7cd835af5d 100755
--- a/ci/run_pytests.sh
+++ b/ci/run_pytests.sh
@@ -1,14 +1,10 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
 # Support invoking run_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/kvikio
 
-# If running CUDA 11.8 on arm64, we skip tests marked "cufile" since
-# cuFile didn't support arm until 12.4
-[[ "${CUDA_VERSION}" == "11.8.0" && "${RUNNER_ARCH}" == "ARM64" ]] \
-  && PYTEST_MARK=( -m 'not cufile' ) || PYTEST_MARK=()
-
-pytest --cache-clear --verbose "${PYTEST_MARK[@]}" "$@" tests
+pytest --cache-clear --verbose "$@" tests
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 05fc954b4f..770f26a675 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/test_java.sh b/ci/test_java.sh
index a715b9422d..27ef5e8905 100755
--- a/ci/test_java.sh
+++ b/ci/test_java.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 2527094f69..1ca634edfe 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 66c7cfcb71..5513083815 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -eou pipefail
 
@@ -15,9 +16,4 @@ rapids-pip-retry install -v \
   "$(echo "${LIBKVIKIO_WHEELHOUSE}"/libkvikio_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \
   "$(echo "${KVIKIO_WHEELHOUSE}"/kvikio_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]"
 
-# If running CUDA 11.8 on arm64, we skip tests marked "cufile" since
-# cuFile didn't support arm until 12.4
-[[ "${CUDA_VERSION}" == "11.8.0" && "${RUNNER_ARCH}" == "ARM64" ]] \
-  && PYTEST_MARK=( -m 'not cufile' ) || PYTEST_MARK=()
-
-python -m pytest --cache-clear --verbose "${PYTEST_MARK[@]}" ./python/kvikio/tests
+python -m pytest --cache-clear --verbose ./python/kvikio/tests
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
index 40bb27feeb..5c5facf13f 100755
--- a/ci/validate_wheel.sh
+++ b/ci/validate_wheel.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/cmake/RAPIDS.cmake b/cmake/RAPIDS.cmake
index d112951d3c..5bbc8fcc3a 100644
--- a/cmake/RAPIDS.cmake
+++ b/cmake/RAPIDS.cmake
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 #
 # This is the preferred entry point for projects using rapids-cmake
@@ -18,9 +11,9 @@
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
 
 # Allow users to control which version is used
-if(NOT rapids-cmake-version OR NOT rapids-cmake-version MATCHES [[^([0-9][0-9])\.([0-9][0-9])$]])
+if(NOT (rapids-cmake-branch OR rapids-cmake-version))
   message(
-    FATAL_ERROR "The CMake variable rapids-cmake-version must be defined in the format MAJOR.MINOR."
+    FATAL_ERROR "The CMake variable `rapids-cmake-branch` or `rapids-cmake-version` must be defined"
   )
 endif()
 
@@ -33,7 +26,7 @@ endif()
 # Allow users to control which branch is fetched
 if(NOT rapids-cmake-branch)
   # Define a default branch if the user doesn't set one
-  set(rapids-cmake-branch "branch-${rapids-cmake-version}")
+  set(rapids-cmake-branch "release/${rapids-cmake-version}")
 endif()
 
 # Allow users to control the exact URL passed to FetchContent
diff --git a/cmake/rapids_config.cmake b/cmake/rapids_config.cmake
index abe468dce8..9cda1f26e4 100644
--- a/cmake/rapids_config.cmake
+++ b/cmake/rapids_config.cmake
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2018-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2018-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 file(READ "${CMAKE_CURRENT_LIST_DIR}/../VERSION" _rapids_version)
 if(_rapids_version MATCHES [[^([0-9][0-9])\.([0-9][0-9])\.([0-9][0-9])]])
@@ -26,5 +19,22 @@ else()
   )
 endif()
 
-set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}")
+# Use STRINGS to trim whitespace/newlines
+file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/../RAPIDS_BRANCH" _rapids_branch)
+if(NOT _rapids_branch)
+  message(
+    FATAL_ERROR
+      "Could not determine branch name to use for checking out rapids-cmake. The file \"${CMAKE_CURRENT_LIST_DIR}/../RAPIDS_BRANCH\" is missing."
+  )
+endif()
+
+if(NOT rapids-cmake-version)
+  set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}")
+endif()
+if(NOT rapids-cmake-branch)
+  set(rapids-cmake-branch "${_rapids_branch}")
+endif()
 include("${CMAKE_CURRENT_LIST_DIR}/RAPIDS.cmake")
+
+# Don't use sccache-dist for CMake's compiler tests
+set(ENV{SCCACHE_NO_DIST_COMPILE} "1")
diff --git a/conda/environments/all_cuda-128_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
similarity index 67%
rename from conda/environments/all_cuda-128_arch-aarch64.yaml
rename to conda/environments/all_cuda-129_arch-aarch64.yaml
index 52355f3994..067837ea36 100644
--- a/conda/environments/all_cuda-128_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -4,41 +4,39 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - conda-forge
-- nvidia
 dependencies:
 - boto3>=1.21.21
 - c-compiler
 - cmake>=3.30.4
 - cuda-nvcc
-- cuda-python>=12.6.2,<13.0a0
-- cuda-version=12.8
-- cupy>=12.0.0
+- cuda-python>=12.9.2,<13.0a0
+- cuda-version=12.9
+- cupy>=13.6.0
 - cxx-compiler
-- cython>=3.0.0
+- cython>=3.0.0,<3.2.0a0
 - doxygen=1.9.1
-- gcc_linux-aarch64=13.*
+- gcc_linux-aarch64=14.*
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
+- libnuma
 - moto>=4.0.8
 - ninja
-- numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
-- nvcomp==4.2.0.11
 - packaging
 - pre-commit
-- pytest
 - pytest-asyncio
 - pytest-cov
 - pytest-timeout
+- pytest<9.0.0a0
 - python>=3.10,<3.14
 - rangehttpserver
-- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.6.*,>=0.0.0a0
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
+- rapids-dask-dependency==26.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
 - sphinx_rtd_theme
 - sysroot_linux-aarch64=2.28
-- zarr>=2.0.0,<4.0.0
-name: all_cuda-128_arch-aarch64
+- zarr>=3.0.0,<3.2.0a0,<4.0.0
+name: all_cuda-129_arch-aarch64
diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
similarity index 67%
rename from conda/environments/all_cuda-128_arch-x86_64.yaml
rename to conda/environments/all_cuda-129_arch-x86_64.yaml
index c520023a30..9270b47f6e 100644
--- a/conda/environments/all_cuda-128_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -4,41 +4,39 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - conda-forge
-- nvidia
 dependencies:
 - boto3>=1.21.21
 - c-compiler
 - cmake>=3.30.4
 - cuda-nvcc
-- cuda-python>=12.6.2,<13.0a0
-- cuda-version=12.8
-- cupy>=12.0.0
+- cuda-python>=12.9.2,<13.0a0
+- cuda-version=12.9
+- cupy>=13.6.0
 - cxx-compiler
-- cython>=3.0.0
+- cython>=3.0.0,<3.2.0a0
 - doxygen=1.9.1
-- gcc_linux-64=13.*
+- gcc_linux-64=14.*
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
+- libnuma
 - moto>=4.0.8
 - ninja
-- numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
-- nvcomp==4.2.0.11
 - packaging
 - pre-commit
-- pytest
 - pytest-asyncio
 - pytest-cov
 - pytest-timeout
+- pytest<9.0.0a0
 - python>=3.10,<3.14
 - rangehttpserver
-- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.6.*,>=0.0.0a0
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
+- rapids-dask-dependency==26.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
 - sphinx_rtd_theme
 - sysroot_linux-64=2.28
-- zarr>=2.0.0,<4.0.0
-name: all_cuda-128_arch-x86_64
+- zarr>=3.0.0,<3.2.0a0,<4.0.0
+name: all_cuda-129_arch-x86_64
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml
similarity index 63%
rename from conda/environments/all_cuda-118_arch-aarch64.yaml
rename to conda/environments/all_cuda-130_arch-aarch64.yaml
index 9f5bc53ac4..e5c40f372a 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-130_arch-aarch64.yaml
@@ -4,41 +4,39 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - conda-forge
-- nvidia
 dependencies:
 - boto3>=1.21.21
 - c-compiler
 - cmake>=3.30.4
-- cuda-python>=11.8.5,<12.0a0
-- cuda-version=11.8
-- cudatoolkit
-- cupy>=12.0.0
+- cuda-nvcc
+- cuda-python>=13.0.1,<14.0a0
+- cuda-version=13.0
+- cupy>=13.6.0
 - cxx-compiler
-- cython>=3.0.0
+- cython>=3.0.0,<3.2.0a0
 - doxygen=1.9.1
-- gcc_linux-aarch64=11.*
+- gcc_linux-aarch64=14.*
+- libcufile-dev
 - libcurl>=8.5.0,<9.0a0
+- libnuma
 - moto>=4.0.8
 - ninja
-- numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
-- nvcc_linux-aarch64=11.8
-- nvcomp==4.2.0.11
 - packaging
 - pre-commit
-- pytest
 - pytest-asyncio
 - pytest-cov
 - pytest-timeout
+- pytest<9.0.0a0
 - python>=3.10,<3.14
 - rangehttpserver
-- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.6.*,>=0.0.0a0
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
+- rapids-dask-dependency==26.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
 - sphinx_rtd_theme
 - sysroot_linux-aarch64=2.28
-- zarr>=2.0.0,<4.0.0
-name: all_cuda-118_arch-aarch64
+- zarr>=3.0.0,<3.2.0a0,<4.0.0
+name: all_cuda-130_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml
similarity index 60%
rename from conda/environments/all_cuda-118_arch-x86_64.yaml
rename to conda/environments/all_cuda-130_arch-x86_64.yaml
index 03764f2a38..3c38fd8b2b 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-130_arch-x86_64.yaml
@@ -4,43 +4,39 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - conda-forge
-- nvidia
 dependencies:
 - boto3>=1.21.21
 - c-compiler
 - cmake>=3.30.4
-- cuda-python>=11.8.5,<12.0a0
-- cuda-version=11.8
-- cudatoolkit
-- cupy>=12.0.0
+- cuda-nvcc
+- cuda-python>=13.0.1,<14.0a0
+- cuda-version=13.0
+- cupy>=13.6.0
 - cxx-compiler
-- cython>=3.0.0
+- cython>=3.0.0,<3.2.0a0
 - doxygen=1.9.1
-- gcc_linux-64=11.*
-- libcufile-dev=1.4.0.31
-- libcufile=1.4.0.31
+- gcc_linux-64=14.*
+- libcufile-dev
 - libcurl>=8.5.0,<9.0a0
+- libnuma
 - moto>=4.0.8
 - ninja
-- numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
-- nvcc_linux-64=11.8
-- nvcomp==4.2.0.11
 - packaging
 - pre-commit
-- pytest
 - pytest-asyncio
 - pytest-cov
 - pytest-timeout
+- pytest<9.0.0a0
 - python>=3.10,<3.14
 - rangehttpserver
-- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.6.*,>=0.0.0a0
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
+- rapids-dask-dependency==26.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
 - sphinx_rtd_theme
 - sysroot_linux-64=2.28
-- zarr>=2.0.0,<4.0.0
-name: all_cuda-118_arch-x86_64
+- zarr>=3.0.0,<3.2.0a0,<4.0.0
+name: all_cuda-130_arch-x86_64
diff --git a/conda/recipes/kvikio/conda_build_config.yaml b/conda/recipes/kvikio/conda_build_config.yaml
index a746372833..f5f37a39a2 100644
--- a/conda/recipes/kvikio/conda_build_config.yaml
+++ b/conda/recipes/kvikio/conda_build_config.yaml
@@ -1,17 +1,14 @@
 c_compiler_version:
-  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
-  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 14
 
 cxx_compiler_version:
-  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
-  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 14
 
 cmake_version:
   - ">=3.30.4"
 
 cuda_compiler:
-  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
-  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - cuda-nvcc
 
 c_stdlib:
   - sysroot
@@ -19,18 +16,5 @@ c_stdlib:
 c_stdlib_version:
   - "2.28"
 
-# The CTK libraries below are missing from the conda-forge::cudatoolkit package
-# for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages
-# and the "*_run_*" version specifiers correspond to `11.x` packages.
-
-cuda11_libcufile_host_version:
-  - "1.4.0.31"
-
-cuda11_libcufile_run_version:
-  - ">=1.0.0.82,<=1.4.0.31"
-
 libcurl_version:
   - "==8.5.0"
-
-nvcomp_version:
-  - "=4.2.0.11"
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index 245773d6ee..cc31c8ea80 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -1,4 +1,5 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 schema_version: 1
 
 context:
@@ -40,17 +41,32 @@ build:
       - AWS_ACCESS_KEY_ID
       - AWS_SECRET_ACCESS_KEY
       - AWS_SESSION_TOKEN
+      - SCCACHE_DIST_AUTH_TOKEN
     env:
       CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }}
       CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }}
       CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }}
       CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }}
-      SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }}
-      SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }}
-      SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }}
-      SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }}
-      SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }}
+      NVCC_APPEND_FLAGS: ${{ env.get("NVCC_APPEND_FLAGS", default="") }}
+      PARALLEL_LEVEL: ${{ env.get("PARALLEL_LEVEL", default="8") }}
+      RAPIDS_ARTIFACTS_DIR: ${{ env.get("RAPIDS_ARTIFACTS_DIR", default="") }}
+      SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET", default="") }}
+      SCCACHE_DIST_AUTH_TYPE: ${{ env.get("SCCACHE_DIST_AUTH_TYPE", default="token") }}
+      SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE: ${{ env.get("SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE", default="false") }}
+      SCCACHE_DIST_MAX_RETRIES: ${{ env.get("SCCACHE_DIST_MAX_RETRIES", default="inf") }}
+      SCCACHE_DIST_REQUEST_TIMEOUT: ${{ env.get("SCCACHE_DIST_REQUEST_TIMEOUT", default="7140") }}
+      SCCACHE_DIST_SCHEDULER_URL: ${{ env.get("SCCACHE_DIST_SCHEDULER_URL", default="") }}
+      SCCACHE_ERROR_LOG: ${{ env.get("SCCACHE_ERROR_LOG", default="/tmp/sccache.log") }}
+      SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT", default="0") }}
+      SCCACHE_NO_CACHE: ${{ env.get("SCCACHE_NO_CACHE", default="") }}
+      SCCACHE_RECACHE: ${{ env.get("SCCACHE_RECACHE", default="") }}
+      SCCACHE_REGION: ${{ env.get("SCCACHE_REGION", default="") }}
       SCCACHE_S3_KEY_PREFIX: kvikio/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_major }}
+      SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS", default="false") }}
+      SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: kvikio/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_major }}/conda/preprocessor-cache
+      SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: ${{ env.get("SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE", default="true") }}
+      SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL", default="true") }}
+      SCCACHE_SERVER_LOG: ${{ env.get("SCCACHE_SERVER_LOG", default="sccache=debug") }}
 
 requirements:
   build:
@@ -63,53 +79,28 @@ requirements:
     - ${{ stdlib("c") }}
   host:
     - cuda-version =${{ cuda_version }}
-    - cython >=3.0.0
+    - cython >=3.0.0,<3.2.0a0
     - libcurl ${{ libcurl_version }}
     - libkvikio =${{ version }}
-    - nvcomp ${{ nvcomp_version }}
     - pip
     - python =${{ py_version }}
-    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - rapids-build-backend >=0.4.0,<0.5.0.dev0
     - scikit-build-core >=0.10.0
-    - if: cuda_major == "11"
-      then:
-        - cudatoolkit
-        - if: linux64
-          then: libcufile-dev =${{ cuda11_libcufile_host_version }}
-      else:
-        - cuda-cudart-dev
-        - if: linux
-          then: libcufile-dev
+    - cuda-cudart-dev
   run:
     - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-    - cupy >=12.0.0
+    - cupy >=13.6.0
     - libkvikio =${{ version }}
-    # See https://github.com/zarr-developers/numcodecs/pull/475
-    - numcodecs !=0.12.0
     - numpy >=1.23,<3.0a0
-    - nvcomp ${{ nvcomp_version }}
     - packaging
     - python
-    - zarr >=2.0.0,<4.0.0a0
-    - if: cuda_major == "11"
-      then:
-        - cudatoolkit
-        - if: linux64
-          then: libcufile ${{ cuda11_libcufile_run_version }}
-      else:
-        - cuda-cudart
-        - if: linux
-          then: libcufile
+    - cuda-cudart
   ignore_run_exports:
     by_name:
       - cuda-cudart
       - cuda-version
-      - libcufile
       - libcurl
       - libkvikio
-      - nvcomp
-      - if: cuda_major == "11"
-        then: cudatoolkit
 
 tests:
   - python:
diff --git a/conda/recipes/libkvikio/conda_build_config.yaml b/conda/recipes/libkvikio/conda_build_config.yaml
index ba8163c7ff..b67ab5d118 100644
--- a/conda/recipes/libkvikio/conda_build_config.yaml
+++ b/conda/recipes/libkvikio/conda_build_config.yaml
@@ -1,17 +1,8 @@
-c_compiler_version:
-  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
-  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
-
-cxx_compiler_version:
-  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
-  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
-
 cmake_version:
   - ">=3.30.4"
 
 cuda_compiler:
-  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
-  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - cuda-nvcc
 
 c_stdlib:
   - sysroot
@@ -19,15 +10,5 @@ c_stdlib:
 c_stdlib_version:
   - "2.28"
 
-# The CTK libraries below are missing from the conda-forge::cudatoolkit package
-# for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages
-# and the "*_run_*" version specifiers correspond to `11.x` packages.
-
-cuda11_libcufile_host_version:
-  - "1.4.0.31"
-
-cuda11_libcufile_run_version:
-  - ">=1.0.0.82,<=1.4.0.31"
-
 libcurl_version:
-  - "==8.5.0"
+  - "8.5.0"
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index 8ee2448217..30942b7215 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -1,14 +1,22 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 schema_version: 1
 
 context:
   version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }}
   minor_version: ${{ (version | split("."))[:2] | join(".") }}
-  cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}
-  cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}'
+  # We need to support three cases:
+  # 1. Linux x86_64, which always uses libcufile
+  # 2. Linux aarch64 with CUDA >= 12.2, which uses libcufile
+  # 3. Linux aarch64 with CUDA < 12.2, which does not use libcufile
+  # Each case has different cuda-version constraints as expressed below
+  should_use_cufile: ${{ x86_64 or (aarch64 and cuda_version >= "12.2") }}
+  # When reverting, instances of cuda_key_string can be replaced with cuda_major
+  cuda_key_string: ${{ cuda_version | replace(".", "_") }}
+  #cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}
+  #cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}'
   date_string: '${{ env.get("RAPIDS_DATE_STRING") }}'
   head_rev: '${{ git.head_rev(".")[:8] }}'
-  linux64: ${{ linux and x86_64 }}
 
 recipe:
   name: libkvikio-split
@@ -29,24 +37,37 @@ cache:
         export CXXFLAGS=$(echo $CXXFLAGS | sed -E 's@\-fdebug\-prefix\-map[^ ]*@@g')
         set +x
 
-        ./build.sh -v -n libkvikio
+        ./build.sh -v -n libkvikio benchmarks
       secrets:
         - AWS_ACCESS_KEY_ID
         - AWS_SECRET_ACCESS_KEY
         - AWS_SESSION_TOKEN
+        - SCCACHE_DIST_AUTH_TOKEN
       env:
         CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }}
         CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }}
         CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }}
         CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }}
-        PARALLEL_LEVEL: ${{ env.get("PARALLEL_LEVEL") }}
-        RAPIDS_ARTIFACTS_DIR: ${{ env.get("RAPIDS_ARTIFACTS_DIR") }}
-        SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }}
-        SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }}
-        SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }}
-        SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }}
-        SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }}
-        SCCACHE_S3_KEY_PREFIX: libkvikio/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_major }}
+        NVCC_APPEND_FLAGS: ${{ env.get("NVCC_APPEND_FLAGS", default="") }}
+        PARALLEL_LEVEL: ${{ env.get("PARALLEL_LEVEL", default="8") }}
+        RAPIDS_ARTIFACTS_DIR: ${{ env.get("RAPIDS_ARTIFACTS_DIR", default="") }}
+        SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET", default="") }}
+        SCCACHE_DIST_AUTH_TYPE: ${{ env.get("SCCACHE_DIST_AUTH_TYPE", default="token") }}
+        SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE: ${{ env.get("SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE", default="false") }}
+        SCCACHE_DIST_MAX_RETRIES: ${{ env.get("SCCACHE_DIST_MAX_RETRIES", default="inf") }}
+        SCCACHE_DIST_REQUEST_TIMEOUT: ${{ env.get("SCCACHE_DIST_REQUEST_TIMEOUT", default="7140") }}
+        SCCACHE_DIST_SCHEDULER_URL: ${{ env.get("SCCACHE_DIST_SCHEDULER_URL", default="") }}
+        SCCACHE_ERROR_LOG: ${{ env.get("SCCACHE_ERROR_LOG", default="/tmp/sccache.log") }}
+        SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT", default="0") }}
+        SCCACHE_NO_CACHE: ${{ env.get("SCCACHE_NO_CACHE", default="") }}
+        SCCACHE_RECACHE: ${{ env.get("SCCACHE_RECACHE", default="") }}
+        SCCACHE_REGION: ${{ env.get("SCCACHE_REGION", default="") }}
+        SCCACHE_S3_KEY_PREFIX: libkvikio/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_key_string }}
+        SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS", default="false") }}
+        SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: libkvikio/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_major }}/conda/preprocessor-cache
+        SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: ${{ env.get("SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE", default="true") }}
+        SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL", default="true") }}
+        SCCACHE_SERVER_LOG: ${{ env.get("SCCACHE_SERVER_LOG", default="sccache=debug") }}
   requirements:
     build:
       - ${{ compiler("c") }}
@@ -58,15 +79,11 @@ cache:
       - ${{ stdlib("c") }}
     host:
       - cuda-version =${{ cuda_version }}
-      - libcurl ${{ libcurl_version }}
-      - if: cuda_major == "11"
+      - libcurl ==${{ libcurl_version }}
+      - if: should_use_cufile
         then:
-          - cudatoolkit
-          - if: linux64
-            then:
-              - libcufile =${{ cuda11_libcufile_host_version }}
-              - libcufile-dev =${{ cuda11_libcufile_host_version }}
-        else: libcufile-dev
+        - libcufile-dev
+      - libnuma
 
 outputs:
   - package:
@@ -76,7 +93,7 @@ outputs:
       script:
         content: |
           cmake --install cpp/build
-      string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }}
+      string: cuda${{ cuda_key_string }}_${{ date_string }}_${{ head_rev }}
       dynamic_linking:
         overlinking_behavior: "error"
       prefix_detection:
@@ -88,27 +105,26 @@ outputs:
         - ${{ compiler("c") }}
       host:
         - cuda-version =${{ cuda_version }}
-        - libcurl ${{ libcurl_version }}
+        - libcurl ==${{ libcurl_version }}
       run:
-        - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-        - if: cuda_major == "11"
+        - if: x86_64 or (aarch64 and cuda_version >= "13.0")
           then:
-            - cudatoolkit
-            - if: linux64
-              then:
-                - libcufile ${{ cuda11_libcufile_run_version }}
-                - libcufile-dev ${{ cuda11_libcufile_run_version }}
+          - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
           else:
-            - if: linux
-              then: libcufile-dev
+          - if: aarch64 and cuda_version >= "12.2"
+            then:
+            - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="12.2.0a0") }}
+            else:
+            - ${{ pin_compatible("cuda-version", upper_bound="12.2.0a0", lower_bound="12.0") }}
+        - if: should_use_cufile
+          then:
+          - libcufile-dev
       ignore_run_exports:
         by_name:
           - cuda-version
-          - libcufile
-          - libcurl
-          - if: cuda_major == "11"
+          - if: should_use_cufile
             then:
-              - cudatoolkit
+            - libcufile
     tests:
       - script:
           - test -f $PREFIX/include/kvikio/file_handle.hpp
@@ -121,7 +137,7 @@ outputs:
       name: libkvikio-tests
       version: ${{ version }}
     build:
-      string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }}
+      string: cuda${{ cuda_key_string }}_${{ date_string }}_${{ head_rev }}
       dynamic_linking:
         overlinking_behavior: "error"
       script:
@@ -134,37 +150,30 @@ outputs:
       host:
         - ${{ pin_subpackage("libkvikio", exact=True) }}
         - cuda-version =${{ cuda_version }}
-        - if: cuda_major == "11"
+        - cuda-cudart-dev
+        - libcurl ==${{ libcurl_version }}
+        - if: should_use_cufile
           then:
-            - cudatoolkit
-            - if: linux64
-              then:
-                - libcufile-dev =${{ cuda11_libcufile_host_version }}
-          else:
-            - cuda-cudart-dev
-            - if: linux
-              then: libcufile-dev
+          - libcufile-dev
       run:
-        - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-        - if: cuda_major == "11"
+        - if: x86_64
           then:
-            - cudatoolkit
-            - if: linux64
-              then:
-                - libcufile =${{ cuda11_libcufile_host_version }}
+          - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
           else:
-            - cuda-cudart
-            - if: linux
-              then: libcufile
+          - if: aarch64 and cuda_version >= "12.2"
+            then:
+            - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="12.2.0a0") }}
+            else:
+            - ${{ pin_compatible("cuda-version", upper_bound="12.2.0a0", lower_bound="12.0") }}
+        - cuda-cudart
       ignore_run_exports:
         by_name:
           - cuda-cudart
           - cuda-version
-          - libcufile
-          - libcurl
-          - if: cuda_major == "11"
+          - libnuma
+          - if: should_use_cufile
             then:
-              - cudatoolkit
+            - libcufile
     about:
       homepage: ${{ load_from_file("python/libkvikio/pyproject.toml").project.urls.Homepage }}
       license: ${{ load_from_file("python/libkvikio/pyproject.toml").project.license.text }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3d61d4a4e1..71cbc258de 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
@@ -29,6 +22,11 @@ project(
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/")
 
+# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the
+# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with
+# gcc>=14. We can remove this once we upgrade to a newer sccache version.
+set(CMAKE_CXX_SCAN_FOR_MODULES OFF)
+
 # Write the version header
 rapids_cmake_write_version_file(include/kvikio/version_config.hpp)
 
@@ -39,12 +37,10 @@ rapids_cmake_build_type(Release)
 # * build options ----------------------------------------------------------------------------------
 
 option(BUILD_SHARED_LIBS "Build KvikIO shared library" ON)
-option(KvikIO_BUILD_BENCHMARKS "Configure CMake to build benchmarks" ON)
+option(KvikIO_BUILD_BENCHMARKS "Configure CMake to build benchmarks" OFF)
 option(KvikIO_BUILD_EXAMPLES "Configure CMake to build examples" ON)
 option(KvikIO_BUILD_TESTS "Configure CMake to build tests" ON)
 option(KvikIO_REMOTE_SUPPORT "Configure CMake to build with remote IO support" ON)
-option(KvikIO_CUDA_SUPPORT "Configure CMake to build with CUDA support" ON)
-option(KvikIO_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON)
 
 # ##################################################################################################
 # * conda environment ------------------------------------------------------------------------------
@@ -68,66 +64,65 @@ if(KvikIO_REMOTE_SUPPORT)
   endif()
 endif()
 
-set(cuFile_FOUND 0)
-if(KvikIO_CUDA_SUPPORT)
-  rapids_find_package(
-    CUDAToolkit REQUIRED
-    BUILD_EXPORT_SET kvikio-exports
-    INSTALL_EXPORT_SET kvikio-exports
+# CUDA is now required
+rapids_find_package(
+  CUDAToolkit REQUIRED
+  BUILD_EXPORT_SET kvikio-exports
+  INSTALL_EXPORT_SET kvikio-exports
+)
+include(cmake/thirdparty/get_nvtx.cmake)
+
+if(NOT TARGET CUDA::cuFile)
+  set(cuFile_FOUND 0)
+  message(
+    WARNING "Cannot find cuFile - KvikIO will still work but won't use GPUDirect Storage (GDS)"
   )
-  include(cmake/thirdparty/get_nvtx.cmake)
-
-  if(NOT TARGET CUDA::cuFile)
-    message(
-      WARNING "Cannot find cuFile - KvikIO will still work but won't use GPUDirect Storage (GDS)"
-    )
-  else()
-    set(cuFile_FOUND 1)
-
-    # Check API support
-    try_compile(
-      cuFile_BATCH_API_FOUND SOURCE_FROM_CONTENT
-      batch.cpp
-      [[#include <cufile.h>
-      int main() {
-        cuFileBatchIOSetUp(nullptr, 0);
-        return 0;
-      }
-      ]]
-      LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
-      OUTPUT_VARIABLE batch_output
-    )
-    message(STATUS "Found cuFile Batch API: ${cuFile_BATCH_API_FOUND}")
-    try_compile(
-      cuFile_STREAM_API_FOUND SOURCE_FROM_CONTENT
-      stream.cpp
-      [[#include <cufile.h>
-      int main() {
-        CUfileHandle_t fh;
-        CUstream stream;
-        cuFileReadAsync(fh, nullptr, nullptr, nullptr, nullptr, nullptr, stream);
-        return 0;
-      }
-      ]]
-      LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
-      OUTPUT_VARIABLE stream_output
-    )
-    message(STATUS "Found cuFile Stream API: ${cuFile_STREAM_API_FOUND}")
-    try_compile(
-      cuFile_VERSION_API_FOUND SOURCE_FROM_CONTENT
-      version.cpp
-      [[#include <cufile.h>
-      int main() {
-        int version;
-        cuFileGetVersion(&version);
-        return 0;
-      }
-      ]]
-      LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
-      OUTPUT_VARIABLE version_output
-    )
-    message(STATUS "Found cuFile Version API: ${cuFile_VERSION_API_FOUND}")
-  endif()
+else()
+  set(cuFile_FOUND 1)
+
+  # Check API support
+  try_compile(
+    cuFile_BATCH_API_FOUND SOURCE_FROM_CONTENT
+    batch.cpp
+    [[#include <cufile.h>
+    int main() {
+      cuFileBatchIOSetUp(nullptr, 0);
+      return 0;
+    }
+    ]]
+    LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
+    OUTPUT_VARIABLE batch_output
+  )
+  message(STATUS "Found cuFile Batch API: ${cuFile_BATCH_API_FOUND}")
+  try_compile(
+    cuFile_STREAM_API_FOUND SOURCE_FROM_CONTENT
+    stream.cpp
+    [[#include <cufile.h>
+    int main() {
+      CUfileHandle_t fh;
+      CUstream stream;
+      cuFileReadAsync(fh, nullptr, nullptr, nullptr, nullptr, nullptr, stream);
+      return 0;
+    }
+    ]]
+    LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
+    OUTPUT_VARIABLE stream_output
+  )
+  message(STATUS "Found cuFile Stream API: ${cuFile_STREAM_API_FOUND}")
+  try_compile(
+    cuFile_VERSION_API_FOUND SOURCE_FROM_CONTENT
+    version.cpp
+    [[#include <cufile.h>
+    int main() {
+      int version;
+      cuFileGetVersion(&version);
+      return 0;
+    }
+    ]]
+    LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
+    OUTPUT_VARIABLE version_output
+  )
+  message(STATUS "Found cuFile Version API: ${cuFile_VERSION_API_FOUND}")
 endif()
 
 include(cmake/thirdparty/get_thread_pool.cmake)
@@ -140,6 +135,7 @@ set(SOURCES
     "src/bounce_buffer.cpp"
     "src/buffer.cpp"
     "src/compat_mode.cpp"
+    "src/compat_mode_manager.cpp"
     "src/http_status_codes.cpp"
     "src/cufile/config.cpp"
     "src/cufile/driver.cpp"
@@ -147,8 +143,10 @@ set(SOURCES
     "src/error.cpp"
     "src/file_handle.cpp"
     "src/file_utils.cpp"
-    "src/nvtx.cpp"
-    "src/posix_io.cpp"
+    "src/mmap.cpp"
+    "src/detail/env.cpp"
+    "src/detail/nvtx.cpp"
+    "src/detail/posix_io.cpp"
     "src/shim/cuda.cpp"
     "src/shim/cufile.cpp"
     "src/shim/utils.cpp"
@@ -157,7 +155,9 @@ set(SOURCES
 )
 
 if(KvikIO_REMOTE_SUPPORT)
-  list(APPEND SOURCES "src/remote_handle.cpp" "src/shim/libcurl.cpp")
+  list(APPEND SOURCES "src/hdfs.cpp" "src/remote_handle.cpp" "src/detail/remote_handle.cpp"
+       "src/detail/tls.cpp" "src/detail/url.cpp" "src/shim/libcurl.cpp"
+  )
 endif()
 
 add_library(kvikio ${SOURCES})
@@ -173,7 +173,7 @@ add_library(kvikio::kvikio ALIAS kvikio)
 target_include_directories(
   kvikio
   PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-         "$<BUILD_INTERFACE:$<$<BOOL:${KvikIO_CUDA_SUPPORT}>:${CUDAToolkit_INCLUDE_DIRS}>>"
+         "$<BUILD_INTERFACE:${CUDAToolkit_INCLUDE_DIRS}>"
   INTERFACE "$<INSTALL_INTERFACE:include>"
 )
 
@@ -187,7 +187,6 @@ target_link_libraries(
 target_compile_definitions(
   kvikio
   PUBLIC $<$<BOOL:${KvikIO_REMOTE_SUPPORT}>:KVIKIO_LIBCURL_FOUND>
-         $<$<BOOL:${KvikIO_CUDA_SUPPORT}>:KVIKIO_CUDA_FOUND>
          $<$<BOOL:${cuFile_FOUND}>:KVIKIO_CUFILE_FOUND>
          $<$<BOOL:${cuFile_BATCH_API_FOUND}>:KVIKIO_CUFILE_BATCH_API_FOUND>
          $<$<BOOL:${cuFile_STREAM_API_FOUND}>:KVIKIO_CUFILE_STREAM_API_FOUND>
@@ -198,7 +197,7 @@ set_target_properties(
   kvikio
   PROPERTIES BUILD_RPATH "\$ORIGIN"
              INSTALL_RPATH "\$ORIGIN"
-             CXX_STANDARD 17
+             CXX_STANDARD 20
              CXX_STANDARD_REQUIRED ON
              POSITION_INDEPENDENT_CODE ON
              INTERFACE_POSITION_INDEPENDENT_CODE ON
@@ -223,10 +222,7 @@ if(KvikIO_BUILD_EXAMPLES)
   add_subdirectory(examples)
 endif()
 
-if(CUDAToolkit_FOUND
-   AND KvikIO_BUILD_TESTS
-   AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME
-)
+if(KvikIO_BUILD_TESTS AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
   include(cmake/thirdparty/get_gtest.cmake)
 
   # include CTest module -- automatically calls enable_testing()
@@ -263,7 +259,6 @@ Provide targets for KvikIO.
 
 set(final_code_string
     "
-set(KvikIO_CUDA_SUPPORT [=[${KvikIO_CUDA_SUPPORT}]=])
 set(KvikIO_CUFILE_SUPPORT [=[${cuFile_FOUND}]=])
 set(KvikIO_REMOTE_SUPPORT [=[${KvikIO_REMOTE_SUPPORT}]=])
 "
@@ -272,13 +267,11 @@ string(
   APPEND
   final_code_string
   [=[
-if(KvikIO_CUDA_SUPPORT)
-  find_package(CUDAToolkit REQUIRED QUIET)
-  target_include_directories(kvikio::kvikio INTERFACE ${CUDAToolkit_INCLUDE_DIRS})
+find_package(CUDAToolkit REQUIRED QUIET)
+target_include_directories(kvikio::kvikio INTERFACE ${CUDAToolkit_INCLUDE_DIRS})
 
-  if(KvikIO_CUFILE_SUPPORT AND NOT TARGET CUDA::cuFile)
-    message(FATAL_ERROR "Compiled with cuFile support but cuFile not found")
-  endif()
+if(KvikIO_CUFILE_SUPPORT AND NOT TARGET CUDA::cuFile)
+  message(FATAL_ERROR "Compiled with cuFile support but cuFile not found")
 endif()
 ]=]
 )
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 665d45edfb..3d9538a5bd 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 #[=======================================================================[.rst:
@@ -42,8 +35,16 @@ function(kvikio_add_benchmark)
   endif()
 
   add_executable(${_KVIKIO_NAME} ${_KVIKIO_SOURCES})
-  set_target_properties(${_KVIKIO_NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib")
+  set_target_properties(
+    ${_KVIKIO_NAME}
+    PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 20
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 20
+               CUDA_STANDARD_REQUIRED ON
+  )
 
+  target_include_directories(${_KVIKIO_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
   target_link_libraries(${_KVIKIO_NAME} PUBLIC benchmark::benchmark kvikio::kvikio)
 
   if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
@@ -59,4 +60,6 @@ function(kvikio_add_benchmark)
   )
 endfunction()
 
-kvikio_add_benchmark(NAME THREADPOOL_BENCHMARK SOURCES "threadpool/threadpool_benchmark.cpp")
+kvikio_add_benchmark(
+  NAME THREADPOOL_BENCHMARK SOURCES "threadpool/threadpool_benchmark.cpp" "utils/utils.cpp"
+)
diff --git a/cpp/benchmarks/threadpool/threadpool_benchmark.cpp b/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
index 1c90579d22..23dcd96ffc 100644
--- a/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
+++ b/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 // This benchmark assesses the scalability of the thread pool.
@@ -25,9 +14,13 @@
 
 #include <cmath>
 #include <cstdint>
+#include <thread>
+#include <vector>
 
 #include <benchmark/benchmark.h>
 #include <kvikio/defaults.hpp>
+#include <kvikio/threadpool_simple.hpp>
+#include <utils/utils.hpp>
 
 namespace kvikio {
 enum class ScalingType : uint8_t {
@@ -35,6 +28,12 @@ enum class ScalingType : uint8_t {
   WEAK_SCALING,
 };
 
+namespace constant {
+std::size_t constexpr ntasks_strong_scaling{10'000};
+std::size_t constexpr ntasks_weak_scaling{1'000};
+std::size_t constexpr num_compute_iterations{1'000};
+}  // namespace constant
+
 void task_compute(std::size_t num_compute_iterations)
 {
   [[maybe_unused]] double res{0.0};
@@ -45,49 +44,142 @@ void task_compute(std::size_t num_compute_iterations)
 }
 
 template <ScalingType scaling_type>
-void BM_threadpool_compute(benchmark::State& state)
+void BM_BS_threadpool_compute(benchmark::State& state)
 {
   auto const num_threads = state.range(0);
 
-  std::size_t const num_compute_tasks =
-    (scaling_type == ScalingType::STRONG_SCALING) ? 10'000 : (1'000 * num_threads);
+  std::size_t const num_compute_tasks = (scaling_type == ScalingType::STRONG_SCALING)
+                                          ? constant::ntasks_strong_scaling
+                                          : (constant::ntasks_weak_scaling * num_threads);
 
-  std::size_t constexpr num_compute_iterations{1'000};
   kvikio::defaults::set_thread_pool_nthreads(num_threads);
 
   for (auto _ : state) {
     // Submit a total of "num_compute_tasks" tasks to the thread pool.
     for (auto i = std::size_t{0}; i < num_compute_tasks; ++i) {
-      [[maybe_unused]] auto fut =
-        kvikio::defaults::thread_pool().submit_task([] { task_compute(num_compute_iterations); });
+      [[maybe_unused]] auto fut = kvikio::defaults::thread_pool().submit_task(
+        [] { task_compute(constant::num_compute_iterations); });
     }
     kvikio::defaults::thread_pool().wait();
   }
 
   state.counters["threads"] = num_threads;
 }
+
+template <ScalingType scaling_type>
+void BM_simple_threadpool_compute(benchmark::State& state)
+{
+  auto const num_threads = state.range(0);
+
+  std::size_t const num_compute_tasks = (scaling_type == ScalingType::STRONG_SCALING)
+                                          ? constant::ntasks_strong_scaling
+                                          : (constant::ntasks_weak_scaling * num_threads);
+
+  kvikio::ThreadPoolSimple thread_pool(num_threads);
+
+  for (auto _ : state) {
+    // Submit a total of "num_compute_tasks" tasks to the thread pool.
+    for (auto i = std::size_t{0}; i < num_compute_tasks; ++i) {
+      [[maybe_unused]] auto fut =
+        thread_pool.submit_task([] { task_compute(constant::num_compute_iterations); });
+    }
+    thread_pool.wait();
+  }
+
+  state.counters["threads"] = num_threads;
+}
+
+template <ScalingType scaling_type>
+void BM_static_task_compute(benchmark::State& state)
+{
+  auto const num_threads = state.range(0);
+
+  for (auto _ : state) {
+    std::vector<std::thread> threads(num_threads);
+    for (auto&& thread : threads) {
+      thread = std::thread([=] {
+        std::size_t num_tasks_this_thread{};
+        if constexpr (scaling_type == ScalingType::STRONG_SCALING) {
+          auto const p = constant::ntasks_strong_scaling / num_threads;
+          auto const q = constant::ntasks_strong_scaling % num_threads;
+          num_tasks_this_thread =
+            (static_cast<decltype(q)>(state.thread_index()) < q) ? (p + 1) : p;
+        } else {
+          num_tasks_this_thread = constant::ntasks_weak_scaling;
+        }
+
+        for (std::size_t i = 0; i < num_tasks_this_thread; ++i) {
+          task_compute(constant::num_compute_iterations);
+        }
+      });
+    }
+
+    for (auto&& thread : threads) {
+      thread.join();
+    }
+  }
+
+  state.counters["threads"] = num_threads;
+}
 }  // namespace kvikio
 
 int main(int argc, char** argv)
 {
   benchmark::Initialize(&argc, argv);
 
-  benchmark::RegisterBenchmark("BM_threadpool_compute:strong_scaling",
-                               kvikio::BM_threadpool_compute<kvikio::ScalingType::STRONG_SCALING>)
+  benchmark::RegisterBenchmark(
+    "BS_threadpool_compute:strong_scaling",
+    kvikio::BM_BS_threadpool_compute<kvikio::ScalingType::STRONG_SCALING>)
     ->RangeMultiplier(2)
     ->Range(1, 64)   // Increase from 1 to 64 (inclusive of both endpoints) with x2 stepping.
     ->UseRealTime()  // Use the wall clock to determine the number of benchmark iterations.
     ->Unit(benchmark::kMillisecond)
     ->MinTime(2);  // Minimum of 2 seconds.
 
-  benchmark::RegisterBenchmark("BM_threadpool_compute:weak_scaling",
-                               kvikio::BM_threadpool_compute<kvikio::ScalingType::WEAK_SCALING>)
+  benchmark::RegisterBenchmark("BS_threadpool_compute:weak_scaling",
+                               kvikio::BM_BS_threadpool_compute<kvikio::ScalingType::WEAK_SCALING>)
+    ->RangeMultiplier(2)
+    ->Range(1, 64)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond)
+    ->MinTime(2);
+
+  benchmark::RegisterBenchmark(
+    "simple_threadpool_compute:strong_scaling",
+    kvikio::BM_simple_threadpool_compute<kvikio::ScalingType::STRONG_SCALING>)
+    ->RangeMultiplier(2)
+    ->Range(1, 64)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond)
+    ->MinTime(2);
+
+  benchmark::RegisterBenchmark(
+    "simple_threadpool_compute:weak_scaling",
+    kvikio::BM_simple_threadpool_compute<kvikio::ScalingType::WEAK_SCALING>)
+    ->RangeMultiplier(2)
+    ->Range(1, 64)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond)
+    ->MinTime(2);
+
+  benchmark::RegisterBenchmark("static_task_compute:strong_scaling",
+                               kvikio::BM_static_task_compute<kvikio::ScalingType::STRONG_SCALING>)
     ->RangeMultiplier(2)
     ->Range(1, 64)
     ->UseRealTime()
     ->Unit(benchmark::kMillisecond)
     ->MinTime(2);
 
+  benchmark::RegisterBenchmark("static_task_compute:weak_scaling",
+                               kvikio::BM_static_task_compute<kvikio::ScalingType::WEAK_SCALING>)
+    ->RangeMultiplier(2)
+    ->Range(1, 64)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond)
+    ->MinTime(2);
+
+  kvikio::utils::explain_default_metrics();
+
   benchmark::RunSpecifiedBenchmarks();
   benchmark::Shutdown();
 }
diff --git a/cpp/benchmarks/utils/utils.cpp b/cpp/benchmarks/utils/utils.cpp
new file mode 100644
index 0000000000..ae71a2c2bc
--- /dev/null
+++ b/cpp/benchmarks/utils/utils.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <utils/utils.hpp>
+
+namespace kvikio::utils {
+void explain_default_metrics()
+{
+  benchmark::AddCustomContext(
+    "Time",
+    "The average real time (i.e. wall-clock time) of the entire process per benchmark iteration.");
+  benchmark::AddCustomContext(
+    "CPU",
+    "The average CPU time of the main thread per benchmark iteration. The timer is accumulated "
+    "only when the main thread is being executed.");
+}
+}  // namespace kvikio::utils
diff --git a/cpp/benchmarks/utils/utils.hpp b/cpp/benchmarks/utils/utils.hpp
new file mode 100644
index 0000000000..dd0090f27b
--- /dev/null
+++ b/cpp/benchmarks/utils/utils.hpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace kvikio::utils {
+void explain_default_metrics();
+}
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 10e6b026d9..698c61eb37 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 # This function finds gtest and sets any additional necessary environment variables.
diff --git a/cpp/cmake/thirdparty/get_libcurl.cmake b/cpp/cmake/thirdparty/get_libcurl.cmake
index e25c5fff0a..98c7089554 100644
--- a/cpp/cmake/thirdparty/get_libcurl.cmake
+++ b/cpp/cmake/thirdparty/get_libcurl.cmake
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 # This function finds libcurl and sets any additional necessary environment variables.
diff --git a/cpp/cmake/thirdparty/get_nvtx.cmake b/cpp/cmake/thirdparty/get_nvtx.cmake
index ee4fc02f69..744a0e3ed1 100644
--- a/cpp/cmake/thirdparty/get_nvtx.cmake
+++ b/cpp/cmake/thirdparty/get_nvtx.cmake
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 # Need to call rapids_cpm_nvtx3 to get support for an installed version of nvtx3 and to support
diff --git a/cpp/cmake/thirdparty/get_thread_pool.cmake b/cpp/cmake/thirdparty/get_thread_pool.cmake
index 3faf21366a..bb5d98433c 100644
--- a/cpp/cmake/thirdparty/get_thread_pool.cmake
+++ b/cpp/cmake/thirdparty/get_thread_pool.cmake
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 # Need to call rapids_cpm_bs_thread_pool to get support for an installed version of thread-pool and
diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
index 7dbcf8edea..a74bad75d6 100644
--- a/cpp/doxygen/main_page.md
+++ b/cpp/doxygen/main_page.md
@@ -30,13 +30,16 @@ For convenience we release Conda packages that makes it easy to include KvikIO i
 We strongly recommend using [mamba](https://github.com/mamba-org/mamba) in place of conda, which we will do throughout the documentation.
 
 Install the **stable release** from the ``rapidsai`` channel with the following:
+
 ```sh
 # Install in existing environment
 mamba install -c rapidsai -c conda-forge libkvikio
+
+# Create new environment (CUDA 13)
+mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=13.0 libkvikio
+
 # Create new environment (CUDA 12)
-mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=12.8 libkvikio
-# Create new environment (CUDA 11)
-mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=11.8 libkvikio
+mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=12.9 libkvikio
 ```
 
 Install the **nightly release** from the ``rapidsai-nightly`` channel with the following:
@@ -44,10 +47,12 @@ Install the **nightly release** from the ``rapidsai-nightly`` channel with the f
 ```sh
 # Install in existing environment
 mamba install -c rapidsai-nightly -c conda-forge libkvikio
+
+# Create new environment (CUDA 13)
+mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=13.0 libkvikio
+
 # Create new environment (CUDA 12)
-mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.8 libkvikio
-# Create new environment (CUDA 11)
-mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=11.8 libkvikio
+mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.9 libkvikio
 ```
 
 ---
@@ -132,6 +137,13 @@ Note that if you're reading a large file that has been split into multiple reads
 
 These settings can also be controlled by `defaults::http_max_attempts()`, `defaults::http_max_attempts_reset()`, `defaults::http_status_codes()`, and `defaults::http_status_codes_reset()`.
 
+#### Remote Verbose (KVIKIO_REMOTE_VERBOSE)
+For debugging HTTP requests, you can enable verbose output that shows detailed information about HTTP communication including headers, request/response bodies, connection details, and SSL handshake information.
+
+Set the environment variable `KVIKIO_REMOTE_VERBOSE` to `true`, `on`, `yes`, or `1` (case-insensitive) to enable verbose output. Otherwise, verbose output is disabled by default.
+
+**Warning** this may show sensitive contents from headers and data.
+
 ## Example
 
 ```cpp
diff --git a/cpp/examples/CMakeLists.txt b/cpp/examples/CMakeLists.txt
index b7aa73dd2d..b401c50d4e 100644
--- a/cpp/examples/CMakeLists.txt
+++ b/cpp/examples/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 set(TEST_INSTALL_PATH bin/tests/libkvikio)
diff --git a/cpp/examples/basic_io.cpp b/cpp/examples/basic_io.cpp
index 9fed0cee6a..4cc050ffd9 100644
--- a/cpp/examples/basic_io.cpp
+++ b/cpp/examples/basic_io.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <chrono>
diff --git a/cpp/examples/basic_no_cuda.cpp b/cpp/examples/basic_no_cuda.cpp
index 42ecb7142d..a5dfac0435 100644
--- a/cpp/examples/basic_no_cuda.cpp
+++ b/cpp/examples/basic_no_cuda.cpp
@@ -1,21 +1,11 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <chrono>
 #include <iostream>
+#include <numeric>
 
 #include <kvikio/batch.hpp>
 #include <kvikio/buffer.hpp>
diff --git a/cpp/examples/downstream/CMakeLists.txt b/cpp/examples/downstream/CMakeLists.txt
index a21d5699d4..d99711b7c9 100644
--- a/cpp/examples/downstream/CMakeLists.txt
+++ b/cpp/examples/downstream/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
diff --git a/cpp/examples/downstream/cmake/get_cpm.cmake b/cpp/examples/downstream/cmake/get_cpm.cmake
index 2050ab4f88..ab9480bb71 100644
--- a/cpp/examples/downstream/cmake/get_cpm.cmake
+++ b/cpp/examples/downstream/cmake/get_cpm.cmake
@@ -1,3 +1,8 @@
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
+
 set(CPM_DOWNLOAD_VERSION 0.35.5)
 
 if(CPM_SOURCE_CACHE)
diff --git a/cpp/examples/downstream/cmake/get_kvikio.cmake b/cpp/examples/downstream/cmake/get_kvikio.cmake
index efd3b1435f..fa61ef3b2d 100644
--- a/cpp/examples/downstream/cmake/get_kvikio.cmake
+++ b/cpp/examples/downstream/cmake/get_kvikio.cmake
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 # Use CPM to fetch KvikIO, which makes `kvikio::kvikio` available for `target_link_libraries`
@@ -20,7 +13,7 @@ function(find_and_configure_kvikio MIN_VERSION)
     VERSION ${MIN_VERSION}
             GIT_REPOSITORY
             https://github.com/rapidsai/kvikio.git
-    GIT_TAG branch-${MIN_VERSION}
+    GIT_TAG main
     GIT_SHALLOW
       TRUE
       SOURCE_SUBDIR
@@ -30,4 +23,4 @@ function(find_and_configure_kvikio MIN_VERSION)
 
 endfunction()
 
-find_and_configure_kvikio("25.06")
+find_and_configure_kvikio("26.02")
diff --git a/cpp/examples/downstream/downstream_example.cpp b/cpp/examples/downstream/downstream_example.cpp
index 87603908a1..f340835091 100644
--- a/cpp/examples/downstream/downstream_example.cpp
+++ b/cpp/examples/downstream/downstream_example.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <iostream>
diff --git a/cpp/include/kvikio/batch.hpp b/cpp/include/kvikio/batch.hpp
index 40168ffbc3..f85c87657c 100644
--- a/cpp/include/kvikio/batch.hpp
+++ b/cpp/include/kvikio/batch.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/bounce_buffer.hpp b/cpp/include/kvikio/bounce_buffer.hpp
index 5a7623a6a4..8b7b45c2e9 100644
--- a/cpp/include/kvikio/bounce_buffer.hpp
+++ b/cpp/include/kvikio/bounce_buffer.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
@@ -22,48 +11,153 @@
 namespace kvikio {
 
 /**
- * @brief Singleton class to retain host memory allocations
+ * @brief Allocator for page-aligned host memory
+ *
+ * Uses std::aligned_alloc to allocate host memory aligned to page boundaries (typically 4096
+ * bytes). This allocator is suitable for Direct I/O operations that require page-aligned buffers
+ * but do not need CUDA context (i.e., host-to-host transfers only).
+ */
+class PageAlignedAllocator {
+ public:
+  /**
+   * @brief Allocate page-aligned host memory
+   *
+   * @param size Requested size in bytes (will be rounded up to page boundary)
+   * @return Pointer to allocated memory
+   */
+  void* allocate(std::size_t size);
+
+  /**
+   * @brief Deallocate memory previously allocated by this allocator
+   *
+   * @param buffer Pointer to memory to deallocate
+   * @param size Size of the allocation (unused, for interface consistency)
+   */
+  void deallocate(void* buffer, std::size_t size);
+};
+
+/**
+ * @brief Allocator for CUDA pinned host memory
+ *
+ * Uses cudaMemHostAlloc to allocate pinned (page-locked) host memory that can be efficiently
+ * transferred to/from GPU device memory. The allocation is only guaranteed to be aligned to "at
+ * least 256 bytes". It is NOT guaranteed to be page aligned.
+ *
+ * @note Do NOT use with Direct I/O - lacks page alignment guarantee
+ */
+class CudaPinnedAllocator {
+ public:
+  /**
+   * @brief Allocate CUDA pinned host memory
+   *
+   * @param size Requested size in bytes
+   * @return Pointer to allocated pinned memory
+   */
+  void* allocate(std::size_t size);
+
+  /**
+   * @brief Deallocate memory previously allocated by this allocator
+   *
+   * @param buffer Pointer to memory to deallocate
+   * @param size Size of the allocation (unused, for interface consistency)
+   */
+  void deallocate(void* buffer, std::size_t size);
+};
+
+/**
+ * @brief Allocator for page-aligned AND CUDA-registered pinned host memory
+ *
+ * Combines the benefits of both page alignment (for Direct I/O) and CUDA registration
+ * (for efficient host-device transfers). Uses std::aligned_alloc followed by
+ * cudaMemHostRegister to achieve both properties.
+ *
+ * @note This is the required allocator for Direct I/O with device memory. Requires a valid CUDA
+ * context when allocating.
+ */
+class CudaPageAlignedPinnedAllocator {
+ public:
+  /**
+   * @brief Allocate page-aligned CUDA-registered pinned host memory
+   *
+   * @param size Requested size in bytes (will be rounded up to page boundary)
+   * @return Pointer to allocated memory
+   */
+  void* allocate(std::size_t size);
+
+  /**
+   * @brief Deallocate memory previously allocated by this allocator
+   *
+   * @param buffer Pointer to memory to deallocate
+   * @param size Size of the allocation (unused, for interface consistency)
+   */
+  void deallocate(void* buffer, std::size_t size);
+};
+
+/**
+ * @brief Thread-safe singleton pool for reusable bounce buffers
+ *
+ * Manages a pool of host memory buffers used for staging data during I/O operations.
+ * Buffers are retained and reused across calls to minimize allocation overhead.
+ * The pool uses a LIFO (stack) allocation strategy optimized for cache locality.
+ *
+ * All buffers in the pool have the same size, controlled by `defaults::bounce_buffer_size()`. If
+ * the buffer size changes, all cached buffers are cleared and reallocated at the new size.
  *
- * Call `AllocRetain::get` to get an allocation that will be retained when it
+ * Call `BounceBufferPool::get` to get an allocation that will be retained when it
  * goes out of scope (RAII). The size of all retained allocations are the same.
+ *
+ * @tparam Allocator The allocator policy that determines buffer properties:
+ * - PageAlignedAllocator: For host-only Direct I/O
+ * - CudaPinnedAllocator: For device I/O without Direct I/O
+ * - CudaPageAlignedPinnedAllocator: For device I/O with Direct I/O
+ *
+ * @note The destructor intentionally leaks allocations to avoid CUDA cleanup issues when static
+ * destructors run after CUDA context destruction
  */
-class AllocRetain {
+template <typename Allocator = CudaPinnedAllocator>
+class BounceBufferPool {
  private:
   std::mutex _mutex{};
-  // Stack of free allocations
-  std::stack<void*> _free_allocs{};
-  // The size of each allocation in `_free_allocs`
-  std::size_t _size{defaults::bounce_buffer_size()};
+  // Stack of free allocations (LIFO for cache locality)
+  std::stack<void*> _free_buffers{};
+  // The size of each allocation in `_free_buffers`
+  std::size_t _buffer_size{defaults::bounce_buffer_size()};
+  Allocator _allocator{};
 
  public:
   /**
-   * @brief An host memory allocation
+   * @brief RAII wrapper for a host bounce buffer allocation
+   *
+   * Automatically returns the buffer to the pool when destroyed (RAII pattern). Provides access to
+   * the underlying memory and its size.
+   *
+   * @note Non-copyable and non-movable to ensure single ownership
    */
-  class Alloc {
+  class Buffer {
    private:
-    AllocRetain* _manager;
-    void* _alloc;
+    BounceBufferPool* _pool;
+    void* _buffer;
     std::size_t const _size;
 
    public:
-    Alloc(AllocRetain* manager, void* alloc, std::size_t size);
-    Alloc(Alloc const&)            = delete;
-    Alloc& operator=(Alloc const&) = delete;
-    Alloc(Alloc&& o)               = delete;
-    Alloc& operator=(Alloc&& o)    = delete;
-    ~Alloc() noexcept;
+    Buffer(BounceBufferPool<Allocator>* pool, void* buffer, std::size_t size);
+    Buffer(Buffer const&)            = delete;
+    Buffer& operator=(Buffer const&) = delete;
+    Buffer(Buffer&& o)               = delete;
+    Buffer& operator=(Buffer&& o)    = delete;
+    ~Buffer() noexcept;
     void* get() noexcept;
     void* get(std::ptrdiff_t offset) noexcept;
     std::size_t size() noexcept;
   };
 
-  AllocRetain() = default;
+  BounceBufferPool() = default;
 
   // Notice, we do not clear the allocations at destruction thus the allocations leaks
-  // at exit. We do this because `AllocRetain::instance()` stores the allocations in a
+  // at exit. We do this because `BounceBufferPool::instance()` stores the allocations in a
   // static stack that are destructed below main, which is not allowed in CUDA:
   // <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
-  ~AllocRetain() noexcept = default;
+  ~BounceBufferPool() noexcept = default;
 
  private:
   /**
@@ -78,28 +172,81 @@ class AllocRetain {
   /**
    * @brief Ensure the sizes of the retained allocations match `defaults::bounce_buffer_size()`
    *
+   * If the configured bounce buffer size has changed, clears all cached buffers so new allocations
+   * will use the updated size.
+   *
    * NB: `_mutex` must be taken prior to calling this function.
    */
-  void _ensure_alloc_size();
+  void _ensure_buffer_size();
 
  public:
-  [[nodiscard]] Alloc get();
+  /**
+   * @brief Acquire a bounce buffer from the pool
+   *
+   * Returns a cached buffer if available, otherwise allocates a new one. The returned Buffer object
+   * will automatically return the buffer to the pool when it goes out of scope.
+   *
+   * @return RAII Buffer object wrapping the allocated memory
+   * @exception CudaError if allocation fails (e.g., invalid CUDA context for pinned allocators)
+   */
+  [[nodiscard]] Buffer get();
 
-  void put(void* alloc, std::size_t size);
+  /**
+   * @brief Return a buffer to the pool for reuse
+   *
+   * Typically called automatically by Buffer's destructor. Only adds the buffer to the pool if its
+   * size matches the current pool buffer size; otherwise the buffer is deallocated immediately.
+   *
+   * @param buffer Pointer to memory to return
+   * @param size Size of the buffer in bytes
+   */
+  void put(void* buffer, std::size_t size);
 
   /**
-   * @brief Free all retained allocations
+   * @brief Free all retained allocations in the pool
+   *
+   * Clears the pool and deallocates all cached buffers. Useful for reclaiming memory when bounce
+   * buffers are no longer needed.
    *
    * @return The number of bytes cleared
    */
   std::size_t clear();
 
-  KVIKIO_EXPORT static AllocRetain& instance();
+  /**
+   * @brief Get the singleton instance of the pool
+   *
+   * Each template instantiation (different Allocator) has its own singleton instance.
+   *
+   * @return Reference to the singleton pool instance
+   */
+  KVIKIO_EXPORT static BounceBufferPool& instance();
 
-  AllocRetain(AllocRetain const&)            = delete;
-  AllocRetain& operator=(AllocRetain const&) = delete;
-  AllocRetain(AllocRetain&& o)               = delete;
-  AllocRetain& operator=(AllocRetain&& o)    = delete;
+  BounceBufferPool(BounceBufferPool const&)            = delete;
+  BounceBufferPool& operator=(BounceBufferPool const&) = delete;
+  BounceBufferPool(BounceBufferPool&& o)               = delete;
+  BounceBufferPool& operator=(BounceBufferPool&& o)    = delete;
 };
 
+/**
+ * @brief Bounce buffer pool using page-aligned host memory
+ *
+ * Use for: Host-only Direct I/O operations (no CUDA context involvement)
+ */
+using PageAlignedBounceBufferPool = BounceBufferPool<PageAlignedAllocator>;
+
+/**
+ * @brief Bounce buffer pool using CUDA pinned memory
+ *
+ * Use for: Device I/O operations without Direct I/O
+ * Note: Not page-aligned - cannot be used with Direct I/O
+ */
+using CudaPinnedBounceBufferPool = BounceBufferPool<CudaPinnedAllocator>;
+
+/**
+ * @brief Bounce buffer pool using page-aligned CUDA-registered pinned memory
+ *
+ * Use for: Device I/O operations with Direct I/O enabled
+ * Provides both page alignment (for Direct I/O) and CUDA registration (for efficient transfers)
+ */
+using CudaPageAlignedPinnedBounceBufferPool = BounceBufferPool<CudaPageAlignedPinnedAllocator>;
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/buffer.hpp b/cpp/include/kvikio/buffer.hpp
index 3897bb3371..00bff4c4c2 100644
--- a/cpp/include/kvikio/buffer.hpp
+++ b/cpp/include/kvikio/buffer.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/compat_mode.hpp b/cpp/include/kvikio/compat_mode.hpp
index 03ed443fe0..a2f1171dba 100644
--- a/cpp/include/kvikio/compat_mode.hpp
+++ b/cpp/include/kvikio/compat_mode.hpp
@@ -1,25 +1,12 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
 
 #include <cstdint>
-#include <string>
-
-#include <kvikio/file_utils.hpp>
+#include <string_view>
 
 namespace kvikio {
 /**
@@ -49,102 +36,4 @@ CompatMode parse_compat_mode_str(std::string_view compat_mode_str);
 
 }  // namespace detail
 
-// Forward declaration.
-class FileHandle;
-
-/**
- * @brief Store and manage the compatibility mode data associated with a FileHandle.
- */
-class CompatModeManager {
- private:
-  CompatMode _compat_mode_requested{CompatMode::AUTO};
-  bool _is_compat_mode_preferred{true};
-  bool _is_compat_mode_preferred_for_async{true};
-
- public:
-  /**
-   * @brief Construct an empty compatibility mode manager.
-   */
-  CompatModeManager() noexcept = default;
-
-  /**
-   * @brief Construct a compatibility mode manager associated with a FileHandle.
-   *
-   * According to the file path, requested compatibility mode, and the system configuration, the
-   * compatibility manager:
-   * - Infers the final compatibility modes for synchronous and asynchronous I/O paths,
-   * respectively.
-   * - Initializes the file wrappers and cuFile handle associated with a FileHandle.
-   *
-   * @param file_path Refer to
-   * FileHandle::FileHandle(std::string const&, std::string const&, mode_t, CompatMode).
-   * @param flags Same as above.
-   * @param mode Same as above.
-   * @param compat_mode_requested Same as above.
-   * @param file_handle Pointer to the FileHandle object that owns this compatibility mode manager.
-   */
-  CompatModeManager(std::string const& file_path,
-                    std::string const& flags,
-                    mode_t mode,
-                    CompatMode compat_mode_requested,
-                    FileHandle* file_handle);
-
-  ~CompatModeManager() noexcept                              = default;
-  CompatModeManager(const CompatModeManager&)                = default;
-  CompatModeManager& operator=(const CompatModeManager&)     = default;
-  CompatModeManager(CompatModeManager&&) noexcept            = default;
-  CompatModeManager& operator=(CompatModeManager&&) noexcept = default;
-
-  /**
-   * @brief Functionally identical to defaults::infer_compat_mode_if_auto(CompatMode).
-   *
-   * @param compat_mode Compatibility mode.
-   * @return If the given compatibility mode is CompatMode::AUTO, infer the final compatibility
-   * mode.
-   */
-  CompatMode infer_compat_mode_if_auto(CompatMode compat_mode) noexcept;
-
-  /**
-   * @brief Functionally identical to defaults::is_compat_mode_preferred(CompatMode).
-   *
-   * @param compat_mode Compatibility mode.
-   * @return Boolean answer.
-   */
-  bool is_compat_mode_preferred(CompatMode compat_mode) noexcept;
-
-  /**
-   * @brief Check if the compatibility mode for synchronous I/O of the associated FileHandle is
-   * expected to be CompatMode::ON.
-   *
-   * @return Boolean answer.
-   */
-  bool is_compat_mode_preferred() const noexcept;
-
-  /**
-   * @brief Check if the compatibility mode for asynchronous I/O of the associated FileHandle is
-   * expected to be CompatMode::ON.
-   *
-   * @return Boolean answer.
-   */
-  bool is_compat_mode_preferred_for_async() const noexcept;
-
-  /**
-   * @brief Retrieve the original compatibility mode requested.
-   *
-   * @return The original compatibility mode requested.
-   */
-  CompatMode compat_mode_requested() const noexcept;
-
-  /**
-   * @brief Determine if asynchronous I/O can be performed or not (throw exceptions)
-   * according to the existing compatibility mode data in the manager.
-   *
-   * Asynchronous I/O cannot be performed, for instance, when compat_mode_requested() is
-   * CompatMode::OFF, is_compat_mode_preferred() is CompatMode::OFF, but
-   * is_compat_mode_preferred_for_async() is CompatMode::ON (due to missing cuFile stream API or
-   * cuFile configuration file).
-   */
-  void validate_compat_mode_for_async() const;
-};
-
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/compat_mode_manager.hpp b/cpp/include/kvikio/compat_mode_manager.hpp
new file mode 100644
index 0000000000..0dc55f165d
--- /dev/null
+++ b/cpp/include/kvikio/compat_mode_manager.hpp
@@ -0,0 +1,95 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <string>
+
+#include <kvikio/compat_mode.hpp>
+
+namespace kvikio {
+
+// Forward declaration.
+class FileHandle;
+
+/**
+ * @brief Store and manage the compatibility mode data associated with a FileHandle.
+ */
+class CompatModeManager {
+ private:
+  CompatMode _compat_mode_requested{CompatMode::AUTO};
+  bool _is_compat_mode_preferred{true};
+  bool _is_compat_mode_preferred_for_async{true};
+
+ public:
+  /**
+   * @brief Construct an empty compatibility mode manager.
+   */
+  CompatModeManager() noexcept = default;
+
+  /**
+   * @brief Construct a compatibility mode manager associated with a FileHandle.
+   *
+   * According to the file path, requested compatibility mode, and the system configuration, the
+   * compatibility manager:
+   * - Infers the final compatibility modes for synchronous and asynchronous I/O paths,
+   * respectively.
+   * - Initializes the file wrappers and cuFile handle associated with a FileHandle.
+   *
+   * @param file_path Refer to
+   * FileHandle::FileHandle(std::string const&, std::string const&, mode_t, CompatMode).
+   * @param flags Same as above.
+   * @param mode Same as above.
+   * @param compat_mode_requested Same as above.
+   * @param file_handle Pointer to the FileHandle object that owns this compatibility mode manager.
+   */
+  CompatModeManager(std::string const& file_path,
+                    std::string const& flags,
+                    mode_t mode,
+                    CompatMode compat_mode_requested,
+                    FileHandle* file_handle);
+
+  ~CompatModeManager() noexcept                              = default;
+  CompatModeManager(const CompatModeManager&)                = default;
+  CompatModeManager& operator=(const CompatModeManager&)     = default;
+  CompatModeManager(CompatModeManager&&) noexcept            = default;
+  CompatModeManager& operator=(CompatModeManager&&) noexcept = default;
+
+  /**
+   * @brief Check if the compatibility mode for synchronous I/O of the associated FileHandle is
+   * expected to be CompatMode::ON.
+   *
+   * @return Boolean answer.
+   */
+  bool is_compat_mode_preferred() const noexcept;
+
+  /**
+   * @brief Check if the compatibility mode for asynchronous I/O of the associated FileHandle is
+   * expected to be CompatMode::ON.
+   *
+   * @return Boolean answer.
+   */
+  bool is_compat_mode_preferred_for_async() const noexcept;
+
+  /**
+   * @brief Retrieve the original compatibility mode requested.
+   *
+   * @return The original compatibility mode requested.
+   */
+  CompatMode compat_mode_requested() const noexcept;
+
+  /**
+   * @brief Determine if asynchronous I/O can be performed or not (throw exceptions)
+   * according to the existing compatibility mode data in the manager.
+   *
+   * Asynchronous I/O cannot be performed, for instance, when compat_mode_requested() is
+   * CompatMode::OFF, is_compat_mode_preferred() is CompatMode::OFF, but
+   * is_compat_mode_preferred_for_async() is CompatMode::ON (due to missing cuFile stream API or
+   * cuFile configuration file).
+   */
+  void validate_compat_mode_for_async() const;
+};
+
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/cufile/config.hpp b/cpp/include/kvikio/cufile/config.hpp
index fd721e5fe4..c6b878a3cd 100644
--- a/cpp/include/kvikio/cufile/config.hpp
+++ b/cpp/include/kvikio/cufile/config.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/cufile/driver.hpp b/cpp/include/kvikio/cufile/driver.hpp
index 56a6e8159b..0f23dee6e7 100644
--- a/cpp/include/kvikio/cufile/driver.hpp
+++ b/cpp/include/kvikio/cufile/driver.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/defaults.hpp b/cpp/include/kvikio/defaults.hpp
index d1e17f0b85..722986c362 100644
--- a/cpp/include/kvikio/defaults.hpp
+++ b/cpp/include/kvikio/defaults.hpp
@@ -1,28 +1,20 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
 
 #include <cstddef>
 #include <cstdlib>
+#include <initializer_list>
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include <type_traits>
 
 #include <kvikio/compat_mode.hpp>
+#include <kvikio/error.hpp>
 #include <kvikio/http_status_codes.hpp>
 #include <kvikio/shim/cufile.hpp>
 #include <kvikio/threadpool_wrapper.hpp>
@@ -41,9 +33,13 @@ T getenv_or(std::string_view env_var_name, T default_val)
   std::stringstream sstream(env_val);
   T converted_val;
   sstream >> converted_val;
-  KVIKIO_EXPECT(!sstream.fail(),
-                "unknown config value " + std::string{env_var_name} + "=" + std::string{env_val},
-                std::invalid_argument);
+
+  if constexpr (!std::is_same_v<T, std::string>) {
+    KVIKIO_EXPECT(!sstream.fail(),
+                  "unknown config value " + std::string{env_var_name} + "=" + std::string{env_val},
+                  std::invalid_argument);
+  }
+
   return converted_val;
 }
 
@@ -56,6 +52,59 @@ CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val);
 template <>
 std::vector<int> getenv_or(std::string_view env_var_name, std::vector<int> default_val);
 
+/**
+ * @brief Get the environment variable value from a candidate list
+ *
+ * @tparam T Type of the environment variable value
+ * @param env_var_names Candidate list containing the names of environment variable
+ * @param default_val Default value of the environment variable, if none of the candidates has been
+ * found
+ * @return A tuple of (`env_var_name`, `result`, `has_found`), where:
+ *   - If the environment variable is not set by any of the candidates, `has_found` will be false,
+ * `result` will be `default_val`, and `env_var_name` will be empty.
+ *   - If the environment variable is set by `env_var_name`, then `has_found` will be true, and
+ * `result` be the set value. If more than one candidates have been set with the same value,
+ * `env_var_name` will be assigned the last candidate.
+ *
+ * @exception std::invalid_argument if:
+ *   - `env_var_names` is empty.
+ *   - The environment variable is not defined to be string type and is assigned an empty value (in
+ *     other words, string-type environment variables are allowed to hold an empty value).
+ *   - More than one candidates have been set with different values.
+ *   - An invalid value is given, e.g. value that cannot be converted to type T.
+ */
+template <typename T>
+std::tuple<std::string_view, T, bool> getenv_or(
+  std::initializer_list<std::string_view> env_var_names, T default_val)
+{
+  KVIKIO_EXPECT(env_var_names.size() > 0,
+                "`env_var_names` must contain at least one environment variable name.",
+                std::invalid_argument);
+  std::string_view env_name_target;
+  std::string_view env_val_target;
+
+  for (auto const& env_var_name : env_var_names) {
+    auto const* env_val = std::getenv(env_var_name.data());
+    if (env_val == nullptr) { continue; }
+
+    if (!env_name_target.empty() && env_val_target != env_val) {
+      std::stringstream ss;
+      ss << "Environment variable " << env_var_name << " (" << env_val
+         << ") has already been set by its alias " << env_name_target << " (" << env_val_target
+         << ") with a different value.";
+      KVIKIO_FAIL(ss.str(), std::invalid_argument);
+    }
+
+    env_name_target = env_var_name;
+    env_val_target  = env_val;
+  }
+
+  if (env_name_target.empty()) { return {env_name_target, default_val, false}; }
+
+  auto res = getenv_or<T>(env_name_target, default_val);
+  return {env_name_target, res, true};
+}
+
 /**
  * @brief Singleton class of default values used throughout KvikIO.
  *
@@ -70,6 +119,8 @@ class defaults {
   std::size_t _http_max_attempts;
   long _http_timeout;
   std::vector<int> _http_status_codes;
+  bool _auto_direct_io_read;
+  bool _auto_direct_io_write;
 
   static unsigned int get_num_threads_from_env();
 
@@ -183,6 +234,20 @@ class defaults {
    */
   static void set_thread_pool_nthreads(unsigned int nthreads);
 
+  /**
+   * @brief Alias of `thread_pool_nthreads`
+   *
+   * @return The number of threads
+   */
+  [[nodiscard]] static unsigned int num_threads();
+
+  /**
+   * @brief Alias of `set_thread_pool_nthreads`
+   *
+   * @param nthreads The number of threads to use
+   */
+  static void set_num_threads(unsigned int nthreads);
+
   /**
    * @brief Get the default task size used for parallel IO operations.
    *
@@ -293,6 +358,42 @@ class defaults {
    * @param status_codes The HTTP status codes to retry.
    */
   static void set_http_status_codes(std::vector<int> status_codes);
+
+  /**
+   * @brief Check if Direct I/O is enabled for POSIX reads
+   *
+   * Returns true if KvikIO should attempt to use Direct I/O (O_DIRECT) for POSIX read operations.
+   *
+   * @return Boolean answer
+   */
+  static bool auto_direct_io_read();
+
+  /**
+   * @brief Enable or disable Direct I/O for POSIX reads
+   *
+   * Controls whether KvikIO should attempt to use Direct I/O (O_DIRECT) for POSIX read operations.
+   *
+   * @param flag true to enable opportunistic Direct I/O reads, false to disable
+   */
+  static void set_auto_direct_io_read(bool flag);
+
+  /**
+   * @brief Check if Direct I/O is enabled for POSIX writes
+   *
+   * Returns true if KvikIO should attempt to use Direct I/O (O_DIRECT) for POSIX write operations.
+   *
+   * @return Boolean answer
+   */
+  static bool auto_direct_io_write();
+
+  /**
+   * @brief Enable or disable Direct I/O for POSIX writes
+   *
+   * Controls whether KvikIO should attempt to use Direct I/O (O_DIRECT) for POSIX write operations.
+   *
+   * @param flag true to enable opportunistic Direct I/O writes, false to disable
+   */
+  static void set_auto_direct_io_write(bool flag);
 };
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/detail/env.hpp b/cpp/include/kvikio/detail/env.hpp
new file mode 100644
index 0000000000..6832eb9ad1
--- /dev/null
+++ b/cpp/include/kvikio/detail/env.hpp
@@ -0,0 +1,27 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <optional>
+#include <string>
+
+namespace kvikio::detail {
+/**
+ * @brief Unwrap an optional parameter, obtaining a fallback from the environment.
+ *
+ * Resolution order:
+ * - If `value` has a value, return it
+ * - If environment variable `env_var` is set, return its value (even if empty)
+ * - Return std::nullopt if err_msg is std::nullopt; throw an exception otherwise
+ *
+ * @param value The value to unwrap.
+ * @param env_var The name of the environment variable to check if `value` isn't set.
+ * @param err_msg Optional error message that controls whether to throw an exception if neither
+ * source provides a value.
+ * @return The resolved value, or std::nullopt if neither source provides a value.
+ */
+std::optional<std::string> unwrap_or_env(std::optional<std::string> value,
+                                         std::string const& env_var,
+                                         std::optional<std::string> const& err_msg = std::nullopt);
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/nvtx.hpp b/cpp/include/kvikio/detail/nvtx.hpp
similarity index 86%
rename from cpp/include/kvikio/nvtx.hpp
rename to cpp/include/kvikio/detail/nvtx.hpp
index 7fff8e4d47..ef41681c55 100644
--- a/cpp/include/kvikio/nvtx.hpp
+++ b/cpp/include/kvikio/detail/nvtx.hpp
@@ -1,32 +1,18 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
 #include <cstdint>
 
-#ifdef KVIKIO_CUDA_FOUND
 #include <nvtx3/nvtx3.hpp>
-#endif
 
 #include <kvikio/shim/cuda.hpp>
 #include <kvikio/utils.hpp>
 
 namespace kvikio {
 
-#ifdef KVIKIO_CUDA_FOUND
 /**
  * @brief Tag type for libkvikio's NVTX domain.
  */
@@ -100,13 +86,7 @@ using nvtx_registered_string_type = nvtx3::registered_string_in<libkvikio_domain
   nvtx3::mark_in<kvikio::libkvikio_domain>(nvtx3::event_attributes{ \
     KVIKIO_REGISTER_STRING(message), nvtx3::payload{kvikio::convert_to_64bit(payload_v)}})
 
-#endif
-
-#ifdef KVIKIO_CUDA_FOUND
 using nvtx_color_type = nvtx3::color;
-#else
-using nvtx_color_type = int;
-#endif
 
 /**
  * @brief Utility singleton class for NVTX annotation.
@@ -181,13 +161,7 @@ class NvtxManager {
  * }
  * ```
  */
-#ifdef KVIKIO_CUDA_FOUND
 #define KVIKIO_NVTX_FUNC_RANGE(...) KVIKIO_NVTX_FUNC_RANGE_IMPL(__VA_ARGS__)
-#else
-#define KVIKIO_NVTX_FUNC_RANGE(...) \
-  do {                              \
-  } while (0)
-#endif
 
 /**
  * @brief Convenience macro for generating an NVTX scoped range in the `libkvikio` domain to
@@ -206,13 +180,7 @@ class NvtxManager {
  * }
  * ```
  */
-#ifdef KVIKIO_CUDA_FOUND
 #define KVIKIO_NVTX_SCOPED_RANGE(...) KVIKIO_NVTX_SCOPED_RANGE_IMPL(__VA_ARGS__)
-#else
-#define KVIKIO_NVTX_SCOPED_RANGE(message, payload, ...) \
-  do {                                                  \
-  } while (0)
-#endif
 
 /**
  * @brief Convenience macro for generating an NVTX marker in the `libkvikio` domain to annotate a
@@ -232,12 +200,6 @@ class NvtxManager {
  * }
  * ```
  */
-#ifdef KVIKIO_CUDA_FOUND
 #define KVIKIO_NVTX_MARKER(message, payload) KVIKIO_NVTX_MARKER_IMPL(message, payload)
-#else
-#define KVIKIO_NVTX_MARKER(message, payload) \
-  do {                                       \
-  } while (0)
-#endif
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/parallel_operation.hpp b/cpp/include/kvikio/detail/parallel_operation.hpp
similarity index 91%
rename from cpp/include/kvikio/parallel_operation.hpp
rename to cpp/include/kvikio/detail/parallel_operation.hpp
index 5026240557..a4489da8e5 100644
--- a/cpp/include/kvikio/parallel_operation.hpp
+++ b/cpp/include/kvikio/detail/parallel_operation.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
@@ -26,8 +15,8 @@
 #include <vector>
 
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/nvtx.hpp>
 #include <kvikio/utils.hpp>
 
 namespace kvikio {
diff --git a/cpp/include/kvikio/detail/posix_io.hpp b/cpp/include/kvikio/detail/posix_io.hpp
new file mode 100644
index 0000000000..2997337906
--- /dev/null
+++ b/cpp/include/kvikio/detail/posix_io.hpp
@@ -0,0 +1,367 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <unistd.h>
+#include <cstddef>
+#include <cstdlib>
+#include <map>
+#include <thread>
+#include <type_traits>
+
+#include <kvikio/bounce_buffer.hpp>
+#include <kvikio/detail/nvtx.hpp>
+#include <kvikio/detail/utils.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/shim/cuda.hpp>
+#include <kvikio/utils.hpp>
+
+namespace kvikio::detail {
+
+/**
+ * @brief Type of the IO operation.
+ */
+enum class IOOperationType : uint8_t {
+  READ,   ///< POSIX read.
+  WRITE,  ///< POSIX write.
+};
+
+/**
+ * @brief Specifies whether all requested bytes are to be processed or not.
+ */
+enum class PartialIO : uint8_t {
+  YES,  ///< POSIX read/write is called only once, which may not process all bytes requested.
+  NO,   ///< POSIX read/write is called repeatedly until all requested bytes are processed.
+};
+
+/**
+ * @brief Singleton class to retrieve a CUDA stream for device-host copying
+ *
+ * Call `StreamsByThread::get` to get the CUDA stream assigned to the current
+ * CUDA context and thread.
+ */
+class StreamsByThread {
+ private:
+  std::map<std::pair<CUcontext, std::thread::id>, CUstream> _streams;
+
+ public:
+  StreamsByThread() = default;
+
+  // Here we intentionally do not destroy in the destructor the CUDA resources
+  // (e.g. CUstream) with static storage duration, but instead let them leak
+  // on program termination. This is to prevent undefined behavior in CUDA. See
+  // <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
+  // This also prevents crash (segmentation fault) if clients call
+  // cuDevicePrimaryCtxReset() or cudaDeviceReset() before program termination.
+  ~StreamsByThread() = default;
+
+  KVIKIO_EXPORT static CUstream get(CUcontext ctx, std::thread::id thd_id);
+
+  static CUstream get();
+
+  StreamsByThread(StreamsByThread const&)            = delete;
+  StreamsByThread& operator=(StreamsByThread const&) = delete;
+  StreamsByThread(StreamsByThread&& o)               = delete;
+  StreamsByThread& operator=(StreamsByThread&& o)    = delete;
+};
+
+/**
+ * @brief Read or write host memory to or from disk using POSIX with opportunistic Direct I/O
+ *
+ * This function attempts to use Direct I/O (O_DIRECT) when alignment requirements are satisfied,
+ * and automatically falls back to buffered I/O when they cannot be met. Direct I/O requires:
+ * - File offset aligned to page boundary
+ * - Buffer address aligned to page boundary
+ * - Transfer size as a multiple of page size
+ *
+ * The implementation handles partial alignment by breaking the I/O into segments:
+ * - Unaligned prefix (if offset not page-aligned): uses buffered I/O to reach page boundary
+ * - Aligned middle section: uses Direct I/O with bounce buffer if needed
+ * - Unaligned suffix (if remaining bytes < page size): uses buffered I/O
+ *
+ * @tparam Operation Whether the operation is a read or a write
+ * @tparam PartialIOStatus If PartialIO::YES, returns after first successful I/O. If PartialIO::NO,
+ * loops until all `count` bytes are processed
+ * @tparam BounceBufferPoolType Pool type for acquiring page-aligned bounce buffers when the user
+ * buffer is not page-aligned (defaults to PageAlignedBounceBufferPool)
+ * @param fd_direct_off File descriptor opened without O_DIRECT (always valid)
+ * @param buf Buffer to read into or write from
+ * @param count Number of bytes to transfer
+ * @param offset File offset in bytes
+ * @param fd_direct_on File descriptor opened with O_DIRECT, or -1 to disable Direct I/O attempts
+ * @return Number of bytes read or written (always greater than zero)
+ */
+template <IOOperationType Operation,
+          PartialIO PartialIOStatus,
+          typename BounceBufferPoolType = PageAlignedBounceBufferPool>
+ssize_t posix_host_io(
+  int fd_direct_off, void const* buf, size_t count, off_t offset, int fd_direct_on = -1)
+{
+  auto pread_or_write = [](int fd, void* buf, size_t count, off_t offset) -> ssize_t {
+    ssize_t nbytes{};
+    if constexpr (Operation == IOOperationType::READ) {
+      nbytes = ::pread(fd, buf, count, offset);
+    } else {
+      nbytes = ::pwrite(fd, buf, count, offset);
+    }
+    return nbytes;
+  };
+
+  off_t cur_offset       = offset;
+  size_t bytes_remaining = count;
+  char* buffer           = const_cast<char*>(static_cast<char const*>(buf));
+  auto const page_size   = get_page_size();
+
+  // Process all bytes in a loop (unless PartialIO::YES returns early)
+  while (bytes_remaining > 0) {
+    ssize_t nbytes_processed{};
+
+    if (fd_direct_on == -1) {
+      // Direct I/O disabled: use buffered I/O for entire transfer
+      nbytes_processed = pread_or_write(fd_direct_off, buffer, bytes_remaining, cur_offset);
+    } else {
+      // Direct I/O enabled: attempt to use it when alignment allows
+      auto const is_cur_offset_aligned = detail::is_aligned(cur_offset, page_size);
+
+      if (!is_cur_offset_aligned) {
+        // Handle unaligned prefix: use buffered I/O to reach next page boundary
+        // This ensures subsequent iterations will have page-aligned offsets
+        auto const aligned_cur_offset = detail::align_up(cur_offset, page_size);
+        auto const bytes_requested    = std::min(aligned_cur_offset - cur_offset, bytes_remaining);
+        nbytes_processed = pread_or_write(fd_direct_off, buffer, bytes_requested, cur_offset);
+      } else {
+        if (bytes_remaining < page_size) {
+          // Handle unaligned suffix: remaining bytes are less than a page, use buffered I/O
+          nbytes_processed = pread_or_write(fd_direct_off, buffer, bytes_remaining, cur_offset);
+        } else {
+          // Offset is page-aligned. Now make transfer size page-aligned too by rounding down
+          auto aligned_bytes_remaining = detail::align_down(bytes_remaining, page_size);
+          auto const is_buf_aligned    = detail::is_aligned(buffer, page_size);
+          auto bytes_requested         = aligned_bytes_remaining;
+
+          if (!is_buf_aligned) {
+            // Buffer not page-aligned: use bounce buffer for Direct I/O
+            auto bounce_buffer = BounceBufferPoolType::instance().get();
+            auto* aligned_buf  = bounce_buffer.get();
+            // Limit transfer size to bounce buffer capacity
+            bytes_requested = std::min(bytes_requested, bounce_buffer.size());
+
+            if constexpr (Operation == IOOperationType::WRITE) {
+              // Copy user data to aligned bounce buffer before Direct I/O write
+              std::memcpy(aligned_buf, buffer, bytes_requested);
+            }
+
+            // Perform Direct I/O using the bounce buffer
+            nbytes_processed =
+              pread_or_write(fd_direct_on, aligned_buf, bytes_requested, cur_offset);
+
+            if constexpr (Operation == IOOperationType::READ) {
+              // Copy data from bounce buffer to user buffer after Direct I/O read
+              std::memcpy(buffer, aligned_buf, nbytes_processed);
+            }
+          } else {
+            // Buffer is page-aligned: perform Direct I/O directly with user buffer
+            nbytes_processed = pread_or_write(fd_direct_on, buffer, bytes_requested, cur_offset);
+          }
+        }
+      }
+    }
+
+    // Error handling
+    if (nbytes_processed == -1) {
+      std::string const name = (Operation == IOOperationType::READ) ? "pread" : "pwrite";
+      KVIKIO_EXPECT(errno != EBADF, "POSIX error: Operation not permitted");
+      KVIKIO_FAIL("POSIX error on " + name + ": " + strerror(errno));
+    }
+    if constexpr (Operation == IOOperationType::READ) {
+      KVIKIO_EXPECT(nbytes_processed != 0, "POSIX error on pread: EOF");
+    }
+
+    // Return early if partial I/O is allowed
+    if constexpr (PartialIOStatus == PartialIO::YES) { return nbytes_processed; }
+
+    // Advance to next segment
+    buffer += nbytes_processed;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+    cur_offset += nbytes_processed;
+    bytes_remaining -= nbytes_processed;
+  }
+
+  return convert_size2ssize(count);
+}
+
+/**
+ * @brief Read or write device memory to or from disk using POSIX with opportunistic Direct I/O
+ *
+ * This function transfers data between GPU device memory and files by staging through a host bounce
+ * buffer. Since without GDS Direct I/O cannot be performed directly with device memory, the
+ * operation is split into stages:
+ * - For reads: File --> Host bounce buffer (with Direct I/O if aligned) --> Device memory
+ * - For writes: Device memory --> Host bounce buffer --> File (with Direct I/O if aligned)
+ *
+ * The underlying file I/O uses `posix_host_io` which opportunistically attempts Direct I/O when
+ * alignment requirements are satisfied.
+ *
+ * @tparam Operation Whether the operation is a read or a write
+ * @tparam BounceBufferPoolType Pool type for acquiring CUDA-registered bounce buffers (defaults to
+ * CudaPinnedBounceBufferPool)
+ * @param fd_direct_off File descriptor opened without O_DIRECT (always valid)
+ * @param devPtr_base Base device pointer for the transfer
+ * @param size Total number of bytes to transfer
+ * @param file_offset Byte offset from the start of the file
+ * @param devPtr_offset Byte offset from devPtr_base (allows working with sub-regions)
+ * @param fd_direct_on File descriptor opened with O_DIRECT, or -1 to disable Direct I/O attempts
+ * @return Total number of bytes read or written
+ */
+template <IOOperationType Operation, typename BounceBufferPoolType = CudaPinnedBounceBufferPool>
+std::size_t posix_device_io(int fd_direct_off,
+                            void const* devPtr_base,
+                            std::size_t size,
+                            std::size_t file_offset,
+                            std::size_t devPtr_offset,
+                            int fd_direct_on = -1)
+{
+  // Direct I/O requires page-aligned bounce buffers. CudaPinnedBounceBufferPool uses
+  // cudaMemHostAlloc which does not guarantee page alignment.
+  if (std::is_same_v<BounceBufferPoolType, CudaPinnedBounceBufferPool>) {
+    KVIKIO_EXPECT(
+      fd_direct_on == -1,
+      "Direct I/O requires page-aligned bounce buffers. CudaPinnedBounceBufferPool does not "
+      "guarantee page alignment. Use CudaPageAlignedPinnedBounceBufferPool instead.");
+  }
+
+  auto bounce_buffer      = BounceBufferPoolType::instance().get();
+  CUdeviceptr devPtr      = convert_void2deviceptr(devPtr_base) + devPtr_offset;
+  off_t cur_file_offset   = convert_size2off(file_offset);
+  off_t bytes_remaining   = convert_size2off(size);
+  off_t const chunk_size2 = convert_size2off(bounce_buffer.size());
+
+  // Get a stream for the current CUDA context and thread
+  CUstream stream = StreamsByThread::get();
+
+  while (bytes_remaining > 0) {
+    off_t const nbytes_requested = std::min(chunk_size2, bytes_remaining);
+    ssize_t nbytes_got           = nbytes_requested;
+    if constexpr (Operation == IOOperationType::READ) {
+      nbytes_got = posix_host_io<IOOperationType::READ, PartialIO::YES>(
+        fd_direct_off, bounce_buffer.get(), nbytes_requested, cur_file_offset, fd_direct_on);
+      CUDA_DRIVER_TRY(
+        cudaAPI::instance().MemcpyHtoDAsync(devPtr, bounce_buffer.get(), nbytes_got, stream));
+      CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
+    } else {  // Is a write operation
+      CUDA_DRIVER_TRY(
+        cudaAPI::instance().MemcpyDtoHAsync(bounce_buffer.get(), devPtr, nbytes_requested, stream));
+      CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
+      posix_host_io<IOOperationType::WRITE, PartialIO::NO>(
+        fd_direct_off, bounce_buffer.get(), nbytes_requested, cur_file_offset, fd_direct_on);
+    }
+    cur_file_offset += nbytes_got;
+    devPtr += nbytes_got;
+    bytes_remaining -= nbytes_got;
+  }
+  return size;
+}
+
+/**
+ * @brief Read from disk to host memory using POSIX
+ *
+ * If `size` or `file_offset` isn't aligned with `page_size` then
+ * `fd` cannot have been opened with the `O_DIRECT` flag.
+ *
+ * @tparam PartialIOStatus Whether all requested data are processed or not. If `FULL`, all of
+ * `count` bytes are read.
+ * @param fd_direct_off File descriptor without Direct I/O.
+ * @param buf Base address of buffer in host memory.
+ * @param size Size in bytes to read.
+ * @param file_offset Offset in the file to read from.
+ * @param fd_direct_on Optional file descriptor with Direct I/O.
+ * @return Size of bytes that were successfully read.
+ */
+template <PartialIO PartialIOStatus>
+std::size_t posix_host_read(
+  int fd_direct_off, void* buf, std::size_t size, std::size_t file_offset, int fd_direct_on = -1)
+{
+  KVIKIO_NVTX_FUNC_RANGE(size);
+
+  auto cur_fd_direct_on{-1};
+  if (fd_direct_on != -1 && defaults::auto_direct_io_read()) { cur_fd_direct_on = fd_direct_on; }
+
+  return detail::posix_host_io<IOOperationType::READ, PartialIOStatus>(
+    fd_direct_off, buf, size, convert_size2off(file_offset), cur_fd_direct_on);
+}
+
+/**
+ * @brief Write host memory to disk using POSIX
+ *
+ * If `size` or `file_offset` isn't aligned with `page_size` then
+ * `fd` cannot have been opened with the `O_DIRECT` flag.
+ *
+ * @tparam ioDataCompletionLevel Whether all requested data are processed or not. If `FULL`, all
+ * of `count` bytes are written.
+ * @param fd_direct_off File descriptor without Direct I/O.
+ * @param buf Base address of buffer in host memory.
+ * @param size Size in bytes to write.
+ * @param file_offset Offset in the file to write to.
+ * @param fd_direct_on Optional file descriptor with Direct I/O.
+ * @return Size of bytes that were successfully read.
+ */
+template <PartialIO PartialIOStatus>
+std::size_t posix_host_write(int fd_direct_off,
+                             void const* buf,
+                             std::size_t size,
+                             std::size_t file_offset,
+                             int fd_direct_on = -1)
+{
+  KVIKIO_NVTX_FUNC_RANGE(size);
+
+  auto cur_fd_direct_on{-1};
+  if (fd_direct_on != -1 && defaults::auto_direct_io_write()) { cur_fd_direct_on = fd_direct_on; }
+
+  return detail::posix_host_io<IOOperationType::WRITE, PartialIOStatus>(
+    fd_direct_off, buf, size, convert_size2off(file_offset), cur_fd_direct_on);
+}
+
+/**
+ * @brief Read from disk to device memory using POSIX
+ *
+ * If `size` or `file_offset` isn't aligned with `page_size` then
+ * `fd` cannot have been opened with the `O_DIRECT` flag.
+ *
+ * @param fd_direct_off File descriptor without Direct I/O.
+ * @param devPtr_base Base address of buffer in device memory.
+ * @param size Size in bytes to read.
+ * @param file_offset Offset in the file to read from.
+ * @param devPtr_offset Offset relative to the `devPtr_base` pointer to read into.
+ * @param fd_direct_on Optional file descriptor with Direct I/O.
+ * @return Size of bytes that were successfully read.
+ */
+std::size_t posix_device_read(int fd_direct_off,
+                              void const* devPtr_base,
+                              std::size_t size,
+                              std::size_t file_offset,
+                              std::size_t devPtr_offset,
+                              int fd_direct_on = -1);
+
+/**
+ * @brief Write device memory to disk using POSIX
+ *
+ * If `size` or `file_offset` isn't aligned with `page_size` then
+ * `fd` cannot have been opened with the `O_DIRECT` flag.
+ *
+ * @param fd_direct_off File descriptor without Direct I/O.
+ * @param devPtr_base Base address of buffer in device memory.
+ * @param size Size in bytes to write.
+ * @param file_offset Offset in the file to write to.
+ * @param devPtr_offset Offset relative to the `devPtr_base` pointer to write into.
+ * @param fd_direct_on Optional file descriptor with Direct I/O.
+ * @return Size of bytes that were successfully written.
+ */
+std::size_t posix_device_write(int fd_direct_off,
+                               void const* devPtr_base,
+                               std::size_t size,
+                               std::size_t file_offset,
+                               std::size_t devPtr_offset,
+                               int fd_direct_on = -1);
+
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/detail/remote_handle.hpp b/cpp/include/kvikio/detail/remote_handle.hpp
new file mode 100644
index 0000000000..2e6613aeef
--- /dev/null
+++ b/cpp/include/kvikio/detail/remote_handle.hpp
@@ -0,0 +1,23 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <cstring>
+
+namespace kvikio::detail {
+/**
+ * @brief Callback for `CURLOPT_WRITEFUNCTION` that copies received data into a `std::string`.
+ *
+ * @param data Received data
+ * @param size Curl internal implementation always sets this parameter to 1
+ * @param num_bytes Number of bytes received
+ * @param userdata Must be cast from `std::string*`
+ * @return The number of bytes consumed by the callback
+ */
+std::size_t callback_get_string_response(char* data,
+                                         std::size_t size,
+                                         std::size_t num_bytes,
+                                         void* userdata);
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/detail/tls.hpp b/cpp/include/kvikio/detail/tls.hpp
new file mode 100644
index 0000000000..52d5d01202
--- /dev/null
+++ b/cpp/include/kvikio/detail/tls.hpp
@@ -0,0 +1,47 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include <kvikio/shim/libcurl.hpp>
+
+namespace kvikio::detail {
+/**
+ * @brief Search for the CA bundle file and directory paths
+ *
+ * This function searches for the Certificate Authority (CA) paths required for TLS/SSL verification
+ * in libcurl. The search is performed in the following priority order, returning as soon as either
+ * a bundle file or a directory is found:
+ * - CA bundle file: Check env vars CURL_CA_BUNDLE, SSL_CERT_FILE
+ * - CA directory: Check env var SSL_CERT_DIR
+ * - CA bundle file: Search distribution-specific locations for accessible bundle
+ * - CA directory: Search distribution-specific locations for accessible directory
+ * - CA bundle file: Check if curl's compile-time default bundle path is accessible
+ * - CA directory: Check if curl's compile-time default directory path is accessible
+ *
+ * @return Result containing CA bundle file and CA certificate directory
+ *
+ * @exception std::runtime_error if neither CA bundle nor directory is found
+ *
+ * @note Environment Variables:
+ * - CURL_CA_BUNDLE: Override CA bundle file location (curl-specific)
+ * - SSL_CERT_FILE: Override CA bundle file location (OpenSSL-compatible)
+ * - SSL_CERT_DIR: Override CA directory location (OpenSSL-compatible)
+ */
+std::pair<std::optional<std::string>, std::optional<std::string>> get_ca_paths();
+
+/**
+ * @brief Configure curl handle with discovered CA certificate paths
+ *
+ * As a performance optimization, the discovered CA certificate paths are cached to avoid repeated
+ * searching.
+ *
+ * @param curl Curl handle to configure with CA certificate paths
+ */
+void set_up_ca_paths(CurlHandle& curl);
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/detail/url.hpp b/cpp/include/kvikio/detail/url.hpp
new file mode 100644
index 0000000000..2c18b8ac29
--- /dev/null
+++ b/cpp/include/kvikio/detail/url.hpp
@@ -0,0 +1,432 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <optional>
+#include <string>
+
+#include <curl/curl.h>
+
+namespace kvikio::detail {
+/**
+ * @brief RAII wrapper for libcurl's URL handle (CURLU)
+ *
+ * This class provides automatic resource management for libcurl URL handles,
+ * ensuring proper cleanup when the handle goes out of scope. The class is
+ * move-only to prevent accidental sharing of the underlying resource.
+ */
+class CurlUrlHandle {
+ private:
+  CURLU* _handle{nullptr};
+
+ public:
+  /**
+   * @brief Create a new libcurl URL handle
+   *
+   * @exception std::runtime_error if libcurl cannot allocate the handle (usually due to out of
+   * memory)
+   */
+  CurlUrlHandle();
+
+  /**
+   * @brief Clean up the underlying URL handle
+   */
+  ~CurlUrlHandle() noexcept;
+
+  CurlUrlHandle(CurlUrlHandle const&)            = delete;
+  CurlUrlHandle& operator=(CurlUrlHandle const&) = delete;
+
+  CurlUrlHandle(CurlUrlHandle&& other) noexcept;
+  CurlUrlHandle& operator=(CurlUrlHandle&& other) noexcept;
+
+  /**
+   * @brief Get the underlying libcurl URL handle
+   *
+   * @return Pointer to the underlying libcurl URL handle
+   * @note The returned pointer should not be freed manually as it is managed by this class
+   */
+  CURLU* get() const;
+};
+
+/**
+ * @brief URL parsing utility using libcurl's URL API
+ *
+ * This class provides static methods for parsing URLs into their constituent
+ * components (scheme, host, port, path, query, fragment).
+ *
+ * @note This class uses libcurl's URL parsing which follows RFC 3986 plus. See
+ * https://curl.se/docs/url-syntax.html
+ *
+ * Example:
+ * @code{.cpp}
+ * auto components = UrlParser::parse("https://example.com:8080/path?query=1#frag");
+ * if (components.scheme.has_value()) {
+ *     std::cout << "Scheme: " << components.scheme.value() << std::endl;
+ * }
+ * if (components.host.has_value()) {
+ *     std::cout << "Host: " << components.host.value() << std::endl;
+ * }
+ * @endcode
+ */
+class UrlParser {
+ public:
+  /**
+   * @brief Container for parsed URL components
+   */
+  struct UrlComponents {
+    /**
+     * @brief The URL scheme (e.g., "http", "https", "ftp"). May be empty for scheme-relative URLs
+     * or paths.
+     */
+    std::optional<std::string> scheme;
+
+    /**
+     * @brief The hostname or IP address. May be empty for URLs without an authority component
+     * (e.g., "file:///path").
+     */
+    std::optional<std::string> host;
+
+    /**
+     * @brief The port number as a string. Will be empty if no explicit port is specified in the
+     * URL.
+     * @note Default ports (e.g., 80 for HTTP, 443 for HTTPS) are not automatically filled in.
+     */
+    std::optional<std::string> port;
+
+    /**
+     * @brief The path component of the URL. Libcurl ensures that the path component is always
+     * present, even if empty (will be "/" for URLs like "http://example.com").
+     */
+    std::optional<std::string> path;
+
+    /**
+     * @brief The query string (without the leading "?"). Empty if no query parameters are present.
+     */
+    std::optional<std::string> query;
+
+    /**
+     * @brief The fragment identifier (without the leading "#"). Empty if no fragment is present.
+     */
+    std::optional<std::string> fragment;
+  };
+
+  /**
+   * @brief Parses the given URL according to RFC 3986 plus and extracts its components.
+   *
+   * @param url The URL string to parse
+   * @param bitmask_url_flags Optional flags for URL parsing. Common flags include:
+   *  - CURLU_DEFAULT_SCHEME: Allows URLs without schemes
+   *  - CURLU_NON_SUPPORT_SCHEME: Accept non-supported schemes
+   *  - CURLU_URLENCODE: URL encode the path
+   * @param bitmask_component_flags Optional flags for component extraction. Common flags include:
+   *  - CURLU_URLDECODE: URL decode the component
+   *  - CURLU_PUNYCODE: Return host as punycode
+   *
+   * @return UrlComponents structure containing the parsed URL components
+   *
+   * @exception std::runtime_error if the URL cannot be parsed or if component extraction fails
+   *
+   * Example:
+   * @code{.cpp}
+   * // Basic parsing
+   * auto components = UrlParser::parse("https://api.example.com/v1/users?page=1");
+   *
+   * // Parsing with URL decoding
+   * auto decoded = UrlParser::parse(
+   *     "https://example.com/hello%20world",
+   *     std::nullopt,
+   *     CURLU_URLDECODE
+   * );
+   *
+   * // Allow non-standard schemes, i.e. schemes not registered with Internet Assigned Numbers
+   * // Authority (IANA), such as AWS S3
+   * auto custom = UrlParser::parse(
+   *     "s3://my-bucket/my-object.bin",
+   *     CURLU_NON_SUPPORT_SCHEME
+   * );
+   * @endcode
+   */
+  static UrlComponents parse(std::string const& url,
+                             std::optional<unsigned int> bitmask_url_flags       = std::nullopt,
+                             std::optional<unsigned int> bitmask_component_flags = std::nullopt);
+
+  /**
+   * @brief Extract a specific component from a CurlUrlHandle
+   *
+   * @param handle The CurlUrlHandle containing the parsed URL
+   * @param part The URL part to extract (e.g., CURLUPART_SCHEME)
+   * @param bitmask_component_flags Flags controlling extraction behavior
+   * @param allowed_err_code Optional error code to treat as valid (e.g., CURLUE_NO_SCHEME)
+   * @return The extracted component as a string, or std::nullopt if not present
+   * @exception std::runtime_error if extraction fails with an unexpected error
+   */
+  static std::optional<std::string> extract_component(
+    CurlUrlHandle const& handle,
+    CURLUPart part,
+    std::optional<unsigned int> bitmask_component_flags = std::nullopt,
+    std::optional<CURLUcode> allowed_err_code           = std::nullopt);
+
+  /**
+   * @brief Extract a specific component from a URL string
+   *
+   * @param url The URL string from which to extract a component
+   * @param part The URL part to extract
+   * @param bitmask_url_flags Optional flags for URL parsing.
+   * @param bitmask_component_flags Flags controlling extraction behavior
+   * @param allowed_err_code Optional error code to treat as valid
+   * @return The extracted component as a string, or std::nullopt if not present
+   * @exception std::runtime_error if extraction fails with an unexpected error
+   */
+  static std::optional<std::string> extract_component(
+    std::string const& url,
+    CURLUPart part,
+    std::optional<unsigned int> bitmask_url_flags       = std::nullopt,
+    std::optional<unsigned int> bitmask_component_flags = std::nullopt,
+    std::optional<CURLUcode> allowed_err_code           = std::nullopt);
+};
+
+/**
+ * @brief URL builder utility using libcurl's URL API
+ *
+ * This class provides methods for constructing and modifying URLs by setting individual components
+ * (scheme, host, port, path, query, fragment).
+ *
+ * @note This class uses libcurl's URL parsing which follows RFC 3986 plus. See
+ * https://curl.se/docs/url-syntax.html
+ *
+ * Example:
+ * @code{.cpp}
+ * // Build from scratch
+ * auto url = UrlBuilder()
+ *     .set_scheme("https")
+ *     .set_host("witcher4.com")
+ *     .set_path("/ciri")
+ *     .set_query("occupation", "witcher")
+ *     .build();
+ *
+ * // Modify existing URL
+ * auto modified = UrlBuilder("https://witcher4.com/old/path/to/bestiary")
+ *     .set_path("/new/path/to/bestiary")
+ *     .set_port("8080")
+ *     .build();
+ * @endcode
+ */
+class UrlBuilder {
+ private:
+  CurlUrlHandle _handle;
+
+  /**
+   * @brief Internal helper to set a URL component
+   *
+   * @param part The URL part to set
+   * @param value The value to set. Use `nullptr` to clear
+   * @param flags Optional flags for the operation
+   * @return Reference to this builder for chaining
+   * @exception std::runtime_error if the operation fails
+   */
+  UrlBuilder& set_component(CURLUPart part,
+                            char const* value,
+                            std::optional<unsigned int> flags = std::nullopt);
+
+ public:
+  /**
+   * @brief Construct an empty URL builder
+   * @exception std::runtime_error if initialization fails
+   */
+  explicit UrlBuilder();
+
+  /**
+   * @brief Construct a URL builder from an existing URL string
+   *
+   * @param url The URL string to start with
+   * @param bitmask_url_flags Optional flags for URL parsing. Common flags include:
+   *  - CURLU_DEFAULT_SCHEME: Allows URLs without schemes
+   *  - CURLU_NON_SUPPORT_SCHEME: Accept non-supported schemes
+   *  - CURLU_URLENCODE: URL encode the path
+   * @exception std::runtime_error if the URL cannot be parsed
+   */
+  explicit UrlBuilder(std::string const& url,
+                      std::optional<unsigned int> bitmask_url_flags = std::nullopt);
+
+  /**
+   * @brief Construct a URL builder from parsed URL components
+   *
+   * @param components The parsed URL components to start with
+   * @param bitmask_url_flags Optional flags for URL handling
+   * @exception std::runtime_error if the components cannot be set
+   */
+  explicit UrlBuilder(UrlParser::UrlComponents const& components,
+                      std::optional<unsigned int> bitmask_url_flags = std::nullopt);
+
+  /**
+   * @brief Set the URL scheme (e.g., "http", "https", "ftp")
+   *
+   * @param scheme The scheme to set. Use `std::nullopt` to clear
+   * @return Reference to this builder for chaining
+   * @exception std::runtime_error if the scheme is invalid
+   *
+   * Example:
+   * @code{.cpp}
+   * builder.set_scheme("https");
+   * @endcode
+   */
+  UrlBuilder& set_scheme(std::optional<std::string> const& scheme);
+
+  /**
+   * @brief Set the hostname or IP address
+   *
+   * @param host The host to set. Use `std::nullopt` to clear
+   * @return Reference to this builder for chaining
+   * @exception std::runtime_error if the host is invalid
+   *
+   * Example:
+   * @code{.cpp}
+   * builder.set_host("api.example.com");
+   * @endcode
+   */
+  UrlBuilder& set_host(std::optional<std::string> const& host);
+
+  /**
+   * @brief Set the port number
+   *
+   * @param port The port to set as string. Use `std::nullopt` to clear
+   * @return Reference to this builder for chaining
+   * @exception std::runtime_error if the port is invalid
+   *
+   * Example:
+   * @code{.cpp}
+   * builder.set_port("8080");
+   * @endcode
+   */
+  UrlBuilder& set_port(std::optional<std::string> const& port);
+
+  /**
+   * @brief Set the path component
+   *
+   * @param path The path to set (should start with "/" for absolute paths). Use `std::nullopt` to
+   * clear
+   * @return Reference to this builder for chaining
+   * @exception std::runtime_error if the path is invalid
+   *
+   * Example:
+   * @code{.cpp}
+   * builder.set_path("/api/v1/users");
+   * @endcode
+   */
+  UrlBuilder& set_path(std::optional<std::string> const& path);
+
+  /**
+   * @brief Set the entire query string
+   *
+   * @param query The query string (without leading "?"). Use `std::nullopt` to clear
+   * @return Reference to this builder for chaining
+   * @exception std::runtime_error if the query is invalid
+   *
+   * Example:
+   * @code{.cpp}
+   * builder.set_query("page=1&limit=10");
+   * @endcode
+   */
+  UrlBuilder& set_query(std::optional<std::string> const& query);
+
+  /**
+   * @brief Set the fragment identifier
+   *
+   * @param fragment The fragment (without leading "#"). Use `std::nullopt` to clear
+   * @return Reference to this builder for chaining
+   * @exception std::runtime_error if the fragment is invalid
+   *
+   * Example:
+   * @code{.cpp}
+   * builder.set_fragment("section-2");
+   * @endcode
+   */
+  UrlBuilder& set_fragment(std::optional<std::string> const& fragment);
+
+  /**
+   * @brief Build the final URL string
+   *
+   * @param bitmask_component_flags Optional flags for URL formatting. Common flags:
+   *  - CURLU_PUNYCODE: Convert host to punycode if needed
+   *  - CURLU_NO_DEFAULT_PORT: Include port even if it's the default for the scheme
+   * @return The complete URL string
+   * @exception std::runtime_error if the URL cannot be built
+   *
+   * Example:
+   * @code{.cpp}
+   * std::string url = builder.build();
+   * @endcode
+   */
+  std::string build(std::optional<unsigned int> bitmask_component_flags = std::nullopt) const;
+
+  static std::string build_manually(UrlParser::UrlComponents const& components);
+};
+
+/**
+ * @brief Provides URL encoding functionality
+ *
+ * The AWS object naming documentation
+ * (https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html) lists several
+ * types of special characters. In practice, handling them using libcurl is complex and described
+ * below.
+ *
+ *  - Special characters that are safe for use in key names: "!-_.*'()" KvikIO includes !*'() in
+ * `aws_special_chars`, because for private bucket they cause AWS authentication by libcurl to fail
+ *
+ *  - Characters that might require special handling: "&$@=;/:+ ,? and 0-31, 127 ASCII
+ * characters". For /, KvikIO does not include it in `aws_special_chars`, because it can be legally
+ * used as a path separator. For the space character and ?, although KvikIO has them in
+ * `aws_special_chars`, users must manually percent encode them to %20 and %3F, respectively.
+ * Otherwise, the space character will be considered malformed by libcurl, and ? cause ambiguity
+ * with the query string. For the control characters, KvikIO include them all in
+ * `aws_special_chars`.
+ *
+ *  - Characters to avoid: "\{^}%`]">[~<#| and 128-255 non-ASCII characters". KvikIO recommends
+ * users avoiding these characters in the URL. They are not included in `aws_special_chars`.
+ *
+ */
+class UrlEncoder {
+ public:
+  /**
+   * @brief Default set of special characters requiring encoding in AWS URLs
+   */
+  static constexpr char aws_special_chars[] = {
+    '!',    '*',    '\'',   '(',    ')',    '&',    '$',    '@',    '=',    ';',    ':',    '+',
+    ' ',    ',',    '?',    '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08',
+    '\x09', '\x0A', '\x0B', '\x0C', '\x0D', '\x0E', '\x0F', '\x10', '\x11', '\x12', '\x13', '\x14',
+    '\x15', '\x16', '\x17', '\x18', '\x19', '\x1A', '\x1B', '\x1C', '\x1D', '\x1E', '\x1F', '\x7F'};
+
+  /**
+   * @brief Percent-encodes specified characters in a URL path
+   *
+   * Performs percent-encoding (RFC 3986) on a given path string, encoding only the characters
+   * specified in the chars_to_encode parameter. Each encoded character is replaced with its
+   * percent-encoded equivalent (%XX where XX is the hexadecimal representation of the character).
+   *
+   * Only ASCII characters (0-127) are supported for encoding. Non-ASCII characters in
+   * chars_to_encode will be encoded to an empty string. Characters not in chars_to_encode are
+   * passed through unchanged.
+   *
+   * @param path The path string to encode
+   * @param chars_to_encode Set of characters that should be encoded (defaults to aws_special_chars)
+   *
+   * @return A new string with specified characters percent-encoded
+   *
+   * @code{.cpp}
+   * // Example usage with default AWS special characters
+   * std::string encoded = UrlEncoder::encode_path("/path/ with spaces");
+   * // Result: "/path/%20with%20spaces"
+   *
+   * // Example with custom character set
+   * std::string encoded = UrlEncoder::encode_path("hello/world", "/");
+   * // Result: "hello%2Fworld"
+   * @endcode
+   */
+  static std::string encode_path(std::string_view path,
+                                 std::string_view chars_to_encode = std::string_view{
+                                   aws_special_chars, sizeof(aws_special_chars)});
+};
+
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/detail/utils.hpp b/cpp/include/kvikio/detail/utils.hpp
new file mode 100644
index 0000000000..ae81c03dc6
--- /dev/null
+++ b/cpp/include/kvikio/detail/utils.hpp
@@ -0,0 +1,65 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <cstring>
+
+namespace kvikio::detail {
+
+/**
+ * @brief Round up `value` to multiples of `alignment`
+ *
+ * @param value Value to be rounded up
+ * @param alignment Must be a power of 2
+ * @return Rounded result
+ */
+[[nodiscard]] std::size_t align_up(std::size_t value, std::size_t alignment);
+
+/**
+ * @brief Align the address `addr` up to the boundary of `alignment`
+ *
+ * @param addr Address to be aligned up
+ * @param alignment Must be a power of 2
+ * @return Aligned address
+ */
+[[nodiscard]] void* align_up(void* addr, std::size_t alignment);
+
+/**
+ * @brief Round down `value` to multiples of `alignment`
+ *
+ * @param value Value to be rounded down
+ * @param alignment Must be a power of 2
+ * @return Rounded result
+ */
+[[nodiscard]] std::size_t align_down(std::size_t value, std::size_t alignment);
+
+/**
+ * @brief Align the address `addr` down to the boundary of `alignment`
+ *
+ * @param addr Address to be aligned down
+ * @param alignment Must be a power of 2
+ * @return Aligned address
+ */
+[[nodiscard]] void* align_down(void* addr, std::size_t alignment);
+
+/**
+ * @brief Whether `value` is a multiple of `alignment`
+ *
+ * @param value Value to be checked
+ * @param alignment Must be a power of 2
+ * @return Boolean answer
+ */
+bool is_aligned(std::size_t value, std::size_t alignment);
+
+/**
+ * @brief Whether the address `addr` is a multiple of `alignment`
+ *
+ * @param addr Address to be checked
+ * @param alignment Must be a power of 2
+ * @return Boolean answer
+ */
+bool is_aligned(void* addr, std::size_t alignment);
+
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/error.hpp b/cpp/include/kvikio/error.hpp
index 29538f5b6a..604388476c 100644
--- a/cpp/include/kvikio/error.hpp
+++ b/cpp/include/kvikio/error.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp
index 50e1c679c3..e74b8e3e20 100644
--- a/cpp/include/kvikio/file_handle.hpp
+++ b/cpp/include/kvikio/file_handle.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
@@ -20,18 +9,14 @@
 
 #include <cstddef>
 #include <cstdlib>
-#include <stdexcept>
-#include <system_error>
-#include <utility>
 
 #include <kvikio/buffer.hpp>
 #include <kvikio/compat_mode.hpp>
+#include <kvikio/compat_mode_manager.hpp>
 #include <kvikio/cufile/config.hpp>
 #include <kvikio/defaults.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/file_utils.hpp>
-#include <kvikio/parallel_operation.hpp>
-#include <kvikio/posix_io.hpp>
 #include <kvikio/shim/cufile.hpp>
 #include <kvikio/shim/cufile_h_wrapper.hpp>
 #include <kvikio/stream.hpp>
@@ -445,6 +430,16 @@ class FileHandle {
    * @return The associated compatibility mode manager.
    */
   const CompatModeManager& get_compat_mode_manager() const noexcept;
+
+  /**
+   * @brief Whether Direct I/O is supported on this file handle. This is determined by two factors:
+   * - Direct I/O support from the operating system and the file system
+   * - KvikIO global setting `auto_direct_io_read` and `auto_direct_io_write`. If both values are
+   * false, Direct I/O will not be supported on this file handle.
+   *
+   * @return Boolean answer.
+   */
+  bool is_direct_io_supported() const noexcept;
 };
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/file_utils.hpp b/cpp/include/kvikio/file_utils.hpp
index 778608a03b..e666b2b471 100644
--- a/cpp/include/kvikio/file_utils.hpp
+++ b/cpp/include/kvikio/file_utils.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
@@ -132,8 +121,8 @@ class CUFileHandleWrapper {
  * @param o_direct Append O_DIRECT to the open flags
  * @return oflags
  *
- * @throw std::invalid_argument if the specified flags are not supported.
- * @throw std::invalid_argument if `o_direct` is true, but `O_DIRECT` is not supported.
+ * @exception std::invalid_argument if the specified flags are not supported.
+ * @exception std::invalid_argument if `o_direct` is true, but `O_DIRECT` is not supported.
  */
 int open_fd_parse_flags(std::string const& flags, bool o_direct);
 
@@ -160,6 +149,14 @@ int open_fd(std::string const& file_path, std::string const& flags, bool o_direc
  * @param file_descriptor Open file descriptor
  * @return The number of bytes
  */
+[[nodiscard]] std::size_t get_file_size(std::string const& file_path);
+
+/**
+ * @brief Get file size given the file path
+ *
+ * @param file_path Path to a file
+ * @return The number of bytes
+ */
 [[nodiscard]] std::size_t get_file_size(int file_descriptor);
 
 /**
@@ -180,4 +177,27 @@ std::pair<std::size_t, std::size_t> get_page_cache_info(std::string const& file_
  * @sa `get_page_cache_info(std::string const&)` overload.
  */
 std::pair<std::size_t, std::size_t> get_page_cache_info(int fd);
+
+/**
+ * @brief Clear the page cache
+ *
+ * @param reclaim_dentries_and_inodes Whether to free reclaimable slab objects which include
+ * dentries and inodes.
+ * - If `true`, equivalent to executing `/sbin/sysctl vm.drop_caches=3`;
+ * - If `false`, equivalent to executing `/sbin/sysctl vm.drop_caches=1`.
+ * @param clear_dirty_pages Whether to trigger the writeback process to clear the dirty pages. If
+ * `true`, `sync` will be called prior to cache clearing.
+ * @return Whether the page cache has been successfully cleared
+ *
+ * @note This function creates a child process and executes the cache clearing shell command in the
+ * following order
+ * - Execute the command without `sudo` prefix. This is for the superuser and also for specially
+ * configured systems where unprivileged users cannot execute `/usr/bin/sudo` but can execute
+ * `/sbin/sysctl`. If this step succeeds, the function returns `true` immediately.
+ * - Execute the command with `sudo` prefix. This is for the general case where selective
+ * unprivileged users have permission to run `/sbin/sysctl` with `sudo` prefix.
+ *
+ * @exception kvikio::GenericSystemError if somehow the child process could not be created.
+ */
+bool clear_page_cache(bool reclaim_dentries_and_inodes = true, bool clear_dirty_pages = true);
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/function_wrapper.hpp b/cpp/include/kvikio/function_wrapper.hpp
new file mode 100644
index 0000000000..02a3260b13
--- /dev/null
+++ b/cpp/include/kvikio/function_wrapper.hpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions andc
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+
+namespace kvikio {
+/**
+ * @brief Type-erased function wrapper that can hold either a copyable or move-only callable. This
+ * class avoids the limitation and inconvenience of std::function whose target has to be copyable.
+ *
+ * @todo Use small object optimization to avoid heap allocation.
+ * @note This class will be deprecated in the far future when C++23 is adopted that offers
+ * std::move_only_function.
+ */
+class SimpleFunctionWrapper {
+ private:
+  struct inner_base {
+    virtual void operator()() = 0;
+
+    virtual ~inner_base() = default;
+  };
+
+  template <typename F>
+  struct inner : inner_base {
+    using F_decay = std::decay_t<F>;
+    static_assert(std::is_invocable_r_v<void, F_decay>);
+
+    inner(F&& f) : _f(std::forward<F>(f)) {}
+
+    void operator()() override { std::invoke(_f); }
+
+    ~inner() override = default;
+
+    F_decay _f;
+  };
+
+  std::unique_ptr<inner_base> _callable;
+
+ public:
+  /**
+   * @brief Constructor. Create a function wrapper that can hold either a copyable or move-only
+   * callable.
+   *
+   * @tparam F Callable type.
+   * @param f Callable.
+   */
+  template <typename F>
+  SimpleFunctionWrapper(F&& f) : _callable(std::make_unique<inner<F>>(std::forward<F>(f)))
+  {
+    using F_decay = std::decay_t<F>;
+    static_assert(std::is_invocable_r_v<void, F_decay>);
+  }
+
+  SimpleFunctionWrapper() = default;
+
+  SimpleFunctionWrapper(SimpleFunctionWrapper&&)            = default;
+  SimpleFunctionWrapper& operator=(SimpleFunctionWrapper&&) = default;
+
+  SimpleFunctionWrapper(const SimpleFunctionWrapper&)            = delete;
+  SimpleFunctionWrapper& operator=(const SimpleFunctionWrapper&) = delete;
+
+  void operator()() { return _callable->operator()(); }
+
+  /**
+   * @brief Conversion function that tells whether the wrapper has a target (true) or is empty
+   * (false).
+   *
+   * @return Boolean answer.
+   */
+  operator bool() { return _callable != nullptr; }
+};
+
+using FunctionWrapper = SimpleFunctionWrapper;
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/hdfs.hpp b/cpp/include/kvikio/hdfs.hpp
new file mode 100644
index 0000000000..8b6aeb12a3
--- /dev/null
+++ b/cpp/include/kvikio/hdfs.hpp
@@ -0,0 +1,72 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <optional>
+
+#include <kvikio/remote_handle.hpp>
+
+namespace kvikio {
+
+/**
+ * @brief A remote endpoint for Apache Hadoop WebHDFS.
+ *
+ * This endpoint is for accessing HDFS files via the WebHDFS REST API over HTTP/HTTPS. If KvikIO is
+ * run within Docker, pass `--network host` to the `docker run` command to ensure proper name node
+ * connectivity.
+ */
+class WebHdfsEndpoint : public RemoteEndpoint {
+ private:
+  std::string _url;
+  std::optional<std::string> _username;
+
+ public:
+  /**
+   * @brief Create an WebHDFS endpoint from a url.
+   *
+   * @param url The WebHDFS HTTP/HTTPS url to the remote file.
+   * @param username Optional user name.
+   *
+   * @note The optional username for authentication is determined in the following descending
+   * priority order:
+   * - Function parameter `username`
+   * - Query string in URL (?user.name=xxx)
+   * - Environment variable `KVIKIO_WEBHDFS_USERNAME`
+   */
+  explicit WebHdfsEndpoint(std::string url, std::optional<std::string> username = std::nullopt);
+
+  /**
+   * @brief Create an WebHDFS endpoint from the host, port, file path and optionally username.
+   *
+   * @param host Host
+   * @param port Port
+   * @param remote_file_path Remote file path
+   * @param username Optional user name.
+   *
+   * @note The optional username for authentication is determined in the following descending
+   * priority order:
+   * - Function parameter `username`
+   * - Environment variable `KVIKIO_WEBHDFS_USERNAME`
+   */
+  explicit WebHdfsEndpoint(std::string host,
+                           std::string port,
+                           std::string remote_file_path,
+                           std::optional<std::string> username = std::nullopt);
+
+  ~WebHdfsEndpoint() override = default;
+  void setopt(CurlHandle& curl) override;
+  std::string str() const override;
+  std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for the WebHDFS endpoints.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
+};
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/http_status_codes.hpp b/cpp/include/kvikio/http_status_codes.hpp
index 98ffb52324..aa6a0c06e2 100644
--- a/cpp/include/kvikio/http_status_codes.hpp
+++ b/cpp/include/kvikio/http_status_codes.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
diff --git a/cpp/include/kvikio/mmap.hpp b/cpp/include/kvikio/mmap.hpp
new file mode 100644
index 0000000000..fe8b71cbf4
--- /dev/null
+++ b/cpp/include/kvikio/mmap.hpp
@@ -0,0 +1,180 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <cstddef>
+#include <future>
+
+#include <kvikio/defaults.hpp>
+#include <kvikio/file_handle.hpp>
+#include <optional>
+
+namespace kvikio {
+
+/**
+ * @brief Handle of a memory-mapped file
+ *
+ * This utility class facilitates the use of file-backed memory by providing a performant method
+ * `pread()` to read a range of data into user-provided memory residing on the host or device.
+ *
+ * File-backed memory can be considered when a large number of nonadjacent file ranges (specified by
+ * the `offset` and `size` pair) are to be frequently accessed. It can potentially reduce memory
+ * usage due to demand paging (compared to reading the entire file with `read(2)`), and may improve
+ * I/O performance compared to frequent calls to `read(2)`.
+ */
+class MmapHandle {
+ private:
+  void* _buf{};
+  std::size_t _initial_map_size{};
+  std::size_t _initial_map_offset{};
+  std::size_t _file_size{};
+  std::size_t _map_offset{};
+  std::size_t _map_size{};
+  void* _map_addr{};
+  bool _initialized{};
+  int _map_protection{};
+  int _map_flags{};
+  FileWrapper _file_wrapper{};
+
+  /**
+   * @brief Validate and adjust the read arguments.
+   *
+   * @param size Size in bytes to read. If not specified, set it to the bytes from `offset` to
+   * the end of file
+   * @param offset File offset
+   * @return Adjusted size in bytes to read
+   *
+   * @exception std::out_of_range if the read region specified by `offset` and `size` is
+   * outside the initial region specified when the mapping handle was constructed
+   * @exception std::runtime_error if the mapping handle is closed
+   */
+  std::size_t validate_and_adjust_read_args(std::optional<std::size_t> const& size,
+                                            std::size_t offset);
+
+ public:
+  /**
+   * @brief Construct an empty memory-mapped file
+   *
+   */
+  MmapHandle() noexcept = default;
+
+  /**
+   * @brief Construct a new memory-mapped file
+   *
+   * @param file_path File path
+   * @param flags Open flags (see also `fopen(3)`):
+   *   - "r": "open for reading (default)"
+   *   - "w": "open for writing, truncating the file first"
+   *   - "a": "open for writing, appending to the end of file if it exists"
+   *   - "+": "open for updating (reading and writing)"
+   * @param initial_map_size Size in bytes of the mapped region. Must be greater than 0. If not
+   * specified, map the region starting from `initial_map_offset` to the end of file
+   * @param initial_map_offset File offset of the mapped region
+   * @param mode Access mode
+   * @param map_flags Flags to be passed to the system call `mmap`. See `mmap(2)` for details
+   * @exception std::out_of_range if `initial_map_offset` (left bound of the mapped region) is equal
+   * to or greater than the file size
+   * @exception std::out_of_range if the sum of `initial_map_offset` and `initial_map_size` (right
+   * bound of the mapped region) is greater than the file size
+   * @exception std::invalid_argument if `initial_map_size` is given but is 0
+   */
+  MmapHandle(std::string const& file_path,
+             std::string const& flags                    = "r",
+             std::optional<std::size_t> initial_map_size = std::nullopt,
+             std::size_t initial_map_offset              = 0,
+             mode_t mode                                 = FileHandle::m644,
+             std::optional<int> map_flags                = std::nullopt);
+
+  MmapHandle(MmapHandle const&)            = delete;
+  MmapHandle& operator=(MmapHandle const&) = delete;
+  MmapHandle(MmapHandle&& o) noexcept;
+  MmapHandle& operator=(MmapHandle&& o) noexcept;
+  ~MmapHandle() noexcept;
+
+  /**
+   * @brief Size in bytes of the mapped region when the mapping handle was constructed
+   *
+   * @return Initial size of the mapped region
+   */
+  [[nodiscard]] std::size_t initial_map_size() const noexcept;
+
+  /**
+   * @brief File offset of the mapped region when the mapping handle was constructed
+   *
+   * @return Initial file offset of the mapped region
+   */
+  [[nodiscard]] std::size_t initial_map_offset() const noexcept;
+
+  /**
+   * @brief Get the file size if the file is open. Returns 0 if the file is closed.
+   *
+   * The behavior of this method is consistent with `FileHandle::nbytes`.
+   *
+   * @return The file size in bytes
+   */
+  [[nodiscard]] std::size_t file_size() const;
+
+  /**
+   * @brief Alias of `file_size`
+   *
+   * @return The file size in bytes
+   */
+  [[nodiscard]] std::size_t nbytes() const;
+
+  /**
+   * @brief Whether the mapping handle is closed
+   *
+   * @return Boolean answer
+   */
+  [[nodiscard]] bool closed() const noexcept;
+
+  /**
+   * @brief Close the mapping handle if it is open; do nothing otherwise
+   */
+  void close() noexcept;
+
+  /**
+   * @brief Sequential read `size` bytes from the file (with the offset `offset`) to the
+   * destination buffer `buf`
+   *
+   * @param buf Address of the host or device memory (destination buffer)
+   * @param size Size in bytes to read. Can be 0 in which case nothing will be read. If not
+   * specified, read starts from `offset` to the end of file
+   * @param offset File offset
+   * @return Number of bytes that have been read
+   *
+   * @exception std::out_of_range if the read region specified by `offset` and `size` is
+   * outside the initial region specified when the mapping handle was constructed
+   * @exception std::runtime_error if the mapping handle is closed
+   */
+  std::size_t read(void* buf,
+                   std::optional<std::size_t> size = std::nullopt,
+                   std::size_t offset              = 0);
+
+  /**
+   * @brief Parallel read `size` bytes from the file (with the offset `offset`) to the
+   * destination buffer `buf`
+   *
+   * @param buf Address of the host or device memory (destination buffer)
+   * @param size Size in bytes to read. Can be 0 in which case nothing will be read. If not
+   * specified, read starts from `offset` to the end of file
+   * @param offset File offset
+   * @param task_size Size of each task in bytes
+   * @return Future that on completion returns the size of bytes that were successfully read.
+   *
+   * @exception std::out_of_range if the read region specified by `offset` and `size` is
+   * outside the initial region specified when the mapping handle was constructed
+   * @exception std::runtime_error if the mapping handle is closed
+   *
+   * @note The `std::future` object's `wait()` or `get()` should not be called after the lifetime of
+   * the MmapHandle object ends. Otherwise, the behavior is undefined.
+   */
+  std::future<std::size_t> pread(void* buf,
+                                 std::optional<std::size_t> size = std::nullopt,
+                                 std::size_t offset              = 0,
+                                 std::size_t task_size           = defaults::task_size());
+};
+
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/posix_io.hpp b/cpp/include/kvikio/posix_io.hpp
deleted file mode 100644
index a139cae718..0000000000
--- a/cpp/include/kvikio/posix_io.hpp
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <unistd.h>
-#include <cstddef>
-#include <cstdlib>
-#include <map>
-#include <thread>
-
-#include <kvikio/bounce_buffer.hpp>
-#include <kvikio/error.hpp>
-#include <kvikio/nvtx.hpp>
-#include <kvikio/shim/cuda.hpp>
-#include <kvikio/utils.hpp>
-
-namespace kvikio::detail {
-
-/**
- * @brief Type of the IO operation.
- */
-enum class IOOperationType : uint8_t {
-  READ,   ///< POSIX read.
-  WRITE,  ///< POSIX write.
-};
-
-/**
- * @brief Specifies whether all requested bytes are to be processed or not.
- */
-enum class PartialIO : uint8_t {
-  YES,  ///< POSIX read/write is called only once, which may not process all bytes requested.
-  NO,   ///< POSIX read/write is called repeatedly until all requested bytes are processed.
-};
-
-/**
- * @brief Singleton class to retrieve a CUDA stream for device-host copying
- *
- * Call `StreamsByThread::get` to get the CUDA stream assigned to the current
- * CUDA context and thread.
- */
-class StreamsByThread {
- private:
-  std::map<std::pair<CUcontext, std::thread::id>, CUstream> _streams;
-
- public:
-  StreamsByThread() = default;
-
-  // Here we intentionally do not destroy in the destructor the CUDA resources
-  // (e.g. CUstream) with static storage duration, but instead let them leak
-  // on program termination. This is to prevent undefined behavior in CUDA. See
-  // <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
-  // This also prevents crash (segmentation fault) if clients call
-  // cuDevicePrimaryCtxReset() or cudaDeviceReset() before program termination.
-  ~StreamsByThread() = default;
-
-  KVIKIO_EXPORT static CUstream get(CUcontext ctx, std::thread::id thd_id);
-
-  static CUstream get();
-
-  StreamsByThread(StreamsByThread const&)            = delete;
-  StreamsByThread& operator=(StreamsByThread const&) = delete;
-  StreamsByThread(StreamsByThread&& o)               = delete;
-  StreamsByThread& operator=(StreamsByThread&& o)    = delete;
-};
-
-/**
- * @brief Read or write host memory to or from disk using POSIX
- *
- * @tparam Operation Whether the operation is a read or a write.
- * @tparam PartialIOStatus Whether all requested data are processed or not. If `FULL`, all of
- * `count` bytes are read or written.
- * @param fd File descriptor
- * @param buf Buffer to write
- * @param count Number of bytes to write
- * @param offset File offset
- * @return The number of bytes read or written (always gather than zero)
- */
-template <IOOperationType Operation, PartialIO PartialIOStatus>
-ssize_t posix_host_io(int fd, void const* buf, size_t count, off_t offset)
-{
-  off_t cur_offset      = offset;
-  size_t byte_remaining = count;
-  char* buffer          = const_cast<char*>(static_cast<char const*>(buf));
-  while (byte_remaining > 0) {
-    ssize_t nbytes = 0;
-    if constexpr (Operation == IOOperationType::READ) {
-      nbytes = ::pread(fd, buffer, byte_remaining, cur_offset);
-    } else {
-      nbytes = ::pwrite(fd, buffer, byte_remaining, cur_offset);
-    }
-    if (nbytes == -1) {
-      std::string const name = (Operation == IOOperationType::READ) ? "pread" : "pwrite";
-      KVIKIO_EXPECT(errno != EBADF, "POSIX error: Operation not permitted");
-      KVIKIO_FAIL("POSIX error on " + name + ": " + strerror(errno));
-    }
-    if constexpr (Operation == IOOperationType::READ) {
-      KVIKIO_EXPECT(nbytes != 0, "POSIX error on pread: EOF");
-    }
-    if constexpr (PartialIOStatus == PartialIO::YES) { return nbytes; }
-    buffer += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    cur_offset += nbytes;
-    byte_remaining -= nbytes;
-  }
-  return convert_size2ssize(count);
-}
-
-/**
- * @brief Read or write device memory to or from disk using POSIX
- *
- * @tparam Operation Whether the operation is a read or a write.
- * @param fd File descriptor
- * @param devPtr_base Device pointer to read or write to.
- * @param size Number of bytes to read or write.
- * @param file_offset Byte offset to the start of the file.
- * @param devPtr_offset Byte offset to the start of the device pointer.
- * @return Number of bytes read or written.
- */
-template <IOOperationType Operation>
-std::size_t posix_device_io(int fd,
-                            void const* devPtr_base,
-                            std::size_t size,
-                            std::size_t file_offset,
-                            std::size_t devPtr_offset)
-{
-  auto alloc              = AllocRetain::instance().get();
-  CUdeviceptr devPtr      = convert_void2deviceptr(devPtr_base) + devPtr_offset;
-  off_t cur_file_offset   = convert_size2off(file_offset);
-  off_t byte_remaining    = convert_size2off(size);
-  off_t const chunk_size2 = convert_size2off(alloc.size());
-
-  // Get a stream for the current CUDA context and thread
-  CUstream stream = StreamsByThread::get();
-
-  while (byte_remaining > 0) {
-    off_t const nbytes_requested = std::min(chunk_size2, byte_remaining);
-    ssize_t nbytes_got           = nbytes_requested;
-    if constexpr (Operation == IOOperationType::READ) {
-      nbytes_got = posix_host_io<IOOperationType::READ, PartialIO::YES>(
-        fd, alloc.get(), nbytes_requested, cur_file_offset);
-      CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(devPtr, alloc.get(), nbytes_got, stream));
-      CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
-    } else {  // Is a write operation
-      CUDA_DRIVER_TRY(
-        cudaAPI::instance().MemcpyDtoHAsync(alloc.get(), devPtr, nbytes_requested, stream));
-      CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
-      posix_host_io<IOOperationType::WRITE, PartialIO::NO>(
-        fd, alloc.get(), nbytes_requested, cur_file_offset);
-    }
-    cur_file_offset += nbytes_got;
-    devPtr += nbytes_got;
-    byte_remaining -= nbytes_got;
-  }
-  return size;
-}
-
-/**
- * @brief Read from disk to host memory using POSIX
- *
- * If `size` or `file_offset` isn't aligned with `page_size` then
- * `fd` cannot have been opened with the `O_DIRECT` flag.
- *
- * @tparam PartialIOStatus Whether all requested data are processed or not. If `FULL`, all of
- * `count` bytes are read.
- * @param fd File descriptor
- * @param buf Base address of buffer in host memory.
- * @param size Size in bytes to read.
- * @param file_offset Offset in the file to read from.
- * @return Size of bytes that were successfully read.
- */
-template <PartialIO PartialIOStatus>
-std::size_t posix_host_read(int fd, void* buf, std::size_t size, std::size_t file_offset)
-{
-  KVIKIO_NVTX_FUNC_RANGE(size);
-  return detail::posix_host_io<IOOperationType::READ, PartialIOStatus>(
-    fd, buf, size, convert_size2off(file_offset));
-}
-
-/**
- * @brief Write host memory to disk using POSIX
- *
- * If `size` or `file_offset` isn't aligned with `page_size` then
- * `fd` cannot have been opened with the `O_DIRECT` flag.
- *
- * @tparam ioDataCompletionLevel Whether all requested data are processed or not. If `FULL`, all of
- * `count` bytes are written.
- * @param fd File descriptor
- * @param buf Base address of buffer in host memory.
- * @param size Size in bytes to write.
- * @param file_offset Offset in the file to write to.
- * @return Size of bytes that were successfully read.
- */
-template <PartialIO PartialIOStatus>
-std::size_t posix_host_write(int fd, void const* buf, std::size_t size, std::size_t file_offset)
-{
-  KVIKIO_NVTX_FUNC_RANGE(size);
-  return detail::posix_host_io<IOOperationType::WRITE, PartialIOStatus>(
-    fd, buf, size, convert_size2off(file_offset));
-}
-
-/**
- * @brief Read from disk to device memory using POSIX
- *
- * If `size` or `file_offset` isn't aligned with `page_size` then
- * `fd` cannot have been opened with the `O_DIRECT` flag.
- *
- * @param fd File descriptor
- * @param devPtr_base Base address of buffer in device memory.
- * @param size Size in bytes to read.
- * @param file_offset Offset in the file to read from.
- * @param devPtr_offset Offset relative to the `devPtr_base` pointer to read into.
- * @return Size of bytes that were successfully read.
- */
-std::size_t posix_device_read(int fd,
-                              void const* devPtr_base,
-                              std::size_t size,
-                              std::size_t file_offset,
-                              std::size_t devPtr_offset);
-
-/**
- * @brief Write device memory to disk using POSIX
- *
- * If `size` or `file_offset` isn't aligned with `page_size` then
- * `fd` cannot have been opened with the `O_DIRECT` flag.
- *
- * @param fd File descriptor
- * @param devPtr_base Base address of buffer in device memory.
- * @param size Size in bytes to write.
- * @param file_offset Offset in the file to write to.
- * @param devPtr_offset Offset relative to the `devPtr_base` pointer to write into.
- * @return Size of bytes that were successfully written.
- */
-std::size_t posix_device_write(int fd,
-                               void const* devPtr_base,
-                               std::size_t size,
-                               std::size_t file_offset,
-                               std::size_t devPtr_offset);
-
-}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index b67ae5560e..416e374291 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -1,35 +1,18 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
 #include <cassert>
 #include <cstddef>
 #include <cstring>
-#include <iostream>
 #include <memory>
 #include <optional>
-#include <regex>
-#include <sstream>
-#include <stdexcept>
 #include <string>
 
 #include <kvikio/defaults.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/parallel_operation.hpp>
-#include <kvikio/posix_io.hpp>
 #include <kvikio/utils.hpp>
 
 struct curl_slist;
@@ -38,6 +21,29 @@ namespace kvikio {
 
 class CurlHandle;  // Prototype
 
+/**
+ * @brief Types of remote file endpoints supported by KvikIO.
+ *
+ * This enum defines the different protocols and services that can be used to access remote files.
+ * It is used to specify or detect the type of remote endpoint when opening files.
+ */
+enum class RemoteEndpointType : uint8_t {
+  AUTO,  ///< Automatically detect the endpoint type from the URL. KvikIO will attempt to infer the
+         ///< appropriate protocol based on the URL format.
+  S3,    ///< AWS S3 endpoint using credentials-based authentication. Requires AWS environment
+         ///< variables (such as AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_DEFAULT_REGION) to be
+         ///< set.
+  S3_PUBLIC,  ///< AWS S3 endpoint for publicly accessible objects. No credentials required as the
+              ///< objects have public read permissions enabled. Used for open datasets and public
+              ///< buckets.
+  S3_PRESIGNED_URL,  ///< AWS S3 endpoint using a presigned URL. No credentials required as
+                     ///< authentication is embedded in the URL with time-limited access.
+  WEBHDFS,  ///< Apache Hadoop WebHDFS (Web-based Hadoop Distributed File System) endpoint for
+            ///< accessing files stored in HDFS over HTTP/HTTPS.
+  HTTP,  ///< Generic HTTP or HTTPS endpoint for accessing files from web servers. This is used for
+         ///< standard web resources that do not fit the other specific categories.
+};
+
 /**
  * @brief Abstract base class for remote endpoints.
  *
@@ -47,7 +53,13 @@ class CurlHandle;  // Prototype
  * its own ctor that takes communication protocol specific arguments.
  */
 class RemoteEndpoint {
+ protected:
+  RemoteEndpointType _remote_endpoint_type{RemoteEndpointType::AUTO};
+  RemoteEndpoint(RemoteEndpointType remote_endpoint_type);
+
  public:
+  virtual ~RemoteEndpoint() = default;
+
   /**
    * @brief Set needed connection options on a curl handle.
    *
@@ -64,11 +76,32 @@ class RemoteEndpoint {
    */
   virtual std::string str() const = 0;
 
-  virtual ~RemoteEndpoint() = default;
+  /**
+   * @brief Get the size of the remote file.
+   *
+   * @return The file size
+   */
+  virtual std::size_t get_file_size() = 0;
+
+  /**
+   * @brief Set up the range request in order to read part of a file given the file offset and read
+   * size.
+   */
+  virtual void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) = 0;
+
+  /**
+   * @brief Get the type of the remote file.
+   *
+   * @return The type of the remote file.
+   */
+  [[nodiscard]] RemoteEndpointType remote_endpoint_type() const noexcept;
 };
 
 /**
- * @brief A remote endpoint using http.
+ * @brief A remote endpoint for HTTP/HTTPS resources
+ *
+ * This endpoint is for accessing files via standard HTTP/HTTPS protocols without any specialized
+ * authentication.
  */
 class HttpEndpoint : public RemoteEndpoint {
  private:
@@ -81,13 +114,27 @@ class HttpEndpoint : public RemoteEndpoint {
    * @param url The full http url to the remote file.
    */
   HttpEndpoint(std::string url);
+
+  ~HttpEndpoint() override = default;
   void setopt(CurlHandle& curl) override;
   std::string str() const override;
-  ~HttpEndpoint() override = default;
+  std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for HTTP/HTTPS endpoints.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
 };
 
 /**
- * @brief A remote endpoint using AWS's S3 protocol.
+ * @brief A remote endpoint for AWS S3 storage requiring credentials
+ *
+ * This endpoint is for accessing private S3 objects using AWS credentials (access key, secret key,
+ * region and optional session token).
  */
 class S3Endpoint : public RemoteEndpoint {
  private:
@@ -96,28 +143,11 @@ class S3Endpoint : public RemoteEndpoint {
   std::string _aws_userpwd;
   curl_slist* _curl_header_list{};
 
-  /**
-   * @brief Unwrap an optional parameter, obtaining a default from the environment.
-   *
-   * If not nullopt, the optional's value is returned. Otherwise, the environment
-   * variable `env_var` is used. If that also doesn't have a value:
-   *   - if `err_msg` is empty, the empty string is returned.
-   *   - if `err_msg` is not empty, `std::invalid_argument(`err_msg`)` is thrown.
-   *
-   * @param value The value to unwrap.
-   * @param env_var The name of the environment variable to check if `value` isn't set.
-   * @param err_msg The error message to throw on error or the empty string.
-   * @return The parsed AWS argument or the empty string.
-   */
-  static std::string unwrap_or_default(std::optional<std::string> aws_arg,
-                                       std::string const& env_var,
-                                       std::string const& err_msg = "");
-
  public:
   /**
    * @brief Get url from a AWS S3 bucket and object name.
    *
-   * @throws std::invalid_argument if no region is specified and no default region is
+   * @exception std::invalid_argument if no region is specified and no default region is
    * specified in the environment.
    *
    * @param bucket_name The name of the S3 bucket.
@@ -137,7 +167,7 @@ class S3Endpoint : public RemoteEndpoint {
   /**
    * @brief Given an url like "s3://<bucket>/<object>", return the name of the bucket and object.
    *
-   * @throws std::invalid_argument if url is ill-formed or is missing the bucket or object name.
+   * @exception std::invalid_argument if url is ill-formed or is missing the bucket or object name.
    *
    * @param s3_url S3 url.
    * @return Pair of strings: [bucket-name, object-name].
@@ -189,9 +219,75 @@ class S3Endpoint : public RemoteEndpoint {
              std::optional<std::string> aws_endpoint_url      = std::nullopt,
              std::optional<std::string> aws_session_token     = std::nullopt);
 
+  ~S3Endpoint() override;
   void setopt(CurlHandle& curl) override;
   std::string str() const override;
-  ~S3Endpoint() override;
+  std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for S3 endpoints (excluding presigned URL).
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
+};
+
+/**
+ * @brief A remote endpoint for publicly accessible S3 objects without authentication
+ *
+ * This endpoint is for accessing S3 objects configured with public read permissions,
+ * requiring no authentication. Supports AWS S3 services with anonymous access enabled.
+ */
+class S3PublicEndpoint : public RemoteEndpoint {
+ private:
+  std::string _url;
+
+ public:
+  explicit S3PublicEndpoint(std::string url);
+
+  ~S3PublicEndpoint() override = default;
+  void setopt(CurlHandle& curl) override;
+  std::string str() const override;
+  std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for S3 public endpoints.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
+};
+
+/**
+ * @brief A remote endpoint for AWS S3 storage using presigned URLs.
+ *
+ * This endpoint is for accessing S3 objects via presigned URLs, which provide time-limited access
+ * without requiring AWS credentials on the client side.
+ */
+class S3EndpointWithPresignedUrl : public RemoteEndpoint {
+ private:
+  std::string _url;
+
+ public:
+  explicit S3EndpointWithPresignedUrl(std::string presigned_url);
+
+  ~S3EndpointWithPresignedUrl() override = default;
+  void setopt(CurlHandle& curl) override;
+  std::string str() const override;
+  std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for S3 endpoints with presigned URL.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
 };
 
 /**
@@ -203,6 +299,88 @@ class RemoteHandle {
   std::size_t _nbytes;
 
  public:
+  /**
+   * @brief Create a remote file handle from a URL.
+   *
+   * This function creates a RemoteHandle for reading data from various remote endpoints
+   * including HTTP/HTTPS servers, AWS S3 buckets, S3 presigned URLs, and WebHDFS.
+   * The endpoint type can be automatically detected from the URL or explicitly specified.
+   *
+   * @param url The URL of the remote file. Supported formats include:
+   *   - S3 with credentials
+   *   - S3 presigned URL
+   *   - WebHDFS
+   *   - HTTP/HTTPS
+   * @param remote_endpoint_type The type of remote endpoint. Default is RemoteEndpointType::AUTO
+   * which automatically detects the endpoint type from the URL. Can be explicitly set to
+   * RemoteEndpointType::S3, RemoteEndpointType::S3_PRESIGNED_URL, RemoteEndpointType::WEBHDFS, or
+   * RemoteEndpointType::HTTP to force a specific endpoint type.
+   * @param allow_list Optional list of allowed endpoint types. If provided:
+   *   - If remote_endpoint_type is RemoteEndpointType::AUTO, Types are tried in the exact order
+   *     specified until a match is found.
+   *   - In explicit mode, the specified type must be in this list, otherwise an exception is
+   *     thrown.
+   *
+   * If not provided, defaults to all supported types in this order: RemoteEndpointType::S3,
+   * RemoteEndpointType::S3_PRESIGNED_URL, RemoteEndpointType::WEBHDFS, and
+   * RemoteEndpointType::HTTP.
+   * @param nbytes Optional file size in bytes. If not provided, the function sends additional
+   * request to the server to query the file size.
+   * @return A RemoteHandle object that can be used to read data from the remote file.
+   * @exception std::runtime_error If:
+   *   - If the URL is malformed or missing required components.
+   *   - RemoteEndpointType::AUTO mode is used and the URL doesn't match any supported endpoint
+   * type.
+   *   - The specified endpoint type is not in the `allow_list`.
+   *   - The URL is invalid for the specified endpoint type.
+   *   - Unable to connect to the remote server or determine file size (when nbytes not provided).
+   *
+   * Example:
+   * - Auto-detect endpoint type from URL
+   *   @code{.cpp}
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       "https://bucket.s3.amazonaws.com/object?X-Amz-Algorithm=AWS4-HMAC-SHA256"
+   *       "&X-Amz-Credential=...&X-Amz-Signature=..."
+   *   );
+   *   @endcode
+   *
+   * - Open S3 file with explicit endpoint type
+   *   @code{.cpp}
+   *
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       "https://my-bucket.s3.us-east-1.amazonaws.com/data.bin",
+   *       kvikio::RemoteEndpointType::S3
+   *   );
+   *   @endcode
+   *
+   * - Restrict endpoint type candidates
+   *   @code{.cpp}
+   *   std::vector<kvikio::RemoteEndpointType> allow_list = {
+   *       kvikio::RemoteEndpointType::HTTP,
+   *       kvikio::RemoteEndpointType::S3_PRESIGNED_URL
+   *   };
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       user_provided_url,
+   *       kvikio::RemoteEndpointType::AUTO,
+   *       allow_list
+   *   );
+   *   @endcode
+   *
+   * - Provide known file size to skip HEAD request
+   *   @code{.cpp}
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       "https://example.com/large-file.bin",
+   *       kvikio::RemoteEndpointType::HTTP,
+   *       std::nullopt,
+   *       1024 * 1024 * 100  // 100 MB
+   *   );
+   *   @endcode
+   */
+  static RemoteHandle open(std::string url,
+                           RemoteEndpointType remote_endpoint_type = RemoteEndpointType::AUTO,
+                           std::optional<std::vector<RemoteEndpointType>> allow_list = std::nullopt,
+                           std::optional<std::size_t> nbytes = std::nullopt);
+
   /**
    * @brief Create a new remote handle from an endpoint and a file size.
    *
@@ -226,10 +404,18 @@ class RemoteHandle {
   RemoteHandle(RemoteHandle const&)            = delete;
   RemoteHandle& operator=(RemoteHandle const&) = delete;
 
+  /**
+   * @brief Get the type of the remote file.
+   *
+   * @return The type of the remote file.
+   */
+  [[nodiscard]] RemoteEndpointType remote_endpoint_type() const noexcept;
+
   /**
    * @brief Get the file size.
    *
-   * Note, this is very fast, no communication needed.
+   * Note, the file size is retrieved at construction so this method is very fast, no communication
+   * needed.
    *
    * @return The number of bytes.
    */
diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp
index 9aaac08827..22c8276817 100644
--- a/cpp/include/kvikio/shim/cuda.hpp
+++ b/cpp/include/kvikio/shim/cuda.hpp
@@ -1,25 +1,73 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
-#include <kvikio/shim/cuda_h_wrapper.hpp>
+#include <any>
+#include <functional>
+
+#include <cuda.h>
 #include <kvikio/shim/utils.hpp>
+#include <stdexcept>
 
 namespace kvikio {
 
+namespace detail {
+/**
+ * @brief Non-templated class to hold any callable that returns CUresult
+ */
+class AnyCallable {
+ private:
+  std::any _callable;
+
+ public:
+  /**
+   * @brief Assign a callable to the object
+   *
+   * @tparam Callable A callable that must return CUresult
+   * @param c The callable object
+   */
+  template <typename Callable>
+  void set(Callable&& c)
+  {
+    _callable = std::function(c);
+  }
+
+  /**
+   * @brief Destroy the contained callable
+   */
+  void reset() { _callable.reset(); }
+
+  /**
+   * @brief Invoke the container callable
+   *
+   * @tparam Args Types of the argument. Must exactly match the parameter types of the contained
+   * callable. For example, if the parameter is `std::size_t*`, an argument of `nullptr` must be
+   * explicitly cast to `std::size_t*`.
+   * @param args Arguments to be passed
+   * @return CUDA driver API error code
+   * @exception std::bad_any_cast if any argument type does not exactly match the parameter type of
+   * the contained callable.
+   */
+  template <typename... Args>
+  CUresult operator()(Args... args)
+  {
+    using T = std::function<CUresult(Args...)>;
+    if (!_callable.has_value()) {
+      throw std::runtime_error("No callable has been assigned to the wrapper yet.");
+    }
+    return std::any_cast<T&>(_callable)(args...);
+  }
+
+  /**
+   * @brief Check if the object holds a callable
+   */
+  operator bool() const { return _callable.has_value(); }
+};
+
+}  // namespace detail
+
 /**
  * @brief Shim layer of the cuda C-API
  *
@@ -29,25 +77,36 @@ namespace kvikio {
  */
 class cudaAPI {
  public:
+  int driver_version{0};
+
   decltype(cuInit)* Init{nullptr};
   decltype(cuMemHostAlloc)* MemHostAlloc{nullptr};
   decltype(cuMemFreeHost)* MemFreeHost{nullptr};
+  decltype(cuMemHostRegister)* MemHostRegister{nullptr};
+  decltype(cuMemHostUnregister)* MemHostUnregister{nullptr};
   decltype(cuMemcpyHtoDAsync)* MemcpyHtoDAsync{nullptr};
   decltype(cuMemcpyDtoHAsync)* MemcpyDtoHAsync{nullptr};
+
+  detail::AnyCallable MemcpyBatchAsync{};
+
   decltype(cuPointerGetAttribute)* PointerGetAttribute{nullptr};
   decltype(cuPointerGetAttributes)* PointerGetAttributes{nullptr};
   decltype(cuCtxPushCurrent)* CtxPushCurrent{nullptr};
   decltype(cuCtxPopCurrent)* CtxPopCurrent{nullptr};
   decltype(cuCtxGetCurrent)* CtxGetCurrent{nullptr};
+  decltype(cuCtxGetDevice)* CtxGetDevice{nullptr};
   decltype(cuMemGetAddressRange)* MemGetAddressRange{nullptr};
   decltype(cuGetErrorName)* GetErrorName{nullptr};
   decltype(cuGetErrorString)* GetErrorString{nullptr};
   decltype(cuDeviceGet)* DeviceGet{nullptr};
+  decltype(cuDeviceGetCount)* DeviceGetCount{nullptr};
+  decltype(cuDeviceGetAttribute)* DeviceGetAttribute{nullptr};
   decltype(cuDevicePrimaryCtxRetain)* DevicePrimaryCtxRetain{nullptr};
   decltype(cuDevicePrimaryCtxRelease)* DevicePrimaryCtxRelease{nullptr};
   decltype(cuStreamSynchronize)* StreamSynchronize{nullptr};
   decltype(cuStreamCreate)* StreamCreate{nullptr};
   decltype(cuStreamDestroy)* StreamDestroy{nullptr};
+  decltype(cuDriverGetVersion)* DriverGetVersion{nullptr};
 
  private:
   cudaAPI();
@@ -66,10 +125,6 @@ class cudaAPI {
  *
  * @return The boolean answer
  */
-#ifdef KVIKIO_CUDA_FOUND
 bool is_cuda_available();
-#else
-constexpr bool is_cuda_available() { return false; }
-#endif
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/shim/cuda_h_wrapper.hpp b/cpp/include/kvikio/shim/cuda_h_wrapper.hpp
deleted file mode 100644
index ee3c855efe..0000000000
--- a/cpp/include/kvikio/shim/cuda_h_wrapper.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-/**
- * In order to support compilation when `cuda.h` isn't available, we
- * wrap all use of cuda in a `#ifdef KVIKIO_CUDA_FOUND` guard.
- *
- * The motivation here is to make KvikIO work in all circumstances so
- * that libraries doesn't have to implement there own fallback solutions.
- */
-#ifdef KVIKIO_CUDA_FOUND
-#include <cuda.h>
-#else
-
-// If CUDA isn't defined, we define some of the data types here.
-// Notice, the functions and constant values don't need to match the CUDA
-// definitions, but the types *do*, since downstream libraries dlsym()-ing
-// the symbols at runtime rely on accurate type definitions. If we mismatch
-// here, then those libraries will get "mismatched type alias redefinition"
-// errors when they include our headers.
-
-#if defined(_WIN64) || defined(__LP64__)
-// Don't use uint64_t, we want to match the driver headers exactly
-using CUdeviceptr = unsigned long long;
-#else
-using CUdeviceptr = unsigned int;
-#endif
-static_assert(sizeof(CUdeviceptr) == sizeof(void*));
-
-using CUresult  = int;
-using CUdevice  = int;
-using CUcontext = struct CUctx_st*;
-using CUstream  = struct CUstream_st*;
-
-#define CUDA_ERROR_STUB_LIBRARY             0
-#define CUDA_SUCCESS                        0
-#define CUDA_ERROR_INVALID_VALUE            0
-#define CU_POINTER_ATTRIBUTE_CONTEXT        0
-#define CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL 0
-#define CU_POINTER_ATTRIBUTE_DEVICE_POINTER 0
-#define CU_MEMHOSTALLOC_PORTABLE            0
-#define CU_STREAM_DEFAULT                   0
-
-CUresult cuInit(...);
-CUresult cuMemHostAlloc(...);
-CUresult cuMemFreeHost(...);
-CUresult cuMemcpyHtoDAsync(...);
-CUresult cuMemcpyDtoHAsync(...);
-CUresult cuPointerGetAttribute(...);
-CUresult cuPointerGetAttributes(...);
-CUresult cuCtxPushCurrent(...);
-CUresult cuCtxPopCurrent(...);
-CUresult cuCtxGetCurrent(...);
-CUresult cuMemGetAddressRange(...);
-CUresult cuGetErrorName(...);
-CUresult cuGetErrorString(...);
-CUresult cuDeviceGet(...);
-CUresult cuDevicePrimaryCtxRetain(...);
-CUresult cuDevicePrimaryCtxRelease(...);
-CUresult cuStreamCreate(...);
-CUresult cuStreamDestroy(...);
-CUresult cuStreamSynchronize(...);
-
-#endif
diff --git a/cpp/include/kvikio/shim/cufile.hpp b/cpp/include/kvikio/shim/cufile.hpp
index 4823fd42f5..721b2f4e17 100644
--- a/cpp/include/kvikio/shim/cufile.hpp
+++ b/cpp/include/kvikio/shim/cufile.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
@@ -63,15 +52,6 @@ class cuFileAPI {
  private:
   cuFileAPI();
 
-#ifdef KVIKIO_CUFILE_FOUND
-  // Notice, we have to close the driver at program exit (if we opened it) even though we are
-  // not allowed to call CUDA after main[1]. This is because, cuFile will segfault if the
-  // driver isn't closed on program exit i.e. we are doomed if we do, doomed if we don't, but
-  // this seems to be the lesser of two evils.
-  // [1] <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
-  ~cuFileAPI();
-#endif
-
  public:
   cuFileAPI(cuFileAPI const&)       = delete;
   void operator=(cuFileAPI const&)  = delete;
@@ -128,11 +108,7 @@ bool is_cufile_available() noexcept;
  *
  * @return The version (1000*major + 10*minor) or zero if older than 1080.
  */
-#ifdef KVIKIO_CUFILE_FOUND
 int cufile_version() noexcept;
-#else
-constexpr int cufile_version() noexcept { return 0; }
-#endif
 
 /**
  * @brief Check if cuFile's batch API is available.
diff --git a/cpp/include/kvikio/shim/cufile_h_wrapper.hpp b/cpp/include/kvikio/shim/cufile_h_wrapper.hpp
index 66f5adbaf3..9aff7248c4 100644
--- a/cpp/include/kvikio/shim/cufile_h_wrapper.hpp
+++ b/cpp/include/kvikio/shim/cufile_h_wrapper.hpp
@@ -1,23 +1,12 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
 #include <sys/types.h>
 
-#include <kvikio/shim/cuda_h_wrapper.hpp>
+#include <cuda.h>
 
 /**
  * In order to support compilation when `cufile.h` isn't available, we
diff --git a/cpp/include/kvikio/shim/libcurl.hpp b/cpp/include/kvikio/shim/libcurl.hpp
index 0a98cecb77..150ac69e6e 100644
--- a/cpp/include/kvikio/shim/libcurl.hpp
+++ b/cpp/include/kvikio/shim/libcurl.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/shim/utils.hpp b/cpp/include/kvikio/shim/utils.hpp
index 03c9773e9f..7c6a4e6d35 100644
--- a/cpp/include/kvikio/shim/utils.hpp
+++ b/cpp/include/kvikio/shim/utils.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
@@ -48,15 +37,6 @@ namespace kvikio {
  */
 void* load_library(std::string const& name, int mode = RTLD_LAZY | RTLD_LOCAL | RTLD_NODELETE);
 
-/**
- * @brief Load shared library
- *
- * @param names Vector of names to try when loading shared library.
- * @return The library handle.
- */
-void* load_library(std::vector<std::string> const& names,
-                   int mode = RTLD_LAZY | RTLD_LOCAL | RTLD_NODELETE);
-
 /**
  * @brief Get symbol using `dlsym`
  *
diff --git a/cpp/include/kvikio/stream.hpp b/cpp/include/kvikio/stream.hpp
index f48f9955f7..12bef04342 100644
--- a/cpp/include/kvikio/stream.hpp
+++ b/cpp/include/kvikio/stream.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/threadpool_simple.hpp b/cpp/include/kvikio/threadpool_simple.hpp
new file mode 100644
index 0000000000..0bed02ed54
--- /dev/null
+++ b/cpp/include/kvikio/threadpool_simple.hpp
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions andc
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <future>
+#include <optional>
+#include <queue>
+#include <thread>
+#include <utility>
+
+#include <kvikio/function_wrapper.hpp>
+
+/**
+ * @file
+ * @brief A simple, header-only thread pool that uses per-thread task queues. Synchronization only
+ * exists between the pairs of the main thread and each worker thread, but not among the worker
+ * threads themselves. Inspired by the BS threadpool that KvikIO has been using.
+ */
+
+namespace kvikio {
+/**
+ * @brief Utility class for the calling thread.
+ */
+class ThisThread {
+ public:
+  /**
+   * @brief Check if the calling thread is from ThreadPoolSimple.
+   *
+   * @return Boolean answer.
+   */
+  static bool is_from_pool() { return get_thread_idx().has_value(); }
+
+  /**
+   * @brief Get the index of the calling thread.
+   *
+   * If the calling thread is not from ThreadPoolSimple, return std::nullopt. Otherwise, return the
+   * thread index ranging from 0 to (N-1) where N is the thread count.
+   *
+   * @return Index of the calling thread.
+   */
+  static std::optional<std::size_t> get_thread_idx() { return this_thread_idx; }
+
+ private:
+  friend class ThreadPoolSimple;
+
+  /**
+   * @brief Set the index of the calling thread.
+   *
+   * @param thread_idx Index of the calling thread.
+   */
+  static void set_thread_idx(std::size_t thread_idx) { this_thread_idx = thread_idx; }
+
+  inline static thread_local std::optional<std::size_t> this_thread_idx{std::nullopt};
+};
+
+/**
+ * @brief Struct to hold per-thread data.
+ */
+struct Worker {
+  std::thread thread;
+  std::condition_variable task_available_cv;
+  std::condition_variable task_done_cv;
+  std::mutex task_mutex;
+  std::queue<FunctionWrapper> task_queue;
+  bool should_stop{false};
+};
+
+/**
+ * @brief A simple thread pool that uses per-thread task queues.
+ *
+ * Each worker thread has their own task queue, mutex and condition variable. The per-thread
+ * synchronization primitives (mutex and condition variable) are shared with the main thread. Tasks
+ * are submitted to the worker threads in a round-robin fashion, unless the target thread index is
+ * specified by the user.
+ *
+ * Example:
+ * ```cpp
+ * // Create a thread pool with 4 threads, and pass an optional callable with which to initialize
+ * // each worker thread.
+ * kvikio::ThreadPoolSimple thread_pool{4, [] {
+ *     // Initialize worker thread
+ * }};
+ *
+ * // Submit the task to the thread pool. The worker thread is selected automatically in a
+ * // round-robin fashion.
+ * auto fut = thread_pool.submit_task([] {
+ *     // Task logic
+ * });
+ *
+ * // Submit the task to a specific thread.
+ * auto fut = thread_pool.submit_task_to_thread([] {
+ *     // Task logic
+ * });
+ *
+ * // Wait until the result is ready.
+ * auto result = fut.get();
+ * ```
+ */
+class ThreadPoolSimple {
+ public:
+  /**
+   * @brief Constructor. Create a thread pool.
+   *
+   * @tparam F Type of the user-defined worker thread initialization.
+   * @param num_threads Number of threads.
+   * @param worker_thread_init_func User-defined worker thread initialization.
+   */
+  template <typename F>
+  ThreadPoolSimple(unsigned int num_threads, F&& worker_thread_init_func)
+    : _num_threads{num_threads}, _worker_thread_init_func{std::forward<F>(worker_thread_init_func)}
+  {
+    create_threads();
+  }
+
+  /**
+   * @brief Constructor, without user-defined worker thread initialization.
+   *
+   * @param num_threads Number of threads.
+   */
+  ThreadPoolSimple(unsigned int num_threads) : ThreadPoolSimple(num_threads, FunctionWrapper{}) {}
+
+  /**
+   * @brief Destructor. Wait until all worker threads complete their tasks, then join the threads.
+   */
+  ~ThreadPoolSimple()
+  {
+    wait();
+    destroy_threads();
+  }
+
+  /**
+   * @brief Wait until all worker threads complete their tasks. Then join the threads, and
+   * reinitialize the thread pool with new threads.
+   *
+   * @tparam F Type of the user-defined worker thread initialization.
+   * @param num_threads Number of threads.
+   * @param worker_thread_init_func User-defined worker thread initialization.
+   */
+  template <typename F>
+  void reset(unsigned int num_threads, F&& worker_thread_init_func)
+  {
+    wait();
+    destroy_threads();
+
+    _num_threads             = num_threads;
+    _worker_thread_init_func = std::forward<F>(worker_thread_init_func);
+    create_threads();
+  }
+
+  /**
+   * @brief Overload of reset(), without user-defined worker thread initialization.
+   *
+   * @param num_threads Number of threads.
+   */
+  void reset(unsigned int num_threads) { reset(num_threads, FunctionWrapper{}); }
+
+  /**
+   * @brief Block the calling thread until all worker threads complete their tasks.
+   */
+  void wait()
+  {
+    for (unsigned int thread_idx = 0; thread_idx < _num_threads; ++thread_idx) {
+      auto& task_done_cv = _workers[thread_idx].task_done_cv;
+      auto& mut          = _workers[thread_idx].task_mutex;
+      auto& task_queue   = _workers[thread_idx].task_queue;
+
+      std::unique_lock lock(mut);
+      task_done_cv.wait(lock, [&] { return task_queue.empty(); });
+    }
+  }
+
+  /**
+   * @brief Get the number of threads from the thread pool.
+   *
+   * @return Thread count.
+   */
+  unsigned int num_threads() const { return _num_threads; }
+
+  /**
+   * @brief Submit the task to the thread pool for execution. The worker thread is selected
+   * automatically in a round-robin fashion.
+   *
+   * @tparam F Type of the task callable.
+   * @tparam R Return type of the task callable.
+   * @param task  Task callable. The task can either be copyable or move-only.
+   * @return An std::future<R> object. R can be void or other types.
+   */
+  template <typename F, typename R = std::invoke_result_t<std::decay_t<F>>>
+  [[nodiscard]] std::future<R> submit_task(F&& task)
+  {
+    // The call index is atomically incremented on each submit_task call, and will wrap around once
+    // it reaches the maximum value the integer type `std::size_t` can hold (this overflow
+    // behavior is well-defined in C++).
+    auto tid =
+      std::atomic_fetch_add_explicit(&_task_submission_counter, 1, std::memory_order_relaxed);
+    tid %= _num_threads;
+
+    return submit_task_to_thread(std::forward<F>(task), tid);
+  }
+
+  /**
+   * @brief Submit the task to a specific thread for execution.
+   *
+   * @tparam F Type of the task callable.
+   * @tparam R Return type of the task callable.
+   * @param task Task callable. The task can either be copyable or move-only.
+   * @param thread_idx Index of the thread to which the task is submitted.
+   * @return An std::future<R> object. R can be void or other types.
+   */
+  template <typename F, typename R = std::invoke_result_t<std::decay_t<F>>>
+  [[nodiscard]] std::future<R> submit_task_to_thread(F&& task, std::size_t thread_idx)
+  {
+    auto& task_available_cv = _workers[thread_idx].task_available_cv;
+    auto& mut               = _workers[thread_idx].task_mutex;
+    auto& task_queue        = _workers[thread_idx].task_queue;
+
+    std::promise<R> p;
+    auto fut = p.get_future();
+
+    {
+      std::lock_guard lock(mut);
+
+      task_queue.emplace([task = std::forward<F>(task), p = std::move(p), thread_idx]() mutable {
+        try {
+          if constexpr (std::is_same_v<R, void>) {
+            task();
+            p.set_value();
+          } else {
+            p.set_value(task());
+          }
+        } catch (...) {
+          p.set_exception(std::current_exception());
+        }
+      });
+    }
+
+    task_available_cv.notify_one();
+    return fut;
+  }
+
+ private:
+  /**
+   * @brief Worker thread loop.
+   *
+   * @param thread_idx Worker thread index.
+   */
+  void run_worker(std::size_t thread_idx)
+  {
+    ThisThread::set_thread_idx(thread_idx);
+
+    auto& task_available_cv = _workers[thread_idx].task_available_cv;
+    auto& task_done_cv      = _workers[thread_idx].task_done_cv;
+    auto& mut               = _workers[thread_idx].task_mutex;
+    auto& task_queue        = _workers[thread_idx].task_queue;
+    auto& should_stop       = _workers[thread_idx].should_stop;
+
+    if (_worker_thread_init_func) { std::invoke(_worker_thread_init_func); }
+
+    while (true) {
+      std::unique_lock lock(mut);
+
+      if (task_queue.empty()) { task_done_cv.notify_all(); }
+
+      task_available_cv.wait(lock, [&] { return !task_queue.empty() || should_stop; });
+
+      if (should_stop) { break; }
+
+      auto task = std::move(task_queue.front());
+      task_queue.pop();
+      lock.unlock();
+
+      task();
+    }
+  }
+
+  /**
+   * @brief Create worker threads.
+   */
+  void create_threads()
+  {
+    _workers = std::make_unique<Worker[]>(_num_threads);
+    for (unsigned int thread_idx = 0; thread_idx < _num_threads; ++thread_idx) {
+      _workers[thread_idx].thread = std::thread([this, thread_idx] { run_worker(thread_idx); });
+    }
+  }
+
+  /**
+   * @brief Notify each work thread of the intention to stop and join the threads. Pre-condition:
+   * Each worker thread has finished all the tasks in their task queue.
+   */
+  void destroy_threads()
+  {
+    for (unsigned int thread_idx = 0; thread_idx < _num_threads; ++thread_idx) {
+      auto& task_available_cv = _workers[thread_idx].task_available_cv;
+      auto& mut               = _workers[thread_idx].task_mutex;
+
+      {
+        std::lock_guard lock(mut);
+        _workers[thread_idx].should_stop = true;
+      }
+
+      task_available_cv.notify_one();
+
+      _workers[thread_idx].thread.join();
+    }
+  }
+
+  unsigned int _num_threads{};
+  FunctionWrapper _worker_thread_init_func;
+  std::unique_ptr<Worker[]> _workers;
+  std::atomic_size_t _task_submission_counter{0};
+};
+
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/threadpool_wrapper.hpp b/cpp/include/kvikio/threadpool_wrapper.hpp
index 6eb2f158b2..0644b8c9ca 100644
--- a/cpp/include/kvikio/threadpool_wrapper.hpp
+++ b/cpp/include/kvikio/threadpool_wrapper.hpp
@@ -1,62 +1,30 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
 
-#include <functional>
-
 #include <BS_thread_pool.hpp>
 
-#include <kvikio/nvtx.hpp>
-
 namespace kvikio {
 
 template <typename pool_type>
 class thread_pool_wrapper : public pool_type {
  public:
   /**
-   * @brief Construct a new thread pool wrapper, and invoke a pre-defined initialization function in
-   * each worker thread.
+   * @brief Construct a new thread pool wrapper.
    *
    * @param nthreads The number of threads to use.
    */
-  thread_pool_wrapper(unsigned int nthreads) : pool_type{nthreads, worker_thread_init_func}
-  {
-    KVIKIO_NVTX_FUNC_RANGE();
-  }
+  thread_pool_wrapper(unsigned int nthreads) : pool_type{nthreads} {}
 
   /**
-   * @brief Reset the number of threads in the thread pool, and invoke a pre-defined initialization
-   * function in each worker thread.
+   * @brief Reset the number of threads in the thread pool.
    *
    * @param nthreads The number of threads to use.
    */
-  void reset(unsigned int nthreads)
-  {
-    KVIKIO_NVTX_FUNC_RANGE();
-    pool_type::reset(nthreads, worker_thread_init_func);
-  }
-
- private:
-  inline static std::function<void()> worker_thread_init_func{[] {
-    KVIKIO_NVTX_FUNC_RANGE();
-    // Rename the worker thread in the thread pool to improve clarity from nsys-ui.
-    // Note: This NVTX feature is currently not supported by nsys-ui.
-    NvtxManager::rename_current_thread("thread pool");
-  }};
+  void reset(unsigned int nthreads) { pool_type::reset(nthreads); }
 };
 
 using BS_thread_pool = thread_pool_wrapper<BS::thread_pool>;
diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 7724fe92a6..b2b695b578 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
@@ -73,11 +62,7 @@ template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
  * @param ptr Memory pointer to query
  * @return The boolean answer
  */
-#ifdef KVIKIO_CUDA_FOUND
 bool is_host_memory(void const* ptr);
-#else
-constexpr bool is_host_memory(void const* ptr) { return true; }
-#endif
 
 /**
  * @brief Return the device owning the pointer
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
index 963fc7359b..659577dd99 100755
--- a/cpp/scripts/run-cmake-format.sh
+++ b/cpp/scripts/run-cmake-format.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 # This script is a wrapper for cmakelang that may be used with pre-commit. The
 # wrapping is necessary because RAPIDS libraries split configuration for
@@ -37,7 +38,7 @@ if ! [ ${status} -eq 0 ]; then
         echo "This script must be run inside the kvikio repository, or the KVIKIO_ROOT environment variable must be set."
     else
         echo "Script failed with unknown error attempting to determine project root:"
-        echo ${KVIKIO_BUILD_DIR}
+        echo "${KVIKIO_BUILD_DIR}"
     fi
     exit 1
 fi
@@ -48,8 +49,8 @@ DEFAULT_FORMAT_FILE_LOCATIONS=(
 )
 
 if [ -z ${RAPIDS_CMAKE_FORMAT_FILE:+PLACEHOLDER} ]; then
-    for file_path in ${DEFAULT_FORMAT_FILE_LOCATIONS[@]}; do
-        if [ -f ${file_path} ]; then
+    for file_path in "${DEFAULT_FORMAT_FILE_LOCATIONS[@]}"; do
+        if [ -f "${file_path}" ]; then
             RAPIDS_CMAKE_FORMAT_FILE=${file_path}
             break
         fi
@@ -68,12 +69,12 @@ else
 fi
 
 if [[ $1 == "cmake-format" ]]; then
-  cmake-format -i --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2}
+  cmake-format -i --config-files cpp/cmake/config.json "${RAPIDS_CMAKE_FORMAT_FILE}" -- "${@:2}"
 elif [[ $1 == "cmake-lint" ]]; then
   # Since the pre-commit hook is verbose, we have to be careful to only
   # present cmake-lint's output (which is quite verbose) if we actually
   # observe a failure.
-  OUTPUT=$(cmake-lint --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2})
+  OUTPUT=$(cmake-lint --config-files cpp/cmake/config.json "${RAPIDS_CMAKE_FORMAT_FILE}" -- "${@:2}")
   status=$?
 
   if ! [ ${status} -eq 0 ]; then
diff --git a/cpp/src/batch.cpp b/cpp/src/batch.cpp
index 8ee2c73290..e1a0f6214d 100644
--- a/cpp/src/batch.cpp
+++ b/cpp/src/batch.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <cstddef>
diff --git a/cpp/src/bounce_buffer.cpp b/cpp/src/bounce_buffer.cpp
index 02281b2adc..d2f2c92b9c 100644
--- a/cpp/src/bounce_buffer.cpp
+++ b/cpp/src/bounce_buffer.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <mutex>
@@ -19,107 +8,169 @@
 
 #include <kvikio/bounce_buffer.hpp>
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/nvtx.hpp>
+#include <kvikio/detail/utils.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/nvtx.hpp>
 #include <kvikio/shim/cuda.hpp>
 
 namespace kvikio {
 
-AllocRetain::Alloc::Alloc(AllocRetain* manager, void* alloc, std::size_t size)
-  : _manager(manager), _alloc{alloc}, _size{size}
+void* PageAlignedAllocator::allocate(std::size_t size)
+{
+  void* buffer{};
+  auto const page_size    = get_page_size();
+  auto const aligned_size = detail::align_up(size, page_size);
+  buffer                  = std::aligned_alloc(page_size, aligned_size);
+  return buffer;
+}
+
+void PageAlignedAllocator::deallocate(void* buffer, std::size_t /*size*/) { std::free(buffer); }
+
+void* CudaPinnedAllocator::allocate(std::size_t size)
+{
+  void* buffer{};
+
+  // If no available allocation, allocate and register a new one
+  // Allocate page-locked host memory
+  // Under unified addressing, host memory allocated this way is automatically portable and
+  // mapped.
+  CUDA_DRIVER_TRY(cudaAPI::instance().MemHostAlloc(&buffer, size, CU_MEMHOSTALLOC_PORTABLE));
+
+  return buffer;
+}
+void CudaPinnedAllocator::deallocate(void* buffer, std::size_t /*size*/)
+{
+  CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(buffer));
+}
+
+void* CudaPageAlignedPinnedAllocator::allocate(std::size_t size)
+{
+  void* buffer{};
+  auto const page_size    = get_page_size();
+  auto const aligned_size = detail::align_up(size, page_size);
+  buffer                  = std::aligned_alloc(page_size, aligned_size);
+  KVIKIO_EXPECT(buffer != nullptr, "Aligned allocation failed");
+  CUDA_DRIVER_TRY(
+    cudaAPI::instance().MemHostRegister(buffer, aligned_size, CU_MEMHOSTALLOC_PORTABLE));
+  return buffer;
+}
+
+void CudaPageAlignedPinnedAllocator::deallocate(void* buffer, std::size_t /*size*/)
+{
+  CUDA_DRIVER_TRY(cudaAPI::instance().MemHostUnregister(buffer));
+  std::free(buffer);
+}
+
+template <typename Allocator>
+BounceBufferPool<Allocator>::Buffer::Buffer(BounceBufferPool<Allocator>* pool,
+                                            void* buffer,
+                                            std::size_t size)
+  : _pool(pool), _buffer{buffer}, _size{size}
 {
-  KVIKIO_NVTX_FUNC_RANGE();
 }
 
-AllocRetain::Alloc::~Alloc() noexcept
+template <typename Allocator>
+BounceBufferPool<Allocator>::Buffer::~Buffer() noexcept
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  _manager->put(_alloc, _size);
+  _pool->put(_buffer, _size);
 }
 
-void* AllocRetain::Alloc::get() noexcept
+template <typename Allocator>
+void* BounceBufferPool<Allocator>::Buffer::get() noexcept
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  return _alloc;
+  return _buffer;
 }
 
-void* AllocRetain::Alloc::get(std::ptrdiff_t offset) noexcept
+template <typename Allocator>
+void* BounceBufferPool<Allocator>::Buffer::get(std::ptrdiff_t offset) noexcept
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  return static_cast<char*>(_alloc) + offset;
+  return static_cast<char*>(_buffer) + offset;
 }
 
-std::size_t AllocRetain::Alloc::size() noexcept { return _size; }
+template <typename Allocator>
+std::size_t BounceBufferPool<Allocator>::Buffer::size() noexcept
+{
+  return _size;
+}
 
-std::size_t AllocRetain::_clear()
+template <typename Allocator>
+std::size_t BounceBufferPool<Allocator>::_clear()
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  std::size_t ret = _free_allocs.size() * _size;
-  while (!_free_allocs.empty()) {
-    CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(_free_allocs.top()));
-    _free_allocs.pop();
+  std::size_t ret = _free_buffers.size() * _buffer_size;
+  while (!_free_buffers.empty()) {
+    _allocator.deallocate(_free_buffers.top(), _buffer_size);
+    _free_buffers.pop();
   }
   return ret;
 }
 
-void AllocRetain::_ensure_alloc_size()
+template <typename Allocator>
+void BounceBufferPool<Allocator>::_ensure_buffer_size()
 {
   KVIKIO_NVTX_FUNC_RANGE();
   auto const bounce_buffer_size = defaults::bounce_buffer_size();
-  if (_size != bounce_buffer_size) {
+  if (_buffer_size != bounce_buffer_size) {
     _clear();
-    _size = bounce_buffer_size;
+    _buffer_size = bounce_buffer_size;
   }
 }
 
-AllocRetain::Alloc AllocRetain::get()
+template <typename Allocator>
+BounceBufferPool<Allocator>::Buffer BounceBufferPool<Allocator>::get()
 {
   KVIKIO_NVTX_FUNC_RANGE();
   std::lock_guard const lock(_mutex);
-  _ensure_alloc_size();
+  _ensure_buffer_size();
 
   // Check if we have an allocation available
-  if (!_free_allocs.empty()) {
-    void* ret = _free_allocs.top();
-    _free_allocs.pop();
-    return Alloc(this, ret, _size);
+  if (!_free_buffers.empty()) {
+    void* ret = _free_buffers.top();
+    _free_buffers.pop();
+    return Buffer(this, ret, _buffer_size);
   }
 
-  // If no available allocation, allocate and register a new one
-  void* alloc{};
-  // Allocate page-locked host memory
-  // Under unified addressing, host memory allocated this way is automatically portable and mapped.
-  CUDA_DRIVER_TRY(cudaAPI::instance().MemHostAlloc(&alloc, _size, CU_MEMHOSTALLOC_PORTABLE));
-  return Alloc(this, alloc, _size);
+  auto* buffer = _allocator.allocate(_buffer_size);
+  return Buffer(this, buffer, _buffer_size);
 }
 
-void AllocRetain::put(void* alloc, std::size_t size)
+template <typename Allocator>
+void BounceBufferPool<Allocator>::put(void* buffer, std::size_t size)
 {
   KVIKIO_NVTX_FUNC_RANGE();
   std::lock_guard const lock(_mutex);
-  _ensure_alloc_size();
+  _ensure_buffer_size();
 
-  // If the size of `alloc` matches the sizes of the retained allocations,
+  // If the size of `buffer` matches the sizes of the retained allocations,
   // it is added to the set of free allocation otherwise it is freed.
-  if (size == _size) {
-    _free_allocs.push(alloc);
+  if (size == _buffer_size) {
+    _free_buffers.push(buffer);
   } else {
-    CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(alloc));
+    _allocator.deallocate(buffer, size);
   }
 }
 
-std::size_t AllocRetain::clear()
+template <typename Allocator>
+std::size_t BounceBufferPool<Allocator>::clear()
 {
   KVIKIO_NVTX_FUNC_RANGE();
   std::lock_guard const lock(_mutex);
   return _clear();
 }
 
-AllocRetain& AllocRetain::instance()
+template <typename Allocator>
+BounceBufferPool<Allocator>& BounceBufferPool<Allocator>::instance()
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  static AllocRetain _instance;
+  static BounceBufferPool _instance;
   return _instance;
 }
 
+// Explicit instantiations
+template class BounceBufferPool<PageAlignedAllocator>;
+template class BounceBufferPool<CudaPinnedAllocator>;
+template class BounceBufferPool<CudaPageAlignedPinnedAllocator>;
 }  // namespace kvikio
diff --git a/cpp/src/buffer.cpp b/cpp/src/buffer.cpp
index 744a004b1d..b964f63d8f 100644
--- a/cpp/src/buffer.cpp
+++ b/cpp/src/buffer.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <algorithm>
@@ -20,8 +9,8 @@
 
 #include <kvikio/buffer.hpp>
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/nvtx.hpp>
 #include <kvikio/shim/cufile.hpp>
 #include <kvikio/shim/cufile_h_wrapper.hpp>
 #include <kvikio/utils.hpp>
diff --git a/cpp/src/compat_mode.cpp b/cpp/src/compat_mode.cpp
index 78a96c66be..a8c7e69fac 100644
--- a/cpp/src/compat_mode.cpp
+++ b/cpp/src/compat_mode.cpp
@@ -1,29 +1,14 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <algorithm>
-#include <cassert>
 #include <stdexcept>
 
 #include <kvikio/compat_mode.hpp>
-#include <kvikio/cufile/config.hpp>
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/file_handle.hpp>
-#include <kvikio/nvtx.hpp>
-#include <kvikio/shim/cufile.hpp>
 
 namespace kvikio {
 
@@ -50,99 +35,4 @@ CompatMode parse_compat_mode_str(std::string_view compat_mode_str)
 
 }  // namespace detail
 
-CompatMode CompatModeManager::infer_compat_mode_if_auto(CompatMode compat_mode) noexcept
-{
-  KVIKIO_NVTX_FUNC_RANGE();
-  if (compat_mode == CompatMode::AUTO) {
-    return is_cufile_available() ? CompatMode::OFF : CompatMode::ON;
-  }
-  return compat_mode;
-}
-
-bool CompatModeManager::is_compat_mode_preferred(CompatMode compat_mode) noexcept
-{
-  return compat_mode == CompatMode::ON ||
-         (compat_mode == CompatMode::AUTO &&
-          infer_compat_mode_if_auto(compat_mode) == CompatMode::ON);
-}
-
-bool CompatModeManager::is_compat_mode_preferred() const noexcept
-{
-  return _is_compat_mode_preferred;
-}
-
-bool CompatModeManager::is_compat_mode_preferred_for_async() const noexcept
-{
-  return _is_compat_mode_preferred_for_async;
-}
-
-CompatMode CompatModeManager::compat_mode_requested() const noexcept
-{
-  return _compat_mode_requested;
-}
-
-CompatModeManager::CompatModeManager(std::string const& file_path,
-                                     std::string const& flags,
-                                     mode_t mode,
-                                     CompatMode compat_mode_requested_v,
-                                     FileHandle* file_handle)
-{
-  KVIKIO_NVTX_FUNC_RANGE();
-  KVIKIO_EXPECT(file_handle != nullptr,
-                "The compatibility mode manager does not have a proper owning file handle.",
-                std::invalid_argument);
-
-  file_handle->_file_direct_off.open(file_path, flags, false, mode);
-  _is_compat_mode_preferred = is_compat_mode_preferred(compat_mode_requested_v);
-
-  // Nothing to do in compatibility mode
-  if (_is_compat_mode_preferred) { return; }
-
-  try {
-    file_handle->_file_direct_on.open(file_path, flags, true, mode);
-  } catch (...) {
-    // Try to open the file with the O_DIRECT flag. Fall back to compatibility mode, if it fails.
-    if (compat_mode_requested_v == CompatMode::AUTO) {
-      _is_compat_mode_preferred = true;
-    } else {  // CompatMode::OFF
-      throw;
-    }
-  }
-
-  if (_is_compat_mode_preferred) { return; }
-
-  auto error_code = file_handle->_cufile_handle.register_handle(file_handle->_file_direct_on.fd());
-  assert(error_code.has_value());
-
-  // For the AUTO mode, if the first cuFile API call fails, fall back to the compatibility
-  // mode.
-  if (compat_mode_requested_v == CompatMode::AUTO && error_code.value().err != CU_FILE_SUCCESS) {
-    _is_compat_mode_preferred = true;
-  } else {
-    CUFILE_TRY(error_code.value());
-  }
-
-  // Check cuFile async API
-  static bool const is_extra_symbol_available = is_stream_api_available();
-  static bool const is_config_path_empty      = config_path().empty();
-  _is_compat_mode_preferred_for_async =
-    _is_compat_mode_preferred || !is_extra_symbol_available || is_config_path_empty;
-}
-
-void CompatModeManager::validate_compat_mode_for_async() const
-{
-  KVIKIO_NVTX_FUNC_RANGE();
-  if (!_is_compat_mode_preferred && _is_compat_mode_preferred_for_async &&
-      _compat_mode_requested == CompatMode::OFF) {
-    std::string err_msg;
-    if (!is_stream_api_available()) { err_msg += "Missing the cuFile stream api."; }
-
-    // When checking for availability, we also check if cuFile's config file exists. This is
-    // because even when the stream API is available, it doesn't work if no config file exists.
-    if (config_path().empty()) { err_msg += " Missing cuFile configuration file."; }
-
-    KVIKIO_FAIL(err_msg, std::runtime_error);
-  }
-}
-
 }  // namespace kvikio
diff --git a/cpp/src/compat_mode_manager.cpp b/cpp/src/compat_mode_manager.cpp
new file mode 100644
index 0000000000..eb601af375
--- /dev/null
+++ b/cpp/src/compat_mode_manager.cpp
@@ -0,0 +1,105 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cassert>
+#include <stdexcept>
+
+#include <kvikio/compat_mode_manager.hpp>
+#include <kvikio/cufile/config.hpp>
+#include <kvikio/defaults.hpp>
+#include <kvikio/detail/nvtx.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/file_handle.hpp>
+#include <kvikio/shim/cufile.hpp>
+
+namespace kvikio {
+
+bool CompatModeManager::is_compat_mode_preferred() const noexcept
+{
+  return _is_compat_mode_preferred;
+}
+
+bool CompatModeManager::is_compat_mode_preferred_for_async() const noexcept
+{
+  return _is_compat_mode_preferred_for_async;
+}
+
+CompatMode CompatModeManager::compat_mode_requested() const noexcept
+{
+  return _compat_mode_requested;
+}
+
+CompatModeManager::CompatModeManager(std::string const& file_path,
+                                     std::string const& flags,
+                                     mode_t mode,
+                                     CompatMode compat_mode_requested_v,
+                                     FileHandle* file_handle)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  KVIKIO_EXPECT(file_handle != nullptr,
+                "The compatibility mode manager does not have a proper owning file handle.",
+                std::invalid_argument);
+
+  _compat_mode_requested = compat_mode_requested_v;
+  file_handle->_file_direct_off.open(file_path, flags, false, mode);
+  _is_compat_mode_preferred = defaults::is_compat_mode_preferred(compat_mode_requested_v);
+
+  if (_is_compat_mode_preferred && !defaults::auto_direct_io_read() &&
+      !defaults::auto_direct_io_write()) {
+    return;
+  }
+
+  // Try to open the file with the O_DIRECT flag.
+  try {
+    file_handle->_file_direct_on.open(file_path, flags, true, mode);
+  } catch (...) {
+    // Handle different compatibility mode requests when Direct I/O is not supported
+    if (compat_mode_requested_v == CompatMode::ON) {
+      // _file_direct_on.fd() == -1, so Direct I/O will never be used on this file handle
+      return;
+    } else if (compat_mode_requested_v == CompatMode::AUTO) {
+      // Fall back to compatibility mode
+      _is_compat_mode_preferred = true;
+    } else {  // CompatMode::OFF
+      throw;
+    }
+  }
+
+  if (_is_compat_mode_preferred) { return; }
+
+  auto error_code = file_handle->_cufile_handle.register_handle(file_handle->_file_direct_on.fd());
+  assert(error_code.has_value());
+
+  // For the AUTO mode, if the first cuFile API call fails, fall back to the compatibility
+  // mode.
+  if (compat_mode_requested_v == CompatMode::AUTO && error_code.value().err != CU_FILE_SUCCESS) {
+    _is_compat_mode_preferred = true;
+  } else {
+    CUFILE_TRY(error_code.value());
+  }
+
+  // Check cuFile async API
+  static bool const is_extra_symbol_available = is_stream_api_available();
+  static bool const is_config_path_empty      = config_path().empty();
+  _is_compat_mode_preferred_for_async =
+    _is_compat_mode_preferred || !is_extra_symbol_available || is_config_path_empty;
+}
+
+void CompatModeManager::validate_compat_mode_for_async() const
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  if (_is_compat_mode_preferred_for_async && _compat_mode_requested == CompatMode::OFF) {
+    std::string err_msg;
+    if (!is_stream_api_available()) { err_msg += "Missing the cuFile stream api."; }
+
+    // When checking for availability, we also check if cuFile's config file exists. This is
+    // because even when the stream API is available, it doesn't work if no config file exists.
+    if (config_path().empty()) { err_msg += " Missing cuFile configuration file."; }
+
+    KVIKIO_FAIL(err_msg, std::runtime_error);
+  }
+}
+
+}  // namespace kvikio
diff --git a/cpp/src/cufile/config.cpp b/cpp/src/cufile/config.cpp
index d1d36adc04..3c33e101df 100644
--- a/cpp/src/cufile/config.cpp
+++ b/cpp/src/cufile/config.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <cstdlib>
diff --git a/cpp/src/cufile/driver.cpp b/cpp/src/cufile/driver.cpp
index 46da3151f7..98070afc84 100644
--- a/cpp/src/cufile/driver.cpp
+++ b/cpp/src/cufile/driver.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <iostream>
diff --git a/cpp/src/defaults.cpp b/cpp/src/defaults.cpp
index f492d39a23..f827ef6cf5 100644
--- a/cpp/src/defaults.cpp
+++ b/cpp/src/defaults.cpp
@@ -1,22 +1,10 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <cstddef>
 #include <cstdlib>
-#include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -25,9 +13,11 @@
 
 #include <kvikio/compat_mode.hpp>
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/http_status_codes.hpp>
 #include <kvikio/shim/cufile.hpp>
+#include <string_view>
 
 namespace kvikio {
 template <>
@@ -88,9 +78,13 @@ std::vector<int> getenv_or(std::string_view env_var_name, std::vector<int> defau
 unsigned int defaults::get_num_threads_from_env()
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  int const ret = getenv_or("KVIKIO_NTHREADS", 1);
-  KVIKIO_EXPECT(ret > 0, "KVIKIO_NTHREADS has to be a positive integer", std::invalid_argument);
-  return ret;
+
+  auto const [env_var_name, num_threads, _] =
+    getenv_or({"KVIKIO_NTHREADS", "KVIKIO_NUM_THREADS"}, 1);
+  KVIKIO_EXPECT(num_threads > 0,
+                std::string{env_var_name} + " has to be a positive integer",
+                std::invalid_argument);
+  return num_threads;
 }
 
 defaults::defaults()
@@ -141,6 +135,12 @@ defaults::defaults()
     _http_status_codes =
       getenv_or("KVIKIO_HTTP_STATUS_CODES", std::vector<int>{429, 500, 502, 503, 504});
   }
+
+  // Determine the default value of `auto_direct_io_read` and `auto_direct_io_write`
+  {
+    _auto_direct_io_read  = getenv_or("KVIKIO_AUTO_DIRECT_IO_READ", false);
+    _auto_direct_io_write = getenv_or("KVIKIO_AUTO_DIRECT_IO_WRITE", true);
+  }
 }
 
 defaults* defaults::instance()
@@ -183,6 +183,10 @@ void defaults::set_thread_pool_nthreads(unsigned int nthreads)
   thread_pool().reset(nthreads);
 }
 
+unsigned int defaults::num_threads() { return thread_pool_nthreads(); }
+
+void defaults::set_num_threads(unsigned int nthreads) { set_thread_pool_nthreads(nthreads); }
+
 std::size_t defaults::task_size() { return instance()->_task_size; }
 
 void defaults::set_task_size(std::size_t nbytes)
@@ -227,4 +231,11 @@ void defaults::set_http_timeout(long timeout_seconds)
   instance()->_http_timeout = timeout_seconds;
 }
 
+bool defaults::auto_direct_io_read() { return instance()->_auto_direct_io_read; }
+
+void defaults::set_auto_direct_io_read(bool flag) { instance()->_auto_direct_io_read = flag; }
+
+bool defaults::auto_direct_io_write() { return instance()->_auto_direct_io_write; }
+
+void defaults::set_auto_direct_io_write(bool flag) { instance()->_auto_direct_io_write = flag; }
 }  // namespace kvikio
diff --git a/cpp/src/detail/env.cpp b/cpp/src/detail/env.cpp
new file mode 100644
index 0000000000..5d31644501
--- /dev/null
+++ b/cpp/src/detail/env.cpp
@@ -0,0 +1,23 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <kvikio/detail/env.hpp>
+#include <kvikio/detail/nvtx.hpp>
+#include <kvikio/error.hpp>
+
+namespace kvikio::detail {
+std::optional<std::string> unwrap_or_env(std::optional<std::string> value,
+                                         std::string const& env_var,
+                                         std::optional<std::string> const& err_msg)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  if (value.has_value()) { return value; }
+  char const* env = std::getenv(env_var.c_str());
+  if (env != nullptr) { return std::string(env); }
+  if (!err_msg.has_value()) { return std::nullopt; }
+  KVIKIO_FAIL(*err_msg, std::invalid_argument);
+  return std::nullopt;
+}
+}  // namespace kvikio::detail
diff --git a/cpp/src/nvtx.cpp b/cpp/src/detail/nvtx.cpp
similarity index 75%
rename from cpp/src/nvtx.cpp
rename to cpp/src/detail/nvtx.cpp
index b9d7c3e146..45809ebab8 100644
--- a/cpp/src/nvtx.cpp
+++ b/cpp/src/detail/nvtx.cpp
@@ -1,28 +1,15 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <sys/syscall.h>
 #include <array>
 #include <sstream>
 
-#ifdef KVIKIO_CUDA_FOUND
 #include <nvtx3/nvtx3.hpp>
-#endif
 
-#include <kvikio/nvtx.hpp>
+#include <kvikio/detail/nvtx.hpp>
 
 namespace kvikio {
 
@@ -34,18 +21,12 @@ NvtxManager& NvtxManager::instance() noexcept
 
 const nvtx_color_type& NvtxManager::default_color() noexcept
 {
-#ifdef KVIKIO_CUDA_FOUND
   static nvtx_color_type default_color{nvtx3::argb{0, 255, 255, 255}};
   return default_color;
-#else
-  static nvtx_color_type dummy{};
-  return dummy;
-#endif
 }
 
 const nvtx_color_type& NvtxManager::get_color_by_index(std::uint64_t idx) noexcept
 {
-#ifdef KVIKIO_CUDA_FOUND
   constexpr std::size_t num_color{16};
   static_assert((num_color & (num_color - 1)) == 0);  // Is power of 2
   static std::array<nvtx_color_type, num_color> color_palette = {nvtx3::rgb{106, 192, 67},
@@ -66,15 +47,10 @@ const nvtx_color_type& NvtxManager::get_color_by_index(std::uint64_t idx) noexce
                                                                  nvtx3::rgb{122, 50, 49}};
   auto safe_idx = idx & (num_color - 1);  // idx % num_color
   return color_palette[safe_idx];
-#else
-  static nvtx_color_type dummy{};
-  return dummy;
-#endif
 }
 
 void NvtxManager::rename_current_thread(std::string_view new_name) noexcept
 {
-#ifdef KVIKIO_CUDA_FOUND
   auto tid = syscall(SYS_gettid);
   std::stringstream ss;
   ss << new_name << " (" << tid << ")";
@@ -85,10 +61,10 @@ void NvtxManager::rename_current_thread(std::string_view new_name) noexcept
   attribs.identifierType           = NVTX_RESOURCE_TYPE_GENERIC_THREAD_NATIVE;
   attribs.identifier.ullValue      = tid;
   attribs.messageType              = NVTX_MESSAGE_TYPE_ASCII;
-  attribs.message.ascii            = ss.str().c_str();
+  auto st                          = ss.str();
+  attribs.message.ascii            = st.c_str();
   nvtxResourceHandle_t handle =
     nvtxDomainResourceCreate(nvtx3::domain::get<libkvikio_domain>(), &attribs);
-#endif
 }
 
 }  // namespace kvikio
diff --git a/cpp/src/posix_io.cpp b/cpp/src/detail/posix_io.cpp
similarity index 52%
rename from cpp/src/posix_io.cpp
rename to cpp/src/detail/posix_io.cpp
index 512576943b..1ddafa4d3e 100644
--- a/cpp/src/posix_io.cpp
+++ b/cpp/src/detail/posix_io.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <unistd.h>
@@ -21,9 +10,9 @@
 #include <thread>
 
 #include <kvikio/bounce_buffer.hpp>
+#include <kvikio/detail/nvtx.hpp>
+#include <kvikio/detail/posix_io.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/nvtx.hpp>
-#include <kvikio/posix_io.hpp>
 #include <kvikio/shim/cuda.hpp>
 #include <kvikio/utils.hpp>
 
@@ -57,26 +46,40 @@ CUstream StreamsByThread::get()
   return get(ctx, std::this_thread::get_id());
 }
 
-std::size_t posix_device_read(int fd,
+std::size_t posix_device_read(int fd_direct_off,
                               void const* devPtr_base,
                               std::size_t size,
                               std::size_t file_offset,
-                              std::size_t devPtr_offset)
+                              std::size_t devPtr_offset,
+                              int fd_direct_on)
 {
   KVIKIO_NVTX_FUNC_RANGE(size);
-  return detail::posix_device_io<IOOperationType::READ>(
-    fd, devPtr_base, size, file_offset, devPtr_offset);
+  // If Direct I/O is supported and requested
+  if (fd_direct_on != -1 && defaults::auto_direct_io_read()) {
+    return detail::posix_device_io<IOOperationType::READ, CudaPageAlignedPinnedBounceBufferPool>(
+      fd_direct_off, devPtr_base, size, file_offset, devPtr_offset, fd_direct_on);
+  } else {
+    return detail::posix_device_io<IOOperationType::READ>(
+      fd_direct_off, devPtr_base, size, file_offset, devPtr_offset);
+  }
 }
 
-std::size_t posix_device_write(int fd,
+std::size_t posix_device_write(int fd_direct_off,
                                void const* devPtr_base,
                                std::size_t size,
                                std::size_t file_offset,
-                               std::size_t devPtr_offset)
+                               std::size_t devPtr_offset,
+                               int fd_direct_on)
 {
   KVIKIO_NVTX_FUNC_RANGE(size);
-  return detail::posix_device_io<IOOperationType::WRITE>(
-    fd, devPtr_base, size, file_offset, devPtr_offset);
+  // If Direct I/O is supported and requested
+  if (fd_direct_on != -1 && defaults::auto_direct_io_write()) {
+    return detail::posix_device_io<IOOperationType::WRITE, CudaPageAlignedPinnedBounceBufferPool>(
+      fd_direct_off, devPtr_base, size, file_offset, devPtr_offset, fd_direct_on);
+  } else {
+    return detail::posix_device_io<IOOperationType::WRITE>(
+      fd_direct_off, devPtr_base, size, file_offset, devPtr_offset);
+  }
 }
 
 }  // namespace kvikio::detail
diff --git a/cpp/src/detail/remote_handle.cpp b/cpp/src/detail/remote_handle.cpp
new file mode 100644
index 0000000000..87d1ed5ab5
--- /dev/null
+++ b/cpp/src/detail/remote_handle.cpp
@@ -0,0 +1,21 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <string>
+
+#include <kvikio/detail/remote_handle.hpp>
+
+namespace kvikio::detail {
+std::size_t callback_get_string_response(char* data,
+                                         std::size_t size,
+                                         std::size_t num_bytes,
+                                         void* userdata)
+{
+  auto new_data_size = size * num_bytes;
+  auto* response     = reinterpret_cast<std::string*>(userdata);
+  response->append(data, new_data_size);
+  return new_data_size;
+}
+}  // namespace kvikio::detail
diff --git a/cpp/src/detail/tls.cpp b/cpp/src/detail/tls.cpp
new file mode 100644
index 0000000000..89773db22a
--- /dev/null
+++ b/cpp/src/detail/tls.cpp
@@ -0,0 +1,139 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <unistd.h>
+#include <cstdlib>
+
+#include <curl/curl.h>
+#include <kvikio/detail/tls.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/shim/libcurl.hpp>
+#include <optional>
+#include <stdexcept>
+
+namespace kvikio::detail {
+
+namespace {
+/**
+ * @brief Search for a CA certificate path using environment variables
+ *
+ * @param env_vars Environment variable names to check in order
+ * @return Path string if found in any environment variable, std::nullopt otherwise
+ */
+std::optional<std::string> find_ca_path_from_env_var(std::vector<std::string> const& env_vars)
+{
+  for (auto const& env_var : env_vars) {
+    auto const* path = std::getenv(env_var.data());
+    if (path != nullptr) { return path; }
+  }
+
+  return std::nullopt;
+}
+
+/**
+ * @brief Search for a CA certificate path in standard system locations
+ *
+ * @param system_paths file system paths to check in order
+ * @return First accessible path if found, std::nullopt otherwise
+ */
+std::optional<std::string> find_ca_path_in_system_locations(
+  std::vector<std::string> const& system_paths)
+{
+  for (auto const& path : system_paths) {
+    // Check whether the file/directory exists, and whether it grants read permission to the calling
+    // process's real UID and GID. If the path is a symbolic link, it is dereferenced.
+    auto const result = access(path.data(), R_OK);
+
+    if (result != -1) { return path; }
+  }
+
+  return std::nullopt;
+}
+
+/**
+ * @brief Get CA certificate path from curl's compile-time defaults
+ *
+ * @param default_path Path provided by curl_version_info (may be nullptr)
+ * @return Path string if accessible, std::nullopt otherwise
+ */
+std::optional<std::string> get_ca_path_from_curl_defaults(char const* default_path)
+{
+  if (default_path != nullptr && access(default_path, R_OK) != -1) { return default_path; }
+
+  return std::nullopt;
+}
+}  // namespace
+
+std::pair<std::optional<std::string>, std::optional<std::string>> get_ca_paths()
+{
+  auto* version_info = curl_version_info(::CURLVERSION_NOW);
+  KVIKIO_EXPECT(version_info != nullptr, "Failed to get curl version info", std::runtime_error);
+
+  std::optional<std::string> ca_bundle_file;
+  std::optional<std::string> ca_directory;
+
+  // Priority 1: CA bundle file from environment variables
+  ca_bundle_file = find_ca_path_from_env_var({
+    "CURL_CA_BUNDLE",  // curl program
+    "SSL_CERT_FILE"    // OpenSSL
+  });
+  if (ca_bundle_file.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 2: CA directory from environment variables
+  ca_directory = find_ca_path_from_env_var({
+    "SSL_CERT_DIR"  // OpenSSL
+  });
+  if (ca_directory.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 3: CA bundle file from system locations
+  ca_bundle_file = find_ca_path_in_system_locations(
+    {"/etc/ssl/certs/ca-certificates.crt",                 // Debian/Ubuntu, Arch, Alpine, Gentoo
+     "/etc/pki/tls/certs/ca-bundle.crt",                   // RHEL/CentOS/Rocky/AlmaLinux, Fedora
+     "/etc/ssl/ca-bundle.pem",                             // OpenSUSE/SLES
+     "/etc/pki/tls/cert.pem",                              // RHEL-based (symlink to ca-bundle.crt)
+     "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem",  // Fedora 28+, RHEL 8+
+
+     // Additional locations mentioned by libcurl:
+     // https://github.com/curl/curl/blob/master/CMakeLists.txt
+     "/usr/share/ssl/certs/ca-bundle.crt",
+     "/usr/local/share/certs/ca-root-nss.crt",
+     "/etc/ssl/cert.pem"});
+  if (ca_bundle_file.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 4: CA directory from system locations
+  ca_directory = find_ca_path_in_system_locations({
+    "/etc/ssl/certs",     // Debian/Ubuntu, Arch, Alpine, OpenSUSE, Gentoo
+    "/etc/pki/tls/certs"  // RHEL/CentOS/Rocky/AlmaLinux, Fedora
+  });
+  if (ca_directory.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 5: CA bundle file from curl compile-time defaults
+  ca_bundle_file = get_ca_path_from_curl_defaults(version_info->cainfo);
+  if (ca_bundle_file.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 6: CA directory from curl compile-time defaults
+  ca_directory = get_ca_path_from_curl_defaults(version_info->capath);
+  if (ca_directory.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // At least one path must exist
+  KVIKIO_EXPECT(ca_bundle_file.has_value() || ca_directory.has_value(),
+                "Failed to find accessible CA certificates.",
+                std::runtime_error);
+  return {ca_bundle_file, ca_directory};
+}
+
+void set_up_ca_paths(CurlHandle& curl)
+{
+  static auto const [ca_bundle_file, ca_directory] = get_ca_paths();
+
+  if (ca_bundle_file.has_value()) {
+    curl.setopt(CURLOPT_CAINFO, ca_bundle_file->c_str());
+    curl.setopt(CURLOPT_CAPATH, nullptr);
+  } else if (ca_directory.has_value()) {
+    curl.setopt(CURLOPT_CAINFO, nullptr);
+    curl.setopt(CURLOPT_CAPATH, ca_directory->c_str());
+  }
+}
+}  // namespace kvikio::detail
diff --git a/cpp/src/detail/url.cpp b/cpp/src/detail/url.cpp
new file mode 100644
index 0000000000..3e76643082
--- /dev/null
+++ b/cpp/src/detail/url.cpp
@@ -0,0 +1,284 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <array>
+#include <cstdint>
+#include <sstream>
+#include <stdexcept>
+#include <string_view>
+#include <utility>
+
+#include <curl/curl.h>
+#include <kvikio/detail/url.hpp>
+#include <kvikio/error.hpp>
+
+#define CHECK_CURL_URL_ERR(err_code) check_curl_url_err(err_code, __LINE__, __FILE__)
+
+namespace kvikio::detail {
+namespace {
+void check_curl_url_err(CURLUcode err_code, int line_number, char const* filename)
+{
+  if (err_code == CURLUcode::CURLUE_OK) { return; }
+
+  std::stringstream ss;
+  ss << "KvikIO detects an URL error at: " << filename << ":" << line_number << ": ";
+  char const* msg = curl_url_strerror(err_code);
+  if (msg == nullptr) {
+    ss << "(no message)";
+  } else {
+    ss << msg;
+  }
+  throw std::runtime_error(ss.str());
+}
+}  // namespace
+
+CurlUrlHandle::CurlUrlHandle() : _handle(curl_url())
+{
+  KVIKIO_EXPECT(_handle != nullptr,
+                "Libcurl is unable to allocate a URL handle (likely out of memory).");
+}
+
+CurlUrlHandle::~CurlUrlHandle() noexcept
+{
+  if (_handle) { curl_url_cleanup(_handle); }
+}
+
+CurlUrlHandle::CurlUrlHandle(CurlUrlHandle&& other) noexcept
+  : _handle{std::exchange(other._handle, nullptr)}
+{
+}
+
+CurlUrlHandle& CurlUrlHandle::operator=(CurlUrlHandle&& other) noexcept
+{
+  if (this != &other) {
+    if (_handle) { curl_url_cleanup(_handle); }
+    _handle = std::exchange(other._handle, nullptr);
+  }
+
+  return *this;
+}
+
+CURLU* CurlUrlHandle::get() const { return _handle; }
+
+std::optional<std::string> UrlParser::extract_component(
+  CurlUrlHandle const& handle,
+  CURLUPart part,
+  std::optional<unsigned int> bitmask_component_flags,
+  std::optional<CURLUcode> allowed_err_code)
+{
+  if (!bitmask_component_flags.has_value()) { bitmask_component_flags = 0U; }
+
+  char* value{};
+  auto err_code = curl_url_get(handle.get(), part, &value, bitmask_component_flags.value());
+
+  if (err_code == CURLUcode::CURLUE_OK && value != nullptr) {
+    std::string result{value};
+    curl_free(value);
+    return result;
+  }
+
+  if (allowed_err_code.has_value() && allowed_err_code.value() == err_code) { return std::nullopt; }
+
+  // Throws an exception and explains the reason.
+  CHECK_CURL_URL_ERR(err_code);
+  return std::nullopt;
+}
+
+std::optional<std::string> UrlParser::extract_component(
+  std::string const& url,
+  CURLUPart part,
+  std::optional<unsigned int> bitmask_url_flags,
+  std::optional<unsigned int> bitmask_component_flags,
+  std::optional<CURLUcode> allowed_err_code)
+{
+  if (!bitmask_url_flags.has_value()) { bitmask_url_flags = 0U; }
+  if (!bitmask_component_flags.has_value()) { bitmask_component_flags = 0U; }
+
+  CurlUrlHandle handle;
+  CHECK_CURL_URL_ERR(
+    curl_url_set(handle.get(), CURLUPART_URL, url.c_str(), bitmask_url_flags.value()));
+
+  return extract_component(handle, part, bitmask_component_flags, allowed_err_code);
+}
+
+UrlParser::UrlComponents UrlParser::parse(std::string const& url,
+                                          std::optional<unsigned int> bitmask_url_flags,
+                                          std::optional<unsigned int> bitmask_component_flags)
+{
+  if (!bitmask_url_flags.has_value()) { bitmask_url_flags = 0U; }
+  if (!bitmask_component_flags.has_value()) { bitmask_component_flags = 0U; }
+
+  CurlUrlHandle handle;
+  CHECK_CURL_URL_ERR(
+    curl_url_set(handle.get(), CURLUPART_URL, url.c_str(), bitmask_url_flags.value()));
+
+  UrlComponents components;
+  CURLUcode err_code{};
+
+  components.scheme = extract_component(
+    handle, CURLUPART_SCHEME, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_SCHEME);
+  components.host = extract_component(
+    handle, CURLUPART_HOST, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_HOST);
+  components.port = extract_component(
+    handle, CURLUPART_PORT, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_PORT);
+  components.path  = extract_component(handle, CURLUPART_PATH, bitmask_component_flags.value());
+  components.query = extract_component(
+    handle, CURLUPART_QUERY, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_QUERY);
+  components.fragment = extract_component(
+    handle, CURLUPART_FRAGMENT, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_FRAGMENT);
+
+  return components;
+}
+
+UrlBuilder::UrlBuilder() {}
+
+UrlBuilder::UrlBuilder(std::string const& url, std::optional<unsigned int> bitmask_url_flags)
+{
+  if (!bitmask_url_flags.has_value()) { bitmask_url_flags = 0U; }
+
+  CHECK_CURL_URL_ERR(
+    curl_url_set(_handle.get(), CURLUPART_URL, url.c_str(), bitmask_url_flags.value()));
+}
+
+UrlBuilder::UrlBuilder(UrlParser::UrlComponents const& components,
+                       std::optional<unsigned int> bitmask_url_flags)
+{
+  // Start with an empty builder
+  // Set each component if present
+  if (components.scheme.has_value()) { set_scheme(components.scheme); }
+  if (components.host.has_value()) { set_host(components.host); }
+  if (components.port.has_value()) { set_port(components.port); }
+  if (components.path.has_value()) { set_path(components.path); }
+  if (components.query.has_value()) { set_query(components.query); }
+  if (components.fragment.has_value()) { set_fragment(components.fragment); }
+}
+
+UrlBuilder& UrlBuilder::set_component(CURLUPart part,
+                                      char const* value,
+                                      std::optional<unsigned int> flags)
+{
+  if (!flags.has_value()) { flags = 0U; }
+
+  CHECK_CURL_URL_ERR(curl_url_set(_handle.get(), part, value, flags.value()));
+  return *this;
+}
+
+UrlBuilder& UrlBuilder::set_scheme(std::optional<std::string> const& scheme)
+{
+  auto const* value = scheme.has_value() ? scheme.value().c_str() : nullptr;
+  return set_component(CURLUPART_SCHEME, value);
+}
+
+UrlBuilder& UrlBuilder::set_host(std::optional<std::string> const& host)
+{
+  auto const* value = host.has_value() ? host.value().c_str() : nullptr;
+  return set_component(CURLUPART_HOST, value);
+}
+
+UrlBuilder& UrlBuilder::set_port(std::optional<std::string> const& port)
+{
+  auto const* value = port.has_value() ? port.value().c_str() : nullptr;
+  return set_component(CURLUPART_PORT, value);
+}
+
+UrlBuilder& UrlBuilder::set_path(std::optional<std::string> const& path)
+{
+  auto const* value = path.has_value() ? path.value().c_str() : nullptr;
+  return set_component(CURLUPART_PATH, value);
+}
+
+UrlBuilder& UrlBuilder::set_query(std::optional<std::string> const& query)
+{
+  auto const* value = query.has_value() ? query.value().c_str() : nullptr;
+  return set_component(CURLUPART_QUERY, value);
+}
+
+UrlBuilder& UrlBuilder::set_fragment(std::optional<std::string> const& fragment)
+{
+  auto const* value = fragment.has_value() ? fragment.value().c_str() : nullptr;
+  return set_component(CURLUPART_FRAGMENT, value);
+}
+
+std::string UrlBuilder::build(std::optional<unsigned int> bitmask_component_flags) const
+{
+  if (!bitmask_component_flags.has_value()) { bitmask_component_flags = 0U; }
+
+  char* url = nullptr;
+  CHECK_CURL_URL_ERR(
+    curl_url_get(_handle.get(), CURLUPART_URL, &url, bitmask_component_flags.value()));
+
+  KVIKIO_EXPECT(
+    url != nullptr, "Failed to build URL: curl_url_get returned nullptr", std::runtime_error);
+
+  std::string result(url);
+  curl_free(url);
+  return result;
+}
+
+std::string UrlBuilder::build_manually(UrlParser::UrlComponents const& components)
+{
+  std::string url;
+  if (components.scheme) { url += components.scheme.value() + "://"; }
+  if (components.host) { url += components.host.value(); }
+  if (components.port) { url += ":" + components.port.value(); }
+  if (components.path) { url += components.path.value(); }
+  if (components.query) { url += "?" + components.query.value(); }
+  if (components.fragment) { url += "#" + components.fragment.value(); }
+  return url;
+}
+
+namespace {
+/**
+ * @brief Compile-time encoding lookup table
+ *
+ * ASCII characters will be percent-encoded. For example, = has a hexadecimal value of 3D, and the
+ * encoding result is %3D. Characters outside the ASCII region are encoded to NUL and map to an
+ * empty std::string.
+ */
+struct EncodingTable {
+  std::array<unsigned char[4], 256> table;
+  constexpr EncodingTable() : table{}
+  {
+    char const num_to_chars[] = "0123456789ABCDEF";
+    for (uint16_t idx = 0U; idx < table.size(); ++idx) {
+      if (idx < 128) {
+        table[idx][0] = '%';
+        table[idx][1] = num_to_chars[idx >> 4];
+        table[idx][2] = num_to_chars[idx & 0x0F];
+        table[idx][3] = '\0';
+      } else {
+        table[idx][0] = '\0';
+      }
+    }
+  }
+};
+}  // namespace
+
+std::string UrlEncoder::encode_path(std::string_view path, std::string_view chars_to_encode)
+{
+  constexpr EncodingTable encoding_table{};
+
+  std::array<bool, 256> should_encode{};
+  for (auto const c : chars_to_encode) {
+    std::size_t idx    = static_cast<unsigned char>(c);
+    should_encode[idx] = true;
+  }
+
+  std::string result;
+  for (auto const c : path) {
+    std::size_t idx = static_cast<unsigned char>(c);
+    if (should_encode[idx]) {
+      // If the character is within chars_to_encode, encode it
+      result += std::string{reinterpret_cast<char const*>(encoding_table.table[idx])};
+    } else {
+      // Otherwise, pass it through
+      result += c;
+    }
+  }
+
+  return result;
+}
+
+}  // namespace kvikio::detail
diff --git a/cpp/src/error.cpp b/cpp/src/error.cpp
index 5d479b36e4..d0179c2c80 100644
--- a/cpp/src/error.cpp
+++ b/cpp/src/error.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <iostream>
diff --git a/cpp/src/file_handle.cpp b/cpp/src/file_handle.cpp
index 11698bf30f..30f1cf335a 100644
--- a/cpp/src/file_handle.cpp
+++ b/cpp/src/file_handle.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <fcntl.h>
@@ -20,15 +9,16 @@
 #include <unistd.h>
 #include <cstddef>
 #include <cstdlib>
-#include <stdexcept>
 #include <utility>
 
 #include <kvikio/compat_mode.hpp>
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/nvtx.hpp>
+#include <kvikio/detail/parallel_operation.hpp>
+#include <kvikio/detail/posix_io.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/file_handle.hpp>
 #include <kvikio/file_utils.hpp>
-#include <kvikio/nvtx.hpp>
 
 namespace kvikio {
 
@@ -115,7 +105,7 @@ std::size_t FileHandle::read(void* devPtr_base,
   KVIKIO_NVTX_FUNC_RANGE(size);
   if (get_compat_mode_manager().is_compat_mode_preferred()) {
     return detail::posix_device_read(
-      _file_direct_off.fd(), devPtr_base, size, file_offset, devPtr_offset);
+      _file_direct_off.fd(), devPtr_base, size, file_offset, devPtr_offset, _file_direct_on.fd());
   }
   if (sync_default_stream) { CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(nullptr)); }
 
@@ -139,7 +129,7 @@ std::size_t FileHandle::write(void const* devPtr_base,
 
   if (get_compat_mode_manager().is_compat_mode_preferred()) {
     return detail::posix_device_write(
-      _file_direct_off.fd(), devPtr_base, size, file_offset, devPtr_offset);
+      _file_direct_off.fd(), devPtr_base, size, file_offset, devPtr_offset, _file_direct_on.fd());
   }
   if (sync_default_stream) { CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(nullptr)); }
 
@@ -169,7 +159,7 @@ std::future<std::size_t> FileHandle::pread(void* buf,
                      std::size_t hostPtr_offset) -> std::size_t {
       char* buf = static_cast<char*>(hostPtr_base) + hostPtr_offset;
       return detail::posix_host_read<detail::PartialIO::NO>(
-        _file_direct_off.fd(), buf, size, file_offset);
+        _file_direct_off.fd(), buf, size, file_offset, _file_direct_on.fd());
     };
 
     return parallel_io(op, buf, size, file_offset, task_size, 0, call_idx, nvtx_color);
@@ -180,7 +170,8 @@ std::future<std::size_t> FileHandle::pread(void* buf,
   // Shortcut that circumvent the threadpool and use the POSIX backend directly.
   if (size < gds_threshold) {
     PushAndPopContext c(ctx);
-    auto bytes_read = detail::posix_device_read(_file_direct_off.fd(), buf, size, file_offset, 0);
+    auto bytes_read = detail::posix_device_read(
+      _file_direct_off.fd(), buf, size, file_offset, 0, _file_direct_on.fd());
     // Maintain API consistency while making this trivial case synchronous.
     // The result in the future is immediately available after the call.
     return make_ready_future(bytes_read);
@@ -221,7 +212,7 @@ std::future<std::size_t> FileHandle::pwrite(void const* buf,
                      std::size_t hostPtr_offset) -> std::size_t {
       char const* buf = static_cast<char const*>(hostPtr_base) + hostPtr_offset;
       return detail::posix_host_write<detail::PartialIO::NO>(
-        _file_direct_off.fd(), buf, size, file_offset);
+        _file_direct_off.fd(), buf, size, file_offset, _file_direct_on.fd());
     };
 
     return parallel_io(op, buf, size, file_offset, task_size, 0, call_idx, nvtx_color);
@@ -232,7 +223,8 @@ std::future<std::size_t> FileHandle::pwrite(void const* buf,
   // Shortcut that circumvent the threadpool and use the POSIX backend directly.
   if (size < gds_threshold) {
     PushAndPopContext c(ctx);
-    auto bytes_write = detail::posix_device_write(_file_direct_off.fd(), buf, size, file_offset, 0);
+    auto bytes_write = detail::posix_device_write(
+      _file_direct_off.fd(), buf, size, file_offset, 0, _file_direct_on.fd());
     // Maintain API consistency while making this trivial case synchronous.
     // The result in the future is immediately available after the call.
     return make_ready_future(bytes_write);
@@ -332,4 +324,6 @@ const CompatModeManager& FileHandle::get_compat_mode_manager() const noexcept
   return _compat_mode_manager;
 }
 
+bool FileHandle::is_direct_io_supported() const noexcept { return _file_direct_on.fd() != -1; }
+
 }  // namespace kvikio
diff --git a/cpp/src/file_utils.cpp b/cpp/src/file_utils.cpp
index 3c7951effe..33c122b652 100644
--- a/cpp/src/file_utils.cpp
+++ b/cpp/src/file_utils.cpp
@@ -1,32 +1,25 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <unistd.h>
+
+#include <array>
 #include <stdexcept>
+#include <string>
+#include <string_view>
 #include <system_error>
 #include <utility>
 #include <vector>
 
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/file_handle.hpp>
 #include <kvikio/file_utils.hpp>
-#include <kvikio/nvtx.hpp>
 #include <kvikio/shim/cufile.hpp>
 
 namespace kvikio {
@@ -127,11 +120,11 @@ int open_fd_parse_flags(std::string const& flags, bool o_direct)
   switch (flags[0]) {
     case 'r':
       file_flags = O_RDONLY;
-      if (flags[1] == '+') { file_flags = O_RDWR; }
+      if (flags.length() > 1 && flags[1] == '+') { file_flags = O_RDWR; }
       break;
     case 'w':
       file_flags = O_WRONLY;
-      if (flags[1] == '+') { file_flags = O_RDWR; }
+      if (flags.length() > 1 && flags[1] == '+') { file_flags = O_RDWR; }
       file_flags |= O_CREAT | O_TRUNC;
       break;
     case 'a': KVIKIO_FAIL("Open flag 'a' isn't supported", std::invalid_argument);
@@ -165,6 +158,18 @@ int open_fd(std::string const& file_path, std::string const& flags, bool o_direc
   return ret;
 }
 
+[[nodiscard]] std::size_t get_file_size(std::string const& file_path)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  std::string const flags{"r"};
+  bool const o_direct{false};
+  mode_t const mode{FileHandle::m644};
+  auto fd     = open_fd(file_path, flags, o_direct, mode);
+  auto result = get_file_size(fd);
+  SYSCALL_CHECK(close(fd));
+  return result;
+}
+
 [[nodiscard]] std::size_t get_file_size(int file_descriptor)
 {
   KVIKIO_NVTX_FUNC_RANGE();
@@ -209,4 +214,43 @@ std::pair<std::size_t, std::size_t> get_page_cache_info(int fd)
   SYSCALL_CHECK(munmap(addr, file_size));
   return {num_pages_in_page_cache, num_pages};
 }
+
+bool clear_page_cache(bool reclaim_dentries_and_inodes, bool clear_dirty_pages)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  if (clear_dirty_pages) { sync(); }
+  std::string param = reclaim_dentries_and_inodes ? "3" : "1";
+
+  auto exec_cmd = [](std::string_view cmd) -> bool {
+    // Prevent the output from the command from mixing with the original process' output.
+    fflush(nullptr);
+    // popen only handles stdout. Switch stderr and stdout to only capture stderr.
+    auto const redirected_cmd =
+      std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null");
+    std::unique_ptr<FILE, int (*)(FILE*)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);
+    KVIKIO_EXPECT(pipe != nullptr, "popen() failed", GenericSystemError);
+
+    std::array<char, 128> buffer;
+    std::string error_out;
+    while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
+      error_out += buffer.data();
+    }
+    return error_out.empty();
+  };
+
+  std::array cmds{
+    // Special case:
+    // - Unprivileged users who cannot execute `/usr/bin/sudo` but can execute `/sbin/sysctl`, and
+    // - Superuser
+    std::string{"/sbin/sysctl vm.drop_caches=" + param},
+    // General case:
+    // - Unprivileged users who can execute `sudo`, and
+    // - Superuser
+    std::string{"sudo /sbin/sysctl vm.drop_caches=" + param}};
+
+  for (auto const& cmd : cmds) {
+    if (exec_cmd(cmd)) { return true; }
+  }
+  return false;
+}
 }  // namespace kvikio
diff --git a/cpp/src/hdfs.cpp b/cpp/src/hdfs.cpp
new file mode 100644
index 0000000000..938ab396c8
--- /dev/null
+++ b/cpp/src/hdfs.cpp
@@ -0,0 +1,140 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <regex>
+
+#include <kvikio/detail/env.hpp>
+#include <kvikio/detail/nvtx.hpp>
+#include <kvikio/detail/remote_handle.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/hdfs.hpp>
+#include <kvikio/remote_handle.hpp>
+#include <kvikio/shim/libcurl.hpp>
+
+namespace kvikio {
+
+WebHdfsEndpoint::WebHdfsEndpoint(std::string url, std::optional<std::string> username)
+  : RemoteEndpoint{RemoteEndpointType::WEBHDFS}, _username(std::move(username))
+{
+  // Extract two parts from the URL: components before (not including) the query and the query
+  // itself
+  std::regex static const url_pattern{R"(^([^?]+)\?([^#]*))"};
+  // Regex meaning:
+  // ^: From the start of the line
+  // [^?]+: Matches non-question-mark characters one or more times. The question mark ushers in the
+  // URL query component.
+  // \?: Matches the question mark, which needs to be escaped.
+  // [^#]*: Matches the non-pound characters zero or more times. The pound sign ushers in the URL
+  // fragment component. It is very likely that this part does not exist.
+  std::smatch url_match_results;
+  bool found_query = std::regex_search(url, url_match_results, url_pattern);
+  if (!found_query) {
+    _url = url;
+  } else {
+    // URL components before (not including) the query
+    _url = url_match_results[1].str();
+
+    auto query = url_match_results[2].str();
+
+    // Extract user name if provided. In WebHDFS, user name is specified as the key=value pair in
+    // the query
+    std::regex static const username_pattern{R"(user.name=([^&]+))"};
+    // Regex meaning:
+    // [^&]+: Matches the non-ampersand character one or more times. The ampersand delimits
+    // different parameters.
+    std::smatch username_match_results;
+    bool found_username = std::regex_search(query, username_match_results, username_pattern);
+    if (found_username) { _username = username_match_results[1].str(); }
+  }
+
+  // If the username is not specified by function parameter `username` or by the query string, check
+  // the environment variable
+  if (!_username.has_value()) {
+    auto const* env_val = std::getenv("KVIKIO_WEBHDFS_USERNAME");
+    if (env_val != nullptr) { _username = env_val; }
+  }
+}
+
+WebHdfsEndpoint::WebHdfsEndpoint(std::string host,
+                                 std::string port,
+                                 std::string file_path,
+                                 std::optional<std::string> username)
+  : RemoteEndpoint{RemoteEndpointType::WEBHDFS}
+{
+  std::stringstream ss;
+  ss << "http://" << host << ":" << port << "/webhdfs/v1" << file_path;
+  _url      = ss.str();
+  _username = detail::unwrap_or_env(std::move(username), "KVIKIO_WEBHDFS_USERNAME");
+}
+
+std::string WebHdfsEndpoint::str() const { return _url; }
+
+void WebHdfsEndpoint::setopt(CurlHandle& curl)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  curl.setopt(CURLOPT_URL, _url.c_str());
+  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+}
+
+std::size_t WebHdfsEndpoint::get_file_size()
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+
+  std::stringstream ss;
+  ss << _url << "?";
+  if (_username.has_value()) { ss << "user.name=" << _username.value() << "&"; }
+  ss << "op=GETFILESTATUS";
+
+  auto curl = create_curl_handle();
+  curl.setopt(CURLOPT_URL, ss.str().c_str());
+  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+
+  std::string response;
+  curl.setopt(CURLOPT_WRITEDATA, static_cast<void*>(&response));
+  curl.setopt(CURLOPT_WRITEFUNCTION, detail::callback_get_string_response);
+
+  curl.perform();
+
+  long http_status_code{};
+  curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+  KVIKIO_EXPECT(http_status_code == 200, "HTTP response is not successful.");
+
+  // The response is in JSON format. The file size is given by `"length":<file_size>`.
+  std::regex static const pattern{R"("length"\s*:\s*(\d+)[^\d])"};
+  // Regex meaning:
+  // \s*: Matches the space character zero or more times.
+  // \d+: Matches the digit one or more times.
+  // [^\d]: Matches a non-digit character.
+  std::smatch match_results;
+  bool found = std::regex_search(response, match_results, pattern);
+  KVIKIO_EXPECT(
+    found, "Regular expression search failed. Cannot extract file length from the JSON response.");
+  return std::stoull(match_results[1].str());
+}
+
+void WebHdfsEndpoint::setup_range_request(CurlHandle& curl,
+                                          std::size_t file_offset,
+                                          std::size_t size)
+{
+  // WebHDFS does not support CURLOPT_RANGE. The range is specified as query parameters in the URL.
+  KVIKIO_NVTX_FUNC_RANGE();
+  std::stringstream ss;
+  ss << _url << "?";
+  if (_username.has_value()) { ss << "user.name=" << _username.value() << "&"; }
+  ss << "op=OPEN&offset=" << file_offset << "&length=" << size;
+  curl.setopt(CURLOPT_URL, ss.str().c_str());
+}
+
+bool WebHdfsEndpoint::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    std::regex static const pattern(R"(^https?://[^/]+:\d+/webhdfs/v1/.+$)",
+                                    std::regex_constants::icase);
+    return std::regex_match(url, pattern);
+  } catch (...) {
+    return false;
+  }
+}
+}  // namespace kvikio
diff --git a/cpp/src/http_status_codes.cpp b/cpp/src/http_status_codes.cpp
index 9b9cd3d793..94d9d52659 100644
--- a/cpp/src/http_status_codes.cpp
+++ b/cpp/src/http_status_codes.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <cstddef>
@@ -31,13 +20,13 @@ std::vector<int> parse_http_status_codes(std::string_view env_var_name,
                                          std::string const& status_codes)
 {
   // Ensure `status_codes` consists only of 3-digit integers separated by commas, allowing spaces.
-  std::regex const check_pattern(R"(^\s*\d{3}\s*(\s*,\s*\d{3}\s*)*$)");
+  std::regex static const check_pattern(R"(^\s*\d{3}\s*(\s*,\s*\d{3}\s*)*$)");
   KVIKIO_EXPECT(std::regex_match(status_codes, check_pattern),
                 std::string{env_var_name} + ": invalid format, expected comma-separated integers.",
                 std::invalid_argument);
 
   // Match every integer in `status_codes`.
-  std::regex const number_pattern(R"(\d+)");
+  std::regex static const number_pattern(R"(\d+)");
 
   // For each match, we push_back `std::stoi(match.str())` into `ret`.
   std::vector<int> ret;
diff --git a/cpp/src/mmap.cpp b/cpp/src/mmap.cpp
new file mode 100644
index 0000000000..a720fa8929
--- /dev/null
+++ b/cpp/src/mmap.cpp
@@ -0,0 +1,482 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <sys/mman.h>
+
+#include <cstddef>
+#include <cstdlib>
+#include <future>
+#include <optional>
+#include <sstream>
+#include <stdexcept>
+#include <type_traits>
+#include <unordered_map>
+
+#include <kvikio/bounce_buffer.hpp>
+#include <kvikio/detail/nvtx.hpp>
+#include <kvikio/detail/parallel_operation.hpp>
+#include <kvikio/detail/posix_io.hpp>
+#include <kvikio/detail/utils.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/file_utils.hpp>
+#include <kvikio/mmap.hpp>
+#include <kvikio/utils.hpp>
+
+namespace kvikio {
+
+namespace detail {
+/**
+ * @brief Prevent the compiler from optimizing away the read of a byte from a given address
+ *
+ * @param addr The address to read from
+ */
+void disable_read_optimization(void* addr)
+{
+  auto addr_byte = static_cast<std::byte*>(addr);
+  std::byte tmp{};
+  asm volatile("" : "+r,m"(tmp = *addr_byte) : : "memory");
+}
+
+/**
+ * @brief Change an address `p` by a signed difference of `v`
+ *
+ * @tparam Integer Signed integer type
+ * @param p An address
+ * @param v Change of `p` in bytes
+ * @return A new address as a result of applying `v` on `p`
+ *
+ * @note Technically, if the initial pointer is non-null, or does not point to an element of an
+ * array object, (p + v) is undefined behavior (https://eel.is/c++draft/expr.add#4). However,
+ * (p + v) on dynamic allocation is generally acceptable in practice, as long as users guarantee
+ * that the resulting pointer points to a valid region.
+ */
+template <typename Integer>
+void* pointer_add(void* p, Integer v)
+{
+  static_assert(std::is_integral_v<Integer>);
+  return static_cast<std::byte*>(p) + v;
+}
+
+/**
+ * @brief The distance in bytes between pointer `p1` and `p2`
+ *
+ * @param p1 The first pointer
+ * @param p2 The second pointer
+ * @return Signed result of (`p1` - `p2`). Both pointers are cast to std::byte* before subtraction.
+ *
+ * @note Technically, if two pointers do not point to elements from the same array, (p1 - p2) is
+ * undefined behavior (https://eel.is/c++draft/expr.add#5). However, (p1 - p2) on dynamic allocation
+ * is generally acceptable in practice, as long as users guarantee that both pointers are within the
+ * valid region.
+ */
+std::ptrdiff_t pointer_diff(void* p1, void* p2)
+{
+  return static_cast<std::byte*>(p1) - static_cast<std::byte*>(p2);
+}
+
+/**
+ * @brief Whether the current device supports address translation service (ATS), whereby the CPU and
+ * GPU share a single page table.
+ *
+ * @return Boolean answer
+ */
+bool is_ats_available()
+{
+  // Memoize the ATS availability record of all devices
+  static auto const ats_availability = []() -> auto {
+    std::unordered_map<CUdevice, int> result;
+    int num_devices{};
+    CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGetCount(&num_devices));
+    for (int device_ordinal = 0; device_ordinal < num_devices; ++device_ordinal) {
+      CUdevice device_handle{};
+      CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&device_handle, device_ordinal));
+      int attr{};
+      CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGetAttribute(
+        &attr,
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+        device_handle));
+      result[device_handle] = attr;
+    }
+    return result;
+  }();
+
+  // Get current device
+  CUdevice device_handle{};
+  CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetDevice(&device_handle));
+
+  // Look up the record
+  return ats_availability.at(device_handle);
+}
+
+/**
+ * @brief For the specified memory range, touch the first byte of each page to cause page fault.
+ *
+ * For the first page, if the starting address is not aligned to the page boundary, the byte at
+ * that address is touched.
+ *
+ * @param buf The starting memory address
+ * @param size The size in bytes of the memory range
+ * @return The number of bytes touched
+ */
+std::size_t perform_prefault(void* buf, std::size_t size)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  auto const page_size = get_page_size();
+  auto aligned_addr    = detail::align_up(buf, page_size);
+
+  std::size_t touched_bytes{0};
+
+  // If buf is not aligned, read the byte at buf.
+  auto num_bytes = detail::pointer_diff(aligned_addr, buf);
+  if (num_bytes > 0) {
+    detail::disable_read_optimization(buf);
+    touched_bytes += num_bytes;
+    if (size >= num_bytes) { size -= num_bytes; }
+  }
+
+  if (num_bytes >= size) { return touched_bytes; }
+
+  while (size > 0) {
+    detail::disable_read_optimization(aligned_addr);
+    if (size >= page_size) {
+      aligned_addr = detail::pointer_add(aligned_addr, page_size);
+      size -= page_size;
+      touched_bytes += page_size;
+    } else {
+      touched_bytes += size;
+      break;
+    }
+  }
+  return touched_bytes;
+}
+
+/**
+ * @brief Implementation of read
+ *
+ * Copy data from the source buffer `src_mapped_buf + buf_offset` to the destination buffer
+ * `dst_buf + buf_offset`.
+ *
+ * @param dst_buf Address of the host or device memory (destination buffer)
+ * @param src_mapped_buf Address of the host memory (source buffer)
+ * @param size Size in bytes to read
+ * @param buf_offset Offset for both `dst_buf` and `src_mapped_buf`
+ * @param is_dst_buf_host_mem Whether the destination buffer is host memory or not
+ * @param ctx CUDA context when the destination buffer is not host memory
+ */
+void read_impl(void* dst_buf,
+               void* src_mapped_buf,
+               std::size_t size,
+               std::size_t buf_offset,
+               bool is_dst_buf_host_mem,
+               CUcontext ctx)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  auto const src = detail::pointer_add(src_mapped_buf, buf_offset);
+  auto const dst = detail::pointer_add(dst_buf, buf_offset);
+
+  if (is_dst_buf_host_mem) {
+    // std::memcpy implicitly performs prefault for the mapped memory.
+    std::memcpy(dst, src, size);
+    return;
+  }
+
+  // Empirically, take the following steps to achieve good performance:
+  // - On C2C:
+  //   - Explicitly prefault
+  //   - Copy from the mapped memory (pageable) to the device buffer
+  // - On PCIe:
+  //   - std::memcpy from the mapped memory to the pinned bounce buffer (which implicitly
+  //     prefaults)
+  //   - Copy from the bounce buffer to the device buffer
+
+  PushAndPopContext c(ctx);
+  CUstream stream = detail::StreamsByThread::get();
+
+  auto h2d_batch_cpy_sync =
+    [](CUdeviceptr dst_devptr, CUdeviceptr src_devptr, std::size_t size, CUstream stream) {
+#if CUDA_VERSION >= 12080
+      if (cudaAPI::instance().MemcpyBatchAsync) {
+        CUmemcpyAttributes attrs{};
+        std::size_t attrs_idxs[] = {0};
+        attrs.srcAccessOrder     = CUmemcpySrcAccessOrder_enum::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM;
+        CUDA_DRIVER_TRY(
+          cudaAPI::instance().MemcpyBatchAsync(&dst_devptr,
+                                               &src_devptr,
+                                               &size,
+                                               static_cast<std::size_t>(1) /* count */,
+                                               &attrs,
+                                               attrs_idxs,
+                                               static_cast<std::size_t>(1) /* num_attrs */,
+#if CUDA_VERSION < 13000
+                                               static_cast<std::size_t*>(nullptr),
+#endif
+                                               stream));
+      } else {
+        // Fall back to the conventional H2D copy if the batch copy API is not available.
+        CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(
+          dst_devptr, reinterpret_cast<void*>(src_devptr), size, stream));
+      }
+#else
+      CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(
+        dst_devptr, reinterpret_cast<void*>(src_devptr), size, stream));
+#endif
+      CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
+    };
+
+  auto dst_devptr = convert_void2deviceptr(dst);
+  CUdeviceptr src_devptr{};
+  if (detail::is_ats_available()) {
+    perform_prefault(src, size);
+    src_devptr = convert_void2deviceptr(src);
+    h2d_batch_cpy_sync(dst_devptr, src_devptr, size, stream);
+  } else {
+    auto bounce_buffer = CudaPinnedBounceBufferPool::instance().get();
+    std::memcpy(bounce_buffer.get(), src, size);
+    src_devptr = convert_void2deviceptr(bounce_buffer.get());
+    h2d_batch_cpy_sync(dst_devptr, src_devptr, size, stream);
+  }
+}
+
+}  // namespace detail
+
+//     |--> file start                 |<--page_size-->|
+//     |
+// (0) |...............|...............|...............|...............|............
+//
+// (1) |<---_initial_map_offset-->|<---------------_initial_map_size--------------->|
+//                                |--> _buf
+//
+// (2) |<-_map_offset->|<----------------------_map_size----------------------->|
+//                     |--> _map_addr
+//
+// (3) |<--------------------------offset--------------------->|<--size-->|
+//                                |--> _buf
+//
+// (0): Layout of the file-backed memory mapping if the whole file were mapped
+// (1): At mapping handle construction time, the member `_initial_map_offset` and
+// `_initial_map_size` determine the mapped region
+// (2): `_map_addr` is the page aligned address returned by `mmap`. `_map_offset` is the adjusted
+// offset.
+// (3): At read time, the argument `offset` and `size` determine the region to be read.
+// This region must be a subset of the one defined at mapping handle construction time.
+MmapHandle::MmapHandle(std::string const& file_path,
+                       std::string const& flags,
+                       std::optional<std::size_t> initial_map_size,
+                       std::size_t initial_map_offset,
+                       mode_t mode,
+                       std::optional<int> map_flags)
+  : _initial_map_offset(initial_map_offset), _initialized{true}
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+
+  switch (flags[0]) {
+    case 'r': {
+      _map_protection = PROT_READ;
+      break;
+    }
+    case 'w': {
+      KVIKIO_FAIL("File-backed mmap write is not supported yet", std::invalid_argument);
+    }
+    default: {
+      KVIKIO_FAIL("Unknown file open flag", std::invalid_argument);
+    }
+  }
+
+  _file_wrapper = FileWrapper(file_path, flags, false /* o_direct */, mode);
+  _file_size    = get_file_size(_file_wrapper.fd());
+  if (_file_size == 0) { return; }
+
+  {
+    std::stringstream ss;
+    ss << "Offset must be less than the file size. initial_map_offset: " << _initial_map_offset
+       << ", file size: " << _file_size << "\n";
+    KVIKIO_EXPECT(_initial_map_offset < _file_size, ss.str(), std::out_of_range);
+  }
+
+  // An initial size of std::nullopt is a shorthand for "starting from _initial_map_offset to the
+  // end of file".
+  _initial_map_size =
+    initial_map_size.has_value() ? initial_map_size.value() : (_file_size - _initial_map_offset);
+
+  KVIKIO_EXPECT(
+    _initial_map_size > 0, "Mapped region should not be zero byte", std::invalid_argument);
+
+  {
+    std::stringstream ss;
+    ss << "Mapped region is past the end of file. initial map offset: " << _initial_map_offset
+       << ", initial map size: " << _initial_map_size << ", file size: " << _file_size << "\n";
+    KVIKIO_EXPECT(
+      _initial_map_offset + _initial_map_size <= _file_size, ss.str(), std::out_of_range);
+  }
+
+  auto const page_size    = get_page_size();
+  _map_offset             = detail::align_down(_initial_map_offset, page_size);
+  auto const offset_delta = _initial_map_offset - _map_offset;
+  _map_size               = _initial_map_size + offset_delta;
+  _map_flags              = map_flags.has_value() ? map_flags.value() : MAP_PRIVATE;
+  _map_addr =
+    mmap(nullptr, _map_size, _map_protection, _map_flags, _file_wrapper.fd(), _map_offset);
+  SYSCALL_CHECK(_map_addr, "Cannot create memory mapping", MAP_FAILED);
+  _buf = detail::pointer_add(_map_addr, offset_delta);
+}
+
+MmapHandle::MmapHandle(MmapHandle&& o) noexcept
+  : _buf{std::exchange(o._buf, {})},
+    _initial_map_size{std::exchange(o._initial_map_size, {})},
+    _initial_map_offset{std::exchange(o._initial_map_offset, {})},
+    _file_size{std::exchange(o._file_size, {})},
+    _map_offset{std::exchange(o._map_offset, {})},
+    _map_size{std::exchange(o._map_size, {})},
+    _map_addr{std::exchange(o._map_addr, {})},
+    _initialized{std::exchange(o._initialized, {})},
+    _map_protection{std::exchange(o._map_protection, {})},
+    _map_flags{std::exchange(o._map_flags, {})},
+    _file_wrapper{std::exchange(o._file_wrapper, {})}
+{
+}
+
+MmapHandle& MmapHandle::operator=(MmapHandle&& o) noexcept
+{
+  close();
+  _buf                = std::exchange(o._buf, {});
+  _initial_map_size   = std::exchange(o._initial_map_size, {});
+  _initial_map_offset = std::exchange(o._initial_map_offset, {});
+  _file_size          = std::exchange(o._file_size, {});
+  _map_offset         = std::exchange(o._map_offset, {});
+  _map_size           = std::exchange(o._map_size, {});
+  _map_addr           = std::exchange(o._map_addr, {});
+  _initialized        = std::exchange(o._initialized, {});
+  _map_protection     = std::exchange(o._map_protection, {});
+  _map_flags          = std::exchange(o._map_flags, {});
+  _file_wrapper       = std::exchange(o._file_wrapper, {});
+  return *this;
+}
+
+MmapHandle::~MmapHandle() noexcept
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  close();
+}
+
+bool MmapHandle::closed() const noexcept { return !_initialized; }
+
+void MmapHandle::close() noexcept
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  if (closed() || _map_addr == nullptr) { return; }
+  try {
+    auto ret = munmap(_map_addr, _map_size);
+    SYSCALL_CHECK(ret);
+  } catch (...) {
+  }
+  _buf                = {};
+  _initial_map_size   = {};
+  _initial_map_offset = {};
+  _file_size          = {};
+  _map_offset         = {};
+  _map_size           = {};
+  _map_addr           = {};
+  _initialized        = {};
+  _map_protection     = {};
+  _map_flags          = {};
+  _file_wrapper       = {};
+}
+
+std::size_t MmapHandle::initial_map_size() const noexcept { return _initial_map_size; }
+
+std::size_t MmapHandle::initial_map_offset() const noexcept { return _initial_map_offset; }
+
+std::size_t MmapHandle::file_size() const
+{
+  if (closed()) { return 0; }
+  return get_file_size(_file_wrapper.fd());
+}
+
+std::size_t MmapHandle::nbytes() const { return file_size(); }
+
+std::size_t MmapHandle::read(void* buf, std::optional<std::size_t> size, std::size_t offset)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+
+  auto actual_size = validate_and_adjust_read_args(size, offset);
+  if (actual_size == 0) { return actual_size; }
+
+  auto const is_dst_buf_host_mem = is_host_memory(buf);
+  CUcontext ctx{};
+  if (!is_dst_buf_host_mem) { ctx = get_context_from_pointer(buf); }
+
+  // Copy `actual_size` bytes from `src_mapped_buf` (src) to `buf` (dst)
+  auto const src_mapped_buf = detail::pointer_add(_buf, offset - _initial_map_offset);
+  detail::read_impl(buf, src_mapped_buf, actual_size, 0, is_dst_buf_host_mem, ctx);
+  return actual_size;
+}
+
+std::future<std::size_t> MmapHandle::pread(void* buf,
+                                           std::optional<std::size_t> size,
+                                           std::size_t offset,
+                                           std::size_t task_size)
+{
+  KVIKIO_EXPECT(task_size <= defaults::bounce_buffer_size(),
+                "bounce buffer size cannot be less than task size.");
+  auto actual_size = validate_and_adjust_read_args(size, offset);
+  if (actual_size == 0) { return make_ready_future(actual_size); }
+
+  auto& [nvtx_color, call_idx] = detail::get_next_color_and_call_idx();
+  KVIKIO_NVTX_FUNC_RANGE(actual_size, nvtx_color);
+
+  auto const is_dst_buf_host_mem = is_host_memory(buf);
+  CUcontext ctx{};
+  if (!is_dst_buf_host_mem) { ctx = get_context_from_pointer(buf); }
+
+  // Copy `actual_size` bytes from `src_mapped_buf` (src) to `buf` (dst)
+  auto const src_mapped_buf = detail::pointer_add(_buf, offset - _initial_map_offset);
+  auto op =
+    [this, src_mapped_buf = src_mapped_buf, is_dst_buf_host_mem = is_dst_buf_host_mem, ctx = ctx](
+      void* dst_buf,
+      std::size_t size,
+      std::size_t,  // offset will be taken into account by dst_buf, hence no longer used here
+      std::size_t buf_offset  // buf_offset will be incremented for each individual task
+      ) -> std::size_t {
+    detail::read_impl(dst_buf, src_mapped_buf, size, buf_offset, is_dst_buf_host_mem, ctx);
+    return size;
+  };
+
+  return parallel_io(op,
+                     buf,
+                     actual_size,
+                     offset,
+                     task_size,
+                     0,  // dst buffer offset initial value
+                     call_idx,
+                     nvtx_color);
+}
+
+std::size_t MmapHandle::validate_and_adjust_read_args(std::optional<std::size_t> const& size,
+                                                      std::size_t offset)
+{
+  {
+    std::stringstream ss;
+    KVIKIO_EXPECT(!closed(), "Cannot read from a closed MmapHandle", std::runtime_error);
+
+    ss << "Offset is past the end of file. offset: " << offset << ", file size: " << _file_size
+       << "\n";
+    KVIKIO_EXPECT(offset <= _file_size, ss.str(), std::out_of_range);
+  }
+
+  auto actual_size = size.has_value() ? size.value() : _file_size - offset;
+
+  {
+    std::stringstream ss;
+    ss << "Read is out of bound. offset: " << offset << ", actual size to read: " << actual_size
+       << ", initial map offset: " << _initial_map_offset
+       << ", initial map size: " << _initial_map_size << "\n";
+    KVIKIO_EXPECT(offset >= _initial_map_offset &&
+                    offset + actual_size <= _initial_map_offset + _initial_map_size,
+                  ss.str(),
+                  std::out_of_range);
+  }
+  return actual_size;
+}
+
+}  // namespace kvikio
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index 485e0739ac..6004515b76 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -1,33 +1,29 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstring>
 #include <iostream>
 #include <memory>
+#include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 
+#include <kvikio/bounce_buffer.hpp>
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/env.hpp>
+#include <kvikio/detail/nvtx.hpp>
+#include <kvikio/detail/parallel_operation.hpp>
+#include <kvikio/detail/posix_io.hpp>
+#include <kvikio/detail/remote_handle.hpp>
+#include <kvikio/detail/url.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/nvtx.hpp>
-#include <kvikio/parallel_operation.hpp>
-#include <kvikio/posix_io.hpp>
+#include <kvikio/hdfs.hpp>
 #include <kvikio/remote_handle.hpp>
 #include <kvikio/shim/libcurl.hpp>
 #include <kvikio/utils.hpp>
@@ -42,11 +38,11 @@ namespace {
  * @note Is not thread-safe.
  */
 class BounceBufferH2D {
-  CUstream _stream;                 // The CUDA stream to use.
-  CUdeviceptr _dev;                 // The output device buffer.
-  AllocRetain::Alloc _host_buffer;  // The host buffer to bounce data on.
-  std::ptrdiff_t _dev_offset{0};    // Number of bytes written to `_dev`.
-  std::ptrdiff_t _host_offset{0};   // Number of bytes written to `_host` (resets on flush).
+  CUstream _stream;                                 // The CUDA stream to use.
+  CUdeviceptr _dev;                                 // The output device buffer.
+  CudaPinnedBounceBufferPool::Buffer _host_buffer;  // The host buffer to bounce data on.
+  std::ptrdiff_t _dev_offset{0};                    // Number of bytes written to `_dev`.
+  std::ptrdiff_t _host_offset{0};  // Number of bytes written to `_host` (resets on flush).
 
  public:
   /**
@@ -58,7 +54,7 @@ class BounceBufferH2D {
   BounceBufferH2D(CUstream stream, void* device_buffer)
     : _stream{stream},
       _dev{convert_void2deviceptr(device_buffer)},
-      _host_buffer{AllocRetain::instance().get()}
+      _host_buffer{CudaPinnedBounceBufferPool::instance().get()}
   {
     KVIKIO_NVTX_FUNC_RANGE();
   }
@@ -133,40 +129,156 @@ class BounceBufferH2D {
   }
 };
 
+/**
+ * @brief Get the file size, if using `HEAD` request to obtain the content-length header is
+ * permitted.
+ *
+ * This function works for the `HttpEndpoint` and `S3Endpoint`, but not for
+ * `S3EndpointWithPresignedUrl`, which does not allow `HEAD` request.
+ *
+ * @param endpoint The remote endpoint
+ * @param url The URL of the remote file
+ * @return The file size
+ */
+std::size_t get_file_size_using_head_impl(RemoteEndpoint& endpoint, std::string const& url)
+{
+  auto curl = create_curl_handle();
+
+  endpoint.setopt(curl);
+  curl.setopt(CURLOPT_NOBODY, 1L);
+  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+  curl.perform();
+  curl_off_t cl;
+  curl.getinfo(CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl);
+  KVIKIO_EXPECT(
+    cl >= 0,
+    "cannot get size of " + endpoint.str() + ", content-length not provided by the server",
+    std::runtime_error);
+  return static_cast<std::size_t>(cl);
+}
+
+/**
+ * @brief Set up the range request for libcurl. Use this method when HTTP range request is supposed.
+ *
+ * @param curl A curl handle
+ * @param file_offset File offset
+ * @param size read size
+ */
+void setup_range_request_impl(CurlHandle& curl, std::size_t file_offset, std::size_t size)
+{
+  std::string const byte_range =
+    std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1);
+  curl.setopt(CURLOPT_RANGE, byte_range.c_str());
+}
+
+/**
+ * @brief Whether the given URL is compatible with the S3 endpoint (including the credential-based
+ * access and presigned URL) which uses HTTP/HTTPS.
+ *
+ * @param url A URL.
+ * @return Boolean answer.
+ */
+bool url_has_aws_s3_http_format(std::string const& url)
+{
+  // Currently KvikIO supports the following AWS S3 HTTP URL formats:
+  static std::array const s3_patterns = {
+    // Virtual host style: https://<bucket-name>.s3.<region-code>.amazonaws.com/<object-key-name>
+    std::regex(R"(https?://[^/]+\.s3\.[^.]+\.amazonaws\.com/.+$)", std::regex_constants::icase),
+
+    // Path style (deprecated but still popular):
+    // https://s3.<region-code>.amazonaws.com/<bucket-name>/<object-key-name>
+    std::regex(R"(https?://s3\.[^.]+\.amazonaws\.com/[^/]+/.+$)", std::regex_constants::icase),
+
+    // Legacy global endpoint: no region code
+    std::regex(R"(https?://[^/]+\.s3\.amazonaws\.com/.+$)", std::regex_constants::icase),
+    std::regex(R"(https?://s3\.amazonaws\.com/[^/]+/.+$)", std::regex_constants::icase),
+
+    // Legacy regional endpoint: s3 and region code are delimited by - instead of .
+    std::regex(R"(https?://[^/]+\.s3-[^.]+\.amazonaws\.com/.+$)", std::regex_constants::icase),
+    std::regex(R"(https?://s3-[^.]+\.amazonaws\.com/[^/]+/.+$)", std::regex_constants::icase)};
+
+  return std::any_of(s3_patterns.begin(), s3_patterns.end(), [&url = url](auto const& pattern) {
+    std::smatch match_result;
+    return std::regex_match(url, match_result, pattern);
+  });
+}
+
+char const* get_remote_endpoint_type_name(RemoteEndpointType remote_endpoint_type)
+{
+  switch (remote_endpoint_type) {
+    case RemoteEndpointType::S3: return "S3";
+    case RemoteEndpointType::S3_PUBLIC: return "S3 public";
+    case RemoteEndpointType::S3_PRESIGNED_URL: return "S3 with presigned URL";
+    case RemoteEndpointType::WEBHDFS: return "WebHDFS";
+    case RemoteEndpointType::HTTP: return "HTTP";
+    case RemoteEndpointType::AUTO: return "AUTO";
+    default:
+      // Unreachable
+      KVIKIO_FAIL("Unknown RemoteEndpointType: " +
+                  std::to_string(static_cast<int>(remote_endpoint_type)));
+      return "UNKNOWN";
+  }
+}
+
+std::string encode_special_chars_in_path(std::string const& url)
+{
+  auto components = detail::UrlParser::parse(url);
+  components.path = detail::UrlEncoder::encode_path(components.path.value());
+  return detail::UrlBuilder::build_manually(components);
+}
 }  // namespace
 
-HttpEndpoint::HttpEndpoint(std::string url) : _url{std::move(url)} {}
+RemoteEndpoint::RemoteEndpoint(RemoteEndpointType remote_endpoint_type)
+  : _remote_endpoint_type{remote_endpoint_type}
+{
+}
+
+RemoteEndpointType RemoteEndpoint::remote_endpoint_type() const noexcept
+{
+  return _remote_endpoint_type;
+}
+
+HttpEndpoint::HttpEndpoint(std::string url)
+  : RemoteEndpoint{RemoteEndpointType::HTTP}, _url{std::move(url)}
+{
+}
 
 std::string HttpEndpoint::str() const { return _url; }
 
-void HttpEndpoint::setopt(CurlHandle& curl)
+std::size_t HttpEndpoint::get_file_size()
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  curl.setopt(CURLOPT_URL, _url.c_str());
+  return get_file_size_using_head_impl(*this, _url);
 }
 
-void S3Endpoint::setopt(CurlHandle& curl)
+void HttpEndpoint::setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size)
 {
-  KVIKIO_NVTX_FUNC_RANGE();
-  curl.setopt(CURLOPT_URL, _url.c_str());
-  curl.setopt(CURLOPT_AWS_SIGV4, _aws_sigv4.c_str());
-  curl.setopt(CURLOPT_USERPWD, _aws_userpwd.c_str());
-  if (_curl_header_list) { curl.setopt(CURLOPT_HTTPHEADER, _curl_header_list); }
+  setup_range_request_impl(curl, file_offset, size);
 }
 
-std::string S3Endpoint::unwrap_or_default(std::optional<std::string> aws_arg,
-                                          std::string const& env_var,
-                                          std::string const& err_msg)
+bool HttpEndpoint::is_url_valid(std::string const& url) noexcept
 {
-  KVIKIO_NVTX_FUNC_RANGE();
-  if (aws_arg.has_value()) { return std::move(*aws_arg); }
+  try {
+    auto parsed_url = detail::UrlParser::parse(url);
+    if ((parsed_url.scheme != "http") && (parsed_url.scheme != "https")) { return false; };
 
-  char const* env = std::getenv(env_var.c_str());
-  if (env == nullptr) {
-    if (err_msg.empty()) { return std::string(); }
-    KVIKIO_FAIL(err_msg, std::invalid_argument);
+    // Check whether the file path exists, excluding the leading "/"
+    return parsed_url.path->length() > 1;
+  } catch (...) {
+    return false;
   }
-  return std::string(env);
+}
+
+void HttpEndpoint::setopt(CurlHandle& curl) { curl.setopt(CURLOPT_URL, _url.c_str()); }
+
+void S3Endpoint::setopt(CurlHandle& curl)
+{
+  auto new_url = encode_special_chars_in_path(_url);
+  curl.setopt(CURLOPT_URL, new_url.c_str());
+
+  curl.setopt(CURLOPT_AWS_SIGV4, _aws_sigv4.c_str());
+  curl.setopt(CURLOPT_USERPWD, _aws_userpwd.c_str());
+  if (_curl_header_list) { curl.setopt(CURLOPT_HTTPHEADER, _curl_header_list); }
 }
 
 std::string S3Endpoint::url_from_bucket_and_object(std::string bucket_name,
@@ -175,17 +287,19 @@ std::string S3Endpoint::url_from_bucket_and_object(std::string bucket_name,
                                                    std::optional<std::string> aws_endpoint_url)
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  auto const endpoint_url = unwrap_or_default(std::move(aws_endpoint_url), "AWS_ENDPOINT_URL");
+  auto const endpoint_url = detail::unwrap_or_env(std::move(aws_endpoint_url), "AWS_ENDPOINT_URL");
   std::stringstream ss;
-  if (endpoint_url.empty()) {
+  if (!endpoint_url.has_value()) {
     auto const region =
-      unwrap_or_default(std::move(aws_region),
-                        "AWS_DEFAULT_REGION",
-                        "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
-    // We default to the official AWS url scheme.
-    ss << "https://" << bucket_name << ".s3." << region << ".amazonaws.com/" << object_name;
+      detail::unwrap_or_env(std::move(aws_region),
+                            "AWS_DEFAULT_REGION",
+                            "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
+    // "s3" is a non-standard URI scheme used by AWS CLI and AWS SDK, and cannot be identified by
+    // libcurl. A valid HTTP/HTTPS URL needs to be constructed for use in libcurl. Here the AWS
+    // virtual host style is used.
+    ss << "https://" << bucket_name << ".s3." << region.value() << ".amazonaws.com/" << object_name;
   } else {
-    ss << endpoint_url << "/" << bucket_name << "/" << object_name;
+    ss << endpoint_url.value() << "/" << bucket_name << "/" << object_name;
   }
   return ss.str();
 }
@@ -194,7 +308,7 @@ std::pair<std::string, std::string> S3Endpoint::parse_s3_url(std::string const&
 {
   KVIKIO_NVTX_FUNC_RANGE();
   // Regular expression to match s3://<bucket>/<object>
-  std::regex const pattern{R"(^s3://([^/]+)/(.+))", std::regex_constants::icase};
+  std::regex static const pattern{R"(^s3://([^/]+)/(.+))", std::regex_constants::icase};
   std::smatch matches;
   if (std::regex_match(s3_url, matches, pattern)) { return {matches[1].str(), matches[2].str()}; }
   KVIKIO_FAIL("Input string does not match the expected S3 URL format.", std::invalid_argument);
@@ -206,26 +320,26 @@ S3Endpoint::S3Endpoint(std::string url,
                        std::optional<std::string> aws_access_key,
                        std::optional<std::string> aws_secret_access_key,
                        std::optional<std::string> aws_session_token)
-  : _url{std::move(url)}
+  : RemoteEndpoint{RemoteEndpointType::S3}, _url{std::move(url)}
 {
   KVIKIO_NVTX_FUNC_RANGE();
   // Regular expression to match http[s]://
-  std::regex pattern{R"(^https?://.*)", std::regex_constants::icase};
+  std::regex static const pattern{R"(^https?://.*)", std::regex_constants::icase};
   KVIKIO_EXPECT(std::regex_search(_url, pattern),
                 "url must start with http:// or https://",
                 std::invalid_argument);
 
   auto const region =
-    unwrap_or_default(std::move(aws_region),
-                      "AWS_DEFAULT_REGION",
-                      "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
+    detail::unwrap_or_env(std::move(aws_region),
+                          "AWS_DEFAULT_REGION",
+                          "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
 
   auto const access_key =
-    unwrap_or_default(std::move(aws_access_key),
-                      "AWS_ACCESS_KEY_ID",
-                      "S3: must provide `aws_access_key` if AWS_ACCESS_KEY_ID isn't set.");
+    detail::unwrap_or_env(std::move(aws_access_key),
+                          "AWS_ACCESS_KEY_ID",
+                          "S3: must provide `aws_access_key` if AWS_ACCESS_KEY_ID isn't set.");
 
-  auto const secret_access_key = unwrap_or_default(
+  auto const secret_access_key = detail::unwrap_or_env(
     std::move(aws_secret_access_key),
     "AWS_SECRET_ACCESS_KEY",
     "S3: must provide `aws_secret_access_key` if AWS_SECRET_ACCESS_KEY isn't set.");
@@ -233,7 +347,7 @@ S3Endpoint::S3Endpoint(std::string url,
   // Create the CURLOPT_AWS_SIGV4 option
   {
     std::stringstream ss;
-    ss << "aws:amz:" << region << ":s3";
+    ss << "aws:amz:" << region.value() << ":s3";
     _aws_sigv4 = ss.str();
   }
   // Create the CURLOPT_USERPWD option
@@ -242,21 +356,21 @@ S3Endpoint::S3Endpoint(std::string url,
   // <https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_sigv-create-signed-request.html>
   {
     std::stringstream ss;
-    ss << access_key << ":" << secret_access_key;
+    ss << access_key.value() << ":" << secret_access_key.value();
     _aws_userpwd = ss.str();
   }
   // Access key IDs beginning with ASIA are temporary credentials that are created using AWS STS
   // operations. They need a session token to work.
-  if (access_key.compare(0, 4, std::string("ASIA")) == 0) {
+  if (access_key->compare(0, 4, std::string("ASIA")) == 0) {
     // Create a Custom Curl header for the session token.
     // The _curl_header_list created by curl_slist_append must be manually freed
     // (see https://curl.se/libcurl/c/CURLOPT_HTTPHEADER.html)
     auto session_token =
-      unwrap_or_default(std::move(aws_session_token),
-                        "AWS_SESSION_TOKEN",
-                        "When using temporary credentials, AWS_SESSION_TOKEN must be set.");
+      detail::unwrap_or_env(std::move(aws_session_token),
+                            "AWS_SESSION_TOKEN",
+                            "When using temporary credentials, AWS_SESSION_TOKEN must be set.");
     std::stringstream ss;
-    ss << "x-amz-security-token: " << session_token;
+    ss << "x-amz-security-token: " << session_token.value();
     _curl_header_list = curl_slist_append(NULL, ss.str().c_str());
     KVIKIO_EXPECT(_curl_header_list != nullptr,
                   "Failed to create curl header for AWS token",
@@ -286,31 +400,294 @@ S3Endpoint::~S3Endpoint() { curl_slist_free_all(_curl_header_list); }
 
 std::string S3Endpoint::str() const { return _url; }
 
-RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
-  : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
+std::size_t S3Endpoint::get_file_size()
 {
   KVIKIO_NVTX_FUNC_RANGE();
+  return get_file_size_using_head_impl(*this, _url);
 }
 
-RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint)
+void S3Endpoint::setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size)
 {
   KVIKIO_NVTX_FUNC_RANGE();
+  setup_range_request_impl(curl, file_offset, size);
+}
+
+bool S3Endpoint::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    auto parsed_url = detail::UrlParser::parse(url, CURLU_NON_SUPPORT_SCHEME);
+
+    if (parsed_url.scheme == "s3") {
+      if (!parsed_url.host.has_value()) { return false; }
+      if (!parsed_url.path.has_value()) { return false; }
+
+      // Check whether the S3 object key exists
+      std::regex static const pattern(R"(^/.+$)");
+      return std::regex_search(parsed_url.path.value(), pattern);
+    } else if ((parsed_url.scheme == "http") || (parsed_url.scheme == "https")) {
+      return url_has_aws_s3_http_format(url) && !S3EndpointWithPresignedUrl::is_url_valid(url);
+    }
+  } catch (...) {
+  }
+  return false;
+}
+
+S3PublicEndpoint::S3PublicEndpoint(std::string url)
+  : RemoteEndpoint{RemoteEndpointType::S3_PUBLIC}, _url{std::move(url)}
+{
+}
+
+void S3PublicEndpoint::setopt(CurlHandle& curl)
+{
+  auto new_url = encode_special_chars_in_path(_url);
+  curl.setopt(CURLOPT_URL, new_url.c_str());
+}
+
+std::string S3PublicEndpoint::str() const { return _url; }
+
+std::size_t S3PublicEndpoint::get_file_size()
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  return get_file_size_using_head_impl(*this, _url);
+}
+
+void S3PublicEndpoint::setup_range_request(CurlHandle& curl,
+                                           std::size_t file_offset,
+                                           std::size_t size)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  setup_range_request_impl(curl, file_offset, size);
+}
+
+bool S3PublicEndpoint::is_url_valid(std::string const& url) noexcept
+{
+  return S3Endpoint::is_url_valid(url);
+}
+
+S3EndpointWithPresignedUrl::S3EndpointWithPresignedUrl(std::string presigned_url)
+  : RemoteEndpoint{RemoteEndpointType::S3_PRESIGNED_URL}, _url{std::move(presigned_url)}
+{
+}
+
+void S3EndpointWithPresignedUrl::setopt(CurlHandle& curl)
+{
+  curl.setopt(CURLOPT_URL, _url.c_str());
+}
+
+std::string S3EndpointWithPresignedUrl::str() const { return _url; }
+
+namespace {
+/**
+ * @brief Callback for the `CURLOPT_HEADERFUNCTION` parameter in libcurl
+ *
+ * The header callback is called once for each header and only complete header lines are passed on
+ * to the callback. The provided header line is not null-terminated.
+ *
+ * @param data Transfer buffer where new data is received
+ * @param size Curl internal implementation always sets this parameter to 1
+ * @param num_bytes The size of new data received
+ * @param userdata User-defined data
+ * @return The number of bytes consumed by the callback
+ * @exception std::invalid_argument if the server does not know the file size, thereby using "*" as
+ * the filler text in the content-range header of the HTTP message.
+ */
+std::size_t callback_header(char* data, std::size_t size, std::size_t num_bytes, void* userdata)
+{
+  auto new_data_size = size * num_bytes;
+  auto* file_size    = reinterpret_cast<long*>(userdata);
+
+  // The header line is not null-terminated. This constructor overload ensures header_line.data() is
+  // null-terminated.
+  std::string const header_line{data, new_data_size};
+
+  // The content-range header has the format
+  // Content-Range: <unit> <range>/<size>
+  // Content-Range: <unit> <range>/*
+  // Content-Range: <unit> */<size>
+  std::regex static const pattern(R"(Content-Range:[^/]+/(.*))", std::regex::icase);
+  std::smatch match_result;
+  bool found = std::regex_search(header_line, match_result, pattern);
+  if (found) {
+    // If the file size is unknown (represented by "*" in the content-range header), string-to-long
+    // conversion will throw an `std::invalid_argument` exception. The exception message from
+    // `std::stol` is usually too concise to be useful (being simply a string of "stol"), so a
+    // custom exception is used instead.
+    try {
+      *file_size = std::stol(match_result[1].str());
+    } catch (...) {
+      KVIKIO_FAIL("File size information missing on the server side.", std::invalid_argument);
+    }
+  }
+  return new_data_size;
+}
+}  // namespace
+
+std::size_t S3EndpointWithPresignedUrl::get_file_size()
+{
+  // Usually the `HEAD` request is used to obtain the content-length (file size). However, AWS S3
+  // does not allow it for presigned URL. The workaround here is to send the `GET` request with
+  // 1-byte range, so that we can still obtain the header information at a negligible cost. Since
+  // the content-length header is now at a fixed value of 1, we instead extract the file size value
+  // from content-range.
+
+  KVIKIO_NVTX_FUNC_RANGE();
+
   auto curl = create_curl_handle();
+  curl.setopt(CURLOPT_URL, _url.c_str());
+
+  // 1-byte range, specified in the format "<start-byte>-<end-byte>""
+  std::string my_range{"0-0"};
+  curl.setopt(CURLOPT_RANGE, my_range.c_str());
+
+  long file_size{};
+  curl.setopt(CURLOPT_HEADERDATA, static_cast<void*>(&file_size));
+  curl.setopt(CURLOPT_HEADERFUNCTION, callback_header);
 
-  endpoint->setopt(curl);
-  curl.setopt(CURLOPT_NOBODY, 1L);
-  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
   curl.perform();
-  curl_off_t cl;
-  curl.getinfo(CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl);
-  KVIKIO_EXPECT(
-    cl >= 0,
-    "cannot get size of " + endpoint->str() + ", content-length not provided by the server",
-    std::runtime_error);
-  _nbytes   = cl;
+  return file_size;
+}
+
+void S3EndpointWithPresignedUrl::setup_range_request(CurlHandle& curl,
+                                                     std::size_t file_offset,
+                                                     std::size_t size)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  setup_range_request_impl(curl, file_offset, size);
+}
+
+bool S3EndpointWithPresignedUrl::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    if (!url_has_aws_s3_http_format(url)) { return false; }
+
+    auto parsed_url = detail::UrlParser::parse(url);
+    if (!parsed_url.query.has_value()) { return false; }
+
+    // Reference: https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-query-string-auth.html
+    return parsed_url.query->find("X-Amz-Algorithm") != std::string::npos &&
+           parsed_url.query->find("X-Amz-Signature") != std::string::npos;
+  } catch (...) {
+    return false;
+  }
+}
+
+RemoteHandle RemoteHandle::open(std::string url,
+                                RemoteEndpointType remote_endpoint_type,
+                                std::optional<std::vector<RemoteEndpointType>> allow_list,
+                                std::optional<std::size_t> nbytes)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  if (!allow_list.has_value()) {
+    allow_list = {RemoteEndpointType::S3,
+                  RemoteEndpointType::S3_PUBLIC,
+                  RemoteEndpointType::S3_PRESIGNED_URL,
+                  RemoteEndpointType::WEBHDFS,
+                  RemoteEndpointType::HTTP};
+  }
+
+  auto const scheme =
+    detail::UrlParser::extract_component(url, CURLUPART_SCHEME, CURLU_NON_SUPPORT_SCHEME);
+  KVIKIO_EXPECT(scheme.has_value(), "Missing scheme in URL.");
+
+  // Helper to create endpoint based on type
+  auto create_endpoint =
+    [&url = url, &scheme = scheme](RemoteEndpointType type) -> std::unique_ptr<RemoteEndpoint> {
+    switch (type) {
+      case RemoteEndpointType::S3:
+        if (!S3Endpoint::is_url_valid(url)) { return nullptr; }
+        if (scheme.value() == "s3") {
+          auto const [bucket, object] = S3Endpoint::parse_s3_url(url);
+          return std::make_unique<S3Endpoint>(std::pair{bucket, object});
+        }
+        return std::make_unique<S3Endpoint>(url);
+
+      case RemoteEndpointType::S3_PUBLIC:
+        if (!S3PublicEndpoint::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<S3PublicEndpoint>(url);
+
+      case RemoteEndpointType::S3_PRESIGNED_URL:
+        if (!S3EndpointWithPresignedUrl::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<S3EndpointWithPresignedUrl>(url);
+
+      case RemoteEndpointType::WEBHDFS:
+        if (!WebHdfsEndpoint::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<WebHdfsEndpoint>(url);
+
+      case RemoteEndpointType::HTTP:
+        if (!HttpEndpoint::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<HttpEndpoint>(url);
+
+      default: return nullptr;
+    }
+  };
+
+  std::unique_ptr<RemoteEndpoint> endpoint;
+
+  if (remote_endpoint_type == RemoteEndpointType::AUTO) {
+    // Try each allowed type in the order of allowlist
+    for (auto const& type : allow_list.value()) {
+      try {
+        endpoint = create_endpoint(type);
+        if (endpoint == nullptr) { continue; }
+        if (type == RemoteEndpointType::S3) {
+          // Check connectivity for the credential-based S3 endpoint, and throw an exception if
+          // failed
+          endpoint->get_file_size();
+        }
+      } catch (...) {
+        // If the credential-based S3 endpoint cannot be used to access the URL, try using S3 public
+        // endpoint instead if it is in the allowlist
+        if (type == RemoteEndpointType::S3 &&
+            std::find(allow_list->begin(), allow_list->end(), RemoteEndpointType::S3_PUBLIC) !=
+              allow_list->end()) {
+          endpoint = std::make_unique<S3PublicEndpoint>(url);
+        } else {
+          throw;
+        }
+      }
+
+      // At this point, a matching endpoint has been found
+      break;
+    }
+    KVIKIO_EXPECT(endpoint.get() != nullptr, "Unsupported endpoint URL.", std::runtime_error);
+  } else {
+    // Validate it is in the allow list
+    KVIKIO_EXPECT(
+      std::find(allow_list->begin(), allow_list->end(), remote_endpoint_type) != allow_list->end(),
+      std::string{get_remote_endpoint_type_name(remote_endpoint_type)} +
+        " is not in the allowlist.",
+      std::runtime_error);
+
+    // Create the specific type
+    endpoint = create_endpoint(remote_endpoint_type);
+    KVIKIO_EXPECT(endpoint.get() != nullptr,
+                  std::string{"Invalid URL for "} +
+                    get_remote_endpoint_type_name(remote_endpoint_type) + " endpoint",
+                  std::runtime_error);
+  }
+
+  return nbytes.has_value() ? RemoteHandle(std::move(endpoint), nbytes.value())
+                            : RemoteHandle(std::move(endpoint));
+}
+
+RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
+  : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+}
+
+RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  _nbytes   = endpoint->get_file_size();
   _endpoint = std::move(endpoint);
 }
 
+RemoteEndpointType RemoteHandle::remote_endpoint_type() const noexcept
+{
+  return _endpoint->remote_endpoint_type();
+}
+
 std::size_t RemoteHandle::nbytes() const noexcept { return _nbytes; }
 
 RemoteEndpoint const& RemoteHandle::endpoint() const noexcept { return *_endpoint; }
@@ -397,10 +774,7 @@ std::size_t RemoteHandle::read(void* buf, std::size_t size, std::size_t file_off
   bool const is_host_mem = is_host_memory(buf);
   auto curl              = create_curl_handle();
   _endpoint->setopt(curl);
-
-  std::string const byte_range =
-    std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1);
-  curl.setopt(CURLOPT_RANGE, byte_range.c_str());
+  _endpoint->setup_range_request(curl, file_offset, size);
 
   if (is_host_mem) {
     curl.setopt(CURLOPT_WRITEFUNCTION, callback_host_memory);
diff --git a/cpp/src/shim/cuda.cpp b/cpp/src/shim/cuda.cpp
index 9e5c05bc05..693dd1bd2a 100644
--- a/cpp/src/shim/cuda.cpp
+++ b/cpp/src/shim/cuda.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <stdexcept>
@@ -21,7 +10,6 @@
 
 namespace kvikio {
 
-#ifdef KVIKIO_CUDA_FOUND
 cudaAPI::cudaAPI()
 {
   void* lib = load_library("libcuda.so.1");
@@ -31,6 +19,8 @@ cudaAPI::cudaAPI()
   // the name of the symbol through cude.h.
   get_symbol(MemHostAlloc, lib, KVIKIO_STRINGIFY(cuMemHostAlloc));
   get_symbol(MemFreeHost, lib, KVIKIO_STRINGIFY(cuMemFreeHost));
+  get_symbol(MemHostRegister, lib, KVIKIO_STRINGIFY(cuMemHostRegister));
+  get_symbol(MemHostUnregister, lib, KVIKIO_STRINGIFY(cuMemHostUnregister));
   get_symbol(MemcpyHtoDAsync, lib, KVIKIO_STRINGIFY(cuMemcpyHtoDAsync));
   get_symbol(MemcpyDtoHAsync, lib, KVIKIO_STRINGIFY(cuMemcpyDtoHAsync));
   get_symbol(PointerGetAttribute, lib, KVIKIO_STRINGIFY(cuPointerGetAttribute));
@@ -38,19 +28,38 @@ cudaAPI::cudaAPI()
   get_symbol(CtxPushCurrent, lib, KVIKIO_STRINGIFY(cuCtxPushCurrent));
   get_symbol(CtxPopCurrent, lib, KVIKIO_STRINGIFY(cuCtxPopCurrent));
   get_symbol(CtxGetCurrent, lib, KVIKIO_STRINGIFY(cuCtxGetCurrent));
+  get_symbol(CtxGetDevice, lib, KVIKIO_STRINGIFY(cuCtxGetDevice));
   get_symbol(MemGetAddressRange, lib, KVIKIO_STRINGIFY(cuMemGetAddressRange));
   get_symbol(GetErrorName, lib, KVIKIO_STRINGIFY(cuGetErrorName));
   get_symbol(GetErrorString, lib, KVIKIO_STRINGIFY(cuGetErrorString));
   get_symbol(DeviceGet, lib, KVIKIO_STRINGIFY(cuDeviceGet));
+  get_symbol(DeviceGetCount, lib, KVIKIO_STRINGIFY(cuDeviceGetCount));
+  get_symbol(DeviceGetAttribute, lib, KVIKIO_STRINGIFY(cuDeviceGetAttribute));
   get_symbol(DevicePrimaryCtxRetain, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRetain));
   get_symbol(DevicePrimaryCtxRelease, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRelease));
   get_symbol(StreamSynchronize, lib, KVIKIO_STRINGIFY(cuStreamSynchronize));
   get_symbol(StreamCreate, lib, KVIKIO_STRINGIFY(cuStreamCreate));
   get_symbol(StreamDestroy, lib, KVIKIO_STRINGIFY(cuStreamDestroy));
-}
-#else
-cudaAPI::cudaAPI() { KVIKIO_FAIL("KvikIO not compiled with CUDA support", std::runtime_error); }
+  get_symbol(DriverGetVersion, lib, KVIKIO_STRINGIFY(cuDriverGetVersion));
+
+  CUDA_DRIVER_TRY(DriverGetVersion(&driver_version));
+
+#if CUDA_VERSION >= 12080
+  // cuMemcpyBatchAsync was introduced in CUDA 12.8, and its parameters were changed in CUDA 13.0.
+  try {
+    decltype(cuMemcpyBatchAsync)* fp;
+    get_symbol(fp, lib, KVIKIO_STRINGIFY(cuMemcpyBatchAsync));
+    MemcpyBatchAsync.set(fp);
+  } catch (std::runtime_error const&) {
+    // Rethrow the exception if the CUDA driver version at runtime is satisfied but
+    // cuMemcpyBatchAsync is not found.
+    if (driver_version >= 12080) { throw; }
+    // If the CUDA driver version at runtime is not satisfied, reset the wrapper. At the call site,
+    // use the conventional cuMemcpyXtoXAsync API as the fallback.
+    MemcpyBatchAsync.reset();
+  }
 #endif
+}
 
 cudaAPI& cudaAPI::instance()
 {
@@ -58,7 +67,6 @@ cudaAPI& cudaAPI::instance()
   return _instance;
 }
 
-#ifdef KVIKIO_CUDA_FOUND
 bool is_cuda_available()
 {
   try {
@@ -68,6 +76,5 @@ bool is_cuda_available()
   }
   return true;
 }
-#endif
 
 }  // namespace kvikio
diff --git a/cpp/src/shim/cufile.cpp b/cpp/src/shim/cufile.cpp
index 1f849263e8..0098a10deb 100644
--- a/cpp/src/shim/cufile.cpp
+++ b/cpp/src/shim/cufile.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <stdexcept>
@@ -27,17 +16,7 @@ namespace kvikio {
 #ifdef KVIKIO_CUFILE_FOUND
 cuFileAPI::cuFileAPI()
 {
-  // CUDA versions before CUDA 11.7.1 did not ship libcufile.so.0, so this is
-  // a workaround that adds support for all prior versions of libcufile.
-  void* lib = load_library({"libcufile.so.0",
-                            "libcufile.so.1.3.0" /* 11.7.0 */,
-                            "libcufile.so.1.2.1" /* 11.6.2, 11.6.1 */,
-                            "libcufile.so.1.2.0" /* 11.6.0 */,
-                            "libcufile.so.1.1.1" /* 11.5.1 */,
-                            "libcufile.so.1.1.0" /* 11.5.0 */,
-                            "libcufile.so.1.0.2" /* 11.4.4, 11.4.3, 11.4.2 */,
-                            "libcufile.so.1.0.1" /* 11.4.1 */,
-                            "libcufile.so.1.0.0" /* 11.4.0 */});
+  void* lib = load_library("libcufile.so.0");
   get_symbol(HandleRegister, lib, KVIKIO_STRINGIFY(cuFileHandleRegister));
   get_symbol(HandleDeregister, lib, KVIKIO_STRINGIFY(cuFileHandleDeregister));
   get_symbol(Read, lib, KVIKIO_STRINGIFY(cuFileRead));
@@ -79,21 +58,6 @@ cuFileAPI::cuFileAPI()
     get_symbol(StreamRegister, lib, KVIKIO_STRINGIFY(cuFileStreamRegister));
     get_symbol(StreamDeregister, lib, KVIKIO_STRINGIFY(cuFileStreamDeregister));
   }
-
-  // cuFile is supposed to open and close the driver automatically but
-  // because of a bug in cuFile v1.4 (CUDA v11.8) it sometimes segfaults:
-  // <https://github.com/rapidsai/kvikio/issues/159>.
-  if (version < 1050) { driver_open(); }
-}
-
-// Notice, we have to close the driver at program exit (if we opened it) even though we are
-// not allowed to call CUDA after main[1]. This is because, cuFile will segfault if the
-// driver isn't closed on program exit i.e. we are doomed if we do, doomed if we don't, but
-// this seems to be the lesser of two evils.
-// [1] <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
-cuFileAPI::~cuFileAPI()
-{
-  if (version < 1050) { driver_close(); }
 }
 #else
 cuFileAPI::cuFileAPI() { KVIKIO_FAIL("KvikIO not compiled with cuFile.h", std::runtime_error); }
@@ -147,6 +111,8 @@ int cufile_version() noexcept
     return 0;
   }
 }
+#else
+int cufile_version() noexcept { return 0; }
 #endif
 
 bool is_batch_api_available() noexcept { return cufile_version() >= 1060; }
diff --git a/cpp/src/shim/libcurl.cpp b/cpp/src/shim/libcurl.cpp
index 613dad32f8..bfee920d0d 100644
--- a/cpp/src/shim/libcurl.cpp
+++ b/cpp/src/shim/libcurl.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <algorithm>
@@ -29,9 +18,10 @@
 #include <curl/curl.h>
 
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/parallel_operation.hpp>
+#include <kvikio/detail/posix_io.hpp>
+#include <kvikio/detail/tls.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/parallel_operation.hpp>
-#include <kvikio/posix_io.hpp>
 #include <kvikio/shim/libcurl.hpp>
 #include <kvikio/utils.hpp>
 
@@ -112,6 +102,12 @@ CurlHandle::CurlHandle(LibCurl::UniqueHandlePtr handle,
 
   // Make requests time out after `value` seconds.
   setopt(CURLOPT_TIMEOUT, kvikio::defaults::http_timeout());
+
+  // Optionally enable verbose output if it's configured.
+  auto const verbose = getenv_or("KVIKIO_REMOTE_VERBOSE", false);
+  if (verbose) { setopt(CURLOPT_VERBOSE, 1L); }
+
+  detail::set_up_ca_paths(*this);
 }
 
 CurlHandle::~CurlHandle() noexcept { LibCurl::instance().retain_handle(std::move(_handle)); }
diff --git a/cpp/src/shim/utils.cpp b/cpp/src/shim/utils.cpp
index ab418c3a0f..232d3a413f 100644
--- a/cpp/src/shim/utils.cpp
+++ b/cpp/src/shim/utils.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <dlfcn.h>
@@ -33,20 +22,6 @@ void* load_library(std::string const& name, int mode)
   return ret;
 }
 
-void* load_library(std::vector<std::string> const& names, int mode)
-{
-  std::stringstream ss;
-  for (auto const& name : names) {
-    ss << name << " ";
-    try {
-      return load_library(name, mode);
-    } catch (std::runtime_error const&) {
-    }
-  }
-  KVIKIO_FAIL("cannot open shared object file, tried: " + ss.str(), std::runtime_error);
-  return {};
-}
-
 bool is_running_in_wsl() noexcept
 {
   try {
diff --git a/cpp/src/stream.cpp b/cpp/src/stream.cpp
index 4b5f454a3f..71b7f544f4 100644
--- a/cpp/src/stream.cpp
+++ b/cpp/src/stream.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <sys/types.h>
@@ -21,8 +10,8 @@
 #include <tuple>
 #include <utility>
 
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/nvtx.hpp>
 #include <kvikio/shim/cuda.hpp>
 #include <kvikio/shim/cufile.hpp>
 #include <kvikio/stream.hpp>
diff --git a/cpp/src/utils.cpp b/cpp/src/utils.cpp
index cb4a51b890..076cec5968 100644
--- a/cpp/src/utils.cpp
+++ b/cpp/src/utils.cpp
@@ -1,28 +1,17 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <unistd.h>
+#include <cstdint>
 #include <cstring>
 #include <iostream>
 #include <map>
 #include <optional>
-#include <stdexcept>
 #include <tuple>
-#include <type_traits>
 
+#include <kvikio/detail/utils.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/shim/cuda.hpp>
 #include <kvikio/utils.hpp>
@@ -55,7 +44,6 @@ CUdeviceptr convert_void2deviceptr(void const* devPtr)
   return reinterpret_cast<CUdeviceptr>(devPtr);
 }
 
-#ifdef KVIKIO_CUDA_FOUND
 bool is_host_memory(void const* ptr)
 {
   CUpointer_attribute attrs[1] = {
@@ -75,7 +63,6 @@ bool is_host_memory(void const* ptr)
   // does it to support `cudaMemoryTypeUnregistered`.
   return memtype == 0 || memtype == CU_MEMORYTYPE_HOST;
 }
-#endif
 
 int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr)
 {
@@ -180,4 +167,51 @@ std::tuple<void*, std::size_t, std::size_t> get_alloc_info(void const* devPtr, C
   return std::make_tuple(reinterpret_cast<void*>(base_ptr), base_size, offset);
 }
 
+namespace detail {
+
+std::size_t align_up(std::size_t value, std::size_t alignment)
+{
+  KVIKIO_EXPECT((alignment > 0) && ((alignment & (alignment - 1)) == 0),
+                "Alignment must be a power of 2");
+  return (value + alignment - 1) & ~(alignment - 1);
+}
+
+void* align_up(void* addr, std::size_t alignment)
+{
+  KVIKIO_EXPECT((alignment > 0) && ((alignment & (alignment - 1)) == 0),
+                "Alignment must be a power of 2");
+  auto res = (reinterpret_cast<uintptr_t>(addr) + alignment - 1) & ~(alignment - 1);
+  return reinterpret_cast<void*>(res);
+}
+
+std::size_t align_down(std::size_t value, std::size_t alignment)
+{
+  KVIKIO_EXPECT((alignment > 0) && ((alignment & (alignment - 1)) == 0),
+                "Alignment must be a power of 2");
+  return value & ~(alignment - 1);
+}
+
+void* align_down(void* addr, std::size_t alignment)
+{
+  KVIKIO_EXPECT((alignment > 0) && ((alignment & (alignment - 1)) == 0),
+                "Alignment must be a power of 2");
+  auto res = reinterpret_cast<uintptr_t>(addr) & ~(alignment - 1);
+  return reinterpret_cast<void*>(res);
+}
+
+bool is_aligned(std::size_t value, std::size_t alignment)
+{
+  KVIKIO_EXPECT((alignment > 0) && ((alignment & (alignment - 1)) == 0),
+                "Alignment must be a power of 2");
+  return (value & (alignment - 1)) == 0;
+}
+
+bool is_aligned(void* addr, std::size_t alignment)
+{
+  KVIKIO_EXPECT((alignment > 0) && ((alignment & (alignment - 1)) == 0),
+                "Alignment must be a power of 2");
+  return (reinterpret_cast<uintptr_t>(addr) & (alignment - 1)) == 0;
+}
+
+}  // namespace detail
 }  // namespace kvikio
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4b4ad1049a..2effd6a559 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 enable_testing()
@@ -50,16 +43,14 @@ function(kvikio_add_test)
   set_target_properties(
     ${_KVIKIO_NAME}
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${KvikIO_BINARY_DIR}/gtests>"
-               CXX_STANDARD 17
+               CXX_STANDARD 20
                CXX_STANDARD_REQUIRED ON
-               # For std:: support of __int128_t. Can be removed once using cuda::std
-               CXX_EXTENSIONS ON
-               CUDA_STANDARD 17
+               CUDA_STANDARD 20
                CUDA_STANDARD_REQUIRED ON
   )
   target_link_libraries(
     ${_KVIKIO_NAME} PRIVATE kvikio::kvikio GTest::gmock GTest::gmock_main GTest::gtest
-                            GTest::gtest_main CUDA::cudart
+                            GTest::gtest_main CUDA::cudart $<TARGET_NAME_IF_EXISTS:CURL::libcurl>
   )
 
   rapids_test_add(
@@ -70,12 +61,19 @@ function(kvikio_add_test)
   )
 endfunction()
 
-kvikio_add_test(NAME BASIC_IO_TEST SOURCES test_basic_io.cpp)
+kvikio_add_test(NAME BASIC_IO_TEST SOURCES test_basic_io.cpp utils/env.cpp)
 
-kvikio_add_test(NAME DEFAULTS_TEST SOURCES test_defaults.cpp)
+kvikio_add_test(NAME DEFAULTS_TEST SOURCES test_defaults.cpp utils/env.cpp)
 
 kvikio_add_test(NAME ERROR_TEST SOURCES test_error.cpp)
 
-kvikio_add_test(NAME REMOTE_HANDLE_TEST SOURCES test_remote_handle.cpp utils/env.cpp)
+kvikio_add_test(NAME MMAP_TEST SOURCES test_mmap.cpp)
+
+if(KvikIO_REMOTE_SUPPORT)
+  kvikio_add_test(NAME REMOTE_HANDLE_TEST SOURCES test_remote_handle.cpp utils/env.cpp)
+  kvikio_add_test(NAME HDFS_TEST SOURCES test_hdfs.cpp utils/hdfs_helper.cpp)
+  kvikio_add_test(NAME TLS_TEST SOURCES test_tls.cpp utils/env.cpp)
+  kvikio_add_test(NAME URL_TEST SOURCES test_url.cpp)
+endif()
 
 rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/tests/libkvikio)
diff --git a/cpp/tests/test_basic_io.cpp b/cpp/tests/test_basic_io.cpp
index d72ba8841f..aeda7051d0 100644
--- a/cpp/tests/test_basic_io.cpp
+++ b/cpp/tests/test_basic_io.cpp
@@ -1,21 +1,21 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <fcntl.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/utils.hpp>
+#include <kvikio/error.hpp>
 #include <kvikio/file_handle.hpp>
+#include <kvikio/file_utils.hpp>
+#include <kvikio/utils.hpp>
+
+#include "utils/env.hpp"
 #include "utils/utils.hpp"
 
 using namespace kvikio::test;
@@ -27,15 +27,16 @@ class BasicIOTest : public testing::Test {
     TempDir tmp_dir{false};
     _filepath = tmp_dir.path() / "test";
 
-    _dev_a = std::move(DevBuffer::arange(100));
-    _dev_b = std::move(DevBuffer::zero_like(_dev_a));
+    _dev_a = std::move(DevBuffer<value_type>::arange(100));
+    _dev_b = std::move(DevBuffer<value_type>::zero_like(_dev_a));
   }
 
   void TearDown() override {}
 
   std::filesystem::path _filepath;
-  DevBuffer _dev_a;
-  DevBuffer _dev_b;
+  using value_type = std::int64_t;
+  DevBuffer<value_type> _dev_a;
+  DevBuffer<value_type> _dev_b;
 };
 
 TEST_F(BasicIOTest, write_read)
@@ -96,3 +97,130 @@ TEST_F(BasicIOTest, write_read_async)
 
   CUDA_DRIVER_TRY(kvikio::cudaAPI::instance().StreamDestroy(stream));
 }
+
+class DirectIOTest : public testing::Test {
+ public:
+  using value_type = std::int64_t;
+
+ protected:
+  void SetUp() override
+  {
+    TempDir tmp_dir{false};
+    _filepath = tmp_dir.path() / "test";
+
+    // Skip the fixture if Direct I/O is not supported
+    try {
+      [[maybe_unused]] auto fd =
+        kvikio::open_fd(_filepath.c_str(), "w", true /* o_direct */, kvikio::FileHandle::m644);
+    } catch (...) {
+      GTEST_SKIP() << "Direct I/O is not supported for the test file: " << _filepath;
+    }
+
+    // Create a sequence of numbers as a ground truth
+    _num_elements = 1ULL * 1024ULL * 1024ULL + 1234ULL;
+    _total_bytes  = _num_elements * sizeof(value_type);
+    _ground_truth.resize(_num_elements);
+    std::iota(_ground_truth.begin(), _ground_truth.end(), 0);
+  }
+
+  void TearDown() override {}
+
+  std::filesystem::path _filepath;
+  std::size_t _num_elements{};
+  std::vector<value_type> _ground_truth;
+  std::size_t _total_bytes{};
+
+ public:
+  static std::size_t constexpr page_size{4096};
+  using AlignedAllocator   = kvikio::test::CustomHostAllocator<value_type, 4096>;
+  using UnalignedAllocator = kvikio::test::CustomHostAllocator<value_type, 4096, 123>;
+};
+
+TEST_F(DirectIOTest, pwrite)
+{
+  // Create host buffers (page-aligned and unaligned) and device buffer for testing
+  std::vector<value_type, AlignedAllocator> aligned_host_buf(_num_elements);
+  std::vector<value_type, UnalignedAllocator> unaligned_host_buf(_num_elements);
+  DevBuffer<value_type> dev_buf(_num_elements);
+
+  std::array<void*, 3> buffers{aligned_host_buf.data(), unaligned_host_buf.data(), dev_buf.ptr};
+  std::array auto_direct_io_statuses{true, false};
+
+  for (const auto& flag : auto_direct_io_statuses) {
+    std::string flag_str = flag ? "true" : "false";
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_AUTO_DIRECT_IO_WRITE", flag_str}};
+    for (const auto buf : buffers) {
+      // Fill up the buffer and write data to file (using KvikIO's pwrite)
+      {
+        if (kvikio::is_host_memory(buf)) {
+          std::memcpy(buf, _ground_truth.data(), _total_bytes);
+        } else {
+          KVIKIO_CHECK_CUDA(
+            cudaMemcpy(buf, _ground_truth.data(), _total_bytes, cudaMemcpyKind::cudaMemcpyDefault));
+        }
+
+        kvikio::FileHandle f(_filepath, "w");
+        auto fut               = f.pwrite(buf, _total_bytes);
+        auto num_bytes_written = fut.get();
+        EXPECT_EQ(num_bytes_written, _total_bytes);
+      }
+
+      // Read data from file (using Linux syscall) and check correctness
+      {
+        auto fd = open(_filepath.c_str(), O_RDONLY);
+        SYSCALL_CHECK(fd, "File cannot be opened");
+
+        std::vector<value_type> result(_ground_truth.size(), 0);
+        SYSCALL_CHECK(read(fd, result.data(), _total_bytes));
+        EXPECT_EQ(result, _ground_truth);
+
+        SYSCALL_CHECK(close(fd));
+      }
+    }
+  }
+}
+
+TEST_F(DirectIOTest, pread)
+{
+  // Write ground truth data to file (using Linux syscall)
+  {
+    auto fd = open(_filepath.c_str(), O_WRONLY | O_CREAT | O_TRUNC, kvikio::FileHandle::m644);
+    SYSCALL_CHECK(fd, "File cannot be opened");
+    SYSCALL_CHECK(write(fd, _ground_truth.data(), _total_bytes));
+    SYSCALL_CHECK(close(fd));
+  }
+
+  // Create host buffers (page-aligned and unaligned) and device buffer for testing
+  std::vector<value_type, AlignedAllocator> aligned_host_buf(_num_elements);
+  std::vector<value_type, UnalignedAllocator> unaligned_host_buf(_num_elements);
+  DevBuffer<value_type> dev_buf(_num_elements);
+
+  std::array<void*, 3> buffers{aligned_host_buf.data(), unaligned_host_buf.data(), dev_buf.ptr};
+  std::array auto_direct_io_statuses{true, false};
+
+  for (const auto& flag : auto_direct_io_statuses) {
+    std::string flag_str = flag ? "true" : "false";
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_AUTO_DIRECT_IO_READ", flag_str}};
+    for (const auto buf : buffers) {
+      // Read data from file (using KvikIO's pread) and check correctness
+      {
+        kvikio::FileHandle f(_filepath, "r");
+        auto fut            = f.pread(buf, _total_bytes);
+        auto num_bytes_read = fut.get();
+        EXPECT_EQ(num_bytes_read, _total_bytes);
+
+        if (kvikio::is_host_memory(buf)) {
+          auto* buf_helper = reinterpret_cast<value_type*>(buf);
+          for (std::size_t i = 0; i < _num_elements; ++i) {
+            EXPECT_EQ(buf_helper[i], _ground_truth[i]);
+          }
+        } else {
+          std::vector<value_type> result(_num_elements);
+          KVIKIO_CHECK_CUDA(
+            cudaMemcpy(result.data(), buf, _total_bytes, cudaMemcpyKind::cudaMemcpyDefault));
+          EXPECT_EQ(result, _ground_truth);
+        }
+      }
+    }
+  }
+}
diff --git a/cpp/tests/test_defaults.cpp b/cpp/tests/test_defaults.cpp
index f4f3a92dc1..4209fafd6b 100644
--- a/cpp/tests/test_defaults.cpp
+++ b/cpp/tests/test_defaults.cpp
@@ -1,24 +1,21 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <stdexcept>
+#include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include <kvikio/compat_mode.hpp>
 #include <kvikio/defaults.hpp>
 
+#include "utils/env.hpp"
+
+using ::testing::HasSubstr;
+using ::testing::ThrowsMessage;
+
 TEST(DefaultsTest, parse_compat_mode_str)
 {
   {
@@ -72,3 +69,162 @@ TEST(DefaultsTest, parse_http_status_codes)
     }
   }
 }
+
+TEST(DefaultsTest, alias_for_getenv_or)
+{
+  // Passed initializer list is empty
+  {
+    EXPECT_THAT([=] { kvikio::getenv_or({}, 123); },
+                ThrowsMessage<std::invalid_argument>(HasSubstr(
+                  "`env_var_names` must contain at least one environment variable name")));
+  }
+
+  // Non-string env var has an empty value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", ""}};
+    EXPECT_THAT(
+      [=] { kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, 123); },
+      ThrowsMessage<std::invalid_argument>(HasSubstr("unknown config value KVIKIO_TEST_ALIAS=")));
+  }
+
+  // Non-string env var and alias have an empty value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS_1", ""},
+                                            {"KVIKIO_TEST_ALIAS_2", ""}};
+    EXPECT_THAT(
+      [=] { kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, 123); },
+      ThrowsMessage<std::invalid_argument>(HasSubstr("unknown config value KVIKIO_TEST_ALIAS_2=")));
+  }
+
+  // String env var has an empty value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", ""}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, std::string{"abc"});
+    EXPECT_EQ(env_var_name, "KVIKIO_TEST_ALIAS");
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_found);
+  }
+
+  // String env var and alias have an empty value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS_1", ""},
+                                            {"KVIKIO_TEST_ALIAS_2", ""}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, std::string{"abc"});
+    EXPECT_EQ(env_var_name, "KVIKIO_TEST_ALIAS_2");
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_found);
+  }
+
+  // Env var has already been set by its alias with the same value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{
+      {"KVIKIO_TEST_ALIAS_1", "10"}, {"KVIKIO_TEST_ALIAS_2", "10"}, {"KVIKIO_TEST_ALIAS_3", "10"}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2", "KVIKIO_TEST_ALIAS_3"}, 123);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS_3"});
+    EXPECT_EQ(result, 10);
+    EXPECT_TRUE(has_found);
+  }
+
+  // Env var has already been set by its alias with a different value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS_1", "10"},
+                                            {"KVIKIO_TEST_ALIAS_2", "20"}};
+    EXPECT_THAT([=] { kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, 123); },
+                ThrowsMessage<std::invalid_argument>(HasSubstr(
+                  "Environment variable KVIKIO_TEST_ALIAS_2 (20) has already been set by its alias "
+                  "KVIKIO_TEST_ALIAS_1 (10) with a different value")));
+  }
+
+  // Env var has invalid value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", "abc"}};
+    EXPECT_THAT([=] { kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, 123); },
+                ThrowsMessage<std::invalid_argument>(
+                  HasSubstr("unknown config value KVIKIO_TEST_ALIAS=abc")));
+  }
+
+  // 1st alias has a set value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS_1", "654.321"}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, 123.456);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS_1"});
+    EXPECT_EQ(result, 654.321);
+    EXPECT_TRUE(has_found);
+  }
+
+  // 2nd alias has a set value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS_2", "654.321"}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, 123.456);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS_2"});
+    EXPECT_EQ(result, 654.321);
+    EXPECT_TRUE(has_found);
+  }
+
+  // Neither alias has a set value
+  {
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, 123.456);
+    EXPECT_TRUE(env_var_name.empty());
+    EXPECT_EQ(result, 123.456);
+    EXPECT_FALSE(has_found);
+  }
+
+  // Special type: bool
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", "yes"}};
+    auto const [env_var_name, result, has_found] = kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, false);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
+    EXPECT_TRUE(result);
+    EXPECT_TRUE(has_found);
+  }
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", "OFF"}};
+    auto const [env_var_name, result, has_found] = kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, false);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
+    EXPECT_FALSE(result);
+    EXPECT_TRUE(has_found);
+  }
+
+  // Special type: CompatMode
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", "yes"}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, kvikio::CompatMode::AUTO);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
+    EXPECT_EQ(result, kvikio::CompatMode::ON);
+    EXPECT_TRUE(has_found);
+  }
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", "FALSE"}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, kvikio::CompatMode::AUTO);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
+    EXPECT_EQ(result, kvikio::CompatMode::OFF);
+    EXPECT_TRUE(has_found);
+  }
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", "aUtO"}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, kvikio::CompatMode::ON);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
+    EXPECT_EQ(result, kvikio::CompatMode::AUTO);
+    EXPECT_TRUE(has_found);
+  }
+
+  // Special type: std::vector<int>
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", "109, 108, 107"}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, std::vector<int>{111, 112, 113});
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
+    std::vector<int> expected{109, 108, 107};
+    EXPECT_EQ(result, expected);
+    EXPECT_TRUE(has_found);
+  }
+}
diff --git a/cpp/tests/test_error.cpp b/cpp/tests/test_error.cpp
index 27713b7830..587022389c 100644
--- a/cpp/tests/test_error.cpp
+++ b/cpp/tests/test_error.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <fcntl.h>
diff --git a/cpp/tests/test_hdfs.cpp b/cpp/tests/test_hdfs.cpp
new file mode 100644
index 0000000000..c0c49cffb2
--- /dev/null
+++ b/cpp/tests/test_hdfs.cpp
@@ -0,0 +1,170 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+#include <cstdlib>
+#include <memory>
+
+#include <kvikio/file_handle.hpp>
+#include <kvikio/hdfs.hpp>
+#include <kvikio/remote_handle.hpp>
+
+#include "utils/hdfs_helper.hpp"
+#include "utils/utils.hpp"
+
+// This test makes the following assumptions:
+// - This KvikIO unit test is run on the "name node" of a WebHDFS server.
+// - Port 9870 (default for WebHDFS) is being used to listen to the requests.
+// - The environment variable `KVIKIO_WEBHDFS_USERNAME` is specified prior to the test. It contains
+// a valid user
+//   name that has been granted access to the HDFS.
+// - The user has the proper permission to create a file under the `/tmp` directory on the HDFS.
+// - If the unit test is run within a Docker. The following arguments are passed to the `docker run`
+//   command:
+//   - `--network host`
+//   - `--env KVIKIO_WEBHDFS_USERNAME=<hdfs_username>`
+//
+// If any of these assumptions is not satisfied, this unit test is expected to be skipped
+// gracefully.
+
+using value_type = double;
+
+namespace kvikio::test {
+struct Config {
+  std::size_t num_elements{1024ull * 1024ull};
+  std::vector<value_type> host_buf;
+  kvikio::test::DevBuffer<value_type> dev_buf;
+  std::string host;
+  std::string port;
+  std::string _username;
+  std::string remote_file_path;
+  bool file_created{false};
+};
+}  // namespace kvikio::test
+
+class WebHdfsTest : public testing::Test {
+ protected:
+  static void SetUpTestSuite()
+  {
+    config.num_elements = 1024ull * 1024ull;
+    config.host_buf.resize(config.num_elements);
+    std::iota(config.host_buf.begin(), config.host_buf.end(), 0);
+
+    config.dev_buf = kvikio::test::DevBuffer<value_type>{config.host_buf};
+
+    config.host = "localhost";
+    config.port = "9870";
+
+    config.remote_file_path = "/tmp/kvikio-test-webhdfs.bin";
+
+    auto res = std::getenv("KVIKIO_WEBHDFS_USERNAME");
+    if (res) {
+      config._username = res;
+    } else {
+      GTEST_SKIP() << "Environment variable KVIKIO_WEBHDFS_USERNAME is not set for this test.";
+    }
+
+    webhdfs_helper =
+      std::make_unique<kvikio::test::WebHdfsTestHelper>(config.host, config.port, config._username);
+
+    if (!webhdfs_helper->can_connect()) {
+      GTEST_SKIP() << "Cannot connect to WebHDFS. Skipping all tests for this fixture.";
+    }
+
+    std::span<std::byte> buffer{reinterpret_cast<std::byte*>(config.host_buf.data()),
+                                config.host_buf.size() * sizeof(value_type)};
+    if (!webhdfs_helper->upload_data(buffer, config.remote_file_path)) {
+      GTEST_SKIP()
+        << "Failed to upload test data using WebHDFS. Skipping all tests for this fixture.";
+    };
+
+    config.file_created = true;
+  }
+
+  static void TearDownTestSuite()
+  {
+    if (config.file_created) { webhdfs_helper->delete_data(config.remote_file_path); }
+  }
+
+  static kvikio::test::Config config;
+  static std::unique_ptr<kvikio::test::WebHdfsTestHelper> webhdfs_helper;
+};
+
+kvikio::test::Config WebHdfsTest::config{};
+std::unique_ptr<kvikio::test::WebHdfsTestHelper> WebHdfsTest::webhdfs_helper{};
+
+TEST_F(WebHdfsTest, constructor)
+{
+  auto do_test = [&](kvikio::RemoteHandle& remote_handle) {
+    kvikio::test::DevBuffer<value_type> out_device_buf(config.num_elements);
+    auto read_size    = remote_handle.read(out_device_buf.ptr, remote_handle.nbytes());
+    auto out_host_buf = out_device_buf.to_vector();
+    for (std::size_t i = 0; i < config.num_elements; ++i) {
+      EXPECT_EQ(config.host_buf[i], out_host_buf[i]);
+    }
+    EXPECT_EQ(read_size, remote_handle.nbytes());
+  };
+
+  std::stringstream ss;
+  ss << "http://" << config.host << ":" << config.port << "/webhdfs/v1" << config.remote_file_path
+     << "?user.name=" << config._username;
+  std::vector<kvikio::RemoteHandle> remote_handles;
+
+  remote_handles.emplace_back(std::make_unique<kvikio::WebHdfsEndpoint>(ss.str()));
+  remote_handles.emplace_back(std::make_unique<kvikio::WebHdfsEndpoint>(
+    config.host, config.port, config.remote_file_path, config._username));
+
+  for (auto& remote_handle : remote_handles) {
+    do_test(remote_handle);
+  }
+}
+
+TEST_F(WebHdfsTest, read_parallel)
+{
+  auto do_test = [&](std::string const& url,
+                     std::size_t num_elements_to_skip,
+                     std::size_t num_elements_to_read,
+                     std::size_t task_size) {
+    kvikio::RemoteHandle remote_handle{std::make_unique<kvikio::WebHdfsEndpoint>(url)};
+    auto const offset             = num_elements_to_skip * sizeof(value_type);
+    auto const expected_read_size = num_elements_to_read * sizeof(value_type);
+
+    // host
+    {
+      std::vector<value_type> out_host_buf(num_elements_to_read, {});
+      auto fut = remote_handle.pread(out_host_buf.data(), expected_read_size, offset, task_size);
+      auto const read_size = fut.get();
+      for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+        EXPECT_EQ(config.host_buf[i], out_host_buf[i - num_elements_to_skip]);
+      }
+      EXPECT_EQ(read_size, expected_read_size);
+    }
+
+    // device
+    {
+      kvikio::test::DevBuffer<value_type> out_device_buf(num_elements_to_read);
+      auto fut = remote_handle.pread(out_device_buf.ptr, expected_read_size, offset, task_size);
+      auto const read_size = fut.get();
+      auto out_host_buf    = out_device_buf.to_vector();
+      for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+        EXPECT_EQ(config.host_buf[i], out_host_buf[i - num_elements_to_skip]);
+      }
+      EXPECT_EQ(read_size, expected_read_size);
+    }
+  };
+
+  std::stringstream ss;
+  ss << "http://" << config.host << ":" << config.port << "/webhdfs/v1" << config.remote_file_path
+     << "?user.name=" << config._username;
+  std::vector<std::size_t> task_sizes{256, 1024, kvikio::defaults::task_size()};
+
+  for (const auto& task_size : task_sizes) {
+    for (const auto& num_elements_to_read : {10, 9999}) {
+      for (const auto& num_elements_to_skip : {0, 10, 100, 1000, 9999}) {
+        do_test(ss.str(), num_elements_to_skip, num_elements_to_read, task_size);
+      }
+    }
+  }
+}
diff --git a/cpp/tests/test_mmap.cpp b/cpp/tests/test_mmap.cpp
new file mode 100644
index 0000000000..9e355f4789
--- /dev/null
+++ b/cpp/tests/test_mmap.cpp
@@ -0,0 +1,362 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <sys/mman.h>
+#include <optional>
+#include <stdexcept>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <kvikio/defaults.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/file_handle.hpp>
+#include <kvikio/mmap.hpp>
+#include <kvikio/utils.hpp>
+
+#include "utils/utils.hpp"
+
+using ::testing::HasSubstr;
+using ::testing::ThrowsMessage;
+
+class MmapTest : public testing::Test {
+ protected:
+  void SetUp() override
+  {
+    kvikio::test::TempDir tmp_dir{false};
+    _filepath                = tmp_dir.path() / "test.bin";
+    std::size_t num_elements = 1024ull * 1024ull;
+    _host_buf                = CreateTempFile<value_type>(_filepath, num_elements);
+    _dev_buf                 = kvikio::test::DevBuffer<value_type>{_host_buf};
+    _page_size               = kvikio::get_page_size();
+  }
+
+  void TearDown() override {}
+
+  template <typename T>
+  std::vector<T> CreateTempFile(std::string const& filepath, std::size_t num_elements)
+  {
+    std::vector<T> v(num_elements);
+    std::iota(v.begin(), v.end(), 0);
+    kvikio::FileHandle f(filepath, "w");
+    auto fut = f.pwrite(v.data(), v.size() * sizeof(T));
+    fut.get();
+    _file_size = f.nbytes();
+    return v;
+  }
+
+  std::filesystem::path _filepath;
+  std::size_t _file_size;
+  std::size_t _page_size;
+  std::vector<std::int64_t> _host_buf;
+  using value_type = decltype(_host_buf)::value_type;
+  kvikio::test::DevBuffer<value_type> _dev_buf;
+};
+
+TEST_F(MmapTest, invalid_file_open_flag)
+{
+  // Empty file open flag
+  EXPECT_THAT(
+    [&] {
+      {
+        kvikio::MmapHandle(_filepath, "");
+      }
+    },
+    ThrowsMessage<std::invalid_argument>(HasSubstr("Unknown file open flag")));
+
+  // Invalid file open flag
+  EXPECT_THAT(
+    [&] {
+      {
+        kvikio::MmapHandle(_filepath, "z");
+      }
+    },
+    ThrowsMessage<std::invalid_argument>(HasSubstr("Unknown file open flag")));
+}
+
+TEST_F(MmapTest, invalid_mmap_flag)
+{
+  EXPECT_THAT(
+    [&] {
+      {
+        int invalid_flag{-1};
+        kvikio::MmapHandle(_filepath, "r", std::nullopt, 0, kvikio::FileHandle::m644, invalid_flag);
+      }
+    },
+    ThrowsMessage<kvikio::GenericSystemError>(HasSubstr("Invalid argument")));
+}
+
+TEST_F(MmapTest, constructor_invalid_range)
+{
+  // init_size is too large (by 1 char)
+  EXPECT_THAT([&] { kvikio::MmapHandle(_filepath, "r", _file_size + 1); },
+              ThrowsMessage<std::out_of_range>(HasSubstr("Mapped region is past the end of file")));
+
+  // init_file_offset is too large (by 1 char)
+  EXPECT_THAT(
+    [&] { kvikio::MmapHandle(_filepath, "r", std::nullopt, _file_size); },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Offset must be less than the file size")));
+
+  // init_size is 0
+  EXPECT_THAT(
+    [&] { kvikio::MmapHandle(_filepath, "r", 0); },
+    ThrowsMessage<std::invalid_argument>(HasSubstr("Mapped region should not be zero byte")));
+}
+
+TEST_F(MmapTest, constructor_valid_range)
+{
+  // init_size is exactly equal to file size
+  EXPECT_NO_THROW({ kvikio::MmapHandle(_filepath, "r", _file_size); });
+
+  // init_file_offset is exactly on the last char
+  EXPECT_NO_THROW({
+    kvikio::MmapHandle mmap_handle(_filepath, "r", std::nullopt, _file_size - 1);
+    EXPECT_EQ(mmap_handle.initial_map_size(), 1);
+  });
+}
+
+TEST_F(MmapTest, read_invalid_range)
+{
+  std::size_t const initial_map_size{1024};
+  std::size_t const initial_file_offset{512};
+  std::vector<value_type> out_host_buf(_file_size / sizeof(value_type), {});
+
+  // Right bound is too large
+  EXPECT_THAT(
+    [&] {
+      kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
+      mmap_handle.read(out_host_buf.data(), initial_map_size, _file_size);
+    },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
+
+  // Left bound is too large
+  EXPECT_THAT(
+    [&] {
+      kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
+      mmap_handle.read(out_host_buf.data(), 0, initial_file_offset + initial_map_size + 1);
+    },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
+
+  EXPECT_THAT(
+    [&] {
+      kvikio::MmapHandle mmap_handle(_filepath, "r");
+      mmap_handle.read(out_host_buf.data(), 0, _file_size + 1);
+    },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Offset is past the end of file")));
+
+  // Left bound is too small
+  EXPECT_THAT(
+    [&] {
+      kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
+      mmap_handle.read(out_host_buf.data(), initial_map_size, initial_file_offset - 128);
+    },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
+
+  // size is too large
+  EXPECT_THAT(
+    [&] {
+      kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
+      mmap_handle.read(out_host_buf.data(), initial_map_size + 128, initial_file_offset);
+    },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
+}
+
+TEST_F(MmapTest, read_valid_range)
+{
+  std::size_t const initial_map_size{1024};
+  std::size_t const initial_file_offset{512};
+  std::vector<value_type> out_host_buf(_file_size / sizeof(value_type), {});
+
+  // size is 0
+  EXPECT_NO_THROW({
+    kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
+    mmap_handle.read(out_host_buf.data(), 0, initial_file_offset + initial_map_size);
+  });
+
+  EXPECT_NO_THROW({
+    kvikio::MmapHandle mmap_handle(_filepath, "r");
+    mmap_handle.read(out_host_buf.data(), 0, _file_size);
+  });
+}
+
+TEST_F(MmapTest, read_seq)
+{
+  auto do_test = [&](std::size_t num_elements_to_skip, std::size_t num_elements_to_read) {
+    kvikio::MmapHandle mmap_handle(_filepath, "r");
+    auto const offset             = num_elements_to_skip * sizeof(value_type);
+    auto const expected_read_size = num_elements_to_read * sizeof(value_type);
+
+    // host
+    {
+      std::vector<value_type> out_host_buf(num_elements_to_read, {});
+      auto const read_size = mmap_handle.read(out_host_buf.data(), expected_read_size, offset);
+      for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+        EXPECT_EQ(_host_buf[i], out_host_buf[i - num_elements_to_skip]);
+      }
+      EXPECT_EQ(read_size, expected_read_size);
+    }
+
+    // device
+    {
+      kvikio::test::DevBuffer<value_type> out_device_buf(num_elements_to_read);
+      auto const read_size = mmap_handle.read(out_device_buf.ptr, expected_read_size, offset);
+      auto out_host_buf    = out_device_buf.to_vector();
+      for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+        EXPECT_EQ(_host_buf[i], out_host_buf[i - num_elements_to_skip]);
+      }
+      EXPECT_EQ(read_size, expected_read_size);
+    }
+  };
+
+  for (const auto& num_elements_to_read : {10, 9999}) {
+    for (const auto& num_elements_to_skip : {0, 10, 100, 1000, 9999}) {
+      do_test(num_elements_to_skip, num_elements_to_read);
+    }
+  }
+}
+
+TEST_F(MmapTest, read_parallel)
+{
+  auto do_test =
+    [&](std::size_t num_elements_to_skip, std::size_t num_elements_to_read, std::size_t task_size) {
+      kvikio::MmapHandle mmap_handle(_filepath, "r");
+      auto const offset             = num_elements_to_skip * sizeof(value_type);
+      auto const expected_read_size = num_elements_to_read * sizeof(value_type);
+
+      // host
+      {
+        std::vector<value_type> out_host_buf(num_elements_to_read, {});
+        auto fut = mmap_handle.pread(out_host_buf.data(), expected_read_size, offset, task_size);
+        auto const read_size = fut.get();
+        for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+          EXPECT_EQ(_host_buf[i], out_host_buf[i - num_elements_to_skip]);
+        }
+        EXPECT_EQ(read_size, expected_read_size);
+      }
+
+      // device
+      {
+        kvikio::test::DevBuffer<value_type> out_device_buf(num_elements_to_read);
+        auto fut             = mmap_handle.pread(out_device_buf.ptr, expected_read_size, offset);
+        auto const read_size = fut.get();
+        auto out_host_buf    = out_device_buf.to_vector();
+        for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+          EXPECT_EQ(_host_buf[i], out_host_buf[i - num_elements_to_skip]);
+        }
+        EXPECT_EQ(read_size, expected_read_size);
+      }
+    };
+
+  std::vector<std::size_t> task_sizes{256, 1024, kvikio::defaults::task_size()};
+  for (const auto& task_size : task_sizes) {
+    for (const auto& num_elements_to_read : {10, 9999}) {
+      for (const auto& num_elements_to_skip : {0, 10, 100, 1000, 9999}) {
+        do_test(num_elements_to_skip, num_elements_to_read, task_size);
+      }
+    }
+  }
+}
+
+TEST_F(MmapTest, read_with_default_arguments)
+{
+  std::size_t num_elements = _file_size / sizeof(value_type);
+  kvikio::MmapHandle mmap_handle(_filepath, "r");
+
+  // host
+  {
+    std::vector<value_type> out_host_buf(num_elements, {});
+
+    {
+      auto const read_size = mmap_handle.read(out_host_buf.data());
+      for (std::size_t i = 0; i < num_elements; ++i) {
+        EXPECT_EQ(_host_buf[i], out_host_buf[i]);
+      }
+      EXPECT_EQ(read_size, _file_size);
+    }
+
+    {
+      auto fut             = mmap_handle.pread(out_host_buf.data());
+      auto const read_size = fut.get();
+      for (std::size_t i = 0; i < num_elements; ++i) {
+        EXPECT_EQ(_host_buf[i], out_host_buf[i]);
+      }
+      EXPECT_EQ(read_size, _file_size);
+    }
+  }
+
+  // device
+  {
+    kvikio::test::DevBuffer<value_type> out_device_buf(num_elements);
+
+    {
+      auto const read_size = mmap_handle.read(out_device_buf.ptr);
+      auto out_host_buf    = out_device_buf.to_vector();
+      for (std::size_t i = 0; i < num_elements; ++i) {
+        EXPECT_EQ(_host_buf[i], out_host_buf[i]);
+      }
+      EXPECT_EQ(read_size, _file_size);
+    }
+
+    {
+      auto fut             = mmap_handle.pread(out_device_buf.ptr);
+      auto const read_size = fut.get();
+      auto out_host_buf    = out_device_buf.to_vector();
+      for (std::size_t i = 0; i < num_elements; ++i) {
+        EXPECT_EQ(_host_buf[i], out_host_buf[i]);
+      }
+      EXPECT_EQ(read_size, _file_size);
+    }
+  }
+}
+
+TEST_F(MmapTest, closed_handle)
+{
+  kvikio::MmapHandle mmap_handle(_filepath, "r");
+  mmap_handle.close();
+
+  EXPECT_TRUE(mmap_handle.closed());
+  EXPECT_EQ(mmap_handle.file_size(), 0);
+
+  std::size_t num_elements = _file_size / sizeof(value_type);
+  std::vector<value_type> out_host_buf(num_elements, {});
+
+  EXPECT_THAT([&] { mmap_handle.read(out_host_buf.data()); },
+              ThrowsMessage<std::runtime_error>(HasSubstr("Cannot read from a closed MmapHandle")));
+
+  EXPECT_THAT([&] { mmap_handle.pread(out_host_buf.data()); },
+              ThrowsMessage<std::runtime_error>(HasSubstr("Cannot read from a closed MmapHandle")));
+}
+
+TEST_F(MmapTest, cpp_move)
+{
+  auto do_test = [&](kvikio::MmapHandle& mmap_handle) {
+    std::size_t num_elements = _file_size / sizeof(value_type);
+    std::vector<value_type> out_host_buf(num_elements, {});
+
+    EXPECT_NO_THROW({ mmap_handle.read(out_host_buf.data()); });
+    auto fut             = mmap_handle.pread(out_host_buf.data());
+    auto const read_size = fut.get();
+    for (std::size_t i = 0; i < num_elements; ++i) {
+      EXPECT_EQ(_host_buf[i], out_host_buf[i]);
+    }
+    EXPECT_EQ(read_size, _file_size);
+  };
+
+  {
+    kvikio::MmapHandle mmap_handle{};
+    EXPECT_TRUE(mmap_handle.closed());
+    mmap_handle = kvikio::MmapHandle(_filepath, "r");
+    EXPECT_FALSE(mmap_handle.closed());
+    do_test(mmap_handle);
+  }
+
+  {
+    kvikio::MmapHandle mmap_handle_1(_filepath, "r");
+    kvikio::MmapHandle mmap_handle_2{std::move(mmap_handle_1)};
+    EXPECT_TRUE(mmap_handle_1.closed());
+    EXPECT_FALSE(mmap_handle_2.closed());
+    do_test(mmap_handle_2);
+  }
+}
diff --git a/cpp/tests/test_remote_handle.cpp b/cpp/tests/test_remote_handle.cpp
index 650f1500f3..41d975fd00 100644
--- a/cpp/tests/test_remote_handle.cpp
+++ b/cpp/tests/test_remote_handle.cpp
@@ -1,31 +1,116 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <functional>
+#include <optional>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include <kvikio/hdfs.hpp>
 #include <kvikio/remote_handle.hpp>
 
-#include <unordered_map>
 #include "utils/env.hpp"
 
-TEST(RemoteHandleTest, s3_endpoint_constructor)
+using ::testing::HasSubstr;
+using ::testing::ThrowsMessage;
+
+class RemoteHandleTest : public testing::Test {
+ protected:
+  void SetUp() override
+  {
+    _sample_urls = {
+      // Endpoint type: S3
+      {"s3://bucket-name/object-key-name", kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"s3://bucket-name/object-key-name-dir/object-key-name-file",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://s3.region-code.amazonaws.com/bucket-name/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://bucket-name.s3.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://s3.amazonaws.com/bucket-name/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://bucket-name.s3-region-code.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://s3-region-code.amazonaws.com/bucket-name/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+
+      // Endpoint type: S3 presigned URL
+      {"https://bucket-name.s3.region-code.amazonaws.com/"
+       "object-key-name?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Signature=sig&X-Amz-Credential=cred&"
+       "X-Amz-SignedHeaders=host",
+       kvikio::RemoteEndpointType::S3_PRESIGNED_URL},
+
+      // Endpoint type: WebHDFS
+      {"https://host:1234/webhdfs/v1/data.bin", kvikio::RemoteEndpointType::WEBHDFS},
+    };
+  }
+
+  void TearDown() override {}
+
+  void test_helper(kvikio::RemoteEndpointType expected_endpoint_type,
+                   std::function<bool(const std::string&)> url_validity_checker)
+  {
+    for (auto const& [url, endpoint_type] : _sample_urls) {
+      if (endpoint_type == expected_endpoint_type) {
+        // Given that the URL is the expected endpoint type
+
+        // Test URL validity checker
+        EXPECT_TRUE(url_validity_checker(url));
+
+        // Test unified interface
+        {
+          // Here we pass the 1-byte argument to RemoteHandle::open. For all endpoints except
+          // kvikio::RemoteEndpointType::S3, this prevents the endpoint constructor from querying
+          // the file size and sending requests to the server, thus allowing us to use dummy URLs
+          // for testing purpose.
+          // For kvikio::RemoteEndpointType::S3, RemoteHandle::open sends HEAD request as a
+          // connectivity check and will fail on the syntactically valid dummy URL. The
+          // kvikio::RemoteEndpointType::S3_PUBLIC will then be used as the endpoint.
+          auto remote_handle =
+            kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::AUTO, std::nullopt, 1);
+          EXPECT_EQ(remote_handle.remote_endpoint_type(), expected_endpoint_type);
+        }
+
+        // Test explicit endpoint type specification
+        {
+          EXPECT_NO_THROW({
+            auto remote_handle =
+              kvikio::RemoteHandle::open(url, expected_endpoint_type, std::nullopt, 1);
+          });
+        }
+      } else {
+        // Given that the URL is NOT the expected endpoint type
+
+        // Test URL validity checker
+        EXPECT_FALSE(url_validity_checker(url));
+
+        // Test explicit endpoint type specification
+        {
+          EXPECT_ANY_THROW({
+            auto remote_handle =
+              kvikio::RemoteHandle::open(url, expected_endpoint_type, std::nullopt, 1);
+          });
+        }
+      }
+    }
+  }
+
+  std::vector<std::pair<std::string, kvikio::RemoteEndpointType>> _sample_urls;
+};
+
+TEST_F(RemoteHandleTest, s3_endpoint_constructor)
 {
-  kvikio::test::EnvVarContext env_var_ctx{{{"AWS_DEFAULT_REGION", "my_aws_default_region"},
-                                           {"AWS_ACCESS_KEY_ID", "my_aws_access_key_id"},
-                                           {"AWS_SECRET_ACCESS_KEY", "my_aws_secrete_access_key"},
-                                           {"AWS_ENDPOINT_URL", "https://my_aws_endpoint_url"}}};
+  kvikio::test::EnvVarContext env_var_ctx{{"AWS_DEFAULT_REGION", "my_aws_default_region"},
+                                          {"AWS_ACCESS_KEY_ID", "my_aws_access_key_id"},
+                                          {"AWS_SECRET_ACCESS_KEY", "my_aws_secrete_access_key"},
+                                          {"AWS_ENDPOINT_URL", "https://my_aws_endpoint_url"}};
   std::string url        = "https://my_aws_endpoint_url/bucket_name/object_name";
   std::string aws_region = "my_aws_region";
   // Use the overload where the full url and the optional aws_region are specified.
@@ -38,3 +123,158 @@ TEST(RemoteHandleTest, s3_endpoint_constructor)
 
   EXPECT_EQ(s1.str(), s2.str());
 }
+
+TEST_F(RemoteHandleTest, test_http_url)
+{
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{// Incorrect scheme
+                                                "s3://example.com",
+                                                "hdfs://example.com",
+                                                // Missing file path
+                                                "http://example.com"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::HttpEndpoint::is_url_valid(invalid_url));
+    }
+  }
+}
+
+TEST_F(RemoteHandleTest, test_s3_url)
+{
+  kvikio::test::EnvVarContext env_var_ctx{{"AWS_DEFAULT_REGION", "my_aws_default_region"},
+                                          {"AWS_ACCESS_KEY_ID", "my_aws_access_key_id"},
+                                          {"AWS_SECRET_ACCESS_KEY", "my_aws_secrete_access_key"}};
+
+  {
+    test_helper(kvikio::RemoteEndpointType::S3_PUBLIC, kvikio::S3Endpoint::is_url_valid);
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{
+      // Lack object-name
+      "s3://bucket-name",
+      "https://bucket-name.s3.region-code.amazonaws.com",
+      // Presigned URL
+      "https://bucket-name.s3.region-code.amazonaws.com/"
+      "object-key-name?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Signature=sig&X-Amz-Credential="
+      "cred&"
+      "X-Amz-SignedHeaders=host"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::S3Endpoint::is_url_valid(invalid_url));
+    }
+  }
+
+  // S3_PUBLIC is not in the allowlist. So when the connectivity check fails on the dummy URL,
+  // KvikIO cannot fall back to S3_PUBLIC.
+  {
+    EXPECT_ANY_THROW({
+      kvikio::RemoteHandle::open(
+        "s3://bucket-name/object-key-name",
+        kvikio::RemoteEndpointType::AUTO,
+        std::vector<kvikio::RemoteEndpointType>{kvikio::RemoteEndpointType::S3,
+                                                kvikio::RemoteEndpointType::HTTP},
+        1);
+    });
+  }
+}
+
+TEST_F(RemoteHandleTest, test_s3_url_with_presigned_url)
+{
+  {
+    test_helper(kvikio::RemoteEndpointType::S3_PRESIGNED_URL,
+                kvikio::S3EndpointWithPresignedUrl::is_url_valid);
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{
+      // Presigned URL should not use S3 scheme
+      "s3://bucket-name/object-key-name",
+
+      // Completely missing query
+      "https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
+
+      // Missing key parameters ("X-Amz-..."") in query
+      "https://bucket-name.s3.region-code.amazonaws.com/object-key-name?k0=v0&k1=v2"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::S3EndpointWithPresignedUrl::is_url_valid(invalid_url));
+    }
+  }
+}
+
+TEST_F(RemoteHandleTest, test_webhdfs_url)
+{
+  {
+    test_helper(kvikio::RemoteEndpointType::WEBHDFS, kvikio::WebHdfsEndpoint::is_url_valid);
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{// Missing file
+                                                "https://host:1234/webhdfs/v1",
+                                                "https://host:1234/webhdfs/v1/",
+
+                                                // Missing WebHDFS identifier
+                                                "https://host:1234/data.bin",
+
+                                                // Missing port number
+                                                "https://host/webhdfs/v1/data.bin"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::WebHdfsEndpoint::is_url_valid(invalid_url));
+    }
+  }
+}
+
+TEST_F(RemoteHandleTest, test_open)
+{
+  // Missing scheme
+  {
+    std::vector<std::string> const urls{
+      "example.com/path", "example.com:8080/path", "//example.com/path", "://example.com/path"};
+    for (auto const& url : urls) {
+      EXPECT_THROW(
+        { kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::AUTO, std::nullopt, 1); },
+        std::runtime_error);
+    }
+  }
+
+  // Unsupported type
+  {
+    std::string const url{"unsupported://example.com/path"};
+    EXPECT_THAT(
+      [&] { kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::AUTO, std::nullopt, 1); },
+      ThrowsMessage<std::runtime_error>(HasSubstr("Unsupported endpoint URL")));
+  }
+
+  // Specified URL not in the allowlist
+  {
+    std::string const url{"https://host:1234/webhdfs/v1/data.bin"};
+    std::vector<std::vector<kvikio::RemoteEndpointType>> const wrong_allowlists{
+      {},
+      {kvikio::RemoteEndpointType::S3},
+    };
+    for (auto const& wrong_allowlist : wrong_allowlists) {
+      EXPECT_THAT(
+        [&] {
+          kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::WEBHDFS, wrong_allowlist, 1);
+        },
+        ThrowsMessage<std::runtime_error>(HasSubstr("is not in the allowlist")));
+    }
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::pair<std::string, kvikio::RemoteEndpointType>> const invalid_urls{
+      {"s3://bucket-name", kvikio::RemoteEndpointType::S3},
+      {"https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3_PRESIGNED_URL},
+      {"https://host:1234/webhdfs/v1", kvikio::RemoteEndpointType::WEBHDFS},
+      {"http://example.com", kvikio::RemoteEndpointType::HTTP},
+    };
+    for (auto const& [invalid_url, endpoint_type] : invalid_urls) {
+      EXPECT_THAT([&] { kvikio::RemoteHandle::open(invalid_url, endpoint_type, std::nullopt, 1); },
+                  ThrowsMessage<std::runtime_error>(HasSubstr("Invalid URL")));
+    }
+  }
+}
diff --git a/cpp/tests/test_tls.cpp b/cpp/tests/test_tls.cpp
new file mode 100644
index 0000000000..63f6735e4d
--- /dev/null
+++ b/cpp/tests/test_tls.cpp
@@ -0,0 +1,37 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <kvikio/detail/tls.hpp>
+
+#include "utils/env.hpp"
+
+TEST(TlsTest, get_ca_paths)
+{
+  std::string const expected_ca_bundle_path{"ca_bundle_path"};
+  std::string const expected_ca_directory{"ca_directory"};
+  {
+    // Env var CURL_CA_BUNDLE has the highest priority. Both SSL_CERT_FILE and SSL_CERT_DIR shall be
+    // skipped
+    kvikio::test::EnvVarContext env_var_ctx{{"CURL_CA_BUNDLE", expected_ca_bundle_path},
+                                            {"SSL_CERT_FILE", "another_ca_bundle_path"},
+                                            {"SSL_CERT_DIR", expected_ca_directory}};
+    auto const& [ca_bundle_file, ca_directory] = kvikio::detail::get_ca_paths();
+
+    EXPECT_EQ(ca_bundle_file, expected_ca_bundle_path);
+    EXPECT_EQ(ca_directory, std::nullopt);
+  }
+
+  {
+    // Env var CURL_CA_BUNDLE and SSL_CERT_FILE are not specified, SSL_CERT_DIR shall be used
+    kvikio::test::EnvVarContext env_var_ctx{{"SSL_CERT_DIR", expected_ca_directory}};
+    auto const& [ca_bundle_file, ca_directory] = kvikio::detail::get_ca_paths();
+
+    EXPECT_EQ(ca_bundle_file, std::nullopt);
+    EXPECT_EQ(ca_directory, expected_ca_directory);
+  }
+}
diff --git a/cpp/tests/test_url.cpp b/cpp/tests/test_url.cpp
new file mode 100644
index 0000000000..84d5e6bc01
--- /dev/null
+++ b/cpp/tests/test_url.cpp
@@ -0,0 +1,272 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <kvikio/detail/url.hpp>
+#include <kvikio/error.hpp>
+#include <stdexcept>
+
+using ::testing::HasSubstr;
+using ::testing::ThrowsMessage;
+
+TEST(UrlTest, parse_scheme)
+{
+  {
+    std::vector<std::string> invalid_scheme_urls{
+      "invalid_scheme://host",
+      // The S3 scheme is not supported by libcurl. Without the CURLU_NON_SUPPORT_SCHEME flag, an
+      // exception is expected.
+      "s3://host"};
+
+    for (auto const& invalid_scheme_url : invalid_scheme_urls) {
+      EXPECT_THAT([&] { kvikio::detail::UrlParser::parse(invalid_scheme_url); },
+                  ThrowsMessage<std::runtime_error>(HasSubstr("KvikIO detects an URL error")));
+    }
+  }
+
+  // With the CURLU_NON_SUPPORT_SCHEME flag, the S3 scheme is now accepted.
+  {
+    std::vector<std::string> schemes{"s3", "S3"};
+    for (auto const& scheme : schemes) {
+      auto parsed_url =
+        kvikio::detail::UrlParser::parse(scheme + "://host", CURLU_NON_SUPPORT_SCHEME);
+      EXPECT_EQ(parsed_url.scheme.value(), "s3");  // Lowercase due to CURL's normalization
+    }
+  }
+}
+
+TEST(UrlTest, parse_host)
+{
+  std::vector<std::string> invalid_host_urls{"http://host with spaces.com",
+                                             "http://host[brackets].com",
+                                             "http://host{braces}.com",
+                                             "http://host<angle>.com",
+                                             R"(http://host\backslash.com)",
+                                             "http://host^caret.com",
+                                             "http://host`backtick.com"};
+  for (auto const& invalid_host_url : invalid_host_urls) {
+    EXPECT_THROW({ kvikio::detail::UrlParser::parse(invalid_host_url); }, std::runtime_error);
+  }
+}
+
+TEST(UrlTest, build_url)
+{
+  // Build a URL from scratch
+  {
+    std::string scheme{"https"};
+    std::string host{"api.example.com"};
+    std::string port{"8080"};
+    std::string path{"/v1/users"};
+    std::string query{"page=1&limit=10"};
+    std::string fragment{"results"};
+    std::stringstream ss;
+    ss << scheme << "://" << host << ":" << port << path << "?" << query << "#" << fragment;
+
+    {
+      auto url = kvikio::detail::UrlBuilder()
+                   .set_scheme("https")
+                   .set_host("api.example.com")
+                   .set_port("8080")
+                   .set_path("/v1/users")
+                   .set_query("page=1&limit=10")
+                   .set_fragment("results")
+                   .build();
+
+      EXPECT_EQ(url, ss.str());
+    }
+
+    // The components do not have to be specified in their correct order
+    {
+      auto url = kvikio::detail::UrlBuilder()
+                   .set_fragment("results")
+                   .set_scheme("https")
+                   .set_path("/v1/users")
+                   .set_host("api.example.com")
+                   .set_query("page=1&limit=10")
+                   .set_port("8080")
+                   .build();
+
+      EXPECT_EQ(url, ss.str());
+    }
+  }
+
+  // Modify an existing URL
+  {
+    std::string scheme_host{"https://api.example.com"};
+    std::string query{"page=1&limit=10"};
+
+    std::string old_path{"/old/path/file.txt"};
+    std::string new_path{"/new/path/document.pdf"};
+
+    // Modify the path
+    {
+      std::string old_url          = scheme_host + old_path + "?" + query;
+      std::string expected_new_url = scheme_host + new_path + "?" + query;
+
+      auto actual_new_url = kvikio::detail::UrlBuilder(old_url).set_path(new_path).build();
+      EXPECT_EQ(actual_new_url, expected_new_url);
+    }
+
+    // Modify the path and add the query
+    std::string port{"8080"};
+    std::string old_url          = scheme_host + old_path;
+    std::string expected_new_url = scheme_host + ":" + port + new_path + "?" + query;
+
+    auto actual_new_url = kvikio::detail::UrlBuilder(old_url)
+                            .set_port(port)
+                            .set_path(new_path)
+                            .set_query(query)
+                            .build();
+    EXPECT_EQ(actual_new_url, expected_new_url);
+  }
+
+  // Build from parsed components
+  {
+    std::string scheme{"https"};
+    std::string host{"api.example.com"};
+    std::string path{"/v1/users"};
+    std::string query{"page=1&limit=10"};
+    std::stringstream ss;
+    ss << scheme << "://" << host << path << "?" << query;
+
+    // First parse an existing URL
+    auto components = kvikio::detail::UrlParser::parse(ss.str());
+
+    // Modify components
+    components.path = "/v2/api";
+    components.port = "443";
+
+    // Build new URL from modified components
+    auto actual_new_url = kvikio::detail::UrlBuilder(components).build();
+
+    // Expected URL
+    ss.str("");
+    ss << scheme << "://" << host << ":" << components.port.value() << components.path.value()
+       << "?" << query;
+
+    EXPECT_EQ(actual_new_url, ss.str());
+  }
+
+  // AWS S3-like URL
+  {
+    std::string path = "/my-bucket/&$@=;:+,.txt";
+    auto url = kvikio::detail::UrlBuilder("https://s3.region.amazonaws.com").set_path(path).build();
+    std::string encoded_path = kvikio::detail::UrlEncoder::encode_path(path);
+
+    auto actual_encoded_url = kvikio::detail::UrlBuilder(url).set_path(encoded_path).build();
+    std::string expected_encoded_url{
+      "https://s3.region.amazonaws.com/my-bucket/%26%24%40%3D%3B%3A%2B%2C.txt"};
+
+    std::transform(actual_encoded_url.begin(),
+                   actual_encoded_url.end(),
+                   actual_encoded_url.begin(),
+                   [](unsigned char c) { return std::tolower(c); });
+
+    std::transform(expected_encoded_url.begin(),
+                   expected_encoded_url.end(),
+                   expected_encoded_url.begin(),
+                   [](unsigned char c) { return std::tolower(c); });
+
+    EXPECT_EQ(actual_encoded_url, expected_encoded_url);
+  }
+}
+
+TEST(UrlTest, encoding_table)
+{
+  // Look up the reserved characters (RFC 3986 section 2.2) in the encoding table
+  {
+    std::string special_chars{"!#$&\'()*+,/:;=?@[]"};
+    std::string expected_result{"%21%23%24%26%27%28%29%2A%2B%2C%2F%3A%3B%3D%3F%40%5B%5D"};
+    // First parameter: string containing special characters
+    // Second parameter: a sequence of special characters to be encoded
+    std::string actual_result =
+      kvikio::detail::UrlEncoder::encode_path(special_chars, special_chars);
+    EXPECT_EQ(actual_result, expected_result);
+  }
+
+  // Check a few samples from the encoding table. Out-of-bound characters (beyond ASCII table) are
+  // expected to be encoded to empty strings.
+  {
+    std::array<unsigned char, 6> input{0,  // First ASCII char NUL
+                                       '\x3D',
+                                       127,  // Last ASCII char DEL
+                                       128,  // Out-of-bound chars
+                                       200,
+                                       255};
+    std::array<std::string, sizeof(input)> expected_results{"%00",
+                                                            "%3D",
+                                                            "%7F"
+                                                            "",
+                                                            "",
+                                                            ""};
+    for (std::size_t i = 0; i < input.size(); ++i) {
+      std::string s{static_cast<char>(input[i])};
+      std::string actual_result = kvikio::detail::UrlEncoder::encode_path(s, s);
+      EXPECT_EQ(actual_result, expected_results[i]);
+    }
+  }
+
+  // Check control characters
+  {
+    std::map<char, std::string> mapping{
+      {'\x00', "%00"}, {'\x1A', "%1A"}, {'\x1F', "%1F"}, {'\x7F', "%7F"}};
+
+    for (auto const [question, answer] : mapping) {
+      // Construct a string view for the character, and specify the size explicitly to take account
+      // of NUL
+      std::string sv{&question, 1};
+      std::string result = kvikio::detail::UrlEncoder::encode_path(sv, sv);
+      EXPECT_EQ(result, answer);
+    }
+  }
+
+  // Check out-of-bound characters
+  {
+    unsigned char out_of_bound_chars[] = {128, 200, 255};
+    std::string_view sv{reinterpret_cast<char*>(out_of_bound_chars), sizeof(out_of_bound_chars)};
+    std::string result = kvikio::detail::UrlEncoder::encode_path(sv, sv);
+    EXPECT_EQ(result, "");
+  }
+}
+
+TEST(UrlTest, encode_url)
+{
+  // Path does not contain characters that require special handling, so no character is encoded
+  {
+    std::string original{"abc123/-_..bin"};
+    auto encoded = kvikio::detail::UrlEncoder::encode_path(original);
+    EXPECT_EQ(original, encoded);
+  }
+
+  // chars_to_encode is empty, so no character is encoded
+  {
+    std::string original{"abc123/!-_.*'()/&$@=;:+ ,?.bin"};
+    auto encoded = kvikio::detail::UrlEncoder::encode_path(original, {});
+    EXPECT_EQ(original, encoded);
+  }
+
+  // Test all characters mentioned by AWS documentation that require special handling
+  {
+    std::string const& input{kvikio::detail::UrlEncoder::aws_special_chars};
+    auto encoded = kvikio::detail::UrlEncoder::encode_path(input);
+
+    // Encoding is performed, so the string is expected to be changed
+    EXPECT_NE(input, encoded);
+
+    auto* curl     = curl_easy_init();
+    auto* expected = curl_easy_escape(curl, input.data(), input.size());
+    EXPECT_NE(expected, nullptr);
+    EXPECT_EQ(encoded, std::string{expected});
+
+    curl_free(expected);
+    curl_easy_cleanup(curl);
+
+    // aws_special_chars does not contain %, so double encoding is expected to not alter anything
+    auto double_encoded = kvikio::detail::UrlEncoder::encode_path(encoded);
+    EXPECT_EQ(encoded, double_encoded);
+  }
+}
diff --git a/cpp/tests/utils/env.cpp b/cpp/tests/utils/env.cpp
index 5e713dca04..262794a48d 100644
--- a/cpp/tests/utils/env.cpp
+++ b/cpp/tests/utils/env.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "env.hpp"
@@ -23,21 +12,22 @@
 
 namespace kvikio::test {
 EnvVarContext::EnvVarContext(
-  std::initializer_list<std::pair<std::string, std::string>> env_var_entries)
+  std::initializer_list<std::pair<std::string_view, std::string_view>> env_var_entries)
 {
   for (auto const& [key, current_value] : env_var_entries) {
     EnvVarState env_var_state;
-    if (auto const res = std::getenv(key.c_str()); res != nullptr) {
+    if (auto const res = std::getenv(key.data()); res != nullptr) {
       env_var_state.existed_before = true;
       env_var_state.previous_value = res;
     }
-    SYSCALL_CHECK(setenv(key.c_str(), current_value.c_str(), 1 /* allow overwrite */));
-    if (_env_var_map.find(key) != _env_var_map.end()) {
+    SYSCALL_CHECK(setenv(key.data(), current_value.data(), 1 /* allow overwrite */));
+    std::string key_str{key};
+    if (_env_var_map.find(key_str) != _env_var_map.end()) {
       std::stringstream ss;
       ss << "Environment variable " << key << " has already been set in this context.";
       KVIKIO_FAIL(ss.str());
     }
-    _env_var_map.insert({key, std::move(env_var_state)});
+    _env_var_map.insert({std::move(key_str), std::move(env_var_state)});
   }
 }
 
diff --git a/cpp/tests/utils/env.hpp b/cpp/tests/utils/env.hpp
index 8ec51ad27b..3d924f8829 100644
--- a/cpp/tests/utils/env.hpp
+++ b/cpp/tests/utils/env.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
@@ -45,7 +34,8 @@ class EnvVarContext {
    * @param env_var_entries User-specified environment variables. Each entry includes the variable
    * name and value.
    */
-  EnvVarContext(std::initializer_list<std::pair<std::string, std::string>> env_var_entries);
+  EnvVarContext(
+    std::initializer_list<std::pair<std::string_view, std::string_view>> env_var_entries);
 
   /**
    * @brief Restore the environment variables to previous values
diff --git a/cpp/tests/utils/hdfs_helper.cpp b/cpp/tests/utils/hdfs_helper.cpp
new file mode 100644
index 0000000000..e884b93eca
--- /dev/null
+++ b/cpp/tests/utils/hdfs_helper.cpp
@@ -0,0 +1,186 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "hdfs_helper.hpp"
+
+#include <algorithm>
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <string>
+
+#include <kvikio/detail/remote_handle.hpp>
+#include <kvikio/shim/libcurl.hpp>
+
+namespace kvikio::test {
+
+namespace {
+
+/**
+ * @brief Helper struct that wraps a buffer view and tracks how many data have been processed via an
+ * offset value.
+ */
+struct tracked_buffer_t {
+  std::span<std::byte> buffer;
+  std::size_t offset;
+};
+
+/**
+ * @brief Callback for `CURLOPT_READFUNCTION` to upload data.
+ *
+ * @param data
+ * @param size Curl internal implementation always sets this parameter to 1
+ * @param num_bytes_max The maximum number of bytes that can be uploaded
+ * @param userdata Must be cast from `tracked_buffer_t*`
+ * @return The number of bytes that have been copied to the transfer buffer.
+ */
+std::size_t callback_upload(char* data, std::size_t size, std::size_t num_bytes_max, void* userdata)
+{
+  auto new_data_size_max = size * num_bytes_max;
+  auto* tracked_buffer   = reinterpret_cast<tracked_buffer_t*>(userdata);
+
+  // All data have been uploaded. Nothing more to do.
+  if (tracked_buffer->offset >= tracked_buffer->buffer.size()) { return 0; }
+
+  auto copy_size =
+    std::min(new_data_size_max, tracked_buffer->buffer.size() - tracked_buffer->offset);
+  std::memcpy(data, tracked_buffer->buffer.data() + tracked_buffer->offset, copy_size);
+  tracked_buffer->offset += copy_size;
+
+  return copy_size;
+}
+}  // namespace
+
+WebHdfsTestHelper::WebHdfsTestHelper(std::string const& host,
+                                     std::string const& port,
+                                     std::string const& username)
+  : _host{host}, _port{port}, _username{username}
+{
+  std::stringstream ss;
+  ss << "http://" << host << ":" << port << "/webhdfs/v1";
+  _url_before_path = ss.str();
+}
+
+bool WebHdfsTestHelper::can_connect() noexcept
+{
+  try {
+    auto curl = create_curl_handle();
+
+    std::stringstream ss;
+    ss << _url_before_path << "/?user.name=" << _username << "&op=GETHOMEDIRECTORY";
+
+    curl.setopt(CURLOPT_URL, ss.str().c_str());
+
+    std::string response{};
+    curl.setopt(CURLOPT_WRITEDATA, &response);
+    curl.setopt(CURLOPT_WRITEFUNCTION, kvikio::detail::callback_get_string_response);
+    curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+    curl.perform();
+    return true;
+  } catch (std::exception const& e) {
+    std::cout << e.what() << "\n";
+    return false;
+  }
+}
+
+bool WebHdfsTestHelper::upload_data(std::span<std::byte> buffer,
+                                    std::string const& remote_file_path) noexcept
+{
+  try {
+    // Official reference on how to create and write to a file:
+    // https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html#Create_and_Write_to_a_File
+    std::string redirect_url;
+
+    {
+      // Step 1: Submit a HTTP PUT request without automatically following redirects and without
+      // sending the file data.
+      auto curl = create_curl_handle();
+
+      std::stringstream ss;
+      ss << _url_before_path << remote_file_path << "?user.name=" << _username << "&op=CREATE";
+      std::string redirect_data_node_location{};
+
+      curl.setopt(CURLOPT_URL, ss.str().c_str());
+      curl.setopt(CURLOPT_FOLLOWLOCATION, 0L);
+      curl.setopt(CURLOPT_CUSTOMREQUEST, "PUT");
+
+      std::string response{};
+      curl.setopt(CURLOPT_HEADERDATA, &response);
+      curl.setopt(CURLOPT_HEADERFUNCTION, kvikio::detail::callback_get_string_response);
+
+      curl.perform();
+
+      long http_status_code{};
+      curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+      KVIKIO_EXPECT(http_status_code == 307, "Redirection from name node to data node failed.");
+
+      std::regex const pattern{R"(Location:\s*(.*)\s*)"};
+      std::smatch match_results;
+      bool found = std::regex_search(response, match_results, pattern);
+      KVIKIO_EXPECT(found,
+                    "Regular expression search failed. Cannot extract redirect location from the "
+                    "JSON response.");
+      redirect_url = match_results[1].str();
+    }
+
+    {
+      // Step 2: Submit another HTTP PUT request using the URL in the Location header with the file
+      // data to be written.
+      auto curl = create_curl_handle();
+      curl.setopt(CURLOPT_URL, redirect_url.c_str());
+      curl.setopt(CURLOPT_UPLOAD, 1L);
+
+      tracked_buffer_t tracked_buffer{.buffer = buffer, .offset = 0};
+      curl.setopt(CURLOPT_READDATA, &tracked_buffer);
+      curl.setopt(CURLOPT_READFUNCTION, callback_upload);
+      curl.setopt(CURLOPT_INFILESIZE_LARGE, static_cast<curl_off_t>(buffer.size()));
+
+      curl.perform();
+
+      long http_status_code{};
+      curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+      KVIKIO_EXPECT(http_status_code == 201, "File creation failed.");
+    }
+
+    return true;
+  } catch (std::exception const& e) {
+    std::cout << e.what() << "\n";
+    return false;
+  }
+}
+
+bool WebHdfsTestHelper::delete_data(std::string const& remote_file_path) noexcept
+{
+  try {
+    // Official reference on how to delete a file:
+    // https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html#Delete_a_File.2FDirectory
+    auto curl = create_curl_handle();
+
+    std::stringstream ss;
+    ss << _url_before_path << remote_file_path << "?user.name=" << _username << "&op=DELETE";
+    std::string const url = ss.str();
+    std::string redirect_data_node_location{};
+
+    curl.setopt(CURLOPT_URL, url.c_str());
+    curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+    curl.setopt(CURLOPT_CUSTOMREQUEST, "DELETE");
+
+    std::string response{};
+    curl.setopt(CURLOPT_HEADERDATA, &response);
+    curl.setopt(CURLOPT_HEADERFUNCTION, kvikio::detail::callback_get_string_response);
+
+    curl.perform();
+
+    long http_status_code{};
+    curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+    KVIKIO_EXPECT(http_status_code == 200, "File deletion failed.");
+
+    return true;
+  } catch (std::exception const& e) {
+    std::cout << e.what() << "\n";
+    return false;
+  }
+}
+}  // namespace kvikio::test
diff --git a/cpp/tests/utils/hdfs_helper.hpp b/cpp/tests/utils/hdfs_helper.hpp
new file mode 100644
index 0000000000..47bd4022ca
--- /dev/null
+++ b/cpp/tests/utils/hdfs_helper.hpp
@@ -0,0 +1,50 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <span>
+#include <string>
+
+namespace kvikio::test {
+
+/**
+ * @brief Helper class to create and upload a file on WebHDFS so as to enable read testing.
+ */
+class WebHdfsTestHelper {
+ private:
+  std::string _host;
+  std::string _port;
+  std::string _username;
+  std::string _url_before_path;
+
+ public:
+  WebHdfsTestHelper(std::string const& host, std::string const& port, std::string const& username);
+
+  /**
+   * @brief Whether KvikIO can connect to the WebHDFS server.
+   *
+   * @return A boolean answer.
+   */
+  bool can_connect() noexcept;
+
+  /**
+   * @brief Copy the data from a host buffer to a remote file on the WebHDFS server.
+   *
+   * @param buffer View to the host buffer whose data will be copied to the WebHDFS server
+   * @param remote_file_path Remote file path
+   * @return True if the file has been successfully uploaded; false otherwise.
+   */
+  bool upload_data(std::span<std::byte> buffer, std::string const& remote_file_path) noexcept;
+
+  /**
+   * @brief Delete a remote file on the WebHDFS server.
+   *
+   * @param remote_file_path Remote file path
+   * @return True if the file has been successfully deleted; false otherwise.
+   */
+  bool delete_data(std::string const& remote_file_path) noexcept;
+};
+
+}  // namespace kvikio::test
diff --git a/cpp/tests/utils/utils.hpp b/cpp/tests/utils/utils.hpp
index 5722d3db25..92e6296e13 100644
--- a/cpp/tests/utils/utils.hpp
+++ b/cpp/tests/utils/utils.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
@@ -108,6 +97,7 @@ class TempDir {
 /**
  * @brief Help class for creating and comparing buffers.
  */
+template <typename T>
 class DevBuffer {
  public:
   std::size_t nelem;
@@ -116,11 +106,12 @@ class DevBuffer {
 
   DevBuffer() : nelem{0}, nbytes{0} {};
 
-  DevBuffer(std::size_t nelem) : nelem{nelem}, nbytes{nelem * sizeof(std::int64_t)}
+  DevBuffer(std::size_t nelem) : nelem{nelem}, nbytes{nelem * sizeof(T)}
   {
     KVIKIO_CHECK_CUDA(cudaMalloc(&ptr, nbytes));
+    KVIKIO_CHECK_CUDA(cudaMemset(ptr, 0, nbytes));
   }
-  DevBuffer(std::vector<std::int64_t> const& host_buffer) : DevBuffer{host_buffer.size()}
+  DevBuffer(std::vector<T> const& host_buffer) : DevBuffer{host_buffer.size()}
   {
     KVIKIO_CHECK_CUDA(cudaMemcpy(ptr, host_buffer.data(), nbytes, cudaMemcpyHostToDevice));
   }
@@ -142,9 +133,9 @@ class DevBuffer {
 
   ~DevBuffer() noexcept { cudaFree(ptr); }
 
-  [[nodiscard]] static DevBuffer arange(std::size_t nelem, std::int64_t start = 0)
+  [[nodiscard]] static DevBuffer arange(std::size_t nelem, T start = 0)
   {
-    std::vector<std::int64_t> host_buffer(nelem);
+    std::vector<T> host_buffer(nelem);
     std::iota(host_buffer.begin(), host_buffer.end(), start);
     return DevBuffer{host_buffer};
   }
@@ -156,9 +147,9 @@ class DevBuffer {
     return ret;
   }
 
-  [[nodiscard]] std::vector<std::int64_t> to_vector() const
+  [[nodiscard]] std::vector<T> to_vector() const
   {
-    std::vector<std::int64_t> ret(nelem);
+    std::vector<T> ret(nelem);
     KVIKIO_CHECK_CUDA(cudaMemcpy(ret.data(), this->ptr, nbytes, cudaMemcpyDeviceToHost));
     return ret;
   }
@@ -176,7 +167,8 @@ class DevBuffer {
 /**
  * @brief Check that two buffers are equal
  */
-inline void expect_equal(DevBuffer const& a, DevBuffer const& b)
+template <typename T>
+inline void expect_equal(DevBuffer<T> const& a, DevBuffer<T> const& b)
 {
   EXPECT_EQ(a.nbytes, b.nbytes);
   auto a_vec = a.to_vector();
@@ -186,4 +178,66 @@ inline void expect_equal(DevBuffer const& a, DevBuffer const& b)
   }
 }
 
+/**
+ * @brief Custom allocator with alignment and element offset support, suitable for use with standard
+ * containers like std::vector.
+ *
+ * @tparam T The type of elements to allocate
+ * @tparam ali Alignment requirement in bytes (must be a power of 2)
+ * @tparam element_offset Number of elements to offset the returned pointer (default: 0)
+ *
+ * Example usage:
+ * @code
+ * // Allocator with 4096-byte alignment, no offset
+ * std::vector<int, CustomHostAllocator<int, 4096>> vec;
+ *
+ * // Allocator with 64-byte alignment and 10-element offset (i.e. 80-byte offset)
+ * std::vector<double, CustomHostAllocator<double, 64, 10>> offset_vec;
+ * @endcode
+ */
+template <class T, std::size_t ali, std::size_t element_offset = 0>
+struct CustomHostAllocator {
+  using value_type      = T;
+  CustomHostAllocator() = default;
+
+  template <class U>
+  constexpr CustomHostAllocator(const CustomHostAllocator<U, ali, element_offset>&) noexcept
+  {
+  }
+
+  template <class U>
+  struct rebind {
+    using other = CustomHostAllocator<U, ali, element_offset>;
+  };
+
+  [[nodiscard]] T* allocate(std::size_t num_elements)
+  {
+    if (num_elements > std::numeric_limits<std::size_t>::max() / sizeof(T)) {
+      throw std::bad_array_new_length();
+    }
+
+    auto total_bytes = (num_elements + element_offset) * sizeof(T);
+    total_bytes      = (total_bytes + ali - 1) & ~(ali - 1);
+
+    if (auto* ptr = static_cast<T*>(std::aligned_alloc(ali, total_bytes))) {
+      auto dst_ptr = reinterpret_cast<std::byte*>(ptr) + element_offset * sizeof(T);
+      return reinterpret_cast<T*>(dst_ptr);
+    }
+
+    throw std::bad_alloc();
+  }
+
+  void deallocate(T* ptr, [[maybe_unused]] std::size_t n) noexcept
+  {
+    auto src_ptr = reinterpret_cast<std::byte*>(ptr) - element_offset * sizeof(T);
+    std::free(src_ptr);
+  }
+
+  template <class U, std::size_t ali_u, std::size_t element_offset_u>
+  bool operator==(const CustomHostAllocator<U, ali_u, element_offset_u>&) const noexcept
+  {
+    return ali == ali_u && element_offset == element_offset_u;
+  }
+};
+
 }  // namespace kvikio::test
diff --git a/dependencies.yaml b/dependencies.yaml
index f5cb1e2556..3805b35326 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -1,9 +1,12 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
 # Dependency list for https://github.com/rapidsai/dependency-file-generator
 files:
   all:
     output: conda
     matrix:
-      cuda: ["11.8", "12.8"]
+      cuda: ["12.9", "13.0"]
       arch: [aarch64, x86_64]
     includes:
       - build-universal
@@ -13,7 +16,6 @@ files:
       - cuda
       - cuda_version
       - depends_on_cupy
-      - depends_on_nvcomp
       - docs
       - py_version
       - rapids_build_skbuild
@@ -66,10 +68,6 @@ files:
       table: project
     includes:
       - depends_on_cupy
-      # TODO: restore runtime dependency when we no longer vendor nvcomp
-      # (when nvcomp supports Python 3.13)
-      # https://github.com/rapidsai/build-planning/issues/171
-      # - depends_on_nvcomp
       - depends_on_libkvikio
       - run
   py_rapids_build_libkvikio:
@@ -91,6 +89,14 @@ files:
       - build-cpp
       - build-py-wrapper
       - build-use-libkvikio-wheel
+  py_optional_zarr:
+    output: pyproject
+    pyproject_dir: python/kvikio
+    extras:
+      table: project.optional-dependencies
+      key: zarr
+    includes:
+      - zarr
   py_optional_test:
     output: pyproject
     pyproject_dir: python/kvikio
@@ -99,6 +105,7 @@ files:
       key: test
     includes:
       - test_python
+      - zarr
   test_java:
     output: none
     includes:
@@ -112,7 +119,6 @@ channels:
   - rapidsai
   - rapidsai-nightly
   - conda-forge
-  - nvidia
 dependencies:
   build-universal:
     common:
@@ -125,6 +131,7 @@ dependencies:
       - output_types: conda
         packages:
           - c-compiler
+          - cuda-nvcc
           - cxx-compiler
           - libcurl>=8.5.0,<9.0a0
     specific:
@@ -132,49 +139,19 @@ dependencies:
         matrices:
           - matrix:
               arch: x86_64
-              cuda: "11.8"
             packages:
-              - gcc_linux-64=11.*
+              - gcc_linux-64=14.*
               - sysroot_linux-64=2.28
           - matrix:
               arch: aarch64
-              cuda: "11.8"
             packages:
-              - gcc_linux-aarch64=11.*
+              - gcc_linux-aarch64=14.*
               - sysroot_linux-aarch64=2.28
-          - matrix:
-              arch: x86_64
-              cuda: "12.*"
-            packages:
-              - gcc_linux-64=13.*
-              - sysroot_linux-64=2.28
-          - matrix:
-              arch: aarch64
-              cuda: "12.*"
-            packages:
-              - gcc_linux-aarch64=13.*
-              - sysroot_linux-aarch64=2.28
-      - output_types: conda
-        matrices:
-          - matrix:
-              arch: x86_64
-              cuda: "11.8"
-            packages:
-              - nvcc_linux-64=11.8
-          - matrix:
-              arch: aarch64
-              cuda: "11.8"
-            packages:
-              - nvcc_linux-aarch64=11.8
-          - matrix:
-              cuda: "12.*"
-            packages:
-              - cuda-nvcc
   build-use-libkvikio-wheel:
     common:
       - output_types: conda
         packages: &libkvikio_packages
-          - libkvikio==25.6.*,>=0.0.0a0
+          - libkvikio==26.2.*,>=0.0.0a0
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -182,18 +159,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.6.*,>=0.0.0a0
+              - libkvikio-cu12==26.2.*,>=0.0.0a0
           - matrix:
-              cuda: "11.*"
+              cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu11==25.6.*,>=0.0.0a0
+              - libkvikio-cu13==26.2.*,>=0.0.0a0
           - {matrix: null, packages: *libkvikio_packages}
   build-py-wrapper:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cython>=3.0.0
+          - cython>=3.0.0,<3.2.0a0
   checks:
     common:
       - output_types: [conda, requirements]
@@ -203,22 +180,6 @@ dependencies:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              cuda: "11.2"
-            packages:
-              - cuda-version=11.2
-          - matrix:
-              cuda: "11.4"
-            packages:
-              - cuda-version=11.4
-          - matrix:
-              cuda: "11.5"
-            packages:
-              - cuda-version=11.5
-          - matrix:
-              cuda: "11.8"
-            packages:
-              - cuda-version=11.8
           - matrix:
               cuda: "12.0"
             packages:
@@ -239,105 +200,44 @@ dependencies:
               cuda: "12.9"
             packages:
               - cuda-version=12.9
-  cuda:
-    specific:
-      - output_types: conda
-        matrices:
           - matrix:
-              cuda: "11.*"
-            packages:
-              - cudatoolkit
-          - matrix:
-              cuda: "12.*"
+              cuda: "13.0"
             packages:
-      - output_types: conda
-        matrices:
-          - matrix:
-              cuda: "12.*"
-            packages:
-              - libcufile-dev
-          - matrix:
-              arch: aarch64
-              cuda: "11.*"
-            packages:
-          - matrix:
-              cuda: "11.8"
-              arch: x86_64
-            packages:
-              - libcufile=1.4.0.31
-              - libcufile-dev=1.4.0.31
-          - matrix:
-              cuda: "11.5"
-              arch: x86_64
-            packages:
-              - libcufile>=1.1.0.37,<=1.1.1.25
-              - libcufile-dev>=1.1.0.37,<=1.1.1.25
-          - matrix:
-              cuda: "11.4"
-              arch: x86_64
-            packages:
-              - &libcufile_114 libcufile>=1.0.0.82,<=1.0.2.10
-              - &libcufile_dev114 libcufile-dev>=1.0.0.82,<=1.0.2.10
-          - matrix:
-              cuda: "11.2"
-              arch: x86_64
-            packages:
-              # The NVIDIA channel doesn't publish pkgs older than 11.4 for these libs,
-              # so 11.2 uses 11.4 packages (the oldest available).
-              - *libcufile_114
-              - *libcufile_dev114
-  depends_on_cupy:
+              - cuda-version=13.0
+  cuda:
     common:
       - output_types: conda
         packages:
-          - cupy>=12.0.0
-    specific:
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix: {cuda: "12.*"}
-            packages:
-              - cupy-cuda12x>=12.0.0
-          - matrix: {cuda: "11.*"}
-            packages: &cupy_packages_cu11
-              - cupy-cuda11x>=12.0.0
-          - {matrix: null, packages: *cupy_packages_cu11}
-  depends_on_nvcomp:
+          - libcufile-dev
+          - libnuma
+  depends_on_cupy:
     common:
       - output_types: conda
         packages:
-          - nvcomp==4.2.0.11
+          - &cupy_unsuffixed cupy>=13.6.0
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix:
               cuda: "12.*"
-              use_cuda_wheels: "true"
             packages:
-              - nvidia-nvcomp-cu12==4.2.0.11
+              - cupy-cuda12x>=13.6.0
           - matrix:
-              cuda: "11.*"
-              use_cuda_wheels: "true"
+              cuda: "13.*"
             packages:
-              - nvidia-nvcomp-cu11==4.2.0.11
-          # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels
-          # (e.g. for DLFW and pip devcontainers)
+              - &cupy_cu13 cupy-cuda13x>=13.6.0
           - matrix:
-              use_cuda_wheels: "false"
             packages:
-          # if no matching matrix selectors passed, list the unsuffixed packages
-          # (just as a source of documentation, as this populates pyproject.toml in source control)
-          - matrix:
-            packages:
-              - nvidia-nvcomp==4.2.0.11
+              - *cupy_cu13
   depends_on_libkvikio:
     common:
       - output_types: conda
         packages:
-          - &libkvikio_unsuffixed libkvikio==25.6.*,>=0.0.0a0
+          - &libkvikio_unsuffixed libkvikio==26.2.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for libkvikio-cu{11,12}.
+          # This index is needed for libkvikio-cu{12,13}.
           - --extra-index-url=https://pypi.nvidia.com
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
@@ -347,18 +247,19 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.6.*,>=0.0.0a0
+              - libkvikio-cu12==26.2.*,>=0.0.0a0
           - matrix:
-              cuda: "11.*"
+              cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu11==25.6.*,>=0.0.0a0
+              - libkvikio-cu13==26.2.*,>=0.0.0a0
           - {matrix: null, packages: [*libkvikio_unsuffixed]}
   docs:
     common:
       - output_types: [conda, requirements]
         packages:
           - numpydoc
+          - zarr>=3.0.0,<3.2.0a0,<4.0.0
           - sphinx
           - sphinx-click
           - sphinx_rtd_theme
@@ -392,7 +293,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-build-backend>=0.3.0,<0.4.0.dev0
+          - rapids-build-backend>=0.4.0,<0.5.0.dev0
       - output_types: conda
         packages:
           - scikit-build-core>=0.10.0
@@ -404,22 +305,24 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - numpy>=1.23,<3.0a0
-          - zarr>=2.0.0,<4.0.0
-          # See https://github.com/zarr-developers/numcodecs/pull/475
-          - numcodecs !=0.12.0
           - packaging
+  zarr:
+    common:
+      - output_types: [requirements, pyproject]
+        packages:
+          - "zarr>=3.0.0,<3.2.0a0,<4.0.0; python_version >= '3.11'"
   test_libkvikio:
     common:
       - output_types: conda
         packages:
-          - libkvikio==25.6.*,>=0.0.0a0
-          - libkvikio-tests==25.6.*,>=0.0.0a0
+          - libkvikio==26.2.*,>=0.0.0a0
+          - libkvikio-tests==26.2.*,>=0.0.0a0
   test_kvikio:
     common:
       - output_types: conda
         packages:
-          - libkvikio==25.6.*,>=0.0.0a0
-          - kvikio==25.6.*,>=0.0.0a0
+          - libkvikio==26.2.*,>=0.0.0a0
+          - kvikio==26.2.*,>=0.0.0a0
   test_cpp:
     common:
       - output_types: conda
@@ -429,8 +332,8 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==25.6.*,>=0.0.0a0
-          - pytest
+          - rapids-dask-dependency==26.2.*,>=0.0.0a0
+          - pytest<9.0.0a0
           - pytest-asyncio
           - pytest-cov
           - pytest-timeout
@@ -443,15 +346,28 @@ dependencies:
         packages:
           - moto>=4.0.8
     specific:
+      - output_types: [conda]
+        matrices:
+          # zarr 3 is not supported on Python 3.10
+          - matrix:
+              py: "3.1[123]"
+            packages:
+              - zarr>=3.0.0,<3.2.0a0,<4.0.0
+          - matrix:
+            packages:
       - output_types: [conda, requirements, pyproject]
         matrices:
           - matrix:
               cuda: "12.*"
             packages:
-              - cuda-python>=12.6.2,<13.0a0
-          - matrix: # All CUDA 11 versions
+              - cuda-python>=12.9.2,<13.0a0
+          - matrix:
+              cuda: "13.*"
+            packages:
+              - &cuda_python_cu13 cuda-python>=13.0.1,<14.0a0
+          - matrix:
             packages:
-              - cuda-python>=11.8.5,<12.0a0
+              - *cuda_python_cu13
   test_java:
     common:
       - output_types: conda
diff --git a/docs/Makefile b/docs/Makefile
index 7c6066a619..79f529b67d 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
 # Minimal makefile for Sphinx documentation
 #
 
diff --git a/docs/make.bat b/docs/make.bat
index 6fcf05b4b7..0217175e0b 100644
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -1,3 +1,6 @@
+REM SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+REM SPDX-License-Identifier: Apache-2.0
+
 @ECHO OFF
 
 pushd %~dp0
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 5cba4fd8d3..e11f4cf55b 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -11,6 +11,10 @@ CuFile
 .. autoclass:: IOFuture
     :members:
 
+.. autofunction:: get_page_cache_info
+
+.. autofunction:: clear_page_cache
+
 CuFile driver
 -------------
 .. currentmodule:: kvikio.cufile_driver
@@ -29,6 +33,13 @@ CuFile driver
 
 .. autofunction:: initialize
 
+Mmap
+----
+.. currentmodule:: kvikio.mmap
+
+.. autoclass:: Mmap
+    :members:
+
 Zarr
 ----
 .. currentmodule:: kvikio.zarr
@@ -40,6 +51,8 @@ RemoteFile
 ----------
 .. currentmodule:: kvikio.remote_file
 
+.. autoclass:: RemoteEndpointType
+
 .. autoclass:: RemoteFile
     :members:
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0ac87ceae7..0b3353e945 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 #
 # Configuration file for the Sphinx documentation builder.
 #
@@ -31,7 +31,9 @@
 # The short X.Y version.
 version = f"{KVIKIO_VERSION.major:02}.{KVIKIO_VERSION.minor:02}"
 # The full version, including alpha/beta/rc tags
-release = f"{KVIKIO_VERSION.major:02}.{KVIKIO_VERSION.minor:02}.{KVIKIO_VERSION.micro:02}"
+release = (
+    f"{KVIKIO_VERSION.major:02}.{KVIKIO_VERSION.minor:02}.{KVIKIO_VERSION.micro:02}"
+)
 
 
 # -- General configuration ---------------------------------------------------
@@ -82,11 +84,11 @@
 pygments_style = None
 
 autodoc_default_options = {
-    'members': True,
-    'member-order': 'bysource',
-    'special-members': '__init__',
-    'undoc-members': True,
-    'exclude-members': '__weakref__'
+    "members": True,
+    "member-order": "bysource",
+    "special-members": "__init__",
+    "undoc-members": True,
+    "exclude-members": "__weakref__",
 }
 
 # -- Options for HTML output -------------------------------------------------
diff --git a/docs/source/install.rst b/docs/source/install.rst
index 4b62dd02ca..844477e421 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -15,10 +15,12 @@ Install the **stable release** from the ``rapidsai`` channel like:
 
   # Install in existing environment
   mamba install -c rapidsai -c conda-forge kvikio
+
+  # Create new environment (CUDA 13)
+  mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=13.0 kvikio
+
   # Create new environment (CUDA 12)
-  mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=12.8 kvikio
-  # Create new environment (CUDA 11)
-  mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=11.8 kvikio
+  mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=12.9 kvikio
 
 Install the **nightly release** from the ``rapidsai-nightly`` channel like:
 
@@ -26,11 +28,12 @@ Install the **nightly release** from the ``rapidsai-nightly`` channel like:
 
   # Install in existing environment
   mamba install -c rapidsai-nightly -c conda-forge kvikio
-  # Create new environment (CUDA 12)
-  mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.8 kvikio
-  # Create new environment (CUDA 11)
-  mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=11.8 kvikio
 
+  # Create new environment (CUDA 13)
+  mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=13.0 kvikio
+
+  # Create new environment (CUDA 12)
+  mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.9 kvikio
 
 .. note::
 
@@ -44,9 +47,11 @@ KvikIO is also available on PyPI. Install the latest release like:
 
 .. code-block::
 
-  pip install kvikio-cu11 # for CUDA 11
-  pip install kvikio-cu12 # for CUDA 12
+  pip install kvikio-cu13 # for CUDA 13
+
+.. code-block::
 
+  pip install kvikio-cu12 # for CUDA 12
 
 Build from source
 -----------------
@@ -55,10 +60,8 @@ In order to setup a development environment, we recommend Conda:
 
 .. code-block::
 
-  # CUDA 12
-  mamba env create --name kvikio-dev --file conda/environments/all_cuda-128_arch-x86_64.yaml
-  # CUDA 11
-  mamba env create --name kvikio-dev --file conda/environments/all_cuda-118_arch-x86_64.yaml
+  # CUDA 13
+  mamba env create --name kvikio-dev --file conda/environments/all_cuda-130_arch-$(arch).yaml
 
 The Python library depends on the C++ library, thus we build and install both:
 
diff --git a/docs/source/remote_file.rst b/docs/source/remote_file.rst
index ed6fe45b7b..e6d038035e 100644
--- a/docs/source/remote_file.rst
+++ b/docs/source/remote_file.rst
@@ -1,11 +1,31 @@
 Remote File
 ===========
 
-KvikIO provides direct access to remote files.
-
+KvikIO provides direct access to remote files, including AWS S3, WebHDFS, and generic HTTP/HTTPS.
 
 Example
 -------
 
 .. literalinclude:: ../../python/kvikio/examples/http_io.py
     :language: python
+
+AWS S3 object naming requirement
+--------------------------------
+
+KvikIO imposes the following naming requirements derived from the `AWS object naming guidelines <https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html>`_ .
+
+ - ``!``, ``*``, ``'``, ``(``, ``)``, ``&``, ``$``, ``@``, ``=``, ``;``, ``:``, ``+``, ``,``: These special characters are automatically encoded by KvikIO, and are safe for use in key names.
+
+ - ``-``, ``_``, ``.``: These special characters are **not** automatically encoded by KvikIO, but are still safe for use in key names.
+
+ - ``/`` is used as path separator and must not appear in the object name itself.
+
+ - Space character must be explicitly encoded (``%20``) because it will otherwise render the URL malformed.
+
+ - ``?`` must be explicitly encoded (``%3F``) because it will otherwise cause ambiguity with the query string.
+
+ - Control characters ``0x00`` ~ ``0x1F`` hexadecimal (0~31 decimal) and ``0x7F`` (127) are automatically encoded by KvikIO, and are safe for use in key names.
+
+ - Other printable special characters must be avoided, such as ``\``, ``{``, ``^``, ``}``, ``%``, `````, ``]``, ``"``, ``>``, ``[``, ``~``, ``<``, ``#``, ``|``.
+
+ - Non-ASCII characters ``0x80`` ~ ``0xFF`` (128~255) must be avoided.
diff --git a/docs/source/runtime_settings.rst b/docs/source/runtime_settings.rst
index bb347ba23c..c96b2804a8 100644
--- a/docs/source/runtime_settings.rst
+++ b/docs/source/runtime_settings.rst
@@ -17,39 +17,107 @@ Under ``AUTO``, KvikIO falls back to the compatibility mode:
   * when running in Windows Subsystem for Linux (WSL).
   * when ``/run/udev`` isn't readable, which typically happens when running inside a docker image not launched with ``--volume /run/udev:/run/udev:ro``.
 
-This setting can also be programmatically accessed using :py:func:`kvikio.defaults.compat_mode` (getter) and :py:func:`kvikio.defaults.set` (setter).
+This setting can also be programmatically accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
 Thread Pool ``KVIKIO_NTHREADS``
 -------------------------------
 KvikIO can use multiple threads for IO automatically. Set the environment variable ``KVIKIO_NTHREADS`` to the number of threads in the thread pool. If not set, the default value is 1.
 
-This setting can also be accessed using :py:func:`kvikio.defaults.num_threads` (getter) and :py:func:`kvikio.defaults.set`  (setter).
+This setting can also be accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set`  (setter).
 
 Task Size ``KVIKIO_TASK_SIZE``
 ------------------------------
 KvikIO splits parallel IO operations into multiple tasks. Set the environment variable ``KVIKIO_TASK_SIZE`` to the maximum task size (in bytes). If not set, the default value is 4194304 (4 MiB).
 
-This setting can also be accessed using :py:func:`kvikio.defaults.task_size` (getter) and :py:func:`kvikio.defaults.set` (setter).
+This setting can also be accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
 GDS Threshold ``KVIKIO_GDS_THRESHOLD``
 --------------------------------------
-In order to improve performance of small IO, ``.pread()`` and ``.pwrite()`` implement a shortcut that circumvent the threadpool and use the POSIX backend directly. Set the environment variable ``KVIKIO_GDS_THRESHOLD`` to the minimum size (in bytes) to use GDS. If not set, the default value is 1048576 (1 MiB).
+In order to improve performance of small IO, ``.pread()`` and ``.pwrite()`` implement a shortcut that circumvent the threadpool and use the POSIX backend directly. Set the environment variable ``KVIKIO_GDS_THRESHOLD`` to the minimum size (in bytes) to use GDS. If not set, the default value is 16384 (16 KiB).
 
-This setting can also be accessed using :py:func:`kvikio.defaults.gds_threshold` (getter) and :py:func:`kvikio.defaults.set` (setter).
+This setting can also be accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
 Size of the Bounce Buffer ``KVIKIO_BOUNCE_BUFFER_SIZE``
 -------------------------------------------------------
 KvikIO might have to use intermediate host buffers (one per thread) when copying between files and device memory. Set the environment variable ``KVIKIO_BOUNCE_BUFFER_SIZE`` to the size (in bytes) of these "bounce" buffers. If not set, the default value is 16777216 (16 MiB).
 
-This setting can also be accessed using :py:func:`kvikio.defaults.bounce_buffer_size` (getter) and :py:func:`kvikio.defaults.set` (setter).
+This setting can also be accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
 HTTP Retries ``KVIKIO_HTTP_STATUS_CODES``, ``KVIKIO_HTTP_MAX_ATTEMPTS``
 ------------------------------------------------------------------------
 
 The behavior when a remote I/O read returns an error can be controlled through the `KVIKIO_HTTP_STATUS_CODES`, `KVIKIO_HTTP_MAX_ATTEMPTS`, and `KVIKIO_HTTP_TIMEOUT` environment variables.
 
-KvikIO will retry a request should any of the HTTP status code in ``KVIKIO_HTTP_STATUS_CODES`` is received. The default values are ``429, 500, 502, 503, 504``. This setting can also be accessed using :py:func:`kvikio.defaults.http_status_codes` (getter) and :py:func:`kvikio.defaults.set` (setter).
+KvikIO will retry a request should any of the HTTP status code in ``KVIKIO_HTTP_STATUS_CODES`` is received. The default values are ``429, 500, 502, 503, 504``. This setting can also be accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
-The maximum number of attempts to make before throwing an exception is controlled by ``KVIKIO_HTTP_MAX_ATTEMPTS``. The default value is 3. This setting can also be accessed using :py:func:`kvikio.defaults.http_max_attempts` (getter) and :py:func:`kvikio.defaults.set` (setter).
+The maximum number of attempts to make before throwing an exception is controlled by ``KVIKIO_HTTP_MAX_ATTEMPTS``. The default value is 3. This setting can also be accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
-The maximum duration of each HTTP request is controlled by ``KVIKIO_HTTP_TIMEOUT``. The default value is 60, which is the duration in seconds to allow. This setting can also be accessed using :py:func:`kvikio.defaults.http_timoeout` (getter) and :py:func:`kvikio.defaults.set` (setter).
+The maximum duration of each HTTP request is controlled by ``KVIKIO_HTTP_TIMEOUT``. The default value is 60, which is the duration in seconds to allow. This setting can also be accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set` (setter).
+
+HTTP Verbose ``KVIKIO_REMOTE_VERBOSE``
+--------------------------------------
+
+For debugging HTTP requests, you can enable verbose output that shows detailed information about HTTP communication including headers, request/response bodies, connection details, and SSL handshake information.
+
+Set the environment variable ``KVIKIO_REMOTE_VERBOSE`` to ``true``, ``on``, ``yes``, or ``1`` (case-insensitive) to enable verbose output. Otherwise, verbose output is disabled by default.
+
+.. warning::
+
+   This may show sensitive contents from headers and data.
+
+CA bundle file and CA directory ``CURL_CA_BUNDLE``, ``SSL_CERT_FILE``, ``SSL_CERT_DIR``
+---------------------------------------------------------------------------------------
+
+The Certificate Authority (CA) paths required for TLS/SSL verification in ``libcurl`` can be explicitly specified using the following environment variables in order of overriding priority:
+
+  * ``CURL_CA_BUNDLE`` (also used in the ``curl`` program) or ``SSL_CERT_FILE`` (also used in OpenSSL): Specifies the CA certificate bundle file location.
+  * ``SSL_CERT_DIR`` (also used in OpenSSL): Specifies the CA certificate directory.
+
+When neither is specified, KvikIO searches several standard system locations for the CA file and directory, and if the search fails falls back to the libcurl compile-time defaults.
+
+Opportunistic POSIX Direct I/O operations ``KVIKIO_AUTO_DIRECT_IO_READ``, ``KVIKIO_AUTO_DIRECT_IO_WRITE``
+---------------------------------------------------------------------------------------------------------
+
+Overview
+^^^^^^^^
+
+By default, POSIX I/O operations perform buffered I/O using the OS page cache. However, Direct I/O (bypassing the page cache) can significantly improve performance in certain scenarios, such as writes and cold page-cache reads.
+
+Traditional Direct I/O has strict requirements: The buffer address must be page-aligned, the file offset must be page-aligned, and the transfer size must be a multiple of page size (typically 4096 bytes). :py:class:`kvikio.CuFile` provides the feature of opportunistic Direct I/O, which removes these restrictions by automatically handling alignment. Specifically, KvikIO can split a POSIX I/O operation into unaligned and aligned segments and apply buffered I/O and direct I/O respectively.
+
+Configuration
+^^^^^^^^^^^^^
+
+Set the environment variable ``KVIKIO_AUTO_DIRECT_IO_READ`` / ``KVIKIO_AUTO_DIRECT_IO_WRITE`` to ``true``, ``on``, ``yes``, or ``1`` (case-insensitive) to enable opportunistic Direct I/O.
+
+.. code-block:: bash
+
+   export KVIKIO_AUTO_DIRECT_IO_READ=1
+   export KVIKIO_AUTO_DIRECT_IO_WRITE=1
+
+Set them to ``false``, ``off``, ``no``, or ``0`` to disable this feature and use buffered I/O.
+
+.. code-block:: bash
+
+   export KVIKIO_AUTO_DIRECT_IO_READ=0
+   export KVIKIO_AUTO_DIRECT_IO_WRITE=0
+
+If not set, the default setting is buffered I/O for POSIX read (``KVIKIO_AUTO_DIRECT_IO_READ=0``) and Direct I/O for POSIX write (``KVIKIO_AUTO_DIRECT_IO_WRITE=1``).
+
+Programmatic Access
+^^^^^^^^^^^^^^^^^^^
+
+These settings can be queried (:py:func:`kvikio.defaults.get`) and modified (:py:func:`kvikio.defaults.set`) at runtime using the property name ``auto_direct_io_read`` and ``auto_direct_io_write``.
+
+Example:
+
+.. code-block:: python
+
+   import kvikio.defaults
+
+   # Check current settings
+   print(kvikio.defaults.get("auto_direct_io_read"))
+   print(kvikio.defaults.get("auto_direct_io_write"))
+
+   # Enable Direct I/O for reads, and disable it for writes
+   kvikio.defaults.set({"auto_direct_io_read": True, "auto_direct_io_write": False})
diff --git a/docs/source/zarr.rst b/docs/source/zarr.rst
index 019eff2767..baa48fa3e2 100644
--- a/docs/source/zarr.rst
+++ b/docs/source/zarr.rst
@@ -3,18 +3,16 @@ Zarr
 
 `Zarr <https://github.com/zarr-developers/zarr-specs>`_ is a binary file format for chunked, compressed, N-Dimensional array. It is used throughout the PyData ecosystem and especially for climate and biological science applications.
 
-
 `Zarr-Python <https://zarr.readthedocs.io/en/stable/>`_ is the official Python package for reading and writing Zarr arrays. Its main feature is a NumPy-like array that translates array operations into file IO seamlessly.
 KvikIO provides a GPU backend to Zarr-Python that enables `GPUDirect Storage (GDS) <https://developer.nvidia.com/blog/gpudirect-storage/>`_ seamlessly.
 
-KvikIO supports either zarr-python 2.x or zarr-python 3.x.
-However, the API provided in :mod:`kvikio.zarr` differs based on which version of zarr you have, following the differences between zarr-python 2.x and zarr-python 3.x.
-
+If the optional zarr-python dependency is installed, then ``kvikio.zarr`` will be available.
+KvikIO supports zarr-python 3.x.
 
-Zarr Python 3.x
----------------
+Usage
+-----
 
-Zarr-python includes native support for reading Zarr chunks into device memory if you `configure Zarr <https://zarr.readthedocs.io/en/stable/user-guide/gpu.html#>`__ to use GPUs.
+Zarr-Python includes native support for reading Zarr chunks into device memory if you `configure Zarr <https://zarr.readthedocs.io/en/stable/user-guide/gpu.html#>`__ to use GPUs.
 You can use any store, but KvikIO provides :py:class:`kvikio.zarr.GDSStore` to efficiently load data directly into GPU memory.
 
 .. code-block:: python
@@ -28,16 +26,3 @@ You can use any store, but KvikIO provides :py:class:`kvikio.zarr.GDSStore` to e
    ... )
    >>> type(z[:10, :10])
    cupy.ndarray
-
-
-
-Zarr Python 2.x
----------------
-
-
-The following uses zarr-python 2.x, and is an example of how to use the convenience function :py:meth:`kvikio.zarr.open_cupy_array`
-to create a new Zarr array and how to open an existing Zarr array.
-
-
-.. literalinclude:: ../../python/kvikio/examples/zarr_cupy_nvcomp.py
-    :language: python
diff --git a/java/pom.xml b/java/pom.xml
index c6aa3eb8ce..be956c8cb4 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -1,5 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
-
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+SPDX-License-Identifier: Apache-2.0
+-->
 <project  xmlns="http://maven.apache.org/POM/4.0.0"
           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
           xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -7,7 +10,7 @@
 
   <groupId>ai.rapids.kvikio</groupId>
   <artifactId>cufile</artifactId>
-  <version>25.06.0-SNAPSHOT</version>
+  <version>26.02.0-SNAPSHOT</version>
 
   <name>cuFile</name>
   <description>
diff --git a/java/src/main/java/ai/rapids/kvikio/cufile/CuFile.java b/java/src/main/java/ai/rapids/kvikio/cufile/CuFile.java
index 68c6fcbb31..fcc931a6b3 100644
--- a/java/src/main/java/ai/rapids/kvikio/cufile/CuFile.java
+++ b/java/src/main/java/ai/rapids/kvikio/cufile/CuFile.java
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 package ai.rapids.kvikio.cufile;
diff --git a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileDriver.java b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileDriver.java
index 109857e137..b33b866c83 100644
--- a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileDriver.java
+++ b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileDriver.java
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 package ai.rapids.kvikio.cufile;
diff --git a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileHandle.java b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileHandle.java
index 1df4728253..6c70a2997f 100644
--- a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileHandle.java
+++ b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileHandle.java
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 package ai.rapids.kvikio.cufile;
diff --git a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileReadHandle.java b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileReadHandle.java
index c21399e292..8323d6bca4 100644
--- a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileReadHandle.java
+++ b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileReadHandle.java
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 package ai.rapids.kvikio.cufile;
diff --git a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileWriteHandle.java b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileWriteHandle.java
index 36e8952df5..d7e18a5660 100644
--- a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileWriteHandle.java
+++ b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileWriteHandle.java
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 package ai.rapids.kvikio.cufile;
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 1800db8a75..be211cc8e1 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -1,15 +1,8 @@
 #
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 #
 
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
diff --git a/java/src/main/native/src/CuFileJni.cpp b/java/src/main/native/src/CuFileJni.cpp
index 829fcbf5f9..47429a3713 100644
--- a/java/src/main/native/src/CuFileJni.cpp
+++ b/java/src/main/native/src/CuFileJni.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <fcntl.h>
diff --git a/java/src/test/java/ai/rapids/kvikio/cufile/BasicReadWriteTest.java b/java/src/test/java/ai/rapids/kvikio/cufile/BasicReadWriteTest.java
index 832dfcb626..9443933693 100644
--- a/java/src/test/java/ai/rapids/kvikio/cufile/BasicReadWriteTest.java
+++ b/java/src/test/java/ai/rapids/kvikio/cufile/BasicReadWriteTest.java
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 package ai.rapids.kvikio.cufile;
diff --git a/notebooks/medical_dicom_image_loading_example.ipynb b/notebooks/medical_dicom_image_loading_example.ipynb
index 3e47693ee5..1cd603e4c5 100644
--- a/notebooks/medical_dicom_image_loading_example.ipynb
+++ b/notebooks/medical_dicom_image_loading_example.ipynb
@@ -58,21 +58,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import kvikio\n",
-    "import kvikio.defaults\n",
-    "import cupy as cp\n",
+    "import datetime\n",
+    "import os\n",
+    "import shutil\n",
     "import tempfile\n",
+    "from timeit import default_timer as timer\n",
+    "\n",
+    "import cupy as cp\n",
+    "import numpy as np\n",
     "import pydicom\n",
     "from pydicom.dataset import Dataset, FileDataset\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import datetime\n",
-    "import requests\n",
-    "import tarfile\n",
-    "import gzip\n",
-    "import shutil\n",
-    "import io\n",
-    "from timeit import default_timer as timer"
+    "\n",
+    "import kvikio\n",
+    "import kvikio.defaults"
    ]
   },
   {
@@ -110,6 +108,7 @@
     "    c = cp.random.rand(100, 100, 3)\n",
     "    d = cp.mean(c)\n",
     "\n",
+    "\n",
     "warmup_kvikio()"
    ]
   },
@@ -172,7 +171,12 @@
     }
    ],
    "source": [
-    "def create_multiframe_dicom(file_path, num_slices=128, pixel_array_shape=(1024, 1024), pixel_value_range=(0, 4095)):\n",
+    "def create_multiframe_dicom(\n",
+    "    file_path,\n",
+    "    num_slices=128,\n",
+    "    pixel_array_shape=(1024, 1024),\n",
+    "    pixel_value_range=(0, 4095),\n",
+    "):\n",
     "    # Create a new DICOM dataset\n",
     "    file_meta = pydicom.dataset.FileMetaDataset()\n",
     "    file_meta.MediaStorageSOPClassUID = pydicom.uid.generate_uid()\n",
@@ -190,10 +194,10 @@
     "    ds.StudyInstanceUID = pydicom.uid.generate_uid()\n",
     "    ds.SeriesInstanceUID = pydicom.uid.generate_uid()\n",
     "    ds.SOPInstanceUID = file_meta.MediaStorageSOPInstanceUID\n",
-    "    ds.StudyDate = datetime.date.today().strftime('%Y%m%d')\n",
-    "    ds.ContentDate = datetime.date.today().strftime('%Y%m%d')\n",
-    "    ds.StudyTime = datetime.datetime.now().strftime('%H%M%S')\n",
-    "    ds.ContentTime = datetime.datetime.now().strftime('%H%M%S')\n",
+    "    ds.StudyDate = datetime.date.today().strftime(\"%Y%m%d\")\n",
+    "    ds.ContentDate = datetime.date.today().strftime(\"%Y%m%d\")\n",
+    "    ds.StudyTime = datetime.datetime.now().strftime(\"%H%M%S\")\n",
+    "    ds.ContentTime = datetime.datetime.now().strftime(\"%H%M%S\")\n",
     "\n",
     "    # Set the pixel data with random integers\n",
     "    pixel_array = np.random.randint(\n",
@@ -234,6 +238,7 @@
     "    ds.save_as(file_path)\n",
     "    print(f\"Multi-frame DICOM file created at: {file_path}\")\n",
     "\n",
+    "\n",
     "# Example usage\n",
     "example_dcm_path = os.path.join(temp_working_dir, \"example.dcm\")\n",
     "\n",
@@ -277,7 +282,7 @@
     "    columns = dcm_read_data.Columns\n",
     "    bits_allocated = dcm_read_data.BitsAllocated\n",
     "    samples_per_pixel = dcm_read_data.SamplesPerPixel\n",
-    "    number_of_frames = getattr(dcm_read_data, 'NumberOfFrames', 1)\n",
+    "    number_of_frames = getattr(dcm_read_data, \"NumberOfFrames\", 1)\n",
     "    pixel_representation = dcm_read_data.PixelRepresentation\n",
     "\n",
     "    if bits_allocated == 8:\n",
diff --git a/notebooks/medical_nifti_image_loading_example.ipynb b/notebooks/medical_nifti_image_loading_example.ipynb
index bc0ec2f837..7104f7db07 100644
--- a/notebooks/medical_nifti_image_loading_example.ipynb
+++ b/notebooks/medical_nifti_image_loading_example.ipynb
@@ -60,19 +60,21 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import kvikio\n",
-    "import kvikio.defaults\n",
-    "import cupy as cp\n",
-    "import numpy as np\n",
+    "import gzip\n",
+    "import io\n",
+    "import os\n",
+    "import shutil\n",
+    "import tarfile\n",
     "import tempfile\n",
+    "from timeit import default_timer as timer\n",
+    "\n",
+    "import cupy as cp\n",
     "import nibabel as nib\n",
-    "import os\n",
+    "import numpy as np\n",
     "import requests\n",
-    "import tarfile\n",
-    "import gzip\n",
-    "import shutil\n",
-    "import io\n",
-    "from timeit import default_timer as timer"
+    "\n",
+    "import kvikio\n",
+    "import kvikio.defaults"
    ]
   },
   {
@@ -110,6 +112,7 @@
     "    c = cp.random.rand(100, 100, 3)\n",
     "    d = cp.mean(c)\n",
     "\n",
+    "\n",
     "warmup_kvikio()"
    ]
   },
@@ -196,8 +199,10 @@
    ],
    "source": [
     "# decompress the nii.gz file\n",
-    "example_nifti_path = os.path.join(temp_working_dir, \"Task09_Spleen\", \"imagesTr\", \"spleen_53.nii\")\n",
-    "with gzip.open(example_nifti_path+\".gz\", \"rb\") as f_in:\n",
+    "example_nifti_path = os.path.join(\n",
+    "    temp_working_dir, \"Task09_Spleen\", \"imagesTr\", \"spleen_53.nii\"\n",
+    ")\n",
+    "with gzip.open(example_nifti_path + \".gz\", \"rb\") as f_in:\n",
     "    with open(example_nifti_path, \"wb\") as f_out:\n",
     "        shutil.copyfileobj(f_in, f_out)\n",
     "print(\"a decompressed nifti file is saved at: \", example_nifti_path)"
diff --git a/notebooks/nvcomp_batch_codec.ipynb b/notebooks/nvcomp_batch_codec.ipynb
deleted file mode 100644
index f4f4689f3c..0000000000
--- a/notebooks/nvcomp_batch_codec.ipynb
+++ /dev/null
@@ -1,357 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "b543ae63",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "\n",
-    "import numcodecs\n",
-    "\n",
-    "import numpy as np\n",
-    "\n",
-    "import zarr\n",
-    "\n",
-    "from IPython.display import display\n",
-    "\n",
-    "np.set_printoptions(precision=4, suppress=True)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "1a8e55d5",
-   "metadata": {},
-   "source": [
-    "### Basic usage\n",
-    "\n",
-    "Get nvCOMP codec from numcodecs registry:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "75524650",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "NvCompBatchCodec(algorithm='lz4', options={})"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "NVCOMP_CODEC_ID = \"nvcomp_batch\"\n",
-    "\n",
-    "# Currently supported algorithms.\n",
-    "LZ4_ALGO = \"LZ4\"\n",
-    "GDEFLATE_ALGO = \"Gdeflate\"\n",
-    "SNAPPY_ALGO = \"snappy\"\n",
-    "ZSTD_ALGO = \"zstd\"\n",
-    "\n",
-    "codec = numcodecs.registry.get_codec(dict(id=NVCOMP_CODEC_ID, algorithm=LZ4_ALGO))\n",
-    "# To pass algorithm-specific options, use options parameter:\n",
-    "# codec = numcodecs.registry.get_codec(dict(id=NVCOMP_CODEC_ID, algo=LZ4_ALGO, options={\"data_type\": 1}))\n",
-    "\n",
-    "display(codec)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "46641ccb",
-   "metadata": {},
-   "source": [
-    "Create data:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "12a4fffd",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[ 1.6243, -0.6118, -0.5282, ...,  0.0436, -0.62  ,  0.698 ],\n",
-       "       [-0.4471,  1.2245,  0.4035, ...,  0.4203,  0.811 ,  1.0444],\n",
-       "       [-0.4009,  0.824 , -0.5623, ...,  0.7848, -0.9554,  0.5859],\n",
-       "       ...,\n",
-       "       [ 1.3797,  0.1387,  1.2255, ...,  1.8051,  0.3722,  0.1253],\n",
-       "       [ 0.7348, -0.7115, -0.1248, ..., -1.9533, -0.7684, -0.5345],\n",
-       "       [ 0.2183, -0.8654,  0.8886, ..., -1.0141, -0.0627, -1.4379]],\n",
-       "      dtype=float32)"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">NvCompBatchCodec(algorithm='lz4', options={})</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">41006 (40.0K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : NvCompBatchCodec(algorithm='lz4', options={})\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 41006 (40.0K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "shape = (100, 100)\n",
-    "chunks = (10, 10)\n",
-    "\n",
-    "np.random.seed(1)\n",
-    "\n",
-    "x = zarr.array(np.random.randn(*shape).astype(np.float32), chunks=chunks, compressor=codec)\n",
-    "display(x[:])\n",
-    "display(x.info)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "c15cbdff",
-   "metadata": {},
-   "source": [
-    "Store and load back the data:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "730cde85",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'chunks': [10, 10],\n",
-       " 'compressor': {'algorithm': 'lz4', 'id': 'nvcomp_batch', 'options': {}},\n",
-       " 'dtype': '<f4',\n",
-       " 'fill_value': 0.0,\n",
-       " 'filters': None,\n",
-       " 'order': 'C',\n",
-       " 'shape': [100, 100],\n",
-       " 'zarr_format': 2}"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Use simple dictionary store, see zarr documentation for other options.\n",
-    "zarr_store = {}\n",
-    "\n",
-    "zarr.save_array(zarr_store, x, compressor=codec)\n",
-    "\n",
-    "# Check stored metadata.\n",
-    "meta_info = json.loads(zarr_store[\".zarray\"])\n",
-    "display(meta_info)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "11338a99",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">NvCompBatchCodec(algorithm='lz4', options={})</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">41006 (40.0K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : NvCompBatchCodec(algorithm='lz4', options={})\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 41006 (40.0K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "y = zarr.open_array(zarr_store)\n",
-    "display(y.info)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "5b6cc2ca",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Test the roundtrip.\n",
-    "np.testing.assert_equal(y[:], x[:])"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "1a8eea79",
-   "metadata": {},
-   "source": [
-    "### CPU compression / GPU decompression\n",
-    "\n",
-    "Some algorithms, such as LZ4, can be used interchangeably on CPU and GPU. For example, the data might be created using CPU LZ4 codec and then decompressed using GPU version of LZ4 codec."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "87d25b76",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'chunks': [10, 10],\n",
-       " 'compressor': {'acceleration': 1, 'id': 'lz4'},\n",
-       " 'dtype': '<f4',\n",
-       " 'fill_value': 0.0,\n",
-       " 'filters': None,\n",
-       " 'order': 'C',\n",
-       " 'shape': [100, 100],\n",
-       " 'zarr_format': 2}"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">LZ4(acceleration=1)</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">40973 (40.0K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : LZ4(acceleration=1)\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 40973 (40.0K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">NvCompBatchCodec(algorithm='lz4', options={})</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">40883 (39.9K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : NvCompBatchCodec(algorithm='lz4', options={})\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 40883 (39.9K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Get default (CPU) implementation of LZ4 codec.\n",
-    "cpu_codec = numcodecs.registry.get_codec({\"id\": \"lz4\"})\n",
-    "\n",
-    "x = zarr.array(np.random.randn(*shape).astype(np.float32), chunks=chunks, compressor=cpu_codec)\n",
-    "# Define a simple, dictionary-based store. In real scenarios this can be a filesystem or some other persistent store.\n",
-    "store = {}\n",
-    "zarr.save_array(store, x, compressor=cpu_codec)\n",
-    "\n",
-    "# Check that the data was written by the expected codec.\n",
-    "meta = json.loads(store[\".zarray\"])\n",
-    "display(meta)\n",
-    "assert meta[\"compressor\"][\"id\"] == \"lz4\"\n",
-    "\n",
-    "# Change codec to GPU/nvCOMP-based.\n",
-    "meta[\"compressor\"] = {\"id\": NVCOMP_CODEC_ID, \"algorithm\": LZ4_ALGO}\n",
-    "store[\".zarray\"] = json.dumps(meta).encode()\n",
-    "\n",
-    "y = zarr.open_array(store, compressor=codec)\n",
-    "\n",
-    "display(x.info)\n",
-    "display(y.info)\n",
-    "\n",
-    "np.testing.assert_equal(x[:], y[:])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b9294992",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/nvcomp_vs_zarr_lz4.ipynb b/notebooks/nvcomp_vs_zarr_lz4.ipynb
deleted file mode 100644
index 3b6d947ac3..0000000000
--- a/notebooks/nvcomp_vs_zarr_lz4.ipynb
+++ /dev/null
@@ -1,2832 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "f95b6759-533a-470b-8f08-5f91ebcea625",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import cupy as cp\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import time\n",
-    "import zarr\n",
-    "\n",
-    "import kvikio.nvcomp\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "d1e60a9b-0bca-4c66-b2f0-829acc3b1ba2",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# conda install -c conda-forge zarr"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "2cedb529-c0fa-4883-a2fd-78b1ad3c1a59",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2013929216, 1006964608, 503482304, 251741152, 125870576, 62935288, 31467644, 15733822, 7866911, 3933455, 1966727, 983363, 491681, 245840, 122920, 61460, 30730, 15365, 7682, 3841]\n"
-     ]
-    }
-   ],
-   "source": [
-    "HOST_LZ4_MAX = 2013929216 # 2113929216\n",
-    "sizes = list(map(lambda x: HOST_LZ4_MAX//(2**x), np.arange(20)))\n",
-    "print(sizes)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "id": "39483573-e79b-4dca-aee3-13bf392da3a7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_size = []\n",
-    "cascaded_size = []\n",
-    "cascaded_temp_size = []\n",
-    "cascaded_round_trip_time = []\n",
-    "lz4_gpu_size = []\n",
-    "lz4_gpu_temp_size = []\n",
-    "lz4_gpu_round_trip_time = []\n",
-    "bitcomp_gpu_size = []\n",
-    "bitcomp_gpu_temp_size = []\n",
-    "bitcomp_gpu_round_trip_time = []\n",
-    "lz4_size = []\n",
-    "lz4_round_trip_time = []"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "id": "ccd9b1e7-b607-4948-8256-73bedf1ec7a8",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "821.61s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "--2023-09-29 13:44:43--  http://textfiles.com/etext/NONFICTION/kjv10.txt\n",
-      "Resolving textfiles.com (textfiles.com)... 208.86.224.90\n",
-      "Connecting to textfiles.com (textfiles.com)|208.86.224.90|:80... connected.\n",
-      "HTTP request sent, awaiting response... 200 OK\n",
-      "Length: 4432803 (4.2M) [text/plain]\n",
-      "Saving to: ‘kjv10.txt.2’\n",
-      "\n",
-      "kjv10.txt.2         100%[===================>]   4.23M   316KB/s    in 14s     \n",
-      "\n",
-      "2023-09-29 13:44:58 (304 KB/s) - ‘kjv10.txt.2’ saved [4432803/4432803]\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "!wget http://textfiles.com/etext/NONFICTION/kjv10.txt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "id": "4c9a5c4c-4c49-4834-8dc2-3e6fc11ea930",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "text = open('kjv10.txt').read()\n",
-    "bib = np.frombuffer(bytes(text, 'utf-8'), dtype=np.int8)\n",
-    "data_buffer = np.tile(bib, 500)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "id": "74740819-b987-4012-ba6c-ed3d3b9afd60",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# One of the three below keys, this will set the arrangement of test data for a full run of the notebook.\n",
-    "TARGET = \"Ascending\"\n",
-    "DTYPE = cp.int32"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "0a1307ed-034c-4943-a7e1-36665cba8ad5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = {\n",
-    "    \"Ascending\": np.arange(0, HOST_LZ4_MAX, dtype=np.int32),\n",
-    "    \"Random\": np.random.randint(0, 100, HOST_LZ4_MAX, dtype=np.int32),\n",
-    "    \"Text\": data_buffer\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "id": "68adbb33-ddb7-4603-8863-fdd25b8bdc51",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_host_data(offset, dtype):\n",
-    "    exemplar = np.array([1], dtype=dtype)\n",
-    "    print(offset)\n",
-    "    print(exemplar.itemsize)\n",
-    "    print(data[TARGET].itemsize)\n",
-    "    index = offset // data[TARGET].itemsize\n",
-    "    index = index - (index % exemplar.itemsize)\n",
-    "    print(index)\n",
-    "    return data[TARGET][0:index].view(dtype)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "id": "f067cdc2-ee14-4258-b89d-0bb4a224c698",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2013929216\n",
-      "4\n",
-      "4\n",
-      "503482304\n",
-      "-----\n",
-      "Input size:  2013929216\n",
-      "Cascaded GPU compressor output size:  33434464\n",
-      "Cascaded GPU decompressor output size:  2013929216\n",
-      "Cascaded GPU compress/decompress round trip time:  0.1076362133026123\n",
-      "2021826967\n",
-      "Lz4 zarr time:  4.681669235229492\n",
-      "Lz4 compressed size:  2021826967\n",
-      "1006964608\n",
-      "4\n",
-      "4\n",
-      "251741152\n",
-      "-----\n",
-      "Input size:  1006964608\n",
-      "Cascaded GPU compressor output size:  16717276\n",
-      "Cascaded GPU decompressor output size:  1006964608\n",
-      "Cascaded GPU compress/decompress round trip time:  0.11769247055053711\n",
-      "1010913478\n",
-      "Lz4 zarr time:  2.57978892326355\n",
-      "Lz4 compressed size:  1010913478\n",
-      "503482304\n",
-      "4\n",
-      "4\n",
-      "125870576\n",
-      "-----\n",
-      "Input size:  503482304\n",
-      "Cascaded GPU compressor output size:  8358716\n",
-      "Cascaded GPU decompressor output size:  503482304\n",
-      "Cascaded GPU compress/decompress round trip time:  0.05775332450866699\n",
-      "505456734\n",
-      "Lz4 zarr time:  1.2365527153015137\n",
-      "Lz4 compressed size:  505456734\n",
-      "251741152\n",
-      "4\n",
-      "4\n",
-      "62935288\n",
-      "-----\n",
-      "Input size:  251741152\n",
-      "Cascaded GPU compressor output size:  4179436\n",
-      "Cascaded GPU decompressor output size:  251741152\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0284881591796875\n",
-      "252728362\n",
-      "Lz4 zarr time:  0.5986642837524414\n",
-      "Lz4 compressed size:  252728362\n",
-      "125870576\n",
-      "4\n",
-      "4\n",
-      "31467644\n",
-      "-----\n",
-      "Input size:  125870576\n",
-      "Cascaded GPU compressor output size:  2089796\n",
-      "Cascaded GPU decompressor output size:  125870576\n",
-      "Cascaded GPU compress/decompress round trip time:  0.01472783088684082\n",
-      "126364175\n",
-      "Lz4 zarr time:  0.30330395698547363\n",
-      "Lz4 compressed size:  126364175\n",
-      "62935288\n",
-      "4\n",
-      "4\n",
-      "15733820\n",
-      "-----\n",
-      "Input size:  62935280\n",
-      "Cascaded GPU compressor output size:  1044976\n",
-      "Cascaded GPU decompressor output size:  62935280\n",
-      "Cascaded GPU compress/decompress round trip time:  0.007399559020996094\n",
-      "63182074\n",
-      "Lz4 zarr time:  0.1610257625579834\n",
-      "Lz4 compressed size:  63182074\n",
-      "31467644\n",
-      "4\n",
-      "4\n",
-      "7866908\n",
-      "-----\n",
-      "Input size:  31467632\n",
-      "Cascaded GPU compressor output size:  522532\n",
-      "Cascaded GPU decompressor output size:  31467632\n",
-      "Cascaded GPU compress/decompress round trip time:  0.004503726959228516\n",
-      "31591024\n",
-      "Lz4 zarr time:  0.1471562385559082\n",
-      "Lz4 compressed size:  31591024\n",
-      "15733822\n",
-      "4\n",
-      "4\n",
-      "3933452\n",
-      "-----\n",
-      "Input size:  15733808\n",
-      "Cascaded GPU compressor output size:  261344\n",
-      "Cascaded GPU decompressor output size:  15733808\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0025734901428222656\n",
-      "15795499\n",
-      "Lz4 zarr time:  0.03436875343322754\n",
-      "Lz4 compressed size:  15795499\n",
-      "7866911\n",
-      "4\n",
-      "4\n",
-      "1966724\n",
-      "-----\n",
-      "Input size:  7866896\n",
-      "Cascaded GPU compressor output size:  130716\n",
-      "Cascaded GPU decompressor output size:  7866896\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0018618106842041016\n",
-      "7897736\n",
-      "Lz4 zarr time:  0.010539531707763672\n",
-      "Lz4 compressed size:  7897736\n",
-      "3933455\n",
-      "4\n",
-      "4\n",
-      "983360\n",
-      "-----\n",
-      "Input size:  3933440\n",
-      "Cascaded GPU compressor output size:  65436\n",
-      "Cascaded GPU decompressor output size:  3933440\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0017323493957519531\n",
-      "3948855\n",
-      "Lz4 zarr time:  0.028203964233398438\n",
-      "Lz4 compressed size:  3948855\n",
-      "1966727\n",
-      "4\n",
-      "4\n",
-      "491680\n",
-      "-----\n",
-      "Input size:  1966720\n",
-      "Cascaded GPU compressor output size:  32796\n",
-      "Cascaded GPU decompressor output size:  1966720\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0020630359649658203\n",
-      "1974422\n",
-      "Lz4 zarr time:  0.002621889114379883\n",
-      "Lz4 compressed size:  1974422\n",
-      "983363\n",
-      "4\n",
-      "4\n",
-      "245840\n",
-      "-----\n",
-      "Input size:  983360\n",
-      "Cascaded GPU compressor output size:  16476\n",
-      "Cascaded GPU decompressor output size:  983360\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0014410018920898438\n",
-      "987206\n",
-      "Lz4 zarr time:  0.0007197856903076172\n",
-      "Lz4 compressed size:  987206\n",
-      "491681\n",
-      "4\n",
-      "4\n",
-      "122920\n",
-      "-----\n",
-      "Input size:  491680\n",
-      "Cascaded GPU compressor output size:  8316\n",
-      "Cascaded GPU decompressor output size:  491680\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0011644363403320312\n",
-      "493597\n",
-      "Lz4 zarr time:  0.000965118408203125\n",
-      "Lz4 compressed size:  493597\n",
-      "245840\n",
-      "4\n",
-      "4\n",
-      "61460\n",
-      "-----\n",
-      "Input size:  245840\n",
-      "Cascaded GPU compressor output size:  4236\n",
-      "Cascaded GPU decompressor output size:  245840\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0015044212341308594\n",
-      "246793\n",
-      "Lz4 zarr time:  0.0004220008850097656\n",
-      "Lz4 compressed size:  246793\n",
-      "122920\n",
-      "4\n",
-      "4\n",
-      "30728\n",
-      "-----\n",
-      "Input size:  122912\n",
-      "Cascaded GPU compressor output size:  2184\n",
-      "Cascaded GPU decompressor output size:  122912\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0011115074157714844\n",
-      "123383\n",
-      "Lz4 zarr time:  0.0002646446228027344\n",
-      "Lz4 compressed size:  123383\n",
-      "61460\n",
-      "4\n",
-      "4\n",
-      "15364\n",
-      "-----\n",
-      "Input size:  61456\n",
-      "Cascaded GPU compressor output size:  1148\n",
-      "Cascaded GPU decompressor output size:  61456\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0009233951568603516\n",
-      "61678\n",
-      "Lz4 zarr time:  0.00020623207092285156\n",
-      "Lz4 compressed size:  61678\n",
-      "30730\n",
-      "4\n",
-      "4\n",
-      "7680\n",
-      "-----\n",
-      "Input size:  30720\n",
-      "Cascaded GPU compressor output size:  632\n",
-      "Cascaded GPU decompressor output size:  30720\n",
-      "Cascaded GPU compress/decompress round trip time:  0.001186370849609375\n",
-      "30822\n",
-      "Lz4 zarr time:  0.00011777877807617188\n",
-      "Lz4 compressed size:  30822\n",
-      "15365\n",
-      "4\n",
-      "4\n",
-      "3840\n",
-      "-----\n",
-      "Input size:  15360\n",
-      "Cascaded GPU compressor output size:  360\n",
-      "Cascaded GPU decompressor output size:  15360\n",
-      "Cascaded GPU compress/decompress round trip time:  0.001523733139038086\n",
-      "15401\n",
-      "Lz4 zarr time:  0.0003781318664550781\n",
-      "Lz4 compressed size:  15401\n",
-      "7682\n",
-      "4\n",
-      "4\n",
-      "1920\n",
-      "-----\n",
-      "Input size:  7680\n",
-      "Cascaded GPU compressor output size:  224\n",
-      "Cascaded GPU decompressor output size:  7680\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0012781620025634766\n",
-      "7699\n",
-      "Lz4 zarr time:  0.0001780986785888672\n",
-      "Lz4 compressed size:  7699\n",
-      "3841\n",
-      "4\n",
-      "4\n",
-      "960\n",
-      "-----\n",
-      "Input size:  3840\n",
-      "Cascaded GPU compressor output size:  156\n",
-      "Cascaded GPU decompressor output size:  3840\n",
-      "Cascaded GPU compress/decompress round trip time:  0.001318216323852539\n",
-      "3852\n",
-      "Lz4 zarr time:  0.00019931793212890625\n",
-      "Lz4 compressed size:  3852\n"
-     ]
-    }
-   ],
-   "source": [
-    "input_size = []\n",
-    "cascaded_size = []\n",
-    "cascaded_temp_size = []\n",
-    "cascaded_round_trip_time = []\n",
-    "lz4_gpu_size = []\n",
-    "lz4_gpu_temp_size = []\n",
-    "lz4_gpu_round_trip_time = []\n",
-    "lz4_size = []\n",
-    "lz4_round_trip_time = []\n",
-    "for size in sizes:\n",
-    "    data_host = get_host_data(size, DTYPE)\n",
-    "    data_gpu = cp.array(data_host)\n",
-    "    \"\"\"Cascaded GPU\"\"\"\n",
-    "    t_gpu = time.time()\n",
-    "    compressor = kvikio.nvcomp.CascadedManager(dtype=data_gpu.dtype)\n",
-    "    compressed = compressor.compress(data_gpu)\n",
-    "    output_size = compressed.nbytes\n",
-    "\n",
-    "    decompressed = compressor.decompress(compressed)\n",
-    "    decompressed_size = decompressed.size * decompressed.itemsize\n",
-    "    input_size.append(data_gpu.size * data_gpu.itemsize)\n",
-    "    cascaded_round_trip_time.append(time.time() - t_gpu)\n",
-    "    cascaded_size.append(output_size)\n",
-    "    print('-----')\n",
-    "    print('Input size: ', data_gpu.size * data_gpu.itemsize)\n",
-    "    print('Cascaded GPU compressor output size: ', output_size)\n",
-    "    print('Cascaded GPU decompressor output size: ', decompressed_size)\n",
-    "    print('Cascaded GPU compress/decompress round trip time: ',time.time() - t_gpu)\n",
-    "    \n",
-    "    del compressor\n",
-    "        \n",
-    "    \"\"\"LZ4 Host\"\"\"\n",
-    "    lz4 = zarr.LZ4()\n",
-    "    t_host = time.time()\n",
-    "    host_compressed = lz4.encode(data_gpu.get())\n",
-    "    del data_gpu\n",
-    "    print(len(host_compressed))\n",
-    "    host_compressed = host_compressed[:2113929216]\n",
-    "    host_decompressed = lz4.decode(host_compressed)\n",
-    "    print('Lz4 zarr time: ', time.time() - t_host)\n",
-    "    print('Lz4 compressed size: ', len(host_compressed))\n",
-    "    lz4_size.append(len(host_compressed))\n",
-    "    lz4_round_trip_time.append(time.time() - t_host)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "id": "c981e8bc-e96a-4af4-9fe1-414aa2ff4c99",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2013929216\n",
-      "4\n",
-      "4\n",
-      "503482304\n",
-      "lz4 GPU compressor output size:  2022340697\n",
-      "lz4 GPU decompressor output size:  2013929216\n",
-      "lz4 GPU compress/decompress round trip time:  0.7271463871002197\n",
-      "1006964608\n",
-      "4\n",
-      "4\n",
-      "251741152\n",
-      "lz4 GPU compressor output size:  1011170371\n",
-      "lz4 GPU decompressor output size:  1006964608\n",
-      "lz4 GPU compress/decompress round trip time:  0.36713171005249023\n",
-      "503482304\n",
-      "4\n",
-      "4\n",
-      "125870576\n",
-      "lz4 GPU compressor output size:  505585200\n",
-      "lz4 GPU decompressor output size:  503482304\n",
-      "lz4 GPU compress/decompress round trip time:  0.1900792121887207\n",
-      "251741152\n",
-      "4\n",
-      "4\n",
-      "62935288\n",
-      "lz4 GPU compressor output size:  252792621\n",
-      "lz4 GPU decompressor output size:  251741152\n",
-      "lz4 GPU compress/decompress round trip time:  0.09049177169799805\n",
-      "125870576\n",
-      "4\n",
-      "4\n",
-      "31467644\n",
-      "lz4 GPU compressor output size:  126396327\n",
-      "lz4 GPU decompressor output size:  125870576\n",
-      "lz4 GPU compress/decompress round trip time:  0.04643416404724121\n",
-      "62935288\n",
-      "4\n",
-      "4\n",
-      "15733820\n",
-      "lz4 GPU compressor output size:  63198181\n",
-      "lz4 GPU decompressor output size:  62935280\n",
-      "lz4 GPU compress/decompress round trip time:  0.02284073829650879\n",
-      "31467644\n",
-      "4\n",
-      "4\n",
-      "7866908\n",
-      "lz4 GPU compressor output size:  31599109\n",
-      "lz4 GPU decompressor output size:  31467632\n",
-      "lz4 GPU compress/decompress round trip time:  0.015845537185668945\n",
-      "15733822\n",
-      "4\n",
-      "4\n",
-      "3933452\n",
-      "lz4 GPU compressor output size:  15799573\n",
-      "lz4 GPU decompressor output size:  15733808\n",
-      "lz4 GPU compress/decompress round trip time:  0.009501934051513672\n",
-      "7866911\n",
-      "4\n",
-      "4\n",
-      "1966724\n",
-      "lz4 GPU compressor output size:  7899801\n",
-      "lz4 GPU decompressor output size:  7866896\n",
-      "lz4 GPU compress/decompress round trip time:  0.011568546295166016\n",
-      "3933455\n",
-      "4\n",
-      "4\n",
-      "983360\n",
-      "lz4 GPU compressor output size:  3949915\n",
-      "lz4 GPU decompressor output size:  3933440\n",
-      "lz4 GPU compress/decompress round trip time:  0.00696110725402832\n",
-      "1966727\n",
-      "4\n",
-      "4\n",
-      "491680\n",
-      "lz4 GPU compressor output size:  1974981\n",
-      "lz4 GPU decompressor output size:  1966720\n",
-      "lz4 GPU compress/decompress round trip time:  0.012327194213867188\n",
-      "983363\n",
-      "4\n",
-      "4\n",
-      "245840\n",
-      "lz4 GPU compressor output size:  987514\n",
-      "lz4 GPU decompressor output size:  983360\n",
-      "lz4 GPU compress/decompress round trip time:  0.006538867950439453\n",
-      "491681\n",
-      "4\n",
-      "4\n",
-      "122920\n",
-      "lz4 GPU compressor output size:  493774\n",
-      "lz4 GPU decompressor output size:  491680\n",
-      "lz4 GPU compress/decompress round trip time:  0.012677907943725586\n",
-      "245840\n",
-      "4\n",
-      "4\n",
-      "61460\n",
-      "lz4 GPU compressor output size:  246904\n",
-      "lz4 GPU decompressor output size:  245840\n",
-      "lz4 GPU compress/decompress round trip time:  0.006706953048706055\n",
-      "122920\n",
-      "4\n",
-      "4\n",
-      "30728\n",
-      "lz4 GPU compressor output size:  123459\n",
-      "lz4 GPU decompressor output size:  122912\n",
-      "lz4 GPU compress/decompress round trip time:  0.010996580123901367\n",
-      "61460\n",
-      "4\n",
-      "4\n",
-      "15364\n",
-      "lz4 GPU compressor output size:  61745\n",
-      "lz4 GPU decompressor output size:  61456\n",
-      "lz4 GPU compress/decompress round trip time:  0.006911039352416992\n",
-      "30730\n",
-      "4\n",
-      "4\n",
-      "7680\n",
-      "lz4 GPU compressor output size:  30907\n",
-      "lz4 GPU decompressor output size:  30720\n",
-      "lz4 GPU compress/decompress round trip time:  0.004134178161621094\n",
-      "15365\n",
-      "4\n",
-      "4\n",
-      "3840\n",
-      "lz4 GPU compressor output size:  15498\n",
-      "lz4 GPU decompressor output size:  15360\n",
-      "lz4 GPU compress/decompress round trip time:  0.0048847198486328125\n",
-      "7682\n",
-      "4\n",
-      "4\n",
-      "1920\n",
-      "lz4 GPU compressor output size:  7787\n",
-      "lz4 GPU decompressor output size:  7680\n",
-      "lz4 GPU compress/decompress round trip time:  0.0031135082244873047\n",
-      "3841\n",
-      "4\n",
-      "4\n",
-      "960\n",
-      "lz4 GPU compressor output size:  3940\n",
-      "lz4 GPU decompressor output size:  3840\n",
-      "lz4 GPU compress/decompress round trip time:  0.0027506351470947266\n"
-     ]
-    }
-   ],
-   "source": [
-    "lz4_gpu_size = []\n",
-    "lz4_gpu_temp_size = []\n",
-    "lz4_gpu_round_trip_time = []\n",
-    "for size in sizes:\n",
-    "    data_host = get_host_data(size, DTYPE)\n",
-    "    data_gpu = cp.array(data_host)\n",
-    "\n",
-    "    \"\"\"LZ4 GPU\"\"\"\n",
-    "    data_gpu = cp.array(data_host)\n",
-    "    t_gpu = time.time()\n",
-    "    compressor = kvikio.nvcomp.LZ4Manager(dtype=data_gpu.dtype)\n",
-    "    compressed = compressor.compress(data_gpu)\n",
-    "    output_size = compressed.nbytes\n",
-    "\n",
-    "    decompressed = compressor.decompress(compressed)\n",
-    "    decompressed_size = decompressed.size * decompressed.itemsize\n",
-    "    lz4_gpu_round_trip_time.append(time.time() - t_gpu)\n",
-    "    lz4_gpu_size.append(output_size)\n",
-    "    print('lz4 GPU compressor output size: ', output_size)\n",
-    "    print('lz4 GPU decompressor output size: ', decompressed_size)\n",
-    "    print('lz4 GPU compress/decompress round trip time: ',time.time() - t_gpu)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "id": "0b9e6efb-439b-4d9e-b221-1a728adee7d6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# zarr lz4 max buffer size is 264241152 int64s\n",
-    "# zarr lz4 max buffer size is 2113929216 bytes\n",
-    "# cascaded max buffer size is 2147483640 bytes\n",
-    "# cascaded max buffer size is 268435456 int64s"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "id": "9cd69f83-88de-4929-b760-b8ebfb916b8f",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2013929216, 1006964608, 503482304, 251741152, 125870576, 62935280, 31467632, 15733808, 7866896, 3933440, 1966720, 983360, 491680, 245840, 122912, 61456, 30720, 15360, 7680, 3840]\n",
-      "[33434464, 16717276, 8358716, 4179436, 2089796, 1044976, 522532, 261344, 130716, 65436, 32796, 16476, 8316, 4236, 2184, 1148, 632, 360, 224, 156]\n",
-      "[]\n",
-      "[0.10751104354858398, 0.11756682395935059, 0.05767321586608887, 0.028416156768798828, 0.014620304107666016, 0.007331132888793945, 0.004427194595336914, 0.0025060176849365234, 0.0017902851104736328, 0.0016641616821289062, 0.001974821090698242, 0.0013790130615234375, 0.0011060237884521484, 0.0014438629150390625, 0.0010533332824707031, 0.0008640289306640625, 0.001127481460571289, 0.0014081001281738281, 0.0011692047119140625, 0.0012063980102539062]\n",
-      "[2022340697, 1011170371, 505585200, 252792621, 126396327, 63198181, 31599109, 15799573, 7899801, 3949915, 1974981, 987514, 493774, 246904, 123459, 61745, 30907, 15498, 7787, 3940]\n",
-      "[]\n",
-      "[0.7270452976226807, 0.3670234680175781, 0.18999958038330078, 0.09043264389038086, 0.04634451866149902, 0.022789478302001953, 0.015785932540893555, 0.009443283081054688, 0.011508703231811523, 0.00690460205078125, 0.012271881103515625, 0.00648951530456543, 0.012626171112060547, 0.006663322448730469, 0.010945320129394531, 0.00687098503112793, 0.004094123840332031, 0.004844188690185547, 0.0030717849731445312, 0.0027098655700683594]\n",
-      "[2021826967, 1010913478, 505456734, 252728362, 126364175, 63182074, 31591024, 15795499, 7897736, 3948855, 1974422, 987206, 493597, 246793, 123383, 61678, 30822, 15401, 7699, 3852]\n",
-      "[4.681788921356201, 2.579982280731201, 1.2367866039276123, 0.5987403392791748, 0.3033754825592041, 0.16110515594482422, 0.1472797393798828, 0.03442859649658203, 0.010602712631225586, 0.028273344039916992, 0.0026633739471435547, 0.0007534027099609375, 0.0009970664978027344, 0.0004544258117675781, 0.0002968311309814453, 0.0002384185791015625, 0.00015044212341308594, 0.00044274330139160156, 0.00023889541625976562, 0.00026869773864746094]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(input_size)\n",
-    "print(cascaded_size)\n",
-    "print(cascaded_temp_size)\n",
-    "print(cascaded_round_trip_time)\n",
-    "print(lz4_gpu_size)\n",
-    "print(lz4_gpu_temp_size)\n",
-    "print(lz4_gpu_round_trip_time)\n",
-    "print(lz4_size)\n",
-    "print(lz4_round_trip_time)\n",
-    "df = pd.DataFrame({\n",
-    "    'Input Size (Bytes)': input_size,\n",
-    "    'cascaded_size': cascaded_size,\n",
-    "    'cascaded_round_trip_time': cascaded_round_trip_time,\n",
-    "    'lz4_gpu_size': lz4_gpu_size,\n",
-    "    'lz4_gpu_round_trip_time': lz4_gpu_round_trip_time,\n",
-    "    'lz4_size': lz4_size,\n",
-    "    'lz4_round_trip_time': lz4_round_trip_time\n",
-    "})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 55,
-   "id": "c7a23383-a073-4156-9be6-9da6b8c9026e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "### You'll need the following to display the upcoming plots. ###\n",
-    "\n",
-    "# !conda install -c conda-forge plotly\n",
-    "# !npm install require"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "id": "8a7d2c60-79d7-4840-a5fb-c7e1eb42f829",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Index(['Input Size (Bytes)', 'cascaded_size', 'cascaded_round_trip_time',\n",
-      "       'lz4_gpu_size', 'lz4_gpu_round_trip_time', 'lz4_size',\n",
-      "       'lz4_round_trip_time', 'Cascaded Compression Ratio',\n",
-      "       'Lz4 Gpu Compression Ratio', 'Lz4 Host Compression Ratio',\n",
-      "       'Cascaded Speedup', 'Lz4 Gpu Speedup'],\n",
-      "      dtype='object')\n"
-     ]
-    }
-   ],
-   "source": [
-    "df['Cascaded Compression Ratio'] = df['Input Size (Bytes)'] / df['cascaded_size']\n",
-    "df['Lz4 Gpu Compression Ratio'] = df['Input Size (Bytes)'] / df['lz4_gpu_size']\n",
-    "df['Lz4 Host Compression Ratio'] = df['Input Size (Bytes)'] / df['lz4_size']\n",
-    "df['Cascaded Speedup'] = df['lz4_round_trip_time'] / df['cascaded_round_trip_time']\n",
-    "df['Lz4 Gpu Speedup'] = df['lz4_round_trip_time'] / df['lz4_gpu_round_trip_time']\n",
-    "print(df.columns)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 57,
-   "id": "8c6f225a-61e6-42b2-a991-6eeab56aae48",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.plotly.v1+json": {
-       "config": {
-        "plotlyServerURL": "https://plot.ly"
-       },
-       "data": [
-        {
-         "hovertemplate": "variable=Cascaded Speedup<br>Input Size (Bytes)=%{x}<br>Multiple Faster=%{y}<extra></extra>",
-         "legendgroup": "Cascaded Speedup",
-         "line": {
-          "color": "#636efa",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Cascaded Speedup",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          43.54705123144407,
-          21.94481567030547,
-          21.444731065444667,
-          21.07041934455389,
-          20.75028537882,
-          21.975478877361866,
-          33.267057999892295,
-          13.738369327371325,
-          5.922359834864829,
-          16.989541547277938,
-          1.3486659422914402,
-          0.5463347164591977,
-          0.9014873895236042,
-          0.31472919418758255,
-          0.2818017202354006,
-          0.27593818984547464,
-          0.13343201522520617,
-          0.3144260074500508,
-          0.20432300163132136,
-          0.22272727272727272
-         ],
-         "yaxis": "y"
-        },
-        {
-         "hovertemplate": "variable=Lz4 Gpu Speedup<br>Input Size (Bytes)=%{x}<br>Multiple Faster=%{y}<extra></extra>",
-         "legendgroup": "Lz4 Gpu Speedup",
-         "line": {
-          "color": "#EF553B",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Lz4 Gpu Speedup",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          6.439473491768513,
-          7.029474967000302,
-          6.5094175546732655,
-          6.620843022182852,
-          6.5460919936414195,
-          7.069277927730003,
-          9.329809246197762,
-          3.645829125429206,
-          0.9212777858341448,
-          4.094854972375691,
-          0.21703061858874728,
-          0.11609537455453911,
-          0.07896823898183467,
-          0.06819808215256906,
-          0.027119456303912173,
-          0.03469933030292515,
-          0.036745865362217564,
-          0.09139679102273846,
-          0.0777708786091276,
-          0.09915537568185817
-         ],
-         "yaxis": "y"
-        }
-       ],
-       "layout": {
-        "legend": {
-         "title": {
-          "text": "variable"
-         },
-         "tracegroupgap": 0
-        },
-        "template": {
-         "data": {
-          "bar": [
-           {
-            "error_x": {
-             "color": "#2a3f5f"
-            },
-            "error_y": {
-             "color": "#2a3f5f"
-            },
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "bar"
-           }
-          ],
-          "barpolar": [
-           {
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "barpolar"
-           }
-          ],
-          "carpet": [
-           {
-            "aaxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "baxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "type": "carpet"
-           }
-          ],
-          "choropleth": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "choropleth"
-           }
-          ],
-          "contour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "contour"
-           }
-          ],
-          "contourcarpet": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "contourcarpet"
-           }
-          ],
-          "heatmap": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmap"
-           }
-          ],
-          "heatmapgl": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmapgl"
-           }
-          ],
-          "histogram": [
-           {
-            "marker": {
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "histogram"
-           }
-          ],
-          "histogram2d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2d"
-           }
-          ],
-          "histogram2dcontour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2dcontour"
-           }
-          ],
-          "mesh3d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "mesh3d"
-           }
-          ],
-          "parcoords": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "parcoords"
-           }
-          ],
-          "pie": [
-           {
-            "automargin": true,
-            "type": "pie"
-           }
-          ],
-          "scatter": [
-           {
-            "fillpattern": {
-             "fillmode": "overlay",
-             "size": 10,
-             "solidity": 0.2
-            },
-            "type": "scatter"
-           }
-          ],
-          "scatter3d": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatter3d"
-           }
-          ],
-          "scattercarpet": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattercarpet"
-           }
-          ],
-          "scattergeo": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergeo"
-           }
-          ],
-          "scattergl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergl"
-           }
-          ],
-          "scattermapbox": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattermapbox"
-           }
-          ],
-          "scatterpolar": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolar"
-           }
-          ],
-          "scatterpolargl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolargl"
-           }
-          ],
-          "scatterternary": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterternary"
-           }
-          ],
-          "surface": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "surface"
-           }
-          ],
-          "table": [
-           {
-            "cells": {
-             "fill": {
-              "color": "#EBF0F8"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "header": {
-             "fill": {
-              "color": "#C8D4E3"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "type": "table"
-           }
-          ]
-         },
-         "layout": {
-          "annotationdefaults": {
-           "arrowcolor": "#2a3f5f",
-           "arrowhead": 0,
-           "arrowwidth": 1
-          },
-          "autotypenumbers": "strict",
-          "coloraxis": {
-           "colorbar": {
-            "outlinewidth": 0,
-            "ticks": ""
-           }
-          },
-          "colorscale": {
-           "diverging": [
-            [
-             0,
-             "#8e0152"
-            ],
-            [
-             0.1,
-             "#c51b7d"
-            ],
-            [
-             0.2,
-             "#de77ae"
-            ],
-            [
-             0.3,
-             "#f1b6da"
-            ],
-            [
-             0.4,
-             "#fde0ef"
-            ],
-            [
-             0.5,
-             "#f7f7f7"
-            ],
-            [
-             0.6,
-             "#e6f5d0"
-            ],
-            [
-             0.7,
-             "#b8e186"
-            ],
-            [
-             0.8,
-             "#7fbc41"
-            ],
-            [
-             0.9,
-             "#4d9221"
-            ],
-            [
-             1,
-             "#276419"
-            ]
-           ],
-           "sequential": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ],
-           "sequentialminus": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ]
-          },
-          "colorway": [
-           "#636efa",
-           "#EF553B",
-           "#00cc96",
-           "#ab63fa",
-           "#FFA15A",
-           "#19d3f3",
-           "#FF6692",
-           "#B6E880",
-           "#FF97FF",
-           "#FECB52"
-          ],
-          "font": {
-           "color": "#2a3f5f"
-          },
-          "geo": {
-           "bgcolor": "white",
-           "lakecolor": "white",
-           "landcolor": "#E5ECF6",
-           "showlakes": true,
-           "showland": true,
-           "subunitcolor": "white"
-          },
-          "hoverlabel": {
-           "align": "left"
-          },
-          "hovermode": "closest",
-          "mapbox": {
-           "style": "light"
-          },
-          "paper_bgcolor": "white",
-          "plot_bgcolor": "#E5ECF6",
-          "polar": {
-           "angularaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "radialaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "scene": {
-           "xaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "yaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "zaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           }
-          },
-          "shapedefaults": {
-           "line": {
-            "color": "#2a3f5f"
-           }
-          },
-          "ternary": {
-           "aaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "baxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "caxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "title": {
-           "x": 0.05
-          },
-          "xaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          },
-          "yaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          }
-         }
-        },
-        "title": {
-         "text": "Gpu Acceleration over Zarr Lz4 - Ascending <class 'numpy.int32'>"
-        },
-        "xaxis": {
-         "anchor": "y",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Input Size (Bytes)"
-         },
-         "type": "category"
-        },
-        "yaxis": {
-         "anchor": "x",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Multiple Faster"
-         }
-        }
-       }
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "import plotly.express as px\n",
-    "title = 'Gpu Acceleration over Zarr Lz4 - ' + TARGET + \" \" + str(DTYPE)\n",
-    "subtitle = 'Includes host->gpu copy time'\n",
-    "fig = px.line(df, x='Input Size (Bytes)',\n",
-    "              y=['Cascaded Speedup', 'Lz4 Gpu Speedup'],\n",
-    "              labels={'value': 'Multiple Faster'},\n",
-    "              title=title)\n",
-    "fig.update_xaxes(type='category')\n",
-    "fig.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 58,
-   "id": "e3d57a90-ca86-41da-9747-696151d66184",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.plotly.v1+json": {
-       "config": {
-        "plotlyServerURL": "https://plot.ly"
-       },
-       "data": [
-        {
-         "hovertemplate": "variable=Lz4 Gpu Compression Ratio<br>Input Size (Bytes)=%{x}<br>Compression Factor=%{y}<extra></extra>",
-         "legendgroup": "Lz4 Gpu Compression Ratio",
-         "line": {
-          "color": "#636efa",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Lz4 Gpu Compression Ratio",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          0.9958407201059258,
-          0.9958406979470327,
-          0.995840669386683,
-          0.9958405866601621,
-          0.9958404566613712,
-          0.9958400543205508,
-          0.9958392181247895,
-          0.9958375457362044,
-          0.9958347052033336,
-          0.9958290241688745,
-          0.9958171749500375,
-          0.9957934773582957,
-          0.9957591934771778,
-          0.9956906327965525,
-          0.9955693793081104,
-          0.9953194590655113,
-          0.9939495907076067,
-          0.9910956252419667,
-          0.9862591498651598,
-          0.9746192893401016
-         ],
-         "yaxis": "y"
-        },
-        {
-         "hovertemplate": "variable=Cascaded Compression Ratio<br>Input Size (Bytes)=%{x}<br>Compression Factor=%{y}<extra></extra>",
-         "legendgroup": "Cascaded Compression Ratio",
-         "line": {
-          "color": "#EF553B",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Cascaded Compression Ratio",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          60.23512792069884,
-          60.23496938137529,
-          60.23440729413465,
-          60.23328315112374,
-          60.231034990975196,
-          60.226531518427215,
-          60.2214448110355,
-          60.20344067589078,
-          60.183114538388566,
-          60.11125374411639,
-          59.96828881570923,
-          59.68438941490653,
-          59.12457912457913,
-          58.035882908404155,
-          56.27838827838828,
-          53.53310104529617,
-          48.607594936708864,
-          42.666666666666664,
-          34.285714285714285,
-          24.615384615384617
-         ],
-         "yaxis": "y"
-        },
-        {
-         "hovertemplate": "variable=Lz4 Host Compression Ratio<br>Input Size (Bytes)=%{x}<br>Compression Factor=%{y}<extra></extra>",
-         "legendgroup": "Lz4 Host Compression Ratio",
-         "line": {
-          "color": "#00cc96",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Lz4 Host Compression Ratio",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          0.9960937552377597,
-          0.9960937606571312,
-          0.9960937705105339,
-          0.9960937902173401,
-          0.9960938375136782,
-          0.9960939237290628,
-          0.9960940803944817,
-          0.9960943937257063,
-          0.9960950834517639,
-          0.996096336786233,
-          0.9960991115374525,
-          0.9961041565792752,
-          0.9961162648881577,
-          0.9961384642190013,
-          0.9961826183509884,
-          0.9964006615000487,
-          0.996690675491532,
-          0.9973378352055061,
-          0.997532147032082,
-          0.9968847352024922
-         ],
-         "yaxis": "y"
-        }
-       ],
-       "layout": {
-        "legend": {
-         "title": {
-          "text": "variable"
-         },
-         "tracegroupgap": 0
-        },
-        "template": {
-         "data": {
-          "bar": [
-           {
-            "error_x": {
-             "color": "#2a3f5f"
-            },
-            "error_y": {
-             "color": "#2a3f5f"
-            },
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "bar"
-           }
-          ],
-          "barpolar": [
-           {
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "barpolar"
-           }
-          ],
-          "carpet": [
-           {
-            "aaxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "baxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "type": "carpet"
-           }
-          ],
-          "choropleth": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "choropleth"
-           }
-          ],
-          "contour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "contour"
-           }
-          ],
-          "contourcarpet": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "contourcarpet"
-           }
-          ],
-          "heatmap": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmap"
-           }
-          ],
-          "heatmapgl": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmapgl"
-           }
-          ],
-          "histogram": [
-           {
-            "marker": {
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "histogram"
-           }
-          ],
-          "histogram2d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2d"
-           }
-          ],
-          "histogram2dcontour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2dcontour"
-           }
-          ],
-          "mesh3d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "mesh3d"
-           }
-          ],
-          "parcoords": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "parcoords"
-           }
-          ],
-          "pie": [
-           {
-            "automargin": true,
-            "type": "pie"
-           }
-          ],
-          "scatter": [
-           {
-            "fillpattern": {
-             "fillmode": "overlay",
-             "size": 10,
-             "solidity": 0.2
-            },
-            "type": "scatter"
-           }
-          ],
-          "scatter3d": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatter3d"
-           }
-          ],
-          "scattercarpet": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattercarpet"
-           }
-          ],
-          "scattergeo": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergeo"
-           }
-          ],
-          "scattergl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergl"
-           }
-          ],
-          "scattermapbox": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattermapbox"
-           }
-          ],
-          "scatterpolar": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolar"
-           }
-          ],
-          "scatterpolargl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolargl"
-           }
-          ],
-          "scatterternary": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterternary"
-           }
-          ],
-          "surface": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "surface"
-           }
-          ],
-          "table": [
-           {
-            "cells": {
-             "fill": {
-              "color": "#EBF0F8"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "header": {
-             "fill": {
-              "color": "#C8D4E3"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "type": "table"
-           }
-          ]
-         },
-         "layout": {
-          "annotationdefaults": {
-           "arrowcolor": "#2a3f5f",
-           "arrowhead": 0,
-           "arrowwidth": 1
-          },
-          "autotypenumbers": "strict",
-          "coloraxis": {
-           "colorbar": {
-            "outlinewidth": 0,
-            "ticks": ""
-           }
-          },
-          "colorscale": {
-           "diverging": [
-            [
-             0,
-             "#8e0152"
-            ],
-            [
-             0.1,
-             "#c51b7d"
-            ],
-            [
-             0.2,
-             "#de77ae"
-            ],
-            [
-             0.3,
-             "#f1b6da"
-            ],
-            [
-             0.4,
-             "#fde0ef"
-            ],
-            [
-             0.5,
-             "#f7f7f7"
-            ],
-            [
-             0.6,
-             "#e6f5d0"
-            ],
-            [
-             0.7,
-             "#b8e186"
-            ],
-            [
-             0.8,
-             "#7fbc41"
-            ],
-            [
-             0.9,
-             "#4d9221"
-            ],
-            [
-             1,
-             "#276419"
-            ]
-           ],
-           "sequential": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ],
-           "sequentialminus": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ]
-          },
-          "colorway": [
-           "#636efa",
-           "#EF553B",
-           "#00cc96",
-           "#ab63fa",
-           "#FFA15A",
-           "#19d3f3",
-           "#FF6692",
-           "#B6E880",
-           "#FF97FF",
-           "#FECB52"
-          ],
-          "font": {
-           "color": "#2a3f5f"
-          },
-          "geo": {
-           "bgcolor": "white",
-           "lakecolor": "white",
-           "landcolor": "#E5ECF6",
-           "showlakes": true,
-           "showland": true,
-           "subunitcolor": "white"
-          },
-          "hoverlabel": {
-           "align": "left"
-          },
-          "hovermode": "closest",
-          "mapbox": {
-           "style": "light"
-          },
-          "paper_bgcolor": "white",
-          "plot_bgcolor": "#E5ECF6",
-          "polar": {
-           "angularaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "radialaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "scene": {
-           "xaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "yaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "zaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           }
-          },
-          "shapedefaults": {
-           "line": {
-            "color": "#2a3f5f"
-           }
-          },
-          "ternary": {
-           "aaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "baxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "caxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "title": {
-           "x": 0.05
-          },
-          "xaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          },
-          "yaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          }
-         }
-        },
-        "title": {
-         "text": "Compression - Ascending <class 'numpy.int32'>"
-        },
-        "xaxis": {
-         "anchor": "y",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Input Size (Bytes)"
-         },
-         "type": "category"
-        },
-        "yaxis": {
-         "anchor": "x",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Compression Factor"
-         }
-        }
-       }
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "import plotly.express as px\n",
-    "title = 'Compression - ' + TARGET + \" \" + str(DTYPE)\n",
-    "fig = px.line(df, x='Input Size (Bytes)',\n",
-    "              y=[\n",
-    "                  'Lz4 Gpu Compression Ratio',\n",
-    "                  'Cascaded Compression Ratio',\n",
-    "                  'Lz4 Host Compression Ratio'\n",
-    "              ],\n",
-    "              labels={'value': 'Compression Factor'},\n",
-    "              title=title)\n",
-    "fig.update_xaxes(type='category')\n",
-    "fig.show()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/zarr.ipynb b/notebooks/zarr.ipynb
deleted file mode 100644
index 33a981ebf5..0000000000
--- a/notebooks/zarr.ipynb
+++ /dev/null
@@ -1,364 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "7a060f7d-9a0c-4763-98df-7dc82409c6ba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\"\"\"\n",
-    "In this tutorial, we will show how to use KvikIO to read and write GPU memory directly to/from Zarr files.\n",
-    "\"\"\"\n",
-    "import json\n",
-    "import shutil\n",
-    "import numpy\n",
-    "import cupy\n",
-    "import zarr\n",
-    "import kvikio\n",
-    "import kvikio.zarr\n",
-    "from kvikio.nvcomp_codec import NvCompBatchCodec\n",
-    "from numcodecs import LZ4"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "99f4d25b-2006-4026-8629-1accafb338ef",
-   "metadata": {},
-   "source": [
-    "We need to set three Zarr arguments: \n",
-    " - `meta_array`: in order to make Zarr read into GPU memory (instead of CPU memory), we set the `meta_array` argument to an empty CuPy array. \n",
-    " - `store`: we need to use a GPU compatible Zarr Store, which will be KvikIO’s GDS store in our case. \n",
-    " - `compressor`: finally, we need to use a GPU compatible compressor (or `None`). KvikIO provides a nvCOMP compressor `kvikio.nvcomp_codec.NvCompBatchCodec` that we will use."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "c179c24a-766e-4e09-83c5-349868042576",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(<zarr.core.Array (10,) int64>,\n",
-       " NvCompBatchCodec(algorithm='lz4', options={}),\n",
-       " <kvikio.zarr.GDSStore at 0x7fd42021ac20>)"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Let's create a new Zarr array using KvikIO's GDS store and LZ4 compression\n",
-    "z = zarr.array(\n",
-    "    cupy.arange(10), \n",
-    "    chunks=2, \n",
-    "    store=kvikio.zarr.GDSStore(\"my-zarr-file.zarr\"), \n",
-    "    meta_array=cupy.empty(()),\n",
-    "    compressor=NvCompBatchCodec(\"lz4\"),\n",
-    "    overwrite=True,\n",
-    ")\n",
-    "z, z.compressor, z.store"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "cupy.ndarray"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# And because we set the `meta_array` argument, reading the Zarr array returns a CuPy array\n",
-    "type(z[:])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "549ded39-1053-4f82-a8a7-5a2ee999a4a1",
-   "metadata": {},
-   "source": [
-    "From this point onwards, `z` can be used just like any other Zarr array."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "8221742d-f15c-450a-9701-dc8c05326126",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([1, 2, 3, 4, 5, 6, 7, 8])"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z[1:9]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "f0c451c1-a240-4b26-a5ef-6e70a5bbeb55",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([42, 43, 44, 45, 46, 47, 48, 49, 50, 51])"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z[:] + 42"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7797155f-40f4-4c50-b704-2356ca64cba3",
-   "metadata": {},
-   "source": [
-    "### GPU compression / CPU decompression"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a0029deb-19b9-4dbb-baf0-ce4b199605a5",
-   "metadata": {},
-   "source": [
-    "In order to read GPU-written Zarr file into a NumPy array, we simply open that file **without** setting the `meta_array` argument:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "399f23f7-4475-496a-a537-a7163a35c888",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(numpy.ndarray,\n",
-       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
-       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z = zarr.open_array(kvikio.zarr.GDSStore(\"my-zarr-file.zarr\"))\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8e9f31d5",
-   "metadata": {},
-   "source": [
-    "And we don't need to use `kvikio.zarr.GDSStore` either:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "4b1f46b2",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(numpy.ndarray,\n",
-       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
-       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z = zarr.open_array(\"my-zarr-file.zarr\")\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f10fd704-35f7-46b7-aabe-ea68fb2bf88d",
-   "metadata": {},
-   "source": [
-    "However, the above use `NvCompBatchCodec(\"lz4\")` for decompression. In the following, we will show how to read Zarr file written and compressed using a GPU on the CPU.\n",
-    "\n",
-    "Some algorithms, such as LZ4, can be used interchangeably on CPU and GPU but Zarr will always use the compressor used to write the Zarr file. We are working with the Zarr team to fix this shortcoming but for now, we will use a workaround where we _patch_ the metadata manually."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "d980361a-e132-4f29-ab13-cbceec5bbbb5",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(numpy.ndarray, numcodecs.lz4.LZ4, array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Read the Zarr metadata and replace the compressor with a CPU implementation of LZ4\n",
-    "store = zarr.DirectoryStore(\"my-zarr-file.zarr\")  # We could also have used kvikio.zarr.GDSStore\n",
-    "meta = json.loads(store[\".zarray\"])\n",
-    "meta[\"compressor\"] = LZ4().get_config()\n",
-    "store[\".zarray\"] = json.dumps(meta).encode()  # NB: this changes the Zarr metadata on disk\n",
-    "\n",
-    "# And then open the file as usually\n",
-    "z = zarr.open_array(store)\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8ea73705",
-   "metadata": {},
-   "source": [
-    "### CPU compression / GPU decompression\n",
-    "\n",
-    "Now, let's try the otherway around."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "id": "c9b2d56a",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(<zarr.core.Array (10,) int64>,\n",
-       " LZ4(acceleration=1),\n",
-       " <zarr.storage.DirectoryStore at 0x7fd351e7a9b0>)"
-      ]
-     },
-     "execution_count": 31,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import numcodecs\n",
-    "# Let's create a new Zarr array using the default compression.\n",
-    "z = zarr.array(\n",
-    "    numpy.arange(10), \n",
-    "    chunks=2, \n",
-    "    store=\"my-zarr-file.zarr\", \n",
-    "    overwrite=True,\n",
-    "    # The default (CPU) implementation of LZ4 codec.\n",
-    "    compressor=numcodecs.registry.get_codec({\"id\": \"lz4\"})\n",
-    ")\n",
-    "z, z.compressor, z.store"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dedd4623",
-   "metadata": {},
-   "source": [
-    "Again, we will use a workaround where we _patch_ the metadata manually."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "id": "ac3f30b1",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(cupy.ndarray,\n",
-       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
-       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 32,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Read the Zarr metadata and replace the compressor with a GPU implementation of LZ4\n",
-    "store = kvikio.zarr.GDSStore(\"my-zarr-file.zarr\")  # We could also have used zarr.DirectoryStore\n",
-    "meta = json.loads(store[\".zarray\"])\n",
-    "meta[\"compressor\"] = NvCompBatchCodec(\"lz4\").get_config()\n",
-    "store[\".zarray\"] = json.dumps(meta).encode()  # NB: this changes the Zarr metadata on disk\n",
-    "\n",
-    "# And then open the file as usually\n",
-    "z = zarr.open_array(store, meta_array=cupy.empty(()))\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "80682922-b7b0-4b08-b595-228c2b446a78",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Clean up\n",
-    "shutil.rmtree(\"my-zarr-file.zarr\", ignore_errors=True)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000..6f9c11cd0b
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,71 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
+[tool.ruff]
+line-length = 88
+target-version = "py310"
+
+[tool.ruff.lint]
+select = [
+    "E",
+    "F",
+    "I",
+    "W",
+]
+ignore = [
+    # whitespace before :
+    "E203",
+    # line-too-long (due to Copyright header)
+    "E501",
+]
+fixable = ["ALL"]
+
+[tool.ruff.lint.per-file-ignores]
+"*.pyx" = [
+    "E211",
+    "E225",
+    "E226",
+    "E227",
+    "E275",
+    "E402",
+]
+"*.pxd" = [
+    "E211",
+    "E225",
+    "E226",
+    "E227",
+    "E275",
+    "E402",
+]
+"*.pxi" = [
+    "E211",
+    "E225",
+    "E226",
+    "E227",
+    "E275",
+    "E402",
+]
+"notebooks/*" = [
+    "F841",
+]
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+order-by-type = true
+known-first-party = [
+    "kvikio",
+]
+default-section = "third-party"
+section-order = [
+    "future",
+    "standard-library",
+    "third-party",
+    "first-party",
+    "local-folder",
+]
+
+[tool.cython-lint]
+ignore = [
+    # line-too-long (due to Copyright header)
+    "E501",
+]
diff --git a/python/kvikio/CMakeLists.txt b/python/kvikio/CMakeLists.txt
index 3e5af8c857..1a8ebf99b6 100644
--- a/python/kvikio/CMakeLists.txt
+++ b/python/kvikio/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
@@ -26,8 +19,6 @@ project(
   LANGUAGES CXX CUDA
 )
 
-option(USE_NVCOMP_RUNTIME_WHEEL "Use the nvcomp wheel at runtime instead of the system library" OFF)
-
 find_package(kvikio REQUIRED "${RAPIDS_VERSION}")
 
 find_package(CUDAToolkit REQUIRED)
@@ -35,8 +26,6 @@ find_package(CUDAToolkit REQUIRED)
 include(rapids-cython-core)
 rapids_cython_init()
 
-add_subdirectory(cmake)
-
 set(cython_lib_dir kvikio)
 
 add_subdirectory(kvikio/_lib)
diff --git a/python/kvikio/cli/gpu_compressor.py b/python/kvikio/cli/gpu_compressor.py
deleted file mode 100755
index ac34b15e8f..0000000000
--- a/python/kvikio/cli/gpu_compressor.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# NVIDIA 2022
-
-import argparse
-import os
-import sys
-import time
-
-import cupy
-
-import kvikio
-import kvikio.nvcomp as nvcomp
-
-
-def get_parser():
-    class NvcompParser(argparse.ArgumentParser):
-        """
-        Handle special case and show help on invalid argument
-        """
-
-        def error(self, message):
-            sys.stderr.write("\nERROR: {}\n\n".format(message))
-            self.print_help()
-            sys.exit(2)
-
-    parser = NvcompParser()
-    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose Output")
-    parser.add_argument(
-        "-o",
-        "--out_file",
-        action="store",
-        dest="out_file",
-        help="Output filename",
-    )
-    parser.add_argument(
-        "-c",
-        choices=["ans", "bitcomp", "cascaded", "gdeflate", "lz4", "snappy"],
-        action="store",
-        dest="compression",
-        help="Which GPU algorithm to use for compression.",
-    )
-    parser.add_argument(
-        "-d",
-        action="store_true",
-        help="Decompress the incoming file",
-    )
-    parser.add_argument(action="store", dest="filename", help="Relative Filename")
-    return parser
-
-
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    print("GPU Compression Initialized") if args.verbose else None
-
-    file_size = os.path.getsize(args.filename)
-    """ test
-    data = cupy.arange(10000, dtype="uint8")
-    """
-    data = cupy.zeros(file_size, dtype=cupy.int8)
-    t = time.time()
-    f = kvikio.CuFile(args.filename, "r")
-    f.read(data)
-    f.close()
-    read_time = time.time() - t
-    print(f"File read time: {read_time:.3} seconds.") if args.verbose else None
-
-    if args.d:
-        compressor = nvcomp.ManagedDecompressionManager(data)
-    elif args.compression == "ans":
-        compressor = nvcomp.ANSManager()
-    elif args.compression == "bitcomp":
-        compressor = nvcomp.BitcompManager()
-    elif args.compression == "cascaded":
-        compressor = nvcomp.CascadedManager()
-    elif args.compression == "gdeflate":
-        compressor = nvcomp.GdeflateManager()
-    elif args.compression == "snappy":
-        compressor = nvcomp.SnappyManager()
-    else:
-        compressor = nvcomp.LZ4Manager(chunk_size=1 << 16)
-
-    if args.d is True:
-        print(f"Decompressing {file_size} bytes") if args.verbose else None
-        t = time.time()
-        converted = compressor.decompress(data)
-        decompress_time = time.time() - t
-        print(
-            f"Decompression time: {decompress_time:.3} seconds"
-        ) if args.verbose else None
-
-        if not args.out_file:
-            raise ValueError("Must specify filename with -o for decompression.")
-
-        t = time.time()
-        o = kvikio.CuFile(args.out_file, "w")
-        o.write(converted)
-        o.close()
-        io_time = time.time() - t
-        print(f"File write time: {io_time:.3} seconds") if args.verbose else None
-
-        print(
-            f"Decompressed file size {os.path.getsize(args.out_file)}"
-        ) if args.verbose else None
-    else:
-        file_size = os.path.getsize(args.filename)
-
-        print(f"Compressing {file_size} bytes") if args.verbose else None
-        t = time.time()
-        converted = compressor.compress(data)
-        compress_time = time.time() - t
-        print(f"Compression time: {compress_time:.3} seconds") if args.verbose else None
-
-        t = time.time()
-        if args.out_file:
-            o = kvikio.CuFile(args.out_file, "w")
-        else:
-            o = kvikio.CuFile(args.filename + ".gpc", "w")
-        o.write(converted)
-        o.close()
-        io_time = time.time() - t
-        print(f"File write time: {io_time:.3} seconds") if args.verbose else None
-
-        print(
-            f"Compressed file size {compressor.get_compressed_output_size(converted)}"
-        ) if args.verbose else None
-
-    if args.out_file:
-        end_name = args.out_file
-    else:
-        end_name = args.filename + ".gpc"
-    print(f"Created file {end_name}") if args.verbose else None
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/kvikio/cmake/CMakeLists.txt b/python/kvikio/cmake/CMakeLists.txt
deleted file mode 100644
index d3882b5ab3..0000000000
--- a/python/kvikio/cmake/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-include(thirdparty/get_nvcomp.cmake)
diff --git a/python/kvikio/cmake/thirdparty/get_nvcomp.cmake b/python/kvikio/cmake/thirdparty/get_nvcomp.cmake
deleted file mode 100644
index a2c6326e76..0000000000
--- a/python/kvikio/cmake/thirdparty/get_nvcomp.cmake
+++ /dev/null
@@ -1,33 +0,0 @@
-# =============================================================================
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(KVIKIO_USE_PROPRIETARY_BINARY ON)
-
-# This function finds nvcomp and sets any additional necessary environment variables.
-function(find_and_configure_nvcomp)
-
-  include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
-  set(export_args)
-  if(KvikIO_EXPORT_NVCOMP)
-    set(export_args BUILD_EXPORT_SET kvikio-exports INSTALL_EXPORT_SET kvikio-exports)
-  endif()
-  rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${KVIKIO_USE_PROPRIETARY_BINARY})
-
-  # Per-thread default stream
-  if(TARGET nvcomp AND PER_THREAD_DEFAULT_STREAM)
-    target_compile_definitions(nvcomp PRIVATE CUDA_API_PER_THREAD_DEFAULT_STREAM)
-  endif()
-endfunction()
-
-find_and_configure_nvcomp()
diff --git a/python/kvikio/examples/hello_world.py b/python/kvikio/examples/hello_world.py
index a5f4cd49a9..33d2191a1f 100644
--- a/python/kvikio/examples/hello_world.py
+++ b/python/kvikio/examples/hello_world.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import cupy
 
diff --git a/python/kvikio/examples/http_io.py b/python/kvikio/examples/http_io.py
index 26c9af1d44..79d272c531 100644
--- a/python/kvikio/examples/http_io.py
+++ b/python/kvikio/examples/http_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import pathlib
 import tempfile
diff --git a/python/kvikio/examples/zarr_cupy_nvcomp.py b/python/kvikio/examples/zarr_cupy_nvcomp.py
deleted file mode 100644
index 9f05f7874a..0000000000
--- a/python/kvikio/examples/zarr_cupy_nvcomp.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-import cupy
-import numpy
-import zarr
-
-import kvikio
-import kvikio.zarr
-
-
-def main(path):
-    a = cupy.arange(20)
-
-    # Let's use KvikIO's convenience function `open_cupy_array()` to create
-    # a new Zarr file on disk. Its semantic is the same as `zarr.open_array()`
-    # but uses a GDS file store, nvCOMP compression, and CuPy arrays.
-    z = kvikio.zarr.open_cupy_array(store=path, mode="w", shape=(20,), chunks=(5,))
-
-    # `z` is a regular Zarr Array that we can write to as usual
-    z[0:10] = numpy.arange(0, 10)
-    # but it also support direct reads and writes of CuPy arrays
-    z[10:20] = cupy.arange(10, 20)
-
-    # Reading `z` returns a CuPy array
-    assert isinstance(z[:], cupy.ndarray)
-    assert (a == z[:]).all()
-
-    # Normally, we cannot assume that GPU and CPU compressors are compatible.
-    # E.g., `open_cupy_array()` uses nvCOMP's Snappy GPU compression by default,
-    # which, as far as we know, isn’t compatible with any CPU compressor. Thus,
-    # let's re-write our Zarr array using a CPU and GPU compatible compressor.
-    #
-    # Warning: it isn't possible to use `CompatCompressor` as a compressor argument
-    #          in Zarr directly. It is only meant for `open_cupy_array()`. However,
-    #          in an example further down, we show how to write using regular Zarr.
-    z = kvikio.zarr.open_cupy_array(
-        store=path,
-        mode="w",
-        shape=(20,),
-        chunks=(5,),
-        compressor=kvikio.zarr.CompatCompressor.lz4(),
-    )
-    z[:] = a
-
-    # Because we are using a CompatCompressor, it is now possible to open the file
-    # using Zarr's built-in LZ4 decompressor that uses the CPU.
-    z = zarr.open_array(path)
-    # `z` is now read as a regular NumPy array
-    assert isinstance(z[:], numpy.ndarray)
-    assert (a.get() == z[:]).all()
-    # and we can write to is as usual
-    z[:] = numpy.arange(20, 40)
-
-    # And we can read the Zarr file back into a CuPy array.
-    z = kvikio.zarr.open_cupy_array(store=path, mode="r")
-    assert isinstance(z[:], cupy.ndarray)
-    assert (cupy.arange(20, 40) == z[:]).all()
-
-    # Similarly, we can also open a file written by regular Zarr.
-    # Let's write the file without any compressor.
-    ary = numpy.arange(10)
-    z = zarr.open(store=path, mode="w", shape=ary.shape, compressor=None)
-    z[:] = ary
-    # This works as before where the file is read as a CuPy array
-    z = kvikio.zarr.open_cupy_array(store=path)
-    assert isinstance(z[:], cupy.ndarray)
-    assert (z[:] == cupy.asarray(ary)).all()
-
-    # Using a compressor is a bit more tricky since not all CPU compressors
-    # are GPU compatible. To make sure we use a compable compressor, we use
-    # the CPU-part of `CompatCompressor.lz4()`.
-    ary = numpy.arange(10)
-    z = zarr.open(
-        store=path,
-        mode="w",
-        shape=ary.shape,
-        compressor=kvikio.zarr.CompatCompressor.lz4().cpu,
-    )
-    z[:] = ary
-    # This works as before where the file is read as a CuPy array
-    z = kvikio.zarr.open_cupy_array(store=path)
-    assert isinstance(z[:], cupy.ndarray)
-    assert (z[:] == cupy.asarray(ary)).all()
-
-
-if __name__ == "__main__":
-    main("/tmp/zarr-cupy-nvcomp")
diff --git a/python/kvikio/kvikio/__init__.py b/python/kvikio/kvikio/__init__.py
index a1f3c483f6..9765101216 100644
--- a/python/kvikio/kvikio/__init__.py
+++ b/python/kvikio/kvikio/__init__.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # If libkvikio was installed as a wheel, we must request it to load the library symbols.
 # Otherwise, we assume that the library was installed in a system path that ld can find.
@@ -14,14 +14,18 @@
 
 from kvikio._lib.defaults import CompatMode  # noqa: F401
 from kvikio._version import __git_commit__, __version__
-from kvikio.cufile import CuFile, get_page_cache_info
-from kvikio.remote_file import RemoteFile, is_remote_file_available
+from kvikio.cufile import CuFile, clear_page_cache, get_page_cache_info
+from kvikio.mmap import Mmap
+from kvikio.remote_file import RemoteEndpointType, RemoteFile, is_remote_file_available
 
 __all__ = [
     "__git_commit__",
     "__version__",
+    "clear_page_cache",
     "CuFile",
+    "Mmap",
     "get_page_cache_info",
-    "RemoteFile",
     "is_remote_file_available",
+    "RemoteEndpointType",
+    "RemoteFile",
 ]
diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt
index 1ea9b85dff..fe640ecfd7 100644
--- a/python/kvikio/kvikio/_lib/CMakeLists.txt
+++ b/python/kvikio/kvikio/_lib/CMakeLists.txt
@@ -1,20 +1,13 @@
 # =============================================================================
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 # Set the list of Cython files to build, one .so per file
 set(cython_modules arr.pyx buffer.pyx defaults.pyx cufile_driver.pyx file_handle.pyx future.pyx
-                   libnvcomp.pyx libnvcomp_ll.pyx
+                   mmap.pyx
 )
 
 if(KvikIO_REMOTE_SUPPORT)
@@ -30,15 +23,5 @@ endif()
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_modules}"
-  LINKED_LIBRARIES kvikio::kvikio nvcomp::nvcomp
+  LINKED_LIBRARIES kvikio::kvikio
 )
-if(USE_NVCOMP_RUNTIME_WHEEL)
-  set(rpaths "$ORIGIN/../../nvidia/nvcomp")
-  foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-    set_property(
-      TARGET ${tgt}
-      PROPERTY INSTALL_RPATH ${rpaths}
-      APPEND
-    )
-  endforeach()
-endif()
diff --git a/python/kvikio/kvikio/_lib/__init__.pxd b/python/kvikio/kvikio/_lib/__init__.pxd
index 09549e2c5b..1dcf28d231 100644
--- a/python/kvikio/kvikio/_lib/__init__.pxd
+++ b/python/kvikio/kvikio/_lib/__init__.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # cython: language_level=3
diff --git a/python/kvikio/kvikio/_lib/__init__.py b/python/kvikio/kvikio/_lib/__init__.py
index 3772e2e8e3..57eded9dda 100644
--- a/python/kvikio/kvikio/_lib/__init__.py
+++ b/python/kvikio/kvikio/_lib/__init__.py
@@ -1,2 +1,2 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/python/kvikio/kvikio/_lib/arr.pxd b/python/kvikio/kvikio/_lib/arr.pxd
index 47bad21a3b..a09bb260ff 100644
--- a/python/kvikio/kvikio/_lib/arr.pxd
+++ b/python/kvikio/kvikio/_lib/arr.pxd
@@ -1,5 +1,5 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
diff --git a/python/kvikio/kvikio/_lib/arr.pyi b/python/kvikio/kvikio/_lib/arr.pyi
index 0d2a8201eb..fb84317fb2 100644
--- a/python/kvikio/kvikio/_lib/arr.pyi
+++ b/python/kvikio/kvikio/_lib/arr.pyi
@@ -1,5 +1,5 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 from typing import Generic, Tuple, TypeVar
 
diff --git a/python/kvikio/kvikio/_lib/arr.pyx b/python/kvikio/kvikio/_lib/arr.pyx
index 19818d7cc0..c151535d18 100644
--- a/python/kvikio/kvikio/_lib/arr.pyx
+++ b/python/kvikio/kvikio/_lib/arr.pyx
@@ -1,5 +1,5 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # cython: language_level=3
 
diff --git a/python/kvikio/kvikio/_lib/buffer.pyx b/python/kvikio/kvikio/_lib/buffer.pyx
index 3b90f09816..c86ee7f021 100644
--- a/python/kvikio/kvikio/_lib/buffer.pyx
+++ b/python/kvikio/kvikio/_lib/buffer.pyx
@@ -1,5 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
@@ -17,19 +17,33 @@ def memory_register(buf) -> None:
     if not isinstance(buf, Array):
         buf = Array(buf)
     cdef Array arr = buf
-    cpp_memory_register(<void*>arr.ptr)
+    with nogil:
+        cpp_memory_register(<void*>arr.ptr)
 
 
 def memory_deregister(buf) -> None:
     if not isinstance(buf, Array):
         buf = Array(buf)
     cdef Array arr = buf
-    cpp_memory_deregister(<void*>arr.ptr)
+    with nogil:
+        cpp_memory_deregister(<void*>arr.ptr)
 
 
 cdef extern from "<kvikio/bounce_buffer.hpp>" nogil:
-    size_t cpp_alloc_retain_clear "kvikio::AllocRetain::instance().clear"() except +
+    size_t cpp_page_aligned_bounce_buffer_pool_clear \
+        "kvikio::PageAlignedBounceBufferPool::instance().clear"() except +
+
+    size_t cpp_cuda_pinned_bounce_buffer_pool_clear \
+        "kvikio::CudaPinnedBounceBufferPool::instance().clear"() except +
+
+    size_t cpp_cuda_page_aligned_pinned_bounce_buffer_pool_clear \
+        "kvikio::CudaPageAlignedPinnedBounceBufferPool::instance().clear"() except +
 
 
 def bounce_buffer_free() -> int:
-    return cpp_alloc_retain_clear()
+    cdef size_t result
+    with nogil:
+        result = cpp_page_aligned_bounce_buffer_pool_clear() + \
+            cpp_cuda_pinned_bounce_buffer_pool_clear() + \
+            cpp_cuda_page_aligned_pinned_bounce_buffer_pool_clear()
+    return result
diff --git a/python/kvikio/kvikio/_lib/cufile_driver.pyx b/python/kvikio/kvikio/_lib/cufile_driver.pyx
index 0488eb3b20..34d3ba652f 100644
--- a/python/kvikio/kvikio/_lib/cufile_driver.pyx
+++ b/python/kvikio/kvikio/_lib/cufile_driver.pyx
@@ -1,5 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
@@ -15,15 +15,20 @@ cdef extern from "<kvikio/shim/cufile.hpp>" nogil:
 
 
 def libcufile_version() -> int:
-    return cpp_libcufile_version()
+    cdef int version
+    with nogil:
+        version = cpp_libcufile_version()
+    return version
 
 
 def driver_open():
-    cpp_driver_open()
+    with nogil:
+        cpp_driver_open()
 
 
 def driver_close():
-    cpp_driver_close()
+    with nogil:
+        cpp_driver_close()
 
 
 cdef extern from "<kvikio/cufile/driver.hpp>" nogil:
@@ -49,55 +54,90 @@ cdef class DriverProperties:
 
     @property
     def is_gds_available(self) -> bool:
+        cdef bool result
         try:
-            return self._handle.is_gds_available()
+            with nogil:
+                result = self._handle.is_gds_available()
+            return result
         except RuntimeError:
             return False
 
     @property
     def major_version(self) -> bool:
-        return self._handle.get_nvfs_major_version()
+        cdef unsigned int version
+        with nogil:
+            version = self._handle.get_nvfs_major_version()
+        return version
 
     @property
     def minor_version(self) -> bool:
-        return self._handle.get_nvfs_minor_version()
+        cdef unsigned int version
+        with nogil:
+            version = self._handle.get_nvfs_minor_version()
+        return version
 
     @property
     def allow_compat_mode(self) -> bool:
-        return self._handle.get_nvfs_allow_compat_mode()
+        cdef bool result
+        with nogil:
+            result = self._handle.get_nvfs_allow_compat_mode()
+        return result
 
     @property
     def poll_mode(self) -> bool:
-        return self._handle.get_nvfs_poll_mode()
+        cdef bool result
+        with nogil:
+            result = self._handle.get_nvfs_poll_mode()
+        return result
 
     @poll_mode.setter
     def poll_mode(self, enable: bool) -> None:
-        self._handle.set_nvfs_poll_mode(enable)
+        cdef bool cpp_enable = enable
+        with nogil:
+            self._handle.set_nvfs_poll_mode(cpp_enable)
 
     @property
     def poll_thresh_size(self) -> int:
-        return self._handle.get_nvfs_poll_thresh_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_nvfs_poll_thresh_size()
+        return size
 
     @poll_thresh_size.setter
     def poll_thresh_size(self, size_in_kb: int) -> None:
-        self._handle.set_nvfs_poll_thresh_size(size_in_kb)
+        cdef size_t size = size_in_kb
+        with nogil:
+            self._handle.set_nvfs_poll_thresh_size(size)
 
     @property
     def max_device_cache_size(self) -> int:
-        return self._handle.get_max_device_cache_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_max_device_cache_size()
+        return size
 
     @max_device_cache_size.setter
     def max_device_cache_size(self, size_in_kb: int) -> None:
-        self._handle.set_max_device_cache_size(size_in_kb)
+        cdef size_t size = size_in_kb
+        with nogil:
+            self._handle.set_max_device_cache_size(size)
 
     @property
     def per_buffer_cache_size(self) -> int:
-        return self._handle.get_per_buffer_cache_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_per_buffer_cache_size()
+        return size
 
     @property
     def max_pinned_memory_size(self) -> int:
-        return self._handle.get_max_pinned_memory_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_max_pinned_memory_size()
+        return size
 
     @max_pinned_memory_size.setter
     def max_pinned_memory_size(self, size_in_kb: int) -> None:
-        self._handle.set_max_pinned_memory_size(size_in_kb)
+        cdef size_t size = size_in_kb
+        with nogil:
+            self._handle.set_max_pinned_memory_size(size)
diff --git a/python/kvikio/kvikio/_lib/defaults.pyx b/python/kvikio/kvikio/_lib/defaults.pyx
index 00f1de4ec1..e5bfbca713 100644
--- a/python/kvikio/kvikio/_lib/defaults.pyx
+++ b/python/kvikio/kvikio/_lib/defaults.pyx
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
@@ -37,75 +37,145 @@ cdef extern from "<kvikio/defaults.hpp>" namespace "kvikio" nogil:
     vector[int] cpp_http_status_codes "kvikio::defaults::http_status_codes"() except +
     void cpp_set_http_status_codes \
         "kvikio::defaults::set_http_status_codes"(vector[int] status_codes) except +
-
     long cpp_http_timeout "kvikio::defaults::http_timeout"() except +
     void cpp_set_http_timeout\
         "kvikio::defaults::set_http_timeout"(long timeout_seconds) except +
+    bool cpp_auto_direct_io_read "kvikio::defaults::auto_direct_io_read"() except +
+    void cpp_set_auto_direct_io_read \
+        "kvikio::defaults::set_auto_direct_io_read"(size_t flag) except +
+    bool cpp_auto_direct_io_write "kvikio::defaults::auto_direct_io_write"() except +
+    void cpp_set_auto_direct_io_write \
+        "kvikio::defaults::set_auto_direct_io_write"(size_t flag) except +
 
 
 def is_compat_mode_preferred() -> bool:
-    return cpp_is_compat_mode_preferred()
+    cdef bool result
+    with nogil:
+        result = cpp_is_compat_mode_preferred()
+    return result
 
 
 def compat_mode() -> CompatMode:
-    return cpp_compat_mode()
+    cdef CompatMode result
+    with nogil:
+        result = cpp_compat_mode()
+    return result
 
 
 def set_compat_mode(compat_mode: CompatMode) -> None:
-    cpp_set_compat_mode(compat_mode)
+    with nogil:
+        cpp_set_compat_mode(compat_mode)
 
 
 def thread_pool_nthreads() -> int:
-    return cpp_thread_pool_nthreads()
+    cdef unsigned int result
+    with nogil:
+        result = cpp_thread_pool_nthreads()
+    return result
 
 
 def set_thread_pool_nthreads(nthreads: int) -> None:
-    cpp_set_thread_pool_nthreads(nthreads)
+    cdef unsigned int cpp_nthreads = nthreads
+    with nogil:
+        cpp_set_thread_pool_nthreads(cpp_nthreads)
 
 
 def task_size() -> int:
-    return cpp_task_size()
+    cdef size_t result
+    with nogil:
+        result = cpp_task_size()
+    return result
 
 
 def set_task_size(nbytes: int) -> None:
-    cpp_set_task_size(nbytes)
+    cdef size_t cpp_nbytes = nbytes
+    with nogil:
+        cpp_set_task_size(cpp_nbytes)
 
 
 def gds_threshold() -> int:
-    return cpp_gds_threshold()
+    cdef size_t result
+    with nogil:
+        result = cpp_gds_threshold()
+    return result
 
 
 def set_gds_threshold(nbytes: int) -> None:
-    cpp_set_gds_threshold(nbytes)
+    cdef size_t cpp_nbytes = nbytes
+    with nogil:
+        cpp_set_gds_threshold(cpp_nbytes)
 
 
 def bounce_buffer_size() -> int:
-    return cpp_bounce_buffer_size()
+    cdef size_t result
+    with nogil:
+        result = cpp_bounce_buffer_size()
+    return result
 
 
 def set_bounce_buffer_size(nbytes: int) -> None:
-    cpp_set_bounce_buffer_size(nbytes)
+    cdef size_t cpp_nbytes = nbytes
+    with nogil:
+        cpp_set_bounce_buffer_size(cpp_nbytes)
 
 
 def http_max_attempts() -> int:
-    return cpp_http_max_attempts()
+    cdef size_t result
+    with nogil:
+        result = cpp_http_max_attempts()
+    return result
 
 
 def set_http_max_attempts(attempts: int) -> None:
-    cpp_set_http_max_attempts(attempts)
+    cdef size_t cpp_attempts = attempts
+    with nogil:
+        cpp_set_http_max_attempts(cpp_attempts)
 
 
 def http_timeout() -> int:
-    return cpp_http_timeout()
+    cdef long result
+    with nogil:
+        result = cpp_http_timeout()
+    return result
 
 
 def set_http_timeout(timeout: int) -> None:
-    return cpp_set_http_timeout(timeout)
+    cdef long cpp_timeout = timeout
+    with nogil:
+        cpp_set_http_timeout(cpp_timeout)
 
 
 def http_status_codes() -> list[int]:
+    # Cannot use nogil here because we need the GIL for list creation
     return cpp_http_status_codes()
 
 
 def set_http_status_codes(status_codes: list[int]) -> None:
-    return cpp_set_http_status_codes(status_codes)
+    # Cannot use nogil here because we need the GIL for list conversion
+    cpp_set_http_status_codes(status_codes)
+
+
+def auto_direct_io_read() -> bool:
+    cdef bool result
+    with nogil:
+        result = cpp_auto_direct_io_read()
+    return result
+
+
+def set_auto_direct_io_read(flag: bool) -> None:
+    cdef bool cpp_flag = flag
+    with nogil:
+        cpp_set_auto_direct_io_read(cpp_flag)
+
+
+def auto_direct_io_write() -> bool:
+    cdef bool result
+    with nogil:
+        result = cpp_auto_direct_io_write()
+    return result
+
+
+def set_auto_direct_io_write(flag: bool) -> None:
+    cdef bool cpp_flag = flag
+    with nogil:
+        cpp_set_auto_direct_io_write(cpp_flag)
diff --git a/python/kvikio/kvikio/_lib/file_handle.pyx b/python/kvikio/kvikio/_lib/file_handle.pyx
index 6ac3cc14d8..b2a11fbdc1 100644
--- a/python/kvikio/kvikio/_lib/file_handle.pyx
+++ b/python/kvikio/kvikio/_lib/file_handle.pyx
@@ -1,12 +1,11 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
 
 import io
 import os
-import pathlib
 from typing import Optional, Union
 
 from posix cimport fcntl
@@ -88,6 +87,7 @@ cdef extern from "<kvikio/file_handle.hpp>" namespace "kvikio" nogil:
             size_t devPtr_offset,
             CUstream stream
         ) except +
+        bool is_direct_io_supported()
 
 
 cdef class CuFile:
@@ -95,88 +95,134 @@ cdef class CuFile:
     cdef FileHandle _handle
 
     def __init__(self, file_path, flags="r"):
-        self._handle = move(
-            FileHandle(
-                str.encode(str(pathlib.Path(file_path))),
-                str.encode(str(flags))
+        cdef string cpp_file_path = os.fsencode(file_path)
+        cdef string cpp_flags = str(flags).encode()
+        with nogil:
+            self._handle = move(
+                FileHandle(
+                    cpp_file_path,
+                    cpp_flags
+                )
             )
-        )
 
     def close(self) -> None:
-        self._handle.close()
+        with nogil:
+            self._handle.close()
 
     def closed(self) -> bool:
-        return self._handle.closed()
+        cdef bool result
+        with nogil:
+            result = self._handle.closed()
+        return result
 
     def fileno(self) -> int:
-        return self._handle.fd()
+        cdef int result
+        with nogil:
+            result = self._handle.fd()
+        return result
 
     def open_flags(self) -> int:
-        return self._handle.fd_open_flags()
+        cdef int result
+        with nogil:
+            result = self._handle.fd_open_flags()
+        return result
 
     def pread(self, buf, size: Optional[int], file_offset: int, task_size) -> IOFuture:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return _wrap_io_future(
-            self._handle.pread(
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_task_size = task_size if task_size else defaults.task_size()
+        cdef future[size_t] fut
+        with nogil:
+            fut = self._handle.pread(
                 <void*>info.first,
                 info.second,
-                file_offset,
-                task_size if task_size else defaults.task_size()
+                cpp_file_offset,
+                cpp_task_size
             )
-        )
+        return _wrap_io_future(fut)
 
     def pwrite(self, buf, size: Optional[int], file_offset: int, task_size) -> IOFuture:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return _wrap_io_future(
-            self._handle.pwrite(
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_task_size = task_size if task_size else defaults.task_size()
+        cdef future[size_t] fut
+        with nogil:
+            fut = self._handle.pwrite(
                 <void*>info.first,
                 info.second,
-                file_offset,
-                task_size if task_size else defaults.task_size()
+                cpp_file_offset,
+                cpp_task_size
             )
-        )
+        return _wrap_io_future(fut)
 
     def read(self, buf, size: Optional[int], file_offset: int, dev_offset: int) -> int:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return self._handle.read(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-        )
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef size_t result
+        with nogil:
+            result = self._handle.read(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+            )
+        return result
 
     def write(self, buf, size: Optional[int], file_offset: int, dev_offset: int) -> int:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return self._handle.write(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-        )
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef size_t result
+        with nogil:
+            result = self._handle.write(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+            )
+        return result
 
     def read_async(self, buf, size: Optional[int], file_offset: int, dev_offset: int,
                    st: uintptr_t) -> IOFutureStream:
-        stream = <CUstream>st
+        cdef CUstream stream = <CUstream>st
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return _wrap_stream_future(self._handle.read_async(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-            stream,
-        ))
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef cpp_StreamFuture fut
+        with nogil:
+            fut = self._handle.read_async(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+                stream,
+            )
+        return _wrap_stream_future(fut)
 
     def write_async(self, buf, size: Optional[int], file_offset: int, dev_offset: int,
                     st: uintptr_t) -> IOFutureStream:
-        stream = <CUstream>st
+        cdef CUstream stream = <CUstream>st
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return _wrap_stream_future(self._handle.write_async(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-            stream,
-        ))
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef cpp_StreamFuture fut
+        with nogil:
+            fut = self._handle.write_async(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+                stream,
+            )
+        return _wrap_stream_future(fut)
+
+    def is_direct_io_supported(self) -> bool:
+        cdef bool result
+        with nogil:
+            result = self._handle.is_direct_io_supported()
+        return result
+
 
 cdef extern from "<kvikio/file_utils.hpp>" nogil:
     pair[size_t, size_t] cpp_get_page_cache_info_str \
@@ -185,20 +231,44 @@ cdef extern from "<kvikio/file_utils.hpp>" nogil:
     pair[size_t, size_t] cpp_get_page_cache_info_int \
         "kvikio::get_page_cache_info"(int fd) except +
 
+    bool cpp_clear_page_cache "kvikio::clear_page_cache" \
+        (bool reclaim_dentries_and_inodes, bool clear_dirty_pages) \
+        except +
+
 
 def get_page_cache_info(file: Union[os.PathLike, str, int, io.IOBase]) \
         -> tuple[int, int]:
+    cdef pair[size_t, size_t] result
+    cdef string path_bytes
+    cdef int fd
+
     if isinstance(file, os.PathLike) or isinstance(file, str):
         # file is a path or a string object
-        path_bytes = str(pathlib.Path(file)).encode()
-        return cpp_get_page_cache_info_str(path_bytes)
+        path_bytes = os.fsencode(file)
+        with nogil:
+            result = cpp_get_page_cache_info_str(path_bytes)
+        return result
     elif isinstance(file, int):
         # file is a file descriptor
-        return cpp_get_page_cache_info_int(file)
+        fd = file
+        with nogil:
+            result = cpp_get_page_cache_info_int(fd)
+        return result
     elif isinstance(file, io.IOBase):
         # file is a file object
         # pass its file descriptor to the underlying C++ function
-        return cpp_get_page_cache_info_int(file.fileno())
+        fd = file.fileno()
+        with nogil:
+            result = cpp_get_page_cache_info_int(fd)
+        return result
     else:
         raise ValueError("The type of `file` must be `os.PathLike`, `str`, `int`, "
                          "or `io.IOBase`")
+
+
+def clear_page_cache(reclaim_dentries_and_inodes: bool,
+                     clear_dirty_pages: bool) -> bool:
+    cdef bool result
+    with nogil:
+        result = cpp_clear_page_cache(reclaim_dentries_and_inodes, clear_dirty_pages)
+    return result
diff --git a/python/kvikio/kvikio/_lib/future.pxd b/python/kvikio/kvikio/_lib/future.pxd
index 4d564b37a9..daabbe0bdc 100644
--- a/python/kvikio/kvikio/_lib/future.pxd
+++ b/python/kvikio/kvikio/_lib/future.pxd
@@ -1,5 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
diff --git a/python/kvikio/kvikio/_lib/future.pyx b/python/kvikio/kvikio/_lib/future.pyx
index da6ab308dc..1d86f0fc27 100644
--- a/python/kvikio/kvikio/_lib/future.pyx
+++ b/python/kvikio/kvikio/_lib/future.pyx
@@ -1,5 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
@@ -15,7 +15,10 @@ cdef extern from "<kvikio/utils.hpp>" namespace "kvikio" nogil:
 cdef class IOFutureStream:
     """Wrap a C++ StreamFuture in a Python object"""
     def check_bytes_done(self) -> int:
-        return self._handle.check_bytes_done()
+        cdef size_t bytes_done
+        with nogil:
+            bytes_done = self._handle.check_bytes_done()
+        return bytes_done
 
 
 cdef IOFutureStream _wrap_stream_future(cpp_StreamFuture &fut):
@@ -34,7 +37,10 @@ cdef class IOFuture:
         return ret
 
     def done(self) -> bool:
-        return is_future_done(self._handle)
+        cdef bool result
+        with nogil:
+            result = is_future_done(self._handle)
+        return result
 
 
 cdef IOFuture _wrap_io_future(future[size_t] &fut):
diff --git a/python/kvikio/kvikio/_lib/libnvcomp.pyx b/python/kvikio/kvikio/_lib/libnvcomp.pyx
deleted file mode 100644
index dc5359e9b3..0000000000
--- a/python/kvikio/kvikio/_lib/libnvcomp.pyx
+++ /dev/null
@@ -1,235 +0,0 @@
-# Copyright (c) 2022 Carson Swope
-# Use, modification, and distribution is subject to the MIT License
-# https://github.com/carsonswope/py-nvcomp/blob/main/LICENSE)
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# SPDX-License-Identifier: MIT
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-from enum import Enum
-
-from libc.stdint cimport uint8_t, uintptr_t
-from libcpp cimport nullptr
-from libcpp.memory cimport make_shared, shared_ptr
-from libcpp.utility cimport move
-
-from kvikio._lib.arr cimport Array
-from kvikio._lib.nvcomp_cxx_api cimport (
-    ANSManager,
-    BitcompManager,
-    CascadedManager,
-    CompressionConfig,
-    DecompressionConfig,
-    GdeflateManager,
-    LZ4Manager,
-    SnappyManager,
-    create_manager,
-    nvcompBatchedANSDefaultOpts,
-    nvcompBatchedANSOpts_t,
-    nvcompBatchedBitcompFormatOpts,
-    nvcompBatchedCascadedDefaultOpts,
-    nvcompBatchedCascadedOpts_t,
-    nvcompBatchedGdeflateOpts_t,
-    nvcompBatchedLZ4Opts_t,
-    nvcompBatchedSnappyDefaultOpts,
-    nvcompBatchedSnappyOpts_t,
-    nvcompManagerBase,
-    nvcompType_t,
-)
-
-
-class pyNvcompType_t(Enum):
-    pyNVCOMP_TYPE_CHAR = nvcompType_t.NVCOMP_TYPE_CHAR
-    pyNVCOMP_TYPE_UCHAR = nvcompType_t.NVCOMP_TYPE_UCHAR
-    pyNVCOMP_TYPE_SHORT = nvcompType_t.NVCOMP_TYPE_SHORT
-    pyNVCOMP_TYPE_USHORT = nvcompType_t.NVCOMP_TYPE_USHORT
-    pyNVCOMP_TYPE_INT = nvcompType_t.NVCOMP_TYPE_INT
-    pyNVCOMP_TYPE_UINT = nvcompType_t.NVCOMP_TYPE_UINT
-    pyNVCOMP_TYPE_LONGLONG = nvcompType_t.NVCOMP_TYPE_LONGLONG
-    pyNVCOMP_TYPE_ULONGLONG = nvcompType_t.NVCOMP_TYPE_ULONGLONG
-    pyNVCOMP_TYPE_BITS = nvcompType_t.NVCOMP_TYPE_BITS
-
-
-cdef class _nvcompManager:
-    # Temporary storage for factory allocated manager to prevent cleanup
-    cdef shared_ptr[nvcompManagerBase] _mgr
-    cdef nvcompManagerBase* _impl
-    cdef shared_ptr[CompressionConfig] _compression_config
-    cdef shared_ptr[DecompressionConfig] _decompression_config
-
-    def __dealloc__(self):
-        # `ManagedManager` uses a temporary object, self._mgr
-        # to retain a reference count to the Manager created by
-        # create_manager. If it is present, then the `shared_ptr`
-        # system will free self._impl. Otherwise, we need to free
-        # self._iNonempl
-        if self._mgr == nullptr:
-            del self._impl
-
-    def configure_compression(self, decomp_buffer_size):
-        cdef shared_ptr[CompressionConfig] partial = make_shared[
-            CompressionConfig](
-                self._impl.configure_compression(decomp_buffer_size)
-        )
-        self._compression_config = make_shared[CompressionConfig](
-            (move(partial.get()[0]))
-        )
-        cdef const CompressionConfig* compression_config_ptr = \
-            self._compression_config.get()
-        return {
-            "uncompressed_buffer_size": compression_config_ptr.
-            uncompressed_buffer_size,
-            "max_compressed_buffer_size": compression_config_ptr.
-            max_compressed_buffer_size,
-            "num_chunks": compression_config_ptr.num_chunks
-        }
-
-    def compress(self, Array decomp_buffer, Array comp_buffer):
-        cdef uintptr_t comp_buffer_ptr = comp_buffer.ptr
-        self._impl.compress(
-            <const uint8_t*>decomp_buffer.ptr,
-            <uint8_t*>comp_buffer_ptr,
-            <CompressionConfig&>self._compression_config.get()[0]
-        )
-        size = self._impl.get_compressed_output_size(
-            <uint8_t*>comp_buffer_ptr
-        )
-        return size
-
-    def configure_decompression_with_compressed_buffer(
-        self,
-        Array comp_buffer
-    ) -> dict:
-        cdef shared_ptr[DecompressionConfig] partial = make_shared[
-            DecompressionConfig](self._impl.configure_decompression(
-                <uint8_t*>comp_buffer.ptr
-            )
-        )
-        self._decompression_config = make_shared[DecompressionConfig](
-            (move(partial.get()[0]))
-        )
-        cdef const DecompressionConfig* decompression_config_ptr = \
-            self._decompression_config.get()
-        return {
-            "decomp_data_size": decompression_config_ptr.decomp_data_size,
-            "num_chunks": decompression_config_ptr.num_chunks
-        }
-
-    def decompress(
-        self,
-        Array decomp_buffer,
-        Array comp_buffer,
-    ):
-        self._impl.decompress(
-            <uint8_t*>decomp_buffer.ptr,
-            <const uint8_t*>comp_buffer.ptr,
-            <DecompressionConfig&>self._decompression_config.get()[0]
-        )
-
-    def get_compressed_output_size(self, Array comp_buffer):
-        return self._impl.get_compressed_output_size(
-            <uint8_t*>comp_buffer.ptr
-        )
-
-
-cdef class _ANSManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-    ):
-        self._impl = <nvcompManagerBase*>new ANSManager(
-            uncomp_chunk_size,
-            <nvcompBatchedANSOpts_t>nvcompBatchedANSDefaultOpts
-        )
-
-
-cdef class _BitcompManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-        nvcompType_t data_type,
-        int bitcomp_algo,
-    ):
-        cdef opts = nvcompBatchedBitcompFormatOpts(bitcomp_algo, data_type)
-        self._impl = <nvcompManagerBase*>new BitcompManager(
-            uncomp_chunk_size,
-            opts
-        )
-
-
-cdef class _CascadedManager(_nvcompManager):
-    def __cinit__(
-        self,
-        _options,
-    ):
-        self._impl = <nvcompManagerBase*>new CascadedManager(
-            _options["chunk_size"],
-            <nvcompBatchedCascadedOpts_t>nvcompBatchedCascadedDefaultOpts
-        )
-
-
-cdef class _GdeflateManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-        int algo,
-    ):
-        cdef opts = nvcompBatchedGdeflateOpts_t(algo)
-        self._impl = <nvcompManagerBase*>new GdeflateManager(
-            uncomp_chunk_size,
-            opts
-        )
-
-
-cdef class _LZ4Manager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-        nvcompType_t data_type,
-    ):
-        # TODO: Doesn't work with user specified streams passed down
-        # from anywhere up. I'm not going to rabbit hole on it until
-        # everything else works.
-        # cdef cudaStream_t stream = <cudaStream_t><void*>user_stream
-        cdef opts = nvcompBatchedLZ4Opts_t(data_type)
-        self._impl = <nvcompManagerBase*>new LZ4Manager(
-            uncomp_chunk_size,
-            opts
-        )
-
-
-cdef class _SnappyManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-    ):
-        # TODO: Doesn't work with user specified streams passed down
-        # from anywhere up. I'm not going to rabbit hole on it until
-        # everything else works.
-        self._impl = <nvcompManagerBase*>new SnappyManager(
-            uncomp_chunk_size,
-            <nvcompBatchedSnappyOpts_t>nvcompBatchedSnappyDefaultOpts
-        )
-
-
-cdef class _ManagedManager(_nvcompManager):
-    def __init__(self, compressed_buffer):
-        cdef shared_ptr[nvcompManagerBase] _mgr = create_manager(
-            <uint8_t*><uintptr_t>compressed_buffer.ptr
-        )
-        self._mgr = _mgr
-        self._impl = move(_mgr).get()
diff --git a/python/kvikio/kvikio/_lib/libnvcomp_ll.pyx b/python/kvikio/kvikio/_lib/libnvcomp_ll.pyx
deleted file mode 100644
index 46c7b399a9..0000000000
--- a/python/kvikio/kvikio/_lib/libnvcomp_ll.pyx
+++ /dev/null
@@ -1,1182 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from enum import IntEnum
-
-from libc.stdint cimport uint32_t, uintptr_t
-
-from kvikio._lib.nvcomp_ll_cxx_api cimport cudaStream_t, nvcompStatus_t, nvcompType_t
-
-import cupy
-
-
-class nvCompStatus(IntEnum):
-    Success = nvcompStatus_t.nvcompSuccess,
-    ErrorInvalidValue = nvcompStatus_t.nvcompErrorInvalidValue,
-    ErrorNotSupported = nvcompStatus_t.nvcompErrorNotSupported,
-    ErrorCannotDecompress = nvcompStatus_t.nvcompErrorCannotDecompress,
-    ErrorBadChecksum = nvcompStatus_t.nvcompErrorBadChecksum,
-    ErrorCannotVerifyChecksums = nvcompStatus_t.nvcompErrorCannotVerifyChecksums,
-    ErrorCudaError = nvcompStatus_t.nvcompErrorCudaError,
-    ErrorInternal = nvcompStatus_t.nvcompErrorInternal,
-
-
-class nvCompType(IntEnum):
-    CHAR = nvcompType_t.NVCOMP_TYPE_CHAR
-    UCHAR = nvcompType_t.NVCOMP_TYPE_UCHAR
-    SHORT = nvcompType_t.NVCOMP_TYPE_SHORT
-    USHORT = nvcompType_t.NVCOMP_TYPE_USHORT
-    INT = nvcompType_t.NVCOMP_TYPE_INT
-    UINT = nvcompType_t.NVCOMP_TYPE_UINT
-    LONGLONG = nvcompType_t.NVCOMP_TYPE_LONGLONG
-    ULONGLONG = nvcompType_t.NVCOMP_TYPE_ULONGLONG
-    BITS = nvcompType_t.NVCOMP_TYPE_BITS
-
-
-class nvCompBatchAlgorithm(ABC):
-    """Abstract class that provides interface to nvCOMP batched algorithms."""
-
-    # TODO(akamenev): it might be possible to have a simpler implementation that
-    # eilminates the need to have a separate implementation class for each algorithm,
-    # potentially using fused types in Cython (similar to C++ templates),
-    # but I could not figure out how to do that (e.g. each algorithm API set has
-    # a different type for the options and so on).
-
-    def get_compress_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        """Get temporary space required for compression.
-
-        Parameters
-        ----------
-        batch_size: int
-            The number of items in the batch.
-        max_uncompressed_chunk_bytes: int
-            The maximum size in bytes of a chunk in the batch.
-
-        Returns
-        -------
-        int
-            The size in bytes of the required GPU workspace for compression.
-        """
-        err, temp_size = self._get_comp_temp_size(
-            batch_size,
-            max_uncompressed_chunk_bytes
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get compress temp buffer size, "
-                f"error: {nvCompStatus(err)!r}."
-            )
-        return temp_size
-
-    @abstractmethod
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        """Algorithm-specific implementation."""
-        ...
-
-    def get_compress_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        """Get the maximum size any chunk could compress to in the batch.
-
-        Parameters
-        ----------
-        max_uncompressed_chunk_bytes: int
-            The maximum size in bytes of a chunk in the batch.
-
-        Returns
-        -------
-        int
-            The maximum compressed size in bytes of the largest chunk. That is,
-            the minimum amount of output memory required to be given to
-            the corresponding *CompressAsync function.
-        """
-        err, comp_chunk_size = self._get_comp_chunk_size(max_uncompressed_chunk_bytes)
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get output buffer size, "
-                f"error: {nvCompStatus(err)!r}."
-            )
-        return comp_chunk_size
-
-    @abstractmethod
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        """Algorithm-specific implementation."""
-        ...
-
-    def compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        """Perform compression.
-
-        Parameters
-        ----------
-        uncomp_chunks: cp.ndarray[uintp]
-            The pointers on the GPU, to uncompressed batched items.
-        uncomp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each uncompressed batch item on the GPU.
-        max_uncomp_chunk_bytes: int
-            The maximum size in bytes of the largest chunk in the batch.
-        batch_size: int
-            The number of chunks to compress.
-        temp_buf: cp.ndarray
-            The temporary GPU workspace.
-        comp_chunks: cp.ndarray[uintp]
-            (output) The list of pointers on the GPU, to the output location for each
-            compressed batch item.
-        comp_chunk_sizes: cp.ndarray[uint64]
-            (output) The compressed size in bytes of each chunk.
-        stream: cp.cuda.Stream
-            CUDA stream.
-        """
-
-        err = self._compress(
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            max_uncomp_chunk_bytes,
-            batch_size,
-            temp_buf,
-            comp_chunks,
-            comp_chunk_sizes,
-            stream,
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(f"Compression failed, error: {nvCompStatus(err)!r}.")
-
-    @abstractmethod
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-    def get_decompress_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        """Get the amount of temp space required on the GPU for decompression.
-
-        Parameters
-        ----------
-        batch_size: int
-            The number of items in the batch.
-        max_uncompressed_chunk_bytes: int
-            The size in bytes of the largest chunk when uncompressed.
-
-        Returns
-        -------
-        int
-            The amount of temporary GPU space in bytes that will be
-            required to decompress.
-        """
-        err, temp_size = self._get_decomp_temp_size(
-            batch_size,
-            max_uncompressed_chunk_bytes
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get decompress temp buffer size, "
-                f"error: {nvCompStatus(err)!r}."
-            )
-
-        return temp_size
-
-    @abstractmethod
-    def _get_decomp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-    def get_decompress_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        """Get the amount of space required on the GPU for decompression.
-
-        Parameters
-        ----------
-        comp_chunks: cp.ndarray[uintp]
-            The pointers on the GPU, to compressed batched items.
-        comp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each compressed batch item.
-        stream: cp.cuda.Stream
-            CUDA stream.
-
-        Returns
-        -------
-        cp.ndarray[uint64]
-            The amount of GPU space in bytes that will be required
-            to decompress each chunk.
-        """
-
-        assert len(comp_chunks) == len(comp_chunk_sizes)
-        batch_size = len(comp_chunks)
-
-        # nvCOMP requires all buffers to be in GPU memory.
-        uncomp_chunk_sizes = cupy.empty_like(comp_chunk_sizes)
-
-        err = self._get_decomp_size(
-            comp_chunks,
-            comp_chunk_sizes,
-            batch_size,
-            uncomp_chunk_sizes,
-            stream,
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get decompress buffer size, error: {nvCompStatus(err)!r}."
-            )
-
-        return uncomp_chunk_sizes
-
-    @abstractmethod
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-    def decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        """Perform decompression.
-
-        Parameters
-        ----------
-        comp_chunks: cp.ndarray[uintp]
-            The pointers on the GPU, to compressed batched items.
-        comp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each compressed batch item.
-        batch_size: int
-            The number of chunks to decompress.
-        temp_buf: cp.ndarray
-            The temporary GPU workspace.
-        uncomp_chunks: cp.ndarray[uintp]
-            (output) The pointers on the GPU, to the output location for each
-            decompressed batch item.
-        uncomp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each decompress chunk location on the GPU.
-        actual_uncomp_chunk_sizes: cp.ndarray[uint64]
-            (output) The actual decompressed size in bytes of each chunk on the GPU.
-        statuses: cp.ndarray
-            (output) The status for each chunk of whether it was decompressed or not.
-        stream: cp.cuda.Stream
-            CUDA stream.
-        """
-
-        err = self._decompress(
-            comp_chunks,
-            comp_chunk_sizes,
-            batch_size,
-            temp_buf,
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            actual_uncomp_chunk_sizes,
-            statuses,
-            stream,
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(f"Decompression failed, error: {nvCompStatus(err)!r}.")
-
-    @abstractmethod
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-
-cdef uintptr_t to_ptr(buf):
-    return buf.data.ptr
-
-
-cdef cudaStream_t to_stream(stream):
-    return <cudaStream_t><size_t>stream.ptr
-
-
-#
-# LZ4 algorithm.
-#
-
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedLZ4CompressAsync,
-    nvcompBatchedLZ4CompressGetMaxOutputChunkSize,
-    nvcompBatchedLZ4CompressGetTempSize,
-    nvcompBatchedLZ4DecompressAsync,
-    nvcompBatchedLZ4DecompressGetTempSize,
-    nvcompBatchedLZ4DefaultOpts,
-    nvcompBatchedLZ4GetDecompressSizeAsync,
-    nvcompBatchedLZ4Opts_t,
-)
-
-
-class nvCompBatchAlgorithmLZ4(nvCompBatchAlgorithm):
-    """LZ4 algorithm implementation."""
-
-    algo_id: str = "lz4"
-
-    options: nvcompBatchedLZ4Opts_t
-
-    HEADER_SIZE_BYTES: size_t = sizeof(uint32_t)
-
-    def __init__(self, data_type: int = None, has_header: bool = True):
-        """Initialize the codec.
-
-        Parameters
-        ----------
-        data_type: int or None
-            Source data type. If None, uses nvcomp default options.
-        has_header: bool
-            Whether the compressed data has a header.
-            This enables data compatibility between numcodecs LZ4 codec,
-            which has the header and nvCOMP LZ4 codec which does not
-            require the header.
-        """
-        if data_type is None:
-            self.options = nvcompBatchedLZ4DefaultOpts
-        else:
-            self.options = nvcompBatchedLZ4Opts_t(data_type)
-
-        self.has_header = has_header
-
-        # Note on LZ4 header structure: numcodecs LZ4 codec prepends
-        # a 4-byte (uint32_t) header to each compressed chunk.
-        # The header stores the size of the original (uncompressed) data:
-        # https://github.com/zarr-developers/numcodecs/blob/cb155432e36536e17a2d054c8c24b7bf6f4a7347/numcodecs/lz4.pyx#L89
-        #
-        # The following CUDA kernels read / write chunk header by
-        # casting the chunk pointer to a pointer to unsigned int.
-
-        # CUDA kernel that copies uncompressed chunk size from the chunk header.
-        self._get_size_from_header_kernel = cupy.ElementwiseKernel(
-            "uint64 comp_chunk_ptr",
-            "uint64 uncomp_chunk_size",
-            "uncomp_chunk_size = *((unsigned int *)comp_chunk_ptr)",
-            "get_size_from_header",
-        )
-
-        # CUDA kernel that copies uncompressed chunk size to the chunk header.
-        self._set_chunk_size_header_kernel = cupy.ElementwiseKernel(
-            "uint64 uncomp_chunk_size",
-            "uint64 comp_chunk_ptr",
-            "((unsigned int *)comp_chunk_ptr)[0] = (unsigned int)uncomp_chunk_size",
-            "set_chunk_size_header",
-            no_return=True,
-        )
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedLZ4CompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        # Add header size, if needed.
-        if err == nvcompStatus_t.nvcompSuccess and self.has_header:
-            max_compressed_bytes += self.HEADER_SIZE_BYTES
-
-        return (err, max_compressed_bytes)
-
-    def compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        if self.has_header:
-            # If there is a header, we need to:
-            # 1. Copy the uncompressed chunk size to the compressed chunk header.
-            # 2. Update target pointers in comp_chunks to skip the header portion,
-            # which is not compressed.
-            #
-            self._set_chunk_size_header_kernel(uncomp_chunk_sizes, comp_chunks)
-            # Update chunk pointer to skip the header.
-            comp_chunks += self.HEADER_SIZE_BYTES
-
-        super().compress(
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            max_uncomp_chunk_bytes,
-            batch_size,
-            temp_buf,
-            comp_chunks,
-            comp_chunk_sizes,
-            stream,
-        )
-
-        if self.has_header:
-            # Update chunk pointer and size to include the header.
-            comp_chunks -= self.HEADER_SIZE_BYTES
-            comp_chunk_sizes += self.HEADER_SIZE_BYTES
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedLZ4CompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedLZ4DecompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def get_decompress_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        if not self.has_header:
-            return super().get_decompress_size(
-                comp_chunks,
-                comp_chunk_sizes,
-                stream,
-            )
-
-        return self._get_size_from_header_kernel(comp_chunks)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedLZ4GetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        if self.has_header:
-            # Update chunk pointer and size to exclude the header.
-            comp_chunks += self.HEADER_SIZE_BYTES
-            comp_chunk_sizes -= self.HEADER_SIZE_BYTES
-
-        super().decompress(
-            comp_chunks,
-            comp_chunk_sizes,
-            batch_size,
-            temp_buf,
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            actual_uncomp_chunk_sizes,
-            statuses,
-            stream,
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedLZ4DecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>NULL,
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>NULL,
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(data_type={self.options['data_type']})"
-
-
-#
-# Gdeflate algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedGdeflateCompressAsync,
-    nvcompBatchedGdeflateCompressGetMaxOutputChunkSize,
-    nvcompBatchedGdeflateCompressGetTempSize,
-    nvcompBatchedGdeflateDecompressAsync,
-    nvcompBatchedGdeflateDecompressGetTempSize,
-    nvcompBatchedGdeflateDefaultOpts,
-    nvcompBatchedGdeflateGetDecompressSizeAsync,
-    nvcompBatchedGdeflateOpts_t,
-)
-
-
-class nvCompBatchAlgorithmGdeflate(nvCompBatchAlgorithm):
-    """Gdeflate algorithm implementation."""
-
-    algo_id: str = "gdeflate"
-
-    options: nvcompBatchedGdeflateOpts_t
-
-    def __init__(self, algo: int = None):
-        if algo is None:
-            self.options = nvcompBatchedGdeflateDefaultOpts
-        else:
-            self.options = nvcompBatchedGdeflateOpts_t(algo)
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedGdeflateCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedGdeflateCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        return nvcompBatchedGdeflateCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedGdeflateDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedGdeflateGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        return nvcompBatchedGdeflateDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>NULL,
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>NULL,
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(algo={self.options['algo']})"
-
-
-#
-# zstd algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedZstdCompressAsync,
-    nvcompBatchedZstdCompressGetMaxOutputChunkSize,
-    nvcompBatchedZstdCompressGetTempSize,
-    nvcompBatchedZstdDecompressAsync,
-    nvcompBatchedZstdDecompressGetTempSize,
-    nvcompBatchedZstdDefaultOpts,
-    nvcompBatchedZstdGetDecompressSizeAsync,
-    nvcompBatchedZstdOpts_t,
-)
-
-
-class nvCompBatchAlgorithmZstd(nvCompBatchAlgorithm):
-    """zstd algorithm implementation."""
-
-    algo_id: str = "zstd"
-
-    options: nvcompBatchedZstdOpts_t
-
-    def __init__(self):
-        self.options = nvcompBatchedZstdDefaultOpts
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedZstdCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        return nvcompBatchedZstdCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedZstdDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedZstdGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        return nvcompBatchedZstdDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>to_ptr(actual_uncomp_chunk_sizes),
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>to_ptr(statuses),
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}()"
-
-
-#
-# Snappy algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedSnappyCompressAsync,
-    nvcompBatchedSnappyCompressGetMaxOutputChunkSize,
-    nvcompBatchedSnappyCompressGetTempSize,
-    nvcompBatchedSnappyDecompressAsync,
-    nvcompBatchedSnappyDecompressGetTempSize,
-    nvcompBatchedSnappyDefaultOpts,
-    nvcompBatchedSnappyGetDecompressSizeAsync,
-    nvcompBatchedSnappyOpts_t,
-)
-
-
-class nvCompBatchAlgorithmSnappy(nvCompBatchAlgorithm):
-    """Snappy algorithm implementation."""
-
-    algo_id: str = "snappy"
-
-    options: nvcompBatchedSnappyOpts_t
-
-    def __init__(self):
-        self.options = nvcompBatchedSnappyDefaultOpts
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedSnappyCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        return nvcompBatchedSnappyCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedSnappyDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedSnappyGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        return nvcompBatchedSnappyDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>NULL,
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>NULL,
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}()"
-
-
-#
-# Deflate algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedDeflateCompressAsync,
-    nvcompBatchedDeflateCompressGetMaxOutputChunkSize,
-    nvcompBatchedDeflateCompressGetTempSize,
-    nvcompBatchedDeflateDecompressAsync,
-    nvcompBatchedDeflateDecompressGetTempSize,
-    nvcompBatchedDeflateDefaultOpts,
-    nvcompBatchedDeflateGetDecompressSizeAsync,
-    nvcompBatchedDeflateOpts_t,
-)
-
-
-class nvCompBatchAlgorithmDeflate(nvCompBatchAlgorithm):
-    """Deflate algorithm implementation."""
-
-    algo_id: str = "deflate"
-
-    options: nvcompBatchedDeflateOpts_t
-
-    def __init__(self, algo: int = None):
-        if algo is None:
-            self.options = nvcompBatchedDeflateDefaultOpts
-        else:
-            self.options = nvcompBatchedDeflateOpts_t(algo)
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedDeflateCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedDeflateCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedDeflateDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedDeflateGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedDeflateDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>to_ptr(actual_uncomp_chunk_sizes),
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>to_ptr(statuses),
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(algo={self.options['algo']})"
-
-
-SUPPORTED_ALGORITHMS = {
-    a.algo_id: a for a in [
-        nvCompBatchAlgorithmLZ4,
-        nvCompBatchAlgorithmGdeflate,
-        nvCompBatchAlgorithmZstd,
-        nvCompBatchAlgorithmSnappy,
-        nvCompBatchAlgorithmDeflate,
-    ]
-}
diff --git a/python/kvikio/kvikio/_lib/mmap.pyx b/python/kvikio/kvikio/_lib/mmap.pyx
new file mode 100644
index 0000000000..29861f0a2c
--- /dev/null
+++ b/python/kvikio/kvikio/_lib/mmap.pyx
@@ -0,0 +1,144 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# distutils: language = c++
+# cython: language_level=3
+
+import os
+from typing import Any, Optional
+
+from posix cimport fcntl, stat
+
+from libc.stdint cimport uintptr_t
+from libcpp cimport bool
+from libcpp.optional cimport nullopt, optional
+from libcpp.string cimport string
+from libcpp.utility cimport move, pair
+
+from kvikio._lib.arr cimport parse_buffer_argument
+from kvikio._lib.future cimport IOFuture, _wrap_io_future, future
+
+from kvikio._lib import defaults
+
+
+cdef extern from "<kvikio/mmap.hpp>" namespace "kvikio" nogil:
+    cdef cppclass CppMmapHandle "kvikio::MmapHandle":
+        CppMmapHandle() noexcept
+        CppMmapHandle(string file_path, string flags, optional[size_t] initial_map_size,
+                      size_t initial_map_offset, fcntl.mode_t mode,
+                      optional[int] map_flags) except +
+        size_t initial_map_size() noexcept
+        size_t initial_map_offset() noexcept
+        size_t file_size() except +
+        void close() noexcept
+        bool closed() noexcept
+        size_t read(void* buf, optional[size_t] size, size_t offset) except +
+        future[size_t] pread(void* buf, optional[size_t] size, size_t offset,
+                             size_t task_size) except +
+
+cdef class InternalMmapHandle:
+    cdef CppMmapHandle _handle
+
+    def __init__(self, file_path: os.PathLike,
+                 flags: str = "r",
+                 initial_map_size: Optional[int] = None,
+                 initial_map_offset: int = 0,
+                 mode: int = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH,
+                 map_flags: Optional[int] = None):
+        if not os.path.exists(file_path):
+            raise RuntimeError("Unable to open file")
+
+        cdef string cpp_path_bytes = os.fsencode(file_path)
+        cdef string cpp_flags_bytes = str(flags).encode()
+
+        cdef optional[size_t] cpp_initial_map_size
+        if initial_map_size is None:
+            cpp_initial_map_size = nullopt
+        else:
+            cpp_initial_map_size = <size_t>(initial_map_size)
+
+        cdef size_t cpp_initial_map_offset = initial_map_offset
+        cdef fcntl.mode_t cpp_mode = mode
+
+        cdef optional[int] cpp_map_flags
+        if map_flags is None:
+            cpp_map_flags = nullopt
+        else:
+            cpp_map_flags = <int>(map_flags)
+
+        with nogil:
+            self._handle = move(CppMmapHandle(cpp_path_bytes,
+                                              cpp_flags_bytes,
+                                              cpp_initial_map_size,
+                                              cpp_initial_map_offset,
+                                              cpp_mode,
+                                              cpp_map_flags))
+
+    def initial_map_size(self) -> int:
+        cdef size_t result
+        with nogil:
+            result = self._handle.initial_map_size()
+        return result
+
+    def initial_map_offset(self) -> int:
+        cdef size_t result
+        with nogil:
+            result = self._handle.initial_map_offset()
+        return result
+
+    def file_size(self) -> int:
+        cdef size_t result
+        with nogil:
+            result = self._handle.file_size()
+        return result
+
+    def close(self) -> None:
+        with nogil:
+            self._handle.close()
+
+    def closed(self) -> bool:
+        cdef bool result
+        with nogil:
+            result = self._handle.closed()
+        return result
+
+    def read(self, buf: Any, size: Optional[int] = None, offset: int = 0) -> int:
+        cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
+        cdef optional[size_t] cpp_size
+        if size is None:
+            cpp_size = nullopt
+        else:
+            cpp_size = <size_t>(size)
+        cdef size_t cpp_offset = offset
+        cdef size_t result
+        with nogil:
+            result = self._handle.read(<void*>info.first,
+                                       cpp_size,
+                                       cpp_offset)
+        return result
+
+    def pread(self, buf: Any, size: Optional[int] = None, offset: int = 0,
+              task_size: Optional[int] = None) -> IOFuture:
+        cdef optional[size_t] cpp_size
+        cdef size_t cpp_task_size
+
+        if size is None:
+            cpp_size = nullopt
+        else:
+            cpp_size = <size_t>(size)
+        cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
+
+        cdef size_t cpp_offset = offset
+
+        if task_size is None:
+            cpp_task_size = defaults.task_size()
+        else:
+            cpp_task_size = task_size
+
+        cdef future[size_t] cpp_future
+        with nogil:
+            cpp_future = self._handle.pread(<void*>info.first,
+                                            cpp_size,
+                                            cpp_offset,
+                                            cpp_task_size)
+        return _wrap_io_future(cpp_future)
diff --git a/python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd b/python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd
deleted file mode 100644
index b86797a93f..0000000000
--- a/python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) 2022 Carson Swope
-# Use, modification, and distribution is subject to the MIT License
-# https://github.com/carsonswope/py-nvcomp/blob/main/LICENSE)
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# SPDX-License-Identifier: MIT
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-from libc.stdint cimport uint8_t, uint32_t
-from libcpp.memory cimport shared_ptr
-from libcpp.vector cimport vector
-
-
-cdef extern from "cuda_runtime.h":
-    ctypedef void* cudaStream_t
-
-cdef extern from "nvcomp.h":
-    ctypedef enum nvcompType_t:
-        NVCOMP_TYPE_CHAR = 0,       # 1B
-        NVCOMP_TYPE_UCHAR = 1,      # 1B
-        NVCOMP_TYPE_SHORT = 2,      # 2B
-        NVCOMP_TYPE_USHORT = 3,     # 2B
-        NVCOMP_TYPE_INT = 4,        # 4B
-        NVCOMP_TYPE_UINT = 5,       # 4B
-        NVCOMP_TYPE_LONGLONG = 6,   # 8B
-        NVCOMP_TYPE_ULONGLONG = 7,  # 8B
-        NVCOMP_TYPE_BITS = 0xff     # 1b
-
-
-cdef extern from "nvcomp/shared_types.h":
-    ctypedef enum nvcompStatus_t:
-        nvcompSuccess = 0,
-        nvcompErrorInvalidValue = 10,
-        nvcompErrorNotSupported = 11,
-        nvcompErrorCannotDecompress = 12,
-        nvcompErrorBadChecksum = 13,
-        nvcompErrorCannotVerifyChecksums = 14,
-        nvcompErrorCudaError = 1000,
-        nvcompErrorInternal = 10000,
-
-# Manager Factory
-cdef extern from "nvcomp/nvcompManagerFactory.hpp" namespace 'nvcomp':
-    cdef shared_ptr[nvcompManagerBase] create_manager "nvcomp::create_manager"(
-        const uint8_t* comp_buffer
-    ) except +
-
-
-# Compression Manager
-cdef extern from "nvcomp/nvcompManager.hpp" namespace 'nvcomp':
-    cdef cppclass PinnedPtrPool[T]:
-        pass
-
-    cdef cppclass CompressionConfig "nvcomp::CompressionConfig":
-        const size_t uncompressed_buffer_size
-        const size_t max_compressed_buffer_size
-        const size_t num_chunks
-        CompressionConfig(
-            PinnedPtrPool[nvcompStatus_t]* pool,
-            size_t uncompressed_buffer_size) except +
-        nvcompStatus_t* get_status() const
-        CompressionConfig(CompressionConfig& other)
-        CompressionConfig& operator=(const CompressionConfig& other) except +
-        # Commented as Cython doesn't support rvalues, but a user can call
-        # `move` with the existing operator and generate correct C++ code
-        # xref: https://github.com/cython/cython/issues/1445
-        # CompressionConfig& operator=(CompressionConfig&& other) except +
-
-    cdef cppclass DecompressionConfig "nvcomp::DecompressionConfig":
-        size_t decomp_data_size
-        uint32_t num_chunks
-        DecompressionConfig(PinnedPtrPool[nvcompStatus_t]& pool) except +
-        nvcompStatus_t* get_status() const
-        DecompressionConfig(DecompressionConfig& other)
-        DecompressionConfig& operator=(const DecompressionConfig& other) except +
-        # Commented as Cython doesn't support rvalues, but a user can call
-        # `move` with the existing operator and generate correct C++ code
-        # xref: https://github.com/cython/cython/issues/1445
-        # DecompressionConfig& operator=(DecompressionConfig&& other) except +
-
-    cdef cppclass nvcompManagerBase "nvcomp::nvcompManagerBase":
-        CompressionConfig configure_compression(
-            const size_t decomp_buffer_size)
-        void compress(
-            const uint8_t* decomp_buffer,
-            uint8_t* comp_buffer,
-            const CompressionConfig& comp_config) except +
-        DecompressionConfig configure_decompression(
-            const uint8_t* comp_buffer)
-        DecompressionConfig configure_decompression(
-            const CompressionConfig& comp_config)
-        void decompress(
-            uint8_t* decomp_buffer,
-            const uint8_t* comp_buffer,
-            const DecompressionConfig& decomp_config)
-        size_t get_compressed_output_size(uint8_t* comp_buffer) except +
-
-    cdef cppclass PimplManager "nvcomp::PimplManager":
-        CompressionConfig configure_compression(
-            const size_t decomp_buffer_size) except +
-        void compress(
-            const uint8_t* decomp_buffer,
-            uint8_t* comp_buffer,
-            const CompressionConfig& comp_config) except +
-        DecompressionConfig configure_decompression(
-            const uint8_t* comp_buffer)
-        DecompressionConfig configure_decompression(
-            const CompressionConfig& comp_config)
-        void decompress(
-            uint8_t* decomp_buffer,
-            const uint8_t* comp_buffer,
-            const DecompressionConfig& decomp_config) except +
-        size_t get_compressed_output_size(uint8_t* comp_buffer) except +
-
-# C++ Concrete ANS Manager
-cdef extern from "nvcomp/ans.h" nogil:
-    ctypedef enum nvcompANSType_t:
-        nvcomp_rANS = 0
-
-    ctypedef struct nvcompBatchedANSOpts_t:
-        nvcompANSType_t type
-    cdef nvcompBatchedANSOpts_t nvcompBatchedANSDefaultOpts
-
-cdef extern from "nvcomp/ans.hpp":
-    cdef cppclass ANSManager "nvcomp::ANSManager":
-        ANSManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedANSOpts_t& format_opts,
-        ) except +
-
-# C++ Concrete Bitcomp Manager
-cdef extern from "nvcomp/bitcomp.h" nogil:
-    ctypedef struct nvcompBatchedBitcompFormatOpts:
-        int algorithm_type
-        nvcompType_t data_type
-    cdef nvcompBatchedBitcompFormatOpts nvcompBatchedBitcompDefaultOpts
-
-cdef extern from "nvcomp/bitcomp.hpp":
-    cdef cppclass BitcompManager "nvcomp::BitcompManager":
-        BitcompManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedBitcompFormatOpts& format_opts,
-        ) except +
-
-# C++ Concrete Cascaded Manager
-cdef extern from "nvcomp/cascaded.h" nogil:
-    ctypedef struct nvcompBatchedCascadedOpts_t:
-        size_t chunk_size
-        nvcompType_t type
-        int num_RLEs
-        int num_deltas
-        int use_bp
-    cdef nvcompBatchedCascadedOpts_t nvcompBatchedCascadedDefaultOpts
-
-cdef extern from "nvcomp/cascaded.hpp" nogil:
-    cdef cppclass CascadedManager "nvcomp::CascadedManager":
-        CascadedManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedCascadedOpts_t& options,
-        )
-
-# C++ Concrete Gdeflate Manager
-cdef extern from "nvcomp/gdeflate.h" nogil:
-    ctypedef struct nvcompBatchedGdeflateOpts_t:
-        int algo
-    cdef nvcompBatchedGdeflateOpts_t nvcompBatchedGdeflateDefaultOpts
-
-cdef extern from "nvcomp/gdeflate.hpp":
-    cdef cppclass GdeflateManager "nvcomp::GdeflateManager":
-        GdeflateManager(
-            int uncomp_chunk_size,
-            const nvcompBatchedGdeflateOpts_t& format_opts,
-        ) except +
-
-# C++ Concrete LZ4 Manager
-cdef extern from "nvcomp/gdeflate.h" nogil:
-    ctypedef struct nvcompBatchedLZ4Opts_t:
-        nvcompType_t data_type
-    cdef nvcompBatchedLZ4Opts_t nvcompBatchedLZ4DefaultOpts
-
-cdef extern from "nvcomp/lz4.hpp":
-    cdef cppclass LZ4Manager "nvcomp::LZ4Manager":
-        LZ4Manager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedLZ4Opts_t& format_opts,
-        ) except +
-
-# C++ Concrete Snappy Manager
-cdef extern from "nvcomp/snappy.h" nogil:
-    ctypedef struct nvcompBatchedSnappyOpts_t:
-        int reserved
-    cdef nvcompBatchedSnappyOpts_t nvcompBatchedSnappyDefaultOpts
-
-cdef extern from "nvcomp/snappy.hpp":
-    cdef cppclass SnappyManager "nvcomp::SnappyManager":
-        SnappyManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedSnappyOpts_t& format_opts,
-        ) except +
diff --git a/python/kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd b/python/kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd
deleted file mode 100644
index 6a23eb5cd1..0000000000
--- a/python/kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd
+++ /dev/null
@@ -1,362 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-# distutils: language = c++
-# cython: language_level=3
-
-cdef extern from "cuda_runtime.h":
-    ctypedef void* cudaStream_t
-
-    ctypedef enum cudaMemcpyKind:
-        cudaMemcpyHostToHost = 0,
-        cudaMemcpyHostToDevice = 1,
-        cudaMemcpyDeviceToHost = 2,
-        cudaMemcpyDeviceToDevice = 3,
-        cudaMemcpyDefault = 4
-
-cdef extern from "nvcomp.h":
-    ctypedef enum nvcompType_t:
-        NVCOMP_TYPE_CHAR = 0,       # 1B
-        NVCOMP_TYPE_UCHAR = 1,      # 1B
-        NVCOMP_TYPE_SHORT = 2,      # 2B
-        NVCOMP_TYPE_USHORT = 3,     # 2B
-        NVCOMP_TYPE_INT = 4,        # 4B
-        NVCOMP_TYPE_UINT = 5,       # 4B
-        NVCOMP_TYPE_LONGLONG = 6,   # 8B
-        NVCOMP_TYPE_ULONGLONG = 7,  # 8B
-        NVCOMP_TYPE_BITS = 0xff     # 1b
-
-cdef extern from "nvcomp/shared_types.h":
-    ctypedef enum nvcompStatus_t:
-        nvcompSuccess = 0,
-        nvcompErrorInvalidValue = 10,
-        nvcompErrorNotSupported = 11,
-        nvcompErrorCannotDecompress = 12,
-        nvcompErrorBadChecksum = 13,
-        nvcompErrorCannotVerifyChecksums = 14,
-        nvcompErrorCudaError = 1000,
-        nvcompErrorInternal = 10000,
-
-# nvCOMP Low-Level Interface.
-# https://github.com/NVIDIA/nvcomp/blob/main/doc/lowlevel_c_quickstart.md
-
-#
-# LZ4 batch compression/decompression API.
-#
-cdef extern from "nvcomp/lz4.h" nogil:
-    ctypedef struct nvcompBatchedLZ4Opts_t:
-        nvcompType_t data_type
-
-    cdef nvcompBatchedLZ4Opts_t nvcompBatchedLZ4DefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedLZ4CompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedLZ4Opts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedLZ4Opts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedLZ4CompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedLZ4Opts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedLZ4DecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedLZ4GetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedLZ4DecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-#
-# Gdeflate batch compression/decompression API.
-#
-cdef extern from "nvcomp/gdeflate.h" nogil:
-    ctypedef struct nvcompBatchedGdeflateOpts_t:
-        int algo
-
-    cdef nvcompBatchedGdeflateOpts_t nvcompBatchedGdeflateDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedGdeflateCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedGdeflateOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedGdeflateCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedGdeflateOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedGdeflateCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedGdeflateOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedGdeflateDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedGdeflateGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedGdeflateDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-#
-# zstd batch compression/decompression API.
-#
-cdef extern from "nvcomp/zstd.h" nogil:
-    ctypedef struct nvcompBatchedZstdOpts_t:
-        int reserved
-
-    cdef nvcompBatchedZstdOpts_t nvcompBatchedZstdDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedZstdCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedZstdOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedZstdCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedZstdOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedZstdCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedZstdOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedZstdDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedZstdGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedZstdDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-#
-# Snappy batch compression/decompression API.
-#
-cdef extern from "nvcomp/snappy.h" nogil:
-    ctypedef struct nvcompBatchedSnappyOpts_t:
-        int reserved
-
-    cdef nvcompBatchedSnappyOpts_t nvcompBatchedSnappyDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedSnappyCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedSnappyOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedSnappyOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedSnappyCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedSnappyOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedSnappyDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedSnappyGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedSnappyDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-
-#
-# Deflate batch compression/decompression API.
-#
-cdef extern from "nvcomp/deflate.h" nogil:
-    ctypedef struct nvcompBatchedDeflateOpts_t:
-        int algo
-
-    cdef nvcompBatchedDeflateOpts_t nvcompBatchedDeflateDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedDeflateCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedDeflateOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedDeflateOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedDeflateCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedDeflateOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedDeflateDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedDeflateGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedDeflateDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index 5a7ba2c846..2f7031a7c3 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
@@ -7,17 +7,26 @@
 from typing import Optional
 
 from cython.operator cimport dereference as deref
-from libc.stdint cimport uintptr_t
+from libc.stdint cimport uint8_t, uintptr_t
 from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.optional cimport nullopt, optional
 from libcpp.pair cimport pair
 from libcpp.string cimport string
 from libcpp.utility cimport move, pair
+from libcpp.vector cimport vector
 
 from kvikio._lib.arr cimport parse_buffer_argument
 from kvikio._lib.future cimport IOFuture, _wrap_io_future, future
 
 
-cdef extern from "<kvikio/remote_handle.hpp>" nogil:
+cdef extern from "<kvikio/remote_handle.hpp>" namespace "kvikio" nogil:
+    cpdef enum class RemoteEndpointType(uint8_t):
+        AUTO = 0
+        S3 = 1
+        S3_PUBLIC = 2
+        S3_PRESIGNED_URL = 3
+        WEBHDFS = 4
+        HTTP = 5
     cdef cppclass cpp_RemoteEndpoint "kvikio::RemoteEndpoint":
         string str() except +
 
@@ -25,18 +34,39 @@ cdef extern from "<kvikio/remote_handle.hpp>" nogil:
         cpp_HttpEndpoint(string url) except +
 
     cdef cppclass cpp_S3Endpoint "kvikio::S3Endpoint"(cpp_RemoteEndpoint):
-        cpp_S3Endpoint(string url) except +
-        cpp_S3Endpoint(pair[string, string] bucket_and_object_names) except +
+        cpp_S3Endpoint(
+            string url,
+            optional[string] aws_region,
+            optional[string] aws_access_key,
+            optional[string] aws_secret_access_key,
+            optional[string] aws_session_token
+        ) except +
+        cpp_S3Endpoint(
+            pair[string, string] bucket_and_object_names,
+            optional[string] aws_region,
+            optional[string] aws_access_key,
+            optional[string] aws_secret_access_key,
+            optional[string] aws_endpoint_url,
+            optional[string] aws_session_token
+        ) except +
 
     pair[string, string] cpp_parse_s3_url \
         "kvikio::S3Endpoint::parse_s3_url"(string url) except +
 
+    cdef cppclass cpp_S3PublicEndpoint "kvikio::S3PublicEndpoint" (cpp_RemoteEndpoint):
+        cpp_S3PublicEndpoint(string url) except +
+
+    cdef cppclass cpp_S3EndpointWithPresignedUrl "kvikio::S3EndpointWithPresignedUrl" \
+                                                 (cpp_RemoteEndpoint):
+        cpp_S3EndpointWithPresignedUrl(string presigned_url) except +
+
     cdef cppclass cpp_RemoteHandle "kvikio::RemoteHandle":
         cpp_RemoteHandle(
             unique_ptr[cpp_RemoteEndpoint] endpoint, size_t nbytes
         ) except +
         cpp_RemoteHandle(unique_ptr[cpp_RemoteEndpoint] endpoint) except +
-        int nbytes() except +
+        RemoteEndpointType remote_endpoint_type() noexcept
+        size_t nbytes() noexcept
         const cpp_RemoteEndpoint& endpoint() except +
         size_t read(
             void* buf,
@@ -49,6 +79,17 @@ cdef extern from "<kvikio/remote_handle.hpp>" nogil:
             size_t file_offset
         ) except +
 
+        @staticmethod
+        cpp_RemoteHandle cpp_easy_open "open"(
+            string url,
+            RemoteEndpointType remote_endpoint_type,
+            optional[vector[RemoteEndpointType]] allow_list,
+            optional[size_t] nbytes
+        ) except +
+
+cdef extern from "<kvikio/hdfs.hpp>" nogil:
+    cdef cppclass cpp_WebHdfsEndpoint "kvikio::WebHdfsEndpoint"(cpp_RemoteEndpoint):
+        cpp_WebHdfsEndpoint(string url) except +
 
 cdef string _to_string(str s):
     """Convert Python object to a C++ string (if None, return the empty string)"""
@@ -61,8 +102,18 @@ cdef pair[string, string] _to_string_pair(str s1, str s2):
     """Wrap two Python string objects in a C++ pair"""
     return pair[string, string](_to_string(s1), _to_string(s2))
 
+cdef optional[string] _to_optional_string(str s):
+    """Convert Python object to a C++ optional string (if None, return nullopt)"""
+    cdef optional[string] result
+    if s is None:
+        result = nullopt
+    else:
+        result = optional[string](_to_string(s))
+    return result
+
+
 # Helper function to cast an endpoint to its base class `RemoteEndpoint`
-cdef extern from *:
+cdef extern from * nogil:
     """
     template <typename T>
     std::unique_ptr<kvikio::RemoteEndpoint> cast_to_remote_endpoint(T endpoint)
@@ -72,6 +123,28 @@ cdef extern from *:
     """
     cdef unique_ptr[cpp_RemoteEndpoint] cast_to_remote_endpoint[T](T handle) except +
 
+# Helper function for the cpp_RemoteHandle.open method to return
+# unique_ptr[cpp_RemoteHandle] instead of cpp_RemoteHandle. Due to lack of a nullary
+# constructor, cpp_RemoteHandle cannot be created as a stack variable in Cython.
+cdef extern from * nogil:
+    """
+    inline std::unique_ptr<kvikio::RemoteHandle> create_remote_handle_from_open(
+        std::string url,
+        kvikio::RemoteEndpointType remote_endpoint_type,
+        std::optional<std::vector<kvikio::RemoteEndpointType>> allow_list,
+        std::optional<std::size_t> nbytes)
+    {
+        return std::make_unique<kvikio::RemoteHandle>(
+            kvikio::RemoteHandle::open(url, remote_endpoint_type, allow_list, nbytes)
+        );
+    }
+    """
+    cdef unique_ptr[cpp_RemoteHandle] create_remote_handle_from_open(
+        string url,
+        RemoteEndpointType remote_endpoint_type,
+        optional[vector[RemoteEndpointType]] allow_list,
+        optional[size_t] nbytes
+    ) except +
 
 cdef class RemoteFile:
     cdef unique_ptr[cpp_RemoteHandle] _handle
@@ -82,11 +155,16 @@ cdef class RemoteFile:
         nbytes: Optional[int],
     ):
         cdef RemoteFile ret = RemoteFile()
+
         if nbytes is None:
-            ret._handle = make_unique[cpp_RemoteHandle](move(ep))
+            with nogil:
+                ret._handle = make_unique[cpp_RemoteHandle](move(ep))
             return ret
+
         cdef size_t n = nbytes
-        ret._handle = make_unique[cpp_RemoteHandle](move(ep), n)
+
+        with nogil:
+            ret._handle = make_unique[cpp_RemoteHandle](move(ep), n)
         return ret
 
     @staticmethod
@@ -94,10 +172,16 @@ cdef class RemoteFile:
         url: str,
         nbytes: Optional[int],
     ):
+        cdef string cpp_url = _to_string(url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_HttpEndpoint](cpp_url)
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_HttpEndpoint](_to_string(url))
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
@@ -106,13 +190,44 @@ cdef class RemoteFile:
         bucket_name: str,
         object_name: str,
         nbytes: Optional[int],
+        aws_region_name: Optional[str] = None,
+        aws_access_key_id: Optional[str] = None,
+        aws_secret_access_key: Optional[str] = None,
+        aws_endpoint_url: Optional[str] = None,
+        aws_session_token: Optional[str] = None,
     ):
-        return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
+        cdef pair[string, string] bucket_and_object_names = _to_string_pair(
+            bucket_name, object_name
+        )
+        cdef optional[string] cpp_aws_region = _to_optional_string(aws_region_name)
+        cdef optional[string] cpp_aws_access_key = _to_optional_string(
+            aws_access_key_id
+        )
+        cdef optional[string] cpp_aws_secret_access_key = (
+            _to_optional_string(aws_secret_access_key)
+        )
+        cdef optional[string] cpp_aws_endpoint_url = _to_optional_string(
+            aws_endpoint_url
+        )
+        cdef optional[string] cpp_aws_session_token = _to_optional_string(
+            aws_session_token
+        )
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
                 make_unique[cpp_S3Endpoint](
-                    _to_string_pair(bucket_name, object_name)
+                    bucket_and_object_names,
+                    cpp_aws_region,
+                    cpp_aws_access_key,
+                    cpp_aws_secret_access_key,
+                    cpp_aws_endpoint_url,
+                    cpp_aws_session_token
                 )
-            ),
+            )
+
+        return RemoteFile._from_endpoint(
+            move(cpp_endpoint),
             nbytes
         )
 
@@ -120,11 +235,37 @@ cdef class RemoteFile:
     def open_s3_from_http_url(
         url: str,
         nbytes: Optional[int],
+        aws_region_name: Optional[str] = None,
+        aws_access_key_id: Optional[str] = None,
+        aws_secret_access_key: Optional[str] = None,
+        aws_session_token: Optional[str] = None,
     ):
+        cdef string cpp_url = _to_string(url)
+        cdef optional[string] cpp_aws_region = _to_optional_string(aws_region_name)
+        cdef optional[string] cpp_aws_access_key = _to_optional_string(
+            aws_access_key_id
+        )
+        cdef optional[string] cpp_aws_secret_access_key = (
+            _to_optional_string(aws_secret_access_key)
+        )
+        cdef optional[string] cpp_aws_session_token = _to_optional_string(
+            aws_session_token
+        )
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3Endpoint](
+                    cpp_url,
+                    cpp_aws_region,
+                    cpp_aws_access_key,
+                    cpp_aws_secret_access_key,
+                    cpp_aws_session_token
+                )
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_S3Endpoint](_to_string(url))
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
@@ -132,36 +273,172 @@ cdef class RemoteFile:
     def open_s3_from_s3_url(
         url: str,
         nbytes: Optional[int],
+        aws_region_name: Optional[str] = None,
+        aws_access_key_id: Optional[str] = None,
+        aws_secret_access_key: Optional[str] = None,
+        aws_endpoint_url: Optional[str] = None,
+        aws_session_token: Optional[str] = None,
+    ):
+        cdef string cpp_url = _to_string(url)
+        cdef pair[string, string] bucket_and_object_names
+        cdef optional[string] cpp_aws_region = _to_optional_string(aws_region_name)
+        cdef optional[string] cpp_aws_access_key = _to_optional_string(
+            aws_access_key_id
+        )
+        cdef optional[string] cpp_aws_secret_access_key = (
+            _to_optional_string(aws_secret_access_key)
+        )
+        cdef optional[string] cpp_aws_endpoint_url = _to_optional_string(
+            aws_endpoint_url
+        )
+        cdef optional[string] cpp_aws_session_token = _to_optional_string(
+            aws_session_token
+        )
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            bucket_and_object_names = cpp_parse_s3_url(cpp_url)
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3Endpoint](
+                    bucket_and_object_names,
+                    cpp_aws_region,
+                    cpp_aws_access_key,
+                    cpp_aws_secret_access_key,
+                    cpp_aws_endpoint_url,
+                    cpp_aws_session_token
+                )
+            )
+
+        return RemoteFile._from_endpoint(
+            move(cpp_endpoint),
+            nbytes
+        )
+
+    @staticmethod
+    def open_s3_public(
+        url: str,
+        nbytes: Optional[int],
+    ):
+        cdef string cpp_url = _to_string(url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3PublicEndpoint](cpp_url)
+            )
+
+        return RemoteFile._from_endpoint(
+            move(cpp_endpoint),
+            nbytes
+        )
+
+    @staticmethod
+    def open_s3_presigned_url(
+        presigned_url: str,
+        nbytes: Optional[int],
+    ):
+        cdef string cpp_url = _to_string(presigned_url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3EndpointWithPresignedUrl](cpp_url)
+            )
+
+        return RemoteFile._from_endpoint(
+            move(cpp_endpoint),
+            nbytes
+        )
+
+    @staticmethod
+    def open_webhdfs(
+        url: str,
+        nbytes: Optional[int],
     ):
-        cdef pair[string, string] bucket_and_object = cpp_parse_s3_url(_to_string(url))
         return RemoteFile._from_endpoint(
             cast_to_remote_endpoint(
-                make_unique[cpp_S3Endpoint](bucket_and_object)
+                make_unique[cpp_WebHdfsEndpoint](_to_string(url))
             ),
             nbytes
         )
 
+    @staticmethod
+    def open(
+        url: str,
+        remote_endpoint_type: RemoteEndpointType,
+        allow_list: Optional[list],
+        nbytes: Optional[int]
+    ):
+        cdef optional[vector[RemoteEndpointType]] cpp_allow_list
+        cdef vector[RemoteEndpointType] vec_allow_list
+        if allow_list is None:
+            cpp_allow_list = nullopt
+        else:
+            for allow_item in allow_list:
+                vec_allow_list.push_back(allow_item.value)
+            cpp_allow_list = vec_allow_list
+
+        cdef optional[size_t] cpp_nbytes
+        if nbytes is None:
+            cpp_nbytes = nullopt
+        else:
+            cpp_nbytes = <size_t>nbytes
+
+        cdef RemoteFile ret = RemoteFile()
+        cdef unique_ptr[cpp_RemoteHandle] cpp_handle
+        cdef string cpp_url = _to_string(url)
+        with nogil:
+            cpp_handle = create_remote_handle_from_open(
+                cpp_url,
+                remote_endpoint_type,
+                cpp_allow_list,
+                cpp_nbytes)
+        ret._handle = move(cpp_handle)
+
+        return ret
+
     def __str__(self) -> str:
-        cdef string ep_str = deref(self._handle).endpoint().str()
+        cdef string ep_str
+        with nogil:
+            ep_str = deref(self._handle).endpoint().str()
         return f'<{self.__class__.__name__} "{ep_str.decode()}">'
 
+    def remote_endpoint_type(self) -> RemoteEndpointType:
+        cdef RemoteEndpointType result
+        with nogil:
+            result = deref(self._handle).remote_endpoint_type()
+        return result
+
     def nbytes(self) -> int:
-        return deref(self._handle).nbytes()
+        cdef size_t result
+        with nogil:
+            result = deref(self._handle).nbytes()
+        return result
 
     def read(self, buf, size: Optional[int], file_offset: int) -> int:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return deref(self._handle).read(
-            <void*>info.first,
-            info.second,
-            file_offset,
-        )
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t result
+
+        with nogil:
+            result = deref(self._handle).read(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+            )
+
+        return result
 
     def pread(self, buf, size: Optional[int], file_offset: int) -> IOFuture:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return _wrap_io_future(
-            deref(self._handle).pread(
+        cdef size_t cpp_file_offset = file_offset
+        cdef future[size_t] fut
+
+        with nogil:
+            fut = deref(self._handle).pread(
                 <void*>info.first,
                 info.second,
-                file_offset,
+                cpp_file_offset,
             )
-        )
+
+        return _wrap_io_future(fut)
diff --git a/python/kvikio/kvikio/_nvcomp.py b/python/kvikio/kvikio/_nvcomp.py
deleted file mode 100644
index 5606ad5ce5..0000000000
--- a/python/kvikio/kvikio/_nvcomp.py
+++ /dev/null
@@ -1,368 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from enum import Enum
-
-import cupy as cp
-import numpy as np
-
-import kvikio._lib.libnvcomp as _lib
-from kvikio._lib.arr import asarray
-
-_dtype_map = {
-    cp.dtype("int8"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_CHAR,
-    cp.dtype("uint8"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_UCHAR,
-    cp.dtype("int16"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_SHORT,
-    cp.dtype("uint16"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_USHORT,
-    cp.dtype("int32"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_INT,
-    cp.dtype("uint32"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_UINT,
-    cp.dtype("int64"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_LONGLONG,
-    cp.dtype("uint64"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_ULONGLONG,
-}
-
-
-def cp_to_nvcomp_dtype(in_type: cp.dtype) -> Enum:
-    """Convert np/cp dtypes to nvcomp integral dtypes.
-
-    Parameters
-    ----------
-    in_type
-        A type argument that can be used to initialize a cupy/numpy dtype.
-
-    Returns
-    -------
-    int
-        The value of the NVCOMP_TYPE for supported dtype.
-    """
-    cp_type = cp.dtype(in_type)
-    return _dtype_map[cp_type]
-
-
-class nvCompManager:
-    """Base class for nvComp Compression Managers.
-
-    Compression managers compress uncompressed data and decompress the result.
-
-    Child types of nvCompManager implement only their constructor, as they each
-    take different options to build. The rest of their implementation is
-    in nvCompManager.
-
-    nvCompManager also keeps all of the options for its child types.
-    """
-
-    _manager: _lib._nvcompManager = None
-    config: dict = {}
-    decompression_config: dict = {}
-
-    # This is a python option: What type was the data when it was passed in?
-    # This is used only for returning a decompressed view of the original
-    # datatype. Untested so far.
-    input_type = cp.int8
-
-    # Default options exist for every option type for every class that inherits
-    # from nvCompManager, which takes advantage of the below property-setting
-    # code.
-    chunk_size: int = 1 << 16
-    data_type: _lib.pyNvcompType_t = _lib.pyNvcompType_t.pyNVCOMP_TYPE_UCHAR
-    # Some classes have this defined as type, some as data_type.
-    type: _lib.pyNvcompType_t = _lib.pyNvcompType_t.pyNVCOMP_TYPE_UCHAR
-
-    # Bitcomp Defaults
-    bitcomp_algo: int = 0
-
-    # Gdeflate defaults
-    algo: int = 0
-
-    def __init__(self, kwargs):
-        """Stores the results of all input arguments as class members.
-
-        This code does type correction, fixing inputs to have an expected
-        shape before calling one of the nvCompManager methods on a child
-        class.
-
-        Special case: Convert data_type to a _lib.pyNvcompType_t
-        """
-        # data_type will be passed in as a python object. Convert it to
-        # a C++ nvcompType_t here.
-        if kwargs.get("data_type"):
-            if not isinstance(kwargs["data_type"], _lib.pyNvcompType_t):
-                kwargs["input_type"] = kwargs.get("data_type")
-                kwargs["data_type"] = cp_to_nvcomp_dtype(
-                    cp.dtype(kwargs["data_type"]).type
-                )
-        # Special case: Convert type to a _lib.pyNvcompType_t
-        if kwargs.get("type"):
-            if not isinstance(kwargs["type"], _lib.pyNvcompType_t):
-                kwargs["input_type"] = kwargs.get("type")
-                kwargs["type"] = cp_to_nvcomp_dtype(cp.dtype(kwargs["type"]).type)
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-
-    def compress(self, data: cp.ndarray) -> cp.ndarray:
-        """Compress a buffer.
-
-        Parameters
-        ----------
-        data: cp.ndarray
-            A GPU buffer of data to compress.
-
-        Returns
-        -------
-        cp.ndarray
-            A GPU buffer of compressed bytes.
-        """
-        # TODO: An option: check if incoming data size matches the size of the
-        # last incoming data, and reuse temp and out buffer if so.
-        data_size = data.size * data.itemsize
-        self.config = self._manager.configure_compression(data_size)
-        self.compress_out_buffer = cp.empty(
-            self.config["max_compressed_buffer_size"], dtype="uint8"
-        )
-        size = self._manager.compress(asarray(data), asarray(self.compress_out_buffer))
-        return self.compress_out_buffer[0:size]
-
-    def decompress(self, data: cp.ndarray) -> cp.ndarray:
-        """Decompress a GPU buffer.
-
-        Parameters
-        ----------
-        data: cp.ndarray
-            A GPU buffer of data to decompress.
-
-        Returns
-        -------
-        cp.ndarray
-            An array of `self.dtype` produced after decompressing the input argument.
-        """
-        self.decompression_config = (
-            self._manager.configure_decompression_with_compressed_buffer(asarray(data))
-        )
-        decomp_buffer = cp.empty(
-            self.decompression_config["decomp_data_size"], dtype="uint8"
-        )
-        self._manager.decompress(asarray(decomp_buffer), asarray(data))
-        return decomp_buffer.view(self.input_type)
-
-    def configure_compression(self, data_size: int) -> dict:
-        """Return the compression configuration object.
-
-        Parameters
-        ----------
-        data_size: int
-            The size of the buffer that is staged to be compressed.
-
-        Returns
-        -------
-        dict {
-            "uncompressed_buffer_size": The size of the input data
-            "max_compressed_buffer_size": The maximum size of the compressed data. The
-                size of the buffer that must be allocated before calling compress.
-            "num_chunks": The number of configured chunks to compress the data over
-        }
-        """
-        return self._manager.configure_compression(data_size)
-
-    def configure_decompression_with_compressed_buffer(
-        self, data: cp.ndarray
-    ) -> cp.ndarray:
-        """Return the decompression configuration object.
-
-        Parameters
-        ----------
-        data: cp.ndarray
-            A GPU buffer of previously compressed data.
-
-        Returns
-        -------
-        dict {
-            "decomp_data_size": The size of each decompression chunk.
-            "num_chunks": The number of chunks that the decompressed data is returned
-            in.
-        }
-        """
-        return self._manager.configure_decompression_with_compressed_buffer(
-            asarray(data)
-        )
-
-    def get_compressed_output_size(self, comp_buffer: cp.ndarray) -> int:
-        """Return the actual size of compression result.
-
-        Returns the number of bytes that should be copied out of
-        `comp_buffer`.
-
-        Parameters
-        ----------
-        comp_buffer: cp.ndarray
-            A GPU buffer that has been previously compressed.
-
-        Returns
-        -------
-        int
-        """
-        return self._manager.get_compressed_output_size(asarray(comp_buffer))
-
-
-class ANSManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Initialize an ANSManager object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            Defaults to 4096.
-        """
-        super().__init__(kwargs)
-
-        self._manager = _lib._ANSManager(self.chunk_size)
-
-
-class BitcompManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU BitcompCompressor object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            Defaults to 4096.
-        """
-        super().__init__(kwargs)
-
-        self._manager = _lib._BitcompManager(
-            self.chunk_size,
-            self.data_type.value,
-            self.bitcomp_algo,
-        )
-
-
-class CascadedManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Initialize a CascadedManager for a specific dtype.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            Defaults to 4096 and can't currently be changed.
-        dtype: cp.dtype (optional)
-            The dtype of the input buffer to be compressed.
-        num_RLEs: int (optional)
-            Number of Run-Length Encoders to use, see [algorithms overview.md](
-                https://github.com/NVIDIA/nvcomp/blob/main/doc/algorithms_overview.md#run-length-encoding-rle)  # noqa: E501
-        num_deltas: int (optional)
-            Number of Delta Encoders to use, see [algorithms overview.md](
-                https://github.com/NVIDIA/nvcomp/blob/main/doc/algorithms_overview.md#delta-encoding)  # noqa: E501
-        use_bp: bool (optional)
-            Enable Bitpacking, see [algorithms overview.md](
-                https://github.com/NVIDIA/nvcomp/blob/main/doc/algorithms_overview.md#bitpacking)  # noqa: E501
-        """
-        super().__init__(kwargs)
-        default_options = {
-            "chunk_size": 1 << 12,
-            "type": np.int32,
-            "num_RLEs": 2,
-            "num_deltas": 1,
-            "use_bp": True,
-        }
-        # Replace any options that may have been excluded, they are not optional.
-        for k, v in default_options.items():
-            try:
-                getattr(self, k)
-            except Exception:
-                setattr(self, k, v)
-
-        self.options = {
-            "chunk_size": self.chunk_size,
-            "type": self.type,
-            "num_RLEs": self.num_RLEs,
-            "num_deltas": self.num_deltas,
-            "use_bp": self.use_bp,
-        }
-        self._manager = _lib._CascadedManager(default_options)
-
-
-class GdeflateManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU GdeflateCompressor object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-        algo: int (optional)
-            Integer in the range [0, 1, 2]. Only algorithm #0 is currently
-            supported.
-        """
-        super().__init__(kwargs)
-
-        self._manager = _lib._GdeflateManager(self.chunk_size, self.algo)
-
-
-class LZ4Manager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU LZ4Compressor object.
-
-        Used to compress and decompress GPU buffers of a specific dtype.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            The size of each chunk of data to decompress indepentently with
-            LZ4. Must be within the range of [32768, 16777216]. Larger sizes will
-            result in higher compression, but with decreased parallelism. The
-            recommended size is 65536.
-            Defaults to the recommended size.
-        data_type: pyNVCOMP_TYPE (optional)
-            The data type returned for decompression.
-            Defaults to pyNVCOMP_TYPE.UCHAR
-        """
-        super().__init__(kwargs)
-        self._manager = _lib._LZ4Manager(self.chunk_size, self.data_type.value)
-
-
-class SnappyManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU SnappyCompressor object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-        """
-        super().__init__(kwargs)
-        self._manager = _lib._SnappyManager(self.chunk_size)
-
-
-class ManagedDecompressionManager(nvCompManager):
-    def __init__(self, compressed_buffer):
-        """Create a Managed compressor using the
-        create_manager factory method.
-
-        This function is used in order to automatically
-        identify which compression algorithm was used on
-        an input buffer.
-
-        It returns a ManagedDecompressionManager that can
-        then be used normally to decompress the unknown
-        compressed binary data, or compress other data
-        into the same format.
-
-        Parameters
-        ----------
-        compressed_buffer: cp.ndarray
-            A buffer of compressed bytes of unknown origin.
-        """
-        super().__init__({})
-        self._manager = _lib._ManagedManager(asarray(compressed_buffer))
diff --git a/python/kvikio/kvikio/_nvcomp_codec.py b/python/kvikio/kvikio/_nvcomp_codec.py
deleted file mode 100644
index dc60d9c7dc..0000000000
--- a/python/kvikio/kvikio/_nvcomp_codec.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from typing import Any, Mapping, Optional, Sequence
-
-import cupy as cp
-import cupy.typing
-from numcodecs.compat import ensure_contiguous_ndarray_like
-
-from kvikio._lib.libnvcomp_ll import SUPPORTED_ALGORITHMS
-from kvikio.numcodecs import BufferLike, CudaCodec
-
-
-class NvCompBatchCodec(CudaCodec):
-    """Codec that uses batch algorithms from nvCOMP library.
-
-    An algorithm is selected using `algorithm` parameter.
-    If the algorithm takes additional options, they can be
-    passed to the algorithm using `options` dictionary.
-    """
-
-    # Header stores original uncompressed size. This is required to enable
-    # data compatibility between existing numcodecs codecs and NvCompBatchCodec.
-    HEADER_SIZE_BYTES: int = 4
-
-    codec_id: str = "nvcomp_batch"
-    algorithm: str
-    options: Mapping[str, Any]
-
-    def __init__(
-        self,
-        algorithm: str,
-        options: Optional[Mapping[str, Any]] = None,
-        stream: Optional[cp.cuda.Stream] = None,
-    ) -> None:
-        algo_id = algorithm.lower()
-        algo_t = SUPPORTED_ALGORITHMS.get(algo_id, None)
-        if algo_t is None:
-            raise ValueError(
-                f"{algorithm} is not supported. "
-                f"Must be one of: {list(SUPPORTED_ALGORITHMS.keys())}"
-            )
-
-        self.algorithm = algo_id
-        self.options = dict(options) if options is not None else {}
-
-        # Create an algorithm.
-        self._algo = algo_t(**self.options)
-        # Use default stream, if needed.
-        self._stream = stream if stream is not None else cp.cuda.Stream.ptds
-
-    def encode(self, buf: BufferLike) -> cupy.typing.NDArray:
-        return self.encode_batch([buf])[0]
-
-    def encode_batch(self, bufs: Sequence[Any]) -> Sequence[Any]:
-        """Encode data in `bufs` using nvCOMP.
-
-        Parameters
-        ----------
-        bufs :
-            Data to be encoded. Each buffer in the list may be any object
-            supporting the new-style buffer protocol.
-
-        Returns
-        -------
-            List of encoded buffers. Each buffer may be any object supporting
-            the new-style buffer protocol.
-        """
-        num_chunks = len(bufs)
-        if num_chunks == 0:
-            return []
-
-        bufs = [cp.asarray(ensure_contiguous_ndarray_like(b)) for b in bufs]
-        buf_sizes = [b.size * b.itemsize for b in bufs]
-
-        max_chunk_size = max(buf_sizes)
-
-        # Get temp and output buffer sizes.
-        temp_size = self._algo.get_compress_temp_size(num_chunks, max_chunk_size)
-        comp_chunk_size = self._algo.get_compress_chunk_size(max_chunk_size)
-
-        # Prepare data and size buffers.
-        # uncomp_chunks is used as a container that stores pointers to actual chunks.
-        # nvCOMP requires this and sizes buffers to be in GPU memory.
-        uncomp_chunks = cp.array([b.data.ptr for b in bufs], dtype=cp.uintp)
-        uncomp_chunk_sizes = cp.array(buf_sizes, dtype=cp.uint64)
-
-        temp_buf = cp.empty(temp_size, dtype=cp.uint8)
-
-        comp_chunks = cp.empty((num_chunks, comp_chunk_size), dtype=cp.uint8)
-        # Array of pointers to each compressed chunk.
-        comp_chunk_ptrs = cp.array([c.data.ptr for c in comp_chunks], dtype=cp.uintp)
-        # Resulting compressed chunk sizes.
-        comp_chunk_sizes = cp.empty(num_chunks, dtype=cp.uint64)
-
-        self._algo.compress(
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            max_chunk_size,
-            num_chunks,
-            temp_buf,
-            comp_chunk_ptrs,
-            comp_chunk_sizes,
-            self._stream,
-        )
-
-        res = []
-        # Copy to host to subsequently avoid many smaller D2H copies.
-        comp_chunks = cp.asnumpy(comp_chunks, self._stream)
-        comp_chunk_sizes = cp.asnumpy(comp_chunk_sizes, self._stream)
-        self._stream.synchronize()
-
-        for i in range(num_chunks):
-            res.append(comp_chunks[i, : comp_chunk_sizes[i]].tobytes())
-        return res
-
-    def decode(self, buf: BufferLike, out: Optional[BufferLike] = None) -> BufferLike:
-        return self.decode_batch([buf], [out])[0]
-
-    def decode_batch(
-        self, bufs: Sequence[Any], out: Optional[Sequence[Any]] = None
-    ) -> Sequence[Any]:
-        """Decode data in `bufs` using nvCOMP.
-
-        Parameters
-        ----------
-        bufs :
-            Encoded data. Each buffer in the list may be any object
-            supporting the new-style buffer protocol.
-        out :
-            List of writeable buffers to store decoded data.
-            N.B. if provided, each buffer must be exactly the right size
-            to store the decoded data.
-
-        Returns
-        -------
-            List of decoded buffers. Each buffer may be any object supporting
-            the new-style buffer protocol.
-        """
-        num_chunks = len(bufs)
-        if num_chunks == 0:
-            return []
-
-        # TODO(akamenev): check only first buffer, assuming they are all
-        # of the same kind.
-        is_host_buffer = not hasattr(bufs[0], "__cuda_array_interface__")
-        if is_host_buffer:
-            bufs = [cp.asarray(ensure_contiguous_ndarray_like(b)) for b in bufs]
-
-        # Prepare compressed chunks buffers.
-        comp_chunks = cp.array([b.data.ptr for b in bufs], dtype=cp.uintp)
-        comp_chunk_sizes = cp.array([b.size for b in bufs], dtype=cp.uint64)
-
-        # Get uncompressed chunk sizes.
-        uncomp_chunk_sizes = self._algo.get_decompress_size(
-            comp_chunks,
-            comp_chunk_sizes,
-            self._stream,
-        )
-
-        # Check whether the uncompressed chunks are all the same size.
-        # cupy.unique returns sorted sizes.
-        sorted_chunk_sizes = cp.unique(uncomp_chunk_sizes)
-        max_chunk_size = sorted_chunk_sizes[-1].item()
-        is_equal_chunks = sorted_chunk_sizes.shape[0] == 1
-
-        # Get temp buffer size.
-        temp_size = self._algo.get_decompress_temp_size(num_chunks, max_chunk_size)
-
-        temp_buf = cp.empty(temp_size, dtype=cp.uint8)
-
-        # Prepare uncompressed chunks buffers.
-        # First, allocate chunks of max_chunk_size and then
-        # copy the pointers to a pointer array in GPU memory as required by nvCOMP.
-        # For performance reasons, we use max_chunk_size so we can create
-        # a rectangular array with the same pointer increments.
-        uncomp_chunks = cp.empty((num_chunks, max_chunk_size), dtype=cp.uint8)
-        p_start = uncomp_chunks.data.ptr
-        uncomp_chunk_ptrs = cp.uint64(p_start) + (
-            cp.arange(0, num_chunks * max_chunk_size, max_chunk_size, dtype=cp.uint64)
-        )
-
-        # TODO(akamenev): currently we provide the following 2 buffers to decompress()
-        # but do not check/use them afterwards since some of the algos
-        # (e.g. LZ4 and Gdeflate) do not require it and run faster
-        # without those arguments passed, while other algos (e.g. zstd) require
-        # these buffers to be valid.
-        actual_uncomp_chunk_sizes = cp.empty(num_chunks, dtype=cp.uint64)
-        statuses = cp.empty(num_chunks, dtype=cp.int32)
-
-        self._algo.decompress(
-            comp_chunks,
-            comp_chunk_sizes,
-            num_chunks,
-            temp_buf,
-            uncomp_chunk_ptrs,
-            uncomp_chunk_sizes,
-            actual_uncomp_chunk_sizes,
-            statuses,
-            self._stream,
-        )
-
-        # If all chunks are the same size, we can just return uncomp_chunks.
-        if is_equal_chunks and out is None:
-            return cp.asnumpy(uncomp_chunks) if is_host_buffer else uncomp_chunks
-
-        res = []
-        uncomp_chunk_sizes = uncomp_chunk_sizes.get()
-        for i in range(num_chunks):
-            ret = uncomp_chunks[i, : uncomp_chunk_sizes[i]]
-            if out is None or out[i] is None:
-                res.append(cp.asnumpy(ret) if is_host_buffer else ret)
-            else:
-                o = ensure_contiguous_ndarray_like(out[i])
-                if hasattr(o, "__cuda_array_interface__"):
-                    cp.copyto(o, ret.view(dtype=o.dtype), casting="no")
-                else:
-                    cp.asnumpy(ret.view(dtype=o.dtype), out=o, stream=self._stream)
-                res.append(o)
-        self._stream.synchronize()
-
-        return res
-
-    def __repr__(self):
-        return (
-            f"{self.__class__.__name__}"
-            f"(algorithm={self.algorithm!r}, options={self.options!r})"
-        )
diff --git a/python/kvikio/kvikio/_version.py b/python/kvikio/kvikio/_version.py
index a5171f19f4..c5a2a0cd83 100644
--- a/python/kvikio/kvikio/_version.py
+++ b/python/kvikio/kvikio/_version.py
@@ -1,16 +1,5 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 import importlib.resources
 
diff --git a/python/kvikio/kvikio/benchmarks/__init__.py b/python/kvikio/kvikio/benchmarks/__init__.py
index 8586c47db2..905216a919 100644
--- a/python/kvikio/kvikio/benchmarks/__init__.py
+++ b/python/kvikio/kvikio/benchmarks/__init__.py
@@ -1,2 +1,2 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/python/kvikio/kvikio/benchmarks/http_io.py b/python/kvikio/kvikio/benchmarks/http_io.py
index af4e44b973..ab72446857 100644
--- a/python/kvikio/kvikio/benchmarks/http_io.py
+++ b/python/kvikio/kvikio/benchmarks/http_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import argparse
 import contextlib
diff --git a/python/kvikio/kvikio/benchmarks/s3_io.py b/python/kvikio/kvikio/benchmarks/s3_io.py
index 08bdfc93a0..5b585cd018 100644
--- a/python/kvikio/kvikio/benchmarks/s3_io.py
+++ b/python/kvikio/kvikio/benchmarks/s3_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import argparse
 import contextlib
diff --git a/python/kvikio/kvikio/benchmarks/single_node_io.py b/python/kvikio/kvikio/benchmarks/single_node_io.py
index e3b152cbaf..eb35f2ae7c 100644
--- a/python/kvikio/kvikio/benchmarks/single_node_io.py
+++ b/python/kvikio/kvikio/benchmarks/single_node_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import argparse
 import contextlib
@@ -8,7 +8,7 @@
 import statistics
 import tempfile
 from time import perf_counter as clock
-from typing import Any, ContextManager, Dict, Union
+from typing import ContextManager, Union
 
 import cupy
 from dask.utils import format_bytes, parse_bytes
@@ -19,21 +19,6 @@
 from kvikio.benchmarks.utils import parse_directory, pprint_sys_info
 
 
-def get_zarr_compressors() -> Dict[str, Any]:
-    """Returns a dict of available Zarr compressors"""
-    try:
-        import kvikio.zarr
-    except ImportError:
-        return {}
-    try:
-        compressors = kvikio.zarr.nvcomp_compressors
-    except AttributeError:
-        # zarr-python 3.x
-        return {}
-    else:
-        return {c.__name__.lower(): c for c in compressors}
-
-
 def create_data(nbytes):
     """Return a random uint8 cupy array"""
     return cupy.arange(nbytes, dtype="uint8")
@@ -223,10 +208,6 @@ def run_zarr(args):
     if not kvikio.zarr.supported:
         raise RuntimeError(f"requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}")
 
-    compressor = None
-    if args.zarr_compressor is not None:
-        compressor = get_zarr_compressors()[args.zarr_compressor]()
-
     a = create_data(args.nbytes)
 
     shutil.rmtree(str(dir_path), ignore_errors=True)
@@ -236,7 +217,6 @@ def run_zarr(args):
     z = zarr.array(
         a,
         chunks=False,
-        compressor=compressor,
         store=kvikio.zarr.GDSStore(dir_path),
         meta_array=cupy.empty(()),
     )
@@ -277,8 +257,6 @@ def main(args):
     print(f"directory         | {args.dir}")
     print(f"nthreads          | {args.nthreads}")
     print(f"nruns             | {args.nruns}")
-    if args.zarr_compressor is not None:
-        print(f"Zarr compressor   | {args.zarr_compressor}")
     print("==================================")
 
     # Run each benchmark using the requested APIs
@@ -354,16 +332,6 @@ def pprint_api_res(name, samples):
         choices=tuple(API.keys()) + ("all",),
         help="List of APIs to use {%(choices)s}",
     )
-    parser.add_argument(
-        "--zarr-compressor",
-        metavar="COMPRESSOR",
-        default=None,
-        choices=tuple(get_zarr_compressors().keys()),
-        help=(
-            "Set a nvCOMP compressor to use with Zarr "
-            "{%(choices)s} (default: %(default)s)"
-        ),
-    )
 
     args = parser.parse_args()
     if "all" in args.api:
diff --git a/python/kvikio/kvikio/benchmarks/utils.py b/python/kvikio/kvikio/benchmarks/utils.py
index 2462b22a79..a6a2b1fb42 100644
--- a/python/kvikio/kvikio/benchmarks/utils.py
+++ b/python/kvikio/kvikio/benchmarks/utils.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
 
diff --git a/python/kvikio/kvikio/benchmarks/zarr_io.py b/python/kvikio/kvikio/benchmarks/zarr_io.py
index 7882fcad8c..81766bd9f9 100644
--- a/python/kvikio/kvikio/benchmarks/zarr_io.py
+++ b/python/kvikio/kvikio/benchmarks/zarr_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import argparse
 import contextlib
@@ -13,9 +13,9 @@
 from typing import ContextManager, Union
 
 import cupy
-import numcodecs.blosc
 import numpy
 import zarr
+import zarr.storage
 from dask.utils import format_bytes, parse_bytes
 
 import kvikio
@@ -23,62 +23,47 @@
 import kvikio.zarr
 from kvikio.benchmarks.utils import drop_vm_cache, parse_directory, pprint_sys_info
 
-if not kvikio.zarr.supported:
-    raise RuntimeError(f"requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}")
-
-compressors = {
-    "none": (None, None),
-    "lz4": (numcodecs.blosc.Blosc(cname="lz4"), kvikio.zarr.LZ4()),
-}
-
 
 def create_src_data(args):
     return cupy.random.random(args.nelem, dtype=args.dtype)
 
 
 def run_kvikio(args):
-    dir_path = args.dir / "kvikio"
-    shutil.rmtree(str(dir_path), ignore_errors=True)
-
-    # Get the GPU compressor
-    compressor = compressors[args.compressor][1]
-
-    src = create_src_data(args)
-
-    # Write
-    if args.drop_vm_cache:
-        drop_vm_cache()
-    t0 = clock()
-    z = zarr.create(
-        shape=(args.nelem,),
-        chunks=(args.chunksize,),
-        dtype=args.dtype,
-        compressor=compressor,
-        store=kvikio.zarr.GDSStore(dir_path),
-        meta_array=cupy.empty(()),
-    )
-    z[:] = src
-    os.sync()
-    write_time = clock() - t0
-
-    # Read
-    if args.drop_vm_cache:
-        drop_vm_cache()
-    t0 = clock()
-    res = z[:]
-    read_time = clock() - t0
-    assert res.nbytes == args.nbytes
-
-    return read_time, write_time
+    with zarr.config.enable_gpu():
+        dir_path = args.dir / "kvikio"
+        shutil.rmtree(str(dir_path), ignore_errors=True)
+
+        src = create_src_data(args)
+
+        # Write
+        if args.drop_vm_cache:
+            drop_vm_cache()
+        t0 = clock()
+        z = zarr.create(
+            shape=(args.nelem,),
+            chunks=(args.chunksize,),
+            dtype=args.dtype,
+            store=kvikio.zarr.GDSStore(dir_path),
+        )
+        z[:] = src
+        os.sync()
+        write_time = clock() - t0
+
+        # Read
+        if args.drop_vm_cache:
+            drop_vm_cache()
+        t0 = clock()
+        res = z[:]
+        read_time = clock() - t0
+        assert res.nbytes == args.nbytes
+
+        return read_time, write_time
 
 
 def run_posix(args):
     dir_path = args.dir / "posix"
     shutil.rmtree(str(dir_path), ignore_errors=True)
 
-    # Get the CPU compressor
-    compressor = compressors[args.compressor][0]
-
     src = create_src_data(args)
 
     # Write
@@ -89,9 +74,7 @@ def run_posix(args):
         shape=(args.nelem,),
         chunks=(args.chunksize,),
         dtype=args.dtype,
-        compressor=compressor,
-        store=zarr.DirectoryStore(dir_path),
-        meta_array=numpy.empty(()),
+        store=zarr.storage.LocalStore(dir_path),
     )
     z[:] = src.get()
     os.sync()
@@ -135,7 +118,6 @@ def main(args):
     print(f"directory         | {args.dir}")
     print(f"nthreads          | {args.nthreads}")
     print(f"nruns             | {args.nruns}")
-    print(f"compressor        | {args.compressor}")
     print("==================================")
 
     # Run each benchmark using the requested APIs
@@ -226,16 +208,6 @@ def pprint_api_res(name, samples):
         choices=tuple(API.keys()) + ("all",),
         help="List of APIs to use {%(choices)s}",
     )
-    parser.add_argument(
-        "--compressor",
-        metavar="COMPRESSOR",
-        default="none",
-        choices=tuple(compressors.keys()),
-        help=(
-            "Set a nvCOMP compressor to use with Zarr "
-            "{%(choices)s} (default: %(default)s)"
-        ),
-    )
     parser.add_argument(
         "--drop-vm-cache",
         action="store_true",
diff --git a/python/kvikio/kvikio/buffer.py b/python/kvikio/kvikio/buffer.py
index 62bbc754b4..653d79a9d8 100644
--- a/python/kvikio/kvikio/buffer.py
+++ b/python/kvikio/kvikio/buffer.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 from kvikio._lib import buffer  # type: ignore
 
diff --git a/python/kvikio/kvikio/cufile.py b/python/kvikio/kvikio/cufile.py
index e703c703bc..0aa16695c9 100644
--- a/python/kvikio/kvikio/cufile.py
+++ b/python/kvikio/kvikio/cufile.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import io
 import os
@@ -433,9 +433,24 @@ def raw_write(
         """
         return self._handle.write(buf, size, file_offset, dev_offset)
 
+    def is_direct_io_supported(self) -> bool:
+        """Whether Direct I/O is supported on this file handle.
+
+        This is determined by two factors:
+        - Direct I/O support from the operating system and the file system
+        - KvikIO global setting `auto_direct_io_read` and `auto_direct_io_write`. If
+        both values are false, Direct I/O will not be supported on this file handle.
+
+        Returns
+        -------
+        bool
+            Whether Direct I/O is supported
+        """
+        return self._handle.is_direct_io_supported()
+
 
 def get_page_cache_info(
-    file: Union[os.PathLike, str, int, io.IOBase]
+    file: Union[os.PathLike, str, int, io.IOBase],
 ) -> tuple[int, int]:
     """Obtain the page cache residency information for a given file
 
@@ -458,3 +473,27 @@ def get_page_cache_info(
         and the total number of pages.
     """
     return file_handle.get_page_cache_info(file)
+
+
+def clear_page_cache(
+    reclaim_dentries_and_inodes: bool = True, clear_dirty_pages: bool = True
+) -> bool:
+    """Clear the page cache
+
+    Parameters
+    ----------
+    reclaim_dentries_and_inodes: bool, optional
+        Whether to free reclaimable slab objects which include dentries and inodes.
+
+        - If `true`, equivalent to executing `/sbin/sysctl vm.drop_caches=3`;
+        - If `false`, equivalent to executing `/sbin/sysctl vm.drop_caches=1`.
+    clear_dirty_pages: bool, optional
+        Whether to trigger the writeback process to clear the dirty pages. If `true`,
+        `sync` will be called prior to cache dropping.
+
+    Returns
+    -------
+    bool
+        Whether the page cache has been successfully cleared.
+    """
+    return file_handle.clear_page_cache(reclaim_dentries_and_inodes, clear_dirty_pages)
diff --git a/python/kvikio/kvikio/cufile_driver.py b/python/kvikio/kvikio/cufile_driver.py
index 8c8804d885..154e8e885b 100644
--- a/python/kvikio/kvikio/cufile_driver.py
+++ b/python/kvikio/kvikio/cufile_driver.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import atexit
 from typing import Any, Tuple, overload
@@ -92,13 +92,11 @@ def _property_getter_and_setter(
 
 
 @overload
-def set(config: dict[str, Any], /) -> ConfigContextManager:
-    ...
+def set(config: dict[str, Any], /) -> ConfigContextManager: ...
 
 
 @overload
-def set(key: str, value: Any, /) -> ConfigContextManager:
-    ...
+def set(key: str, value: Any, /) -> ConfigContextManager: ...
 
 
 def set(*config) -> ConfigContextManager:
diff --git a/python/kvikio/kvikio/defaults.py b/python/kvikio/kvikio/defaults.py
index be57d2739c..14d355db15 100644
--- a/python/kvikio/kvikio/defaults.py
+++ b/python/kvikio/kvikio/defaults.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 
 from typing import Any, overload
@@ -56,6 +56,8 @@ def _property_getter_and_setter(self) -> tuple[dict[str, Any], dict[str, Any]]:
             "http_max_attempts",
             "http_status_codes",
             "http_timeout",
+            "auto_direct_io_read",
+            "auto_direct_io_write",
         ]
 
         property_getters = {}
@@ -68,13 +70,11 @@ def _property_getter_and_setter(self) -> tuple[dict[str, Any], dict[str, Any]]:
 
 
 @overload
-def set(config: dict[str, Any], /) -> ConfigContextManager:
-    ...
+def set(config: dict[str, Any], /) -> ConfigContextManager: ...
 
 
 @overload
-def set(key: str, value: Any, /) -> ConfigContextManager:
-    ...
+def set(key: str, value: Any, /) -> ConfigContextManager: ...
 
 
 def set(*config) -> ConfigContextManager:
@@ -124,7 +124,9 @@ def set(*config) -> ConfigContextManager:
         - ``"bounce_buffer_size"``
         - ``"http_max_attempts"``
         - ``"http_status_codes"``
-        - ``*http_timeout*``
+        - ``"http_timeout"``
+        - ``"auto_direct_io_read"``
+        - ``"auto_direct_io_write"``
 
     Returns
     -------
@@ -167,6 +169,7 @@ def get(config_name: str) -> Any:
         - ``"bounce_buffer_size"``
         - ``"http_max_attempts"``
         - ``"http_status_codes"``
+        - ``"http_timeout"``
 
     Returns
     -------
diff --git a/python/kvikio/kvikio/mmap.py b/python/kvikio/kvikio/mmap.py
new file mode 100644
index 0000000000..3c216dd236
--- /dev/null
+++ b/python/kvikio/kvikio/mmap.py
@@ -0,0 +1,171 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import stat
+from typing import Any, Optional
+
+from kvikio._lib.mmap import InternalMmapHandle
+from kvikio.cufile import IOFuture
+
+
+class Mmap:
+    """Handle of a memory-mapped file"""
+
+    def __init__(
+        self,
+        file_path: os.PathLike,
+        flags: str = "r",
+        initial_map_size: Optional[int] = None,
+        initial_map_offset: int = 0,
+        mode: int = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH,
+        map_flags: Optional[int] = None,
+    ):
+        """Construct a new memory-mapped file handle
+
+        Parameters
+        ----------
+        file_path : os.PathLike
+            File path.
+        flags : str, optional
+
+            - ``r``: Open for reading (default)
+            - ``w``: (Not implemented yet) Open for writing, truncating the file first
+            - ``a``: (Not implemented yet) Open for writing, appending to the end of
+              file if it exists
+            - ``+``: (Not implemented yet) Open for updating (reading and writing)
+        initial_map_size : int, optional
+            Size in bytes of the mapped region. If not specified, map the region
+            starting from ``initial_map_offset`` to the end of file.
+        initial_map_offset : int, optional
+            File offset of the mapped region. Default is 0.
+        mode : int, optional
+            Access mode (permissions) to use if creating a new file. Default is
+            0644 (octal), 420 (decimal).
+        map_flags : int, optional
+            Flags to be passed to the system call ``mmap``. See `mmap(2)` for details.
+        """
+        self._handle = InternalMmapHandle(
+            file_path, flags, initial_map_size, initial_map_offset, mode, map_flags
+        )
+
+    def initial_map_size(self) -> int:
+        """Size in bytes of the mapped region when the mapping handle was constructed
+
+        Returns
+        -------
+        int
+            Initial size of the mapped region.
+        """
+        return self._handle.initial_map_size()
+
+    def initial_map_offset(self) -> int:
+        """File offset of the mapped region when the mapping handle was constructed
+
+        Returns
+        -------
+        int
+            Initial file offset of the mapped region.
+        """
+        return self._handle.initial_map_offset()
+
+    def file_size(self) -> int:
+        """Get the file size if the file is open
+
+        Returns 0 if the file is closed.
+
+        Returns
+        -------
+        int
+            The file size in bytes.
+        """
+        return self._handle.file_size()
+
+    def close(self) -> None:
+        """Close the mapping handle if it is open; do nothing otherwise
+
+        Unmaps the memory region and closes the underlying file descriptor.
+        """
+        self._handle.close()
+
+    def closed(self) -> bool:
+        """Whether the mapping handle is closed
+
+        Returns
+        -------
+        bool
+            Boolean answer.
+        """
+        return self._handle.closed()
+
+    def read(self, buf: Any, size: Optional[int] = None, offset: int = 0) -> int:
+        """Sequential read ``size`` bytes from the file to the destination buffer
+        ``buf``
+
+        Parameters
+        ----------
+        buf : buffer-like or array-like
+            Address of the host or device memory (destination buffer).
+        size : int, optional
+            Size in bytes to read. If not specified, read starts from ``offset``
+            to the end of file.
+        offset : int, optional
+            File offset. Default is 0.
+
+        Returns
+        -------
+        int
+            Number of bytes that have been read.
+
+        Raises
+        ------
+        IndexError
+            If the read region specified by ``offset`` and ``size`` is outside the
+            initial region specified when the mapping handle was constructed.
+        RuntimeError
+            If the mapping handle is closed.
+        """
+        return self._handle.read(buf, size, offset)
+
+    def pread(
+        self,
+        buf: Any,
+        size: Optional[int] = None,
+        offset: int = 0,
+        task_size: Optional[int] = None,
+    ) -> IOFuture:
+        """Parallel read ``size`` bytes from the file to the destination buffer ``buf``
+
+        Parameters
+        ----------
+        buf : buffer-like or array-like
+            Address of the host or device memory (destination buffer).
+        size : int, optional
+            Size in bytes to read. If not specified, read starts from ``offset``
+            to the end of file.
+        offset : int, optional
+            File offset. Default is 0.
+        task_size : int, optional
+            Size of each task in bytes for parallel execution. If None, uses
+            the default task size from :func:`kvikio.defaults.task_size`.
+
+        Returns
+        -------
+        IOFuture
+            Future that on completion returns the size of bytes that were successfully
+            read.
+
+        Raises
+        ------
+        IndexError
+            If the read region specified by ``offset`` and ``size`` is outside the
+            initial region specified when the mapping handle was constructed.
+        RuntimeError
+            If the mapping handle is closed.
+
+        Notes
+        -----
+        The returned IOFuture object's ``get()`` should not be called after the lifetime
+        of the MmapHandle object ends. Otherwise, the behavior is undefined.
+        """
+        return IOFuture(self._handle.pread(buf, size, offset, task_size))
diff --git a/python/kvikio/kvikio/numcodecs.py b/python/kvikio/kvikio/numcodecs.py
index 40f62be1de..059ad296f9 100644
--- a/python/kvikio/kvikio/numcodecs.py
+++ b/python/kvikio/kvikio/numcodecs.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 """
 This module implements CUDA compression and transformation codecs for Numcodecs.
diff --git a/python/kvikio/kvikio/numpy.py b/python/kvikio/kvikio/numpy.py
index 461a0fae65..c3acab2bd6 100644
--- a/python/kvikio/kvikio/numpy.py
+++ b/python/kvikio/kvikio/numpy.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import io
 import os
@@ -16,12 +16,10 @@
 class FileLike(Protocol):
     """File like object that represent a OS-level file"""
 
-    def fileno(self) -> int:
-        ...
+    def fileno(self) -> int: ...
 
     @property
-    def name(self) -> str:
-        ...
+    def name(self) -> str: ...
 
 
 class LikeWrapper:
diff --git a/python/kvikio/kvikio/nvcomp.py b/python/kvikio/kvikio/nvcomp.py
deleted file mode 100644
index 3b62e51e8c..0000000000
--- a/python/kvikio/kvikio/nvcomp.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-
-from kvikio._nvcomp import (  # noqa: F401
-    ANSManager,
-    BitcompManager,
-    CascadedManager,
-    GdeflateManager,
-    LZ4Manager,
-    ManagedDecompressionManager,
-    SnappyManager,
-    cp_to_nvcomp_dtype,
-    nvCompManager,
-)
-from kvikio.utils import kvikio_deprecate_module
-
-kvikio_deprecate_module(
-    "Use the official nvCOMP API from 'nvidia.nvcomp' instead.", since="25.06"
-)
diff --git a/python/kvikio/kvikio/nvcomp_codec.py b/python/kvikio/kvikio/nvcomp_codec.py
deleted file mode 100644
index ded350cdd5..0000000000
--- a/python/kvikio/kvikio/nvcomp_codec.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from kvikio._nvcomp_codec import NvCompBatchCodec  # noqa: F401
-from kvikio.utils import kvikio_deprecate_module
-
-kvikio_deprecate_module(
-    "Use the official nvCOMP API from 'nvidia.nvcomp' instead.", since="25.06"
-)
diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
index 55cce53115..1faf010c58 100644
--- a/python/kvikio/kvikio/remote_file.py
+++ b/python/kvikio/kvikio/remote_file.py
@@ -1,14 +1,65 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
 
+import enum
 import functools
+import urllib.parse
 from typing import Optional
 
 from kvikio.cufile import IOFuture
 
 
+class RemoteEndpointType(enum.Enum):
+    """
+    Types of remote file endpoints supported by KvikIO.
+
+    This enum defines the different protocols and services that can be used
+    to access remote files. It is used to specify or detect the type of
+    remote endpoint when opening files.
+
+    Attributes
+    ----------
+    AUTO : int
+        Automatically detect the endpoint type from the URL. KvikIO will
+        attempt to infer the appropriate protocol based on the URL format.
+    S3 : int
+        AWS S3 endpoint using credentials-based authentication. Requires
+        AWS environment variables (such as AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
+        AWS_DEFAULT_REGION) to be set.
+    S3_PUBLIC : INT
+        AWS S3 endpoint for publicly accessible objects. No credentials required as the
+        objects have public read permissions enabled. Used for open datasets and public
+        buckets.
+    S3_PRESIGNED_URL : int
+        AWS S3 endpoint using a presigned URL. No credentials required as
+        authentication is embedded in the URL with time-limited access.
+    WEBHDFS : int
+        Apache Hadoop WebHDFS (Web-based Hadoop Distributed File System)
+        endpoint for accessing files stored in HDFS over HTTP/HTTPS.
+    HTTP : int
+        Generic HTTP or HTTPS endpoint for accessing files from web servers.
+        This is used for standard web resources that do not fit the other
+        specific categories.
+
+    See Also
+    --------
+    RemoteFile.open : Factory method that uses this enum to specify endpoint types.
+    """
+
+    AUTO = 0
+    S3 = 1
+    S3_PUBLIC = 2
+    S3_PRESIGNED_URL = 3
+    WEBHDFS = 4
+    HTTP = 5
+
+    @staticmethod
+    def _map_to_internal(remote_endpoint_type: RemoteEndpointType):
+        return _get_remote_module().RemoteEndpointType[remote_endpoint_type.name]
+
+
 @functools.cache
 def is_remote_file_available() -> bool:
     """Check if the remote module is available"""
@@ -56,7 +107,7 @@ def open_http(
         url: str,
         nbytes: Optional[int] = None,
     ) -> RemoteFile:
-        """Open a http file.
+        """Open a HTTP/HTTPS file.
 
         Parameters
         ----------
@@ -66,7 +117,7 @@ def open_http(
             The size of the file. If None, KvikIO will ask the server
             for the file size.
         """
-        return RemoteFile(_get_remote_module().RemoteFile.open_http(url, nbytes))
+        return cls(_get_remote_module().RemoteFile.open_http(url, nbytes))
 
     @classmethod
     def open_s3(
@@ -74,16 +125,25 @@ def open_s3(
         bucket_name: str,
         object_name: str,
         nbytes: Optional[int] = None,
+        aws_region_name: Optional[str] = None,
+        aws_access_key_id: Optional[str] = None,
+        aws_secret_access_key: Optional[str] = None,
+        aws_endpoint_url: Optional[str] = None,
+        aws_session_token: Optional[str] = None,
     ) -> RemoteFile:
         """Open a AWS S3 file from a bucket name and object name.
 
-        Please make sure to set the AWS environment variables:
-          - `AWS_DEFAULT_REGION`
-          - `AWS_ACCESS_KEY_ID`
-          - `AWS_SECRET_ACCESS_KEY`
-          - `AWS_SESSION_TOKEN` (when using temporary credentials)
+        AWS credentials can be provided as keyword arguments or through
+        environment variables:
 
-        Additionally, to overwrite the AWS endpoint, set `AWS_ENDPOINT_URL`.
+        - ``AWS_DEFAULT_REGION`` (or region_name parameter)
+        - ``AWS_ACCESS_KEY_ID`` (or access_key_id parameter)
+        - ``AWS_SECRET_ACCESS_KEY`` (or secret_access_key parameter)
+        - ``AWS_SESSION_TOKEN`` (or aws_session_token parameter, when using
+          temporary credentials)
+
+        Additionally, to overwrite the AWS endpoint, set `AWS_ENDPOINT_URL`
+        (or endpoint_url parameter).
         See <https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html>
 
         Parameters
@@ -95,9 +155,36 @@ def open_s3(
         nbytes
             The size of the file. If None, KvikIO will ask the server
             for the file size.
+        aws_region
+            The AWS region, such as "us-east-1", to use. If None, the value of the
+            `AWS_DEFAULT_REGION` environment variable is used.
+        aws_access_key
+            The AWS access key to use. If None, the value of the
+            `AWS_ACCESS_KEY_ID` environment variable is used.
+        aws_secret_access_key
+            The AWS secret access key to use. If None, the value of the
+            `AWS_SECRET_ACCESS_KEY` environment variable is used.
+        aws_endpoint_url
+            Overwrite the endpoint url (including the protocol part) by using
+            the scheme: "<aws_endpoint_url>/<bucket_name>/<object_name>". If None,
+            the value of the `AWS_ENDPOINT_URL` environment variable is used. If
+            this is also not set, the regular AWS url scheme is used:
+            "https://<bucket_name>.s3.<region>.amazonaws.com/<object_name>".
+        aws_session_token
+            The AWS session token to use. If None, the value of the
+            `AWS_SESSION_TOKEN` environment variable is used.
         """
-        return RemoteFile(
-            _get_remote_module().RemoteFile.open_s3(bucket_name, object_name, nbytes)
+        return cls(
+            _get_remote_module().RemoteFile.open_s3(
+                bucket_name,
+                object_name,
+                nbytes,
+                aws_region_name,
+                aws_access_key_id,
+                aws_secret_access_key,
+                aws_endpoint_url,
+                aws_session_token,
+            )
         )
 
     @classmethod
@@ -105,6 +192,11 @@ def open_s3_url(
         cls,
         url: str,
         nbytes: Optional[int] = None,
+        aws_region_name: Optional[str] = None,
+        aws_access_key_id: Optional[str] = None,
+        aws_secret_access_key: Optional[str] = None,
+        aws_endpoint_url: Optional[str] = None,
+        aws_session_token: Optional[str] = None,
     ) -> RemoteFile:
         """Open a AWS S3 file from an URL.
 
@@ -112,14 +204,17 @@ def open_s3_url(
           - A full http url such as "http://127.0.0.1/my/file", or
           - A S3 url such as "s3://<bucket>/<object>".
 
-        Please make sure to set the AWS environment variables:
-          - `AWS_DEFAULT_REGION`
-          - `AWS_ACCESS_KEY_ID`
-          - `AWS_SECRET_ACCESS_KEY`
-          - `AWS_SESSION_TOKEN` (when using temporary credentials)
+        AWS credentials can be provided as keyword arguments or through
+        environment variables:
+
+        - ``AWS_DEFAULT_REGION`` (or region_name parameter)
+        - ``AWS_ACCESS_KEY_ID`` (or access_key_id parameter)
+        - ``AWS_SECRET_ACCESS_KEY`` (or secret_access_key parameter)
+        - ``AWS_SESSION_TOKEN`` (or aws_session_token parameter, when using
+          temporary credentials)
 
         Additionally, if `url` is a S3 url, it is possible to overwrite the AWS endpoint
-        by setting `AWS_ENDPOINT_URL`.
+        by setting `AWS_ENDPOINT_URL` (or endpoint_url parameter).
         See <https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html>
 
         Parameters
@@ -129,18 +224,225 @@ def open_s3_url(
         nbytes
             The size of the file. If None, KvikIO will ask the server
             for the file size.
+        aws_region
+            The AWS region, such as "us-east-1", to use. If None, the value of the
+            `AWS_DEFAULT_REGION` environment variable is used.
+        aws_access_key
+            The AWS access key to use. If None, the value of the
+            `AWS_ACCESS_KEY_ID` environment variable is used.
+        aws_secret_access_key
+            The AWS secret access key to use. If None, the value of the
+            `AWS_SECRET_ACCESS_KEY` environment variable is used.
+        aws_endpoint_url
+            Overwrite the endpoint url (including the protocol part) by using
+            the scheme: "<aws_endpoint_url>/<bucket_name>/<object_name>". If None,
+            the value of the `AWS_ENDPOINT_URL` environment variable is used. If
+            this is also not set, the regular AWS url scheme is used:
+            "https://<bucket_name>.s3.<region>.amazonaws.com/<object_name>".
+        aws_session_token
+            The AWS session token to use. If None, the value of the
+            `AWS_SESSION_TOKEN` environment variable is used.
         """
-        url = url.lower()
-        if url.startswith("http://") or url.startswith("https://"):
-            return RemoteFile(
-                _get_remote_module().RemoteFile.open_s3_from_http_url(url, nbytes)
+        parsed_result = urllib.parse.urlparse(url.lower())
+        if parsed_result.scheme in ("http", "https"):
+            return cls(
+                _get_remote_module().RemoteFile.open_s3_from_http_url(
+                    url,
+                    nbytes,
+                    aws_region_name,
+                    aws_access_key_id,
+                    aws_secret_access_key,
+                    aws_session_token,
+                )
             )
-        if url.startswith("s3://"):
-            return RemoteFile(
-                _get_remote_module().RemoteFile.open_s3_from_s3_url(url, nbytes)
+        if parsed_result.scheme == "s3":
+            return cls(
+                _get_remote_module().RemoteFile.open_s3_from_s3_url(
+                    url,
+                    nbytes,
+                    aws_region_name,
+                    aws_access_key_id,
+                    aws_secret_access_key,
+                    aws_endpoint_url,
+                    aws_session_token,
+                )
             )
         raise ValueError(f"Unsupported protocol: {url}")
 
+    @classmethod
+    def open_s3_public(cls, url: str, nbytes: Optional[int] = None) -> RemoteFile:
+        """Open a publicly accessible AWS S3 file.
+
+        Parameters
+        ----------
+        url
+             URL to the remote file.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server
+            for the file size.
+        """
+        return cls(_get_remote_module().RemoteFile.open_s3_public(url, nbytes))
+
+    @classmethod
+    def open_s3_presigned_url(
+        cls,
+        presigned_url: str,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """Open a AWS S3 file from a presigned URL.
+
+        Parameters
+        ----------
+        presigned_url
+            Presigned URL to the remote file.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server
+            for the file size.
+        """
+        return cls(
+            _get_remote_module().RemoteFile.open_s3_presigned_url(presigned_url, nbytes)
+        )
+
+    @classmethod
+    def open_webhdfs(
+        cls,
+        url: str,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """Open a file on Apache Hadoop Distributed File System (HDFS) using WebHDFS.
+
+        If KvikIO is run within a Docker, the argument ``--network host`` needs to be
+        passed to the ``docker run`` command.
+
+        Parameters
+        ----------
+        url
+            URL to the remote file.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server for the file
+            size.
+        """
+        return cls(_get_remote_module().RemoteFile.open_webhdfs(url, nbytes))
+
+    @classmethod
+    def open(
+        cls,
+        url: str,
+        remote_endpoint_type: RemoteEndpointType = RemoteEndpointType.AUTO,
+        allow_list: Optional[list] = None,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """
+        Create a remote file handle from a URL.
+
+        This function creates a RemoteFile for reading data from various remote
+        endpoints including HTTP/HTTPS servers, AWS S3 buckets, S3 for public access,
+        S3 presigned URLs, and WebHDFS. The endpoint type can be automatically detected
+        from the URL or explicitly specified.
+
+        Parameters
+        ----------
+        url : str
+            The URL of the remote file. Supported formats include:
+
+            - S3 with credentials
+            - S3 for public access
+            - S3 presigned URL
+            - WebHDFS
+            - HTTP/HTTPS
+        remote_endpoint_type : RemoteEndpointType, optional
+            The type of remote endpoint. Default is :class:`RemoteEndpointType.AUTO`
+            which automatically detects the endpoint type from the URL. Can be
+            explicitly set to :class:`RemoteEndpointType.S3`,
+            :class:`RemoteEndpointType.S3_PUBLIC`,
+            :class:`RemoteEndpointType.S3_PRESIGNED_URL`,
+            :class:`RemoteEndpointType.WEBHDFS`, or :class:`RemoteEndpointType.HTTP`
+            to force a specific endpoint type.
+        allow_list : list of RemoteEndpointType, optional
+            List of allowed endpoint types. If provided:
+
+            - If remote_endpoint_type is :class:`RemoteEndpointType.AUTO`, types are
+              tried in the exact order specified until a match is found.
+            - In explicit mode, the specified type must be in this list, otherwise an
+              exception is thrown.
+
+            If not provided, defaults to all supported types in this order:
+            :class:`RemoteEndpointType.S3`,
+            :class:`RemoteEndpointType.S3_PUBLIC`,
+            :class:`RemoteEndpointType.S3_PRESIGNED_URL`,
+            :class:`RemoteEndpointType.WEBHDFS`, and :class:`RemoteEndpointType.HTTP`.
+        nbytes : int, optional
+            File size in bytes. If not provided, the function sends an additional
+            request to the server to query the file size.
+
+        Returns
+        -------
+        RemoteFile
+            A RemoteFile object that can be used to read data from the remote file.
+
+        Raises
+        ------
+        RuntimeError
+            - If the URL is malformed or missing required components.
+            - :class:`RemoteEndpointType.AUTO` mode is used and the URL does not match
+              any supported endpoint type.
+            - The specified endpoint type is not in the `allow_list`.
+            - The URL is invalid for the specified endpoint type.
+            - Unable to connect to the remote server or determine file size
+              (when nbytes not provided).
+
+        Examples
+        --------
+        - Auto-detect endpoint type from URL:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 "https://bucket.s3.amazonaws.com/object?X-Amz-Algorithm=AWS4-HMAC-SHA256"
+                 "&X-Amz-Credential=...&X-Amz-Signature=..."
+             )
+
+        - Open S3 file with explicit endpoint type:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 "https://my-bucket.s3.us-east-1.amazonaws.com/data.bin",
+                 remote_endpoint_type=RemoteEndpointType.S3
+             )
+
+        - Restrict endpoint type candidates:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 user_provided_url,
+                 remote_endpoint_type=RemoteEndpointType.AUTO,
+                 allow_list=[
+                     RemoteEndpointType.HTTP,
+                     RemoteEndpointType.S3_PRESIGNED_URL
+                 ]
+             )
+
+        - Provide known file size to skip HEAD request:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 "https://example.com/large-file.bin",
+                 remote_endpoint_type=RemoteEndpointType.HTTP,
+                 nbytes=1024 * 1024 * 100  # 100 MB
+             )
+        """
+        return cls(
+            _get_remote_module().RemoteFile.open(
+                url,
+                RemoteEndpointType._map_to_internal(remote_endpoint_type),
+                allow_list,
+                nbytes,
+            )
+        )
+
     def close(self) -> None:
         """Close the file"""
         pass
@@ -154,6 +456,15 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
     def __str__(self) -> str:
         return str(self._handle)
 
+    def remote_endpoint_type(self) -> RemoteEndpointType:
+        """Get the type of the remote file.
+
+        Returns
+        -------
+        The type of the remote file.
+        """
+        return RemoteEndpointType[self._handle.remote_endpoint_type().name]
+
     def nbytes(self) -> int:
         """Get the file size.
 
diff --git a/python/kvikio/kvikio/utils.py b/python/kvikio/kvikio/utils.py
index 72dabaf064..4f58b36c18 100644
--- a/python/kvikio/kvikio/utils.py
+++ b/python/kvikio/kvikio/utils.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import functools
 import multiprocessing
diff --git a/python/kvikio/kvikio/zarr/__init__.py b/python/kvikio/kvikio/zarr/__init__.py
index 7ec22c275a..a3ae659b2e 100644
--- a/python/kvikio/kvikio/zarr/__init__.py
+++ b/python/kvikio/kvikio/zarr/__init__.py
@@ -1,10 +1,9 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
-from importlib import metadata as _metadata
+try:
+    from ._zarr_python_3 import GDSStore
+except ImportError as e:
+    raise ImportError("kvikio.zarr requires the optional 'zarr>=3' dependency") from e
 
-from packaging.version import Version as _Version, parse as _parse
-
-if _parse(_metadata.version("zarr")) >= _Version("3.0.0"):
-    from ._zarr_python_3 import *  # noqa: F401,F403
-else:
-    from ._zarr_python_2 import *  # type: ignore[assignment] # noqa: F401,F403
+__all__ = ["GDSStore"]
diff --git a/python/kvikio/kvikio/zarr/_zarr_python_2.py b/python/kvikio/kvikio/zarr/_zarr_python_2.py
deleted file mode 100644
index bd1418e799..0000000000
--- a/python/kvikio/kvikio/zarr/_zarr_python_2.py
+++ /dev/null
@@ -1,400 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-from __future__ import annotations
-
-import contextlib
-import os
-import os.path
-from abc import abstractmethod
-from typing import Any, Literal, Mapping, Optional, Sequence, Union
-
-import cupy
-import cupy.typing
-import numcodecs
-import numpy
-import numpy as np
-import zarr
-import zarr.creation
-import zarr.errors
-import zarr.storage
-from numcodecs.abc import Codec
-from numcodecs.compat import ensure_contiguous_ndarray_like
-from numcodecs.registry import register_codec
-from packaging.version import parse
-
-import kvikio
-import kvikio._nvcomp
-import kvikio._nvcomp_codec
-import kvikio.zarr
-from kvikio._nvcomp_codec import NvCompBatchCodec
-from kvikio.numcodecs import BufferLike, CudaCodec
-
-MINIMUM_ZARR_VERSION = "2.15"
-
-# Is this version of zarr supported? We depend on the `Context`
-# argument introduced in https://github.com/zarr-developers/zarr-python/pull/1131
-# in zarr v2.15.
-supported = parse(zarr.__version__) >= parse(MINIMUM_ZARR_VERSION)
-
-
-class GDSStore(zarr.storage.DirectoryStore):  # type: ignore[name-defined]
-    """GPUDirect Storage (GDS) class using directories and files.
-
-    This class works like `zarr.storage.DirectoryStore` but implements
-    getitems() in order to support direct reading into device memory.
-    It uses KvikIO for reads and writes, which in turn will use GDS
-    when applicable.
-
-    Parameters
-    ----------
-    path : string
-        Location of directory to use as the root of the storage hierarchy.
-    normalize_keys : bool, optional
-        If True, all store keys will be normalized to use lower case characters
-        (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be
-        useful to avoid potential discrepancies between case-sensitive and
-        case-insensitive file system. Default value is False.
-    dimension_separator : {'.', '/'}, optional
-        Separator placed between the dimensions of a chunk.
-    compressor_config_overwrite
-        If not None, use this `Mapping` to specify what is written to the Zarr metadata
-        file on disk (`.zarray`). Normally, Zarr writes the configuration[1] given by
-        the `compressor` argument to the `.zarray` file. Use this argument to overwrite
-        the normal configuration and use the specified `Mapping` instead.
-    decompressor_config_overwrite
-        If not None, use this `Mapping` to specify what compressor configuration[1] is
-        used for decompressing no matter the configuration found in the Zarr metadata
-        on disk (the `.zarray` file).
-
-    [1] https://github.com/zarr-developers/numcodecs/blob/cb155432/numcodecs/abc.py#L79
-
-    Notes
-    -----
-    Atomic writes are used, which means that data are first written to a
-    temporary file, then moved into place when the write is successfully
-    completed. Files are only held open while they are being read or written and are
-    closed immediately afterwards, so there is no need to manually close any files.
-
-    Safe to write in multiple threads or processes.
-    """
-
-    # The default output array type used by getitems().
-    default_meta_array = numpy.empty(())
-
-    def __init__(
-        self,
-        path,
-        normalize_keys=False,
-        dimension_separator=None,
-        *,
-        compressor_config_overwrite: Optional[Mapping] = None,
-        decompressor_config_overwrite: Optional[Mapping] = None,
-    ) -> None:
-        if not kvikio.zarr.supported:
-            raise RuntimeError(
-                f"GDSStore requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}"
-            )
-        super().__init__(
-            path, normalize_keys=normalize_keys, dimension_separator=dimension_separator
-        )
-        self.compressor_config_overwrite = compressor_config_overwrite
-        self.decompressor_config_overwrite = decompressor_config_overwrite
-
-    def __eq__(self, other):
-        return isinstance(other, GDSStore) and self.path == other.path
-
-    def _tofile(self, a, fn):
-        with kvikio.CuFile(fn, "w") as f:
-            written = f.write(a)
-            assert written == a.nbytes
-
-    def __getitem__(self, key):
-        ret = super().__getitem__(key)
-        if self.decompressor_config_overwrite and key == ".zarray":
-            meta = self._metadata_class.decode_array_metadata(ret)
-            if meta["compressor"]:
-                meta["compressor"] = self.decompressor_config_overwrite
-                ret = self._metadata_class.encode_array_metadata(meta)
-        return ret
-
-    def __setitem__(self, key, value):
-        if self.compressor_config_overwrite and key == ".zarray":
-            meta = self._metadata_class.decode_array_metadata(value)
-            if meta["compressor"]:
-                meta["compressor"] = self.compressor_config_overwrite
-                value = self._metadata_class.encode_array_metadata(meta)
-        super().__setitem__(key, value)
-
-    def getitems(
-        self,
-        keys: Sequence[str],
-        *,
-        contexts: Mapping[str, Mapping] = {},
-    ) -> Mapping[str, Any]:
-        """Retrieve data from multiple keys.
-
-        Parameters
-        ----------
-        keys : Iterable[str]
-            The keys to retrieve
-        contexts: Mapping[str, Context]
-            A mapping of keys to their context. Each context is a mapping of store
-            specific information. If the "meta_array" key exist, GDSStore use its
-            values as the output array otherwise GDSStore.default_meta_array is used.
-
-        Returns
-        -------
-        Mapping
-            A collection mapping the input keys to their results.
-        """
-        ret = {}
-        io_results = []
-
-        with contextlib.ExitStack() as stack:
-            for key in keys:
-                filepath = os.path.join(self.path, key)
-                if not os.path.isfile(filepath):
-                    continue
-                try:
-                    meta_array = contexts[key]["meta_array"]
-                except KeyError:
-                    meta_array = self.default_meta_array
-
-                nbytes = os.path.getsize(filepath)
-                f = stack.enter_context(kvikio.CuFile(filepath, "r"))
-                ret[key] = numpy.empty_like(meta_array, shape=(nbytes,), dtype="u1")
-                io_results.append((f.pread(ret[key]), nbytes))
-
-            for future, nbytes in io_results:
-                nbytes_read = future.get()
-                if nbytes_read != nbytes:
-                    raise RuntimeError(
-                        f"Incomplete read ({nbytes_read}) expected {nbytes}"
-                    )
-        return ret
-
-
-class NVCompCompressor(CudaCodec):
-    """Abstract base class for nvCOMP compressors
-
-    The derived classes must set `codec_id` and implement
-    `get_nvcomp_manager`
-    """
-
-    @abstractmethod
-    def get_nvcomp_manager(self) -> kvikio.nvcomp.nvCompManager:
-        """Abstract method that should return the nvCOMP compressor manager"""
-        pass  # TODO: cache Manager
-
-    def encode(self, buf: BufferLike) -> cupy.typing.NDArray:
-        buf = cupy.asarray(ensure_contiguous_ndarray_like(buf))
-        return self.get_nvcomp_manager().compress(buf)
-
-    def decode(self, buf: BufferLike, out: Optional[BufferLike] = None) -> BufferLike:
-        buf = ensure_contiguous_ndarray_like(buf)
-        is_host_buffer = not hasattr(buf, "__cuda_array_interface__")
-        if is_host_buffer:
-            buf = cupy.asarray(buf)
-
-        ret = self.get_nvcomp_manager().decompress(buf)
-
-        if is_host_buffer:
-            ret = cupy.asnumpy(ret)
-
-        if out is not None:
-            out = ensure_contiguous_ndarray_like(out)
-            if hasattr(out, "__cuda_array_interface__"):
-                cupy.copyto(out, ret.view(dtype=out.dtype), casting="no")
-            else:
-                np.copyto(out, cupy.asnumpy(ret.view(dtype=out.dtype)), casting="no")
-        return ret
-
-
-class ANS(NVCompCompressor):
-    codec_id = "nvcomp_ANS"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.ANSManager()
-
-
-class Bitcomp(NVCompCompressor):
-    codec_id = "nvcomp_Bitcomp"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.BitcompManager()
-
-
-class Cascaded(NVCompCompressor):
-    codec_id = "nvcomp_Cascaded"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.CascadedManager()
-
-
-class Gdeflate(NVCompCompressor):
-    codec_id = "nvcomp_Gdeflate"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.GdeflateManager()
-
-
-class LZ4(NVCompCompressor):
-    codec_id = "nvcomp_LZ4"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.LZ4Manager()
-
-
-class Snappy(NVCompCompressor):
-    codec_id = "nvcomp_Snappy"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.SnappyManager()
-
-
-# Expose a list of available nvCOMP compressors and register them as Zarr condecs
-nvcomp_compressors = [ANS, Bitcomp, Cascaded, Gdeflate, LZ4, Snappy]
-for c in nvcomp_compressors:
-    register_codec(c)
-
-
-class CompatCompressor:
-    """A pair of compatible compressors one using the CPU and one using the GPU
-
-    Warning
-    -------
-    `CompatCompressor` is only supported by KvikIO's `open_cupy_array()` and
-    cannot be used as a compressor argument in Zarr functions like `open()`
-    and `open_array()` directly. However, it is possible to use its `.cpu`
-    like: `open(..., compressor=CompatCompressor.lz4().cpu)`.
-
-    Parameters
-    ----------
-    cpu
-        The CPU compressor.
-    gpu
-        The GPU compressor.
-    """
-
-    def __init__(self, cpu: Codec, gpu: CudaCodec) -> None:
-        self.cpu = cpu
-        self.gpu = gpu
-
-    @classmethod
-    def lz4(cls) -> CompatCompressor:
-        """A compatible pair of LZ4 compressors"""
-        return cls(cpu=numcodecs.LZ4(), gpu=NvCompBatchCodec("lz4"))
-
-
-def open_cupy_array(
-    store: Union[os.PathLike, str],
-    mode: Literal["r", "r+", "a", "w", "w-"] = "a",
-    compressor: Codec | CompatCompressor = Snappy(),
-    meta_array=cupy.empty(()),
-    **kwargs,
-) -> zarr.Array:
-    """Open an Zarr array as a CuPy-like array using file-mode-like semantics.
-
-    This function is a CUDA friendly version of `zarr.open_array` that reads
-    and writes to CuPy arrays. Beside the arguments listed below, the arguments
-    have the same semantics as in `zarr.open_array`.
-
-    Parameters
-    ----------
-    store
-        Path to directory in file system. As opposed to `zarr.open_array`,
-        Store and path to zip files isn't supported.
-    mode
-        Persistence mode: 'r' means read only (must exist); 'r+' means
-        read/write (must exist); 'a' means read/write (create if doesn't
-        exist); 'w' means create (overwrite if exists); 'w-' means create
-        (fail if exists).
-    compressor
-        The compressor used when creating a Zarr file or None if no compressor
-        is to be used. If a `CompatCompressor` is given, `CompatCompressor.gpu`
-        is used for compression and decompression; and `CompatCompressor.cpu`
-        is written as the compressor in the Zarr file metadata on disk.
-        This argument is ignored in "r" and "r+" mode. By default the
-        Snappy compressor by nvCOMP is used.
-    meta_array : array-like, optional
-        An CuPy-like array instance to use for determining arrays to create and
-        return to users. It must implement `__cuda_array_interface__`.
-    **kwargs
-        The rest of the arguments are forwarded to `zarr.open_array` as-is.
-
-    Returns
-    -------
-    Zarr array backed by a GDS file store, nvCOMP compression, and CuPy arrays.
-    """
-
-    if not isinstance(store, (str, os.PathLike)):
-        raise ValueError("store must be a path")
-    store = str(os.fspath(store))
-    if not hasattr(meta_array, "__cuda_array_interface__"):
-        raise ValueError("meta_array must implement __cuda_array_interface__")
-
-    if mode in ("r", "r+", "a"):
-        # In order to handle "a", we start by trying to open the file in read mode.
-        try:
-            ret = zarr.open_array(
-                store=kvikio.zarr.GDSStore(path=store),  # type: ignore[call-arg]
-                mode="r+",
-                meta_array=meta_array,
-                **kwargs,
-            )
-        except (
-            zarr.errors.ContainsGroupError,
-            zarr.errors.ArrayNotFoundError,  # type: ignore[attr-defined]
-        ):
-            # If we are reading, this is a genuine error.
-            if mode in ("r", "r+"):
-                raise
-        else:
-            if ret.compressor is None:
-                return ret
-            # If we are reading a LZ4-CPU compressed file, we overwrite the
-            # metadata on-the-fly to make Zarr use LZ4-GPU for both compression
-            # and decompression.
-            compat_lz4 = CompatCompressor.lz4()
-            if ret.compressor == compat_lz4.cpu:
-                ret = zarr.open_array(
-                    store=kvikio.zarr.GDSStore(  # type: ignore[call-arg]
-                        path=store,
-                        compressor_config_overwrite=compat_lz4.cpu.get_config(),
-                        decompressor_config_overwrite=compat_lz4.gpu.get_config(),
-                    ),
-                    mode=mode,
-                    meta_array=meta_array,
-                    **kwargs,
-                )
-            elif not isinstance(ret.compressor, CudaCodec):
-                raise ValueError(
-                    "The Zarr file was written using a non-CUDA compatible "
-                    f"compressor, {ret.compressor}, please use something "
-                    "like kvikio.zarr.CompatCompressor"
-                )
-            return ret
-
-    # At this point, we known that we are writing a new array
-    if mode not in ("w", "w-", "a"):
-        raise ValueError(f"Unknown mode: {mode}")
-
-    if isinstance(compressor, CompatCompressor):
-        compressor_config_overwrite = compressor.cpu.get_config()
-        decompressor_config_overwrite = compressor.gpu.get_config()
-        compressor = compressor.gpu
-    else:
-        compressor_config_overwrite = None
-        decompressor_config_overwrite = None
-
-    return zarr.open_array(
-        store=kvikio.zarr.GDSStore(  # type: ignore[call-arg]
-            path=store,
-            compressor_config_overwrite=compressor_config_overwrite,
-            decompressor_config_overwrite=decompressor_config_overwrite,
-        ),
-        mode=mode,
-        meta_array=meta_array,
-        compressor=compressor,
-        **kwargs,
-    )
diff --git a/python/kvikio/kvikio/zarr/_zarr_python_3.py b/python/kvikio/kvikio/zarr/_zarr_python_3.py
index 5305cd9b72..5f22d73ae0 100644
--- a/python/kvikio/kvikio/zarr/_zarr_python_3.py
+++ b/python/kvikio/kvikio/zarr/_zarr_python_3.py
@@ -1,23 +1,34 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import asyncio
 import functools
 import os
 from pathlib import Path
 
-import packaging
-import zarr.storage
-from zarr.abc.store import (
+import packaging.version
+import zarr
+
+_zarr_version = packaging.version.parse(zarr.__version__)
+
+if _zarr_version < packaging.version.parse("3.0.0"):
+    # We include this runtime package checking to help users who relied on
+    # installing kvikio to also include zarr, which is not an optional dependency.
+    raise ImportError(
+        f"'zarr>=3' is required, but 'zarr=={_zarr_version}' is installed."
+    )
+
+import zarr.storage  # noqa: E402
+from zarr.abc.store import (  # noqa: E402
     ByteRequest,
     OffsetByteRequest,
     RangeByteRequest,
     SuffixByteRequest,
 )
-from zarr.core.buffer import Buffer, BufferPrototype
-from zarr.core.buffer.core import default_buffer_prototype
+from zarr.core.buffer import Buffer, BufferPrototype  # noqa: E402
+from zarr.core.buffer.core import default_buffer_prototype  # noqa: E402
 
-import kvikio
+import kvikio  # noqa: E402
 
 # The GDSStore implementation follows the `LocalStore` implementation
 # at https://github.com/zarr-developers/zarr-python/blob/main/src/zarr/storage/_local.py
@@ -26,7 +37,7 @@
 
 @functools.cache
 def _is_ge_zarr_3_0_7():
-    return packaging.version.parse(zarr.__version__) >= packaging.version.parse("3.0.7")
+    return _zarr_version >= packaging.version.parse("3.0.7")
 
 
 def _get(
@@ -138,10 +149,3 @@ async def _set(self, key: str, value: Buffer, exclusive: bool = False) -> None:
         path = self.root / key
 
         await asyncio.to_thread(_put, path, value, start=None, exclusive=exclusive)
-
-
-# Matching the check that zarr.__version__ > 2.15 that's
-# part of the public API for our zarr 2.x support
-# This module is behind a check that zarr.__version__ > 3
-# so we can just assume it's already checked and supported.
-supported = True
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index a61fb39763..619dcbfa90 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -1,10 +1,10 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 [build-system]
 build-backend = "rapids_build_backend.build"
 requires = [
-    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "rapids-build-backend>=0.4.0,<0.5.0.dev0",
     "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -19,12 +19,10 @@ authors = [
 license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cupy-cuda11x>=12.0.0",
-    "libkvikio==25.6.*,>=0.0.0a0",
-    "numcodecs !=0.12.0",
+    "cupy-cuda13x>=13.6.0",
+    "libkvikio==26.2.*,>=0.0.0a0",
     "numpy>=1.23,<3.0a0",
     "packaging",
-    "zarr>=2.0.0,<4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -41,40 +39,23 @@ classifiers = [
 [project.optional-dependencies]
 test = [
     "boto3>=1.21.21",
-    "cuda-python>=11.8.5,<12.0a0",
+    "cuda-python>=13.0.1,<14.0a0",
     "moto[server]>=4.0.8",
-    "pytest",
     "pytest-asyncio",
     "pytest-cov",
     "pytest-timeout",
+    "pytest<9.0.0a0",
     "rangehttpserver",
-    "rapids-dask-dependency==25.6.*,>=0.0.0a0",
+    "rapids-dask-dependency==26.2.*,>=0.0.0a0",
+    "zarr>=3.0.0,<3.2.0a0,<4.0.0; python_version >= '3.11'",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+zarr = [
+    "zarr>=3.0.0,<3.2.0a0,<4.0.0; python_version >= '3.11'",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/kvikio"
 
-[tool.black]
-line-length = 88
-target-version = ["py310"]
-include = '\.py?$'
-exclude = '''
-/(
-    thirdparty |
-    \.eggs |
-    \.git |
-    \.hg |
-    \.mypy_cache |
-    \.tox |
-    \.venv |
-    _build |
-    buck-out |
-    build |
-    dist |
-    _skbuild
-)/
-'''
-
 [tool.isort]
 line_length = 88
 multi_line_output = 3
@@ -110,15 +91,6 @@ skip = [
 
 [tool.mypy]
 ignore_missing_imports = true
-exclude = [
-    # we type check against zarr-python 3.x
-    # and ignore modules using 2.x
-    "python/kvikio/kvikio/zarr/_zarr_python_2.py",
-    "python/kvikio/tests/test_nvcomp_codec.py",
-]
-
-[project.entry-points."numcodecs.codecs"]
-nvcomp_batch = "kvikio.nvcomp_codec:NvCompBatchCodec"
 
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
@@ -126,8 +98,8 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.30.4",
-    "cython>=3.0.0",
-    "libkvikio==25.6.*,>=0.0.0a0",
+    "cython>=3.0.0,<3.2.0a0",
+    "libkvikio==26.2.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -160,9 +132,9 @@ filterwarnings = [
     "error",
     "ignore:Jitify is performing a one-time only warm-up to populate the persistent cache",
     "ignore::DeprecationWarning:botocore.*",
-    "ignore:This module is deprecated since.*Use the official nvCOMP API from 'nvidia.nvcomp' instead.:FutureWarning:.*nvcomp|.*nvcomp_codec",
 ]
 markers = [
-    "cufile: tests to skip if cuFile isn't available e.g. run with `pytest -m 'not cufile'`"
+    "cufile: tests to skip if cuFile isn't available e.g. run with `pytest -m 'not cufile'`",
+    "gpu: zarr-python tests requiring a GPU to run."
 ]
 asyncio_mode = "auto"
diff --git a/python/kvikio/tests/conftest.py b/python/kvikio/tests/conftest.py
index 07636095eb..eb6eda9fda 100644
--- a/python/kvikio/tests/conftest.py
+++ b/python/kvikio/tests/conftest.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import contextlib
 import multiprocessing as mp
@@ -57,19 +57,6 @@ def run_cmd(cmd: Iterable[str], cwd, verbose=True):
     p.kill()
 
 
-@pytest.fixture()
-def managers():
-    libnvcomp = pytest.importorskip("kvikio.nvcomp")
-    return [
-        libnvcomp.ANSManager,
-        libnvcomp.BitcompManager,
-        libnvcomp.CascadedManager,
-        libnvcomp.GdeflateManager,
-        libnvcomp.LZ4Manager,
-        libnvcomp.SnappyManager,
-    ]
-
-
 @pytest.fixture(
     params=[("cupy", False), ("cupy", True), ("numpy", False)],
     ids=["cupy", "cupy_async", "numpy"],
diff --git a/python/kvikio/tests/test_async_io.py b/python/kvikio/tests/test_async_io.py
index 2de4aef9c9..388a2019f6 100644
--- a/python/kvikio/tests/test_async_io.py
+++ b/python/kvikio/tests/test_async_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import os
 
diff --git a/python/kvikio/tests/test_basic_io.py b/python/kvikio/tests/test_basic_io.py
index 7263a77154..5be7b5e982 100644
--- a/python/kvikio/tests/test_basic_io.py
+++ b/python/kvikio/tests/test_basic_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import io
 import os
diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py
index 8450fdfc25..465aed7bd9 100644
--- a/python/kvikio/tests/test_benchmarks.py
+++ b/python/kvikio/tests/test_benchmarks.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import os
 import os.path
@@ -7,7 +7,6 @@
 from pathlib import Path
 
 import pytest
-from packaging.version import parse
 
 import kvikio
 
@@ -26,25 +25,12 @@
         "cufile-mfma",
         "cufile-mf",
         "cufile-ma",
-        "zarr",
     ],
 )
 @pytest.mark.timeout(30, method="thread")
 def test_single_node_io(run_cmd, tmp_path, api):
     """Test benchmarks/single_node_io.py"""
 
-    if "zarr" in api:
-        kz = pytest.importorskip("kvikio.zarr")
-        import zarr
-
-        if not kz.supported:
-            pytest.skip(f"requires Zarr >={kz.MINIMUM_ZARR_VERSION}")
-
-        if parse(zarr.__version__) >= parse("3.0.0"):
-            pytest.skip(
-                "requires Zarr<3",
-            )
-
     retcode = run_cmd(
         cmd=[
             sys.executable or "python",
@@ -61,44 +47,6 @@ def test_single_node_io(run_cmd, tmp_path, api):
     assert retcode == 0
 
 
-@pytest.mark.parametrize(
-    "api",
-    [
-        "kvikio",
-        "posix",
-    ],
-)
-@pytest.mark.timeout(30, method="thread")
-def test_zarr_io(run_cmd, tmp_path, api):
-    """Test benchmarks/zarr_io.py"""
-
-    kz = pytest.importorskip("kvikio.zarr")
-    import zarr
-
-    if not kz.supported:
-        pytest.skip(f"requires Zarr >={kz.MINIMUM_ZARR_VERSION}")
-
-    if parse(zarr.__version__) >= parse("3.0.0"):
-        pytest.skip(
-            "requires Zarr<3",
-        )
-
-    retcode = run_cmd(
-        cmd=[
-            sys.executable or "python",
-            "zarr_io.py",
-            "-n",
-            "1MiB",
-            "-d",
-            str(tmp_path),
-            "--api",
-            api,
-        ],
-        cwd=benchmarks_path,
-    )
-    assert retcode == 0
-
-
 @pytest.mark.parametrize(
     "api",
     [
diff --git a/python/kvikio/tests/test_cufile_driver.py b/python/kvikio/tests/test_cufile_driver.py
index a325272426..78f9f54dae 100644
--- a/python/kvikio/tests/test_cufile_driver.py
+++ b/python/kvikio/tests/test_cufile_driver.py
@@ -1,11 +1,21 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import subprocess
 
 import pytest
 
 import kvikio.cufile_driver
 
 
+def has_bar_memory() -> bool:
+    try:
+        output = subprocess.check_output(["nvidia-smi"], text=True)
+        return "Not Supported" not in output
+    except Exception:
+        return False
+
+
 def test_version():
     major, minor = kvikio.cufile_driver.libcufile_version()
     assert major >= 0
@@ -13,47 +23,65 @@ def test_version():
 
 
 @pytest.mark.cufile
-def test_open_and_close():
-    kvikio.cufile_driver.driver_open()
-    kvikio.cufile_driver.driver_close()
+def test_open_and_close(request):
+    try:
+        kvikio.cufile_driver.driver_open()
+        kvikio.cufile_driver.driver_close()
+    except RuntimeError as e:
+        if "KvikIO not compiled with cuFile.h" in str(e):
+            pytest.skip("KvikIO not compiled with cuFile.h, skipping cuFile tests")
 
 
 @pytest.mark.cufile
 def test_property_accessor():
     """Test the method `get` and `set`"""
 
-    # Attempt to set a nonexistent property
-    with pytest.raises(KeyError):
-        kvikio.cufile_driver.set("nonexistent_property", 123)
-
-    # Attempt to get a nonexistent property
-    with pytest.raises(KeyError):
-        kvikio.cufile_driver.get("nonexistent_property")
-
-    # Attempt to set a read-only property
-    with pytest.raises(KeyError, match="read-only"):
-        kvikio.cufile_driver.set("major_version", 2077)
-
-    # Nested context managers
-    poll_thresh_size_default = kvikio.cufile_driver.get("poll_thresh_size")
-    with kvikio.cufile_driver.set("poll_thresh_size", 1024):
-        assert kvikio.cufile_driver.get("poll_thresh_size") == 1024
-        with kvikio.cufile_driver.set("poll_thresh_size", 2048):
-            assert kvikio.cufile_driver.get("poll_thresh_size") == 2048
-            with kvikio.cufile_driver.set("poll_thresh_size", 4096):
-                assert kvikio.cufile_driver.get("poll_thresh_size") == 4096
-            assert kvikio.cufile_driver.get("poll_thresh_size") == 2048
-        assert kvikio.cufile_driver.get("poll_thresh_size") == 1024
-    assert kvikio.cufile_driver.get("poll_thresh_size") == poll_thresh_size_default
-
-    # Multiple context managers
-    poll_mode_default = kvikio.cufile_driver.get("poll_mode")
-    max_device_cache_size_default = kvikio.cufile_driver.get("max_device_cache_size")
-    with kvikio.cufile_driver.set({"poll_mode": True, "max_device_cache_size": 2048}):
-        assert kvikio.cufile_driver.get("poll_mode") and (
-            kvikio.cufile_driver.get("max_device_cache_size") == 2048
+    try:
+        # Attempt to set a nonexistent property
+        with pytest.raises(KeyError):
+            kvikio.cufile_driver.set("nonexistent_property", 123)
+
+        # Attempt to get a nonexistent property
+        with pytest.raises(KeyError):
+            kvikio.cufile_driver.get("nonexistent_property")
+
+        # Attempt to set a read-only property
+        with pytest.raises(KeyError, match="read-only"):
+            kvikio.cufile_driver.set("major_version", 2077)
+
+        # Nested context managers
+        poll_thresh_size_default = kvikio.cufile_driver.get("poll_thresh_size")
+        with kvikio.cufile_driver.set("poll_thresh_size", 1024):
+            assert kvikio.cufile_driver.get("poll_thresh_size") == 1024
+            with kvikio.cufile_driver.set("poll_thresh_size", 2048):
+                assert kvikio.cufile_driver.get("poll_thresh_size") == 2048
+                with kvikio.cufile_driver.set("poll_thresh_size", 4096):
+                    assert kvikio.cufile_driver.get("poll_thresh_size") == 4096
+                assert kvikio.cufile_driver.get("poll_thresh_size") == 2048
+            assert kvikio.cufile_driver.get("poll_thresh_size") == 1024
+        assert kvikio.cufile_driver.get("poll_thresh_size") == poll_thresh_size_default
+
+        # Multiple context managers
+        poll_mode_default = kvikio.cufile_driver.get("poll_mode")
+        max_device_cache_size_default = kvikio.cufile_driver.get(
+            "max_device_cache_size"
         )
-    assert (kvikio.cufile_driver.get("poll_mode") == poll_mode_default) and (
-        kvikio.cufile_driver.get("max_device_cache_size")
-        == max_device_cache_size_default
-    )
+        if has_bar_memory():
+            with kvikio.cufile_driver.set(
+                {"poll_mode": True, "max_device_cache_size": 2048}
+            ):
+                assert kvikio.cufile_driver.get("poll_mode") and (
+                    kvikio.cufile_driver.get("max_device_cache_size") == 2048
+                )
+            assert (kvikio.cufile_driver.get("poll_mode") == poll_mode_default) and (
+                kvikio.cufile_driver.get("max_device_cache_size")
+                == max_device_cache_size_default
+            )
+        else:
+            with kvikio.cufile_driver.set("poll_mode", True):
+                assert kvikio.cufile_driver.get("poll_mode")
+            assert kvikio.cufile_driver.get("poll_mode") == poll_mode_default
+
+    except RuntimeError as e:
+        if "KvikIO not compiled with cuFile.h" in str(e):
+            pytest.skip("KvikIO not compiled with cuFile.h, skipping cuFile tests")
diff --git a/python/kvikio/tests/test_defaults.py b/python/kvikio/tests/test_defaults.py
index 57a6ffea7e..c0fac1eaec 100644
--- a/python/kvikio/tests/test_defaults.py
+++ b/python/kvikio/tests/test_defaults.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 
 import pytest
diff --git a/python/kvikio/tests/test_examples.py b/python/kvikio/tests/test_examples.py
index f32485b6c4..3e280b2d67 100644
--- a/python/kvikio/tests/test_examples.py
+++ b/python/kvikio/tests/test_examples.py
@@ -1,12 +1,11 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import os
 from importlib import import_module
 from pathlib import Path
 
 import pytest
-from packaging.version import parse
 
 import kvikio
 
@@ -21,18 +20,6 @@ def test_hello_world(tmp_path, monkeypatch):
     import_module("hello_world").main(tmp_path / "test-file")
 
 
-def test_zarr_cupy_nvcomp(tmp_path, monkeypatch):
-    """Test examples/zarr_cupy_nvcomp.py"""
-
-    # `examples/zarr_cupy_nvcomp.py` requires the Zarr submodule
-    zarr = pytest.importorskip("zarr")
-    if parse(zarr.__version__) >= parse("3.0.0"):
-        pytest.skip(reason="Requires zarr<3")
-
-    monkeypatch.syspath_prepend(str(examples_path))
-    import_module("zarr_cupy_nvcomp").main(tmp_path / "test-file")
-
-
 def test_http_io(tmp_path, monkeypatch):
     """Test examples/http_io.py"""
 
diff --git a/python/kvikio/tests/test_hdfs_io.py b/python/kvikio/tests/test_hdfs_io.py
new file mode 100644
index 0000000000..709bb04992
--- /dev/null
+++ b/python/kvikio/tests/test_hdfs_io.py
@@ -0,0 +1,234 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import json
+import urllib.parse
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from multiprocessing import Process, Queue
+from typing import Any, Generator
+
+import cupy as cp
+import numpy as np
+import numpy.typing as npt
+import pytest
+import utils
+
+import kvikio.defaults
+from kvikio import remote_file
+
+
+class RemoteFileData:
+    def __init__(self, file_path: str, num_elements: int, dtype: npt.DTypeLike) -> None:
+        self.file_path = file_path
+        self.num_elements = num_elements
+        self.dtype = dtype
+        self.buf = np.arange(0, self.num_elements, dtype=self.dtype)
+        self.file_size = self.buf.nbytes
+
+
+@pytest.fixture(scope="module")
+def remote_file_data() -> RemoteFileData:
+    return RemoteFileData(
+        file_path="/webhdfs/v1/home/test_user/test_file.bin",
+        num_elements=1024 * 1024,
+        dtype=np.float64,
+    )
+
+
+def run_mock_server(queue: Queue[int], file_size: int, buf: npt.NDArray[Any]) -> None:
+    """Run HTTP server in a separate process"""
+
+    class WebHdfsHandler(BaseHTTPRequestHandler):
+        def do_GET(self) -> None:
+            parsed_url = urllib.parse.urlparse(self.path)
+            query_dict = urllib.parse.parse_qs(parsed_url.query)
+            op = query_dict["op"]
+
+            # Client requests file size
+            if op == ["GETFILESTATUS"]:
+                self.send_response(200)
+                self.send_header("Content-Type", "application/json")
+                self.end_headers()
+                response = json.dumps({"length": file_size})
+                self.wfile.write(response.encode())
+
+            # Client requests file content
+            elif op == ["OPEN"]:
+                offset = int(query_dict["offset"][0])
+                length = int(query_dict["length"][0])
+
+                # Convert byte offsets to element indices
+                element_size = buf.itemsize
+                begin_idx = offset // element_size
+                end_idx = (offset + length) // element_size
+                range_data = buf[begin_idx:end_idx].tobytes()
+
+                self.send_response(200)
+                self.send_header("Content-Type", "application/octet-stream")
+                self.send_header("Content-Length", str(len(range_data)))
+                self.end_headers()
+                self.wfile.write(range_data)
+            else:
+                self.send_response(400)
+                self.end_headers()
+
+        def log_message(self, format: str, *args: Any) -> None:
+            pass
+
+    port = utils.find_free_port()
+    server = HTTPServer((utils.localhost(), port), WebHdfsHandler)
+
+    # Send port back to parent process
+    queue.put(port)
+
+    server.serve_forever()
+
+
+@pytest.fixture
+def mock_webhdfs_server(remote_file_data: RemoteFileData) -> Generator[str, None, None]:
+    """Start WebHDFS mock server in a separate process"""
+    queue: Queue[int] = Queue()
+    server_process = Process(
+        target=run_mock_server,
+        args=(
+            queue,
+            remote_file_data.file_size,
+            remote_file_data.buf,
+        ),
+        daemon=True,
+    )
+    server_process.start()
+
+    # Get the port the server is running on
+    port = queue.get(timeout=5)
+
+    yield f"http://{utils.localhost()}:{port}"
+
+    # Cleanup
+    server_process.terminate()
+    server_process.join(timeout=1)
+
+
+class TestWebHdfsOperations:
+    @pytest.mark.parametrize("url_query", ["", "?op=OPEN"])
+    def test_get_file_size(
+        self,
+        mock_webhdfs_server: str,
+        remote_file_data: RemoteFileData,
+        url_query: str,
+    ) -> None:
+        url = f"{mock_webhdfs_server}{remote_file_data.file_path}{url_query}"
+        handle = remote_file.RemoteFile.open_webhdfs(url)
+        file_size = handle.nbytes()
+        assert file_size == remote_file_data.file_size
+
+    def test_parallel_read(
+        self, mock_webhdfs_server: str, remote_file_data: RemoteFileData, xp: Any
+    ) -> None:
+        url = f"{mock_webhdfs_server}{remote_file_data.file_path}"
+        handle = remote_file.RemoteFile.open_webhdfs(url)
+        result_buf = xp.arange(
+            0, remote_file_data.num_elements, dtype=remote_file_data.dtype
+        )
+        fut = handle.pread(result_buf)
+        read_size = fut.get()
+
+        assert read_size == remote_file_data.file_size
+
+        result_buf_np = result_buf
+        if isinstance(result_buf, cp.ndarray):
+            result_buf_np = cp.asnumpy(result_buf)
+        assert np.array_equal(result_buf_np, remote_file_data.buf)
+
+    @pytest.mark.parametrize("size", [80, 8 * 9999])
+    @pytest.mark.parametrize("offset", [0, 800, 8000, 8 * 9999])
+    @pytest.mark.parametrize("num_threads", [1, 4])
+    @pytest.mark.parametrize("task_size", [1024, 4096])
+    def test_parallel_read_partial(
+        self,
+        mock_webhdfs_server: str,
+        remote_file_data: RemoteFileData,
+        size: int,
+        offset: int,
+        num_threads: int,
+        task_size: int,
+        xp: Any,
+    ) -> None:
+        url = f"{mock_webhdfs_server}{remote_file_data.file_path}"
+        element_size = remote_file_data.buf.itemsize
+        begin_idx = offset // element_size
+        end_idx = (offset + size) // element_size
+        expected_buf = remote_file_data.buf[begin_idx:end_idx]
+
+        actual_num_elements = size // np.dtype(remote_file_data.dtype).itemsize
+        with kvikio.defaults.set({"num_threads": num_threads, "task_size": task_size}):
+            handle = remote_file.RemoteFile.open_webhdfs(url)
+            result_buf = xp.zeros(actual_num_elements, dtype=remote_file_data.dtype)
+            fut = handle.pread(result_buf, size, offset)
+            read_size = fut.get()
+
+            assert read_size == size
+
+            result_buf_np = result_buf
+            if isinstance(result_buf, cp.ndarray):
+                result_buf_np = cp.asnumpy(result_buf)
+            assert np.array_equal(result_buf_np, expected_buf)
+
+
+class TestWebHdfsErrors:
+    @pytest.fixture
+    def mock_bad_server(
+        self, remote_file_data: RemoteFileData
+    ) -> Generator[str, None, None]:
+        """Start a bad WebHDFS server that returns invalid JSON"""
+
+        def run_bad_server(queue: Queue[int]) -> None:
+            class BadHandler(BaseHTTPRequestHandler):
+                def do_GET(self):
+                    parsed = urllib.parse.urlparse(self.path)
+                    query = urllib.parse.parse_qs(parsed.query)
+
+                    if query.get("op") == ["GETFILESTATUS"]:
+                        self.send_response(200)
+                        self.send_header("Content-Type", "application/json")
+                        self.end_headers()
+                        # Missing "length" field
+                        response = json.dumps({})
+                        self.wfile.write(response.encode())
+                    else:
+                        self.send_response(400)
+                        self.end_headers()
+
+                def log_message(self, format, *args):
+                    pass
+
+            port = utils.find_free_port()
+            server = HTTPServer((utils.localhost(), port), BadHandler)
+            queue.put(port)
+            server.serve_forever()
+
+        queue: Queue[int] = Queue()
+        server_process = Process(target=run_bad_server, args=(queue,), daemon=True)
+        server_process.start()
+
+        port = queue.get(timeout=5)
+
+        yield f"http://{utils.localhost()}:{port}"
+
+        server_process.terminate()
+        server_process.join(timeout=1)
+
+    def test_missing_file_size(
+        self, mock_bad_server: str, remote_file_data: RemoteFileData
+    ) -> None:
+        url = f"{mock_bad_server}{remote_file_data.file_path}"
+
+        with pytest.raises(
+            RuntimeError,
+            match="Regular expression search failed. "
+            "Cannot extract file length from the JSON response.",
+        ):
+            handle = remote_file.RemoteFile.open_webhdfs(url)
+            handle.nbytes()
diff --git a/python/kvikio/tests/test_http_io.py b/python/kvikio/tests/test_http_io.py
index fac0aca5cc..f152c316c5 100644
--- a/python/kvikio/tests/test_http_io.py
+++ b/python/kvikio/tests/test_http_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 
 import http
@@ -203,8 +203,9 @@ def test_retry_http_503_fails(tmpdir, xp, capfd):
         a.tofile(tmpdir / "a")
         b = xp.empty_like(a)
 
-        with pytest.raises(RuntimeError) as m, kvikio.defaults.set(
-            "http_max_attempts", 2
+        with (
+            pytest.raises(RuntimeError) as m,
+            kvikio.defaults.set("http_max_attempts", 2),
         ):
             with kvikio.RemoteFile.open_http(f"{server.url}/a") as f:
                 f.read(b)
diff --git a/python/kvikio/tests/test_mmap.py b/python/kvikio/tests/test_mmap.py
new file mode 100644
index 0000000000..4790f06db4
--- /dev/null
+++ b/python/kvikio/tests/test_mmap.py
@@ -0,0 +1,188 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import contextlib
+import os
+
+import pytest
+
+import kvikio.defaults
+
+cupy = pytest.importorskip("cupy")
+numpy = pytest.importorskip("numpy")
+
+
+def test_no_file(tmp_path):
+    nonexistent_file = tmp_path / "nonexistent_file"
+    with pytest.raises(RuntimeError, match=r".*Unable to open file.*"):
+        kvikio.Mmap(nonexistent_file)
+
+
+def test_invalid_file_open_flag(tmp_path):
+    filename = tmp_path / "read-only-test-file"
+    expected_data = numpy.arange(1024)
+    expected_data.tofile(filename)
+
+    with pytest.raises(ValueError, match=r".*Unknown file open flag.*"):
+        kvikio.Mmap(filename, "")
+
+    with pytest.raises(ValueError, match=r".*Unknown file open flag.*"):
+        kvikio.Mmap(filename, "z")
+
+
+def test_constructor_invalid_range(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+
+    with pytest.raises(IndexError, match=r".*Offset must be less than the file size.*"):
+        kvikio.Mmap(filename, "r", None, test_data.nbytes * 2)
+
+    with pytest.raises(IndexError, match=r".*Mapped region is past the end of file.*"):
+        kvikio.Mmap(filename, "r", test_data.nbytes * 2)
+
+    with pytest.raises(ValueError, match=r".*Mapped region should not be zero byte.*"):
+        kvikio.Mmap(filename, "r", 0)
+
+
+def test_read_invalid_range(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+    output_data = xp.zeros_like(test_data)
+
+    initial_size = 1024
+    initial_file_offset = 512
+
+    with pytest.raises(IndexError, match=r".*Offset is past the end of file.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size, test_data.nbytes + 1)
+
+    with pytest.raises(IndexError, match=r".*Read is out of bound.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size, initial_file_offset + 1)
+
+    with pytest.raises(IndexError, match=r".*Read is out of bound.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size, initial_file_offset - 128)
+
+    with pytest.raises(IndexError, match=r".*Read is out of bound.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size + 128, initial_file_offset)
+
+
+def test_read_valid_range(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+    output_data = xp.zeros_like(test_data)
+
+    initial_size = 1024
+    initial_file_offset = 512
+
+    with contextlib.nullcontext():
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, 0, initial_file_offset)
+
+
+@pytest.mark.parametrize("num_elements_to_read", [None, 10, 9999])
+@pytest.mark.parametrize("num_elements_to_skip", [0, 10, 100, 1000, 9999])
+def test_read_seq(tmp_path, xp, num_elements_to_read, num_elements_to_skip):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+
+    if num_elements_to_read is None:
+        initial_size = None
+        actual_num_elements_to_read = int(
+            os.path.getsize(filename) / test_data.itemsize
+        )
+    else:
+        initial_size = num_elements_to_read * test_data.itemsize
+        actual_num_elements_to_read = num_elements_to_read
+
+    initial_file_offset = num_elements_to_skip * test_data.itemsize
+    expected_data = test_data[
+        num_elements_to_skip : (num_elements_to_skip + actual_num_elements_to_read)
+    ]
+    actual_data = xp.zeros_like(expected_data)
+
+    mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+    read_size = mmap_handle.read(actual_data, initial_size, initial_file_offset)
+
+    assert read_size == expected_data.nbytes
+    xp.testing.assert_array_equal(actual_data, expected_data)
+
+
+@pytest.mark.parametrize("num_elements_to_read", [None, 10, 9999])
+@pytest.mark.parametrize("num_elements_to_skip", [0, 10, 100, 1000, 9999])
+@pytest.mark.parametrize("task_size", [1024, 12345])
+def test_read_parallel(
+    tmp_path, xp, num_elements_to_read, num_elements_to_skip, task_size
+):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+
+    if num_elements_to_read is None:
+        initial_size = None
+        actual_num_elements_to_read = int(
+            os.path.getsize(filename) / test_data.itemsize
+        )
+    else:
+        initial_size = num_elements_to_read * test_data.itemsize
+        actual_num_elements_to_read = num_elements_to_read
+
+    initial_file_offset = num_elements_to_skip * test_data.itemsize
+    expected_data = test_data[
+        num_elements_to_skip : (num_elements_to_skip + actual_num_elements_to_read)
+    ]
+    actual_data = xp.zeros_like(expected_data)
+
+    with kvikio.defaults.set("task_size", task_size):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        fut = mmap_handle.pread(
+            actual_data, initial_size, initial_file_offset, task_size
+        )
+
+        assert fut.get() == expected_data.nbytes
+        xp.testing.assert_array_equal(actual_data, expected_data)
+
+
+def test_read_with_default_arguments(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    expected_data = xp.arange(1024 * 1024)
+    expected_data.tofile(filename)
+    actual_data = xp.zeros_like(expected_data)
+
+    # Workaround for a CI failure where defaults.task_size() is somehow 0
+    # instead of 4 MiB when KVIKIO_TASK_SIZE is unset
+    with kvikio.defaults.set("task_size", 4 * 1024 * 1024):
+        mmap_handle = kvikio.Mmap(filename, "r")
+
+        read_size = mmap_handle.read(actual_data)
+        assert read_size == expected_data.nbytes
+        xp.testing.assert_array_equal(actual_data, expected_data)
+
+        fut = mmap_handle.pread(actual_data)
+        assert fut.get() == expected_data.nbytes
+        xp.testing.assert_array_equal(actual_data, expected_data)
+
+
+def test_closed_handle(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    expected_data = xp.arange(1024 * 1024)
+    expected_data.tofile(filename)
+    actual_data = xp.zeros_like(expected_data)
+
+    mmap_handle = kvikio.Mmap(filename, "r")
+    mmap_handle.close()
+
+    assert mmap_handle.closed()
+    assert mmap_handle.file_size() == 0
+
+    with pytest.raises(RuntimeError, match=r".*Cannot read from a closed MmapHandle.*"):
+        mmap_handle.read(actual_data)
+
+    with pytest.raises(RuntimeError, match=r".*Cannot read from a closed MmapHandle.*"):
+        mmap_handle.pread(actual_data)
diff --git a/python/kvikio/tests/test_numpy.py b/python/kvikio/tests/test_numpy.py
index 607081e649..14ffc1c2e2 100644
--- a/python/kvikio/tests/test_numpy.py
+++ b/python/kvikio/tests/test_numpy.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 
 import pytest
diff --git a/python/kvikio/tests/test_nvcomp.py b/python/kvikio/tests/test_nvcomp.py
deleted file mode 100644
index 356c5e77cd..0000000000
--- a/python/kvikio/tests/test_nvcomp.py
+++ /dev/null
@@ -1,444 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-import pytest
-
-np = pytest.importorskip("numpy")
-cupy = pytest.importorskip("cupy")
-kvikio = pytest.importorskip("kvikio")
-libnvcomp = pytest.importorskip("kvikio.nvcomp")
-
-
-# TODO: don't hardcode the following expected values
-LEN = {
-    "ANS": 11144,
-    "Bitcomp": 3208,
-    "Cascaded": 600,
-    "Gdeflate": 760,
-    "LZ4": 393,
-    "Snappy": 3548,
-}
-
-
-def assert_compression_size(actual, desired, rtol=0.1):
-    """Compression ratios might change slightly between library versions
-
-    We mark a failure as "xfail"
-    """
-    try:
-        np.testing.assert_allclose(actual, desired, rtol=rtol)
-    except AssertionError:
-        pytest.xfail("mismatch in compression ratios is acceptable")
-        raise
-
-
-def managers():
-    return [
-        libnvcomp.ANSManager,
-        libnvcomp.BitcompManager,
-        libnvcomp.CascadedManager,
-        libnvcomp.GdeflateManager,
-        libnvcomp.LZ4Manager,
-        libnvcomp.SnappyManager,
-    ]
-
-
-def dtypes():
-    return [
-        "uint8",
-        "uint16",
-        "uint32",
-        "int8",
-        "int16",
-        "int32",
-    ]
-
-
-@pytest.mark.parametrize("manager, dtype", zip(managers(), dtypes()))
-def test_round_trip_dtypes(manager, dtype):
-    length = 10000
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager(data_type=dtype)
-    compressed = compressor_instance.compress(data)
-    decompressed = compressor_instance.decompress(compressed)
-    assert (data == decompressed).all()
-
-
-#
-# ANS Options test
-#
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-    ],
-)
-def test_ans_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.ANSManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["ANS"])
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "data_type": np.uint8,
-            "algo": 0,
-        },
-        {"data_type": np.uint8},
-        {
-            "algo": 0,
-        },
-    ],
-)
-def test_bitcomp_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.BitcompManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Bitcomp"])
-
-
-@pytest.mark.parametrize(
-    "inputs, expected",
-    zip(
-        [
-            {"algo": 0},
-            {"algo": 1},
-            {"algo": 2},
-        ],
-        [LEN["Bitcomp"], LEN["Bitcomp"], LEN["Bitcomp"]],
-    ),
-)
-def test_bitcomp_algorithms(inputs, expected):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.BitcompManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), expected)
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-        },
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-            "chunk_size": 1 << 16,
-        },
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-            "data_type": np.uint8,
-        },
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-        },
-    ],
-)
-def test_cascaded_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.CascadedManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Cascaded"])
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-            "algo": 0,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "algo": 0,
-        },
-    ],
-)
-def test_gdeflate_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.GdeflateManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Gdeflate"])
-
-
-@pytest.mark.parametrize(
-    "inputs, expected",
-    zip(
-        [
-            {"algo": 0},
-        ],
-        [LEN["Gdeflate"]],
-    ),
-)
-def test_gdeflate_algorithms(inputs, expected):
-    size = 10000
-    dtype = np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.GdeflateManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), expected)
-
-
-@pytest.mark.xfail(raises=ValueError)
-@pytest.mark.parametrize(
-    "inputs, expected",
-    zip([{"algo": 1}, {"algo": 2}], [LEN["Gdeflate"], LEN["Gdeflate"]]),
-)
-def test_gdeflate_algorithms_not_implemented(inputs, expected):
-    size = 10000
-    dtype = np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.GdeflateManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), expected)
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-            "data_type": np.uint8,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "data_type": np.uint8,
-        },
-    ],
-)
-def test_lz4_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.LZ4Manager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["LZ4"])
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-        {},
-    ],
-)
-def test_snappy_inputs(inputs):
-    size = 10000
-    dtype = np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.SnappyManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Snappy"])
-
-
-@pytest.mark.parametrize(
-    "compressor_size",
-    zip(
-        managers(),
-        [
-            {  # ANS
-                "max_compressed_buffer_size": 89373,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Bitcomp
-                "max_compressed_buffer_size": 16432,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Cascaded
-                "max_compressed_buffer_size": 12460,
-                "num_chunks": 3,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Gdeflate
-                "max_compressed_buffer_size": 131160,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # LZ4
-                "max_compressed_buffer_size": 65888,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Snappy
-                "max_compressed_buffer_size": 76575,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-        ],
-    ),
-)
-def test_get_compression_config_with_default_options(compressor_size):
-    compressor = compressor_size[0]
-    expected = compressor_size[1]
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = compressor()
-    result = compressor_instance.configure_compression(len(data))
-    assert_compression_size(
-        result.pop("max_compressed_buffer_size"),
-        expected.pop("max_compressed_buffer_size"),
-    )
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    "manager,expected",
-    zip(
-        managers(),
-        [
-            {  # ANS
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # Bitcomp
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # Cascaded
-                "num_chunks": 3,
-                "decomp_data_size": 10000,
-            },
-            {  # Gdeflate
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # LZ4
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # Snappy
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-        ],
-    ),
-)
-def test_get_decompression_config_with_default_options(manager, expected):
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager()
-    compressed = compressor_instance.compress(data)
-    result = compressor_instance.configure_decompression_with_compressed_buffer(
-        compressed
-    )
-    assert_compression_size(
-        result.pop("decomp_data_size"), expected.pop("decomp_data_size")
-    )
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    "manager, expected",
-    zip(managers(), list(LEN.values())),
-)
-def test_get_compressed_output_size(manager, expected):
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager()
-    compressed = compressor_instance.compress(data)
-    buffer_size = compressor_instance.get_compressed_output_size(compressed)
-    assert_compression_size(buffer_size, expected)
-
-
-@pytest.mark.parametrize("manager", managers())
-def test_managed_manager(manager):
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager()
-    compressed = compressor_instance.compress(data)
-    manager = libnvcomp.ManagedDecompressionManager(compressed)
-    decompressed = manager.decompress(compressed)
-    assert len(decompressed) == 10000
diff --git a/python/kvikio/tests/test_nvcomp_codec.py b/python/kvikio/tests/test_nvcomp_codec.py
deleted file mode 100644
index 29e50ad64b..0000000000
--- a/python/kvikio/tests/test_nvcomp_codec.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-import itertools as it
-import json
-
-import cupy as cp
-import numcodecs
-import numpy as np
-import packaging
-import packaging.version
-import pytest
-import zarr
-from numpy.testing import assert_equal
-
-from kvikio.nvcomp_codec import NvCompBatchCodec
-
-NVCOMP_CODEC_ID = "nvcomp_batch"
-
-LZ4_ALGO = "LZ4"
-GDEFLATE_ALGO = "Gdeflate"
-SNAPPY_ALGO = "snappy"
-ZSTD_ALGO = "zstd"
-DEFLATE_ALGO = "deflate"
-
-SUPPORTED_CODECS = [LZ4_ALGO, GDEFLATE_ALGO, SNAPPY_ALGO, ZSTD_ALGO, DEFLATE_ALGO]
-
-
-def skip_if_zarr_v3():
-    return pytest.mark.skipif(
-        packaging.version.parse(zarr.__version__) >= packaging.version.Version("3.0.0"),
-        reason="zarr 3.x not supported.",
-    )
-
-
-def _get_codec(algo: str, **kwargs):
-    codec_args = {"id": NVCOMP_CODEC_ID, "algorithm": algo, "options": kwargs}
-    return numcodecs.registry.get_codec(codec_args)
-
-
-@pytest.fixture(params=[(32,), (8, 16), (16, 16)])
-def shape(request):
-    return request.param
-
-
-# Separate fixture for combinations of shapes and chunks, since
-# chunks array must have the same rank as data array.
-@pytest.fixture(
-    params=it.chain(
-        it.product([(64,)], [(64,), (100,)]),
-        it.product([(16, 8), (16, 16)], [(8, 16), (16, 16), (40, 12)]),
-    )
-)
-def shape_chunks(request):
-    return request.param
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-def test_codec_registry(algo: str):
-    codec = _get_codec(algo)
-    assert isinstance(codec, numcodecs.abc.Codec)
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-def test_basic(algo: str, shape):
-    codec = NvCompBatchCodec(algo)
-
-    # Create data.
-    dtype = np.float32
-    data = np.ones(shape, dtype=dtype)
-    # Do roundtrip.
-    comp_data = codec.encode(data)
-    # Decompress and cast to original data type/shape.
-    decomp_data = codec.decode(comp_data).view(dtype).reshape(shape)
-
-    assert_equal(decomp_data, data)
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-@skip_if_zarr_v3()
-def test_basic_zarr(algo: str, shape_chunks):
-    shape, chunks = shape_chunks
-
-    codec = NvCompBatchCodec(algo)
-
-    data = np.ones(shape, dtype=np.float32)
-
-    # This will do the compression.
-    z = zarr.array(data, chunks=chunks, compressor=codec)
-
-    # Test the decompression.
-    assert_equal(z[:], data[:])
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-@pytest.mark.parametrize("chunk_sizes", [(100, 100), (100, 150)])
-@pytest.mark.parametrize("out", [None, "cpu", "gpu"])
-def test_batch_comp_decomp(algo: str, chunk_sizes, out: str):
-    codec = _get_codec(algo)
-
-    np.random.seed(1)
-
-    dtype = np.float32
-    chunks = [np.random.randn(s).astype(dtype) for s in chunk_sizes]
-    out_buf = None
-    if out == "cpu":
-        out_buf = [np.empty_like(c) for c in chunks]
-    elif out == "gpu":
-        out_buf = [cp.empty_like(c) for c in chunks]
-
-    comp_chunks = codec.encode_batch([c.tobytes() for c in chunks])
-    assert len(comp_chunks) == 2
-
-    decomp_chunks = codec.decode_batch(comp_chunks, out=out_buf)
-    assert len(decomp_chunks) == 2
-
-    for i, dc in enumerate(decomp_chunks):
-        dc = dc.view(dtype=dtype)
-        if isinstance(dc, cp.ndarray):
-            dc = dc.get()
-        assert_equal(dc, chunks[i], f"{i=}")
-
-        if out_buf is not None:
-            ob = out_buf[i]
-            if isinstance(ob, cp.ndarray):
-                ob = ob.get()
-            assert_equal(ob, chunks[i], f"{i=}")
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-@skip_if_zarr_v3()
-def test_comp_decomp(algo: str, shape_chunks):
-    shape, chunks = shape_chunks
-
-    codec = _get_codec(algo)
-
-    np.random.seed(1)
-
-    data = np.random.randn(*shape).astype(np.float32)
-
-    z1 = zarr.array(data, chunks=chunks, compressor=codec)
-
-    zarr_store = zarr.MemoryStore()
-    zarr.save_array(zarr_store, z1, compressor=codec)
-    # Check the store.
-    meta = json.loads(zarr_store[".zarray"])
-    assert meta["compressor"]["id"] == NVCOMP_CODEC_ID
-    assert meta["compressor"]["algorithm"] == algo.lower()
-
-    # Read back/decompress.
-    z2 = zarr.open_array(zarr_store)
-
-    assert_equal(z1[:], z2[:])
-
-
-@pytest.mark.parametrize(
-    "algo, options",
-    [
-        ("lz4", {"data_type": 4}),  # NVCOMP_TYPE_INT data type.
-        ("gdeflate", {"algo": 1}),  # low-throughput, high compression ratio algo
-    ],
-)
-@skip_if_zarr_v3()
-def test_codec_options(algo, options):
-    codec = NvCompBatchCodec(algo, options)
-
-    shape = (16, 16)
-    chunks = (8, 8)
-
-    data = np.ones(shape, dtype=np.float32)
-
-    z = zarr.array(data, chunks=chunks, compressor=codec)
-
-    assert_equal(z[:], data[:])
-
-
-@skip_if_zarr_v3()
-def test_codec_invalid_options():
-    # There are currently only 3 supported algos in Gdeflate
-    codec = NvCompBatchCodec(GDEFLATE_ALGO, options={"algo": 10})
-
-    data = np.ones((16, 16), dtype=np.float32)
-
-    with pytest.raises(RuntimeError):
-        zarr.array(data, compressor=codec)
-
-
-@pytest.mark.parametrize(
-    "cpu_algo, gpu_algo",
-    [
-        ("lz4", LZ4_ALGO),
-        ("zstd", ZSTD_ALGO),
-    ],
-)
-@skip_if_zarr_v3()
-def test_cpu_comp_gpu_decomp(cpu_algo, gpu_algo):
-    cpu_codec = numcodecs.registry.get_codec({"id": cpu_algo})
-    gpu_codec = _get_codec(gpu_algo)
-
-    shape = (16, 16)
-    chunks = (8, 8)
-
-    data = np.ones(shape, dtype=np.float32)
-
-    z1 = zarr.array(data, chunks=chunks)
-    store = {}
-    zarr.save_array(store, z1, compressor=cpu_codec)
-
-    meta = json.loads(store[".zarray"])
-    assert meta["compressor"]["id"] == cpu_algo
-
-    meta["compressor"] = {"id": NVCOMP_CODEC_ID, "algorithm": gpu_algo}
-    store[".zarray"] = json.dumps(meta).encode()
-
-    z2 = zarr.open_array(store, compressor=gpu_codec)
-
-    assert_equal(z1[:], z2[:])
-
-
-@skip_if_zarr_v3()
-def test_lz4_codec_header(shape_chunks):
-    shape, chunks = shape_chunks
-
-    # Test LZ4 nvCOMP codecs with and without the header.
-    codec_h = _get_codec(LZ4_ALGO, has_header=True)
-    codec_no_h = _get_codec(LZ4_ALGO, has_header=False)
-
-    np.random.seed(1)
-
-    data = np.random.randn(*shape).astype(np.float32)
-
-    z_h = zarr.array(data, chunks=chunks, compressor=codec_h)
-    z_no_h = zarr.array(data, chunks=chunks, compressor=codec_no_h)
-
-    # Result must be the same regardless of the header presence.
-    assert_equal(z_h[:], z_no_h[:])
-
-
-def test_empty_batch():
-    codec = _get_codec(LZ4_ALGO)
-
-    assert len(codec.encode_batch([])) == 0
-    assert len(codec.decode_batch([])) == 0
diff --git a/python/kvikio/tests/test_s3_io.py b/python/kvikio/tests/test_s3_io.py
index 45997b1e71..d8610c73bc 100644
--- a/python/kvikio/tests/test_s3_io.py
+++ b/python/kvikio/tests/test_s3_io.py
@@ -1,12 +1,12 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import multiprocessing as mp
-import socket
 import time
 from contextlib import contextmanager
 
 import pytest
+import utils
 
 import kvikio
 import kvikio.defaults
@@ -26,18 +26,13 @@
 
 
 @pytest.fixture(scope="session")
-def endpoint_ip():
-    return "127.0.0.1"
+def endpoint_ip() -> str:
+    return utils.localhost()
 
 
 @pytest.fixture(scope="session")
 def endpoint_port():
-    # Return a free port per worker session.
-    sock = socket.socket()
-    sock.bind(("127.0.0.1", 0))
-    port = sock.getsockname()[1]
-    sock.close()
-    return port
+    return utils.find_free_port()
 
 
 def start_s3_server(ip_address, port):
@@ -81,7 +76,7 @@ def s3_context(s3_base, bucket, files=None):
 
 def test_read_access(s3_base):
     bucket_name = "bucket"
-    object_name = "data"
+    object_name = "Data"
     data = b"file content"
     with s3_context(
         s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(data)}
@@ -119,7 +114,7 @@ def test_read_access(s3_base):
 @pytest.mark.parametrize("buffer_size", [101, 1001])
 def test_read(s3_base, xp, size, nthreads, tasksize, buffer_size):
     bucket_name = "test_read"
-    object_name = "a1"
+    object_name = "Aa1"
     a = xp.arange(size)
     with s3_context(
         s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(a)}
@@ -151,7 +146,7 @@ def test_read(s3_base, xp, size, nthreads, tasksize, buffer_size):
 )
 def test_read_with_file_offset(s3_base, xp, start, end):
     bucket_name = "test_read_with_file_offset"
-    object_name = "a1"
+    object_name = "Aa1"
     a = xp.arange(end, dtype=xp.int64)
     with s3_context(
         s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(a)}
@@ -161,3 +156,64 @@ def test_read_with_file_offset(s3_base, xp, start, end):
             b = xp.zeros(shape=(end - start,), dtype=xp.int64)
             assert f.read(b, file_offset=start * a.itemsize) == b.nbytes
             xp.testing.assert_array_equal(a[start:end], b)
+
+
+@pytest.mark.parametrize("scheme", ["S3"])
+@pytest.mark.parametrize(
+    "remote_endpoint_type",
+    [kvikio.RemoteEndpointType.S3.AUTO, kvikio.RemoteEndpointType.S3],
+)
+@pytest.mark.parametrize("allow_list", [None, [kvikio.RemoteEndpointType.S3]])
+@pytest.mark.parametrize("nbytes", [None, 1])
+def test_open_valid(s3_base, scheme, remote_endpoint_type, allow_list, nbytes):
+    bucket_name = "bucket_name"
+    object_name = "object_name"
+    data = b"file content"
+    with s3_context(
+        s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(data)}
+    ) as server_address:
+        if scheme == "S3":
+            url = f"{scheme}://{bucket_name}/{object_name}"
+        else:
+            url = f"{server_address}/{bucket_name}/{object_name}"
+
+        if nbytes is None:
+            expected_file_size = len(data)
+        else:
+            expected_file_size = nbytes
+
+        with kvikio.RemoteFile.open(url, remote_endpoint_type, allow_list, nbytes) as f:
+            assert f.nbytes() == expected_file_size
+            assert f.remote_endpoint_type() == kvikio.RemoteEndpointType.S3
+
+
+def test_open_invalid(s3_base):
+    bucket_name = "bucket_name"
+    object_name = "object_name"
+    data = b"file content"
+    with s3_context(
+        s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(data)}
+    ) as server_address:
+        # Missing scheme
+        url = f"://{bucket_name}/{object_name}"
+        with pytest.raises(RuntimeError, match="Bad scheme"):
+            kvikio.RemoteFile.open(url)
+
+        # Unsupported type
+        url = f"unsupported://{bucket_name}/{object_name}"
+        with pytest.raises(RuntimeError, match="Unsupported endpoint URL"):
+            kvikio.RemoteFile.open(url)
+
+        # Specified URL not in the allowlist
+        url = f"{server_address}/{bucket_name}/{object_name}"
+        with pytest.raises(RuntimeError, match="not in the allowlist"):
+            kvikio.RemoteFile.open(
+                url, kvikio.RemoteEndpointType.S3, [kvikio.RemoteEndpointType.WEBHDFS]
+            )
+
+        # Invalid URLs
+        url = f"s3://{bucket_name}"
+        with pytest.raises(RuntimeError, match="Unsupported endpoint URL"):
+            kvikio.RemoteFile.open(url)
+        with pytest.raises(RuntimeError, match="Invalid URL"):
+            kvikio.RemoteFile.open(url, kvikio.RemoteEndpointType.S3)
diff --git a/python/kvikio/tests/test_utils.py b/python/kvikio/tests/test_utils.py
index cd34c50d0e..34279826b7 100644
--- a/python/kvikio/tests/test_utils.py
+++ b/python/kvikio/tests/test_utils.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 
 import pytest
diff --git a/python/kvikio/tests/test_version.py b/python/kvikio/tests/test_version.py
index 840d6624c9..0f8249856f 100644
--- a/python/kvikio/tests/test_version.py
+++ b/python/kvikio/tests/test_version.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import kvikio
 
diff --git a/python/kvikio/tests/test_zarr.py b/python/kvikio/tests/test_zarr.py
deleted file mode 100644
index a793e2568e..0000000000
--- a/python/kvikio/tests/test_zarr.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-
-import math
-
-import numpy
-import pytest
-from packaging.version import parse
-
-cupy = pytest.importorskip("cupy")
-zarr = pytest.importorskip("zarr")
-kvikio_zarr = pytest.importorskip("kvikio.zarr")
-kvikio_nvcomp_codec = pytest.importorskip("kvikio.nvcomp_codec")
-numcodecs = pytest.importorskip("numcodecs")
-
-if not kvikio_zarr.supported:
-    pytest.skip(
-        f"requires Zarr >={kvikio_zarr.MINIMUM_ZARR_VERSION}",
-        allow_module_level=True,
-    )
-
-if parse(zarr.__version__) >= parse("3.0.0"):
-    pytest.skip(
-        "requires Zarr<3",
-        allow_module_level=True,
-    )
-
-
-@pytest.fixture
-def store(tmp_path):
-    """Fixture that creates a GDS Store"""
-    return kvikio_zarr.GDSStore(tmp_path / "test-file.zarr")
-
-
-def test_direct_store_access(store, xp):
-    """Test accessing the GDS Store directly"""
-
-    a = xp.arange(5, dtype="u1")
-    store["a"] = a
-    b = store["a"]
-
-    # Notice, unless using getitems(), GDSStore always returns bytes
-    assert isinstance(b, bytes)
-    assert (xp.frombuffer(b, dtype="u1") == a).all()
-
-
-@pytest.mark.parametrize("xp_write", ["numpy", "cupy"])
-@pytest.mark.parametrize("xp_read_a", ["numpy", "cupy"])
-@pytest.mark.parametrize("xp_read_b", ["numpy", "cupy"])
-def test_direct_store_access_getitems(store, xp_write, xp_read_a, xp_read_b):
-    """Test accessing the GDS Store directly using getitems()"""
-
-    xp_read_a = pytest.importorskip(xp_read_a)
-    xp_read_b = pytest.importorskip(xp_read_b)
-    xp_write = pytest.importorskip(xp_write)
-    a = xp_write.arange(5, dtype="u1")
-    b = a * 2
-    store["a"] = a
-    store["b"] = b
-
-    res = store.getitems(
-        keys=["a", "b"],
-        contexts={
-            "a": {"meta_array": xp_read_a.empty(())},
-            "b": {"meta_array": xp_read_b.empty(())},
-        },
-    )
-    assert isinstance(res["a"], xp_read_a.ndarray)
-    assert isinstance(res["b"], xp_read_b.ndarray)
-    cupy.testing.assert_array_equal(res["a"], a)
-    cupy.testing.assert_array_equal(res["b"], b)
-
-
-def test_array(store, xp):
-    """Test Zarr array"""
-
-    a = xp.arange(100)
-    z = zarr.array(a, chunks=10, compressor=None, store=store, meta_array=xp.empty(()))
-    assert isinstance(z.meta_array, type(a))
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(a, type(z[:]))
-    xp.testing.assert_array_equal(a, z[:])
-
-
-def test_group(store, xp):
-    """Test Zarr group"""
-
-    g = zarr.open_group(store, meta_array=xp.empty(()))
-    g.ones("data", shape=(10, 11), dtype=int, compressor=None)
-    a = g["data"]
-    assert a.shape == (10, 11)
-    assert a.dtype == int
-    assert isinstance(a, zarr.Array)
-    assert isinstance(a.meta_array, xp.ndarray)
-    assert isinstance(a[:], xp.ndarray)
-    assert (a[:] == 1).all()
-
-
-def test_open_array(store, xp):
-    """Test Zarr's open_array()"""
-
-    a = xp.arange(10)
-    z = zarr.open_array(
-        store,
-        shape=a.shape,
-        dtype=a.dtype,
-        chunks=(10,),
-        compressor=None,
-        meta_array=xp.empty(()),
-    )
-    z[:] = a
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(a, type(z[:]))
-    xp.testing.assert_array_equal(a, z[:])
-
-
-@pytest.mark.parametrize("inline_array", [True, False])
-def test_dask_read(store, xp, inline_array):
-    """Test Zarr read in Dask"""
-
-    da = pytest.importorskip("dask.array")
-    a = xp.arange(100)
-    z = zarr.array(a, chunks=10, compressor=None, store=store, meta_array=xp.empty(()))
-    d = da.from_zarr(z, inline_array=inline_array)
-    d += 1
-    xp.testing.assert_array_equal(a + 1, d.compute())
-
-
-def test_dask_write(store, xp):
-    """Test Zarr write in Dask"""
-
-    da = pytest.importorskip("dask.array")
-
-    # Write dask array to disk using Zarr
-    a = xp.arange(100)
-    d = da.from_array(a, chunks=10)
-    da.to_zarr(d, store, compressor=None, meta_array=xp.empty(()))
-
-    # Validate the written Zarr array
-    z = zarr.open_array(store)
-    xp.testing.assert_array_equal(a, z[:])
-
-
-@pytest.mark.parametrize("xp_read", ["numpy", "cupy"])
-@pytest.mark.parametrize("xp_write", ["numpy", "cupy"])
-@pytest.mark.parametrize("compressor", kvikio_zarr.nvcomp_compressors)
-def test_compressor(store, xp_write, xp_read, compressor):
-    xp_read = pytest.importorskip(xp_read)
-    xp_write = pytest.importorskip(xp_write)
-
-    shape = (10, 1)
-    chunks = (10, 1)
-    a = xp_write.arange(math.prod(shape)).reshape(shape)
-    z = zarr.creation.create(
-        shape=shape,
-        chunks=chunks,
-        compressor=compressor(),
-        store=store,
-        meta_array=xp_read.empty(()),
-    )
-    z[:] = a
-    b = z[:]
-    assert isinstance(b, xp_read.ndarray)
-    cupy.testing.assert_array_equal(b, a)
-
-
-@pytest.mark.parametrize("algo", ["lz4", "zstd"])
-def test_decompressor_config_overwrite(tmp_path, xp, algo):
-    cpu_codec = numcodecs.registry.get_codec({"id": algo})
-    gpu_codec = kvikio_nvcomp_codec.NvCompBatchCodec(algo)
-
-    # Write using Zarr's default file store and the `cpu_codec` compressor
-    z = zarr.open_array(tmp_path, mode="w", shape=(10,), compressor=cpu_codec)
-    z[:] = range(10)
-    assert z.compressor == cpu_codec
-
-    # Open file using GDSStore and use `gpu_codec` as decompressor.
-    z = zarr.open_array(
-        kvikio_zarr.GDSStore(
-            tmp_path,
-            decompressor_config_overwrite=gpu_codec.get_config(),
-        ),
-        mode="r",
-        meta_array=xp.empty(()),
-    )
-    assert z.compressor == gpu_codec
-    assert isinstance(z[:], xp.ndarray)
-    xp.testing.assert_array_equal(z[:], range(10))
-
-
-@pytest.mark.parametrize("algo", ["lz4"])
-def test_compressor_config_overwrite(tmp_path, xp, algo):
-    cpu_codec = numcodecs.registry.get_codec({"id": algo})
-    gpu_codec = kvikio_nvcomp_codec.NvCompBatchCodec(algo)
-
-    # Write file using GDSStore and the `gpu_codec` compressor. In order
-    # to make the file compatible with Zarr's builtin CPU decompressor,
-    # we set `cpu_codec` as the compressor in the meta file on disk.
-    z = zarr.open_array(
-        kvikio_zarr.GDSStore(
-            tmp_path,
-            compressor_config_overwrite=cpu_codec.get_config(),
-            decompressor_config_overwrite=gpu_codec.get_config(),
-        ),
-        mode="w",
-        shape=10,
-        compressor=gpu_codec,
-        meta_array=xp.empty(()),
-    )
-    assert z.compressor == gpu_codec
-    z[:] = xp.arange(10)
-
-    # We can now open the file using Zarr's builtin CPU decompressor
-    z = zarr.open_array(tmp_path, mode="r")
-    assert isinstance(z[:], numpy.ndarray)
-    numpy.testing.assert_array_equal(z[:], range(10))
-
-
-@pytest.mark.parametrize("write_mode", ["w", "w-", "a"])
-@pytest.mark.parametrize("read_mode", ["r", "r+", "a"])
-def test_open_cupy_array(tmp_path, write_mode, read_mode):
-    a = cupy.arange(10)
-    z = kvikio_zarr.open_cupy_array(
-        tmp_path,
-        mode=write_mode,
-        shape=a.shape,
-        dtype=a.dtype,
-        chunks=(2,),
-        compressor=kvikio_zarr.CompatCompressor.lz4(),
-    )
-    z[:] = a
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(z[:], type(a))
-    assert z.compressor == kvikio_nvcomp_codec.NvCompBatchCodec("lz4")
-    cupy.testing.assert_array_equal(a, z[:])
-
-    z = kvikio_zarr.open_cupy_array(
-        tmp_path,
-        mode=read_mode,
-    )
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(z[:], type(a))
-    assert z.compressor == kvikio_nvcomp_codec.NvCompBatchCodec("lz4")
-    cupy.testing.assert_array_equal(a, z[:])
-
-    z = zarr.open_array(tmp_path, mode=read_mode)
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(z[:], numpy.ndarray)
-    assert z.compressor == kvikio_zarr.CompatCompressor.lz4().cpu
-    numpy.testing.assert_array_equal(a.get(), z[:])
-
-
-@pytest.mark.parametrize("compressor", [None, kvikio_zarr.CompatCompressor.lz4().cpu])
-def test_open_cupy_array_written_by_zarr(tmp_path, compressor):
-    data = numpy.arange(100)
-    z = zarr.open_array(
-        tmp_path,
-        shape=data.shape,
-        mode="w",
-        compressor=compressor,
-    )
-    z[:] = data
-
-    z = kvikio_zarr.open_cupy_array(tmp_path, mode="r")
-    assert isinstance(z[:], cupy.ndarray)
-    cupy.testing.assert_array_equal(z[:], data)
-
-
-@pytest.mark.parametrize("mode", ["r", "r+", "a"])
-def test_open_cupy_array_incompatible_compressor(tmp_path, mode):
-    zarr.create((10,), store=tmp_path, compressor=numcodecs.Blosc())
-
-    with pytest.raises(ValueError, match="non-CUDA compatible compressor"):
-        kvikio_zarr.open_cupy_array(tmp_path, mode=mode)
-
-
-def test_open_cupy_array_unknown_mode(tmp_path):
-    a = cupy.arange(10)
-    with pytest.raises(ValueError, match="Unknown mode: x"):
-        kvikio_zarr.open_cupy_array(
-            tmp_path,
-            mode="x",
-            shape=a.shape,
-            dtype=a.dtype,
-            chunks=(2,),
-        )
diff --git a/python/kvikio/tests/test_zarr_missing.py b/python/kvikio/tests/test_zarr_missing.py
new file mode 100644
index 0000000000..eee25b0e2e
--- /dev/null
+++ b/python/kvikio/tests/test_zarr_missing.py
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+import pytest
+
+
+def test_zarr_missing_raises(monkeypatch):
+    modules = list(sys.modules)
+    for module in modules:
+        pkg = module.split(".")[0]
+        if pkg == "kvikio":
+            # remove from the import cache
+            monkeypatch.delitem(sys.modules, module, raising=False)
+        elif pkg == "zarr":
+            # force an ImportError
+            monkeypatch.setitem(sys.modules, module, None)
+
+    with pytest.raises(ImportError):
+        import kvikio.zarr  # noqa: F401
+
+
+def test_zarr_2_installed_raises(monkeypatch):
+    modules = list(sys.modules)
+    zarr = pytest.importorskip("zarr")
+    monkeypatch.setattr(zarr, "__version__", "2.0.0")
+
+    for module in modules:
+        pkg = module.split(".")[0]
+        if pkg == "kvikio":
+            # remove from the import cache
+            monkeypatch.delitem(sys.modules, module, raising=False)
+
+    with pytest.raises(ImportError):
+        import kvikio.zarr  # noqa: F401
diff --git a/python/kvikio/tests/test_zarr_v3.py b/python/kvikio/tests/test_zarr_v3.py
index e44de36d22..4a430b5dbd 100644
--- a/python/kvikio/tests/test_zarr_v3.py
+++ b/python/kvikio/tests/test_zarr_v3.py
@@ -1,21 +1,22 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import pathlib
 
 import cupy as cp
 import pytest
 
-import kvikio.zarr
-
 pytest.importorskip("zarr", minversion="3.0.0")
 
+# these must follow the pytest.importorskip
 
 import zarr.core.buffer  # noqa: E402
 import zarr.storage  # noqa: E402
 from zarr.core.buffer.gpu import Buffer  # noqa: E402
 from zarr.testing.store import StoreTests  # noqa: E402
 
+import kvikio.zarr  # noqa: E402
+
 
 @pytest.mark.asyncio
 async def test_basic(tmp_path: pathlib.Path) -> None:
diff --git a/python/kvikio/tests/utils.py b/python/kvikio/tests/utils.py
new file mode 100644
index 0000000000..1b1368a421
--- /dev/null
+++ b/python/kvikio/tests/utils.py
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import socket
+
+
+def localhost() -> str:
+    return "127.0.0.1"
+
+
+def find_free_port(host: str = localhost()) -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind((host, 0))
+        _, port = s.getsockname()
+    return port
diff --git a/python/libkvikio/CMakeLists.txt b/python/libkvikio/CMakeLists.txt
index ecde2dc288..318d1edb49 100644
--- a/python/libkvikio/CMakeLists.txt
+++ b/python/libkvikio/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
@@ -39,28 +32,7 @@ unset(kvikio_FOUND)
 set(KvikIO_BUILD_BENCHMARKS OFF)
 set(KvikIO_BUILD_EXAMPLES OFF)
 set(KvikIO_BUILD_TESTS OFF)
-if(USE_NVCOMP_RUNTIME_WHEEL)
-  set(KvikIO_EXPORT_NVCOMP OFF)
-else()
-  # vendor nvcomp but not the entire kvikio-export set because that's huge
-  include(cmake/thirdparty/get_nvcomp.cmake)
-  include(cmake/Modules/WheelHelpers.cmake)
-  install_aliased_imported_targets(
-    TARGETS nvcomp::nvcomp DESTINATION ${SKBUILD_PLATLIB_DIR}/libkvikio/lib64/
-  )
-endif()
 
 set(CUDA_STATIC_RUNTIME ON)
 
 add_subdirectory(../../cpp kvikio-cpp)
-
-if(USE_NVCOMP_RUNTIME_WHEEL)
-  set(rpaths "$ORIGIN/../../nvidia/nvcomp")
-  foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-    set_property(
-      TARGET ${tgt}
-      PROPERTY INSTALL_RPATH ${rpaths}
-      APPEND
-    )
-  endforeach()
-endif()
diff --git a/python/libkvikio/cmake/Modules/WheelHelpers.cmake b/python/libkvikio/cmake/Modules/WheelHelpers.cmake
deleted file mode 100644
index abdde95298..0000000000
--- a/python/libkvikio/cmake/Modules/WheelHelpers.cmake
+++ /dev/null
@@ -1,59 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-include_guard(GLOBAL)
-
-# Making libraries available inside wheels by installing the associated targets.
-function(install_aliased_imported_targets)
-  list(APPEND CMAKE_MESSAGE_CONTEXT "install_aliased_imported_targets")
-
-  set(options "")
-  set(one_value "DESTINATION")
-  set(multi_value "TARGETS")
-  cmake_parse_arguments(_ "${options}" "${one_value}" "${multi_value}" ${ARGN})
-
-  message(VERBOSE "Installing targets '${__TARGETS}' into lib_dir '${__DESTINATION}'")
-
-  foreach(target IN LISTS __TARGETS)
-
-    if(NOT TARGET ${target})
-      message(VERBOSE "No target named ${target}")
-      continue()
-    endif()
-
-    get_target_property(alias_target ${target} ALIASED_TARGET)
-    if(alias_target)
-      set(target ${alias_target})
-    endif()
-
-    get_target_property(is_imported ${target} IMPORTED)
-    if(NOT is_imported)
-      # If the target isn't imported, install it into the wheel
-      install(TARGETS ${target} DESTINATION ${__DESTINATION})
-      message(VERBOSE "install(TARGETS ${target} DESTINATION ${__DESTINATION})")
-    else()
-      # If the target is imported, make sure it's global
-      get_target_property(type ${target} TYPE)
-      if(${type} STREQUAL "UNKNOWN_LIBRARY")
-        install(FILES $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})
-        message(VERBOSE "install(FILES $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})")
-      else()
-        install(IMPORTED_RUNTIME_ARTIFACTS ${target} DESTINATION ${__DESTINATION})
-        message(
-          VERBOSE
-          "install(IMPORTED_RUNTIME_ARTIFACTS $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})"
-        )
-      endif()
-    endif()
-  endforeach()
-endfunction()
diff --git a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake b/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
deleted file mode 100644
index 0901c1e349..0000000000
--- a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
+++ /dev/null
@@ -1,35 +0,0 @@
-# =============================================================================
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(KVIKIO_USE_PROPRIETARY_BINARY ON)
-
-# This function finds nvcomp and sets any additional necessary environment variables.
-function(find_and_configure_nvcomp)
-
-  include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
-  set(export_args)
-  if(KvikIO_EXPORT_NVCOMP)
-    # We're vendoring nvcomp and we only want `libnvcomp.so.4`
-    set(export_args BUILD_EXPORT_SET nvcomp)
-  endif()
-
-  rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${KVIKIO_USE_PROPRIETARY_BINARY})
-
-  # Per-thread default stream
-  if(TARGET nvcomp AND PER_THREAD_DEFAULT_STREAM)
-    target_compile_definitions(nvcomp PRIVATE CUDA_API_PER_THREAD_DEFAULT_STREAM)
-  endif()
-endfunction()
-
-find_and_configure_nvcomp()
diff --git a/python/libkvikio/libkvikio/__init__.py b/python/libkvikio/libkvikio/__init__.py
index a221295d4c..8051956848 100644
--- a/python/libkvikio/libkvikio/__init__.py
+++ b/python/libkvikio/libkvikio/__init__.py
@@ -1,16 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 from libkvikio._version import __git_commit__, __version__
 from libkvikio.load import load_library
diff --git a/python/libkvikio/libkvikio/_version.py b/python/libkvikio/libkvikio/_version.py
index 8b67326806..dec2546e41 100644
--- a/python/libkvikio/libkvikio/_version.py
+++ b/python/libkvikio/libkvikio/_version.py
@@ -1,16 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 import importlib.resources
 
diff --git a/python/libkvikio/libkvikio/load.py b/python/libkvikio/libkvikio/load.py
index c790d2dd00..beb7273da2 100644
--- a/python/libkvikio/libkvikio/load.py
+++ b/python/libkvikio/libkvikio/load.py
@@ -1,16 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 #
 
 import ctypes
@@ -44,9 +33,6 @@ def _load_wheel_installation(soname: str):
 
 
 def load_library():
-    # TODO: remove this nvcomp load when `nvcomp` is re-de-vendored
-    # https://github.com/rapidsai/build-planning/issues/171
-    _load_library("libnvcomp.so.4")
     return _load_library("libkvikio.so")
 
 
diff --git a/python/libkvikio/pyproject.toml b/python/libkvikio/pyproject.toml
index bbbd6f2e74..7861779811 100644
--- a/python/libkvikio/pyproject.toml
+++ b/python/libkvikio/pyproject.toml
@@ -1,10 +1,10 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 [build-system]
 build-backend = "rapids_build_backend.build"
 requires = [
-    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "rapids-build-backend>=0.4.0,<0.5.0.dev0",
     "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -67,6 +67,4 @@ select = [
 ]
 
 # PyPI limit is 100 MiB, fail CI before we get too close to that
-# TODO: drop this to 75M after we re-de-vendor nvcomp
-# https://github.com/rapidsai/build-planning/issues/171
-max_allowed_size_compressed = '90M'
+max_allowed_size_compressed = '75M'