apache · junrushao · Dec 22, 2025 · Dec 19, 2025
diff --git a/docs/.rstcheck.cfg b/docs/.rstcheck.cfg
@@ -1,5 +1,5 @@
 [rstcheck]
 report_level = warning
 ignore_directives = automodule, autosummary, currentmodule, toctree, ifconfig, tab-set, collapse, tabs, dropdown
-ignore_roles = ref, cpp:class, cpp:func, py:func, c:macro
+ignore_roles = ref, cpp:class, cpp:func, py:func, c:macro, external+data-api:doc, external+scikit_build_core:doc
 ignore_languages = cpp, python
diff --git a/docs/concepts/abi_overview.md b/docs/concepts/abi_overview.md
@@ -15,7 +15,7 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
-# ABI Overview
+# ABI Specification
 
 This section provides an overview of the ABI convention of TVM FFI. The ABI
 is designed around the following key principles:

diff --git a/docs/conf.py b/docs/conf.py
@@ -157,8 +157,10 @@
     "pillow": ("https://pillow.readthedocs.io/en/stable", None),
     "numpy": ("https://numpy.org/doc/stable", None),
     "torch": ("https://pytorch.org/docs/stable", None),
-    "torch-cpp": ("https://docs.pytorch.org/cppdocs/", None),
+    "torch-cpp": ("https://docs.pytorch.org/cppdocs", None),
     "dlpack": ("https://dmlc.github.io/dlpack/latest", None),
+    "data-api": ("https://data-apis.org/array-api/latest", None),
+    "scikit_build_core": ("https://scikit-build-core.readthedocs.io/en/stable/", None),
 }
 
 autosummary_generate = True  # actually create stub pages

diff --git a/docs/guides/build_from_source.md → docs/dev/build_from_source.md b/docs/guides/build_from_source.md → docs/dev/build_from_source.md
diff --git a/docs/get_started/quickstart.rst b/docs/get_started/quickstart.rst
@@ -83,7 +83,7 @@ The class :cpp:class:`tvm::ffi::TensorView` allows zero-copy interop with tensor
 
 - NumPy, CuPy,
 - PyTorch, JAX, or
-- any array type that supports the standard `DLPack protocol <https://data-apis.org/array-api/2024.12/design_topics/data_interchange.html>`_.
+- any array type that supports the standard :external+data-api:doc:`DLPack protocol <design_topics/data_interchange>`.
 
 Finally, :cpp:func:`TVMFFIEnvGetStream` can be used in the CUDA code to launch a kernel on the caller's stream.
 
@@ -127,36 +127,34 @@ TVM-FFI natively integrates with CMake via ``find_package`` as demonstrated belo
 
     .. code-block:: cmake
 
-      # Run `tvm-ffi-config --cmakedir` to set `tvm_ffi_DIR`
+      # Run `tvm-ffi-config --cmakedir` to set `tvm_ffi_ROOT`
       find_package(Python COMPONENTS Interpreter REQUIRED)
       execute_process(COMMAND "${Python_EXECUTABLE}" -m tvm_ffi.config --cmakedir OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE tvm_ffi_ROOT)
       find_package(tvm_ffi CONFIG REQUIRED)
 
       # Link C++ target to `tvm_ffi_header` and `tvm_ffi_shared`
       add_library(add_one_cpu SHARED compile/add_one_cpu.cc)
-      target_link_libraries(add_one_cpu PRIVATE tvm_ffi_header)
-      target_link_libraries(add_one_cpu PRIVATE tvm_ffi_shared)
+      tvm_ffi_configure_target(add_one_cpu)
 
   .. group-tab:: CUDA
 
     .. code-block:: cmake
 
       enable_language(CUDA)
-      # Run `tvm-ffi-config --cmakedir` to set `tvm_ffi_DIR`
+      # Run `tvm-ffi-config --cmakedir` to set `tvm_ffi_ROOT`
       find_package(Python COMPONENTS Interpreter REQUIRED)
       execute_process(COMMAND "${Python_EXECUTABLE}" -m tvm_ffi.config --cmakedir OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE tvm_ffi_ROOT)
       find_package(tvm_ffi CONFIG REQUIRED)
 
       # Link CUDA target to `tvm_ffi_header` and `tvm_ffi_shared`
       add_library(add_one_cuda SHARED compile/add_one_cuda.cu)
-      target_link_libraries(add_one_cuda PRIVATE tvm_ffi_header)
-      target_link_libraries(add_one_cuda PRIVATE tvm_ffi_shared)
+      tvm_ffi_configure_target(add_one_cuda)
 
 **Artifact.** The resulting ``add_one_cpu.so`` and ``add_one_cuda.so`` are minimal libraries that are agnostic to:
 
 - Python version/ABI. It is not compiled/linked with Python and depends only on TVM-FFI's stable C ABI;
 - Languages, including C++, Python, Rust or any other language that can interop with C ABI;
-- ML frameworks, such as PyTorch, JAX, NumPy, CuPy, or anything with standard `DLPack protocol <https://data-apis.org/array-api/2024.12/design_topics/data_interchange.html>`_.
+- ML frameworks, such as PyTorch, JAX, NumPy, CuPy, or anything with standard :external+data-api:doc:`DLPack protocol <design_topics/data_interchange>`.
 
 .. _sec-use-across-framework:
 
@@ -177,60 +175,66 @@ directly. This process is done zero-copy, without any boilerplate code, under ex
 
 We can then use these functions in the following ways:
 
-.. tab-set::
+.. _ship-to-pytorch:
 
-    .. tab-item:: PyTorch
+PyTorch
+~~~~~~~
 
-        .. literalinclude:: ../../examples/quickstart/load/load_pytorch.py
-          :language: python
-          :start-after: [example.begin]
-          :end-before: [example.end]
+.. literalinclude:: ../../examples/quickstart/load/load_pytorch.py
+  :language: python
+  :start-after: [example.begin]
+  :end-before: [example.end]
 
-    .. tab-item:: JAX
+.. _ship-to-jax:
 
-        Support via `nvidia/jax-tvm-ffi <https://github.com/nvidia/jax-tvm-ffi>`_. This can be installed via
+JAX
+~~~
+
+Support via `nvidia/jax-tvm-ffi <https://github.com/nvidia/jax-tvm-ffi>`_. This can be installed via
 
-        .. code-block:: bash
+.. code-block:: bash
 
-          pip install jax-tvm-ffi
+  pip install jax-tvm-ffi
 
-        After installation, ``add_one_cuda`` can be registered as a target to JAX's ``ffi_call``.
+After installation, ``add_one_cuda`` can be registered as a target to JAX's ``ffi_call``.
 
-        .. code-block:: python
+.. code-block:: python
 
-          # Step 1. Load `build/add_one_cuda.so`
-          import tvm_ffi
-          mod = tvm_ffi.load_module("build/add_one_cuda.so")
+  # Step 1. Load `build/add_one_cuda.so`
+  import tvm_ffi
+  mod = tvm_ffi.load_module("build/add_one_cuda.so")
 
-          # Step 2. Register `mod.add_one_cuda` into JAX
-          import jax_tvm_ffi
-          jax_tvm_ffi.register_ffi_target("add_one", mod.add_one_cuda, platform="gpu")
+  # Step 2. Register `mod.add_one_cuda` into JAX
+  import jax_tvm_ffi
+  jax_tvm_ffi.register_ffi_target("add_one", mod.add_one_cuda, platform="gpu")
 
-          # Step 3. Run `mod.add_one_cuda` with JAX
-          import jax
-          import jax.numpy as jnp
-          jax_device, *_ = jax.devices("gpu")
-          x = jnp.array([1, 2, 3, 4, 5], dtype=jnp.float32, device=jax_device)
-          y = jax.ffi.ffi_call(
-              "add_one",  # name of the registered function
-              jax.ShapeDtypeStruct(x.shape, x.dtype),  # shape and dtype of the output
-              vmap_method="broadcast_all",
-          )(x)
-          print(y)
+  # Step 3. Run `mod.add_one_cuda` with JAX
+  import jax
+  import jax.numpy as jnp
+  jax_device, *_ = jax.devices("gpu")
+  x = jnp.array([1, 2, 3, 4, 5], dtype=jnp.float32, device=jax_device)
+  y = jax.ffi.ffi_call(
+    "add_one",  # name of the registered function
+    jax.ShapeDtypeStruct(x.shape, x.dtype),  # shape and dtype of the output
+    vmap_method="broadcast_all",
+  )(x)
+  print(y)
 
-    .. tab-item:: NumPy
+.. _ship-to-numpy:
 
-        .. literalinclude:: ../../examples/quickstart/load/load_numpy.py
-          :language: python
-          :start-after: [example.begin]
-          :end-before: [example.end]
+NumPy/CuPy
+~~~~~~~~~~
 
-    .. tab-item:: CuPy
+.. literalinclude:: ../../examples/quickstart/load/load_numpy.py
+  :language: python
+  :start-after: [example.begin]
+  :end-before: [example.end]
 
-        .. literalinclude:: ../../examples/quickstart/load/load_cupy.py
-          :language: python
-          :start-after: [example.begin]
-          :end-before: [example.end]
+
+.. literalinclude:: ../../examples/quickstart/load/load_cupy.py
+  :language: python
+  :start-after: [example.begin]
+  :end-before: [example.end]
 
 
 Ship Across Languages
@@ -240,14 +244,16 @@ TVM-FFI's core loading mechanism is ABI stable and works across language boundar
 A single library can be loaded in every language TVM-FFI supports,
 without having to recompile different libraries targeting different ABIs or languages.
 
+.. _ship-to-python:
+
 Python
 ~~~~~~
 
 As shown in the :ref:`previous section<sec-use-across-framework>`, :py:func:`tvm_ffi.load_module` loads a language-
 and framework-independent ``add_one_cpu.so`` or ``add_one_cuda.so`` and can be used to incorporate it into all Python
-array frameworks that implement the standard `DLPack protocol <https://data-apis.org/array-api/2024.12/design_topics/data_interchange.html>`_.
+array frameworks that implement the standard :external+data-api:doc:`DLPack protocol <design_topics/data_interchange>`.
 
-.. _cpp_load:
+.. _ship-to-cpp:
 
 C++
 ~~~
@@ -301,6 +307,8 @@ Compile and run it with:
         return 0;
       }
 
+.. _ship-to-rust:
+
 Rust
 ~~~~
 
@@ -328,6 +336,15 @@ This procedure is identical to those in C++ and Python:
 Troubleshooting
 ---------------
 
-- ``OSError: cannot open shared object file``: Add an rpath (Linux/macOS) or ensure the DLL is on ``PATH`` (Windows). Example run-path: ``-Wl,-rpath,`tvm-ffi-config --libdir```.
+- ``OSError: cannot open shared object file``: Add an rpath (Linux/macOS) or ensure the DLL is on ``PATH`` (Windows). Example run-path: ``-Wl,-rpath,$(tvm-ffi-config --libdir)``.
 - ``undefined symbol: __tvm_ffi_add_one_cpu``: Ensure you used :c:macro:`TVM_FFI_DLL_EXPORT_TYPED_FUNC` and compiled with default symbol visibility (``-fvisibility=hidden`` is fine; the macro ensures export).
 - ``CUDA error: invalid device function``: Rebuild with the correct ``-arch=sm_XX`` for your GPU, or include multiple ``-gencode`` entries.
+
+
+Further Reading
+---------------
+
+- :doc:`Python Packaging <../packaging/python_packaging>` provides details on ABI-agnostic Python wheel building, as well as
+  exposing functions, classes and C symbols from TVM-FFI modules.
+- :doc:`Stable C ABI <stable_c_abi>` explains the ABI in depth and how it enables stability guarantee. Its C examples demonstrate
+  how to interoperate through the stable C ABI from both callee and caller sides.
diff --git a/docs/get_started/stable_c_abi.rst b/docs/get_started/stable_c_abi.rst
@@ -94,7 +94,7 @@ The following conventions apply when representing values in :cpp:class:`TVMFFIAn
 
 - Heap-allocated objects: the last 64 bits store a pointer to the actual object, for example:
 
-  * Managed tensor objects that follow `DLPack <https://data-apis.org/array-api/2024.12/design_topics/data_interchange.html#dlpack-an-in-memory-tensor-structure>`_ (i.e. `DLTensor <https://dmlc.github.io/dlpack/latest/c_api.html#c.DLTensor>`_) layout.
+  * Managed tensor objects that follow :external+data-api:doc:`DLPack <design_topics/data_interchange>` (i.e. `DLTensor <https://dmlc.github.io/dlpack/latest/c_api.html#c.DLTensor>`_) layout.
 
 - Arbitrary objects: the type index identifies the concrete type, and the last 64 bits store a pointer to a reference-counted object in TVM-FFI's object format, for example:
 
@@ -126,7 +126,7 @@ Stability and Interoperability
 
 **Cross-language.** TVM-FFI implements this calling convention in multiple languages (C, C++, Python, Rust, ...), enabling code written in one language—or generated by a DSL targeting the ABI—to be called from another language.
 
-**Cross-framework.** TVM-FFI uses standard data structures such as `DLPack tensors <https://data-apis.org/array-api/2024.12/design_topics/data_interchange.html#dlpack-an-in-memory-tensor-structure>`_ to represent arrays, so compiled functions can be used from any array framework that implements the DLPack protocol (NumPy, PyTorch, TensorFlow, CuPy, JAX, and others).
+**Cross-framework.** TVM-FFI uses standard data structures such as :external+data-api:doc:`DLPack tensors <design_topics/data_interchange>` to represent arrays, so compiled functions can be used from any array framework that implements the DLPack protocol (NumPy, PyTorch, TensorFlow, CuPy, JAX, and others).
 
 
 Stable ABI in C Code
@@ -142,7 +142,7 @@ TVM FFI's :ref:`C ABI <tvm_ffi_c_abi>` is designed with DSL and ML compilers in
 This section shows how to write C code that follows the stable C ABI. Specifically, we provide two examples:
 
 - Callee side: A CPU ``add_one_cpu`` kernel in C that is equivalent to the :ref:`C++ example <cpp_add_one_kernel>`.
-- Caller side: A loader and runner in C that invokes the kernel, a direct C translation of the :ref:`C++ example <cpp_load>`.
+- Caller side: A loader and runner in C that invokes the kernel, a direct C translation of the :ref:`C++ example <ship-to-cpp>`.
 
 The C code is minimal and dependency-free, so it can serve as a direct reference for DSL compilers that want to expose or invoke kernels through the ABI.
 
@@ -200,7 +200,7 @@ Build it with either approach:
 Caller: Kernel Loader
 ~~~~~~~~~~~~~~~~~~~~~
 
-Next, a minimal C loader invokes the ``add_one_cpu`` kernel. It is functionally identical to the :ref:`C++ example <cpp_load>` and performs:
+Next, a minimal C loader invokes the ``add_one_cpu`` kernel. It is functionally identical to the :ref:`C++ example <ship-to-cpp>` and performs:
 
 - **Step 1**. Load the shared library ``build/add_one_cpu.so`` that contains the kernel;
 - **Step 2**. Get function ``add_one_cpu`` from the library;
@@ -249,6 +249,6 @@ What's Next
 
 **ABI specification.** See the complete ABI specification in :doc:`../concepts/abi_overview`.
 
-**Convenient compiler target.** The stable C ABI is a simple, portable codegen target for DSL compilers. Emit C that follows this ABI to integrate with TVM-FFI and call the result from multiple languages and frameworks. See :doc:`../guides/compiler_integration`.
+**Convenient compiler target.** The stable C ABI is a simple, portable codegen target for DSL compilers. Emit C that follows this ABI to integrate with TVM-FFI and call the result from multiple languages and frameworks. See :doc:`../concepts/abi_overview`.
 
-**Rich and extensible type system.** TVM-FFI supports a rich set of types in the stable C ABI: primitive types (integers, floats), DLPack tensors, strings, built-in reference-counted objects (functions, arrays, maps), and user-defined reference-counted objects. See :doc:`../guides/cpp_guide`.
+**Rich and extensible type system.** TVM-FFI supports a rich set of types in the stable C ABI: primitive types (integers, floats), DLPack tensors, strings, built-in reference-counted objects (functions, arrays, maps), and user-defined reference-counted objects. See :doc:`../guides/cpp_lang_guide`.
diff --git a/docs/guides/cpp_guide.md → docs/guides/cpp_lang_guide.md b/docs/guides/cpp_guide.md → docs/guides/cpp_lang_guide.md
diff --git a/docs/guides/python_guide.md → docs/guides/python_lang_guide.md b/docs/guides/python_guide.md → docs/guides/python_lang_guide.md