diff --git a/docs/.rstcheck.cfg b/docs/.rstcheck.cfg
index 5d48d421..080a7ccd 100644
--- a/docs/.rstcheck.cfg
+++ b/docs/.rstcheck.cfg
@@ -1,5 +1,5 @@
 [rstcheck]
 report_level = warning
 ignore_directives = automodule, autosummary, currentmodule, toctree, ifconfig, tab-set, collapse, tabs, dropdown
-ignore_roles = ref, cpp:class, cpp:func, py:func, c:macro
+ignore_roles = ref, cpp:class, cpp:func, py:func, c:macro, external+data-api:doc, external+scikit_build_core:doc
 ignore_languages = cpp, python
diff --git a/docs/concepts/abi_overview.md b/docs/concepts/abi_overview.md
index c8e0cd56..125de216 100644
--- a/docs/concepts/abi_overview.md
+++ b/docs/concepts/abi_overview.md
@@ -15,7 +15,7 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
-# ABI Overview
+# ABI Specification
 
 This section provides an overview of the ABI convention of TVM FFI. The ABI
 is designed around the following key principles:
diff --git a/docs/conf.py b/docs/conf.py
index bb7f1202..a575ee04 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -157,8 +157,10 @@
     "pillow": ("https://pillow.readthedocs.io/en/stable", None),
     "numpy": ("https://numpy.org/doc/stable", None),
     "torch": ("https://pytorch.org/docs/stable", None),
-    "torch-cpp": ("https://docs.pytorch.org/cppdocs/", None),
+    "torch-cpp": ("https://docs.pytorch.org/cppdocs", None),
     "dlpack": ("https://dmlc.github.io/dlpack/latest", None),
+    "data-api": ("https://data-apis.org/array-api/latest", None),
+    "scikit_build_core": ("https://scikit-build-core.readthedocs.io/en/stable/", None),
 }
 
 autosummary_generate = True  # actually create stub pages
diff --git a/docs/guides/build_from_source.md b/docs/dev/build_from_source.md
similarity index 100%
rename from docs/guides/build_from_source.md
rename to docs/dev/build_from_source.md
diff --git a/docs/get_started/quickstart.rst b/docs/get_started/quickstart.rst
index 1ebb5338..c702c7cb 100644
--- a/docs/get_started/quickstart.rst
+++ b/docs/get_started/quickstart.rst
@@ -83,7 +83,7 @@ The class :cpp:class:`tvm::ffi::TensorView` allows zero-copy interop with tensor
 
 - NumPy, CuPy,
 - PyTorch, JAX, or
-- any array type that supports the standard `DLPack protocol <https://data-apis.org/array-api/2024.12/design_topics/data_interchange.html>`_.
+- any array type that supports the standard :external+data-api:doc:`DLPack protocol <design_topics/data_interchange>`.
 
 Finally, :cpp:func:`TVMFFIEnvGetStream` can be used in the CUDA code to launch a kernel on the caller's stream.
 
@@ -127,36 +127,34 @@ TVM-FFI natively integrates with CMake via ``find_package`` as demonstrated belo
 
     .. code-block:: cmake
 
-      # Run `tvm-ffi-config --cmakedir` to set `tvm_ffi_DIR`
+      # Run `tvm-ffi-config --cmakedir` to set `tvm_ffi_ROOT`
       find_package(Python COMPONENTS Interpreter REQUIRED)
       execute_process(COMMAND "${Python_EXECUTABLE}" -m tvm_ffi.config --cmakedir OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE tvm_ffi_ROOT)
       find_package(tvm_ffi CONFIG REQUIRED)
 
       # Link C++ target to `tvm_ffi_header` and `tvm_ffi_shared`
       add_library(add_one_cpu SHARED compile/add_one_cpu.cc)
-      target_link_libraries(add_one_cpu PRIVATE tvm_ffi_header)
-      target_link_libraries(add_one_cpu PRIVATE tvm_ffi_shared)
+      tvm_ffi_configure_target(add_one_cpu)
 
   .. group-tab:: CUDA
 
     .. code-block:: cmake
 
       enable_language(CUDA)
-      # Run `tvm-ffi-config --cmakedir` to set `tvm_ffi_DIR`
+      # Run `tvm-ffi-config --cmakedir` to set `tvm_ffi_ROOT`
       find_package(Python COMPONENTS Interpreter REQUIRED)
       execute_process(COMMAND "${Python_EXECUTABLE}" -m tvm_ffi.config --cmakedir OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE tvm_ffi_ROOT)
       find_package(tvm_ffi CONFIG REQUIRED)
 
       # Link CUDA target to `tvm_ffi_header` and `tvm_ffi_shared`
       add_library(add_one_cuda SHARED compile/add_one_cuda.cu)
-      target_link_libraries(add_one_cuda PRIVATE tvm_ffi_header)
-      target_link_libraries(add_one_cuda PRIVATE tvm_ffi_shared)
+      tvm_ffi_configure_target(add_one_cuda)
 
 **Artifact.** The resulting ``add_one_cpu.so`` and ``add_one_cuda.so`` are minimal libraries that are agnostic to:
 
 - Python version/ABI. It is not compiled/linked with Python and depends only on TVM-FFI's stable C ABI;
 - Languages, including C++, Python, Rust or any other language that can interop with C ABI;
-- ML frameworks, such as PyTorch, JAX, NumPy, CuPy, or anything with standard `DLPack protocol <https://data-apis.org/array-api/2024.12/design_topics/data_interchange.html>`_.
+- ML frameworks, such as PyTorch, JAX, NumPy, CuPy, or anything with standard :external+data-api:doc:`DLPack protocol <design_topics/data_interchange>`.
 
 .. _sec-use-across-framework:
 
@@ -177,60 +175,66 @@ directly. This process is done zero-copy, without any boilerplate code, under ex
 
 We can then use these functions in the following ways:
 
-.. tab-set::
+.. _ship-to-pytorch:
 
-    .. tab-item:: PyTorch
+PyTorch
+~~~~~~~
 
-        .. literalinclude:: ../../examples/quickstart/load/load_pytorch.py
-          :language: python
-          :start-after: [example.begin]
-          :end-before: [example.end]
+.. literalinclude:: ../../examples/quickstart/load/load_pytorch.py
+  :language: python
+  :start-after: [example.begin]
+  :end-before: [example.end]
 
-    .. tab-item:: JAX
+.. _ship-to-jax:
 
-        Support via `nvidia/jax-tvm-ffi <https://github.com/nvidia/jax-tvm-ffi>`_. This can be installed via
+JAX
+~~~
+
+Support via `nvidia/jax-tvm-ffi <https://github.com/nvidia/jax-tvm-ffi>`_. This can be installed via
 
-        .. code-block:: bash
+.. code-block:: bash
 
-          pip install jax-tvm-ffi
+  pip install jax-tvm-ffi
 
-        After installation, ``add_one_cuda`` can be registered as a target to JAX's ``ffi_call``.
+After installation, ``add_one_cuda`` can be registered as a target to JAX's ``ffi_call``.
 
-        .. code-block:: python
+.. code-block:: python
 
-          # Step 1. Load `build/add_one_cuda.so`
-          import tvm_ffi
-          mod = tvm_ffi.load_module("build/add_one_cuda.so")
+  # Step 1. Load `build/add_one_cuda.so`
+  import tvm_ffi
+  mod = tvm_ffi.load_module("build/add_one_cuda.so")
 
-          # Step 2. Register `mod.add_one_cuda` into JAX
-          import jax_tvm_ffi
-          jax_tvm_ffi.register_ffi_target("add_one", mod.add_one_cuda, platform="gpu")
+  # Step 2. Register `mod.add_one_cuda` into JAX
+  import jax_tvm_ffi
+  jax_tvm_ffi.register_ffi_target("add_one", mod.add_one_cuda, platform="gpu")
 
-          # Step 3. Run `mod.add_one_cuda` with JAX
-          import jax
-          import jax.numpy as jnp
-          jax_device, *_ = jax.devices("gpu")
-          x = jnp.array([1, 2, 3, 4, 5], dtype=jnp.float32, device=jax_device)
-          y = jax.ffi.ffi_call(
-              "add_one",  # name of the registered function
-              jax.ShapeDtypeStruct(x.shape, x.dtype),  # shape and dtype of the output
-              vmap_method="broadcast_all",
-          )(x)
-          print(y)
+  # Step 3. Run `mod.add_one_cuda` with JAX
+  import jax
+  import jax.numpy as jnp
+  jax_device, *_ = jax.devices("gpu")
+  x = jnp.array([1, 2, 3, 4, 5], dtype=jnp.float32, device=jax_device)
+  y = jax.ffi.ffi_call(
+    "add_one",  # name of the registered function
+    jax.ShapeDtypeStruct(x.shape, x.dtype),  # shape and dtype of the output
+    vmap_method="broadcast_all",
+  )(x)
+  print(y)
 
-    .. tab-item:: NumPy
+.. _ship-to-numpy:
 
-        .. literalinclude:: ../../examples/quickstart/load/load_numpy.py
-          :language: python
-          :start-after: [example.begin]
-          :end-before: [example.end]
+NumPy/CuPy
+~~~~~~~~~~
 
-    .. tab-item:: CuPy
+.. literalinclude:: ../../examples/quickstart/load/load_numpy.py
+  :language: python
+  :start-after: [example.begin]
+  :end-before: [example.end]
 
-        .. literalinclude:: ../../examples/quickstart/load/load_cupy.py
-          :language: python
-          :start-after: [example.begin]
-          :end-before: [example.end]
+
+.. literalinclude:: ../../examples/quickstart/load/load_cupy.py
+  :language: python
+  :start-after: [example.begin]
+  :end-before: [example.end]
 
 
 Ship Across Languages
@@ -240,14 +244,16 @@ TVM-FFI's core loading mechanism is ABI stable and works across language boundar
 A single library can be loaded in every language TVM-FFI supports,
 without having to recompile different libraries targeting different ABIs or languages.
 
+.. _ship-to-python:
+
 Python
 ~~~~~~
 
 As shown in the :ref:`previous section<sec-use-across-framework>`, :py:func:`tvm_ffi.load_module` loads a language-
 and framework-independent ``add_one_cpu.so`` or ``add_one_cuda.so`` and can be used to incorporate it into all Python
-array frameworks that implement the standard `DLPack protocol <https://data-apis.org/array-api/2024.12/design_topics/data_interchange.html>`_.
+array frameworks that implement the standard :external+data-api:doc:`DLPack protocol <design_topics/data_interchange>`.
 
-.. _cpp_load:
+.. _ship-to-cpp:
 
 C++
 ~~~
@@ -301,6 +307,8 @@ Compile and run it with:
         return 0;
       }
 
+.. _ship-to-rust:
+
 Rust
 ~~~~
 
@@ -328,6 +336,15 @@ This procedure is identical to those in C++ and Python:
 Troubleshooting
 ---------------
 
-- ``OSError: cannot open shared object file``: Add an rpath (Linux/macOS) or ensure the DLL is on ``PATH`` (Windows). Example run-path: ``-Wl,-rpath,`tvm-ffi-config --libdir```.
+- ``OSError: cannot open shared object file``: Add an rpath (Linux/macOS) or ensure the DLL is on ``PATH`` (Windows). Example run-path: ``-Wl,-rpath,$(tvm-ffi-config --libdir)``.
 - ``undefined symbol: __tvm_ffi_add_one_cpu``: Ensure you used :c:macro:`TVM_FFI_DLL_EXPORT_TYPED_FUNC` and compiled with default symbol visibility (``-fvisibility=hidden`` is fine; the macro ensures export).
 - ``CUDA error: invalid device function``: Rebuild with the correct ``-arch=sm_XX`` for your GPU, or include multiple ``-gencode`` entries.
+
+
+Further Reading
+---------------
+
+- :doc:`Python Packaging <../packaging/python_packaging>` provides details on ABI-agnostic Python wheel building, as well as
+  exposing functions, classes and C symbols from TVM-FFI modules.
+- :doc:`Stable C ABI <stable_c_abi>` explains the ABI in depth and how it enables stability guarantee. Its C examples demonstrate
+  how to interoperate through the stable C ABI from both callee and caller sides.
diff --git a/docs/get_started/stable_c_abi.rst b/docs/get_started/stable_c_abi.rst
index c372a7bb..bcfe4915 100644
--- a/docs/get_started/stable_c_abi.rst
+++ b/docs/get_started/stable_c_abi.rst
@@ -94,7 +94,7 @@ The following conventions apply when representing values in :cpp:class:`TVMFFIAn
 
 - Heap-allocated objects: the last 64 bits store a pointer to the actual object, for example:
 
-  * Managed tensor objects that follow `DLPack <https://data-apis.org/array-api/2024.12/design_topics/data_interchange.html#dlpack-an-in-memory-tensor-structure>`_ (i.e. `DLTensor <https://dmlc.github.io/dlpack/latest/c_api.html#c.DLTensor>`_) layout.
+  * Managed tensor objects that follow :external+data-api:doc:`DLPack <design_topics/data_interchange>` (i.e. `DLTensor <https://dmlc.github.io/dlpack/latest/c_api.html#c.DLTensor>`_) layout.
 
 - Arbitrary objects: the type index identifies the concrete type, and the last 64 bits store a pointer to a reference-counted object in TVM-FFI's object format, for example:
 
@@ -126,7 +126,7 @@ Stability and Interoperability
 
 **Cross-language.** TVM-FFI implements this calling convention in multiple languages (C, C++, Python, Rust, ...), enabling code written in one language—or generated by a DSL targeting the ABI—to be called from another language.
 
-**Cross-framework.** TVM-FFI uses standard data structures such as `DLPack tensors <https://data-apis.org/array-api/2024.12/design_topics/data_interchange.html#dlpack-an-in-memory-tensor-structure>`_ to represent arrays, so compiled functions can be used from any array framework that implements the DLPack protocol (NumPy, PyTorch, TensorFlow, CuPy, JAX, and others).
+**Cross-framework.** TVM-FFI uses standard data structures such as :external+data-api:doc:`DLPack tensors <design_topics/data_interchange>` to represent arrays, so compiled functions can be used from any array framework that implements the DLPack protocol (NumPy, PyTorch, TensorFlow, CuPy, JAX, and others).
 
 
 Stable ABI in C Code
@@ -142,7 +142,7 @@ TVM FFI's :ref:`C ABI <tvm_ffi_c_abi>` is designed with DSL and ML compilers in
 This section shows how to write C code that follows the stable C ABI. Specifically, we provide two examples:
 
 - Callee side: A CPU ``add_one_cpu`` kernel in C that is equivalent to the :ref:`C++ example <cpp_add_one_kernel>`.
-- Caller side: A loader and runner in C that invokes the kernel, a direct C translation of the :ref:`C++ example <cpp_load>`.
+- Caller side: A loader and runner in C that invokes the kernel, a direct C translation of the :ref:`C++ example <ship-to-cpp>`.
 
 The C code is minimal and dependency-free, so it can serve as a direct reference for DSL compilers that want to expose or invoke kernels through the ABI.
 
@@ -200,7 +200,7 @@ Build it with either approach:
 Caller: Kernel Loader
 ~~~~~~~~~~~~~~~~~~~~~
 
-Next, a minimal C loader invokes the ``add_one_cpu`` kernel. It is functionally identical to the :ref:`C++ example <cpp_load>` and performs:
+Next, a minimal C loader invokes the ``add_one_cpu`` kernel. It is functionally identical to the :ref:`C++ example <ship-to-cpp>` and performs:
 
 - **Step 1**. Load the shared library ``build/add_one_cpu.so`` that contains the kernel;
 - **Step 2**. Get function ``add_one_cpu`` from the library;
@@ -249,6 +249,6 @@ What's Next
 
 **ABI specification.** See the complete ABI specification in :doc:`../concepts/abi_overview`.
 
-**Convenient compiler target.** The stable C ABI is a simple, portable codegen target for DSL compilers. Emit C that follows this ABI to integrate with TVM-FFI and call the result from multiple languages and frameworks. See :doc:`../guides/compiler_integration`.
+**Convenient compiler target.** The stable C ABI is a simple, portable codegen target for DSL compilers. Emit C that follows this ABI to integrate with TVM-FFI and call the result from multiple languages and frameworks. See :doc:`../concepts/abi_overview`.
 
-**Rich and extensible type system.** TVM-FFI supports a rich set of types in the stable C ABI: primitive types (integers, floats), DLPack tensors, strings, built-in reference-counted objects (functions, arrays, maps), and user-defined reference-counted objects. See :doc:`../guides/cpp_guide`.
+**Rich and extensible type system.** TVM-FFI supports a rich set of types in the stable C ABI: primitive types (integers, floats), DLPack tensors, strings, built-in reference-counted objects (functions, arrays, maps), and user-defined reference-counted objects. See :doc:`../guides/cpp_lang_guide`.
diff --git a/docs/guides/cpp_guide.md b/docs/guides/cpp_lang_guide.md
similarity index 100%
rename from docs/guides/cpp_guide.md
rename to docs/guides/cpp_lang_guide.md
diff --git a/docs/guides/python_guide.md b/docs/guides/python_lang_guide.md
similarity index 100%
rename from docs/guides/python_guide.md
rename to docs/guides/python_lang_guide.md
diff --git a/docs/guides/python_packaging.md b/docs/guides/python_packaging.md
deleted file mode 100644
index 3934b1fb..00000000
--- a/docs/guides/python_packaging.md
+++ /dev/null
@@ -1,460 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-# Python Binding and Packaging
-
-This guide explains how to leverage tvm-ffi to expose C++ functions into Python and package them into a wheel.
-At a high level, packaging with tvm-ffi offers several benefits:
-
-- **Ship one wheel** that can be used across Python versions, including free-threaded Python.
-- **Multi-language access** to functions from Python, C++, Rust and other languages that support the ABI.
-- **ML Systems Interop** with ML frameworks, DSLs, and libraries while maintaining minimal dependency.
-
-## Directly using Exported Library
-
-If you just need to expose a simple set of functions,
-you can declare an exported symbol in C++:
-
-```c++
-// Compiles to mylib.so
-#include <tvm/ffi/function.h>
-
-int add_one(int x) {
-  return x + 1;
-}
-
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(add_one, add_one)
-```
-
-You then load the exported function in your Python project via {py:func}`tvm_ffi.load_module`.
-
-```python
-# In your __init__.py
-import tvm_ffi
-
-_LIB = tvm_ffi.load_module("/path/to/mlib.so")
-
-def add_one(x):
-    """Expose mylib.add_one"""
-    return _LIB.add_one(x)
-```
-
-This approach is like using {py:mod}`ctypes` to load and run DLLs, except we have more powerful features:
-
-- We can pass in `torch.Tensor` (or any other DLPack-compatible arrays).
-- We can pass in a richer set of data structures such as strings, tuples, and dicts.
-- {py:class}`tvm_ffi.Function` enables natural callbacks to Python lambdas or other languages.
-- Exceptions are propagated naturally across language boundaries.
-
-## Pybind11 and Nanobind style Usage
-
-For advanced use cases where users may wish to register global functions or custom object types,
-we also provide a pybind11/nanobind style API to register functions and custom objects.
-
-```c++
-#include <tvm/ffi/error.h>
-#include <tvm/ffi/reflection/registry.h>
-
-namespace my_ffi_extension {
-
-namespace ffi = tvm::ffi;
-
-/*!
- * \brief Example of a custom object that is exposed to the FFI library
- */
-class IntPairObj : public ffi::Object {
- public:
-  int64_t a;
-  int64_t b;
-
-  IntPairObj() = default;
-  IntPairObj(int64_t a, int64_t b) : a(a), b(b) {}
-
-  int64_t GetFirst() const { return this->a; }
-
-  // Required: declare type information
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("my_ffi_extension.IntPair", IntPairObj, ffi::Object);
-};
-
-/*!
- * \brief Defines an explicit reference to IntPairObj
- *
- * A reference wrapper serves as a reference-counted pointer to the object.
- * You can use obj->field to access the fields of the object.
- */
-class IntPair : public tvm::ffi::ObjectRef {
- public:
-  // Constructor
-  explicit IntPair(int64_t a, int64_t b) {
-    data_ = tvm::ffi::make_object<IntPairObj>(a, b);
-  }
-
-  // Required: define object reference methods
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(IntPair, tvm::ffi::ObjectRef, IntPairObj);
-};
-
-void RaiseError(ffi::String msg) { TVM_FFI_THROW(RuntimeError) << msg; }
-
-TVM_FFI_STATIC_INIT_BLOCK() {
-  namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef()
-    .def("my_ffi_extension.raise_error", RaiseError);
-  // register object definition
-  refl::ObjectDef<IntPairObj>()
-      .def(refl::init<int64_t, int64_t>())
-       // Example static method that returns the second element of the pair
-      .def_static("static_get_second", [](IntPair pair) -> int64_t { return pair->b; })
-      // Example to bind an instance method
-      .def("get_first", &IntPairObj::GetFirst)
-      .def_ro("a", &IntPairObj::a)
-      .def_ro("b", &IntPairObj::b);
-}
-}  // namespace my_ffi_extension
-```
-
-Then these functions and objects can be accessed from Python as long as the library is loaded.
-You can use {py:func}`tvm_ffi.load_module` or simply use {py:class}`ctypes.CDLL`. Then you can access
-the function through {py:func}`tvm_ffi.get_global_func` or {py:func}`tvm_ffi.init_ffi_api`.
-We also allow direct exposure of object via {py:func}`tvm_ffi.register_object`.
-
-```python
-# __init__.py
-import tvm_ffi
-
-def raise_error(msg: str):
-    """Wrap raise error function."""
-    # Usually we reorganize these functions into a _ffi_api.py and load once
-    func = tvm_ffi.get_global_func("my_ffi_extension.raise_error")
-    func(msg)
-
-
-@tvm_ffi.register_object("my_ffi_extension.IntPair")
-class IntPair(tvm_ffi.Object):
-    """IntPair object."""
-
-    def __init__(self, a: int, b: int) -> None:
-        """Construct the object."""
-        # __ffi_init__ call into the refl::init<> registered
-        # in the static initialization block of the extension library
-        self.__ffi_init__(a, b)
-
-
-def run_example():
-    pair = IntPair(1, 2)
-    # prints 1
-    print(pair.get_first())
-    # prints 2
-    print(IntPair.static_get_second(pair))
-    # Raises a RuntimeError("error happens")
-    raise_error("error happens")
-```
-
-### Relations to Existing Solutions
-
-Most current binding systems focus on creating one-to-one bindings
-that take a source language and bind to an existing target language runtime and ABI.
-We deliberately take a more decoupled approach here:
-
-- Build stable, minimal ABI convention that is agnostic to the target language.
-- Create bindings to connect the source and target language to the ABI.
-
-The focus of this project is the ABI itself which we believe can help the overall ecosystem.
-We also anticipate there are possibilities for existing binding generators to also target the tvm-ffi ABI.
-
-**Design philosophy**. We have the following design philosophies focusing on ML systems.
-
-- FFI and cross-language interop should be first-class citizens in ML systems rather than an add-on.
-- Enable multi-environment support in both source and target languages.
-- The same ABI should be minimal and targetable by DSL compilers.
-
-Of course, there is always a tradeoff. It is by design impossible to support arbitrary advanced language features
-in the target language, as different programming languages have their own design considerations.
-We do believe it is possible to build a universal, effective, and minimal ABI for machine learning
-system use cases. Based on the above design philosophies, we focus our cross-language
-interaction interface through the FFI ABI for machine learning systems.
-
-So if you are building projects related to machine learning compilers, runtimes,
-libraries, frameworks, DSLs, or generally scientific computing, we encourage you
-to try it out. The extension mechanism can likely support features in other domains as well
-and we welcome you to try it out as well.
-
-### Mix with Existing Solutions
-
-Because the global registry mechanism only relies on the code being linked,
-you can also partially use tvm-ffi-based registration together with pybind11/nanobind in your project.
-Just add the related code, link to `libtvm_ffi` and make sure you `import tvm_ffi` before importing
-your module to ensure related symbols are available.
-This approach may help to quickly leverage some of the cross-language features we have.
-It also provides more powerful interaction with the host Python language, but of course the tradeoff
-is that the final library will now also depend on the Python ABI.
-
-## Example Project Walk Through
-
-To get hands-on experience with the packaging flow,
-you can try out an example project in our folder.
-First, obtain a copy of the tvm-ffi source code.
-
-```bash
-git clone https://github.com/apache/tvm-ffi --recursive
-cd tvm-ffi
-```
-
-The examples are now in the examples folder. You can quickly build
-and install the example using the following commands.
-
-```bash
-cd examples/packaging
-pip install -v .
-```
-
-Then you can run examples that leverage the built wheel package.
-
-```bash
-python run_example.py add_one
-```
-
-## Setup pyproject.toml
-
-A typical tvm-ffi-based project has the following structure:
-
-```text
-├── CMakeLists.txt          # CMake build configuration
-├── pyproject.toml          # Python packaging configuration
-├── src/
-│   └── extension.cc        # C++ source code
-├── python/
-│   └── my_ffi_extension/
-│       ├── __init__.py     # Python package initialization
-│       ├── base.py         # Library loading logic
-│       └── _ffi_api.py     # FFI API registration
-└── README.md               # Project documentation
-```
-
-The `pyproject.toml` file configures the build system and project metadata.
-
-```toml
-[project]
-name = "my-ffi-extension"
-version = "0.1.0"
-# ... more project metadata omitted ...
-
-[build-system]
-requires = ["scikit-build-core>=0.10.0", "apache-tvm-ffi"]
-build-backend = "scikit_build_core.build"
-
-[tool.scikit-build]
-# ABI-agnostic wheel
-wheel.py-api = "py3"
-# ... more build configuration omitted ...
-```
-
-We use scikit-build-core for building the wheel. Make sure you add tvm-ffi as a build-system requirement.
-Importantly, we should set `wheel.py-api` to `py3` to indicate it is ABI-generic.
-
-### Setup CMakeLists.txt
-
-The CMakeLists.txt handles the build and linking of the project.
-There are two ways you can build with tvm-ffi:
-
-- Link the pre-built `libtvm_ffi` shipped from the pip package
-- Build tvm-ffi from source
-
-For common cases, using the pre-built library and linking tvm_ffi_shared is sufficient.
-To build with the pre-built library, you can do:
-
-```cmake
-cmake_minimum_required(VERSION 3.18)
-project(my_ffi_extension)
-
-find_package(Python COMPONENTS Interpreter REQUIRED)
-# find the prebuilt package
-find_package(tvm_ffi CONFIG REQUIRED)
-
-# ... more cmake configuration omitted ...
-
-# linking the library
-target_link_libraries(my_ffi_extension tvm_ffi_shared)
-```
-
-There are cases where one may want to cross-compile or bundle part of tvm_ffi objects directly
-into the project. In such cases, you should build from source.
-
-```cmake
-execute_process(
-  COMMAND "${Python_EXECUTABLE}" -m tvm_ffi.config --sourcedir
-  OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE tvm_ffi_ROOT)
-# add the shipped source code as a cmake subdirectory
-add_subdirectory(${tvm_ffi_ROOT} tvm_ffi)
-
-# ... more cmake configuration omitted ...
-
-# linking the library
-target_link_libraries(my_ffi_extension tvm_ffi_shared)
-```
-
-Note that it is always safe to build from source, and the extra cost of building tvm-ffi is small
-because tvm-ffi is a lightweight library. If you are in doubt,
-you can always choose to build tvm-ffi from source.
-In Python or other cases when we dynamically load libtvm_ffi shipped with the dedicated pip package,
-you do not need to ship libtvm_ffi.so in your package even if you build tvm-ffi from source.
-The built objects are only used to supply the linking information.
-
-### Exposing C++ Functions
-
-The C++ implementation is defined in `src/extension.cc`.
-There are two ways one can expose a function in C++ to the FFI library.
-First, `TVM_FFI_DLL_EXPORT_TYPED_FUNC` can be used to expose the function directly as a C symbol that follows the tvm-ffi ABI,
-which can later be accessed via `tvm_ffi.load_module`.
-
-Here's a basic example of the function implementation:
-
-```c++
-void AddOne(ffi::TensorView x, ffi::TensorView y) {
-  // ... implementation omitted ...
-}
-
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(add_one, my_ffi_extension::AddOne);
-```
-
-We can also register a function into the global function table with a given name:
-
-```c++
-void RaiseError(ffi::String msg) {
-  TVM_FFI_THROW(RuntimeError) << msg;
-}
-
-TVM_FFI_STATIC_INIT_BLOCK() {
-  namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef()
-    .def("my_ffi_extension.raise_error", RaiseError);
-}
-```
-
-Make sure to have a unique name across all registered functions when registering a global function.
-Always prefix with a package namespace name to avoid name collisions.
-The function can then be found via `tvm_ffi.get_global_func(name)`
-and is expected to stay throughout the lifetime of the program.
-
-We recommend using `TVM_FFI_DLL_EXPORT_TYPED_FUNC` for functions that are supposed to be dynamically
-loaded (such as JIT scenarios) so they won't be exposed to the global function table.
-
-### Library Loading in Python
-
-The base module handles loading the compiled extension:
-
-```python
-import tvm_ffi
-import os
-import sys
-
-def _load_lib():
-    file_dir = os.path.dirname(os.path.realpath(__file__))
-
-    # Platform-specific library names
-    if sys.platform.startswith("win32"):
-        lib_name = "my_ffi_extension.dll"
-    elif sys.platform.startswith("darwin"):
-        lib_name = "my_ffi_extension.dylib"
-    else:
-        lib_name = "my_ffi_extension.so"
-
-    lib_path = os.path.join(file_dir, lib_name)
-    return tvm_ffi.load_module(lib_path)
-
-_LIB = _load_lib()
-```
-
-Effectively, it leverages the `tvm_ffi.load_module` call to load the library
-extension DLL shipped along with the package. The `_ffi_api.py` contains a function
-call to `tvm_ffi.init_ffi_api` that registers all global functions prefixed
-with `my_ffi_extension` into the module.
-
-```python
-# _ffi_api.py
-import tvm_ffi
-from .base import _LIB
-
-# Register all global functions prefixed with 'my_ffi_extension.'
-# This makes functions registered via TVM_FFI_STATIC_INIT_BLOCK available
-tvm_ffi.init_ffi_api("my_ffi_extension", __name__)
-```
-
-Then we can redirect the calls to the related functions.
-
-```python
-from .base import _LIB
-from . import _ffi_api
-
-def add_one(x, y):
-    # ... docstring omitted ...
-    return _LIB.add_one(x, y)
-
-def raise_error(msg):
-    # ... docstring omitted ...
-    return _ffi_api.raise_error(msg)
-```
-
-### Build and Use the Package
-
-First, build the wheel:
-
-```bash
-pip wheel -v -w dist .
-```
-
-Then install the built wheel:
-
-```bash
-pip install dist/*.whl
-```
-
-Then you can try it out:
-
-```python
-import torch
-import my_ffi_extension
-
-# Create input and output tensors
-x = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32)
-y = torch.empty_like(x)
-
-# Call the function
-my_ffi_extension.add_one(x, y)
-print(y)  # Output: tensor([2., 3., 4., 5., 6.])
-```
-
-You can also run the following command to see how errors are raised and propagated
-across language boundaries:
-
-```bash
-python run_example.py raise_error
-```
-
-When possible, tvm-ffi will try to preserve backtraces across language boundaries. You will see outputs like:
-
-```text
-File "src/extension.cc", line 45, in void my_ffi_extension::RaiseError(tvm::ffi::String)
-```
-
-## Wheel Auditing
-
-When using `auditwheel`, exclude `libtvm_ffi` as it will be shipped with the `tvm_ffi` package.
-
-```bash
-auditwheel repair --exclude libtvm_ffi.so dist/*.whl
-```
-
-As long as you import `tvm_ffi` first before loading the library, the symbols will be available.
diff --git a/docs/guides/rust_guide.md b/docs/guides/rust_lang_guide.md
similarity index 97%
rename from docs/guides/rust_guide.md
rename to docs/guides/rust_lang_guide.md
index a1b7ac78..f4c19e78 100644
--- a/docs/guides/rust_guide.md
+++ b/docs/guides/rust_lang_guide.md
@@ -213,5 +213,5 @@ For detailed API documentation, see the [Rust API Reference](../reference/rust/i
 ## Related Resources
 
 - [Quick Start Guide](../get_started/quickstart.rst) - General TVM FFI introduction
-- [C++ Guide](cpp_guide.md) - C++ API usage
-- [Python Guide](python_guide.md) - Python API usage
+- [C++ Guide](./cpp_lang_guide.md) - C++ API usage
+- [Python Guide](./python_lang_guide.md) - Python API usage
diff --git a/docs/index.rst b/docs/index.rst
index 23e67d0b..53f922cc 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -25,18 +25,12 @@ or reading through the guides and concepts sections.
 Installation
 ------------
 
-To install via pip, run:
+To install TVM-FFI via pip or uv, run:
 
 .. code-block:: bash
 
    pip install apache-tvm-ffi
-
-We also recommend installing the optional package below for improved
-torch tensor conversion performance.
-
-.. code-block:: bash
-
-   pip install torch-c-dlpack-ext
+   pip install torch-c-dlpack-ext  # compatibility package for torch <= 2.9
 
 
 Table of Contents
@@ -53,16 +47,12 @@ Table of Contents
    :maxdepth: 1
    :caption: Guides
 
-   guides/python_packaging.md
-   guides/cpp_packaging.md
-   guides/cpp_guide.md
-   guides/python_guide.md
-   guides/rust_guide.md
-   guides/cubin_launcher.rst
-   guides/compiler_integration.md
-   guides/build_from_source.md
    guides/kernel_library_guide.rst
-
+   guides/compiler_integration.md
+   guides/cubin_launcher.rst
+   guides/python_lang_guide.md
+   guides/cpp_lang_guide.md
+   guides/rust_lang_guide.md
 
 .. toctree::
    :maxdepth: 1
@@ -70,6 +60,12 @@ Table of Contents
 
    concepts/abi_overview.md
 
+.. toctree::
+   :maxdepth: 1
+   :caption: Packaging
+
+   packaging/python_packaging.rst
+   packaging/cpp_packaging.md
 
 .. toctree::
    :maxdepth: 1
@@ -78,3 +74,9 @@ Table of Contents
    reference/python/index.rst
    reference/cpp/index.rst
    reference/rust/index.rst
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Developer Manual
+
+   dev/build_from_source.md
diff --git a/docs/guides/cpp_packaging.md b/docs/packaging/cpp_packaging.md
similarity index 100%
rename from docs/guides/cpp_packaging.md
rename to docs/packaging/cpp_packaging.md
diff --git a/docs/packaging/python_packaging.rst b/docs/packaging/python_packaging.rst
new file mode 100644
index 00000000..c768768d
--- /dev/null
+++ b/docs/packaging/python_packaging.rst
@@ -0,0 +1,506 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+Python Packaging
+================
+
+This guide walks through a small but complete workflow for packaging a TVM-FFI extension
+as a Python wheel. The goal is to help you wire up a simple extension, produce a wheel,
+and ship user-friendly typing annotations without needing to know every detail of TVM
+internals. We will cover three checkpoints:
+
+- Export C++ to Python;
+- Build Python wheel;
+- Automatic Python package generation tools.
+
+Export C++ to Python
+--------------------
+
+TVM-FFI offers three ways to expose code:
+
+- C symbols in TVM FFI ABI: Export code as plain C symbols. This is the recommended way for
+  most usecases as it keeps the boundary thin and works well with compiler codegen;
+- Functions: Reflect functions via the global registry;
+- Classes: Register C++ classes derived from :cpp:class:`tvm::ffi::Object` to Python dataclasses.
+
+Metadata is automatically captured and is later be turned into type hints for proper LSP help.
+
+TVM-FFI ABI (Recommended)
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you prefer to export plain C symbols, TVM-FFI provides helpers to make them accessible
+to Python. This option keeps the boundary thin and works well with LLVM compilers where
+C symbols are easier to call into.
+
+.. tabs::
+
+  .. group-tab:: C++
+
+    Macro :c:macro:`TVM_FFI_DLL_EXPORT_TYPED_FUNC` exports the function ``AddTwo`` as
+    a C symbol ``__tvm_ffi_add_two`` inside the shared library.
+
+    .. code-block:: cpp
+
+      static int AddTwo(int x) {
+        return x + 2;
+      }
+
+      TVM_FFI_DLL_EXPORT_TYPED_FUNC(add_two, AddTwo);
+
+  .. group-tab:: Python (User)
+
+    Symbol ``__tvm_ffi_add_two`` is made available via ``LIB.add_two`` to users.
+
+    .. code-block:: python
+
+      import my_ffi_extension
+      my_ffi_extension.LIB.add_two(1)  # -> 3
+
+  .. group-tab:: Python (Generated)
+
+    The shared library is loaded by :py:func:`tvm_ffi.libinfo.load_lib_module`.
+
+    .. code-block:: python
+
+      # File: my_ffi_extension/_ffi_api.py
+
+      LIB = tvm_ffi.libinfo.load_lib_module(
+        package="my-ffi-extension",
+        target_name="my_ffi_extension",
+      )
+
+
+Global Function
+~~~~~~~~~~~~~~~
+
+This example registers a function into the global registry and then calls it from Python.
+It registry handles type translation, error handling, and metadata.
+
+.. tabs::
+
+  .. group-tab:: C++
+
+    C++ function ``AddOne`` is registered with name ``my_ffi_extension.add_one``
+    in the global registry using :cpp:class:`tvm::ffi::reflection::GlobalDef`.
+
+    .. code-block:: cpp
+
+      static int AddOne(int x) {
+        return x + 1;
+      }
+
+      TVM_FFI_STATIC_INIT_BLOCK() {
+        namespace refl = tvm::ffi::reflection;
+        refl::GlobalDef()
+          .def("my_ffi_extension.add_one", AddOne);
+      }
+
+  .. group-tab:: Python (User)
+
+    The global function is accessible after importing the extension,
+    and the import path matches the registered name, i.e. ``my_ffi_extension.add_one``.
+
+    .. code-block:: python
+
+      import my_ffi_extension
+
+      my_ffi_extension.add_one(3)  # -> 4
+
+  .. group-tab:: Python (Generated)
+
+    Under the hood, the shared library is loaded by :py:func:`tvm_ffi.init_ffi_api`
+    during package initialization.
+
+    .. code-block:: python
+
+      # File: my_ffi_extension/_ffi_api.py
+
+      tvm_ffi.init_ffi_api(
+        namespace="my_ffi_extension",
+        target_module_name=__name__,
+      )
+
+      def add_one(x: int) -> int: ...
+
+
+Class
+~~~~~
+
+Any class derived from :cpp:class:`tvm::ffi::Object` can be registered, exported and
+instantiated from Python. The reflection helper :cpp:class:`tvm::ffi::reflection::ObjectDef`
+makes it easy to expose:
+
+- Fields
+
+  * Immutable field via :cpp:func:`ObjectDef::def_ro <tvm::ffi::reflection::ObjectDef::def_ro>`;
+  * Mutable field via :cpp:func:`ObjectDef::def_rw <tvm::ffi::reflection::ObjectDef::def_rw>`;
+
+- Methods
+
+  * Member method via :cpp:func:`ObjectDef::def <tvm::ffi::reflection::ObjectDef::def>`.
+  * Static method via :cpp:func:`ObjectDef::def_static <tvm::ffi::reflection::ObjectDef::def_static>`;
+  * Constructors via :cpp:class:`tvm::ffi::reflection::init`.
+
+
+.. tabs::
+
+  .. group-tab:: C++
+
+    The example below defines a class ``my_ffi_extension.IntPair`` with
+
+    - two integer fields ``a``, ``b``,
+    - a constructor, and
+    - a method ``Sum`` that returns the sum of the two fields.
+
+    .. code-block:: cpp
+
+      class IntPairObj : public ffi::Object {
+       public:
+        int64_t a;
+        int64_t b;
+        IntPairObj(int64_t a, int64_t b) : a(a), b(b) {}
+
+        int64_t Sum() const {
+          return a + b;
+        }
+
+        TVM_FFI_DECLARE_OBJECT_INFO_FINAL(
+          /*type_key=*/"my_ffi_extension.IntPair",
+          /*class=*/IntPairObj,
+          /*parent_class=*/ffi::Object
+        );
+      };
+
+      TVM_FFI_STATIC_INIT_BLOCK() {
+        namespace refl = tvm::ffi::reflection;
+        refl::ObjectDef<IntPairObj>()
+          .def(refl::init<int64_t, int64_t>())
+          .def_rw("a", &IntPairObj::a, "the first field")
+          .def_rw("b", &IntPairObj::b, "the second field")
+          .def("sum", &IntPairObj::Sum, "IntPairObj::Sum() method");
+      }
+
+  .. group-tab:: Python (User)
+
+    The class is available immediately after importing the extension,
+    with the import path matching the registered name, i.e. ``my_ffi_extension.IntPair``.
+
+    .. code-block:: python
+
+      import my_ffi_extension
+
+      pair = my_ffi_extension.IntPair(1, 2)
+      pair.sum() # -> 3
+
+  .. group-tab:: Python (Generated)
+
+    Type hints are generated for both fields and methods.
+
+    .. code-block:: python
+
+      # File: my_ffi_extension/_ffi_api.py (auto generated)
+
+      @tvm_ffi.register_object("my_ffi_extension.IntPair")
+      class IntPair(tvm_ffi.Object):
+          a: int
+          b: int
+
+          def __init__(self, a: int, b: int) -> None: ...
+          def sum(self) -> int: ...
+
+
+Build Python Wheel
+------------------
+
+Once the C++ side is ready, TVM-FFI provides convenient helpers to build and ship
+ABI-agnostic Python extensions using any standard packaging tool.
+
+The flow below uses :external+scikit_build_core:doc:`scikit-build-core <index>`
+that drives CMake build, but the same ideas translate to setuptools or other :pep:`517` backends.
+
+CMake Target
+~~~~~~~~~~~~
+
+Assume the source tree contains ``src/extension.cc``. Create a ``CMakeLists.txt`` that
+creates a shared target ``my_ffi_extension`` and configures it against TVM-FFI.
+
+.. code-block:: cmake
+
+    add_library(my_ffi_extension SHARED src/extension.cc)
+    tvm_ffi_configure_target(my_ffi_extension STUB_DIR "./python")
+    install(TARGETS my_ffi_extension DESTINATION .)
+    tvm_ffi_install(my_ffi_extension DESTINATION .)
+
+Function ``tvm_ffi_configure_target`` sets up TVM-FFI include paths, link against TVM-FFI library,
+generates stubs under the specified directory, and optionally debug symbols.
+
+Function ``tvm_ffi_install`` places necessary information, e.g. debug symbols in macOS, next to
+the shared library for proper packaging.
+
+Python Build Backend
+~~~~~~~~~~~~~~~~~~~~
+
+Define a :pep:`517` build backend in ``pyproject.toml``, with the following steps:
+
+- Sepcfiy ``apache-tvm-ffi`` as a build requirement, so that CMake can find TVM-FFI;
+- Configure ``wheel.py-api`` that indicates a Python ABI-agnostic wheel;
+- Specify the source directory of the package via ``wheel.packages``, and the installation
+  destination via ``wheel.install-dir``.
+
+.. code-block:: toml
+
+   [build-system]
+   requires = ["scikit-build-core>=0.10.0", "apache-tvm-ffi"]
+   build-backend = "scikit_build_core.build"
+
+   [tool.scikit-build]
+   # The wheel is Python ABI-agnostic
+   wheel.py-api = "py3"
+   # The package contains the Python module at `python/my_ffi_extension`
+   wheel.packages = ["python/my_ffi_extension"]
+    # The install dir matches the import name
+   wheel.install-dir = "my_ffi_extension"
+
+Once fully specified, scikit-build-core will invoke CMake and drive the extension building process.
+
+
+Wheel Auditing
+~~~~~~~~~~~~~~
+
+**Build wheels**. The wheel can be built using the standard workflows, e.g.:
+
+- `pip workflow <https://pip.pypa.io/en/stable/cli/pip_wheel/>`_ or `editable install <https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs>`_
+
+.. code-block:: bash
+
+  # editable install
+  pip install -e .
+  # standard wheel build
+  pip wheel -w dist .
+
+- `uv workflow <https://docs.astral.sh/uv/guides/package/>`_
+
+.. code-block:: bash
+
+  uv build --wheel --out-dir dist .
+
+- `cibuildwheel <https://cibuildwheel.pypa.io/>`_ for multi-platform build
+
+.. code-block:: bash
+
+  cibuildwheel --output-dir dist
+
+**Audit wheels**. In practice, an extra step is usually necessary to remove redundant
+and error-prone shared library dependencies. In our case, given ``libtvm_ffi.so``
+(or its respective platform variants) is guaranteed to be loaded by importing ``tvm_ffi``,
+we can safely exclude this dependency from the final wheel.
+
+.. code-block:: bash
+
+   # Linux
+   auditwheel repair --exclude libtvm_ffi.so dist/*.whl
+   # macOS
+   delocate-wheel -w dist -v --exclude libtvm_ffi.dylib dist/*.whl
+   # Windows
+   delvewheel repair --exclude tvm_ffi.dll -w dist dist\\*.whl
+
+Stub Generation Tool
+--------------------
+
+TVM-FFI comes with a command-line tool ``tvm-ffi-stubgen`` that automates
+the generation of type stubs for both global functions and classes.
+It turns reflection metadata into proper Python type hints, and generates
+corresponding Python code **inline** and **statically**.
+
+Inline Directives
+~~~~~~~~~~~~~~~~~
+
+Similar to linter tools, ``tvm-ffi-stubgen`` uses special comments
+to identify what to generate and where to write generated code.
+
+**Directive 1 (Global functions)**. Example below shows an directive
+``global/${prefix}`` marking a type stub section of global functions.
+
+.. code-block:: python
+
+   # tvm-ffi-stubgen(begin): global/my_ext.arith
+   tvm_ffi.init_ffi_api("my_ext.arith", __name__)
+   if TYPE_CHECKING:
+     def add_one(_0: int, /) -> int: ...
+     def add_two(_0: int, /) -> int: ...
+     def add_three(_0: int, /) -> int: ...
+   # tvm-ffi-stubgen(end)
+
+Running ``tvm-ffi-stubgen`` fills in the function stubs between the
+``begin`` and ``end`` markers based on the loaded registry, and in this case
+introduces all the global functions named ``my_ext.arith.*``.
+
+**Directive 2 (Classes)**. Example below shows an directive
+``object/${type_key}`` marking the fields and methods of a registered class.
+
+.. code-block:: python
+
+   @tvm_ffi.register_object("my_ffi_extension.IntPair")
+   class IntPair(_ffi_Object):
+     # tvm-ffi-stubgen(begin): object/my_ffi_extension.IntPair
+     a: int
+     b: int
+     if TYPE_CHECKING:
+       def __init__(self, a: int, b: int) -> None: ...
+       def sum(self) -> int: ...
+     # tvm-ffi-stubgen(end)
+
+Directive-based Generation
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+After TVM-FFI extension is built as a shared library, say at
+``build/libmy_ffi_extension.so``
+
+**Command line tool**. The command below generates stubs for
+the package located at ``python/my_ffi_extension``, updating
+all sections marked by the directives.
+
+.. code-block:: bash
+
+   tvm-ffi-stubgen                          \
+     python/my_ffi_extension                \
+     --dlls build/libmy_ffi_extension.so    \
+
+
+**CMake Integration**. CMake function ``tvm_ffi_configure_target``
+is integrated with this command and can be used to keep stubs up to date
+every time the target is built.
+
+.. code-block:: cmake
+
+   tvm_ffi_configure_target(my_ffi_extension
+       STUB_DIR "python"
+   )
+
+Inside the function, CMake manages to find proper ``--dlls`` arguments
+via ``$<TARGET_FILE:${target}>``.
+
+Scaffold Missing Directives
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Command line tool**. Beyond updating existing directives, ``tvm-ffi-stubgen``
+can be used to scaffold missing directives if they are not yet present in the
+package with a few extra flags.
+
+.. code-block:: bash
+
+   tvm-ffi-stubgen                          \
+     python/my_ffi_extension                \
+     --dlls build/libmy_ffi_extension.so    \
+     --init-pypkg my-ffi-extension          \
+     --init-lib my_ffi_extension            \
+     --init-prefix "my_ffi_extension."      \
+
+- ``--init-pypkg <pypkg>``: Specifies the name of the Python package to initialize, e.g. ``apache-tvm-ffi``, ``my-ffi-extension``;
+- ``--init-lib <libtarget>``: Specifies the name of the CMake target (shared library) to load for reflection metadata;
+- ``--init-prefix <prefix>``: Specifies the registry prefix to include for stub generation, e.g. ``my_ffi_extension.``. If names of global functions or classes start with this prefix, they will be included in the generated stubs.
+
+**CMake Integration**. CMake function ``tvm_ffi_configure_target``
+also supports scaffolding missing directives via the extra options
+``STUB_INIT``, ``STUB_PKG``, and ``STUB_PREFIX``.
+
+.. code-block:: cmake
+
+   tvm_ffi_configure_target(my_ffi_extension
+       STUB_DIR "python"
+       STUB_INIT ON
+   )
+
+The ``STUB_INIT`` option instructs CMake to scaffold missing directives
+based on the target and package information already specified.
+
+Other Directives
+~~~~~~~~~~~~~~~~
+
+All the supported directives are documented via:
+
+.. code-block:: bash
+
+   tvm-ffi-stubgen --help
+
+
+It includes:
+
+**Directive 3 (Import section)**. It populates all the imported names used by generated stubs. Example:
+
+.. code-block:: python
+
+   # tvm-ffi-stubgen(begin): import-section
+   from __future__ import annotations
+   from ..registry import init_ffi_api as _FFI_INIT_FUNC
+   from typing import TYPE_CHECKING
+   if TYPE_CHECKING:
+       from collections.abc import Mapping, Sequence
+       from tvm_ffi import Device, Object, Tensor, dtype
+       from tvm_ffi.testing import TestIntPair
+       from typing import Any, Callable
+   # tvm-ffi-stubgen(end)
+
+**Directive 4 (Export)**. It re-exports names defined in `_ffi_api.__all__` into the current file. Usually
+used in ``__init__.py`` to aggregate all exported names. Example:
+
+.. code-block:: python
+
+   # tvm-ffi-stubgen(begin): export/_ffi_api
+   from ._ffi_api import *  # noqa: F403
+   from ._ffi_api import __all__ as _ffi_api__all__
+   if "__all__" not in globals():
+       __all__ = []
+   __all__.extend(_ffi_api__all__)
+   # tvm-ffi-stubgen(end)
+
+**Directive 5 (__all__)**. It populates the ``__all__`` variable with all generated
+classes and functions, as well as ``LIB`` if present. It's usually placed at the end of
+``_ffi_api.py``. Example:
+
+.. code-block:: python
+
+   __all__ = [
+       # tvm-ffi-stubgen(begin): __all__
+       "LIB",
+       "IntPair",
+       "raise_error",
+       # tvm-ffi-stubgen(end)
+   ]
+
+**Directive 6 (ty-map)**. It maps the type key of a class to Python types used in generation. Example:
+
+.. code-block:: python
+
+   # tvm-ffi-stubgen(ty-map): ffi.reflection.AccessStep -> ffi.access_path.AccessStep
+
+means the class with type key ``ffi.reflection.AccessStep``, is instead class ``ffi.access_path.AccessStep``
+in Python.
+
+**Directive 7 (Import object)**. It injects a custom import into generated code, optionally
+TYPE_CHECKING-only. Example:
+
+
+.. code-block:: python
+
+   # tvm-ffi-stubgen(import-object): ffi.Object;False;_ffi_Object
+
+imports ``ffi.Object`` as ``_ffi_Object`` for use in generated code,
+where the second field ``False`` indicates the import is not TYPE_CHECKING-only.
+
+**Directive 8 (Skip file)**. It prevents the stub generation tool from modifying the file.
+This is useful when the file contains custom code that should not be altered.
diff --git a/docs/reference/python/index.rst b/docs/reference/python/index.rst
index 99b652f4..93144d90 100644
--- a/docs/reference/python/index.rst
+++ b/docs/reference/python/index.rst
@@ -103,7 +103,7 @@ Stream Context
 
 
 C++ Extension
---------------
+-------------
 
 C++ integration helpers for building and loading inline modules.
 
@@ -114,6 +114,7 @@ C++ integration helpers for building and loading inline modules.
   cpp.build_inline
   cpp.load
   cpp.build
+  libinfo.load_lib_module
 
 NVRTC Utilities
 ---------------
diff --git a/include/tvm/ffi/base_details.h b/include/tvm/ffi/base_details.h
index f6280200..7224ac11 100644
--- a/include/tvm/ffi/base_details.h
+++ b/include/tvm/ffi/base_details.h
@@ -92,10 +92,10 @@
 /// \cond Doxygen_Suppress
 #define TVM_FFI_STATIC_INIT_BLOCK_DEF_(FnName) __attribute__((constructor)) static void FnName()
 /// \endcond
-/*
+/*!
  * \brief Macro that defines a block that will be called during static initialization.
  *
- * \code
+ * \code{.cpp}
  * TVM_FFI_STATIC_INIT_BLOCK() {
  *   RegisterFunctions();
  * }
diff --git a/include/tvm/ffi/c_api.h b/include/tvm/ffi/c_api.h
index 1610376f..27d6e8cf 100644
--- a/include/tvm/ffi/c_api.h
+++ b/include/tvm/ffi/c_api.h
@@ -802,7 +802,7 @@ typedef enum {
  *
  * The meta-data record comparison method in tree node and DAG node.
  *
- * \code
+ * \code{.cpp}
  * x = VarNode()
  * v0 = AddNode(x, 1)
  * v1 = AddNode(x, 1)
diff --git a/include/tvm/ffi/container/container_details.h b/include/tvm/ffi/container/container_details.h
index 397209f3..09f513b1 100644
--- a/include/tvm/ffi/container/container_details.h
+++ b/include/tvm/ffi/container/container_details.h
@@ -47,7 +47,7 @@ namespace details {
  * \tparam ElemType The type of objects stored in the array right after
  * ArrayType.
  *
- * \code
+ * \code{.cpp}
  * // Example usage of the template to define a simple array wrapper
  * class ArrayObj : public tvm::ffi::details::InplaceArrayBase<ArrayObj, Elem> {
  * public:
@@ -72,7 +72,6 @@ namespace details {
  *   // Access the 0th element in the array.
  *   assert(ptr->operator[](0) == fields[0]);
  * }
- *
  * \endcode
  */
 template <typename ArrayType, typename ElemType>
diff --git a/include/tvm/ffi/container/tensor.h b/include/tvm/ffi/container/tensor.h
index 857bd6bd..3675bb5a 100644
--- a/include/tvm/ffi/container/tensor.h
+++ b/include/tvm/ffi/container/tensor.h
@@ -30,8 +30,6 @@
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/type_traits.h>
 
-#include <atomic>
-#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -406,7 +404,7 @@ class Tensor : public ObjectRef {
    * to create Tensors.
    *
    * Example usage:
-   * \code
+   * \code{.cpp}
    * // CPU Allocator
    * struct CPUNDAlloc {
    *   void AllocData(DLTensor* tensor) { tensor->data = malloc(ffi::GetDataSize(*tensor)); }
@@ -431,20 +429,20 @@ class Tensor : public ObjectRef {
    *   }
    * };
    *
-   *   // NVSHMEM Allocator
-   *   struct NVSHMEMNDAlloc {
-   *     void AllocData(DLTensor* tensor) {
-   *       size_t size = tvm::ffi::GetDataSize(*tensor);
-   *       tensor->data = nvshmem_malloc(size);
-   *       TVM_FFI_ICHECK_NE(tensor->data, nullptr) << "nvshmem_malloc failed. size: " << size;
-   *     }
-   *     void FreeData(DLTensor* tensor) { nvshmem_free(tensor->data); }
-   *   };
+   * // NVSHMEM Allocator
+   * struct NVSHMEMNDAlloc {
+   *   void AllocData(DLTensor* tensor) {
+   *     size_t size = tvm::ffi::GetDataSize(*tensor);
+   *     tensor->data = nvshmem_malloc(size);
+   *     TVM_FFI_ICHECK_NE(tensor->data, nullptr) << "nvshmem_malloc failed. size: " << size;
+   *   }
+   *   void FreeData(DLTensor* tensor) { nvshmem_free(tensor->data); }
+   * };
    *
-   *   // Allocator usage
-   *   ffi::Tensor cpu_tensor = ffi::Tensor::FromNDAlloc(CPUNDAlloc(), ...);
-   *   ffi::Tensor cuda_tensor = ffi::Tensor::FromNDAlloc(CUDANDAlloc(), ...);
-   *   ffi::Tensor nvshmem_tensor = ffi::Tensor::FromNDAlloc(NVSHMEMNDAlloc(), ...);
+   * // Allocator usage
+   * ffi::Tensor cpu_tensor = ffi::Tensor::FromNDAlloc(CPUNDAlloc(), ...);
+   * ffi::Tensor cuda_tensor = ffi::Tensor::FromNDAlloc(CUDANDAlloc(), ...);
+   * ffi::Tensor nvshmem_tensor = ffi::Tensor::FromNDAlloc(NVSHMEMNDAlloc(), ...);
    * \endcode
    *
    * \param alloc The NDAllocator.
@@ -507,12 +505,8 @@ class Tensor : public ObjectRef {
    * in the extra/c_env_api.h to create a Tensor from the thread-local environment allocator.
    * We explicitly pass TVMFFIEnvTensorAlloc to maintain explicit dependency on extra/c_env_api.h
    *
-   * \code
-   *
-   * ffi::Tensor tensor = ffi::Tensor::FromEnvAlloc(
-   *   TVMFFIEnvTensorAlloc, shape, dtype, device
-   * );
-   *
+   * \code{.cpp}
+   * ffi::Tensor tensor = ffi::Tensor::FromEnvAlloc(TVMFFIEnvTensorAlloc, shape, dtype, device);
    * \endcode
    *
    * \param env_alloc TVMFFIEnvTensorAlloc function pointer.
diff --git a/include/tvm/ffi/error.h b/include/tvm/ffi/error.h
index 95602379..91d55c67 100644
--- a/include/tvm/ffi/error.h
+++ b/include/tvm/ffi/error.h
@@ -34,7 +34,6 @@
 #include <memory>
 #include <sstream>
 #include <string>
-#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -69,15 +68,13 @@ namespace ffi {
  *  and return a proper code to tell the frontend caller about
  *  this fact.
  *
- * \code
- *
+ * \code{.cpp}
  * void ExampleLongRunningFunction() {
  *   if (TVMFFIEnvCheckSignals() != 0) {
  *     throw ::tvm::ffi::EnvErrorAlreadySet();
  *   }
  *   // do work here
  * }
- *
  * \endcode
  */
 struct EnvErrorAlreadySet : public std::exception {};
@@ -295,12 +292,10 @@ class ErrorBuilder {
 /*!
  * \brief Helper macro to throw an error with backtrace and message
  *
- * \code
- *
- *   void ThrowError() {
- *     TVM_FFI_THROW(RuntimeError) << "error message";
- *   }
- *
+ * \code{.cpp}
+ * void ThrowError() {
+ *   TVM_FFI_THROW(RuntimeError) << "error message";
+ * }
  * \endcode
  */
 #define TVM_FFI_THROW(ErrorKind)                                                              \
diff --git a/include/tvm/ffi/extra/cuda/device_guard.h b/include/tvm/ffi/extra/cuda/device_guard.h
index 083580f7..01586886 100644
--- a/include/tvm/ffi/extra/cuda/device_guard.h
+++ b/include/tvm/ffi/extra/cuda/device_guard.h
@@ -34,7 +34,7 @@ namespace ffi {
  * current CUDA device back to original device index.
  *
  * Example usage:
- * \code
+ * \code{.cpp}
  * void kernel(ffi::TensorView x) {
  *   ffi::CUDADeviceGuard guard(x.device().device_id);
  *   ...
diff --git a/include/tvm/ffi/extra/module.h b/include/tvm/ffi/extra/module.h
index 6af26c25..5c2142ec 100644
--- a/include/tvm/ffi/extra/module.h
+++ b/include/tvm/ffi/extra/module.h
@@ -87,7 +87,7 @@ class TVM_FFI_EXTRA_CXX_API ModuleObj : public Object {
    * \param name The name of the function.
    * \return The metadata as JSON string if available, nullopt otherwise.
    *
-   * \code
+   * \code{.cpp}
    * Module mod = Module::LoadFromFile("lib.so");
    * Optional<String> metadata = mod->GetFunctionMetadata("my_func");
    * if (metadata.has_value()) {
@@ -208,7 +208,7 @@ class TVM_FFI_EXTRA_CXX_API ModuleObj : public Object {
  * When invoking a function on a ModuleObj, such as GetFunction,
  * use operator-> to get the ModuleObj pointer and invoke the member functions.
  *
- * \code
+ * \code{.cpp}
  * ffi::Module mod = ffi::Module::LoadFromFile("path/to/module.so");
  * ffi::Function func = mod->GetFunction(name);
  * \endcode
diff --git a/include/tvm/ffi/function.h b/include/tvm/ffi/function.h
index f2cd61fa..d1cc6933 100644
--- a/include/tvm/ffi/function.h
+++ b/include/tvm/ffi/function.h
@@ -40,7 +40,6 @@
 #include <tvm/ffi/function_details.h>
 
 #include <functional>
-#include <sstream>
 #include <string>
 #include <type_traits>
 #include <utility>
@@ -55,7 +54,7 @@ namespace ffi {
  * \brief Marks the beginning of the safe call that catches exception explicitly
  * \sa TVM_FFI_SAFE_CALL_END
  *
- * \code
+ * \code{.cpp}
  * int TVMFFICStyleFunction() {
  *   TVM_FFI_SAFE_CALL_BEGIN();
  *   // c++ code region here
@@ -90,7 +89,7 @@ namespace ffi {
  * \brief Macro to check a call to TVMFFISafeCallType and raise exception if error happens.
  * \param func The function to check.
  *
- * \code
+ * \code{.cpp}
  * // calls TVMFFIFunctionCall and raises exception if error happens
  * TVM_FFI_CHECK_SAFE_CALL(TVMFFITypeKeyToIndex(&type_key_arr, &type_index));
  * \endcode
@@ -545,18 +544,15 @@ class Function : public ObjectRef {
    *
    * This function can be useful to turn an existing exported symbol into a typed function.
    *
-   * \code
-   *
+   * \code{.cpp}
    * // An extern "C" function, matching TVMFFISafeCallType
    * extern "C" int __tvm_ffi_add(
    *   void* handle, const TVMFFIAny* args, int32_t num_args, TVMFFIAny*result
    * );
-   *
    * // redirect an existing symbol into a typed function
    * inline int add(int a, int b) {
    *   return tvm::ffi::Function::InvokeExternC(nullptr, __tvm_ffi_add, a, b).cast<int>();
    * }
-   *
    * \endcode
    *
    * \tparam Args The types of the arguments to the extern function.
@@ -583,13 +579,13 @@ class Function : public ObjectRef {
    * \param args Arguments to be passed.
    * \tparam Args arguments to be passed.
    *
-   * \code
-   *   // Example code on how to call packed function
-   *   void CallFFIFunction(tvm::ffi::Function f) {
-   *     // call like normal functions by pass in arguments
-   *     // return value is automatically converted back
-   *     int rvalue = f(1, 2.0);
-   *   }
+   * \code{.cpp}
+   * // Example code on how to call packed function
+   * void CallFFIFunction(tvm::ffi::Function f) {
+   *   // call like normal functions by pass in arguments
+   *   // return value is automatically converted back
+   *   int rvalue = f(1, 2.0);
+   * }
    * \endcode
    */
   template <typename... Args>
@@ -669,11 +665,9 @@ class TypedFunction;
  * We can construct a TypedFunction from a lambda function
  * with the same signature.
  *
- * \code
+ * \code{.cpp}
  *  // user defined lambda function.
- *  auto addone = [](int x)->int {
- *    return x + 1;
- *  };
+ *  auto addone = [](int x)->int { return x + 1; };
  *  // We can directly convert
  *  // lambda function to TypedFunction
  *  TypedFunction<int(int)> ftyped(addone);
@@ -703,7 +697,7 @@ class TypedFunction<R(Args...)> {
    * \brief construct from a lambda function with the same signature.
    *
    * Example usage:
-   * \code
+   * \code{.cpp}
    * auto typed_lambda = [](int x)->int { return x + 1; }
    * // construct from packed function
    * TypedFunction<int(int)> ftyped(typed_lambda, "add_one");
@@ -727,7 +721,7 @@ class TypedFunction<R(Args...)> {
    * version that takes a name for the lambda.
    *
    * Example usage:
-   * \code
+   * \code{.cpp}
    * auto typed_lambda = [](int x)->int { return x + 1; }
    * // construct from packed function
    * TypedFunction<int(int)> ftyped(typed_lambda);
@@ -748,7 +742,7 @@ class TypedFunction<R(Args...)> {
    * \brief copy assignment operator from typed lambda
    *
    * Example usage:
-   * \code
+   * \code{.cpp}
    * // construct from packed function
    * TypedFunction<int(int)> ftyped;
    * ftyped = [](int x) { return x + 1; }
@@ -901,15 +895,12 @@ inline int32_t TypeKeyToIndex(std::string_view type_key) {
  *
  * \sa ffi::TypedFunction, TVM_FFI_DLL_EXPORT_TYPED_FUNC_DOC
  *
- * \code
- *
+ * \code{.cpp}
  * int AddOne_(int x) {
  *   return x + 1;
  * }
- *
  * // Expose the function as "AddOne"
  * TVM_FFI_DLL_EXPORT_TYPED_FUNC(AddOne, AddOne_);
- *
  * // Expose the function as "SubOne"
  * TVM_FFI_DLL_EXPORT_TYPED_FUNC(SubOne, [](int x) {
  *   return x - 1;
@@ -957,8 +948,7 @@ inline int32_t TypeKeyToIndex(std::string_view type_key) {
  *
  * \sa ffi::TypedFunction, TVM_FFI_DLL_EXPORT_TYPED_FUNC
  *
- * \code
- *
+ * \code{.cpp}
  * int Add(int a, int b) {
  *   return a + b;
  * }
@@ -979,7 +969,6 @@ inline int32_t TypeKeyToIndex(std::string_view type_key) {
  * -------
  * result : int
  *     Sum of a and b)");
- *
  * \endcode
  *
  * \note The exported symbol name is `__tvm_ffi__doc_<ExportName>` (docstring getter function).
diff --git a/include/tvm/ffi/reflection/registry.h b/include/tvm/ffi/reflection/registry.h
index 3014108c..3224a9fd 100644
--- a/include/tvm/ffi/reflection/registry.h
+++ b/include/tvm/ffi/reflection/registry.h
@@ -322,9 +322,9 @@ class ReflectionDefBase {
 /*!
  * \brief GlobalDef helper to register a global function.
  *
- * \code
- *  namespace refl = tvm::ffi::reflection;
- *  refl::GlobalDef().def("my_ffi_extension.my_function", MyFunction);
+ * \code{.cpp}
+ * namespace refl = tvm::ffi::reflection;
+ * refl::GlobalDef().def("my_ffi_extension.my_function", MyFunction);
  * \endcode
  */
 class GlobalDef : public ReflectionDefBase {
@@ -415,19 +415,20 @@ class GlobalDef : public ReflectionDefBase {
  * \tparam Args The argument types for the constructor.
  *
  * Example usage:
- * \code
- *   class ExampleObject : public Object {
- *    public:
- *      int64_t v_i64;
- *      int32_t v_i32;
  *
- *      ExampleObject(int64_t v_i64, int32_t v_i32) : v_i64(v_i64), v_i32(v_i32) {}
- *      TVM_FFI_DECLARE_OBJECT_INFO("example.ExampleObject", ExampleObject, Object);
- *   };
+ * \code{.cpp}
+ * class ExampleObject : public Object {
+ *  public:
+ *   int64_t v_i64;
+ *   int32_t v_i32;
  *
- *   // Register the constructor
- *   refl::ObjectDef<ExampleObject>()
- *      .def(refl::init<int64_t, int32_t>());
+ *   ExampleObject(int64_t v_i64, int32_t v_i32) : v_i64(v_i64), v_i32(v_i32) {}
+ *   TVM_FFI_DECLARE_OBJECT_INFO("example.ExampleObject", ExampleObject, Object);
+ * };
+ *
+ * // Register the constructor
+ * refl::ObjectDef<ExampleObject>()
+ *     .def(refl::init<int64_t, int32_t>());
  * \endcode
  *
  * \note The object type is automatically deduced from the `ObjectDef` context.
@@ -460,9 +461,9 @@ struct init {
  * \brief Helper to register Object's reflection metadata.
  * \tparam Class The class type.
  *
- * \code
- *  namespace refl = tvm::ffi::reflection;
- *  refl::ObjectDef<MyClass>().def_ro("my_field", &MyClass::my_field);
+ * \code{.cpp}
+ * namespace refl = tvm::ffi::reflection;
+ * refl::ObjectDef<MyClass>().def_ro("my_field", &MyClass::my_field);
  * \endcode
  */
 template <typename Class>
@@ -570,9 +571,10 @@ class ObjectDef : public ReflectionDefBase {
    * \return Reference to this `ObjectDef` for method chaining.
    *
    * Example:
-   * \code
-   *   refl::ObjectDef<MyObject>()
-   *       .def(refl::init<int64_t, std::string>(), "Constructor docstring");
+   *
+   * \code{.cpp}
+   * refl::ObjectDef<MyObject>()
+   *     .def(refl::init<int64_t, std::string>(), "Constructor docstring");
    * \endcode
    */
   template <typename... Args, typename... Extra>
@@ -662,11 +664,10 @@ class ObjectDef : public ReflectionDefBase {
  * \tparam Class The class type.
  * \tparam ExtraArgs The extra arguments.
  *
- * \code
- *  namespace refl = tvm::ffi::reflection;
- *  refl::TypeAttrDef<MyClass>().def("func_attr", MyFunc);
+ * \code{.cpp}
+ * namespace refl = tvm::ffi::reflection;
+ * refl::TypeAttrDef<MyClass>().def("func_attr", MyFunc);
  * \endcode
- *
  */
 template <typename Class, typename = std::enable_if_t<std::is_base_of_v<Object, Class>>>
 class TypeAttrDef : public ReflectionDefBase {
diff --git a/include/tvm/ffi/rvalue_ref.h b/include/tvm/ffi/rvalue_ref.h
index aca5840f..c34a12e2 100644
--- a/include/tvm/ffi/rvalue_ref.h
+++ b/include/tvm/ffi/rvalue_ref.h
@@ -50,8 +50,7 @@ namespace ffi {
  * This design allows us to still leverage move semantics for parameters that
  * need copy on write scenarios (and requires an unique copy).
  *
- * \code
- *
+ * \code{.cpp}
  * void Example() {
  *   auto append = Function::FromTyped([](RValueRef<Array<int>> ref, int val) -> Array<int> {
  *     Array<int> arr = *std::move(ref);
@@ -65,7 +64,6 @@ namespace ffi {
  *   a = append(RvalueRef(std::move(a)), 3);
  *   assert(a.size() == 3);
  * }
- *
  * \endcode
  */
 template <typename TObjRef, typename = std::enable_if_t<std::is_base_of_v<ObjectRef, TObjRef>>>
diff --git a/python/tvm_ffi/cython/tensor.pxi b/python/tvm_ffi/cython/tensor.pxi
index 1f4973d9..8b78c809 100644
--- a/python/tvm_ffi/cython/tensor.pxi
+++ b/python/tvm_ffi/cython/tensor.pxi
@@ -195,10 +195,10 @@ def from_dlpack(
 
     Parameters
     ----------
-    ext_tensor : object
-        An object supporting `__dlpack__ <https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html#array_api.array.__dlpack__>`_
-        and `__dlpack_device__ <https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack_device__.html#array_api.array.__dlpack_device__>`_.
-    require_alignment : int, optional
+    ext_tensor
+        An object supporting :py:meth:`__dlpack__ <array_api.array.__dlpack__>`
+        and :py:meth:`__dlpack_device__ <array_api.array.__dlpack_device__>`.
+    require_alignment
         If greater than zero, require the underlying data pointer to be
         aligned to this many bytes. Misaligned inputs raise
         :class:`ValueError`.
@@ -314,7 +314,7 @@ cdef class Tensor(Object):
             dltensor, _c_str_dltensor_versioned, <PyCapsule_Destructor>_c_dlpack_versioned_deleter)
 
     def __dlpack_device__(self) -> tuple[int, int]:
-        """Implement the standard `__dlpack_device__ <https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack_device__.html#array_api.array.__dlpack_device__>`_ protocol."""  # noqa: E501
+        """Implement the standard :py:meth:`__dlpack_device__ <array_api.array.__dlpack_device__>` protocol."""
         cdef int device_type = self.cdltensor.device.device_type
         cdef int device_id = self.cdltensor.device.device_id
         return (device_type, device_id)
@@ -327,7 +327,7 @@ cdef class Tensor(Object):
         dl_device: tuple[int, int] | None = None,
         copy: bool | None = None,
     ) -> object:
-        """Implement the standard `__dlpack__ <https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html#array_api.array.__dlpack__>`_ protocol.
+        """Implement the standard :py:meth:`__dlpack__ <array_api.array.__dlpack__>` protocol.
 
         Parameters
         ----------
diff --git a/python/tvm_ffi/stub/cli.py b/python/tvm_ffi/stub/cli.py
index 9b9786c6..f2d34a9f 100644
--- a/python/tvm_ffi/stub/cli.py
+++ b/python/tvm_ffi/stub/cli.py
@@ -141,14 +141,15 @@ def _find_or_insert_file(path: Path) -> FileInfo:
     } | C.BUILTIN_TYPE_KEYS
 
     # Step 0. Generate missing `_ffi_api.py` and `__init__.py` under each prefix.
+    prefix_filter = init_cfg.prefix.strip()
+    if prefix_filter and not prefix_filter.endswith("."):
+        prefix_filter += "."
+    root_prefix = prefix_filter.rstrip(".")
     prefixes: dict[str, list[str]] = collect_type_keys()
     for prefix in global_funcs:
         prefixes.setdefault(prefix, [])
-
-    root_ffi_api_py = init_path / init_cfg.prefix.rstrip(".") / "_ffi_api.py"
     for prefix, obj_names in prefixes.items():
-        # TODO(@junrushao): control the prefix to generate stubs for
-        if prefix.startswith("testing") or prefix.startswith("ffi"):
+        if not (prefix == root_prefix or prefix.startswith(prefix_filter)):
             continue
         funcs = sorted(
             [] if prefix in defined_func_prefixes else global_funcs.get(prefix, []),
@@ -172,7 +173,7 @@ def _find_or_insert_file(path: Path) -> FileInfo:
                     prefix,
                     object_infos,
                     init_cfg,
-                    is_root=root_ffi_api_py.samefile(target_path),
+                    is_root=prefix == root_prefix,
                 )
             )
         target_file.reload()
@@ -448,7 +449,7 @@ def _split_list_arg(arg: str | None) -> list[str]:
         default="",
         help=(
             "Python package name to generate stubs for (e.g. apache-tvm-ffi). "
-            "Required together with --init-lib, --init-path, and --init-prefix."
+            "Required together with --init-lib and --init-prefix."
         ),
     )
     parser.add_argument(
diff --git a/tests/python/test_stubgen.py b/tests/python/test_stubgen.py
index c0d3dd5d..0c2eb0cf 100644
--- a/tests/python/test_stubgen.py
+++ b/tests/python/test_stubgen.py
@@ -19,9 +19,10 @@
 from pathlib import Path
 
 import pytest
+import tvm_ffi.stub.cli as stub_cli
 from tvm_ffi.core import TypeSchema
 from tvm_ffi.stub import consts as C
-from tvm_ffi.stub.cli import _stage_3
+from tvm_ffi.stub.cli import _stage_2, _stage_3
 from tvm_ffi.stub.codegen import (
     generate_all,
     generate_export,
@@ -604,3 +605,50 @@ def test_generate_ffi_api_with_objects_imports_parents() -> None:
         f"{C.STUB_IMPORT_OBJECT} {parent_key};False;_{parent_key.replace('.', '_')}"
     )
     assert parent_import_prompt in code
+
+
+def test_stage_2_filters_prefix_and_marks_root(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    prefixes: dict[str, list[FuncInfo]] = {"demo.sub": [], "demo": [], "other": []}
+    monkeypatch.setattr(stub_cli, "collect_type_keys", lambda: prefixes)
+    monkeypatch.setattr(stub_cli, "toposort_objects", lambda objs: [])
+
+    global_funcs = {
+        "demo.sub": [
+            FuncInfo.from_schema(
+                "demo.sub.add_one",
+                TypeSchema("Callable", (TypeSchema("int"), TypeSchema("int"))),
+            )
+        ],
+        "demo": [
+            FuncInfo.from_schema(
+                "demo.add_one",
+                TypeSchema("Callable", (TypeSchema("int"), TypeSchema("int"))),
+            )
+        ],
+        "other": [
+            FuncInfo.from_schema(
+                "other.add_one",
+                TypeSchema("Callable", (TypeSchema("int"), TypeSchema("int"))),
+            )
+        ],
+    }
+    _stage_2(
+        files=[],
+        ty_map=_default_ty_map(),
+        init_cfg=InitConfig(pkg="demo-pkg", shared_target="demo_shared", prefix="demo."),
+        init_path=tmp_path,
+        global_funcs=global_funcs,
+    )
+
+    root_api = tmp_path / "demo" / "_ffi_api.py"
+    sub_api = tmp_path / "demo" / "sub" / "_ffi_api.py"
+    other_api = tmp_path / "other" / "_ffi_api.py"
+    assert root_api.exists()
+    assert sub_api.exists()
+    assert not other_api.exists()
+    root_text = root_api.read_text(encoding="utf-8")
+    sub_text = sub_api.read_text(encoding="utf-8")
+    assert 'LIB = _FFI_LOAD_LIB("demo-pkg", "demo_shared")' in root_text
+    assert "LIB =" not in sub_text