diff --git a/cmake/Utils/AddGoogleTest.cmake b/cmake/Utils/AddGoogleTest.cmake
index e5a7a849..251d7133 100644
--- a/cmake/Utils/AddGoogleTest.cmake
+++ b/cmake/Utils/AddGoogleTest.cmake
@@ -48,7 +48,7 @@ macro(tvm_ffi_add_googletest target_name)
   target_link_libraries(${target_name} PRIVATE gtest_main)
   gtest_discover_tests(${target_name}
     WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-    TEST_DISCOVERY_TIMEOUT 300
+    TEST_DISCOVERY_TIMEOUT 600
     DISCOVERY_MODE PRE_TEST
     PROPERTIES
       VS_DEBUGGER_WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
diff --git a/docs/get_started/quick_start.md b/docs/get_started/quick_start.md
index 8b127c9e..449c1db4 100644
--- a/docs/get_started/quick_start.md
+++ b/docs/get_started/quick_start.md
@@ -72,6 +72,7 @@ tensor and expose that function as TVM FFI compatible function. The key file str
 examples/quick_start/
 ├── src/
 │   ├── add_one_cpu.cc      # CPU implementation
+│   ├── add_one_c.c         # A low-level C based implementation
 │   ├── add_one_cuda.cu     # CUDA implementation
 │   └── run_example.cc      # C++ usage example
 ├── run_example.py          # Python usage example
@@ -201,16 +202,81 @@ shows how to run the example exported function in C++.
 #include <tvm/ffi/container/tensor.h>
 #include <tvm/ffi/extra/module.h>
 
-void CallAddOne(DLTensor* x, DLTensor *y) {
-  namespace ffi = tvm::ffi;
+namespace ffi = tvm::ffi;
+
+void CallAddOne(ffi::Tensor x, ffi::Tensor y) {
   ffi::Module mod = ffi::Module::LoadFromFile("build/add_one_cpu.so");
   ffi::Function add_one_cpu = mod->GetFunction("add_one_cpu").value();
   add_one_cpu(x, y);
 }
 ```
 
+## Advanced: Minimal C ABI demonstration
+
+For those who need to understand the low-level C ABI or are implementing
+compiler codegen, we also provided an example that is C only as follows:
+
+```c
+#include <tvm/ffi/c_api.h>
+#include <tvm/ffi/extra/c_env_api.h>
+
+// Helper to extract DLTensor from TVMFFIAny
+int ReadDLTensorPtr(const TVMFFIAny *value, DLTensor** out) {
+  if (value->type_index == kTVMFFIDLTensorPtr) {
+    *out = (DLTensor*)(value->v_ptr);
+    return 0;
+  }
+  if (value->type_index != kTVMFFITensor) {
+    TVMFFIErrorSetRaisedFromCStr("ValueError", "Expects a Tensor input");
+    return -1;
+  }
+  *out = (DLTensor*)((char*)(value->v_obj) + sizeof(TVMFFIObject));
+  return 0;
+}
+
+// Raw C FFI function
+int __tvm_ffi_add_one_c(
+  void* handle, const TVMFFIAny* args, int32_t num_args, TVMFFIAny* result
+) {
+  DLTensor *x, *y;
+
+  // Extract tensor arguments
+  if (ReadDLTensorPtr(&args[0], &x) == -1) return -1;
+  if (ReadDLTensorPtr(&args[1], &y) == -1) return -1;
+
+  // Get current stream for device synchronization (e.g., CUDA)
+  // not needed for CPU, just keep here for demonstration purpose
+  void* stream = TVMFFIEnvGetStream(x->device.device_type, x->device.device_id);
+
+  // Perform computation
+  for (int i = 0; i < x->shape[0]; ++i) {
+    ((float*)(y->data))[i] = ((float*)(x->data))[i] + 1;
+  }
+  return 0;  // Success
+}
+```
+To compile this code, you need to add {py:func}`tvm_ffi.libinfo.find_include_paths` to your include
+path and link the shared library that can be found through {py:func}`tvm_ffi.libinfo.find_libtvm_ffi`.
+We also provide command line tools to link, so you can compile with the following command:
+
+```bash
+gcc -shared -fPIC `tvm-ffi-config --cflags`  \
+    src/add_one_c.c -o build/add_one_c.so    \
+    `tvm-ffi-config --ldflags` `tvm-ffi-config --libs`
+```
+
+The main takeaway points are:
+- Function symbols follow name `int __tvm_ffi_<name>`
+- The function follows signaure of `TVMFFISafeCallType`
+- Use `TVMFFIAny` to handle dynamic argument types
+- Return `0` for success, `-1` for error (set via `TVMFFIErrorSetRaisedFromCStr`)
+- This function can be compiled using a c compiler and loaded in the same one as
+  other libraries in this example.
+
 ## Summary Key Concepts
 
 - **TVM_FFI_DLL_EXPORT_TYPED_FUNC** exposes a c++ function into tvm-ffi C ABI
-- **DLTensor** is a universal tensor structure that enables zero-copy exchange of array data
+- **ffi::Tensor** is a universal tensor structure that enables zero-copy exchange of array data
 - **Module loading** is provided by tvm ffi APIs in multiple languages.
+- **C ABI** is provided for easy low-level integration
+
diff --git a/docs/guides/compiler_integration.md b/docs/guides/compiler_integration.md
index 0eaf1ff0..a1355aff 100644
--- a/docs/guides/compiler_integration.md
+++ b/docs/guides/compiler_integration.md
@@ -35,43 +35,49 @@ following options:
   use {c:macro}`TVM_FFI_DLL_EXPORT_TYPED_FUNC` to expose the symbol.
 
 The following code snippet shows C code that corresponds to a
-function performing `add_one` under the ABI. It is reasonably straightforward for
+function performing `add_one_c` under the ABI. It is reasonably straightforward for
 low-level code generators to replicate this C logic.
+You can run this code as part of the [quick start example](https://github.com/apache/tvm-ffi/tree/dev/examples/quick_start).
 
 ```c
 #include <tvm/ffi/c_api.h>
 #include <tvm/ffi/extra/c_env_api.h>
 
 // Helper function to extract DLTensor from TVMFFIAny (can be inlined into generated code)
-int ReadDLTensorPtr(const TVMFFIAny *value, DLTensor* out) {
+int ReadDLTensorPtr(const TVMFFIAny *value, DLTensor** out) {
   if (value->type_index == kTVMFFIDLTensorPtr) {
-    *out = static_cast<DLTensor*>(value->v_ptr);
+    *out = (DLTensor*)(value->v_ptr);
     return 0;
   }
-  if (value->type_index == kTVMFFITensor) {
+  if (value->type_index != kTVMFFITensor) {
+    // Use TVMFFIErrorSetRaisedFromCStr to set an error which will
+    // be propagated to the caller
     TVMFFIErrorSetRaisedFromCStr("ValueError", "Expects a Tensor input");
     return -1;
   }
-  *out = reinterpret_cast<DLTensor*>(
-    reinterpret_cast<char*>(value->v_obj) + sizeof(TVMFFIObject));
+  *out = (DLTensor*)((char*)(value->v_obj) + sizeof(TVMFFIObject));
   return 0;
 }
 
 // FFI function implementing add_one operation
-int __tvm_ffi_add_one(
+int __tvm_ffi_add_one_c(
   void* handle, const TVMFFIAny* args, int32_t num_args, TVMFFIAny* result
 ) {
-  DLTensor *a, *b, *c;
+  DLTensor *x, *y;
   // Extract tensor arguments
-  if (ReadDLTensorPtr(&args[0], &a) == -1) return -1;
-  if (ReadDLTensorPtr(&args[1], &b) == -1) return -1;
-  if (ReadDLTensorPtr(&args[2], &c) == -1) return -1;
+  // return -1 for error, error is set through TVMFFIErrorSetRaisedFromCStr
+  if (ReadDLTensorPtr(&args[0], &x) == -1) return -1;
+  if (ReadDLTensorPtr(&args[1], &y) == -1) return -1;
 
   // Get current stream for device synchronization (e.g., CUDA)
-  void* stream = TVMFFIEnvGetStream(a->device.device_type, a->device.device_id);
+  // not needed for CPU, just keep here for demonstration purpose
+  void* stream = TVMFFIEnvGetStream(x->device.device_type, x->device.device_id);
 
-  // Generated computation code would follow here to perform the actual operation
-  // on tensors a, b, c and store result in c
+  // perform the actual operation
+  for (int i = 0; i < x->shape[0]; ++i) {
+    ((float*)(y->data))[i] = ((float*)(x->data))[i] + 1;
+  }
+  // return 0 for success run
   return 0;
 }
 ```
diff --git a/examples/quick_start/CMakeLists.txt b/examples/quick_start/CMakeLists.txt
index 05530988..0f6ea11d 100644
--- a/examples/quick_start/CMakeLists.txt
+++ b/examples/quick_start/CMakeLists.txt
@@ -31,14 +31,21 @@ find_package(tvm_ffi CONFIG REQUIRED)
 
 # use the projects as usual
 add_library(add_one_cpu SHARED src/add_one_cpu.cc)
+add_library(add_one_c SHARED src/add_one_c.c)
 target_link_libraries(add_one_cpu tvm_ffi_header)
 target_link_libraries(add_one_cpu tvm_ffi_shared)
+target_link_libraries(add_one_c tvm_ffi_shared)
 # show as add_one_cpu.so
 set_target_properties(
   add_one_cpu PROPERTIES
   PREFIX ""
   SUFFIX ".so"
 )
+set_target_properties(
+  add_one_c PROPERTIES
+  PREFIX ""
+  SUFFIX ".so"
+)
 
 # Check if CUDA is available
 if(NOT WIN32)
diff --git a/examples/quick_start/README.md b/examples/quick_start/README.md
index 002d4375..d4d130e0 100644
--- a/examples/quick_start/README.md
+++ b/examples/quick_start/README.md
@@ -52,7 +52,7 @@ You can also compile the modules directly using
 flags provided by the `tvm-ffi-config` tool.
 
 ```bash
-g++ -shared -fPIC `tvm-ffi-config --cxxflags`  \
-    src/add_one_cpu.cc -o build/add_one_cpu.so \
+gcc -shared -fPIC `tvm-ffi-config --cflags`  \
+    src/add_one_c.c -o build/add_one_c.so \
     `tvm-ffi-config --ldflags` `tvm-ffi-config --libs`
 ```
diff --git a/examples/quick_start/run_example.py b/examples/quick_start/run_example.py
index c7a2fcbf..e126af14 100644
--- a/examples/quick_start/run_example.py
+++ b/examples/quick_start/run_example.py
@@ -52,6 +52,26 @@ def run_add_one_cpu():
     print(y)
 
 
+def run_add_one_c():
+    """Load the add_one_c module and call the add_one_c function."""
+    mod = tvm_ffi.load_module("build/add_one_c.so")
+
+    x = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32)
+    y = numpy.empty_like(x)
+    mod.add_one_c(x, y)
+    print("numpy.result after add_one_c(x, y)")
+    print(x)
+
+    if torch is None:
+        return
+
+    x = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32)
+    y = torch.empty_like(x)
+    mod.add_one_c(x, y)
+    print("torch.result after add_one_c(x, y)")
+    print(y)
+
+
 def run_add_one_cuda():
     """Load the add_one_cuda module and call the add_one_cuda function."""
     if torch is None or not torch.cuda.is_available():
@@ -76,6 +96,7 @@ def run_add_one_cuda():
 def main():
     """Main function to run the example."""
     run_add_one_cpu()
+    run_add_one_c()
     run_add_one_cuda()
 
 
diff --git a/examples/quick_start/src/add_one_c.c b/examples/quick_start/src/add_one_c.c
new file mode 100644
index 00000000..a12987e2
--- /dev/null
+++ b/examples/quick_start/src/add_one_c.c
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/ffi/c_api.h>
+#include <tvm/ffi/extra/c_env_api.h>
+
+// This is a raw C variant of the add_one_cpu function
+// it is used to demonstrate how low-level mechanism works
+// to construct a tvm ffi compatible function
+//
+// This function can also serve as a reference for how to implement
+// a compiler codegen to target tvm ffi
+//
+// if you are looking for a more high-level way to construct a tvm ffi compatible function,
+// please refer to the add_one_cpu.cc instead
+/*!
+ * \brief Helper code to read DLTensor from TVMFFIAny, can be inlined into generated code
+ * \param value The TVMFFIAny to read from
+ * \param out The DLTensor to read into
+ * \return 0 on success, -1 on error
+ */
+int ReadDLTensorPtr(const TVMFFIAny* value, DLTensor** out) {
+  if (value->type_index == kTVMFFIDLTensorPtr) {
+    *out = (DLTensor*)(value->v_ptr);
+    return 0;
+  }
+  if (value->type_index != kTVMFFITensor) {
+    // Use TVMFFIErrorSetRaisedFromCStr to set an error which will
+    // be propagated to the caller
+    TVMFFIErrorSetRaisedFromCStr("ValueError", "Expects a Tensor input");
+    return -1;
+  }
+  *out = (DLTensor*)((char*)(value->v_obj) + sizeof(TVMFFIObject));
+  return 0;
+}
+
+// FFI function implementing add_one operation
+int __tvm_ffi_add_one_c(                                                      //
+    void* handle, const TVMFFIAny* args, int32_t num_args, TVMFFIAny* result  //
+) {
+  DLTensor *x, *y;
+  // Extract tensor arguments
+  // return -1 for error, error is set through TVMFFIErrorSetRaisedFromCStr
+  if (ReadDLTensorPtr(&args[0], &x) == -1) return -1;
+  if (ReadDLTensorPtr(&args[1], &y) == -1) return -1;
+
+  // Get current stream for device synchronization (e.g., CUDA)
+  // not needed for CPU, just keep here for demonstration purpose
+  void* stream = TVMFFIEnvGetStream(x->device.device_type, x->device.device_id);
+
+  // perform the actual operation
+  for (int i = 0; i < x->shape[0]; ++i) {
+    ((float*)(y->data))[i] = ((float*)(x->data))[i] + 1;
+  }
+  // return 0 for success run
+  return 0;
+}
diff --git a/include/tvm/ffi/c_api.h b/include/tvm/ffi/c_api.h
index f13f820b..3dcdf4f8 100644
--- a/include/tvm/ffi/c_api.h
+++ b/include/tvm/ffi/c_api.h
@@ -27,21 +27,6 @@
 #include <dlpack/dlpack.h>
 #include <stdint.h>
 
-/*
- * \brief C-style Allocator that allocates memory for a DLPack tensor.
- * \param prototype The prototype DLTensor to offer details about device and shape.
- * \param out The output DLManagedTensorVersioned.
- * \param error_ctx The context to set the error.
- * \param SetError The function to set the error.
- * \return 0 on success, -1 on failure.
- *         call SetError(error_ctx, kind, message) to set the error kind and message.
- * \note Error propagation via SetError.
- */
-typedef int (*DLPackTensorAllocator)(                                         //
-    DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx,     //
-    void (*SetError)(void* error_ctx, const char* kind, const char* message)  //
-);
-
 // Macros to do weak linking
 #ifdef _MSC_VER
 #define TVM_FFI_WEAK __declspec(selectany)
@@ -75,12 +60,29 @@ typedef int (*DLPackTensorAllocator)(                                         //
 extern "C" {
 #endif
 
+// TODO(tqchen): remove this once dlpack.h is updated
+typedef struct DLManagedTensorVersioned DLManagedTensorVersioned;
+
+/*
+ * \brief C-style Allocator that allocates memory for a DLPack tensor.
+ * \param prototype The prototype DLTensor to offer details about device and shape.
+ * \param out The output DLManagedTensorVersioned.
+ * \param error_ctx The context to set the error.
+ * \param SetError The function to set the error.
+ * \return 0 on success, -1 on failure.
+ *         call SetError(error_ctx, kind, message) to set the error kind and message.
+ * \note Error propagation via SetError.
+ */
+typedef int (*DLPackTensorAllocator)(                                         //
+    DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx,     //
+    void (*SetError)(void* error_ctx, const char* kind, const char* message)  //
+);
+
 #ifdef __cplusplus
 enum TVMFFITypeIndex : int32_t {
 #else
 typedef enum {
 #endif
-
   /*
    * \brief The root type of all FFI objects.
    *
@@ -279,7 +281,6 @@ typedef struct {
     DLDataType v_dtype;    // data type
     DLDevice v_device;     // device
     char v_bytes[8];       // small string
-    char32_t v_char32[2];  // small UCS4 string and Unicode
     uint64_t v_uint64;     // uint64 repr mainly used for hashing
   };
 } TVMFFIAny;
diff --git a/pyproject.toml b/pyproject.toml
index bfe7b428..6c0c4908 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@
 
 [project]
 name = "apache-tvm-ffi"
-version = "0.1.0b1"
+version = "0.1.0b2"
 description = "tvm ffi"
 
 authors = [{ name = "TVM FFI team" }]
diff --git a/python/tvm_ffi/config.py b/python/tvm_ffi/config.py
index dcd85c24..4e87caaa 100644
--- a/python/tvm_ffi/config.py
+++ b/python/tvm_ffi/config.py
@@ -48,6 +48,7 @@ def __main__():
     parser.add_argument("--libs", action="store_true", help="Libraries to be linked")
     parser.add_argument("--cython-lib-path", action="store_true", help="Print cython path")
     parser.add_argument("--cxxflags", action="store_true", help="Print cxx flags")
+    parser.add_argument("--cflags", action="store_true", help="Print c flags")
     parser.add_argument("--ldflags", action="store_true", help="Print ld flags")
 
     args = parser.parse_args()
@@ -78,12 +79,15 @@ def __main__():
         include_dir = libinfo.find_include_path()
         dlpack_include_dir = libinfo.find_dlpack_include_path()
         print(f"-I{include_dir} -I{dlpack_include_dir} -std=c++17")
+    if args.cflags:
+        include_dir = libinfo.find_include_path()
+        dlpack_include_dir = libinfo.find_dlpack_include_path()
+        print(f"-I{include_dir} -I{dlpack_include_dir}")
     if args.libs:
         if sys.platform.startswith("win32"):
             print(find_windows_implib())
         else:
             print("-ltvm_ffi")
-
     if args.ldflags:
         if not sys.platform.startswith("win32"):
             print(f"-L{os.path.dirname(libinfo.find_libtvm_ffi())}")