diff --git a/cmake/Utils/AddGoogleTest.cmake b/cmake/Utils/AddGoogleTest.cmake index e5a7a849..251d7133 100644 --- a/cmake/Utils/AddGoogleTest.cmake +++ b/cmake/Utils/AddGoogleTest.cmake @@ -48,7 +48,7 @@ macro(tvm_ffi_add_googletest target_name) target_link_libraries(${target_name} PRIVATE gtest_main) gtest_discover_tests(${target_name} WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} - TEST_DISCOVERY_TIMEOUT 300 + TEST_DISCOVERY_TIMEOUT 600 DISCOVERY_MODE PRE_TEST PROPERTIES VS_DEBUGGER_WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}" diff --git a/docs/get_started/quick_start.md b/docs/get_started/quick_start.md index 8b127c9e..449c1db4 100644 --- a/docs/get_started/quick_start.md +++ b/docs/get_started/quick_start.md @@ -72,6 +72,7 @@ tensor and expose that function as TVM FFI compatible function. The key file str examples/quick_start/ ├── src/ │ ├── add_one_cpu.cc # CPU implementation +│ ├── add_one_c.c # A low-level C based implementation │ ├── add_one_cuda.cu # CUDA implementation │ └── run_example.cc # C++ usage example ├── run_example.py # Python usage example @@ -201,16 +202,81 @@ shows how to run the example exported function in C++. #include #include -void CallAddOne(DLTensor* x, DLTensor *y) { - namespace ffi = tvm::ffi; +namespace ffi = tvm::ffi; + +void CallAddOne(ffi::Tensor x, ffi::Tensor y) { ffi::Module mod = ffi::Module::LoadFromFile("build/add_one_cpu.so"); ffi::Function add_one_cpu = mod->GetFunction("add_one_cpu").value(); add_one_cpu(x, y); } ``` +## Advanced: Minimal C ABI demonstration + +For those who need to understand the low-level C ABI or are implementing +compiler codegen, we also provided an example that is C only as follows: + +```c +#include +#include + +// Helper to extract DLTensor from TVMFFIAny +int ReadDLTensorPtr(const TVMFFIAny *value, DLTensor** out) { + if (value->type_index == kTVMFFIDLTensorPtr) { + *out = (DLTensor*)(value->v_ptr); + return 0; + } + if (value->type_index != kTVMFFITensor) { + TVMFFIErrorSetRaisedFromCStr("ValueError", "Expects a Tensor input"); + return -1; + } + *out = (DLTensor*)((char*)(value->v_obj) + sizeof(TVMFFIObject)); + return 0; +} + +// Raw C FFI function +int __tvm_ffi_add_one_c( + void* handle, const TVMFFIAny* args, int32_t num_args, TVMFFIAny* result +) { + DLTensor *x, *y; + + // Extract tensor arguments + if (ReadDLTensorPtr(&args[0], &x) == -1) return -1; + if (ReadDLTensorPtr(&args[1], &y) == -1) return -1; + + // Get current stream for device synchronization (e.g., CUDA) + // not needed for CPU, just keep here for demonstration purpose + void* stream = TVMFFIEnvGetStream(x->device.device_type, x->device.device_id); + + // Perform computation + for (int i = 0; i < x->shape[0]; ++i) { + ((float*)(y->data))[i] = ((float*)(x->data))[i] + 1; + } + return 0; // Success +} +``` +To compile this code, you need to add {py:func}`tvm_ffi.libinfo.find_include_paths` to your include +path and link the shared library that can be found through {py:func}`tvm_ffi.libinfo.find_libtvm_ffi`. +We also provide command line tools to link, so you can compile with the following command: + +```bash +gcc -shared -fPIC `tvm-ffi-config --cflags` \ + src/add_one_c.c -o build/add_one_c.so \ + `tvm-ffi-config --ldflags` `tvm-ffi-config --libs` +``` + +The main takeaway points are: +- Function symbols follow name `int __tvm_ffi_` +- The function follows signaure of `TVMFFISafeCallType` +- Use `TVMFFIAny` to handle dynamic argument types +- Return `0` for success, `-1` for error (set via `TVMFFIErrorSetRaisedFromCStr`) +- This function can be compiled using a c compiler and loaded in the same one as + other libraries in this example. + ## Summary Key Concepts - **TVM_FFI_DLL_EXPORT_TYPED_FUNC** exposes a c++ function into tvm-ffi C ABI -- **DLTensor** is a universal tensor structure that enables zero-copy exchange of array data +- **ffi::Tensor** is a universal tensor structure that enables zero-copy exchange of array data - **Module loading** is provided by tvm ffi APIs in multiple languages. +- **C ABI** is provided for easy low-level integration + diff --git a/docs/guides/compiler_integration.md b/docs/guides/compiler_integration.md index 0eaf1ff0..a1355aff 100644 --- a/docs/guides/compiler_integration.md +++ b/docs/guides/compiler_integration.md @@ -35,43 +35,49 @@ following options: use {c:macro}`TVM_FFI_DLL_EXPORT_TYPED_FUNC` to expose the symbol. The following code snippet shows C code that corresponds to a -function performing `add_one` under the ABI. It is reasonably straightforward for +function performing `add_one_c` under the ABI. It is reasonably straightforward for low-level code generators to replicate this C logic. +You can run this code as part of the [quick start example](https://github.com/apache/tvm-ffi/tree/dev/examples/quick_start). ```c #include #include // Helper function to extract DLTensor from TVMFFIAny (can be inlined into generated code) -int ReadDLTensorPtr(const TVMFFIAny *value, DLTensor* out) { +int ReadDLTensorPtr(const TVMFFIAny *value, DLTensor** out) { if (value->type_index == kTVMFFIDLTensorPtr) { - *out = static_cast(value->v_ptr); + *out = (DLTensor*)(value->v_ptr); return 0; } - if (value->type_index == kTVMFFITensor) { + if (value->type_index != kTVMFFITensor) { + // Use TVMFFIErrorSetRaisedFromCStr to set an error which will + // be propagated to the caller TVMFFIErrorSetRaisedFromCStr("ValueError", "Expects a Tensor input"); return -1; } - *out = reinterpret_cast( - reinterpret_cast(value->v_obj) + sizeof(TVMFFIObject)); + *out = (DLTensor*)((char*)(value->v_obj) + sizeof(TVMFFIObject)); return 0; } // FFI function implementing add_one operation -int __tvm_ffi_add_one( +int __tvm_ffi_add_one_c( void* handle, const TVMFFIAny* args, int32_t num_args, TVMFFIAny* result ) { - DLTensor *a, *b, *c; + DLTensor *x, *y; // Extract tensor arguments - if (ReadDLTensorPtr(&args[0], &a) == -1) return -1; - if (ReadDLTensorPtr(&args[1], &b) == -1) return -1; - if (ReadDLTensorPtr(&args[2], &c) == -1) return -1; + // return -1 for error, error is set through TVMFFIErrorSetRaisedFromCStr + if (ReadDLTensorPtr(&args[0], &x) == -1) return -1; + if (ReadDLTensorPtr(&args[1], &y) == -1) return -1; // Get current stream for device synchronization (e.g., CUDA) - void* stream = TVMFFIEnvGetStream(a->device.device_type, a->device.device_id); + // not needed for CPU, just keep here for demonstration purpose + void* stream = TVMFFIEnvGetStream(x->device.device_type, x->device.device_id); - // Generated computation code would follow here to perform the actual operation - // on tensors a, b, c and store result in c + // perform the actual operation + for (int i = 0; i < x->shape[0]; ++i) { + ((float*)(y->data))[i] = ((float*)(x->data))[i] + 1; + } + // return 0 for success run return 0; } ``` diff --git a/examples/quick_start/CMakeLists.txt b/examples/quick_start/CMakeLists.txt index 05530988..0f6ea11d 100644 --- a/examples/quick_start/CMakeLists.txt +++ b/examples/quick_start/CMakeLists.txt @@ -31,14 +31,21 @@ find_package(tvm_ffi CONFIG REQUIRED) # use the projects as usual add_library(add_one_cpu SHARED src/add_one_cpu.cc) +add_library(add_one_c SHARED src/add_one_c.c) target_link_libraries(add_one_cpu tvm_ffi_header) target_link_libraries(add_one_cpu tvm_ffi_shared) +target_link_libraries(add_one_c tvm_ffi_shared) # show as add_one_cpu.so set_target_properties( add_one_cpu PROPERTIES PREFIX "" SUFFIX ".so" ) +set_target_properties( + add_one_c PROPERTIES + PREFIX "" + SUFFIX ".so" +) # Check if CUDA is available if(NOT WIN32) diff --git a/examples/quick_start/README.md b/examples/quick_start/README.md index 002d4375..d4d130e0 100644 --- a/examples/quick_start/README.md +++ b/examples/quick_start/README.md @@ -52,7 +52,7 @@ You can also compile the modules directly using flags provided by the `tvm-ffi-config` tool. ```bash -g++ -shared -fPIC `tvm-ffi-config --cxxflags` \ - src/add_one_cpu.cc -o build/add_one_cpu.so \ +gcc -shared -fPIC `tvm-ffi-config --cflags` \ + src/add_one_c.c -o build/add_one_c.so \ `tvm-ffi-config --ldflags` `tvm-ffi-config --libs` ``` diff --git a/examples/quick_start/run_example.py b/examples/quick_start/run_example.py index c7a2fcbf..e126af14 100644 --- a/examples/quick_start/run_example.py +++ b/examples/quick_start/run_example.py @@ -52,6 +52,26 @@ def run_add_one_cpu(): print(y) +def run_add_one_c(): + """Load the add_one_c module and call the add_one_c function.""" + mod = tvm_ffi.load_module("build/add_one_c.so") + + x = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32) + y = numpy.empty_like(x) + mod.add_one_c(x, y) + print("numpy.result after add_one_c(x, y)") + print(x) + + if torch is None: + return + + x = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32) + y = torch.empty_like(x) + mod.add_one_c(x, y) + print("torch.result after add_one_c(x, y)") + print(y) + + def run_add_one_cuda(): """Load the add_one_cuda module and call the add_one_cuda function.""" if torch is None or not torch.cuda.is_available(): @@ -76,6 +96,7 @@ def run_add_one_cuda(): def main(): """Main function to run the example.""" run_add_one_cpu() + run_add_one_c() run_add_one_cuda() diff --git a/examples/quick_start/src/add_one_c.c b/examples/quick_start/src/add_one_c.c new file mode 100644 index 00000000..a12987e2 --- /dev/null +++ b/examples/quick_start/src/add_one_c.c @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include +#include + +// This is a raw C variant of the add_one_cpu function +// it is used to demonstrate how low-level mechanism works +// to construct a tvm ffi compatible function +// +// This function can also serve as a reference for how to implement +// a compiler codegen to target tvm ffi +// +// if you are looking for a more high-level way to construct a tvm ffi compatible function, +// please refer to the add_one_cpu.cc instead +/*! + * \brief Helper code to read DLTensor from TVMFFIAny, can be inlined into generated code + * \param value The TVMFFIAny to read from + * \param out The DLTensor to read into + * \return 0 on success, -1 on error + */ +int ReadDLTensorPtr(const TVMFFIAny* value, DLTensor** out) { + if (value->type_index == kTVMFFIDLTensorPtr) { + *out = (DLTensor*)(value->v_ptr); + return 0; + } + if (value->type_index != kTVMFFITensor) { + // Use TVMFFIErrorSetRaisedFromCStr to set an error which will + // be propagated to the caller + TVMFFIErrorSetRaisedFromCStr("ValueError", "Expects a Tensor input"); + return -1; + } + *out = (DLTensor*)((char*)(value->v_obj) + sizeof(TVMFFIObject)); + return 0; +} + +// FFI function implementing add_one operation +int __tvm_ffi_add_one_c( // + void* handle, const TVMFFIAny* args, int32_t num_args, TVMFFIAny* result // +) { + DLTensor *x, *y; + // Extract tensor arguments + // return -1 for error, error is set through TVMFFIErrorSetRaisedFromCStr + if (ReadDLTensorPtr(&args[0], &x) == -1) return -1; + if (ReadDLTensorPtr(&args[1], &y) == -1) return -1; + + // Get current stream for device synchronization (e.g., CUDA) + // not needed for CPU, just keep here for demonstration purpose + void* stream = TVMFFIEnvGetStream(x->device.device_type, x->device.device_id); + + // perform the actual operation + for (int i = 0; i < x->shape[0]; ++i) { + ((float*)(y->data))[i] = ((float*)(x->data))[i] + 1; + } + // return 0 for success run + return 0; +} diff --git a/include/tvm/ffi/c_api.h b/include/tvm/ffi/c_api.h index f13f820b..3dcdf4f8 100644 --- a/include/tvm/ffi/c_api.h +++ b/include/tvm/ffi/c_api.h @@ -27,21 +27,6 @@ #include #include -/* - * \brief C-style Allocator that allocates memory for a DLPack tensor. - * \param prototype The prototype DLTensor to offer details about device and shape. - * \param out The output DLManagedTensorVersioned. - * \param error_ctx The context to set the error. - * \param SetError The function to set the error. - * \return 0 on success, -1 on failure. - * call SetError(error_ctx, kind, message) to set the error kind and message. - * \note Error propagation via SetError. - */ -typedef int (*DLPackTensorAllocator)( // - DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx, // - void (*SetError)(void* error_ctx, const char* kind, const char* message) // -); - // Macros to do weak linking #ifdef _MSC_VER #define TVM_FFI_WEAK __declspec(selectany) @@ -75,12 +60,29 @@ typedef int (*DLPackTensorAllocator)( // extern "C" { #endif +// TODO(tqchen): remove this once dlpack.h is updated +typedef struct DLManagedTensorVersioned DLManagedTensorVersioned; + +/* + * \brief C-style Allocator that allocates memory for a DLPack tensor. + * \param prototype The prototype DLTensor to offer details about device and shape. + * \param out The output DLManagedTensorVersioned. + * \param error_ctx The context to set the error. + * \param SetError The function to set the error. + * \return 0 on success, -1 on failure. + * call SetError(error_ctx, kind, message) to set the error kind and message. + * \note Error propagation via SetError. + */ +typedef int (*DLPackTensorAllocator)( // + DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx, // + void (*SetError)(void* error_ctx, const char* kind, const char* message) // +); + #ifdef __cplusplus enum TVMFFITypeIndex : int32_t { #else typedef enum { #endif - /* * \brief The root type of all FFI objects. * @@ -279,7 +281,6 @@ typedef struct { DLDataType v_dtype; // data type DLDevice v_device; // device char v_bytes[8]; // small string - char32_t v_char32[2]; // small UCS4 string and Unicode uint64_t v_uint64; // uint64 repr mainly used for hashing }; } TVMFFIAny; diff --git a/pyproject.toml b/pyproject.toml index bfe7b428..6c0c4908 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ [project] name = "apache-tvm-ffi" -version = "0.1.0b1" +version = "0.1.0b2" description = "tvm ffi" authors = [{ name = "TVM FFI team" }] diff --git a/python/tvm_ffi/config.py b/python/tvm_ffi/config.py index dcd85c24..4e87caaa 100644 --- a/python/tvm_ffi/config.py +++ b/python/tvm_ffi/config.py @@ -48,6 +48,7 @@ def __main__(): parser.add_argument("--libs", action="store_true", help="Libraries to be linked") parser.add_argument("--cython-lib-path", action="store_true", help="Print cython path") parser.add_argument("--cxxflags", action="store_true", help="Print cxx flags") + parser.add_argument("--cflags", action="store_true", help="Print c flags") parser.add_argument("--ldflags", action="store_true", help="Print ld flags") args = parser.parse_args() @@ -78,12 +79,15 @@ def __main__(): include_dir = libinfo.find_include_path() dlpack_include_dir = libinfo.find_dlpack_include_path() print(f"-I{include_dir} -I{dlpack_include_dir} -std=c++17") + if args.cflags: + include_dir = libinfo.find_include_path() + dlpack_include_dir = libinfo.find_dlpack_include_path() + print(f"-I{include_dir} -I{dlpack_include_dir}") if args.libs: if sys.platform.startswith("win32"): print(find_windows_implib()) else: print("-ltvm_ffi") - if args.ldflags: if not sys.platform.startswith("win32"): print(f"-L{os.path.dirname(libinfo.find_libtvm_ffi())}")