Skip to content

Commit

Permalink
support tokenizers build only in C API mode (microsoft#783)
Browse files Browse the repository at this point in the history
* support tokenizer build only in C API mode

* fix the python build.

* fix the selectedops build

---------

Co-authored-by: Sayan Shaw <[email protected]>
  • Loading branch information
wenbingl and sayanshaw24 authored Aug 2, 2024
1 parent 7851b51 commit be29e28
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 81 deletions.
7 changes: 7 additions & 0 deletions .pipelines/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,13 @@ stages:
ctest -C RelWithDebInfo --output-on-failure
displayName: Build ort-extensions with API enabled and run tests
- bash: |
set -e -x -u
./build.sh -DOCOS_BUILD_PRESET=token_api_only -DOCOS_BUILD_SHARED_LIB=OFF
cd out/Linux
ctest -C RelWithDebInfo --output-on-failure
displayName: Build ort-extensions with tokenizer API only enabled and run tests
- stage: MacOSBuilds
dependsOn: []
Expand Down
43 changes: 32 additions & 11 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ option(OCOS_ENABLE_STATIC_LIB "Enable generating static library" OFF)
option(OCOS_ENABLE_SELECTED_OPLIST "Enable including the selected_ops tool file" OFF)
option(OCOS_ENABLE_C_API "Enable building the C API" OFF)

option(OCOS_BUILD_SHARED_LIB "Enable building the dynamic library" ON)
option(OCOS_BUILD_PYTHON "Enable building the Python package" OFF)
option(OCOS_BUILD_JAVA "Enable building the Java package" OFF)
option(OCOS_BUILD_ANDROID "Enable building the Android package" OFF)
Expand Down Expand Up @@ -698,7 +699,7 @@ endif()

# If building a shared library we can't throw an internal exception type across the library boundary as the type
# will be unknown. Set a compile definition so the code can adjust to the build type.
if(NOT OCOS_ENABLE_STATIC_LIB)
if(OCOS_BUILD_SHARED_LIB)
list(APPEND OCOS_COMPILE_DEFINITIONS OCOS_SHARED_LIBRARY)
endif()

Expand All @@ -724,15 +725,32 @@ list(APPEND ocos_libraries noexcep_operators)
target_compile_definitions(ocos_operators PRIVATE ${OCOS_COMPILE_DEFINITIONS})
target_link_libraries(ocos_operators PRIVATE ${ocos_libraries})

set (file_patterns "shared/lib/*.cc")
if (OCOS_ENABLE_C_API)
list(APPEND file_patterns "shared/api/*.h*" "shared/api/*.c" "shared/api/*.cc")
file(GLOB _TARGET_LIB_SRC "shared/lib/*.cc")
if(OCOS_ENABLE_C_API)
file(GLOB utils_TARGET_SRC "shared/api/c_api_utils.*" "shared/api/runner.hpp")
list(APPEND _TARGET_LIB_SRC ${utils_TARGET_SRC})
if(_HAS_TOKENIZER)
file(GLOB tok_TARGET_SRC "shared/api/c_api_tokenizer.cc" "shared/api/token*")
list(APPEND _TARGET_LIB_SRC ${tok_TARGET_SRC})
endif()
if(OCOS_ENABLE_AUDIO)
file(GLOB audio_TARGET_SRC "shared/api/c_api_feature_extraction.*" "shared/api/speech_*")
list(APPEND _TARGET_LIB_SRC ${audio_TARGET_SRC})
endif()
if(OCOS_ENABLE_CV2)
file(GLOB cv2_TARGET_SRC "shared/api/c_api_processor.*" "shared/api/image_*.*")
list(APPEND _TARGET_LIB_SRC ${cv2_TARGET_SRC})
endif()
endif()

file(GLOB shared_TARGET_LIB_SRC ${file_patterns})

if(NOT OCOS_ENABLE_STATIC_LIB AND CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
add_executable(ortcustomops ${shared_TARGET_LIB_SRC})
if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
if(OCOS_ENABLE_STATIC_LIB)
message(FATAL_ERROR "Emscripten build does not support building a static library.")
endif()
# Emscripten does not support building a shared library with custom ops.
# and backward compatible with the previous version, we silently turn off the shared library build.
set(OCOS_BUILD_SHARED_LIB OFF CACHE INTERNAL "" FORCE)
add_executable(ortcustomops ${_TARGET_LIB_SRC})
set_target_properties(ortcustomops PROPERTIES LINK_FLAGS " \
-s WASM=1 \
-s NO_EXIT_RUNTIME=0 \
Expand All @@ -751,13 +769,12 @@ if(NOT OCOS_ENABLE_STATIC_LIB AND CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
set_property(TARGET ortcustomops APPEND_STRING PROPERTY LINK_FLAGS " -s ASSERTIONS=0 -s DEMANGLE_SUPPORT=0")
endif()
else()
add_library(ortcustomops STATIC ${shared_TARGET_LIB_SRC})
add_library(ortcustomops STATIC ${_TARGET_LIB_SRC})
if (HAS_SDL)
target_compile_options(ortcustomops PRIVATE "/sdl")
endif()
add_library(onnxruntime_extensions ALIAS ortcustomops)
standardize_output_folder(ortcustomops)
set(_BUILD_SHARED_LIBRARY TRUE)
endif()
set_target_properties(ortcustomops PROPERTIES FOLDER "operators")

Expand Down Expand Up @@ -832,9 +849,12 @@ target_include_directories(ortcustomops PUBLIC "$<TARGET_PROPERTY:ocos_operators

target_link_libraries(ortcustomops PUBLIC ocos_operators)

if(_BUILD_SHARED_LIBRARY)
if(OCOS_BUILD_SHARED_LIB)
file(GLOB shared_TARGET_SRC "shared/*.cc" "shared/*.h")
if (OCOS_ENABLE_C_API)
if (NOT _HAS_TOKENIZER OR NOT OCOS_ENABLE_CV2 OR NOT OCOS_ENABLE_AUDIO)
message(FATAL_ERROR "Shared library build requires GPT2_TOKENIZER, CV2 and AUDIO to be enabled.")
endif()
list(APPEND shared_TARGET_SRC "shared/extensions_c.def")
else()
list(APPEND shared_TARGET_SRC "shared/ortcustomops.def")
Expand Down Expand Up @@ -885,6 +905,7 @@ endif()

if(OCOS_BUILD_PYTHON)
message(STATUS "Python Build is enabled")
set(shared_TARGET_LIB_SRC ${_TARGET_LIB_SRC}) # these library file are also needed for python build
include(ext_python)
endif()

Expand Down
142 changes: 72 additions & 70 deletions cmake/ext_tests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ add_test_target(TARGET ocos_test
LIBRARIES ortcustomops ${ocos_libraries})
target_compile_definitions(ocos_test PRIVATE ${OCOS_COMPILE_DEFINITIONS})

if (OCOS_ENABLE_C_API)
if (OCOS_ENABLE_C_API AND OCOS_BUILD_SHARED_LIB)
file(GLOB pp_api_TEST_SRC
"${TEST_SRC_DIR}/pp_api_test/*.c"
"${TEST_SRC_DIR}/pp_api_test/*.cc"
Expand Down Expand Up @@ -163,73 +163,75 @@ else()
find_library(ONNXRUNTIME onnxruntime HINTS "${ONNXRUNTIME_LIB_DIR}")
endif()

if("${ONNXRUNTIME}" STREQUAL "ONNXRUNTIME-NOTFOUND")
message(WARNING "The prebuilt onnxruntime library was not found, extensions_test will be skipped.")
else()
block()
if(NOT IOS)
set(use_extensions_shared_library 1)
endif()

set(extensions_target $<IF:$<BOOL:${use_extensions_shared_library}>,extensions_shared,ortcustomops>)

file(GLOB shared_TEST_SRC
"${TEST_SRC_DIR}/shared_test/*.cc"
"${TEST_SRC_DIR}/shared_test/*.hpp")

set(extensions_test_libraries ${extensions_target} ${ONNXRUNTIME})

if(use_extensions_shared_library)
list(APPEND extensions_test_libraries ${ocos_libraries})
endif()

# needs to link with stdc++fs in Linux
if(LINUX)
list(APPEND extensions_test_libraries stdc++fs -pthread)
endif()

add_test_target(TARGET extensions_test
TEST_SOURCES ${shared_TEST_SRC}
LIBRARIES ${extensions_test_libraries}
TEST_DATA_DIRECTORIES ${TEST_SRC_DIR}/data)

target_include_directories(extensions_test PRIVATE ${spm_INCLUDE_DIRS})

target_compile_definitions(extensions_test PUBLIC ${OCOS_COMPILE_DEFINITIONS})
if(use_extensions_shared_library)
target_compile_definitions(extensions_test PUBLIC ORT_EXTENSIONS_UNIT_TEST_USE_EXTENSIONS_SHARED_LIBRARY)
endif()

# FUTURE: This is required to use the ORT C++ API with delayed init which must be done conditionally using
# ifdef OCOS_BUILD_SHARED in RegisterCustomOps and where onnxruntime_cxx_api.h is included .
# ---
# We have to remove the OCOS_BUILD_SHARED when building the test code. It is used to delay population of the
# ORT api pointer until RegisterCustomOps is called, but the test code needs to create an ORT env which requires
# the pointer to exist.
# set(test_compile_definitions ${OCOS_COMPILE_DEFINITIONS})
# remove(test_compile_definitions "OCOS_SHARED_LIBRARY")
# target_compile_definitions(extensions_test PUBLIC ${test_compile_definitions})

# Copy onnxruntime DLL files into the same directory as the test binary.
if(WIN32)
file(TO_CMAKE_PATH "${ONNXRUNTIME_LIB_DIR}/*" ONNXRUNTIME_LIB_FILEPATTERN)
file(GLOB ONNXRUNTIME_LIB_FILES CONFIGURE_DEPENDS "${ONNXRUNTIME_LIB_FILEPATTERN}")
add_custom_command(
TARGET extensions_test POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ONNXRUNTIME_LIB_FILES} $<TARGET_FILE_DIR:extensions_test>)
endif()

# Copy onnxruntime shared library to known location for easy access, e.g., for adb push to emulator or device.
if(ANDROID)
add_custom_command(
TARGET extensions_test POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ONNXRUNTIME} ${CMAKE_BINARY_DIR}/lib
)
endif()

if (OCOS_ENABLE_C_API)
# avoid copying the same data directory at the same time.
add_dependencies(extensions_test pp_api_test)
endif()
endblock()
if (OCOS_BUILD_SHARED_LIB)
if("${ONNXRUNTIME}" STREQUAL "ONNXRUNTIME-NOTFOUND")
message(WARNING "The prebuilt onnxruntime library was not found, extensions_test will be skipped.")
else()
block()
if(NOT IOS)
set(use_extensions_shared_library 1)
endif()

set(extensions_target $<IF:$<BOOL:${use_extensions_shared_library}>,extensions_shared,ortcustomops>)

file(GLOB shared_TEST_SRC
"${TEST_SRC_DIR}/shared_test/*.cc"
"${TEST_SRC_DIR}/shared_test/*.hpp")

set(extensions_test_libraries ${extensions_target} ${ONNXRUNTIME})

if(use_extensions_shared_library)
list(APPEND extensions_test_libraries ${ocos_libraries})
endif()

# needs to link with stdc++fs in Linux
if(LINUX)
list(APPEND extensions_test_libraries stdc++fs -pthread)
endif()

add_test_target(TARGET extensions_test
TEST_SOURCES ${shared_TEST_SRC}
LIBRARIES ${extensions_test_libraries}
TEST_DATA_DIRECTORIES ${TEST_SRC_DIR}/data)

target_include_directories(extensions_test PRIVATE ${spm_INCLUDE_DIRS})

target_compile_definitions(extensions_test PUBLIC ${OCOS_COMPILE_DEFINITIONS})
if(use_extensions_shared_library)
target_compile_definitions(extensions_test PUBLIC ORT_EXTENSIONS_UNIT_TEST_USE_EXTENSIONS_SHARED_LIBRARY)
endif()

# FUTURE: This is required to use the ORT C++ API with delayed init which must be done conditionally using
# ifdef OCOS_BUILD_SHARED in RegisterCustomOps and where onnxruntime_cxx_api.h is included .
# ---
# We have to remove the OCOS_BUILD_SHARED when building the test code. It is used to delay population of the
# ORT api pointer until RegisterCustomOps is called, but the test code needs to create an ORT env which requires
# the pointer to exist.
# set(test_compile_definitions ${OCOS_COMPILE_DEFINITIONS})
# remove(test_compile_definitions "OCOS_SHARED_LIBRARY")
# target_compile_definitions(extensions_test PUBLIC ${test_compile_definitions})

# Copy onnxruntime DLL files into the same directory as the test binary.
if(WIN32)
file(TO_CMAKE_PATH "${ONNXRUNTIME_LIB_DIR}/*" ONNXRUNTIME_LIB_FILEPATTERN)
file(GLOB ONNXRUNTIME_LIB_FILES CONFIGURE_DEPENDS "${ONNXRUNTIME_LIB_FILEPATTERN}")
add_custom_command(
TARGET extensions_test POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ONNXRUNTIME_LIB_FILES} $<TARGET_FILE_DIR:extensions_test>)
endif()

# Copy onnxruntime shared library to known location for easy access, e.g., for adb push to emulator or device.
if(ANDROID)
add_custom_command(
TARGET extensions_test POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ONNXRUNTIME} ${CMAKE_BINARY_DIR}/lib
)
endif()

if (OCOS_ENABLE_C_API)
# avoid copying the same data directory at the same time.
add_dependencies(extensions_test pp_api_test)
endif()
endblock()
endif()
endif()
2 changes: 2 additions & 0 deletions docs/c_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@ Most APIs accept raw data inputs such as audio, image compressed binary formats,
**Image processing:** `OrtxCreateProcessor` can create an image processor object from a pre-defined workflow in JSON format to process image files into a tensor-like data type. An example code snippet can be found [here](../test/pp_api_test/test_processor.cc#L75).

**Audio feature extraction:** `OrtxCreateSpeechFeatureExtractor` creates a speech feature extractor to obtain log mel spectrum data as input for the Whisper model. An example code snippet can be found [here](../test/pp_api_test/test_feature_extraction.cc#L16).

NB: If onnxruntime-extensions is to build as a shared library, which requires the OCOS_ENABLE_AUDIO OCOS_ENABLE_CV2 OCOS_ENABLE_OPENCV_CODECS OCOS_ENABLE_GPT2_TOKENIZER build flags are ON to have a full function of binary. Only onnxruntime-extensions static library can be used for a minimal build with the selected operators, so in that case, the shared library build can be switched off by `-DOCOS_BUILD_SHARED_LIB=OFF`.
4 changes: 4 additions & 0 deletions test/static_test/test_tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class LocaleBaseTest : public testing::Test {
std::string default_locale_;
};

#if defined(ENABLE_WORDPIECE_TOKENIZER) && defined(ENABLE_BERT_TOKENIZER)

TEST(tokenizer, bert_word_split) {
ustring ind("##");
ustring text("A AAA B BB");
Expand Down Expand Up @@ -261,3 +263,5 @@ TEST(tokenizer, basic_tok_eager) {
tokenizer.Compute(test_case, output);
EXPECT_EQ(output.Data(), expect_result);
}

#endif

0 comments on commit be29e28

Please sign in to comment.