Skip to content
This repository has been archived by the owner on Aug 16, 2024. It is now read-only.

Commit

Permalink
[torch deploy] add support for Python C extension modules (pytorch#58117
Browse files Browse the repository at this point in the history
)

Summary:
Pull Request resolved: pytorch#58117

Previously it was not possible to load C extension modules with deploy because extension
modules need to link against the Python.h API functions. Since
each libtorchdeploy_interpreter.so had its own copy of these functions, it is not possible
to tell dlopen to resolve symbols in a loaded SO from one of these libraries without exposing
its symbols globally.

This patch adds a custom ELF loader which does the custom loading of attaching c extension libraries
to the Python API that loaded the shared library. Simple use of numpy and regex modules appears to work.

This diff has some limitations:

* 64-bit Linux only. OSX and windows use different formats for shared libraries. 32-bit ELF files are not supported.
* debug info is not immediately availiable to debuggers. A script for lldb is provided which can be loaded
so that lldb knows about the libraries as they are loaded.
* shared libraries can directly use the Python API, but libraries they depend on
  (via DT_NEEDED entries in their dynamic segment) may not use Python. In the future, we can
  try to detect whether a sub library uses the Python API and load it with our customer loader.
* TLS initialization and library initialization may occur in a different order than what would happen with dlopen,
  potentially leading to some issues running destructors in TLS segments. Use of this C++ features is relatively rare.

Test Plan: Imported from OSS

Reviewed By: suo

Differential Revision: D28435305

Pulled By: zdevito

fbshipit-source-id: 10f046053dd1d250e3c73f2cce8eb945eeba31b6
  • Loading branch information
zdevito authored and facebook-github-bot committed Jul 24, 2021
1 parent e856a45 commit 7c09de8
Show file tree
Hide file tree
Showing 16 changed files with 1,755 additions and 21 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
- name: Ensure no trailing spaces
if: always()
run: |
(! git --no-pager grep -In '[[:blank:]]$' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' || (echo "The above lines have trailing spaces; please remove them"; false))
(! git --no-pager grep -In '[[:blank:]]$' -- . ':(exclude)**/contrib/**' ':(exclude)**.diff' ':(exclude)third_party' || (echo "The above lines have trailing spaces; please remove them"; false))
- name: Ensure no tabs
if: always()
run: |
Expand Down
34 changes: 34 additions & 0 deletions tools/lldb/deploy_debugger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import lldb # type: ignore[import]
# load into lldb instance with:
# command script import tools/lldb/deploy_debugger.py

target = lldb.debugger.GetSelectedTarget()
bp = target.BreakpointCreateByRegex("__deploy_register_code")
bp.SetScriptCallbackBody("""\
process = frame.thread.GetProcess()
target = process.target
symbol_addr = frame.module.FindSymbol("__deploy_module_info").GetStartAddress()
info_addr = symbol_addr.GetLoadAddress(target)
e = lldb.SBError()
ptr_size = 8
str_addr = process.ReadPointerFromMemory(info_addr, e)
file_addr = process.ReadPointerFromMemory(info_addr + ptr_size, e)
file_size = process.ReadPointerFromMemory(info_addr + 2*ptr_size, e)
load_bias = process.ReadPointerFromMemory(info_addr + 3*ptr_size, e)
name = process.ReadCStringFromMemory(str_addr, 512, e)
r = process.ReadMemory(file_addr, file_size, e)
from tempfile import NamedTemporaryFile
from pathlib import Path
stem = Path(name).stem
with NamedTemporaryFile(prefix=stem, suffix='.so', delete=False) as tf:
tf.write(r)
print("torch_deploy registering debug inforation for ", tf.name)
cmd1 = f"target modules add {tf.name}"
# print(cmd1)
lldb.debugger.HandleCommand(cmd1)
cmd2 = f"target modules load -f {tf.name} -s {hex(load_bias)}"
# print(cmd2)
lldb.debugger.HandleCommand(cmd2)
return False
""")
4 changes: 4 additions & 0 deletions torch/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,10 @@ if(USE_DEPLOY)

target_compile_options(torch_python_obj PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS})

if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
target_compile_options(torch_python_obj PRIVATE -fno-gnu-unique)
endif()

target_include_directories(torch_python_obj PUBLIC ${TORCH_PYTHON_INCLUDE_DIRECTORIES})
target_include_directories(torch_python_obj PRIVATE ../third_party/fmt/include)

Expand Down
24 changes: 22 additions & 2 deletions torch/csrc/deploy/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,32 @@ set(DEPLOY_DIR "${CMAKE_CURRENT_SOURCE_DIR}")

add_subdirectory(interpreter)


# we do not want to have torch_deployinterpreter linked against libstdc++ or libc because
# when loading it with RTLD_DEEPBIND it will resolve std::cout/stdout to the copy in libc++/libc instead of the
# ones in the main process (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=42679).
# However, we can't just instruct the linker to not link against these libraries because these
# libraries use function versioning. Without linking them, the shared library would not know the right
# symbol versions and instead try to link against the old ones. Our solution is to link the library
# normally then remove the DT_NEEDED entries in the ELF file that instruct the loaded to load the sublibraries.
# This gives us the right version numbers but no direct dependency on libstdc++/libc. When loaded these
# symbols will fallback to resolution through the main execution and get the correct values
add_executable(remove_dt_needed remove_dt_needed.cpp)
target_link_libraries(remove_dt_needed PRIVATE fmt::fmt-header-only)

add_custom_command(
OUTPUT libtorch_deployinterpreter.o
COMMAND cp $<TARGET_FILE:torch_deployinterpreter> .
# remove the DT_NEEDED entries
COMMAND $<TARGET_FILE:remove_dt_needed> $<TARGET_FILE:torch_deployinterpreter> libtorch_deployinterpreter.so
# package the result into an object we can link into the libdeploy binary.
COMMAND ld -r -b binary -o libtorch_deployinterpreter.o libtorch_deployinterpreter.so
COMMAND rm libtorch_deployinterpreter.so
DEPENDS torch_deployinterpreter
DEPENDS torch_deployinterpreter remove_dt_needed
VERBATIM
)

add_library(torch_deploy libtorch_deployinterpreter.o ${DEPLOY_DIR}/deploy.cpp)
target_link_libraries(torch_deploy PRIVATE crypt pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins
target_link_libraries(torch_deploy PUBLIC "-Wl,--no-as-needed" shm torch protobuf::libprotobuf-lite)


Expand All @@ -22,6 +38,10 @@ add_executable(test_deploy ${INTERPRETER_TEST_SOURCES})
target_include_directories(test_deploy PRIVATE ${PYTORCH_ROOT}/torch)
target_link_libraries(test_deploy PUBLIC gtest dl torch_deploy)

add_library(test_deploy_lib SHARED test_deploy_lib.cpp)
add_dependencies(test_deploy_lib cpython)
target_include_directories(test_deploy_lib BEFORE PRIVATE ${PYTHON_INC_DIR})

add_executable(deploy_benchmark ${DEPLOY_DIR}/example/benchmark.cpp)
target_include_directories(deploy_benchmark PRIVATE ${PYTORCH_ROOT}/torch)
target_link_libraries(deploy_benchmark PUBLIC torch_deploy)
23 changes: 18 additions & 5 deletions torch/csrc/deploy/deploy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,11 @@ Interpreter::Interpreter(InterpreterManager* manager)

write_tmp_lib(dst, lib_start, lib_end);
fclose(dst);
handle_ = dlopen(library_name, RTLD_LOCAL | RTLD_LAZY);
int flags = RTLD_LOCAL | RTLD_LAZY;
#ifndef FBCODE_CAFFE2
flags |= RTLD_DEEPBIND;
#endif
handle_ = dlopen(library_name, flags);
if (!handle_) {
throw std::runtime_error(dlerror());
}
Expand All @@ -212,17 +216,26 @@ Interpreter::Interpreter(InterpreterManager* manager)
// for the debugger to see it.
unlink(library_name_.c_str());

#ifndef FBCODE_CAFFE2
auto deploy_set_self_ptr = (void (*)(void*))dlsym(handle_, "deploy_set_self");
AT_ASSERT(deploy_set_self_ptr);
deploy_set_self_ptr(handle_);
#endif

void* new_interpreter_impl = dlsym(handle_, "new_interpreter_impl");
assert(new_interpreter_impl);
pImpl_ = std::unique_ptr<InterpreterImpl>(
// NOLINTNEXTLINE(modernize-redundant-void-arg)
((InterpreterImpl * (*)(void)) new_interpreter_impl)());
AT_ASSERT(new_interpreter_impl);
pImpl_ = std::unique_ptr<InterpreterImpl>(((InterpreterImpl * (*)()) new_interpreter_impl)());
}

Interpreter::~Interpreter() {
if (handle_) {
// ensure python uninitialization runs before we dlclose the library
pImpl_.reset();
#ifndef FBCODE_CAFFE2
auto deploy_flush_python_libs =
(void (*)())dlsym(handle_, "deploy_flush_python_libs");
deploy_flush_python_libs();
#endif
dlclose(handle_);
}
}
Expand Down
24 changes: 13 additions & 11 deletions torch/csrc/deploy/interpreter/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ SET(PYTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../../../")
# Build cpython
SET(PYTHON_INSTALL_DIR "${INTERPRETER_DIR}/cpython")
SET(PYTHON_INC_DIR "${PYTHON_INSTALL_DIR}/include/python3.8")
SET(PYTHON_INC_DIR "${PYTHON_INSTALL_DIR}/include/python3.8" PARENT_SCOPE)
SET(PYTHON_LIB "${PYTHON_INSTALL_DIR}/lib/libpython3.8.a")
SET(PYTHON_BIN "${PYTHON_INSTALL_DIR}/bin/python3")
ExternalProject_Add(
Expand All @@ -14,11 +15,12 @@ ExternalProject_Add(
GIT_REPOSITORY https://github.com/python/cpython.git
GIT_TAG v3.8.6
UPDATE_COMMAND ""
PATCH_COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/cpython_patch.diff
BUILD_IN_SOURCE True
CONFIGURE_COMMAND CFLAGS=-fPIC CPPFLAGS=-fPIC <SOURCE_DIR>/configure --prefix ${PYTHON_INSTALL_DIR}
CONFIGURE_COMMAND PYTHON_INSTALL_DIR=${PYTHON_INSTALL_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/configure_cpython.sh
BUILD_COMMAND CFLAGS=-fPIC CPPFLAGS=-fPIC make -j8
INSTALL_COMMAND make install
BYPRODUCTS ${PYTHON_MODULES} ${PYTHON_LIB} ${PYTHON_BIN}
BYPRODUCTS ${PYTHON_MODULES} ${PYTHON_LIB} ${PYTHON_BIN} ${PYTHON_INSTALL_DIR}/lib/libssl.a ${PYTHON_INSTALL_DIR}/lib/libcrypto.a
LOG_OUTPUT_ON_FAILURE True
)

Expand Down Expand Up @@ -83,25 +85,25 @@ target_include_directories(torch_python_obj BEFORE PRIVATE ${PYTHON_INC_DIR})
add_library(torch_python_static STATIC $<TARGET_OBJECTS:torch_python_obj>)
# Build the interpreter lib, designed to be standalone and dlopened
# We bake the python and torch_python binding objs into libinterpreter
set(LINKER_SCRIPT "${INTERPRETER_DIR}/hide_symbols.script")
set(INTERPRETER_LIB_SOURCES
${INTERPRETER_DIR}/interpreter_impl.cpp
${INTERPRETER_DIR}/linker.cpp
${INTERPRETER_DIR}/import_find_sharedfuncptr.cpp
${FROZEN_FILES}
${LINKER_SCRIPT}
)
add_library(torch_deployinterpreter ${INTERPRETER_LIB_SOURCES} ${LINKER_SCRIPT})
set_property(TARGET torch_deployinterpreter APPEND_STRING PROPERTY
LINK_FLAGS " -Wl,--version-script=${LINKER_SCRIPT}")
# need to ensure headers are present before any .cpp in interpreter are compiled,
# but cpp themselves don't clearly depend on cpython so there is a race otherwise
add_dependencies(torch_deployinterpreter cpython)
add_dependencies(torch_python_obj cpython)
target_compile_options(
torch_deployinterpreter PRIVATE
-fvisibility=hidden
)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
target_compile_options(torch_deployinterpreter PRIVATE -fno-gnu-unique)
endif()

target_include_directories(torch_deployinterpreter PRIVATE ${INTERPRETER_DIR})
target_include_directories(torch_deployinterpreter PUBLIC ${PYTHON_INC_DIR})
target_include_directories(torch_deployinterpreter BEFORE PUBLIC ${PYTHON_INC_DIR})

target_link_libraries(torch_deployinterpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STDLIB} torch_python_static)
target_link_libraries(torch_deployinterpreter PRIVATE crypt crypto ssl pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins
target_link_libraries(torch_deployinterpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite)
target_link_libraries(torch_deployinterpreter PRIVATE ${PYTHON_INSTALL_DIR}/lib/libssl.a ${PYTHON_INSTALL_DIR}/lib/libcrypto.a)
6 changes: 6 additions & 0 deletions torch/csrc/deploy/interpreter/configure_cpython.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
set -ex
wget https://www.openssl.org/source/openssl-1.1.1k.tar.gz
tar xf openssl-1.1.1k.tar.gz
(cd openssl-1.1.1k && ./config --prefix="$PYTHON_INSTALL_DIR" && make -j32 && make install)
CFLAGS=-fPIC CPPFLAGS=-fPIC ./configure --prefix "$PYTHON_INSTALL_DIR" --with-openssl="$PYTHON_INSTALL_DIR"
14 changes: 14 additions & 0 deletions torch/csrc/deploy/interpreter/cpython_patch.diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
diff --git a/Python/dynload_shlib.c b/Python/dynload_shlib.c
index c51f97abd2..83f73e351d 100644
--- a/Python/dynload_shlib.c
+++ b/Python/dynload_shlib.c
@@ -54,8 +54,7 @@ static struct {
} handles[128];
static int nhandles = 0;

-
-dl_funcptr
+dl_funcptr __attribute__((weak))
_PyImport_FindSharedFuncptr(const char *prefix,
const char *shortname,
const char *pathname, FILE *fp)
44 changes: 44 additions & 0 deletions torch/csrc/deploy/interpreter/import_find_sharedfuncptr.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#include <torch/csrc/deploy/interpreter/linker.h>
#include <vector>

using torch::deploy::CustomLibrary;
using torch::deploy::CustomLibraryPtr;
using torch::deploy::SystemLibrary;

// NOLINTNEXTLINE
std::vector<CustomLibraryPtr> loaded_files_;
// NOLINTNEXTLINE
static void* deploy_self = nullptr;

extern "C" {

__attribute__((visibility("default"))) void deploy_set_self(void* v) {
deploy_self = v;
}

typedef void (*dl_funcptr)();
extern "C" dl_funcptr _PyImport_FindSharedFuncptr(
const char* prefix,
const char* shortname,
const char* pathname,
FILE* fp) {
const char* args[] = {"deploy"};
// XXX: we have to manually flush loaded_files_ (see deploy_flush_python_libs)
// when the manager unloads. Otherwise some libraries can live longer than
// they are needed, and the process of unloading them might use functionality
// that itself gets unloaded.
loaded_files_.emplace_back(CustomLibrary::create(pathname, 1, args));
CustomLibrary& lib = *loaded_files_.back();
lib.add_search_library(SystemLibrary::create());
lib.add_search_library(SystemLibrary::create(deploy_self));
lib.load();
std::stringstream ss;
ss << prefix << "_" << shortname;
auto r = (dl_funcptr)lib.sym(ss.str().c_str()).value();
assert(r);
return r;
}
__attribute__((visibility("default"))) void deploy_flush_python_libs() {
loaded_files_.clear();
}
}
6 changes: 4 additions & 2 deletions torch/csrc/deploy/interpreter/interpreter_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ extern "C" struct _frozen _PyImport_FrozenModules[];
extern "C" struct _frozen _PyImport_FrozenModules_torch[];

const char* startup = R"RAW(
import _ssl # must come before _hashlib otherwise ssl's locks will be set to a Python that might no longer exist...
import sys
import importlib.abc
import linecache
Expand Down Expand Up @@ -283,7 +284,8 @@ struct InitLockAcquire {
std::mutex& init_lock_;
};

struct ConcreteInterpreterImpl : public torch::deploy::InterpreterImpl {
struct __attribute__((visibility("hidden"))) ConcreteInterpreterImpl
: public torch::deploy::InterpreterImpl {
ConcreteInterpreterImpl() {
#define APPEND_INIT(name) PyImport_AppendInittab(#name, PyInit_##name);
FOREACH_LIBRARY(APPEND_INIT)
Expand Down Expand Up @@ -376,7 +378,7 @@ struct ConcreteInterpreterImpl : public torch::deploy::InterpreterImpl {
std::mutex init_lock_;
};

struct ConcreteInterpreterSessionImpl
struct __attribute__((visibility("hidden"))) ConcreteInterpreterSessionImpl
: public torch::deploy::InterpreterSessionImpl {
ConcreteInterpreterSessionImpl(ConcreteInterpreterImpl* interp)
: interp_(interp) {}
Expand Down
Loading

0 comments on commit 7c09de8

Please sign in to comment.