[torch deploy] add support for Python C extension modules (pytorch#58117

) Summary: Pull Request resolved: pytorch#58117 Previously it was not possible to load C extension modules with deploy because extension modules need to link against the Python.h API functions. Since each libtorchdeploy_interpreter.so had its own copy of these functions, it is not possible to tell dlopen to resolve symbols in a loaded SO from one of these libraries without exposing its symbols globally. This patch adds a custom ELF loader which does the custom loading of attaching c extension libraries to the Python API that loaded the shared library. Simple use of numpy and regex modules appears to work. This diff has some limitations: * 64-bit Linux only. OSX and windows use different formats for shared libraries. 32-bit ELF files are not supported. * debug info is not immediately availiable to debuggers. A script for lldb is provided which can be loaded so that lldb knows about the libraries as they are loaded. * shared libraries can directly use the Python API, but libraries they depend on (via DT_NEEDED entries in their dynamic segment) may not use Python. In the future, we can try to detect whether a sub library uses the Python API and load it with our customer loader. * TLS initialization and library initialization may occur in a different order than what would happen with dlopen, potentially leading to some issues running destructors in TLS segments. Use of this C++ features is relatively rare. Test Plan: Imported from OSS Reviewed By: suo Differential Revision: D28435305 Pulled By: zdevito fbshipit-source-id: 10f046053dd1d250e3c73f2cce8eb945eeba31b6
openai · Jul 24, 2021 · 7c09de8 · 7c09de8
1 parent e856a45
commit 7c09de8
Show file tree

Hide file tree

Showing 16 changed files with 1,755 additions and 21 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -35,7 +35,7 @@ jobs:
       - name: Ensure no trailing spaces
         if: always()
         run: |
-          (! git --no-pager grep -In '[[:blank:]]$' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' || (echo "The above lines have trailing spaces; please remove them"; false))
+          (! git --no-pager grep -In '[[:blank:]]$' -- . ':(exclude)**/contrib/**' ':(exclude)**.diff' ':(exclude)third_party' || (echo "The above lines have trailing spaces; please remove them"; false))
       - name: Ensure no tabs
         if: always()
         run: |

diff --git a/tools/lldb/deploy_debugger.py b/tools/lldb/deploy_debugger.py
@@ -0,0 +1,34 @@
+import lldb  # type: ignore[import]
+# load into lldb instance with:
+#   command script import tools/lldb/deploy_debugger.py
+
+target = lldb.debugger.GetSelectedTarget()
+bp = target.BreakpointCreateByRegex("__deploy_register_code")
+bp.SetScriptCallbackBody("""\
+process = frame.thread.GetProcess()
+target = process.target
+symbol_addr = frame.module.FindSymbol("__deploy_module_info").GetStartAddress()
+info_addr = symbol_addr.GetLoadAddress(target)
+e = lldb.SBError()
+ptr_size = 8
+str_addr = process.ReadPointerFromMemory(info_addr, e)
+file_addr = process.ReadPointerFromMemory(info_addr + ptr_size, e)
+file_size = process.ReadPointerFromMemory(info_addr + 2*ptr_size, e)
+load_bias = process.ReadPointerFromMemory(info_addr + 3*ptr_size, e)
+name = process.ReadCStringFromMemory(str_addr, 512, e)
+r = process.ReadMemory(file_addr, file_size, e)
+from tempfile import NamedTemporaryFile
+from pathlib import Path
+stem = Path(name).stem
+with NamedTemporaryFile(prefix=stem, suffix='.so', delete=False) as tf:
+    tf.write(r)
+    print("torch_deploy registering debug inforation for ", tf.name)
+    cmd1 = f"target modules add {tf.name}"
+    # print(cmd1)
+    lldb.debugger.HandleCommand(cmd1)
+    cmd2 = f"target modules load -f {tf.name} -s {hex(load_bias)}"
+    # print(cmd2)
+    lldb.debugger.HandleCommand(cmd2)
+
+return False
+""")
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
@@ -223,6 +223,10 @@ if(USE_DEPLOY)
 
   target_compile_options(torch_python_obj PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS})
 
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+    target_compile_options(torch_python_obj PRIVATE -fno-gnu-unique)
+  endif()
+
   target_include_directories(torch_python_obj PUBLIC ${TORCH_PYTHON_INCLUDE_DIRECTORIES})
   target_include_directories(torch_python_obj PRIVATE ../third_party/fmt/include)
 

diff --git a/torch/csrc/deploy/CMakeLists.txt b/torch/csrc/deploy/CMakeLists.txt
@@ -2,16 +2,32 @@ set(DEPLOY_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
 
 add_subdirectory(interpreter)
 
+
+# we do not want to have torch_deployinterpreter linked against libstdc++ or libc because
+# when loading it with RTLD_DEEPBIND it will resolve std::cout/stdout to the copy in libc++/libc instead of the
+# ones in the main process (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=42679).
+# However, we can't just instruct the linker to not link against these libraries because these
+# libraries use function versioning. Without linking them, the shared library would not know the right
+# symbol versions and instead try to link against the old ones. Our solution is to link the library
+# normally then remove the DT_NEEDED entries in the ELF file that instruct the loaded to load the sublibraries.
+# This gives us the right version numbers but no direct dependency on libstdc++/libc. When loaded these
+# symbols will fallback to resolution through the main execution and get the correct values
+add_executable(remove_dt_needed remove_dt_needed.cpp)
+target_link_libraries(remove_dt_needed PRIVATE fmt::fmt-header-only)
+
 add_custom_command(
   OUTPUT libtorch_deployinterpreter.o
-  COMMAND cp $<TARGET_FILE:torch_deployinterpreter> .
+  # remove the DT_NEEDED entries
+  COMMAND $<TARGET_FILE:remove_dt_needed> $<TARGET_FILE:torch_deployinterpreter> libtorch_deployinterpreter.so
+  # package the result into an object we can link into the libdeploy binary.
   COMMAND ld -r -b binary -o libtorch_deployinterpreter.o libtorch_deployinterpreter.so
   COMMAND rm libtorch_deployinterpreter.so
-  DEPENDS torch_deployinterpreter
+  DEPENDS torch_deployinterpreter remove_dt_needed
   VERBATIM
 )
 
 add_library(torch_deploy libtorch_deployinterpreter.o ${DEPLOY_DIR}/deploy.cpp)
+target_link_libraries(torch_deploy PRIVATE crypt pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins
 target_link_libraries(torch_deploy PUBLIC "-Wl,--no-as-needed" shm torch protobuf::libprotobuf-lite)
 
 
@@ -22,6 +38,10 @@ add_executable(test_deploy ${INTERPRETER_TEST_SOURCES})
 target_include_directories(test_deploy PRIVATE ${PYTORCH_ROOT}/torch)
 target_link_libraries(test_deploy PUBLIC gtest dl torch_deploy)
 
+add_library(test_deploy_lib SHARED test_deploy_lib.cpp)
+add_dependencies(test_deploy_lib cpython)
+target_include_directories(test_deploy_lib BEFORE PRIVATE ${PYTHON_INC_DIR})
+
 add_executable(deploy_benchmark ${DEPLOY_DIR}/example/benchmark.cpp)
 target_include_directories(deploy_benchmark PRIVATE ${PYTORCH_ROOT}/torch)
 target_link_libraries(deploy_benchmark PUBLIC torch_deploy)
diff --git a/torch/csrc/deploy/deploy.cpp b/torch/csrc/deploy/deploy.cpp
@@ -202,7 +202,11 @@ Interpreter::Interpreter(InterpreterManager* manager)
 
   write_tmp_lib(dst, lib_start, lib_end);
   fclose(dst);
-  handle_ = dlopen(library_name, RTLD_LOCAL | RTLD_LAZY);
+  int flags = RTLD_LOCAL | RTLD_LAZY;
+#ifndef FBCODE_CAFFE2
+  flags |= RTLD_DEEPBIND;
+#endif
+  handle_ = dlopen(library_name, flags);
   if (!handle_) {
     throw std::runtime_error(dlerror());
   }
@@ -212,17 +216,26 @@ Interpreter::Interpreter(InterpreterManager* manager)
   // for the debugger to see it.
   unlink(library_name_.c_str());
 
+#ifndef FBCODE_CAFFE2
+  auto deploy_set_self_ptr = (void (*)(void*))dlsym(handle_, "deploy_set_self");
+  AT_ASSERT(deploy_set_self_ptr);
+  deploy_set_self_ptr(handle_);
+#endif
+
   void* new_interpreter_impl = dlsym(handle_, "new_interpreter_impl");
-  assert(new_interpreter_impl);
-  pImpl_ = std::unique_ptr<InterpreterImpl>(
-      // NOLINTNEXTLINE(modernize-redundant-void-arg)
-      ((InterpreterImpl * (*)(void)) new_interpreter_impl)());
+  AT_ASSERT(new_interpreter_impl);
+  pImpl_ = std::unique_ptr<InterpreterImpl>(((InterpreterImpl * (*)()) new_interpreter_impl)());
 }
 
 Interpreter::~Interpreter() {
   if (handle_) {
     // ensure python uninitialization runs before we dlclose the library
     pImpl_.reset();
+#ifndef FBCODE_CAFFE2
+    auto deploy_flush_python_libs =
+        (void (*)())dlsym(handle_, "deploy_flush_python_libs");
+    deploy_flush_python_libs();
+#endif
     dlclose(handle_);
   }
 }

diff --git a/torch/csrc/deploy/interpreter/CMakeLists.txt b/torch/csrc/deploy/interpreter/CMakeLists.txt
@@ -6,6 +6,7 @@ SET(PYTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../../../")
 # Build cpython
 SET(PYTHON_INSTALL_DIR "${INTERPRETER_DIR}/cpython")
 SET(PYTHON_INC_DIR "${PYTHON_INSTALL_DIR}/include/python3.8")
+SET(PYTHON_INC_DIR "${PYTHON_INSTALL_DIR}/include/python3.8" PARENT_SCOPE)
 SET(PYTHON_LIB "${PYTHON_INSTALL_DIR}/lib/libpython3.8.a")
 SET(PYTHON_BIN "${PYTHON_INSTALL_DIR}/bin/python3")
 ExternalProject_Add(
@@ -14,11 +15,12 @@ ExternalProject_Add(
   GIT_REPOSITORY https://github.com/python/cpython.git
   GIT_TAG v3.8.6
   UPDATE_COMMAND ""
+  PATCH_COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/cpython_patch.diff
   BUILD_IN_SOURCE True
-  CONFIGURE_COMMAND CFLAGS=-fPIC CPPFLAGS=-fPIC <SOURCE_DIR>/configure --prefix ${PYTHON_INSTALL_DIR}
+  CONFIGURE_COMMAND PYTHON_INSTALL_DIR=${PYTHON_INSTALL_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/configure_cpython.sh
   BUILD_COMMAND CFLAGS=-fPIC CPPFLAGS=-fPIC make -j8
   INSTALL_COMMAND make install
-  BYPRODUCTS ${PYTHON_MODULES} ${PYTHON_LIB} ${PYTHON_BIN}
+  BYPRODUCTS ${PYTHON_MODULES} ${PYTHON_LIB} ${PYTHON_BIN} ${PYTHON_INSTALL_DIR}/lib/libssl.a ${PYTHON_INSTALL_DIR}/lib/libcrypto.a
   LOG_OUTPUT_ON_FAILURE True
 )
 
@@ -83,25 +85,25 @@ target_include_directories(torch_python_obj BEFORE PRIVATE ${PYTHON_INC_DIR})
 add_library(torch_python_static STATIC $<TARGET_OBJECTS:torch_python_obj>)
 # Build the interpreter lib, designed to be standalone and dlopened
 # We bake the python and torch_python binding objs into libinterpreter
-set(LINKER_SCRIPT "${INTERPRETER_DIR}/hide_symbols.script")
 set(INTERPRETER_LIB_SOURCES
   ${INTERPRETER_DIR}/interpreter_impl.cpp
+  ${INTERPRETER_DIR}/linker.cpp
+  ${INTERPRETER_DIR}/import_find_sharedfuncptr.cpp
   ${FROZEN_FILES}
   ${LINKER_SCRIPT}
 )
 add_library(torch_deployinterpreter ${INTERPRETER_LIB_SOURCES} ${LINKER_SCRIPT})
-set_property(TARGET torch_deployinterpreter APPEND_STRING PROPERTY
-             LINK_FLAGS " -Wl,--version-script=${LINKER_SCRIPT}")
 # need to ensure headers are present before any .cpp in interpreter are compiled,
 # but cpp themselves don't clearly depend on cpython so there is a race otherwise
 add_dependencies(torch_deployinterpreter cpython)
 add_dependencies(torch_python_obj cpython)
-target_compile_options(
-    torch_deployinterpreter PRIVATE
-    -fvisibility=hidden
-)
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+  target_compile_options(torch_deployinterpreter PRIVATE -fno-gnu-unique)
+endif()
+
 target_include_directories(torch_deployinterpreter PRIVATE ${INTERPRETER_DIR})
-target_include_directories(torch_deployinterpreter PUBLIC ${PYTHON_INC_DIR})
+target_include_directories(torch_deployinterpreter BEFORE PUBLIC ${PYTHON_INC_DIR})
+
 target_link_libraries(torch_deployinterpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STDLIB} torch_python_static)
-target_link_libraries(torch_deployinterpreter PRIVATE crypt crypto ssl pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins
 target_link_libraries(torch_deployinterpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite)
+target_link_libraries(torch_deployinterpreter PRIVATE ${PYTHON_INSTALL_DIR}/lib/libssl.a ${PYTHON_INSTALL_DIR}/lib/libcrypto.a)
diff --git a/torch/csrc/deploy/interpreter/configure_cpython.sh b/torch/csrc/deploy/interpreter/configure_cpython.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -ex
+wget https://www.openssl.org/source/openssl-1.1.1k.tar.gz
+tar xf openssl-1.1.1k.tar.gz
+(cd openssl-1.1.1k && ./config --prefix="$PYTHON_INSTALL_DIR" && make -j32 && make install)
+CFLAGS=-fPIC CPPFLAGS=-fPIC ./configure --prefix "$PYTHON_INSTALL_DIR" --with-openssl="$PYTHON_INSTALL_DIR"
diff --git a/torch/csrc/deploy/interpreter/cpython_patch.diff b/torch/csrc/deploy/interpreter/cpython_patch.diff
@@ -0,0 +1,14 @@
+diff --git a/Python/dynload_shlib.c b/Python/dynload_shlib.c
+index c51f97abd2..83f73e351d 100644
+--- a/Python/dynload_shlib.c
++++ b/Python/dynload_shlib.c
+@@ -54,8 +54,7 @@ static struct {
+ } handles[128];
+ static int nhandles = 0;
+
+-
+-dl_funcptr
++dl_funcptr __attribute__((weak))
+ _PyImport_FindSharedFuncptr(const char *prefix,
+                             const char *shortname,
+                             const char *pathname, FILE *fp)
diff --git a/torch/csrc/deploy/interpreter/import_find_sharedfuncptr.cpp b/torch/csrc/deploy/interpreter/import_find_sharedfuncptr.cpp
@@ -0,0 +1,44 @@
+#include <torch/csrc/deploy/interpreter/linker.h>
+#include <vector>
+
+using torch::deploy::CustomLibrary;
+using torch::deploy::CustomLibraryPtr;
+using torch::deploy::SystemLibrary;
+
+// NOLINTNEXTLINE
+std::vector<CustomLibraryPtr> loaded_files_;
+// NOLINTNEXTLINE
+static void* deploy_self = nullptr;
+
+extern "C" {
+
+__attribute__((visibility("default"))) void deploy_set_self(void* v) {
+  deploy_self = v;
+}
+
+typedef void (*dl_funcptr)();
+extern "C" dl_funcptr _PyImport_FindSharedFuncptr(
+    const char* prefix,
+    const char* shortname,
+    const char* pathname,
+    FILE* fp) {
+  const char* args[] = {"deploy"};
+  // XXX: we have to manually flush loaded_files_ (see deploy_flush_python_libs)
+  // when the manager unloads. Otherwise some libraries can live longer than
+  // they are needed, and the process of unloading them might use functionality
+  // that itself gets unloaded.
+  loaded_files_.emplace_back(CustomLibrary::create(pathname, 1, args));
+  CustomLibrary& lib = *loaded_files_.back();
+  lib.add_search_library(SystemLibrary::create());
+  lib.add_search_library(SystemLibrary::create(deploy_self));
+  lib.load();
+  std::stringstream ss;
+  ss << prefix << "_" << shortname;
+  auto r = (dl_funcptr)lib.sym(ss.str().c_str()).value();
+  assert(r);
+  return r;
+}
+__attribute__((visibility("default"))) void deploy_flush_python_libs() {
+  loaded_files_.clear();
+}
+}
diff --git a/torch/csrc/deploy/interpreter/interpreter_impl.cpp b/torch/csrc/deploy/interpreter/interpreter_impl.cpp
@@ -110,6 +110,7 @@ extern "C" struct _frozen _PyImport_FrozenModules[];
 extern "C" struct _frozen _PyImport_FrozenModules_torch[];
 
 const char* startup = R"RAW(
+import _ssl # must come before _hashlib otherwise ssl's locks will be set to a Python that might no longer exist...
 import sys
 import importlib.abc
 import linecache
@@ -283,7 +284,8 @@ struct InitLockAcquire {
   std::mutex& init_lock_;
 };
 
-struct ConcreteInterpreterImpl : public torch::deploy::InterpreterImpl {
+struct __attribute__((visibility("hidden"))) ConcreteInterpreterImpl
+    : public torch::deploy::InterpreterImpl {
   ConcreteInterpreterImpl() {
 #define APPEND_INIT(name) PyImport_AppendInittab(#name, PyInit_##name);
     FOREACH_LIBRARY(APPEND_INIT)
@@ -376,7 +378,7 @@ struct ConcreteInterpreterImpl : public torch::deploy::InterpreterImpl {
   std::mutex init_lock_;
 };
 
-struct ConcreteInterpreterSessionImpl
+struct __attribute__((visibility("hidden"))) ConcreteInterpreterSessionImpl
     : public torch::deploy::InterpreterSessionImpl {
   ConcreteInterpreterSessionImpl(ConcreteInterpreterImpl* interp)
       : interp_(interp) {}