From 04b33b582c1875528b4089eacd5d7b8321075967 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20J=2E=20Iglesias=20Garc=C3=ADa?= Date: Mon, 17 Jun 2024 20:51:46 +0200 Subject: [PATCH] Python Interface with Nanobind (#99) Summary cherry-picking the squashed messages: * Nanobind extension * Basic test and example. * CI workflow. * Update extension. --- .github/workflows/nanobind.yml | 37 ++++++++++ CMakeLists.txt | 71 ++++++++++++++++++- examples/go.py | 106 ++++------------------------- examples/nanobind.py | 10 +++ examples/utils.py | 91 +++++++++++++++++++++++++ include/stichwort/parameter.hpp | 1 + include/tapkee/chain_interface.hpp | 4 +- src/cli/util.hpp | 5 ++ src/python/nanobind_extension.cpp | 46 +++++++++++++ test/test_nanobind_extension.py | 21 ++++++ 10 files changed, 295 insertions(+), 97 deletions(-) create mode 100644 .github/workflows/nanobind.yml create mode 100755 examples/nanobind.py create mode 100644 examples/utils.py create mode 100644 src/python/nanobind_extension.cpp create mode 100644 test/test_nanobind_extension.py diff --git a/.github/workflows/nanobind.yml b/.github/workflows/nanobind.yml new file mode 100644 index 0000000..bbcd217 --- /dev/null +++ b/.github/workflows/nanobind.yml @@ -0,0 +1,37 @@ +name: Nanobind + +on: + # TODO probably change/add main later. Ah, perhaps this wf can be part of linux.yml? + push: + branches: [ "nanobind" ] + # pull_request: + # branches: [ ] + +jobs: + build: + name: "Build Python interface" + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Python setup + uses: actions/setup-python@v5 + with: + python-version: '3.11.9' + + - name: Install dependencies + run: | + sudo apt-get install -y libarpack2-dev libcxxopts-dev libeigen3-dev libfmt-dev python3.11-dev + python -m pip install nanobind numpy + + - name: Configure + run: cmake -B ${{github.workspace}}/build -DBUILD_NANOBIND=ON + + - name: Build + # TODO there are A LOT of warnings in the logs from nanobind (-pedantic, -Wshadow). + run: cmake --build ${{github.workspace}}/build + + - name: Test + working-directory: ${{github.workspace}} + run: PYTHONPATH=. python test/test_nanobind_extension.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 41d8a9a..5302162 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -123,8 +123,12 @@ if (MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj") endif() -if (NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Release) +# Part of nanobind's setting up the build system, configuring optimized build +# unless otherwise specified, to avoid slow binding code and large binaries. +# https://nanobind.readthedocs.io/en/latest/building.html#preliminaries +if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() if (CMAKE_BUILD_TYPE MATCHES Debug) @@ -164,6 +168,69 @@ if (BUILD_TESTS) endforeach() endif() +option(BUILD_NANOBIND "Build nanobind Python extension" OFF) + +# https://nanobind.readthedocs.io/en/latest/building.html +# TODO for faster nanobind build, I tried removing the +# add_executable tapkee and that quickly didn't work. It would +# be nice if it could be done without requiring a new option +# BUILD_CLI, or so, maybe if there's a CMake command to +# configure not building tapkee inside the if BUILD_NANOBIND. +if (BUILD_NANOBIND) + message(STATUS "Detecting and configuring nanobind") + find_package(Python COMPONENTS Interpreter Development.Module REQUIRED) + # Detect the installed nanobind package and import it into CMake + execute_process( + COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir + OUTPUT_VARIABLE NB_DIR OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_VARIABLE NB_DIR) + list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}") + find_package(nanobind CONFIG REQUIRED) + + # Build extension (the library that'll be imported from the Python interpreter) + include_directories("${TAPKEE_SRC_DIR}") # about TODO in nanobind_extension.cpp including src/utils.hpp + nanobind_add_module(pytapkee src/python/nanobind_extension.cpp) # TODO fix paths. + target_link_libraries(pytapkee PRIVATE "${EIGEN3_LIBRARY_TO_LINK}") + target_link_libraries(pytapkee PRIVATE arpack) # TODO ARPACK guard; TODO without ARPACK(?) + target_link_libraries(pytapkee PRIVATE "${FMT_LIBRARY_TO_LINK}") + + # Rename so that it can be imported as tapkee iso pytapkee. + # TODO can this go into a separate CMake file? + add_custom_command(TARGET pytapkee POST_BUILD + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_BINARY_DIR}/rename_pytapkee.cmake + COMMENT "Renaming nanobind's extension pytapkee*.so to tapkee*.so") + # Create the custom script to rename the file + file(WRITE ${CMAKE_BINARY_DIR}/rename_pytapkee.cmake + " # Find the file that starts with 'pytapkee' and ends with '.so' in the lib directory +file(GLOB PYLIBRARY_FILE \"${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/pytapkee*.so\") + +# Ensure exactly one file is found +list(LENGTH PYLIBRARY_FILE FILE_COUNT) +if(NOT FILE_COUNT EQUAL 1) + message(FATAL_ERROR \"Expected exactly one file starting with 'pytapkee' in lib, but found \${FILE_COUNT}\") +endif() + +# Get the first (and only) matched file +list(GET PYLIBRARY_FILE 0 SOURCE_FILE) + +# Extract the filename from the full path +get_filename_component(FILENAME \${SOURCE_FILE} NAME) + +string(REPLACE \"pytapkee\" \"tapkee\" DEST_FILENAME \${FILENAME}) + +# Construct the full destination path +set(DEST_FILE \"${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/\${DEST_FILENAME}\") + +file(RENAME \${SOURCE_FILE} \${DEST_FILE}) + +if(EXISTS \${DEST_FILE} AND NOT EXISTS \${SOURCE_FILE}) + message(STATUS \"File renamed from \${SOURCE_FILE} to \${DEST_FILE}\") +else() + message(FATAL_ERROR \"Error renaming file from \${SOURCE_FILE} to \${DEST_FILE}\") +endif()") + message(STATUS "Detecting and configuring nanobind - done") +endif() + export( TARGETS tapkee_library NAMESPACE tapkee:: diff --git a/examples/go.py b/examples/go.py index 494d211..25ca7d8 100755 --- a/examples/go.py +++ b/examples/go.py @@ -8,55 +8,25 @@ import tempfile import numpy as np -import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D - -def generate_data(type, N=1000, random_state=None): - rng = np.random.RandomState(random_state) - if type=='swissroll': - tt = np.array((3*np.pi/2)*(1+2*rng.rand(N))) - height = np.array((rng.rand(N)-0.5)) - X = np.array([tt*np.cos(tt), 10*height, tt*np.sin(tt)]) - return X, tt - if type=='scurve': - tt = np.array((3*np.pi*(rng.rand(N)-0.5))) - height = np.array((rng.rand(N)-0.5)) - X = np.array([np.sin(tt), 10*height, np.sign(tt)*(np.cos(tt)-1)]) - return X, tt - if type=='helix': - tt = np.linspace(1,N,N).T / N - tt = tt*2*np.pi - X = np.r_[[(2+np.cos(8*tt))*np.cos(tt)], - [(2+np.cos(8*tt))*np.sin(tt)], - [np.sin(8*tt)]] - return X, tt - if type=='twinpeaks': - X = rng.uniform(-1, 1, size=(N, 2)) - tt = np.sin(np.pi * X[:, 0]) * np.tanh(X[:, 1]) - tt += 0.1 * rng.normal(size=tt.shape) - X = np.vstack([X.T, tt]) - return X, tt - if type=='klein': - u = rng.uniform(0, 2 * np.pi, N) - v = rng.uniform(0, 2 * np.pi, N) - x = (2 + np.cos(u / 2) * np.sin(v) - np.sin(u / 2) * np.sin(2 * v)) * np.cos(u) - y = (2 + np.cos(u / 2) * np.sin(v) - np.sin(u / 2) * np.sin(2 * v)) * np.sin(u) - z = np.sin(u / 2) * np.sin(v) + np.cos(u / 2) * np.sin(2 * v) - - noise = 0.01 - x += noise * rng.normal(size=x.shape) - y += noise * rng.normal(size=y.shape) - z += noise * rng.normal(size=z.shape) - return np.vstack((x, y, z)), u - - raise Exception('Dataset is not supported') - +from utils import generate_data, plot + +supported_methods = { + 'lle': 'Locally Linear Embedding', + 'ltsa': 'Local Tangent Space Alignment', + 'isomap': 'Isomap', + 'mds': 'Multidimensional Scaling', + 'pca': 'Principal Component Analysis', + 'kpca': 'Kernel Principal Component Analysis', + 't-sne': 't-distributed Stochastic Neighborhood Embedding', + 'dm': 'Diffusion Map', +} def embed(data,method): input_file = tempfile.NamedTemporaryFile(prefix='tapkee_input') output_file = tempfile.NamedTemporaryFile(prefix='tapkee_output') np.savetxt(input_file.name, data.T,delimiter=',') tapkee_binary = 'bin/tapkee' + runner_string = '%s -i %s -o %s -m %s -k 20 --precompute --debug --verbose --transpose-output --benchmark' % ( tapkee_binary, input_file.name, output_file.name, method ) @@ -71,59 +41,9 @@ def embed(data,method): else: used_method = '' - embedded_data = np.loadtxt(output_file, delimiter=',') return embedded_data, used_method -def plot(data, embedded_data, colors='m', method=None): - fig = plt.figure() - fig.set_facecolor('white') - - ax_original = fig.add_subplot(121, projection='3d') - scatter_original = ax_original.scatter(data[0], data[1], data[2], c=colors, cmap=plt.cm.Spectral, s=5, picker=True) - plt.axis('tight') - plt.axis('off') - plt.title('Original', fontsize=9) - - ax_embedding = fig.add_subplot(122) - scatter_embedding = ax_embedding.scatter(embedded_data[0], embedded_data[1], c=colors, cmap=plt.cm.Spectral, s=5, picker=True) - plt.axis('tight') - plt.axis('off') - plt.title('Embedding' + (' with ' + method) if method else '', fontsize=9, wrap=True) - - highlighted_points = [] # To store highlighted points - - # Function to highlight points on both plots - def highlight(index): - # Reset previous highlighted points - for point in highlighted_points: - point.remove() - highlighted_points.clear() - - # Highlight the current point on both scatter plots - point1 = ax_original.scatter([data[0][index]], [data[1][index]], [data[2][index]], color='white', s=25, edgecolor='black', zorder=3) - point2 = ax_embedding.scatter([embedded_data[0][index]], [embedded_data[1][index]], color='white', s=25, edgecolor='black', zorder=3) - highlighted_points.append(point1) - highlighted_points.append(point2) - fig.canvas.draw_idle() - - # Event handler for mouse motion - def on_hover(event): - if event.inaxes == ax_original: - cont, ind = scatter_original.contains(event) - elif event.inaxes == ax_embedding: - cont, ind = scatter_embedding.contains(event) - else: - return - - if cont: - index = ind['ind'][0] - highlight(index) - - fig.canvas.mpl_connect('motion_notify_event', on_hover) - - plt.show() - if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Graphical example of dimension reduction with Tapkee.') diff --git a/examples/nanobind.py b/examples/nanobind.py new file mode 100755 index 0000000..58b386b --- /dev/null +++ b/examples/nanobind.py @@ -0,0 +1,10 @@ +import lib.tapkee as tapkee +from utils import generate_data, plot + +if __name__=='__main__': + parameters = tapkee.ParametersSet() + method = tapkee.parse_reduction_method('lle') + parameters.add(tapkee.Parameter.create('dimension reduction method', method)) + data, colors = generate_data('swissroll') + embedded_data = tapkee.withParameters(parameters).embedUsing(data).embedding + plot(data, embedded_data.T, colors) diff --git a/examples/utils.py b/examples/utils.py new file mode 100644 index 0000000..fe432ba --- /dev/null +++ b/examples/utils.py @@ -0,0 +1,91 @@ +import matplotlib.pyplot as plt +import numpy as np + +def generate_data(type, N=1000, random_state=None): + rng = np.random.RandomState(random_state) + if type=='swissroll': + tt = np.array((3*np.pi/2)*(1+2*rng.rand(N))) + height = np.array((rng.rand(N)-0.5)) + X = np.array([tt*np.cos(tt), 10*height, tt*np.sin(tt)]) + return X, tt + if type=='scurve': + tt = np.array((3*np.pi*(rng.rand(N)-0.5))) + height = np.array((rng.rand(N)-0.5)) + X = np.array([np.sin(tt), 10*height, np.sign(tt)*(np.cos(tt)-1)]) + return X, tt + if type=='helix': + tt = np.linspace(1,N,N).T / N + tt = tt*2*np.pi + X = np.r_[[(2+np.cos(8*tt))*np.cos(tt)], + [(2+np.cos(8*tt))*np.sin(tt)], + [np.sin(8*tt)]] + return X, tt + if type=='twinpeaks': + X = rng.uniform(-1, 1, size=(N, 2)) + tt = np.sin(np.pi * X[:, 0]) * np.tanh(X[:, 1]) + tt += 0.1 * rng.normal(size=tt.shape) + X = np.vstack([X.T, tt]) + return X, tt + if type=='klein': + u = rng.uniform(0, 2 * np.pi, N) + v = rng.uniform(0, 2 * np.pi, N) + x = (2 + np.cos(u / 2) * np.sin(v) - np.sin(u / 2) * np.sin(2 * v)) * np.cos(u) + y = (2 + np.cos(u / 2) * np.sin(v) - np.sin(u / 2) * np.sin(2 * v)) * np.sin(u) + z = np.sin(u / 2) * np.sin(v) + np.cos(u / 2) * np.sin(2 * v) + + noise = 0.01 + x += noise * rng.normal(size=x.shape) + y += noise * rng.normal(size=y.shape) + z += noise * rng.normal(size=z.shape) + return np.vstack((x, y, z)), u + + raise Exception('Dataset is not supported') + +def plot(data, embedded_data, colors='m', method=None): + fig = plt.figure() + fig.set_facecolor('white') + + ax_original = fig.add_subplot(121, projection='3d') + scatter_original = ax_original.scatter(data[0], data[1], data[2], c=colors, cmap=plt.cm.Spectral, s=5, picker=True) + plt.axis('tight') + plt.axis('off') + plt.title('Original', fontsize=9) + + ax_embedding = fig.add_subplot(122) + scatter_embedding = ax_embedding.scatter(embedded_data[0], embedded_data[1], c=colors, cmap=plt.cm.Spectral, s=5, picker=True) + plt.axis('tight') + plt.axis('off') + plt.title('Embedding' + (' with ' + method) if method else '', fontsize=9, wrap=True) + + highlighted_points = [] # To store highlighted points + + # Function to highlight points on both plots + def highlight(index): + # Reset previous highlighted points + for point in highlighted_points: + point.remove() + highlighted_points.clear() + + # Highlight the current point on both scatter plots + point1 = ax_original.scatter([data[0][index]], [data[1][index]], [data[2][index]], color='white', s=25, edgecolor='black', zorder=3) + point2 = ax_embedding.scatter([embedded_data[0][index]], [embedded_data[1][index]], color='white', s=25, edgecolor='black', zorder=3) + highlighted_points.append(point1) + highlighted_points.append(point2) + fig.canvas.draw_idle() + + # Event handler for mouse motion + def on_hover(event): + if event.inaxes == ax_original: + cont, ind = scatter_original.contains(event) + elif event.inaxes == ax_embedding: + cont, ind = scatter_embedding.contains(event) + else: + return + + if cont: + index = ind['ind'][0] + highlight(index) + + fig.canvas.mpl_connect('motion_notify_event', on_hover) + + plt.show() diff --git a/include/stichwort/parameter.hpp b/include/stichwort/parameter.hpp index 5844eba..bd14f11 100644 --- a/include/stichwort/parameter.hpp +++ b/include/stichwort/parameter.hpp @@ -29,6 +29,7 @@ #include +#include #include #include #include diff --git a/include/tapkee/chain_interface.hpp b/include/tapkee/chain_interface.hpp index 42e08a5..87a5617 100644 --- a/include/tapkee/chain_interface.hpp +++ b/include/tapkee/chain_interface.hpp @@ -420,8 +420,8 @@ class ParametersInitializedState ParametersInitializedState(const ParametersSet& that) : parameters(that) { } - ParametersInitializedState(const ParametersInitializedState&); - ParametersInitializedState& operator=(const ParametersInitializedState&); + ParametersInitializedState(const ParametersInitializedState&) = default; + ParametersInitializedState& operator=(const ParametersInitializedState&) = default; /** Sets kernel callback. * diff --git a/src/cli/util.hpp b/src/cli/util.hpp index 69b5c52..d4c6594 100644 --- a/src/cli/util.hpp +++ b/src/cli/util.hpp @@ -225,6 +225,11 @@ typename Mapping::mapped_type parse_multiple(Mapping mapping, const std::string& throw std::logic_error(str); } +auto parse_reduction_method(const std::string& str) +{ + return parse_multiple(DIMENSION_REDUCTION_METHODS, str); +} + template tapkee::DenseMatrix matrix_from_callback(const tapkee::IndexType N, PairwiseCallback callback) { diff --git a/src/python/nanobind_extension.cpp b/src/python/nanobind_extension.cpp new file mode 100644 index 0000000..7cb45d8 --- /dev/null +++ b/src/python/nanobind_extension.cpp @@ -0,0 +1,46 @@ +#include + +#include // for Parameter::create + +#include +#include // for ParametersSet, TapkeeOutput +#include // for DimensionReductionMethod + +#include +#include // for type caster embedUsing +#include // for type caster calling stichwort::Parameter::create(const std::string&, ... + +namespace nb = nanobind; + +using stichwort::Parameter; + +using tapkee::DimensionReductionMethod; +using tapkee::ParametersSet; +using tapkee::TapkeeOutput; +using tapkee::with; + +using tapkee::tapkee_internal::ParametersInitializedState; // TODO consider making it part of the "external" API + +NB_MODULE(tapkee, m) { + m.def("withParameters", &with); + + nb::class_(m, "ParametersSet") + .def(nb::init<>()) + .def("add", &ParametersSet::add); + + nb::class_(m, "ParametersInitializedState") + .def(nb::init()) + .def("embedUsing", &ParametersInitializedState::embedUsing); + + nb::class_(m, "TapkeeOutput") + .def_rw("embedding", &TapkeeOutput::embedding); + + m.def("parse_reduction_method", &parse_reduction_method); + + nb::class_(m, "DimensionReductionMethod") + .def_rw("name", &DimensionReductionMethod::name_); + + nb::class_(m, "Parameter") + .def_static("create", &Parameter::create) + .def_static("create", &Parameter::create); +} diff --git a/test/test_nanobind_extension.py b/test/test_nanobind_extension.py new file mode 100644 index 0000000..9d824ba --- /dev/null +++ b/test/test_nanobind_extension.py @@ -0,0 +1,21 @@ +import lib.tapkee as tapkee +import numpy as np + +def test_exception_unknown_method(): + try: + tapkee.parse_reduction_method('unknown') + assert(False) + except: + pass + +if __name__=='__main__': + test_exception_unknown_method() + parameters = tapkee.ParametersSet() + method = tapkee.parse_reduction_method('spe') + assert(method.name == 'Stochastic Proximity Embedding (SPE)') + parameters.add(tapkee.Parameter.create('dimension reduction method', method)) + target_dimension = 2 + parameters.add(tapkee.Parameter.create('target dimension', target_dimension)) + data = np.random.randn(124, 3) + embedded_data = tapkee.withParameters(parameters).embedUsing(data).embedding + assert(embedded_data.shape == tuple([data.shape[1], target_dimension]))