Re-factor AddGHPairs interface

YuanTingHsieh · Nov 27, 2024 · 91602e8 · 91602e8
1 parent 596016b
commit 91602e8
Show file tree

Hide file tree

Showing 10 changed files with 646 additions and 422 deletions.
diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_ct_plugin.h b/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_ct_plugin.h
diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_plugin.h b/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_plugin.h
@@ -28,28 +28,6 @@
 #include "endec.h"
 
 #define PRECISION 1e9
-#define TIME
-
-class Timer {
-public:
-    Timer() : start_time_(), end_time_() {}
-
-    void start() {
-        start_time_ = std::chrono::high_resolution_clock::now();
-    }
-
-    void stop() {
-        end_time_ = std::chrono::high_resolution_clock::now();
-    }
-
-    double duration() const {
-        return std::chrono::duration_cast<std::chrono::microseconds>(end_time_ - start_time_).count();
-    }
-
-private:
-    std::chrono::high_resolution_clock::time_point start_time_;
-    std::chrono::high_resolution_clock::time_point end_time_;
-};
 
 namespace nvflare {
 
@@ -283,12 +261,13 @@ class CUDAPlugin: public LocalPlugin {
 
       cudaMemcpy(h_ptr, d_plains_ptr, mem_size, cudaMemcpyDeviceToHost);
       std::vector<double> result;
+      result.resize(count);
       for (size_t i = 0; i < count; ++i) {
         mpz_t n;
         mpz_init(n);
         store2Gmp(n, h_ptr + i);
         double output_num = endec_ptr_->decode(n);
-        result.push_back(output_num);
+        result[i] = output_num;
         mpz_clear(n);
       }
       cudaFree(d_plains_ptr);
@@ -297,12 +276,14 @@ class CUDAPlugin: public LocalPlugin {
       return result;
     }
 
-    std::map<int, Buffer> AddGHPairs(const std::map<int, std::vector<int>>& sample_ids) override{
-      if (debug_) std::cout << "Calling AddGHPairs with sample_ids size " << sample_ids.size() << std::endl;
+    void AddGHPairs(std::vector<Buffer>& result, const std::uint64_t *ridx, const std::size_t size) override {
+      if (debug_) std::cout << "Calling AddGHPairs with size " << size << std::endl;
       if (!encrypted_gh_pairs_) {
         setGHPairs();
       }
-      std::map<int, Buffer> result;
+
+      std::vector<std::vector<int>> binIndexVec;
+      prepareBinIndexVec(binIndexVec, ridx, size);
 
       CgbnPair* d_res_ptr;
       size_t mem_size = sizeof(CgbnPair);
@@ -316,34 +297,28 @@ class CUDAPlugin: public LocalPlugin {
         throw std::runtime_error("Can't call AddGHPairs if paillier does not have public key.");
       }
 
-      Timer timer;
-      // Iterate through the map
-      for (auto& pair : sample_ids) {
-        int key = pair.first;
-        const int* sample_id = pair.second.data();
-        int count = pair.second.size();
+      for (auto i = 0; i < binIndexVec.size(); i++) {
+        const int* sample_id = binIndexVec[i].data();
+        int count = binIndexVec[i].size();
 
         int* sample_id_d;
         ck(cudaMalloc((void **)&sample_id_d, sizeof(int) * count));
         cudaMemcpy(sample_id_d, sample_id, sizeof(int) * count, cudaMemcpyHostToDevice);
 
-        timer.start();
         paillier_cipher_ptr_->sum<TPI,TPB>(d_res_ptr, encrypted_gh_pairs_, sample_id_d, count);
-        timer.stop();
-        std::cout << "Time for add " << count << " of elements is " << timer.duration() << " US" << std::endl;
 
         void* data = malloc(mem_size);
         cudaMemcpy(data, d_res_ptr, mem_size, cudaMemcpyDeviceToHost);
         Buffer buffer(data, mem_size, true);
-        result[key] = buffer; // Add the Buffer object to the result map
+        result[i] = buffer; // Add the Buffer object to the result map
         cudaFree(sample_id_d);
       }
       cudaFree(d_res_ptr);
       if (debug_) std::cout << "Finish AddGHPairs" << std::endl;
       if (encrypted_gh_pairs_) {
         clearGHPairs();
       }
-      return result;
+
     }
 };
 } // namespace nvflare

diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_utils.h b/integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_utils.h
@@ -34,6 +34,7 @@
 #include "cgbn.h"
 #include <cstdlib> // For rand() function
 #include <ctime>   // For time() function
+#include <cuda_runtime.h>
 
 /********** Constant Values **************/
 const static unsigned int bits=2048;
@@ -45,9 +46,6 @@ const static unsigned int key_len=1024;
 //const static unsigned int bits=6144;
 //const static unsigned int key_len=3072;
 
-#define DEBUG
-#define TIME
-
 const static int TPB=512;
 const static int TPI=32;
 const static int window_bits=5;
@@ -273,4 +271,56 @@ bool compare_result(const std::vector<double> &a, const std::vector<double> &b,
     return true;
 }
 
+
+void getGPUMemory() {
+    int deviceCount = 0;
+    cudaGetDeviceCount(&deviceCount);
+
+    if (deviceCount == 0) {
+        std::cerr << "No CUDA-capable devices found!" << std::endl;
+        return;
+    }
+
+    // Iterate over all devices
+    for (int i = 0; i < deviceCount; ++i) {
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, i);
+
+        std::cout << "Device " << i << ": " << deviceProp.name << std::endl;
+
+        // Get the total and available memory on the GPU
+        size_t freeMem, totalMem;
+        cudaMemGetInfo(&freeMem, &totalMem);
+
+        std::cout << "Total GPU Memory: " << totalMem / (1024 * 1024) << " MB" << std::endl;
+        std::cout << "Free GPU Memory: " << freeMem / (1024 * 1024) << " MB" << std::endl;
+        std::cout << "Used GPU Memory: " << (totalMem - freeMem) / (1024 * 1024) << " MB" << std::endl;
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, i);
+        printf("Device Number: %d\n", i);
+        printf("  Device name: %s\n", prop.name);
+        printf("  Memory Clock Rate (MHz): %d\n",
+            prop.memoryClockRate/1024);
+        printf("  Memory Bus Width (bits): %d\n",
+            prop.memoryBusWidth);
+        printf("  Peak Memory Bandwidth (GB/s): %.1f\n",
+            2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
+        printf("  Total global memory (Gbytes) %.1f\n",(float)(prop.totalGlobalMem)/1024.0/1024.0/1024.0);
+        printf("  Shared memory per block (Kbytes) %.1f\n",(float)(prop.sharedMemPerBlock)/1024.0);
+        printf("  minor-major: %d-%d\n", prop.minor, prop.major);
+        printf("  Warp-size: %d\n", prop.warpSize);
+        printf("  Concurrent kernels: %s\n", prop.concurrentKernels ? "yes" : "no");
+        printf("  Concurrent computation/communication: %s\n\n",prop.deviceOverlap ? "yes" : "no");
+
+
+        // bits is how many bits for G or H
+        // bits / 8 * 2 is the size of one GHPair
+        // each sum kernel needs 2 GHPairs
+        unsigned int max_num_of_kernel_launch_permitted = freeMem / bits * 8 / 4;
+
+        //size_t max_num_of_instances_per_launch = 16777216; // maximum numbers can be processed in a single launch
+        std::cout << "max_num_of_kernel_launch_permitted is " << max_num_of_kernel_launch_permitted << std::endl;
+    }
+}
+
 #endif // CUDA_UTILS_H
diff --git a/integration/xgboost/encryption_plugins/cuda_plugin/src/paillier.h b/integration/xgboost/encryption_plugins/cuda_plugin/src/paillier.h
@@ -411,13 +411,12 @@ class PaillierCipher{
 	    }
 
         template<unsigned int TPI, unsigned int TPB>
-            int agg_tuple(CgbnPair* d_cell_table, int count, int num_blocks) {
+            int agg_tuple(CgbnPair* d_cell_table, int count, unsigned int num_blocks) {
                 cgbn_error_report_t *report;
                 ck(cgbn_error_report_alloc(&report));
 
 #ifdef TIME
                 CudaTimer cuda_timer(0);
-                float gen_time=0;
                 cuda_timer.start();
 #endif
 
@@ -765,9 +764,9 @@ __global__ void reduce_sum_with_index(cgbn_error_report_t* report, CgbnPair* res
     for (unsigned int window = 0; window < total_windows; window++) {
         int global_position = id + window * IPB * gridDim.x;
 
-// #ifdef DEBUG
-//         printf("id %d shm_id %d IPB %d threadIdx.x %d blockIdx.x %d gridDim.x %d window %d total_windows %d global_position %d \n", id, shm_id, IPB, threadIdx.x, blockIdx.x, gridDim.x, window, total_windows, global_position);
-// #endif
+#ifdef DEBUG
+        printf("id %d shm_id %d IPB %d threadIdx.x %d blockIdx.x %d gridDim.x %d window %d total_windows %d global_position %d \n", id, shm_id, IPB, threadIdx.x, blockIdx.x, gridDim.x, window, total_windows, global_position);
+#endif
 
         if (global_position >= count) {
             // Load rand_seed into sdata4 directly for positions exceeding count
@@ -840,7 +839,6 @@ __global__ void add_two(cgbn_error_report_t *report, CgbnPair* arr, int count) {
         typedef cgbn_env_t<context_t, BITS> env_t;
         typedef typename env_t::cgbn_t bn_t;
         typedef typename env_t::cgbn_wide_t bn_w_t;
-        int IPB = blockDim.x / TPI;
 
         context_t bn_context(cgbn_report_monitor, report, item_id);
         env_t bn_env(bn_context);
@@ -851,10 +849,7 @@ __global__ void add_two(cgbn_error_report_t *report, CgbnPair* arr, int count) {
 
         cgbn_load(bn_env, n_square, &c_PubKey.n_square);
 
-        printf("id %d item_id %d IPB %d threadIdx.x %d blockIdx.x %d gridDim.x %d \n", id, item_id, IPB, threadIdx.x, blockIdx.x, gridDim.x);
-
-
-        // Add 2 numbers up
+        // Add 2 GHPairs up
         // Load pairs of elements and perform reduction
         cgbn_load(bn_env, a, &(arr[item_id].g));
         cgbn_load(bn_env, b, &(arr[item_id + 1].g));

diff --git a/integration/xgboost/encryption_plugins/nvflare_plugin/src/pass_thru_plugin.cc b/integration/xgboost/encryption_plugins/nvflare_plugin/src/pass_thru_plugin.cc
@@ -100,31 +100,40 @@ std::vector<double> PassThruPlugin::DecryptVector(const std::vector<Buffer>& cip
   return result;
 }
 
-std::map<int, Buffer> PassThruPlugin::AddGHPairs(const std::map<int, std::vector<int>>& sample_ids) {
+void PassThruPlugin::AddGHPairs(std::vector<Buffer>& result, const std::uint64_t *ridx, const std::size_t size) {
+  size_t total_bin_size = cuts_.back();
   if (debug_) {
-    std::cout << "PassThruPlugin::AddGHPairs called with " << sample_ids.size() << " slots" << std::endl;
+    std::cout << "PassThruPlugin::AddGHPairs called with " << total_bin_size << " bins" << std::endl;
   }
 
   // Can't do this in real plugin. It needs to be broken into encrypted parts
   auto gh_pairs = DecryptVector(std::vector<Buffer>{Buffer(encrypted_gh_.data(), encrypted_gh_.size())});
 
-  auto result = std::map<int, Buffer>();
-  for (auto const &entry : sample_ids) {
-    auto rows = entry.second;
+  std::vector<std::vector<int>> binIndexVec;
+  prepareBinIndexVec(binIndexVec, ridx, size);
+
+  size_t total_sample_ids = 0;
+  for (auto i = 0; i < binIndexVec.size(); ++i) {
+    auto rows = binIndexVec[i];
+    total_sample_ids += rows.size();
     double g = 0.0;
     double h = 0.0;
 
     for (auto row : rows) {
       g += gh_pairs[2 * row];
       h += gh_pairs[2 * row + 1];
     }
+
     // In real plugin, the sum should be still in encrypted state. No need to do this step
     auto encrypted_sum = EncryptVector(std::vector<double>{g, h});
     // print_buffer(reinterpret_cast<uint8_t *>(encrypted_sum.buffer), encrypted_sum.buf_size);
-    result.insert({entry.first, encrypted_sum});
+    result[i] = encrypted_sum;
+  }
+
+  if (debug_) {
+    std::cout << "PassThruPlugin::AddGHPairs finished with " << total_bin_size << " bins and " << total_sample_ids << " ids " << std::endl;
   }
 
-  return result;
 }
 
 } // namespace nvflare
diff --git a/integration/xgboost/encryption_plugins/nvflare_plugin/src/pass_thru_plugin.h b/integration/xgboost/encryption_plugins/nvflare_plugin/src/pass_thru_plugin.h
@@ -36,6 +36,7 @@ namespace nvflare {
 
     std::vector<double> DecryptVector(const std::vector<Buffer> &ciphertext) override;
 
-    std::map<int, Buffer> AddGHPairs(const std::map<int, std::vector<int>> &sample_ids) override;
+    void AddGHPairs(std::vector<Buffer>& result, const std::uint64_t *ridx, const std::size_t size) override;
+
   };
 } // namespace nvflare
diff --git a/integration/xgboost/encryption_plugins/shared/include/local_plugin.h b/integration/xgboost/encryption_plugins/shared/include/local_plugin.h
@@ -79,7 +79,7 @@ class LocalPlugin : public BasePlugin {
    * \return A map of the serialized encrypted sum of G and H for each slot
    *         The input and output maps must have the same size
    */
-  virtual std::map<int, Buffer> AddGHPairs(const std::map<int, std::vector<int>> &sample_ids) = 0;
+  virtual void AddGHPairs(std::vector<Buffer>& result, const std::uint64_t *ridx, const std::size_t size) = 0;
 
   /*!
    * \brief Free encrypted data buffer
@@ -94,6 +94,16 @@ class LocalPlugin : public BasePlugin {
     ciphertext.buf_size = 0;
   };
 
+  /**
+   * @brief Prepare the bin index vector
+   *
+   * @param binIndexVec A vector of length "total_bin_size", each item contains a vector of row ID
+   * @param ridx Point to an array of row IDs
+   * @param size Size of the ridx
+   *
+  */
+  void prepareBinIndexVec(std::vector<std::vector<int>>& binIndexVec, const std::uint64_t *ridx, const std::size_t size);
+
 private:
 
   /**

diff --git a/integration/xgboost/encryption_plugins/shared/include/timer.h b/integration/xgboost/encryption_plugins/shared/include/timer.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TIMER_H
+#define TIMER_H
+
+#include <chrono>
+
+class Timer {
+public:
+    Timer() : start_time_(), end_time_() {
+      begin_time_ = std::chrono::high_resolution_clock::now();
+    }
+
+    void start() {
+      start_time_ = std::chrono::high_resolution_clock::now();
+    }
+
+    void stop() {
+      end_time_ = std::chrono::high_resolution_clock::now();
+    }
+
+    double duration() const {
+      return std::chrono::duration_cast<std::chrono::microseconds>(end_time_ - start_time_).count();
+    }
+
+    double now() {
+      return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin_time_).count();
+    }
+
+private:
+    std::chrono::high_resolution_clock::time_point begin_time_;
+    std::chrono::high_resolution_clock::time_point start_time_;
+    std::chrono::high_resolution_clock::time_point end_time_;
+};
+
+#endif // TIMER_H