Skip to content

Commit

Permalink
Re-factor AddGHPairs interface
Browse files Browse the repository at this point in the history
  • Loading branch information
YuanTingHsieh committed Nov 27, 2024
1 parent 596016b commit 91602e8
Show file tree
Hide file tree
Showing 10 changed files with 646 additions and 422 deletions.
630 changes: 312 additions & 318 deletions integration/xgboost/encryption_plugins/cuda_plugin/src/cuda_ct_plugin.h

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -28,28 +28,6 @@
#include "endec.h"

#define PRECISION 1e9
#define TIME

class Timer {
public:
Timer() : start_time_(), end_time_() {}

void start() {
start_time_ = std::chrono::high_resolution_clock::now();
}

void stop() {
end_time_ = std::chrono::high_resolution_clock::now();
}

double duration() const {
return std::chrono::duration_cast<std::chrono::microseconds>(end_time_ - start_time_).count();
}

private:
std::chrono::high_resolution_clock::time_point start_time_;
std::chrono::high_resolution_clock::time_point end_time_;
};

namespace nvflare {

Expand Down Expand Up @@ -283,12 +261,13 @@ class CUDAPlugin: public LocalPlugin {

cudaMemcpy(h_ptr, d_plains_ptr, mem_size, cudaMemcpyDeviceToHost);
std::vector<double> result;
result.resize(count);
for (size_t i = 0; i < count; ++i) {
mpz_t n;
mpz_init(n);
store2Gmp(n, h_ptr + i);
double output_num = endec_ptr_->decode(n);
result.push_back(output_num);
result[i] = output_num;
mpz_clear(n);
}
cudaFree(d_plains_ptr);
Expand All @@ -297,12 +276,14 @@ class CUDAPlugin: public LocalPlugin {
return result;
}

std::map<int, Buffer> AddGHPairs(const std::map<int, std::vector<int>>& sample_ids) override{
if (debug_) std::cout << "Calling AddGHPairs with sample_ids size " << sample_ids.size() << std::endl;
void AddGHPairs(std::vector<Buffer>& result, const std::uint64_t *ridx, const std::size_t size) override {
if (debug_) std::cout << "Calling AddGHPairs with size " << size << std::endl;
if (!encrypted_gh_pairs_) {
setGHPairs();
}
std::map<int, Buffer> result;

std::vector<std::vector<int>> binIndexVec;
prepareBinIndexVec(binIndexVec, ridx, size);

CgbnPair* d_res_ptr;
size_t mem_size = sizeof(CgbnPair);
Expand All @@ -316,34 +297,28 @@ class CUDAPlugin: public LocalPlugin {
throw std::runtime_error("Can't call AddGHPairs if paillier does not have public key.");
}

Timer timer;
// Iterate through the map
for (auto& pair : sample_ids) {
int key = pair.first;
const int* sample_id = pair.second.data();
int count = pair.second.size();
for (auto i = 0; i < binIndexVec.size(); i++) {
const int* sample_id = binIndexVec[i].data();
int count = binIndexVec[i].size();

int* sample_id_d;
ck(cudaMalloc((void **)&sample_id_d, sizeof(int) * count));
cudaMemcpy(sample_id_d, sample_id, sizeof(int) * count, cudaMemcpyHostToDevice);

timer.start();
paillier_cipher_ptr_->sum<TPI,TPB>(d_res_ptr, encrypted_gh_pairs_, sample_id_d, count);
timer.stop();
std::cout << "Time for add " << count << " of elements is " << timer.duration() << " US" << std::endl;

void* data = malloc(mem_size);
cudaMemcpy(data, d_res_ptr, mem_size, cudaMemcpyDeviceToHost);
Buffer buffer(data, mem_size, true);
result[key] = buffer; // Add the Buffer object to the result map
result[i] = buffer; // Add the Buffer object to the result map
cudaFree(sample_id_d);
}
cudaFree(d_res_ptr);
if (debug_) std::cout << "Finish AddGHPairs" << std::endl;
if (encrypted_gh_pairs_) {
clearGHPairs();
}
return result;

}
};
} // namespace nvflare
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "cgbn.h"
#include <cstdlib> // For rand() function
#include <ctime> // For time() function
#include <cuda_runtime.h>

/********** Constant Values **************/
const static unsigned int bits=2048;
Expand All @@ -45,9 +46,6 @@ const static unsigned int key_len=1024;
//const static unsigned int bits=6144;
//const static unsigned int key_len=3072;

#define DEBUG
#define TIME

const static int TPB=512;
const static int TPI=32;
const static int window_bits=5;
Expand Down Expand Up @@ -273,4 +271,56 @@ bool compare_result(const std::vector<double> &a, const std::vector<double> &b,
return true;
}


void getGPUMemory() {
int deviceCount = 0;
cudaGetDeviceCount(&deviceCount);

if (deviceCount == 0) {
std::cerr << "No CUDA-capable devices found!" << std::endl;
return;
}

// Iterate over all devices
for (int i = 0; i < deviceCount; ++i) {
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, i);

std::cout << "Device " << i << ": " << deviceProp.name << std::endl;

// Get the total and available memory on the GPU
size_t freeMem, totalMem;
cudaMemGetInfo(&freeMem, &totalMem);

std::cout << "Total GPU Memory: " << totalMem / (1024 * 1024) << " MB" << std::endl;
std::cout << "Free GPU Memory: " << freeMem / (1024 * 1024) << " MB" << std::endl;
std::cout << "Used GPU Memory: " << (totalMem - freeMem) / (1024 * 1024) << " MB" << std::endl;
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device Number: %d\n", i);
printf(" Device name: %s\n", prop.name);
printf(" Memory Clock Rate (MHz): %d\n",
prop.memoryClockRate/1024);
printf(" Memory Bus Width (bits): %d\n",
prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %.1f\n",
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
printf(" Total global memory (Gbytes) %.1f\n",(float)(prop.totalGlobalMem)/1024.0/1024.0/1024.0);
printf(" Shared memory per block (Kbytes) %.1f\n",(float)(prop.sharedMemPerBlock)/1024.0);
printf(" minor-major: %d-%d\n", prop.minor, prop.major);
printf(" Warp-size: %d\n", prop.warpSize);
printf(" Concurrent kernels: %s\n", prop.concurrentKernels ? "yes" : "no");
printf(" Concurrent computation/communication: %s\n\n",prop.deviceOverlap ? "yes" : "no");


// bits is how many bits for G or H
// bits / 8 * 2 is the size of one GHPair
// each sum kernel needs 2 GHPairs
unsigned int max_num_of_kernel_launch_permitted = freeMem / bits * 8 / 4;

//size_t max_num_of_instances_per_launch = 16777216; // maximum numbers can be processed in a single launch
std::cout << "max_num_of_kernel_launch_permitted is " << max_num_of_kernel_launch_permitted << std::endl;
}
}

#endif // CUDA_UTILS_H
15 changes: 5 additions & 10 deletions integration/xgboost/encryption_plugins/cuda_plugin/src/paillier.h
Original file line number Diff line number Diff line change
Expand Up @@ -411,13 +411,12 @@ class PaillierCipher{
}

template<unsigned int TPI, unsigned int TPB>
int agg_tuple(CgbnPair* d_cell_table, int count, int num_blocks) {
int agg_tuple(CgbnPair* d_cell_table, int count, unsigned int num_blocks) {
cgbn_error_report_t *report;
ck(cgbn_error_report_alloc(&report));

#ifdef TIME
CudaTimer cuda_timer(0);
float gen_time=0;
cuda_timer.start();
#endif

Expand Down Expand Up @@ -765,9 +764,9 @@ __global__ void reduce_sum_with_index(cgbn_error_report_t* report, CgbnPair* res
for (unsigned int window = 0; window < total_windows; window++) {
int global_position = id + window * IPB * gridDim.x;

// #ifdef DEBUG
// printf("id %d shm_id %d IPB %d threadIdx.x %d blockIdx.x %d gridDim.x %d window %d total_windows %d global_position %d \n", id, shm_id, IPB, threadIdx.x, blockIdx.x, gridDim.x, window, total_windows, global_position);
// #endif
#ifdef DEBUG
printf("id %d shm_id %d IPB %d threadIdx.x %d blockIdx.x %d gridDim.x %d window %d total_windows %d global_position %d \n", id, shm_id, IPB, threadIdx.x, blockIdx.x, gridDim.x, window, total_windows, global_position);
#endif

if (global_position >= count) {
// Load rand_seed into sdata4 directly for positions exceeding count
Expand Down Expand Up @@ -840,7 +839,6 @@ __global__ void add_two(cgbn_error_report_t *report, CgbnPair* arr, int count) {
typedef cgbn_env_t<context_t, BITS> env_t;
typedef typename env_t::cgbn_t bn_t;
typedef typename env_t::cgbn_wide_t bn_w_t;
int IPB = blockDim.x / TPI;

context_t bn_context(cgbn_report_monitor, report, item_id);
env_t bn_env(bn_context);
Expand All @@ -851,10 +849,7 @@ __global__ void add_two(cgbn_error_report_t *report, CgbnPair* arr, int count) {

cgbn_load(bn_env, n_square, &c_PubKey.n_square);

printf("id %d item_id %d IPB %d threadIdx.x %d blockIdx.x %d gridDim.x %d \n", id, item_id, IPB, threadIdx.x, blockIdx.x, gridDim.x);


// Add 2 numbers up
// Add 2 GHPairs up
// Load pairs of elements and perform reduction
cgbn_load(bn_env, a, &(arr[item_id].g));
cgbn_load(bn_env, b, &(arr[item_id + 1].g));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,31 +100,40 @@ std::vector<double> PassThruPlugin::DecryptVector(const std::vector<Buffer>& cip
return result;
}

std::map<int, Buffer> PassThruPlugin::AddGHPairs(const std::map<int, std::vector<int>>& sample_ids) {
void PassThruPlugin::AddGHPairs(std::vector<Buffer>& result, const std::uint64_t *ridx, const std::size_t size) {
size_t total_bin_size = cuts_.back();
if (debug_) {
std::cout << "PassThruPlugin::AddGHPairs called with " << sample_ids.size() << " slots" << std::endl;
std::cout << "PassThruPlugin::AddGHPairs called with " << total_bin_size << " bins" << std::endl;
}

// Can't do this in real plugin. It needs to be broken into encrypted parts
auto gh_pairs = DecryptVector(std::vector<Buffer>{Buffer(encrypted_gh_.data(), encrypted_gh_.size())});

auto result = std::map<int, Buffer>();
for (auto const &entry : sample_ids) {
auto rows = entry.second;
std::vector<std::vector<int>> binIndexVec;
prepareBinIndexVec(binIndexVec, ridx, size);

size_t total_sample_ids = 0;
for (auto i = 0; i < binIndexVec.size(); ++i) {
auto rows = binIndexVec[i];
total_sample_ids += rows.size();
double g = 0.0;
double h = 0.0;

for (auto row : rows) {
g += gh_pairs[2 * row];
h += gh_pairs[2 * row + 1];
}

// In real plugin, the sum should be still in encrypted state. No need to do this step
auto encrypted_sum = EncryptVector(std::vector<double>{g, h});
// print_buffer(reinterpret_cast<uint8_t *>(encrypted_sum.buffer), encrypted_sum.buf_size);
result.insert({entry.first, encrypted_sum});
result[i] = encrypted_sum;
}

if (debug_) {
std::cout << "PassThruPlugin::AddGHPairs finished with " << total_bin_size << " bins and " << total_sample_ids << " ids " << std::endl;
}

return result;
}

} // namespace nvflare
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ namespace nvflare {

std::vector<double> DecryptVector(const std::vector<Buffer> &ciphertext) override;

std::map<int, Buffer> AddGHPairs(const std::map<int, std::vector<int>> &sample_ids) override;
void AddGHPairs(std::vector<Buffer>& result, const std::uint64_t *ridx, const std::size_t size) override;

};
} // namespace nvflare
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class LocalPlugin : public BasePlugin {
* \return A map of the serialized encrypted sum of G and H for each slot
* The input and output maps must have the same size
*/
virtual std::map<int, Buffer> AddGHPairs(const std::map<int, std::vector<int>> &sample_ids) = 0;
virtual void AddGHPairs(std::vector<Buffer>& result, const std::uint64_t *ridx, const std::size_t size) = 0;

/*!
* \brief Free encrypted data buffer
Expand All @@ -94,6 +94,16 @@ class LocalPlugin : public BasePlugin {
ciphertext.buf_size = 0;
};

/**
* @brief Prepare the bin index vector
*
* @param binIndexVec A vector of length "total_bin_size", each item contains a vector of row ID
* @param ridx Point to an array of row IDs
* @param size Size of the ridx
*
*/
void prepareBinIndexVec(std::vector<std::vector<int>>& binIndexVec, const std::uint64_t *ridx, const std::size_t size);

private:

/**
Expand Down
50 changes: 50 additions & 0 deletions integration/xgboost/encryption_plugins/shared/include/timer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef TIMER_H
#define TIMER_H

#include <chrono>

class Timer {
public:
Timer() : start_time_(), end_time_() {
begin_time_ = std::chrono::high_resolution_clock::now();
}

void start() {
start_time_ = std::chrono::high_resolution_clock::now();
}

void stop() {
end_time_ = std::chrono::high_resolution_clock::now();
}

double duration() const {
return std::chrono::duration_cast<std::chrono::microseconds>(end_time_ - start_time_).count();
}

double now() {
return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin_time_).count();
}

private:
std::chrono::high_resolution_clock::time_point begin_time_;
std::chrono::high_resolution_clock::time_point start_time_;
std::chrono::high_resolution_clock::time_point end_time_;
};

#endif // TIMER_H
Loading

0 comments on commit 91602e8

Please sign in to comment.