From b89715f4e38da968c7a9aa81806bc1ac5bd82e89 Mon Sep 17 00:00:00 2001 From: Nobel Gautam Date: Fri, 4 Dec 2020 15:35:38 -0500 Subject: [PATCH 1/6] chane aes to curand wip --- smt.cu | 57 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/smt.cu b/smt.cu index 6e5df17..ab9d4a2 100644 --- a/smt.cu +++ b/smt.cu @@ -1,6 +1,7 @@ #include #include #include +#include "curand_kernel.h" #include "SMTLIB/Float.h" #include "SMTLIB/BufferRef.h" #include "cuda_aes.h" @@ -32,27 +33,25 @@ __host__ __device__ inline int64_t aes_pad(int64_t num) { } // Note: size is the *unpadded* size of the input vars -__global__ void fuzz(uint8_t *in_data, size_t size, const uint8_t *key, uint64_t *gobuf, unsigned long long *execs) { +//__global__ void fuzz(uint8_t *in_data, size_t size, const uint8_t *key, uint64_t *gobuf, unsigned long long *execs) { +__global__ void fuzz(uint8_t *in_data, size_t size, curandState *state, uint64_t *gobuf, unsigned long long *execs) { int bindex = blockIdx.x * blockDim.x + threadIdx.x; - int64_t padded = aes_pad(size); - uint64_t offset = bindex * padded; - extern __shared__ uint8_t sdata[]; - // Get our local chunk - int soff = threadIdx.x * padded; + uint8_t *data = in_data + bindex*size; // i think? - // First time initialize block to i - for (int i = 0; i < padded; i += AES_BLOCK_SIZE) { - *(uint64_t *)(sdata+soff+i) = bindex * (padded/AES_BLOCK_SIZE) + i; - } + curandState localState = state[bindex]; while (!solved) { atomicAdd(execs, 1); // Randomize input for our slice - for (int i = 0; i < padded; i += AES_BLOCK_SIZE) { - encrypt_one_table(sdata+soff, key, i); + uint8_t* curr = data; + //TODO: once we confirm 16bytes and we generate 8bytes, replace loop with writes + while (curr < data + size) + { + *curr++ = curand(&localState); //TODO: i think this is 8bytes but not sure, alternative is uint4 } - if (LLVMFuzzerTestOneInput(sdata+soff, size)) { + + if (LLVMFuzzerTestOneInput(data, size)) { *gobuf = bindex; memcpy(in_data+offset, sdata+soff, size); solved = 1; @@ -69,6 +68,13 @@ void CUDART_CB finishedCB(void *data) { finished_dev = *(int *)data; } +__global__ void setup_kernel(curandState *state) +{ + int id = threadIdx.x + blockIdx.x * blockDim.x; + int seed = id*37; + curand_init(seed, id, 0, &state[id]); +} + void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_gobuf, unsigned long long **ret_execs) { cudaSetDevice(device); @@ -76,22 +82,14 @@ void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_g uint64_t *gobuf; unsigned long long *gexecs; - int64_t padded = aes_pad(varsize); - printf("Padding varsize from %d to %ld\n", varsize, padded); - unsigned char ckey[AES_BLOCK_SIZE]; - FILE *rng = fopen("/dev/urandom","rb"); - fread(ckey, AES_BLOCK_SIZE, 1, rng); - fclose(rng); - - // Pre-expand the round keys and copy to device mem - uint8_t rkey[176]; - const uint8_t *drkey; - expand_key(ckey, rkey); - gpuErrchk(cudaMalloc(&drkey, 176)); - gpuErrchk(cudaMemcpy((uint8_t *)drkey, rkey, sizeof(uint8_t) * 176, cudaMemcpyHostToDevice)); + int size = varsize; // i think? + curandState *rngStates; + gpuErrchk(cudaMalloc(&rngStates, N*M*sizeof(curandState))); + + setup_kernel<<>>(rngStates); // Alloc GPU buffers - gpuErrchk(cudaMalloc(&gbuf, padded*N*M)); + gpuErrchk(cudaMalloc(&gbuf, size*N*M)); gpuErrchk(cudaMalloc(&gobuf, sizeof(uint64_t))); gpuErrchk(cudaMalloc(&gexecs, sizeof(unsigned long long))); @@ -105,9 +103,10 @@ void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_g int *dev = (int *)malloc(sizeof(int)); *dev = device + 1; printf("Launching kernel on GPU%d...\n", device); - fuzz<<>>(gbuf, varsize, drkey, gobuf, gexecs); - gpuErrchk(cudaPeekAtLastError()); + //fuzz<<>>(gbuf, varsize, rnd, gobuf, gexecs); + fuzz<<>>(gbuf, varsize, rngStates, gobuf, gexecs); gpuErrchk(cudaLaunchHostFunc(stream, finishedCB, dev)); + //gpuErrchk(curandDestroyGenerator(gen)); } int main(int argc, char **argv) { From 5b78c6084263c29378e1ac6aab80e92bc77e8d54 Mon Sep 17 00:00:00 2001 From: Nobel Gautam Date: Fri, 4 Dec 2020 16:40:18 -0500 Subject: [PATCH 2/6] dont need curand init kernel oops --- smt.cu | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/smt.cu b/smt.cu index ab9d4a2..15e933d 100644 --- a/smt.cu +++ b/smt.cu @@ -37,8 +37,9 @@ __host__ __device__ inline int64_t aes_pad(int64_t num) { __global__ void fuzz(uint8_t *in_data, size_t size, curandState *state, uint64_t *gobuf, unsigned long long *execs) { int bindex = blockIdx.x * blockDim.x + threadIdx.x; + int seed = bindex*37; + curand_init(seed, bindex, 0, &state[bindex]); uint8_t *data = in_data + bindex*size; // i think? - curandState localState = state[bindex]; while (!solved) { @@ -68,13 +69,6 @@ void CUDART_CB finishedCB(void *data) { finished_dev = *(int *)data; } -__global__ void setup_kernel(curandState *state) -{ - int id = threadIdx.x + blockIdx.x * blockDim.x; - int seed = id*37; - curand_init(seed, id, 0, &state[id]); -} - void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_gobuf, unsigned long long **ret_execs) { cudaSetDevice(device); @@ -86,8 +80,6 @@ void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_g curandState *rngStates; gpuErrchk(cudaMalloc(&rngStates, N*M*sizeof(curandState))); - setup_kernel<<>>(rngStates); - // Alloc GPU buffers gpuErrchk(cudaMalloc(&gbuf, size*N*M)); gpuErrchk(cudaMalloc(&gobuf, sizeof(uint64_t))); From a17288a1220d886507057729a63d1086f34342a2 Mon Sep 17 00:00:00 2001 From: John McCann Cunniff Jr Date: Mon, 7 Dec 2020 23:41:24 -0500 Subject: [PATCH 3/6] catch up with main --- smt.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/smt.cu b/smt.cu index 15e933d..800bd70 100644 --- a/smt.cu +++ b/smt.cu @@ -36,11 +36,15 @@ __host__ __device__ inline int64_t aes_pad(int64_t num) { //__global__ void fuzz(uint8_t *in_data, size_t size, const uint8_t *key, uint64_t *gobuf, unsigned long long *execs) { __global__ void fuzz(uint8_t *in_data, size_t size, curandState *state, uint64_t *gobuf, unsigned long long *execs) { int bindex = blockIdx.x * blockDim.x + threadIdx.x; + int64_t padded = aes_pad(size); + uint64_t offset = bindex * padded; + int soff = threadIdx.x * padded; int seed = bindex*37; curand_init(seed, bindex, 0, &state[bindex]); uint8_t *data = in_data + bindex*size; // i think? curandState localState = state[bindex]; + extern __shared__ uint8_t sdata[]; while (!solved) { atomicAdd(execs, 1); @@ -62,7 +66,6 @@ __global__ void fuzz(uint8_t *in_data, size_t size, curandState *state, uint64_t *(uint64_t *)(sdata+soff+i) = bindex * (padded/AES_BLOCK_SIZE) + i; } } - return; } void CUDART_CB finishedCB(void *data) { From 407881be60a99c405e3f18827eb3d4efdf1c65b3 Mon Sep 17 00:00:00 2001 From: Nobel Gautam Date: Thu, 10 Dec 2020 04:07:04 -0500 Subject: [PATCH 4/6] report correct buffer on exit --- smt.cu | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/smt.cu b/smt.cu index 800bd70..25e31e3 100644 --- a/smt.cu +++ b/smt.cu @@ -98,10 +98,8 @@ void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_g int *dev = (int *)malloc(sizeof(int)); *dev = device + 1; printf("Launching kernel on GPU%d...\n", device); - //fuzz<<>>(gbuf, varsize, rnd, gobuf, gexecs); fuzz<<>>(gbuf, varsize, rngStates, gobuf, gexecs); gpuErrchk(cudaLaunchHostFunc(stream, finishedCB, dev)); - //gpuErrchk(curandDestroyGenerator(gen)); } int main(int argc, char **argv) { @@ -137,11 +135,10 @@ int main(int argc, char **argv) { // Get and print output - int64_t padded = aes_pad(varsize); - uint8_t *buf = (uint8_t *)malloc(padded); + uint8_t *buf = (uint8_t*)malloc(varsize); uint64_t oindex; gpuErrchk(cudaMemcpy(&oindex, gobuf[i], sizeof(uint64_t), cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(buf, gbuf[i]+(oindex*padded), padded, cudaMemcpyDeviceToHost)); + gpuErrchk(cudaMemcpy(buf, gbuf[i]+(oindex*varsize), varsize, cudaMemcpyDeviceToHost)); printf("Found a satisfying assignment on device %d thread %lu:\n", i, oindex); for (int k = 0; k < varsize; k++) printf("%02x", buf[k]); printf("\n"); } From 0199d74fc442e4885f02c30d576655687c48b17f Mon Sep 17 00:00:00 2001 From: Nobel Gautam Date: Thu, 10 Dec 2020 07:24:26 -0500 Subject: [PATCH 5/6] use shared data --- smt.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/smt.cu b/smt.cu index 25e31e3..8cd14f6 100644 --- a/smt.cu +++ b/smt.cu @@ -36,15 +36,15 @@ __host__ __device__ inline int64_t aes_pad(int64_t num) { //__global__ void fuzz(uint8_t *in_data, size_t size, const uint8_t *key, uint64_t *gobuf, unsigned long long *execs) { __global__ void fuzz(uint8_t *in_data, size_t size, curandState *state, uint64_t *gobuf, unsigned long long *execs) { int bindex = blockIdx.x * blockDim.x + threadIdx.x; - int64_t padded = aes_pad(size); - uint64_t offset = bindex * padded; - int soff = threadIdx.x * padded; + int offset = bindex * size; int seed = bindex*37; curand_init(seed, bindex, 0, &state[bindex]); - uint8_t *data = in_data + bindex*size; // i think? curandState localState = state[bindex]; + extern __shared__ uint8_t sdata[]; + int soff = threadIdx.x * size; + uint8_t *data = sdata + soff; while (!solved) { atomicAdd(execs, 1); @@ -62,9 +62,9 @@ __global__ void fuzz(uint8_t *in_data, size_t size, curandState *state, uint64_t solved = 1; } // Add increment to randomize (I hope?) - for (int i = 0; i < padded; i += AES_BLOCK_SIZE) { - *(uint64_t *)(sdata+soff+i) = bindex * (padded/AES_BLOCK_SIZE) + i; - } + //for (int i = 0; i < padded; i += AES_BLOCK_SIZE) { + // *(uint64_t *)(sdata+soff+i) = bindex * (padded/AES_BLOCK_SIZE) + i; + //} } } @@ -98,7 +98,7 @@ void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_g int *dev = (int *)malloc(sizeof(int)); *dev = device + 1; printf("Launching kernel on GPU%d...\n", device); - fuzz<<>>(gbuf, varsize, rngStates, gobuf, gexecs); + fuzz<<>>(gbuf, varsize, rngStates, gobuf, gexecs); gpuErrchk(cudaLaunchHostFunc(stream, finishedCB, dev)); } From 0cb03f19766aad5cf8c0c9aa5d1673a8ca756b4d Mon Sep 17 00:00:00 2001 From: John McCann Cunniff Jr Date: Thu, 10 Dec 2020 16:59:25 -0500 Subject: [PATCH 6/6] minor cmake build improvements --- CMakeLists.txt | 39 ++++++++++++++++++++++++++++++++------- Makefile | 4 ++-- generate.sh | 4 ++-- 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 087e301..02f8304 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,33 @@ cmake_minimum_required(VERSION 3.14) -project(smt LANGUAGES CXX CUDA) +project(smt) + +# Find CUDA +find_package(CUDA 11) +enable_language(CUDA) +set(ENV{CUDA_PATH} ${CUDA_TOOLKIT_ROOT_DIR}) +set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc) set(CMAKE_CUDA_ARCHETECTURES 35 50 72) +if (NOT DEFINED USEDLTO) + set(USEDLTO ON) +endif() + +message("") +message("CUDA_VERSION ${CUDA_VERSION}") +message("USEDLTO ${USEDLTO}") +message("CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR}") +message("CMAKE_CUDA_COMPILER ${CMAKE_CUDA_COMPILER}") +message("CMAKE_CUDA_ARCHETECTURES ${CMAKE_CUDA_ARCHETECTURES}") +message("") + +if (${CUDA_VERSION_MAJOR} EQUAL 11 AND ${USEDLTO}) + set(DLTO -dlto) +endif () + +set(NVCC_FLAGS -Wno-deprecated-gpu-targets --expt-relaxed-constexpr -DJFS_RUNTIME_FAILURE_CALLS_ABORT -dc) + +# SMTLIB add_library(smtlib STATIC SMTLIB/Core.cu SMTLIB/Logger.cu @@ -12,18 +37,18 @@ add_library(smtlib STATIC SMTLIB/NativeBitVector.cu) target_include_directories (smtlib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_compile_features(smtlib PUBLIC cxx_std_11) +set_property(TARGET smtlib PROPERTY CUDA_ARCHITECTURES 35 50 72) +target_compile_options(smtlib PUBLIC ${NVCC_FLAGS}) -add_executable(smt smt.cu smt.h theory.h theory.cu aes.cu cuda_aes.h) +# SMT +add_executable(smt smt.cu smt.h theory.h theory.cu) target_include_directories (smt PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(smt LINK_PUBLIC smtlib) - set_property(TARGET smt PROPERTY CUDA_ARCHITECTURES 35 50 72) -set_property(TARGET smtlib PROPERTY CUDA_ARCHITECTURES 35 50 72) set_property(TARGET smt PROPERTY CUDA_SEPARABLE_COMPILATION ON) +target_compile_options(smt PUBLIC ${NVCC_FLAGS}) -target_compile_options(smt PUBLIC --expt-relaxed-constexpr -DJFS_RUNTIME_FAILURE_CALLS_ABORT -dc) -target_compile_options(smtlib PUBLIC --expt-relaxed-constexpr -DJFS_RUNTIME_FAILURE_CALLS_ABORT -dc) - +# Move executable after build add_custom_command(TARGET smt POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ../bin/) add_custom_command(TARGET smt diff --git a/Makefile b/Makefile index b5a402c..ff06e18 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: cmake -Bbuild -GNinja . - cmake --build build -j $(shell nproc) + cmake --build build SMTLIB/%.o: SMTLIB/%.cu $(MAKE) -C SMTLIB @@ -26,4 +26,4 @@ smt: theory.o smt.o aes.o $(smtlib-objs) clean: rm -f *.o SMTLIB/*.o smt - rm -rf build + rm -rf build bin cxx diff --git a/generate.sh b/generate.sh index ac591ba..5c51f2f 100755 --- a/generate.sh +++ b/generate.sh @@ -55,6 +55,6 @@ for filename in "$@"; do fi smt2cxx "${filename}" | transpose | format > theory.cu cp theory.cu "cxx/smt-$(basename ${filename} | tr '.' '-')".cxx - make smt - mv smt "bin/smt-$(basename ${filename} | tr '.' '-')" + make + mv bin/smt "bin/smt-$(basename ${filename} | tr '.' '-')" done