diff --git a/CMakeLists.txt b/CMakeLists.txt index 087e301..02f8304 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,33 @@ cmake_minimum_required(VERSION 3.14) -project(smt LANGUAGES CXX CUDA) +project(smt) + +# Find CUDA +find_package(CUDA 11) +enable_language(CUDA) +set(ENV{CUDA_PATH} ${CUDA_TOOLKIT_ROOT_DIR}) +set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc) set(CMAKE_CUDA_ARCHETECTURES 35 50 72) +if (NOT DEFINED USEDLTO) + set(USEDLTO ON) +endif() + +message("") +message("CUDA_VERSION ${CUDA_VERSION}") +message("USEDLTO ${USEDLTO}") +message("CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR}") +message("CMAKE_CUDA_COMPILER ${CMAKE_CUDA_COMPILER}") +message("CMAKE_CUDA_ARCHETECTURES ${CMAKE_CUDA_ARCHETECTURES}") +message("") + +if (${CUDA_VERSION_MAJOR} EQUAL 11 AND ${USEDLTO}) + set(DLTO -dlto) +endif () + +set(NVCC_FLAGS -Wno-deprecated-gpu-targets --expt-relaxed-constexpr -DJFS_RUNTIME_FAILURE_CALLS_ABORT -dc) + +# SMTLIB add_library(smtlib STATIC SMTLIB/Core.cu SMTLIB/Logger.cu @@ -12,18 +37,18 @@ add_library(smtlib STATIC SMTLIB/NativeBitVector.cu) target_include_directories (smtlib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_compile_features(smtlib PUBLIC cxx_std_11) +set_property(TARGET smtlib PROPERTY CUDA_ARCHITECTURES 35 50 72) +target_compile_options(smtlib PUBLIC ${NVCC_FLAGS}) -add_executable(smt smt.cu smt.h theory.h theory.cu aes.cu cuda_aes.h) +# SMT +add_executable(smt smt.cu smt.h theory.h theory.cu) target_include_directories (smt PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(smt LINK_PUBLIC smtlib) - set_property(TARGET smt PROPERTY CUDA_ARCHITECTURES 35 50 72) -set_property(TARGET smtlib PROPERTY CUDA_ARCHITECTURES 35 50 72) set_property(TARGET smt PROPERTY CUDA_SEPARABLE_COMPILATION ON) +target_compile_options(smt PUBLIC ${NVCC_FLAGS}) -target_compile_options(smt PUBLIC --expt-relaxed-constexpr -DJFS_RUNTIME_FAILURE_CALLS_ABORT -dc) -target_compile_options(smtlib PUBLIC --expt-relaxed-constexpr -DJFS_RUNTIME_FAILURE_CALLS_ABORT -dc) - +# Move executable after build add_custom_command(TARGET smt POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ../bin/) add_custom_command(TARGET smt diff --git a/Makefile b/Makefile index b5a402c..ff06e18 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: cmake -Bbuild -GNinja . - cmake --build build -j $(shell nproc) + cmake --build build SMTLIB/%.o: SMTLIB/%.cu $(MAKE) -C SMTLIB @@ -26,4 +26,4 @@ smt: theory.o smt.o aes.o $(smtlib-objs) clean: rm -f *.o SMTLIB/*.o smt - rm -rf build + rm -rf build bin cxx diff --git a/generate.sh b/generate.sh index ac591ba..5c51f2f 100755 --- a/generate.sh +++ b/generate.sh @@ -55,6 +55,6 @@ for filename in "$@"; do fi smt2cxx "${filename}" | transpose | format > theory.cu cp theory.cu "cxx/smt-$(basename ${filename} | tr '.' '-')".cxx - make smt - mv smt "bin/smt-$(basename ${filename} | tr '.' '-')" + make + mv bin/smt "bin/smt-$(basename ${filename} | tr '.' '-')" done diff --git a/smt.cu b/smt.cu index 6e5df17..8cd14f6 100644 --- a/smt.cu +++ b/smt.cu @@ -1,6 +1,7 @@ #include #include #include +#include "curand_kernel.h" #include "SMTLIB/Float.h" #include "SMTLIB/BufferRef.h" #include "cuda_aes.h" @@ -32,37 +33,39 @@ __host__ __device__ inline int64_t aes_pad(int64_t num) { } // Note: size is the *unpadded* size of the input vars -__global__ void fuzz(uint8_t *in_data, size_t size, const uint8_t *key, uint64_t *gobuf, unsigned long long *execs) { +//__global__ void fuzz(uint8_t *in_data, size_t size, const uint8_t *key, uint64_t *gobuf, unsigned long long *execs) { +__global__ void fuzz(uint8_t *in_data, size_t size, curandState *state, uint64_t *gobuf, unsigned long long *execs) { int bindex = blockIdx.x * blockDim.x + threadIdx.x; - int64_t padded = aes_pad(size); - uint64_t offset = bindex * padded; - extern __shared__ uint8_t sdata[]; + int offset = bindex * size; - // Get our local chunk - int soff = threadIdx.x * padded; + int seed = bindex*37; + curand_init(seed, bindex, 0, &state[bindex]); + curandState localState = state[bindex]; - // First time initialize block to i - for (int i = 0; i < padded; i += AES_BLOCK_SIZE) { - *(uint64_t *)(sdata+soff+i) = bindex * (padded/AES_BLOCK_SIZE) + i; - } + extern __shared__ uint8_t sdata[]; + int soff = threadIdx.x * size; + uint8_t *data = sdata + soff; while (!solved) { atomicAdd(execs, 1); // Randomize input for our slice - for (int i = 0; i < padded; i += AES_BLOCK_SIZE) { - encrypt_one_table(sdata+soff, key, i); + uint8_t* curr = data; + //TODO: once we confirm 16bytes and we generate 8bytes, replace loop with writes + while (curr < data + size) + { + *curr++ = curand(&localState); //TODO: i think this is 8bytes but not sure, alternative is uint4 } - if (LLVMFuzzerTestOneInput(sdata+soff, size)) { + + if (LLVMFuzzerTestOneInput(data, size)) { *gobuf = bindex; memcpy(in_data+offset, sdata+soff, size); solved = 1; } // Add increment to randomize (I hope?) - for (int i = 0; i < padded; i += AES_BLOCK_SIZE) { - *(uint64_t *)(sdata+soff+i) = bindex * (padded/AES_BLOCK_SIZE) + i; - } + //for (int i = 0; i < padded; i += AES_BLOCK_SIZE) { + // *(uint64_t *)(sdata+soff+i) = bindex * (padded/AES_BLOCK_SIZE) + i; + //} } - return; } void CUDART_CB finishedCB(void *data) { @@ -76,22 +79,12 @@ void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_g uint64_t *gobuf; unsigned long long *gexecs; - int64_t padded = aes_pad(varsize); - printf("Padding varsize from %d to %ld\n", varsize, padded); - unsigned char ckey[AES_BLOCK_SIZE]; - FILE *rng = fopen("/dev/urandom","rb"); - fread(ckey, AES_BLOCK_SIZE, 1, rng); - fclose(rng); - - // Pre-expand the round keys and copy to device mem - uint8_t rkey[176]; - const uint8_t *drkey; - expand_key(ckey, rkey); - gpuErrchk(cudaMalloc(&drkey, 176)); - gpuErrchk(cudaMemcpy((uint8_t *)drkey, rkey, sizeof(uint8_t) * 176, cudaMemcpyHostToDevice)); + int size = varsize; // i think? + curandState *rngStates; + gpuErrchk(cudaMalloc(&rngStates, N*M*sizeof(curandState))); // Alloc GPU buffers - gpuErrchk(cudaMalloc(&gbuf, padded*N*M)); + gpuErrchk(cudaMalloc(&gbuf, size*N*M)); gpuErrchk(cudaMalloc(&gobuf, sizeof(uint64_t))); gpuErrchk(cudaMalloc(&gexecs, sizeof(unsigned long long))); @@ -105,8 +98,7 @@ void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_g int *dev = (int *)malloc(sizeof(int)); *dev = device + 1; printf("Launching kernel on GPU%d...\n", device); - fuzz<<>>(gbuf, varsize, drkey, gobuf, gexecs); - gpuErrchk(cudaPeekAtLastError()); + fuzz<<>>(gbuf, varsize, rngStates, gobuf, gexecs); gpuErrchk(cudaLaunchHostFunc(stream, finishedCB, dev)); } @@ -143,11 +135,10 @@ int main(int argc, char **argv) { // Get and print output - int64_t padded = aes_pad(varsize); - uint8_t *buf = (uint8_t *)malloc(padded); + uint8_t *buf = (uint8_t*)malloc(varsize); uint64_t oindex; gpuErrchk(cudaMemcpy(&oindex, gobuf[i], sizeof(uint64_t), cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(buf, gbuf[i]+(oindex*padded), padded, cudaMemcpyDeviceToHost)); + gpuErrchk(cudaMemcpy(buf, gbuf[i]+(oindex*varsize), varsize, cudaMemcpyDeviceToHost)); printf("Found a satisfying assignment on device %d thread %lu:\n", i, oindex); for (int k = 0; k < varsize; k++) printf("%02x", buf[k]); printf("\n"); }