From b89715f4e38da968c7a9aa81806bc1ac5bd82e89 Mon Sep 17 00:00:00 2001
From: Nobel Gautam <nobel.gautam@gmail.com>
Date: Fri, 4 Dec 2020 15:35:38 -0500
Subject: [PATCH 1/6] chane aes to curand wip

---
 smt.cu | 57 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/smt.cu b/smt.cu
index 6e5df17..ab9d4a2 100644
--- a/smt.cu
+++ b/smt.cu
@@ -1,6 +1,7 @@
 #include <time.h>
 #include <sys/time.h>
 #include <sched.h>
+#include "curand_kernel.h"
 #include "SMTLIB/Float.h"
 #include "SMTLIB/BufferRef.h"
 #include "cuda_aes.h"
@@ -32,27 +33,25 @@ __host__ __device__ inline int64_t aes_pad(int64_t num) {
 }
 
 // Note: size is the *unpadded* size of the input vars
-__global__ void fuzz(uint8_t *in_data, size_t size, const uint8_t *key, uint64_t *gobuf, unsigned long long *execs) {
+//__global__ void fuzz(uint8_t *in_data, size_t size, const uint8_t *key, uint64_t *gobuf, unsigned long long *execs) {
+__global__ void fuzz(uint8_t *in_data, size_t size, curandState *state, uint64_t *gobuf, unsigned long long *execs) {
   int bindex = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t padded = aes_pad(size);
-  uint64_t offset = bindex * padded;
-  extern __shared__ uint8_t sdata[];
 
-  // Get our local chunk
-  int soff = threadIdx.x * padded;
+  uint8_t *data = in_data + bindex*size; // i think?
 
-  // First time initialize block to i
-  for (int i = 0; i < padded; i += AES_BLOCK_SIZE) {
-    *(uint64_t *)(sdata+soff+i) = bindex * (padded/AES_BLOCK_SIZE) + i;
-  }
+  curandState localState = state[bindex];
 
   while (!solved) {
     atomicAdd(execs, 1);
     // Randomize input for our slice
-    for (int i = 0; i < padded; i += AES_BLOCK_SIZE) {
-      encrypt_one_table(sdata+soff, key, i);
+    uint8_t* curr = data;
+    //TODO: once we confirm 16bytes and we generate 8bytes, replace loop with writes
+    while (curr < data + size)
+    {
+        *curr++ = curand(&localState); //TODO: i think this is 8bytes but not sure, alternative is uint4
     }
-    if (LLVMFuzzerTestOneInput(sdata+soff, size)) {
+
+    if (LLVMFuzzerTestOneInput(data, size)) {
       *gobuf = bindex;
       memcpy(in_data+offset, sdata+soff, size);
       solved = 1;
@@ -69,6 +68,13 @@ void CUDART_CB finishedCB(void *data) {
   finished_dev = *(int *)data;
 }
 
+__global__ void setup_kernel(curandState *state)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
+    int seed = id*37;
+    curand_init(seed, id, 0, &state[id]);
+}
+
 void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_gobuf, unsigned long long **ret_execs) {
   cudaSetDevice(device);
 
@@ -76,22 +82,14 @@ void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_g
   uint64_t *gobuf;
   unsigned long long *gexecs;
 
-  int64_t padded = aes_pad(varsize);
-  printf("Padding varsize from %d to %ld\n", varsize, padded);
-  unsigned char ckey[AES_BLOCK_SIZE];
-  FILE *rng = fopen("/dev/urandom","rb");
-  fread(ckey, AES_BLOCK_SIZE, 1, rng);
-  fclose(rng);
-
-  // Pre-expand the round keys and copy to device mem
-  uint8_t rkey[176];
-  const uint8_t *drkey;
-  expand_key(ckey, rkey);
-  gpuErrchk(cudaMalloc(&drkey, 176));
-  gpuErrchk(cudaMemcpy((uint8_t *)drkey, rkey, sizeof(uint8_t) * 176, cudaMemcpyHostToDevice));
+  int size = varsize; // i think?
+  curandState *rngStates;
+  gpuErrchk(cudaMalloc(&rngStates, N*M*sizeof(curandState)));
+
+  setup_kernel<<<M,N>>>(rngStates);
 
   // Alloc GPU buffers
-  gpuErrchk(cudaMalloc(&gbuf, padded*N*M));
+  gpuErrchk(cudaMalloc(&gbuf, size*N*M));
   gpuErrchk(cudaMalloc(&gobuf, sizeof(uint64_t)));
   gpuErrchk(cudaMalloc(&gexecs, sizeof(unsigned long long)));
 
@@ -105,9 +103,10 @@ void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_g
   int *dev = (int *)malloc(sizeof(int));
   *dev = device + 1;
   printf("Launching kernel on GPU%d...\n", device);
-  fuzz<<<M,N,N*padded,stream>>>(gbuf, varsize, drkey, gobuf, gexecs);
-  gpuErrchk(cudaPeekAtLastError());
+  //fuzz<<<M,N,0,stream>>>(gbuf, varsize, rnd, gobuf, gexecs);
+  fuzz<<<M,N,0,stream>>>(gbuf, varsize, rngStates, gobuf, gexecs);
   gpuErrchk(cudaLaunchHostFunc(stream, finishedCB, dev));
+  //gpuErrchk(curandDestroyGenerator(gen));
 }
 
 int main(int argc, char **argv) {

From 5b78c6084263c29378e1ac6aab80e92bc77e8d54 Mon Sep 17 00:00:00 2001
From: Nobel Gautam <nobel.gautam@gmail.com>
Date: Fri, 4 Dec 2020 16:40:18 -0500
Subject: [PATCH 2/6] dont need curand init kernel oops

---
 smt.cu | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/smt.cu b/smt.cu
index ab9d4a2..15e933d 100644
--- a/smt.cu
+++ b/smt.cu
@@ -37,8 +37,9 @@ __host__ __device__ inline int64_t aes_pad(int64_t num) {
 __global__ void fuzz(uint8_t *in_data, size_t size, curandState *state, uint64_t *gobuf, unsigned long long *execs) {
   int bindex = blockIdx.x * blockDim.x + threadIdx.x;
 
+  int seed = bindex*37;
+  curand_init(seed, bindex, 0, &state[bindex]);
   uint8_t *data = in_data + bindex*size; // i think?
-
   curandState localState = state[bindex];
 
   while (!solved) {
@@ -68,13 +69,6 @@ void CUDART_CB finishedCB(void *data) {
   finished_dev = *(int *)data;
 }
 
-__global__ void setup_kernel(curandState *state)
-{
-    int id = threadIdx.x + blockIdx.x * blockDim.x;
-    int seed = id*37;
-    curand_init(seed, id, 0, &state[id]);
-}
-
 void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_gobuf, unsigned long long **ret_execs) {
   cudaSetDevice(device);
 
@@ -86,8 +80,6 @@ void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_g
   curandState *rngStates;
   gpuErrchk(cudaMalloc(&rngStates, N*M*sizeof(curandState)));
 
-  setup_kernel<<<M,N>>>(rngStates);
-
   // Alloc GPU buffers
   gpuErrchk(cudaMalloc(&gbuf, size*N*M));
   gpuErrchk(cudaMalloc(&gobuf, sizeof(uint64_t)));

From a17288a1220d886507057729a63d1086f34342a2 Mon Sep 17 00:00:00 2001
From: John McCann Cunniff Jr <johncunniff1248@gmail.com>
Date: Mon, 7 Dec 2020 23:41:24 -0500
Subject: [PATCH 3/6] catch up with main

---
 smt.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/smt.cu b/smt.cu
index 15e933d..800bd70 100644
--- a/smt.cu
+++ b/smt.cu
@@ -36,11 +36,15 @@ __host__ __device__ inline int64_t aes_pad(int64_t num) {
 //__global__ void fuzz(uint8_t *in_data, size_t size, const uint8_t *key, uint64_t *gobuf, unsigned long long *execs) {
 __global__ void fuzz(uint8_t *in_data, size_t size, curandState *state, uint64_t *gobuf, unsigned long long *execs) {
   int bindex = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t padded = aes_pad(size);
+  uint64_t offset = bindex * padded;
+  int soff = threadIdx.x * padded;
 
   int seed = bindex*37;
   curand_init(seed, bindex, 0, &state[bindex]);
   uint8_t *data = in_data + bindex*size; // i think?
   curandState localState = state[bindex];
+  extern __shared__ uint8_t sdata[];
 
   while (!solved) {
     atomicAdd(execs, 1);
@@ -62,7 +66,6 @@ __global__ void fuzz(uint8_t *in_data, size_t size, curandState *state, uint64_t
       *(uint64_t *)(sdata+soff+i) = bindex * (padded/AES_BLOCK_SIZE) + i;
     }
   }
-  return;
 }
 
 void CUDART_CB finishedCB(void *data) {

From 407881be60a99c405e3f18827eb3d4efdf1c65b3 Mon Sep 17 00:00:00 2001
From: Nobel Gautam <nobel.gautam@gmail.com>
Date: Thu, 10 Dec 2020 04:07:04 -0500
Subject: [PATCH 4/6] report correct buffer on exit

---
 smt.cu | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/smt.cu b/smt.cu
index 800bd70..25e31e3 100644
--- a/smt.cu
+++ b/smt.cu
@@ -98,10 +98,8 @@ void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_g
   int *dev = (int *)malloc(sizeof(int));
   *dev = device + 1;
   printf("Launching kernel on GPU%d...\n", device);
-  //fuzz<<<M,N,0,stream>>>(gbuf, varsize, rnd, gobuf, gexecs);
   fuzz<<<M,N,0,stream>>>(gbuf, varsize, rngStates, gobuf, gexecs);
   gpuErrchk(cudaLaunchHostFunc(stream, finishedCB, dev));
-  //gpuErrchk(curandDestroyGenerator(gen));
 }
 
 int main(int argc, char **argv) {
@@ -137,11 +135,10 @@ int main(int argc, char **argv) {
 
 
   // Get and print output
-  int64_t padded = aes_pad(varsize);
-  uint8_t *buf = (uint8_t *)malloc(padded);
+  uint8_t *buf = (uint8_t*)malloc(varsize);
   uint64_t oindex;
   gpuErrchk(cudaMemcpy(&oindex, gobuf[i], sizeof(uint64_t), cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(buf, gbuf[i]+(oindex*padded), padded, cudaMemcpyDeviceToHost));
+  gpuErrchk(cudaMemcpy(buf, gbuf[i]+(oindex*varsize), varsize, cudaMemcpyDeviceToHost));
   printf("Found a satisfying assignment on device %d thread %lu:\n", i, oindex);
   for (int k = 0; k < varsize; k++) printf("%02x", buf[k]); printf("\n");
 }

From 0199d74fc442e4885f02c30d576655687c48b17f Mon Sep 17 00:00:00 2001
From: Nobel Gautam <nobel.gautam@gmail.com>
Date: Thu, 10 Dec 2020 07:24:26 -0500
Subject: [PATCH 5/6] use shared data

---
 smt.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/smt.cu b/smt.cu
index 25e31e3..8cd14f6 100644
--- a/smt.cu
+++ b/smt.cu
@@ -36,15 +36,15 @@ __host__ __device__ inline int64_t aes_pad(int64_t num) {
 //__global__ void fuzz(uint8_t *in_data, size_t size, const uint8_t *key, uint64_t *gobuf, unsigned long long *execs) {
 __global__ void fuzz(uint8_t *in_data, size_t size, curandState *state, uint64_t *gobuf, unsigned long long *execs) {
   int bindex = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t padded = aes_pad(size);
-  uint64_t offset = bindex * padded;
-  int soff = threadIdx.x * padded;
+  int offset = bindex * size;
 
   int seed = bindex*37;
   curand_init(seed, bindex, 0, &state[bindex]);
-  uint8_t *data = in_data + bindex*size; // i think?
   curandState localState = state[bindex];
+
   extern __shared__ uint8_t sdata[];
+  int soff = threadIdx.x * size;
+  uint8_t *data = sdata + soff;
 
   while (!solved) {
     atomicAdd(execs, 1);
@@ -62,9 +62,9 @@ __global__ void fuzz(uint8_t *in_data, size_t size, curandState *state, uint64_t
       solved = 1;
     }
     // Add increment to randomize (I hope?)
-    for (int i = 0; i < padded; i += AES_BLOCK_SIZE) {
-      *(uint64_t *)(sdata+soff+i) = bindex * (padded/AES_BLOCK_SIZE) + i;
-    }
+    //for (int i = 0; i < padded; i += AES_BLOCK_SIZE) {
+    //  *(uint64_t *)(sdata+soff+i) = bindex * (padded/AES_BLOCK_SIZE) + i;
+    //}
   }
 }
 
@@ -98,7 +98,7 @@ void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_g
   int *dev = (int *)malloc(sizeof(int));
   *dev = device + 1;
   printf("Launching kernel on GPU%d...\n", device);
-  fuzz<<<M,N,0,stream>>>(gbuf, varsize, rngStates, gobuf, gexecs);
+  fuzz<<<M,N,N*size,stream>>>(gbuf, varsize, rngStates, gobuf, gexecs);
   gpuErrchk(cudaLaunchHostFunc(stream, finishedCB, dev));
 }
 

From 0cb03f19766aad5cf8c0c9aa5d1673a8ca756b4d Mon Sep 17 00:00:00 2001
From: John McCann Cunniff Jr <johncunniff1248@gmail.com>
Date: Thu, 10 Dec 2020 16:59:25 -0500
Subject: [PATCH 6/6] minor cmake build improvements

---
 CMakeLists.txt | 39 ++++++++++++++++++++++++++++++++-------
 Makefile       |  4 ++--
 generate.sh    |  4 ++--
 3 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 087e301..02f8304 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,33 @@
 cmake_minimum_required(VERSION 3.14)
-project(smt LANGUAGES CXX CUDA)
+project(smt)
 
+
+# Find CUDA
+find_package(CUDA 11)
+enable_language(CUDA)
+set(ENV{CUDA_PATH} ${CUDA_TOOLKIT_ROOT_DIR})
+set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc)
 set(CMAKE_CUDA_ARCHETECTURES 35 50 72)
 
+if (NOT DEFINED USEDLTO)
+  set(USEDLTO ON)
+endif()
+
+message("")
+message("CUDA_VERSION              ${CUDA_VERSION}")
+message("USEDLTO                   ${USEDLTO}")
+message("CUDA_PATH                 ${CUDA_TOOLKIT_ROOT_DIR}")
+message("CMAKE_CUDA_COMPILER       ${CMAKE_CUDA_COMPILER}")
+message("CMAKE_CUDA_ARCHETECTURES  ${CMAKE_CUDA_ARCHETECTURES}")
+message("")
+
+if (${CUDA_VERSION_MAJOR} EQUAL 11 AND ${USEDLTO})
+    set(DLTO -dlto)
+endif ()
+
+set(NVCC_FLAGS -Wno-deprecated-gpu-targets --expt-relaxed-constexpr -DJFS_RUNTIME_FAILURE_CALLS_ABORT -dc)
+
+# SMTLIB
 add_library(smtlib STATIC
   SMTLIB/Core.cu
   SMTLIB/Logger.cu
@@ -12,18 +37,18 @@ add_library(smtlib STATIC
   SMTLIB/NativeBitVector.cu)
 target_include_directories (smtlib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_compile_features(smtlib PUBLIC cxx_std_11)
+set_property(TARGET smtlib PROPERTY CUDA_ARCHITECTURES 35 50 72)
+target_compile_options(smtlib PUBLIC ${NVCC_FLAGS})
 
-add_executable(smt smt.cu smt.h theory.h theory.cu aes.cu cuda_aes.h)
+# SMT
+add_executable(smt smt.cu smt.h theory.h theory.cu)
 target_include_directories (smt PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(smt LINK_PUBLIC smtlib)
-
 set_property(TARGET smt PROPERTY CUDA_ARCHITECTURES 35 50 72)
-set_property(TARGET smtlib PROPERTY CUDA_ARCHITECTURES 35 50 72)
 set_property(TARGET smt PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+target_compile_options(smt PUBLIC ${NVCC_FLAGS})
 
-target_compile_options(smt PUBLIC --expt-relaxed-constexpr -DJFS_RUNTIME_FAILURE_CALLS_ABORT -dc)
-target_compile_options(smtlib PUBLIC --expt-relaxed-constexpr -DJFS_RUNTIME_FAILURE_CALLS_ABORT -dc)
-
+# Move executable after build
 add_custom_command(TARGET smt
   POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ../bin/)
 add_custom_command(TARGET smt
diff --git a/Makefile b/Makefile
index b5a402c..ff06e18 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 
 all:
 	cmake -Bbuild -GNinja .
-	cmake --build build -j $(shell nproc)
+	cmake --build build
 
 SMTLIB/%.o: SMTLIB/%.cu
 	$(MAKE) -C SMTLIB
@@ -26,4 +26,4 @@ smt: theory.o smt.o aes.o $(smtlib-objs)
 
 clean:
 	rm -f *.o SMTLIB/*.o smt
-	rm -rf build
+	rm -rf build bin cxx
diff --git a/generate.sh b/generate.sh
index ac591ba..5c51f2f 100755
--- a/generate.sh
+++ b/generate.sh
@@ -55,6 +55,6 @@ for filename in "$@"; do
     fi
     smt2cxx "${filename}" | transpose | format > theory.cu
     cp theory.cu "cxx/smt-$(basename ${filename} | tr '.' '-')".cxx
-    make smt
-    mv smt "bin/smt-$(basename ${filename} | tr '.' '-')"
+    make
+    mv bin/smt "bin/smt-$(basename ${filename} | tr '.' '-')"
 done