diff --git a/CMakeLists.txt b/CMakeLists.txt
index 087e301..02f8304 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,33 @@
 cmake_minimum_required(VERSION 3.14)
-project(smt LANGUAGES CXX CUDA)
+project(smt)
 
+
+# Find CUDA
+find_package(CUDA 11)
+enable_language(CUDA)
+set(ENV{CUDA_PATH} ${CUDA_TOOLKIT_ROOT_DIR})
+set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc)
 set(CMAKE_CUDA_ARCHETECTURES 35 50 72)
 
+if (NOT DEFINED USEDLTO)
+  set(USEDLTO ON)
+endif()
+
+message("")
+message("CUDA_VERSION              ${CUDA_VERSION}")
+message("USEDLTO                   ${USEDLTO}")
+message("CUDA_PATH                 ${CUDA_TOOLKIT_ROOT_DIR}")
+message("CMAKE_CUDA_COMPILER       ${CMAKE_CUDA_COMPILER}")
+message("CMAKE_CUDA_ARCHETECTURES  ${CMAKE_CUDA_ARCHETECTURES}")
+message("")
+
+if (${CUDA_VERSION_MAJOR} EQUAL 11 AND ${USEDLTO})
+    set(DLTO -dlto)
+endif ()
+
+set(NVCC_FLAGS -Wno-deprecated-gpu-targets --expt-relaxed-constexpr -DJFS_RUNTIME_FAILURE_CALLS_ABORT -dc)
+
+# SMTLIB
 add_library(smtlib STATIC
   SMTLIB/Core.cu
   SMTLIB/Logger.cu
@@ -12,18 +37,18 @@ add_library(smtlib STATIC
   SMTLIB/NativeBitVector.cu)
 target_include_directories (smtlib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_compile_features(smtlib PUBLIC cxx_std_11)
+set_property(TARGET smtlib PROPERTY CUDA_ARCHITECTURES 35 50 72)
+target_compile_options(smtlib PUBLIC ${NVCC_FLAGS})
 
-add_executable(smt smt.cu smt.h theory.h theory.cu aes.cu cuda_aes.h)
+# SMT
+add_executable(smt smt.cu smt.h theory.h theory.cu)
 target_include_directories (smt PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(smt LINK_PUBLIC smtlib)
-
 set_property(TARGET smt PROPERTY CUDA_ARCHITECTURES 35 50 72)
-set_property(TARGET smtlib PROPERTY CUDA_ARCHITECTURES 35 50 72)
 set_property(TARGET smt PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+target_compile_options(smt PUBLIC ${NVCC_FLAGS})
 
-target_compile_options(smt PUBLIC --expt-relaxed-constexpr -DJFS_RUNTIME_FAILURE_CALLS_ABORT -dc)
-target_compile_options(smtlib PUBLIC --expt-relaxed-constexpr -DJFS_RUNTIME_FAILURE_CALLS_ABORT -dc)
-
+# Move executable after build
 add_custom_command(TARGET smt
   POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ../bin/)
 add_custom_command(TARGET smt
diff --git a/Makefile b/Makefile
index b5a402c..ff06e18 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 
 all:
 	cmake -Bbuild -GNinja .
-	cmake --build build -j $(shell nproc)
+	cmake --build build
 
 SMTLIB/%.o: SMTLIB/%.cu
 	$(MAKE) -C SMTLIB
@@ -26,4 +26,4 @@ smt: theory.o smt.o aes.o $(smtlib-objs)
 
 clean:
 	rm -f *.o SMTLIB/*.o smt
-	rm -rf build
+	rm -rf build bin cxx
diff --git a/generate.sh b/generate.sh
index ac591ba..5c51f2f 100755
--- a/generate.sh
+++ b/generate.sh
@@ -55,6 +55,6 @@ for filename in "$@"; do
     fi
     smt2cxx "${filename}" | transpose | format > theory.cu
     cp theory.cu "cxx/smt-$(basename ${filename} | tr '.' '-')".cxx
-    make smt
-    mv smt "bin/smt-$(basename ${filename} | tr '.' '-')"
+    make
+    mv bin/smt "bin/smt-$(basename ${filename} | tr '.' '-')"
 done
diff --git a/smt.cu b/smt.cu
index 6e5df17..8cd14f6 100644
--- a/smt.cu
+++ b/smt.cu
@@ -1,6 +1,7 @@
 #include <time.h>
 #include <sys/time.h>
 #include <sched.h>
+#include "curand_kernel.h"
 #include "SMTLIB/Float.h"
 #include "SMTLIB/BufferRef.h"
 #include "cuda_aes.h"
@@ -32,37 +33,39 @@ __host__ __device__ inline int64_t aes_pad(int64_t num) {
 }
 
 // Note: size is the *unpadded* size of the input vars
-__global__ void fuzz(uint8_t *in_data, size_t size, const uint8_t *key, uint64_t *gobuf, unsigned long long *execs) {
+//__global__ void fuzz(uint8_t *in_data, size_t size, const uint8_t *key, uint64_t *gobuf, unsigned long long *execs) {
+__global__ void fuzz(uint8_t *in_data, size_t size, curandState *state, uint64_t *gobuf, unsigned long long *execs) {
   int bindex = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t padded = aes_pad(size);
-  uint64_t offset = bindex * padded;
-  extern __shared__ uint8_t sdata[];
+  int offset = bindex * size;
 
-  // Get our local chunk
-  int soff = threadIdx.x * padded;
+  int seed = bindex*37;
+  curand_init(seed, bindex, 0, &state[bindex]);
+  curandState localState = state[bindex];
 
-  // First time initialize block to i
-  for (int i = 0; i < padded; i += AES_BLOCK_SIZE) {
-    *(uint64_t *)(sdata+soff+i) = bindex * (padded/AES_BLOCK_SIZE) + i;
-  }
+  extern __shared__ uint8_t sdata[];
+  int soff = threadIdx.x * size;
+  uint8_t *data = sdata + soff;
 
   while (!solved) {
     atomicAdd(execs, 1);
     // Randomize input for our slice
-    for (int i = 0; i < padded; i += AES_BLOCK_SIZE) {
-      encrypt_one_table(sdata+soff, key, i);
+    uint8_t* curr = data;
+    //TODO: once we confirm 16bytes and we generate 8bytes, replace loop with writes
+    while (curr < data + size)
+    {
+        *curr++ = curand(&localState); //TODO: i think this is 8bytes but not sure, alternative is uint4
     }
-    if (LLVMFuzzerTestOneInput(sdata+soff, size)) {
+
+    if (LLVMFuzzerTestOneInput(data, size)) {
       *gobuf = bindex;
       memcpy(in_data+offset, sdata+soff, size);
       solved = 1;
     }
     // Add increment to randomize (I hope?)
-    for (int i = 0; i < padded; i += AES_BLOCK_SIZE) {
-      *(uint64_t *)(sdata+soff+i) = bindex * (padded/AES_BLOCK_SIZE) + i;
-    }
+    //for (int i = 0; i < padded; i += AES_BLOCK_SIZE) {
+    //  *(uint64_t *)(sdata+soff+i) = bindex * (padded/AES_BLOCK_SIZE) + i;
+    //}
   }
-  return;
 }
 
 void CUDART_CB finishedCB(void *data) {
@@ -76,22 +79,12 @@ void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_g
   uint64_t *gobuf;
   unsigned long long *gexecs;
 
-  int64_t padded = aes_pad(varsize);
-  printf("Padding varsize from %d to %ld\n", varsize, padded);
-  unsigned char ckey[AES_BLOCK_SIZE];
-  FILE *rng = fopen("/dev/urandom","rb");
-  fread(ckey, AES_BLOCK_SIZE, 1, rng);
-  fclose(rng);
-
-  // Pre-expand the round keys and copy to device mem
-  uint8_t rkey[176];
-  const uint8_t *drkey;
-  expand_key(ckey, rkey);
-  gpuErrchk(cudaMalloc(&drkey, 176));
-  gpuErrchk(cudaMemcpy((uint8_t *)drkey, rkey, sizeof(uint8_t) * 176, cudaMemcpyHostToDevice));
+  int size = varsize; // i think?
+  curandState *rngStates;
+  gpuErrchk(cudaMalloc(&rngStates, N*M*sizeof(curandState)));
 
   // Alloc GPU buffers
-  gpuErrchk(cudaMalloc(&gbuf, padded*N*M));
+  gpuErrchk(cudaMalloc(&gbuf, size*N*M));
   gpuErrchk(cudaMalloc(&gobuf, sizeof(uint64_t)));
   gpuErrchk(cudaMalloc(&gexecs, sizeof(unsigned long long)));
 
@@ -105,8 +98,7 @@ void launch_kernel(int device, int varsize, uint8_t **ret_gbuf, uint64_t **ret_g
   int *dev = (int *)malloc(sizeof(int));
   *dev = device + 1;
   printf("Launching kernel on GPU%d...\n", device);
-  fuzz<<<M,N,N*padded,stream>>>(gbuf, varsize, drkey, gobuf, gexecs);
-  gpuErrchk(cudaPeekAtLastError());
+  fuzz<<<M,N,N*size,stream>>>(gbuf, varsize, rngStates, gobuf, gexecs);
   gpuErrchk(cudaLaunchHostFunc(stream, finishedCB, dev));
 }
 
@@ -143,11 +135,10 @@ int main(int argc, char **argv) {
 
 
   // Get and print output
-  int64_t padded = aes_pad(varsize);
-  uint8_t *buf = (uint8_t *)malloc(padded);
+  uint8_t *buf = (uint8_t*)malloc(varsize);
   uint64_t oindex;
   gpuErrchk(cudaMemcpy(&oindex, gobuf[i], sizeof(uint64_t), cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(buf, gbuf[i]+(oindex*padded), padded, cudaMemcpyDeviceToHost));
+  gpuErrchk(cudaMemcpy(buf, gbuf[i]+(oindex*varsize), varsize, cudaMemcpyDeviceToHost));
   printf("Found a satisfying assignment on device %d thread %lu:\n", i, oindex);
   for (int k = 0; k < varsize; k++) printf("%02x", buf[k]); printf("\n");
 }