diff --git a/.github/workflows/sst_integration.yml b/.github/workflows/sst_integration.yml
new file mode 100644
index 000000000..03635db64
--- /dev/null
+++ b/.github/workflows/sst_integration.yml
@@ -0,0 +1,80 @@
+# Workflow with cmake build system
+name: SST Integration Test
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events but only for the mydev branch
+  push:
+    branches-ignore:
+      - "gh-readonly-queue**"
+  pull_request:
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  build-QV100:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        # test_type: [simple, medium, long]
+        test_type: [simple, medium]
+    container:
+      image: tgrogers/accel-sim_regress:SST-Integration-Ubuntu-22.04-cuda-11.7-llvm-18.1.8-riscv-gnu-2024.08.06-nightly
+      env:
+        CONFIG: QV100
+        GPU_ARCH: sm_70
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Build GPGPU-Sim in SST mode
+        run: |
+          source ./setup_environment
+          make -j4
+      - name: Prepare SST dependencies
+        run: |
+          apt install -y openmpi-bin openmpi-common libtool libtool-bin autoconf python3 python3-dev automake build-essential git
+      # Use personal repo for now
+      - name: Build SST-Core
+        run: |
+          git clone https://github.com/William-An/sst-core.git
+          cd sst-core
+          git pull
+          git checkout devel
+          ./autogen.sh
+          ./configure --prefix=`realpath ../sstcore-install` --disable-mpi --disable-mem-pools
+          make -j4
+          make install
+          cd ..
+          rm -rf ./sst-core
+      # Use personal repo for now
+      - name: Build SST-Elements
+        run: |
+          git clone https://github.com/William-An/sst-elements.git
+          source ./setup_environment
+          cd sst-elements
+          git pull
+          git checkout balar-mmio-vanadis-llvm
+          ./autogen.sh
+          ./configure --prefix=`realpath ../sstelements-install` --with-sst-core=`realpath ../sstcore-install` --with-cuda=$CUDA_INSTALL_PATH --with-gpgpusim=$GPGPUSIM_ROOT
+          make -j4
+          make install
+      # Have to resource the gpu app
+      # Also fake a SDK since rodinia 2.0 does not need this, speed things up on github
+      - name: Balar Test
+        run: |
+          pip install testtools blessings pygments
+          source ./setup_environment
+          mkdir 4.2
+          mkdir fake_sdk
+          export NVIDIA_COMPUTE_SDK_LOCATION=$(readlink -f ./fake_sdk)
+          source $GPUAPPS_ROOT/src/setup_environment sst
+          rm -rf 4.2
+          rm -f gpucomputingsdk_4.2.9_linux.run
+          ./sstcore-install/bin/sst-test-elements -w "*balar*${{ matrix.test_type }}*"
\ No newline at end of file
diff --git a/Jenkinsfile b/Jenkinsfile
index f6676bf14..4ef467bae 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -87,6 +87,41 @@ pipeline {
                         ssh tgrogers@dynamo.ecn.purdue.edu "cd $PLOTDIR && rm -rf latest && cp -r ${BUILD_NUMBER} latest"'
             }
         }
+        stage('sst-core-build') {
+            steps {
+                sh 'rm -rf sstcore-install'
+                sh 'rm -rf sst-core && git clone git@github.com:sstsimulator/sst-core.git'
+                sh '''#!/bin/bash
+                    cd sst-core
+                    ./autogen.sh
+                    ./configure --prefix=`realpath ../sstcore-install` --disable-mpi --disable-mem-pools
+                    make -j 10 
+                    make install'''
+            }
+        }
+        stage('sst-elements-build') {
+            steps {
+                sh 'rm -rf sstelements-install'
+                sh 'rm -rf sst-elements && git clone git@github.com:sstsimulator/sst-elements.git'
+                // First sourcing the env_setup and setup_environment script for env vars
+                sh '''#!/bin/bash
+                    source ./env-setup/11.0_env_setup.sh
+                    source `pwd`/setup_environment
+                    cd sst-elements
+                    ./autogen.sh
+                    ./configure --prefix=`realpath ../sstelements-install` --with-sst-core=`realpath ../sstcore-install` --with-cuda=$CUDA_INSTALL_PATH --with-gpgpusim=$GPGPUSIM_ROOT
+                    make -j 10 
+                    make install'''
+            }
+        }
+        stage('sst balar test') {
+            steps {
+                sh '''#!/bin/bash
+                    source ./env-setup/11.0_env_setup.sh
+                    source `pwd`/setup_environment sst
+                    ./sstcore-install/bin/sst-test-elements -p ./sst-elements/src/sst/elements/balar/tests'''
+            }
+        }
     }
     post {
         success {
diff --git a/Makefile b/Makefile
index 82ea39928..37dba0146 100644
--- a/Makefile
+++ b/Makefile
@@ -34,6 +34,7 @@ INTERSIM ?= intersim2
 
 include version_detection.mk
 
+# Check for debug
 ifeq ($(GPGPUSIM_CONFIG), gcc-$(CC_VERSION)/cuda-$(CUDART_VERSION)/debug)
 	export DEBUG=1
 else
@@ -168,6 +169,7 @@ $(SIM_LIB_DIR)/libcudart.so: makedirs $(LIBS) cudalib
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.10.0 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.10.0; fi
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.10.1 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.10.1; fi
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.11.0 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.11.0; fi
+	if [ ! -f $(SIM_LIB_DIR)/libcudart_mod.so ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart_mod.so; fi
 
 $(SIM_LIB_DIR)/libcudart.dylib: makedirs $(LIBS) cudalib
 	g++ -dynamiclib -Wl,-headerpad_max_install_names,-undefined,dynamic_lookup,-compatibility_version,1.1,-current_version,1.1\
diff --git a/libcuda/cuda_api_object.h b/libcuda/cuda_api_object.h
index d292e224e..e620e5728 100644
--- a/libcuda/cuda_api_object.h
+++ b/libcuda/cuda_api_object.h
@@ -1,6 +1,7 @@
 #ifndef __cuda_api_object_h__
 #define __cuda_api_object_h__
 
+#include <functional>
 #include <list>
 #include <map>
 #include <set>
@@ -193,9 +194,25 @@ class cuda_runtime_api {
   // backward pointer
   class gpgpu_context *gpgpu_ctx;
   // member function list
+
+  // For SST and other potential simulator interface
+  void cuobjdumpInit(const char *fn);
+  void extract_code_using_cuobjdump(const char *fn);
+  void extract_ptx_files_using_cuobjdump(CUctx_st *context, const char *fn);
+
+  // For running GPGPUSim alone
   void cuobjdumpInit();
   void extract_code_using_cuobjdump();
   void extract_ptx_files_using_cuobjdump(CUctx_st *context);
+
+  // Internal functions for the above public methods
+  void cuobjdumpInit_internal(std::function<void()> ctx_extract_code_func);
+  void extract_code_using_cuobjdump_internal(
+      CUctx_st *context, std::string &app_binary,
+      std::function<void(CUctx_st *)> ctx_extract_ptx_func);
+  void extract_ptx_files_using_cuobjdump_internal(CUctx_st *context,
+                                                  std::string &app_binary);
+
   std::list<cuobjdumpSection *> pruneSectionList(CUctx_st *context);
   std::list<cuobjdumpSection *> mergeMatchingSections(std::string identifier);
   std::list<cuobjdumpSection *> mergeSections();
diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc
index b540ffd91..a98411f1a 100644
--- a/libcuda/cuda_runtime_api.cc
+++ b/libcuda/cuda_runtime_api.cc
@@ -109,6 +109,7 @@
 #include <string.h>
 #include <time.h>
 #include <fstream>
+#include <functional>
 #include <iostream>
 #include <regex>
 #include <sstream>
@@ -151,6 +152,9 @@
 #include <mach-o/dyld.h>
 #endif
 
+// SST cycle
+extern bool SST_Cycle();
+
 /*DEVICE_BUILTIN*/
 struct cudaArray {
   void *devPtr;
@@ -412,6 +416,13 @@ void setCuobjdumpsassfilename(
 //! processes (e.g. cuobjdump) reading /proc/<pid>/exe will see the emulator
 //! executable instead of the application binary.
 //!
+// In SST need the string to pass the binary information
+// as we cannot get it from /proc/self/exe
+std::string get_app_binary(const char *fn) {
+  printf("self exe links to: %s\n", fn);
+  return fn;
+}
+
 std::string get_app_binary() {
   char self_exe_path[1025];
 #ifdef __APPLE__
@@ -453,19 +464,25 @@ char *get_app_binary_name(std::string abs_path) {
   return self_exe_path;
 }
 
-static int get_app_cuda_version() {
+static int get_app_cuda_version_internal(std::string app_binary) {
   int app_cuda_version = 0;
   char fname[1024];
   snprintf(fname, 1024, "_app_cuda_version_XXXXXX");
   int fd = mkstemp(fname);
   close(fd);
+  // Weili: Add way to extract CUDA version information from Balar Vanadis
+  // binary (stored as a const string)
   std::string app_cuda_version_command =
-      "ldd " + get_app_binary() +
+      "ldd " + app_binary +
       " | grep libcudart.so | sed  's/.*libcudart.so.\\(.*\\) =>.*/\\1/' > " +
+      fname + " && strings " + app_binary +
+      " | grep libcudart_vanadis.a | sed  "
+      "'s/.*libcudart_vanadis.a.\\(.*\\)/\\1/' >> " +
       fname;
   int res = system(app_cuda_version_command.c_str());
   if (res == -1) {
-    printf("Error - Cannot detect the app's CUDA version.\n");
+    printf("Error - Cannot detect the app's CUDA version. Command: %s\n",
+           app_cuda_version_command.c_str());
     exit(1);
   }
   FILE *cmd = fopen(fname, "r");
@@ -476,12 +493,24 @@ static int get_app_cuda_version() {
   }
   fclose(cmd);
   if (app_cuda_version == 0) {
-    printf("Error - Cannot detect the app's CUDA version.\n");
+    printf("Error - Cannot detect the app's CUDA version. Command: %s\n",
+           app_cuda_version_command.c_str());
     exit(1);
   }
   return app_cuda_version;
 }
 
+static int get_app_cuda_version(const char *fn) {
+  // Use for other simulator integration
+  std::string app_binary = get_app_binary(fn);
+  return get_app_cuda_version_internal(app_binary);
+}
+
+static int get_app_cuda_version() {
+  std::string app_binary = get_app_binary();
+  return get_app_cuda_version_internal(app_binary);
+}
+
 //! Keep track of the association between filename and cubin handle
 void cuda_runtime_api::cuobjdumpRegisterFatBinary(unsigned int handle,
                                                   const char *filename,
@@ -574,8 +603,11 @@ __host__ cudaError_t CUDARTAPI cudaDeviceGetLimitInternal(
   return g_last_cudaError = cudaSuccess;
 }
 
-void **cudaRegisterFatBinaryInternal(void *fatCubin,
-                                     gpgpu_context *gpgpu_ctx = NULL) {
+// Internal implementation for cudaRegisterFatBiaryInternal
+void **cudaRegisterFatBiaryInternal_impl(
+    void *fatCubin, gpgpu_context *gpgpu_ctx, std::string &app_binary_path,
+    int app_cuda_version,
+    std::function<void(gpgpu_context *)> ctx_cuobjdumpInit_func) {
   gpgpu_context *ctx;
   if (gpgpu_ctx) {
     ctx = gpgpu_ctx;
@@ -606,11 +638,9 @@ void **cudaRegisterFatBinaryInternal(void *fatCubin,
     // compiled with a newer version of CUDA to run apps compiled with older
     // versions of CUDA. This is especially useful for PTXPLUS execution.
     // Skip cuda version check for pytorch application
-    std::string app_binary_path = get_app_binary();
     int pos = app_binary_path.find("python");
     if (pos == std::string::npos) {
       // Not pytorch app : checking cuda version
-      int app_cuda_version = get_app_cuda_version();
       assert(
           app_cuda_version == CUDART_VERSION / 1000 &&
           "The app must be compiled with same major version as the simulator.");
@@ -661,7 +691,7 @@ void **cudaRegisterFatBinaryInternal(void *fatCubin,
      * then for next calls, only returns the appropriate number
      */
     assert(fat_cubin_handle >= 1);
-    if (fat_cubin_handle == 1) ctx->api->cuobjdumpInit();
+    if (fat_cubin_handle == 1) ctx_cuobjdumpInit_func(ctx);
     ctx->api->cuobjdumpRegisterFatBinary(fat_cubin_handle, filename, context);
 
     return (void **)fat_cubin_handle;
@@ -753,6 +783,28 @@ void **cudaRegisterFatBinaryInternal(void *fatCubin,
 #endif
 }
 
+void **cudaRegisterFatBinaryInternal(const char *fn, void *fatCubin,
+                                     gpgpu_context *gpgpu_ctx = NULL) {
+  std::string app_binary_path = get_app_binary(fn);
+  int app_cuda_version = get_app_cuda_version(fn);
+  auto ctx_cuobjdumpInit = [=](gpgpu_context *ctx) {
+    ctx->api->cuobjdumpInit(fn);
+  };
+  return cudaRegisterFatBiaryInternal_impl(fatCubin, gpgpu_ctx, app_binary_path,
+                                           app_cuda_version, ctx_cuobjdumpInit);
+}
+
+void **cudaRegisterFatBinaryInternal(void *fatCubin,
+                                     gpgpu_context *gpgpu_ctx = NULL) {
+  std::string app_binary_path = get_app_binary();
+  int app_cuda_version = get_app_cuda_version();
+  auto ctx_cuobjdumpInit = [](gpgpu_context *ctx) {
+    ctx->api->cuobjdumpInit();
+  };
+  return cudaRegisterFatBiaryInternal_impl(fatCubin, gpgpu_ctx, app_binary_path,
+                                           app_cuda_version, ctx_cuobjdumpInit);
+}
+
 void cudaRegisterFunctionInternal(void **fatCubinHandle, const char *hostFun,
                                   char *deviceFun, const char *deviceName,
                                   int thread_limit, uint3 *tid, uint3 *bid,
@@ -1057,6 +1109,24 @@ cudaError_t cudaMallocHostInternal(void **ptr, size_t size,
   }
 }
 
+// SST malloc done by vanadis, we just need to record the memory addr
+cudaError_t CUDARTAPI cudaMallocHostSSTInternal(
+    void *addr, size_t size, gpgpu_context *gpgpu_ctx = NULL) {
+  gpgpu_context *ctx;
+  if (gpgpu_ctx) {
+    ctx = gpgpu_ctx;
+  } else {
+    ctx = GPGPU_Context();
+  }
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+  // track pinned memory size allocated in the host so that same amount of
+  // memory is also allocated in GPU.
+  ctx->api->pinned_memory_size[addr] = size;
+  return g_last_cudaError = cudaSuccess;
+}
+
 __host__ cudaError_t CUDARTAPI
 cudaMallocPitchInternal(void **devPtr, size_t *pitch, size_t width,
                         size_t height, gpgpu_context *gpgpu_ctx = NULL) {
@@ -2301,13 +2371,78 @@ cudaDeviceSynchronizeInternal(gpgpu_context *gpgpu_ctx = NULL) {
  *                                                                              *
  *******************************************************************************/
 
-extern "C" {
-
 /*******************************************************************************
  *                                                                              *
- *                                                                              *
+ *   SST Specific functions, used by Balar *
  *                                                                              *
  *******************************************************************************/
+
+/**
+ * @brief Custom function to get CUDA function parameter size and offset
+ *        from PTX parsing result
+ *
+ * @param hostFun
+ * @param index
+ * @return std::tuple<cudaError_t, size_t, unsigned>
+ */
+std::tuple<cudaError_t, size_t, unsigned> SST_cudaGetParamConfig(
+    uint64_t hostFun, unsigned index) {
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+  CUctx_st *context = GPGPUSim_Context(GPGPU_Context());
+  function_info *entry = context->get_kernel((char *)hostFun);
+  cudaError_t result = cudaSuccess;
+  size_t size = 0;
+  unsigned alignment = 0;
+  if (index >= entry->num_args()) {
+    result = cudaErrorAssert;
+  } else {
+    std::pair<size_t, unsigned> p = entry->get_param_config(index);
+    size = p.first;
+    alignment = p.second;
+  }
+  return std::tuple<cudaError_t, size_t, unsigned>(result, size, alignment);
+}
+
+extern "C" {
+void SST_receive_mem_reply(unsigned core_id, void *mem_req) {
+  CUctx_st *context = GPGPUSim_Context(GPGPU_Context());
+  static_cast<sst_gpgpu_sim *>(context->get_device()->get_gpgpu())
+      ->SST_receive_mem_reply(core_id, mem_req);
+  // printf("GPGPU-sim: Recived Request\n");
+}
+
+bool SST_gpu_core_cycle() { return SST_Cycle(); }
+
+void SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) {
+  CUctx_st *context = GPGPUSim_Context(GPGPU_Context());
+  static_cast<sst_gpgpu_sim *>(context->get_device()->get_gpgpu())
+      ->SST_gpgpusim_numcores_equal_check(sst_numcores);
+}
+
+// TODO Weili: Unify SST and GPGPUSim apis?
+uint64_t cudaMallocSST(void **devPtr, size_t size) {
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+  void *test_malloc;
+  test_malloc = (void *)malloc(size);
+  void **test_malloc2 = &test_malloc;
+  CUctx_st *context = GPGPUSim_Context(GPGPU_Context());
+  *test_malloc2 = context->get_device()->get_gpgpu()->gpu_malloc(size);
+  printf("GPGPU-Sim PTX: cudaMallocing %zu bytes starting at 0x%llx..\n", size,
+         (unsigned long long)*test_malloc2);
+  if (g_debug_execution >= 3)
+    printf("GPGPU-Sim PTX: cudaMallocing %zu bytes starting at 0x%llx..\n",
+           size, (unsigned long long)*test_malloc2);
+  return (uint64_t)*test_malloc2;
+}
+
+__host__ cudaError_t CUDARTAPI cudaMallocHostSST(void *addr, size_t size) {
+  return cudaMallocHostSSTInternal(addr, size);
+}
+
 cudaError_t cudaPeekAtLastError(void) { return g_last_cudaError; }
 
 __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size) {
@@ -2534,6 +2669,7 @@ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
  *                                                                              *
  *                                                                              *
  *******************************************************************************/
+
 __host__ cudaError_t CUDARTAPI cudaMemset(void *mem, int c, size_t count) {
   return cudaMemsetInternal(mem, c, count);
 }
@@ -2754,11 +2890,48 @@ __host__ const char *CUDARTAPI cudaGetErrorString(cudaError_t error) {
   return strdup(buf);
 }
 
+// SST specific cuda apis
+// TODO Weili: Unify SST and GPGPUSim apis?
+// Weili: Use the regular cudaConfigureCall instead in SST, this following
+// Weili: function could be removed
+__host__ cudaError_t CUDARTAPI cudaConfigureCallSST(dim3 gridDim, dim3 blockDim,
+                                                    size_t sharedMem = 0,
+                                                    cudaStream_t stream = 0) {
+  printf("From cudaConfigureCallSST\n");
+  // weird setting of stream structs
+  struct CUstream_st value;
+  struct CUstream_st *s;
+  s = (CUstream_st *)malloc(sizeof(CUstream_st));
+  memcpy(s, &value, sizeof(CUstream_st));
+  return cudaConfigureCallInternal(gridDim, blockDim, sharedMem, s);
+}
+
+__host__ cudaError_t CUDARTAPI cudaSetupArgumentSST(uint64_t arg,
+                                                    uint8_t value[200],
+                                                    size_t size,
+                                                    size_t offset) {
+  void *local_value;
+  local_value = (void *)malloc(size);
+
+  if (arg) {
+    memcpy(local_value, (void *)&arg, size);
+  } else {
+    memcpy(local_value, value, size);
+  }
+  return cudaSetupArgumentInternal(local_value, size, offset);
+}
+
 __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, size_t size,
                                                  size_t offset) {
   return cudaSetupArgumentInternal(arg, size, offset);
 }
 
+// SST specific cuda apis
+// TODO Weili: Unify SST and GPGPUSim apis?
+__host__ cudaError_t CUDARTAPI cudaLaunchSST(uint64_t hostFun) {
+  return cudaLaunchInternal((char *)hostFun);
+}
+
 __host__ cudaError_t CUDARTAPI cudaLaunch(const char *hostFun) {
   return cudaLaunchInternal(hostFun);
 }
@@ -2933,6 +3106,27 @@ __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void) {
   return cudaThreadSynchronizeInternal();
 }
 
+__host__ cudaError_t CUDARTAPI cudaThreadSynchronizeSST(void) {
+  // For SST, perform a one-time check and let SST_Cycle()
+  // do the polling test and invoke callback to SST
+  // to signal ThreadSynchonize done
+  gpgpu_context *ctx = GPGPU_Context();
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+
+  // Called on host side
+  bool thread_sync_done = ctx->synchronize_check();
+  g_last_cudaError = cudaSuccess;
+  if (thread_sync_done) {
+    // We are already done, so no need to poll for sync done
+    ctx->requested_synchronize = false;
+    return cudaSuccess;
+  } else {
+    return cudaErrorNotReady;
+  }
+}
+
 int CUDARTAPI __cudaSynchronizeThreads(void **, void *) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
@@ -2992,10 +3186,10 @@ __host__ cudaError_t CUDARTAPI cudaGetExportTable(
 
 // extracts all ptx files from binary and dumps into
 // prog_name.unique_no.sm_<>.ptx files
-void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) {
+void cuda_runtime_api::extract_ptx_files_using_cuobjdump_internal(
+    CUctx_st *context, std::string &app_binary) {
   char command[1000];
   char *pytorch_bin = getenv("PYTORCH_BIN");
-  std::string app_binary = get_app_binary();
 
   char ptx_list_file_name[1024];
   snprintf(ptx_list_file_name, 1024, "_cuobjdump_list_ptx_XXXXXX");
@@ -3062,6 +3256,17 @@ void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) {
   }
 }
 
+void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context,
+                                                         const char *fn) {
+  std::string app_binary = get_app_binary(fn);
+  this->extract_ptx_files_using_cuobjdump_internal(context, app_binary);
+}
+
+void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) {
+  std::string app_binary = get_app_binary();
+  this->extract_ptx_files_using_cuobjdump_internal(context, app_binary);
+}
+
 //! Call cuobjdump to extract everything (-elf -sass -ptx)
 /*!
  *	This Function extract the whole PTX (for all the files) using cuobjdump
@@ -3069,13 +3274,12 @@ void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) {
  *with each binary in its own file It is also responsible for extracting the
  *libraries linked to the binary if the option is enabled
  * */
-void cuda_runtime_api::extract_code_using_cuobjdump() {
-  CUctx_st *context = GPGPUSim_Context(gpgpu_ctx);
-
+void cuda_runtime_api::extract_code_using_cuobjdump_internal(
+    CUctx_st *context, std::string &app_binary,
+    std::function<void(CUctx_st *)> ctx_extract_ptx_func) {
   // prevent the dumping by cuobjdump everytime we execute the code!
   const char *override_cuobjdump = getenv("CUOBJDUMP_SIM_FILE");
   char command[1000];
-  std::string app_binary = get_app_binary();
   // Running cuobjdump using dynamic link to current process
   snprintf(command, 1000, "md5sum %s ", app_binary.c_str());
   printf("Running md5sum using \"%s\"\n", command);
@@ -3090,7 +3294,7 @@ void cuda_runtime_api::extract_code_using_cuobjdump() {
   // used by ptxas.
   int result = 0;
 #if (CUDART_VERSION >= 6000)
-  extract_ptx_files_using_cuobjdump(context);
+  ctx_extract_ptx_func(context);
   return;
 #endif
   // TODO: redundant to dump twice. how can it be prevented?
@@ -3222,6 +3426,26 @@ void cuda_runtime_api::extract_code_using_cuobjdump() {
   }
 }
 
+void cuda_runtime_api::extract_code_using_cuobjdump(const char *fn) {
+  CUctx_st *context = GPGPUSim_Context(gpgpu_ctx);
+  std::string app_binary = get_app_binary(fn);
+  auto ctx_extract_ptx_func = [=](CUctx_st *context) {
+    extract_ptx_files_using_cuobjdump(context, fn);
+  };
+  extract_code_using_cuobjdump_internal(context, app_binary,
+                                        ctx_extract_ptx_func);
+}
+
+void cuda_runtime_api::extract_code_using_cuobjdump() {
+  CUctx_st *context = GPGPUSim_Context(gpgpu_ctx);
+  std::string app_binary = get_app_binary();
+  auto ctx_extract_ptx_func = [=](CUctx_st *context) {
+    extract_ptx_files_using_cuobjdump(context);
+  };
+  extract_code_using_cuobjdump_internal(context, app_binary,
+                                        ctx_extract_ptx_func);
+}
+
 //! Read file into char*
 // TODO: convert this to C++ streams, will be way cleaner
 char *readfile(const std::string filename) {
@@ -3466,10 +3690,11 @@ cuobjdumpPTXSection *cuda_runtime_api::findPTXSection(
 }
 
 //! Extract the code using cuobjdump and remove unnecessary sections
-void cuda_runtime_api::cuobjdumpInit() {
+void cuda_runtime_api::cuobjdumpInit_internal(
+    std::function<void()> ctx_extract_code_func) {
   CUctx_st *context = GPGPUSim_Context(gpgpu_ctx);
-  extract_code_using_cuobjdump();  // extract all the output of cuobjdump to
-                                   // _cuobjdump_*.*
+  ctx_extract_code_func();  // extract all the output of cuobjdump to
+                            // _cuobjdump_*.*
   const char *pre_load = getenv("CUOBJDUMP_SIM_FILE");
   if (pre_load == NULL || strlen(pre_load) == 0) {
     cuobjdumpSectionList = pruneSectionList(context);
@@ -3477,6 +3702,16 @@ void cuda_runtime_api::cuobjdumpInit() {
   }
 }
 
+void cuda_runtime_api::cuobjdumpInit(const char *fn) {
+  auto ctx_extract_code_func = [=]() { extract_code_using_cuobjdump(fn); };
+  cuobjdumpInit_internal(ctx_extract_code_func);
+}
+
+void cuda_runtime_api::cuobjdumpInit() {
+  auto ctx_extract_code_func = [=]() { extract_code_using_cuobjdump(); };
+  cuobjdumpInit_internal(ctx_extract_code_func);
+}
+
 //! Either submit PTX for simulation or convert SASS to PTXPlus and submit it
 void gpgpu_context::cuobjdumpParseBinary(unsigned int handle) {
   CUctx_st *context = GPGPUSim_Context(this);
@@ -3587,6 +3822,11 @@ void gpgpu_context::cuobjdumpParseBinary(unsigned int handle) {
 
 extern "C" {
 
+// TODO Weili: Unify SST and GPGPUSim apis?
+void **CUDARTAPI __cudaRegisterFatBinarySST(const char *fn) {
+  return cudaRegisterFatBinaryInternal(fn, NULL);
+}
+
 void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
@@ -3619,6 +3859,15 @@ cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim, dim3 *blockDim,
   return g_last_cudaError = cudaSuccess;
 }
 
+// TODO Weili: Unify SST and GPGPUSim apis?
+void CUDARTAPI __cudaRegisterFunctionSST(unsigned fatCubinHandle,
+                                         uint64_t hostFun,
+                                         char deviceFun[512]) {
+  cudaRegisterFunctionInternal((void **)fatCubinHandle, (const char *)hostFun,
+                               (char *)deviceFun, NULL, NULL, NULL, NULL, NULL,
+                               NULL);
+}
+
 void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
                                       const char *hostFun, char *deviceFun,
                                       const char *deviceName, int thread_limit,
diff --git a/libcuda/gpgpu_context.h b/libcuda/gpgpu_context.h
index d0cd7c48c..5ef21177b 100644
--- a/libcuda/gpgpu_context.h
+++ b/libcuda/gpgpu_context.h
@@ -44,6 +44,9 @@ class gpgpu_context {
       s_g_pc_to_insn;  // a direct mapping from PC to instruction
   bool debug_tensorcore;
 
+  // SST related
+  bool requested_synchronize = false;
+
   // objects pointers for each file
   cuda_runtime_api *api;
   ptxinfo_data *ptxinfo;
@@ -54,6 +57,7 @@ class gpgpu_context {
   ptx_stats *stats;
   // member function list
   void synchronize();
+  bool synchronize_check();
   void exit_simulation();
   void print_simulation_time();
   int gpgpu_opencl_ptx_sim_main_perf(kernel_info_t *grid);
diff --git a/setup_environment b/setup_environment
index 871bb593a..6b39b0149 100644
--- a/setup_environment
+++ b/setup_environment
@@ -44,7 +44,7 @@ if [ $? = 1 ]; then
 	return 1;
 fi
 
-CC_VERSION=`gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($i,/^[0-9]\.[0-9]\.[0-9]$/))  {print $i; exit 0}}}'`
+CC_VERSION=`gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($i,/^[0-9]+\.[0-9]+\.[0-9]+$/))  {print $i; exit 0}}}'`
 
 CUDA_VERSION_STRING=`$CUDA_INSTALL_PATH/bin/nvcc --version | awk '/release/ {print $5;}' | sed 's/,//'`;
 export CUDA_VERSION_NUMBER=`echo $CUDA_VERSION_STRING | sed 's/\./ /' | awk '{printf("%02u%02u", 10*int($1), 10*$2);}'`
@@ -60,13 +60,18 @@ if [ $CUDA_VERSION_NUMBER -ge 6000 ]; then
 	export CUOBJDUMP_SIM_FILE=jj
 fi
 
+# Simple configure, loop through all positional arguments
+# Default config
+export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/release
 
-if [ $# = '1' ] ;
-then
-    export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/$1
-else
-    export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/release
-fi
+for opt in $@
+do
+	if [[ $opt == 'debug' ]] ; then
+		# Debug mode
+		echo -n "enabled debug mode "
+		export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/$1
+	fi
+done
 
 export QTINC=/usr/include
 
diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc
index 833d33f5c..2fd90c0e5 100644
--- a/src/cuda-sim/cuda-sim.cc
+++ b/src/cuda-sim/cuda-sim.cc
@@ -1305,7 +1305,12 @@ void function_info::add_param_name_type_size(unsigned index, std::string name,
 void function_info::add_param_data(unsigned argn,
                                    struct gpgpu_ptx_sim_arg *args) {
   const void *data = args->m_start;
-
+  if (g_debug_execution >= 3) {
+    if (args->m_nbytes == 4)
+      printf("ADD_PARAM_DATA %d\n", *((uint32_t *)data));
+    else
+      printf("ADD_PARAM_DATA %p\n", *((void **)data));
+  }
   bool scratchpad_memory_param =
       false;  // Is this parameter in CUDA shared memory or OpenCL local memory
 
@@ -1746,6 +1751,17 @@ static unsigned get_tex_datasize(const ptx_instruction *pI,
                                  ptx_thread_info *thread) {
   const operand_info &src1 = pI->src1();  // the name of the texture
   std::string texname = src1.name();
+  // If indirect access, use register's value as address
+  // to find the symbol
+  if (src1.is_reg()) {
+    const operand_info &dst = pI->dst();
+    ptx_reg_t src1_data =
+        thread->get_operand_value(src1, dst, pI->get_type(), thread, 1);
+    addr_t sym_addr = src1_data.u64;
+    symbol *texRef = thread->get_symbol_table()->lookup_by_addr(sym_addr);
+    assert(texRef != NULL);
+    texname = texRef->name();
+  }
 
   /*
     For programs with many streams, textures can be bound and unbound
@@ -2285,15 +2301,24 @@ void cuda_sim::gpgpu_ptx_sim_memcpy_symbol(const char *hostVar, const void *src,
     sym_name = g->second;
     mem_region = global_space;
   }
-  if (g_globals.find(hostVar) != g_globals.end()) {
-    found_sym = true;
-    sym_name = hostVar;
-    mem_region = global_space;
-  }
-  if (g_constants.find(hostVar) != g_constants.end()) {
-    found_sym = true;
-    sym_name = hostVar;
-    mem_region = const_space;
+
+  // Weili: Only attempt to find symbol as it is a string
+  // if we could not find it in previously registered variable.
+  // This will avoid constructing std::string() from hostVar address
+  // where it is not a string as
+  // Use of a string naming a variable as the symbol parameter was deprecated in
+  // CUDA 4.1 and removed in CUDA 5.0.
+  if (!found_sym) {
+    if (g_globals.find(hostVar) != g_globals.end()) {
+      found_sym = true;
+      sym_name = hostVar;
+      mem_region = global_space;
+    }
+    if (g_constants.find(hostVar) != g_constants.end()) {
+      found_sym = true;
+      sym_name = hostVar;
+      mem_region = const_space;
+    }
   }
 
   if (!found_sym) {
diff --git a/src/cuda-sim/instructions.cc b/src/cuda-sim/instructions.cc
index 108de9759..843bf0ba7 100644
--- a/src/cuda-sim/instructions.cc
+++ b/src/cuda-sim/instructions.cc
@@ -6055,6 +6055,17 @@ void tex_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
                    // to be fetched
 
   std::string texname = src1.name();
+  // If indirect access, use register's value as address
+  // to find the symbol
+  if (src1.is_reg()) {
+    ptx_reg_t src1_data =
+        thread->get_operand_value(src1, dst, pI->get_type(), thread, 1);
+    addr_t sym_addr = src1_data.u64;
+    symbol *texRef = thread->get_symbol_table()->lookup_by_addr(sym_addr);
+    assert(texRef != NULL);
+    texname = texRef->name();
+  }
+
   unsigned to_type = pI->get_type();
   unsigned c_type = pI->get_type2();
   fflush(stdout);
diff --git a/src/cuda-sim/ptx_ir.cc b/src/cuda-sim/ptx_ir.cc
index 139920930..4e500ccb4 100644
--- a/src/cuda-sim/ptx_ir.cc
+++ b/src/cuda-sim/ptx_ir.cc
@@ -139,6 +139,22 @@ symbol *symbol_table::lookup(const char *identifier) {
   return NULL;
 }
 
+symbol *symbol_table::lookup_by_addr(addr_t addr) {
+  for (auto it = m_symbols.begin(); it != m_symbols.end(); ++it) {
+    symbol *sym = it->second;
+
+    // check if symbol has the addr to be found
+    if ((!sym->is_reg()) && (sym->has_valid_address()) &&
+        (sym->get_address() == addr)) {
+      return sym;
+    }
+  }
+  if (m_parent) {
+    return m_parent->lookup_by_addr(addr);
+  }
+  return NULL;
+}
+
 symbol *symbol_table::add_variable(const char *identifier,
                                    const type_info *type, unsigned size,
                                    const char *filename, unsigned line) {
diff --git a/src/cuda-sim/ptx_ir.h b/src/cuda-sim/ptx_ir.h
index d253866db..b08a692d8 100644
--- a/src/cuda-sim/ptx_ir.h
+++ b/src/cuda-sim/ptx_ir.h
@@ -205,6 +205,7 @@ class symbol {
   const std::string &name() const { return m_name; }
   const std::string &decl_location() const { return m_decl_location; }
   const type_info *type() const { return m_type; }
+  bool has_valid_address() const { return m_address_valid; }
   addr_t get_address() const {
     assert(m_is_label ||
            !m_type->get_key().is_reg());  // todo : other assertions
@@ -310,6 +311,7 @@ class symbol_table {
   void set_ptx_version(float ver, unsigned ext);
   void set_sm_target(const char *target, const char *ext, const char *ext2);
   symbol *lookup(const char *identifier);
+  symbol *lookup_by_addr(addr_t addr);
   std::string get_scope_name() const { return m_scope_name; }
   symbol *add_variable(const char *identifier, const type_info *type,
                        unsigned size, const char *filename, unsigned line);
diff --git a/src/cuda-sim/ptx_sim.h b/src/cuda-sim/ptx_sim.h
index f0c26efc8..8eec922e4 100644
--- a/src/cuda-sim/ptx_sim.h
+++ b/src/cuda-sim/ptx_sim.h
@@ -459,6 +459,9 @@ class ptx_thread_info {
   // Jin: get corresponding kernel grid for CDP purpose
   kernel_info_t &get_kernel() { return m_kernel; }
 
+  // Weili: access symbol_table
+  symbol_table *get_symbol_table() { return m_symbol_table; }
+
  public:
   addr_t m_last_effective_address;
   bool m_branch_taken;
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 0c922bdb3..b91d6f9ae 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -319,6 +319,9 @@ void memory_config::reg_options(class OptionParser *opp) {
       "elimnate_rw_turnaround i.e set tWTR and tRTW = 0", "0");
   option_parser_register(opp, "-icnt_flit_size", OPT_UINT32, &icnt_flit_size,
                          "icnt_flit_size", "32");
+  // SST mode activate
+  option_parser_register(opp, "-SST_mode", OPT_BOOL, &SST_mode, "SST mode",
+                         "0");
   m_address_mapping.addrdec_setoption(opp);
 }
 
@@ -955,6 +958,16 @@ void exec_gpgpu_sim::createSIMTCluster() {
                                    m_shader_stats, m_memory_stats);
 }
 
+// SST get its own simt_cluster
+void sst_gpgpu_sim::createSIMTCluster() {
+  m_cluster = new simt_core_cluster *[m_shader_config->n_simt_clusters];
+  for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
+    m_cluster[i] =
+        new sst_simt_core_cluster(this, i, m_shader_config, m_memory_config,
+                                  m_shader_stats, m_memory_stats);
+  SST_gpgpu_reply_buffer.resize(m_shader_config->n_simt_clusters);
+}
+
 gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
     : gpgpu_t(config, ctx), m_config(config) {
   gpgpu_ctx = ctx;
@@ -999,26 +1012,29 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
 
   gpu_kernel_time.clear();
 
-  m_memory_partition_unit =
-      new memory_partition_unit *[m_memory_config->m_n_mem];
-  m_memory_sub_partition =
-      new memory_sub_partition *[m_memory_config->m_n_mem_sub_partition];
-  for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
-    m_memory_partition_unit[i] =
-        new memory_partition_unit(i, m_memory_config, m_memory_stats, this);
-    for (unsigned p = 0;
-         p < m_memory_config->m_n_sub_partition_per_memory_channel; p++) {
-      unsigned submpid =
-          i * m_memory_config->m_n_sub_partition_per_memory_channel + p;
-      m_memory_sub_partition[submpid] =
-          m_memory_partition_unit[i]->get_sub_partition(p);
+  // TODO: somehow move this logic to the sst_gpgpu_sim constructor?
+  if (!m_config.is_SST_mode()) {
+    // Init memory if not in SST mode
+    m_memory_partition_unit =
+        new memory_partition_unit *[m_memory_config->m_n_mem];
+    m_memory_sub_partition =
+        new memory_sub_partition *[m_memory_config->m_n_mem_sub_partition];
+    for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
+      m_memory_partition_unit[i] =
+          new memory_partition_unit(i, m_memory_config, m_memory_stats, this);
+      for (unsigned p = 0;
+           p < m_memory_config->m_n_sub_partition_per_memory_channel; p++) {
+        unsigned submpid =
+            i * m_memory_config->m_n_sub_partition_per_memory_channel + p;
+        m_memory_sub_partition[submpid] =
+            m_memory_partition_unit[i]->get_sub_partition(p);
+      }
     }
-  }
-
-  icnt_wrapper_init();
-  icnt_create(m_shader_config->n_simt_clusters,
-              m_memory_config->m_n_mem_sub_partition);
 
+    icnt_wrapper_init();
+    icnt_create(m_shader_config->n_simt_clusters,
+                m_memory_config->m_n_mem_sub_partition);
+  }
   time_vector_create(NUM_MEM_REQ_STAT);
   fprintf(stdout,
           "GPGPU-Sim uArch: performance model initialization complete.\n");
@@ -1037,6 +1053,22 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
   m_functional_sim_kernel = NULL;
 }
 
+void sst_gpgpu_sim::SST_receive_mem_reply(unsigned core_id, void *mem_req) {
+  assert(core_id < m_shader_config->n_simt_clusters);
+  mem_fetch *mf = (mem_fetch *)mem_req;
+
+  (SST_gpgpu_reply_buffer[core_id]).push_back(mf);
+}
+
+mem_fetch *sst_gpgpu_sim::SST_pop_mem_reply(unsigned core_id) {
+  if (SST_gpgpu_reply_buffer[core_id].size() > 0) {
+    mem_fetch *temp = SST_gpgpu_reply_buffer[core_id].front();
+    SST_gpgpu_reply_buffer[core_id].pop_front();
+    return temp;
+  } else
+    return NULL;
+}
+
 int gpgpu_sim::shared_mem_size() const {
   return m_shader_config->gpgpu_shmem_size;
 }
@@ -1132,6 +1164,27 @@ bool gpgpu_sim::active() {
   return false;
 }
 
+bool sst_gpgpu_sim::active() {
+  if (m_config.gpu_max_cycle_opt &&
+      (gpu_tot_sim_cycle + gpu_sim_cycle) >= m_config.gpu_max_cycle_opt)
+    return false;
+  if (m_config.gpu_max_insn_opt &&
+      (gpu_tot_sim_insn + gpu_sim_insn) >= m_config.gpu_max_insn_opt)
+    return false;
+  if (m_config.gpu_max_cta_opt &&
+      (gpu_tot_issued_cta >= m_config.gpu_max_cta_opt))
+    return false;
+  if (m_config.gpu_max_completed_cta_opt &&
+      (gpu_completed_cta >= m_config.gpu_max_completed_cta_opt))
+    return false;
+  if (m_config.gpu_deadlock_detect && gpu_deadlock) return false;
+  for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
+    if (m_cluster[i]->get_not_completed() > 0) return true;
+  ;
+  if (get_more_cta_left()) return true;
+  return false;
+}
+
 void gpgpu_sim::init() {
   // run a CUDA grid on the GPU microarchitecture simulator
   gpu_sim_cycle = 0;
@@ -2157,6 +2210,11 @@ void gpgpu_sim::cycle() {
   }
 }
 
+void sst_gpgpu_sim::cycle() {
+  SST_cycle();
+  return;
+}
+
 void shader_core_ctx::dump_warp_state(FILE *fout) const {
   fprintf(fout, "\n");
   fprintf(fout, "per warp functional simulation status:\n");
@@ -2236,3 +2294,114 @@ const shader_core_config *gpgpu_sim::getShaderCoreConfig() {
 const memory_config *gpgpu_sim::getMemoryConfig() { return m_memory_config; }
 
 simt_core_cluster *gpgpu_sim::getSIMTCluster() { return *m_cluster; }
+
+void sst_gpgpu_sim::SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) {
+  if (m_shader_config->n_simt_clusters != sst_numcores) {
+    assert(
+        "\nSST core is not equal the GPGPU-sim cores. Open gpgpu-sim.config "
+        "file and ensure n_simt_clusters"
+        "is the same as SST gpu cores.\n" &&
+        0);
+  } else {
+    printf("\nSST GPU core is equal the GPGPU-sim cores = %d\n", sst_numcores);
+  }
+}
+
+void sst_gpgpu_sim::SST_cycle() {
+  // int clock_mask = next_clock_domain();
+
+  //   if (clock_mask & CORE ) {
+  // shader core loading (pop from ICNT into core) follows CORE clock
+  for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
+    static_cast<sst_simt_core_cluster *>(m_cluster[i])->icnt_cycle_SST();
+
+  // L1 cache + shader core pipeline stages
+  m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].clear();
+  for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) {
+    if (m_cluster[i]->get_not_completed() || get_more_cta_left()) {
+      m_cluster[i]->core_cycle();
+      *active_sms += m_cluster[i]->get_n_active_sms();
+    }
+    // Update core icnt/cache stats for GPUWattch
+    m_cluster[i]->get_icnt_stats(
+        m_power_stats->pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i],
+        m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]);
+    m_cluster[i]->get_cache_stats(
+        m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX]);
+  }
+  float temp = 0;
+  for (unsigned i = 0; i < m_shader_config->num_shader(); i++) {
+    temp += m_shader_stats->m_pipeline_duty_cycle[i];
+  }
+  temp = temp / m_shader_config->num_shader();
+  *average_pipeline_duty_cycle = ((*average_pipeline_duty_cycle) + temp);
+  // cout<<"Average pipeline duty cycle: "<<*average_pipeline_duty_cycle<<endl;
+
+  if (g_single_step && ((gpu_sim_cycle + gpu_tot_sim_cycle) >= g_single_step)) {
+    asm("int $03");
+  }
+  gpu_sim_cycle++;
+  if (g_interactive_debugger_enabled) gpgpu_debug();
+
+    // McPAT main cycle (interface with McPAT)
+#ifdef GPGPUSIM_POWER_MODEL
+  if (m_config.g_power_simulation_enabled) {
+    mcpat_cycle(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
+                m_power_stats, m_config.gpu_stat_sample_freq, gpu_tot_sim_cycle,
+                gpu_sim_cycle, gpu_tot_sim_insn, gpu_sim_insn,
+                m_config.g_dvfs_enabled);
+  }
+#endif
+
+  issue_block2core();
+
+  if (!(gpu_sim_cycle % m_config.gpu_stat_sample_freq)) {
+    time_t days, hrs, minutes, sec;
+    time_t curr_time;
+    time(&curr_time);
+    unsigned long long elapsed_time =
+        MAX(curr_time - gpgpu_ctx->the_gpgpusim->g_simulation_starttime, 1);
+    if ((elapsed_time - last_liveness_message_time) >=
+        m_config.liveness_message_freq) {
+      days = elapsed_time / (3600 * 24);
+      hrs = elapsed_time / 3600 - 24 * days;
+      minutes = elapsed_time / 60 - 60 * (hrs + 24 * days);
+      sec = elapsed_time - 60 * (minutes + 60 * (hrs + 24 * days));
+
+      last_liveness_message_time = elapsed_time;
+    }
+    visualizer_printstat();
+    m_memory_stats->memlatstat_lat_pw();
+    if (m_config.gpgpu_runtime_stat && (m_config.gpu_runtime_stat_flag != 0)) {
+      if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_BW_STAT) {
+        for (unsigned i = 0; i < m_memory_config->m_n_mem; i++)
+          m_memory_partition_unit[i]->print_stat(stdout);
+        printf("maxmrqlatency = %d \n", m_memory_stats->max_mrq_latency);
+        printf("maxmflatency = %d \n", m_memory_stats->max_mf_latency);
+      }
+      if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_SHD_INFO)
+        shader_print_runtime_stat(stdout);
+      if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_L1MISS)
+        shader_print_l1_miss_stat(stdout);
+      if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_SCHED)
+        shader_print_scheduler_stat(stdout, false);
+    }
+  }
+
+  if (!(gpu_sim_cycle % 20000)) {
+    // deadlock detection
+    if (m_config.gpu_deadlock_detect && gpu_sim_insn == last_gpu_sim_insn) {
+      gpu_deadlock = true;
+    } else {
+      last_gpu_sim_insn = gpu_sim_insn;
+    }
+  }
+  try_snap_shot(gpu_sim_cycle);
+  spill_log_to_file(stdout, 0, gpu_sim_cycle);
+
+#if (CUDART_VERSION >= 5000)
+  // launch device kernel
+  gpgpu_ctx->device_runtime->launch_one_device_kernel();
+#endif
+  //   }
+}
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index 8e81451b6..d0c2a1763 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -69,6 +69,38 @@ class gpgpu_context;
 
 extern tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 
+// SST communication functions
+/**
+ * @brief Check if SST requests buffer is full
+ *
+ * @param core_id
+ * @return true
+ * @return false
+ */
+extern bool is_SST_buffer_full(unsigned core_id);
+
+/**
+ * @brief Send loads to SST memory backend
+ *
+ * @param core_id
+ * @param address
+ * @param size
+ * @param mem_req
+ */
+extern void send_read_request_SST(unsigned core_id, uint64_t address,
+                                  size_t size, void *mem_req);
+
+/**
+ * @brief Send stores to SST memory backend
+ *
+ * @param core_id
+ * @param address
+ * @param size
+ * @param mem_req
+ */
+extern void send_write_request_SST(unsigned core_id, uint64_t address,
+                                   size_t size, void *mem_req);
+
 enum dram_ctrl_t { DRAM_FIFO = 0, DRAM_FRFCFS = 1 };
 
 enum hw_perf_t {
@@ -274,6 +306,14 @@ class memory_config {
   }
   void reg_options(class OptionParser *opp);
 
+  /**
+   * @brief Check if the config script is in SST mode
+   *
+   * @return true
+   * @return false
+   */
+  bool is_SST_mode() const { return SST_mode; }
+
   bool m_valid;
   mutable l2_cache_config m_L2_config;
   bool m_L2_texure_only;
@@ -351,7 +391,7 @@ class memory_config {
   unsigned write_low_watermark;
   bool m_perf_sim_memcpy;
   bool simple_dram_model;
-
+  bool SST_mode;
   gpgpu_context *gpgpu_ctx;
 };
 
@@ -398,6 +438,15 @@ class gpgpu_sim_config : public power_config,
   unsigned num_shader() const { return m_shader_config.num_shader(); }
   unsigned num_cluster() const { return m_shader_config.n_simt_clusters; }
   unsigned get_max_concurrent_kernel() const { return max_concurrent_kernel; }
+
+  /**
+   * @brief Check if we are in SST mode
+   *
+   * @return true
+   * @return false
+   */
+  bool is_SST_mode() const { return m_memory_config.SST_mode; }
+
   unsigned checkpoint_option;
 
   size_t stack_limit() const { return stack_size_limit; }
@@ -462,6 +511,7 @@ class gpgpu_sim_config : public power_config,
   unsigned long long liveness_message_freq;
 
   friend class gpgpu_sim;
+  friend class sst_gpgpu_sim;
 };
 
 struct occupancy_stats {
@@ -600,10 +650,18 @@ class gpgpu_sim : public gpgpu_t {
   void hit_watchpoint(unsigned watchpoint_num, ptx_thread_info *thd,
                       const ptx_instruction *pI);
 
+  /**
+   * @brief Check if we are in SST mode
+   *
+   * @return true
+   * @return false
+   */
+  bool is_SST_mode() { return m_config.is_SST_mode(); }
+
   // backward pointer
   class gpgpu_context *gpgpu_ctx;
 
- private:
+ protected:
   // clocks
   void reinit_clock_domains(void);
   int next_clock_domain(void);
@@ -715,7 +773,7 @@ class gpgpu_sim : public gpgpu_t {
   void set_cache_config(std::string kernel_name);
 
   // Jin: functional simulation for CDP
- private:
+ protected:
   // set by stream operation every time a functoinal simulation is done
   bool m_functional_sim;
   kernel_info_t *m_functional_sim_kernel;
@@ -748,4 +806,79 @@ class exec_gpgpu_sim : public gpgpu_sim {
   virtual void createSIMTCluster();
 };
 
+/**
+ * @brief A GPGPUSim class customized to SST Balar interfacing
+ *
+ */
+class sst_gpgpu_sim : public gpgpu_sim {
+ public:
+  sst_gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
+      : gpgpu_sim(config, ctx) {
+    createSIMTCluster();
+  }
+
+  // SST memory handling
+  std::vector<std::deque<mem_fetch *>>
+      SST_gpgpu_reply_buffer; /** SST mem response queue */
+
+  /**
+   * @brief Receive mem request's response from SST and put
+   *        it in a buffer (SST_gpgpu_reply_buffer)
+   *
+   * @param core_id
+   * @param mem_req
+   */
+  void SST_receive_mem_reply(unsigned core_id, void *mem_req);
+
+  /**
+   * @brief Pop the head of the buffer queue to get the
+   *        memory response
+   *
+   * @param core_id
+   * @return mem_fetch*
+   */
+  mem_fetch *SST_pop_mem_reply(unsigned core_id);
+
+  virtual void createSIMTCluster();
+
+  // SST Balar interfacing
+  /**
+   * @brief Advance core and collect stats
+   *
+   */
+  void SST_cycle();
+
+  /**
+   * @brief Wrapper of SST_cycle()
+   *
+   */
+  void cycle();
+
+  /**
+   * @brief Whether the GPU is active, removed test for
+   *        memory system since that is handled in SST
+   *
+   * @return true
+   * @return false
+   */
+  bool active();
+
+  /**
+   * @brief SST mode use SST memory system instead, so the memcpy
+   *        is empty here
+   *
+   * @param dst_start_addr
+   * @param count
+   */
+  void perf_memcpy_to_gpu(size_t dst_start_addr, size_t count){};
+
+  /**
+   * @brief Check if the SST config matches up with the
+   *        gpgpusim.config in core number
+   *
+   * @param sst_numcores SST core count
+   */
+  void SST_gpgpusim_numcores_equal_check(unsigned sst_numcores);
+};
+
 #endif
diff --git a/src/gpgpu-sim/mem_fetch.cc b/src/gpgpu-sim/mem_fetch.cc
index 7211a7dd3..809c92081 100644
--- a/src/gpgpu-sim/mem_fetch.cc
+++ b/src/gpgpu-sim/mem_fetch.cc
@@ -54,9 +54,15 @@ mem_fetch::mem_fetch(const mem_access_t &access, const warp_inst_t *inst,
   m_sid = sid;
   m_tpc = tpc;
   m_wid = wid;
-  config->m_address_mapping.addrdec_tlx(access.get_addr(), &m_raw_addr);
-  m_partition_addr =
-      config->m_address_mapping.partition_address(access.get_addr());
+
+  if (!config->is_SST_mode()) {
+    // In SST memory model, the SST memory hierarchy is
+    // responsible to generate the correct address mapping
+    config->m_address_mapping.addrdec_tlx(access.get_addr(), &m_raw_addr);
+    m_partition_addr =
+        config->m_address_mapping.partition_address(access.get_addr());
+  }
+
   m_type = m_access.is_write() ? WRITE_REQUEST : READ_REQUEST;
   m_timestamp = cycle;
   m_timestamp2 = 0;
diff --git a/src/gpgpu-sim/mem_latency_stat.cc b/src/gpgpu-sim/mem_latency_stat.cc
index 63d7ee80c..c77a68648 100644
--- a/src/gpgpu-sim/mem_latency_stat.cc
+++ b/src/gpgpu-sim/mem_latency_stat.cc
@@ -203,7 +203,15 @@ unsigned memory_stats_t::memlatstat_done(mem_fetch *mf) {
 }
 
 void memory_stats_t::memlatstat_read_done(mem_fetch *mf) {
-  if (m_memory_config->gpgpu_memlatency_stat) {
+  if (m_memory_config->SST_mode) {
+    // in SST mode, we just calculate mem latency
+    unsigned mf_latency;
+    mf_latency =
+        (m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle) - mf->get_timestamp();
+    num_mfs++;
+    mf_total_lat += mf_latency;
+    if (mf_latency > max_mf_latency) max_mf_latency = mf_latency;
+  } else if (m_memory_config->gpgpu_memlatency_stat) {
     unsigned mf_latency = memlatstat_done(mf);
     if (mf_latency >
         mf_max_lat_table[mf->get_tlx_addr().chip][mf->get_tlx_addr().bk])
@@ -273,7 +281,12 @@ void memory_stats_t::memlatstat_print(unsigned n_mem, unsigned gpu_mem_n_bk) {
   unsigned max_bank_accesses, min_bank_accesses, max_chip_accesses,
       min_chip_accesses;
 
-  if (m_memory_config->gpgpu_memlatency_stat) {
+  if (m_memory_config->SST_mode) {
+    // in SST mode, we just calculate mem latency
+    printf("max_mem_SST_latency = %d \n", max_mf_latency);
+    if (num_mfs)
+      printf("average_mf_SST_latency = %lld \n", mf_total_lat / num_mfs);
+  } else if (m_memory_config->gpgpu_memlatency_stat) {
     printf("maxmflatency = %d \n", max_mf_latency);
     printf("max_icnt2mem_latency = %d \n", max_icnt2mem_latency);
     printf("maxmrqlatency = %d \n", max_mrq_latency);
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 4d4f11277..a19c4abb2 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -162,7 +162,10 @@ void shader_core_ctx::create_front_pipeline() {
   }
 
   // m_icnt = new shader_memory_interface(this,cluster);
-  if (m_config->gpgpu_perfect_mem) {
+  if (m_memory_config->SST_mode) {
+    m_icnt = new sst_memory_interface(
+        this, static_cast<sst_simt_core_cluster *>(m_cluster));
+  } else if (m_config->gpgpu_perfect_mem) {
     m_icnt = new perfect_memory_interface(this, m_cluster);
   } else {
     m_icnt = new shader_memory_interface(this, m_cluster);
@@ -2281,7 +2284,15 @@ bool ldst_unit::memory_cycle(warp_inst_t &inst,
         inst.is_store() ? WRITE_PACKET_SIZE : READ_PACKET_SIZE;
     unsigned size = access.get_size() + control_size;
     // printf("Interconnect:Addr: %x, size=%d\n",access.get_addr(),size);
-    if (m_icnt->full(size, inst.is_store() || inst.isatomic())) {
+    if (m_memory_config->SST_mode &&
+        (static_cast<sst_memory_interface *>(m_icnt)->full(
+            size, inst.is_store() || inst.isatomic(), access.get_type()))) {
+      // SST need mf type here
+      // Cast it to sst_memory_interface pointer first as this full() method
+      // is not a virtual method in parent class
+      stall_cond = ICNT_RC_FAIL;
+    } else if (!m_memory_config->SST_mode &&
+               (m_icnt->full(size, inst.is_store() || inst.isatomic()))) {
       stall_cond = ICNT_RC_FAIL;
     } else {
       mem_fetch *mf =
@@ -2846,7 +2857,10 @@ void ldst_unit::cycle() {
       }
     } else {
       if (mf->get_type() == WRITE_ACK ||
-          (m_config->gpgpu_perfect_mem && mf->get_is_write())) {
+          ((m_config->gpgpu_perfect_mem || m_memory_config->SST_mode) &&
+           mf->get_is_write())) {
+        // SST memory is handled by SST mem hierarchy
+        // Perfect mem
         m_core->store_ack(mf);
         m_response_fifo.pop_front();
         delete mf;
@@ -4020,7 +4034,8 @@ void shader_core_ctx::accept_ldst_unit_response(mem_fetch *mf) {
 
 void shader_core_ctx::store_ack(class mem_fetch *mf) {
   assert(mf->get_type() == WRITE_ACK ||
-         (m_config->gpgpu_perfect_mem && mf->get_is_write()));
+         ((m_config->gpgpu_perfect_mem || m_memory_config->SST_mode) &&
+          mf->get_is_write()));
   unsigned warp_id = mf->get_wid();
   m_warp[warp_id]->dec_store_req();
 }
@@ -4573,6 +4588,22 @@ bool simt_core_cluster::icnt_injection_buffer_full(unsigned size, bool write) {
   return !::icnt_has_buffer(m_cluster_id, request_size);
 }
 
+bool sst_simt_core_cluster::SST_injection_buffer_full(unsigned size, bool write,
+                                                      mem_access_type type) {
+  switch (type) {
+    case CONST_ACC_R:
+    case TEXTURE_ACC_R:
+    case INST_ACC_R: {
+      return response_queue_full();
+      break;
+    }
+    default: {
+      return ::is_SST_buffer_full(m_cluster_id);
+      break;
+    }
+  }
+}
+
 void simt_core_cluster::icnt_inject_request_packet(class mem_fetch *mf) {
   // stats
   if (mf->get_is_write())
@@ -4639,6 +4670,82 @@ void simt_core_cluster::icnt_inject_request_packet(class mem_fetch *mf) {
                 mf->size());
 }
 
+void sst_simt_core_cluster::icnt_inject_request_packet_to_SST(
+    class mem_fetch *mf) {
+  // stats
+  if (mf->get_is_write())
+    m_stats->made_write_mfs++;
+  else
+    m_stats->made_read_mfs++;
+  switch (mf->get_access_type()) {
+    case CONST_ACC_R:
+      m_stats->gpgpu_n_mem_const++;
+      break;
+    case TEXTURE_ACC_R:
+      m_stats->gpgpu_n_mem_texture++;
+      break;
+    case GLOBAL_ACC_R:
+      m_stats->gpgpu_n_mem_read_global++;
+      break;
+    case GLOBAL_ACC_W:
+      m_stats->gpgpu_n_mem_write_global++;
+      break;
+    case LOCAL_ACC_R:
+      m_stats->gpgpu_n_mem_read_local++;
+      break;
+    case LOCAL_ACC_W:
+      m_stats->gpgpu_n_mem_write_local++;
+      break;
+    case INST_ACC_R:
+      m_stats->gpgpu_n_mem_read_inst++;
+      break;
+    case L1_WRBK_ACC:
+      m_stats->gpgpu_n_mem_write_global++;
+      break;
+    case L2_WRBK_ACC:
+      m_stats->gpgpu_n_mem_l2_writeback++;
+      break;
+    case L1_WR_ALLOC_R:
+      m_stats->gpgpu_n_mem_l1_write_allocate++;
+      break;
+    case L2_WR_ALLOC_R:
+      m_stats->gpgpu_n_mem_l2_write_allocate++;
+      break;
+    default:
+      assert(0);
+  }
+
+  // The packet size varies depending on the type of request:
+  // - For write request and atomic request, the packet contains the data
+  // - For read request (i.e. not write nor atomic), the packet only has control
+  // metadata
+  unsigned int packet_size = mf->size();
+  if (!mf->get_is_write() && !mf->isatomic()) {
+    packet_size = mf->get_ctrl_size();
+  }
+  m_stats->m_outgoing_traffic_stats->record_traffic(mf, packet_size);
+  mf->set_status(IN_ICNT_TO_MEM,
+                 m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+  switch (mf->get_access_type()) {
+    case CONST_ACC_R:
+    case TEXTURE_ACC_R:
+    case INST_ACC_R: {
+      push_response_fifo(mf);
+      break;
+    }
+    default: {
+      if (!mf->get_is_write() && !mf->isatomic())
+        ::send_read_request_SST(m_cluster_id, mf->get_addr(),
+                                mf->get_data_size(), (void *)mf);
+      else
+        ::send_write_request_SST(m_cluster_id, mf->get_addr(),
+                                 mf->get_data_size(), (void *)mf);
+
+      break;
+    }
+  }
+}
+
 void simt_core_cluster::icnt_cycle() {
   if (!m_response_fifo.empty()) {
     mem_fetch *mf = m_response_fifo.front();
@@ -4678,6 +4785,49 @@ void simt_core_cluster::icnt_cycle() {
   }
 }
 
+void sst_simt_core_cluster::icnt_cycle_SST() {
+  if (!m_response_fifo.empty()) {
+    mem_fetch *mf = m_response_fifo.front();
+    unsigned cid = m_config->sid_to_cid(mf->get_sid());
+    if (mf->get_access_type() == INST_ACC_R) {
+      // instruction fetch response
+      if (!m_core[cid]->fetch_unit_response_buffer_full()) {
+        m_response_fifo.pop_front();
+        m_core[cid]->accept_fetch_response(mf);
+      }
+    } else {
+      // data response
+      if (!m_core[cid]->ldst_unit_response_buffer_full()) {
+        m_response_fifo.pop_front();
+        m_memory_stats->memlatstat_read_done(mf);
+        m_core[cid]->accept_ldst_unit_response(mf);
+      }
+    }
+  }
+
+  // pop from SST buffers
+  if (m_response_fifo.size() < m_config->n_simt_ejection_buffer_size) {
+    mem_fetch *mf = (mem_fetch *)(static_cast<sst_gpgpu_sim *>(get_gpu())
+                                      ->SST_pop_mem_reply(m_cluster_id));
+    if (!mf) return;
+    assert(mf->get_tpc() == m_cluster_id);
+
+    // do atomic here
+    // For now, we execute atomic when the mem reply comes back
+    // This needs to be validated
+    if (mf && mf->isatomic()) mf->do_atomic();
+
+    unsigned int packet_size =
+        (mf->get_is_write()) ? mf->get_ctrl_size() : mf->size();
+    m_stats->m_incoming_traffic_stats->record_traffic(mf, packet_size);
+    mf->set_status(IN_CLUSTER_TO_SHADER_QUEUE,
+                   m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+    // m_memory_stats->memlatstat_read_done(mf,m_shader_config->max_warps_per_shader);
+    m_response_fifo.push_back(mf);
+    m_stats->n_mem_to_simt[m_cluster_id] += mf->get_num_flits(false);
+  }
+}
+
 void simt_core_cluster::get_pdom_stack_top_info(unsigned sid, unsigned tid,
                                                 unsigned *pc,
                                                 unsigned *rpc) const {
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index e658a14c9..dae697a2d 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -2015,6 +2015,7 @@ class shader_core_stats : public shader_core_stats_pod {
   friend class shader_core_ctx;
   friend class ldst_unit;
   friend class simt_core_cluster;
+  friend class sst_simt_core_cluster;
   friend class scheduler_unit;
   friend class TwoLevelScheduler;
   friend class LooseRoundRobbinScheduler;
@@ -2685,6 +2686,50 @@ class exec_simt_core_cluster : public simt_core_cluster {
   virtual void create_shader_core_ctx();
 };
 
+/**
+ * @brief SST cluster class
+ *
+ */
+class sst_simt_core_cluster : public exec_simt_core_cluster {
+ public:
+  sst_simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id,
+                        const shader_core_config *config,
+                        const memory_config *mem_config,
+                        class shader_core_stats *stats,
+                        class memory_stats_t *mstats)
+      : exec_simt_core_cluster(gpu, cluster_id, config, mem_config, stats,
+                               mstats) {}
+
+  /**
+   * @brief Check if SST memory request injection
+   *        buffer is full by using extern
+   *        function is_SST_buffer_full()
+   *        defined in Balar
+   *
+   * @param size
+   * @param write
+   * @param type
+   * @return true
+   * @return false
+   */
+  bool SST_injection_buffer_full(unsigned size, bool write,
+                                 mem_access_type type);
+
+  /**
+   * @brief Send memory request packets to SST
+   *        memory
+   *
+   * @param mf
+   */
+  void icnt_inject_request_packet_to_SST(class mem_fetch *mf);
+
+  /**
+   * @brief Advance ICNT between core and SST
+   *
+   */
+  void icnt_cycle_SST();
+};
+
 class shader_memory_interface : public mem_fetch_interface {
  public:
   shader_memory_interface(shader_core_ctx *core, simt_core_cluster *cluster) {
@@ -2725,6 +2770,61 @@ class perfect_memory_interface : public mem_fetch_interface {
   simt_core_cluster *m_cluster;
 };
 
+/**
+ * @brief SST memory interface
+ *
+ */
+class sst_memory_interface : public mem_fetch_interface {
+ public:
+  sst_memory_interface(shader_core_ctx *core, sst_simt_core_cluster *cluster) {
+    m_core = core;
+    m_cluster = cluster;
+  }
+  /**
+   * @brief For constant, inst, tex cache access
+   *
+   * @param size
+   * @param write
+   * @return true
+   * @return false
+   */
+  virtual bool full(unsigned size, bool write) const {
+    assert(false && "Use the full() method with access type instead!");
+    return true;
+  }
+
+  /**
+   * @brief With SST, the core will direct all mem access except for
+   *        constant, tex, and inst reads to SST mem system
+   *        (i.e. not modeling constant mem right now), thus
+   *        requiring the mem_access_type information to be passed in
+   *
+   * @param size
+   * @param write
+   * @param type
+   * @return true
+   * @return false
+   */
+  bool full(unsigned size, bool write, mem_access_type type) const {
+    return m_cluster->SST_injection_buffer_full(size, write, type);
+  }
+
+  /**
+   * @brief Push memory request to SST memory system and
+   *        update stats
+   *
+   * @param mf
+   */
+  virtual void push(mem_fetch *mf) {
+    m_core->inc_simt_to_mem(mf->get_num_flits(true));
+    m_cluster->icnt_inject_request_packet_to_SST(mf);
+  }
+
+ private:
+  shader_core_ctx *m_core;
+  sst_simt_core_cluster *m_cluster;
+};
+
 inline int scheduler_unit::get_sid() const { return m_shader->get_sid(); }
 
 #endif /* SHADER_H */
diff --git a/src/gpgpusim_entrypoint.cc b/src/gpgpusim_entrypoint.cc
index 42c6981b0..839fef619 100644
--- a/src/gpgpusim_entrypoint.cc
+++ b/src/gpgpusim_entrypoint.cc
@@ -43,6 +43,20 @@
 static int sg_argc = 3;
 static const char *sg_argv[] = {"", "-config", "gpgpusim.config"};
 
+// Help funcs to avoid multiple '->' for SST
+GPGPUsim_ctx *GPGPUsim_ctx_ptr() { return GPGPU_Context()->the_gpgpusim; }
+
+class sst_gpgpu_sim *g_the_gpu() {
+  return static_cast<sst_gpgpu_sim *>(GPGPUsim_ctx_ptr()->g_the_gpu);
+}
+
+class stream_manager *g_stream_manager() {
+  return GPGPUsim_ctx_ptr()->g_stream_manager;
+}
+
+// SST callback
+extern void SST_callback_cudaThreadSynchronize_done();
+
 void *gpgpu_sim_thread_sequential(void *ctx_ptr) {
   gpgpu_context *ctx = (gpgpu_context *)ctx_ptr;
   // at most one kernel running at a time
@@ -169,6 +183,75 @@ void *gpgpu_sim_thread_concurrent(void *ctx_ptr) {
   return NULL;
 }
 
+bool sst_sim_cycles = false;
+
+bool SST_Cycle() {
+  // Check if Synchronize is done when SST previously requested
+  // cudaThreadSynchronize
+  if (GPGPU_Context()->requested_synchronize &&
+      ((g_stream_manager()->empty() && !GPGPUsim_ctx_ptr()->g_sim_active) ||
+       GPGPUsim_ctx_ptr()->g_sim_done)) {
+    SST_callback_cudaThreadSynchronize_done();
+    GPGPU_Context()->requested_synchronize = false;
+  }
+
+  if (g_stream_manager()->empty_protected() &&
+      !GPGPUsim_ctx_ptr()->g_sim_done && !g_the_gpu()->active()) {
+    GPGPUsim_ctx_ptr()->g_sim_active = false;
+    // printf("stream is empty %d \n",  g_stream_manager->empty());
+    return false;
+  }
+
+  if (g_stream_manager()->operation(&sst_sim_cycles) &&
+      !g_the_gpu()->active()) {
+    if (sst_sim_cycles) {
+      sst_sim_cycles = false;
+    }
+    return false;
+  }
+
+  // printf("GPGPU-Sim: Give GPU Cycle\n");
+  GPGPUsim_ctx_ptr()->g_sim_active = true;
+
+  // functional simulation
+  if (g_the_gpu()->is_functional_sim()) {
+    kernel_info_t *kernel = g_the_gpu()->get_functional_kernel();
+    assert(kernel);
+    GPGPUsim_ctx_ptr()->gpgpu_ctx->func_sim->gpgpu_cuda_ptx_sim_main_func(
+        *kernel);
+    g_the_gpu()->finish_functional_sim(kernel);
+  }
+
+  // performance simulation
+  if (g_the_gpu()->active()) {
+    g_the_gpu()->SST_cycle();
+    sst_sim_cycles = true;
+    g_the_gpu()->deadlock_check();
+  } else {
+    if (g_the_gpu()->cycle_insn_cta_max_hit()) {
+      g_stream_manager()->stop_all_running_kernels();
+      GPGPUsim_ctx_ptr()->g_sim_done = true;
+      GPGPUsim_ctx_ptr()->g_sim_active = false;
+      GPGPUsim_ctx_ptr()->break_limit = true;
+    }
+  }
+
+  if (!g_the_gpu()->active()) {
+    g_the_gpu()->print_stats(GPGPUsim_ctx_ptr()->g_the_gpu->last_streamID);
+    g_the_gpu()->update_stats();
+    GPGPU_Context()->print_simulation_time();
+  }
+
+  if (GPGPUsim_ctx_ptr()->break_limit) {
+    printf(
+        "GPGPU-Sim: ** break due to reaching the maximum cycles (or "
+        "instructions) **\n");
+    return true;
+  }
+
+  return false;
+}
+
 void gpgpu_context::synchronize() {
   printf("GPGPU-Sim: synchronize waiting for inactive GPU simulation\n");
   the_gpgpusim->g_stream_manager->print(stdout);
@@ -187,6 +270,27 @@ void gpgpu_context::synchronize() {
   //    sem_post(&g_sim_signal_start);
 }
 
+bool gpgpu_context::synchronize_check() {
+  // printf("GPGPU-Sim: synchronize checking for inactive GPU simulation\n");
+  requested_synchronize = true;
+  the_gpgpusim->g_stream_manager->print(stdout);
+  fflush(stdout);
+  //    sem_wait(&g_sim_signal_finish);
+  bool done = false;
+  pthread_mutex_lock(&(the_gpgpusim->g_sim_lock));
+  done = (the_gpgpusim->g_stream_manager->empty() &&
+          !the_gpgpusim->g_sim_active) ||
+         the_gpgpusim->g_sim_done;
+  pthread_mutex_unlock(&(the_gpgpusim->g_sim_lock));
+  if (done) {
+    printf(
+        "GPGPU-Sim: synchronize checking: detected inactive GPU simulation "
+        "thread\n");
+  }
+  fflush(stdout);
+  return done;
+}
+
 void gpgpu_context::exit_simulation() {
   the_gpgpusim->g_sim_done = true;
   printf("GPGPU-Sim: exit_simulation called\n");
@@ -220,8 +324,14 @@ gpgpu_sim *gpgpu_context::gpgpu_ptx_sim_init_perf() {
   assert(setlocale(LC_NUMERIC, "C"));
   the_gpgpusim->g_the_gpu_config->init();
 
-  the_gpgpusim->g_the_gpu =
-      new exec_gpgpu_sim(*(the_gpgpusim->g_the_gpu_config), this);
+  if (the_gpgpusim->g_the_gpu_config->is_SST_mode()) {
+    // Create SST specific GPGPUSim
+    the_gpgpusim->g_the_gpu =
+        new sst_gpgpu_sim(*(the_gpgpusim->g_the_gpu_config), this);
+  } else {
+    the_gpgpusim->g_the_gpu =
+        new exec_gpgpu_sim(*(the_gpgpusim->g_the_gpu_config), this);
+  }
   the_gpgpusim->g_stream_manager = new stream_manager(
       (the_gpgpusim->g_the_gpu), func_sim->g_cuda_launch_blocking);
 
@@ -237,12 +347,17 @@ gpgpu_sim *gpgpu_context::gpgpu_ptx_sim_init_perf() {
 void gpgpu_context::start_sim_thread(int api) {
   if (the_gpgpusim->g_sim_done) {
     the_gpgpusim->g_sim_done = false;
-    if (api == 1) {
-      pthread_create(&(the_gpgpusim->g_simulation_thread), NULL,
-                     gpgpu_sim_thread_concurrent, (void *)this);
+    if (the_gpgpusim->g_the_gpu_config->is_SST_mode()) {
+      // Do not create concurrent thread in SST mode
+      g_the_gpu()->init();
     } else {
-      pthread_create(&(the_gpgpusim->g_simulation_thread), NULL,
-                     gpgpu_sim_thread_sequential, (void *)this);
+      if (api == 1) {
+        pthread_create(&(the_gpgpusim->g_simulation_thread), NULL,
+                       gpgpu_sim_thread_concurrent, (void *)this);
+      } else {
+        pthread_create(&(the_gpgpusim->g_simulation_thread), NULL,
+                       gpgpu_sim_thread_sequential, (void *)this);
+      }
     }
   }
 }
@@ -266,8 +381,13 @@ void gpgpu_context::print_simulation_time() {
   const unsigned cycles_per_sec =
       (unsigned)(the_gpgpusim->g_the_gpu->gpu_tot_sim_cycle / difference);
   printf("gpgpu_simulation_rate = %u (cycle/sec)\n", cycles_per_sec);
-  printf("gpgpu_silicon_slowdown = %ux\n",
-         the_gpgpusim->g_the_gpu->shader_clock() * 1000 / cycles_per_sec);
+
+  if (cycles_per_sec == 0) {
+    printf("gpgpu_silicon_slowdown = Nan\n");
+  } else {
+    printf("gpgpu_silicon_slowdown = %ux\n",
+           the_gpgpusim->g_the_gpu->shader_clock() * 1000 / cycles_per_sec);
+  }
   fflush(stdout);
 }
 
diff --git a/src/stream_manager.cc b/src/stream_manager.cc
index 72f8bb0b2..b974791d0 100644
--- a/src/stream_manager.cc
+++ b/src/stream_manager.cc
@@ -34,6 +34,12 @@
 
 unsigned CUstream_st::sm_next_stream_uid = 0;
 
+// SST memcpy callbacks
+extern void SST_callback_memcpy_H2D_done();
+extern void SST_callback_memcpy_D2H_done();
+extern void SST_callback_memcpy_to_symbol_done();
+extern void SST_callback_memcpy_from_symbol_done();
+
 CUstream_st::CUstream_st() {
   m_pending = false;
   m_uid = sm_next_stream_uid++;
@@ -122,11 +128,13 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) {
       if (g_debug_execution >= 3) printf("memcpy host-to-device\n");
       gpu->memcpy_to_gpu(m_device_address_dst, m_host_address_src, m_cnt);
       m_stream->record_next_done();
+      if (gpu->is_SST_mode()) SST_callback_memcpy_H2D_done();
       break;
     case stream_memcpy_device_to_host:
       if (g_debug_execution >= 3) printf("memcpy device-to-host\n");
       gpu->memcpy_from_gpu(m_host_address_dst, m_device_address_src, m_cnt);
       m_stream->record_next_done();
+      if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done();
       break;
     case stream_memcpy_device_to_device:
       if (g_debug_execution >= 3) printf("memcpy device-to-device\n");
@@ -138,12 +146,14 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) {
       gpu->gpgpu_ctx->func_sim->gpgpu_ptx_sim_memcpy_symbol(
           m_symbol, m_host_address_src, m_cnt, m_offset, 1, gpu);
       m_stream->record_next_done();
+      if (gpu->is_SST_mode()) SST_callback_memcpy_to_symbol_done();
       break;
     case stream_memcpy_from_symbol:
       if (g_debug_execution >= 3) printf("memcpy from symbol\n");
       gpu->gpgpu_ctx->func_sim->gpgpu_ptx_sim_memcpy_symbol(
           m_symbol, m_host_address_dst, m_cnt, m_offset, 0, gpu);
       m_stream->record_next_done();
+      if (gpu->is_SST_mode()) SST_callback_memcpy_from_symbol_done();
       break;
     case stream_kernel_launch:
       if (m_sim_mode) {  // Functional Sim
@@ -472,7 +482,7 @@ void stream_manager::push(stream_operation op) {
   }
   if (g_debug_execution >= 3) print_impl(stdout);
   pthread_mutex_unlock(&m_lock);
-  if (m_cuda_launch_blocking || stream == NULL) {
+  if (!m_gpu->is_SST_mode() && (m_cuda_launch_blocking || stream == NULL)) {
     unsigned int wait_amount = 100;
     unsigned int wait_cap = 100000;  // 100ms
     while (!empty()) {
diff --git a/version_detection.mk b/version_detection.mk
index 81c1d2ae7..ca48d2a72 100644
--- a/version_detection.mk
+++ b/version_detection.mk
@@ -43,7 +43,7 @@ CUDA_VERSION_STRING:=$(shell $(CUDA_INSTALL_PATH)/bin/nvcc --version | awk '/rel
 CUDART_VERSION:=$(shell echo $(CUDA_VERSION_STRING) | sed 's/\./ /' | awk '{printf("%02u%02u", 10*int($$1), 10*$$2);}')
 
 # Detect GCC Version 
-CC_VERSION := $(shell gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($$i,/^[0-9]\.[0-9]\.[0-9]$$/))  {print $$i; exit 0 }}}')
+CC_VERSION := $(shell gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($$i,/^[0-9]+\.[0-9]+\.[0-9]+$$/))  {print $$i; exit 0 }}}')
 
 # Detect Support for C++11 (C++0x) from GCC Version 
 GNUC_CPP0X := $(shell gcc --version | perl -ne 'if (/gcc\s+\(.*\)\s+([0-9.]+)/){ if($$1 >= 4.3) {$$n=1} else {$$n=0;} } END { print $$n; }')