diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 5af244b33..136ed0261 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1585,25 +1585,62 @@ bool shader_core_ctx::can_issue_1block(kernel_info_t &kernel) {
     return (get_n_active_cta() < m_config->max_cta(kernel));
   }
 }
-
-int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) {
-  unsigned int step;
-  for (step = 0; step < m_config->n_thread_per_shader; step += cta_size) {
-    unsigned int hw_tid;
-    for (hw_tid = step; hw_tid < step + cta_size; hw_tid++) {
-      if (m_occupied_hwtid.test(hw_tid)) break;
+ 
+/**
+ * @brief Tries to find a contiguous range of available {hw_tid}s (and mark them as occupied). 
+ * Wrap-arounds are allowed.
+ * 
+ * @param cta_size How many threads this CTA contains. Should already be
+ * "padded" to an integer multiple of the max warp size (m_config->warp_size)
+ * @param occupy Set to "false" for a dry run 
+ * @return -1 if a contiguous range that can fit all threads of this cta
+ * cannot be found, otherwise the hw_tid to which the first thread of this cta maps. Note
+ * that since wrap-arounds can happen, naively adding cta_size to the retval - which is the
+ * start_thread - can result in a value exceeding the simulated hardware limits
+ */
+int shader_core_ctx::find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy) {
+  //TODO: use round robin based on dynamic_warp id
+  const unsigned int& warp_size = m_config->warp_size;
+
+  unsigned int step=0;
+  while(step < m_config->n_thread_per_shader) {
+    //Subcore experiments on Volta V100 
+    //show that warps are assigned to subcores in a Round-Robin fashion, 
+    //so we should start testing from the successor of the subcore 
+    //to which the last warp was assigned. 
+    
+    //Note: Warp ids are bound to a specific scheduler - which
+    //is equivalent to a subcore - based on (warp_id modulo # of schedulers) 
+
+    //m_dynamic_warp_id is incremented after a warp has been initiated,
+    //therefore we don't need to add one to find the "next" subcore 
+    //(ref: shader_core_ctx::init_warps)
+    unsigned int i;
+    for (i = step; i < step + cta_size; i++) {
+      unsigned int hw_tid = (i + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader;
+      if (m_occupied_hwtid.test(hw_tid)) break; //break from this inner for-loop
+    }
+    if (i == step + cta_size)  // consecutive non-active
+      break; //break from the outer while-loop
+    else {
+      //start from the next warp slot
+      //e.g. if step was 32, i was 35, and warp_size is 32, then step will be updated to 64
+      step = (i / warp_size + 1) * warp_size;
     }
-    if (hw_tid == step + cta_size)  // consecutive non-active
-      break;
   }
-  if (step >= m_config->n_thread_per_shader)  // didn't find
+  if (step >= m_config->n_thread_per_shader){  // didn't find
+    DPRINTF(SUBCORE, "SM unit %d cannot find proper hwtid to occupy for kernel uid %u\n", this->m_cluster->m_cluster_id, kernel.get_uid());
     return -1;
+  }
   else {
     if (occupy) {
-      for (unsigned hw_tid = step; hw_tid < step + cta_size; hw_tid++)
+      DPRINTF(SUBCORE, "SM unit %d tid %d to %d occupied for kernel uid %u\n", this->m_cluster->m_cluster_id, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader+cta_size-1, kernel.get_uid());
+      for (unsigned i = step; i < step + cta_size; i++){
+        unsigned int hw_tid = (i + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader;
         m_occupied_hwtid.set(hw_tid);
+      }
     }
-    return step;
+    return (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader;
   }
 }
 
@@ -1619,13 +1656,23 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k,
   if (m_occupied_n_threads + padded_cta_size > m_config->n_thread_per_shader)
     return false;
 
-  if (find_available_hwtid(padded_cta_size, false) == -1) return false;
+  //Even if the amount of available "thread slots" exceed our CTA size,
+  //if these slots are fragmented (non-continuous regions),
+  //we still might not be able to launch this CTA. 
+  //Obviously fragmentation can only happen on the granularity of warp size
+  //since hwtids are allocated on the granularity of warp_size. 
+  //It remains a TODO to find out if a CTA *can* launch when the warps of this CTA
+  //have no choice but map to non contiguous regions of hwtid.
+  if (find_available_hwtid(padded_cta_size, k, false) == -1) return false;
 
   const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel);
 
   if (m_occupied_shmem + kernel_info->smem > m_config->gpgpu_shmem_size)
     return false;
 
+  //TODO: check if each subcore has enough regs for this block
+  //this requires tracking the amount of available regs per subcore,
+  //plus knowning how many warps are to be issued on each subcore. 
   unsigned int used_regs = padded_cta_size * ((kernel_info->regs + 3) & ~3);
   if (m_occupied_regs + used_regs > m_config->gpgpu_shader_registers)
     return false;
@@ -1649,7 +1696,7 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k,
 }
 
 void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid,
-                                                     kernel_info_t &k) {
+                                                     const kernel_info_t &k) {
   if (m_config->gpgpu_concurrent_kernel_sm) {
     unsigned threads_per_cta = k.threads_per_cta();
     const class function_info *kernel = k.entry();
@@ -1661,11 +1708,17 @@ void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid,
     assert(m_occupied_n_threads >= padded_cta_size);
     m_occupied_n_threads -= padded_cta_size;
 
-    int start_thread = m_occupied_cta_to_hwtid[hw_ctaid];
+    unsigned start_thread = m_occupied_cta_to_hwtid[hw_ctaid];
+    unsigned end_thread = ((start_thread + padded_cta_size) - 1) % m_config->n_thread_per_shader + 1;
+
+    WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader);
+  
+    DPRINTF(SUBCORE, "SM unit %u tid %d to %d released for kernel uid %u\n", this->m_cluster->m_cluster_id, start_thread, start_thread + padded_cta_size - 1, k.get_uid());
 
-    for (unsigned hwtid = start_thread; hwtid < start_thread + padded_cta_size;
-         hwtid++)
+    tid_range.loop([&](const unsigned hwtid){
       m_occupied_hwtid.reset(hwtid);
+    });
+      
     m_occupied_cta_to_hwtid.erase(hw_ctaid);
 
     const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel);
@@ -1702,8 +1755,12 @@ unsigned exec_shader_core_ctx::sim_init_thread(
 void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   if (!m_config->gpgpu_concurrent_kernel_sm)
     set_max_cta(kernel);
-  else
+  else{
+    //shader_core_ctx::can_issue_1block should have already verified that one block
+    //is indeed issueable on this shader core, therefore we expect 
+    //occupy_shader_resource_1block to return true here. 
     assert(occupy_shader_resource_1block(kernel, true));
+  }
 
   kernel.inc_running();
 
@@ -1734,15 +1791,26 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
     padded_cta_size =
         ((cta_size / m_config->warp_size) + 1) * (m_config->warp_size);
 
-  unsigned int start_thread, end_thread;
 
+  //find available hwtids
+  // Note: A lot of legacy function that take in a range of thread ids
+  // are built upon the assumption that no wrap-around happens.
+  // However with the subcore model this is no longer true.     
+  // It is hence necessary to perform a wrap-around. 
+  // E.g. to demo the effect off wrap-around,if CTA size is 10, 
+  // n_thread_per_shader is 20 and start_thread is 18, end thread will 
+  // not be 28 but 8. 
+  unsigned int start_thread, end_thread;
   if (!m_config->gpgpu_concurrent_kernel_sm) {
     start_thread = free_cta_hw_id * padded_cta_size;
     end_thread = start_thread + cta_size;
+    end_thread = (end_thread-1) % m_config->n_thread_per_shader + 1;
   } else {
-    start_thread = find_available_hwtid(padded_cta_size, true);
+    start_thread = find_available_hwtid(padded_cta_size, kernel, true);
     assert((int)start_thread != -1);
     end_thread = start_thread + cta_size;
+
+    end_thread = (end_thread-1) % m_config->n_thread_per_shader + 1;
     assert(m_occupied_cta_to_hwtid.find(free_cta_hw_id) ==
            m_occupied_cta_to_hwtid.end());
     m_occupied_cta_to_hwtid[free_cta_hw_id] = start_thread;
@@ -1751,7 +1819,7 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   // reset the microarchitecture state of the selected hardware thread and warp
   // contexts
   reinit(start_thread, end_thread, false);
-
+  
   // initalize scalar threads and determine which hardware warps they are
   // allocated to bind functional simulation state of threads to hardware
   // resources (simulation)
@@ -1761,11 +1829,16 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   symbol_table *symtab = kernel_func_info->get_symtab();
   unsigned ctaid = kernel.get_next_cta_id_single();
   checkpoint *g_checkpoint = new checkpoint();
-  for (unsigned i = start_thread; i < end_thread; i++) {
+
+  //used to pass in as the "threads_left" argument passed to sim_init_thread
+  int threads_left = cta_size; 
+
+  WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader);
+  tid_range.loop([&](const unsigned i){
     m_threadState[i].m_cta_id = free_cta_hw_id;
     unsigned warp_id = i / m_config->warp_size;
     nthreads_in_block += sim_init_thread(
-        kernel, &m_thread[i], m_sid, i, cta_size - (i - start_thread),
+        kernel, &m_thread[i], m_sid, i, threads_left--,
         m_config->n_thread_per_shader, this, free_cta_hw_id, warp_id,
         m_cluster->get_gpu());
     m_threadState[i].m_active = true;
@@ -1774,16 +1847,17 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
         ctaid >= m_gpu->resume_CTA && ctaid < m_gpu->checkpoint_CTA_t) {
       char fname[2048];
       snprintf(fname, 2048, "checkpoint_files/thread_%d_%d_reg.txt",
-               i % cta_size, ctaid);
+              i % cta_size, ctaid);
       m_thread[i]->resume_reg_thread(fname, symtab);
       char f1name[2048];
       snprintf(f1name, 2048, "checkpoint_files/local_mem_thread_%d_%d_reg.txt",
-               i % cta_size, ctaid);
+              i % cta_size, ctaid);
       g_checkpoint->load_global_mem(m_thread[i]->m_local_mem, f1name);
     }
     //
     warps.set(warp_id);
-  }
+  });
+
   assert(nthreads_in_block > 0 &&
          nthreads_in_block <=
              m_config->n_thread_per_shader);  // should be at least one, but
@@ -1803,6 +1877,7 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
 
   // initialize the SIMT stacks and fetch hardware
   init_warps(free_cta_hw_id, start_thread, end_thread, ctaid, cta_size, kernel);
+
   m_n_active_cta++;
 
   shader_CTA_count_log(m_sid, 1);
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index de69ef8ce..7b1b38c85 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -1,17 +1,18 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas
-// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah,
+// Nikos Hardavellas Mahmoud Khairy, Junrui Pan, Timothy G. Rogers The
+// University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -28,7 +29,6 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 
-
 #ifndef GPU_SIM_H
 #define GPU_SIM_H
 
@@ -36,6 +36,8 @@
 #include <fstream>
 #include <iostream>
 #include <list>
+#include <vector>
+#include <functional>
 #include "../abstract_hardware_model.h"
 #include "../option_parser.h"
 #include "../trace.h"
@@ -72,7 +74,7 @@ extern tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 enum dram_ctrl_t { DRAM_FIFO = 0, DRAM_FRFCFS = 1 };
 
 enum hw_perf_t {
-  HW_BENCH_NAME=0,
+  HW_BENCH_NAME = 0,
   HW_KERNEL_NAME,
   HW_L1_RH,
   HW_L1_RM,
@@ -108,7 +110,7 @@ struct power_config {
       s++;
     }
     char buf1[1024];
-    //snprintf(buf1, 1024, "accelwattch_power_report__%s.log", date);
+    // snprintf(buf1, 1024, "accelwattch_power_report__%s.log", date);
     snprintf(buf1, 1024, "accelwattch_power_report.log");
     g_power_filename = strdup(buf1);
     char buf2[1024];
@@ -155,7 +157,6 @@ struct power_config {
   double gpu_steady_power_deviation;
   double gpu_steady_min_period;
 
-
   char *g_hw_perf_file_name;
   char *g_hw_perf_bench_name;
   int g_power_simulation_mode;
@@ -735,4 +736,106 @@ class exec_gpgpu_sim : public gpgpu_sim {
   virtual void createSIMTCluster();
 };
 
+
+/**
+ * @brief Represents a range of unsigned indices that can wrap around
+ * at a certain threshold value. The functionality of this class is to 
+ * provide a programmer-friendly and performant way to run a for loop over 
+ * a range of indices that can potentially wrap over at the max value. 
+ * 
+ * This class comes in handy when looping over a range of hwtid and wrap_ids 
+ * with subcore model in effect. Threads of a certain CTA may start mapping to 
+ * the higher portion of the hwtid space and wrap around at the max thread id.
+ * E.g. Assuming max thread per SM is 2048, the CTA size is 128 threads, and
+ * the CTA's first thread maps to hwtid=2016, then the last thread shall map
+ * to hwtid=(2016 + 128) % 2048 - 1 = 95. Hence wrap-around.
+ * 
+ * Hard-coding a for-loop that can detect wrap-arounds can make the code look 
+ * complicated; populating an ordered-list of indices to iterate over is 
+ * straightforward but both space- and time-inefficient. This class offers the 
+ * benefit of functional programming by letting the programmer specify a 
+ * lambda function to apply on each index within the specified range. 
+ * 
+ * The lambda function is required to take in one const unsigned argument and 
+ * return void (i.e. std::function<void(const unsigned)> ). It is recommended 
+ * the programmer use [&] to capture by-reference everything in the context, 
+ * so as to mimic the effect of running a naked for-loop. 
+ * 
+ * E.g. if the original code was 
+ * ```
+ * //variables like a, b, c are in the scope
+ * for(unsigned int i=12; i<18; ++i){
+ *  //do things depending on value of i on a, b, and c 
+ * }
+ * //use modified values of a, b, and c
+ * ```
+ * 
+ * then the code can look like this when using WrappableUnsignedRange:
+ * 
+ * ```
+ * //variables like a, b, c are in the scope
+ * WrappableUnsignedRange r(12, 18, 10000);
+ * r.loop(
+ *  [&](const unsigned i){
+ *    //do things depending on value of i on a, b, and c 
+ *  }
+ * );
+ *  //use modified values of a, b, and c 
+ * ```
+ * 
+ * Note: When start_index < end_index, the range of indices is [start, end)
+ * When start_index > end_index, the range is [start, wrapping_threshold) plus 
+ * [0, end).
+ * When start_index == end_index, the range is considered empty.
+ * 
+ * It is required that 0<=start<wrapping_thres, 0<end<=wrapping_thres
+ */
+class WrappableUnsignedRange {
+ public:
+  unsigned start_index;
+  unsigned end_index;
+  unsigned wrapping_threshold;
+
+  WrappableUnsignedRange(unsigned _start_index, unsigned _end_index,
+                         unsigned _wrapping_threshold)
+      : start_index(_start_index),
+        end_index(_end_index),
+        wrapping_threshold(_wrapping_threshold) {}
+
+  bool isWrapped() {
+    return end_index < start_index;
+  }
+
+  bool isWithinRange(const unsigned v){
+    if(false==isWrapped() && start_index <= v && v<end_index) return true;
+    else if(isWrapped() && start_index <= v && v<wrapping_threshold) return true;
+    else if(isWrapped() && 0<=v && v<end_index) return true;
+    else return false;
+  }
+
+  bool contains(const unsigned v){
+    return isWithinRange(v);
+  }
+  
+  // loop_body_function is called solely for its side-effect.
+  // To use this as a drop-in replacement for a naked for-loop,
+  // You might want to create a lambda that captures everything by-reference
+  // and pass that as the loop_body_function 
+  void loop(std::function<void(const unsigned)> loop_body_function) {
+    assert(start_index >= 0);
+    assert(start_index < wrapping_threshold);
+    assert(end_index > 0);
+    assert(end_index <= wrapping_threshold);
+
+    unsigned index = start_index;
+    while (index != end_index) {
+      if (index >= wrapping_threshold) {
+        index = 0;
+      }
+      loop_body_function(index);
+      ++index;
+    }
+  }
+};
+
 #endif
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 814311d1c..6b5813ab3 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -477,6 +477,7 @@ shader_core_ctx::shader_core_ctx(class gpgpu_sim *gpu,
                  config->max_barriers_per_cta, config->warp_size),
       m_active_warps(0),
       m_dynamic_warp_id(0) {
+  
   m_cluster = cluster;
   m_config = config;
   m_memory_config = mem_config;
@@ -518,21 +519,32 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread,
     m_occupied_cta_to_hwtid.clear();
     m_active_warps = 0;
   }
-  for (unsigned i = start_thread; i < end_thread; i++) {
+
+  WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader);
+  tid_range.loop([&](const unsigned i){
     m_threadState[i].n_insn = 0;
-    m_threadState[i].m_cta_id = -1;
-  }
-  for (unsigned i = start_thread / m_config->warp_size;
-       i < end_thread / m_config->warp_size; ++i) {
+    m_threadState[i].m_cta_id = -1;   
+  });
+  
+  const unsigned start_warp = start_thread / m_config->warp_size;
+  const unsigned end_warp = end_thread / m_config->warp_size +
+                      ((end_thread % m_config->warp_size) ? 1 : 0);
+  WrappableUnsignedRange warp_id_range(start_warp, end_warp, m_config->max_warps_per_shader);    
+  warp_id_range.loop([&](const unsigned i){
     m_warp[i]->reset();
-    m_simt_stack[i]->reset();
-  }
+    m_simt_stack[i]->reset();  
+  });
 }
 
+/**
+ * @brief Note: To handle the case of hwtid wrap-around (end_thread < start_thread),
+ * this method will generate a const vec of warp ids to iterate over in a range-based for loop.  
+ */ 
 void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread,
                                  unsigned end_thread, unsigned ctaid,
                                  int cta_size, kernel_info_t &kernel) {
-  //
+  //when concurrent_sm is enabled, 
+  //both start_thread and end_thread are hwtid (0 <= x < n_thread_per_shader)
   address_type start_pc = next_pc(start_thread);
   unsigned kernel_id = kernel.get_uid();
   if (m_config->model == POST_DOMINATOR) {
@@ -540,12 +552,16 @@ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread,
     unsigned warp_per_cta = cta_size / m_config->warp_size;
     unsigned end_warp = end_thread / m_config->warp_size +
                         ((end_thread % m_config->warp_size) ? 1 : 0);
-    for (unsigned i = start_warp; i < end_warp; ++i) {
+
+    WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader);
+    WrappableUnsignedRange warp_id_range(start_warp, end_warp, m_config->max_warps_per_shader);
+    
+    warp_id_range.loop([&](const unsigned i){ 
       unsigned n_active = 0;
       simt_mask_t active_threads;
       for (unsigned t = 0; t < m_config->warp_size; t++) {
         unsigned hwtid = i * m_config->warp_size + t;
-        if (hwtid < end_thread) {
+        if ( tid_range.contains(hwtid) ) {
           n_active++;
           assert(!m_active_threads.test(hwtid));
           m_active_threads.set(hwtid);
@@ -574,8 +590,8 @@ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread,
       m_warp[i]->init(start_pc, cta_id, i, active_threads, m_dynamic_warp_id);
       ++m_dynamic_warp_id;
       m_not_completed += n_active;
-      ++m_active_warps;
-    }
+      ++m_active_warps;      
+    });
   }
 }
 
@@ -3337,6 +3353,20 @@ void shader_core_ctx::display_pipeline(FILE *fout, int print_mem,
   }
 }
 
+/**
+ * @brief Given the resource requirements per CTA of a kernel, calculate how
+ * many such CTAs can a shader core sustain when it is "empty". In other words,
+ * it checks if the CTA is too "fat" to fit on a core; if it can, how many.
+ * 
+ * Although this function is declared to be const (promises not to modify any
+ * state of the shader_core_config class), it also checks if
+ * adaptive_cache_config is
+ * enabled and if yes, it might modify some states of the cache configuration.
+ * Read the code yourself if you are concerned!
+ * 
+ * @param k 
+ * @return unsigned int How many CTAs of the kernel can be sustained on a core.
+ */
 unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
   unsigned threads_per_cta = k.threads_per_cta();
   const class function_info *kernel = k.entry();
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index c3e6f93ed..155f479c3 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -957,7 +957,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_sub_core_model = sub_core_model;
       m_num_warp_scheds = num_warp_scheds;
       if (m_sub_core_model) {
-        m_last_cu_set = new unsigned(m_num_warp_scheds);
+        m_last_cu_set = new unsigned[m_num_warp_scheds];
         for (unsigned i = 0; i < m_num_warp_scheds; i++)
         {
           m_last_cu_set[i] = i * m_num_collectors / m_num_warp_scheds;
@@ -2514,8 +2514,8 @@ class shader_core_ctx : public core_t {
  public:
   bool can_issue_1block(kernel_info_t &kernel);
   bool occupy_shader_resource_1block(kernel_info_t &kernel, bool occupy);
-  void release_shader_resource_1block(unsigned hw_ctaid, kernel_info_t &kernel);
-  int find_available_hwtid(unsigned int cta_size, bool occupy);
+  void release_shader_resource_1block(unsigned hw_ctaid, const kernel_info_t &kernel);
+  int find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy);
 
  private:
   unsigned int m_occupied_n_threads;
@@ -2559,6 +2559,7 @@ class exec_shader_core_ctx : public shader_core_ctx {
 };
 
 class simt_core_cluster {
+ friend class shader_core_ctx;
  public:
   simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id,
                     const shader_core_config *config,
diff --git a/src/trace_streams.tup b/src/trace_streams.tup
index 074c7c880..4457f6c25 100644
--- a/src/trace_streams.tup
+++ b/src/trace_streams.tup
@@ -32,5 +32,6 @@ TS_TUP_BEGIN( trace_streams_type )
     TS_TUP( MEMORY_SUBPARTITION_UNIT ),
     TS_TUP( INTERCONNECT ),
     TS_TUP( LIVENESS ),
+    TS_TUP( SUBCORE ),
     TS_TUP( NUM_TRACE_STREAMS )
 TS_TUP_END( trace_streams_type )