diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index 5af244b33..136ed0261 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -1585,25 +1585,62 @@ bool shader_core_ctx::can_issue_1block(kernel_info_t &kernel) { return (get_n_active_cta() < m_config->max_cta(kernel)); } } - -int shader_core_ctx::find_available_hwtid(unsigned int cta_size, bool occupy) { - unsigned int step; - for (step = 0; step < m_config->n_thread_per_shader; step += cta_size) { - unsigned int hw_tid; - for (hw_tid = step; hw_tid < step + cta_size; hw_tid++) { - if (m_occupied_hwtid.test(hw_tid)) break; + +/** + * @brief Tries to find a contiguous range of available {hw_tid}s (and mark them as occupied). + * Wrap-arounds are allowed. + * + * @param cta_size How many threads this CTA contains. Should already be + * "padded" to an integer multiple of the max warp size (m_config->warp_size) + * @param occupy Set to "false" for a dry run + * @return -1 if a contiguous range that can fit all threads of this cta + * cannot be found, otherwise the hw_tid to which the first thread of this cta maps. Note + * that since wrap-arounds can happen, naively adding cta_size to the retval - which is the + * start_thread - can result in a value exceeding the simulated hardware limits + */ +int shader_core_ctx::find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy) { + //TODO: use round robin based on dynamic_warp id + const unsigned int& warp_size = m_config->warp_size; + + unsigned int step=0; + while(step < m_config->n_thread_per_shader) { + //Subcore experiments on Volta V100 + //show that warps are assigned to subcores in a Round-Robin fashion, + //so we should start testing from the successor of the subcore + //to which the last warp was assigned. + + //Note: Warp ids are bound to a specific scheduler - which + //is equivalent to a subcore - based on (warp_id modulo # of schedulers) + + //m_dynamic_warp_id is incremented after a warp has been initiated, + //therefore we don't need to add one to find the "next" subcore + //(ref: shader_core_ctx::init_warps) + unsigned int i; + for (i = step; i < step + cta_size; i++) { + unsigned int hw_tid = (i + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader; + if (m_occupied_hwtid.test(hw_tid)) break; //break from this inner for-loop + } + if (i == step + cta_size) // consecutive non-active + break; //break from the outer while-loop + else { + //start from the next warp slot + //e.g. if step was 32, i was 35, and warp_size is 32, then step will be updated to 64 + step = (i / warp_size + 1) * warp_size; } - if (hw_tid == step + cta_size) // consecutive non-active - break; } - if (step >= m_config->n_thread_per_shader) // didn't find + if (step >= m_config->n_thread_per_shader){ // didn't find + DPRINTF(SUBCORE, "SM unit %d cannot find proper hwtid to occupy for kernel uid %u\n", this->m_cluster->m_cluster_id, kernel.get_uid()); return -1; + } else { if (occupy) { - for (unsigned hw_tid = step; hw_tid < step + cta_size; hw_tid++) + DPRINTF(SUBCORE, "SM unit %d tid %d to %d occupied for kernel uid %u\n", this->m_cluster->m_cluster_id, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader, (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader+cta_size-1, kernel.get_uid()); + for (unsigned i = step; i < step + cta_size; i++){ + unsigned int hw_tid = (i + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader; m_occupied_hwtid.set(hw_tid); + } } - return step; + return (step + m_dynamic_warp_id*warp_size) % m_config->n_thread_per_shader; } } @@ -1619,13 +1656,23 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k, if (m_occupied_n_threads + padded_cta_size > m_config->n_thread_per_shader) return false; - if (find_available_hwtid(padded_cta_size, false) == -1) return false; + //Even if the amount of available "thread slots" exceed our CTA size, + //if these slots are fragmented (non-continuous regions), + //we still might not be able to launch this CTA. + //Obviously fragmentation can only happen on the granularity of warp size + //since hwtids are allocated on the granularity of warp_size. + //It remains a TODO to find out if a CTA *can* launch when the warps of this CTA + //have no choice but map to non contiguous regions of hwtid. + if (find_available_hwtid(padded_cta_size, k, false) == -1) return false; const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel); if (m_occupied_shmem + kernel_info->smem > m_config->gpgpu_shmem_size) return false; + //TODO: check if each subcore has enough regs for this block + //this requires tracking the amount of available regs per subcore, + //plus knowning how many warps are to be issued on each subcore. unsigned int used_regs = padded_cta_size * ((kernel_info->regs + 3) & ~3); if (m_occupied_regs + used_regs > m_config->gpgpu_shader_registers) return false; @@ -1649,7 +1696,7 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k, } void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid, - kernel_info_t &k) { + const kernel_info_t &k) { if (m_config->gpgpu_concurrent_kernel_sm) { unsigned threads_per_cta = k.threads_per_cta(); const class function_info *kernel = k.entry(); @@ -1661,11 +1708,17 @@ void shader_core_ctx::release_shader_resource_1block(unsigned hw_ctaid, assert(m_occupied_n_threads >= padded_cta_size); m_occupied_n_threads -= padded_cta_size; - int start_thread = m_occupied_cta_to_hwtid[hw_ctaid]; + unsigned start_thread = m_occupied_cta_to_hwtid[hw_ctaid]; + unsigned end_thread = ((start_thread + padded_cta_size) - 1) % m_config->n_thread_per_shader + 1; + + WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader); + + DPRINTF(SUBCORE, "SM unit %u tid %d to %d released for kernel uid %u\n", this->m_cluster->m_cluster_id, start_thread, start_thread + padded_cta_size - 1, k.get_uid()); - for (unsigned hwtid = start_thread; hwtid < start_thread + padded_cta_size; - hwtid++) + tid_range.loop([&](const unsigned hwtid){ m_occupied_hwtid.reset(hwtid); + }); + m_occupied_cta_to_hwtid.erase(hw_ctaid); const struct gpgpu_ptx_sim_info *kernel_info = ptx_sim_kernel_info(kernel); @@ -1702,8 +1755,12 @@ unsigned exec_shader_core_ctx::sim_init_thread( void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { if (!m_config->gpgpu_concurrent_kernel_sm) set_max_cta(kernel); - else + else{ + //shader_core_ctx::can_issue_1block should have already verified that one block + //is indeed issueable on this shader core, therefore we expect + //occupy_shader_resource_1block to return true here. assert(occupy_shader_resource_1block(kernel, true)); + } kernel.inc_running(); @@ -1734,15 +1791,26 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { padded_cta_size = ((cta_size / m_config->warp_size) + 1) * (m_config->warp_size); - unsigned int start_thread, end_thread; + //find available hwtids + // Note: A lot of legacy function that take in a range of thread ids + // are built upon the assumption that no wrap-around happens. + // However with the subcore model this is no longer true. + // It is hence necessary to perform a wrap-around. + // E.g. to demo the effect off wrap-around,if CTA size is 10, + // n_thread_per_shader is 20 and start_thread is 18, end thread will + // not be 28 but 8. + unsigned int start_thread, end_thread; if (!m_config->gpgpu_concurrent_kernel_sm) { start_thread = free_cta_hw_id * padded_cta_size; end_thread = start_thread + cta_size; + end_thread = (end_thread-1) % m_config->n_thread_per_shader + 1; } else { - start_thread = find_available_hwtid(padded_cta_size, true); + start_thread = find_available_hwtid(padded_cta_size, kernel, true); assert((int)start_thread != -1); end_thread = start_thread + cta_size; + + end_thread = (end_thread-1) % m_config->n_thread_per_shader + 1; assert(m_occupied_cta_to_hwtid.find(free_cta_hw_id) == m_occupied_cta_to_hwtid.end()); m_occupied_cta_to_hwtid[free_cta_hw_id] = start_thread; @@ -1751,7 +1819,7 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { // reset the microarchitecture state of the selected hardware thread and warp // contexts reinit(start_thread, end_thread, false); - + // initalize scalar threads and determine which hardware warps they are // allocated to bind functional simulation state of threads to hardware // resources (simulation) @@ -1761,11 +1829,16 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { symbol_table *symtab = kernel_func_info->get_symtab(); unsigned ctaid = kernel.get_next_cta_id_single(); checkpoint *g_checkpoint = new checkpoint(); - for (unsigned i = start_thread; i < end_thread; i++) { + + //used to pass in as the "threads_left" argument passed to sim_init_thread + int threads_left = cta_size; + + WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader); + tid_range.loop([&](const unsigned i){ m_threadState[i].m_cta_id = free_cta_hw_id; unsigned warp_id = i / m_config->warp_size; nthreads_in_block += sim_init_thread( - kernel, &m_thread[i], m_sid, i, cta_size - (i - start_thread), + kernel, &m_thread[i], m_sid, i, threads_left--, m_config->n_thread_per_shader, this, free_cta_hw_id, warp_id, m_cluster->get_gpu()); m_threadState[i].m_active = true; @@ -1774,16 +1847,17 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { ctaid >= m_gpu->resume_CTA && ctaid < m_gpu->checkpoint_CTA_t) { char fname[2048]; snprintf(fname, 2048, "checkpoint_files/thread_%d_%d_reg.txt", - i % cta_size, ctaid); + i % cta_size, ctaid); m_thread[i]->resume_reg_thread(fname, symtab); char f1name[2048]; snprintf(f1name, 2048, "checkpoint_files/local_mem_thread_%d_%d_reg.txt", - i % cta_size, ctaid); + i % cta_size, ctaid); g_checkpoint->load_global_mem(m_thread[i]->m_local_mem, f1name); } // warps.set(warp_id); - } + }); + assert(nthreads_in_block > 0 && nthreads_in_block <= m_config->n_thread_per_shader); // should be at least one, but @@ -1803,6 +1877,7 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) { // initialize the SIMT stacks and fetch hardware init_warps(free_cta_hw_id, start_thread, end_thread, ctaid, cta_size, kernel); + m_n_active_cta++; shader_CTA_count_log(m_sid, 1); diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h index de69ef8ce..7b1b38c85 100644 --- a/src/gpgpu-sim/gpu-sim.h +++ b/src/gpgpu-sim/gpu-sim.h @@ -1,17 +1,18 @@ -// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas -// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers -// The University of British Columbia, Northwestern University, Purdue University +// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah, +// Nikos Hardavellas Mahmoud Khairy, Junrui Pan, Timothy G. Rogers The +// University of British Columbia, Northwestern University, Purdue University // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // -// 1. Redistributions of source code must retain the above copyright notice, this +// 1. Redistributions of source code must retain the above copyright notice, +// this // list of conditions and the following disclaimer; // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution; -// 3. Neither the names of The University of British Columbia, Northwestern +// 3. Neither the names of The University of British Columbia, Northwestern // University nor the names of their contributors may be used to // endorse or promote products derived from this software without specific // prior written permission. @@ -28,7 +29,6 @@ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. - #ifndef GPU_SIM_H #define GPU_SIM_H @@ -36,6 +36,8 @@ #include #include #include +#include +#include #include "../abstract_hardware_model.h" #include "../option_parser.h" #include "../trace.h" @@ -72,7 +74,7 @@ extern tr1_hash_map address_random_interleaving; enum dram_ctrl_t { DRAM_FIFO = 0, DRAM_FRFCFS = 1 }; enum hw_perf_t { - HW_BENCH_NAME=0, + HW_BENCH_NAME = 0, HW_KERNEL_NAME, HW_L1_RH, HW_L1_RM, @@ -108,7 +110,7 @@ struct power_config { s++; } char buf1[1024]; - //snprintf(buf1, 1024, "accelwattch_power_report__%s.log", date); + // snprintf(buf1, 1024, "accelwattch_power_report__%s.log", date); snprintf(buf1, 1024, "accelwattch_power_report.log"); g_power_filename = strdup(buf1); char buf2[1024]; @@ -155,7 +157,6 @@ struct power_config { double gpu_steady_power_deviation; double gpu_steady_min_period; - char *g_hw_perf_file_name; char *g_hw_perf_bench_name; int g_power_simulation_mode; @@ -735,4 +736,106 @@ class exec_gpgpu_sim : public gpgpu_sim { virtual void createSIMTCluster(); }; + +/** + * @brief Represents a range of unsigned indices that can wrap around + * at a certain threshold value. The functionality of this class is to + * provide a programmer-friendly and performant way to run a for loop over + * a range of indices that can potentially wrap over at the max value. + * + * This class comes in handy when looping over a range of hwtid and wrap_ids + * with subcore model in effect. Threads of a certain CTA may start mapping to + * the higher portion of the hwtid space and wrap around at the max thread id. + * E.g. Assuming max thread per SM is 2048, the CTA size is 128 threads, and + * the CTA's first thread maps to hwtid=2016, then the last thread shall map + * to hwtid=(2016 + 128) % 2048 - 1 = 95. Hence wrap-around. + * + * Hard-coding a for-loop that can detect wrap-arounds can make the code look + * complicated; populating an ordered-list of indices to iterate over is + * straightforward but both space- and time-inefficient. This class offers the + * benefit of functional programming by letting the programmer specify a + * lambda function to apply on each index within the specified range. + * + * The lambda function is required to take in one const unsigned argument and + * return void (i.e. std::function ). It is recommended + * the programmer use [&] to capture by-reference everything in the context, + * so as to mimic the effect of running a naked for-loop. + * + * E.g. if the original code was + * ``` + * //variables like a, b, c are in the scope + * for(unsigned int i=12; i<18; ++i){ + * //do things depending on value of i on a, b, and c + * } + * //use modified values of a, b, and c + * ``` + * + * then the code can look like this when using WrappableUnsignedRange: + * + * ``` + * //variables like a, b, c are in the scope + * WrappableUnsignedRange r(12, 18, 10000); + * r.loop( + * [&](const unsigned i){ + * //do things depending on value of i on a, b, and c + * } + * ); + * //use modified values of a, b, and c + * ``` + * + * Note: When start_index < end_index, the range of indices is [start, end) + * When start_index > end_index, the range is [start, wrapping_threshold) plus + * [0, end). + * When start_index == end_index, the range is considered empty. + * + * It is required that 0<=start loop_body_function) { + assert(start_index >= 0); + assert(start_index < wrapping_threshold); + assert(end_index > 0); + assert(end_index <= wrapping_threshold); + + unsigned index = start_index; + while (index != end_index) { + if (index >= wrapping_threshold) { + index = 0; + } + loop_body_function(index); + ++index; + } + } +}; + #endif diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc index 814311d1c..6b5813ab3 100644 --- a/src/gpgpu-sim/shader.cc +++ b/src/gpgpu-sim/shader.cc @@ -477,6 +477,7 @@ shader_core_ctx::shader_core_ctx(class gpgpu_sim *gpu, config->max_barriers_per_cta, config->warp_size), m_active_warps(0), m_dynamic_warp_id(0) { + m_cluster = cluster; m_config = config; m_memory_config = mem_config; @@ -518,21 +519,32 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread, m_occupied_cta_to_hwtid.clear(); m_active_warps = 0; } - for (unsigned i = start_thread; i < end_thread; i++) { + + WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader); + tid_range.loop([&](const unsigned i){ m_threadState[i].n_insn = 0; - m_threadState[i].m_cta_id = -1; - } - for (unsigned i = start_thread / m_config->warp_size; - i < end_thread / m_config->warp_size; ++i) { + m_threadState[i].m_cta_id = -1; + }); + + const unsigned start_warp = start_thread / m_config->warp_size; + const unsigned end_warp = end_thread / m_config->warp_size + + ((end_thread % m_config->warp_size) ? 1 : 0); + WrappableUnsignedRange warp_id_range(start_warp, end_warp, m_config->max_warps_per_shader); + warp_id_range.loop([&](const unsigned i){ m_warp[i]->reset(); - m_simt_stack[i]->reset(); - } + m_simt_stack[i]->reset(); + }); } +/** + * @brief Note: To handle the case of hwtid wrap-around (end_thread < start_thread), + * this method will generate a const vec of warp ids to iterate over in a range-based for loop. + */ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread, unsigned end_thread, unsigned ctaid, int cta_size, kernel_info_t &kernel) { - // + //when concurrent_sm is enabled, + //both start_thread and end_thread are hwtid (0 <= x < n_thread_per_shader) address_type start_pc = next_pc(start_thread); unsigned kernel_id = kernel.get_uid(); if (m_config->model == POST_DOMINATOR) { @@ -540,12 +552,16 @@ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread, unsigned warp_per_cta = cta_size / m_config->warp_size; unsigned end_warp = end_thread / m_config->warp_size + ((end_thread % m_config->warp_size) ? 1 : 0); - for (unsigned i = start_warp; i < end_warp; ++i) { + + WrappableUnsignedRange tid_range(start_thread, end_thread, m_config->n_thread_per_shader); + WrappableUnsignedRange warp_id_range(start_warp, end_warp, m_config->max_warps_per_shader); + + warp_id_range.loop([&](const unsigned i){ unsigned n_active = 0; simt_mask_t active_threads; for (unsigned t = 0; t < m_config->warp_size; t++) { unsigned hwtid = i * m_config->warp_size + t; - if (hwtid < end_thread) { + if ( tid_range.contains(hwtid) ) { n_active++; assert(!m_active_threads.test(hwtid)); m_active_threads.set(hwtid); @@ -574,8 +590,8 @@ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread, m_warp[i]->init(start_pc, cta_id, i, active_threads, m_dynamic_warp_id); ++m_dynamic_warp_id; m_not_completed += n_active; - ++m_active_warps; - } + ++m_active_warps; + }); } } @@ -3337,6 +3353,20 @@ void shader_core_ctx::display_pipeline(FILE *fout, int print_mem, } } +/** + * @brief Given the resource requirements per CTA of a kernel, calculate how + * many such CTAs can a shader core sustain when it is "empty". In other words, + * it checks if the CTA is too "fat" to fit on a core; if it can, how many. + * + * Although this function is declared to be const (promises not to modify any + * state of the shader_core_config class), it also checks if + * adaptive_cache_config is + * enabled and if yes, it might modify some states of the cache configuration. + * Read the code yourself if you are concerned! + * + * @param k + * @return unsigned int How many CTAs of the kernel can be sustained on a core. + */ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const { unsigned threads_per_cta = k.threads_per_cta(); const class function_info *kernel = k.entry(); diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h index c3e6f93ed..155f479c3 100644 --- a/src/gpgpu-sim/shader.h +++ b/src/gpgpu-sim/shader.h @@ -957,7 +957,7 @@ class opndcoll_rfu_t { // operand collector based register file unit m_sub_core_model = sub_core_model; m_num_warp_scheds = num_warp_scheds; if (m_sub_core_model) { - m_last_cu_set = new unsigned(m_num_warp_scheds); + m_last_cu_set = new unsigned[m_num_warp_scheds]; for (unsigned i = 0; i < m_num_warp_scheds; i++) { m_last_cu_set[i] = i * m_num_collectors / m_num_warp_scheds; @@ -2514,8 +2514,8 @@ class shader_core_ctx : public core_t { public: bool can_issue_1block(kernel_info_t &kernel); bool occupy_shader_resource_1block(kernel_info_t &kernel, bool occupy); - void release_shader_resource_1block(unsigned hw_ctaid, kernel_info_t &kernel); - int find_available_hwtid(unsigned int cta_size, bool occupy); + void release_shader_resource_1block(unsigned hw_ctaid, const kernel_info_t &kernel); + int find_available_hwtid(unsigned int cta_size, const kernel_info_t &kernel, bool occupy); private: unsigned int m_occupied_n_threads; @@ -2559,6 +2559,7 @@ class exec_shader_core_ctx : public shader_core_ctx { }; class simt_core_cluster { + friend class shader_core_ctx; public: simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id, const shader_core_config *config, diff --git a/src/trace_streams.tup b/src/trace_streams.tup index 074c7c880..4457f6c25 100644 --- a/src/trace_streams.tup +++ b/src/trace_streams.tup @@ -32,5 +32,6 @@ TS_TUP_BEGIN( trace_streams_type ) TS_TUP( MEMORY_SUBPARTITION_UNIT ), TS_TUP( INTERCONNECT ), TS_TUP( LIVENESS ), + TS_TUP( SUBCORE ), TS_TUP( NUM_TRACE_STREAMS ) TS_TUP_END( trace_streams_type )