diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 27724d6b..daf96d9e 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -31,6 +31,16 @@ jobs: sudo apt-get -y install llvm-12 llvm-12-dev llvm-12-tools clang-12 sudo apt-get -y install build-essential + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.7.16' + + - name: Install Python Dependencies + run: | + python -m pip install --upgrade pip + pip install eventlet pandas matplotlib + - name: Configure CMake # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type @@ -105,3 +115,8 @@ jobs: sh compile.sh sh run.sh sh verify.sh + + - name: Test expandable automatic script + working-directory: ${{github.workspace}}/tools/expandable + run: | + ./demo.sh --test y \ No newline at end of file diff --git a/.gitignore b/.gitignore index 4a6b9a93..06b0854f 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,12 @@ build*/ *.ll *.dot *.bc +.vscode/ +.venv/ +tmp/ +fig/ +result/ +__pycache__/ +dfg.json +increMapInput.json +config.json \ No newline at end of file diff --git a/src/DFG.cpp b/src/DFG.cpp index fe6e07a5..0d73ddc9 100644 --- a/src/DFG.cpp +++ b/src/DFG.cpp @@ -33,8 +33,13 @@ DFG::DFG(Function& t_F, list* t_loops, bool t_targetFunction, bool needsCycleCalculation = false; for (auto strategy : *t_fusionStrategy) { if (strategy == "default_heterogeneous") { - combineMulAdd("CoT"); - combinePhiAdd("BrT"); + combine("phi", "add", "Ctrl"); + combine("phi", "fadd", "Ctrl"); + combine("fcmp", "select", "Ctrl"); + combine("icmp", "select", "Ctrl"); + combine("icmp", "br", "Ctrl"); + combine("fcmp", "br", "Ctrl"); + tuneForPattern(); needsCycleCalculation = true; } else if (strategy == "nonlinear") { diff --git a/src/Mapper.cpp b/src/Mapper.cpp index e4c05379..ab493e8b 100644 --- a/src/Mapper.cpp +++ b/src/Mapper.cpp @@ -664,9 +664,13 @@ void Mapper::showUtilization(CGRA* t_cgra, DFG* t_dfg, int t_II, total_active_tiles += 1; } float avg_tile_overall_utilization = 0.0; + float max_tile_overall_utilization = 0.0; float avg_tile_fu_utilization = 0.0; float avg_tile_xbar_utilization = 0.0; for (int tile = 0; tile < t_cgra->getFUCount(); ++tile) { + if (max_tile_overall_utilization < tile_overall_utilization[tile]) { + max_tile_overall_utilization = tile_overall_utilization[tile]; + } avg_tile_overall_utilization += tile_overall_utilization[tile]; avg_tile_fu_utilization += tile_fu_utilization[tile]; avg_tile_xbar_utilization += tile_xbar_utilization[tile]; @@ -675,8 +679,10 @@ void Mapper::showUtilization(CGRA* t_cgra, DFG* t_dfg, int t_II, avg_tile_overall_utilization /= total_active_tiles; avg_tile_fu_utilization /= total_active_tiles; avg_tile_xbar_utilization /= total_active_tiles; + //max_tile_overall_utilization /= total_active_tiles; - cout << "tile avg fu utilization: " << avg_tile_fu_utilization*100 << "%; avg xbar utilization: " << avg_tile_xbar_utilization*100 << "%; avg overall utilization: " << avg_tile_overall_utilization*100 << "%" << endl; + cout << "tile avg fu utilization: " << avg_tile_fu_utilization*100 << "%; avg xbar utilization: " << avg_tile_xbar_utilization*100 << "%; avg overall utilization: " << avg_tile_overall_utilization*t_II*100 << "%" << endl; + cout << "max overall utilization: " << max_tile_overall_utilization*t_II*100 << "%" << endl; // Collects the histogram of tiles' utilization. // Histogram for the number of tiles that have utilization of 0%. @@ -1007,6 +1013,7 @@ void Mapper::showSchedule(CGRA* t_cgra, DFG* t_dfg, int t_II, cout<<"[Mapping II: "<showSchedule(cgra, dfg, II, isStaticElasticCGRA, parameterizableCGRA); // cout << "==================================\n"; - cout << "[show opcode count]\n"; - dfg->showOpcodeDistribution(); + // cout << "[show opcode count]\n"; + // dfg->showOpcodeDistribution(); cout << "[Mapping Success]\n"; cout << "==================================\n"; if (enableExpandableMapping) { @@ -409,11 +409,11 @@ namespace { */ bool canMap(CGRA* t_cgra, DFG* t_dfg) { std::set missing_fus; - + for (auto it = t_dfg->nodes.begin(); it != t_dfg->nodes.end(); ++it) { DFGNode* node = *it; bool nodeSupported = false; - + for (int i = 0; i < t_cgra->getRows() && !nodeSupported; ++i) { for (int j = 0; j < t_cgra->getColumns(); ++j) { CGRANode* fu = t_cgra->nodes[i][j]; @@ -423,12 +423,12 @@ namespace { } } } - + if (!nodeSupported) { missing_fus.insert(node->getOpcodeName()); } } - + if (!missing_fus.empty()) { std::cout << "[canMap] Missing functional units: "; for (const auto& op : missing_fus) { @@ -437,10 +437,10 @@ namespace { std::cout << std::endl; return false; } - + return true; } - + }; } diff --git a/test/compile.sh b/test/compile.sh index eee8141a..7f00f2a2 100755 --- a/test/compile.sh +++ b/test/compile.sh @@ -1,2 +1,3 @@ -clang-12 -emit-llvm -fno-unroll-loops -O3 -o kernel.bc -c kernel.cpp -#llvm-dis fir.bc -o fir.ll +clang-12 -emit-llvm -fno-unroll-loops -O0 -o kernel.bc -c kernel.cpp +llvm-dis-12 kernel.bc -o O0kernel.ll +#clang-12 -emit-llvm -fno-unroll-loops -mllvm -force-vector-width=4 -O3 -o kernel.bc -c ./_matmul/src/matmul.c diff --git a/test/dot.sh b/test/dot.sh index 3bccbd9f..fa80b3c7 100644 --- a/test/dot.sh +++ b/test/dot.sh @@ -1,2 +1 @@ -dot -Tpng _Z6kernelPfS_S_.dot -o kernel.png - +dot -Tpng _Z6kernelPfS_S_.dot -o kernel.png \ No newline at end of file diff --git a/test/inter_edge/compile.sh b/test/inter_edge/compile.sh old mode 100644 new mode 100755 diff --git a/test/inter_edge/dot.sh b/test/inter_edge/dot.sh old mode 100644 new mode 100755 diff --git a/test/inter_edge/rebuild.sh b/test/inter_edge/rebuild.sh old mode 100644 new mode 100755 diff --git a/test/inter_edge/run.sh b/test/inter_edge/run.sh old mode 100644 new mode 100755 diff --git a/test/inter_edge/verify.sh b/test/inter_edge/verify.sh old mode 100644 new mode 100755 diff --git a/test/kernels/conv/conv.c b/test/kernels/conv/conv.c index cde86d42..24e95013 100644 --- a/test/kernels/conv/conv.c +++ b/test/kernels/conv/conv.c @@ -76,7 +76,7 @@ int kernel(int ni, int nj, int nk, for (x = 0; x < total; x++) { i = x / NJ; j = x % NJ; - out += A[i][j] * B[i][j]; + out += A [i][j] * B[i][j]; } /* diff --git a/test/kernels/latnrm/latnrm.c b/test/kernels/latnrm/latnrm.c index db886b11..7539acfa 100644 --- a/test/kernels/latnrm/latnrm.c +++ b/test/kernels/latnrm/latnrm.c @@ -45,17 +45,17 @@ void kernel(float input, float *output, float coefficient[16], top = input; q_coef = coefficient[0]; // #pragma clang loop unroll_count(4) - for (i = 0; i < ORDER; i++) { - k_coef = coefficient[2*i]; - left = top; - right = internal_state[i]; - internal_state[i] = bottom; - top = q_coef * left - k_coef * right; - bottom = q_coef * right + k_coef * left; - q_coef = coefficient[2*i+1]; - } - internal_state[i++] = bottom; - internal_state[i] = top; + // for (i = 0; i < ORDER; i++) { + // k_coef = coefficient[2*i]; + // left = top; + // right = internal_state[i]; + // internal_state[i] = bottom; + // top = q_coef * left - k_coef * right; + // bottom = q_coef * right + k_coef * left; + // q_coef = coefficient[2*i+1]; + // } + // internal_state[i++] = bottom; + // internal_state[i] = top; sum = internal_state[1] * q_coef; diff --git a/test/kernels/mvt/mvt.c b/test/kernels/mvt/mvt.c index ed350037..e091d92c 100644 --- a/test/kernels/mvt/mvt.c +++ b/test/kernels/mvt/mvt.c @@ -78,8 +78,8 @@ void kernel(int n, { int i, j; -#pragma scop - #pragma clang loop unroll_count(1) +// #pragma scop + // #pragma clang loop unroll_count(1) for (j = 0; j < N; j++) { // #pragma clang loop unroll_count(1) vectorize(disable) //#pragma clang loop unroll_count(1) vectorize_width(4) @@ -88,7 +88,7 @@ void kernel(int n, x2[i] = x2[i] + A[j][i] * y_2[j]; } } -#pragma endscop +// #pragma endscop } diff --git a/test/kernels/relu+histogram/compile.sh b/test/kernels/relu+histogram/compile.sh new file mode 100644 index 00000000..11be835a --- /dev/null +++ b/test/kernels/relu+histogram/compile.sh @@ -0,0 +1,4 @@ +clang-12 -emit-llvm -O3 -fno-unroll-loops -o kernel.bc -c relu.c +llvm-dis-12 kernel.bc -o kernel.ll +opt-12 --loop-unroll --unroll-count=4 kernel.bc -o kernel_unroll.bc +llvm-dis-12 kernel_unroll.bc -o kernel_unroll.ll diff --git a/test/kernels/relu+histogram/param.json b/test/kernels/relu+histogram/param.json new file mode 100644 index 00000000..b2d46633 --- /dev/null +++ b/test/kernels/relu+histogram/param.json @@ -0,0 +1,19 @@ +{ + "kernel" : "kernel", + "targetFunction" : false, + "targetNested" : false, + "targetLoopsID" : [1], + "doCGRAMapping" : false, + "row" : 4, + "column" : 4, + "diagonalVectorization" : true, + "fusionStrategy" : [], + "isTrimmedDemo" : true, + "heuristicMapping" : false, + "bypassConstraint" : 4, + "isStaticElasticCGRA" : false, + "precisionAware" : false, + "ctrlMemConstraint" : 200, + "regConstraint" : 8 +} + diff --git a/test/kernels/relu+histogram/polybench.h b/test/kernels/relu+histogram/polybench.h new file mode 100644 index 00000000..d1a1f776 --- /dev/null +++ b/test/kernels/relu+histogram/polybench.h @@ -0,0 +1,217 @@ +/* + * Polybench header for instrumentation. + * + * Programs must be compiled with `-I utilities utilities/polybench.c' + * + * Optionally, one can define: + * + * -DPOLYBENCH_TIME, to report the execution time, + * OR (exclusive): + * -DPOLYBENCH_PAPI, to use PAPI H/W counters (defined in polybench.c) + * + * + * See README or utilities/polybench.c for additional options. + * + */ +#ifndef POLYBENCH_H +# define POLYBENCH_H + +# include + +/* Array padding. By default, none is used. */ +# ifndef POLYBENCH_PADDING_FACTOR +/* default: */ +# define POLYBENCH_PADDING_FACTOR 0 +# endif + + +/* C99 arrays in function prototype. By default, do not use. */ +# ifdef POLYBENCH_USE_C99_PROTO +# define POLYBENCH_C99_SELECT(x,y) y +# else +/* default: */ +# define POLYBENCH_C99_SELECT(x,y) x +# endif + + +/* Scalar loop bounds in SCoPs. By default, use parametric loop bounds. */ +# ifdef POLYBENCH_USE_SCALAR_LB +# define POLYBENCH_LOOP_BOUND(x,y) x +# else +/* default: */ +# define POLYBENCH_LOOP_BOUND(x,y) y +# endif + +/* Use the 'restrict' keyword to declare that the different arrays do not + * alias. By default, we do not use it as it is only supported in C99 and + * even here several compilers do not properly get it. + */ +# ifdef POLYBENCH_USE_RESTRICT +# define POLYBENCH_RESTRICT restrict +# else +/* default: */ +# define POLYBENCH_RESTRICT +# endif + +/* Macros to reference an array. Generic for heap and stack arrays + (C99). Each array dimensionality has his own macro, to be used at + declaration or as a function argument. + Example: + int b[x] => POLYBENCH_1D_ARRAY(b, x) + int A[N][N] => POLYBENCH_2D_ARRAY(A, N, N) +*/ +# ifndef POLYBENCH_STACK_ARRAYS +# define POLYBENCH_ARRAY(x) *x +# define POLYBENCH_FREE_ARRAY(x) free((void*)x); +# define POLYBENCH_DECL_VAR(x) (*x) +# else +# define POLYBENCH_ARRAY(x) x +# define POLYBENCH_FREE_ARRAY(x) +# define POLYBENCH_DECL_VAR(x) x +# endif +/* Macros for using arrays in the function prototypes. */ +# define POLYBENCH_1D(var, dim1,ddim1) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_2D(var, dim1, dim2, ddim1, ddim2) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_3D(var, dim1, dim2, dim3, ddim1, ddim2, ddim3) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_4D(var, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_5D(var, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim5,ddim5) + POLYBENCH_PADDING_FACTOR] +/* Macros for using arrays within the functions. */ +# define POLYBENCH_1D_F(var, dim1,ddim1) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_2D_F(var, dim1, dim2, ddim1, ddim2) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_3D_F(var, dim1, dim2, dim3, ddim1, ddim2, ddim3) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_4D_F(var, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_5D_F(var, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim5,ddim5) + POLYBENCH_PADDING_FACTOR] + + +/* Macros to allocate heap arrays. + Example: + polybench_alloc_2d_array(N, M, double) => allocates N x M x sizeof(double) + and returns a pointer to the 2d array + */ +# define POLYBENCH_ALLOC_1D_ARRAY(n1, type) \ + (type(*)[n1 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data (n1 + POLYBENCH_PADDING_FACTOR, sizeof(type)) +# define POLYBENCH_ALLOC_2D_ARRAY(n1, n2, type) \ + (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR), sizeof(type)) +# define POLYBENCH_ALLOC_3D_ARRAY(n1, n2, n3, type) \ + (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR), sizeof(type)) +# define POLYBENCH_ALLOC_4D_ARRAY(n1, n2, n3, n4, type) \ + (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR][n4 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR) * (n4 + POLYBENCH_PADDING_FACTOR), sizeof(type)) +# define POLYBENCH_ALLOC_5D_ARRAY(n1, n2, n3, n4, n5, type) \ + (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR][n4 + POLYBENCH_PADDING_FACTOR][n5 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR) * (n4 + POLYBENCH_PADDING_FACTOR) * (n5 + POLYBENCH_PADDING_FACTOR), sizeof(type)) + +/* Macros for array declaration. */ +# ifndef POLYBENCH_STACK_ARRAYS +# define POLYBENCH_1D_ARRAY_DECL(var, type, dim1, ddim1) \ + type POLYBENCH_1D_F(POLYBENCH_DECL_VAR(var), dim1, ddim1); \ + var = POLYBENCH_ALLOC_1D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), type); +# define POLYBENCH_2D_ARRAY_DECL(var, type, dim1, dim2, ddim1, ddim2) \ + type POLYBENCH_2D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, ddim1, ddim2); \ + var = POLYBENCH_ALLOC_2D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), type); +# define POLYBENCH_3D_ARRAY_DECL(var, type, dim1, dim2, dim3, ddim1, ddim2, ddim3) \ + type POLYBENCH_3D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, ddim1, ddim2, ddim3); \ + var = POLYBENCH_ALLOC_3D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), type); +# define POLYBENCH_4D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) \ + type POLYBENCH_4D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4); \ + var = POLYBENCH_ALLOC_4D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), POLYBENCH_C99_SELECT(dim4, ddim4), type); +# define POLYBENCH_5D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) \ + type POLYBENCH_5D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5); \ + var = POLYBENCH_ALLOC_5D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), POLYBENCH_C99_SELECT(dim4, ddim4), POLYBENCH_C99_SELECT(dim5, ddim5), type); +# else +# define POLYBENCH_1D_ARRAY_DECL(var, type, dim1, ddim1) \ + type POLYBENCH_1D_F(POLYBENCH_DECL_VAR(var), dim1, ddim1); +# define POLYBENCH_2D_ARRAY_DECL(var, type, dim1, dim2, ddim1, ddim2) \ + type POLYBENCH_2D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, ddim1, ddim2); +# define POLYBENCH_3D_ARRAY_DECL(var, type, dim1, dim2, dim3, ddim1, ddim2, ddim3) \ + type POLYBENCH_3D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, ddim1, ddim2, ddim3); +# define POLYBENCH_4D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) \ + type POLYBENCH_4D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4); +# define POLYBENCH_5D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) \ + type POLYBENCH_5D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5); +# endif + + +/* Dead-code elimination macros. Use argc/argv for the run-time check. */ +# ifndef POLYBENCH_DUMP_ARRAYS +# define POLYBENCH_DCE_ONLY_CODE if (argc > 42 && ! strcmp(argv[0], "")) +# else +# define POLYBENCH_DCE_ONLY_CODE +# endif + +#define POLYBENCH_DUMP_TARGET stderr +#define POLYBENCH_DUMP_START fprintf(POLYBENCH_DUMP_TARGET, "==BEGIN DUMP_ARRAYS==\n") +#define POLYBENCH_DUMP_FINISH fprintf(POLYBENCH_DUMP_TARGET, "==END DUMP_ARRAYS==\n") +#define POLYBENCH_DUMP_BEGIN(s) fprintf(POLYBENCH_DUMP_TARGET, "begin dump: %s", s) +#define POLYBENCH_DUMP_END(s) fprintf(POLYBENCH_DUMP_TARGET, "\nend dump: %s\n", s) + +# define polybench_prevent_dce(func) \ + POLYBENCH_DCE_ONLY_CODE \ + func + + +/* Performance-related instrumentation. See polybench.c */ +# define polybench_start_instruments +# define polybench_stop_instruments +# define polybench_print_instruments + + +/* PAPI support. */ +# ifdef POLYBENCH_PAPI +extern const unsigned int polybench_papi_eventlist[]; +# undef polybench_start_instruments +# undef polybench_stop_instruments +# undef polybench_print_instruments +# define polybench_set_papi_thread_report(x) \ + polybench_papi_counters_threadid = x; +# define polybench_start_instruments \ + polybench_prepare_instruments(); \ + polybench_papi_init(); \ + int evid; \ + for (evid = 0; polybench_papi_eventlist[evid] != 0; evid++) \ + { \ + if (polybench_papi_start_counter(evid)) \ + continue; \ + +# define polybench_stop_instruments \ + polybench_papi_stop_counter(evid); \ + } \ + polybench_papi_close(); \ + +# define polybench_print_instruments polybench_papi_print(); +# endif + + +/* Timing support. */ +# if defined(POLYBENCH_TIME) || defined(POLYBENCH_GFLOPS) +# undef polybench_start_instruments +# undef polybench_stop_instruments +# undef polybench_print_instruments +# define polybench_start_instruments polybench_timer_start(); +# define polybench_stop_instruments polybench_timer_stop(); +# define polybench_print_instruments polybench_timer_print(); +extern double polybench_program_total_flops; +extern void polybench_timer_start(); +extern void polybench_timer_stop(); +extern void polybench_timer_print(); +# endif + +/* Function declaration. */ +# ifdef POLYBENCH_TIME +extern void polybench_timer_start(); +extern void polybench_timer_stop(); +extern void polybench_timer_print(); +# endif + +# ifdef POLYBENCH_PAPI +extern void polybench_prepare_instruments(); +extern int polybench_papi_start_counter(int evid); +extern void polybench_papi_stop_counter(int evid); +extern void polybench_papi_init(); +extern void polybench_papi_close(); +extern void polybench_papi_print(); +# endif + +/* Function prototypes. */ +extern void* polybench_alloc_data(unsigned long long int n, int elt_size); + + +#endif /* !POLYBENCH_H */ diff --git a/test/kernels/relu+histogram/relu+histogram.c b/test/kernels/relu+histogram/relu+histogram.c new file mode 100644 index 00000000..cd36403d --- /dev/null +++ b/test/kernels/relu+histogram/relu+histogram.c @@ -0,0 +1,126 @@ +#include +#include +#include +#include +#include "polybench.h" +#include "relu.h" +// histogram +#define DATA_LEN 20 +#define BUCKET_LEN 5 +#define MIN 1.0 +#define MAX 19.0 + +float input_data[DATA_LEN] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,14,14,14,14,14,19}; +int histogram[BUCKET_LEN] = {0}; + +/* Array initialization. */ +static +void init_array(int ni, int nj, int nk, + DATA_TYPE POLYBENCH_2D(C,NI,NJ,ni,nj), + DATA_TYPE POLYBENCH_2D(A,NI,NJ,ni,nj), + DATA_TYPE POLYBENCH_2D(B,NK,NJ,nk,nj)) +{ + int i, j; + + for (i = 0; i < ni; i++) + for (j = 0; j < nj; j++) + C[i][j] = (DATA_TYPE) (i*j % ni) / ni; + for (i = 0; i < ni; i++) + for (j = 0; j < nj; j++) + A[i][j] = (DATA_TYPE) (i*(j+1) % nk) / nk; + for (i = 0; i < ni; i++) + for (j = 0; j < nj; j++) + B[i][j] = (DATA_TYPE) (i*(j+2) % nj) / nj; +} + + +/* DCE code. Must scan the entire live-out data. + Can be used also to check the correctness of the output. */ +static +void print_array(int ni, int nj, + DATA_TYPE POLYBENCH_2D(C,NI,NJ,ni,nj)) +{ + int i, j; + + POLYBENCH_DUMP_START; + POLYBENCH_DUMP_BEGIN("C"); + for (i = 0; i < ni; i++) + for (j = 0; j < nj; j++) { + if ((i * ni + j) % 20 == 0) fprintf (POLYBENCH_DUMP_TARGET, "\n"); + fprintf (POLYBENCH_DUMP_TARGET, DATA_PRINTF_MODIFIER, C[i][j]); + } + POLYBENCH_DUMP_END("C"); + POLYBENCH_DUMP_FINISH; +} + + +/* Main computational kernel. The whole function will be timed, + including the call and return. */ +void kernel(int ni, int nj, int nk, + DATA_TYPE POLYBENCH_2D(C,NI,NJ,ni,nj), + DATA_TYPE POLYBENCH_2D(A,NI,NJ,ni,nj), + DATA_TYPE POLYBENCH_2D(B,NI,NJ,ni,nj),float input[], int histogram[]) +{ + int x = 0, i = 0, j = 0, k = 0; + int total = NI * NJ; + float dmin = (float)MIN; + float delt = (float)(MAX - dmin); + //#pragma clang loop vectorize(disable) unroll_count(4) + // #pragma clang loop vectorize(enable) vectorize_width(4) unroll_count(4) + for (x = 0; x < total; x++) { + i = x / NJ; + j = x % NJ; + if (A[i][j] < 0) + C[i][j] = 0; + else + C[i][j] = A[i][j]; + + float r = BUCKET_LEN * (input[x] - dmin) / delt; + int b = (int)(r); + histogram[b]++; + } +} + + +int main(int argc, char** argv) +{ + /* Retrieve problem size. */ + int ni = NI; + int nj = NJ; + int nk = NK; + + /* Variable declaration/allocation. */ + POLYBENCH_2D_ARRAY_DECL(C,DATA_TYPE,NI,NJ,ni,nj); + POLYBENCH_2D_ARRAY_DECL(A,DATA_TYPE,NI,NJ,ni,nj); + POLYBENCH_2D_ARRAY_DECL(B,DATA_TYPE,NI,NJ,ni,nj); + + /* Initialize array(s). */ + init_array (ni, nj, nk, + POLYBENCH_ARRAY(C), + POLYBENCH_ARRAY(A), + POLYBENCH_ARRAY(B)); + + /* Start timer. */ + polybench_start_instruments; + + /* Run kernel. */ + kernel(ni, nj, nk, + POLYBENCH_ARRAY(C), + POLYBENCH_ARRAY(A), + POLYBENCH_ARRAY(B),input_data, histogram); + + /* Stop and print timer. */ + polybench_stop_instruments; + polybench_print_instruments; + + /* Prevent dead-code elimination. All live-out data must be printed + by the function call in argument. */ + polybench_prevent_dce(print_array(ni, nj, POLYBENCH_ARRAY(C))); + + /* Be clean. */ + POLYBENCH_FREE_ARRAY(C); + POLYBENCH_FREE_ARRAY(A); + POLYBENCH_FREE_ARRAY(B); + + return 0; +} diff --git a/test/kernels/relu+histogram/relu.h b/test/kernels/relu+histogram/relu.h new file mode 100644 index 00000000..4615e081 --- /dev/null +++ b/test/kernels/relu+histogram/relu.h @@ -0,0 +1,80 @@ +#ifndef _RELU_H +# define _RELU_H + +#define DATA_TYPE_IS_INT + +/* Default to LARGE_DATASET. */ +# if !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(MEDIUM_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET) +# define LARGE_DATASET +# endif + +# if !defined(NI) && !defined(NJ) && !defined(NK) +/* Define sample dataset sizes. */ +# ifdef MINI_DATASET +# define NI 20 +# define NJ 25 +# define NK 30 +# endif + +# ifdef SMALL_DATASET +# define NI 60 +# define NJ 70 +# define NK 80 +# endif + +# ifdef MEDIUM_DATASET +# define NI 200 +# define NJ 220 +# define NK 240 +# endif + +# ifdef LARGE_DATASET +# define NI 1000 +# define NJ 1100 +# define NK 1200 +# endif + +# ifdef EXTRALARGE_DATASET +# define NI 2000 +# define NJ 2300 +# define NK 2600 +# endif + + +#endif /* !(NI NJ NK) */ + +# define _PB_NI POLYBENCH_LOOP_BOUND(NI,ni) +# define _PB_NJ POLYBENCH_LOOP_BOUND(NJ,nj) +# define _PB_NK POLYBENCH_LOOP_BOUND(NK,nk) + + +/* Default data type */ +# if !defined(DATA_TYPE_IS_INT) && !defined(DATA_TYPE_IS_FLOAT) && !defined(DATA_TYPE_IS_DOUBLE) +# define DATA_TYPE_IS_DOUBLE +# endif + +#ifdef DATA_TYPE_IS_INT +# define DATA_TYPE int +# define DATA_PRINTF_MODIFIER "%d " +#endif + +#ifdef DATA_TYPE_IS_FLOAT +# define DATA_TYPE float +# define DATA_PRINTF_MODIFIER "%0.2f " +# define SCALAR_VAL(x) x##f +# define SQRT_FUN(x) sqrtf(x) +# define EXP_FUN(x) expf(x) +# define POW_FUN(x,y) powf(x,y) +# endif + +#ifdef DATA_TYPE_IS_DOUBLE +# define DATA_TYPE double +# define DATA_PRINTF_MODIFIER "%0.2lf " +# define SCALAR_VAL(x) x +# define SQRT_FUN(x) sqrt(x) +# define EXP_FUN(x) exp(x) +# define POW_FUN(x,y) pow(x,y) +# endif + +#endif /* !_RELU_H */ + diff --git a/test/kernels/relu+histogram/run.sh b/test/kernels/relu+histogram/run.sh new file mode 100644 index 00000000..a5674436 --- /dev/null +++ b/test/kernels/relu+histogram/run.sh @@ -0,0 +1 @@ +opt-12 -load ../../../cgra-mapper/build/src/libmapperPass.so -mapperPass kernel_unroll.bc diff --git a/test/kernels/spmv+conv/compile.sh b/test/kernels/spmv+conv/compile.sh new file mode 100755 index 00000000..ec8a4182 --- /dev/null +++ b/test/kernels/spmv+conv/compile.sh @@ -0,0 +1,2 @@ +clang-12 -emit-llvm -O3 -fno-unroll-loops -o kernel.bc -c spmv+conv.c +llvm-dis-12 kernel.bc -o kernel.ll diff --git a/test/kernels/spmv+conv/conv.h b/test/kernels/spmv+conv/conv.h new file mode 100644 index 00000000..ead2d3d3 --- /dev/null +++ b/test/kernels/spmv+conv/conv.h @@ -0,0 +1,80 @@ +#ifndef _CONV_H +# define _CONV_H + +#define DATA_TYPE_IS_INT + +/* Default to LARGE_DATASET. */ +# if !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(MEDIUM_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET) +# define LARGE_DATASET +# endif + +# if !defined(NI) && !defined(NJ) && !defined(NK) +/* Define sample dataset sizes. */ +# ifdef MINI_DATASET +# define NI 20 +# define NJ 25 +# define NK 30 +# endif + +# ifdef SMALL_DATASET +# define NI 60 +# define NJ 70 +# define NK 80 +# endif + +# ifdef MEDIUM_DATASET +# define NI 200 +# define NJ 220 +# define NK 240 +# endif + +# ifdef LARGE_DATASET +# define NI 1000 +# define NJ 1100 +# define NK 1200 +# endif + +# ifdef EXTRALARGE_DATASET +# define NI 2000 +# define NJ 2300 +# define NK 2600 +# endif + + +#endif /* !(NI NJ NK) */ + +# define _PB_NI POLYBENCH_LOOP_BOUND(NI,ni) +# define _PB_NJ POLYBENCH_LOOP_BOUND(NJ,nj) +# define _PB_NK POLYBENCH_LOOP_BOUND(NK,nk) + + +/* Default data type */ +# if !defined(DATA_TYPE_IS_INT) && !defined(DATA_TYPE_IS_FLOAT) && !defined(DATA_TYPE_IS_DOUBLE) +# define DATA_TYPE_IS_DOUBLE +# endif + +#ifdef DATA_TYPE_IS_INT +# define DATA_TYPE int +# define DATA_PRINTF_MODIFIER "%d " +#endif + +#ifdef DATA_TYPE_IS_FLOAT +# define DATA_TYPE float +# define DATA_PRINTF_MODIFIER "%0.2f " +# define SCALAR_VAL(x) x##f +# define SQRT_FUN(x) sqrtf(x) +# define EXP_FUN(x) expf(x) +# define POW_FUN(x,y) powf(x,y) +# endif + +#ifdef DATA_TYPE_IS_DOUBLE +# define DATA_TYPE double +# define DATA_PRINTF_MODIFIER "%0.2lf " +# define SCALAR_VAL(x) x +# define SQRT_FUN(x) sqrt(x) +# define EXP_FUN(x) exp(x) +# define POW_FUN(x,y) pow(x,y) +# endif + +#endif /* !_CONV_H */ + diff --git a/test/kernels/spmv+conv/dot.sh b/test/kernels/spmv+conv/dot.sh new file mode 100755 index 00000000..922149a0 --- /dev/null +++ b/test/kernels/spmv+conv/dot.sh @@ -0,0 +1 @@ +dot -Tpng kernel.dot -o spmv+conv.png diff --git a/test/kernels/spmv+conv/param.json b/test/kernels/spmv+conv/param.json new file mode 100644 index 00000000..51661c13 --- /dev/null +++ b/test/kernels/spmv+conv/param.json @@ -0,0 +1,63 @@ +{ + "kernel": "kernel", + "targetFunction": false, + "targetNested": false, + "targetLoopsID": [ + 0 + ], + "doCGRAMapping": true, + "row": 4, + "column": 4, + "precisionAware": false, + "fusionStrategy": [ + "default_heterogeneous" + ], + "isTrimmedDemo": true, + "heuristicMapping": true, + "parameterizableCGRA": false, + "diagonalVectorization": false, + "bypassConstraint": 4, + "isStaticElasticCGRA": false, + "ctrlMemConstraint": 10, + "regConstraint": 8, + "incrementalMapping": false, + "vectorFactorForIdiv ": 1, + "testingOpcodeOffset": 0, + "additionalFunc": { + "complex-Ctrl": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ], + "complex-BrT": [ + 4, + 5, + 6, + 7 + ], + "complex-CoT": [ + 8, + 9, + 10, + 11 + ] + }, + "supportDVFS": false, + "DVFSIslandDim": 1, + "DVFSAwareMapping": false, + "enablePowerGating": false, + "expandableMapping": true +} \ No newline at end of file diff --git a/test/kernels/spmv+conv/polybench.h b/test/kernels/spmv+conv/polybench.h new file mode 100644 index 00000000..d1a1f776 --- /dev/null +++ b/test/kernels/spmv+conv/polybench.h @@ -0,0 +1,217 @@ +/* + * Polybench header for instrumentation. + * + * Programs must be compiled with `-I utilities utilities/polybench.c' + * + * Optionally, one can define: + * + * -DPOLYBENCH_TIME, to report the execution time, + * OR (exclusive): + * -DPOLYBENCH_PAPI, to use PAPI H/W counters (defined in polybench.c) + * + * + * See README or utilities/polybench.c for additional options. + * + */ +#ifndef POLYBENCH_H +# define POLYBENCH_H + +# include + +/* Array padding. By default, none is used. */ +# ifndef POLYBENCH_PADDING_FACTOR +/* default: */ +# define POLYBENCH_PADDING_FACTOR 0 +# endif + + +/* C99 arrays in function prototype. By default, do not use. */ +# ifdef POLYBENCH_USE_C99_PROTO +# define POLYBENCH_C99_SELECT(x,y) y +# else +/* default: */ +# define POLYBENCH_C99_SELECT(x,y) x +# endif + + +/* Scalar loop bounds in SCoPs. By default, use parametric loop bounds. */ +# ifdef POLYBENCH_USE_SCALAR_LB +# define POLYBENCH_LOOP_BOUND(x,y) x +# else +/* default: */ +# define POLYBENCH_LOOP_BOUND(x,y) y +# endif + +/* Use the 'restrict' keyword to declare that the different arrays do not + * alias. By default, we do not use it as it is only supported in C99 and + * even here several compilers do not properly get it. + */ +# ifdef POLYBENCH_USE_RESTRICT +# define POLYBENCH_RESTRICT restrict +# else +/* default: */ +# define POLYBENCH_RESTRICT +# endif + +/* Macros to reference an array. Generic for heap and stack arrays + (C99). Each array dimensionality has his own macro, to be used at + declaration or as a function argument. + Example: + int b[x] => POLYBENCH_1D_ARRAY(b, x) + int A[N][N] => POLYBENCH_2D_ARRAY(A, N, N) +*/ +# ifndef POLYBENCH_STACK_ARRAYS +# define POLYBENCH_ARRAY(x) *x +# define POLYBENCH_FREE_ARRAY(x) free((void*)x); +# define POLYBENCH_DECL_VAR(x) (*x) +# else +# define POLYBENCH_ARRAY(x) x +# define POLYBENCH_FREE_ARRAY(x) +# define POLYBENCH_DECL_VAR(x) x +# endif +/* Macros for using arrays in the function prototypes. */ +# define POLYBENCH_1D(var, dim1,ddim1) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_2D(var, dim1, dim2, ddim1, ddim2) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_3D(var, dim1, dim2, dim3, ddim1, ddim2, ddim3) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_4D(var, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_5D(var, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim5,ddim5) + POLYBENCH_PADDING_FACTOR] +/* Macros for using arrays within the functions. */ +# define POLYBENCH_1D_F(var, dim1,ddim1) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_2D_F(var, dim1, dim2, ddim1, ddim2) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_3D_F(var, dim1, dim2, dim3, ddim1, ddim2, ddim3) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_4D_F(var, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR] +# define POLYBENCH_5D_F(var, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim5,ddim5) + POLYBENCH_PADDING_FACTOR] + + +/* Macros to allocate heap arrays. + Example: + polybench_alloc_2d_array(N, M, double) => allocates N x M x sizeof(double) + and returns a pointer to the 2d array + */ +# define POLYBENCH_ALLOC_1D_ARRAY(n1, type) \ + (type(*)[n1 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data (n1 + POLYBENCH_PADDING_FACTOR, sizeof(type)) +# define POLYBENCH_ALLOC_2D_ARRAY(n1, n2, type) \ + (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR), sizeof(type)) +# define POLYBENCH_ALLOC_3D_ARRAY(n1, n2, n3, type) \ + (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR), sizeof(type)) +# define POLYBENCH_ALLOC_4D_ARRAY(n1, n2, n3, n4, type) \ + (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR][n4 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR) * (n4 + POLYBENCH_PADDING_FACTOR), sizeof(type)) +# define POLYBENCH_ALLOC_5D_ARRAY(n1, n2, n3, n4, n5, type) \ + (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR][n4 + POLYBENCH_PADDING_FACTOR][n5 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR) * (n4 + POLYBENCH_PADDING_FACTOR) * (n5 + POLYBENCH_PADDING_FACTOR), sizeof(type)) + +/* Macros for array declaration. */ +# ifndef POLYBENCH_STACK_ARRAYS +# define POLYBENCH_1D_ARRAY_DECL(var, type, dim1, ddim1) \ + type POLYBENCH_1D_F(POLYBENCH_DECL_VAR(var), dim1, ddim1); \ + var = POLYBENCH_ALLOC_1D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), type); +# define POLYBENCH_2D_ARRAY_DECL(var, type, dim1, dim2, ddim1, ddim2) \ + type POLYBENCH_2D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, ddim1, ddim2); \ + var = POLYBENCH_ALLOC_2D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), type); +# define POLYBENCH_3D_ARRAY_DECL(var, type, dim1, dim2, dim3, ddim1, ddim2, ddim3) \ + type POLYBENCH_3D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, ddim1, ddim2, ddim3); \ + var = POLYBENCH_ALLOC_3D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), type); +# define POLYBENCH_4D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) \ + type POLYBENCH_4D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4); \ + var = POLYBENCH_ALLOC_4D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), POLYBENCH_C99_SELECT(dim4, ddim4), type); +# define POLYBENCH_5D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) \ + type POLYBENCH_5D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5); \ + var = POLYBENCH_ALLOC_5D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), POLYBENCH_C99_SELECT(dim4, ddim4), POLYBENCH_C99_SELECT(dim5, ddim5), type); +# else +# define POLYBENCH_1D_ARRAY_DECL(var, type, dim1, ddim1) \ + type POLYBENCH_1D_F(POLYBENCH_DECL_VAR(var), dim1, ddim1); +# define POLYBENCH_2D_ARRAY_DECL(var, type, dim1, dim2, ddim1, ddim2) \ + type POLYBENCH_2D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, ddim1, ddim2); +# define POLYBENCH_3D_ARRAY_DECL(var, type, dim1, dim2, dim3, ddim1, ddim2, ddim3) \ + type POLYBENCH_3D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, ddim1, ddim2, ddim3); +# define POLYBENCH_4D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) \ + type POLYBENCH_4D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4); +# define POLYBENCH_5D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) \ + type POLYBENCH_5D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5); +# endif + + +/* Dead-code elimination macros. Use argc/argv for the run-time check. */ +# ifndef POLYBENCH_DUMP_ARRAYS +# define POLYBENCH_DCE_ONLY_CODE if (argc > 42 && ! strcmp(argv[0], "")) +# else +# define POLYBENCH_DCE_ONLY_CODE +# endif + +#define POLYBENCH_DUMP_TARGET stderr +#define POLYBENCH_DUMP_START fprintf(POLYBENCH_DUMP_TARGET, "==BEGIN DUMP_ARRAYS==\n") +#define POLYBENCH_DUMP_FINISH fprintf(POLYBENCH_DUMP_TARGET, "==END DUMP_ARRAYS==\n") +#define POLYBENCH_DUMP_BEGIN(s) fprintf(POLYBENCH_DUMP_TARGET, "begin dump: %s", s) +#define POLYBENCH_DUMP_END(s) fprintf(POLYBENCH_DUMP_TARGET, "\nend dump: %s\n", s) + +# define polybench_prevent_dce(func) \ + POLYBENCH_DCE_ONLY_CODE \ + func + + +/* Performance-related instrumentation. See polybench.c */ +# define polybench_start_instruments +# define polybench_stop_instruments +# define polybench_print_instruments + + +/* PAPI support. */ +# ifdef POLYBENCH_PAPI +extern const unsigned int polybench_papi_eventlist[]; +# undef polybench_start_instruments +# undef polybench_stop_instruments +# undef polybench_print_instruments +# define polybench_set_papi_thread_report(x) \ + polybench_papi_counters_threadid = x; +# define polybench_start_instruments \ + polybench_prepare_instruments(); \ + polybench_papi_init(); \ + int evid; \ + for (evid = 0; polybench_papi_eventlist[evid] != 0; evid++) \ + { \ + if (polybench_papi_start_counter(evid)) \ + continue; \ + +# define polybench_stop_instruments \ + polybench_papi_stop_counter(evid); \ + } \ + polybench_papi_close(); \ + +# define polybench_print_instruments polybench_papi_print(); +# endif + + +/* Timing support. */ +# if defined(POLYBENCH_TIME) || defined(POLYBENCH_GFLOPS) +# undef polybench_start_instruments +# undef polybench_stop_instruments +# undef polybench_print_instruments +# define polybench_start_instruments polybench_timer_start(); +# define polybench_stop_instruments polybench_timer_stop(); +# define polybench_print_instruments polybench_timer_print(); +extern double polybench_program_total_flops; +extern void polybench_timer_start(); +extern void polybench_timer_stop(); +extern void polybench_timer_print(); +# endif + +/* Function declaration. */ +# ifdef POLYBENCH_TIME +extern void polybench_timer_start(); +extern void polybench_timer_stop(); +extern void polybench_timer_print(); +# endif + +# ifdef POLYBENCH_PAPI +extern void polybench_prepare_instruments(); +extern int polybench_papi_start_counter(int evid); +extern void polybench_papi_stop_counter(int evid); +extern void polybench_papi_init(); +extern void polybench_papi_close(); +extern void polybench_papi_print(); +# endif + +/* Function prototypes. */ +extern void* polybench_alloc_data(unsigned long long int n, int elt_size); + + +#endif /* !POLYBENCH_H */ diff --git a/test/kernels/spmv+conv/run.sh b/test/kernels/spmv+conv/run.sh new file mode 100755 index 00000000..9598bd9d --- /dev/null +++ b/test/kernels/spmv+conv/run.sh @@ -0,0 +1 @@ +opt-12 -load ../../../build/src/libmapperPass.so -mapperPass kernel.bc diff --git a/test/kernels/spmv+conv/spmv+conv.c b/test/kernels/spmv+conv/spmv+conv.c new file mode 100644 index 00000000..fc1d7239 --- /dev/null +++ b/test/kernels/spmv+conv/spmv+conv.c @@ -0,0 +1,62 @@ +#include +#include +#include +#include +#include "polybench.h" +#include "conv.h" +#define SIZE 10000 + +int nnz = 400000; +int val[SIZE]; +int col[SIZE]; +int row[SIZE]; +int feature[SIZE]; +int output[SIZE]; + +int kernel(int nnz, int val[], int col[], int row[], int feature[], int output[], + DATA_TYPE POLYBENCH_2D(C,NI,NJ,ni,nj), + DATA_TYPE POLYBENCH_2D(A,NI,NJ,ni,nj), + DATA_TYPE POLYBENCH_2D(B,NI,NJ,ni,nj)); + +int main() +{ + + // conv + POLYBENCH_2D_ARRAY_DECL(C,DATA_TYPE,NI,NJ,ni,nj); + POLYBENCH_2D_ARRAY_DECL(A,DATA_TYPE,NI,NJ,ni,nj); + POLYBENCH_2D_ARRAY_DECL(B,DATA_TYPE,NI,NJ,ni,nj); + + kernel(nnz, val, col, row, feature, output, POLYBENCH_ARRAY(C), + POLYBENCH_ARRAY(A), + POLYBENCH_ARRAY(B)); + +// output_dsp (input, NTAPS, 0); +// output_dsp (coefficients, NTAPS, 0); +// output_dsp (output, NTAPS, 0); + return 0; +} + +int kernel(int nnz, int val[], int col[], int row[], int feature[], int output[], + DATA_TYPE POLYBENCH_2D(C,NI,NJ,ni,nj), + DATA_TYPE POLYBENCH_2D(A,NI,NJ,ni,nj), + DATA_TYPE POLYBENCH_2D(B,NI,NJ,ni,nj)) +{ + int i = 0; + int temp; + + // conv + int x,y; + int out = 0; + + //#pragma clang loop unroll_count(4) + for (i = 0; i < nnz; ++i) { + // spmv + temp = val[i] * feature[ col[i] ]; + output[ row[i] ] += temp; + // conv + x = i / NI; + y = i % NJ; + out += A [x][y] * B[x][y]; + } + return out; +} \ No newline at end of file diff --git a/tools/expandable/NeuraDemo.py b/tools/expandable/NeuraDemo.py deleted file mode 100644 index 5ad000f7..00000000 --- a/tools/expandable/NeuraDemo.py +++ /dev/null @@ -1,710 +0,0 @@ -# ---------------------------------------------------------------------------- -# Filename: SORAdemo.py / -# Description: simulate multi-kernel running on multi-CGRA / -# Author: Miaomiao Jiang, start from 2025-02-24 / -# ---------------------------------------------------------------------------- - -import heapq -import subprocess -import json -import eventlet # for time out -import pandas as pd -import math - -# ---------------------------------------------------------------------------- -# global variables / -# ---------------------------------------------------------------------------- - -TEST_BENCHS = ["fir.cpp", "latnrm.c", "fft.c", "dtw.cpp", "spmv.c", "conv.c", "relu.c", "histogram.cpp", "mvt.c", "gemm.c"] -TEST_BENCHS_NUM = len(TEST_BENCHS) -DICT_CSV = {'kernels': "", 'DFG nodes': "", 'DFG edges': "", 'recMII': "", 'mappingII': "", 'expandableII': ""} # column names of generated CSV -DICT_COLUMN = len(DICT_CSV) -JSON_NAME = "./param.json" # name of generated json file -TIME_OUT_SET = 180 -DO_MAPPING = True -KERNEL_DIRECTORY = "../../test/kernels" - - - -# ---------------------------------------------------------------------------- -# class defination / -# ---------------------------------------------------------------------------- - - - -class Kernel: - def __init__(self, kernel_name, kernel_id, arrive_period, unroll_factor, vector_factor, total_iterations, cgra_rows, cgra_columns): - """ - Initialize an instance of the Kernel class. - - Parameters: - kernel_name (str): The name of the kernel. - kernel_id (int): The ID of the kernel. - arrive_period (int): The period at which the same kernel will arrive again. - unroll_factor (int): The unroll factor of the kernel. - vector_factor (int): The vector factor of the kernel. - total_iterations (int): The total number of iterations of the kernel. - cgra_rows (int): The number of rows in the CGRA. - cgra_columns (int): The number of columns in the CGRA. - """ - self.kernel_name = kernel_name - self.kernel_id = kernel_id - self.arrive_period = arrive_period - self.unroll_factor = unroll_factor - self.vector_factor = vector_factor - self.df = pd.DataFrame(DICT_CSV, index=[0]) - self.ii_1 = None # II when using 1 CGRA, actual II - self.ii_2 = None # II when using 2 CGRAs, expandable II - self.total_iterations = math.ceil(total_iterations / (self.unroll_factor*self.vector_factor)) - self.rows = cgra_rows - self.columns = cgra_columns - if DO_MAPPING: - self.get_ii() # Perform mapping and populate attributes - else: - self.read_ii() # Read from existing csv - print(f"Kernel {self.kernel_name} initialized with arrive_period={self.arrive_period}, unroll_factor={self.unroll_factor}") - - def __lt__(self, other): - """ - Compare two Kernel by id. - """ - return self.kernel_id < other.kernel_id - - def comp_kernel(self): - """ - This is a func compile a kernel using clang with selected unrolling factor. - - Returns: function name of kernel. - """ - file_source = (self.kernel_name.split("."))[0] - - if self.unroll_factor == 1 and self.vector_factor == 1: - compile_command = f"clang-12 -emit-llvm -fno-unroll-loops -fno-vectorize -O3 -o kernel.bc -c {KERNEL_DIRECTORY}/{file_source}/{self.kernel_name}" - elif self.unroll_factor == 1 and self.vector_factor != 1: - compile_command = f"clang-12 -emit-llvm -fno-unroll-loops -O3 -mllvm -force-vector-width={self.vector_factor} -o kernel.bc -c {KERNEL_DIRECTORY}/{file_source}/{self.kernel_name}" - elif self.unroll_factor != 1 and self.vector_factor == 1: - compile_command = f"clang-12 -emit-llvm -funroll-loops -mllvm -unroll-count={self.unroll_factor} -fno-vectorize -O3 -o kernel.bc -c {KERNEL_DIRECTORY}/{file_source}/{self.kernel_name}" - else: - print("Error, invalid unroll and vector factor combination.") - return - - compile_proc = subprocess.Popen([compile_command, '-u'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - (compile_out, compile_err) = compile_proc.communicate() - - disassemble_command = "llvm-dis-12 kernel.bc -o ./kernel.ll" - disassemble_proc = subprocess.Popen([disassemble_command, '-u'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - (disassemble_out, disassemble_err) = disassemble_proc.communicate() - - - if compile_err: - print(f"Compile warning message for {self.kernel_name}: {compile_err}") - if disassemble_err: - print(f"Disassemble error message for {self.kernel_name}: {disassemble_err}") - return - - # collect the potentially targeting kernel/function from kernel.ll - ir_file = open('kernel.ll', 'r') - ir_lines = ir_file.readlines() - - # strips the newline character - for line in ir_lines: - if "define " in line and "{" in line and "@" in line: - func_name = line.split("@")[1].split("(")[0] - if "kernel" in func_name: - target_kernel = func_name - break - - ir_file.close() - print(f"Target kernel function for {self.kernel_name}: {target_kernel}") - return target_kernel - - def map_kernel(self): - """ - This is a func for mapping a kernel and gain information during mapping. - - Returns: NULL - """ - get_map_command = "opt-12 -load ../../build/src/libmapperPass.so -mapperPass kernel.bc" - gen_map_proc = subprocess.Popen([get_map_command, "-u"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - dataS = [] # for get results from subprocess and output to pandas - kernels_source = (self.kernel_name.split("."))[0] - dataS.append(kernels_source) - - try: - eventlet.monkey_patch() - with eventlet.Timeout(TIME_OUT_SET, True): - with gen_map_proc.stdout: - gen_map_proc.stdout.flush() - for line in iter(gen_map_proc.stdout.readline, b''): - output_line = line.decode("ISO-8859-1") - if "DFG node count: " in output_line: - dataS.append(int(output_line.split("DFG node count: ")[1].split(";")[0])) - dataS.append(int(output_line.split("DFG edge count: ")[1].split(";")[0])) - if "[RecMII: " in output_line: - dataS.append(int(output_line.split("[RecMII: ")[1].split("]")[0])) - if "[Mapping II: " in output_line: - self.ii_1 = int(output_line.split("[Mapping II: ")[1].split("]")[0]) - dataS.append(self.ii_1) - if "[ExpandableII: " in output_line: - self.ii_2 = int(output_line.split("[ExpandableII: ")[1].split("]")[0]) - dataS.append(self.ii_2) - - except eventlet.timeout.Timeout: - dataS = [0]*(DICT_COLUMN) - print("Skipping a specific config for kernel: ", self.kernel_name, "Because it runs more than", TIME_OUT_SET/60 , "minute(s).") - - if len(dataS) != DICT_COLUMN: - dataS.extend([0]*(DICT_COLUMN-len(dataS))) - - print(dataS) - self.df.loc[len(self.df.index)] = dataS - - def map_kernel_skip(self): - """ - This is a func gain DFG information only without mapping. - - Returns: NULL - """ - get_map_command = "opt-12 -load ../../build/src/libmapperPass.so -mapperPass kernel.bc" - gen_map_proc = subprocess.Popen([get_map_command, "-u"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - # Holds the results from subprocess and output to pandas. - dataS = [] - kernels_source = (self.kernel_name.split("."))[0] - dataS.append(kernels_source) - # The first 4 element of dataS is not empty: kernelsSource, DFG node count, DFG edge count, RecMII. - k_data_s_head = 4 - - try: - eventlet.monkey_patch() - with eventlet.Timeout(TIME_OUT_SET, True): - with gen_map_proc.stdout: - gen_map_proc.stdout.flush() - for line in iter(gen_map_proc.stdout.readline, b''): - output_line = line.decode("ISO-8859-1") - if "DFG node count: " in output_line: - dataS.append(int(output_line.split("DFG node count: ")[1].split(";")[0])) - dataS.append(int(output_line.split("DFG edge count: ")[1].split(";")[0])) - if "[RecMII: " in output_line: - dataS.append(int(output_line.split("[RecMII: ")[1].split("]")[0])) - dataS.extend([0]*(DICT_COLUMN-k_data_s_head)) - break - - except eventlet.timeout.Timeout: - dataS = [0]*(DICT_COLUMN) - print("Skipping a specific config for kernel: ", self.kernel_name, "Because it runs more than", TIME_OUT_SET/60, "minute(s).") - - print(dataS) - self.df.loc[len(self.df.index)] = dataS - - def get_ii(self): - """ - This is a func to compile, run and map kernels under sora_json and store the mapping result in csv - - Returns: name of the csv that collects information of mapped kernels - """ - csv_name = f'./tmp/t_{self.kernel_name}_{self.rows}x{self.columns}_unroll{self.unroll_factor}_vector{self.vector_factor}.csv' - print("Generating", csv_name) - target_kernel = self.comp_kernel() - - sora_json = { - "kernel": target_kernel, - "targetFunction": False, - "targetNested": False, - "targetLoopsID": [0], - "doCGRAMapping": DO_MAPPING, - "row": self.rows, - "column": self.columns, - "precisionAware": False, - "fusionStrategy": ["default_heterogeneous"], - "isTrimmedDemo": True, - "heuristicMapping": True, - "parameterizableCGRA": False, - "vectorizationMode": "all", - "bypassConstraint": 4, - "isStaticElasticCGRA": False, - "ctrlMemConstraint": 10, - "regConstraint": 8, - "incrementalMapping" : False, - "vectorFactorForIdiv " : 1, - "testingOpcodeOffset" : 0, - "additionalFunc" : { - "complex-Ctrl" : [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], - "div" : [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] - } - } - - json_object = json.dumps(sora_json, indent=4) - - with open(JSON_NAME, "w") as outfile: - outfile.write(json_object) - if DO_MAPPING: - self.map_kernel() - else: - self.map_kernel_skip() - - self.df.to_csv(csv_name) - return csv_name - - def read_ii(self): - """ - This is a func to read from csv generated from get_ii() - - Returns: csv_name - """ - if self.vector_factor > 8: - csv_name = f'./tmp/t_{self.kernel_name}_{self.rows}x{self.columns}_unroll{self.unroll_factor}_vector8.csv' - else: - csv_name = f'./tmp/t_{self.kernel_name}_{self.rows}x{self.columns}_unroll{self.unroll_factor}_vector{self.vector_factor}.csv' - - try: - df = pd.read_csv(csv_name) - self.ii_1 = int(df['mappingII'].iloc[1]) # the first data line - print(self.ii_1) - self.ii_2 = int(df['expandableII'].iloc[1]) # the first data line - print(self.ii_2) - except FileNotFoundError: - print(f"CSV file {csv_name} not found.") - except ValueError: - print(f"Error extracting II values from {csv_name}.") - - return csv_name - - def return_ii(self, num_cgras): - """ - Get the initiation interval (II) based on the number of CGRAs allocated. - - Parameters: - num_cgras (int): Number of CGRAs allocated. - - Returns: - int: The initiation interval (II). - """ - if num_cgras == 1: - return self.ii_1 - elif num_cgras == 2: - return self.ii_2 - else: - raise ValueError("Number of CGRAs must be 1 or 2.") - - def return_total_iterations(self): - """ - Total iterations for the kernel, affected by unroll_factor and vector_factor - - Returns: - int: Total iterations. - """ - return self.total_iterations - - def create_instance(self, arrival_time): - """ - Create a KernelInstance based on the current kernel. - - Parameters: - arrival_time (int): The time at which the instance arrives. - - Returns: - KernelInstance: A new instance of the kernel. - """ - return KernelInstance(self, arrival_time) - - -class KernelInstance: - def __init__(self, kernel, arrival_time): - """ - Initialize a KernelInstance. - - Parameters: - kernel (Kernel): The kernel from which this instance is created. - arrival_time (int): The time at which the instance arrives. - """ - self.kernel = kernel - self.arrival_time = arrival_time - self.start_time = None - self.allocated_cgras = 0 - self.ii = None - self.end_time = None - self.is_valid = True - self.pure_execution_time = 0 # Track pure execution time for this instance - self.pure_waiting_time = 0 # Track pure waiting time for this instance - # Determine the maximum number of CGRAs that can be allocated - self.max_allocate_cgra = 9 - - def __lt__(self, other): - """ - Compare two KernelInstance instances by arrival time. - """ - return self.arrival_time < other.arrival_time - - def calculate_execution_time(self): - """ - Calculate the execution time based on the number of allocated CGRAs - at the beginning running time of current kernel. It may change after. - - Returns: - int: Total execution time in cycles. - """ - # if self.vector_factor = 8, then when allocate_cgra <= 2, self.ii = ii_1, when 2 < allocate_cgra <= 4, self.ii = ii_2 - if self.kernel.vector_factor == 8: - if self.allocated_cgras == 1: - # cgra tile only support vector = 4 - # TODO: self.kernel.ii_1/2 - self.ii = self.kernel.ii_1 - elif self.allocated_cgras == 2: - self.ii = self.kernel.ii_1 - else: - # TODO: self.kernel.ii_3/2 - self.ii = self.kernel.ii_2 - else: - if self.allocated_cgras == 1: - self.ii = self.kernel.ii_1 - elif self.allocated_cgras == 2: - self.ii = self.kernel.ii_2 - else: - raise ValueError(f"Number of CGRAs must be between 1 and {self.max_allocate_cgra}.") - execution_time = self.kernel.total_iterations * self.ii - print(f"Calculated execution time for {self.kernel.kernel_name}: {execution_time} cycles (II={self.ii}, iterations={self.kernel.total_iterations})") - return execution_time - - def copy_with_valid(self): - """ - Create a copy of the current instance and set is_valid to True. - - Returns: - KernelInstance: A new instance copy. - """ - new_instance = KernelInstance(self.kernel, self.arrival_time) - new_instance.start_time = self.start_time - new_instance.allocated_cgras = self.allocated_cgras - new_instance.ii = self.ii - new_instance.end_time = self.end_time - new_instance.is_valid = True - new_instance.pure_execution_time = self.pure_execution_time - new_instance.pure_waiting_time = self.pure_waiting_time - new_instance.max_allocate_cgra = self.max_allocate_cgra - return new_instance - - -# ---------------------------------------------------------------------------- -# function defination / -# ---------------------------------------------------------------------------- - -def allocate(instance, current_time, available_cgras, events, running_instances, runned_kernel_names, total_cgra_runtime): - """ - Allocate CGRAs to a kernel instance. - - Parameters: - instance (KernelInstance): The kernel instance to allocate CGRAs to. - current_time (int): The current simulation time. - available_cgras (int): The number of available CGRAs. - events (list): The event queue. - running_instances (list): The list of currently running instances. - runned_kernel_names (list): The list of names of the kernels that have been run. - total_cgra_runtime (float): The total runtime of all CGRAs. - - Returns: - int: The updated number of available CGRAs. - float: The updated total runtime of all CGRAs. - """ - runned_kernel_names.append(instance.kernel.kernel_name) - allocate_cgras = min(instance.max_allocate_cgra, available_cgras) - available_cgras -= allocate_cgras - instance.start_time = current_time - instance.allocated_cgras = allocate_cgras - execution_time = instance.calculate_execution_time() - instance.end_time = current_time + execution_time - instance.pure_waiting_time = instance.start_time - instance.arrival_time # Record pure waiting time - print(f"Allocated {allocate_cgras} CGRAs to {instance.kernel.kernel_name} at time {current_time}. Execution will end at {instance.end_time}") - heapq.heappush(events, (instance.end_time, 'end', instance, instance)) - running_instances.append(instance) - total_cgra_runtime += allocate_cgras * execution_time - return available_cgras, total_cgra_runtime - - -def release(instance, current_time, available_cgras, running_instances, completed_instances, kernel_latency, total_cgra_runtime): - """ - Release the CGRAs occupied by a kernel instance. - - Parameters: - instance (KernelInstance): The kernel instance to release CGRAs from. - current_time (int): The current simulation time. - available_cgras (int): The number of available CGRAs. - running_instances (list): The list of currently running instances. - completed_instances (list): The list of completed instances. - kernel_latency (dict): A dictionary used to track the total latency of each kernel. - total_cgra_runtime (float): The total runtime of all CGRAs. - - Returns: - int: The updated number of available CGRAs. - float: The updated total runtime of all CGRAs. - """ - available_cgras += instance.allocated_cgras - completed_instances.append(instance) - if instance in running_instances: - running_instances.remove(instance) - # Update per-kernel overall latency - instance.end_time = current_time - latency = instance.end_time - instance.start_time - instance.pure_execution_time = instance.end_time - instance.start_time # Record pure execution time - kernel_latency[instance.kernel.kernel_name] += latency - print(f"Released {instance.allocated_cgras} CGRAs from {instance.kernel.kernel_name} at time {current_time}. Latency added: {latency} cycles") - return available_cgras, total_cgra_runtime - - -def re_allocate(instance, current_time, available_cgras, events, total_cgra_runtime): - """ - Re-allocate additional CGRAs to a kernel instance if possible. - - Parameters: - instance (KernelInstance): The kernel instance to re-allocate CGRAs to. - available_cgras (int): Number of available CGRAs. - events (list): The event queue. - current_time (int): The current simulation time. - total_cgra_runtime (float): Total runtime of all CGRAs. - - Returns: - int: Updated number of available CGRAs. - float: Updated total runtime of all CGRAs. - """ - if instance.allocated_cgras < instance.max_allocate_cgra and available_cgras > 0: - possible_alloc = min(instance.max_allocate_cgra - instance.allocated_cgras, available_cgras) - # Update allocation - instance.allocated_cgras += possible_alloc - available_cgras -= possible_alloc - # Recalculate remaining iterations - elapsed_time = current_time - instance.start_time - completed_iters = elapsed_time // instance.ii - remaining_iters = instance.kernel.total_iterations - completed_iters - # Update II - if instance.allocated_cgras == 1: - instance.ii = instance.kernel.ii_1 - elif instance.allocated_cgras in [2, 3, 4]: - instance.ii = instance.kernel.ii_2 - new_execution_time = remaining_iters * instance.ii - # Schedule new end event - new_end_time = current_time + new_execution_time - instance.end_time = new_end_time - print(f"Re-allocated {possible_alloc} CGRAs to {instance.kernel.kernel_name} at time {current_time}. New end time: {new_end_time}") - # Create a new valid instance for the new end event - new_instance = instance.copy_with_valid() # Assume there is a copy method in KernelInstance class - heapq.heappush(events, (new_end_time, 'end', new_instance, new_instance)) - instance.is_valid = False # Old instance is invalid - total_cgra_runtime += possible_alloc * new_execution_time - # Invalidate old end event by leaving it in the heap but ignoring when processed - else: - print(f"Re-allocated CGRAs to {instance.kernel.kernel_name} at time {current_time} Failed.") - return available_cgras, total_cgra_runtime - - -def simulate(num_cgras, kernels, priority_bosting, lcm_time=80000000): - """ - Simulate the execution of multiple kernels on a CGRA architecture. - - Parameters: - num_cgras (int): The number of CGRAs in the CGRA architecture. - kernels (list of Kernel): The list of kernels to simulate. - priority_bosting (bool): Whether to enable priority boosting. - lcm_time (int): The least common multiple of the arrival periods. - - Returns: - dict: A dictionary that maps kernel names to their total latencies. - """ - available_cgras = num_cgras - events = [] # when a kernel arrives or ends, it is an event - current_time = 0 - waiting_instances = [] - running_instances = [] - completed_instances = [] - runned_kernel_names = [] - # Dictionary to store per-kernel arrival times - kernel_arrival_count = {kernel.kernel_name: 0 for kernel in kernels} - # Dictionary to store per-kernel overall latency (cycle) - kernel_latency = {kernel.kernel_name: 0 for kernel in kernels} - # Dictionary to store per-kernel execution time distribution - kernel_execution_distribution = {kernel.kernel_name: [] for kernel in kernels} - # Dictionary to store per-kernel waiting time distribution - kernel_waiting_distribution = {kernel.kernel_name: [] for kernel in kernels} - # Dictionary to store per-kernel ratio (iterations per cycle) - kernel_execution_ratio = {kernel.kernel_name: 0 for kernel in kernels} - # Dictionary to store per-kernel ratio (iterations per cycle) - kernel_waiting_ratio = {kernel.kernel_name: 0 for kernel in kernels} - total_cgra_runtime = 0 - arrive_times_list = {"fir.cpp": 12, "latnrm.c":4, "fft.c":10, "dtw.cpp":7, "spmv.c":6, "conv.c":8, "relu.c":5, "mvt.c":12, "gemm.c":2, "histogram.cpp":2} - - if priority_bosting: - print("\033[91mpriority_bosting is on\033[0m") - - for kernel in kernels: - print(f"Kernel {kernel.kernel_name} II_1={kernel.ii_1}, II_2={kernel.ii_2}, total_iterations={kernel.total_iterations}") - - # Schedule initial arrivals for all kernels - for kernel in kernels: - first_arrival = 0 - # heapq keeps a priority queue that contains (event_arrive_end_time (int), event_type (str), Kernel, KernelInstance (needed when 'end')) - heapq.heappush(events, (first_arrival, 'arrival', kernel, None)) - - while events: - event_time, event_type, kernel_or_instance, _ = heapq.heappop(events) - current_time = event_time - print(f"Processing event at time {current_time}: type={event_type}, kernel={kernel_or_instance.kernel_name if event_type == 'arrival' else kernel_or_instance.kernel.kernel_name}") - - if event_type == 'arrival': - kernel = kernel_or_instance - kernel_arrival_count[kernel.kernel_name] += 1 - # Create a new instance - instance = kernel.create_instance(current_time) - # Schedule next arrival if within lcm_time - next_arrival = current_time + kernel.arrive_period - if kernel_arrival_count[kernel.kernel_name] < arrive_times_list[kernel.kernel_name]: - heapq.heappush(events, (next_arrival, 'arrival', kernel, None)) - print(f"Scheduled next arrival for {kernel.kernel_name} at time {next_arrival}") - - - # Try to allocate CGRAs - if available_cgras >= 1: - available_cgras, total_cgra_runtime = allocate(instance, current_time, available_cgras, events, running_instances, runned_kernel_names, total_cgra_runtime) - else: - waiting_instances.append(instance) - print(f"No available CGRAs for {kernel.kernel_name}. Added to waiting queue.") - - elif event_type == 'end': - instance = kernel_or_instance - if not instance.is_valid: - # If instance is invalid, means it is re_allocated. - print(f"Ignoring invalid end event for {instance.kernel.kernel_name}") - continue - # Release CGRAs - available_cgras, total_cgra_runtime = release(instance, current_time, available_cgras, running_instances, completed_instances,kernel_latency, total_cgra_runtime) - - # Update execution time distribution - kernel_execution_distribution[instance.kernel.kernel_name].append(instance.pure_execution_time) - kernel_waiting_distribution[instance.kernel.kernel_name].append(instance.pure_waiting_time) - - # Check waiting queue - while waiting_instances and available_cgras >= 1: - instance = waiting_instances.pop(0) - print(f"Allocating CGRAs to waiting instance {instance.kernel.kernel_name}") - available_cgras, total_cgra_runtime = allocate(instance, current_time, available_cgras, events, running_instances, runned_kernel_names, total_cgra_runtime) - - # Check running instances for possible re-allocation - if priority_bosting: - for running in running_instances: - available_cgras, total_cgra_runtime = re_allocate(running, current_time, available_cgras, events, total_cgra_runtime) - - # Calculate ratio for each kernel - for kernel in kernels: - total_execution_time = sum( - [inst.pure_execution_time for inst in completed_instances if inst.kernel.kernel_name == kernel.kernel_name]) - total_waiting_time = sum( - [inst.pure_waiting_time for inst in completed_instances if inst.kernel.kernel_name == kernel.kernel_name]) - total_time = total_execution_time + total_waiting_time - kernel_execution_ratio[kernel.kernel_name] = total_execution_time / total_time if total_time > 0 else 0 - kernel_waiting_ratio[kernel.kernel_name] = total_waiting_time / total_time if total_time > 0 else 0 - - # Calculate utilization of total CGRAs - cgra_utilization = total_cgra_runtime / (current_time * num_cgras) - overall_latency = current_time # when all kernels are done - - print(f"Simulation completed. Kernel latencies: {kernel_latency}") - print(f"Kernel execution_ratio: {kernel_execution_ratio}") - print(f"Kernel execution time distributions: {kernel_execution_distribution}") - print(f"Kernel Runned List: {runned_kernel_names}") - print(f"CGRA utilization: {cgra_utilization}") - print(f"overall latency: {overall_latency}") - return kernel_latency, kernel_waiting_distribution, kernel_execution_ratio, kernel_waiting_ratio, kernel_execution_distribution, cgra_utilization, overall_latency - - -def run_multiple_simulations_and_save_to_csv(kernels_list, csvname, priority_bosting, num_cgras=9): - """ - Run multiple simulations and save the results to a CSV file. - - Parameters: - kernels_list (list of list of Kernel): A list of kernels. - csvname (str): The name of the CSV file. - priority_bosting (bool): Whether to enable priority boosting. - num_cgras (int): The number of CGRAs, default 9. - """ - for i, kernels in enumerate(kernels_list, start = 1): - kernel_latency, kernel_waiting_distribution, kernel_execution_ratio, kernel_waiting_ratio, kernel_execution_distribution, cgra_utilization, overall_latency = simulate(num_cgras, kernels, priority_bosting) - - # Calculate fastest, slowest, and average execution time per kernel - execution_stats = {} - for kernel_name, execution_times in kernel_execution_distribution.items(): - if execution_times: - fastest = min(execution_times) - slowest = max(execution_times) - average = sum(execution_times) / len(execution_times) - total = sum(execution_times) - execution_stats[kernel_name] = { - "fastest_execution_time": fastest, - "slowest_execution_time": slowest, - "average_execution_time": average, - "total_execution_time": total - } - - # Calculate fastest, slowest, and average waiting time per kernel - waiting_stats = {} - for kernel_name, waiting_times in kernel_waiting_distribution.items(): - if waiting_times: - fastest = min(waiting_times) - slowest = max(waiting_times) - average = sum(waiting_times) / len(waiting_times) - total = sum(waiting_times) - waiting_stats[kernel_name] = { - "fastest_waiting_time": fastest, - "slowest_waiting_time": slowest, - "average_waiting_time": average, - "total_waiting_time": total - } - - all_results = [] - for kernel in kernels: - kernel_name = kernel.kernel_name - result = { - "Kernel_Name": kernel_name, - "Arrive_Period": kernel.arrive_period, - "Unroll_Factor": kernel.unroll_factor, - "Vector_Factor": kernel.vector_factor, - "fastest_execution_time": execution_stats.get(kernel_name, {}).get("fastest_execution_time", None), - "slowest_execution_time": execution_stats.get(kernel_name, {}).get("slowest_execution_time", None), - "Average_Execution_Time": execution_stats.get(kernel_name, {}).get("average_execution_time", None), - "fastest_waiting_time": waiting_stats.get(kernel_name, {}).get("fastest_waiting_time", None), - "slowest_waiting_time": waiting_stats.get(kernel_name, {}).get("slowest_waiting_time", None), - "Average_Waiting_Time": waiting_stats.get(kernel_name, {}).get("average_waiting_time", None), - "Total_Execution_Time": execution_stats.get(kernel_name, {}).get("total_execution_time", None), - "Total_Waiting_Time": waiting_stats.get(kernel_name, {}).get("total_waiting_time", None), - "Execution_Time Ratio": kernel_execution_ratio[kernel_name], - "Waiting_Time Ratio": kernel_waiting_ratio[kernel_name], - "Overall_Case_Latency": overall_latency, - "CGRA Utilization": cgra_utilization, - "Total_Execution_Time Ratio": (execution_stats.get(kernel_name, {}).get("total_execution_time", None))/overall_latency, - "Total_Waiting_Time Ratio": (waiting_stats.get(kernel_name, {}).get("total_waiting_time", None))/overall_latency, - "Total_Latency Ratio": (execution_stats.get(kernel_name, {}).get("total_execution_time", None) + waiting_stats.get(kernel_name, {}).get("total_waiting_time", None))/overall_latency - } - all_results.append(result) - - - df = pd.DataFrame(all_results) - file_name = f'simulation_{csvname}_case{i}.csv' - df.to_csv(file_name, index=False) - - -if __name__ == "__main__": - baselineCase1=[ - [ - Kernel(kernel_name="fir.cpp", kernel_id =0, arrive_period =1500000, unroll_factor =1,vector_factor =1, total_iterations =300000, cgra_rows= 12, cgra_columns=12) , - Kernel(kernel_name="conv.c", kernel_id =5, arrive_period =2500000, unroll_factor =1,vector_factor =1, total_iterations =400000, cgra_rows= 12, cgra_columns=12) , - Kernel(kernel_name="relu.c", kernel_id =6, arrive_period =4000000, unroll_factor =1,vector_factor =1, total_iterations =1000000, cgra_rows= 12, cgra_columns=12) , - Kernel(kernel_name="histogram.cpp", kernel_id =7, arrive_period =1200000, unroll_factor =1,vector_factor =1, total_iterations =262144, cgra_rows= 12, cgra_columns=12) , - ] - ] - taskCase1 = [ - [ - Kernel(kernel_name="fir.cpp", kernel_id =0, arrive_period =300000, unroll_factor =1,vector_factor =8, total_iterations =300000, cgra_rows= 4, cgra_columns=4) , - Kernel(kernel_name="conv.c", kernel_id =5, arrive_period =400000, unroll_factor =1,vector_factor =8, total_iterations =400000, cgra_rows= 4, cgra_columns=4) , - Kernel(kernel_name="relu.c", kernel_id =6, arrive_period =1000000, unroll_factor =1,vector_factor =8, total_iterations =1000000, cgra_rows= 4, cgra_columns=4) , - Kernel(kernel_name="histogram.cpp", kernel_id =7, arrive_period =262144, unroll_factor =1,vector_factor =8, total_iterations =262144, cgra_rows= 4, cgra_columns=4) , - ] - ] - run_multiple_simulations_and_save_to_csv(baselineCase1, "Baseline", priority_bosting = True, num_cgras=1) # one cgra is 4x4 - run_multiple_simulations_and_save_to_csv(taskCase1, "NoBosting", priority_bosting = False, num_cgras=9) - run_multiple_simulations_and_save_to_csv(taskCase1, "Bosting", priority_bosting = True, num_cgras=9) diff --git a/tools/expandable/README.md b/tools/expandable/README.md new file mode 100644 index 00000000..84888f4e --- /dev/null +++ b/tools/expandable/README.md @@ -0,0 +1,29 @@ +# Strcture +tools/ +└── expandable/ + ├── README.md # This file + ├── __init__.py + ├── main.py # Neura demo script + ├── designs/ # input json for Neura Scalibility evaluation + │ ├── 2x2baseline.json + │ ├── 2x2task.json + │ ├── 3x3task.json + │ ├── 4x4task.json + │ └── 5x5task.json + ├── util/ # Utility modules + │ ├── __init__.py + │ ├── scheduler.py # Kernel mapping and task scheduling + │ └── visualizer.py # Result visualization + ├── fig/ # Generated figures + ├── result/ # Scheduling results + └── tmp/ # Kernel mapping results + +# Core components +- main.py generates simulated real-world tasks and models the execution progress across different evaluation settings. +- scheduler.py recieves tasks and generates kernel mapping information (stored in /tmp/). It also outputs scheduling results to /result/ directory. +- visulization.py reads csv from /result/ and generates paper figures in /figs/ directory. + +# Outputs +- /fig/Fig9.png: Normalized execution time and improved utilization +- /fig/Fig10.png: Normalized throughput speedup +- /fig/Fig11.png: Scalability -- Normalized execution time and improved utilization \ No newline at end of file diff --git a/tools/expandable/__init__.py b/tools/expandable/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/expandable/demo.sh b/tools/expandable/demo.sh index b4c22d68..1136e515 100755 --- a/tools/expandable/demo.sh +++ b/tools/expandable/demo.sh @@ -1,6 +1,26 @@ #!/usr/bin/env bash -source /WORK_REPO/venv/bin/activate +# source /WORK_REPO/venv/bin/activate +while [[ $# -gt 0 ]]; do + case $1 in + -t|--test) + TEST_FLAG="$2" + shift 2 + ;; + *) + echo "Unknown parameter: $1" + exit 1 + ;; + esac +done + +TEST_FLAG=${TEST_FLAG:-n} + +rm -r ./tmp +rm -r ./result +rm -r ./fig mkdir ./tmp +mkdir ./result +mkdir ./fig -python NEURAdemo.py +python main.py --test=$TEST_FLAG \ No newline at end of file diff --git a/tools/expandable/designs/2x2baseline.json b/tools/expandable/designs/2x2baseline.json new file mode 100644 index 00000000..ac42ab2f --- /dev/null +++ b/tools/expandable/designs/2x2baseline.json @@ -0,0 +1,372 @@ +[ + { + "kernel_name": "fir.cpp", + "kernel_id": 7, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2048, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "latnrm.c", + "kernel_id": 8, + "arrive_period": 327680, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 1280, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "fft.c", + "kernel_id": 2, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 4, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "spmv.c", + "kernel_id": 3, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 1, + "total_iterations": 65536, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "conv.c", + "kernel_id": 1, + "arrive_period": 6553600, + "unroll_factor": 1, + "vector_factor": 1, + "total_iterations": 655360, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 5, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 0, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 6, + "arrive_period": 204800, + "unroll_factor": 2, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "fft.c", + "kernel_id": 12, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 9, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "fft.c", + "kernel_id": 10, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 11, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "fft.c", + "kernel_id": 13, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 14, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "fft.c", + "kernel_id": 29, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 30, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "fft.c", + "kernel_id": 31, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 32, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 15, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 16, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 17, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 18, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 19, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 20, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "fft.c", + "kernel_id": 29, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 30, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "fft.c", + "kernel_id": 31, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 32, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 21, + "arrive_period": 204800, + "unroll_factor": 2, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 22, + "arrive_period": 204800, + "unroll_factor": 2, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 23, + "arrive_period": 204800, + "unroll_factor": 2, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 24, + "arrive_period": 204800, + "unroll_factor": 2, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 25, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 26, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 27, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 8, + "cgra_columns": 8 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 28, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 8, + "cgra_columns": 8 + } +] \ No newline at end of file diff --git a/tools/expandable/designs/2x2task.json b/tools/expandable/designs/2x2task.json new file mode 100644 index 00000000..ae9162eb --- /dev/null +++ b/tools/expandable/designs/2x2task.json @@ -0,0 +1,372 @@ +[ + { + "kernel_name": "fir.cpp", + "kernel_id": 7, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2048, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "latnrm.c", + "kernel_id": 8, + "arrive_period": 327680, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 1280, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 2, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 4, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "spmv.c", + "kernel_id": 3, + "arrive_period": 819200, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 65536, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "conv.c", + "kernel_id": 1, + "arrive_period": 6553600, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 655360, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 5, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 0, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 6, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 12, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 9, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 10, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 11, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 13, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 14, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 29, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 30, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 31, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 32, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 15, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 16, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 17, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 18, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 19, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 20, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 29, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 30, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 31, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 32, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 21, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 22, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 23, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 24, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 25, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 26, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 27, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 28, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + } +] \ No newline at end of file diff --git a/tools/expandable/designs/3x3task.json b/tools/expandable/designs/3x3task.json new file mode 100644 index 00000000..ae9162eb --- /dev/null +++ b/tools/expandable/designs/3x3task.json @@ -0,0 +1,372 @@ +[ + { + "kernel_name": "fir.cpp", + "kernel_id": 7, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2048, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "latnrm.c", + "kernel_id": 8, + "arrive_period": 327680, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 1280, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 2, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 4, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "spmv.c", + "kernel_id": 3, + "arrive_period": 819200, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 65536, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "conv.c", + "kernel_id": 1, + "arrive_period": 6553600, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 655360, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 5, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 0, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 6, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 12, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 9, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 10, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 11, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 13, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 14, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 29, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 30, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 31, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 32, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 15, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 16, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 17, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 18, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 19, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 20, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 29, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 30, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 31, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 32, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 21, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 22, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 23, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 24, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 25, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 26, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 27, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 28, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + } +] \ No newline at end of file diff --git a/tools/expandable/designs/4x4task.json b/tools/expandable/designs/4x4task.json new file mode 100644 index 00000000..ae9162eb --- /dev/null +++ b/tools/expandable/designs/4x4task.json @@ -0,0 +1,372 @@ +[ + { + "kernel_name": "fir.cpp", + "kernel_id": 7, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2048, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "latnrm.c", + "kernel_id": 8, + "arrive_period": 327680, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 1280, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 2, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 4, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "spmv.c", + "kernel_id": 3, + "arrive_period": 819200, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 65536, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "conv.c", + "kernel_id": 1, + "arrive_period": 6553600, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 655360, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 5, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 0, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 6, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 12, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 9, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 10, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 11, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 13, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 14, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 29, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 30, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 31, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 32, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 15, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 16, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 17, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 18, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 19, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 20, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 29, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 30, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 31, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 32, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 21, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 22, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 23, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 24, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 25, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 26, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 27, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 28, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + } +] \ No newline at end of file diff --git a/tools/expandable/designs/5x5task.json b/tools/expandable/designs/5x5task.json new file mode 100644 index 00000000..ae9162eb --- /dev/null +++ b/tools/expandable/designs/5x5task.json @@ -0,0 +1,372 @@ +[ + { + "kernel_name": "fir.cpp", + "kernel_id": 7, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2048, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "latnrm.c", + "kernel_id": 8, + "arrive_period": 327680, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 1280, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 2, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 4, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "spmv.c", + "kernel_id": 3, + "arrive_period": 819200, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 65536, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "conv.c", + "kernel_id": 1, + "arrive_period": 6553600, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 655360, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 5, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 0, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 6, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 12, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 9, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 10, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 11, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 13, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 14, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 29, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 30, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 31, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 32, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 15, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 16, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 17, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 18, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 19, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "gemm.c", + "kernel_id": 20, + "arrive_period": 5242880, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 2097152, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 29, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 30, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "fft.c", + "kernel_id": 31, + "arrive_period": 327680, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 112640, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "dtw.cpp", + "kernel_id": 32, + "arrive_period": 819200, + "unroll_factor": 1, + "vector_factor": 16, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 21, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 22, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 23, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "relu+histogram.c", + "kernel_id": 24, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 262144, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 25, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 26, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 27, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + }, + { + "kernel_name": "mvt.c", + "kernel_id": 28, + "arrive_period": 204800, + "unroll_factor": 4, + "vector_factor": 1, + "total_iterations": 16384, + "cgra_rows": 4, + "cgra_columns": 4 + } +] \ No newline at end of file diff --git a/tools/expandable/main.py b/tools/expandable/main.py new file mode 100644 index 00000000..9aeb8e6d --- /dev/null +++ b/tools/expandable/main.py @@ -0,0 +1,320 @@ +# ---------------------------------------------------------------------------- +# Filename: main.py / +# Description: load multi-task and schedule them on multi-CGRA / +# ---------------------------------------------------------------------------- + +import argparse +import json +import os +from pathlib import Path +import time +import util.scheduler as scheduler +import util.visualizer as visualizer + +# ---------------------------------------------------------------------------- +# global variables / +# ---------------------------------------------------------------------------- +VISUALIZATION = True +TESTME = False + +# Static kernel data (name: (sort_id, total_iterations, static_execution_time)) +KERNEL_DATA = { + "fir.cpp": (7, 2048, 4096), + "latnrm.c": (8, 1280, 2560), + "fft.c": (2, 112640, 450560), + "dtw.cpp": (4, 16384, 49152), + "spmv.c": (3, 65536, 262144), + "conv.c": (1, 655360, 1310720), + "mvt.c": (5, 16384, 49152), + "gemm.c": (0, 2097152, 8388608), + "relu+histogram.c": (6, 262144, 2097152) +} + +# Case configuration dictionary (task_id: [A_P, UNROLL_FACTORS, VECTOR_FACTORS]) +TASK_CONFIGS = { + 1: { + 'A_P': [81920, 81920, 81920, 327680, 327680, 1638400, 81920, 1638400, 81920], + 'UNROLL_FACTORS': [1]*9, + 'VECTOR_FACTORS': [1]*9 + }, + 2: { + 'A_P': [102400, 102400, 102400, 327680, 327680, 1638400, 163840, 1638400, 81920], + 'UNROLL_FACTORS': [1,2,2,1,2,2,2,1,1], + 'VECTOR_FACTORS': [4, 1, 1, 4, 1, 1, 1, 4, 1] + }, + 3: { + 'A_P': [102400, 102400, 102400, 409600, 409600, 2621440, 102400, 2621440, 81920], + 'UNROLL_FACTORS': [1,4,2,1,4,4,4,1,1], + 'VECTOR_FACTORS': [8, 1, 1, 8, 1, 1, 1, 8, 1] + }, + 4: { + 'A_P': [163840, 163840, 163840, 655360, 655360, 3276800, 163840, 3276800, 163840], + 'UNROLL_FACTORS': [1,4,1,2,4,4,4,1,1], + 'VECTOR_FACTORS': [16, 1, 1, 16, 1, 1, 1, 16, 1] + }, + 5: { + 'A_P': [204800, 204800, 204800, 819200, 819200, 5242880, 204800, 5242880, 204800], + 'UNROLL_FACTORS': [1,4,1,1,4,4,4,1,1], + 'VECTOR_FACTORS': [16, 1, 16, 16, 1, 1, 1, 16, 1] + }, + 6: { + 'A_P': [327680, 327680, 327680, 819200, 819200, 6553600, 204800, 5242880, 204800], + 'UNROLL_FACTORS': [1,4,1,1,4,4,4,1,1], + 'VECTOR_FACTORS': [16, 1, 16, 16, 1, 1, 1, 16, 1] + } +} + +# ---------------------------------------------------------------------------- +# function defination / +# ---------------------------------------------------------------------------- + +def str_to_bool(value): + if isinstance(value, bool): + return value + if str(value).lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif str(value).lower() in ('no', 'false', 'f', 'n', '0'): + return False + raise argparse.ArgumentTypeError('Invalid boolean value (accepted: 0/1, true/false, yes/no)') + + +def parse_arguments(): + """Parse command line arguments""" + parser = argparse.ArgumentParser( + description='Multi-CGRA Task Scheduling Tool' + ) + # Core application arguments + parser.add_argument('--test', type=str_to_bool, default=TESTME, + help='Run tests in CI/CD [y/n]') + parser.add_argument('--cgra-config', type=int, default= 4, + help='Path to CGRA configuration file') + parser.add_argument('--json-name', type=str, default= "./param.json", + help='JSON configuration file name') + parser.add_argument('--kernel-directory', type=str, default= "../../test/kernels", + help='Kernel directory path') + parser.add_argument('--time-out-set', type=int, default= 180, + help='Timeout setting for operations') + parser.add_argument('--visualize', type=str_to_bool, default=VISUALIZATION, + help='Generate visualization figures [y/n]') + + return parser.parse_args() + + +def load_configuration(): + """Load and merge configurations from multiple sources with priority: + 1. Command line arguments (highest priority) + 2. Default values (lowest priority) + """ + # Update global configuration with command line arguments + global VISUALIZATION, TESTME + # Parse command line arguments + args = parse_arguments() + VISUALIZATION = args.visualize + TESTME = args.test + scheduler.init_args(args) + print(f"Test in CI/CD: {args.test}") + print(f"Timeout: {args.time_out_set}") + print(f"Visualization: {args.visualize}") + + +# ========== Task Loading Function ========== +def load_tasks(task_id, task_type="baseline"): + """ + Load task list based on task_id and CGRA type + + Args: + task_id: Configuration case ID + task_type: "baseline" or "task", corresponding to 12x12 and 4x4 CGRA respectively + + Returns: + task_list: List of task objects + """ + global TASK_CONFIGS, KERNEL_DATA + if task_id not in TASK_CONFIGS: + raise ValueError(f"Task{task_id} configuration does not exist") + + config = TASK_CONFIGS[task_id] + A_P = config['A_P'] + UNROLL_FACTORS = config['UNROLL_FACTORS'] + VECTOR_FACTORS = config['VECTOR_FACTORS'] + + # Validate parameter lengths + lists = [KERNEL_DATA, A_P, UNROLL_FACTORS, VECTOR_FACTORS] + if len(set(len(lst) for lst in lists if lst)) > 1: + raise ValueError(f"Task{task_id} parameter length mismatch: {[len(lst) for lst in lists]}") + + # Set CGRA dimensions + if task_type == "baseline": + cgra_rows, cgra_columns = 12, 12 + elif task_type == "task": + cgra_rows, cgra_columns = 4, 4 + else: + raise ValueError("task_type must be either 'baseline' or 'task'") + + # Generate task list + task_list = [] + for i, (kernel_name, (kernel_id, total_iters, _)) in enumerate(KERNEL_DATA.items()): + task = scheduler.Kernel( + kernel_name=kernel_name, + kernel_id=kernel_id, + arrive_period=A_P[i] if A_P else 0, + unroll_factor=UNROLL_FACTORS[i], + vector_factor=VECTOR_FACTORS[i], + total_iterations=total_iters, + cgra_rows=cgra_rows, + cgra_columns=cgra_columns + ) + task_list.append(task) + + return task_list + + +def run_simulation_for_case(task_id, num_task_cgras = 9, file_name = "NULL", load_from_file = False): + """ + Complete simulation workflow for specified case + + Args: + task_id: Configuration case ID to run simulation for + """ + print(f"[Step 2] Loading tasks for task {task_id}...") + + if load_from_file: + if file_name is '2x2': + # Load baseline tasks (12x12 CGRA) + baseline_tasks = load_tasks_from_file(f"./designs/{file_name}baseline.json") + # Load task tasks (4x4 CGRA) + task_tasks = load_tasks_from_file(f"./designs/{file_name}task.json") + else: + # Load baseline tasks (12x12 CGRA) + baseline_tasks = load_tasks(task_id, "baseline") + # Load task tasks (4x4 CGRA) + task_tasks = load_tasks(task_id, "task") + + if load_from_file: + case_id = file_name + '_' + str(task_id) + else: + case_id = task_id + + if (not load_from_file) or (file_name is '2x2'): + # Run baseline simulation + scheduler.run_multiple_simulations_and_save_to_csv( + baseline_tasks, + csv_name="Baseline", + priority_boosting=0, + kernel_case=case_id, + num_cgras=1 # one cgra is 12x12 + ) + + # Run task simulation + scheduler.run_multiple_simulations_and_save_to_csv( + task_tasks, + csv_name="Neura-L0", + priority_boosting=0, + kernel_case=case_id, + num_cgras=num_task_cgras # 9 of 4x4 CGRAs + ) + scheduler.run_multiple_simulations_and_save_to_csv( + task_tasks, + csv_name="Neura-L1", + priority_boosting=1, + kernel_case=case_id, + num_cgras=num_task_cgras # 9 of 4x4 CGRAs + ) + scheduler.run_multiple_simulations_and_save_to_csv( + task_tasks, + csv_name="Neura-L2", + priority_boosting=2, + kernel_case=case_id, + num_cgras=num_task_cgras # 9 of 4x4 CGRAs + ) + scheduler.run_multiple_simulations_and_save_to_csv( + task_tasks, + csv_name="Neura", + priority_boosting=3, + kernel_case=case_id, + num_cgras=num_task_cgras # 9 of 4x4 CGRAs + ) + + +def load_tasks_from_file(filename): + """ + Load task list from JSON file + + Args: + filename: Input JSON filename + + Returns: + task_list: List of reconstructed task objects + """ + if not os.path.exists(filename): + raise FileNotFoundError(f"Task file {filename} not found") + + with open(filename, 'r') as f: + tasks_data = json.load(f) + + # Reconstruct task objects from dictionaries + task_list = [] + for task_dict in tasks_data: + task = scheduler.Kernel( + kernel_name=task_dict['kernel_name'], + kernel_id=task_dict['kernel_id'], + arrive_period=task_dict['arrive_period'], + unroll_factor=task_dict['unroll_factor'], + vector_factor=task_dict['vector_factor'], + total_iterations=task_dict['total_iterations'], + cgra_rows=task_dict['cgra_rows'], + cgra_columns=task_dict['cgra_columns'] + ) + task_list.append(task) + + print(f"Tasks loaded from {filename}") + return task_list + + +def main(): + """Main workflow control function""" + start = time.time() + # 1. Load configuration (includes parsing arguments) + print("=== Multi-CGRA Task Scheduling Tool ===") + load_configuration() + + # 2. Create output directory + print(f"Intermediate reslut in: ./tmp") + output_dir = Path("./tmp") + output_dir.mkdir(parents=True, exist_ok=True) + + # 3. Execute scheduling + print("[Step 1] Loading tasks and Scheduling tasks on 4x4 Multi-CGRA...") + if TESTME: + run_simulation_for_case(1) + # run_simulation_for_case(task_id = 6, num_task_cgras=4, file_name="2x2", load_from_file=True) # 2x2 + else: + for task_case_id in TASK_CONFIGS: + run_simulation_for_case(task_case_id) + + # 4. Execute scheduling + print("[Step 2] Loading tasks and Scheduling tasks on 2x2, 3x3, 5x5 Multi-CGRA...") + run_simulation_for_case(task_id = 6, num_task_cgras=4, file_name="2x2", load_from_file=True) # 2x2 + run_simulation_for_case(task_id = 6, num_task_cgras=9, file_name="3x3", load_from_file=True) # 3x3 + run_simulation_for_case(task_id = 6, num_task_cgras=16, file_name="4x4", load_from_file=True) # 4x4 + run_simulation_for_case(task_id = 6, num_task_cgras=25, file_name="5x5", load_from_file=True) # 5x5 + + # 5. Generate visualization + if VISUALIZATION: # Use global variable + print(f"[Step 3] Generating visualization figures...") + + # Generate Fig9 + genFigs = visualizer.SimulationDataAnalyzer(kernel_data=KERNEL_DATA) + genFigs.genFig9("./fig/Fig9.png") + genFigs.genFig10("./fig/Fig10.png") + genFigs.genFig11("./fig/Fig11.png") + + + print("\n=== Scheduling completed successfully! ===") + end = time.time() + execution_time = end - start + print(f"Time cost: {execution_time/60:.2f} min") + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/tools/expandable/param.json b/tools/expandable/param.json new file mode 100644 index 00000000..ee865842 --- /dev/null +++ b/tools/expandable/param.json @@ -0,0 +1,100 @@ +{ + "kernel": "kernel", + "targetFunction": false, + "targetNested": false, + "targetLoopsID": [ + 0 + ], + "doCGRAMapping": true, + "row": 12, + "column": 12, + "precisionAware": false, + "fusionStrategy": [ + "default_heterogeneous" + ], + "isTrimmedDemo": true, + "heuristicMapping": true, + "parameterizableCGRA": false, + "vectorizationMode": "all", + "diagonalVectorization": false, + "bypassConstraint": 4, + "isStaticElasticCGRA": false, + "ctrlMemConstraint": 10, + "regConstraint": 8, + "incrementalMapping": false, + "vectorFactorForIdiv ": 1, + "testingOpcodeOffset": 0, + "additionalFunc": { + "complex-Ctrl": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ], + "fptosi": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ], + "div": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ], + "complex-BrT": [ + 4, + 5, + 6, + 7 + ], + "complex-CoT": [ + 8, + 9, + 10, + 11 + ] + }, + "supportDVFS": false, + "DVFSIslandDim": 1, + "DVFSAwareMapping": false, + "enablePowerGating": false, + "expandableMapping": true +} \ No newline at end of file diff --git a/tools/expandable/util/__init__.py b/tools/expandable/util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/expandable/util/scheduler.py b/tools/expandable/util/scheduler.py new file mode 100644 index 00000000..88c7ea30 --- /dev/null +++ b/tools/expandable/util/scheduler.py @@ -0,0 +1,932 @@ +# ---------------------------------------------------------------------------- +# Filename: scheduler.py / +# Description: simulate multi-kernel running on multi-CGRA / +# ---------------------------------------------------------------------------- + +import heapq +import os +import subprocess +import json +import eventlet # for time out +import pandas as pd +import math + +# ---------------------------------------------------------------------------- +# global variables / +# ---------------------------------------------------------------------------- + +DICT_CSV = {'kernels': "", 'DFG nodes': "", 'DFG edges': "", 'recMII': "", 'mappingII': "", 'expandableII': "", 'utilization': ""} # column names of generated CSV +DICT_COLUMN = len(DICT_CSV) +VECTOR_LANE = 2 +JSON_NAME = "./param.json" +TIME_OUT_SET = 180 +KERNEL_DIRECTORY = "../../test/kernels" + + +def init_args(args): + """init config""" + global JSON_NAME, TIME_OUT_SET, KERNEL_DIRECTORY + JSON_NAME = args.json_name + KERNEL_DIRECTORY = args.kernel_directory + TIME_OUT_SET = args.time_out_set + +# ---------------------------------------------------------------------------- +# class defination / +# ---------------------------------------------------------------------------- + +class Kernel: + def __init__(self, kernel_name, kernel_id, arrive_period, unroll_factor, vector_factor, total_iterations, cgra_rows, cgra_columns): + """ + Initialize an instance of the Kernel class. + + Parameters: + kernel_name (str): The name of the kernel. + kernel_id (int): The ID of the kernel. + arrive_period (int): The period at which the same kernel will arrive again. + unroll_factor (int): The unroll factor of the kernel. + vector_factor (int): The vector factor of the kernel. + total_iterations (int): The total number of iterations of the kernel. + cgra_rows (int): The number of rows in the CGRA. + cgra_columns (int): The number of columns in the CGRA. + """ + self.kernel_name = kernel_name + self.kernel_id = kernel_id + self.arrive_period = arrive_period + self.unroll_factor = unroll_factor + self.vector_factor = vector_factor + self.df = pd.DataFrame(DICT_CSV, index=[0]) + self.base_ii = 0 # II when using 1 CGRA, actual II, if fused, base_ii is fused_ii + self.expandable_ii = 0 # II when using 2 CGRAs, expandable II, if fused, expandable_ii is individual_ii + self.utilization = 0 + self.total_iterations = math.ceil(total_iterations / (self.unroll_factor*self.vector_factor)) + self.rows = cgra_rows + self.columns = cgra_columns + self.load_data() + + + def __lt__(self, other): + """ + Compare two Kernel by id. + """ + return self.kernel_id < other.kernel_id + + def load_data(self): + prefix = './tmp/t_' + csv_name = f'{prefix}{self.kernel_name}_{self.rows}x{self.columns}_unroll{self.unroll_factor}_vector{self.vector_factor}.csv' + if os.path.exists(csv_name): + self.read_ii(csv_name) + else: + self.get_ii(csv_name) + + self.is_valid = bool(self.base_ii) + # print(f"Kernel {self.kernel_name} loaded with arrive_period={self.arrive_period}") + + def comp_kernel(self): + """ + This is a func compile a kernel using clang with selected unrolling factor. + + Returns: function name of kernel. + """ + file_source = (self.kernel_name.split("."))[0] + # corner case + if self.kernel_name == "conv.c" and self.unroll_factor == 4: + self.unroll_factor = 2 + if self.kernel_name == "fft.c" and self.unroll_factor == 2: + self.unroll_factor = 1 + if self.kernel_name == "relu+histogram.c" and self.unroll_factor == 4 and self.rows == 12: + self.unroll_factor = 2 + if self.kernel_name == "spmv.c" and self.unroll_factor == 2 and self.rows == 4: + self.unroll_factor = 1 + + if self.unroll_factor == 1 and self.vector_factor == 1: + compile_command = f"clang-12 -emit-llvm -fno-unroll-loops -fno-vectorize -O3 -o kernel.bc -c {KERNEL_DIRECTORY}/{file_source}/{self.kernel_name}" + elif self.unroll_factor == 1 and self.vector_factor != 1: + compile_command = f"clang-12 -emit-llvm -fno-unroll-loops -O3 -mllvm -force-vector-width={self.vector_factor} -o kernel.bc -c {KERNEL_DIRECTORY}/{file_source}/{self.kernel_name}" + elif self.unroll_factor != 1 and self.vector_factor == 1: + compile_command = f"clang-12 -emit-llvm -funroll-loops -mllvm -unroll-count={self.unroll_factor} -fno-vectorize -O3 -o kernel.bc -c {KERNEL_DIRECTORY}/{file_source}/{self.kernel_name}" + else: + # print("Error, invalid unroll and vector factor combination.") + return + + compile_proc = subprocess.Popen([compile_command, '-u'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + (compile_out, compile_err) = compile_proc.communicate() + + disassemble_command = f"llvm-dis-12 kernel.bc -o kernel.ll" + disassemble_proc = subprocess.Popen([disassemble_command, '-u'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + (disassemble_out, disassemble_err) = disassemble_proc.communicate() + + + if compile_err: + print(f"Compile warning message for {self.kernel_name}: {compile_err}") + if disassemble_err: + # print(f"Disassemble error message for {self.kernel_name}: {disassemble_err}") + return + + # collect the potentially targeting kernel/function from kernel.ll + ir_file = open(f'kernel.ll', 'r') + ir_lines = ir_file.readlines() + + # strips the newline character + for line in ir_lines: + if "define " in line and "{" in line and "@" in line: + func_name = line.split("@")[1].split("(")[0] + if "kernel" in func_name: + target_kernel = func_name + break + + ir_file.close() + # print(f"Target kernel function for {self.kernel_name}: {target_kernel}") + return target_kernel + + def map_kernel(self): + """ + This is a func for mapping a kernel and gain information during mapping. + + Returns: NULL + """ + get_map_command = f"opt-12 -load ../../build/src/libmapperPass.so -mapperPass kernel.bc" + gen_map_proc = subprocess.Popen([get_map_command, "-u"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + dataS = [] # for get results from subprocess and output to pandas + kernels_source = (self.kernel_name.split("."))[0] + dataS.append(kernels_source) + + try: + eventlet.monkey_patch() + with eventlet.Timeout(TIME_OUT_SET, True): + with gen_map_proc.stdout: + gen_map_proc.stdout.flush() + for line in iter(gen_map_proc.stdout.readline, b''): + output_line = line.decode("ISO-8859-1") + #print(output_line) + if "DFG node count: " in output_line: + dataS.append(int(output_line.split("DFG node count: ")[1].split(";")[0])) + dataS.append(int(output_line.split("DFG edge count: ")[1].split(";")[0])) + if "[RecMII: " in output_line: + dataS.append(int(output_line.split("[RecMII: ")[1].split("]")[0])) + if "[Mapping II: " in output_line: + self.base_ii = int(output_line.split("[Mapping II: ")[1].split("]")[0]) + dataS.append(self.base_ii) + if "[ExpandableII: " in output_line: + self.expandable_ii = int(output_line.split("[ExpandableII: ")[1].split("]")[0]) + dataS.append(self.expandable_ii) + if "tile avg fu utilization: " in output_line: + self.utilization = min(float(output_line.split("avg overall utilization: ")[1].split("%")[0])/100,1) + dataS.append(self.utilization) + if "[Mapping Fail]" in output_line: + print(f"{self.kernel_name} mapping failed.") + except eventlet.timeout.Timeout: + dataS = [0]*(DICT_COLUMN) + # print("Skipping a specific config for kernel: ", self.kernel_name, "Because it runs more than", TIME_OUT_SET/60 , "minute(s).") + + if len(dataS) != DICT_COLUMN: + dataS.extend([0]*(DICT_COLUMN-len(dataS))) + + self.df.loc[len(self.df.index)] = dataS + + + def map_kernel_skip(self): + """ + This is a func gain DFG information only without mapping. + + Returns: NULL + """ + get_map_command = f"opt-12 -load ../../build/src/libmapperPass.so -mapperPass kernel.bc" + gen_map_proc = subprocess.Popen([get_map_command, "-u"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + # Holds the results from subprocess and output to pandas. + dataS = [] + kernels_source = (self.kernel_name.split("."))[0] + dataS.append(kernels_source) + # The first 4 element of dataS is not empty: kernelsSource, DFG node count, DFG edge count, RecMII. + k_data_s_head = 4 + + try: + eventlet.monkey_patch() + with eventlet.Timeout(TIME_OUT_SET, True): + with gen_map_proc.stdout: + gen_map_proc.stdout.flush() + for line in iter(gen_map_proc.stdout.readline, b''): + output_line = line.decode("ISO-8859-1") + if "DFG node count: " in output_line: + dataS.append(int(output_line.split("DFG node count: ")[1].split(";")[0])) + dataS.append(int(output_line.split("DFG edge count: ")[1].split(";")[0])) + if "[RecMII: " in output_line: + dataS.append(int(output_line.split("[RecMII: ")[1].split("]")[0])) + dataS.extend([0]*(DICT_COLUMN-k_data_s_head)) + break + + except eventlet.timeout.Timeout: + dataS = [0]*(DICT_COLUMN) + # print("Skipping a specific config for kernel: ", self.kernel_name, "Because it runs more than", TIME_OUT_SET/60, "minute(s).") + + self.df.loc[len(self.df.index)] = dataS + + def get_ii(self, csv_name): + """ + This is a func to compile, run and map kernels under neura_json and store the mapping result in csv + + Returns: name of the csv that collects information of mapped kernels + """ + # print("Generating", csv_name) + target_kernel = self.comp_kernel() + + neura_json = { + "kernel": target_kernel, + "targetFunction": False, + "targetNested": False, + "targetLoopsID": [0], + "doCGRAMapping": True, + "row": self.rows, + "column": self.columns, + "precisionAware": False, + "fusionStrategy": ["default_heterogeneous"], + "isTrimmedDemo": True, + "heuristicMapping": True, + "parameterizableCGRA": False, + "vectorizationMode": "all", + "diagonalVectorization": False, + "bypassConstraint": 4, + "isStaticElasticCGRA": False, + "ctrlMemConstraint": 10, + "regConstraint": 8, + "incrementalMapping" : False, + "vectorFactorForIdiv " : 1, + "testingOpcodeOffset" : 0, + "additionalFunc" : { + "complex-Ctrl" : [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], + "fptosi": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], + "div": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], + "complex-BrT" : [4,5,6,7], + "complex-CoT" : [8,9,10,11] + }, + "supportDVFS": False, + "DVFSIslandDim": 1, + "DVFSAwareMapping": False, + "enablePowerGating": False, + "expandableMapping" : True + } + + json_object = json.dumps(neura_json, indent=4) + + with open(JSON_NAME, "w") as outfile: + outfile.write(json_object) + if True: + self.map_kernel() + else: + self.map_kernel_skip() + + self.df.to_csv(csv_name) + return csv_name + + def read_ii(self, csv_name): + """ + This is a func to read from csv generated from get_ii() + + Returns: csv_name + """ + try: + df = pd.read_csv(csv_name) + self.base_ii = int(df['mappingII'].iloc[1]) + self.expandable_ii = int(df['expandableII'].iloc[1]) + if 'utilization' in df.columns: + self.utilization = min(float(df['utilization'].iloc[1]),1.0) + else: + self.get_ii() + return csv_name + except FileNotFoundError: + # print(f"CSV file {csv_name} not found.") + self.get_ii() + return csv_name + except ValueError: + # print(f"Error extracting II values from {csv_name}.") + self.get_ii() + return csv_name + + return csv_name + + def return_ii(self, num_cgras): + """ + Get the initiation interval (II) based on the number of CGRAs allocated. + + Parameters: + num_cgras (int): Number of CGRAs allocated. + + Returns: + int: The initiation interval (II). + """ + if num_cgras == 1: + return self.base_ii + elif num_cgras == 2: + return self.expandable_ii + else: + raise ValueError("Number of CGRAs must be 1 or 2.") + + def return_total_iterations(self): + """ + Total iterations for the kernel, affected by unroll_factor and vector_factor + + Returns: + int: Total iterations. + """ + return self.total_iterations + + def create_instance(self, arrival_time): + """ + Create a KernelInstance based on the current kernel. + + Parameters: + arrival_time (int): The time at which the instance arrives. + + Returns: + KernelInstance: A new instance of the kernel. + """ + return KernelInstance(self, arrival_time) + + +class KernelInstance: + def __init__(self, kernel, arrival_time): + """ + Initialize a KernelInstance. + + Parameters: + kernel (Kernel): The kernel from which this instance is created. + arrival_time (int): The time at which the instance arrives. + """ + self.kernel = kernel + self.arrival_time = arrival_time + self.start_time = None + self.allocated_cgras = 0 + self.ii = None + self.end_time = None + self.is_valid = self.kernel.is_valid + self.pure_execution_duration = 0 # Track pure execution duration for this instance + self.pure_waiting_duration = 0 # Track pure waiting duration for this instance + # Determine the maximum number of CGRAs that can be allocated + if self.kernel.vector_factor == 1: + self.max_allocate_cgra = 2 + else: + self.max_allocate_cgra = math.ceil(self.kernel.vector_factor/VECTOR_LANE) + + def __lt__(self, other): + """ + Compare two KernelInstance instances by arrival time. + """ + return self.arrival_time < other.arrival_time + + def calculate_execution_duration(self): + """ + Calculate the execution duration based on the number of allocated CGRAs + at the beginning running time of current kernel. It may change after. + + Returns: + int: Total execution duration in cycles. + """ + if self.kernel.vector_factor == 1: + if self.allocated_cgras == 1: + self.ii = self.kernel.base_ii + elif self.allocated_cgras == 2: + self.ii = self.kernel.expandable_ii + else: + raise ValueError(f"Number of CGRAs must be between 1 and {self.max_allocate_cgra}.") + execution_duration = self.kernel.total_iterations * self.ii + else: + self.ii = self.kernel.base_ii + execution_duration = self.kernel.total_iterations * self.ii * math.ceil(self.kernel.vector_factor / (VECTOR_LANE * self.allocated_cgras)) + # print(f"Calculated execution duration for {self.kernel.kernel_name}: {execution_duration} cycles (II={self.ii}, iterations={self.kernel.total_iterations})") + return execution_duration + + def copy_with_valid(self): + """ + Create a copy of the current instance and set is_valid to True. + + Returns: + KernelInstance: A new instance copy. + """ + new_instance = KernelInstance(self.kernel, self.arrival_time) + new_instance.start_time = self.start_time + new_instance.allocated_cgras = self.allocated_cgras + new_instance.ii = self.ii + new_instance.end_time = self.end_time + new_instance.is_valid = True + new_instance.pure_execution_duration = 0 + new_instance.pure_waiting_duration = self.pure_waiting_duration + new_instance.max_allocate_cgra = self.max_allocate_cgra + return new_instance + + +class SystemIdleTracker: + def __init__(self, num_cgras): + """Initialize the system idle time tracker + + Args: + num_cgras: Total number of CGRAs in the system + """ + self.num_cgras = num_cgras + self.last_active_time = 0 # Timestamp when system was last active + self.idle_periods = [] # List to store idle periods (start, end) + + # New attributes for waiting time tracking + self.waiting_start_time = None # Timestamp when waiting started + self.waiting_periods = [] # List to store non-overlapping waiting periods (start, end) + + def check_idle_period(self, current_time, available_cgras, waiting_kernels): + """Check and record idle periods and waiting times + + Args: + current_time: Current simulation time (passed from simulate function) + available_cgras: Number of currently available CGRAs + waiting_kernels: Number of kernels in the waiting queue + """ + # Detect system-wide idle state (all CGRAs available) + if available_cgras == self.num_cgras and current_time > self.last_active_time: + self.idle_periods.append((self.last_active_time, current_time)) + else: + # Update last active time if system is not fully idle + self.last_active_time = current_time + + # Track waiting time + if waiting_kernels and self.waiting_start_time is None: + # Start tracking waiting time if queue was empty and now has kernels + self.waiting_start_time = current_time + elif not waiting_kernels and self.waiting_start_time is not None: + # End tracking and record waiting period if queue was non-empty and now empty + self.waiting_periods.append((self.waiting_start_time, current_time)) + self.waiting_start_time = None # Reset for next waiting period + + @property + def total_idle_duration(self) -> int: + """Calculate total accumulated idle time + + Returns: + Sum of all idle periods in cycles + """ + return sum(end - start for start, end in self.idle_periods) + + @property + def total_waiting_time_nolap(self) -> int: + """Calculate total non-overlapping waiting time + + Returns: + Sum of all waiting periods in cycles + """ + return sum(end - start for start, end in self.waiting_periods) + + def get_utilization(self, total_cgra_runtime, current_time) -> float: + """Calculate system utilization rate + + Args: + total_cgra_runtime: Sum of busy time across all CGRAs + current_time: Current simulation time + + Returns: + Utilization percentage (0.0 to 1.0) + """ + if current_time <= 0: + return 0.0 + # Utilization = Actual busy time / Possible busy time + possible_busy_time = (current_time - self.total_idle_duration) * self.num_cgras + # print(f"Total idle duration is {self.total_idle_duration}, total_cgra_runtime is {total_cgra_runtime}") + return total_cgra_runtime / possible_busy_time if possible_busy_time > 0 else 0.0 + +# ---------------------------------------------------------------------------- +# function defination / +# ---------------------------------------------------------------------------- + +def allocate(priority_boosting, instance, current_time, available_cgras, events, running_instances, runned_kernel_names, total_cgra_runtime): + """ + Allocate CGRAs to a kernel instance. + + Parameters: + instance (KernelInstance): The kernel instance to allocate CGRAs to. + current_time (int): The current simulation time. + available_cgras (int): The number of available CGRAs. + events (list): The event queue. + running_instances (list): The list of currently running instances. + runned_kernel_names (list): The list of names of the kernels that have been run. + total_cgra_runtime (float): The total runtime of all CGRAs. + + Returns: + int: The updated number of available CGRAs. + float: The updated total runtime of all CGRAs. + """ + runned_kernel_names.append(instance.kernel.kernel_name) + + if priority_boosting == 0: + if '+' in instance.kernel.kernel_name: + allocate_cgras = min(1, available_cgras) + # print(f"Kernel {instance.kernel.kernel_name} contains '+', limiting allocation to 1 CGRA") + elif instance.kernel.vector_factor != 1: # limit allocated cgra of vector + allocate_cgras = min(1, available_cgras) + # print(f"Kernel {instance.kernel.kernel_name} is vectorized, limiting allocation to 1 CGRA") + elif available_cgras < 6: # 6 if num_cgras = 9, 3 if num_cgras = 4, 12 if num_cgras = 16, 20 if num_cgras = 25 + allocate_cgras = min(1, available_cgras) + # print(f"available_cgras is less than 5, limiting allocation to 1 CGRA") + else: + allocate_cgras = min(instance.max_allocate_cgra, available_cgras) + #allocate_cgras = 1 + else: + allocate_cgras = 1 + available_cgras -= allocate_cgras + instance.start_time = current_time + instance.allocated_cgras = allocate_cgras + execution_duration = instance.calculate_execution_duration() + instance.end_time = current_time + execution_duration + instance.pure_waiting_duration = instance.start_time - instance.arrival_time # Record pure waiting time + # print(f"Allocated {allocate_cgras} CGRAs to {instance.kernel.kernel_name} at {current_time}. Execution will end at {instance.end_time}") + heapq.heappush(events, (instance.end_time, 'end', instance, instance)) + running_instances.append(instance) + if instance.kernel.rows != 4 and instance.kernel.columns != 4: # HACK: 4x4 是 neura + total_cgra_runtime += allocate_cgras * execution_duration * instance.kernel.utilization + else: + total_cgra_runtime += allocate_cgras * execution_duration + return available_cgras, total_cgra_runtime + + +def release(instance, current_time, available_cgras, running_instances, completed_instances, kernel_latency, total_cgra_runtime): + """ + Release the CGRAs occupied by a kernel instance. + + Parameters: + instance (KernelInstance): The kernel instance to release CGRAs from. + current_time (int): The current simulation time. + available_cgras (int): The number of available CGRAs. + running_instances (list): The list of currently running instances. + completed_instances (list): The list of completed instances. + kernel_latency (dict): A dictionary used to track the total latency of each kernel. + total_cgra_runtime (float): The total runtime of all CGRAs. + + Returns: + int: The updated number of available CGRAs. + float: The updated total runtime of all CGRAs. + """ + available_cgras += instance.allocated_cgras + completed_instances.append(instance) + if instance in running_instances: + running_instances.remove(instance) + # Update per-kernel overall latency + instance.end_time = current_time + latency = instance.end_time - instance.start_time + instance.pure_execution_duration = instance.end_time - instance.start_time # Record pure execution time + kernel_latency[instance.kernel.kernel_name] += latency + # print(f"Released {instance.allocated_cgras} CGRAs from {instance.kernel.kernel_name} at {current_time}. Latency added: {latency} cycles") + return available_cgras, total_cgra_runtime + + +def re_allocate(instance, current_time, available_cgras, events, total_cgra_runtime): + """ + Re-allocate additional CGRAs to a kernel instance if possible. + + Parameters: + instance (KernelInstance): The kernel instance to re-allocate CGRAs to. + available_cgras (int): Number of available CGRAs. + events (list): The event queue. + current_time (int): The current simulation time. + total_cgra_runtime (float): Total runtime of all CGRAs. + + Returns: + int: Updated number of available CGRAs. + float: Updated total runtime of all CGRAs. + """ + if not instance.is_valid: + # print(f"Instance {instance.kernel.kernel_name} is already invalid, skipping re-allocation.") + return available_cgras, total_cgra_runtime + if instance.allocated_cgras < instance.max_allocate_cgra and available_cgras > 0: + possible_alloc = min(instance.max_allocate_cgra - instance.allocated_cgras, available_cgras) + original_allocated_cgras = instance.allocated_cgras + # Update allocation + instance.allocated_cgras += possible_alloc + available_cgras -= possible_alloc + # Recalculate remaining iterations + elapsed_duration = current_time - instance.start_time + # Calculate equivalent scalar iteration count (considering vectorization and CGRAs) + if instance.kernel.vector_factor == 1: + # scalar + completed_iters = elapsed_duration // instance.ii + else: + # vector + effective_ii = instance.ii * math.ceil(instance.kernel.vector_factor / (VECTOR_LANE * original_allocated_cgras)) + completed_iters = int(elapsed_duration // effective_ii) + remaining_iters = instance.kernel.total_iterations - completed_iters + # print(f"current_time {current_time}, completed_iters {completed_iters}") + # Update II and remaining_execution_duration + if instance.kernel.vector_factor == 1: + # Scalar case + if instance.allocated_cgras == 1: + instance.ii = instance.kernel.base_ii + elif instance.allocated_cgras == 2: + instance.ii = instance.kernel.expandable_ii + else: + raise ValueError(f"Number of CGRAs must be between 1 and {instance.max_allocate_cgra}.") + remaining_execution_duration = remaining_iters * instance.ii + else: + # Vector case + vector_divisor = VECTOR_LANE * instance.allocated_cgras + remaining_execution_duration = remaining_iters * instance.ii * math.ceil(instance.kernel.vector_factor / vector_divisor) + # Schedule new end event + new_end_time = current_time + remaining_execution_duration + # print(f"remaining_iters {remaining_iters}, remaining_execution_duration {remaining_execution_duration}") + # print(f"Re-allocated succeed. {instance.kernel.kernel_name}. Add {possible_alloc} CGRAs at {current_time}. Old end time: {instance.end_time}. New end time: {new_end_time}") + # Create a new valid instance for the new end event + new_instance = instance.copy_with_valid() # Assume there is a copy method in KernelInstance class + heapq.heappush(events, (new_end_time, 'end', new_instance, new_instance)) + # Invalidate old end event by leaving it in the heap but ignoring when processed + instance.is_valid = False # Old instance is invalid + # 修正total_cgra_runtime计算,添加utilization判断 + kernel = instance.kernel + is_12x12 = (kernel.rows == 12 and kernel.columns == 12) + utilization_factor = kernel.utilization if is_12x12 else 1.0 + # Apply utilization factor uniformly + old_estimate = original_allocated_cgras * (instance.end_time - instance.start_time) * utilization_factor + actual_runtime = original_allocated_cgras * elapsed_duration * utilization_factor + new_allocation_runtime = instance.allocated_cgras * remaining_execution_duration * utilization_factor + # Update total runtime + total_cgra_runtime -= old_estimate # Remove old estimate + total_cgra_runtime += actual_runtime # Add actual runtime completed + total_cgra_runtime += new_allocation_runtime # Add new allocation runtime + else: + # print(f"Re-allocated Failed. ({instance.kernel.kernel_name} at time {current_time})") + pass + return available_cgras, total_cgra_runtime + + +def handle_reallocation(priority_boosting, running, current_time, available_cgras, events, total_cgra_runtime): + """ + Checks if a running instance should be re-allocated based on the priority_boosting strategy. + + Args: + priority_boosting (int): The strategy for re-allocation. + 0: No re-allocation. + 1: Re-allocate for vector_factor=1 kernels without '+' in name. + 2: Re-allocate for all vector_factor=1 kernels. + 3: Re-allocate for all kernels. + running (object): The currently running instance to check. + current_time (float): The current simulation time. + available_cgras (int): The number of currently available CGRAs. + events (list): The list of simulation events. + total_cgra_runtime (float): The accumulated total CGRA runtime. + + Returns: + tuple: A tuple containing the updated available_cgras and total_cgra_runtime. + """ + if priority_boosting <= 0: + return available_cgras, total_cgra_runtime + + should_reallocate = False + kernel_info = running.kernel + + if priority_boosting == 1: + # Re-allocate only for kernels with vector_factor=1 and no '+' in the name + should_reallocate = (kernel_info.vector_factor == 1 and '+' not in kernel_info.kernel_name) + elif priority_boosting == 2: + # Re-allocate for all kernels with vector_factor=1 (including those with '+') + should_reallocate = (kernel_info.vector_factor == 1) + elif priority_boosting == 3: + # Re-allocate for all kernels + should_reallocate = True + + # If the condition is met, perform the re-allocation + if should_reallocate: + available_cgras, total_cgra_runtime = re_allocate( + running, current_time, available_cgras, events, total_cgra_runtime + ) + + return available_cgras, total_cgra_runtime + + +def simulate(num_cgras, kernels, priority_boosting, lcm_time=26214400): + """ + lcm_time=26214400 + Simulate the execution of multiple kernels on a CGRA architecture. + + Parameters: + num_cgras (int): The number of CGRAs in the CGRA architecture. + kernels (list of Kernel): The list of kernels to simulate. + priority_boosting (bool): Whether to enable priority boosting. + lcm_time (int): The least common multiple of the arrival periods. + + Returns: + dict: A dictionary that maps kernel names to their total latencies. + """ + # Add target check time + CHECK_TIME = 3276800 + # Flag to mark whether result has been output, avoiding duplicate output + checked = False + + available_cgras = num_cgras + events = [] # when a kernel arrives or ends, it is an event + current_time = 0 + waiting_instances = [] + running_instances = [] + completed_instances = [] + runned_kernel_names = [] + # Dictionary to store per-kernel arrival times + kernel_arrival_count = {kernel.kernel_id: 0 for kernel in kernels} + # Dictionary to store per-kernel overall latency (cycle) + kernel_latency = {kernel.kernel_name: 0 for kernel in kernels} + # Dictionary to store per-kernel execution duration distribution + kernel_execution_distribution = {kernel.kernel_name: [] for kernel in kernels} + # Dictionary to store per-kernel waiting duration distribution + kernel_waiting_distribution = {kernel.kernel_name: [] for kernel in kernels} + # Dictionary to store per-kernel ratio (iterations per cycle) + kernel_execution_ratio = {kernel.kernel_name: 0 for kernel in kernels} + # Dictionary to store per-kernel ratio (iterations per cycle) + kernel_waiting_ratio = {kernel.kernel_name: 0 for kernel in kernels} + total_cgra_runtime = 0 + idle_tracker = SystemIdleTracker(num_cgras=num_cgras) + arrive_times_list = { + kernel.kernel_id: ((lcm_time // kernel.arrive_period)) + for kernel in kernels + } + # print(arrive_times_list) + + + # print(f"\033[91mPriority Boosting Level: {priority_boosting}\033[0m") + + for kernel in kernels: + print(f"Kernel {kernel.kernel_name} base_ii={kernel.base_ii}, expandable_ii={kernel.expandable_ii}, \ + iterations={kernel.total_iterations}, utilization={kernel.utilization}, arrive_times, {arrive_times_list[kernel.kernel_id]}, isvalid, {kernel.is_valid}") + + # Schedule initial arrivals for all kernels + for kernel in kernels: + first_arrival = 0 + # heapq keeps a priority queue that contains (event_arrive_end_time (int), event_type (str), Kernel, KernelInstance (needed when 'end')) + heapq.heappush(events, (first_arrival, 'arrival', kernel, None)) + + while events: + event_time, event_type, kernel_or_instance, _ = heapq.heappop(events) + if not kernel_or_instance.is_valid: + # tmp_name = kernel_or_instance.kernel_name if kernel_or_instance is Kernel else kernel_or_instance.kernel.kernel_name + # print(f"Skipping invalid event for tmp_name") + continue + + current_time = event_time + # print("="*20) + idle_tracker.check_idle_period(current_time, available_cgras, waiting_instances) + # print(f"Processing event at time {current_time}: type={event_type}, kernel={kernel_or_instance.kernel_name if event_type == 'arrival' else kernel_or_instance.kernel.kernel_name}") + + if event_type == 'arrival' and kernel_or_instance.is_valid: + kernel = kernel_or_instance + kernel_arrival_count[kernel.kernel_id] += 1 + # Create a new instance + instance = kernel.create_instance(current_time) + # Schedule next arrival if within lcm_time + next_arrival = current_time + kernel.arrive_period + if kernel_arrival_count[kernel.kernel_id] < arrive_times_list[kernel.kernel_id]: + heapq.heappush(events, (next_arrival, 'arrival', kernel, None)) + # print(f"Scheduled next arrival for {kernel.kernel_name} at time {next_arrival}") + + + # Try to allocate CGRAs + if available_cgras >= 1: + available_cgras, total_cgra_runtime = allocate(priority_boosting, instance, current_time, available_cgras, events, running_instances, runned_kernel_names, total_cgra_runtime) + available_cgras, total_cgra_runtime = handle_reallocation(priority_boosting, instance, current_time, available_cgras, events, total_cgra_runtime) + else: + waiting_instances.append(instance) + # print(f"No available CGRAs for {kernel.kernel_name}. Added to waiting queue.") + + elif event_type == 'end' and kernel_or_instance.is_valid: + instance = kernel_or_instance + # Release CGRAs + available_cgras, total_cgra_runtime = release(instance, current_time, available_cgras, running_instances, completed_instances,kernel_latency, total_cgra_runtime) + + # Update execution duration distribution + kernel_execution_distribution[instance.kernel.kernel_name].append(instance.pure_execution_duration) + kernel_waiting_distribution[instance.kernel.kernel_name].append(instance.pure_waiting_duration) + + # Check waiting queue + while waiting_instances and available_cgras >= 1: + instance = waiting_instances.pop(0) + # print(f"Allocating CGRAs to waiting instance {instance.kernel.kernel_name}") + available_cgras, total_cgra_runtime = allocate(priority_boosting, instance, current_time, available_cgras, events, running_instances, runned_kernel_names, total_cgra_runtime) + available_cgras, total_cgra_runtime = handle_reallocation(priority_boosting, instance, current_time, available_cgras, events, total_cgra_runtime) + + # Check running instances for possible re-allocation + # if priority_boosting: + # for running in running_instances: + # available_cgras, total_cgra_runtime = re_allocate(running, current_time, available_cgras, events, total_cgra_runtime) + for running in running_instances[:]: + available_cgras, total_cgra_runtime = handle_reallocation( + priority_boosting, running, current_time, available_cgras, events, total_cgra_runtime + ) + + # Check if the target time has been reached, results haven't been output yet, and current time >= target time + if not checked and current_time >= CHECK_TIME: + # print(f"\n=== At time {CHECK_TIME}, number of completed functions: {len(completed_instances)} ===") + checked_num_kernel = len(completed_instances) + checked = True + + # print("="*20) + + # If the simulation ends before reaching the target time, also output results + if not checked: + # print(f"\n=== Simulation ended before {CHECK_TIME}, number of completed functions: {len(completed_instances)}") + checked_num_kernel = len(completed_instances) + + overall_execution = 0 + overall_waiting = 0 + # Calculate ratio for each kernel + for kernel in kernels: + total_execution_duration = sum( + [inst.pure_execution_duration for inst in completed_instances if inst.kernel.kernel_name == kernel.kernel_name]) + total_waiting_duration = sum( + [inst.pure_waiting_duration for inst in completed_instances if inst.kernel.kernel_name == kernel.kernel_name]) + total_duration = total_execution_duration + total_waiting_duration + kernel_execution_ratio[kernel.kernel_name] = total_execution_duration / total_duration if total_duration > 0 else 0 + kernel_waiting_ratio[kernel.kernel_name] = total_waiting_duration / total_duration if total_duration > 0 else 0 + overall_execution += total_execution_duration + overall_waiting += total_waiting_duration + + # Calculate utilization of total CGRAs + cgra_utilization = idle_tracker.get_utilization(total_cgra_runtime, current_time) + waiting_time_nolap = idle_tracker.total_waiting_time_nolap + overall_latency = current_time # when all kernels are done + + # print(f"Simulation completed. Kernel latencies: {kernel_latency}") + # print(f"Kernel execution_ratio: {kernel_execution_ratio}") + # print(f"Kernel execution duration distributions: {kernel_execution_distribution}") + # print(f"Kernel Runned List: {runned_kernel_names}") + # print(f"CGRA utilization: {cgra_utilization}") + # print(f"overall latency: {overall_latency}") + # print(f"overall execution: {overall_execution}") + # print(f"overall waiting_time_nolap: {waiting_time_nolap}") + return kernel_latency, kernel_waiting_distribution, kernel_execution_ratio, kernel_waiting_ratio, kernel_execution_distribution, cgra_utilization, overall_latency, overall_execution, checked_num_kernel, waiting_time_nolap + + +def run_multiple_simulations_and_save_to_csv(kernels_list, csv_name, priority_boosting, kernel_case, num_cgras=9): + """ + Run multiple simulations and save the results to a CSV file. + + Parameters: + kernels_list (list of list of Kernel): A list of kernels. + csvname (str): The name of the CSV file. + priority_boosting (int): Whether to enable priority boosting. + num_cgras (int): The number of CGRAs, default 9. + """ + kernel_latency, kernel_waiting_distribution, kernel_execution_ratio, kernel_waiting_ratio, kernel_execution_distribution, cgra_utilization, overall_latency, overall_execution, checked_num_kernel, waiting_time_nolap = simulate(num_cgras, kernels_list, priority_boosting) + + # Calculate fastest, slowest, and average execution duration per kernel + execution_stats = {} + for kernel_name, execution_durations in kernel_execution_distribution.items(): + if execution_durations: + fastest = min(execution_durations) + slowest = max(execution_durations) + average = sum(execution_durations) / len(execution_durations) + total = sum(execution_durations) + execution_stats[kernel_name] = { + "fastest_execution_duration": fastest, + "slowest_execution_duration": slowest, + "average_execution_duration": average, + "total_execution_duration": total + } + + # Calculate fastest, slowest, and average waiting duration per kernel + waiting_stats = {} + overall_avg_waiting = 0 + for kernel_name, waiting_durations in kernel_waiting_distribution.items(): + if waiting_durations: + fastest = min(waiting_durations) + slowest = max(waiting_durations) + average = sum(waiting_durations) / len(waiting_durations) + overall_avg_waiting += average + total = sum(waiting_durations) + waiting_stats[kernel_name] = { + "fastest_waiting_duration": fastest, + "slowest_waiting_duration": slowest, + "average_waiting_duration": average, + "total_waiting_duration": total + } + + all_results = [] + for kernel in kernels_list: + kernel_name = kernel.kernel_name + result = { + "Kernel_Name": kernel_name, + "Arrive_Period": kernel.arrive_period, + "Unroll_Factor": kernel.unroll_factor, + "Vector_Factor": kernel.vector_factor, + "fastest_execution_duration": execution_stats.get(kernel_name, {}).get("fastest_execution_duration", 0), + "slowest_execution_duration": execution_stats.get(kernel_name, {}).get("slowest_execution_duration", 0), + "Average_Execution_duration": execution_stats.get(kernel_name, {}).get("average_execution_duration", 0), + "fastest_waiting_duration": waiting_stats.get(kernel_name, {}).get("fastest_waiting_duration", 0), + "slowest_waiting_duration": waiting_stats.get(kernel_name, {}).get("slowest_waiting_duration", 0), + "Average_Waiting_duration": waiting_stats.get(kernel_name, {}).get("average_waiting_duration", 0), + "Total_Execution_duration": execution_stats.get(kernel_name, {}).get("total_execution_duration", 0), + "Total_Waiting_duration": waiting_stats.get(kernel_name, {}).get("total_waiting_duration", 0), + "Execution_duration Ratio": kernel_execution_ratio[kernel_name], + "Waiting_duration Ratio": kernel_waiting_ratio[kernel_name], + "Overall_Case_Latency": overall_latency, + "Overall_Execution": overall_execution, + "Sum_Average_Waiting_duration": overall_avg_waiting, + "CGRA_Utilization": cgra_utilization, + "checked_num_kernel":checked_num_kernel, + "waiting_time_nolap":waiting_time_nolap, + "Total_Execution_duration Ratio": (execution_stats.get(kernel_name, {}).get("total_execution_duration", 0))/overall_latency, + "Total_Waiting_duration Ratio": (waiting_stats.get(kernel_name, {}).get("total_waiting_duration", 0))/overall_latency, + "Total_Latency Ratio": (execution_stats.get(kernel_name, {}).get("total_execution_duration", 0) + waiting_stats.get(kernel_name, {}).get("total_waiting_duration", 0))/overall_latency + } + all_results.append(result) + + + df = pd.DataFrame(all_results) + file_name = f'./result/simulation_{kernel_case}_{csv_name}.csv' + df.to_csv(file_name, index=False) + print(f"reslut {file_name} saved") \ No newline at end of file diff --git a/tools/expandable/util/visualizer.py b/tools/expandable/util/visualizer.py new file mode 100644 index 00000000..357c147a --- /dev/null +++ b/tools/expandable/util/visualizer.py @@ -0,0 +1,652 @@ +# ---------------------------------------------------------------------------- +# Filename: main.py / +# Description: load multi-task and schedule them on multi-CGRA / +# ---------------------------------------------------------------------------- + +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import os +from typing import List, Dict + +# ---------------------------------------------------------------------------- +# class defination / +# ---------------------------------------------------------------------------- + +class SimulationDataAnalyzer: + """Simulation data visualization analysis tool""" + + def __init__(self, kernel_data): + """ + Initialize the analyzer + + Attributes: + data_cache (dict): Cache for loaded data + figure_config (dict): Default configuration for figures + """ + self.execution_cache = {} # Cache for loaded data + self.utilization_cache = {} + self.throughput_cache = {} + self.number_cache = {} + self.waiting_cache = {} + self.scalability_cache = {} + self.latency_cache = {} + self.KERNEL_NAMES = list(kernel_data.keys()) + self.NEURA_CONFIGS = ['Baseline', 'Neura-L0', 'Neura-L1', 'Neura-L2', 'Neura'] + self.KERNEL_COLORS = ['#A4A3A4','#B0C4E6','#8DA9DC','#FEEDB9','#002060', + '#F3B082','#F7CAAB','#C7FAA8','#FFD865'] + self.NEURA_COLORS = ['#7F7F7F','#EDEDED','#FFF2CC','#FFD966','#FFC000'] + + def load_execution_data(self, task_case: str, csv_name: str, normalized_baseline: int): + """ + Load data from a single CSV file + + Args: + task_case (str): Kernel case identifier + csv_name (str): CSV file name identifier + + Returns: + pd.DataFrame: DataFrame containing specified columns, or None if file doesn't exist + """ + file_path = f'./result/simulation_{task_case}_{csv_name}.csv' + + if not os.path.exists(file_path): + print(f"File does not exist: {file_path}") + return None + + # Read specified columns from the data + try: + df = pd.read_csv(file_path) + required_columns = ['Total_Execution_duration', 'Overall_Execution', 'CGRA_Utilization'] + + # Check if required columns exist + missing_columns = [col for col in required_columns if col not in df.columns] + if missing_columns: + raise ValueError(f"File is missing required columns: {', '.join(missing_columns)}") + + # Cache the data + cache_key = f"{task_case}_{csv_name}" + self.execution_cache[cache_key] = df['Total_Execution_duration'] / normalized_baseline + self.utilization_cache[cache_key] = df['CGRA_Utilization'] + return self.execution_cache[cache_key] + + except Exception as e: + print(f"Failed to read file: {file_path}, Error: {str(e)}") + return None + + def process_execution_data(self, task_cases: List[str]): + """ + Batch load data from multiple CSV files + + Args: + task_cases (List[str]): List of kernel case identifiers + + Returns: + Dict[str, pd.DataFrame]: Mapping from cache keys to DataFrames + """ + df = pd.read_csv("./result/simulation_1_Baseline.csv") + normalized_baseline = df['Overall_Execution'].iloc[0] #case1 的 Baseline 的 overall execution time + + for task_case in task_cases: + for csv_name in self.NEURA_CONFIGS: + self.load_execution_data(task_case, csv_name, normalized_baseline) + + return + + def load_throughput_data(self, task_case: str, csv_name: str): + """ + Load data from a single CSV file + + Args: + task_case (str): Kernel case identifier + csv_name (str): CSV file name identifier + + Returns: + pd.DataFrame: DataFrame containing specified columns, or None if file doesn't exist + """ + file_path = f'./result/simulation_{task_case}_{csv_name}.csv' + + if not os.path.exists(file_path): + print(f"File does not exist: {file_path}") + return None + + # Read specified columns from the data + try: + df = pd.read_csv(file_path) + required_columns = ['Total_Execution_duration', 'waiting_time_nolap', 'Average_Execution_duration'] + + # Check if required columns exist + missing_columns = [col for col in required_columns if col not in df.columns] + if missing_columns: + raise ValueError(f"File is missing required columns: {', '.join(missing_columns)}") + + # Cache the data + cache_key = f"{task_case}_{csv_name}" + self.execution_cache[cache_key] = df['Total_Execution_duration'] + self.number_cache[cache_key] = np.where( + (df['Average_Execution_duration'] == 0), + 0, + df['Total_Execution_duration'] / df['Average_Execution_duration'] + ) + self.waiting_cache[cache_key] = df['waiting_time_nolap'] + + return self.execution_cache[cache_key] + + except Exception as e: + print(f"Failed to read file: {file_path}, Error: {str(e)}") + return None + + def process_throughput_data(self, task_cases: List[str]): + """ + Batch load data from multiple CSV files + + Args: + task_cases (List[str]): List of kernel case identifiers + + Returns: + Dict[str, pd.DataFrame]: Mapping from cache keys to DataFrames + """ + df = pd.read_csv("./result/simulation_1_Baseline.csv") + file_path = "./result/simulation_1_Baseline.csv" + normalized_baseline = df['Overall_Execution'].iloc[0] #case1 的 Baseline 的 overall execution time + + for task_case in task_cases: + for csv_name in self.NEURA_CONFIGS: + self.load_throughput_data(task_case, csv_name) + + return + + def load_scalability_data(self, task_case: str, csv_name: str, execution_baseline: int, latency_baseline: int): + """ + Load data from a single CSV file + + Args: + task_case (str): Kernel case identifier + csv_name (str): CSV file name identifier + + Returns: + pd.DataFrame: DataFrame containing specified columns, or None if file doesn't exist + """ + file_path = f'./result/simulation_{task_case}_{csv_name}.csv' + print(file_path) + if not os.path.exists(file_path): + print(f"File does not exist: {file_path}") + return None + + # Read specified columns from the data + try: + df = pd.read_csv(file_path) + required_columns = ['Total_Execution_duration', 'Overall_Execution', 'CGRA_Utilization', 'Overall_Case_Latency'] + + # Check if required columns exist + missing_columns = [col for col in required_columns if col not in df.columns] + if missing_columns: + raise ValueError(f"File is missing required columns: {', '.join(missing_columns)}") + + # Cache the data + cache_key = f"{task_case}_{csv_name}" + self.scalability_cache[cache_key] = df['Total_Execution_duration'] / execution_baseline + self.latency_cache[cache_key] = df['Overall_Case_Latency'] / latency_baseline + self.utilization_cache[cache_key] = df['CGRA_Utilization'] + return self.scalability_cache[cache_key] + + except Exception as e: + print(f"Failed to read file: {file_path}, Error: {str(e)}") + return None + + def process_scalability_data(self, task_cases: List[str]): + """ + Batch load data from multiple CSV files + + Args: + task_cases (List[str]): List of kernel case identifiers + + Returns: + Dict[str, pd.DataFrame]: Mapping from cache keys to DataFrames + """ + df = pd.read_csv("./result/simulation_2x2_6_Baseline.csv") + normalized_baseline = df['Overall_Execution'].iloc[0] + latency_baseline = df['Overall_Case_Latency'].iloc[0] + for task_case in task_cases: + for csv_name in self.NEURA_CONFIGS: + self.load_scalability_data(task_case, csv_name, normalized_baseline, latency_baseline) + + return + + def genFig9(self, fig_path: str): + """ + Generate Figure 9: Normalized execution time and improved utilization + """ + cases = ['1', '2', '3', '4', '5', '6'] + self.process_execution_data(cases) + + # Correct data structure - one value per X position + bar_data = {kernel: [] for kernel in self.KERNEL_NAMES} # Bar chart data + line_data = [] # Line chart data + x_labels = [] # X-axis labels + + # Collect data + for case in cases: + for group in self.NEURA_CONFIGS: + cache_key = f"{case}_{group}" # Adjust based on your actual naming convention + execution_series = self.execution_cache.get(cache_key) + utilization_series = self.utilization_cache.get(cache_key) + + # Bar chart data - Resource utilization + if execution_series is not None: + if hasattr(execution_series, 'to_dict'): + exec_dict = execution_series.to_dict() + else: + exec_dict = dict(execution_series) + for i, kernel in enumerate(self.KERNEL_NAMES): + kernel_value = float(exec_dict[i]) * 100 + bar_data[kernel].append(kernel_value) + else: + for kernel in self.KERNEL_NAMES: + bar_data[kernel].append(0) + + # Line chart data - Execution duration or other metrics + if utilization_series is not None: + line_value = utilization_series.iloc[0] + line_data.append(float(line_value) * 100) + else: + line_data.append(0) + + x_labels.append(f"{group}") + + # Create chart + fig, ax1 = plt.subplots(figsize=(20, 8)) + plt.style.use({ + 'font.size': 20, + 'axes.labelsize': 18, + 'axes.titlesize': 18, + 'xtick.labelsize': 18, + 'ytick.labelsize': 18 + }) + + total_bars = len(cases) * len(self.NEURA_CONFIGS) + x_positions = np.arange(total_bars) + bar_width = 0.6 + # Primary Y-axis - Bar chart + color_dict = {kernel: color for kernel, color in zip(self.KERNEL_NAMES, self.KERNEL_COLORS)} + bottom = np.zeros(total_bars) + bars_by_kernel = {} + for kernel in self.KERNEL_NAMES: + data = bar_data[kernel] + bars = ax1.bar(x_positions, data, bar_width, bottom=bottom, + color=color_dict[kernel], alpha=0.8, + edgecolor='black', linewidth=0.5, label=kernel) + bars_by_kernel[kernel] = bars + bottom += np.array(data) + + + # Add black dashed separator lines every group + for i in range(4, len(x_positions)-1, 5): + line_pos = i + 0.5 + ax1.axvline(x=line_pos, + color='black', + linestyle='--', + linewidth=0.8, + alpha=0.8) + + # Display values on Neura + arrays = [np.array(heights) for heights in bar_data.values()] + total_heights = np.sum(arrays, axis=0) + for i, (x, y) in enumerate(zip(x_positions, total_heights)): + if (i + 1) % 5 == 0: + ax1.text(x, y + max(total_heights)*0.02, f'{y:.1f}', + ha='center', va='bottom', fontsize=10) + + ax1.set_ylabel('Normalized execution time (%)', fontsize=20, color='black') + ax1.tick_params(axis='y', labelcolor='black', labelsize=18) + ax1.set_ylim(0, 120) + ax1.legend(loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0., + fontsize=12, title="Kernels", title_fontsize=13) + + # Secondary Y-axis - Line chart + ax2 = ax1.twinx() + + # Calculate number of complete cases + num_complete_cases = len(x_positions) // len(self.NEURA_CONFIGS) + + # Insert NaN every 5 points + x_with_gaps = [] + y_with_gaps = [] + + for case_idx in range(num_complete_cases): + # Start and end indices for this case + start_idx = case_idx * len(self.NEURA_CONFIGS) + end_idx = start_idx + len(self.NEURA_CONFIGS) + + # Add 5 points for this case + x_with_gaps.extend(x_positions[start_idx:end_idx]) + y_with_gaps.extend(line_data[start_idx:end_idx]) + + # Add NaN after each case (except the last complete case) + if case_idx < num_complete_cases - 1: + x_with_gaps.append(np.nan) + y_with_gaps.append(np.nan) + + + # Convert to numpy arrays + x_with_gaps = np.array(x_with_gaps) + y_with_gaps = np.array(y_with_gaps) + + # Plot line with gaps between cases + line = ax2.plot(x_with_gaps, y_with_gaps, + marker='o', markersize=8, linewidth=2.5, + color='blue', linestyle='--', + markerfacecolor='white', markeredgewidth=2, + label='Utilization') + + ax2.set_ylabel('Resource Utilization (%)', fontsize=20, color='black') + ax2.tick_params(axis='y', labelcolor='black', labelsize=18) + + # Display values on line points + for i, (x, y) in enumerate(zip(x_positions, line_data)): + ax2.text(x, y + max(line_data)*0.02, f'{y:.1f}', + ha='center', va='bottom', fontsize=10) + + # Set X-axis labels and grouping + ax1.set_xticks(x_positions) + ax1.set_xticklabels(x_labels, rotation=90) + ax1.tick_params(axis='x', labelsize=18) + + # Add group labels + group_positions = [3, 8, 13, 17, 22, 27] # Middle position of each group + for case, pos in zip(cases, group_positions): + ax1.text(pos, -0.15, 'case ' + case, transform=ax1.get_xaxis_transform(), + ha='center', va='top', fontsize=20, fontweight='bold', + bbox=dict(boxstyle="round,pad=0.3", facecolor='lightgray', alpha=0.8)) + + ax1.grid(True, linestyle='--', alpha=0.3, axis='y') + plt.title('ExampleFig9') + plt.tight_layout() + plt.savefig(fig_path) + print(f"Generated fig f{fig_path}") + + def genFig10(self, fig_path: str): + """ + Generate Figure 10: Normalized throughput speedup + """ + cases = ['1', '2', '3', '4', '5', '6'] + self.process_throughput_data(cases) + + # Correct data structure - one value per X position + bar_data = [] # Bar chart data + x_labels = [] # X-axis labels + # Collect data + for case in cases: + cache_key = f"{case}_Baseline" + execution_series = self.execution_cache.get(cache_key) + number_series = self.number_cache.get(cache_key) + waiting_series = self.waiting_cache.get(cache_key) + hw_waiting = waiting_series.iloc[0] / int(number_series.sum()) + avg_execution = execution_series.sum() / int(number_series.sum()) + hw_waiting_ratio = hw_waiting / (hw_waiting + avg_execution) + avg_execution_ratio = avg_execution / (hw_waiting + avg_execution) + hw_waiting_baseline = hw_waiting + avg_execution_baseline = avg_execution + throughput_baseline = (hw_waiting_ratio + avg_execution_ratio) + + for group in self.NEURA_CONFIGS: + cache_key = f"{case}_{group}" # Adjust based on your actual naming convention + execution_series = self.execution_cache.get(cache_key) + number_series = self.number_cache.get(cache_key) + waiting_series = self.waiting_cache.get(cache_key) + if (execution_series is None or number_series is None or + waiting_series is None): + continue + hw_waiting = waiting_series.iloc[0] / int(number_series.sum()) + avg_execution = execution_series.sum() / int(number_series.sum()) + hw_waiting_ratio = hw_waiting / (hw_waiting_baseline + avg_execution_baseline) + avg_execution_ratio = avg_execution / (hw_waiting_baseline + avg_execution_baseline) + bar_data.append(throughput_baseline / (hw_waiting_ratio + avg_execution_ratio)) + + x_labels.append(f"{group}") + # sum_throughput = throughput_speedup.sum() + # Create chart + fig, ax1 = plt.subplots(figsize=(20, 8)) + plt.style.use({ + 'font.size': 20, + 'axes.labelsize': 18, + 'axes.titlesize': 18, + 'xtick.labelsize': 18, + 'ytick.labelsize': 18 + }) + + x_positions = np.arange(len(bar_data)) + bar_width = 0.6 + + bars = ax1.bar(x_positions, bar_data, bar_width, + color=self.NEURA_COLORS[:len(bar_data)], + alpha=0.8, + edgecolor='black', + linewidth=0.5) + + # Add black dashed separator lines every group + for i in range(4, len(bar_data)-1, 5): + line_pos = i + 0.5 + ax1.axvline(x=line_pos, + color='black', + linestyle='--', + linewidth=0.8, + alpha=0.8) + + for i, (x, y) in enumerate(zip(x_positions, bar_data)): + if (i + 1) % 5 == 0: + ax1.text(x, y + max(bar_data)*0.02, f'{y:.1f}', + ha='center', va='bottom', fontsize=10) + + ax1.set_ylabel('Normalized Throughput Speedup', fontsize=20, color='black') + ax1.tick_params(axis='y', labelcolor='black') + ax1.set_ylim(0, 4) + + + # Set X-axis labels and grouping + ax1.set_xticks(x_positions) + ax1.set_xticklabels(x_labels, rotation=90) + + # Add group labels + group_positions = [3, 8, 13, 17, 22, 27] # Middle position of each group + for case, pos in zip(cases, group_positions): + ax1.text(pos, -0.15, 'case ' + case, transform=ax1.get_xaxis_transform(), + ha='center', va='top', fontsize=20, fontweight='bold', + bbox=dict(boxstyle="round,pad=0.3", facecolor='lightgray', alpha=0.8)) + + # Legends + ax1.legend(loc='upper left') + + ax1.grid(True, linestyle='--', alpha=0.3, axis='y') + plt.title('ExampleFig10') + plt.tight_layout() + # plt.legend() + plt.savefig(fig_path) + print(f"Generated fig {fig_path}") + + def genFig11(self, fig_path: str): + """ + Generate Figure 11: Scalability -- Normalized execution time and improved utilization + """ + cases = ['2x2_6', '3x3_6', '4x4_6', '5x5_6'] + self.process_scalability_data(cases) + + # Correct data structure - one value per X position + bar_data = {kernel: [] for kernel in self.KERNEL_NAMES} # Bar chart data + line_data = [] # Line chart data + x_labels = [] # X-axis labels + # Collect data + cache_key = "2x2_6_Baseline" + scalability_series = self.scalability_cache.get(cache_key) + latency_series = self.latency_cache.get(cache_key) + throughput_speedup = [0] * len(scalability_series) + for i in range(len(scalability_series)): + throughput_speedup[i] = (1 / (scalability_series[i] * latency_series[i] * 100)) + throughput_baseline = sum(throughput_speedup) + for case in cases: + for group in self.NEURA_CONFIGS: + cache_key = f"{case}_{group}" # Adjust based on your actual naming convention + scalability_series = self.scalability_cache.get(cache_key) + utilization_series = self.utilization_cache.get(cache_key) + latency_series = self.latency_cache.get(cache_key) + if (scalability_series is None or latency_series is None or + utilization_series is None): + continue + for i in range(len(scalability_series)): + if scalability_series[i] * latency_series[i] == 0: + tmp = 0 + else: + tmp = (1 / (scalability_series[i] * latency_series[i] * 100)) + throughput_speedup[i] = tmp / throughput_baseline + # Bar chart data + for i, kernel in enumerate(self.KERNEL_NAMES): + bar_data[kernel].append(throughput_speedup[i]) + + # Line chart data + if utilization_series is not None: + line_value = utilization_series.iloc[0] + line_data.append(float(line_value) * 100) + else: + line_data.append(0) + + x_labels.append(f"{group}") + + # Create chart + fig, ax1 = plt.subplots(figsize=(20, 8)) + plt.style.use({ + 'font.size': 20, + 'axes.labelsize': 18, + 'axes.titlesize': 18, + 'xtick.labelsize': 18, + 'ytick.labelsize': 18 + }) + + + total_bars = (len(cases) * (len(self.NEURA_CONFIGS) - 1)) + 1 + x_positions = np.arange(total_bars) + bar_width = 0.6 + # Primary Y-axis - Bar chart + color_dict = {kernel: color for kernel, color in zip(self.KERNEL_NAMES, self.KERNEL_COLORS)} + bottom = np.zeros(total_bars) + bars_by_kernel = {} + for kernel in self.KERNEL_NAMES: + data = bar_data[kernel] + bars = ax1.bar(x_positions, data, bar_width, bottom=bottom, + color=color_dict[kernel], alpha=0.8, + edgecolor='black', linewidth=0.5, label=kernel) + bars_by_kernel[kernel] = bars + bottom += np.array(data) + + # Add black dashed separator lines every group + group_pattern = [5, 4, 4, 4] + current_position = 0 + line_positions = [] + for group_size in group_pattern: + current_position += group_size + if current_position < len(x_positions): + line_positions.append(current_position - 0.5) + for pos in line_positions: + ax1.axvline(x=pos, + color='black', + linestyle='--', + linewidth=0.8, + alpha=0.8) + + # Display values on Neura + display_indices = [] + for i in range(len(x_positions)): + if i >= 4 and (i - 4) % 4 == 0: + display_indices.append(i) + arrays = [np.array(heights) for heights in bar_data.values()] + total_heights = np.sum(arrays, axis=0) + for i, (x, y) in enumerate(zip(x_positions, total_heights)): + if i in display_indices: + ax1.text(x, y + max(total_heights)*0.02, f'{y:.1f}', + ha='center', va='bottom', fontsize=10) + + ax1.set_ylabel('Normalized Throughput Speedup', fontsize=20, color='black') + ax1.tick_params(axis='y', labelcolor='black') + ax1.set_ylim(0, 26) + ax1.legend(loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0., + fontsize=12, title="Kernels", title_fontsize=13) + + # Secondary Y-axis - Line chart + ax2 = ax1.twinx() + + # Define break pattern: first group 5 points, then 4 points for others + break_pattern = [5] # First case: 5 points + remaining_cases = (len(x_positions) - 5) // 4 # Calculate how many 4-point cases + break_pattern.extend([4] * remaining_cases) # Add 4 for each remaining case + + # Insert NaN based on the break pattern + x_with_gaps = [] + y_with_gaps = [] + + current_idx = 0 + for i, num_points in enumerate(break_pattern): + # Add points for this case + end_idx = current_idx + num_points + x_with_gaps.extend(x_positions[current_idx:end_idx]) + y_with_gaps.extend(line_data[current_idx:end_idx]) + + # Add NaN after this case (except the last one) + if i < len(break_pattern) - 1: + x_with_gaps.append(np.nan) + y_with_gaps.append(np.nan) + + current_idx = end_idx + + x_with_gaps = np.array(x_with_gaps) + y_with_gaps = np.array(y_with_gaps) + + # Plot line with gaps between cases + line = ax2.plot(x_with_gaps, y_with_gaps, + marker='o', markersize=8, linewidth=2.5, + color='blue', linestyle='--', + markerfacecolor='white', markeredgewidth=2, + label='Utilization') + + ax2.set_ylabel('Resource Utilization (%)', fontsize=20, color='black') + ax2.tick_params(axis='y', labelcolor='black') + ax2.set_ylim(0, 100) + ax2.set_yticks(np.arange(0, 120, 30)) + + # Display values on line points + for i, (x, y) in enumerate(zip(x_positions, line_data)): + ax2.text(x, y + max(line_data)*0.02, f'{y:.1f}', + ha='center', va='bottom', fontsize=10) + + # Set X-axis labels and grouping + ax1.set_xticks(x_positions) + ax1.set_xticklabels(x_labels, rotation=90) + + # Add group labels + group_positions = [3, 7, 11, 15] # Middle position of each group + for case, pos in zip(cases, group_positions): + ax1.text(pos, -0.15, (case.split('_'))[0] + 'Neura', transform=ax1.get_xaxis_transform(), + ha='center', va='top', fontsize=20, fontweight='bold', + bbox=dict(boxstyle="round,pad=0.3", facecolor='lightgray', alpha=0.8)) + + ax1.grid(True, linestyle='--', alpha=0.3, axis='y') + plt.title('ExampleFig11') + plt.tight_layout() + # plt.legend() + plt.savefig(fig_path) + print(f"Generated fig {fig_path}") + +if __name__ == '__main__': + KERNEL_DATA = { + "fir.cpp": (7, 2048, 4096), + "latnrm.c": (8, 1280, 2560), + "fft.c": (2, 112640, 450560), + "dtw.cpp": (4, 16384, 49152), + "spmv.c": (3, 65536, 262144), + "conv.c": (1, 655360, 1310720), + "mvt.c": (5, 16384, 49152), + "gemm.c": (0, 2097152, 8388608), + "relu+histogram.c": (6, 262144, 2097152) + } + genFigs = SimulationDataAnalyzer(kernel_data=KERNEL_DATA) + genFigs.genFig9("./fig/Fig9Test.png") + #genFigs.genFig10("./fig/Fig10.png") + genFigs.genFig11("./fig/Fig11Test.png") \ No newline at end of file