diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 27724d6b..daf96d9e 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -31,6 +31,16 @@ jobs:
         sudo apt-get -y install llvm-12 llvm-12-dev llvm-12-tools clang-12
         sudo apt-get -y install build-essential
 
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.7.16'
+
+    - name: Install Python Dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install eventlet pandas matplotlib
+
     - name: Configure CMake
       # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
       # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
@@ -105,3 +115,8 @@ jobs:
         sh compile.sh
         sh run.sh
         sh verify.sh
+
+    - name: Test expandable automatic script
+      working-directory: ${{github.workspace}}/tools/expandable
+      run: |
+        ./demo.sh --test y
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 4a6b9a93..06b0854f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,12 @@ build*/
 *.ll
 *.dot
 *.bc
+.vscode/
+.venv/
+tmp/
+fig/
+result/
+__pycache__/
+dfg.json
+increMapInput.json
+config.json
\ No newline at end of file
diff --git a/src/DFG.cpp b/src/DFG.cpp
index fe6e07a5..0d73ddc9 100644
--- a/src/DFG.cpp
+++ b/src/DFG.cpp
@@ -33,8 +33,13 @@ DFG::DFG(Function& t_F, list<Loop*>* t_loops, bool t_targetFunction,
   bool needsCycleCalculation = false;
   for (auto strategy : *t_fusionStrategy) {
     if (strategy == "default_heterogeneous") {
-      combineMulAdd("CoT");
-      combinePhiAdd("BrT");
+      combine("phi", "add", "Ctrl");
+      combine("phi", "fadd", "Ctrl");
+      combine("fcmp", "select", "Ctrl");
+      combine("icmp", "select", "Ctrl");
+      combine("icmp", "br", "Ctrl");
+      combine("fcmp", "br", "Ctrl");
+      tuneForPattern();
       needsCycleCalculation = true;
     }
     else if (strategy == "nonlinear") {
diff --git a/src/Mapper.cpp b/src/Mapper.cpp
index e4c05379..ab493e8b 100644
--- a/src/Mapper.cpp
+++ b/src/Mapper.cpp
@@ -664,9 +664,13 @@ void Mapper::showUtilization(CGRA* t_cgra, DFG* t_dfg, int t_II,
     total_active_tiles += 1;
   }
   float avg_tile_overall_utilization = 0.0;
+  float max_tile_overall_utilization = 0.0;
   float avg_tile_fu_utilization = 0.0;
   float avg_tile_xbar_utilization = 0.0;
   for (int tile = 0; tile < t_cgra->getFUCount(); ++tile) {
+    if (max_tile_overall_utilization < tile_overall_utilization[tile]) {
+      max_tile_overall_utilization = tile_overall_utilization[tile];
+    }
     avg_tile_overall_utilization += tile_overall_utilization[tile];
     avg_tile_fu_utilization += tile_fu_utilization[tile];
     avg_tile_xbar_utilization += tile_xbar_utilization[tile];
@@ -675,8 +679,10 @@ void Mapper::showUtilization(CGRA* t_cgra, DFG* t_dfg, int t_II,
   avg_tile_overall_utilization /= total_active_tiles;
   avg_tile_fu_utilization /= total_active_tiles;
   avg_tile_xbar_utilization /= total_active_tiles;
+  //max_tile_overall_utilization /= total_active_tiles;
 
-  cout << "tile avg fu utilization: " << avg_tile_fu_utilization*100 << "%; avg xbar utilization: " << avg_tile_xbar_utilization*100 << "%; avg overall utilization: " << avg_tile_overall_utilization*100 << "%" << endl;
+  cout << "tile avg fu utilization: " << avg_tile_fu_utilization*100 << "%; avg xbar utilization: " << avg_tile_xbar_utilization*100 << "%; avg overall utilization: " << avg_tile_overall_utilization*t_II*100 << "%" << endl;
+  cout << "max overall utilization: " << max_tile_overall_utilization*t_II*100 << "%" << endl;
 
   // Collects the histogram of tiles' utilization.
   // Histogram for the number of tiles that have utilization of 0%.
@@ -1007,6 +1013,7 @@ void Mapper::showSchedule(CGRA* t_cgra, DFG* t_dfg, int t_II,
   cout<<"[Mapping II: "<<t_II<<"]"<<endl;
 
   if (t_parameterizableCGRA) {
+    // TODO: make it clean
     jsonTilesLinks["tiles"] = jsonTiles;
     jsonTilesLinks["links"] = jsonLinks;
     json jsonMap(jsonTilesLinks);
diff --git a/src/mapperPass.cpp b/src/mapperPass.cpp
index 268191be..d86e05d1 100644
--- a/src/mapperPass.cpp
+++ b/src/mapperPass.cpp
@@ -189,7 +189,7 @@ namespace {
           // Exclusive: Multi-cyce operations occupy tiles exclusively. Other operations can be mapped onto this tile only if the multi-cycle operation finishs its computation.
           // Distributed: Multi-cycle operations are splitted into multiple single-cycle operations and each of which can be mapped onto a tile.
           // Inclusive: Multi-cycle operations' execution can overlap with other operations on the same tile.
-          // Note that 
+          // Note that
           assert(multiCycleStrategy.compare("exclusive") == 0 or
                  multiCycleStrategy.compare("distributed") == 0 or
                  multiCycleStrategy.compare("inclusive") == 0);
@@ -339,8 +339,8 @@ namespace {
       else {
         mapper->showSchedule(cgra, dfg, II, isStaticElasticCGRA, parameterizableCGRA);
         // cout << "==================================\n";
-        cout << "[show opcode count]\n";
-        dfg->showOpcodeDistribution();
+        // cout << "[show opcode count]\n";
+        // dfg->showOpcodeDistribution();
         cout << "[Mapping Success]\n";
         cout << "==================================\n";
         if (enableExpandableMapping) {
@@ -409,11 +409,11 @@ namespace {
      */
      bool canMap(CGRA* t_cgra, DFG* t_dfg) {
       std::set<std::string> missing_fus;
-    
+
       for (auto it = t_dfg->nodes.begin(); it != t_dfg->nodes.end(); ++it) {
         DFGNode* node = *it;
         bool nodeSupported = false;
-    
+
         for (int i = 0; i < t_cgra->getRows() && !nodeSupported; ++i) {
           for (int j = 0; j < t_cgra->getColumns(); ++j) {
             CGRANode* fu = t_cgra->nodes[i][j];
@@ -423,12 +423,12 @@ namespace {
             }
           }
         }
-    
+
         if (!nodeSupported) {
           missing_fus.insert(node->getOpcodeName());
         }
       }
-    
+
       if (!missing_fus.empty()) {
         std::cout << "[canMap] Missing functional units: ";
         for (const auto& op : missing_fus) {
@@ -437,10 +437,10 @@ namespace {
         std::cout << std::endl;
         return false;
       }
-    
+
       return true;
     }
-    
+
   };
 }
 
diff --git a/test/compile.sh b/test/compile.sh
index eee8141a..7f00f2a2 100755
--- a/test/compile.sh
+++ b/test/compile.sh
@@ -1,2 +1,3 @@
-clang-12 -emit-llvm -fno-unroll-loops -O3 -o kernel.bc -c kernel.cpp
-#llvm-dis fir.bc -o fir.ll
+clang-12 -emit-llvm -fno-unroll-loops -O0 -o kernel.bc -c kernel.cpp
+llvm-dis-12 kernel.bc -o O0kernel.ll
+#clang-12 -emit-llvm -fno-unroll-loops -mllvm -force-vector-width=4 -O3 -o kernel.bc -c ./_matmul/src/matmul.c
diff --git a/test/dot.sh b/test/dot.sh
index 3bccbd9f..fa80b3c7 100644
--- a/test/dot.sh
+++ b/test/dot.sh
@@ -1,2 +1 @@
-dot -Tpng _Z6kernelPfS_S_.dot -o kernel.png
-
+dot -Tpng _Z6kernelPfS_S_.dot -o kernel.png
\ No newline at end of file
diff --git a/test/inter_edge/compile.sh b/test/inter_edge/compile.sh
old mode 100644
new mode 100755
diff --git a/test/inter_edge/dot.sh b/test/inter_edge/dot.sh
old mode 100644
new mode 100755
diff --git a/test/inter_edge/rebuild.sh b/test/inter_edge/rebuild.sh
old mode 100644
new mode 100755
diff --git a/test/inter_edge/run.sh b/test/inter_edge/run.sh
old mode 100644
new mode 100755
diff --git a/test/inter_edge/verify.sh b/test/inter_edge/verify.sh
old mode 100644
new mode 100755
diff --git a/test/kernels/conv/conv.c b/test/kernels/conv/conv.c
index cde86d42..24e95013 100644
--- a/test/kernels/conv/conv.c
+++ b/test/kernels/conv/conv.c
@@ -76,7 +76,7 @@ int kernel(int ni, int nj, int nk,
   for (x = 0; x < total; x++) {
     i = x / NJ;
     j = x % NJ;
-    out += A[i][j] * B[i][j];
+    out += A  [i][j] * B[i][j];
   }
 
   /*
diff --git a/test/kernels/latnrm/latnrm.c b/test/kernels/latnrm/latnrm.c
index db886b11..7539acfa 100644
--- a/test/kernels/latnrm/latnrm.c
+++ b/test/kernels/latnrm/latnrm.c
@@ -45,17 +45,17 @@ void kernel(float input, float *output, float coefficient[16],
   top = input;
   q_coef = coefficient[0];
   // #pragma clang loop unroll_count(4)
-  for (i = 0; i < ORDER; i++) {
-    k_coef = coefficient[2*i];
-    left = top;
-    right = internal_state[i];
-    internal_state[i] = bottom;
-    top = q_coef * left - k_coef * right;
-    bottom = q_coef * right + k_coef * left;
-    q_coef = coefficient[2*i+1];
-  }
-  internal_state[i++] = bottom;
-  internal_state[i] = top;
+  // for (i = 0; i < ORDER; i++) {
+  //   k_coef = coefficient[2*i];
+  //   left = top;
+  //   right = internal_state[i];
+  //   internal_state[i] = bottom;
+  //   top = q_coef * left - k_coef * right;
+  //   bottom = q_coef * right + k_coef * left;
+  //   q_coef = coefficient[2*i+1];
+  // }
+  // internal_state[i++] = bottom;
+  // internal_state[i] = top;
 
   sum = internal_state[1] * q_coef;
 
diff --git a/test/kernels/mvt/mvt.c b/test/kernels/mvt/mvt.c
index ed350037..e091d92c 100644
--- a/test/kernels/mvt/mvt.c
+++ b/test/kernels/mvt/mvt.c
@@ -78,8 +78,8 @@ void kernel(int n,
 {
   int i, j;
 
-#pragma scop
-  #pragma clang loop unroll_count(1)
+// #pragma scop
+  // #pragma clang loop unroll_count(1)
   for (j = 0; j < N; j++) {
     // #pragma clang loop unroll_count(1) vectorize(disable)
     //#pragma clang loop unroll_count(1) vectorize_width(4)
@@ -88,7 +88,7 @@ void kernel(int n,
       x2[i] = x2[i] + A[j][i] * y_2[j];
     }
   }
-#pragma endscop
+// #pragma endscop
 }
 
 
diff --git a/test/kernels/relu+histogram/compile.sh b/test/kernels/relu+histogram/compile.sh
new file mode 100644
index 00000000..11be835a
--- /dev/null
+++ b/test/kernels/relu+histogram/compile.sh
@@ -0,0 +1,4 @@
+clang-12 -emit-llvm -O3 -fno-unroll-loops -o kernel.bc -c relu.c
+llvm-dis-12 kernel.bc -o kernel.ll
+opt-12 --loop-unroll --unroll-count=4 kernel.bc -o kernel_unroll.bc
+llvm-dis-12 kernel_unroll.bc -o kernel_unroll.ll
diff --git a/test/kernels/relu+histogram/param.json b/test/kernels/relu+histogram/param.json
new file mode 100644
index 00000000..b2d46633
--- /dev/null
+++ b/test/kernels/relu+histogram/param.json
@@ -0,0 +1,19 @@
+{
+  "kernel"                : "kernel",
+  "targetFunction"        : false,
+  "targetNested"          : false,
+  "targetLoopsID"         : [1],
+  "doCGRAMapping"         : false,
+  "row"                   : 4,
+  "column"                : 4,
+  "diagonalVectorization" : true,
+  "fusionStrategy"        : [],
+  "isTrimmedDemo"         : true,
+  "heuristicMapping"      : false,
+  "bypassConstraint"      : 4,
+  "isStaticElasticCGRA"   : false,
+  "precisionAware"        : false,
+  "ctrlMemConstraint"     : 200,
+  "regConstraint"         : 8
+}
+
diff --git a/test/kernels/relu+histogram/polybench.h b/test/kernels/relu+histogram/polybench.h
new file mode 100644
index 00000000..d1a1f776
--- /dev/null
+++ b/test/kernels/relu+histogram/polybench.h
@@ -0,0 +1,217 @@
+/*
+ * Polybench header for instrumentation.
+ *
+ * Programs must be compiled with `-I utilities utilities/polybench.c'
+ *
+ * Optionally, one can define:
+ *
+ * -DPOLYBENCH_TIME, to report the execution time,
+ *   OR (exclusive):
+ * -DPOLYBENCH_PAPI, to use PAPI H/W counters (defined in polybench.c)
+ *
+ *
+ * See README or utilities/polybench.c for additional options.
+ *
+ */
+#ifndef POLYBENCH_H
+# define POLYBENCH_H
+
+# include <stdlib.h>
+
+/* Array padding. By default, none is used. */
+# ifndef POLYBENCH_PADDING_FACTOR
+/* default: */
+#  define POLYBENCH_PADDING_FACTOR 0
+# endif
+
+
+/* C99 arrays in function prototype. By default, do not use. */
+# ifdef POLYBENCH_USE_C99_PROTO
+#  define POLYBENCH_C99_SELECT(x,y) y
+# else
+/* default: */
+#  define POLYBENCH_C99_SELECT(x,y) x
+# endif
+
+
+/* Scalar loop bounds in SCoPs. By default, use parametric loop bounds. */
+# ifdef POLYBENCH_USE_SCALAR_LB
+#  define POLYBENCH_LOOP_BOUND(x,y) x
+# else
+/* default: */
+#  define POLYBENCH_LOOP_BOUND(x,y) y
+# endif
+
+/* Use the 'restrict' keyword to declare that the different arrays do not
+ * alias. By default, we do not use it as it is only supported in C99 and
+ * even here several compilers do not properly get it.
+ */
+# ifdef POLYBENCH_USE_RESTRICT
+#  define POLYBENCH_RESTRICT restrict
+# else
+/* default: */
+#  define POLYBENCH_RESTRICT
+# endif
+
+/* Macros to reference an array. Generic for heap and stack arrays
+   (C99).  Each array dimensionality has his own macro, to be used at
+   declaration or as a function argument.
+   Example:
+   int b[x] => POLYBENCH_1D_ARRAY(b, x)
+   int A[N][N] => POLYBENCH_2D_ARRAY(A, N, N)
+*/
+# ifndef POLYBENCH_STACK_ARRAYS
+#  define POLYBENCH_ARRAY(x) *x
+#  define POLYBENCH_FREE_ARRAY(x) free((void*)x);
+#  define POLYBENCH_DECL_VAR(x) (*x)
+# else
+#  define POLYBENCH_ARRAY(x) x
+#  define POLYBENCH_FREE_ARRAY(x)
+#  define POLYBENCH_DECL_VAR(x) x
+# endif
+/* Macros for using arrays in the function prototypes. */
+# define POLYBENCH_1D(var, dim1,ddim1) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_2D(var, dim1, dim2, ddim1, ddim2) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_3D(var, dim1, dim2, dim3, ddim1, ddim2, ddim3) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_4D(var, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_5D(var, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim5,ddim5) + POLYBENCH_PADDING_FACTOR]
+/* Macros for using arrays within the functions. */
+# define POLYBENCH_1D_F(var, dim1,ddim1) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_2D_F(var, dim1, dim2, ddim1, ddim2) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_3D_F(var, dim1, dim2, dim3, ddim1, ddim2, ddim3) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_4D_F(var, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_5D_F(var, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim5,ddim5) + POLYBENCH_PADDING_FACTOR]
+
+
+/* Macros to allocate heap arrays.
+   Example:
+   polybench_alloc_2d_array(N, M, double) => allocates N x M x sizeof(double)
+					  and returns a pointer to the 2d array
+ */
+# define POLYBENCH_ALLOC_1D_ARRAY(n1, type)	\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data (n1 + POLYBENCH_PADDING_FACTOR, sizeof(type))
+# define POLYBENCH_ALLOC_2D_ARRAY(n1, n2, type)		\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR), sizeof(type))
+# define POLYBENCH_ALLOC_3D_ARRAY(n1, n2, n3, type)		\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR), sizeof(type))
+# define POLYBENCH_ALLOC_4D_ARRAY(n1, n2, n3, n4, type)			\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR][n4 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR) * (n4 + POLYBENCH_PADDING_FACTOR), sizeof(type))
+# define POLYBENCH_ALLOC_5D_ARRAY(n1, n2, n3, n4, n5, type)		\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR][n4 + POLYBENCH_PADDING_FACTOR][n5 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR) * (n4 + POLYBENCH_PADDING_FACTOR) * (n5 + POLYBENCH_PADDING_FACTOR), sizeof(type))
+
+/* Macros for array declaration. */
+# ifndef POLYBENCH_STACK_ARRAYS
+#  define POLYBENCH_1D_ARRAY_DECL(var, type, dim1, ddim1)		\
+  type POLYBENCH_1D_F(POLYBENCH_DECL_VAR(var), dim1, ddim1); \
+  var = POLYBENCH_ALLOC_1D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), type);
+#  define POLYBENCH_2D_ARRAY_DECL(var, type, dim1, dim2, ddim1, ddim2)	\
+  type POLYBENCH_2D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, ddim1, ddim2); \
+  var = POLYBENCH_ALLOC_2D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), type);
+#  define POLYBENCH_3D_ARRAY_DECL(var, type, dim1, dim2, dim3, ddim1, ddim2, ddim3) \
+  type POLYBENCH_3D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, ddim1, ddim2, ddim3); \
+  var = POLYBENCH_ALLOC_3D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), type);
+#  define POLYBENCH_4D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) \
+  type POLYBENCH_4D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4); \
+  var = POLYBENCH_ALLOC_4D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), POLYBENCH_C99_SELECT(dim4, ddim4), type);
+#  define POLYBENCH_5D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) \
+  type POLYBENCH_5D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5); \
+  var = POLYBENCH_ALLOC_5D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), POLYBENCH_C99_SELECT(dim4, ddim4), POLYBENCH_C99_SELECT(dim5, ddim5), type);
+# else
+#  define POLYBENCH_1D_ARRAY_DECL(var, type, dim1, ddim1)		\
+  type POLYBENCH_1D_F(POLYBENCH_DECL_VAR(var), dim1, ddim1);
+#  define POLYBENCH_2D_ARRAY_DECL(var, type, dim1, dim2, ddim1, ddim2)	\
+  type POLYBENCH_2D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, ddim1, ddim2);
+#  define POLYBENCH_3D_ARRAY_DECL(var, type, dim1, dim2, dim3, ddim1, ddim2, ddim3) \
+  type POLYBENCH_3D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, ddim1, ddim2, ddim3);
+#  define POLYBENCH_4D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) \
+  type POLYBENCH_4D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4);
+#  define POLYBENCH_5D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) \
+  type POLYBENCH_5D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5);
+# endif
+
+
+/* Dead-code elimination macros. Use argc/argv for the run-time check. */
+# ifndef POLYBENCH_DUMP_ARRAYS
+#  define POLYBENCH_DCE_ONLY_CODE    if (argc > 42 && ! strcmp(argv[0], ""))
+# else
+#  define POLYBENCH_DCE_ONLY_CODE
+# endif
+
+#define POLYBENCH_DUMP_TARGET stderr
+#define POLYBENCH_DUMP_START    fprintf(POLYBENCH_DUMP_TARGET, "==BEGIN DUMP_ARRAYS==\n")
+#define POLYBENCH_DUMP_FINISH   fprintf(POLYBENCH_DUMP_TARGET, "==END   DUMP_ARRAYS==\n")
+#define POLYBENCH_DUMP_BEGIN(s) fprintf(POLYBENCH_DUMP_TARGET, "begin dump: %s", s)
+#define POLYBENCH_DUMP_END(s)   fprintf(POLYBENCH_DUMP_TARGET, "\nend   dump: %s\n", s)
+
+# define polybench_prevent_dce(func)		\
+  POLYBENCH_DCE_ONLY_CODE			\
+  func
+
+
+/* Performance-related instrumentation. See polybench.c */
+# define polybench_start_instruments
+# define polybench_stop_instruments
+# define polybench_print_instruments
+
+
+/* PAPI support. */
+# ifdef POLYBENCH_PAPI
+extern const unsigned int polybench_papi_eventlist[];
+#  undef polybench_start_instruments
+#  undef polybench_stop_instruments
+#  undef polybench_print_instruments
+#  define polybench_set_papi_thread_report(x)	\
+   polybench_papi_counters_threadid = x;
+#  define polybench_start_instruments				\
+  polybench_prepare_instruments();				\
+  polybench_papi_init();					\
+  int evid;							\
+  for (evid = 0; polybench_papi_eventlist[evid] != 0; evid++)	\
+    {								\
+      if (polybench_papi_start_counter(evid))			\
+	continue;						\
+
+#  define polybench_stop_instruments		\
+      polybench_papi_stop_counter(evid);	\
+    }						\
+  polybench_papi_close();			\
+
+#  define polybench_print_instruments polybench_papi_print();
+# endif
+
+
+/* Timing support. */
+# if defined(POLYBENCH_TIME) || defined(POLYBENCH_GFLOPS)
+#  undef polybench_start_instruments
+#  undef polybench_stop_instruments
+#  undef polybench_print_instruments
+#  define polybench_start_instruments polybench_timer_start();
+#  define polybench_stop_instruments polybench_timer_stop();
+#  define polybench_print_instruments polybench_timer_print();
+extern double polybench_program_total_flops;
+extern void polybench_timer_start();
+extern void polybench_timer_stop();
+extern void polybench_timer_print();
+# endif
+
+/* Function declaration. */
+# ifdef POLYBENCH_TIME
+extern void polybench_timer_start();
+extern void polybench_timer_stop();
+extern void polybench_timer_print();
+# endif
+
+# ifdef POLYBENCH_PAPI
+extern void polybench_prepare_instruments();
+extern int polybench_papi_start_counter(int evid);
+extern void polybench_papi_stop_counter(int evid);
+extern void polybench_papi_init();
+extern void polybench_papi_close();
+extern void polybench_papi_print();
+# endif
+
+/* Function prototypes. */
+extern void* polybench_alloc_data(unsigned long long int n, int elt_size);
+
+
+#endif /* !POLYBENCH_H */
diff --git a/test/kernels/relu+histogram/relu+histogram.c b/test/kernels/relu+histogram/relu+histogram.c
new file mode 100644
index 00000000..cd36403d
--- /dev/null
+++ b/test/kernels/relu+histogram/relu+histogram.c
@@ -0,0 +1,126 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <math.h>
+#include "polybench.h"
+#include "relu.h"
+// histogram
+#define DATA_LEN 20
+#define BUCKET_LEN 5
+#define MIN 1.0
+#define MAX 19.0
+
+float input_data[DATA_LEN] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,14,14,14,14,14,19};
+int histogram[BUCKET_LEN] = {0};
+
+/* Array initialization. */
+static
+void init_array(int ni, int nj, int nk,
+		DATA_TYPE POLYBENCH_2D(C,NI,NJ,ni,nj),
+		DATA_TYPE POLYBENCH_2D(A,NI,NJ,ni,nj),
+		DATA_TYPE POLYBENCH_2D(B,NK,NJ,nk,nj))
+{
+  int i, j;
+
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nj; j++)
+      C[i][j] = (DATA_TYPE) (i*j % ni) / ni;
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nj; j++)
+      A[i][j] = (DATA_TYPE) (i*(j+1) % nk) / nk;
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nj; j++)
+      B[i][j] = (DATA_TYPE) (i*(j+2) % nj) / nj;
+}
+
+
+/* DCE code. Must scan the entire live-out data.
+   Can be used also to check the correctness of the output. */
+static
+void print_array(int ni, int nj,
+		 DATA_TYPE POLYBENCH_2D(C,NI,NJ,ni,nj))
+{
+  int i, j;
+
+  POLYBENCH_DUMP_START;
+  POLYBENCH_DUMP_BEGIN("C");
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nj; j++) {
+	if ((i * ni + j) % 20 == 0) fprintf (POLYBENCH_DUMP_TARGET, "\n");
+	fprintf (POLYBENCH_DUMP_TARGET, DATA_PRINTF_MODIFIER, C[i][j]);
+    }
+  POLYBENCH_DUMP_END("C");
+  POLYBENCH_DUMP_FINISH;
+}
+
+
+/* Main computational kernel. The whole function will be timed,
+   including the call and return. */
+void kernel(int ni, int nj, int nk,
+	   DATA_TYPE POLYBENCH_2D(C,NI,NJ,ni,nj),
+	   DATA_TYPE POLYBENCH_2D(A,NI,NJ,ni,nj),
+	   DATA_TYPE POLYBENCH_2D(B,NI,NJ,ni,nj),float input[], int histogram[])
+{
+  int x = 0, i = 0, j = 0, k = 0;
+  int total = NI * NJ;
+  float dmin = (float)MIN;
+  float delt = (float)(MAX - dmin);
+  //#pragma clang loop vectorize(disable) unroll_count(4)
+  // #pragma clang loop vectorize(enable) vectorize_width(4) unroll_count(4)
+  for (x = 0; x < total; x++) {
+    i = x / NJ;
+    j = x % NJ;
+    if (A[i][j] < 0)
+      C[i][j] = 0;
+    else
+      C[i][j] = A[i][j];
+
+    float r = BUCKET_LEN * (input[x] - dmin) / delt;
+    int b = (int)(r);
+    histogram[b]++;
+  }
+}
+
+
+int main(int argc, char** argv)
+{
+  /* Retrieve problem size. */
+  int ni = NI;
+  int nj = NJ;
+  int nk = NK;
+
+  /* Variable declaration/allocation. */
+  POLYBENCH_2D_ARRAY_DECL(C,DATA_TYPE,NI,NJ,ni,nj);
+  POLYBENCH_2D_ARRAY_DECL(A,DATA_TYPE,NI,NJ,ni,nj);
+  POLYBENCH_2D_ARRAY_DECL(B,DATA_TYPE,NI,NJ,ni,nj);
+
+  /* Initialize array(s). */
+  init_array (ni, nj, nk,
+	      POLYBENCH_ARRAY(C),
+	      POLYBENCH_ARRAY(A),
+	      POLYBENCH_ARRAY(B));
+
+  /* Start timer. */
+  polybench_start_instruments;
+
+  /* Run kernel. */
+  kernel(ni, nj, nk,
+	 POLYBENCH_ARRAY(C),
+	 POLYBENCH_ARRAY(A),
+	 POLYBENCH_ARRAY(B),input_data, histogram);
+
+  /* Stop and print timer. */
+  polybench_stop_instruments;
+  polybench_print_instruments;
+
+  /* Prevent dead-code elimination. All live-out data must be printed
+     by the function call in argument. */
+  polybench_prevent_dce(print_array(ni, nj,  POLYBENCH_ARRAY(C)));
+
+  /* Be clean. */
+  POLYBENCH_FREE_ARRAY(C);
+  POLYBENCH_FREE_ARRAY(A);
+  POLYBENCH_FREE_ARRAY(B);
+
+  return 0;
+}
diff --git a/test/kernels/relu+histogram/relu.h b/test/kernels/relu+histogram/relu.h
new file mode 100644
index 00000000..4615e081
--- /dev/null
+++ b/test/kernels/relu+histogram/relu.h
@@ -0,0 +1,80 @@
+#ifndef _RELU_H
+# define _RELU_H
+
+#define DATA_TYPE_IS_INT
+
+/* Default to LARGE_DATASET. */
+# if !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(MEDIUM_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
+#  define LARGE_DATASET
+# endif
+
+# if !defined(NI) && !defined(NJ) && !defined(NK)
+/* Define sample dataset sizes. */
+#  ifdef MINI_DATASET
+#   define NI 20
+#   define NJ 25
+#   define NK 30
+#  endif 
+
+#  ifdef SMALL_DATASET
+#   define NI 60
+#   define NJ 70
+#   define NK 80
+#  endif 
+
+#  ifdef MEDIUM_DATASET
+#   define NI 200
+#   define NJ 220
+#   define NK 240
+#  endif 
+
+#  ifdef LARGE_DATASET
+#   define NI 1000
+#   define NJ 1100
+#   define NK 1200
+#  endif 
+
+#  ifdef EXTRALARGE_DATASET
+#   define NI 2000
+#   define NJ 2300
+#   define NK 2600
+#  endif 
+
+
+#endif /* !(NI NJ NK) */
+
+# define _PB_NI POLYBENCH_LOOP_BOUND(NI,ni)
+# define _PB_NJ POLYBENCH_LOOP_BOUND(NJ,nj)
+# define _PB_NK POLYBENCH_LOOP_BOUND(NK,nk)
+
+
+/* Default data type */
+# if !defined(DATA_TYPE_IS_INT) && !defined(DATA_TYPE_IS_FLOAT) && !defined(DATA_TYPE_IS_DOUBLE)
+#  define DATA_TYPE_IS_DOUBLE
+# endif
+
+#ifdef DATA_TYPE_IS_INT
+#  define DATA_TYPE int
+#  define DATA_PRINTF_MODIFIER "%d "
+#endif 
+
+#ifdef DATA_TYPE_IS_FLOAT
+#  define DATA_TYPE float
+#  define DATA_PRINTF_MODIFIER "%0.2f "
+#  define SCALAR_VAL(x) x##f
+#  define SQRT_FUN(x) sqrtf(x)
+#  define EXP_FUN(x) expf(x)
+#  define POW_FUN(x,y) powf(x,y)
+# endif
+
+#ifdef DATA_TYPE_IS_DOUBLE
+#  define DATA_TYPE double
+#  define DATA_PRINTF_MODIFIER "%0.2lf "
+#  define SCALAR_VAL(x) x
+#  define SQRT_FUN(x) sqrt(x)
+#  define EXP_FUN(x) exp(x)
+#  define POW_FUN(x,y) pow(x,y)
+# endif
+
+#endif /* !_RELU_H */
+
diff --git a/test/kernels/relu+histogram/run.sh b/test/kernels/relu+histogram/run.sh
new file mode 100644
index 00000000..a5674436
--- /dev/null
+++ b/test/kernels/relu+histogram/run.sh
@@ -0,0 +1 @@
+opt-12 -load ../../../cgra-mapper/build/src/libmapperPass.so -mapperPass kernel_unroll.bc
diff --git a/test/kernels/spmv+conv/compile.sh b/test/kernels/spmv+conv/compile.sh
new file mode 100755
index 00000000..ec8a4182
--- /dev/null
+++ b/test/kernels/spmv+conv/compile.sh
@@ -0,0 +1,2 @@
+clang-12 -emit-llvm -O3 -fno-unroll-loops -o kernel.bc -c spmv+conv.c
+llvm-dis-12 kernel.bc -o kernel.ll
diff --git a/test/kernels/spmv+conv/conv.h b/test/kernels/spmv+conv/conv.h
new file mode 100644
index 00000000..ead2d3d3
--- /dev/null
+++ b/test/kernels/spmv+conv/conv.h
@@ -0,0 +1,80 @@
+#ifndef _CONV_H
+# define _CONV_H
+
+#define DATA_TYPE_IS_INT
+
+/* Default to LARGE_DATASET. */
+# if !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(MEDIUM_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
+#  define LARGE_DATASET
+# endif
+
+# if !defined(NI) && !defined(NJ) && !defined(NK)
+/* Define sample dataset sizes. */
+#  ifdef MINI_DATASET
+#   define NI 20
+#   define NJ 25
+#   define NK 30
+#  endif 
+
+#  ifdef SMALL_DATASET
+#   define NI 60
+#   define NJ 70
+#   define NK 80
+#  endif 
+
+#  ifdef MEDIUM_DATASET
+#   define NI 200
+#   define NJ 220
+#   define NK 240
+#  endif 
+
+#  ifdef LARGE_DATASET
+#   define NI 1000
+#   define NJ 1100
+#   define NK 1200
+#  endif 
+
+#  ifdef EXTRALARGE_DATASET
+#   define NI 2000
+#   define NJ 2300
+#   define NK 2600
+#  endif 
+
+
+#endif /* !(NI NJ NK) */
+
+# define _PB_NI POLYBENCH_LOOP_BOUND(NI,ni)
+# define _PB_NJ POLYBENCH_LOOP_BOUND(NJ,nj)
+# define _PB_NK POLYBENCH_LOOP_BOUND(NK,nk)
+
+
+/* Default data type */
+# if !defined(DATA_TYPE_IS_INT) && !defined(DATA_TYPE_IS_FLOAT) && !defined(DATA_TYPE_IS_DOUBLE)
+#  define DATA_TYPE_IS_DOUBLE
+# endif
+
+#ifdef DATA_TYPE_IS_INT
+#  define DATA_TYPE int
+#  define DATA_PRINTF_MODIFIER "%d "
+#endif 
+
+#ifdef DATA_TYPE_IS_FLOAT
+#  define DATA_TYPE float
+#  define DATA_PRINTF_MODIFIER "%0.2f "
+#  define SCALAR_VAL(x) x##f
+#  define SQRT_FUN(x) sqrtf(x)
+#  define EXP_FUN(x) expf(x)
+#  define POW_FUN(x,y) powf(x,y)
+# endif
+
+#ifdef DATA_TYPE_IS_DOUBLE
+#  define DATA_TYPE double
+#  define DATA_PRINTF_MODIFIER "%0.2lf "
+#  define SCALAR_VAL(x) x
+#  define SQRT_FUN(x) sqrt(x)
+#  define EXP_FUN(x) exp(x)
+#  define POW_FUN(x,y) pow(x,y)
+# endif
+
+#endif /* !_CONV_H */
+
diff --git a/test/kernels/spmv+conv/dot.sh b/test/kernels/spmv+conv/dot.sh
new file mode 100755
index 00000000..922149a0
--- /dev/null
+++ b/test/kernels/spmv+conv/dot.sh
@@ -0,0 +1 @@
+dot -Tpng kernel.dot -o spmv+conv.png
diff --git a/test/kernels/spmv+conv/param.json b/test/kernels/spmv+conv/param.json
new file mode 100644
index 00000000..51661c13
--- /dev/null
+++ b/test/kernels/spmv+conv/param.json
@@ -0,0 +1,63 @@
+{
+    "kernel": "kernel",
+    "targetFunction": false,
+    "targetNested": false,
+    "targetLoopsID": [
+        0
+    ],
+    "doCGRAMapping": true,
+    "row": 4,
+    "column": 4,
+    "precisionAware": false,
+    "fusionStrategy": [
+        "default_heterogeneous"
+    ],
+    "isTrimmedDemo": true,
+    "heuristicMapping": true,
+    "parameterizableCGRA": false,
+    "diagonalVectorization": false,
+    "bypassConstraint": 4,
+    "isStaticElasticCGRA": false,
+    "ctrlMemConstraint": 10,
+    "regConstraint": 8,
+    "incrementalMapping": false,
+    "vectorFactorForIdiv ": 1,
+    "testingOpcodeOffset": 0,
+    "additionalFunc": {
+        "complex-Ctrl": [
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15
+        ],
+        "complex-BrT": [
+            4,
+            5,
+            6,
+            7
+        ],
+        "complex-CoT": [
+            8,
+            9,
+            10,
+            11
+        ]
+    },
+    "supportDVFS": false,
+    "DVFSIslandDim": 1,
+    "DVFSAwareMapping": false,
+    "enablePowerGating": false,
+    "expandableMapping": true
+}
\ No newline at end of file
diff --git a/test/kernels/spmv+conv/polybench.h b/test/kernels/spmv+conv/polybench.h
new file mode 100644
index 00000000..d1a1f776
--- /dev/null
+++ b/test/kernels/spmv+conv/polybench.h
@@ -0,0 +1,217 @@
+/*
+ * Polybench header for instrumentation.
+ *
+ * Programs must be compiled with `-I utilities utilities/polybench.c'
+ *
+ * Optionally, one can define:
+ *
+ * -DPOLYBENCH_TIME, to report the execution time,
+ *   OR (exclusive):
+ * -DPOLYBENCH_PAPI, to use PAPI H/W counters (defined in polybench.c)
+ *
+ *
+ * See README or utilities/polybench.c for additional options.
+ *
+ */
+#ifndef POLYBENCH_H
+# define POLYBENCH_H
+
+# include <stdlib.h>
+
+/* Array padding. By default, none is used. */
+# ifndef POLYBENCH_PADDING_FACTOR
+/* default: */
+#  define POLYBENCH_PADDING_FACTOR 0
+# endif
+
+
+/* C99 arrays in function prototype. By default, do not use. */
+# ifdef POLYBENCH_USE_C99_PROTO
+#  define POLYBENCH_C99_SELECT(x,y) y
+# else
+/* default: */
+#  define POLYBENCH_C99_SELECT(x,y) x
+# endif
+
+
+/* Scalar loop bounds in SCoPs. By default, use parametric loop bounds. */
+# ifdef POLYBENCH_USE_SCALAR_LB
+#  define POLYBENCH_LOOP_BOUND(x,y) x
+# else
+/* default: */
+#  define POLYBENCH_LOOP_BOUND(x,y) y
+# endif
+
+/* Use the 'restrict' keyword to declare that the different arrays do not
+ * alias. By default, we do not use it as it is only supported in C99 and
+ * even here several compilers do not properly get it.
+ */
+# ifdef POLYBENCH_USE_RESTRICT
+#  define POLYBENCH_RESTRICT restrict
+# else
+/* default: */
+#  define POLYBENCH_RESTRICT
+# endif
+
+/* Macros to reference an array. Generic for heap and stack arrays
+   (C99).  Each array dimensionality has his own macro, to be used at
+   declaration or as a function argument.
+   Example:
+   int b[x] => POLYBENCH_1D_ARRAY(b, x)
+   int A[N][N] => POLYBENCH_2D_ARRAY(A, N, N)
+*/
+# ifndef POLYBENCH_STACK_ARRAYS
+#  define POLYBENCH_ARRAY(x) *x
+#  define POLYBENCH_FREE_ARRAY(x) free((void*)x);
+#  define POLYBENCH_DECL_VAR(x) (*x)
+# else
+#  define POLYBENCH_ARRAY(x) x
+#  define POLYBENCH_FREE_ARRAY(x)
+#  define POLYBENCH_DECL_VAR(x) x
+# endif
+/* Macros for using arrays in the function prototypes. */
+# define POLYBENCH_1D(var, dim1,ddim1) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_2D(var, dim1, dim2, ddim1, ddim2) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_3D(var, dim1, dim2, dim3, ddim1, ddim2, ddim3) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_4D(var, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_5D(var, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) var[POLYBENCH_RESTRICT POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim5,ddim5) + POLYBENCH_PADDING_FACTOR]
+/* Macros for using arrays within the functions. */
+# define POLYBENCH_1D_F(var, dim1,ddim1) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_2D_F(var, dim1, dim2, ddim1, ddim2) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_3D_F(var, dim1, dim2, dim3, ddim1, ddim2, ddim3) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_4D_F(var, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_5D_F(var, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim5,ddim5) + POLYBENCH_PADDING_FACTOR]
+
+
+/* Macros to allocate heap arrays.
+   Example:
+   polybench_alloc_2d_array(N, M, double) => allocates N x M x sizeof(double)
+					  and returns a pointer to the 2d array
+ */
+# define POLYBENCH_ALLOC_1D_ARRAY(n1, type)	\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data (n1 + POLYBENCH_PADDING_FACTOR, sizeof(type))
+# define POLYBENCH_ALLOC_2D_ARRAY(n1, n2, type)		\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR), sizeof(type))
+# define POLYBENCH_ALLOC_3D_ARRAY(n1, n2, n3, type)		\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR), sizeof(type))
+# define POLYBENCH_ALLOC_4D_ARRAY(n1, n2, n3, n4, type)			\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR][n4 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR) * (n4 + POLYBENCH_PADDING_FACTOR), sizeof(type))
+# define POLYBENCH_ALLOC_5D_ARRAY(n1, n2, n3, n4, n5, type)		\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR][n4 + POLYBENCH_PADDING_FACTOR][n5 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR) * (n4 + POLYBENCH_PADDING_FACTOR) * (n5 + POLYBENCH_PADDING_FACTOR), sizeof(type))
+
+/* Macros for array declaration. */
+# ifndef POLYBENCH_STACK_ARRAYS
+#  define POLYBENCH_1D_ARRAY_DECL(var, type, dim1, ddim1)		\
+  type POLYBENCH_1D_F(POLYBENCH_DECL_VAR(var), dim1, ddim1); \
+  var = POLYBENCH_ALLOC_1D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), type);
+#  define POLYBENCH_2D_ARRAY_DECL(var, type, dim1, dim2, ddim1, ddim2)	\
+  type POLYBENCH_2D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, ddim1, ddim2); \
+  var = POLYBENCH_ALLOC_2D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), type);
+#  define POLYBENCH_3D_ARRAY_DECL(var, type, dim1, dim2, dim3, ddim1, ddim2, ddim3) \
+  type POLYBENCH_3D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, ddim1, ddim2, ddim3); \
+  var = POLYBENCH_ALLOC_3D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), type);
+#  define POLYBENCH_4D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) \
+  type POLYBENCH_4D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4); \
+  var = POLYBENCH_ALLOC_4D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), POLYBENCH_C99_SELECT(dim4, ddim4), type);
+#  define POLYBENCH_5D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) \
+  type POLYBENCH_5D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5); \
+  var = POLYBENCH_ALLOC_5D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), POLYBENCH_C99_SELECT(dim4, ddim4), POLYBENCH_C99_SELECT(dim5, ddim5), type);
+# else
+#  define POLYBENCH_1D_ARRAY_DECL(var, type, dim1, ddim1)		\
+  type POLYBENCH_1D_F(POLYBENCH_DECL_VAR(var), dim1, ddim1);
+#  define POLYBENCH_2D_ARRAY_DECL(var, type, dim1, dim2, ddim1, ddim2)	\
+  type POLYBENCH_2D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, ddim1, ddim2);
+#  define POLYBENCH_3D_ARRAY_DECL(var, type, dim1, dim2, dim3, ddim1, ddim2, ddim3) \
+  type POLYBENCH_3D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, ddim1, ddim2, ddim3);
+#  define POLYBENCH_4D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) \
+  type POLYBENCH_4D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4);
+#  define POLYBENCH_5D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) \
+  type POLYBENCH_5D_F(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5);
+# endif
+
+
+/* Dead-code elimination macros. Use argc/argv for the run-time check. */
+# ifndef POLYBENCH_DUMP_ARRAYS
+#  define POLYBENCH_DCE_ONLY_CODE    if (argc > 42 && ! strcmp(argv[0], ""))
+# else
+#  define POLYBENCH_DCE_ONLY_CODE
+# endif
+
+#define POLYBENCH_DUMP_TARGET stderr
+#define POLYBENCH_DUMP_START    fprintf(POLYBENCH_DUMP_TARGET, "==BEGIN DUMP_ARRAYS==\n")
+#define POLYBENCH_DUMP_FINISH   fprintf(POLYBENCH_DUMP_TARGET, "==END   DUMP_ARRAYS==\n")
+#define POLYBENCH_DUMP_BEGIN(s) fprintf(POLYBENCH_DUMP_TARGET, "begin dump: %s", s)
+#define POLYBENCH_DUMP_END(s)   fprintf(POLYBENCH_DUMP_TARGET, "\nend   dump: %s\n", s)
+
+# define polybench_prevent_dce(func)		\
+  POLYBENCH_DCE_ONLY_CODE			\
+  func
+
+
+/* Performance-related instrumentation. See polybench.c */
+# define polybench_start_instruments
+# define polybench_stop_instruments
+# define polybench_print_instruments
+
+
+/* PAPI support. */
+# ifdef POLYBENCH_PAPI
+extern const unsigned int polybench_papi_eventlist[];
+#  undef polybench_start_instruments
+#  undef polybench_stop_instruments
+#  undef polybench_print_instruments
+#  define polybench_set_papi_thread_report(x)	\
+   polybench_papi_counters_threadid = x;
+#  define polybench_start_instruments				\
+  polybench_prepare_instruments();				\
+  polybench_papi_init();					\
+  int evid;							\
+  for (evid = 0; polybench_papi_eventlist[evid] != 0; evid++)	\
+    {								\
+      if (polybench_papi_start_counter(evid))			\
+	continue;						\
+
+#  define polybench_stop_instruments		\
+      polybench_papi_stop_counter(evid);	\
+    }						\
+  polybench_papi_close();			\
+
+#  define polybench_print_instruments polybench_papi_print();
+# endif
+
+
+/* Timing support. */
+# if defined(POLYBENCH_TIME) || defined(POLYBENCH_GFLOPS)
+#  undef polybench_start_instruments
+#  undef polybench_stop_instruments
+#  undef polybench_print_instruments
+#  define polybench_start_instruments polybench_timer_start();
+#  define polybench_stop_instruments polybench_timer_stop();
+#  define polybench_print_instruments polybench_timer_print();
+extern double polybench_program_total_flops;
+extern void polybench_timer_start();
+extern void polybench_timer_stop();
+extern void polybench_timer_print();
+# endif
+
+/* Function declaration. */
+# ifdef POLYBENCH_TIME
+extern void polybench_timer_start();
+extern void polybench_timer_stop();
+extern void polybench_timer_print();
+# endif
+
+# ifdef POLYBENCH_PAPI
+extern void polybench_prepare_instruments();
+extern int polybench_papi_start_counter(int evid);
+extern void polybench_papi_stop_counter(int evid);
+extern void polybench_papi_init();
+extern void polybench_papi_close();
+extern void polybench_papi_print();
+# endif
+
+/* Function prototypes. */
+extern void* polybench_alloc_data(unsigned long long int n, int elt_size);
+
+
+#endif /* !POLYBENCH_H */
diff --git a/test/kernels/spmv+conv/run.sh b/test/kernels/spmv+conv/run.sh
new file mode 100755
index 00000000..9598bd9d
--- /dev/null
+++ b/test/kernels/spmv+conv/run.sh
@@ -0,0 +1 @@
+opt-12 -load ../../../build/src/libmapperPass.so -mapperPass kernel.bc
diff --git a/test/kernels/spmv+conv/spmv+conv.c b/test/kernels/spmv+conv/spmv+conv.c
new file mode 100644
index 00000000..fc1d7239
--- /dev/null
+++ b/test/kernels/spmv+conv/spmv+conv.c
@@ -0,0 +1,62 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <math.h>
+#include "polybench.h"
+#include "conv.h"
+#define SIZE 10000
+
+int nnz = 400000;
+int val[SIZE];
+int col[SIZE];
+int row[SIZE];
+int feature[SIZE];
+int output[SIZE];
+
+int kernel(int nnz, int val[], int col[], int row[], int feature[], int output[],
+    DATA_TYPE POLYBENCH_2D(C,NI,NJ,ni,nj),
+	   DATA_TYPE POLYBENCH_2D(A,NI,NJ,ni,nj),
+	   DATA_TYPE POLYBENCH_2D(B,NI,NJ,ni,nj));
+
+int main()
+{
+
+  // conv
+  POLYBENCH_2D_ARRAY_DECL(C,DATA_TYPE,NI,NJ,ni,nj);
+  POLYBENCH_2D_ARRAY_DECL(A,DATA_TYPE,NI,NJ,ni,nj);
+  POLYBENCH_2D_ARRAY_DECL(B,DATA_TYPE,NI,NJ,ni,nj);
+
+  kernel(nnz, val, col, row, feature, output,	           POLYBENCH_ARRAY(C),
+	           POLYBENCH_ARRAY(A),
+	           POLYBENCH_ARRAY(B));
+
+//  output_dsp (input, NTAPS, 0);
+//  output_dsp (coefficients, NTAPS, 0);
+//  output_dsp (output, NTAPS, 0);
+  return 0;
+}
+
+int kernel(int nnz, int val[], int col[], int row[], int feature[], int output[],
+    DATA_TYPE POLYBENCH_2D(C,NI,NJ,ni,nj),
+	   DATA_TYPE POLYBENCH_2D(A,NI,NJ,ni,nj),
+	   DATA_TYPE POLYBENCH_2D(B,NI,NJ,ni,nj))
+{
+  int i = 0;
+  int temp;
+
+  // conv
+  int x,y;
+  int out = 0;
+
+  //#pragma clang loop unroll_count(4)
+  for (i = 0; i < nnz; ++i) {
+    // spmv
+    temp = val[i] * feature[ col[i] ];
+    output[ row[i] ] += temp;
+    // conv
+    x = i / NI;
+    y = i % NJ;
+    out += A  [x][y] * B[x][y];
+  }
+  return out;
+}
\ No newline at end of file
diff --git a/tools/expandable/NeuraDemo.py b/tools/expandable/NeuraDemo.py
deleted file mode 100644
index 5ad000f7..00000000
--- a/tools/expandable/NeuraDemo.py
+++ /dev/null
@@ -1,710 +0,0 @@
-# ----------------------------------------------------------------------------
-#   Filename: SORAdemo.py                                                   /
-#   Description: simulate multi-kernel running on multi-CGRA                /
-#   Author: Miaomiao Jiang, start from 2025-02-24                           /
-# ----------------------------------------------------------------------------
-
-import heapq
-import subprocess
-import json
-import eventlet    # for time out
-import pandas as pd
-import math
-
-# ----------------------------------------------------------------------------
-#   global variables                                                        /
-# ----------------------------------------------------------------------------
-
-TEST_BENCHS = ["fir.cpp", "latnrm.c", "fft.c", "dtw.cpp", "spmv.c", "conv.c", "relu.c", "histogram.cpp", "mvt.c", "gemm.c"]
-TEST_BENCHS_NUM = len(TEST_BENCHS)
-DICT_CSV = {'kernels': "", 'DFG nodes': "", 'DFG edges': "", 'recMII': "", 'mappingII': "", 'expandableII': ""}  # column names of generated CSV
-DICT_COLUMN = len(DICT_CSV)
-JSON_NAME = "./param.json"   # name of generated json file
-TIME_OUT_SET = 180
-DO_MAPPING = True
-KERNEL_DIRECTORY = "../../test/kernels"
-
-
-
-# ----------------------------------------------------------------------------
-#   class defination                                                         /
-# ----------------------------------------------------------------------------
-
-
-
-class Kernel:
-    def __init__(self, kernel_name, kernel_id, arrive_period, unroll_factor, vector_factor, total_iterations, cgra_rows, cgra_columns):
-        """
-        Initialize an instance of the Kernel class.
-
-        Parameters:
-            kernel_name (str): The name of the kernel.
-            kernel_id (int): The ID of the kernel.
-            arrive_period (int): The period at which the same kernel will arrive again.
-            unroll_factor (int): The unroll factor of the kernel.
-            vector_factor (int): The vector factor of the kernel.
-            total_iterations (int): The total number of iterations of the kernel.
-            cgra_rows (int): The number of rows in the CGRA.
-            cgra_columns (int): The number of columns in the CGRA.
-        """
-        self.kernel_name = kernel_name
-        self.kernel_id = kernel_id
-        self.arrive_period = arrive_period
-        self.unroll_factor = unroll_factor
-        self.vector_factor = vector_factor
-        self.df = pd.DataFrame(DICT_CSV, index=[0])
-        self.ii_1 = None  # II when using 1 CGRA, actual II
-        self.ii_2 = None  # II when using 2 CGRAs, expandable II
-        self.total_iterations = math.ceil(total_iterations / (self.unroll_factor*self.vector_factor))
-        self.rows = cgra_rows
-        self.columns = cgra_columns
-        if DO_MAPPING:
-            self.get_ii()  # Perform mapping and populate attributes
-        else:
-            self.read_ii()  # Read from existing csv
-        print(f"Kernel {self.kernel_name} initialized with arrive_period={self.arrive_period}, unroll_factor={self.unroll_factor}")
-
-    def __lt__(self, other):
-        """
-        Compare two Kernel by id.
-        """
-        return self.kernel_id < other.kernel_id
-
-    def comp_kernel(self):
-        """
-        This is a func compile a kernel using clang with selected unrolling factor.
-
-        Returns: function name of kernel.
-        """
-        file_source = (self.kernel_name.split("."))[0]
-
-        if self.unroll_factor == 1 and self.vector_factor == 1:
-            compile_command = f"clang-12 -emit-llvm -fno-unroll-loops -fno-vectorize -O3 -o kernel.bc -c {KERNEL_DIRECTORY}/{file_source}/{self.kernel_name}"
-        elif self.unroll_factor == 1 and self.vector_factor != 1:
-            compile_command = f"clang-12 -emit-llvm -fno-unroll-loops -O3 -mllvm -force-vector-width={self.vector_factor} -o kernel.bc -c {KERNEL_DIRECTORY}/{file_source}/{self.kernel_name}"
-        elif self.unroll_factor != 1 and self.vector_factor == 1:
-            compile_command = f"clang-12 -emit-llvm -funroll-loops -mllvm -unroll-count={self.unroll_factor} -fno-vectorize -O3 -o kernel.bc -c {KERNEL_DIRECTORY}/{file_source}/{self.kernel_name}"
-        else:
-            print("Error, invalid unroll and vector factor combination.")
-            return
-
-        compile_proc = subprocess.Popen([compile_command, '-u'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-        (compile_out, compile_err) = compile_proc.communicate()
-
-        disassemble_command = "llvm-dis-12 kernel.bc -o ./kernel.ll"
-        disassemble_proc = subprocess.Popen([disassemble_command, '-u'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-        (disassemble_out, disassemble_err) = disassemble_proc.communicate()
-
-
-        if compile_err:
-            print(f"Compile warning message for {self.kernel_name}: {compile_err}")
-        if disassemble_err:
-            print(f"Disassemble error message for {self.kernel_name}: {disassemble_err}")
-            return
-
-        # collect the potentially targeting kernel/function from kernel.ll
-        ir_file = open('kernel.ll', 'r')
-        ir_lines = ir_file.readlines()
-
-        # strips the newline character
-        for line in ir_lines:
-            if "define " in line and "{" in line and "@" in line:
-                func_name = line.split("@")[1].split("(")[0]
-                if "kernel" in func_name:
-                    target_kernel = func_name
-                    break
-
-        ir_file.close()
-        print(f"Target kernel function for {self.kernel_name}: {target_kernel}")
-        return target_kernel
-
-    def map_kernel(self):
-        """
-        This is a func for mapping a kernel and gain information during mapping.
-
-        Returns: NULL
-        """
-        get_map_command = "opt-12 -load ../../build/src/libmapperPass.so -mapperPass kernel.bc"
-        gen_map_proc = subprocess.Popen([get_map_command, "-u"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-        dataS = []    # for get results from subprocess and output to pandas
-        kernels_source = (self.kernel_name.split("."))[0]
-        dataS.append(kernels_source)
-
-        try:
-            eventlet.monkey_patch()
-            with eventlet.Timeout(TIME_OUT_SET, True):
-                with gen_map_proc.stdout:
-                    gen_map_proc.stdout.flush()
-                    for line in iter(gen_map_proc.stdout.readline, b''):
-                        output_line = line.decode("ISO-8859-1")
-                        if "DFG node count: " in output_line:
-                            dataS.append(int(output_line.split("DFG node count: ")[1].split(";")[0]))
-                            dataS.append(int(output_line.split("DFG edge count: ")[1].split(";")[0]))
-                        if "[RecMII: " in output_line:
-                            dataS.append(int(output_line.split("[RecMII: ")[1].split("]")[0]))
-                        if "[Mapping II: " in output_line:
-                            self.ii_1 = int(output_line.split("[Mapping II: ")[1].split("]")[0])
-                            dataS.append(self.ii_1)
-                        if "[ExpandableII: " in output_line:
-                            self.ii_2 = int(output_line.split("[ExpandableII: ")[1].split("]")[0])
-                            dataS.append(self.ii_2)
-
-        except eventlet.timeout.Timeout:
-            dataS = [0]*(DICT_COLUMN)
-            print("Skipping a specific config for kernel: ", self.kernel_name, "Because it runs more than", TIME_OUT_SET/60 , "minute(s).")
-
-        if len(dataS) != DICT_COLUMN:
-            dataS.extend([0]*(DICT_COLUMN-len(dataS)))
-
-        print(dataS)
-        self.df.loc[len(self.df.index)] = dataS
-
-    def map_kernel_skip(self):
-        """
-        This is a func gain DFG information only without mapping.
-
-        Returns: NULL
-        """
-        get_map_command = "opt-12 -load ../../build/src/libmapperPass.so -mapperPass kernel.bc"
-        gen_map_proc = subprocess.Popen([get_map_command, "-u"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-        # Holds the results from subprocess and output to pandas.
-        dataS = []
-        kernels_source = (self.kernel_name.split("."))[0]
-        dataS.append(kernels_source)
-        # The first 4 element of dataS is not empty: kernelsSource, DFG node count, DFG edge count, RecMII.
-        k_data_s_head = 4
-
-        try:
-            eventlet.monkey_patch()
-            with eventlet.Timeout(TIME_OUT_SET, True):
-                with gen_map_proc.stdout:
-                    gen_map_proc.stdout.flush()
-                    for line in iter(gen_map_proc.stdout.readline, b''):
-                        output_line = line.decode("ISO-8859-1")
-                        if "DFG node count: " in output_line:
-                            dataS.append(int(output_line.split("DFG node count: ")[1].split(";")[0]))
-                            dataS.append(int(output_line.split("DFG edge count: ")[1].split(";")[0]))
-                        if "[RecMII: " in output_line:
-                            dataS.append(int(output_line.split("[RecMII: ")[1].split("]")[0]))
-                            dataS.extend([0]*(DICT_COLUMN-k_data_s_head))
-                            break
-
-        except eventlet.timeout.Timeout:
-            dataS = [0]*(DICT_COLUMN)
-            print("Skipping a specific config for kernel: ", self.kernel_name, "Because it runs more than", TIME_OUT_SET/60, "minute(s).")
-
-        print(dataS)
-        self.df.loc[len(self.df.index)] = dataS
-
-    def get_ii(self):
-        """
-        This is a func to compile, run and map kernels under sora_json and store the mapping result in csv
-
-        Returns: name of the csv that collects information of mapped kernels
-        """
-        csv_name = f'./tmp/t_{self.kernel_name}_{self.rows}x{self.columns}_unroll{self.unroll_factor}_vector{self.vector_factor}.csv'
-        print("Generating", csv_name)
-        target_kernel = self.comp_kernel()
-
-        sora_json = {
-            "kernel": target_kernel,
-            "targetFunction": False,
-            "targetNested": False,
-            "targetLoopsID": [0],
-            "doCGRAMapping": DO_MAPPING,
-            "row": self.rows,
-            "column": self.columns,
-            "precisionAware": False,
-            "fusionStrategy": ["default_heterogeneous"],
-            "isTrimmedDemo": True,
-            "heuristicMapping": True,
-            "parameterizableCGRA": False,
-            "vectorizationMode": "all",
-            "bypassConstraint": 4,
-            "isStaticElasticCGRA": False,
-            "ctrlMemConstraint": 10,
-            "regConstraint": 8,
-            "incrementalMapping"    : False,
-            "vectorFactorForIdiv "  : 1,
-            "testingOpcodeOffset"   : 0,
-            "additionalFunc"        : {
-                                        "complex-Ctrl" : [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
-                                        "div" : [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-                                    }
-        }
-
-        json_object = json.dumps(sora_json, indent=4)
-
-        with open(JSON_NAME, "w") as outfile:
-            outfile.write(json_object)
-        if DO_MAPPING:
-            self.map_kernel()
-        else:
-            self.map_kernel_skip()
-
-        self.df.to_csv(csv_name)
-        return csv_name
-
-    def read_ii(self):
-        """
-        This is a func to read from csv generated from get_ii()
-
-        Returns: csv_name
-        """
-        if self.vector_factor > 8:
-            csv_name = f'./tmp/t_{self.kernel_name}_{self.rows}x{self.columns}_unroll{self.unroll_factor}_vector8.csv'
-        else:
-            csv_name = f'./tmp/t_{self.kernel_name}_{self.rows}x{self.columns}_unroll{self.unroll_factor}_vector{self.vector_factor}.csv'
-
-        try:
-            df = pd.read_csv(csv_name)
-            self.ii_1 = int(df['mappingII'].iloc[1])   # the first data line
-            print(self.ii_1)
-            self.ii_2 = int(df['expandableII'].iloc[1])    # the first data line
-            print(self.ii_2)
-        except FileNotFoundError:
-            print(f"CSV file {csv_name} not found.")
-        except ValueError:
-            print(f"Error extracting II values from {csv_name}.")
-
-        return csv_name
-
-    def return_ii(self, num_cgras):
-        """
-        Get the initiation interval (II) based on the number of CGRAs allocated.
-
-        Parameters:
-            num_cgras (int): Number of CGRAs allocated.
-
-        Returns:
-            int: The initiation interval (II).
-        """
-        if num_cgras == 1:
-            return self.ii_1
-        elif num_cgras == 2:
-            return self.ii_2
-        else:
-            raise ValueError("Number of CGRAs must be 1 or 2.")
-
-    def return_total_iterations(self):
-        """
-        Total iterations for the kernel, affected by unroll_factor and vector_factor
-
-        Returns:
-            int: Total iterations.
-        """
-        return self.total_iterations
-
-    def create_instance(self, arrival_time):
-        """
-        Create a KernelInstance based on the current kernel.
-
-        Parameters:
-            arrival_time (int): The time at which the instance arrives.
-
-        Returns:
-            KernelInstance: A new instance of the kernel.
-        """
-        return KernelInstance(self, arrival_time)
-
-
-class KernelInstance:
-    def __init__(self, kernel, arrival_time):
-        """
-        Initialize a KernelInstance.
-
-        Parameters:
-            kernel (Kernel): The kernel from which this instance is created.
-            arrival_time (int): The time at which the instance arrives.
-        """
-        self.kernel = kernel
-        self.arrival_time = arrival_time
-        self.start_time = None
-        self.allocated_cgras = 0
-        self.ii = None
-        self.end_time = None
-        self.is_valid = True
-        self.pure_execution_time = 0  # Track pure execution time for this instance
-        self.pure_waiting_time = 0  # Track pure waiting time for this instance
-        # Determine the maximum number of CGRAs that can be allocated
-        self.max_allocate_cgra = 9
-
-    def __lt__(self, other):
-        """
-        Compare two KernelInstance instances by arrival time.
-        """
-        return self.arrival_time < other.arrival_time
-
-    def calculate_execution_time(self):
-        """
-        Calculate the execution time based on the number of allocated CGRAs
-        at the beginning running time of current kernel. It may change after.
-
-        Returns:
-            int: Total execution time in cycles.
-        """
-        # if self.vector_factor = 8, then when allocate_cgra <= 2, self.ii = ii_1, when 2 < allocate_cgra <= 4, self.ii = ii_2
-        if self.kernel.vector_factor == 8:
-            if self.allocated_cgras == 1:
-                # cgra tile only support vector = 4
-                # TODO: self.kernel.ii_1/2
-                self.ii = self.kernel.ii_1
-            elif self.allocated_cgras == 2:
-                self.ii = self.kernel.ii_1
-            else:
-                # TODO: self.kernel.ii_3/2
-                self.ii = self.kernel.ii_2
-        else:
-            if self.allocated_cgras == 1:
-                self.ii = self.kernel.ii_1
-            elif self.allocated_cgras == 2:
-                self.ii = self.kernel.ii_2
-            else:
-                raise ValueError(f"Number of CGRAs must be between 1 and {self.max_allocate_cgra}.")
-        execution_time = self.kernel.total_iterations * self.ii
-        print(f"Calculated execution time for {self.kernel.kernel_name}: {execution_time} cycles (II={self.ii}, iterations={self.kernel.total_iterations})")
-        return execution_time
-
-    def copy_with_valid(self):
-        """
-        Create a copy of the current instance and set is_valid to True.
-
-        Returns:
-            KernelInstance: A new instance copy.
-        """
-        new_instance = KernelInstance(self.kernel, self.arrival_time)
-        new_instance.start_time = self.start_time
-        new_instance.allocated_cgras = self.allocated_cgras
-        new_instance.ii = self.ii
-        new_instance.end_time = self.end_time
-        new_instance.is_valid = True
-        new_instance.pure_execution_time = self.pure_execution_time
-        new_instance.pure_waiting_time = self.pure_waiting_time
-        new_instance.max_allocate_cgra = self.max_allocate_cgra
-        return new_instance
-
-
-# ----------------------------------------------------------------------------
-#   function defination                                                      /
-# ----------------------------------------------------------------------------
-
-def allocate(instance, current_time, available_cgras, events, running_instances, runned_kernel_names, total_cgra_runtime):
-    """
-    Allocate CGRAs to a kernel instance.
-
-    Parameters:
-        instance (KernelInstance): The kernel instance to allocate CGRAs to.
-        current_time (int): The current simulation time.
-        available_cgras (int): The number of available CGRAs.
-        events (list): The event queue.
-        running_instances (list): The list of currently running instances.
-        runned_kernel_names (list): The list of names of the kernels that have been run.
-        total_cgra_runtime (float): The total runtime of all CGRAs.
-
-    Returns:
-        int: The updated number of available CGRAs.
-        float: The updated total runtime of all CGRAs.
-    """
-    runned_kernel_names.append(instance.kernel.kernel_name)
-    allocate_cgras = min(instance.max_allocate_cgra, available_cgras)
-    available_cgras -= allocate_cgras
-    instance.start_time = current_time
-    instance.allocated_cgras = allocate_cgras
-    execution_time = instance.calculate_execution_time()
-    instance.end_time = current_time + execution_time
-    instance.pure_waiting_time = instance.start_time - instance.arrival_time  # Record pure waiting time
-    print(f"Allocated {allocate_cgras} CGRAs to {instance.kernel.kernel_name} at time {current_time}. Execution will end at {instance.end_time}")
-    heapq.heappush(events, (instance.end_time, 'end', instance, instance))
-    running_instances.append(instance)
-    total_cgra_runtime += allocate_cgras * execution_time
-    return available_cgras, total_cgra_runtime
-
-
-def release(instance, current_time, available_cgras, running_instances, completed_instances, kernel_latency, total_cgra_runtime):
-    """
-    Release the CGRAs occupied by a kernel instance.
-
-    Parameters:
-        instance (KernelInstance): The kernel instance to release CGRAs from.
-        current_time (int): The current simulation time.
-        available_cgras (int): The number of available CGRAs.
-        running_instances (list): The list of currently running instances.
-        completed_instances (list): The list of completed instances.
-        kernel_latency (dict): A dictionary used to track the total latency of each kernel.
-        total_cgra_runtime (float): The total runtime of all CGRAs.
-
-    Returns:
-        int: The updated number of available CGRAs.
-        float: The updated total runtime of all CGRAs.
-    """
-    available_cgras += instance.allocated_cgras
-    completed_instances.append(instance)
-    if instance in running_instances:
-        running_instances.remove(instance)
-    # Update per-kernel overall latency
-    instance.end_time = current_time
-    latency = instance.end_time - instance.start_time
-    instance.pure_execution_time = instance.end_time - instance.start_time  # Record pure execution time
-    kernel_latency[instance.kernel.kernel_name] += latency
-    print(f"Released {instance.allocated_cgras} CGRAs from {instance.kernel.kernel_name} at time {current_time}. Latency added: {latency} cycles")
-    return available_cgras, total_cgra_runtime
-
-
-def re_allocate(instance, current_time, available_cgras, events, total_cgra_runtime):
-    """
-    Re-allocate additional CGRAs to a kernel instance if possible.
-
-    Parameters:
-        instance (KernelInstance): The kernel instance to re-allocate CGRAs to.
-        available_cgras (int): Number of available CGRAs.
-        events (list): The event queue.
-        current_time (int): The current simulation time.
-        total_cgra_runtime (float): Total runtime of all CGRAs.
-
-    Returns:
-        int: Updated number of available CGRAs.
-        float: Updated total runtime of all CGRAs.
-    """
-    if instance.allocated_cgras < instance.max_allocate_cgra and available_cgras > 0:
-        possible_alloc = min(instance.max_allocate_cgra - instance.allocated_cgras, available_cgras)
-        # Update allocation
-        instance.allocated_cgras += possible_alloc
-        available_cgras -= possible_alloc
-        # Recalculate remaining iterations
-        elapsed_time = current_time - instance.start_time
-        completed_iters = elapsed_time // instance.ii
-        remaining_iters = instance.kernel.total_iterations - completed_iters
-        # Update II
-        if instance.allocated_cgras == 1:
-            instance.ii = instance.kernel.ii_1
-        elif instance.allocated_cgras in [2, 3, 4]:
-            instance.ii = instance.kernel.ii_2
-        new_execution_time = remaining_iters * instance.ii
-        # Schedule new end event
-        new_end_time = current_time + new_execution_time
-        instance.end_time = new_end_time
-        print(f"Re-allocated {possible_alloc} CGRAs to {instance.kernel.kernel_name} at time {current_time}. New end time: {new_end_time}")
-        # Create a new valid instance for the new end event
-        new_instance = instance.copy_with_valid()  # Assume there is a copy method in KernelInstance class
-        heapq.heappush(events, (new_end_time, 'end', new_instance, new_instance))
-        instance.is_valid = False   # Old instance is invalid
-        total_cgra_runtime += possible_alloc * new_execution_time
-        # Invalidate old end event by leaving it in the heap but ignoring when processed
-    else:
-        print(f"Re-allocated CGRAs to {instance.kernel.kernel_name} at time {current_time} Failed.")
-    return available_cgras, total_cgra_runtime
-
-
-def simulate(num_cgras, kernels, priority_bosting, lcm_time=80000000):
-    """
-    Simulate the execution of multiple kernels on a CGRA architecture.
-
-    Parameters:
-        num_cgras (int): The number of CGRAs in the CGRA architecture.
-        kernels (list of Kernel): The list of kernels to simulate.
-        priority_bosting (bool): Whether to enable priority boosting.
-        lcm_time (int): The least common multiple of the arrival periods.
-
-    Returns:
-        dict: A dictionary that maps kernel names to their total latencies.
-    """
-    available_cgras = num_cgras
-    events = []  # when a kernel arrives or ends, it is an event
-    current_time = 0
-    waiting_instances = []
-    running_instances = []
-    completed_instances = []
-    runned_kernel_names = []
-    # Dictionary to store per-kernel arrival times
-    kernel_arrival_count = {kernel.kernel_name: 0 for kernel in kernels}
-    # Dictionary to store per-kernel overall latency (cycle)
-    kernel_latency = {kernel.kernel_name: 0 for kernel in kernels}
-    # Dictionary to store per-kernel execution time distribution
-    kernel_execution_distribution = {kernel.kernel_name: [] for kernel in kernels}
-    # Dictionary to store per-kernel waiting time distribution
-    kernel_waiting_distribution = {kernel.kernel_name: [] for kernel in kernels}
-    # Dictionary to store per-kernel ratio (iterations per cycle)
-    kernel_execution_ratio = {kernel.kernel_name: 0 for kernel in kernels}
-    # Dictionary to store per-kernel ratio (iterations per cycle)
-    kernel_waiting_ratio = {kernel.kernel_name: 0 for kernel in kernels}
-    total_cgra_runtime = 0
-    arrive_times_list = {"fir.cpp": 12, "latnrm.c":4, "fft.c":10, "dtw.cpp":7, "spmv.c":6, "conv.c":8, "relu.c":5, "mvt.c":12, "gemm.c":2, "histogram.cpp":2}
-
-    if priority_bosting:
-        print("\033[91mpriority_bosting is on\033[0m")
-
-    for kernel in kernels:
-        print(f"Kernel {kernel.kernel_name} II_1={kernel.ii_1}, II_2={kernel.ii_2}, total_iterations={kernel.total_iterations}")
-
-    # Schedule initial arrivals for all kernels
-    for kernel in kernels:
-        first_arrival = 0
-        # heapq keeps a priority queue that contains (event_arrive_end_time (int), event_type (str), Kernel, KernelInstance (needed when 'end'))
-        heapq.heappush(events, (first_arrival, 'arrival', kernel, None))
-
-    while events:
-        event_time, event_type, kernel_or_instance, _ = heapq.heappop(events)
-        current_time = event_time
-        print(f"Processing event at time {current_time}: type={event_type}, kernel={kernel_or_instance.kernel_name if event_type == 'arrival' else kernel_or_instance.kernel.kernel_name}")
-
-        if event_type == 'arrival':
-            kernel = kernel_or_instance
-            kernel_arrival_count[kernel.kernel_name] += 1
-            # Create a new instance
-            instance = kernel.create_instance(current_time)
-            # Schedule next arrival if within lcm_time
-            next_arrival = current_time + kernel.arrive_period
-            if kernel_arrival_count[kernel.kernel_name] < arrive_times_list[kernel.kernel_name]:
-                heapq.heappush(events, (next_arrival, 'arrival', kernel, None))
-                print(f"Scheduled next arrival for {kernel.kernel_name} at time {next_arrival}")
-
-
-            # Try to allocate CGRAs
-            if available_cgras >= 1:
-                available_cgras, total_cgra_runtime = allocate(instance, current_time, available_cgras, events, running_instances, runned_kernel_names, total_cgra_runtime)
-            else:
-                waiting_instances.append(instance)
-                print(f"No available CGRAs for {kernel.kernel_name}. Added to waiting queue.")
-
-        elif event_type == 'end':
-            instance = kernel_or_instance
-            if not instance.is_valid:
-                # If instance is invalid, means it is re_allocated.
-                print(f"Ignoring invalid end event for {instance.kernel.kernel_name}")
-                continue
-            # Release CGRAs
-            available_cgras, total_cgra_runtime = release(instance, current_time, available_cgras, running_instances, completed_instances,kernel_latency, total_cgra_runtime)
-
-            # Update execution time distribution
-            kernel_execution_distribution[instance.kernel.kernel_name].append(instance.pure_execution_time)
-            kernel_waiting_distribution[instance.kernel.kernel_name].append(instance.pure_waiting_time)
-
-            # Check waiting queue
-            while waiting_instances and available_cgras >= 1:
-                instance = waiting_instances.pop(0)
-                print(f"Allocating CGRAs to waiting instance {instance.kernel.kernel_name}")
-                available_cgras, total_cgra_runtime = allocate(instance, current_time, available_cgras, events, running_instances, runned_kernel_names, total_cgra_runtime)
-
-            # Check running instances for possible re-allocation
-            if priority_bosting:
-                for running in running_instances:
-                    available_cgras, total_cgra_runtime = re_allocate(running, current_time, available_cgras, events, total_cgra_runtime)
-
-    # Calculate ratio for each kernel
-    for kernel in kernels:
-        total_execution_time = sum(
-            [inst.pure_execution_time for inst in completed_instances if inst.kernel.kernel_name == kernel.kernel_name])
-        total_waiting_time = sum(
-            [inst.pure_waiting_time for inst in completed_instances if inst.kernel.kernel_name == kernel.kernel_name])
-        total_time = total_execution_time + total_waiting_time
-        kernel_execution_ratio[kernel.kernel_name] = total_execution_time / total_time if total_time > 0 else 0
-        kernel_waiting_ratio[kernel.kernel_name] = total_waiting_time / total_time if total_time > 0 else 0
-
-    # Calculate utilization of total CGRAs
-    cgra_utilization = total_cgra_runtime / (current_time * num_cgras)
-    overall_latency = current_time  # when all kernels are done
-
-    print(f"Simulation completed. Kernel latencies: {kernel_latency}")
-    print(f"Kernel execution_ratio: {kernel_execution_ratio}")
-    print(f"Kernel execution time distributions: {kernel_execution_distribution}")
-    print(f"Kernel Runned List: {runned_kernel_names}")
-    print(f"CGRA utilization: {cgra_utilization}")
-    print(f"overall latency: {overall_latency}")
-    return kernel_latency, kernel_waiting_distribution, kernel_execution_ratio, kernel_waiting_ratio, kernel_execution_distribution, cgra_utilization, overall_latency
-
-
-def run_multiple_simulations_and_save_to_csv(kernels_list, csvname, priority_bosting, num_cgras=9):
-    """
-    Run multiple simulations and save the results to a CSV file.
-
-    Parameters:
-        kernels_list (list of list of Kernel): A list of kernels.
-        csvname (str): The name of the CSV file.
-        priority_bosting (bool): Whether to enable priority boosting.
-        num_cgras (int): The number of CGRAs, default 9.
-    """
-    for i, kernels in enumerate(kernels_list, start = 1):
-        kernel_latency, kernel_waiting_distribution, kernel_execution_ratio, kernel_waiting_ratio, kernel_execution_distribution, cgra_utilization, overall_latency = simulate(num_cgras, kernels, priority_bosting)
-
-        # Calculate fastest, slowest, and average execution time per kernel
-        execution_stats = {}
-        for kernel_name, execution_times in kernel_execution_distribution.items():
-            if execution_times:
-                fastest = min(execution_times)
-                slowest = max(execution_times)
-                average = sum(execution_times) / len(execution_times)
-                total = sum(execution_times)
-                execution_stats[kernel_name] = {
-                    "fastest_execution_time": fastest,
-                    "slowest_execution_time": slowest,
-                    "average_execution_time": average,
-                    "total_execution_time": total
-                }
-
-        # Calculate fastest, slowest, and average waiting time per kernel
-        waiting_stats = {}
-        for kernel_name, waiting_times in kernel_waiting_distribution.items():
-            if waiting_times:
-                fastest = min(waiting_times)
-                slowest = max(waiting_times)
-                average = sum(waiting_times) / len(waiting_times)
-                total = sum(waiting_times)
-                waiting_stats[kernel_name] = {
-                    "fastest_waiting_time": fastest,
-                    "slowest_waiting_time": slowest,
-                    "average_waiting_time": average,
-                    "total_waiting_time": total
-                }
-
-        all_results = []
-        for kernel in kernels:
-            kernel_name = kernel.kernel_name
-            result = {
-                "Kernel_Name": kernel_name,
-                "Arrive_Period": kernel.arrive_period,
-                "Unroll_Factor": kernel.unroll_factor,
-                "Vector_Factor": kernel.vector_factor,
-                "fastest_execution_time": execution_stats.get(kernel_name, {}).get("fastest_execution_time", None),
-                "slowest_execution_time": execution_stats.get(kernel_name, {}).get("slowest_execution_time", None),
-                "Average_Execution_Time": execution_stats.get(kernel_name, {}).get("average_execution_time", None),
-                "fastest_waiting_time": waiting_stats.get(kernel_name, {}).get("fastest_waiting_time", None),
-                "slowest_waiting_time": waiting_stats.get(kernel_name, {}).get("slowest_waiting_time", None),
-                "Average_Waiting_Time": waiting_stats.get(kernel_name, {}).get("average_waiting_time", None),
-                "Total_Execution_Time": execution_stats.get(kernel_name, {}).get("total_execution_time", None),
-                "Total_Waiting_Time": waiting_stats.get(kernel_name, {}).get("total_waiting_time", None),
-                "Execution_Time Ratio": kernel_execution_ratio[kernel_name],
-                "Waiting_Time Ratio": kernel_waiting_ratio[kernel_name],
-                "Overall_Case_Latency": overall_latency,
-                "CGRA Utilization": cgra_utilization,
-                "Total_Execution_Time Ratio": (execution_stats.get(kernel_name, {}).get("total_execution_time", None))/overall_latency,
-                "Total_Waiting_Time Ratio": (waiting_stats.get(kernel_name, {}).get("total_waiting_time", None))/overall_latency,
-                "Total_Latency Ratio":  (execution_stats.get(kernel_name, {}).get("total_execution_time", None) + waiting_stats.get(kernel_name, {}).get("total_waiting_time", None))/overall_latency
-            }
-            all_results.append(result)
-
-
-        df = pd.DataFrame(all_results)
-        file_name = f'simulation_{csvname}_case{i}.csv'
-        df.to_csv(file_name, index=False)
-
-
-if __name__ == "__main__":
-    baselineCase1=[
-        [
-            Kernel(kernel_name="fir.cpp", kernel_id =0, arrive_period =1500000, unroll_factor =1,vector_factor =1, total_iterations =300000, cgra_rows= 12, cgra_columns=12) ,
-            Kernel(kernel_name="conv.c", kernel_id =5, arrive_period =2500000, unroll_factor =1,vector_factor =1, total_iterations =400000, cgra_rows= 12, cgra_columns=12) ,
-            Kernel(kernel_name="relu.c", kernel_id =6, arrive_period =4000000, unroll_factor =1,vector_factor =1, total_iterations =1000000, cgra_rows= 12, cgra_columns=12) ,
-            Kernel(kernel_name="histogram.cpp", kernel_id =7, arrive_period =1200000, unroll_factor =1,vector_factor =1, total_iterations =262144, cgra_rows= 12, cgra_columns=12) ,
-        ]
-    ]
-    taskCase1 = [
-        [
-            Kernel(kernel_name="fir.cpp", kernel_id =0, arrive_period =300000, unroll_factor =1,vector_factor =8, total_iterations =300000, cgra_rows= 4, cgra_columns=4) ,
-            Kernel(kernel_name="conv.c", kernel_id =5, arrive_period =400000, unroll_factor =1,vector_factor =8, total_iterations =400000, cgra_rows= 4, cgra_columns=4) ,
-            Kernel(kernel_name="relu.c", kernel_id =6, arrive_period =1000000, unroll_factor =1,vector_factor =8, total_iterations =1000000, cgra_rows= 4, cgra_columns=4) ,
-            Kernel(kernel_name="histogram.cpp", kernel_id =7, arrive_period =262144, unroll_factor =1,vector_factor =8, total_iterations =262144, cgra_rows= 4, cgra_columns=4) ,
-        ]
-    ]
-    run_multiple_simulations_and_save_to_csv(baselineCase1, "Baseline", priority_bosting = True, num_cgras=1)  # one cgra is 4x4
-    run_multiple_simulations_and_save_to_csv(taskCase1, "NoBosting", priority_bosting = False, num_cgras=9)
-    run_multiple_simulations_and_save_to_csv(taskCase1, "Bosting", priority_bosting = True, num_cgras=9)
diff --git a/tools/expandable/README.md b/tools/expandable/README.md
new file mode 100644
index 00000000..84888f4e
--- /dev/null
+++ b/tools/expandable/README.md
@@ -0,0 +1,29 @@
+# Strcture
+tools/
+└── expandable/
+    ├── README.md              # This file
+    ├── __init__.py
+    ├── main.py                # Neura demo script
+    ├── designs/               # input json for Neura Scalibility evaluation
+    │   ├── 2x2baseline.json
+    │   ├── 2x2task.json
+    │   ├── 3x3task.json
+    │   ├── 4x4task.json
+    │   └── 5x5task.json
+    ├── util/                  # Utility modules
+    │   ├── __init__.py
+    │   ├── scheduler.py      # Kernel mapping and task scheduling
+    │   └── visualizer.py     # Result visualization
+    ├── fig/                  # Generated figures
+    ├── result/               # Scheduling results
+    └── tmp/                  # Kernel mapping results
+
+# Core components
+- main.py generates simulated real-world tasks and models the execution progress across different evaluation settings.
+- scheduler.py recieves tasks and generates kernel mapping information (stored in /tmp/). It also outputs scheduling results to /result/ directory.
+- visulization.py reads csv from /result/ and generates paper figures in /figs/ directory.
+
+# Outputs
+- /fig/Fig9.png: Normalized execution time and improved utilization
+- /fig/Fig10.png: Normalized throughput speedup
+- /fig/Fig11.png: Scalability -- Normalized execution time and improved utilization
\ No newline at end of file
diff --git a/tools/expandable/__init__.py b/tools/expandable/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tools/expandable/demo.sh b/tools/expandable/demo.sh
index b4c22d68..1136e515 100755
--- a/tools/expandable/demo.sh
+++ b/tools/expandable/demo.sh
@@ -1,6 +1,26 @@
 #!/usr/bin/env bash
-source /WORK_REPO/venv/bin/activate
+# source /WORK_REPO/venv/bin/activate
 
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -t|--test)
+            TEST_FLAG="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown parameter: $1"
+            exit 1
+            ;;
+    esac
+done
+
+TEST_FLAG=${TEST_FLAG:-n}
+
+rm -r ./tmp
+rm -r ./result
+rm -r ./fig
 mkdir ./tmp
+mkdir ./result
+mkdir ./fig
 
-python NEURAdemo.py
+python main.py --test=$TEST_FLAG
\ No newline at end of file
diff --git a/tools/expandable/designs/2x2baseline.json b/tools/expandable/designs/2x2baseline.json
new file mode 100644
index 00000000..ac42ab2f
--- /dev/null
+++ b/tools/expandable/designs/2x2baseline.json
@@ -0,0 +1,372 @@
+[
+  {
+    "kernel_name": "fir.cpp",
+    "kernel_id": 7,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2048,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "latnrm.c",
+    "kernel_id": 8,
+    "arrive_period": 327680,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 1280,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 2,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 4,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "spmv.c",
+    "kernel_id": 3,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 1,
+    "total_iterations": 65536,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "conv.c",
+    "kernel_id": 1,
+    "arrive_period": 6553600,
+    "unroll_factor": 1,
+    "vector_factor": 1,
+    "total_iterations": 655360,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 5,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 0,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 6,
+    "arrive_period": 204800,
+    "unroll_factor": 2,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 12,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 9,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 10,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 11,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 13,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 14,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 29,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 30,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 31,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 32,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 15,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 16,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 17,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 18,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 19,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 20,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 29,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 30,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 31,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 32,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 21,
+    "arrive_period": 204800,
+    "unroll_factor": 2,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 22,
+    "arrive_period": 204800,
+    "unroll_factor": 2,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 23,
+    "arrive_period": 204800,
+    "unroll_factor": 2,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 24,
+    "arrive_period": 204800,
+    "unroll_factor": 2,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 25,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 26,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 27,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 28,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 8,
+    "cgra_columns": 8
+  }
+]
\ No newline at end of file
diff --git a/tools/expandable/designs/2x2task.json b/tools/expandable/designs/2x2task.json
new file mode 100644
index 00000000..ae9162eb
--- /dev/null
+++ b/tools/expandable/designs/2x2task.json
@@ -0,0 +1,372 @@
+[
+  {
+    "kernel_name": "fir.cpp",
+    "kernel_id": 7,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2048,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "latnrm.c",
+    "kernel_id": 8,
+    "arrive_period": 327680,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 1280,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 2,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 4,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "spmv.c",
+    "kernel_id": 3,
+    "arrive_period": 819200,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 65536,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "conv.c",
+    "kernel_id": 1,
+    "arrive_period": 6553600,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 655360,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 5,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 0,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 6,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 12,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 9,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 10,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 11,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 13,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 14,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 29,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 30,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 31,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 32,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 15,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 16,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 17,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 18,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 19,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 20,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 29,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 30,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 31,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 32,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 21,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 22,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 23,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 24,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 25,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 26,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 27,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 28,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  }
+]
\ No newline at end of file
diff --git a/tools/expandable/designs/3x3task.json b/tools/expandable/designs/3x3task.json
new file mode 100644
index 00000000..ae9162eb
--- /dev/null
+++ b/tools/expandable/designs/3x3task.json
@@ -0,0 +1,372 @@
+[
+  {
+    "kernel_name": "fir.cpp",
+    "kernel_id": 7,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2048,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "latnrm.c",
+    "kernel_id": 8,
+    "arrive_period": 327680,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 1280,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 2,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 4,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "spmv.c",
+    "kernel_id": 3,
+    "arrive_period": 819200,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 65536,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "conv.c",
+    "kernel_id": 1,
+    "arrive_period": 6553600,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 655360,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 5,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 0,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 6,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 12,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 9,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 10,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 11,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 13,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 14,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 29,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 30,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 31,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 32,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 15,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 16,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 17,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 18,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 19,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 20,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 29,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 30,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 31,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 32,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 21,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 22,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 23,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 24,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 25,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 26,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 27,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 28,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  }
+]
\ No newline at end of file
diff --git a/tools/expandable/designs/4x4task.json b/tools/expandable/designs/4x4task.json
new file mode 100644
index 00000000..ae9162eb
--- /dev/null
+++ b/tools/expandable/designs/4x4task.json
@@ -0,0 +1,372 @@
+[
+  {
+    "kernel_name": "fir.cpp",
+    "kernel_id": 7,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2048,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "latnrm.c",
+    "kernel_id": 8,
+    "arrive_period": 327680,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 1280,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 2,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 4,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "spmv.c",
+    "kernel_id": 3,
+    "arrive_period": 819200,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 65536,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "conv.c",
+    "kernel_id": 1,
+    "arrive_period": 6553600,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 655360,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 5,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 0,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 6,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 12,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 9,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 10,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 11,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 13,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 14,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 29,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 30,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 31,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 32,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 15,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 16,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 17,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 18,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 19,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 20,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 29,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 30,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 31,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 32,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 21,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 22,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 23,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 24,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 25,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 26,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 27,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 28,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  }
+]
\ No newline at end of file
diff --git a/tools/expandable/designs/5x5task.json b/tools/expandable/designs/5x5task.json
new file mode 100644
index 00000000..ae9162eb
--- /dev/null
+++ b/tools/expandable/designs/5x5task.json
@@ -0,0 +1,372 @@
+[
+  {
+    "kernel_name": "fir.cpp",
+    "kernel_id": 7,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2048,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "latnrm.c",
+    "kernel_id": 8,
+    "arrive_period": 327680,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 1280,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 2,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 4,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "spmv.c",
+    "kernel_id": 3,
+    "arrive_period": 819200,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 65536,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "conv.c",
+    "kernel_id": 1,
+    "arrive_period": 6553600,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 655360,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 5,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 0,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 6,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 12,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 9,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 10,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 11,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 13,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 14,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 29,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 30,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 31,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 32,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 15,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 16,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 17,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 18,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 19,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "gemm.c",
+    "kernel_id": 20,
+    "arrive_period": 5242880,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 2097152,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 29,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 30,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "fft.c",
+    "kernel_id": 31,
+    "arrive_period": 327680,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 112640,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "dtw.cpp",
+    "kernel_id": 32,
+    "arrive_period": 819200,
+    "unroll_factor": 1,
+    "vector_factor": 16,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 21,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 22,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 23,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "relu+histogram.c",
+    "kernel_id": 24,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 262144,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 25,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 26,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 27,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  },
+  {
+    "kernel_name": "mvt.c",
+    "kernel_id": 28,
+    "arrive_period": 204800,
+    "unroll_factor": 4,
+    "vector_factor": 1,
+    "total_iterations": 16384,
+    "cgra_rows": 4,
+    "cgra_columns": 4
+  }
+]
\ No newline at end of file
diff --git a/tools/expandable/main.py b/tools/expandable/main.py
new file mode 100644
index 00000000..9aeb8e6d
--- /dev/null
+++ b/tools/expandable/main.py
@@ -0,0 +1,320 @@
+# ----------------------------------------------------------------------------
+#   Filename: main.py                                                       /
+#   Description: load multi-task and schedule them on multi-CGRA            /
+# ----------------------------------------------------------------------------
+
+import argparse
+import json
+import os
+from pathlib import Path
+import time
+import util.scheduler as scheduler
+import util.visualizer as visualizer
+
+# ----------------------------------------------------------------------------
+#   global variables                                                        /
+# ----------------------------------------------------------------------------
+VISUALIZATION = True
+TESTME = False
+
+# Static kernel data (name: (sort_id, total_iterations, static_execution_time))
+KERNEL_DATA = {
+    "fir.cpp": (7, 2048, 4096),
+    "latnrm.c": (8, 1280, 2560),
+    "fft.c": (2, 112640, 450560),
+    "dtw.cpp": (4, 16384, 49152),
+    "spmv.c": (3, 65536, 262144),
+    "conv.c": (1, 655360, 1310720),
+    "mvt.c": (5, 16384, 49152),
+    "gemm.c": (0, 2097152, 8388608),
+    "relu+histogram.c": (6, 262144, 2097152)
+}
+
+# Case configuration dictionary (task_id: [A_P, UNROLL_FACTORS, VECTOR_FACTORS])
+TASK_CONFIGS = {
+    1: {
+        'A_P': [81920, 81920, 81920, 327680, 327680, 1638400, 81920, 1638400, 81920],
+        'UNROLL_FACTORS': [1]*9,
+        'VECTOR_FACTORS': [1]*9
+    },
+    2: {
+        'A_P': [102400, 102400, 102400, 327680, 327680, 1638400, 163840, 1638400, 81920],
+        'UNROLL_FACTORS': [1,2,2,1,2,2,2,1,1],
+        'VECTOR_FACTORS': [4, 1, 1, 4, 1, 1, 1, 4, 1]
+    },
+    3: {
+        'A_P': [102400, 102400, 102400, 409600, 409600, 2621440, 102400, 2621440, 81920],
+        'UNROLL_FACTORS': [1,4,2,1,4,4,4,1,1],
+        'VECTOR_FACTORS': [8, 1, 1, 8, 1, 1, 1, 8, 1]
+    },
+    4: {
+        'A_P': [163840, 163840, 163840, 655360, 655360, 3276800, 163840, 3276800, 163840],
+        'UNROLL_FACTORS': [1,4,1,2,4,4,4,1,1],
+        'VECTOR_FACTORS': [16, 1, 1, 16, 1, 1, 1, 16, 1]
+    },
+    5: {
+        'A_P': [204800, 204800, 204800, 819200, 819200, 5242880, 204800, 5242880, 204800],
+        'UNROLL_FACTORS': [1,4,1,1,4,4,4,1,1],
+        'VECTOR_FACTORS': [16, 1, 16, 16, 1, 1, 1, 16, 1]
+    },
+    6: {
+        'A_P': [327680, 327680, 327680, 819200, 819200, 6553600, 204800, 5242880, 204800],
+        'UNROLL_FACTORS': [1,4,1,1,4,4,4,1,1],
+        'VECTOR_FACTORS': [16, 1, 16, 16, 1, 1, 1, 16, 1]
+    }
+}
+
+# ----------------------------------------------------------------------------
+#   function defination                                                      /
+# ----------------------------------------------------------------------------
+
+def str_to_bool(value):
+    if isinstance(value, bool):
+        return value
+    if str(value).lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif str(value).lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    raise argparse.ArgumentTypeError('Invalid boolean value (accepted: 0/1, true/false, yes/no)')
+
+
+def parse_arguments():
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(
+        description='Multi-CGRA Task Scheduling Tool'
+    )
+    # Core application arguments
+    parser.add_argument('--test', type=str_to_bool, default=TESTME,
+                       help='Run tests in CI/CD [y/n]')
+    parser.add_argument('--cgra-config', type=int, default= 4,
+                       help='Path to CGRA configuration file')
+    parser.add_argument('--json-name', type=str, default= "./param.json",
+                       help='JSON configuration file name')
+    parser.add_argument('--kernel-directory', type=str, default= "../../test/kernels",
+                       help='Kernel directory path')
+    parser.add_argument('--time-out-set', type=int, default= 180,
+                       help='Timeout setting for operations')
+    parser.add_argument('--visualize', type=str_to_bool, default=VISUALIZATION,
+                       help='Generate visualization figures [y/n]')
+
+    return parser.parse_args()
+
+
+def load_configuration():
+    """Load and merge configurations from multiple sources with priority:
+    1. Command line arguments (highest priority)
+    2. Default values (lowest priority)
+    """
+    # Update global configuration with command line arguments
+    global VISUALIZATION, TESTME
+    # Parse command line arguments
+    args = parse_arguments()
+    VISUALIZATION = args.visualize
+    TESTME = args.test
+    scheduler.init_args(args)
+    print(f"Test in CI/CD: {args.test}")
+    print(f"Timeout: {args.time_out_set}")
+    print(f"Visualization: {args.visualize}")
+
+
+# ========== Task Loading Function ==========
+def load_tasks(task_id, task_type="baseline"):
+    """
+    Load task list based on task_id and CGRA type
+
+    Args:
+        task_id: Configuration case ID
+        task_type: "baseline" or "task", corresponding to 12x12 and 4x4 CGRA respectively
+
+    Returns:
+        task_list: List of task objects
+    """
+    global TASK_CONFIGS, KERNEL_DATA
+    if task_id not in TASK_CONFIGS:
+        raise ValueError(f"Task{task_id} configuration does not exist")
+
+    config = TASK_CONFIGS[task_id]
+    A_P = config['A_P']
+    UNROLL_FACTORS = config['UNROLL_FACTORS']
+    VECTOR_FACTORS = config['VECTOR_FACTORS']
+
+    # Validate parameter lengths
+    lists = [KERNEL_DATA, A_P, UNROLL_FACTORS, VECTOR_FACTORS]
+    if len(set(len(lst) for lst in lists if lst)) > 1:
+        raise ValueError(f"Task{task_id} parameter length mismatch: {[len(lst) for lst in lists]}")
+
+    # Set CGRA dimensions
+    if task_type == "baseline":
+        cgra_rows, cgra_columns = 12, 12
+    elif task_type == "task":
+        cgra_rows, cgra_columns = 4, 4
+    else:
+        raise ValueError("task_type must be either 'baseline' or 'task'")
+
+    # Generate task list
+    task_list = []
+    for i, (kernel_name, (kernel_id, total_iters, _)) in enumerate(KERNEL_DATA.items()):
+        task = scheduler.Kernel(
+            kernel_name=kernel_name,
+            kernel_id=kernel_id,
+            arrive_period=A_P[i] if A_P else 0,
+            unroll_factor=UNROLL_FACTORS[i],
+            vector_factor=VECTOR_FACTORS[i],
+            total_iterations=total_iters,
+            cgra_rows=cgra_rows,
+            cgra_columns=cgra_columns
+        )
+        task_list.append(task)
+
+    return task_list
+
+
+def run_simulation_for_case(task_id, num_task_cgras = 9, file_name = "NULL", load_from_file = False):
+    """
+    Complete simulation workflow for specified case
+
+    Args:
+        task_id: Configuration case ID to run simulation for
+    """
+    print(f"[Step 2] Loading tasks for task {task_id}...")
+
+    if load_from_file:
+        if file_name is '2x2':
+            # Load baseline tasks (12x12 CGRA)
+            baseline_tasks = load_tasks_from_file(f"./designs/{file_name}baseline.json")
+        # Load task tasks (4x4 CGRA)
+        task_tasks = load_tasks_from_file(f"./designs/{file_name}task.json")
+    else:
+        # Load baseline tasks (12x12 CGRA)
+        baseline_tasks = load_tasks(task_id, "baseline")
+        # Load task tasks (4x4 CGRA)
+        task_tasks = load_tasks(task_id, "task")
+
+    if load_from_file:
+        case_id = file_name + '_' + str(task_id)
+    else:
+        case_id = task_id
+
+    if (not load_from_file) or (file_name is '2x2'):
+        # Run baseline simulation
+        scheduler.run_multiple_simulations_and_save_to_csv(
+            baseline_tasks,
+            csv_name="Baseline",
+            priority_boosting=0,
+            kernel_case=case_id,
+            num_cgras=1  # one cgra is 12x12
+        )
+
+    # Run task simulation
+    scheduler.run_multiple_simulations_and_save_to_csv(
+        task_tasks,
+        csv_name="Neura-L0",
+        priority_boosting=0,
+        kernel_case=case_id,
+        num_cgras=num_task_cgras  # 9 of 4x4 CGRAs
+    )
+    scheduler.run_multiple_simulations_and_save_to_csv(
+        task_tasks,
+        csv_name="Neura-L1",
+        priority_boosting=1,
+        kernel_case=case_id,
+        num_cgras=num_task_cgras  # 9 of 4x4 CGRAs
+    )
+    scheduler.run_multiple_simulations_and_save_to_csv(
+        task_tasks,
+        csv_name="Neura-L2",
+        priority_boosting=2,
+        kernel_case=case_id,
+        num_cgras=num_task_cgras  # 9 of 4x4 CGRAs
+    )
+    scheduler.run_multiple_simulations_and_save_to_csv(
+        task_tasks,
+        csv_name="Neura",
+        priority_boosting=3,
+        kernel_case=case_id,
+        num_cgras=num_task_cgras  # 9 of 4x4 CGRAs
+    )
+
+
+def load_tasks_from_file(filename):
+    """
+    Load task list from JSON file
+
+    Args:
+        filename: Input JSON filename
+
+    Returns:
+        task_list: List of reconstructed task objects
+    """
+    if not os.path.exists(filename):
+        raise FileNotFoundError(f"Task file {filename} not found")
+
+    with open(filename, 'r') as f:
+        tasks_data = json.load(f)
+
+    # Reconstruct task objects from dictionaries
+    task_list = []
+    for task_dict in tasks_data:
+        task = scheduler.Kernel(
+            kernel_name=task_dict['kernel_name'],
+            kernel_id=task_dict['kernel_id'],
+            arrive_period=task_dict['arrive_period'],
+            unroll_factor=task_dict['unroll_factor'],
+            vector_factor=task_dict['vector_factor'],
+            total_iterations=task_dict['total_iterations'],
+            cgra_rows=task_dict['cgra_rows'],
+            cgra_columns=task_dict['cgra_columns']
+        )
+        task_list.append(task)
+
+    print(f"Tasks loaded from {filename}")
+    return task_list
+
+
+def main():
+    """Main workflow control function"""
+    start = time.time()
+    # 1. Load configuration (includes parsing arguments)
+    print("=== Multi-CGRA Task Scheduling Tool ===")
+    load_configuration()
+
+    # 2. Create output directory
+    print(f"Intermediate reslut in: ./tmp")
+    output_dir = Path("./tmp")
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # 3. Execute scheduling
+    print("[Step 1] Loading tasks and Scheduling tasks on 4x4 Multi-CGRA...")
+    if TESTME:
+        run_simulation_for_case(1)
+        # run_simulation_for_case(task_id = 6, num_task_cgras=4, file_name="2x2", load_from_file=True)  # 2x2
+    else:
+        for task_case_id in TASK_CONFIGS:
+            run_simulation_for_case(task_case_id)
+
+        # 4. Execute scheduling
+        print("[Step 2] Loading tasks and Scheduling tasks on 2x2, 3x3, 5x5 Multi-CGRA...")
+        run_simulation_for_case(task_id = 6, num_task_cgras=4, file_name="2x2", load_from_file=True)  # 2x2
+        run_simulation_for_case(task_id = 6, num_task_cgras=9, file_name="3x3", load_from_file=True)  # 3x3
+        run_simulation_for_case(task_id = 6, num_task_cgras=16, file_name="4x4", load_from_file=True)  # 4x4
+        run_simulation_for_case(task_id = 6, num_task_cgras=25, file_name="5x5", load_from_file=True)  # 5x5
+
+        # 5. Generate visualization
+        if VISUALIZATION:  # Use global variable
+            print(f"[Step 3] Generating visualization figures...")
+
+            # Generate Fig9
+            genFigs = visualizer.SimulationDataAnalyzer(kernel_data=KERNEL_DATA)
+            genFigs.genFig9("./fig/Fig9.png")
+            genFigs.genFig10("./fig/Fig10.png")
+            genFigs.genFig11("./fig/Fig11.png")
+
+
+    print("\n=== Scheduling completed successfully! ===")
+    end = time.time()
+    execution_time = end - start
+    print(f"Time cost: {execution_time/60:.2f} min")
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/tools/expandable/param.json b/tools/expandable/param.json
new file mode 100644
index 00000000..ee865842
--- /dev/null
+++ b/tools/expandable/param.json
@@ -0,0 +1,100 @@
+{
+    "kernel": "kernel",
+    "targetFunction": false,
+    "targetNested": false,
+    "targetLoopsID": [
+        0
+    ],
+    "doCGRAMapping": true,
+    "row": 12,
+    "column": 12,
+    "precisionAware": false,
+    "fusionStrategy": [
+        "default_heterogeneous"
+    ],
+    "isTrimmedDemo": true,
+    "heuristicMapping": true,
+    "parameterizableCGRA": false,
+    "vectorizationMode": "all",
+    "diagonalVectorization": false,
+    "bypassConstraint": 4,
+    "isStaticElasticCGRA": false,
+    "ctrlMemConstraint": 10,
+    "regConstraint": 8,
+    "incrementalMapping": false,
+    "vectorFactorForIdiv ": 1,
+    "testingOpcodeOffset": 0,
+    "additionalFunc": {
+        "complex-Ctrl": [
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15
+        ],
+        "fptosi": [
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15
+        ],
+        "div": [
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15
+        ],
+        "complex-BrT": [
+            4,
+            5,
+            6,
+            7
+        ],
+        "complex-CoT": [
+            8,
+            9,
+            10,
+            11
+        ]
+    },
+    "supportDVFS": false,
+    "DVFSIslandDim": 1,
+    "DVFSAwareMapping": false,
+    "enablePowerGating": false,
+    "expandableMapping": true
+}
\ No newline at end of file
diff --git a/tools/expandable/util/__init__.py b/tools/expandable/util/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tools/expandable/util/scheduler.py b/tools/expandable/util/scheduler.py
new file mode 100644
index 00000000..88c7ea30
--- /dev/null
+++ b/tools/expandable/util/scheduler.py
@@ -0,0 +1,932 @@
+# ----------------------------------------------------------------------------
+#   Filename: scheduler.py                                                  /
+#   Description: simulate multi-kernel running on multi-CGRA                /
+# ----------------------------------------------------------------------------
+
+import heapq
+import os
+import subprocess
+import json
+import eventlet    # for time out
+import pandas as pd
+import math
+
+# ----------------------------------------------------------------------------
+#   global variables                                                        /
+# ----------------------------------------------------------------------------
+
+DICT_CSV = {'kernels': "", 'DFG nodes': "", 'DFG edges': "", 'recMII': "", 'mappingII': "", 'expandableII': "", 'utilization': ""}  # column names of generated CSV
+DICT_COLUMN = len(DICT_CSV)
+VECTOR_LANE = 2
+JSON_NAME = "./param.json"
+TIME_OUT_SET = 180
+KERNEL_DIRECTORY = "../../test/kernels"
+
+
+def init_args(args):
+    """init config"""
+    global JSON_NAME, TIME_OUT_SET, KERNEL_DIRECTORY
+    JSON_NAME = args.json_name
+    KERNEL_DIRECTORY = args.kernel_directory
+    TIME_OUT_SET = args.time_out_set
+
+# ----------------------------------------------------------------------------
+#   class defination                                                         /
+# ----------------------------------------------------------------------------
+
+class Kernel:
+    def __init__(self, kernel_name, kernel_id, arrive_period, unroll_factor, vector_factor, total_iterations, cgra_rows, cgra_columns):
+        """
+        Initialize an instance of the Kernel class.
+
+        Parameters:
+            kernel_name (str): The name of the kernel.
+            kernel_id (int): The ID of the kernel.
+            arrive_period (int): The period at which the same kernel will arrive again.
+            unroll_factor (int): The unroll factor of the kernel.
+            vector_factor (int): The vector factor of the kernel.
+            total_iterations (int): The total number of iterations of the kernel.
+            cgra_rows (int): The number of rows in the CGRA.
+            cgra_columns (int): The number of columns in the CGRA.
+        """
+        self.kernel_name = kernel_name
+        self.kernel_id = kernel_id
+        self.arrive_period = arrive_period
+        self.unroll_factor = unroll_factor
+        self.vector_factor = vector_factor
+        self.df = pd.DataFrame(DICT_CSV, index=[0])
+        self.base_ii = 0  # II when using 1 CGRA, actual II, if fused, base_ii is fused_ii
+        self.expandable_ii = 0  # II when using 2 CGRAs, expandable II, if fused, expandable_ii is individual_ii
+        self.utilization = 0
+        self.total_iterations = math.ceil(total_iterations / (self.unroll_factor*self.vector_factor))
+        self.rows = cgra_rows
+        self.columns = cgra_columns
+        self.load_data()
+
+
+    def __lt__(self, other):
+        """
+        Compare two Kernel by id.
+        """
+        return self.kernel_id < other.kernel_id
+
+    def load_data(self):
+        prefix = './tmp/t_'
+        csv_name = f'{prefix}{self.kernel_name}_{self.rows}x{self.columns}_unroll{self.unroll_factor}_vector{self.vector_factor}.csv'
+        if os.path.exists(csv_name):
+            self.read_ii(csv_name)
+        else:
+            self.get_ii(csv_name)
+
+        self.is_valid = bool(self.base_ii)
+        # print(f"Kernel {self.kernel_name} loaded with arrive_period={self.arrive_period}")
+
+    def comp_kernel(self):
+        """
+        This is a func compile a kernel using clang with selected unrolling factor.
+
+        Returns: function name of kernel.
+        """
+        file_source = (self.kernel_name.split("."))[0]
+        # corner case
+        if self.kernel_name == "conv.c" and self.unroll_factor == 4:
+            self.unroll_factor = 2
+        if self.kernel_name == "fft.c" and self.unroll_factor == 2:
+            self.unroll_factor = 1
+        if self.kernel_name == "relu+histogram.c" and self.unroll_factor == 4 and self.rows == 12:
+            self.unroll_factor = 2
+        if self.kernel_name == "spmv.c" and self.unroll_factor == 2 and self.rows == 4:
+            self.unroll_factor = 1
+
+        if self.unroll_factor == 1 and self.vector_factor == 1:
+            compile_command = f"clang-12 -emit-llvm -fno-unroll-loops -fno-vectorize -O3 -o kernel.bc -c {KERNEL_DIRECTORY}/{file_source}/{self.kernel_name}"
+        elif self.unroll_factor == 1 and self.vector_factor != 1:
+            compile_command = f"clang-12 -emit-llvm -fno-unroll-loops -O3 -mllvm -force-vector-width={self.vector_factor} -o kernel.bc -c {KERNEL_DIRECTORY}/{file_source}/{self.kernel_name}"
+        elif self.unroll_factor != 1 and self.vector_factor == 1:
+            compile_command = f"clang-12 -emit-llvm -funroll-loops -mllvm -unroll-count={self.unroll_factor} -fno-vectorize -O3 -o kernel.bc -c {KERNEL_DIRECTORY}/{file_source}/{self.kernel_name}"
+        else:
+            # print("Error, invalid unroll and vector factor combination.")
+            return
+
+        compile_proc = subprocess.Popen([compile_command, '-u'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+        (compile_out, compile_err) = compile_proc.communicate()
+
+        disassemble_command = f"llvm-dis-12 kernel.bc -o kernel.ll"
+        disassemble_proc = subprocess.Popen([disassemble_command, '-u'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+        (disassemble_out, disassemble_err) = disassemble_proc.communicate()
+
+
+        if compile_err:
+            print(f"Compile warning message for {self.kernel_name}: {compile_err}")
+        if disassemble_err:
+            # print(f"Disassemble error message for {self.kernel_name}: {disassemble_err}")
+            return
+
+        # collect the potentially targeting kernel/function from kernel.ll
+        ir_file = open(f'kernel.ll', 'r')
+        ir_lines = ir_file.readlines()
+
+        # strips the newline character
+        for line in ir_lines:
+            if "define " in line and "{" in line and "@" in line:
+                func_name = line.split("@")[1].split("(")[0]
+                if "kernel" in func_name:
+                    target_kernel = func_name
+                    break
+
+        ir_file.close()
+        # print(f"Target kernel function for {self.kernel_name}: {target_kernel}")
+        return target_kernel
+
+    def map_kernel(self):
+        """
+        This is a func for mapping a kernel and gain information during mapping.
+
+        Returns: NULL
+        """
+        get_map_command = f"opt-12 -load ../../build/src/libmapperPass.so -mapperPass kernel.bc"
+        gen_map_proc = subprocess.Popen([get_map_command, "-u"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+        dataS = []    # for get results from subprocess and output to pandas
+        kernels_source = (self.kernel_name.split("."))[0]
+        dataS.append(kernels_source)
+
+        try:
+            eventlet.monkey_patch()
+            with eventlet.Timeout(TIME_OUT_SET, True):
+                with gen_map_proc.stdout:
+                    gen_map_proc.stdout.flush()
+                    for line in iter(gen_map_proc.stdout.readline, b''):
+                        output_line = line.decode("ISO-8859-1")
+                        #print(output_line)
+                        if "DFG node count: " in output_line:
+                            dataS.append(int(output_line.split("DFG node count: ")[1].split(";")[0]))
+                            dataS.append(int(output_line.split("DFG edge count: ")[1].split(";")[0]))
+                        if "[RecMII: " in output_line:
+                            dataS.append(int(output_line.split("[RecMII: ")[1].split("]")[0]))
+                        if "[Mapping II: " in output_line:
+                            self.base_ii = int(output_line.split("[Mapping II: ")[1].split("]")[0])
+                            dataS.append(self.base_ii)
+                        if "[ExpandableII: " in output_line:
+                            self.expandable_ii = int(output_line.split("[ExpandableII: ")[1].split("]")[0])
+                            dataS.append(self.expandable_ii)
+                        if "tile avg fu utilization: " in output_line:
+                            self.utilization = min(float(output_line.split("avg overall utilization: ")[1].split("%")[0])/100,1)
+                            dataS.append(self.utilization)
+                        if "[Mapping Fail]" in output_line:
+                            print(f"{self.kernel_name} mapping failed.")
+        except eventlet.timeout.Timeout:
+            dataS = [0]*(DICT_COLUMN)
+            # print("Skipping a specific config for kernel: ", self.kernel_name, "Because it runs more than", TIME_OUT_SET/60 , "minute(s).")
+
+        if len(dataS) != DICT_COLUMN:
+            dataS.extend([0]*(DICT_COLUMN-len(dataS)))
+
+        self.df.loc[len(self.df.index)] = dataS
+
+
+    def map_kernel_skip(self):
+        """
+        This is a func gain DFG information only without mapping.
+
+        Returns: NULL
+        """
+        get_map_command = f"opt-12 -load ../../build/src/libmapperPass.so -mapperPass kernel.bc"
+        gen_map_proc = subprocess.Popen([get_map_command, "-u"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+        # Holds the results from subprocess and output to pandas.
+        dataS = []
+        kernels_source = (self.kernel_name.split("."))[0]
+        dataS.append(kernels_source)
+        # The first 4 element of dataS is not empty: kernelsSource, DFG node count, DFG edge count, RecMII.
+        k_data_s_head = 4
+
+        try:
+            eventlet.monkey_patch()
+            with eventlet.Timeout(TIME_OUT_SET, True):
+                with gen_map_proc.stdout:
+                    gen_map_proc.stdout.flush()
+                    for line in iter(gen_map_proc.stdout.readline, b''):
+                        output_line = line.decode("ISO-8859-1")
+                        if "DFG node count: " in output_line:
+                            dataS.append(int(output_line.split("DFG node count: ")[1].split(";")[0]))
+                            dataS.append(int(output_line.split("DFG edge count: ")[1].split(";")[0]))
+                        if "[RecMII: " in output_line:
+                            dataS.append(int(output_line.split("[RecMII: ")[1].split("]")[0]))
+                            dataS.extend([0]*(DICT_COLUMN-k_data_s_head))
+                            break
+
+        except eventlet.timeout.Timeout:
+            dataS = [0]*(DICT_COLUMN)
+            # print("Skipping a specific config for kernel: ", self.kernel_name, "Because it runs more than", TIME_OUT_SET/60, "minute(s).")
+
+        self.df.loc[len(self.df.index)] = dataS
+
+    def get_ii(self, csv_name):
+        """
+        This is a func to compile, run and map kernels under neura_json and store the mapping result in csv
+
+        Returns: name of the csv that collects information of mapped kernels
+        """
+        # print("Generating", csv_name)
+        target_kernel = self.comp_kernel()
+
+        neura_json = {
+            "kernel": target_kernel,
+            "targetFunction": False,
+            "targetNested": False,
+            "targetLoopsID": [0],
+            "doCGRAMapping": True,
+            "row": self.rows,
+            "column": self.columns,
+            "precisionAware": False,
+            "fusionStrategy": ["default_heterogeneous"],
+            "isTrimmedDemo": True,
+            "heuristicMapping": True,
+            "parameterizableCGRA": False,
+            "vectorizationMode": "all",
+            "diagonalVectorization": False,
+            "bypassConstraint": 4,
+            "isStaticElasticCGRA": False,
+            "ctrlMemConstraint": 10,
+            "regConstraint": 8,
+            "incrementalMapping"    : False,
+            "vectorFactorForIdiv "  : 1,
+            "testingOpcodeOffset"   : 0,
+            "additionalFunc"        : {
+                                        "complex-Ctrl" : [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
+                                        "fptosi": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
+                                        "div": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
+                                        "complex-BrT" : [4,5,6,7],
+                                        "complex-CoT" : [8,9,10,11]
+                                    },
+            "supportDVFS": False,
+            "DVFSIslandDim": 1,
+            "DVFSAwareMapping": False,
+            "enablePowerGating": False,
+            "expandableMapping" : True
+        }
+
+        json_object = json.dumps(neura_json, indent=4)
+
+        with open(JSON_NAME, "w") as outfile:
+            outfile.write(json_object)
+        if True:
+            self.map_kernel()
+        else:
+            self.map_kernel_skip()
+
+        self.df.to_csv(csv_name)
+        return csv_name
+
+    def read_ii(self, csv_name):
+        """
+        This is a func to read from csv generated from get_ii()
+
+        Returns: csv_name
+        """
+        try:
+            df = pd.read_csv(csv_name)
+            self.base_ii = int(df['mappingII'].iloc[1])
+            self.expandable_ii = int(df['expandableII'].iloc[1])
+            if 'utilization' in df.columns:
+                self.utilization = min(float(df['utilization'].iloc[1]),1.0)
+            else:
+                self.get_ii()
+                return csv_name
+        except FileNotFoundError:
+            # print(f"CSV file {csv_name} not found.")
+            self.get_ii()
+            return csv_name
+        except ValueError:
+            # print(f"Error extracting II values from {csv_name}.")
+            self.get_ii()
+            return csv_name
+
+        return csv_name
+
+    def return_ii(self, num_cgras):
+        """
+        Get the initiation interval (II) based on the number of CGRAs allocated.
+
+        Parameters:
+            num_cgras (int): Number of CGRAs allocated.
+
+        Returns:
+            int: The initiation interval (II).
+        """
+        if num_cgras == 1:
+            return self.base_ii
+        elif num_cgras == 2:
+            return self.expandable_ii
+        else:
+            raise ValueError("Number of CGRAs must be 1 or 2.")
+
+    def return_total_iterations(self):
+        """
+        Total iterations for the kernel, affected by unroll_factor and vector_factor
+
+        Returns:
+            int: Total iterations.
+        """
+        return self.total_iterations
+
+    def create_instance(self, arrival_time):
+        """
+        Create a KernelInstance based on the current kernel.
+
+        Parameters:
+            arrival_time (int): The time at which the instance arrives.
+
+        Returns:
+            KernelInstance: A new instance of the kernel.
+        """
+        return KernelInstance(self, arrival_time)
+
+
+class KernelInstance:
+    def __init__(self, kernel, arrival_time):
+        """
+        Initialize a KernelInstance.
+
+        Parameters:
+            kernel (Kernel): The kernel from which this instance is created.
+            arrival_time (int): The time at which the instance arrives.
+        """
+        self.kernel = kernel
+        self.arrival_time = arrival_time
+        self.start_time = None
+        self.allocated_cgras = 0
+        self.ii = None
+        self.end_time = None
+        self.is_valid = self.kernel.is_valid
+        self.pure_execution_duration = 0  # Track pure execution duration for this instance
+        self.pure_waiting_duration = 0  # Track pure waiting duration for this instance
+        # Determine the maximum number of CGRAs that can be allocated
+        if self.kernel.vector_factor == 1:
+            self.max_allocate_cgra = 2
+        else:
+            self.max_allocate_cgra = math.ceil(self.kernel.vector_factor/VECTOR_LANE)
+
+    def __lt__(self, other):
+        """
+        Compare two KernelInstance instances by arrival time.
+        """
+        return self.arrival_time < other.arrival_time
+
+    def calculate_execution_duration(self):
+        """
+        Calculate the execution duration based on the number of allocated CGRAs
+        at the beginning running time of current kernel. It may change after.
+
+        Returns:
+            int: Total execution duration in cycles.
+        """
+        if self.kernel.vector_factor == 1:
+            if self.allocated_cgras == 1:
+                self.ii = self.kernel.base_ii
+            elif self.allocated_cgras == 2:
+                self.ii = self.kernel.expandable_ii
+            else:
+                raise ValueError(f"Number of CGRAs must be between 1 and {self.max_allocate_cgra}.")
+            execution_duration = self.kernel.total_iterations * self.ii
+        else:
+            self.ii = self.kernel.base_ii
+            execution_duration = self.kernel.total_iterations * self.ii * math.ceil(self.kernel.vector_factor / (VECTOR_LANE * self.allocated_cgras))
+        # print(f"Calculated execution duration for {self.kernel.kernel_name}: {execution_duration} cycles (II={self.ii}, iterations={self.kernel.total_iterations})")
+        return execution_duration
+
+    def copy_with_valid(self):
+        """
+        Create a copy of the current instance and set is_valid to True.
+
+        Returns:
+            KernelInstance: A new instance copy.
+        """
+        new_instance = KernelInstance(self.kernel, self.arrival_time)
+        new_instance.start_time = self.start_time
+        new_instance.allocated_cgras = self.allocated_cgras
+        new_instance.ii = self.ii
+        new_instance.end_time = self.end_time
+        new_instance.is_valid = True
+        new_instance.pure_execution_duration = 0
+        new_instance.pure_waiting_duration = self.pure_waiting_duration
+        new_instance.max_allocate_cgra = self.max_allocate_cgra
+        return new_instance
+
+
+class SystemIdleTracker:
+    def __init__(self, num_cgras):
+        """Initialize the system idle time tracker
+
+        Args:
+            num_cgras: Total number of CGRAs in the system
+        """
+        self.num_cgras = num_cgras
+        self.last_active_time = 0  # Timestamp when system was last active
+        self.idle_periods = []     # List to store idle periods (start, end)
+
+        # New attributes for waiting time tracking
+        self.waiting_start_time = None  # Timestamp when waiting started
+        self.waiting_periods = []       # List to store non-overlapping waiting periods (start, end)
+
+    def check_idle_period(self, current_time, available_cgras, waiting_kernels):
+        """Check and record idle periods and waiting times
+
+        Args:
+            current_time: Current simulation time (passed from simulate function)
+            available_cgras: Number of currently available CGRAs
+            waiting_kernels: Number of kernels in the waiting queue
+        """
+        # Detect system-wide idle state (all CGRAs available)
+        if available_cgras == self.num_cgras and current_time > self.last_active_time:
+            self.idle_periods.append((self.last_active_time, current_time))
+        else:
+            # Update last active time if system is not fully idle
+            self.last_active_time = current_time
+
+        # Track waiting time
+        if waiting_kernels and self.waiting_start_time is None:
+            # Start tracking waiting time if queue was empty and now has kernels
+            self.waiting_start_time = current_time
+        elif not waiting_kernels and self.waiting_start_time is not None:
+            # End tracking and record waiting period if queue was non-empty and now empty
+            self.waiting_periods.append((self.waiting_start_time, current_time))
+            self.waiting_start_time = None  # Reset for next waiting period
+
+    @property
+    def total_idle_duration(self) -> int:
+        """Calculate total accumulated idle time
+
+        Returns:
+            Sum of all idle periods in cycles
+        """
+        return sum(end - start for start, end in self.idle_periods)
+
+    @property
+    def total_waiting_time_nolap(self) -> int:
+        """Calculate total non-overlapping waiting time
+
+        Returns:
+            Sum of all waiting periods in cycles
+        """
+        return sum(end - start for start, end in self.waiting_periods)
+
+    def get_utilization(self, total_cgra_runtime, current_time) -> float:
+        """Calculate system utilization rate
+
+        Args:
+            total_cgra_runtime: Sum of busy time across all CGRAs
+            current_time: Current simulation time
+
+        Returns:
+            Utilization percentage (0.0 to 1.0)
+        """
+        if current_time <= 0:
+            return 0.0
+        # Utilization = Actual busy time / Possible busy time
+        possible_busy_time = (current_time - self.total_idle_duration) * self.num_cgras
+        # print(f"Total idle duration is {self.total_idle_duration}, total_cgra_runtime is {total_cgra_runtime}")
+        return total_cgra_runtime / possible_busy_time if possible_busy_time > 0 else 0.0
+
+# ----------------------------------------------------------------------------
+#   function defination                                                      /
+# ----------------------------------------------------------------------------
+
+def allocate(priority_boosting, instance, current_time, available_cgras, events, running_instances, runned_kernel_names, total_cgra_runtime):
+    """
+    Allocate CGRAs to a kernel instance.
+
+    Parameters:
+        instance (KernelInstance): The kernel instance to allocate CGRAs to.
+        current_time (int): The current simulation time.
+        available_cgras (int): The number of available CGRAs.
+        events (list): The event queue.
+        running_instances (list): The list of currently running instances.
+        runned_kernel_names (list): The list of names of the kernels that have been run.
+        total_cgra_runtime (float): The total runtime of all CGRAs.
+
+    Returns:
+        int: The updated number of available CGRAs.
+        float: The updated total runtime of all CGRAs.
+    """
+    runned_kernel_names.append(instance.kernel.kernel_name)
+
+    if priority_boosting == 0:
+        if '+' in instance.kernel.kernel_name:
+            allocate_cgras = min(1, available_cgras)
+            # print(f"Kernel {instance.kernel.kernel_name} contains '+', limiting allocation to 1 CGRA")
+        elif instance.kernel.vector_factor != 1:    # limit allocated cgra of vector
+            allocate_cgras = min(1, available_cgras)
+            # print(f"Kernel {instance.kernel.kernel_name} is vectorized, limiting allocation to 1 CGRA")
+        elif available_cgras < 6: # 6 if num_cgras = 9, 3 if num_cgras = 4, 12 if num_cgras = 16, 20 if num_cgras = 25
+            allocate_cgras = min(1, available_cgras)
+            # print(f"available_cgras is less than 5, limiting allocation to 1 CGRA")
+        else:
+            allocate_cgras = min(instance.max_allocate_cgra, available_cgras)
+        #allocate_cgras = 1
+    else:
+        allocate_cgras = 1
+    available_cgras -= allocate_cgras
+    instance.start_time = current_time
+    instance.allocated_cgras = allocate_cgras
+    execution_duration = instance.calculate_execution_duration()
+    instance.end_time = current_time + execution_duration
+    instance.pure_waiting_duration = instance.start_time - instance.arrival_time  # Record pure waiting time
+    # print(f"Allocated {allocate_cgras} CGRAs to {instance.kernel.kernel_name} at {current_time}. Execution will end at {instance.end_time}")
+    heapq.heappush(events, (instance.end_time, 'end', instance, instance))
+    running_instances.append(instance)
+    if instance.kernel.rows != 4 and instance.kernel.columns != 4:  # HACK: 4x4 是 neura
+        total_cgra_runtime += allocate_cgras * execution_duration * instance.kernel.utilization
+    else:
+        total_cgra_runtime += allocate_cgras * execution_duration
+    return available_cgras, total_cgra_runtime
+
+
+def release(instance, current_time, available_cgras, running_instances, completed_instances, kernel_latency, total_cgra_runtime):
+    """
+    Release the CGRAs occupied by a kernel instance.
+
+    Parameters:
+        instance (KernelInstance): The kernel instance to release CGRAs from.
+        current_time (int): The current simulation time.
+        available_cgras (int): The number of available CGRAs.
+        running_instances (list): The list of currently running instances.
+        completed_instances (list): The list of completed instances.
+        kernel_latency (dict): A dictionary used to track the total latency of each kernel.
+        total_cgra_runtime (float): The total runtime of all CGRAs.
+
+    Returns:
+        int: The updated number of available CGRAs.
+        float: The updated total runtime of all CGRAs.
+    """
+    available_cgras += instance.allocated_cgras
+    completed_instances.append(instance)
+    if instance in running_instances:
+        running_instances.remove(instance)
+    # Update per-kernel overall latency
+    instance.end_time = current_time
+    latency = instance.end_time - instance.start_time
+    instance.pure_execution_duration = instance.end_time - instance.start_time  # Record pure execution time
+    kernel_latency[instance.kernel.kernel_name] += latency
+    # print(f"Released {instance.allocated_cgras} CGRAs from {instance.kernel.kernel_name} at {current_time}. Latency added: {latency} cycles")
+    return available_cgras, total_cgra_runtime
+
+
+def re_allocate(instance, current_time, available_cgras, events, total_cgra_runtime):
+    """
+    Re-allocate additional CGRAs to a kernel instance if possible.
+
+    Parameters:
+        instance (KernelInstance): The kernel instance to re-allocate CGRAs to.
+        available_cgras (int): Number of available CGRAs.
+        events (list): The event queue.
+        current_time (int): The current simulation time.
+        total_cgra_runtime (float): Total runtime of all CGRAs.
+
+    Returns:
+        int: Updated number of available CGRAs.
+        float: Updated total runtime of all CGRAs.
+    """
+    if not instance.is_valid:
+        # print(f"Instance {instance.kernel.kernel_name} is already invalid, skipping re-allocation.")
+        return available_cgras, total_cgra_runtime
+    if instance.allocated_cgras < instance.max_allocate_cgra and available_cgras > 0:
+        possible_alloc = min(instance.max_allocate_cgra - instance.allocated_cgras, available_cgras)
+        original_allocated_cgras = instance.allocated_cgras
+        # Update allocation
+        instance.allocated_cgras += possible_alloc
+        available_cgras -= possible_alloc
+        # Recalculate remaining iterations
+        elapsed_duration = current_time - instance.start_time
+        # Calculate equivalent scalar iteration count (considering vectorization and CGRAs)
+        if instance.kernel.vector_factor == 1:
+            # scalar
+            completed_iters = elapsed_duration // instance.ii
+        else:
+            # vector
+            effective_ii = instance.ii * math.ceil(instance.kernel.vector_factor / (VECTOR_LANE * original_allocated_cgras))
+            completed_iters = int(elapsed_duration // effective_ii)
+        remaining_iters = instance.kernel.total_iterations - completed_iters
+        # print(f"current_time {current_time}, completed_iters {completed_iters}")
+        # Update II and remaining_execution_duration
+        if instance.kernel.vector_factor == 1:
+            # Scalar case
+            if instance.allocated_cgras == 1:
+                instance.ii = instance.kernel.base_ii
+            elif instance.allocated_cgras == 2:
+                instance.ii = instance.kernel.expandable_ii
+            else:
+                raise ValueError(f"Number of CGRAs must be between 1 and {instance.max_allocate_cgra}.")
+            remaining_execution_duration = remaining_iters * instance.ii
+        else:
+            # Vector case
+            vector_divisor = VECTOR_LANE * instance.allocated_cgras
+            remaining_execution_duration = remaining_iters * instance.ii * math.ceil(instance.kernel.vector_factor / vector_divisor)
+        # Schedule new end event
+        new_end_time = current_time + remaining_execution_duration
+        # print(f"remaining_iters {remaining_iters}, remaining_execution_duration {remaining_execution_duration}")
+        # print(f"Re-allocated succeed. {instance.kernel.kernel_name}. Add {possible_alloc} CGRAs at {current_time}. Old end time: {instance.end_time}. New end time: {new_end_time}")
+        # Create a new valid instance for the new end event
+        new_instance = instance.copy_with_valid()  # Assume there is a copy method in KernelInstance class
+        heapq.heappush(events, (new_end_time, 'end', new_instance, new_instance))
+        # Invalidate old end event by leaving it in the heap but ignoring when processed
+        instance.is_valid = False   # Old instance is invalid
+        # 修正total_cgra_runtime计算，添加utilization判断
+        kernel = instance.kernel
+        is_12x12 = (kernel.rows == 12 and kernel.columns == 12)
+        utilization_factor = kernel.utilization if is_12x12 else 1.0
+        # Apply utilization factor uniformly
+        old_estimate = original_allocated_cgras * (instance.end_time - instance.start_time) * utilization_factor
+        actual_runtime = original_allocated_cgras * elapsed_duration * utilization_factor
+        new_allocation_runtime = instance.allocated_cgras * remaining_execution_duration * utilization_factor
+        # Update total runtime
+        total_cgra_runtime -= old_estimate  # Remove old estimate
+        total_cgra_runtime += actual_runtime  # Add actual runtime completed
+        total_cgra_runtime += new_allocation_runtime  # Add new allocation runtime
+    else:
+        # print(f"Re-allocated Failed. ({instance.kernel.kernel_name} at time {current_time})")
+        pass
+    return available_cgras, total_cgra_runtime
+
+
+def handle_reallocation(priority_boosting, running, current_time, available_cgras, events, total_cgra_runtime):
+    """
+    Checks if a running instance should be re-allocated based on the priority_boosting strategy.
+
+    Args:
+        priority_boosting (int): The strategy for re-allocation.
+                                 0: No re-allocation.
+                                 1: Re-allocate for vector_factor=1 kernels without '+' in name.
+                                 2: Re-allocate for all vector_factor=1 kernels.
+                                 3: Re-allocate for all kernels.
+        running (object): The currently running instance to check.
+        current_time (float): The current simulation time.
+        available_cgras (int): The number of currently available CGRAs.
+        events (list): The list of simulation events.
+        total_cgra_runtime (float): The accumulated total CGRA runtime.
+
+    Returns:
+        tuple: A tuple containing the updated available_cgras and total_cgra_runtime.
+    """
+    if priority_boosting <= 0:
+        return available_cgras, total_cgra_runtime
+
+    should_reallocate = False
+    kernel_info = running.kernel
+
+    if priority_boosting == 1:
+        # Re-allocate only for kernels with vector_factor=1 and no '+' in the name
+        should_reallocate = (kernel_info.vector_factor == 1 and '+' not in kernel_info.kernel_name)
+    elif priority_boosting == 2:
+        # Re-allocate for all kernels with vector_factor=1 (including those with '+')
+        should_reallocate = (kernel_info.vector_factor == 1)
+    elif priority_boosting == 3:
+        # Re-allocate for all kernels
+        should_reallocate = True
+
+    # If the condition is met, perform the re-allocation
+    if should_reallocate:
+        available_cgras, total_cgra_runtime = re_allocate(
+            running, current_time, available_cgras, events, total_cgra_runtime
+        )
+
+    return available_cgras, total_cgra_runtime
+
+
+def simulate(num_cgras, kernels, priority_boosting, lcm_time=26214400):
+    """
+    lcm_time=26214400
+    Simulate the execution of multiple kernels on a CGRA architecture.
+
+    Parameters:
+        num_cgras (int): The number of CGRAs in the CGRA architecture.
+        kernels (list of Kernel): The list of kernels to simulate.
+        priority_boosting (bool): Whether to enable priority boosting.
+        lcm_time (int): The least common multiple of the arrival periods.
+
+    Returns:
+        dict: A dictionary that maps kernel names to their total latencies.
+    """
+    # Add target check time
+    CHECK_TIME = 3276800
+    # Flag to mark whether result has been output, avoiding duplicate output
+    checked = False
+
+    available_cgras = num_cgras
+    events = []  # when a kernel arrives or ends, it is an event
+    current_time = 0
+    waiting_instances = []
+    running_instances = []
+    completed_instances = []
+    runned_kernel_names = []
+    # Dictionary to store per-kernel arrival times
+    kernel_arrival_count = {kernel.kernel_id: 0 for kernel in kernels}
+    # Dictionary to store per-kernel overall latency (cycle)
+    kernel_latency = {kernel.kernel_name: 0 for kernel in kernels}
+    # Dictionary to store per-kernel execution duration distribution
+    kernel_execution_distribution = {kernel.kernel_name: [] for kernel in kernels}
+    # Dictionary to store per-kernel waiting duration distribution
+    kernel_waiting_distribution = {kernel.kernel_name: [] for kernel in kernels}
+    # Dictionary to store per-kernel ratio (iterations per cycle)
+    kernel_execution_ratio = {kernel.kernel_name: 0 for kernel in kernels}
+    # Dictionary to store per-kernel ratio (iterations per cycle)
+    kernel_waiting_ratio = {kernel.kernel_name: 0 for kernel in kernels}
+    total_cgra_runtime = 0
+    idle_tracker = SystemIdleTracker(num_cgras=num_cgras)
+    arrive_times_list = {
+        kernel.kernel_id: ((lcm_time // kernel.arrive_period))
+        for kernel in kernels
+    }
+    # print(arrive_times_list)
+
+
+    # print(f"\033[91mPriority Boosting Level: {priority_boosting}\033[0m")
+
+    for kernel in kernels:
+        print(f"Kernel {kernel.kernel_name} base_ii={kernel.base_ii}, expandable_ii={kernel.expandable_ii}, \
+              iterations={kernel.total_iterations}, utilization={kernel.utilization}, arrive_times, {arrive_times_list[kernel.kernel_id]}, isvalid, {kernel.is_valid}")
+
+    # Schedule initial arrivals for all kernels
+    for kernel in kernels:
+        first_arrival = 0
+        # heapq keeps a priority queue that contains (event_arrive_end_time (int), event_type (str), Kernel, KernelInstance (needed when 'end'))
+        heapq.heappush(events, (first_arrival, 'arrival', kernel, None))
+
+    while events:
+        event_time, event_type, kernel_or_instance, _ = heapq.heappop(events)
+        if not kernel_or_instance.is_valid:
+            # tmp_name = kernel_or_instance.kernel_name if kernel_or_instance is Kernel else kernel_or_instance.kernel.kernel_name
+            # print(f"Skipping invalid event for tmp_name")
+            continue
+
+        current_time = event_time
+        # print("="*20)
+        idle_tracker.check_idle_period(current_time, available_cgras, waiting_instances)
+        # print(f"Processing event at time {current_time}: type={event_type}, kernel={kernel_or_instance.kernel_name if event_type == 'arrival' else kernel_or_instance.kernel.kernel_name}")
+
+        if event_type == 'arrival' and kernel_or_instance.is_valid:
+            kernel = kernel_or_instance
+            kernel_arrival_count[kernel.kernel_id] += 1
+            # Create a new instance
+            instance = kernel.create_instance(current_time)
+            # Schedule next arrival if within lcm_time
+            next_arrival = current_time + kernel.arrive_period
+            if kernel_arrival_count[kernel.kernel_id] < arrive_times_list[kernel.kernel_id]:
+                heapq.heappush(events, (next_arrival, 'arrival', kernel, None))
+                # print(f"Scheduled next arrival for {kernel.kernel_name} at time {next_arrival}")
+
+
+            # Try to allocate CGRAs
+            if available_cgras >= 1:
+                available_cgras, total_cgra_runtime = allocate(priority_boosting, instance, current_time, available_cgras, events, running_instances, runned_kernel_names, total_cgra_runtime)
+                available_cgras, total_cgra_runtime = handle_reallocation(priority_boosting, instance, current_time, available_cgras, events, total_cgra_runtime)
+            else:
+                waiting_instances.append(instance)
+                # print(f"No available CGRAs for {kernel.kernel_name}. Added to waiting queue.")
+
+        elif event_type == 'end' and kernel_or_instance.is_valid:
+            instance = kernel_or_instance
+            # Release CGRAs
+            available_cgras, total_cgra_runtime = release(instance, current_time, available_cgras, running_instances, completed_instances,kernel_latency, total_cgra_runtime)
+
+            # Update execution duration distribution
+            kernel_execution_distribution[instance.kernel.kernel_name].append(instance.pure_execution_duration)
+            kernel_waiting_distribution[instance.kernel.kernel_name].append(instance.pure_waiting_duration)
+
+            # Check waiting queue
+            while waiting_instances and available_cgras >= 1:
+                instance = waiting_instances.pop(0)
+                # print(f"Allocating CGRAs to waiting instance {instance.kernel.kernel_name}")
+                available_cgras, total_cgra_runtime = allocate(priority_boosting, instance, current_time, available_cgras, events, running_instances, runned_kernel_names, total_cgra_runtime)
+                available_cgras, total_cgra_runtime = handle_reallocation(priority_boosting, instance, current_time, available_cgras, events, total_cgra_runtime)
+
+            # Check running instances for possible re-allocation
+            # if priority_boosting:
+            #     for running in running_instances:
+            #         available_cgras, total_cgra_runtime = re_allocate(running, current_time, available_cgras, events, total_cgra_runtime)
+            for running in running_instances[:]:
+                available_cgras, total_cgra_runtime = handle_reallocation(
+                    priority_boosting, running, current_time, available_cgras, events, total_cgra_runtime
+                )
+
+        # Check if the target time has been reached, results haven't been output yet, and current time >= target time
+        if not checked and current_time >= CHECK_TIME:
+            # print(f"\n=== At time {CHECK_TIME}, number of completed functions: {len(completed_instances)} ===")
+            checked_num_kernel = len(completed_instances)
+            checked = True
+
+        # print("="*20)
+
+    # If the simulation ends before reaching the target time, also output results
+    if not checked:
+        # print(f"\n=== Simulation ended before {CHECK_TIME}, number of completed functions: {len(completed_instances)}")
+        checked_num_kernel = len(completed_instances)
+
+    overall_execution = 0
+    overall_waiting = 0
+    # Calculate ratio for each kernel
+    for kernel in kernels:
+        total_execution_duration = sum(
+            [inst.pure_execution_duration for inst in completed_instances if inst.kernel.kernel_name == kernel.kernel_name])
+        total_waiting_duration = sum(
+            [inst.pure_waiting_duration for inst in completed_instances if inst.kernel.kernel_name == kernel.kernel_name])
+        total_duration = total_execution_duration + total_waiting_duration
+        kernel_execution_ratio[kernel.kernel_name] = total_execution_duration / total_duration if total_duration > 0 else 0
+        kernel_waiting_ratio[kernel.kernel_name] = total_waiting_duration / total_duration if total_duration > 0 else 0
+        overall_execution += total_execution_duration
+        overall_waiting += total_waiting_duration
+
+    # Calculate utilization of total CGRAs
+    cgra_utilization = idle_tracker.get_utilization(total_cgra_runtime, current_time)
+    waiting_time_nolap = idle_tracker.total_waiting_time_nolap
+    overall_latency = current_time  # when all kernels are done
+
+    # print(f"Simulation completed. Kernel latencies: {kernel_latency}")
+    # print(f"Kernel execution_ratio: {kernel_execution_ratio}")
+    # print(f"Kernel execution duration distributions: {kernel_execution_distribution}")
+    # print(f"Kernel Runned List: {runned_kernel_names}")
+    # print(f"CGRA utilization: {cgra_utilization}")
+    # print(f"overall latency: {overall_latency}")
+    # print(f"overall execution: {overall_execution}")
+    # print(f"overall waiting_time_nolap: {waiting_time_nolap}")
+    return kernel_latency, kernel_waiting_distribution, kernel_execution_ratio, kernel_waiting_ratio, kernel_execution_distribution, cgra_utilization, overall_latency, overall_execution, checked_num_kernel, waiting_time_nolap
+
+
+def run_multiple_simulations_and_save_to_csv(kernels_list, csv_name, priority_boosting, kernel_case, num_cgras=9):
+    """
+    Run multiple simulations and save the results to a CSV file.
+
+    Parameters:
+        kernels_list (list of list of Kernel): A list of kernels.
+        csvname (str): The name of the CSV file.
+        priority_boosting (int): Whether to enable priority boosting.
+        num_cgras (int): The number of CGRAs, default 9.
+    """
+    kernel_latency, kernel_waiting_distribution, kernel_execution_ratio, kernel_waiting_ratio, kernel_execution_distribution, cgra_utilization, overall_latency, overall_execution, checked_num_kernel, waiting_time_nolap = simulate(num_cgras, kernels_list, priority_boosting)
+
+    # Calculate fastest, slowest, and average execution duration per kernel
+    execution_stats = {}
+    for kernel_name, execution_durations in kernel_execution_distribution.items():
+        if execution_durations:
+            fastest = min(execution_durations)
+            slowest = max(execution_durations)
+            average = sum(execution_durations) / len(execution_durations)
+            total = sum(execution_durations)
+            execution_stats[kernel_name] = {
+                "fastest_execution_duration": fastest,
+                "slowest_execution_duration": slowest,
+                "average_execution_duration": average,
+                "total_execution_duration": total
+            }
+
+    # Calculate fastest, slowest, and average waiting duration per kernel
+    waiting_stats = {}
+    overall_avg_waiting = 0
+    for kernel_name, waiting_durations in kernel_waiting_distribution.items():
+        if waiting_durations:
+            fastest = min(waiting_durations)
+            slowest = max(waiting_durations)
+            average = sum(waiting_durations) / len(waiting_durations)
+            overall_avg_waiting += average
+            total = sum(waiting_durations)
+            waiting_stats[kernel_name] = {
+                "fastest_waiting_duration": fastest,
+                "slowest_waiting_duration": slowest,
+                "average_waiting_duration": average,
+                "total_waiting_duration": total
+            }
+
+    all_results = []
+    for kernel in kernels_list:
+        kernel_name = kernel.kernel_name
+        result = {
+            "Kernel_Name": kernel_name,
+            "Arrive_Period": kernel.arrive_period,
+            "Unroll_Factor": kernel.unroll_factor,
+            "Vector_Factor": kernel.vector_factor,
+            "fastest_execution_duration": execution_stats.get(kernel_name, {}).get("fastest_execution_duration", 0),
+            "slowest_execution_duration": execution_stats.get(kernel_name, {}).get("slowest_execution_duration", 0),
+            "Average_Execution_duration": execution_stats.get(kernel_name, {}).get("average_execution_duration", 0),
+            "fastest_waiting_duration": waiting_stats.get(kernel_name, {}).get("fastest_waiting_duration", 0),
+            "slowest_waiting_duration": waiting_stats.get(kernel_name, {}).get("slowest_waiting_duration", 0),
+            "Average_Waiting_duration": waiting_stats.get(kernel_name, {}).get("average_waiting_duration", 0),
+            "Total_Execution_duration": execution_stats.get(kernel_name, {}).get("total_execution_duration", 0),
+            "Total_Waiting_duration": waiting_stats.get(kernel_name, {}).get("total_waiting_duration", 0),
+            "Execution_duration Ratio": kernel_execution_ratio[kernel_name],
+            "Waiting_duration Ratio": kernel_waiting_ratio[kernel_name],
+            "Overall_Case_Latency": overall_latency,
+            "Overall_Execution": overall_execution,
+            "Sum_Average_Waiting_duration": overall_avg_waiting,
+            "CGRA_Utilization": cgra_utilization,
+            "checked_num_kernel":checked_num_kernel,
+            "waiting_time_nolap":waiting_time_nolap,
+            "Total_Execution_duration Ratio": (execution_stats.get(kernel_name, {}).get("total_execution_duration", 0))/overall_latency,
+            "Total_Waiting_duration Ratio": (waiting_stats.get(kernel_name, {}).get("total_waiting_duration", 0))/overall_latency,
+            "Total_Latency Ratio":  (execution_stats.get(kernel_name, {}).get("total_execution_duration", 0) + waiting_stats.get(kernel_name, {}).get("total_waiting_duration", 0))/overall_latency
+        }
+        all_results.append(result)
+
+
+    df = pd.DataFrame(all_results)
+    file_name = f'./result/simulation_{kernel_case}_{csv_name}.csv'
+    df.to_csv(file_name, index=False)
+    print(f"reslut {file_name} saved")
\ No newline at end of file
diff --git a/tools/expandable/util/visualizer.py b/tools/expandable/util/visualizer.py
new file mode 100644
index 00000000..357c147a
--- /dev/null
+++ b/tools/expandable/util/visualizer.py
@@ -0,0 +1,652 @@
+# ----------------------------------------------------------------------------
+#   Filename: main.py                                                       /
+#   Description: load multi-task and schedule them on multi-CGRA            /
+# ----------------------------------------------------------------------------
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+from typing import List, Dict
+
+# ----------------------------------------------------------------------------
+#   class defination                                                         /
+# ----------------------------------------------------------------------------
+
+class SimulationDataAnalyzer:
+    """Simulation data visualization analysis tool"""
+
+    def __init__(self, kernel_data):
+        """
+        Initialize the analyzer
+
+        Attributes:
+            data_cache (dict): Cache for loaded data
+            figure_config (dict): Default configuration for figures
+        """
+        self.execution_cache = {}  # Cache for loaded data
+        self.utilization_cache = {}
+        self.throughput_cache = {}
+        self.number_cache = {}
+        self.waiting_cache = {}
+        self.scalability_cache = {}
+        self.latency_cache = {}
+        self.KERNEL_NAMES = list(kernel_data.keys())
+        self.NEURA_CONFIGS = ['Baseline', 'Neura-L0', 'Neura-L1', 'Neura-L2', 'Neura']
+        self.KERNEL_COLORS = ['#A4A3A4','#B0C4E6','#8DA9DC','#FEEDB9','#002060',
+                            '#F3B082','#F7CAAB','#C7FAA8','#FFD865']
+        self.NEURA_COLORS = ['#7F7F7F','#EDEDED','#FFF2CC','#FFD966','#FFC000']
+
+    def load_execution_data(self, task_case: str, csv_name: str, normalized_baseline: int):
+        """
+        Load data from a single CSV file
+
+        Args:
+            task_case (str): Kernel case identifier
+            csv_name (str): CSV file name identifier
+
+        Returns:
+            pd.DataFrame: DataFrame containing specified columns, or None if file doesn't exist
+        """
+        file_path = f'./result/simulation_{task_case}_{csv_name}.csv'
+
+        if not os.path.exists(file_path):
+            print(f"File does not exist: {file_path}")
+            return None
+
+        # Read specified columns from the data
+        try:
+            df = pd.read_csv(file_path)
+            required_columns = ['Total_Execution_duration', 'Overall_Execution', 'CGRA_Utilization']
+
+            # Check if required columns exist
+            missing_columns = [col for col in required_columns if col not in df.columns]
+            if missing_columns:
+                raise ValueError(f"File is missing required columns: {', '.join(missing_columns)}")
+
+            # Cache the data
+            cache_key = f"{task_case}_{csv_name}"
+            self.execution_cache[cache_key] = df['Total_Execution_duration'] / normalized_baseline
+            self.utilization_cache[cache_key] = df['CGRA_Utilization']
+            return self.execution_cache[cache_key]
+
+        except Exception as e:
+            print(f"Failed to read file: {file_path}, Error: {str(e)}")
+            return None
+
+    def process_execution_data(self, task_cases: List[str]):
+        """
+        Batch load data from multiple CSV files
+
+        Args:
+            task_cases (List[str]): List of kernel case identifiers
+
+        Returns:
+            Dict[str, pd.DataFrame]: Mapping from cache keys to DataFrames
+        """
+        df = pd.read_csv("./result/simulation_1_Baseline.csv")
+        normalized_baseline = df['Overall_Execution'].iloc[0] #case1 的 Baseline 的 overall execution time
+
+        for task_case in task_cases:
+            for csv_name in self.NEURA_CONFIGS:
+                self.load_execution_data(task_case, csv_name, normalized_baseline)
+
+        return
+
+    def load_throughput_data(self, task_case: str, csv_name: str):
+        """
+        Load data from a single CSV file
+
+        Args:
+            task_case (str): Kernel case identifier
+            csv_name (str): CSV file name identifier
+
+        Returns:
+            pd.DataFrame: DataFrame containing specified columns, or None if file doesn't exist
+        """
+        file_path = f'./result/simulation_{task_case}_{csv_name}.csv'
+
+        if not os.path.exists(file_path):
+            print(f"File does not exist: {file_path}")
+            return None
+
+        # Read specified columns from the data
+        try:
+            df = pd.read_csv(file_path)
+            required_columns = ['Total_Execution_duration', 'waiting_time_nolap', 'Average_Execution_duration']
+
+            # Check if required columns exist
+            missing_columns = [col for col in required_columns if col not in df.columns]
+            if missing_columns:
+                raise ValueError(f"File is missing required columns: {', '.join(missing_columns)}")
+
+            # Cache the data
+            cache_key = f"{task_case}_{csv_name}"
+            self.execution_cache[cache_key] = df['Total_Execution_duration']
+            self.number_cache[cache_key] = np.where(
+                (df['Average_Execution_duration'] == 0),
+                0,
+                df['Total_Execution_duration'] / df['Average_Execution_duration']
+            )
+            self.waiting_cache[cache_key] = df['waiting_time_nolap']
+
+            return self.execution_cache[cache_key]
+
+        except Exception as e:
+            print(f"Failed to read file: {file_path}, Error: {str(e)}")
+            return None
+
+    def process_throughput_data(self, task_cases: List[str]):
+        """
+        Batch load data from multiple CSV files
+
+        Args:
+            task_cases (List[str]): List of kernel case identifiers
+
+        Returns:
+            Dict[str, pd.DataFrame]: Mapping from cache keys to DataFrames
+        """
+        df = pd.read_csv("./result/simulation_1_Baseline.csv")
+        file_path = "./result/simulation_1_Baseline.csv"
+        normalized_baseline = df['Overall_Execution'].iloc[0] #case1 的 Baseline 的 overall execution time
+
+        for task_case in task_cases:
+            for csv_name in self.NEURA_CONFIGS:
+                self.load_throughput_data(task_case, csv_name)
+
+        return
+
+    def load_scalability_data(self, task_case: str, csv_name: str, execution_baseline: int, latency_baseline: int):
+        """
+        Load data from a single CSV file
+
+        Args:
+            task_case (str): Kernel case identifier
+            csv_name (str): CSV file name identifier
+
+        Returns:
+            pd.DataFrame: DataFrame containing specified columns, or None if file doesn't exist
+        """
+        file_path = f'./result/simulation_{task_case}_{csv_name}.csv'
+        print(file_path)
+        if not os.path.exists(file_path):
+            print(f"File does not exist: {file_path}")
+            return None
+
+        # Read specified columns from the data
+        try:
+            df = pd.read_csv(file_path)
+            required_columns = ['Total_Execution_duration', 'Overall_Execution', 'CGRA_Utilization', 'Overall_Case_Latency']
+
+            # Check if required columns exist
+            missing_columns = [col for col in required_columns if col not in df.columns]
+            if missing_columns:
+                raise ValueError(f"File is missing required columns: {', '.join(missing_columns)}")
+
+            # Cache the data
+            cache_key = f"{task_case}_{csv_name}"
+            self.scalability_cache[cache_key] = df['Total_Execution_duration'] / execution_baseline
+            self.latency_cache[cache_key] = df['Overall_Case_Latency'] / latency_baseline
+            self.utilization_cache[cache_key] = df['CGRA_Utilization']
+            return self.scalability_cache[cache_key]
+
+        except Exception as e:
+            print(f"Failed to read file: {file_path}, Error: {str(e)}")
+            return None
+
+    def process_scalability_data(self, task_cases: List[str]):
+        """
+        Batch load data from multiple CSV files
+
+        Args:
+            task_cases (List[str]): List of kernel case identifiers
+
+        Returns:
+            Dict[str, pd.DataFrame]: Mapping from cache keys to DataFrames
+        """
+        df = pd.read_csv("./result/simulation_2x2_6_Baseline.csv")
+        normalized_baseline = df['Overall_Execution'].iloc[0]
+        latency_baseline = df['Overall_Case_Latency'].iloc[0]
+        for task_case in task_cases:
+            for csv_name in self.NEURA_CONFIGS:
+                self.load_scalability_data(task_case, csv_name, normalized_baseline, latency_baseline)
+
+        return
+
+    def genFig9(self, fig_path: str):
+        """
+        Generate Figure 9: Normalized execution time and improved utilization
+        """
+        cases = ['1', '2', '3', '4', '5', '6']
+        self.process_execution_data(cases)
+
+        # Correct data structure - one value per X position
+        bar_data = {kernel: [] for kernel in self.KERNEL_NAMES}  # Bar chart data
+        line_data = [] # Line chart data
+        x_labels = []  # X-axis labels
+
+        # Collect data
+        for case in cases:
+            for group in self.NEURA_CONFIGS:
+                cache_key = f"{case}_{group}"  # Adjust based on your actual naming convention
+                execution_series = self.execution_cache.get(cache_key)
+                utilization_series = self.utilization_cache.get(cache_key)
+
+                # Bar chart data - Resource utilization
+                if execution_series is not None:
+                    if hasattr(execution_series, 'to_dict'):
+                        exec_dict = execution_series.to_dict()
+                    else:
+                        exec_dict = dict(execution_series)
+                    for i, kernel in enumerate(self.KERNEL_NAMES):
+                        kernel_value = float(exec_dict[i]) * 100
+                        bar_data[kernel].append(kernel_value)
+                else:
+                    for kernel in self.KERNEL_NAMES:
+                        bar_data[kernel].append(0)
+
+                # Line chart data - Execution duration or other metrics
+                if utilization_series is not None:
+                    line_value = utilization_series.iloc[0]
+                    line_data.append(float(line_value) * 100)
+                else:
+                    line_data.append(0)
+
+                x_labels.append(f"{group}")
+
+        # Create chart
+        fig, ax1 = plt.subplots(figsize=(20, 8))
+        plt.style.use({
+            'font.size': 20,
+            'axes.labelsize': 18,
+            'axes.titlesize': 18,
+            'xtick.labelsize': 18,
+            'ytick.labelsize': 18
+        })
+
+        total_bars = len(cases) * len(self.NEURA_CONFIGS)
+        x_positions = np.arange(total_bars)
+        bar_width = 0.6
+        # Primary Y-axis - Bar chart
+        color_dict = {kernel: color for kernel, color in zip(self.KERNEL_NAMES, self.KERNEL_COLORS)}
+        bottom = np.zeros(total_bars)
+        bars_by_kernel = {}
+        for kernel in self.KERNEL_NAMES:
+            data = bar_data[kernel]
+            bars = ax1.bar(x_positions, data, bar_width, bottom=bottom,
+                        color=color_dict[kernel], alpha=0.8,
+                        edgecolor='black', linewidth=0.5, label=kernel)
+            bars_by_kernel[kernel] = bars
+            bottom += np.array(data)
+
+
+        # Add black dashed separator lines every group
+        for i in range(4, len(x_positions)-1, 5):
+            line_pos = i + 0.5
+            ax1.axvline(x=line_pos,
+                    color='black',
+                    linestyle='--',
+                    linewidth=0.8,
+                    alpha=0.8)
+
+        # Display values on Neura
+        arrays = [np.array(heights) for heights in bar_data.values()]
+        total_heights = np.sum(arrays, axis=0)
+        for i, (x, y) in enumerate(zip(x_positions, total_heights)):
+            if (i + 1) % 5 == 0:
+                ax1.text(x, y + max(total_heights)*0.02, f'{y:.1f}',
+                        ha='center', va='bottom', fontsize=10)
+
+        ax1.set_ylabel('Normalized execution time (%)', fontsize=20, color='black')
+        ax1.tick_params(axis='y', labelcolor='black', labelsize=18)
+        ax1.set_ylim(0, 120)
+        ax1.legend(loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0.,
+                fontsize=12, title="Kernels", title_fontsize=13)
+
+        # Secondary Y-axis - Line chart
+        ax2 = ax1.twinx()
+
+        # Calculate number of complete cases
+        num_complete_cases = len(x_positions) // len(self.NEURA_CONFIGS)
+
+        # Insert NaN every 5 points
+        x_with_gaps = []
+        y_with_gaps = []
+
+        for case_idx in range(num_complete_cases):
+            # Start and end indices for this case
+            start_idx = case_idx * len(self.NEURA_CONFIGS)
+            end_idx = start_idx + len(self.NEURA_CONFIGS)
+
+            # Add 5 points for this case
+            x_with_gaps.extend(x_positions[start_idx:end_idx])
+            y_with_gaps.extend(line_data[start_idx:end_idx])
+
+            # Add NaN after each case (except the last complete case)
+            if case_idx < num_complete_cases - 1:
+                x_with_gaps.append(np.nan)
+                y_with_gaps.append(np.nan)
+
+
+        # Convert to numpy arrays
+        x_with_gaps = np.array(x_with_gaps)
+        y_with_gaps = np.array(y_with_gaps)
+
+        # Plot line with gaps between cases
+        line = ax2.plot(x_with_gaps, y_with_gaps,
+                        marker='o', markersize=8, linewidth=2.5,
+                        color='blue', linestyle='--',
+                        markerfacecolor='white', markeredgewidth=2,
+                        label='Utilization')
+
+        ax2.set_ylabel('Resource Utilization (%)', fontsize=20, color='black')
+        ax2.tick_params(axis='y', labelcolor='black', labelsize=18)
+
+        # Display values on line points
+        for i, (x, y) in enumerate(zip(x_positions, line_data)):
+            ax2.text(x, y + max(line_data)*0.02, f'{y:.1f}',
+                    ha='center', va='bottom', fontsize=10)
+
+        # Set X-axis labels and grouping
+        ax1.set_xticks(x_positions)
+        ax1.set_xticklabels(x_labels, rotation=90)
+        ax1.tick_params(axis='x', labelsize=18)
+
+        # Add group labels
+        group_positions = [3, 8, 13, 17, 22, 27]  # Middle position of each group
+        for case, pos in zip(cases, group_positions):
+            ax1.text(pos, -0.15, 'case ' + case, transform=ax1.get_xaxis_transform(),
+                    ha='center', va='top', fontsize=20, fontweight='bold',
+                    bbox=dict(boxstyle="round,pad=0.3", facecolor='lightgray', alpha=0.8))
+
+        ax1.grid(True, linestyle='--', alpha=0.3, axis='y')
+        plt.title('ExampleFig9')
+        plt.tight_layout()
+        plt.savefig(fig_path)
+        print(f"Generated fig f{fig_path}")
+
+    def genFig10(self, fig_path: str):
+        """
+        Generate Figure 10: Normalized throughput speedup
+        """
+        cases = ['1', '2', '3', '4', '5', '6']
+        self.process_throughput_data(cases)
+
+        # Correct data structure - one value per X position
+        bar_data = []  # Bar chart data
+        x_labels = []  # X-axis labels
+        # Collect data
+        for case in cases:
+            cache_key = f"{case}_Baseline"
+            execution_series = self.execution_cache.get(cache_key)
+            number_series = self.number_cache.get(cache_key)
+            waiting_series = self.waiting_cache.get(cache_key)
+            hw_waiting = waiting_series.iloc[0] / int(number_series.sum())
+            avg_execution = execution_series.sum() / int(number_series.sum())
+            hw_waiting_ratio = hw_waiting / (hw_waiting + avg_execution)
+            avg_execution_ratio = avg_execution / (hw_waiting + avg_execution)
+            hw_waiting_baseline = hw_waiting
+            avg_execution_baseline = avg_execution
+            throughput_baseline = (hw_waiting_ratio + avg_execution_ratio)
+
+            for group in self.NEURA_CONFIGS:
+                cache_key = f"{case}_{group}"  # Adjust based on your actual naming convention
+                execution_series = self.execution_cache.get(cache_key)
+                number_series = self.number_cache.get(cache_key)
+                waiting_series = self.waiting_cache.get(cache_key)
+                if (execution_series is None or number_series is None or
+                waiting_series is None):
+                    continue
+                hw_waiting = waiting_series.iloc[0] / int(number_series.sum())
+                avg_execution = execution_series.sum() / int(number_series.sum())
+                hw_waiting_ratio = hw_waiting / (hw_waiting_baseline + avg_execution_baseline)
+                avg_execution_ratio = avg_execution / (hw_waiting_baseline + avg_execution_baseline)
+                bar_data.append(throughput_baseline / (hw_waiting_ratio + avg_execution_ratio))
+
+                x_labels.append(f"{group}")
+        # sum_throughput = throughput_speedup.sum()
+        # Create chart
+        fig, ax1 = plt.subplots(figsize=(20, 8))
+        plt.style.use({
+            'font.size': 20,
+            'axes.labelsize': 18,
+            'axes.titlesize': 18,
+            'xtick.labelsize': 18,
+            'ytick.labelsize': 18
+        })
+
+        x_positions = np.arange(len(bar_data))
+        bar_width = 0.6
+
+        bars = ax1.bar(x_positions, bar_data, bar_width,
+               color=self.NEURA_COLORS[:len(bar_data)],
+               alpha=0.8,
+               edgecolor='black',
+               linewidth=0.5)
+
+        # Add black dashed separator lines every group
+        for i in range(4, len(bar_data)-1, 5):
+            line_pos = i + 0.5
+            ax1.axvline(x=line_pos,
+                    color='black',
+                    linestyle='--',
+                    linewidth=0.8,
+                    alpha=0.8)
+
+        for i, (x, y) in enumerate(zip(x_positions, bar_data)):
+            if (i + 1) % 5 == 0:
+                ax1.text(x, y + max(bar_data)*0.02, f'{y:.1f}',
+                        ha='center', va='bottom', fontsize=10)
+
+        ax1.set_ylabel('Normalized Throughput Speedup', fontsize=20, color='black')
+        ax1.tick_params(axis='y', labelcolor='black')
+        ax1.set_ylim(0, 4)
+
+
+        # Set X-axis labels and grouping
+        ax1.set_xticks(x_positions)
+        ax1.set_xticklabels(x_labels, rotation=90)
+
+        # Add group labels
+        group_positions = [3, 8, 13, 17, 22, 27]  # Middle position of each group
+        for case, pos in zip(cases, group_positions):
+            ax1.text(pos, -0.15, 'case ' + case, transform=ax1.get_xaxis_transform(),
+                    ha='center', va='top', fontsize=20, fontweight='bold',
+                    bbox=dict(boxstyle="round,pad=0.3", facecolor='lightgray', alpha=0.8))
+
+        # Legends
+        ax1.legend(loc='upper left')
+
+        ax1.grid(True, linestyle='--', alpha=0.3, axis='y')
+        plt.title('ExampleFig10')
+        plt.tight_layout()
+        # plt.legend()
+        plt.savefig(fig_path)
+        print(f"Generated fig {fig_path}")
+
+    def genFig11(self, fig_path: str):
+        """
+        Generate Figure 11: Scalability -- Normalized execution time and improved utilization
+        """
+        cases = ['2x2_6', '3x3_6', '4x4_6', '5x5_6']
+        self.process_scalability_data(cases)
+
+        # Correct data structure - one value per X position
+        bar_data = {kernel: [] for kernel in self.KERNEL_NAMES}  # Bar chart data
+        line_data = [] # Line chart data
+        x_labels = []  # X-axis labels
+        # Collect data
+        cache_key = "2x2_6_Baseline"
+        scalability_series = self.scalability_cache.get(cache_key)
+        latency_series = self.latency_cache.get(cache_key)
+        throughput_speedup = [0] * len(scalability_series)
+        for i in range(len(scalability_series)):
+            throughput_speedup[i] = (1 / (scalability_series[i] * latency_series[i] * 100))
+        throughput_baseline = sum(throughput_speedup)
+        for case in cases:
+            for group in self.NEURA_CONFIGS:
+                cache_key = f"{case}_{group}"  # Adjust based on your actual naming convention
+                scalability_series = self.scalability_cache.get(cache_key)
+                utilization_series = self.utilization_cache.get(cache_key)
+                latency_series = self.latency_cache.get(cache_key)
+                if (scalability_series is None or latency_series is None or
+                utilization_series is None):
+                    continue
+                for i in range(len(scalability_series)):
+                    if scalability_series[i] * latency_series[i] == 0:
+                        tmp = 0
+                    else:
+                        tmp = (1 / (scalability_series[i] * latency_series[i] * 100))
+                    throughput_speedup[i] = tmp / throughput_baseline
+                # Bar chart data
+                for i, kernel in enumerate(self.KERNEL_NAMES):
+                    bar_data[kernel].append(throughput_speedup[i])
+
+                # Line chart data
+                if utilization_series is not None:
+                    line_value = utilization_series.iloc[0]
+                    line_data.append(float(line_value) * 100)
+                else:
+                    line_data.append(0)
+
+                x_labels.append(f"{group}")
+
+        # Create chart
+        fig, ax1 = plt.subplots(figsize=(20, 8))
+        plt.style.use({
+            'font.size': 20,
+            'axes.labelsize': 18,
+            'axes.titlesize': 18,
+            'xtick.labelsize': 18,
+            'ytick.labelsize': 18
+        })
+
+
+        total_bars = (len(cases) * (len(self.NEURA_CONFIGS) - 1)) + 1
+        x_positions = np.arange(total_bars)
+        bar_width = 0.6
+        # Primary Y-axis - Bar chart
+        color_dict = {kernel: color for kernel, color in zip(self.KERNEL_NAMES, self.KERNEL_COLORS)}
+        bottom = np.zeros(total_bars)
+        bars_by_kernel = {}
+        for kernel in self.KERNEL_NAMES:
+            data = bar_data[kernel]
+            bars = ax1.bar(x_positions, data, bar_width, bottom=bottom,
+                        color=color_dict[kernel], alpha=0.8,
+                        edgecolor='black', linewidth=0.5, label=kernel)
+            bars_by_kernel[kernel] = bars
+            bottom += np.array(data)
+
+        # Add black dashed separator lines every group
+        group_pattern = [5, 4, 4, 4]
+        current_position = 0
+        line_positions = []
+        for group_size in group_pattern:
+            current_position += group_size
+            if current_position < len(x_positions):
+                line_positions.append(current_position - 0.5)
+        for pos in line_positions:
+            ax1.axvline(x=pos,
+                        color='black',
+                        linestyle='--',
+                        linewidth=0.8,
+                        alpha=0.8)
+
+        # Display values on Neura
+        display_indices = []
+        for i in range(len(x_positions)):
+            if i >= 4 and (i - 4) % 4 == 0:
+                display_indices.append(i)
+        arrays = [np.array(heights) for heights in bar_data.values()]
+        total_heights = np.sum(arrays, axis=0)
+        for i, (x, y) in enumerate(zip(x_positions, total_heights)):
+            if i in display_indices:
+                ax1.text(x, y + max(total_heights)*0.02, f'{y:.1f}',
+                        ha='center', va='bottom', fontsize=10)
+
+        ax1.set_ylabel('Normalized Throughput Speedup', fontsize=20, color='black')
+        ax1.tick_params(axis='y', labelcolor='black')
+        ax1.set_ylim(0, 26)
+        ax1.legend(loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0.,
+                fontsize=12, title="Kernels", title_fontsize=13)
+
+        # Secondary Y-axis - Line chart
+        ax2 = ax1.twinx()
+
+        # Define break pattern: first group 5 points, then 4 points for others
+        break_pattern = [5]  # First case: 5 points
+        remaining_cases = (len(x_positions) - 5) // 4  # Calculate how many 4-point cases
+        break_pattern.extend([4] * remaining_cases)  # Add 4 for each remaining case
+
+        # Insert NaN based on the break pattern
+        x_with_gaps = []
+        y_with_gaps = []
+
+        current_idx = 0
+        for i, num_points in enumerate(break_pattern):
+            # Add points for this case
+            end_idx = current_idx + num_points
+            x_with_gaps.extend(x_positions[current_idx:end_idx])
+            y_with_gaps.extend(line_data[current_idx:end_idx])
+
+            # Add NaN after this case (except the last one)
+            if i < len(break_pattern) - 1:
+                x_with_gaps.append(np.nan)
+                y_with_gaps.append(np.nan)
+
+            current_idx = end_idx
+
+        x_with_gaps = np.array(x_with_gaps)
+        y_with_gaps = np.array(y_with_gaps)
+
+        # Plot line with gaps between cases
+        line = ax2.plot(x_with_gaps, y_with_gaps,
+                        marker='o', markersize=8, linewidth=2.5,
+                        color='blue', linestyle='--',
+                        markerfacecolor='white', markeredgewidth=2,
+                        label='Utilization')
+
+        ax2.set_ylabel('Resource Utilization (%)', fontsize=20, color='black')
+        ax2.tick_params(axis='y', labelcolor='black')
+        ax2.set_ylim(0, 100)
+        ax2.set_yticks(np.arange(0, 120, 30))
+
+        # Display values on line points
+        for i, (x, y) in enumerate(zip(x_positions, line_data)):
+            ax2.text(x, y + max(line_data)*0.02, f'{y:.1f}',
+                    ha='center', va='bottom', fontsize=10)
+
+        # Set X-axis labels and grouping
+        ax1.set_xticks(x_positions)
+        ax1.set_xticklabels(x_labels, rotation=90)
+
+        # Add group labels
+        group_positions = [3, 7, 11, 15]  # Middle position of each group
+        for case, pos in zip(cases, group_positions):
+            ax1.text(pos, -0.15, (case.split('_'))[0] + 'Neura', transform=ax1.get_xaxis_transform(),
+                    ha='center', va='top', fontsize=20, fontweight='bold',
+                    bbox=dict(boxstyle="round,pad=0.3", facecolor='lightgray', alpha=0.8))
+
+        ax1.grid(True, linestyle='--', alpha=0.3, axis='y')
+        plt.title('ExampleFig11')
+        plt.tight_layout()
+        # plt.legend()
+        plt.savefig(fig_path)
+        print(f"Generated fig {fig_path}")
+
+if __name__ == '__main__':
+    KERNEL_DATA = {
+    "fir.cpp": (7, 2048, 4096),
+    "latnrm.c": (8, 1280, 2560),
+    "fft.c": (2, 112640, 450560),
+    "dtw.cpp": (4, 16384, 49152),
+    "spmv.c": (3, 65536, 262144),
+    "conv.c": (1, 655360, 1310720),
+    "mvt.c": (5, 16384, 49152),
+    "gemm.c": (0, 2097152, 8388608),
+    "relu+histogram.c": (6, 262144, 2097152)
+    }
+    genFigs = SimulationDataAnalyzer(kernel_data=KERNEL_DATA)
+    genFigs.genFig9("./fig/Fig9Test.png")
+    #genFigs.genFig10("./fig/Fig10.png")
+    genFigs.genFig11("./fig/Fig11Test.png")
\ No newline at end of file