Avoid unnecessary copy in for loop.

solrex · solrex · commit 2b2a88b03604 · 2025-05-26T19:37:38.000+08:00
diff --git a/examples/85_ada_ampere_gemm_with_blockwise_scaling/85a_ada_fp8_gemm_with_groupwise_scaling_cute.cu b/examples/85_ada_ampere_gemm_with_blockwise_scaling/85a_ada_fp8_gemm_with_groupwise_scaling_cute.cu
@@ -248,7 +248,7 @@ struct Options {
   float alpha = 1.f, beta = 0.f;
   int iterations = 1000;
   int warmup = 1000;
-  int m = 1024, n = 512, k = 1024, l = 1;
+  int m = 1024, n = 1024, k = 1024, l = 1;
   float epsilon = 0.02f;
   float non_zero_floor = 1.f;
 
diff --git a/examples/85_ada_ampere_gemm_with_blockwise_scaling/85b_ada_fp8_gemm_with_blockwise_scaling_cute.cu b/examples/85_ada_ampere_gemm_with_blockwise_scaling/85b_ada_fp8_gemm_with_blockwise_scaling_cute.cu
@@ -250,7 +250,7 @@ struct Options {
   float alpha = 1.f, beta = 0.f;
   int iterations = 1000;
   int warmup = 1000;
-  int m = 1024, n = 512, k = 1024, l = 1;
+  int m = 1024, n = 1024, k = 1024, l = 1;
   float epsilon = 0.02f;
   float non_zero_floor = 1.f;
 
diff --git a/examples/85_ada_ampere_gemm_with_blockwise_scaling/85c_ampere_int8_gemm_with_groupwise_scaling_cute.cu b/examples/85_ada_ampere_gemm_with_blockwise_scaling/85c_ampere_int8_gemm_with_groupwise_scaling_cute.cu
@@ -250,7 +250,7 @@ struct Options {
   float alpha = 1.f, beta = 0.f;
   int iterations = 1000;
   int warmup = 1000;
-  int m = 1024, n = 512, k = 1024, l = 1;
+  int m = 1024, n = 1024, k = 1024, l = 1;
   float epsilon = 0.02f;
   float non_zero_floor = 1.f;
 
diff --git a/examples/85_ada_ampere_gemm_with_blockwise_scaling/85d_ampere_int8_gemm_with_blockwise_scaling_cute.cu b/examples/85_ada_ampere_gemm_with_blockwise_scaling/85d_ampere_int8_gemm_with_blockwise_scaling_cute.cu
@@ -251,7 +251,7 @@ struct Options {
   float alpha = 1.f, beta = 0.f;
   int iterations = 1000;
   int warmup = 1000;
-  int m = 1024, n = 512, k = 1024, l = 1;
+  int m = 1024, n = 1024, k = 1024, l = 1;
   float epsilon = 0.02f;
   float non_zero_floor = 1.f;
 
diff --git a/include/cutlass/gemm/collective/sm80_mma_multistage_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm80_mma_multistage_blockwise_scaling.hpp
@@ -454,14 +454,14 @@ struct CollectiveMma<
       // Prefetch the first rmem from the first k-tile
       copy(smem_tiled_copy_A, tCsA_p(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{}));
       copy(smem_tiled_copy_B, tCsB_p(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{}));
-      // Load per block scale values from shared memory to registers
-      copy(tCsSFA(_,_,_,make_coord(_0{}, _0{})), tCrSFA);
-      copy(tCsSFB(_,_,_,make_coord(_0{}, _0{})), tCrSFB);
     }
 
     CUTLASS_PRAGMA_NO_UNROLL
     for ( ; k_tile_count > -(DispatchPolicy::Stages-1); --k_tile_count)
     {
+      // Load per block scale values from shared memory to registers
+      copy(tCsSFA(_,_,_,make_coord(_0{}, smem_pipe_read)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{}, smem_pipe_read)), tCrSFB);
       // Pipeline the outer products with a static for loop.
       //
       // Note, the for_each() function is required here to ensure `k_block` is of type Int<N>.
@@ -552,9 +552,6 @@ struct CollectiveMma<
           tCrAccum(i) = 0;
         }
       }
-      // Load per block scale values from shared memory to registers
-      copy(tCsSFA(_,_,_,make_coord(_0{}, smem_pipe_read)), tCrSFA);
-      copy(tCsSFB(_,_,_,make_coord(_0{}, smem_pipe_read)), tCrSFB);
     }
 
     cp_async_wait<0>();

Original file line number	Diff line number	Diff line change
`@@ -454,14 +454,14 @@ struct CollectiveMma<`
`454`	`454`	`// Prefetch the first rmem from the first k-tile`
`455`	`455`	`copy(smem_tiled_copy_A, tCsA_p(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{}));`
`456`	`456`	`copy(smem_tiled_copy_B, tCsB_p(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{}));`
`457`		`- // Load per block scale values from shared memory to registers`
`458`		`- copy(tCsSFA(_,_,_,make_coord(_0{}, _0{})), tCrSFA);`
`459`		`- copy(tCsSFB(_,_,_,make_coord(_0{}, _0{})), tCrSFB);`
`460`	`457`	`}`
`461`	`458`
`462`	`459`	`CUTLASS_PRAGMA_NO_UNROLL`
`463`	`460`	`for ( ; k_tile_count > -(DispatchPolicy::Stages-1); --k_tile_count)`
`464`	`461`	`{`
	`462`	`+ // Load per block scale values from shared memory to registers`
	`463`	`+ copy(tCsSFA(_,_,_,make_coord(_0{}, smem_pipe_read)), tCrSFA);`
	`464`	`+ copy(tCsSFB(_,_,_,make_coord(_0{}, smem_pipe_read)), tCrSFB);`
`465`	`465`	`// Pipeline the outer products with a static for loop.`
`466`	`466`	`//`
`467`	`467`	// Note, the for_each() function is required here to ensure `k_block` is of type Int<N>.
`@@ -552,9 +552,6 @@ struct CollectiveMma<`
`552`	`552`	`tCrAccum(i) = 0;`
`553`	`553`	`}`
`554`	`554`	`}`
`555`		`- // Load per block scale values from shared memory to registers`
`556`		`- copy(tCsSFA(_,_,_,make_coord(_0{}, smem_pipe_read)), tCrSFA);`
`557`		`- copy(tCsSFB(_,_,_,make_coord(_0{}, smem_pipe_read)), tCrSFB);`
`558`	`555`	`}`
`559`	`556`
`560`	`557`	`cp_async_wait<0>();`