@@ -489,7 +489,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
489
489
move_tile_window (k_dram_window, {kN0 , 0 });
490
490
async_load_tile (v_lds_write_window, v_dram_window);
491
491
move_tile_window (v_dram_window, {kN0 , 0 });
492
- // __builtin_amdgcn_s_waitcnt(0 );
492
+ s_waitcnt< /* vmcnt= */ 0 >( );
493
493
k_reg_tensor = load_tile (k_lds_read_window);
494
494
v_reg_tensor = load_tile (v_lds_read_window);
495
495
kt_reg_tensor = load_tile_transpose (kt_lds_read_window);
@@ -636,7 +636,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
636
636
}
637
637
}();
638
638
store_tile (bias_lds_write_window, dbias);
639
- __builtin_amdgcn_s_waitcnt ( 3952 );
639
+ s_waitcnt< /* vmcnt= */ 0 >( );
640
640
block_sync_lds ();
641
641
auto shuffled_dbias_tile = load_tile (dbias_lds_read_window);
642
642
auto dbias_tile = make_static_distributed_tensor<BiasGradDataType>(
@@ -664,7 +664,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
664
664
}
665
665
store_tile (ds_lds_window, ds_gemm);
666
666
}
667
- __builtin_amdgcn_s_waitcnt ( 3952 );
667
+ s_waitcnt< /* vmcnt= */ 0 >( );
668
668
block_sync_lds ();
669
669
if constexpr (is_epilogue)
670
670
{
0 commit comments