Skip to content

Commit 6cf3fdd

Browse files
authored
[CK_TILE] FMHA BWD Fix Decode Accuracy (#2881)
* [CK_TILE] FMHA BWD Fix Decode Accuracy * use s_waitcnt utils
1 parent 86dd59c commit 6cf3fdd

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
489489
move_tile_window(k_dram_window, {kN0, 0});
490490
async_load_tile(v_lds_write_window, v_dram_window);
491491
move_tile_window(v_dram_window, {kN0, 0});
492-
// __builtin_amdgcn_s_waitcnt(0);
492+
s_waitcnt</*vmcnt=*/0>();
493493
k_reg_tensor = load_tile(k_lds_read_window);
494494
v_reg_tensor = load_tile(v_lds_read_window);
495495
kt_reg_tensor = load_tile_transpose(kt_lds_read_window);
@@ -636,7 +636,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
636636
}
637637
}();
638638
store_tile(bias_lds_write_window, dbias);
639-
__builtin_amdgcn_s_waitcnt(3952);
639+
s_waitcnt</*vmcnt=*/0>();
640640
block_sync_lds();
641641
auto shuffled_dbias_tile = load_tile(dbias_lds_read_window);
642642
auto dbias_tile = make_static_distributed_tensor<BiasGradDataType>(
@@ -664,7 +664,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
664664
}
665665
store_tile(ds_lds_window, ds_gemm);
666666
}
667-
__builtin_amdgcn_s_waitcnt(3952);
667+
s_waitcnt</*vmcnt=*/0>();
668668
block_sync_lds();
669669
if constexpr(is_epilogue)
670670
{

0 commit comments

Comments
 (0)