@@ -496,46 +496,55 @@ <h2>Functions<a class="headerlink" href="#functions" title="Link to this heading
496496< tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.dec_max_nreg " title ="tilelang.language.builtin.dec_max_nreg "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> dec_max_nreg</ span > </ code > </ a > (reg_count)</ p > </ td >
497497< td > < p > Decrement the maximum number of registers to use.</ p > </ td >
498498</ tr >
499+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.annotate_producer_reg_dealloc " title ="tilelang.language.builtin.annotate_producer_reg_dealloc "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> annotate_producer_reg_dealloc</ span > </ code > </ a > ([reg_count])</ p > </ td >
500+ < td > < p > Annotate the producer reg dealloc.</ p > </ td >
501+ </ tr >
502+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.annotate_consumer_reg_alloc " title ="tilelang.language.builtin.annotate_consumer_reg_alloc "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> annotate_consumer_reg_alloc</ span > </ code > </ a > ([reg_count])</ p > </ td >
503+ < td > < p > Annotate the consumer reg alloc.</ p > </ td >
504+ </ tr >
499505< tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.no_set_max_nreg " title ="tilelang.language.builtin.no_set_max_nreg "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> no_set_max_nreg</ span > </ code > </ a > ()</ p > </ td >
500506< td > < p > Disable the maximum register limit setting.</ p > </ td >
501507</ tr >
502- < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_wait_parity " title ="tilelang.language.builtin.mbarrier_wait_parity "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> mbarrier_wait_parity</ span > </ code > </ a > (mbarrier, parity)</ p > </ td >
508+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.disable_warp_group_reg_alloc " title ="tilelang.language.builtin.disable_warp_group_reg_alloc "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> disable_warp_group_reg_alloc</ span > </ code > </ a > ()</ p > </ td >
509+ < td > < p > Disable the warp group reg alloc.</ p > </ td >
510+ </ tr >
511+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_wait_parity " title ="tilelang.language.builtin.mbarrier_wait_parity "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> mbarrier_wait_parity</ span > </ code > </ a > (mbarrier, parity)</ p > </ td >
503512< td > < p > Wait for memory barrier parity condition.</ p > </ td >
504513</ tr >
505- < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_arrive " title ="tilelang.language.builtin.mbarrier_arrive "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> mbarrier_arrive</ span > </ code > </ a > (mbarrier)</ p > </ td >
514+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_arrive " title ="tilelang.language.builtin.mbarrier_arrive "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> mbarrier_arrive</ span > </ code > </ a > (mbarrier)</ p > </ td >
506515< td > < p > Arrive at memory barrier.</ p > </ td >
507516</ tr >
508- < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_expect_tx " title ="tilelang.language.builtin.mbarrier_expect_tx "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> mbarrier_expect_tx</ span > </ code > </ a > (*args)</ p > </ td >
517+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_expect_tx " title ="tilelang.language.builtin.mbarrier_expect_tx "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> mbarrier_expect_tx</ span > </ code > </ a > (*args)</ p > </ td >
509518< td > < p > Set expected transaction count for memory barrier.</ p > </ td >
510519</ tr >
511- < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.wait_wgmma " title ="tilelang.language.builtin.wait_wgmma "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> wait_wgmma</ span > </ code > </ a > (id)</ p > </ td >
520+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.wait_wgmma " title ="tilelang.language.builtin.wait_wgmma "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> wait_wgmma</ span > </ code > </ a > (id)</ p > </ td >
512521< td > < p > Wait for WGMMA (Warp Group Matrix Multiply-Accumulate) operations to complete.</ p > </ td >
513522</ tr >
514- < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.barrier_wait " title ="tilelang.language.builtin.barrier_wait "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> barrier_wait</ span > </ code > </ a > (barrier_id[, parity])</ p > </ td >
523+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.barrier_wait " title ="tilelang.language.builtin.barrier_wait "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> barrier_wait</ span > </ code > </ a > (barrier_id[, parity])</ p > </ td >
515524< td > < p > Wait for a memory barrier to complete.</ p > </ td >
516525</ tr >
517- < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.barrier_arrive " title ="tilelang.language.builtin.barrier_arrive "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> barrier_arrive</ span > </ code > </ a > (barrier_id)</ p > </ td >
526+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.barrier_arrive " title ="tilelang.language.builtin.barrier_arrive "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> barrier_arrive</ span > </ code > </ a > (barrier_id)</ p > </ td >
518527< td > < p > Arrive at a memory barrier.</ p > </ td >
519528</ tr >
520- < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.shfl_xor " title ="tilelang.language.builtin.shfl_xor "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> shfl_xor</ span > </ code > </ a > (value, offset)</ p > </ td >
529+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.shfl_xor " title ="tilelang.language.builtin.shfl_xor "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> shfl_xor</ span > </ code > </ a > (value, offset)</ p > </ td >
521530< td > < p > Perform a shuffle operation with XOR offset.</ p > </ td >
522531</ tr >
523- < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.shfl_down " title ="tilelang.language.builtin.shfl_down "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> shfl_down</ span > </ code > </ a > (value, offset)</ p > </ td >
532+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.shfl_down " title ="tilelang.language.builtin.shfl_down "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> shfl_down</ span > </ code > </ a > (value, offset)</ p > </ td >
524533< td > < p > Perform a shuffle operation with down offset.</ p > </ td >
525534</ tr >
526- < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.shfl_up " title ="tilelang.language.builtin.shfl_up "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> shfl_up</ span > </ code > </ a > (value, offset)</ p > </ td >
535+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.shfl_up " title ="tilelang.language.builtin.shfl_up "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> shfl_up</ span > </ code > </ a > (value, offset)</ p > </ td >
527536< td > < p > Perform a shuffle operation with up offset.</ p > </ td >
528537</ tr >
529- < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_threads " title ="tilelang.language.builtin.sync_threads "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_threads</ span > </ code > </ a > ()</ p > </ td >
538+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_threads " title ="tilelang.language.builtin.sync_threads "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_threads</ span > </ code > </ a > ()</ p > </ td >
530539< td > < p > Synchronize all threads in a warp.</ p > </ td >
531540</ tr >
532- < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_thread_partial " title ="tilelang.language.builtin.sync_thread_partial "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_thread_partial</ span > </ code > </ a > (barrier_id)</ p > </ td >
541+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_thread_partial " title ="tilelang.language.builtin.sync_thread_partial "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_thread_partial</ span > </ code > </ a > (barrier_id)</ p > </ td >
533542< td > < p > Synchronize threads within a warp.</ p > </ td >
534543</ tr >
535- < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_global " title ="tilelang.language.builtin.sync_global "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_global</ span > </ code > </ a > ()</ p > </ td >
544+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_global " title ="tilelang.language.builtin.sync_global "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_global</ span > </ code > </ a > ()</ p > </ td >
536545< td > < p > Synchronize all threads in a block.</ p > </ td >
537546</ tr >
538- < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_grid " title ="tilelang.language.builtin.sync_grid "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_grid</ span > </ code > </ a > ()</ p > </ td >
547+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_grid " title ="tilelang.language.builtin.sync_grid "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_grid</ span > </ code > </ a > ()</ p > </ td >
539548< td > < p > Synchronize all threads in a grid.</ p > </ td >
540549</ tr >
541550</ tbody >
@@ -718,12 +727,40 @@ <h2>Module Contents<a class="headerlink" href="#module-contents" title="Link to
718727</ dl >
719728</ dd > </ dl >
720729
730+ < dl class ="py function ">
731+ < dt class ="sig sig-object py " id ="tilelang.language.builtin.annotate_producer_reg_dealloc ">
732+ < span class ="sig-prename descclassname "> < span class ="pre "> tilelang.language.builtin.</ span > </ span > < span class ="sig-name descname "> < span class ="pre "> annotate_producer_reg_dealloc</ span > </ span > < span class ="sig-paren "> (</ span > < em class ="sig-param "> < span class ="n "> < span class ="pre "> reg_count</ span > </ span > < span class ="o "> < span class ="pre "> =</ span > </ span > < span class ="default_value "> < span class ="pre "> 24</ span > </ span > </ em > < span class ="sig-paren "> )</ span > < a class ="headerlink " href ="#tilelang.language.builtin.annotate_producer_reg_dealloc " title ="Link to this definition "> ¶</ a > </ dt >
733+ < dd > < p > Annotate the producer reg dealloc.</ p >
734+ < dl class ="field-list simple ">
735+ < dt class ="field-odd "> Parameters< span class ="colon "> :</ span > </ dt >
736+ < dd class ="field-odd "> < p > < strong > reg_count</ strong > (< em > int</ em > )</ p >
737+ </ dd >
738+ </ dl >
739+ </ dd > </ dl >
740+
741+ < dl class ="py function ">
742+ < dt class ="sig sig-object py " id ="tilelang.language.builtin.annotate_consumer_reg_alloc ">
743+ < span class ="sig-prename descclassname "> < span class ="pre "> tilelang.language.builtin.</ span > </ span > < span class ="sig-name descname "> < span class ="pre "> annotate_consumer_reg_alloc</ span > </ span > < span class ="sig-paren "> (</ span > < em class ="sig-param "> < span class ="n "> < span class ="pre "> reg_count</ span > </ span > < span class ="o "> < span class ="pre "> =</ span > </ span > < span class ="default_value "> < span class ="pre "> 240</ span > </ span > </ em > < span class ="sig-paren "> )</ span > < a class ="headerlink " href ="#tilelang.language.builtin.annotate_consumer_reg_alloc " title ="Link to this definition "> ¶</ a > </ dt >
744+ < dd > < p > Annotate the consumer reg alloc.</ p >
745+ < dl class ="field-list simple ">
746+ < dt class ="field-odd "> Parameters< span class ="colon "> :</ span > </ dt >
747+ < dd class ="field-odd "> < p > < strong > reg_count</ strong > (< em > int</ em > )</ p >
748+ </ dd >
749+ </ dl >
750+ </ dd > </ dl >
751+
721752< dl class ="py function ">
722753< dt class ="sig sig-object py " id ="tilelang.language.builtin.no_set_max_nreg ">
723754< span class ="sig-prename descclassname "> < span class ="pre "> tilelang.language.builtin.</ span > </ span > < span class ="sig-name descname "> < span class ="pre "> no_set_max_nreg</ span > </ span > < span class ="sig-paren "> (</ span > < span class ="sig-paren "> )</ span > < a class ="headerlink " href ="#tilelang.language.builtin.no_set_max_nreg " title ="Link to this definition "> ¶</ a > </ dt >
724755< dd > < p > Disable the maximum register limit setting.</ p >
725756</ dd > </ dl >
726757
758+ < dl class ="py function ">
759+ < dt class ="sig sig-object py " id ="tilelang.language.builtin.disable_warp_group_reg_alloc ">
760+ < span class ="sig-prename descclassname "> < span class ="pre "> tilelang.language.builtin.</ span > </ span > < span class ="sig-name descname "> < span class ="pre "> disable_warp_group_reg_alloc</ span > </ span > < span class ="sig-paren "> (</ span > < span class ="sig-paren "> )</ span > < a class ="headerlink " href ="#tilelang.language.builtin.disable_warp_group_reg_alloc " title ="Link to this definition "> ¶</ a > </ dt >
761+ < dd > < p > Disable the warp group reg alloc.</ p >
762+ </ dd > </ dl >
763+
727764< dl class ="py function ">
728765< dt class ="sig sig-object py " id ="tilelang.language.builtin.mbarrier_wait_parity ">
729766< span class ="sig-prename descclassname "> < span class ="pre "> tilelang.language.builtin.</ span > </ span > < span class ="sig-name descname "> < span class ="pre "> mbarrier_wait_parity</ span > </ span > < span class ="sig-paren "> (</ span > < em class ="sig-param "> < span class ="n "> < span class ="pre "> mbarrier</ span > </ span > </ em > , < em class ="sig-param "> < span class ="n "> < span class ="pre "> parity</ span > </ span > </ em > < span class ="sig-paren "> )</ span > < a class ="headerlink " href ="#tilelang.language.builtin.mbarrier_wait_parity " title ="Link to this definition "> ¶</ a > </ dt >
@@ -1023,7 +1060,10 @@ <h2>Module Contents<a class="headerlink" href="#module-contents" title="Link to
10231060< li > < a class ="reference internal " href ="#tilelang.language.builtin.set_max_nreg "> < code class ="docutils literal notranslate "> < span class ="pre "> set_max_nreg()</ span > </ code > </ a > </ li >
10241061< li > < a class ="reference internal " href ="#tilelang.language.builtin.inc_max_nreg "> < code class ="docutils literal notranslate "> < span class ="pre "> inc_max_nreg()</ span > </ code > </ a > </ li >
10251062< li > < a class ="reference internal " href ="#tilelang.language.builtin.dec_max_nreg "> < code class ="docutils literal notranslate "> < span class ="pre "> dec_max_nreg()</ span > </ code > </ a > </ li >
1063+ < li > < a class ="reference internal " href ="#tilelang.language.builtin.annotate_producer_reg_dealloc "> < code class ="docutils literal notranslate "> < span class ="pre "> annotate_producer_reg_dealloc()</ span > </ code > </ a > </ li >
1064+ < li > < a class ="reference internal " href ="#tilelang.language.builtin.annotate_consumer_reg_alloc "> < code class ="docutils literal notranslate "> < span class ="pre "> annotate_consumer_reg_alloc()</ span > </ code > </ a > </ li >
10261065< li > < a class ="reference internal " href ="#tilelang.language.builtin.no_set_max_nreg "> < code class ="docutils literal notranslate "> < span class ="pre "> no_set_max_nreg()</ span > </ code > </ a > </ li >
1066+ < li > < a class ="reference internal " href ="#tilelang.language.builtin.disable_warp_group_reg_alloc "> < code class ="docutils literal notranslate "> < span class ="pre "> disable_warp_group_reg_alloc()</ span > </ code > </ a > </ li >
10271067< li > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_wait_parity "> < code class ="docutils literal notranslate "> < span class ="pre "> mbarrier_wait_parity()</ span > </ code > </ a > </ li >
10281068< li > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_arrive "> < code class ="docutils literal notranslate "> < span class ="pre "> mbarrier_arrive()</ span > </ code > </ a > </ li >
10291069< li > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_expect_tx "> < code class ="docutils literal notranslate "> < span class ="pre "> mbarrier_expect_tx()</ span > </ code > </ a > </ li >
0 commit comments