NVIDIA · co63oc · Sep 3, 2025
diff --git a/python/CuTeDSL/base_dsl/ast_helpers.py b/python/CuTeDSL/base_dsl/ast_helpers.py
@@ -413,7 +413,7 @@ def bool_cast(value):
     if executor._is_dynamic_expression(value):
         raise DSLRuntimeError(
             "Only constexpr (Python Value) is allowed here, but got non-constexpr (IR Values) expression.",
-            suggestion = "Please explicitly convert to boolean with expressions like comparision."
+            suggestion = "Please explicitly convert to boolean with expressions like comparison."
         )
     return bool(value)
 

diff --git a/python/CuTeDSL/base_dsl/ast_preprocessor.py b/python/CuTeDSL/base_dsl/ast_preprocessor.py
@@ -873,7 +873,7 @@ def _handle_negative_step(self, node, start_expr, stop_expr, step_expr):
         extra_exprs.append(step)
         extra_exprs.append(offset)
 
-        # Add this to begining of loop body
+        # Add this to beginning of loop body
         # for i in range(start, stop, step):
         #     i = offset - i if isNegative else i
         assert isinstance(node.target, ast.Name)

diff --git a/python/CuTeDSL/cutlass/cute/arch/smem.py b/python/CuTeDSL/cutlass/cute/arch/smem.py
@@ -77,7 +77,7 @@ def get_dyn_smem(
     :param alignment:     An optional pointer alignment, the result pointer is offset appropriately
     :type alignment:      int
     :return:              A pointer to the start of the dynamic SMEM allocation with a correct
-                          alignement
+                          alignment
     :rtype:               Pointer
     """
     if not isinstance(element_type, NumericMeta):

diff --git a/python/CuTeDSL/cutlass/cute/core.py b/python/CuTeDSL/cutlass/cute/core.py
@@ -1717,7 +1717,7 @@ def print_tensor(tensor: Tensor, *, verbose: bool = False, loc=None, ip=None):
 
 
 #
-# Utilties
+# Utilities
 #
 
 
@@ -4282,7 +4282,7 @@ def flat_divide(target, tiler: Tiler, *, loc=None, ip=None):
 
 
 #
-# Higher-level utilties
+# Higher-level utilities
 #
 
 
@@ -6563,7 +6563,7 @@ class StorageA:
             intA : cutlass.Int16
 
 
-        # Supports aligment for its elements:
+        # Supports alignment for its elements:
         @cute.struct
         class StorageB:
             a: cute.struct.Align[
@@ -6738,7 +6738,7 @@ def __getitem__(cls, params) -> Any:
                 return new_obj
             else:
                 raise TypeError(
-                    "align only can be applied to sturct/MemRange/base_dsl scalar"
+                    "align only can be applied to struct/MemRange/base_dsl scalar"
                 )
 
     class Align(metaclass=_AlignMeta):
@@ -6814,10 +6814,10 @@ def add_offset(val):
                 offset = add_offset(object.__sizeof__())
             else:
                 raise TypeError(
-                    f"Struct element only support sturct/array/base_dsl scalar, "
+                    f"Struct element only support struct/array/base_dsl scalar, "
                     f"but got {object}"
                 )
-            # Total aligment determined by the strictest requirement
+            # Total alignment determined by the strictest requirement
             alignment = max(alignment, sub_align)
         # Total size determined by alignment
         self._align_of = alignment
@@ -6851,7 +6851,7 @@ def __call__(self, base: Any) -> None:
                 setattr(cls, name, new_obj)
             else:
                 raise TypeError(
-                    f"Struct element only support sturct/array/base_dsl scalar, "
+                    f"Struct element only support struct/array/base_dsl scalar, "
                     f"but got {obj}"
                 )
         return cls

diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py
@@ -27,7 +27,7 @@
 
 ####################################################################################################
 #
-# Aynchronous copies
+# Asynchronous copies
 #
 ####################################################################################################
 
@@ -119,7 +119,7 @@ class CopyG2STrait(Trait):
 @dataclass(frozen=True)
 class CopyBulkTensorTileG2SOp(CopyOp):
     """
-    Bulk tensor asynchrnous GMEM to SMEM Copy Operation using the TMA unit.
+    Bulk tensor asynchronous GMEM to SMEM Copy Operation using the TMA unit.
 
     See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
     This Operation uses TMA in the ``.tile`` mode.
@@ -221,7 +221,7 @@ def unpack(
 @dataclass(frozen=True)
 class CopyBulkTensorTileG2SMulticastOp(CopyOp):
     """
-    Bulk tensor asynchrnous multicast GMEM to SMEM Copy Operation using the TMA unit.
+    Bulk tensor asynchronous multicast GMEM to SMEM Copy Operation using the TMA unit.
 
     See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
     This Operation uses TMA in the ``.tile`` mode.
@@ -330,7 +330,7 @@ def unpack(
 @dataclass(frozen=True)
 class CopyBulkTensorTileS2GOp(CopyOp):
     """
-    Bulk tensor asynchrnous SMEM to GMEM Copy Operation using the TMA unit.
+    Bulk tensor asynchronous SMEM to GMEM Copy Operation using the TMA unit.
 
     See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
     This Operation uses TMA in the ``.tile`` mode.

diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py
@@ -257,7 +257,7 @@ def update_tma_descriptor(
     :type tma_atom:       CopyAtom
     :param gmem_tensor:   The GMEM tensor
     :type gmem_tensor:    Tensor
-    :param tensormap_ptr: The pointer to the memory location of the descriptor to udpate
+    :param tensormap_ptr: The pointer to the memory location of the descriptor to update
     :type tensormap_ptr:  Pointer
     """
     _cute_nvgpu_ir.update_tma_desc(

diff --git a/python/CuTeDSL/cutlass/pipeline/sm100.py b/python/CuTeDSL/cutlass/pipeline/sm100.py
@@ -400,7 +400,7 @@ def create(
             producer_mask = PipelineUmmaAsync._compute_tmem_sync_mask(cta_layout_vmnk)
 
         if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1:
-            # Set mask to None if not using 2CTA intructions
+            # Set mask to None if not using 2CTA instructions
             consumer_mask = None
         else:
             consumer_mask = PipelineUmmaAsync._compute_peer_cta_rank()

diff --git a/python/CuTeDSL/cutlass/torch.py b/python/CuTeDSL/cutlass/torch.py
@@ -148,7 +148,7 @@ def convert_cute_tensor(
 ) -> Tensor:
     """
     Change the value of the cute tensor to make its value converted from a fp32 torch tensor.
-    Used for fp8 types tensor creatation now.
+    Used for fp8 types tensor creation now.
     """
     # if torch_tensor is on cpu, create a gpu copy
     if f32_torch_tensor.device.type == "cpu":

diff --git a/python/CuTeDSL/cutlass/utils/README.md b/python/CuTeDSL/cutlass/utils/README.md
@@ -1,9 +1,9 @@
 # Utilities
 
-This folder contains various utilties for kernel authoring. Specifically, the implementation of the
-followings can be considered experimental and subject to breaking changes:
+This folder contains various utilities for kernel authoring. Specifically, the implementation of the
+following can be considered experimental and subject to breaking changes:
 
 - static persistent tile scheduler defined in [`static_persistent_tile_scheduler.py`](./static_persistent_tile_scheduler.py)
 - pipeline abstractions defined in [`pipeline.py`](./pipeline.py)
-- grouped GEMM utilties defined [`grouped_gemm_tile_scheduler_helper.py`](./grouped_gemm_tile_scheduler_helper.py)
+- grouped GEMM utilities defined [`grouped_gemm_tile_scheduler_helper.py`](./grouped_gemm_tile_scheduler_helper.py)
   and [`tensormap_manager.py`](./tensormap_manager.py)
diff --git a/python/CuTeDSL/cutlass_dsl/cutlass.py b/python/CuTeDSL/cutlass_dsl/cutlass.py
@@ -675,7 +675,7 @@ def count_values(args):
 
 
 # =============================================================================
-# DSL implementation of Python Build-in Operators
+# DSL implementation of Python Built-in Operators
 # =============================================================================
 
 

diff --git a/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py b/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py
@@ -194,7 +194,7 @@ def scf_execute_dynamic(
                                 original_idx = unpacked_idx
                                 break
                         raise DSLRuntimeError(
-                            f"`{op_type_name}` expects {expected_type} type for varible `{mix_iter_arg_names[original_idx]}`, but got {actual_type}.",
+                            f"`{op_type_name}` expects {expected_type} type for variable `{mix_iter_arg_names[original_idx]}`, but got {actual_type}.",
                             suggestion=f"Please make sure `{mix_iter_arg_names[original_idx]}` type is not changed inside of `{op_type_name}`.",
                         )
                     scf.YieldOp(region_values)

diff --git a/python/cutlass/backend/compiler.py b/python/cutlass/backend/compiler.py
@@ -349,7 +349,7 @@ def emit_compile_(self, operation_list, compilation_options, host_compilation_op
         cmd.extend(host_compilation_options.get_str().split(" "))
         cmd.extend(["-shared", "-o", temp_dst.name, temp_src.name, "-lcudart", "-lcuda"])
 
-        # Comile and load the library
+        # Compile and load the library
         compile_with_nvcc( cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
         host_lib = ctypes.CDLL(temp_dst.name)
 

diff --git a/python/cutlass/backend/evt/passes/pass_layout_elimination.py b/python/cutlass/backend/evt/passes/pass_layout_elimination.py
@@ -53,7 +53,7 @@ def __init__(self, dag_ir: DAGIR) -> None:
 
     def call(self):
         self.layout_nodes_worklist = self.get_all_layout_nodes()
-        # Run while loop utill all layout nodes are eliminated
+        # Run while loop until all layout nodes are eliminated
         while(len(self.layout_nodes_worklist) > 0):
             node = self.layout_nodes_worklist.pop(0)
             # for node in layout_nodes:

diff --git a/python/cutlass/backend/gemm_operation.py b/python/cutlass/backend/gemm_operation.py
@@ -113,7 +113,7 @@
 
 def leading_dimension(layout: LayoutType, shape: MatrixCoord) -> int:
     """
-    Returns the leading dimenson of a tensor with layout ``layout`` and shape ``shape``.
+    Returns the leading dimension of a tensor with layout ``layout`` and shape ``shape``.
 
     :param layout: layout of the tensor
     :type layout: cutlass_cppgen.shape.LayoutType
@@ -1510,7 +1510,7 @@ def __init__(
 
         # Optionally swap the TensorDescriptions for operands A and B and transpose their
         # layouts. This is needed to mimic the transpose performed by device::GemmUniversal.
-        # The code below uses deep copy to avoid overwritting the original TensorDescription
+        # The code below uses deep copy to avoid overwriting the original TensorDescription
         self.switched = (self.api != ApiVersion.v3x and
                          self.emission_type == EmissionType.Kernel and
                          C.layout == LayoutType.ColumnMajor)
@@ -1775,7 +1775,7 @@ def __init__(self, arch, tile_description: TileDescription, A: TensorDescription
         epilogue_functor, swizzling_functor=SwizzlingFunctor.Identity1, **kwargs):
         super(GemmOperationGrouped, self).__init__(GemmKind.Grouped, arch, tile_description,
                                                    A, B, C, epilogue_functor, swizzling_functor, **kwargs)
-        assert "precompute_mode" in kwargs.keys(), "missing keyword arguement 'precompute_mode'."
+        assert "precompute_mode" in kwargs.keys(), "missing keyword argument 'precompute_mode'."
         self.precompute_mode = kwargs["precompute_mode"]
         self.rt_module = GemmRTGrouped(self)
         self.argument_type = self.rt_module.argument_type

diff --git a/python/cutlass/backend/library.py b/python/cutlass/backend/library.py
@@ -258,7 +258,7 @@ def __init__(
         """
         :param threadblock_shape: shape of a threadblock tyle
         :type threadblock_shape: list or tuple
-        :param stages: number of pipline stages in the operation. For SM90 kernels, this can be set to `None` and the maximum
+        :param stages: number of pipeline stages in the operation. For SM90 kernels, this can be set to `None` and the maximum
                        number of stages that can be supported for an operation on a given architecture will be computed at a later time
         :type stages: int or None
         :param warp_count: number of warps in each [M, N, K] dimension of a threadblock tile

diff --git a/python/cutlass/backend/reduction_operation.py b/python/cutlass/backend/reduction_operation.py
@@ -377,7 +377,7 @@ def configuration_name(self):
         )
 
     def procedural_name(self):
-        """The full procedural name indicates architeture, extended name, tile size"""
+        """The full procedural name indicates architecture, extended name, tile size"""
         return self.configuration_name()
 
     def run(self, arguments: ReductionArguments) -> cuda.CUresult:

diff --git a/python/cutlass/backend/utils/device.py b/python/cutlass/backend/utils/device.py
@@ -93,7 +93,7 @@ def device_sm_count(device: int = -1):
     )
     if err != cuda.CUresult.CUDA_SUCCESS:
         raise Exception(
-            "Failed to retireve SM count. "
+            "Failed to retrieve SM count. "
             f"cuDeviceGetAttribute() failed with error: {cuda.cuGetErrorString(err)[1]}"
         )
 

diff --git a/python/cutlass/emit/pytorch.py b/python/cutlass/emit/pytorch.py
@@ -622,15 +622,15 @@ class _ArchListSetter:
     Utility context manager for temporarily setting the value of the ``TORCH_CUDA_ARCH_LIST``
     environment variable when building a PyTorch CUDA module.
 
-    ``TORCH_CUDA_ARCH_LIST`` is a space-delmited list of compute capabilites for which a PyTorch
+    ``TORCH_CUDA_ARCH_LIST`` is a space-delmited list of compute capabilities for which a PyTorch
     CUDA module should be compiled.
 
     For example, ``TORCH_CUDA_ARCH_LIST="7.0 8.0"`` would result in the inclusion of
     ``-gencode=arch=compute_70,code=sm_70`` and ``-gencode=arch=compute_80,code=sm_80`` in the
     compilation of the module.
 
     This utility wraps the building of a PyTorch CUDA module with a setting of this environment
-    variable according to the current compute capability being targetted.
+    variable according to the current compute capability being targeted.
 
     Example usage:
 

diff --git a/python/cutlass/epilogue/epilogue.py b/python/cutlass/epilogue/epilogue.py
@@ -122,7 +122,7 @@ def trace(fn, example_tensors, **kwargs):
     :param example_tensors: example inputs for fn
     :type example_tensors: dict
 
-    .. hightlight:: python
+    .. highlight:: python
     .. code-block:: python
         import cutlass_cppgen.backend.evt
 

diff --git a/python/cutlass/op/conv.py b/python/cutlass/op/conv.py
@@ -183,7 +183,7 @@ class Conv2d(OperationBase):
     :param B: tensor representing data type of operand B
     :param C: tensor representing data type of operand C
     :param D: tensor representing data type of operand D
-    :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+    :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B
     :param beta: scalar parameter beta from GEMM operation that scales operand C
     :param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type
     :type element: cutlass_cppgen.DataType
@@ -749,7 +749,7 @@ def run(self, A=None, B=None, C=None, D=None,
 
         By default, this call returns only once the kernel has completed. To launch the kernel
         and immediately return, set ``sync=False``. In this case, it is the responsibility of the
-        caller to syncrhonize the results of the kernel before attempting to access outputs
+        caller to synchronize the results of the kernel before attempting to access outputs
         by calling ``sync()`` on the arguments returned from this call.
 
         :param A: tensor representing data type and layout of operand A
@@ -759,7 +759,7 @@ def run(self, A=None, B=None, C=None, D=None,
         :param stride: (stride_h, stride_w) describing the convolution stride. Default: (1, 1)
         :param padding: (pad_h, pad_w) describing the convolution padding. Default: (0, 0)
         :param dilation: (dilation_h, dilation_w) describing the dilation of convolution. Default: (1, 1)
-        :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+        :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B
         :param beta: scalar parameter beta from GEMM operation that scales operand C
         :param split_k: a tuple (split_k_mode, split_k_slices)
         :param sync: whether the call should wait for the kernel to complete before returning

diff --git a/python/cutlass/op/gemm.py b/python/cutlass/op/gemm.py
@@ -189,7 +189,7 @@ class Gemm(OperationBase):
     :param B: tensor representing data type and layout of operand B
     :param C: tensor representing data type and layout of operand C
     :param D: tensor representing data type and layout of operand D
-    :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+    :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B
     :param beta: scalar parameter beta from GEMM operation that scales operand C
     :param element_accumulator: data type to be used in accumulation of the product of operands A and B
     :type element_accumulator: cutlass_cppgen.DataType
@@ -635,14 +635,14 @@ def run(self, A=None, B=None, C=None, D=None,
 
         By default, this call returns only once the kernel has completed. To launch the kernel
         and immediately return, set ``sync=False``. In this case, it is the responsibility of the
-        caller to syncrhonize the results of the kernel before attempting to access outputs
+        caller to synchronize the results of the kernel before attempting to access outputs
         by calling ``sync()`` on the arguments returned from this call.
 
         :param A: tensor representing data type and layout of operand A
         :param B: tensor representing data type and layout of operand B
         :param C: tensor representing data type and layout of operand C
         :param D: tensor representing data type and layout of operand D
-        :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+        :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B
         :param beta: scalar parameter beta from GEMM operation that scales operand C
         :param sync: whether the call should wait for the kernel to complete before returning
         :type sync: bool

diff --git a/python/cutlass/op/gemm_grouped.py b/python/cutlass/op/gemm_grouped.py
@@ -87,7 +87,7 @@ class GroupedGemm(Gemm):
     :param B: tensor representing data type and layout of operands B
     :param C: tensor representing data type and layout of operands C
     :param D: tensor representing data type and layout of operands D
-    :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+    :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B
     :param beta: scalar parameter beta from GEMM operation that scales operand C
     :param element_accumulator: data type to be used in accumulation of the product of operands A and B
     :type element_accumulator: cutlass_cppgen.DataType
@@ -204,7 +204,7 @@ def run(self, A, B, C, D,
 
         By default, this call returns only once the kernel has completed. To launch the kernel
         and immediately return, set ``sync=False``. In this case, it is the responsibility of the
-        caller to syncrhonize the results of the kernel before attempting to access outputs
+        caller to synchronize the results of the kernel before attempting to access outputs
         by calling ``sync()`` on the arguments returned from this call.
 
         :param A: list of tensors representing data type and layout of operand A
@@ -215,7 +215,7 @@ def run(self, A, B, C, D,
         :type C: list
         :param D: list of tensors representing data type and layout of operand D
         :type D: list
-        :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+        :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B
         :param beta: scalar parameter beta from GEMM operation that scales operand C
         :param sync: whether the call should wait for the kernel to complete before returning
         :type sync: bool

diff --git a/python/cutlass/op/op.py b/python/cutlass/op/op.py
@@ -424,7 +424,7 @@ def epilogue_visitor(self, visitor):
 
     def run_setup(self):
         """
-        Steps that must be taken before caling `plan.run()`
+        Steps that must be taken before calling `plan.run()`
         """
         # Initialize the memory pool if, if not already done
         cutlass_cppgen.get_memory_pool()
diff --git a/python/cutlass/utils/check.py b/python/cutlass/utils/check.py
@@ -138,7 +138,7 @@ def valid_stage_count(
             f"Details:\n"
             f"Mainloop uses {smem_per_stage} bytes of shared memory per stage, and "
             f"{td.stages} stages for a total of {smem_usage_mainloop} bytes.\n"
-            f"The maxmium amount of shared memory that can be used per block on CC {cc} is {smem_arch}.")
+            f"The maximum amount of shared memory that can be used per block on CC {cc} is {smem_arch}.")
 
     return (True, "")