Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/CuTeDSL/base_dsl/ast_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ def bool_cast(value):
if executor._is_dynamic_expression(value):
raise DSLRuntimeError(
"Only constexpr (Python Value) is allowed here, but got non-constexpr (IR Values) expression.",
suggestion = "Please explicitly convert to boolean with expressions like comparision."
suggestion = "Please explicitly convert to boolean with expressions like comparison."
)
return bool(value)

Expand Down
2 changes: 1 addition & 1 deletion python/CuTeDSL/base_dsl/ast_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -873,7 +873,7 @@ def _handle_negative_step(self, node, start_expr, stop_expr, step_expr):
extra_exprs.append(step)
extra_exprs.append(offset)

# Add this to begining of loop body
# Add this to beginning of loop body
# for i in range(start, stop, step):
# i = offset - i if isNegative else i
assert isinstance(node.target, ast.Name)
Expand Down
2 changes: 1 addition & 1 deletion python/CuTeDSL/cutlass/cute/arch/smem.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def get_dyn_smem(
:param alignment: An optional pointer alignment, the result pointer is offset appropriately
:type alignment: int
:return: A pointer to the start of the dynamic SMEM allocation with a correct
alignement
alignment
:rtype: Pointer
"""
if not isinstance(element_type, NumericMeta):
Expand Down
14 changes: 7 additions & 7 deletions python/CuTeDSL/cutlass/cute/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1717,7 +1717,7 @@ def print_tensor(tensor: Tensor, *, verbose: bool = False, loc=None, ip=None):


#
# Utilties
# Utilities
#


Expand Down Expand Up @@ -4282,7 +4282,7 @@ def flat_divide(target, tiler: Tiler, *, loc=None, ip=None):


#
# Higher-level utilties
# Higher-level utilities
#


Expand Down Expand Up @@ -6563,7 +6563,7 @@ class StorageA:
intA : cutlass.Int16


# Supports aligment for its elements:
# Supports alignment for its elements:
@cute.struct
class StorageB:
a: cute.struct.Align[
Expand Down Expand Up @@ -6738,7 +6738,7 @@ def __getitem__(cls, params) -> Any:
return new_obj
else:
raise TypeError(
"align only can be applied to sturct/MemRange/base_dsl scalar"
"align only can be applied to struct/MemRange/base_dsl scalar"
)

class Align(metaclass=_AlignMeta):
Expand Down Expand Up @@ -6814,10 +6814,10 @@ def add_offset(val):
offset = add_offset(object.__sizeof__())
else:
raise TypeError(
f"Struct element only support sturct/array/base_dsl scalar, "
f"Struct element only support struct/array/base_dsl scalar, "
f"but got {object}"
)
# Total aligment determined by the strictest requirement
# Total alignment determined by the strictest requirement
alignment = max(alignment, sub_align)
# Total size determined by alignment
self._align_of = alignment
Expand Down Expand Up @@ -6851,7 +6851,7 @@ def __call__(self, base: Any) -> None:
setattr(cls, name, new_obj)
else:
raise TypeError(
f"Struct element only support sturct/array/base_dsl scalar, "
f"Struct element only support struct/array/base_dsl scalar, "
f"but got {obj}"
)
return cls
Expand Down
8 changes: 4 additions & 4 deletions python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

####################################################################################################
#
# Aynchronous copies
# Asynchronous copies
#
####################################################################################################

Expand Down Expand Up @@ -119,7 +119,7 @@ class CopyG2STrait(Trait):
@dataclass(frozen=True)
class CopyBulkTensorTileG2SOp(CopyOp):
"""
Bulk tensor asynchrnous GMEM to SMEM Copy Operation using the TMA unit.
Bulk tensor asynchronous GMEM to SMEM Copy Operation using the TMA unit.

See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
This Operation uses TMA in the ``.tile`` mode.
Expand Down Expand Up @@ -221,7 +221,7 @@ def unpack(
@dataclass(frozen=True)
class CopyBulkTensorTileG2SMulticastOp(CopyOp):
"""
Bulk tensor asynchrnous multicast GMEM to SMEM Copy Operation using the TMA unit.
Bulk tensor asynchronous multicast GMEM to SMEM Copy Operation using the TMA unit.

See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
This Operation uses TMA in the ``.tile`` mode.
Expand Down Expand Up @@ -330,7 +330,7 @@ def unpack(
@dataclass(frozen=True)
class CopyBulkTensorTileS2GOp(CopyOp):
"""
Bulk tensor asynchrnous SMEM to GMEM Copy Operation using the TMA unit.
Bulk tensor asynchronous SMEM to GMEM Copy Operation using the TMA unit.

See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
This Operation uses TMA in the ``.tile`` mode.
Expand Down
2 changes: 1 addition & 1 deletion python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def update_tma_descriptor(
:type tma_atom: CopyAtom
:param gmem_tensor: The GMEM tensor
:type gmem_tensor: Tensor
:param tensormap_ptr: The pointer to the memory location of the descriptor to udpate
:param tensormap_ptr: The pointer to the memory location of the descriptor to update
:type tensormap_ptr: Pointer
"""
_cute_nvgpu_ir.update_tma_desc(
Expand Down
2 changes: 1 addition & 1 deletion python/CuTeDSL/cutlass/pipeline/sm100.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ def create(
producer_mask = PipelineUmmaAsync._compute_tmem_sync_mask(cta_layout_vmnk)

if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1:
# Set mask to None if not using 2CTA intructions
# Set mask to None if not using 2CTA instructions
consumer_mask = None
else:
consumer_mask = PipelineUmmaAsync._compute_peer_cta_rank()
Expand Down
2 changes: 1 addition & 1 deletion python/CuTeDSL/cutlass/torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def convert_cute_tensor(
) -> Tensor:
"""
Change the value of the cute tensor to make its value converted from a fp32 torch tensor.
Used for fp8 types tensor creatation now.
Used for fp8 types tensor creation now.
"""
# if torch_tensor is on cpu, create a gpu copy
if f32_torch_tensor.device.type == "cpu":
Expand Down
6 changes: 3 additions & 3 deletions python/CuTeDSL/cutlass/utils/README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Utilities

This folder contains various utilties for kernel authoring. Specifically, the implementation of the
followings can be considered experimental and subject to breaking changes:
This folder contains various utilities for kernel authoring. Specifically, the implementation of the
following can be considered experimental and subject to breaking changes:

- static persistent tile scheduler defined in [`static_persistent_tile_scheduler.py`](./static_persistent_tile_scheduler.py)
- pipeline abstractions defined in [`pipeline.py`](./pipeline.py)
- grouped GEMM utilties defined [`grouped_gemm_tile_scheduler_helper.py`](./grouped_gemm_tile_scheduler_helper.py)
- grouped GEMM utilities defined [`grouped_gemm_tile_scheduler_helper.py`](./grouped_gemm_tile_scheduler_helper.py)
and [`tensormap_manager.py`](./tensormap_manager.py)
2 changes: 1 addition & 1 deletion python/CuTeDSL/cutlass_dsl/cutlass.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,7 +675,7 @@ def count_values(args):


# =============================================================================
# DSL implementation of Python Build-in Operators
# DSL implementation of Python Built-in Operators
# =============================================================================


Expand Down
2 changes: 1 addition & 1 deletion python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def scf_execute_dynamic(
original_idx = unpacked_idx
break
raise DSLRuntimeError(
f"`{op_type_name}` expects {expected_type} type for varible `{mix_iter_arg_names[original_idx]}`, but got {actual_type}.",
f"`{op_type_name}` expects {expected_type} type for variable `{mix_iter_arg_names[original_idx]}`, but got {actual_type}.",
suggestion=f"Please make sure `{mix_iter_arg_names[original_idx]}` type is not changed inside of `{op_type_name}`.",
)
scf.YieldOp(region_values)
Expand Down
2 changes: 1 addition & 1 deletion python/cutlass/backend/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ def emit_compile_(self, operation_list, compilation_options, host_compilation_op
cmd.extend(host_compilation_options.get_str().split(" "))
cmd.extend(["-shared", "-o", temp_dst.name, temp_src.name, "-lcudart", "-lcuda"])

# Comile and load the library
# Compile and load the library
compile_with_nvcc( cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
host_lib = ctypes.CDLL(temp_dst.name)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(self, dag_ir: DAGIR) -> None:

def call(self):
self.layout_nodes_worklist = self.get_all_layout_nodes()
# Run while loop utill all layout nodes are eliminated
# Run while loop until all layout nodes are eliminated
while(len(self.layout_nodes_worklist) > 0):
node = self.layout_nodes_worklist.pop(0)
# for node in layout_nodes:
Expand Down
6 changes: 3 additions & 3 deletions python/cutlass/backend/gemm_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@

def leading_dimension(layout: LayoutType, shape: MatrixCoord) -> int:
"""
Returns the leading dimenson of a tensor with layout ``layout`` and shape ``shape``.
Returns the leading dimension of a tensor with layout ``layout`` and shape ``shape``.

:param layout: layout of the tensor
:type layout: cutlass_cppgen.shape.LayoutType
Expand Down Expand Up @@ -1510,7 +1510,7 @@ def __init__(

# Optionally swap the TensorDescriptions for operands A and B and transpose their
# layouts. This is needed to mimic the transpose performed by device::GemmUniversal.
# The code below uses deep copy to avoid overwritting the original TensorDescription
# The code below uses deep copy to avoid overwriting the original TensorDescription
self.switched = (self.api != ApiVersion.v3x and
self.emission_type == EmissionType.Kernel and
C.layout == LayoutType.ColumnMajor)
Expand Down Expand Up @@ -1775,7 +1775,7 @@ def __init__(self, arch, tile_description: TileDescription, A: TensorDescription
epilogue_functor, swizzling_functor=SwizzlingFunctor.Identity1, **kwargs):
super(GemmOperationGrouped, self).__init__(GemmKind.Grouped, arch, tile_description,
A, B, C, epilogue_functor, swizzling_functor, **kwargs)
assert "precompute_mode" in kwargs.keys(), "missing keyword arguement 'precompute_mode'."
assert "precompute_mode" in kwargs.keys(), "missing keyword argument 'precompute_mode'."
self.precompute_mode = kwargs["precompute_mode"]
self.rt_module = GemmRTGrouped(self)
self.argument_type = self.rt_module.argument_type
Expand Down
2 changes: 1 addition & 1 deletion python/cutlass/backend/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def __init__(
"""
:param threadblock_shape: shape of a threadblock tyle
:type threadblock_shape: list or tuple
:param stages: number of pipline stages in the operation. For SM90 kernels, this can be set to `None` and the maximum
:param stages: number of pipeline stages in the operation. For SM90 kernels, this can be set to `None` and the maximum
number of stages that can be supported for an operation on a given architecture will be computed at a later time
:type stages: int or None
:param warp_count: number of warps in each [M, N, K] dimension of a threadblock tile
Expand Down
2 changes: 1 addition & 1 deletion python/cutlass/backend/reduction_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ def configuration_name(self):
)

def procedural_name(self):
"""The full procedural name indicates architeture, extended name, tile size"""
"""The full procedural name indicates architecture, extended name, tile size"""
return self.configuration_name()

def run(self, arguments: ReductionArguments) -> cuda.CUresult:
Expand Down
2 changes: 1 addition & 1 deletion python/cutlass/backend/utils/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def device_sm_count(device: int = -1):
)
if err != cuda.CUresult.CUDA_SUCCESS:
raise Exception(
"Failed to retireve SM count. "
"Failed to retrieve SM count. "
f"cuDeviceGetAttribute() failed with error: {cuda.cuGetErrorString(err)[1]}"
)

Expand Down
4 changes: 2 additions & 2 deletions python/cutlass/emit/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,15 +622,15 @@ class _ArchListSetter:
Utility context manager for temporarily setting the value of the ``TORCH_CUDA_ARCH_LIST``
environment variable when building a PyTorch CUDA module.

``TORCH_CUDA_ARCH_LIST`` is a space-delmited list of compute capabilites for which a PyTorch
``TORCH_CUDA_ARCH_LIST`` is a space-delmited list of compute capabilities for which a PyTorch
CUDA module should be compiled.

For example, ``TORCH_CUDA_ARCH_LIST="7.0 8.0"`` would result in the inclusion of
``-gencode=arch=compute_70,code=sm_70`` and ``-gencode=arch=compute_80,code=sm_80`` in the
compilation of the module.

This utility wraps the building of a PyTorch CUDA module with a setting of this environment
variable according to the current compute capability being targetted.
variable according to the current compute capability being targeted.

Example usage:

Expand Down
2 changes: 1 addition & 1 deletion python/cutlass/epilogue/epilogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def trace(fn, example_tensors, **kwargs):
:param example_tensors: example inputs for fn
:type example_tensors: dict

.. hightlight:: python
.. highlight:: python
.. code-block:: python
import cutlass_cppgen.backend.evt

Expand Down
6 changes: 3 additions & 3 deletions python/cutlass/op/conv.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ class Conv2d(OperationBase):
:param B: tensor representing data type of operand B
:param C: tensor representing data type of operand C
:param D: tensor representing data type of operand D
:param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
:param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B
:param beta: scalar parameter beta from GEMM operation that scales operand C
:param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type
:type element: cutlass_cppgen.DataType
Expand Down Expand Up @@ -749,7 +749,7 @@ def run(self, A=None, B=None, C=None, D=None,

By default, this call returns only once the kernel has completed. To launch the kernel
and immediately return, set ``sync=False``. In this case, it is the responsibility of the
caller to syncrhonize the results of the kernel before attempting to access outputs
caller to synchronize the results of the kernel before attempting to access outputs
by calling ``sync()`` on the arguments returned from this call.

:param A: tensor representing data type and layout of operand A
Expand All @@ -759,7 +759,7 @@ def run(self, A=None, B=None, C=None, D=None,
:param stride: (stride_h, stride_w) describing the convolution stride. Default: (1, 1)
:param padding: (pad_h, pad_w) describing the convolution padding. Default: (0, 0)
:param dilation: (dilation_h, dilation_w) describing the dilation of convolution. Default: (1, 1)
:param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
:param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B
:param beta: scalar parameter beta from GEMM operation that scales operand C
:param split_k: a tuple (split_k_mode, split_k_slices)
:param sync: whether the call should wait for the kernel to complete before returning
Expand Down
6 changes: 3 additions & 3 deletions python/cutlass/op/gemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ class Gemm(OperationBase):
:param B: tensor representing data type and layout of operand B
:param C: tensor representing data type and layout of operand C
:param D: tensor representing data type and layout of operand D
:param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
:param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B
:param beta: scalar parameter beta from GEMM operation that scales operand C
:param element_accumulator: data type to be used in accumulation of the product of operands A and B
:type element_accumulator: cutlass_cppgen.DataType
Expand Down Expand Up @@ -635,14 +635,14 @@ def run(self, A=None, B=None, C=None, D=None,

By default, this call returns only once the kernel has completed. To launch the kernel
and immediately return, set ``sync=False``. In this case, it is the responsibility of the
caller to syncrhonize the results of the kernel before attempting to access outputs
caller to synchronize the results of the kernel before attempting to access outputs
by calling ``sync()`` on the arguments returned from this call.

:param A: tensor representing data type and layout of operand A
:param B: tensor representing data type and layout of operand B
:param C: tensor representing data type and layout of operand C
:param D: tensor representing data type and layout of operand D
:param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
:param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B
:param beta: scalar parameter beta from GEMM operation that scales operand C
:param sync: whether the call should wait for the kernel to complete before returning
:type sync: bool
Expand Down
6 changes: 3 additions & 3 deletions python/cutlass/op/gemm_grouped.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class GroupedGemm(Gemm):
:param B: tensor representing data type and layout of operands B
:param C: tensor representing data type and layout of operands C
:param D: tensor representing data type and layout of operands D
:param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
:param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B
:param beta: scalar parameter beta from GEMM operation that scales operand C
:param element_accumulator: data type to be used in accumulation of the product of operands A and B
:type element_accumulator: cutlass_cppgen.DataType
Expand Down Expand Up @@ -204,7 +204,7 @@ def run(self, A, B, C, D,

By default, this call returns only once the kernel has completed. To launch the kernel
and immediately return, set ``sync=False``. In this case, it is the responsibility of the
caller to syncrhonize the results of the kernel before attempting to access outputs
caller to synchronize the results of the kernel before attempting to access outputs
by calling ``sync()`` on the arguments returned from this call.

:param A: list of tensors representing data type and layout of operand A
Expand All @@ -215,7 +215,7 @@ def run(self, A, B, C, D,
:type C: list
:param D: list of tensors representing data type and layout of operand D
:type D: list
:param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
:param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B
:param beta: scalar parameter beta from GEMM operation that scales operand C
:param sync: whether the call should wait for the kernel to complete before returning
:type sync: bool
Expand Down
2 changes: 1 addition & 1 deletion python/cutlass/op/op.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def epilogue_visitor(self, visitor):

def run_setup(self):
"""
Steps that must be taken before caling `plan.run()`
Steps that must be taken before calling `plan.run()`
"""
# Initialize the memory pool if, if not already done
cutlass_cppgen.get_memory_pool()
2 changes: 1 addition & 1 deletion python/cutlass/utils/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def valid_stage_count(
f"Details:\n"
f"Mainloop uses {smem_per_stage} bytes of shared memory per stage, and "
f"{td.stages} stages for a total of {smem_usage_mainloop} bytes.\n"
f"The maxmium amount of shared memory that can be used per block on CC {cc} is {smem_arch}.")
f"The maximum amount of shared memory that can be used per block on CC {cc} is {smem_arch}.")

return (True, "")

Expand Down
Loading