diff --git a/python/CuTeDSL/base_dsl/ast_helpers.py b/python/CuTeDSL/base_dsl/ast_helpers.py index b857e40e68..51b8ec0c39 100644 --- a/python/CuTeDSL/base_dsl/ast_helpers.py +++ b/python/CuTeDSL/base_dsl/ast_helpers.py @@ -413,7 +413,7 @@ def bool_cast(value): if executor._is_dynamic_expression(value): raise DSLRuntimeError( "Only constexpr (Python Value) is allowed here, but got non-constexpr (IR Values) expression.", - suggestion = "Please explicitly convert to boolean with expressions like comparision." + suggestion = "Please explicitly convert to boolean with expressions like comparison." ) return bool(value) diff --git a/python/CuTeDSL/base_dsl/ast_preprocessor.py b/python/CuTeDSL/base_dsl/ast_preprocessor.py index bffbc7f2e1..6d9c3dde72 100644 --- a/python/CuTeDSL/base_dsl/ast_preprocessor.py +++ b/python/CuTeDSL/base_dsl/ast_preprocessor.py @@ -873,7 +873,7 @@ def _handle_negative_step(self, node, start_expr, stop_expr, step_expr): extra_exprs.append(step) extra_exprs.append(offset) - # Add this to begining of loop body + # Add this to beginning of loop body # for i in range(start, stop, step): # i = offset - i if isNegative else i assert isinstance(node.target, ast.Name) diff --git a/python/CuTeDSL/cutlass/cute/arch/smem.py b/python/CuTeDSL/cutlass/cute/arch/smem.py index 37f87ea64d..bfff87d538 100644 --- a/python/CuTeDSL/cutlass/cute/arch/smem.py +++ b/python/CuTeDSL/cutlass/cute/arch/smem.py @@ -77,7 +77,7 @@ def get_dyn_smem( :param alignment: An optional pointer alignment, the result pointer is offset appropriately :type alignment: int :return: A pointer to the start of the dynamic SMEM allocation with a correct - alignement + alignment :rtype: Pointer """ if not isinstance(element_type, NumericMeta): diff --git a/python/CuTeDSL/cutlass/cute/core.py b/python/CuTeDSL/cutlass/cute/core.py index e3f6b1e78a..004e75a1f0 100644 --- a/python/CuTeDSL/cutlass/cute/core.py +++ b/python/CuTeDSL/cutlass/cute/core.py @@ -1717,7 +1717,7 @@ def print_tensor(tensor: Tensor, *, verbose: bool = False, loc=None, ip=None): # -# Utilties +# Utilities # @@ -4282,7 +4282,7 @@ def flat_divide(target, tiler: Tiler, *, loc=None, ip=None): # -# Higher-level utilties +# Higher-level utilities # @@ -6563,7 +6563,7 @@ class StorageA: intA : cutlass.Int16 - # Supports aligment for its elements: + # Supports alignment for its elements: @cute.struct class StorageB: a: cute.struct.Align[ @@ -6738,7 +6738,7 @@ def __getitem__(cls, params) -> Any: return new_obj else: raise TypeError( - "align only can be applied to sturct/MemRange/base_dsl scalar" + "align only can be applied to struct/MemRange/base_dsl scalar" ) class Align(metaclass=_AlignMeta): @@ -6814,10 +6814,10 @@ def add_offset(val): offset = add_offset(object.__sizeof__()) else: raise TypeError( - f"Struct element only support sturct/array/base_dsl scalar, " + f"Struct element only support struct/array/base_dsl scalar, " f"but got {object}" ) - # Total aligment determined by the strictest requirement + # Total alignment determined by the strictest requirement alignment = max(alignment, sub_align) # Total size determined by alignment self._align_of = alignment @@ -6851,7 +6851,7 @@ def __call__(self, base: Any) -> None: setattr(cls, name, new_obj) else: raise TypeError( - f"Struct element only support sturct/array/base_dsl scalar, " + f"Struct element only support struct/array/base_dsl scalar, " f"but got {obj}" ) return cls diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py index 8744a37600..7f35f4f004 100644 --- a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py +++ b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py @@ -27,7 +27,7 @@ #################################################################################################### # -# Aynchronous copies +# Asynchronous copies # #################################################################################################### @@ -119,7 +119,7 @@ class CopyG2STrait(Trait): @dataclass(frozen=True) class CopyBulkTensorTileG2SOp(CopyOp): """ - Bulk tensor asynchrnous GMEM to SMEM Copy Operation using the TMA unit. + Bulk tensor asynchronous GMEM to SMEM Copy Operation using the TMA unit. See the `PTX documentation `__. This Operation uses TMA in the ``.tile`` mode. @@ -221,7 +221,7 @@ def unpack( @dataclass(frozen=True) class CopyBulkTensorTileG2SMulticastOp(CopyOp): """ - Bulk tensor asynchrnous multicast GMEM to SMEM Copy Operation using the TMA unit. + Bulk tensor asynchronous multicast GMEM to SMEM Copy Operation using the TMA unit. See the `PTX documentation `__. This Operation uses TMA in the ``.tile`` mode. @@ -330,7 +330,7 @@ def unpack( @dataclass(frozen=True) class CopyBulkTensorTileS2GOp(CopyOp): """ - Bulk tensor asynchrnous SMEM to GMEM Copy Operation using the TMA unit. + Bulk tensor asynchronous SMEM to GMEM Copy Operation using the TMA unit. See the `PTX documentation `__. This Operation uses TMA in the ``.tile`` mode. diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py index f83744076f..ae640ee8e2 100644 --- a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py +++ b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py @@ -257,7 +257,7 @@ def update_tma_descriptor( :type tma_atom: CopyAtom :param gmem_tensor: The GMEM tensor :type gmem_tensor: Tensor - :param tensormap_ptr: The pointer to the memory location of the descriptor to udpate + :param tensormap_ptr: The pointer to the memory location of the descriptor to update :type tensormap_ptr: Pointer """ _cute_nvgpu_ir.update_tma_desc( diff --git a/python/CuTeDSL/cutlass/pipeline/sm100.py b/python/CuTeDSL/cutlass/pipeline/sm100.py index 591e1d7a60..801a413980 100644 --- a/python/CuTeDSL/cutlass/pipeline/sm100.py +++ b/python/CuTeDSL/cutlass/pipeline/sm100.py @@ -400,7 +400,7 @@ def create( producer_mask = PipelineUmmaAsync._compute_tmem_sync_mask(cta_layout_vmnk) if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1: - # Set mask to None if not using 2CTA intructions + # Set mask to None if not using 2CTA instructions consumer_mask = None else: consumer_mask = PipelineUmmaAsync._compute_peer_cta_rank() diff --git a/python/CuTeDSL/cutlass/torch.py b/python/CuTeDSL/cutlass/torch.py index 066c281612..52ea8000bf 100644 --- a/python/CuTeDSL/cutlass/torch.py +++ b/python/CuTeDSL/cutlass/torch.py @@ -148,7 +148,7 @@ def convert_cute_tensor( ) -> Tensor: """ Change the value of the cute tensor to make its value converted from a fp32 torch tensor. - Used for fp8 types tensor creatation now. + Used for fp8 types tensor creation now. """ # if torch_tensor is on cpu, create a gpu copy if f32_torch_tensor.device.type == "cpu": diff --git a/python/CuTeDSL/cutlass/utils/README.md b/python/CuTeDSL/cutlass/utils/README.md index 3a583ed49f..d6a6860a58 100644 --- a/python/CuTeDSL/cutlass/utils/README.md +++ b/python/CuTeDSL/cutlass/utils/README.md @@ -1,9 +1,9 @@ # Utilities -This folder contains various utilties for kernel authoring. Specifically, the implementation of the -followings can be considered experimental and subject to breaking changes: +This folder contains various utilities for kernel authoring. Specifically, the implementation of the +following can be considered experimental and subject to breaking changes: - static persistent tile scheduler defined in [`static_persistent_tile_scheduler.py`](./static_persistent_tile_scheduler.py) - pipeline abstractions defined in [`pipeline.py`](./pipeline.py) -- grouped GEMM utilties defined [`grouped_gemm_tile_scheduler_helper.py`](./grouped_gemm_tile_scheduler_helper.py) +- grouped GEMM utilities defined [`grouped_gemm_tile_scheduler_helper.py`](./grouped_gemm_tile_scheduler_helper.py) and [`tensormap_manager.py`](./tensormap_manager.py) diff --git a/python/CuTeDSL/cutlass_dsl/cutlass.py b/python/CuTeDSL/cutlass_dsl/cutlass.py index e2461d5077..75cb7b6a16 100644 --- a/python/CuTeDSL/cutlass_dsl/cutlass.py +++ b/python/CuTeDSL/cutlass_dsl/cutlass.py @@ -675,7 +675,7 @@ def count_values(args): # ============================================================================= -# DSL implementation of Python Build-in Operators +# DSL implementation of Python Built-in Operators # ============================================================================= diff --git a/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py b/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py index 370a0c9ff5..ce4c0bc713 100644 --- a/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py +++ b/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py @@ -194,7 +194,7 @@ def scf_execute_dynamic( original_idx = unpacked_idx break raise DSLRuntimeError( - f"`{op_type_name}` expects {expected_type} type for varible `{mix_iter_arg_names[original_idx]}`, but got {actual_type}.", + f"`{op_type_name}` expects {expected_type} type for variable `{mix_iter_arg_names[original_idx]}`, but got {actual_type}.", suggestion=f"Please make sure `{mix_iter_arg_names[original_idx]}` type is not changed inside of `{op_type_name}`.", ) scf.YieldOp(region_values) diff --git a/python/cutlass/backend/compiler.py b/python/cutlass/backend/compiler.py index 1b78b51387..3f8bcec291 100644 --- a/python/cutlass/backend/compiler.py +++ b/python/cutlass/backend/compiler.py @@ -349,7 +349,7 @@ def emit_compile_(self, operation_list, compilation_options, host_compilation_op cmd.extend(host_compilation_options.get_str().split(" ")) cmd.extend(["-shared", "-o", temp_dst.name, temp_src.name, "-lcudart", "-lcuda"]) - # Comile and load the library + # Compile and load the library compile_with_nvcc( cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt") host_lib = ctypes.CDLL(temp_dst.name) diff --git a/python/cutlass/backend/evt/passes/pass_layout_elimination.py b/python/cutlass/backend/evt/passes/pass_layout_elimination.py index af147969f0..5e53119811 100644 --- a/python/cutlass/backend/evt/passes/pass_layout_elimination.py +++ b/python/cutlass/backend/evt/passes/pass_layout_elimination.py @@ -53,7 +53,7 @@ def __init__(self, dag_ir: DAGIR) -> None: def call(self): self.layout_nodes_worklist = self.get_all_layout_nodes() - # Run while loop utill all layout nodes are eliminated + # Run while loop until all layout nodes are eliminated while(len(self.layout_nodes_worklist) > 0): node = self.layout_nodes_worklist.pop(0) # for node in layout_nodes: diff --git a/python/cutlass/backend/gemm_operation.py b/python/cutlass/backend/gemm_operation.py index cf6bcc184d..db581aa685 100644 --- a/python/cutlass/backend/gemm_operation.py +++ b/python/cutlass/backend/gemm_operation.py @@ -113,7 +113,7 @@ def leading_dimension(layout: LayoutType, shape: MatrixCoord) -> int: """ - Returns the leading dimenson of a tensor with layout ``layout`` and shape ``shape``. + Returns the leading dimension of a tensor with layout ``layout`` and shape ``shape``. :param layout: layout of the tensor :type layout: cutlass_cppgen.shape.LayoutType @@ -1510,7 +1510,7 @@ def __init__( # Optionally swap the TensorDescriptions for operands A and B and transpose their # layouts. This is needed to mimic the transpose performed by device::GemmUniversal. - # The code below uses deep copy to avoid overwritting the original TensorDescription + # The code below uses deep copy to avoid overwriting the original TensorDescription self.switched = (self.api != ApiVersion.v3x and self.emission_type == EmissionType.Kernel and C.layout == LayoutType.ColumnMajor) @@ -1775,7 +1775,7 @@ def __init__(self, arch, tile_description: TileDescription, A: TensorDescription epilogue_functor, swizzling_functor=SwizzlingFunctor.Identity1, **kwargs): super(GemmOperationGrouped, self).__init__(GemmKind.Grouped, arch, tile_description, A, B, C, epilogue_functor, swizzling_functor, **kwargs) - assert "precompute_mode" in kwargs.keys(), "missing keyword arguement 'precompute_mode'." + assert "precompute_mode" in kwargs.keys(), "missing keyword argument 'precompute_mode'." self.precompute_mode = kwargs["precompute_mode"] self.rt_module = GemmRTGrouped(self) self.argument_type = self.rt_module.argument_type diff --git a/python/cutlass/backend/library.py b/python/cutlass/backend/library.py index a8b113b4e9..ef8fd30046 100644 --- a/python/cutlass/backend/library.py +++ b/python/cutlass/backend/library.py @@ -258,7 +258,7 @@ def __init__( """ :param threadblock_shape: shape of a threadblock tyle :type threadblock_shape: list or tuple - :param stages: number of pipline stages in the operation. For SM90 kernels, this can be set to `None` and the maximum + :param stages: number of pipeline stages in the operation. For SM90 kernels, this can be set to `None` and the maximum number of stages that can be supported for an operation on a given architecture will be computed at a later time :type stages: int or None :param warp_count: number of warps in each [M, N, K] dimension of a threadblock tile diff --git a/python/cutlass/backend/reduction_operation.py b/python/cutlass/backend/reduction_operation.py index 535cea2cb2..8fb6ed162d 100644 --- a/python/cutlass/backend/reduction_operation.py +++ b/python/cutlass/backend/reduction_operation.py @@ -377,7 +377,7 @@ def configuration_name(self): ) def procedural_name(self): - """The full procedural name indicates architeture, extended name, tile size""" + """The full procedural name indicates architecture, extended name, tile size""" return self.configuration_name() def run(self, arguments: ReductionArguments) -> cuda.CUresult: diff --git a/python/cutlass/backend/utils/device.py b/python/cutlass/backend/utils/device.py index 9ed4096a6f..23c458c8dc 100644 --- a/python/cutlass/backend/utils/device.py +++ b/python/cutlass/backend/utils/device.py @@ -93,7 +93,7 @@ def device_sm_count(device: int = -1): ) if err != cuda.CUresult.CUDA_SUCCESS: raise Exception( - "Failed to retireve SM count. " + "Failed to retrieve SM count. " f"cuDeviceGetAttribute() failed with error: {cuda.cuGetErrorString(err)[1]}" ) diff --git a/python/cutlass/emit/pytorch.py b/python/cutlass/emit/pytorch.py index 86374b8b0c..ce0a295fdd 100644 --- a/python/cutlass/emit/pytorch.py +++ b/python/cutlass/emit/pytorch.py @@ -622,7 +622,7 @@ class _ArchListSetter: Utility context manager for temporarily setting the value of the ``TORCH_CUDA_ARCH_LIST`` environment variable when building a PyTorch CUDA module. - ``TORCH_CUDA_ARCH_LIST`` is a space-delmited list of compute capabilites for which a PyTorch + ``TORCH_CUDA_ARCH_LIST`` is a space-delmited list of compute capabilities for which a PyTorch CUDA module should be compiled. For example, ``TORCH_CUDA_ARCH_LIST="7.0 8.0"`` would result in the inclusion of @@ -630,7 +630,7 @@ class _ArchListSetter: compilation of the module. This utility wraps the building of a PyTorch CUDA module with a setting of this environment - variable according to the current compute capability being targetted. + variable according to the current compute capability being targeted. Example usage: diff --git a/python/cutlass/epilogue/epilogue.py b/python/cutlass/epilogue/epilogue.py index 16d1fec8fa..d3887c4842 100644 --- a/python/cutlass/epilogue/epilogue.py +++ b/python/cutlass/epilogue/epilogue.py @@ -122,7 +122,7 @@ def trace(fn, example_tensors, **kwargs): :param example_tensors: example inputs for fn :type example_tensors: dict - .. hightlight:: python + .. highlight:: python .. code-block:: python import cutlass_cppgen.backend.evt diff --git a/python/cutlass/op/conv.py b/python/cutlass/op/conv.py index 4f21d85436..f32f34a3a0 100644 --- a/python/cutlass/op/conv.py +++ b/python/cutlass/op/conv.py @@ -183,7 +183,7 @@ class Conv2d(OperationBase): :param B: tensor representing data type of operand B :param C: tensor representing data type of operand C :param D: tensor representing data type of operand D - :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B + :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B :param beta: scalar parameter beta from GEMM operation that scales operand C :param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type :type element: cutlass_cppgen.DataType @@ -749,7 +749,7 @@ def run(self, A=None, B=None, C=None, D=None, By default, this call returns only once the kernel has completed. To launch the kernel and immediately return, set ``sync=False``. In this case, it is the responsibility of the - caller to syncrhonize the results of the kernel before attempting to access outputs + caller to synchronize the results of the kernel before attempting to access outputs by calling ``sync()`` on the arguments returned from this call. :param A: tensor representing data type and layout of operand A @@ -759,7 +759,7 @@ def run(self, A=None, B=None, C=None, D=None, :param stride: (stride_h, stride_w) describing the convolution stride. Default: (1, 1) :param padding: (pad_h, pad_w) describing the convolution padding. Default: (0, 0) :param dilation: (dilation_h, dilation_w) describing the dilation of convolution. Default: (1, 1) - :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B + :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B :param beta: scalar parameter beta from GEMM operation that scales operand C :param split_k: a tuple (split_k_mode, split_k_slices) :param sync: whether the call should wait for the kernel to complete before returning diff --git a/python/cutlass/op/gemm.py b/python/cutlass/op/gemm.py index fddd0c095e..9dcb4b2226 100644 --- a/python/cutlass/op/gemm.py +++ b/python/cutlass/op/gemm.py @@ -189,7 +189,7 @@ class Gemm(OperationBase): :param B: tensor representing data type and layout of operand B :param C: tensor representing data type and layout of operand C :param D: tensor representing data type and layout of operand D - :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B + :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B :param beta: scalar parameter beta from GEMM operation that scales operand C :param element_accumulator: data type to be used in accumulation of the product of operands A and B :type element_accumulator: cutlass_cppgen.DataType @@ -635,14 +635,14 @@ def run(self, A=None, B=None, C=None, D=None, By default, this call returns only once the kernel has completed. To launch the kernel and immediately return, set ``sync=False``. In this case, it is the responsibility of the - caller to syncrhonize the results of the kernel before attempting to access outputs + caller to synchronize the results of the kernel before attempting to access outputs by calling ``sync()`` on the arguments returned from this call. :param A: tensor representing data type and layout of operand A :param B: tensor representing data type and layout of operand B :param C: tensor representing data type and layout of operand C :param D: tensor representing data type and layout of operand D - :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B + :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B :param beta: scalar parameter beta from GEMM operation that scales operand C :param sync: whether the call should wait for the kernel to complete before returning :type sync: bool diff --git a/python/cutlass/op/gemm_grouped.py b/python/cutlass/op/gemm_grouped.py index 594106f2d1..0e1f9d588d 100644 --- a/python/cutlass/op/gemm_grouped.py +++ b/python/cutlass/op/gemm_grouped.py @@ -87,7 +87,7 @@ class GroupedGemm(Gemm): :param B: tensor representing data type and layout of operands B :param C: tensor representing data type and layout of operands C :param D: tensor representing data type and layout of operands D - :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B + :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B :param beta: scalar parameter beta from GEMM operation that scales operand C :param element_accumulator: data type to be used in accumulation of the product of operands A and B :type element_accumulator: cutlass_cppgen.DataType @@ -204,7 +204,7 @@ def run(self, A, B, C, D, By default, this call returns only once the kernel has completed. To launch the kernel and immediately return, set ``sync=False``. In this case, it is the responsibility of the - caller to syncrhonize the results of the kernel before attempting to access outputs + caller to synchronize the results of the kernel before attempting to access outputs by calling ``sync()`` on the arguments returned from this call. :param A: list of tensors representing data type and layout of operand A @@ -215,7 +215,7 @@ def run(self, A, B, C, D, :type C: list :param D: list of tensors representing data type and layout of operand D :type D: list - :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B + :param alpha: scalar parameter alpha from GEMM computation that scales the product of operands A and B :param beta: scalar parameter beta from GEMM operation that scales operand C :param sync: whether the call should wait for the kernel to complete before returning :type sync: bool diff --git a/python/cutlass/op/op.py b/python/cutlass/op/op.py index 88ccd26e07..b7234f31a3 100644 --- a/python/cutlass/op/op.py +++ b/python/cutlass/op/op.py @@ -424,7 +424,7 @@ def epilogue_visitor(self, visitor): def run_setup(self): """ - Steps that must be taken before caling `plan.run()` + Steps that must be taken before calling `plan.run()` """ # Initialize the memory pool if, if not already done cutlass_cppgen.get_memory_pool() diff --git a/python/cutlass/utils/check.py b/python/cutlass/utils/check.py index ff76a42b6f..55dc54dfe9 100644 --- a/python/cutlass/utils/check.py +++ b/python/cutlass/utils/check.py @@ -138,7 +138,7 @@ def valid_stage_count( f"Details:\n" f"Mainloop uses {smem_per_stage} bytes of shared memory per stage, and " f"{td.stages} stages for a total of {smem_usage_mainloop} bytes.\n" - f"The maxmium amount of shared memory that can be used per block on CC {cc} is {smem_arch}.") + f"The maximum amount of shared memory that can be used per block on CC {cc} is {smem_arch}.") return (True, "") diff --git a/python/cutlass_library/heuristics.py b/python/cutlass_library/heuristics.py index 83421a0642..89923384df 100644 --- a/python/cutlass_library/heuristics.py +++ b/python/cutlass_library/heuristics.py @@ -67,7 +67,7 @@ def serialize_heuristics_results_to_json(problems_with_configs, outfile_path): """ - Utilitiy function to write heuristics results to a json file for debug + Utility function to write heuristics results to a json file for debug args: problems_with_configs: List of problems provided to the heuristic, with a list of operations added to each problem dict @@ -142,7 +142,7 @@ def get_gemm_configs(problems, provider=None, count=1): - 'batch_count': Number of GEMM operations in batch (default: 1) - 'use_fast_acc': Enable fast accumulation for FP8 on Hopper (default: True) provider: Heuristics provider to use - count: Number of configurations to return per problem (defualt: 1) + count: Number of configurations to return per problem (default: 1) returns: A copy of the input dictionary, with key `configs` added containing the selected gemm configs diff --git a/python/cutlass_library/manifest.py b/python/cutlass_library/manifest.py index baaaac28a8..f0ea6500bf 100644 --- a/python/cutlass_library/manifest.py +++ b/python/cutlass_library/manifest.py @@ -31,7 +31,7 @@ ################################################################################################# """ -Utilities for filtering CUTLASS library kernels and emitting library intitialization +Utilities for filtering CUTLASS library kernels and emitting library initialization and building code """ diff --git a/python/docs/externals/00_basic_gemm.ipynb b/python/docs/externals/00_basic_gemm.ipynb index a18b320a84..f6396493f3 100644 --- a/python/docs/externals/00_basic_gemm.ipynb +++ b/python/docs/externals/00_basic_gemm.ipynb @@ -47,7 +47,7 @@ "\n", "import cutlass\n", "\n", - "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n", + "# This controls whether the C++ GEMM declaration will be printed at each step. Set to `false` to\n", "# omit this information.\n", "print_module = True\n", "\n", @@ -154,7 +154,7 @@ "id": "4a5856de", "metadata": {}, "source": [ - "There are many other ways to construct a plan from `cutlass.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass.op.Gemm` constructor." + "There are many other ways to construct a plan from `cutlass.op.Gemm` (e.g., by specifying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass.op.Gemm` constructor." ] }, { @@ -236,7 +236,7 @@ "\n", "As is shown in the printed output, the emitted kernel uses template parameters that fit CUTLASS's SIMT GEMMs.\n", "\n", - "Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization." + "Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on initialization." ] }, { diff --git a/python/docs/externals/01_epilogue.ipynb b/python/docs/externals/01_epilogue.ipynb index 2669802337..54a4864bad 100644 --- a/python/docs/externals/01_epilogue.ipynb +++ b/python/docs/externals/01_epilogue.ipynb @@ -47,7 +47,7 @@ "\n", "import cutlass\n", "\n", - "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n", + "# This controls whether the C++ GEMM declaration will be printed at each step. Set to `false` to\n", "# omit this information.\n", "print_module = True\n", "\n",