-
Notifications
You must be signed in to change notification settings - Fork 2.6k
/
Copy pathvector_addition.mojo
executable file
·103 lines (87 loc) · 3.46 KB
/
vector_addition.mojo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# ===----------------------------------------------------------------------=== #
# Copyright (c) 2025, Modular Inc. All rights reserved.
#
# Licensed under the Apache License v2.0 with LLVM Exceptions:
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===----------------------------------------------------------------------=== #
from gpu.host import Dim
from gpu.id import block_dim, block_idx, thread_idx
from layout import LayoutTensor, Layout
from math import ceildiv
from max.driver import (
Accelerator,
Device,
Tensor,
accelerator,
cpu,
)
from sys import has_nvidia_gpu_accelerator
alias float_dtype = DType.float32
alias tensor_rank = 1
fn vector_addition[
lhs_layout: Layout,
rhs_layout: Layout,
out_layout: Layout,
](
lhs: LayoutTensor[float_dtype, lhs_layout, MutableAnyOrigin],
rhs: LayoutTensor[float_dtype, rhs_layout, MutableAnyOrigin],
out: LayoutTensor[float_dtype, out_layout, MutableAnyOrigin],
):
"""The calculation to perform across the vector on the GPU."""
tid = block_dim.x * block_idx.x + thread_idx.x
if tid < out.layout.size():
out[tid] = lhs[tid] + rhs[tid]
def main():
# Attempt to connect to a compatible GPU. If one is not found, this will
# error out and exit.
gpu_device = accelerator()
host_device = cpu()
alias VECTOR_WIDTH = 10
# Allocate the two input tensors on the host.
lhs_tensor = Tensor[float_dtype, 1]((VECTOR_WIDTH), host_device)
rhs_tensor = Tensor[float_dtype, 1]((VECTOR_WIDTH), host_device)
# Fill them with initial values.
for i in range(VECTOR_WIDTH):
lhs_tensor[i] = 1.25
rhs_tensor[i] = 2.5
# Move the input tensors to the accelerator.
lhs_tensor = lhs_tensor.move_to(gpu_device)
rhs_tensor = rhs_tensor.move_to(gpu_device)
# Allocate a tensor on the accelerator to host the calculation results.
out_tensor = Tensor[float_dtype, tensor_rank]((VECTOR_WIDTH), gpu_device)
lhs_layout_tensor = lhs_tensor.to_layout_tensor()
rhs_layout_tensor = rhs_tensor.to_layout_tensor()
out_layout_tensor = out_tensor.to_layout_tensor()
# Compile the function to run across a grid on the GPU.
gpu_function = Accelerator.compile[
vector_addition[
lhs_layout_tensor.layout,
rhs_layout_tensor.layout,
out_layout_tensor.layout,
]
](gpu_device)
# The grid is divided up into blocks, making sure there's an extra
# full block for any remainder. This hasn't been tuned for any specific
# GPU.
alias BLOCK_SIZE = 16
var num_blocks = ceildiv(VECTOR_WIDTH, BLOCK_SIZE)
# Launch the compiled function on the GPU. The target device is specified
# first, followed by all function arguments. The last two named parameters
# are the dimensions of the grid in blocks, and the block dimensions.
gpu_function(
gpu_device,
lhs_layout_tensor,
rhs_layout_tensor,
out_layout_tensor,
grid_dim=Dim(num_blocks),
block_dim=Dim(BLOCK_SIZE),
)
# Move the output tensor back onto the CPU so that we can read the results.
out_tensor = out_tensor.move_to(host_device)
print("Resulting vector:", out_tensor)