Skip to content

Commit 74afe5e

Browse files
committed
Feature: cleanup the memory aspects
Mojo has started enforcing its memory model. Use this as an opportunity to cleanup the design of the various types and their relationships.
1 parent 195ef64 commit 74afe5e

18 files changed

+410
-326
lines changed

firebolt/MEMORY.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
Starting with the Sept 2025 version of Mojo the compiler is starting to enforce
2+
[lifetimes](https://docs.modular.com/mojo/manual/values/lifetimes/). This document proposes an approach at using memory.
3+
4+
1. Low level memory format
5+
6+
Apache Arrow defines a columnar memory [format](https://arrow.apache.org/docs/format/Columnar.html) that can
7+
be accessed in many languages, including Python. Firebolt defines
8+
an API to access this format in Mojo. One of the goals is to allow high performance
9+
integration between Python and Mojo when it comes to process vast amounts of data.
10+
11+
1. ArrayData owns the data
12+
13+
In Firebolt ArrayData is the low level API that will access the Arrow memory block.
14+
15+
As such it should own the data type, bitmap, buffers and children.
16+
17+
2. Typed arrays own ArrayData
18+
19+
The next level in the API are the typed arrays: PrimitiveArray, ListArray, StructArray and so on.
20+
21+
When constructing a typed array from an ArrowData the typed array will own the ArrayData.
22+
23+
The typed arrays provide convenient accessors into the ArrayData. For example PrimitiveArray
24+
provides a `bitmap` and a `buffer`. Since Mojo doesn't currently provide properties these
25+
helper accessors will be implemented as functions.
26+
27+
3. The Array trait
28+
29+
All of the typed Arrays are expected to implement the Array trait by providing 2 methods:
30+
31+
- `fn take_data(deinit self) -> ArrayData` creates a standalone ArrayData by destroying the self.
32+
- `fn data(self) -> ref [self] ArrayData` access a read only copy of the ArrayGata in the typed array.

firebolt/arrays/base.mojo

Lines changed: 64 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,19 @@ from sys.info import sizeof
44

55

66
trait Array(Movable, Representable, Sized, Stringable, Writable):
7-
fn as_data(self) -> ArrayData:
8-
...
7+
fn take_data(deinit self) -> ArrayData:
8+
"""Construct an ArrayData by consuming self."""
9+
pass
10+
11+
fn as_data[
12+
self_origin: ImmutableOrigin
13+
](ref [self_origin]self) -> UnsafePointer[ArrayData, mut=False]:
14+
"""Return a read only reference to the ArrayData wrapped by self.
15+
16+
Note that ideally the output type would be `ref [self_origin] ArrayData` but this is not supported yet.
17+
https://forum.modular.com/t/how-to-mark-a-trait-as-applying-to-not-register-passable/2265/6?u=mseritan
18+
"""
19+
pass
920

1021

1122
@fieldwise_init
@@ -22,47 +33,55 @@ struct ArrayData(Copyable, Movable, Representable, Stringable, Writable):
2233
var children: List[ArcPointer[ArrayData]]
2334
var offset: Int
2435

36+
fn __copyinit__(out self, existing: Self):
37+
self.dtype = existing.dtype.copy()
38+
self.length = existing.length
39+
self.bitmap = existing.bitmap
40+
self.buffers = existing.buffers.copy()
41+
self.children = existing.children.copy()
42+
self.offset = existing.offset
43+
2544
fn is_valid(self, index: Int) -> Bool:
2645
return self.bitmap[].unsafe_get(index + self.offset)
2746

28-
fn as_primitive[T: DataType](self) raises -> PrimitiveArray[T]:
29-
return PrimitiveArray[T](self)
47+
fn as_primitive[T: DataType](var self) raises -> PrimitiveArray[T]:
48+
return PrimitiveArray[T](self^)
3049

31-
fn as_int8(self) raises -> Int8Array:
32-
return Int8Array(self)
50+
fn as_int8(var self) raises -> Int8Array:
51+
return Int8Array(self^)
3352

34-
fn as_int16(self) raises -> Int16Array:
35-
return Int16Array(self)
53+
fn as_int16(var self) raises -> Int16Array:
54+
return Int16Array(self^)
3655

37-
fn as_int32(self) raises -> Int32Array:
38-
return Int32Array(self)
56+
fn as_int32(var self) raises -> Int32Array:
57+
return Int32Array(self^)
3958

40-
fn as_int64(self) raises -> Int64Array:
41-
return Int64Array(self)
59+
fn as_int64(var self) raises -> Int64Array:
60+
return Int64Array(self^)
4261

43-
fn as_uint8(self) raises -> UInt8Array:
44-
return UInt8Array(self)
62+
fn as_uint8(var self) raises -> UInt8Array:
63+
return UInt8Array(self^)
4564

46-
fn as_uint16(self) raises -> UInt16Array:
47-
return UInt16Array(self)
65+
fn as_uint16(var self) raises -> UInt16Array:
66+
return UInt16Array(self^)
4867

49-
fn as_uint32(self) raises -> UInt32Array:
50-
return UInt32Array(self)
68+
fn as_uint32(var self) raises -> UInt32Array:
69+
return UInt32Array(self^)
5170

52-
fn as_uint64(self) raises -> UInt64Array:
53-
return UInt64Array(self)
71+
fn as_uint64(var self) raises -> UInt64Array:
72+
return UInt64Array(self^)
5473

55-
fn as_float32(self) raises -> Float32Array:
56-
return Float32Array(self)
74+
fn as_float32(var self) raises -> Float32Array:
75+
return Float32Array(self^)
5776

58-
fn as_float64(self) raises -> Float64Array:
59-
return Float64Array(self)
77+
fn as_float64(var self) raises -> Float64Array:
78+
return Float64Array(self^)
6079

61-
fn as_string(self) raises -> StringArray:
62-
return StringArray(self)
80+
fn as_string(var self) raises -> StringArray:
81+
return StringArray(self^)
6382

64-
fn as_list(self) raises -> ListArray:
65-
return ListArray(self)
83+
fn as_list(var self) raises -> ListArray:
84+
return ListArray(self^)
6685

6786
fn _dynamic_write[W: Writer](self, index: Int, mut writer: W):
6887
"""Write to the given stream dispatching on the dtype."""
@@ -117,3 +136,20 @@ struct ArrayData(Copyable, Movable, Representable, Stringable, Writable):
117136

118137
fn __repr__(self) -> String:
119138
return String.write(self)
139+
140+
fn append_to_array(
141+
deinit self: ArrayData, mut combined: ArrayData, start: Int
142+
) -> Int:
143+
"""Append the content self to the combined array, consumes self.
144+
145+
Args:
146+
combined: Array to append to.
147+
start: Position where to append.
148+
149+
Returns:
150+
The new start position.
151+
"""
152+
combined.bitmap[].extend(self.bitmap[], start, self.length)
153+
combined.buffers.extend(self.buffers^)
154+
combined.children.extend(self.children^)
155+
return start + self.length

firebolt/arrays/binary.mojo

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,27 @@ from ..dtypes import *
66

77
struct StringArray(Array):
88
var data: ArrayData
9-
var bitmap: ArcPointer[Bitmap]
10-
var offsets: ArcPointer[Buffer]
11-
var values: ArcPointer[Buffer]
129
var capacity: Int
1310

14-
fn __init__(out self, data: ArrayData) raises:
11+
fn __init__(out self, var data: ArrayData) raises:
1512
if data.dtype != string:
1613
raise Error(
1714
"Unexpected dtype '{}' instead of 'string'.".format(data.dtype)
1815
)
1916
elif len(data.buffers) != 2:
2017
raise Error("StringArray requires exactly two buffers")
2118

22-
self.data = data
23-
self.bitmap = data.bitmap
24-
self.offsets = data.buffers[0]
25-
self.values = data.buffers[1]
2619
self.capacity = data.length
20+
self.data = data^
21+
22+
fn bitmap(self) -> ref [self.data.bitmap] ArcPointer[Bitmap]:
23+
return self.data.bitmap
24+
25+
fn offsets(self) -> ref [self.data.buffers] ArcPointer[Buffer]:
26+
return self.data.buffers[0]
27+
28+
fn values(self) -> ref [self.data.buffers] ArcPointer[Buffer]:
29+
return self.data.buffers[1]
2730

2831
fn __init__(out self, capacity: Int = 0):
2932
var bitmap = Bitmap.alloc(capacity)
@@ -33,68 +36,67 @@ struct StringArray(Array):
3336
offsets.unsafe_set[DType.uint32](0, 0)
3437

3538
self.capacity = capacity
36-
self.bitmap = ArcPointer(bitmap^)
37-
self.offsets = ArcPointer(offsets^)
38-
self.values = ArcPointer(values^)
3939
self.data = ArrayData(
4040
dtype=string,
4141
length=0,
42-
bitmap=self.bitmap,
43-
buffers=List(self.offsets, self.values),
42+
bitmap=ArcPointer(bitmap^),
43+
buffers=List(ArcPointer(offsets^), ArcPointer(values^)),
4444
children=List[ArcPointer[ArrayData]](),
4545
offset=0,
4646
)
4747

4848
fn __moveinit__(out self, deinit existing: Self):
4949
self.data = existing.data^
50-
self.bitmap = existing.bitmap^
51-
self.offsets = existing.offsets^
52-
self.values = existing.values^
5350
self.capacity = existing.capacity
5451

5552
fn __len__(self) -> Int:
5653
return self.data.length
5754

58-
fn as_data(self) -> ArrayData:
59-
return self.data
55+
fn as_data[
56+
self_origin: ImmutableOrigin
57+
](ref [self_origin]self) -> UnsafePointer[ArrayData, mut=False]:
58+
return UnsafePointer(to=self.data)
59+
60+
fn take_data(deinit self) -> ArrayData:
61+
return self.data^
6062

6163
fn grow(mut self, capacity: Int):
62-
self.bitmap[].grow(capacity)
63-
self.offsets[].grow[DType.uint32](capacity + 1)
64+
self.bitmap()[].grow(capacity)
65+
self.offsets()[].grow[DType.uint32](capacity + 1)
6466
self.capacity = capacity
6567

6668
# fn shrink_to_fit(out self):
6769

6870
fn is_valid(self, index: Int) -> Bool:
69-
return self.bitmap[].unsafe_get(index)
71+
return self.bitmap()[].unsafe_get(index)
7072

7173
fn unsafe_append(mut self, value: String):
7274
# todo(kszucs): use unsafe set
7375
var index = self.data.length
74-
var last_offset = self.offsets[].unsafe_get[DType.uint32](index)
76+
var last_offset = self.offsets()[].unsafe_get[DType.uint32](index)
7577
var next_offset = last_offset + len(value)
7678
self.data.length += 1
77-
self.bitmap[].unsafe_set(index, True)
78-
self.offsets[].unsafe_set[DType.uint32](index + 1, next_offset)
79-
self.values[].grow[DType.uint8](next_offset)
80-
var dst_address = self.values[].get_ptr_at(Int(last_offset))
79+
self.bitmap()[].unsafe_set(index, True)
80+
self.offsets()[].unsafe_set[DType.uint32](index + 1, next_offset)
81+
self.values()[].grow[DType.uint8](next_offset)
82+
var dst_address = self.values()[].get_ptr_at(Int(last_offset))
8183
var src_address = value.unsafe_ptr()
8284
memcpy(dst_address, src_address, len(value))
8385

8486
fn unsafe_get(self, index: UInt) -> StringSlice[__origin_of(self)]:
85-
var start_offset = self.offsets[].unsafe_get[DType.uint32](
87+
var start_offset = self.offsets()[].unsafe_get[DType.uint32](
8688
index + self.data.offset
8789
)
88-
var end_offset = self.offsets[].unsafe_get[DType.uint32](
90+
var end_offset = self.offsets()[].unsafe_get[DType.uint32](
8991
index + 1 + self.data.offset
9092
)
91-
var address = self.values[].get_ptr_at(Int(start_offset))
93+
var address = self.values()[].get_ptr_at(Int(start_offset))
9294
var length = UInt(Int(end_offset - start_offset))
9395
return StringSlice[__origin_of(self)](ptr=address, length=length)
9496

9597
fn unsafe_set(mut self, index: Int, value: String) raises:
96-
var start_offset = self.offsets[].unsafe_get[DType.int32](index)
97-
var end_offset = self.offsets[].unsafe_get[DType.int32](index + 1)
98+
var start_offset = self.offsets()[].unsafe_get[DType.int32](index)
99+
var end_offset = self.offsets()[].unsafe_get[DType.int32](index + 1)
98100
var length = Int(end_offset - start_offset)
99101

100102
if length != len(value):
@@ -103,7 +105,7 @@ struct StringArray(Array):
103105
" length"
104106
)
105107

106-
var dst_address = self.values[].get_ptr_at(Int(start_offset))
108+
var dst_address = self.values()[].get_ptr_at(Int(start_offset))
107109
var src_address = value.unsafe_ptr()
108110
memcpy(dst_address, src_address, length)
109111

firebolt/arrays/chunked_array.mojo

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ struct ChunkedArray:
2121
total_length += chunk.length
2222
self.length = total_length
2323

24-
fn __init__(out self, dtype: DataType, chunks: List[ArrayData]):
25-
self.dtype = dtype
26-
self.chunks = chunks
24+
fn __init__(out self, var dtype: DataType, var chunks: List[ArrayData]):
25+
self.dtype = dtype^
26+
self.chunks = chunks^
2727
self.length = 0
2828
self._compute_length()
2929

@@ -38,21 +38,19 @@ struct ChunkedArray:
3838
"""
3939
return self.chunks[index]
4040

41-
fn combine_chunks(self) -> ArrayData:
41+
fn combine_chunks(var self, out combined: ArrayData):
4242
"""Combines all chunks into a single array."""
4343
var bitmap = ArcPointer(Bitmap.alloc(self.length))
44-
var combined = ArrayData(
45-
dtype=self.dtype,
44+
combined = ArrayData(
45+
dtype=self.dtype.copy(),
4646
length=self.length,
4747
bitmap=bitmap,
4848
buffers=List[ArcPointer[Buffer]](),
4949
children=List[ArcPointer[ArrayData]](),
5050
offset=0,
5151
)
5252
var start = 0
53-
for chunk in self.chunks:
54-
combined.bitmap[].extend(chunk.bitmap[], start, chunk.length)
55-
start += chunk.length
56-
combined.buffers.extend(chunk.buffers)
57-
combined.children.extend(chunk.children)
58-
return combined
53+
while self.chunks:
54+
var chunk = self.chunks.pop(0)
55+
start += chunk^.append_to_array(combined, start)
56+
return combined^

0 commit comments

Comments
 (0)