diff --git a/.github/workflows/run_tests.yaml b/.github/workflows/run_tests.yaml
index f1885bc3..27f75e71 100644
--- a/.github/workflows/run_tests.yaml
+++ b/.github/workflows/run_tests.yaml
@@ -44,7 +44,39 @@ jobs:
           pixi run mojo package numojo
           cp numojo.mojopkg tests/
 
-      - name: Run tests
+      - name: Run Core Tests
         run: |
-          pixi run mojo test tests -I .
-          pixi run mojo test tests/core/test_matrix.mojo -I . -D F_CONTIGUOUS
+          echo "Testing core modules..."
+          pixi run mojo run -I tests/ tests/core/test_array_indexing_and_slicing.mojo
+          pixi run mojo run -I tests/ tests/core/test_array_methods.mojo
+          pixi run mojo run -I tests/ tests/core/test_bool_masks.mojo
+          pixi run mojo run -I tests/ tests/core/test_complexArray.mojo
+          pixi run mojo run -I tests/ tests/core/test_complexSIMD.mojo
+          pixi run mojo run -I tests/ tests/core/test_matrix.mojo
+          pixi run mojo run -I tests/ -D F_CONTIGUOUS tests/core/test_matrix.mojo
+          pixi run mojo run -I tests/ tests/core/test_shape_strides_item.mojo
+
+      - name: Run Routine Tests
+        run: |
+          echo "Testing routines..."
+          pixi run mojo run -I tests/ tests/routines/test_creation.mojo
+          pixi run mojo run -I tests/ tests/routines/test_functional.mojo
+          pixi run mojo run -I tests/ tests/routines/test_indexing.mojo
+          pixi run mojo run -I tests/ tests/routines/test_io.mojo
+          pixi run mojo run -I tests/ tests/routines/test_linalg.mojo
+          pixi run mojo run -I tests/ tests/routines/test_manipulation.mojo
+          pixi run mojo run -I tests/ tests/routines/test_math.mojo
+          pixi run mojo run -I tests/ tests/routines/test_random.mojo
+          pixi run mojo run -I tests/ tests/routines/test_statistics.mojo
+          pixi run mojo run -I tests/ tests/routines/test_sorting.mojo
+          pixi run mojo run -I tests/ tests/routines/test_searching.mojo
+
+      - name: Run Science Tests
+        run: |
+          echo "Testing science modules..."
+          pixi run mojo run -I tests/ tests/science/test_signal.mojo
+
+      - name: Cleanup
+        if: always()
+        run: |
+          rm -f tests/numojo.mojopkg
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 84d1b604..5f6311b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,6 +28,7 @@ numojo.mojopkg
 /bench.mojo
 /test*.mojo
 /test*.ipynb
+bench_*.mojo
 /tempCodeRunnerFile.mojo
 kgen.trace.*
 
diff --git a/assets/matrix_test.mojo b/assets/matrix_test.mojo
new file mode 100644
index 00000000..a9ceec04
--- /dev/null
+++ b/assets/matrix_test.mojo
@@ -0,0 +1,1811 @@
+"""
+`numojo.Matrix` provides:
+
+- `Matrix` type (2DArray).
+- `_MatrixIter` type (for iteration).
+- Dunder methods for initialization, indexing, slicing, and arithmetics.
+- Auxiliary functions.
+"""
+
+from algorithm import parallelize, vectorize
+from memory import UnsafePointer, memcpy, memset_zero
+from random import random_float64
+from sys import simd_width_of
+from python import PythonObject, Python
+
+from numojo.core.flags import Flags
+from numojo.core.ndarray import NDArray
+from numojo.core.data_container import DataContainer
+from numojo.core.utility import _get_offset
+from numojo.routines.manipulation import broadcast_to, reorder_layout
+from numojo.routines.linalg.misc import issymmetric
+
+
+# ===----------------------------------------------------------------------===#
+# Matrix struct
+# ===----------------------------------------------------------------------===#
+
+
+struct Matrix[dtype: DType = DType.float64](
+    ImplicitlyCopyable, Movable, Sized, Stringable, Writable
+):
+    # TODO: Add buffer_type in the parameters.
+    """
+    `Matrix` is a special case of `NDArray` (2DArray) but has some targeted
+    optimization since the number of dimensions is known at the compile time.
+    It has simpler indexing and slicing methods, which is very useful when users
+    only want to work with 2-dimensional arrays.
+
+    NuMojo's `Matrix` is `NDArray` with fixed `ndim` known at compile time.
+    It may be different in some behaviors compared to `numpy.matrix`.
+
+    - For `__getitem__`, passing in two `Int` returns a scalar,
+    and passing in one `Int` or two `Slice` returns a `Matrix`.
+    - We do not need auxiliary types `NDArrayShape` and `NDArrayStrides`
+    as the shape and strides information is fixed in length `Tuple[Int,Int]`.
+
+    Parameters:
+        dtype: Type of item in NDArray. Default type is DType.float64.
+
+    The matrix can be uniquely defined by the following features:
+        1. The data buffer of all items.
+        2. The shape of the matrix.
+        3. The data type of the elements (compile-time known).
+
+    Attributes:
+        - _buf (saved as row-majored, C-type)
+        - shape
+        - size (shape[0] * shape[1])
+        - strides (shape[1], 1)
+
+    Default constructor:
+    - [dtype], shape
+    - [dtype], data
+
+    [checklist] CORE METHODS that have been implemented:
+    - [x] `Matrix.any` and `mat.logic.all`
+    - [x] `Matrix.any` and `mat.logic.any`
+    - [x] `Matrix.argmax` and `mat.sorting.argmax`
+    - [x] `Matrix.argmin` and `mat.sorting.argmin`
+    - [x] `Matrix.argsort` and `mat.sorting.argsort`
+    - [x] `Matrix.astype`
+    - [x] `Matrix.cumprod` and `mat.mathematics.cumprod`
+    - [x] `Matrix.cumsum` and `mat.mathematics.cumsum`
+    - [x] `Matrix.fill` and `mat.creation.full`
+    - [x] `Matrix.flatten`
+    - [x] `Matrix.inv` and `mat.linalg.inv`
+    - [x] `Matrix.max` and `mat.sorting.max`
+    - [x] `Matrix.mean` and `mat.statistics.mean`
+    - [x] `Matrix.min` and `mat.sorting.min`
+    - [x] `Matrix.prod` and `mat.mathematics.prod`
+    - [x] `Matrix.reshape`
+    - [x] `Matrix.resize`
+    - [x] `Matrix.round` and `mat.mathematics.round` (TODO: Check this after next Mojo update)
+    - [x] `Matrix.std` and `mat.statistics.std`
+    - [x] `Matrix.sum` and `mat.mathematics.sum`
+    - [x] `Matrix.trace` and `mat.linalg.trace`
+    - [x] `Matrix.transpose` and `mat.linalg.transpose` (also `Matrix.T`)
+    - [x] `Matrix.variance` and `mat.statistics.variance` (`var` is primitive)
+    """
+
+    alias width: Int = simd_width_of[dtype]()  #
+    """Vector size of the data type."""
+
+    # var _buf: DataContainer[dtype]
+    var _buf: UnsafePointer[Scalar[dtype], **_]
+    """Data buffer of the items in the NDArray."""
+
+    var shape: Tuple[Int, Int]
+    """Shape of Matrix."""
+
+    var size: Int
+    """Size of Matrix."""
+
+    var strides: Tuple[Int, Int]
+    """Strides of matrix."""
+
+    var flags: Flags
+    "Information about the memory layout of the array."
+
+    # ===-------------------------------------------------------------------===#
+    # Life cycle methods
+    # ===-------------------------------------------------------------------===#
+
+    @always_inline("nodebug")
+    fn __init__(
+        out self,
+        shape: Tuple[Int, Int],
+        order: String = "C",
+    ):
+        """
+        Create a new matrix of the given shape,without initializing data.
+
+        Args:
+            shape: Tuple representing (rows, columns).
+            order: Use "C" for row-major (C-style) layout or "F" for column-major
+               (Fortran-style) layout. Defaults to "C".
+        """
+
+        self.shape = (shape[0], shape[1])
+        if order == "C":
+            self.strides = (shape[1], 1)
+        else:
+            self.strides = (1, shape[0])
+        self.size = shape[0] * shape[1]
+        # self._buf = DataContainer[dtype](size=self.size)
+        self._buf = UnsafePointer[
+            Scalar[dtype], mut=True, origin = MutableOrigin.empty
+        ].alloc(self.size)
+        self.flags = Flags(
+            self.shape, self.strides, owndata=True, writeable=True
+        )
+
+    # * Should we take var ref and transfer ownership or take a read ref and copy it?
+    # @always_inline("nodebug")
+    # fn __init__(
+    #     out self,
+    #     var data: Self,
+    # ):
+    #     """
+    #     Construct a matrix from matrix.
+    #     """
+
+    #     self = data^
+
+    # @always_inline("nodebug")
+    # fn __init__(
+    #     out self,
+    #     data: NDArray[dtype],
+    # ) raises:
+    #     """
+    #     Construct a matrix from array.
+    #     """
+
+    #     if data.ndim == 1:
+    #         self.shape = (1, data.shape[0])
+    #         self.strides = (data.shape[0], 1)
+    #         self.size = data.shape[0]
+    #     elif data.ndim == 2:
+    #         self.shape = (data.shape[0], data.shape[1])
+    #         self.strides = (data.shape[1], 1)
+    #         self.size = data.shape[0] * data.shape[1]
+    #     else:
+    #         raise Error(String("Shape too large to be a matrix."))
+
+    #     self._buf = DataContainer[dtype](self.size)
+
+    #     self.flags = Flags(
+    #         self.shape, self.strides, owndata=True, writeable=True
+    #     )
+
+    #     if data.flags["C_CONTIGUOUS"]:
+    #         for i in range(data.shape[0]):
+    #             memcpy(
+    #                 self._buf.ptr.offset(i * self.shape[0]),
+    #                 data._buf.ptr.offset(i * data.shape[0]),
+    #                 self.shape[0],
+    #             )
+    #     else:
+    #         for i in range(data.shape[0]):
+    #             for j in range(data.shape[1]):
+    #                 self._store(i, j, data._getitem(i, j))
+
+    @always_inline("nodebug")
+    fn __init__(
+        out self,
+        shape: Tuple[Int, Int],
+        strides: Tuple[Int, Int],
+        offset: Int,
+        ptr: UnsafePointer[Scalar[dtype], *_],
+    ):
+        """
+        Initialize Matrix that does not own the data.
+        The data is owned by another Matrix.
+
+        Args:
+            shape: Shape of the view.
+            strides: Strides of the view.
+            offset: Offset in pointer of the data buffer.
+            ptr: Pointer to the data buffer of the original array.
+        """
+        self.shape = shape
+        self.strides = strides
+        self.size = shape[0] * shape[1]
+        # self._buf = DataContainer(ptr=ptr.offset(offset))
+        self._buf = ptr.offset(offset)
+        self.flags = Flags(
+            self.shape, self.strides, owndata=False, writeable=False
+        )
+
+    @always_inline("nodebug")
+    fn __copyinit__(out self, other: Self):
+        """
+        Copy other into self.
+        """
+        self.shape = (other.shape[0], other.shape[1])
+        self.strides = (other.strides[0], other.strides[1])
+        self.size = other.size
+        # self._buf = DataContainer[dtype](other.size)
+        # memcpy(self._buf.ptr, other._buf.ptr, other.size)
+        self._buf = UnsafePointer[
+            Scalar[dtype], mut=True, origin = MutableOrigin.empty
+        ].alloc(other.size)
+        memcpy(self._buf, other._buf, other.size)
+        self.flags = other.flags
+
+    @always_inline("nodebug")
+    fn __moveinit__(out self, deinit other: Self):
+        """
+        Move other into self.
+        """
+        self.shape = other.shape^
+        self.strides = other.strides^
+        self.size = other.size
+        self._buf = other._buf
+        self.flags = other.flags^
+
+    @always_inline("nodebug")
+    fn __del__(deinit self):
+        var owndata: Bool = self.flags.OWNDATA
+        if owndata:
+            print("Matrix __del__ called", self.size, self.flags.OWNDATA)
+            self._buf.free()
+
+    # ===-------------------------------------------------------------------===#
+    # Slicing and indexing methods
+    # ===-------------------------------------------------------------------===#
+
+    fn __getitem__(self, var x: Int, var y: Int) raises -> Scalar[dtype]:
+        """
+        Return the scalar at the index.
+
+        Args:
+            x: The row number.
+            y: The column number.
+
+        Returns:
+            A scalar matching the dtype of the array.
+        """
+
+        if x < 0:
+            x = self.shape[0] + x
+
+        if y < 0:
+            y = self.shape[1] + y
+
+        if (x >= self.shape[0]) or (y >= self.shape[1]):
+            raise Error(
+                String(
+                    "Index ({}, {}) exceed the matrix shape ({}, {})"
+                ).format(x, y, self.shape[0], self.shape[1])
+            )
+
+        # return self._buf.load(x * self.strides[0] + y * self.strides[1])
+        return self._buf.load(x * self.strides[0] + y * self.strides[1])
+
+    fn __getitem__(ref self: Self, var x: Int) -> Matrix[dtype]:
+        """
+        Return the corresponding row at the index.
+
+        Args:
+            x: The row number.
+        """
+        print("_getitem__ called")
+        # var new_ptr = self._buf.origin_cast[
+        #     target_mut = True,
+        #     target_origin=MutableOrigin.cast_from[__origin_of(self)],
+        # ]()
+        var new_ptr = self._buf.origin_cast[
+            Origin(__origin_of(self)).mut, __origin_of(self)
+        ]()
+        return Matrix[dtype](
+            shape=(1, self.shape[1]),
+            strides=(self.strides[0], self.strides[1]),
+            offset=x * self.strides[0],
+            ptr=new_ptr,
+            # ptr = self._buf.get_ptr()
+        )
+
+    fn _store[
+        width: Int = 1
+    ](mut self, var x: Int, simd: SIMD[dtype, width]) raises:
+        """
+        `__setitem__` for row with width.
+        Unsafe: No boundary check!
+        """
+        self._buf.store(x, simd)
+
+    # fn __getitem__(self, var x: Int) raises -> Self:
+    #     """
+    #     Return the corresponding row at the index.
+
+    #     Args:
+    #         x: The row number.
+    #     """
+
+    #     if x < 0:
+    #         x = self.shape[0] + x
+
+    #     if x >= self.shape[0]:
+    #         raise Error(
+    #             String("Index {} exceed the row number {}").format(
+    #                 x, self.shape[0]
+    #             )
+    #         )
+
+    #     var res = Self(shape=(1, self.shape[1]), order=self.order())
+
+    #     if self.flags.C_CONTIGUOUS:
+    #         var ptr = self._buf.ptr.offset(x * self.strides[0])
+    #         memcpy(res._buf.ptr, ptr, self.shape[1])
+    #     else:
+    #         for j in range(self.shape[1]):
+    #             res[0, j] = self[x, j]
+
+    #     return res^
+
+    fn __getitem__(self, x: Slice, y: Slice) -> Self:
+        """
+        Get item from two slices.
+        """
+        var start_x: Int
+        var end_x: Int
+        var step_x: Int
+        var start_y: Int
+        var end_y: Int
+        var step_y: Int
+        start_x, end_x, step_x = x.indices(self.shape[0])
+        start_y, end_y, step_y = y.indices(self.shape[1])
+        var range_x = range(start_x, end_x, step_x)
+        var range_y = range(start_y, end_y, step_y)
+
+        # The new matrix with the corresponding shape
+        var B = Matrix[dtype](
+            shape=(len(range_x), len(range_y)), order=self.order()
+        )
+
+        # Fill in the values at the corresponding index
+        var row = 0
+        for i in range_x:
+            var col = 0
+            for j in range_y:
+                B._store(row, col, self._load(i, j))
+                col += 1
+            row += 1
+
+        return B^
+
+    # fn __getitem__(self, x: Slice, var y: Int) -> Self:
+    #     """
+    #     Get item from one slice and one int.
+    #     """
+    #     if y < 0:
+    #         y = self.shape[1] + y
+
+    #     var start_x: Int
+    #     var end_x: Int
+    #     var step_x: Int
+    #     start_x, end_x, step_x = x.indices(self.shape[0])
+    #     var range_x = range(start_x, end_x, step_x)
+
+    #     # The new matrix with the corresponding shape
+    #     var B = Matrix[dtype](shape=(len(range_x), 1), order=self.order())
+
+    #     # Fill in the values at the corresponding index
+    #     var row = 0
+    #     for i in range_x:
+    #         B._store(row, 0, self._load(i, y))
+    #         row += 1
+
+    #     return B^
+
+    # fn __getitem__(self, var x: Int, y: Slice) -> Self:
+    #     """
+    #     Get item from one int and one slice.
+    #     """
+    #     if x < 0:
+    #         x = self.shape[0] + x
+
+    #     var start_y: Int
+    #     var end_y: Int
+    #     var step_y: Int
+    #     start_y, end_y, step_y = y.indices(self.shape[1])
+    #     var range_y = range(start_y, end_y, step_y)
+
+    #     # The new matrix with the corresponding shape
+    #     var B = Matrix[dtype](shape=(1, len(range_y)), order=self.order())
+
+    #     # Fill in the values at the corresponding index
+    #     var col = 0
+    #     for j in range_y:
+    #         B._store(0, col, self._load(x, j))
+    #         col += 1
+
+    #     return B^
+
+    # fn __getitem__(self, indices: List[Int]) raises -> Self:
+    #     """
+    #     Get item by a list of integers.
+    #     """
+
+    #     var ncol = self.shape[1]
+    #     var nrow = len(indices)
+    #     var res = Matrix.zeros[dtype](shape=(nrow, ncol))
+    #     for i in range(nrow):
+    #         res[i] = self[indices[i]]
+    #     return res^
+
+    fn _load[width: Int = 1](self, x: Int, y: Int) -> SIMD[dtype, width]:
+        """
+        `__getitem__` with width.
+        Unsafe: No boundary check!
+        """
+        return self._buf.load[width=width](
+            x * self.strides[0] + y * self.strides[1]
+        )
+
+    fn __setitem__(self, x: Int, y: Int, value: Scalar[dtype]) raises:
+        """
+        Return the scalar at the index.
+
+        Args:
+            x: The row number.
+            y: The column number.
+            value: The value to be set.
+        """
+
+        if (x >= self.shape[0]) or (y >= self.shape[1]):
+            raise Error(
+                String(
+                    "Index ({}, {}) exceed the matrix shape ({}, {})"
+                ).format(x, y, self.shape[0], self.shape[1])
+            )
+
+        self._buf.store(x * self.strides[0] + y * self.strides[1], value)
+
+    fn __setitem__(self, var x: Int, value: Self) raises:
+        """
+        Set the corresponding row at the index with the given matrix.
+
+        Args:
+            x: The row number.
+            value: Matrix (row vector).
+        """
+
+        if x < 0:
+            x = self.shape[0] + x
+
+        if x >= self.shape[0]:
+            raise Error(
+                String(
+                    "Error: Elements of `index` ({}) \n"
+                    "exceed the matrix shape ({})."
+                ).format(x, self.shape[0])
+            )
+
+        if value.shape[0] != 1:
+            raise Error(
+                String(
+                    "Error: The value should has only 1 row, "
+                    "but it has {} rows."
+                ).format(value.shape[0])
+            )
+
+        if self.shape[1] != value.shape[1]:
+            raise Error(
+                String(
+                    "Error: Matrix has {} columns, "
+                    "but the value has {} columns."
+                ).format(self.shape[1], value.shape[1])
+            )
+
+        var ptr = self._buf.offset(x * self.shape[1])
+        memcpy(ptr, value._buf, value.size)
+
+    fn _store[
+        width: Int = 1
+    ](mut self, x: Int, y: Int, simd: SIMD[dtype, width]):
+        """
+        `__setitem__` with width.
+        Unsafe: No boundary check!
+        """
+        self._buf.store(x * self.strides[0] + y * self.strides[1], simd)
+
+    # ===-------------------------------------------------------------------===#
+    # Other dunders and auxiliary methods
+    # ===-------------------------------------------------------------------===#
+
+    fn __iter__(self) raises -> _MatrixIter[__origin_of(self), dtype]:
+        """Iterate over elements of the Matrix, returning copied value.
+
+        Example:
+        ```mojo
+        from numojo import Matrix
+        var A = Matrix.rand((4,4))
+        for i in A:
+            print(i)
+        ```
+
+        Returns:
+            An iterator of Matrix elements.
+        """
+
+        return _MatrixIter[__origin_of(self), dtype](
+            matrix=self,
+            length=self.shape[0],
+        )
+
+    fn __len__(self) -> Int:
+        """
+        Returns length of 0-th dimension.
+        """
+        return self.shape[0]
+
+    fn __reversed__(
+        self,
+    ) raises -> _MatrixIter[__origin_of(self), dtype, forward=False]:
+        """Iterate backwards over elements of the Matrix, returning
+        copied value.
+
+        Returns:
+            A reversed iterator of Matrix elements.
+        """
+
+        return _MatrixIter[__origin_of(self), dtype, forward=False](
+            matrix=self,
+            length=self.shape[0],
+        )
+
+    fn __str__(self) -> String:
+        return String.write(self)
+
+    fn write_to[W: Writer](self, mut writer: W):
+        fn print_row(self: Self, i: Int, sep: String) raises -> String:
+            var result: String = String("[")
+            var number_of_sep: Int = 1
+            if self.shape[1] <= 6:
+                for j in range(self.shape[1]):
+                    if j == self.shape[1] - 1:
+                        number_of_sep = 0
+                    result += String(self[i, j]) + sep * number_of_sep
+            else:
+                for j in range(3):
+                    result += String(self[i, j]) + sep
+                result += String("...") + sep
+                for j in range(self.shape[1] - 3, self.shape[1]):
+                    if j == self.shape[1] - 1:
+                        number_of_sep = 0
+                    result += String(self[i, j]) + sep * number_of_sep
+            result += String("]")
+            return result
+
+        var sep: String = String("\t")
+        var newline: String = String("\n ")
+        var number_of_newline: Int = 1
+        var result: String = "["
+
+        try:
+            if self.shape[0] <= 6:
+                for i in range(self.shape[0]):
+                    if i == self.shape[0] - 1:
+                        number_of_newline = 0
+                    result += (
+                        print_row(self, i, sep) + newline * number_of_newline
+                    )
+            else:
+                for i in range(3):
+                    result += print_row(self, i, sep) + newline
+                result += String("...") + newline
+                for i in range(self.shape[0] - 3, self.shape[0]):
+                    if i == self.shape[0] - 1:
+                        number_of_newline = 0
+                    result += (
+                        print_row(self, i, sep) + newline * number_of_newline
+                    )
+            result += String("]")
+            writer.write(
+                result
+                + "\nDType: "
+                + String(self.dtype)
+                + "  Shape: "
+                + String(self.shape[0])
+                + "x"
+                + String(self.shape[1])
+                + "  Strides: "
+                + String(self.strides[0])
+                + ","
+                + String(self.strides[1])
+                + "  C: "
+                + String(self.flags["C_CONTIGUOUS"])
+                + "  F: "
+                + String(self.flags["F_CONTIGUOUS"])
+                + "  Own: "
+                + String(self.flags["OWNDATA"])
+            )
+        except e:
+            print("Cannot transfer matrix to string!", e)
+
+    # ===-------------------------------------------------------------------===#
+    # Arithmetic dunder methods
+    # ===-------------------------------------------------------------------===#
+
+    fn __add__(
+        read self: Matrix[dtype, *_], read other: Matrix[dtype, *_]
+    ) raises -> Matrix[dtype, *_]:
+        # if (self.shape[0] == other.shape[0]) and (
+        #     self.shape[1] == other.shape[1]
+        # ):
+        return _arithmetic_func_matrix_matrix_to_matrix[dtype, SIMD.__add__](
+            self, other
+        )
+
+    # fn __add__(self, other: Self) raises -> Self:
+    #     if (self.shape[0] == other.shape[0]) and (
+    #         self.shape[1] == other.shape[1]
+    #     ):
+    #         return _arithmetic_func_matrix_matrix_to_matrix[
+    #             dtype, SIMD.__add__
+    #         ](self, other)
+    #     elif (self.shape[0] < other.shape[0]) or (
+    #         self.shape[1] < other.shape[1]
+    #     ):
+    #         return _arithmetic_func_matrix_matrix_to_matrix[
+    #             dtype, SIMD.__add__
+    #         ](broadcast_to(self.copy(), other.shape, self.order()), other)
+    #     else:
+    #         return _arithmetic_func_matrix_matrix_to_matrix[
+    #             dtype, SIMD.__add__
+    #         ](self, broadcast_to(other.copy(), self.shape, self.order()))
+
+    # fn __add__(self, other: Scalar[dtype]) raises -> Self:
+    #     """Add matrix to scalar.
+
+    #     ```mojo
+    #     from numojo import Matrix
+    #     var A = Matrix.ones(shape=(4, 4))
+    #     print(A + 2)
+    #     ```
+    #     """
+    #     return self + broadcast_to[dtype](other, self.shape, self.order())
+
+    fn __radd__(self, other: Scalar[dtype]) raises -> Self:
+        """
+        Right-add.
+
+        ```mojo
+        from numojo import Matrix
+        A = Matrix.ones(shape=(4, 4))
+        print(2 + A)
+        ```
+        """
+        return broadcast_to[dtype](other, self.shape, self.order()) + self
+
+    fn __sub__(self, other: Self) raises -> Self:
+        if (self.shape[0] == other.shape[0]) and (
+            self.shape[1] == other.shape[1]
+        ):
+            return _arithmetic_func_matrix_matrix_to_matrix[
+                dtype, SIMD.__sub__
+            ](self, other)
+        elif (self.shape[0] < other.shape[0]) or (
+            self.shape[1] < other.shape[1]
+        ):
+            return _arithmetic_func_matrix_matrix_to_matrix[
+                dtype, SIMD.__sub__
+            ](broadcast_to(self.copy(), other.shape, self.order()), other)
+        else:
+            return _arithmetic_func_matrix_matrix_to_matrix[
+                dtype, SIMD.__sub__
+            ](self, broadcast_to(other.copy(), self.shape, self.order()))
+
+    fn __sub__(self, other: Scalar[dtype]) raises -> Self:
+        """Subtract matrix by scalar.
+
+        ```mojo
+        from numojo import Matrix
+        A = Matrix(shape=(4, 4))
+        print(A - 2)
+        ```
+        """
+        return self - broadcast_to[dtype](other, self.shape, self.order())
+
+    fn __rsub__(self, other: Scalar[dtype]) raises -> Self:
+        """
+        Right-sub.
+
+        ```mojo
+        from numojo import Matrix
+        A = Matrix.ones(shape=(4, 4))
+        print(2 - A)
+        ```
+        """
+        return broadcast_to[dtype](other, self.shape, self.order()) - self
+
+    fn __mul__(self, other: Self) raises -> Self:
+        if (self.shape[0] == other.shape[0]) and (
+            self.shape[1] == other.shape[1]
+        ):
+            return _arithmetic_func_matrix_matrix_to_matrix[
+                dtype, SIMD.__mul__
+            ](self, other)
+        elif (self.shape[0] < other.shape[0]) or (
+            self.shape[1] < other.shape[1]
+        ):
+            return _arithmetic_func_matrix_matrix_to_matrix[
+                dtype, SIMD.__mul__
+            ](broadcast_to(self.copy(), other.shape, self.order()), other)
+        else:
+            return _arithmetic_func_matrix_matrix_to_matrix[
+                dtype, SIMD.__mul__
+            ](self, broadcast_to(other.copy(), self.shape, self.order()))
+
+    fn __mul__(self, other: Scalar[dtype]) raises -> Self:
+        """Mutiply matrix by scalar.
+
+        ```mojo
+        from numojo import Matrix
+        A = Matrix.ones(shape=(4, 4))
+        print(A * 2)
+        ```
+        """
+        return self * broadcast_to[dtype](other, self.shape, self.order())
+
+    fn __rmul__(self, other: Scalar[dtype]) raises -> Self:
+        """
+        Right-mul.
+
+        ```mojo
+        from numojo import Matrix
+        A = Matrix.ones(shape=(4, 4))
+        print(2 * A)
+        ```
+        """
+        return broadcast_to[dtype](other, self.shape, self.order()) * self
+
+    fn __truediv__(self, other: Self) raises -> Self:
+        if (self.shape[0] == other.shape[0]) and (
+            self.shape[1] == other.shape[1]
+        ):
+            return _arithmetic_func_matrix_matrix_to_matrix[
+                dtype, SIMD.__truediv__
+            ](self, other)
+        elif (self.shape[0] < other.shape[0]) or (
+            self.shape[1] < other.shape[1]
+        ):
+            return _arithmetic_func_matrix_matrix_to_matrix[
+                dtype, SIMD.__truediv__
+            ](broadcast_to(self.copy(), other.shape, self.order()), other)
+        else:
+            return _arithmetic_func_matrix_matrix_to_matrix[
+                dtype, SIMD.__truediv__
+            ](self, broadcast_to(other.copy(), self.shape, self.order()))
+
+    fn __truediv__(self, other: Scalar[dtype]) raises -> Self:
+        """Divide matrix by scalar."""
+        return self / broadcast_to[dtype](other, self.shape, order=self.order())
+
+    # Shouldn't we do the operation inplace?
+    fn __pow__(self, rhs: Scalar[dtype]) raises -> Self:
+        """Power of items."""
+        var result: Self = self.copy()
+        for i in range(self.size):
+            result._buf.ptr[i] = self._buf.ptr[i].__pow__(rhs)
+        return result^
+
+    fn __lt__(self, other: Self) raises -> Matrix[DType.bool]:
+        if (self.shape[0] == other.shape[0]) and (
+            self.shape[1] == other.shape[1]
+        ):
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.lt](
+                self, other
+            )
+        elif (self.shape[0] < other.shape[0]) or (
+            self.shape[1] < other.shape[1]
+        ):
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.lt](
+                broadcast_to(self.copy(), other.shape, self.order()), other
+            )
+        else:
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.lt](
+                self, broadcast_to(other.copy(), self.shape, self.order())
+            )
+
+    fn __lt__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool]:
+        """Matrix less than scalar.
+
+        ```mojo
+        from numojo import Matrix
+        A = Matrix.ones(shape=(4, 4))
+        print(A < 2)
+        ```
+        """
+        return self < broadcast_to[dtype](other, self.shape, self.order())
+
+    fn __le__(self, other: Self) raises -> Matrix[DType.bool]:
+        if (self.shape[0] == other.shape[0]) and (
+            self.shape[1] == other.shape[1]
+        ):
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.le](
+                self, other
+            )
+        elif (self.shape[0] < other.shape[0]) or (
+            self.shape[1] < other.shape[1]
+        ):
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.le](
+                broadcast_to(self.copy(), other.shape, self.order()), other
+            )
+        else:
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.le](
+                self, broadcast_to(other.copy(), self.shape, self.order())
+            )
+
+    fn __le__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool]:
+        """Matrix less than and equal to scalar.
+
+        ```mojo
+        from numojo import Matrix
+        A = Matrix.ones(shape=(4, 4))
+        print(A <= 2)
+        ```
+        """
+        return self <= broadcast_to[dtype](other, self.shape, self.order())
+
+    fn __gt__(self, other: Self) raises -> Matrix[DType.bool]:
+        if (self.shape[0] == other.shape[0]) and (
+            self.shape[1] == other.shape[1]
+        ):
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.gt](
+                self, other
+            )
+        elif (self.shape[0] < other.shape[0]) or (
+            self.shape[1] < other.shape[1]
+        ):
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.gt](
+                broadcast_to(self.copy(), other.shape, self.order()), other
+            )
+        else:
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.gt](
+                self, broadcast_to(other.copy(), self.shape, self.order())
+            )
+
+    fn __gt__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool]:
+        """Matrix greater than scalar.
+
+        ```mojo
+        from numojo import Matrix
+        A = Matrix.ones(shape=(4, 4))
+        print(A > 2)
+        ```
+        """
+        return self > broadcast_to[dtype](other, self.shape, self.order())
+
+    fn __ge__(self, other: Self) raises -> Matrix[DType.bool]:
+        if (self.shape[0] == other.shape[0]) and (
+            self.shape[1] == other.shape[1]
+        ):
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.ge](
+                self, other
+            )
+        elif (self.shape[0] < other.shape[0]) or (
+            self.shape[1] < other.shape[1]
+        ):
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.ge](
+                broadcast_to(self.copy(), other.shape, self.order()), other
+            )
+        else:
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.ge](
+                self, broadcast_to(other.copy(), self.shape, self.order())
+            )
+
+    fn __ge__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool]:
+        """Matrix greater than and equal to scalar.
+
+        ```mojo
+        from numojo import Matrix
+        A = Matrix.ones(shape=(4, 4))
+        print(A >= 2)
+        ```
+        """
+        return self >= broadcast_to[dtype](other, self.shape, self.order())
+
+    fn __eq__(self, other: Self) raises -> Matrix[DType.bool]:
+        if (self.shape[0] == other.shape[0]) and (
+            self.shape[1] == other.shape[1]
+        ):
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.eq](
+                self, other
+            )
+        elif (self.shape[0] < other.shape[0]) or (
+            self.shape[1] < other.shape[1]
+        ):
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.eq](
+                broadcast_to(self.copy(), other.shape, self.order()), other
+            )
+        else:
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.eq](
+                self, broadcast_to(other.copy(), self.shape, self.order())
+            )
+
+    fn __eq__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool]:
+        """Matrix less than and equal to scalar.
+
+        ```mojo
+        from numojo import Matrix
+        A = Matrix.ones(shape=(4, 4))
+        print(A == 2)
+        ```
+        """
+        return self == broadcast_to[dtype](other, self.shape, self.order())
+
+    fn __ne__(self, other: Self) raises -> Matrix[DType.bool]:
+        if (self.shape[0] == other.shape[0]) and (
+            self.shape[1] == other.shape[1]
+        ):
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.ne](
+                self, other
+            )
+        elif (self.shape[0] < other.shape[0]) or (
+            self.shape[1] < other.shape[1]
+        ):
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.ne](
+                broadcast_to(self.copy(), other.shape, self.order()), other
+            )
+        else:
+            return _logic_func_matrix_matrix_to_matrix[dtype, SIMD.ne](
+                self, broadcast_to(other.copy(), self.shape, self.order())
+            )
+
+    fn __ne__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool]:
+        """Matrix less than and equal to scalar.
+
+        ```mojo
+        from numojo import Matrix
+        A = Matrix.ones(shape=(4, 4))
+        print(A != 2)
+        ```
+        """
+        return self != broadcast_to[dtype](other, self.shape, self.order())
+
+    fn __matmul__(self, other: Self) raises -> Self:
+        return numojo.linalg.matmul(self, other)
+
+    # ===-------------------------------------------------------------------===#
+    # Core methods
+    # ===-------------------------------------------------------------------===#
+
+    fn all(self) -> Scalar[dtype]:
+        """
+        Test whether all array elements evaluate to True.
+        """
+        return numojo.logic.all(self)
+
+    fn all(self, axis: Int) raises -> Self:
+        """
+        Test whether all array elements evaluate to True along axis.
+        """
+        return numojo.logic.all(self, axis=axis)
+
+    fn any(self) -> Scalar[dtype]:
+        """
+        Test whether any array elements evaluate to True.
+        """
+        return numojo.logic.any(self)
+
+    fn any(self, axis: Int) raises -> Self:
+        """
+        Test whether any array elements evaluate to True along axis.
+        """
+        return numojo.logic.any(self, axis=axis)
+
+    fn argmax(self) raises -> Scalar[DType.int]:
+        """
+        Index of the max. It is first flattened before sorting.
+        """
+        return numojo.math.argmax(self)
+
+    fn argmax(self, axis: Int) raises -> Matrix[DType.int]:
+        """
+        Index of the max along the given axis.
+        """
+        return numojo.math.argmax(self, axis=axis)
+
+    fn argmin(self) raises -> Scalar[DType.int]:
+        """
+        Index of the min. It is first flattened before sorting.
+        """
+        return numojo.math.argmin(self)
+
+    fn argmin(self, axis: Int) raises -> Matrix[DType.int]:
+        """
+        Index of the min along the given axis.
+        """
+        return numojo.math.argmin(self, axis=axis)
+
+    fn argsort(self) raises -> Matrix[DType.int]:
+        """
+        Argsort the Matrix. It is first flattened before sorting.
+        """
+        return numojo.math.argsort(self)
+
+    fn argsort(self, axis: Int) raises -> Matrix[DType.int]:
+        """
+        Argsort the Matrix along the given axis.
+        """
+        return numojo.math.argsort(self.copy(), axis=axis)
+
+    fn astype[asdtype: DType](self) -> Matrix[asdtype]:
+        """
+        Copy of the matrix, cast to a specified type.
+        """
+        var res = Matrix[asdtype](
+            shape=(self.shape[0], self.shape[1]), order=self.order()
+        )
+        for i in range(self.size):
+            res._buf.ptr[i] = self._buf.ptr[i].cast[asdtype]()
+        return res^
+
+    fn cumprod(self) raises -> Matrix[dtype]:
+        """
+        Cumprod of flattened matrix.
+
+        Example:
+        ```mojo
+        from numojo import Matrix
+        var A = Matrix.rand(shape=(100, 100))
+        print(A.cumprod())
+        ```
+        """
+        return numojo.math.cumprod(self.copy())
+
+    fn cumprod(self, axis: Int) raises -> Matrix[dtype]:
+        """
+        Cumprod of Matrix along the axis.
+
+        Args:
+            axis: 0 or 1.
+
+        Example:
+        ```mojo
+        from numojo import Matrix
+        var A = Matrix.rand(shape=(100, 100))
+        print(A.cumprod(axis=0))
+        print(A.cumprod(axis=1))
+        ```
+        """
+        return numojo.math.cumprod(self.copy(), axis=axis)
+
+    fn cumsum(self) raises -> Matrix[dtype]:
+        return numojo.math.cumsum(self.copy())
+
+    fn cumsum(self, axis: Int) raises -> Matrix[dtype]:
+        return numojo.math.cumsum(self.copy(), axis=axis)
+
+    fn fill(self, fill_value: Scalar[dtype]):
+        """
+        Fill the matrix with value.
+
+        See also function `mat.creation.full`.
+        """
+        for i in range(self.size):
+            self._buf.ptr[i] = fill_value
+
+    fn flatten(self) -> Self:
+        """
+        Return a flattened copy of the matrix.
+        """
+        var res = Self(shape=(1, self.size), order=self.order())
+        memcpy(res._buf.ptr, self._buf.ptr, res.size)
+        return res^
+
+    fn inv(self) raises -> Self:
+        """
+        Inverse of matrix.
+        """
+        return numojo.linalg.inv(self)
+
+    fn order(self) -> String:
+        """
+        Returns the order.
+        """
+        var order: String = "F"
+        if self.flags.C_CONTIGUOUS:
+            order = "C"
+        return order
+
+    fn max(self) raises -> Scalar[dtype]:
+        """
+        Find max item. It is first flattened before sorting.
+        """
+        return numojo.math.extrema.max(self)
+
+    fn max(self, axis: Int) raises -> Self:
+        """
+        Find max item along the given axis.
+        """
+        return numojo.math.extrema.max(self, axis=axis)
+
+    fn mean[
+        returned_dtype: DType = DType.float64
+    ](self) raises -> Scalar[returned_dtype]:
+        """
+        Calculate the arithmetic average of all items in the Matrix.
+        """
+        return numojo.statistics.mean[returned_dtype](self)
+
+    fn mean[
+        returned_dtype: DType = DType.float64
+    ](self, axis: Int) raises -> Matrix[returned_dtype]:
+        """
+        Calculate the arithmetic average of a Matrix along the axis.
+
+        Args:
+            axis: 0 or 1.
+        """
+        return numojo.statistics.mean[returned_dtype](self, axis=axis)
+
+    fn min(self) raises -> Scalar[dtype]:
+        """
+        Find min item. It is first flattened before sorting.
+        """
+        return numojo.math.extrema.min(self)
+
+    fn min(self, axis: Int) raises -> Self:
+        """
+        Find min item along the given axis.
+        """
+        return numojo.math.extrema.min(self, axis=axis)
+
+    fn prod(self) -> Scalar[dtype]:
+        """
+        Product of all items in the Matrix.
+        """
+        return numojo.math.prod(self)
+
+    fn prod(self, axis: Int) raises -> Self:
+        """
+        Product of items in a Matrix along the axis.
+
+        Args:
+            axis: 0 or 1.
+
+        Example:
+        ```mojo
+        from numojo import Matrix
+        var A = Matrix.rand(shape=(100, 100))
+        print(A.prod(axis=0))
+        print(A.prod(axis=1))
+        ```
+        """
+        return numojo.math.prod(self, axis=axis)
+
+    fn reshape(self, shape: Tuple[Int, Int]) raises -> Self:
+        """
+        Change shape and size of matrix and return a new matrix.
+        """
+        if shape[0] * shape[1] != self.size:
+            raise Error(
+                String(
+                    "Cannot reshape matrix of size {} into shape ({}, {})."
+                ).format(self.size, shape[0], shape[1])
+            )
+        var res = Self(shape=shape, order="C")
+        if self.flags.F_CONTIGUOUS:
+            var temp = self.reorder_layout()
+            memcpy(res._buf.ptr, temp._buf.ptr, res.size)
+            res = res.reorder_layout()
+        else:
+            memcpy(res._buf.ptr, self._buf.ptr, res.size)
+        return res^
+
+    fn resize(mut self, shape: Tuple[Int, Int]) raises:
+        """
+        Change shape and size of matrix in-place.
+        """
+        if shape[0] * shape[1] > self.size:
+            var other = Self(shape=shape)
+            if self.flags.C_CONTIGUOUS:
+                memcpy(other._buf.ptr, self._buf.ptr, self.size)
+                for i in range(self.size, other.size):
+                    other._buf.ptr[i] = 0
+            else:
+                var idx = 0
+                for i in range(other.size):
+                    other._buf.ptr.store(i, 0.0)
+                    if idx < self.size:
+                        other._buf.ptr[i] = self._buf.ptr[
+                            (i % self.shape[1]) * self.shape[0]
+                            + (i // self.shape[1])
+                        ]
+                        idx += 1
+                other = other.reorder_layout()
+            self = other^
+        else:
+            self.shape[0] = shape[0]
+            self.shape[1] = shape[1]
+            self.size = shape[0] * shape[1]
+
+            if self.flags.C_CONTIGUOUS:
+                self.strides[0] = shape[1]
+            else:
+                self.strides[1] = shape[0]
+
+    fn round(self, decimals: Int) raises -> Self:
+        return numojo.math.rounding.round(self.copy(), decimals=decimals)
+
+    fn std[
+        returned_dtype: DType = DType.float64
+    ](self, ddof: Int = 0) raises -> Scalar[returned_dtype]:
+        """
+        Compute the standard deviation.
+
+        Args:
+            ddof: Delta degree of freedom.
+        """
+        return numojo.statistics.std[returned_dtype](self, ddof=ddof)
+
+    fn std[
+        returned_dtype: DType = DType.float64
+    ](self, axis: Int, ddof: Int = 0) raises -> Matrix[returned_dtype]:
+        """
+        Compute the standard deviation along axis.
+
+        Args:
+            axis: 0 or 1.
+            ddof: Delta degree of freedom.
+        """
+        return numojo.statistics.std[returned_dtype](self, axis=axis, ddof=ddof)
+
+    fn sum(self) -> Scalar[dtype]:
+        """
+        Sum up all items in the Matrix.
+
+        Example:
+        ```mojo
+        from numojo import Matrix
+        var A = Matrix.rand(shape=(100, 100))
+        print(A.sum())
+        ```
+        """
+        return numojo.math.sum(self)
+
+    fn sum(self, axis: Int) raises -> Self:
+        """
+        Sum up the items in a Matrix along the axis.
+
+        Args:
+            axis: 0 or 1.
+
+        Example:
+        ```mojo
+        from numojo import Matrix
+        var A = Matrix.rand(shape=(100, 100))
+        print(A.sum(axis=0))
+        print(A.sum(axis=1))
+        ```
+        """
+        return numojo.math.sum(self, axis=axis)
+
+    fn trace(self) raises -> Scalar[dtype]:
+        """
+        Trace of matrix.
+        """
+        return numojo.linalg.trace(self)
+
+    fn issymmetric(self) -> Bool:
+        """
+        Transpose of matrix.
+        """
+        return issymmetric(self)
+
+    fn transpose(self) -> Self:
+        """
+        Transpose of matrix.
+        """
+        return transpose(self)
+
+    fn reorder_layout(self) raises -> Self:
+        """
+        Reorder_layout matrix.
+        """
+        return reorder_layout(self)
+
+    fn T(self) -> Self:
+        return transpose(self)
+
+    fn variance[
+        returned_dtype: DType = DType.float64
+    ](self, ddof: Int = 0) raises -> Scalar[returned_dtype]:
+        """
+        Compute the variance.
+
+        Args:
+            ddof: Delta degree of freedom.
+        """
+        return numojo.statistics.variance[returned_dtype](self, ddof=ddof)
+
+    fn variance[
+        returned_dtype: DType = DType.float64
+    ](self, axis: Int, ddof: Int = 0) raises -> Matrix[returned_dtype]:
+        """
+        Compute the variance along axis.
+
+        Args:
+            axis: 0 or 1.
+            ddof: Delta degree of freedom.
+        """
+        return numojo.statistics.variance[returned_dtype](
+            self, axis=axis, ddof=ddof
+        )
+
+    # ===-------------------------------------------------------------------===#
+    # To other data types
+    # ===-------------------------------------------------------------------===#
+
+    fn to_ndarray(self) raises -> NDArray[dtype]:
+        """Create `NDArray` from `Matrix`.
+
+        It makes a copy of the buffer of the matrix.
+        """
+
+        var ndarray: NDArray[dtype] = NDArray[dtype](
+            shape=List[Int](self.shape[0], self.shape[1]), order="C"
+        )
+        memcpy(ndarray._buf.ptr, self._buf.ptr, ndarray.size)
+
+        return ndarray^
+
+    fn to_numpy(self) raises -> PythonObject:
+        """See `numojo.core.utility.to_numpy`."""
+        try:
+            var np = Python.import_module("numpy")
+
+            var np_arr_dim = Python.list()
+            np_arr_dim.append(self.shape[0])
+            np_arr_dim.append(self.shape[1])
+
+            np.set_printoptions(4)
+
+            # Implement a dictionary for this later
+            var numpyarray: PythonObject
+            var np_dtype = np.float64
+            if dtype == DType.float16:
+                np_dtype = np.float16
+            elif dtype == DType.float32:
+                np_dtype = np.float32
+            elif dtype == DType.int64:
+                np_dtype = np.int64
+            elif dtype == DType.int32:
+                np_dtype = np.int32
+            elif dtype == DType.int16:
+                np_dtype = np.int16
+            elif dtype == DType.int8:
+                np_dtype = np.int8
+            elif dtype == DType.uint64:
+                np_dtype = np.uint64
+            elif dtype == DType.uint32:
+                np_dtype = np.uint32
+            elif dtype == DType.uint16:
+                np_dtype = np.uint16
+            elif dtype == DType.uint8:
+                np_dtype = np.uint8
+            elif dtype == DType.bool:
+                np_dtype = np.bool_
+            elif dtype == DType.int:
+                np_dtype = np.int64
+
+            var order = "C" if self.flags.C_CONTIGUOUS else "F"
+            numpyarray = np.empty(np_arr_dim, dtype=np_dtype, order=order)
+            var pointer_d = numpyarray.__array_interface__["data"][
+                0
+            ].unsafe_get_as_pointer[dtype]()
+            memcpy(pointer_d, self._buf.ptr, self.size)
+
+            return numpyarray^
+
+        except e:
+            print("Error in converting to numpy", e)
+            return PythonObject()
+
+    # ===-----------------------------------------------------------------------===#
+    # Static methods to construct matrix
+    # ===-----------------------------------------------------------------------===#
+
+    @staticmethod
+    fn full[
+        dtype: DType = DType.float64
+    ](
+        shape: Tuple[Int, Int],
+        fill_value: Scalar[dtype] = 0,
+        order: String = "C",
+    ) -> Matrix[dtype]:
+        """Return a matrix with given shape and filled value.
+
+        Example:
+        ```mojo
+        from numojo import Matrix
+        var A = Matrix.full(shape=(10, 10), fill_value=100)
+        ```
+        """
+
+        var matrix = Matrix[dtype](shape, order)
+        for i in range(shape[0] * shape[1]):
+            matrix._buf.ptr.store(i, fill_value)
+
+        return matrix^
+
+    @staticmethod
+    fn zeros[
+        dtype: DType = DType.float64
+    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[dtype]:
+        """Return a matrix with given shape and filled with zeros.
+
+        Example:
+        ```mojo
+        from numojo import Matrix
+        var A = Matrix.ones(shape=(10, 10))
+        ```
+        """
+
+        var M = Matrix[dtype](shape, order)
+        memset_zero(M._buf.ptr, M.size)
+        return M^
+
+    @staticmethod
+    fn ones[
+        dtype: DType = DType.float64
+    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[dtype]:
+        """Return a matrix with given shape and filled with ones.
+
+        Example:
+        ```mojo
+        from numojo import Matrix
+        var A = Matrix.ones(shape=(10, 10))
+        ```
+        """
+
+        return Matrix.full[dtype](shape=shape, fill_value=1)
+
+    @staticmethod
+    fn identity[
+        dtype: DType = DType.float64
+    ](len: Int, order: String = "C") -> Matrix[dtype]:
+        """Return an identity matrix with given size.
+
+        Example:
+        ```mojo
+        from numojo import Matrix
+        var A = Matrix.identity(12)
+        ```
+        """
+        var matrix = Matrix.zeros[dtype]((len, len), order)
+        for i in range(len):
+            matrix._buf.ptr.store(
+                i * matrix.strides[0] + i * matrix.strides[1], 1
+            )
+        return matrix^
+
+    @staticmethod
+    fn rand[
+        dtype: DType = DType.float64
+    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[dtype]:
+        """Return a matrix with random values uniformed distributed between 0 and 1.
+
+        Example:
+        ```mojo
+        from numojo import Matrix
+        var A = Matrix.rand((12, 12))
+        ```
+
+        Parameters:
+            dtype: The data type of the NDArray elements.
+
+        Args:
+            shape: The shape of the Matrix.
+            order: The order of the Matrix. "C" or "F".
+        """
+        var result = Matrix[dtype](shape, order)
+        for i in range(result.size):
+            result._buf.ptr.store(i, random_float64(0, 1).cast[dtype]())
+        return result^
+
+    @staticmethod
+    fn fromlist[
+        dtype: DType
+    ](
+        object: List[Scalar[dtype]],
+        shape: Tuple[Int, Int] = (0, 0),
+        order: String = "C",
+    ) raises -> Matrix[dtype]:
+        """Create a matrix from a 1-dimensional list into given shape.
+
+        If no shape is passed, the return matrix will be a row vector.
+
+        Example:
+        ```mojo
+        from numojo import Matrix
+        fn main() raises:
+            print(Matrix.fromlist(List[Float64](1, 2, 3, 4, 5), (5, 1)))
+        ```
+        """
+
+        if (shape[0] == 0) and (shape[1] == 0):
+            var M = Matrix[dtype](shape=(1, len(object)))
+            memcpy(M._buf.ptr, object.unsafe_ptr(), M.size)
+            return M^
+
+        if shape[0] * shape[1] != len(object):
+            var message = String(
+                "The input has {} elements, but the target has the shape {}x{}"
+            ).format(len(object), shape[0], shape[1])
+            raise Error(message)
+        var M = Matrix[dtype](shape=shape, order="C")
+        memcpy(M._buf.ptr, object.unsafe_ptr(), M.size)
+        if order == "F":
+            M = M.reorder_layout()
+        return M^
+
+    @staticmethod
+    fn fromstring[
+        dtype: DType = DType.float64
+    ](
+        text: String, shape: Tuple[Int, Int] = (0, 0), order: String = "C"
+    ) raises -> Matrix[dtype]:
+        """Matrix initialization from string representation of an matrix.
+
+        Comma, right brackets, and whitespace are treated as seperators of numbers.
+        Digits, underscores, and minus signs are treated as a part of the numbers.
+
+        If now shape is passed, the return matrix will be a row vector.
+
+        Example:
+        ```mojo
+        from numojo.prelude import *
+        from numojo import Matrix
+        fn main() raises:
+            var A = Matrix.fromstring[f32](
+            "1 2 .3 4 5 6.5 7 1_323.12 9 10, 11.12, 12 13 14 15 16", (4, 4))
+        ```
+        ```console
+        [[1.0   2.0     0.30000001192092896     4.0]
+        [5.0   6.5     7.0     1323.1199951171875]
+        [9.0   10.0    11.119999885559082      12.0]
+        [13.0  14.0    15.0    16.0]]
+        Size: 4x4  DType: float32
+        ```
+
+        Args:
+            text: String representation of a matrix.
+            shape: Shape of the matrix.
+            order: Order of the matrix. "C" or "F".
+        """
+
+        var data = List[Scalar[dtype]]()
+        var bytes = text.as_bytes()
+        var number_as_str: String = ""
+        var size = shape[0] * shape[1]
+
+        for i in range(len(bytes)):
+            var b = bytes[i]
+            if (
+                chr(Int(b)).isdigit()
+                or (chr(Int(b)) == ".")
+                or (chr(Int(b)) == "-")
+            ):
+                number_as_str = number_as_str + chr(Int(b))
+                if i == len(bytes) - 1:  # Last byte
+                    var number = atof(number_as_str).cast[dtype]()
+                    data.append(number)  # Add the number to the data buffer
+                    number_as_str = ""  # Clean the number cache
+            if (
+                (chr(Int(b)) == ",")
+                or (chr(Int(b)) == "]")
+                or (chr(Int(b)) == " ")
+            ):
+                if number_as_str != "":
+                    var number = atof(number_as_str).cast[dtype]()
+                    data.append(number)  # Add the number to the data buffer
+                    number_as_str = ""  # Clean the number cache
+
+        if (shape[0] == 0) and (shape[1] == 0):
+            return Matrix.fromlist(data)
+
+        if size != len(data):
+            var message = String(
+                "The number of items in the string is {}, which does not match"
+                " the given shape {}x{}."
+            ).format(len(data), shape[0], shape[1])
+            raise Error(message)
+
+        var result = Matrix[dtype](shape=shape)
+        for i in range(len(data)):
+            result._buf.ptr[i] = data[i]
+        return result^
+
+
+# ===-----------------------------------------------------------------------===#
+# MatrixIter struct
+# ===-----------------------------------------------------------------------===#
+
+
+# ! Should the iterator be mutable or not?
+struct _MatrixIter[
+    is_mutable: Bool, //,
+    lifetime: Origin[is_mutable],
+    dtype: DType,
+    forward: Bool = True,
+](Copyable, Movable):
+    """Iterator for Matrix.
+
+    Parameters:
+        is_mutable: Whether the iterator is mutable.
+        lifetime: The lifetime of the underlying Matrix data.
+        dtype: The data type of the item.
+        forward: The iteration direction. `False` is backwards.
+    """
+
+    var index: Int
+    var matrix: Matrix[dtype]
+    var length: Int
+
+    fn __init__(
+        out self,
+        matrix: Matrix[dtype],
+        length: Int,
+    ):
+        self.index = 0 if forward else length
+        self.length = length
+        self.matrix = matrix.copy()
+
+    fn __iter__(self) -> Self:
+        return self.copy()
+
+    fn __next__(mut self) raises -> Matrix[dtype]:
+        @parameter
+        if forward:
+            var current_index = self.index
+            self.index += 1
+            return self.matrix[current_index]
+        else:
+            var current_index = self.index
+            self.index -= 1
+            return self.matrix[current_index]
+
+    @always_inline
+    fn __has_next__(self) -> Bool:
+        @parameter
+        if forward:
+            return self.index < self.length
+        else:
+            return self.index > 0
+
+    fn __len__(self) -> Int:
+        @parameter
+        if forward:
+            return self.length - self.index
+        else:
+            return self.index
+
+
+# ===-----------------------------------------------------------------------===#
+# Backend fucntions using SMID functions
+# ===-----------------------------------------------------------------------===#
+
+
+fn _arithmetic_func_matrix_matrix_to_matrix[
+    dtype: DType,
+    simd_func: fn[type: DType, simd_width: Int] (
+        SIMD[type, simd_width], SIMD[type, simd_width]
+    ) -> SIMD[type, simd_width],
+](A: Matrix[dtype], B: Matrix[dtype]) raises -> Matrix[dtype]:
+    """
+    Matrix[dtype] & Matrix[dtype] -> Matrix[dtype]
+
+    For example: `__add__`, `__sub__`, etc.
+    """
+    alias simd_width = simd_width_of[dtype]()
+    if A.order() != B.order():
+        raise Error(
+            String("Matrix order {} does not match {}.").format(
+                A.order(), B.order()
+            )
+        )
+
+    if (A.shape[0] != B.shape[0]) or (A.shape[1] != B.shape[1]):
+        raise Error(
+            String("Shape {}x{} does not match {}x{}.").format(
+                A.shape[0], A.shape[1], B.shape[0], B.shape[1]
+            )
+        )
+
+    var C = Matrix[dtype](shape=A.shape, order=A.order())
+
+    @parameter
+    fn vec_func[simd_width: Int](i: Int):
+        C._buf.store(
+            i,
+            simd_func(
+                A._buf.load[width=simd_width](i),
+                B._buf.load[width=simd_width](i),
+            ),
+        )
+
+    vectorize[vec_func, simd_width](A.size)
+
+    return C^
+
+
+fn _arithmetic_func_matrix_to_matrix[
+    dtype: DType,
+    simd_func: fn[type: DType, simd_width: Int] (
+        SIMD[type, simd_width]
+    ) -> SIMD[type, simd_width],
+](A: Matrix[dtype]) -> Matrix[dtype]:
+    """
+    Matrix[dtype] -> Matrix[dtype]
+
+    For example: `sin`, `cos`, etc.
+    """
+    alias simd_width: Int = simd_width_of[dtype]()
+
+    var C: Matrix[dtype] = Matrix[dtype](shape=A.shape, order=A.order())
+
+    @parameter
+    fn vec_func[simd_width: Int](i: Int):
+        C._buf.ptr.store(i, simd_func(A._buf.ptr.load[width=simd_width](i)))
+
+    vectorize[vec_func, simd_width](A.size)
+
+    return C^
+
+
+fn _logic_func_matrix_matrix_to_matrix[
+    dtype: DType,
+    simd_func: fn[type: DType, simd_width: Int] (
+        SIMD[type, simd_width], SIMD[type, simd_width]
+    ) -> SIMD[DType.bool, simd_width],
+](A: Matrix[dtype], B: Matrix[dtype]) raises -> Matrix[DType.bool]:
+    """
+    Matrix[dtype] & Matrix[dtype] -> Matrix[bool]
+    """
+    alias width = simd_width_of[dtype]()
+
+    if A.order() != B.order():
+        raise Error(
+            String("Matrix order {} does not match {}.").format(
+                A.order(), B.order()
+            )
+        )
+
+    if (A.shape[0] != B.shape[0]) or (A.shape[1] != B.shape[1]):
+        raise Error(
+            String("Shape {}x{} does not match {}x{}.").format(
+                A.shape[0], A.shape[1], B.shape[0], B.shape[1]
+            )
+        )
+
+    var t0 = A.shape[0]
+    var t1 = A.shape[1]
+    var C = Matrix[DType.bool](shape=A.shape, order=A.order())
+
+    @parameter
+    fn calculate_CC(m: Int):
+        @parameter
+        fn vec_func[simd_width: Int](n: Int):
+            C._store[simd_width](
+                m,
+                n,
+                simd_func(A._load[simd_width](m, n), B._load[simd_width](m, n)),
+            )
+
+        vectorize[vec_func, width](t1)
+
+    parallelize[calculate_CC](t0, t0)
+
+    var _t0 = t0
+    var _t1 = t1
+    var _A = (
+        A.copy()
+    )  # ! perhaps remove this explicit copy if we don't need to extend it's lifetime.
+    var _B = B.copy()
+
+    return C^
diff --git a/numojo/__init__.mojo b/numojo/__init__.mojo
index 50a16eee..9488b874 100644
--- a/numojo/__init__.mojo
+++ b/numojo/__init__.mojo
@@ -208,7 +208,7 @@ from numojo.routines.creation import (
 )
 
 from numojo.routines import indexing
-from numojo.routines.indexing import where, compress, take_along_axis
+from numojo.routines.indexing import `where`, compress, take_along_axis
 
 from numojo.routines.functional import apply_along_axis
 
diff --git a/numojo/core/__init__.mojo b/numojo/core/__init__.mojo
index e90a56d7..b8cf4ec5 100644
--- a/numojo/core/__init__.mojo
+++ b/numojo/core/__init__.mojo
@@ -5,6 +5,8 @@ from .ndarray import NDArray
 from .item import Item
 from .ndshape import NDArrayShape
 from .ndstrides import NDArrayStrides
+from .own_data import OwnData
+from .ref_data import RefData
 
 from .complex import (
     ComplexSIMD,
diff --git a/numojo/core/complex/complex_ndarray.mojo b/numojo/core/complex/complex_ndarray.mojo
index cf207431..5d509303 100644
--- a/numojo/core/complex/complex_ndarray.mojo
+++ b/numojo/core/complex/complex_ndarray.mojo
@@ -40,7 +40,8 @@ import builtin.math as builtin_math
 from builtin.type_aliases import Origin
 from collections.optional import Optional
 from math import log10, sqrt
-from memory import UnsafePointer, memset_zero, memcpy
+from memory import memset_zero, memcpy
+from memory import LegacyUnsafePointer as UnsafePointer
 from python import PythonObject
 from sys import simd_width_of
 from utils import Variant
@@ -393,7 +394,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
     # fn __getitem__(self, *slices: Variant[Slice, Int]) raises -> Self         # Get by mix of slices/ints
     #
     # 4. Advanced Indexing
-    # fn __getitem__(self, indices: NDArray[DType.index]) raises -> Self        # Get by index array
+    # fn __getitem__(self, indices: NDArray[DType.int]) raises -> Self        # Get by index array
     # fn __getitem__(self, indices: List[Int]) raises -> Self                   # Get by list of indices
     # fn __getitem__(self, mask: NDArray[DType.bool]) raises -> Self            # Get by boolean mask
     # fn __getitem__(self, mask: List[Bool]) raises -> Self                     # Get by boolean list
@@ -646,13 +647,21 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         # Fast path for C-contiguous
         if self.flags.C_CONTIGUOUS:
             var block = self.size // self.shape[0]
-            memcpy(result._re._buf.ptr, self._re._buf.ptr + norm * block, block)
-            memcpy(result._im._buf.ptr, self._im._buf.ptr + norm * block, block)
+            memcpy(
+                dest=result._re._buf.ptr,
+                src=self._re._buf.ptr + norm * block,
+                count=block,
+            )
+            memcpy(
+                dest=result._im._buf.ptr,
+                src=self._im._buf.ptr + norm * block,
+                count=block,
+            )
             return result^
 
         # F layout
-        self._re._copy_first_axis_slice[Self.dtype](self._re, norm, result._re)
-        self._im._copy_first_axis_slice[Self.dtype](self._im, norm, result._im)
+        self[Self.dtype]._re._copy_first_axis_slice(self._re, norm, result._re)
+        self[Self.dtype]._im._copy_first_axis_slice(self._im, norm, result._im)
         return result^
 
     fn __getitem__(self, var *slices: Slice) raises -> Self:
@@ -927,7 +936,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         narr = self.__getitem__(slice_list^)
         return narr^
 
-    fn __getitem__(self, indices: NDArray[DType.index]) raises -> Self:
+    fn __getitem__(self, indices: NDArray[DType.int]) raises -> Self:
         """
         Get items from 0-th dimension of a ComplexNDArray of indices.
         If the original array is of shape (i,j,k) and
@@ -969,14 +978,14 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
                     )
                 )
             memcpy(
-                result._re._buf.ptr + i * size_per_item,
-                self._re._buf.ptr + indices.item(i) * size_per_item,
-                size_per_item,
+                dest=result._re._buf.ptr + i * size_per_item,
+                src=self._re._buf.ptr + indices.item(i) * size_per_item,
+                count=size_per_item,
             )
             memcpy(
-                result._im._buf.ptr + i * size_per_item,
-                self._im._buf.ptr + indices.item(i) * size_per_item,
-                size_per_item,
+                dest=result._im._buf.ptr + i * size_per_item,
+                src=self._im._buf.ptr + indices.item(i) * size_per_item,
+                count=size_per_item,
             )
 
         return result^
@@ -985,7 +994,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         """
         Get items from 0-th dimension of a ComplexNDArray of indices.
         It is an overload of
-        `__getitem__(self, indices: NDArray[DType.index]) raises -> Self`.
+        `__getitem__(self, indices: NDArray[DType.int]) raises -> Self`.
 
         Args:
             indices: A list of Int.
@@ -998,7 +1007,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
 
         """
 
-        var indices_array = NDArray[DType.index](shape=Shape(len(indices)))
+        var indices_array = NDArray[DType.int](shape=Shape(len(indices)))
         for i in range(len(indices)):
             (indices_array._buf.ptr + i).init_pointee_copy(indices[i])
 
@@ -1108,14 +1117,14 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
         for i in range(mask.size):
             if mask.item(i):
                 memcpy(
-                    result._re._buf.ptr + offset * size_per_item,
-                    self._re._buf.ptr + i * size_per_item,
-                    size_per_item,
+                    dest=result._re._buf.ptr + offset * size_per_item,
+                    src=self._re._buf.ptr + i * size_per_item,
+                    count=size_per_item,
                 )
                 memcpy(
-                    result._im._buf.ptr + offset * size_per_item,
-                    self._im._buf.ptr + i * size_per_item,
-                    size_per_item,
+                    dest=result._im._buf.ptr + offset * size_per_item,
+                    src=self._im._buf.ptr + i * size_per_item,
+                    count=size_per_item,
                 )
                 offset += 1
 
@@ -1657,13 +1666,21 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
                         ),
                     )
                 )
-            memcpy(self._re._buf.ptr + norm * block, val._re._buf.ptr, block)
-            memcpy(self._im._buf.ptr + norm * block, val._im._buf.ptr, block)
+            memcpy(
+                dest=self._re._buf.ptr + norm * block,
+                src=val._re._buf.ptr,
+                count=block,
+            )
+            memcpy(
+                dest=self._im._buf.ptr + norm * block,
+                src=val._im._buf.ptr,
+                count=block,
+            )
             return
 
         # F order
-        self._re._write_first_axis_slice[Self.dtype](self._re, norm, val._re)
-        self._im._write_first_axis_slice[Self.dtype](self._im, norm, val._im)
+        self[Self.dtype]._re._write_first_axis_slice(self._re, norm, val._re)
+        self[Self.dtype]._im._write_first_axis_slice(self._im, norm, val._im)
 
     fn __setitem__(mut self, index: Item, val: ComplexSIMD[cdtype]) raises:
         """
@@ -1854,7 +1871,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
 
     #     self.__setitem__(slices=slice_list, val=val)
 
-    fn __setitem__(self, index: NDArray[DType.index], val: Self) raises:
+    fn __setitem__(self, index: NDArray[DType.int], val: Self) raises:
         """
         Returns the items of the ComplexNDArray from an array of indices.
 
@@ -3141,7 +3158,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
 
     fn __iter__(
         self,
-    ) raises -> _ComplexNDArrayIter[__origin_of(self._re), cdtype]:
+    ) raises -> _ComplexNDArrayIter[origin_of(self._re), cdtype]:
         """
         Iterates over elements of the ComplexNDArray and return sub-arrays as view.
 
@@ -3149,16 +3166,14 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
             An iterator of ComplexNDArray elements.
         """
 
-        return _ComplexNDArrayIter[__origin_of(self._re), cdtype](
+        return _ComplexNDArrayIter[origin_of(self._re), cdtype](
             self,
             dimension=0,
         )
 
     fn __reversed__(
         self,
-    ) raises -> _ComplexNDArrayIter[
-        __origin_of(self._re), cdtype, forward=False
-    ]:
+    ) raises -> _ComplexNDArrayIter[origin_of(self._re), cdtype, forward=False]:
         """
         Iterates backwards over elements of the ComplexNDArray, returning
         copied value.
@@ -3167,9 +3182,7 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
             A reversed iterator of NDArray elements.
         """
 
-        return _ComplexNDArrayIter[
-            __origin_of(self._re), cdtype, forward=False
-        ](
+        return _ComplexNDArrayIter[origin_of(self._re), cdtype, forward=False](
             self,
             dimension=0,
         )
@@ -3272,13 +3285,13 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
             var result: NDArray[dtype = Self.dtype] = NDArray[
                 dtype = Self.dtype
             ](self.shape)
-            memcpy(result._buf.ptr, self._re._buf.ptr, self.size)
+            memcpy(dest=result._buf.ptr, src=self._re._buf.ptr, count=self.size)
             return result^
         elif type == "im":
             var result: NDArray[dtype = Self.dtype] = NDArray[
                 dtype = Self.dtype
             ](self.shape)
-            memcpy(result._buf.ptr, self._im._buf.ptr, self.size)
+            memcpy(dest=result._buf.ptr, src=self._im._buf.ptr, count=self.size)
             return result^
         else:
             raise Error(
@@ -3939,8 +3952,8 @@ struct ComplexNDArray[cdtype: ComplexDType = ComplexDType.float64](
                 )
             )
 
-        var diag_re = self._re.diagonal[Self.dtype](offset)
-        var diag_im = self._im.diagonal[Self.dtype](offset)
+        var diag_re = self[Self.dtype]._re.diagonal(offset)
+        var diag_im = self[Self.dtype]._im.diagonal(offset)
         return Self(diag_re^, diag_im^)
 
     fn trace(self) raises -> ComplexSIMD[cdtype]:
diff --git a/numojo/core/data_container.mojo b/numojo/core/data_container.mojo
index dbab9d19..b27aee00 100644
--- a/numojo/core/data_container.mojo
+++ b/numojo/core/data_container.mojo
@@ -3,14 +3,14 @@
 #
 # TODO: fields in traits are not supported yet by Mojo
 # Currently use `get_ptr()` to get pointer, in future, use `ptr` directly.
-# var ptr: UnsafePointer[Scalar[dtype]]
+# var ptr: LegacyUnsafePointer[Scalar[dtype]]
 # ===----------------------------------------------------------------------===
 
-from memory import UnsafePointer
+from memory import LegacyUnsafePointer
 
 
 struct DataContainer[dtype: DType]():
-    var ptr: UnsafePointer[Scalar[dtype]]
+    var ptr: LegacyUnsafePointer[Scalar[dtype]]
 
     fn __init__(out self, size: Int):
         """
@@ -21,9 +21,9 @@ struct DataContainer[dtype: DType]():
         `ndarray.flags['OWN_DATA']` should be set as True.
         The memory should be freed by `__del__`.
         """
-        self.ptr = UnsafePointer[Scalar[dtype]]().alloc(size)
+        self.ptr = LegacyUnsafePointer[Scalar[dtype]]().alloc(size)
 
-    fn __init__(out self, ptr: UnsafePointer[Scalar[dtype]]):
+    fn __init__(out self, ptr: LegacyUnsafePointer[Scalar[dtype]]):
         """
         Do not use this if you know what it means.
         If the pointer is associated with another array, it might cause
@@ -38,5 +38,5 @@ struct DataContainer[dtype: DType]():
     fn __moveinit__(out self, deinit other: Self):
         self.ptr = other.ptr
 
-    fn get_ptr(self) -> UnsafePointer[Scalar[dtype]]:
+    fn get_ptr(self) -> LegacyUnsafePointer[Scalar[dtype]]:
         return self.ptr
diff --git a/numojo/core/item.mojo b/numojo/core/item.mojo
index 9d3d8f9a..57a33d42 100644
--- a/numojo/core/item.mojo
+++ b/numojo/core/item.mojo
@@ -6,7 +6,8 @@ Implements Item type.
 
 from builtin.type_aliases import Origin
 from builtin.int import index as index_int
-from memory import UnsafePointer, memset_zero, memcpy
+from memory import memset_zero, memcpy
+from memory import LegacyUnsafePointer as UnsafePointer
 from memory import memcmp
 from os import abort
 from sys import simd_width_of
@@ -131,7 +132,7 @@ struct Item(
         """
         self.ndim = other.ndim
         self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
-        memcpy(self._buf, other._buf, self.ndim)
+        memcpy(dest=self._buf, src=other._buf, count=self.ndim)
 
     @always_inline("nodebug")
     fn __del__(deinit self):
@@ -357,7 +358,7 @@ struct Item(
             A new Item with the same values.
         """
         var res = Self(ndim=self.ndim, initialized=False)
-        memcpy(res._buf, self._buf, self.ndim)
+        memcpy(dest=res._buf, src=self._buf, count=self.ndim)
         return res^
 
     fn swapaxes(self, axis1: Int, axis2: Int) raises -> Self:
@@ -529,7 +530,7 @@ struct Item(
 
     fn _compute_slice_params(
         self, slice_index: Slice
-    ) raises -> (Int, Int, Int):
+    ) raises -> Tuple[Int, Int, Int]:
         """
         Compute normalized slice parameters (start, step, length).
 
@@ -725,7 +726,7 @@ struct _ItemIter[
         else:
             return self.index > 0
 
-    fn __next__(mut self) raises -> Scalar[DType.index]:
+    fn __next__(mut self) raises -> Scalar[DType.int]:
         @parameter
         if forward:
             var current_index = self.index
diff --git a/numojo/core/matrix.mojo b/numojo/core/matrix.mojo
index 7c8594dd..72bbcdc1 100644
--- a/numojo/core/matrix.mojo
+++ b/numojo/core/matrix.mojo
@@ -12,10 +12,14 @@ from memory import UnsafePointer, memcpy, memset_zero
 from random import random_float64
 from sys import simd_width_of
 from python import PythonObject, Python
+from math import ceil
 
 from numojo.core.flags import Flags
 from numojo.core.ndarray import NDArray
 from numojo.core.data_container import DataContainer
+from numojo.core.traits.buffered import Buffered
+from numojo.core.own_data import OwnData
+from numojo.core.ref_data import RefData
 from numojo.core.utility import _get_offset
 from numojo.routines.manipulation import broadcast_to, reorder_layout
 from numojo.routines.linalg.misc import issymmetric
@@ -26,10 +30,9 @@ from numojo.routines.linalg.misc import issymmetric
 # ===----------------------------------------------------------------------===#
 
 
-struct Matrix[dtype: DType = DType.float64](
+struct Matrix[dtype: DType = DType.float64, BufType: Buffered = OwnData](
     Copyable, Movable, Sized, Stringable, Writable
 ):
-    # TODO: Add buffer_type in the parameters.
     """
     `Matrix` is a special case of `NDArray` (2DArray) but has some targeted
     optimization since the number of dimensions is known at the compile time.
@@ -46,6 +49,7 @@ struct Matrix[dtype: DType = DType.float64](
 
     Parameters:
         dtype: Type of item in NDArray. Default type is DType.float64.
+        BufType: This is only for internal use! The buffer type of the Matrix, denotes whether the instance owns the data or is a view. Default is `OwnData`. Manipulating it can lead to undefined behaviors.
 
     The matrix can be uniquely defined by the following features:
         1. The data buffer of all items.
@@ -92,7 +96,10 @@ struct Matrix[dtype: DType = DType.float64](
     """Vector size of the data type."""
 
     var _buf: DataContainer[dtype]
-    """Data buffer of the items in the NDArray."""
+    """Data buffer of the items in the Matrix."""
+
+    var buf_type: BufType
+    """View information of the Matrix."""
 
     var shape: Tuple[Int, Int]
     """Shape of Matrix."""
@@ -117,13 +124,17 @@ struct Matrix[dtype: DType = DType.float64](
         order: String = "C",
     ):
         """
-        Create a new matrix of the given shape,without initializing data.
+        Create a new matrix of the given shape, without initializing data.
 
         Args:
             shape: Tuple representing (rows, columns).
             order: Use "C" for row-major (C-style) layout or "F" for column-major
                (Fortran-style) layout. Defaults to "C".
         """
+        constrained[
+            BufType.is_own_data(),
+            "Buffer type must be OwnData to create matrix that owns data.",
+        ]()
 
         self.shape = (shape[0], shape[1])
         if order == "C":
@@ -132,6 +143,7 @@ struct Matrix[dtype: DType = DType.float64](
             self.strides = (1, shape[0])
         self.size = shape[0] * shape[1]
         self._buf = DataContainer[dtype](size=self.size)
+        self.buf_type = BufType()
         self.flags = Flags(
             self.shape, self.strides, owndata=True, writeable=True
         )
@@ -145,7 +157,10 @@ struct Matrix[dtype: DType = DType.float64](
         """
         Construct a matrix from matrix.
         """
-
+        constrained[
+            BufType.is_own_data(),
+            "Buffer type must be OwnData to create matrix that owns data.",
+        ]()
         self = data^
 
     @always_inline("nodebug")
@@ -156,7 +171,10 @@ struct Matrix[dtype: DType = DType.float64](
         """
         Construct a matrix from array.
         """
-
+        constrained[
+            BufType.is_own_data(),
+            "Buffer type must be OwnData to create matrix that owns data.",
+        ]()
         if data.ndim == 1:
             self.shape = (1, data.shape[0])
             self.strides = (data.shape[0], 1)
@@ -169,7 +187,7 @@ struct Matrix[dtype: DType = DType.float64](
             raise Error(String("Shape too large to be a matrix."))
 
         self._buf = DataContainer[dtype](self.size)
-
+        self.buf_type = BufType()
         self.flags = Flags(
             self.shape, self.strides, owndata=True, writeable=True
         )
@@ -177,15 +195,47 @@ struct Matrix[dtype: DType = DType.float64](
         if data.flags["C_CONTIGUOUS"]:
             for i in range(data.shape[0]):
                 memcpy(
-                    self._buf.ptr.offset(i * self.shape[0]),
-                    data._buf.ptr.offset(i * data.shape[0]),
-                    self.shape[0],
+                    dest=self._buf.ptr.offset(i * self.shape[0]),
+                    src=data._buf.ptr.offset(i * data.shape[0]),
+                    count=self.shape[0],
                 )
         else:
             for i in range(data.shape[0]):
                 for j in range(data.shape[1]):
                     self._store(i, j, data._getitem(i, j))
 
+    # to construct views
+    @always_inline("nodebug")
+    fn __init__(
+        out self,
+        shape: Tuple[Int, Int],
+        strides: Tuple[Int, Int],
+        offset: Int,
+        ptr: UnsafePointer[Scalar[dtype]],
+    ):
+        """
+        Initialize Matrix that does not own the data.
+        The data is owned by another Matrix.
+
+        Args:
+            shape: Shape of the view.
+            strides: Strides of the view.
+            offset: Offset in pointer of the data buffer.
+            ptr: Pointer to the data buffer of the original array.
+        """
+        constrained[
+            BufType.is_ref_data(),
+            "Buffer type must be RefData to create matrix view.",
+        ]()
+        self.shape = shape
+        self.strides = strides
+        self.size = shape[0] * shape[1]
+        self._buf = DataContainer(ptr=ptr.offset(offset))
+        self.buf_type = BufType()
+        self.flags = Flags(
+            self.shape, self.strides, owndata=False, writeable=False
+        )
+
     @always_inline("nodebug")
     fn __copyinit__(out self, other: Self):
         """
@@ -195,8 +245,11 @@ struct Matrix[dtype: DType = DType.float64](
         self.strides = (other.strides[0], other.strides[1])
         self.size = other.size
         self._buf = DataContainer[dtype](other.size)
-        memcpy(self._buf.ptr, other._buf.ptr, other.size)
-        self.flags = other.flags
+        memcpy(dest=self._buf.ptr, src=other._buf.ptr, count=other.size)
+        self.buf_type = BufType()
+        self.flags = Flags(
+            other.shape, other.strides, owndata=True, writeable=True
+        )
 
     @always_inline("nodebug")
     fn __moveinit__(out self, deinit other: Self):
@@ -207,24 +260,46 @@ struct Matrix[dtype: DType = DType.float64](
         self.strides = other.strides^
         self.size = other.size
         self._buf = other._buf^
+        self.buf_type = other.buf_type^
         self.flags = other.flags^
 
     @always_inline("nodebug")
     fn __del__(deinit self):
-        var owndata: Bool
-        try:
-            owndata = self.flags["OWNDATA"]
-        except:
-            owndata = True
-            print("Invalid `OWNDATA` flag. Treat as `True`.")
-        if owndata:
+        var owndata: Bool = self.flags.OWNDATA
+        # Free the buffer only if it owns the data, but its redudant rn. move buf type checks into compile time and remove redundant check here.
+        if owndata and self.buf_type.is_own_data():
             self._buf.ptr.free()
 
+    fn create_copy(self) raises -> Matrix[dtype, OwnData]:
+        """
+        Create a copy of the matrix with OwnData buffer type.
+        """
+        var result = Matrix[dtype, OwnData](
+            shape=self.shape, order=self.order()
+        )
+        if self.flags.C_CONTIGUOUS:
+            memcpy(dest=result._buf.ptr, src=self._buf.ptr, count=self.size)
+        else:
+            for i in range(self.shape[0]):
+                for j in range(self.shape[1]):
+                    result[i, j] = self[i, j]
+
+        return result^
+
     # ===-------------------------------------------------------------------===#
     # Slicing and indexing methods
     # ===-------------------------------------------------------------------===#
 
-    fn __getitem__(self, var x: Int, var y: Int) raises -> Scalar[dtype]:
+    fn normalize(self, idx: Int, dim: Int) -> Int:
+        """
+        Normalize negative indices.
+        """
+        var idx_norm = idx
+        if idx_norm < 0:
+            idx_norm = dim + idx_norm
+        return idx_norm
+
+    fn __getitem__(self, x: Int, y: Int) raises -> Scalar[dtype]:
         """
         Return the scalar at the index.
 
@@ -235,52 +310,125 @@ struct Matrix[dtype: DType = DType.float64](
         Returns:
             A scalar matching the dtype of the array.
         """
-
-        if x < 0:
-            x = self.shape[0] + x
-
-        if y < 0:
-            y = self.shape[1] + y
-
-        if (x >= self.shape[0]) or (y >= self.shape[1]):
+        if (
+            x >= self.shape[0]
+            or x < -self.shape[0]
+            or y >= self.shape[1]
+            or y < -self.shape[1]
+        ):
             raise Error(
                 String(
                     "Index ({}, {}) exceed the matrix shape ({}, {})"
                 ).format(x, y, self.shape[0], self.shape[1])
             )
+        var x_norm = self.normalize(x, self.shape[0])
+        var y_norm = self.normalize(y, self.shape[1])
+        return self._buf.ptr.load(
+            x_norm * self.strides[0] + y_norm * self.strides[1]
+        )
 
-        return self._buf.ptr.load(x * self.strides[0] + y * self.strides[1])
-
-    fn __getitem__(self, var x: Int) raises -> Self:
+    # TODO: temporarily renaming all view returning functions to be `get` or `set` due to a Mojo bug with overloading `__getitem__` and `__setitem__` with different argument types. Created an issue in Mojo GitHub
+    fn get(
+        ref self, x: Int
+    ) raises -> Matrix[dtype, RefData[ImmutOrigin.cast_from[origin_of(self)]]]:
         """
         Return the corresponding row at the index.
 
         Args:
             x: The row number.
         """
-
-        if x < 0:
-            x = self.shape[0] + x
-
-        if x >= self.shape[0]:
+        constrained[
+            BufType.is_own_data(),
+            "Buffer type must be OwnData to get a reference row.",
+        ]()
+        if x >= self.shape[0] or x < -self.shape[0]:
             raise Error(
                 String("Index {} exceed the row number {}").format(
                     x, self.shape[0]
                 )
             )
 
-        var res = Self(shape=(1, self.shape[1]), order=self.order())
+        var x_norm = self.normalize(x, self.shape[0])
+        var res = Matrix[
+            dtype, RefData[ImmutOrigin.cast_from[origin_of(self)]]
+        ](
+            shape=(1, self.shape[1]),
+            strides=(self.strides[0], self.strides[1]),
+            offset=x_norm * self.strides[0],
+            ptr=self._buf.get_ptr()
+            .mut_cast[target_mut=False]()
+            .unsafe_origin_cast[
+                target_origin = ImmutOrigin.cast_from[origin_of(self)]
+            ](),
+        )
+        return res^
+
+    # for creating a copy of the row.
+    fn __getitem__(self, var x: Int) raises -> Matrix[dtype, OwnData]:
+        """
+        Return the corresponding row at the index.
+
+        Args:
+            x: The row number.
+
+        Returns:
+            A new Matrix (row vector) copied from the original matrix.
 
+        Notes:
+            This function is for internal use only. Users should use `create_copy` to create a copy of the whole matrix instead.
+        """
+        if x >= self.shape[0] or x < -self.shape[0]:
+            raise Error(
+                String("Index {} exceed the row size {}").format(
+                    x, self.shape[0]
+                )
+            )
+        var x_norm = self.normalize(x, self.shape[0])
+        var result = Matrix[dtype, OwnData](
+            shape=(1, self.shape[1]), order=self.order()
+        )
         if self.flags.C_CONTIGUOUS:
-            var ptr = self._buf.ptr.offset(x * self.strides[0])
-            memcpy(res._buf.ptr, ptr, self.shape[1])
+            var ptr = self._buf.ptr.offset(x_norm * self.strides[0])
+            memcpy(dest=result._buf.ptr, src=ptr, count=self.shape[1])
         else:
             for j in range(self.shape[1]):
-                res[0, j] = self[x, j]
+                result[0, j] = self[x_norm, j]
+
+        return result^
+
+    fn get(
+        ref self, x: Slice, y: Slice
+    ) -> Matrix[dtype, RefData[ImmutOrigin.cast_from[origin_of(self)]]]:
+        """
+        Get item from two slices.
+        """
+        constrained[
+            BufType.is_own_data(),
+            "Buffer type must be OwnData to get a reference row.",
+        ]()
+        start_x, end_x, step_x = x.indices(self.shape[0])
+        start_y, end_y, step_y = y.indices(self.shape[1])
+
+        var res = Matrix[
+            dtype, RefData[ImmutOrigin.cast_from[origin_of(self)]]
+        ](
+            shape=(
+                Int(ceil((end_x - start_x) / step_x)),
+                Int(ceil((end_y - start_y) / step_y)),
+            ),
+            strides=(step_x * self.strides[0], step_y * self.strides[1]),
+            offset=start_x * self.strides[0] + start_y * self.strides[1],
+            ptr=self._buf.get_ptr()
+            .mut_cast[target_mut=False]()
+            .unsafe_origin_cast[
+                target_origin = ImmutOrigin.cast_from[origin_of(self)]
+            ](),
+        )
 
         return res^
 
-    fn __getitem__(self, x: Slice, y: Slice) -> Self:
+    # for creating a copy of the slice.
+    fn __getitem__(self, x: Slice, y: Slice) -> Matrix[dtype]:
         """
         Get item from two slices.
         """
@@ -295,12 +443,9 @@ struct Matrix[dtype: DType = DType.float64](
         var range_x = range(start_x, end_x, step_x)
         var range_y = range(start_y, end_y, step_y)
 
-        # The new matrix with the corresponding shape
         var B = Matrix[dtype](
             shape=(len(range_x), len(range_y)), order=self.order()
         )
-
-        # Fill in the values at the corresponding index
         var row = 0
         for i in range_x:
             var col = 0
@@ -311,7 +456,49 @@ struct Matrix[dtype: DType = DType.float64](
 
         return B^
 
-    fn __getitem__(self, x: Slice, var y: Int) -> Self:
+    fn get(
+        ref self, x: Slice, var y: Int
+    ) raises -> Matrix[dtype, RefData[ImmutOrigin.cast_from[origin_of(self)]]]:
+        """
+        Get item from one slice and one int.
+        """
+        # we could remove this constraint if we wanna allow users to create views from views. But that may complicate the origin tracking?
+        constrained[
+            BufType.is_own_data(),
+            "Buffer type must be OwnData to get a reference slice.",
+        ]()
+        if y >= self.shape[1] or y < -self.shape[1]:
+            raise Error(
+                String("Index {} exceed the column number {}").format(
+                    y, self.shape[1]
+                )
+            )
+        y = self.normalize(y, self.shape[1])
+        var start_x: Int
+        var end_x: Int
+        var step_x: Int
+        start_x, end_x, step_x = x.indices(self.shape[0])
+
+        var res = Matrix[
+            dtype, RefData[ImmutOrigin.cast_from[origin_of(self)]]
+        ](
+            shape=(
+                Int(ceil((end_x - start_x) / step_x)),
+                1,
+            ),
+            strides=(step_x * self.strides[0], self.strides[1]),
+            offset=start_x * self.strides[0] + y * self.strides[1],
+            ptr=self._buf.get_ptr()
+            .mut_cast[target_mut=False]()
+            .unsafe_origin_cast[
+                target_origin = ImmutOrigin.cast_from[origin_of(self)]
+            ](),
+        )
+
+        return res^
+
+    # for creating a copy of the slice.
+    fn __getitem__(self, x: Slice, var y: Int) -> Matrix[dtype, OwnData]:
         """
         Get item from one slice and one int.
         """
@@ -323,35 +510,79 @@ struct Matrix[dtype: DType = DType.float64](
         var step_x: Int
         start_x, end_x, step_x = x.indices(self.shape[0])
         var range_x = range(start_x, end_x, step_x)
-
-        # The new matrix with the corresponding shape
-        var B = Matrix[dtype](shape=(len(range_x), 1), order=self.order())
-
-        # Fill in the values at the corresponding index
+        var res = Matrix[dtype, OwnData](
+            shape=(
+                len(range_x),
+                1,
+            ),
+            order=self.order(),
+        )
         var row = 0
         for i in range_x:
-            B._store(row, 0, self._load(i, y))
+            res._store(row, 0, self._load(i, y))
             row += 1
+        return res^
 
-        return B^
-
-    fn __getitem__(self, var x: Int, y: Slice) -> Self:
+    fn get(
+        ref self, var x: Int, y: Slice
+    ) raises -> Matrix[dtype, RefData[ImmutOrigin.cast_from[origin_of(self)]]]:
         """
         Get item from one int and one slice.
         """
-        if x < 0:
-            x = self.shape[0] + x
+        constrained[
+            BufType.is_own_data(),
+            "Buffer type must be OwnData to get a reference slice.",
+        ]()
+        if x >= self.shape[0] or x < -self.shape[0]:
+            raise Error(
+                String("Index {} exceed the row size {}").format(
+                    x, self.shape[0]
+                )
+            )
+        x = self.normalize(x, self.shape[0])
+        var start_y: Int
+        var end_y: Int
+        var step_y: Int
+        start_y, end_y, step_y = y.indices(self.shape[1])
+        var range_y = range(start_y, end_y, step_y)
 
+        var res = Matrix[
+            dtype, RefData[ImmutOrigin.cast_from[origin_of(self)]]
+        ](
+            shape=(
+                1,
+                Int(ceil((end_y - start_y) / step_y)),
+            ),
+            strides=(self.strides[0], step_y * self.strides[1]),
+            offset=x * self.strides[0] + start_y * self.strides[1],
+            ptr=self._buf.get_ptr()
+            .mut_cast[target_mut=False]()
+            .unsafe_origin_cast[
+                target_origin = ImmutOrigin.cast_from[origin_of(self)]
+            ](),
+        )
+
+        return res^
+
+    # for creating a copy of the slice.
+    fn __getitem__(self, var x: Int, y: Slice) raises -> Matrix[dtype]:
+        """
+        Get item from one int and one slice.
+        """
+        if x >= self.shape[0] or x < -self.shape[0]:
+            raise Error(
+                String("Index {} exceed the row size {}").format(
+                    x, self.shape[0]
+                )
+            )
+        x = self.normalize(x, self.shape[0])
         var start_y: Int
         var end_y: Int
         var step_y: Int
         start_y, end_y, step_y = y.indices(self.shape[1])
         var range_y = range(start_y, end_y, step_y)
 
-        # The new matrix with the corresponding shape
         var B = Matrix[dtype](shape=(1, len(range_y)), order=self.order())
-
-        # Fill in the values at the corresponding index
         var col = 0
         for j in range_y:
             B._store(0, col, self._load(x, j))
@@ -359,18 +590,39 @@ struct Matrix[dtype: DType = DType.float64](
 
         return B^
 
-    fn __getitem__(self, indices: List[Int]) raises -> Self:
+    fn __getitem__(self, indices: List[Int]) raises -> Matrix[dtype, OwnData]:
         """
         Get item by a list of integers.
         """
-
         var ncol = self.shape[1]
         var nrow = len(indices)
         var res = Matrix.zeros[dtype](shape=(nrow, ncol))
         for i in range(nrow):
-            res[i] = self[indices[i]]
+            res.__setitem__(i, self[indices[i]])
         return res^
 
+    fn load[width: Int = 1](self, idx: Int) raises -> SIMD[dtype, width]:
+        """
+        Returns a SIMD element with width `width` at the given index.
+
+        Parameters:
+            width: The width of the SIMD element.
+
+        Args:
+            idx: The linear index.
+
+        Returns:
+            A SIMD element with width `width`.
+        """
+        if idx >= self.size or idx < -self.size:
+            raise Error(
+                String("Index {} exceed the matrix size {}").format(
+                    idx, self.size
+                )
+            )
+        var idx_norm = self.normalize(idx, self.size)
+        return self._buf.ptr.load[width=width](idx_norm)
+
     fn _load[width: Int = 1](self, x: Int, y: Int) -> SIMD[dtype, width]:
         """
         `__getitem__` with width.
@@ -380,6 +632,13 @@ struct Matrix[dtype: DType = DType.float64](
             x * self.strides[0] + y * self.strides[1]
         )
 
+    fn _load[width: Int = 1](self, idx: Int) -> SIMD[dtype, width]:
+        """
+        `__getitem__` with width.
+        Unsafe: No boundary check!
+        """
+        return self._buf.ptr.load[width=width](idx)
+
     fn __setitem__(self, x: Int, y: Int, value: Scalar[dtype]) raises:
         """
         Return the scalar at the index.
@@ -389,29 +648,84 @@ struct Matrix[dtype: DType = DType.float64](
             y: The column number.
             value: The value to be set.
         """
-
-        if (x >= self.shape[0]) or (y >= self.shape[1]):
+        if (
+            x >= self.shape[0]
+            or x < -self.shape[0]
+            or y >= self.shape[1]
+            or y < -self.shape[1]
+        ):
             raise Error(
                 String(
                     "Index ({}, {}) exceed the matrix shape ({}, {})"
                 ).format(x, y, self.shape[0], self.shape[1])
             )
+        var x_norm = self.normalize(x, self.shape[0])
+        var y_norm = self.normalize(y, self.shape[1])
 
-        self._buf.ptr.store(x * self.strides[0] + y * self.strides[1], value)
+        self._buf.ptr.store(
+            x_norm * self.strides[0] + y_norm * self.strides[1], value
+        )
 
-    fn __setitem__(self, var x: Int, value: Self) raises:
+    fn __setitem__(self, var x: Int, value: Matrix[dtype, **_]) raises:
         """
         Set the corresponding row at the index with the given matrix.
 
         Args:
             x: The row number.
-            value: Matrix (row vector).
+            value: Matrix (row vector). Can be either C or F order.
         """
+        if x >= self.shape[0] or x < -self.shape[0]:
+            raise Error(
+                String(
+                    "Error: Elements of `index` ({}) \n"
+                    "exceed the matrix shape ({})."
+                ).format(x, self.shape[0])
+            )
 
-        if x < 0:
-            x = self.shape[0] + x
+        if value.shape[0] != 1:
+            raise Error(
+                String(
+                    "Error: The value should have only 1 row, "
+                    "but it has {} rows."
+                ).format(value.shape[0])
+            )
 
-        if x >= self.shape[0]:
+        if self.shape[1] != value.shape[1]:
+            raise Error(
+                String(
+                    "Error: Matrix has {} columns, "
+                    "but the value has {} columns."
+                ).format(self.shape[1], value.shape[1])
+            )
+
+        if self.flags.C_CONTIGUOUS:
+            if value.flags.C_CONTIGUOUS:
+                var dest_ptr = self._buf.ptr.offset(x * self.strides[0])
+                memcpy(dest=dest_ptr, src=value._buf.ptr, count=self.shape[1])
+            else:
+                for j in range(self.shape[1]):
+                    self._store(x, j, value._load(0, j))
+
+        # For F-contiguous
+        else:
+            if value.flags.F_CONTIGUOUS:
+                for j in range(self.shape[1]):
+                    self._buf.ptr.offset(x + j * self.strides[1]).store(
+                        value._buf.ptr.load(j * value.strides[1])
+                    )
+            else:
+                for j in range(self.shape[1]):
+                    self._store(x, j, value._load(0, j))
+
+    fn set(self, var x: Int, value: Matrix[dtype, **_]) raises:
+        """
+        Set the corresponding row at the index with the given matrix.
+
+        Args:
+            x: The row number.
+            value: Matrix (row vector). Can be either C or F order.
+        """
+        if x >= self.shape[0] or x < -self.shape[0]:
             raise Error(
                 String(
                     "Error: Elements of `index` ({}) \n"
@@ -422,7 +736,7 @@ struct Matrix[dtype: DType = DType.float64](
         if value.shape[0] != 1:
             raise Error(
                 String(
-                    "Error: The value should has only 1 row, "
+                    "Error: The value should have only 1 row, "
                     "but it has {} rows."
                 ).format(value.shape[0])
             )
@@ -435,23 +749,234 @@ struct Matrix[dtype: DType = DType.float64](
                 ).format(self.shape[1], value.shape[1])
             )
 
-        var ptr = self._buf.ptr.offset(x * self.shape[1])
-        memcpy(ptr, value._buf.ptr, value.size)
+        if self.flags.C_CONTIGUOUS:
+            if value.flags.C_CONTIGUOUS:
+                var dest_ptr = self._buf.ptr.offset(x * self.strides[0])
+                memcpy(dest=dest_ptr, src=value._buf.ptr, count=self.shape[1])
+            else:
+                for j in range(self.shape[1]):
+                    self._store(x, j, value._load(0, j))
+
+        # For F-contiguous
+        else:
+            if value.flags.F_CONTIGUOUS:
+                for j in range(self.shape[1]):
+                    self._buf.ptr.offset(x + j * self.strides[1]).store(
+                        value._buf.ptr.load(j * value.strides[1])
+                    )
+            else:
+                for j in range(self.shape[1]):
+                    self._store(x, j, value._load(0, j))
 
-    fn _store[
-        width: Int = 1
-    ](mut self, x: Int, y: Int, simd: SIMD[dtype, width]):
+    fn __setitem__(self, x: Slice, y: Int, value: Matrix[dtype, **_]) raises:
+        """
+        Set item from one slice and one int.
+        """
+        if y >= self.shape[1] or y < -self.shape[1]:
+            raise Error(
+                String("Index {} exceed the column number {}").format(
+                    y, self.shape[1]
+                )
+            )
+        var y_norm = self.normalize(y, self.shape[1])
+        var start_x: Int
+        var end_x: Int
+        var step_x: Int
+        start_x, end_x, step_x = x.indices(self.shape[0])
+        var range_x = range(start_x, end_x, step_x)
+        var len_range_x: Int = len(range_x)
+
+        if len_range_x != value.shape[0] or value.shape[1] != 1:
+            raise Error(
+                String(
+                    "Shape mismatch when assigning to slice: "
+                    "target shape ({}, {}) vs value shape ({}, {})"
+                ).format(len_range_x, 1, value.shape[0], value.shape[1])
+            )
+
+        var row = 0
+        for i in range_x:
+            self._store(i, y_norm, value._load(row, 0))
+            row += 1
+
+    fn set(self, x: Slice, y: Int, value: Matrix[dtype, **_]) raises:
+        """
+        Set item from one slice and one int.
+        """
+        if y >= self.shape[1] or y < -self.shape[1]:
+            raise Error(
+                String("Index {} exceed the column number {}").format(
+                    y, self.shape[1]
+                )
+            )
+        var y_norm = self.normalize(y, self.shape[1])
+        var start_x: Int
+        var end_x: Int
+        var step_x: Int
+        start_x, end_x, step_x = x.indices(self.shape[0])
+        var range_x = range(start_x, end_x, step_x)
+        var len_range_x: Int = len(range_x)
+
+        if len_range_x != value.shape[0] or value.shape[1] != 1:
+            raise Error(
+                String(
+                    "Shape mismatch when assigning to slice: "
+                    "target shape ({}, {}) vs value shape ({}, {})"
+                ).format(len_range_x, 1, value.shape[0], value.shape[1])
+            )
+
+        var row = 0
+        for i in range_x:
+            self._store(i, y_norm, value._load(row, 0))
+            row += 1
+
+    fn __setitem__(self, x: Int, y: Slice, value: Matrix[dtype, **_]) raises:
+        """
+        Set item from one int and one slice.
+        """
+        if x >= self.shape[0] or x < -self.shape[0]:
+            raise Error(
+                String("Index {} exceed the row size {}").format(
+                    x, self.shape[0]
+                )
+            )
+        var x_norm = self.normalize(x, self.shape[0])
+        var start_y: Int
+        var end_y: Int
+        var step_y: Int
+        start_y, end_y, step_y = y.indices(self.shape[1])
+        var range_y = range(start_y, end_y, step_y)
+        var len_range_y: Int = len(range_y)
+
+        if len_range_y != value.shape[1] or value.shape[0] != 1:
+            raise Error(
+                String(
+                    "Shape mismatch when assigning to slice: "
+                    "target shape ({}, {}) vs value shape ({}, {})"
+                ).format(1, len_range_y, value.shape[0], value.shape[1])
+            )
+
+        var col = 0
+        for j in range_y:
+            self._store(x_norm, j, value._load(0, col))
+            col += 1
+
+    fn set(self, x: Int, y: Slice, value: Matrix[dtype, **_]) raises:
+        """
+        Set item from one int and one slice.
+        """
+        if x >= self.shape[0] or x < -self.shape[0]:
+            raise Error(
+                String("Index {} exceed the row size {}").format(
+                    x, self.shape[0]
+                )
+            )
+        var x_norm = self.normalize(x, self.shape[0])
+        var start_y: Int
+        var end_y: Int
+        var step_y: Int
+        start_y, end_y, step_y = y.indices(self.shape[1])
+        var range_y = range(start_y, end_y, step_y)
+        var len_range_y: Int = len(range_y)
+
+        if len_range_y != value.shape[1] or value.shape[0] != 1:
+            raise Error(
+                String(
+                    "Shape mismatch when assigning to slice: "
+                    "target shape ({}, {}) vs value shape ({}, {})"
+                ).format(1, len_range_y, value.shape[0], value.shape[1])
+            )
+
+        var col = 0
+        for j in range_y:
+            self._store(x_norm, j, value._load(0, col))
+            col += 1
+
+    fn __setitem__(self, x: Slice, y: Slice, value: Matrix[dtype, **_]) raises:
+        """
+        Set item from two slices.
+        """
+        var start_x: Int
+        var end_x: Int
+        var step_x: Int
+        var start_y: Int
+        var end_y: Int
+        var step_y: Int
+        start_x, end_x, step_x = x.indices(self.shape[0])
+        start_y, end_y, step_y = y.indices(self.shape[1])
+        var range_x = range(start_x, end_x, step_x)
+        var range_y = range(start_y, end_y, step_y)
+
+        if len(range_x) != value.shape[0] or len(range_y) != value.shape[1]:
+            raise Error(
+                String(
+                    "Shape mismatch when assigning to slice: "
+                    "target shape ({}, {}) vs value shape ({}, {})"
+                ).format(
+                    len(range_x), len(range_y), value.shape[0], value.shape[1]
+                )
+            )
+
+        var row = 0
+        for i in range_x:
+            var col = 0
+            for j in range_y:
+                self._store(i, j, value._load(row, col))
+                col += 1
+            row += 1
+
+    fn set(self, x: Slice, y: Slice, value: Matrix[dtype, **_]) raises:
+        """
+        Set item from two slices.
+        """
+        var start_x: Int
+        var end_x: Int
+        var step_x: Int
+        var start_y: Int
+        var end_y: Int
+        var step_y: Int
+        start_x, end_x, step_x = x.indices(self.shape[0])
+        start_y, end_y, step_y = y.indices(self.shape[1])
+        var range_x = range(start_x, end_x, step_x)
+        var range_y = range(start_y, end_y, step_y)
+
+        if len(range_x) != value.shape[0] or len(range_y) != value.shape[1]:
+            raise Error(
+                String(
+                    "Shape mismatch when assigning to slice: "
+                    "target shape ({}, {}) vs value shape ({}, {})"
+                ).format(
+                    len(range_x), len(range_y), value.shape[0], value.shape[1]
+                )
+            )
+
+        var row = 0
+        for i in range_x:
+            var col = 0
+            for j in range_y:
+                self._store(i, j, value._load(row, col))
+                col += 1
+            row += 1
+
+    fn _store[width: Int = 1](self, x: Int, y: Int, simd: SIMD[dtype, width]):
         """
         `__setitem__` with width.
         Unsafe: No boundary check!
         """
         self._buf.ptr.store(x * self.strides[0] + y * self.strides[1], simd)
 
+    fn _store_idx[width: Int = 1](self, idx: Int, val: SIMD[dtype, width]):
+        """
+        `__setitem__` with width.
+        Unsafe: No boundary check!
+        """
+        self._buf.ptr.store(idx, val)
+
     # ===-------------------------------------------------------------------===#
     # Other dunders and auxiliary methods
     # ===-------------------------------------------------------------------===#
 
-    fn __iter__(self) raises -> _MatrixIter[__origin_of(self), dtype]:
+    fn __iter__(self) raises -> _MatrixIter[origin_of(self), dtype, BufType]:
         """Iterate over elements of the Matrix, returning copied value.
 
         Example:
@@ -466,7 +991,7 @@ struct Matrix[dtype: DType = DType.float64](
             An iterator of Matrix elements.
         """
 
-        return _MatrixIter[__origin_of(self), dtype](
+        return _MatrixIter[origin_of(self), dtype, BufType](
             matrix=self,
             length=self.shape[0],
         )
@@ -479,7 +1004,7 @@ struct Matrix[dtype: DType = DType.float64](
 
     fn __reversed__(
         self,
-    ) raises -> _MatrixIter[__origin_of(self), dtype, forward=False]:
+    ) raises -> _MatrixIter[origin_of(self), dtype, BufType, forward=False]:
         """Iterate backwards over elements of the Matrix, returning
         copied value.
 
@@ -487,7 +1012,7 @@ struct Matrix[dtype: DType = DType.float64](
             A reversed iterator of Matrix elements.
         """
 
-        return _MatrixIter[__origin_of(self), dtype, forward=False](
+        return _MatrixIter[origin_of(self), dtype, BufType, forward=False](
             matrix=self,
             length=self.shape[0],
         )
@@ -551,10 +1076,8 @@ struct Matrix[dtype: DType = DType.float64](
                 + String(self.strides[0])
                 + ","
                 + String(self.strides[1])
-                + "  C: "
-                + String(self.flags["C_CONTIGUOUS"])
-                + "  F: "
-                + String(self.flags["F_CONTIGUOUS"])
+                + "  order: "
+                + String("C" if self.flags["C_CONTIGUOUS"] else "F")
                 + "  Own: "
                 + String(self.flags["OWNDATA"])
             )
@@ -565,7 +1088,9 @@ struct Matrix[dtype: DType = DType.float64](
     # Arithmetic dunder methods
     # ===-------------------------------------------------------------------===#
 
-    fn __add__(self, other: Self) raises -> Self:
+    fn __add__(
+        read self, read other: Matrix[dtype, *_]
+    ) raises -> Matrix[dtype, OwnData]:
         if (self.shape[0] == other.shape[0]) and (
             self.shape[1] == other.shape[1]
         ):
@@ -577,13 +1102,13 @@ struct Matrix[dtype: DType = DType.float64](
         ):
             return _arithmetic_func_matrix_matrix_to_matrix[
                 dtype, SIMD.__add__
-            ](broadcast_to(self.copy(), other.shape, self.order()), other)
+            ](broadcast_to[dtype](self, other.shape, self.order()), other)
         else:
             return _arithmetic_func_matrix_matrix_to_matrix[
                 dtype, SIMD.__add__
-            ](self, broadcast_to(other.copy(), self.shape, self.order()))
+            ](self, broadcast_to[dtype](other, self.shape, self.order()))
 
-    fn __add__(self, other: Scalar[dtype]) raises -> Self:
+    fn __add__(self, other: Scalar[dtype]) raises -> Matrix[dtype, **_]:
         """Add matrix to scalar.
 
         ```mojo
@@ -594,7 +1119,7 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return self + broadcast_to[dtype](other, self.shape, self.order())
 
-    fn __radd__(self, other: Scalar[dtype]) raises -> Self:
+    fn __radd__(self, other: Scalar[dtype]) raises -> Matrix[dtype, **_]:
         """
         Right-add.
 
@@ -606,7 +1131,9 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return broadcast_to[dtype](other, self.shape, self.order()) + self
 
-    fn __sub__(self, other: Self) raises -> Self:
+    fn __sub__(
+        read self, read other: Matrix[dtype, *_]
+    ) raises -> Matrix[dtype, **_]:
         if (self.shape[0] == other.shape[0]) and (
             self.shape[1] == other.shape[1]
         ):
@@ -624,7 +1151,7 @@ struct Matrix[dtype: DType = DType.float64](
                 dtype, SIMD.__sub__
             ](self, broadcast_to(other.copy(), self.shape, self.order()))
 
-    fn __sub__(self, other: Scalar[dtype]) raises -> Self:
+    fn __sub__(self, other: Scalar[dtype]) raises -> Matrix[dtype, **_]:
         """Subtract matrix by scalar.
 
         ```mojo
@@ -635,7 +1162,7 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return self - broadcast_to[dtype](other, self.shape, self.order())
 
-    fn __rsub__(self, other: Scalar[dtype]) raises -> Self:
+    fn __rsub__(self, other: Scalar[dtype]) raises -> Matrix[dtype, **_]:
         """
         Right-sub.
 
@@ -647,7 +1174,7 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return broadcast_to[dtype](other, self.shape, self.order()) - self
 
-    fn __mul__(self, other: Self) raises -> Self:
+    fn __mul__(self, other: Matrix[dtype, **_]) raises -> Matrix[dtype, **_]:
         if (self.shape[0] == other.shape[0]) and (
             self.shape[1] == other.shape[1]
         ):
@@ -665,7 +1192,7 @@ struct Matrix[dtype: DType = DType.float64](
                 dtype, SIMD.__mul__
             ](self, broadcast_to(other.copy(), self.shape, self.order()))
 
-    fn __mul__(self, other: Scalar[dtype]) raises -> Self:
+    fn __mul__(self, other: Scalar[dtype]) raises -> Matrix[dtype, **_]:
         """Mutiply matrix by scalar.
 
         ```mojo
@@ -676,7 +1203,7 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return self * broadcast_to[dtype](other, self.shape, self.order())
 
-    fn __rmul__(self, other: Scalar[dtype]) raises -> Self:
+    fn __rmul__(self, other: Scalar[dtype]) raises -> Matrix[dtype, **_]:
         """
         Right-mul.
 
@@ -688,7 +1215,9 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return broadcast_to[dtype](other, self.shape, self.order()) * self
 
-    fn __truediv__(self, other: Self) raises -> Self:
+    fn __truediv__(
+        self, other: Matrix[dtype, **_]
+    ) raises -> Matrix[dtype, **_]:
         if (self.shape[0] == other.shape[0]) and (
             self.shape[1] == other.shape[1]
         ):
@@ -706,19 +1235,23 @@ struct Matrix[dtype: DType = DType.float64](
                 dtype, SIMD.__truediv__
             ](self, broadcast_to(other.copy(), self.shape, self.order()))
 
-    fn __truediv__(self, other: Scalar[dtype]) raises -> Self:
+    fn __truediv__(self, other: Scalar[dtype]) raises -> Matrix[dtype, **_]:
         """Divide matrix by scalar."""
         return self / broadcast_to[dtype](other, self.shape, order=self.order())
 
     # Shouldn't we do the operation inplace?
-    fn __pow__(self, rhs: Scalar[dtype]) raises -> Self:
+    fn __pow__(self, rhs: Scalar[dtype]) raises -> Matrix[dtype, **_]:
         """Power of items."""
-        var result: Self = self.copy()
+        var result: Matrix[dtype, OwnData] = Matrix[dtype, OwnData](
+            shape=self.shape, order=self.order()
+        )
         for i in range(self.size):
             result._buf.ptr[i] = self._buf.ptr[i].__pow__(rhs)
         return result^
 
-    fn __lt__(self, other: Self) raises -> Matrix[DType.bool]:
+    fn __lt__(
+        self, other: Matrix[dtype, **_]
+    ) raises -> Matrix[DType.bool, **_]:
         if (self.shape[0] == other.shape[0]) and (
             self.shape[1] == other.shape[1]
         ):
@@ -736,7 +1269,7 @@ struct Matrix[dtype: DType = DType.float64](
                 self, broadcast_to(other.copy(), self.shape, self.order())
             )
 
-    fn __lt__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool]:
+    fn __lt__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool, **_]:
         """Matrix less than scalar.
 
         ```mojo
@@ -747,7 +1280,9 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return self < broadcast_to[dtype](other, self.shape, self.order())
 
-    fn __le__(self, other: Self) raises -> Matrix[DType.bool]:
+    fn __le__(
+        self, other: Matrix[dtype, **_]
+    ) raises -> Matrix[DType.bool, **_]:
         if (self.shape[0] == other.shape[0]) and (
             self.shape[1] == other.shape[1]
         ):
@@ -765,7 +1300,7 @@ struct Matrix[dtype: DType = DType.float64](
                 self, broadcast_to(other.copy(), self.shape, self.order())
             )
 
-    fn __le__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool]:
+    fn __le__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool, **_]:
         """Matrix less than and equal to scalar.
 
         ```mojo
@@ -776,7 +1311,9 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return self <= broadcast_to[dtype](other, self.shape, self.order())
 
-    fn __gt__(self, other: Self) raises -> Matrix[DType.bool]:
+    fn __gt__(
+        self, other: Matrix[dtype, **_]
+    ) raises -> Matrix[DType.bool, **_]:
         if (self.shape[0] == other.shape[0]) and (
             self.shape[1] == other.shape[1]
         ):
@@ -794,7 +1331,7 @@ struct Matrix[dtype: DType = DType.float64](
                 self, broadcast_to(other.copy(), self.shape, self.order())
             )
 
-    fn __gt__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool]:
+    fn __gt__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool, **_]:
         """Matrix greater than scalar.
 
         ```mojo
@@ -805,7 +1342,9 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return self > broadcast_to[dtype](other, self.shape, self.order())
 
-    fn __ge__(self, other: Self) raises -> Matrix[DType.bool]:
+    fn __ge__(
+        self, other: Matrix[dtype, **_]
+    ) raises -> Matrix[DType.bool, **_]:
         if (self.shape[0] == other.shape[0]) and (
             self.shape[1] == other.shape[1]
         ):
@@ -823,7 +1362,7 @@ struct Matrix[dtype: DType = DType.float64](
                 self, broadcast_to(other.copy(), self.shape, self.order())
             )
 
-    fn __ge__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool]:
+    fn __ge__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool, **_]:
         """Matrix greater than and equal to scalar.
 
         ```mojo
@@ -834,7 +1373,9 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return self >= broadcast_to[dtype](other, self.shape, self.order())
 
-    fn __eq__(self, other: Self) raises -> Matrix[DType.bool]:
+    fn __eq__(
+        self, other: Matrix[dtype, **_]
+    ) raises -> Matrix[DType.bool, **_]:
         if (self.shape[0] == other.shape[0]) and (
             self.shape[1] == other.shape[1]
         ):
@@ -852,7 +1393,7 @@ struct Matrix[dtype: DType = DType.float64](
                 self, broadcast_to(other.copy(), self.shape, self.order())
             )
 
-    fn __eq__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool]:
+    fn __eq__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool, **_]:
         """Matrix less than and equal to scalar.
 
         ```mojo
@@ -863,7 +1404,9 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return self == broadcast_to[dtype](other, self.shape, self.order())
 
-    fn __ne__(self, other: Self) raises -> Matrix[DType.bool]:
+    fn __ne__(
+        self, other: Matrix[dtype, **_]
+    ) raises -> Matrix[DType.bool, **_]:
         if (self.shape[0] == other.shape[0]) and (
             self.shape[1] == other.shape[1]
         ):
@@ -881,7 +1424,7 @@ struct Matrix[dtype: DType = DType.float64](
                 self, broadcast_to(other.copy(), self.shape, self.order())
             )
 
-    fn __ne__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool]:
+    fn __ne__(self, other: Scalar[dtype]) raises -> Matrix[DType.bool, **_]:
         """Matrix less than and equal to scalar.
 
         ```mojo
@@ -892,7 +1435,7 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return self != broadcast_to[dtype](other, self.shape, self.order())
 
-    fn __matmul__(self, other: Self) raises -> Self:
+    fn __matmul__(self, other: Matrix[dtype, **_]) raises -> Matrix[dtype, **_]:
         return numojo.linalg.matmul(self, other)
 
     # ===-------------------------------------------------------------------===#
@@ -905,11 +1448,11 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return numojo.logic.all(self)
 
-    fn all(self, axis: Int) raises -> Self:
+    fn all(self, axis: Int) raises -> Matrix[dtype, OwnData]:
         """
         Test whether all array elements evaluate to True along axis.
         """
-        return numojo.logic.all(self, axis=axis)
+        return numojo.logic.all[dtype](self, axis=axis)
 
     fn any(self) -> Scalar[dtype]:
         """
@@ -917,47 +1460,47 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return numojo.logic.any(self)
 
-    fn any(self, axis: Int) raises -> Self:
+    fn any(self, axis: Int) raises -> Matrix[dtype, OwnData]:
         """
         Test whether any array elements evaluate to True along axis.
         """
         return numojo.logic.any(self, axis=axis)
 
-    fn argmax(self) raises -> Scalar[DType.index]:
+    fn argmax(self) raises -> Scalar[DType.int]:
         """
         Index of the max. It is first flattened before sorting.
         """
         return numojo.math.argmax(self)
 
-    fn argmax(self, axis: Int) raises -> Matrix[DType.index]:
+    fn argmax(self, axis: Int) raises -> Matrix[DType.int]:
         """
         Index of the max along the given axis.
         """
         return numojo.math.argmax(self, axis=axis)
 
-    fn argmin(self) raises -> Scalar[DType.index]:
+    fn argmin(self) raises -> Scalar[DType.int]:
         """
         Index of the min. It is first flattened before sorting.
         """
         return numojo.math.argmin(self)
 
-    fn argmin(self, axis: Int) raises -> Matrix[DType.index]:
+    fn argmin(self, axis: Int) raises -> Matrix[DType.int]:
         """
         Index of the min along the given axis.
         """
         return numojo.math.argmin(self, axis=axis)
 
-    fn argsort(self) raises -> Matrix[DType.index]:
+    fn argsort(self) raises -> Matrix[DType.int]:
         """
         Argsort the Matrix. It is first flattened before sorting.
         """
         return numojo.math.argsort(self)
 
-    fn argsort(self, axis: Int) raises -> Matrix[DType.index]:
+    fn argsort(self, axis: Int) raises -> Matrix[DType.int]:
         """
         Argsort the Matrix along the given axis.
         """
-        return numojo.math.argsort(self.copy(), axis=axis)
+        return numojo.math.argsort(self, axis=axis)
 
     fn astype[asdtype: DType](self) -> Matrix[asdtype]:
         """
@@ -970,7 +1513,7 @@ struct Matrix[dtype: DType = DType.float64](
             res._buf.ptr[i] = self._buf.ptr[i].cast[asdtype]()
         return res^
 
-    fn cumprod(self) raises -> Matrix[dtype]:
+    fn cumprod(self) raises -> Matrix[dtype, OwnData]:
         """
         Cumprod of flattened matrix.
 
@@ -983,7 +1526,7 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return numojo.math.cumprod(self.copy())
 
-    fn cumprod(self, axis: Int) raises -> Matrix[dtype]:
+    fn cumprod(self, axis: Int) raises -> Matrix[dtype, OwnData]:
         """
         Cumprod of Matrix along the axis.
 
@@ -1000,10 +1543,10 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return numojo.math.cumprod(self.copy(), axis=axis)
 
-    fn cumsum(self) raises -> Matrix[dtype]:
+    fn cumsum(self) raises -> Matrix[dtype, OwnData]:
         return numojo.math.cumsum(self.copy())
 
-    fn cumsum(self, axis: Int) raises -> Matrix[dtype]:
+    fn cumsum(self, axis: Int) raises -> Matrix[dtype, OwnData]:
         return numojo.math.cumsum(self.copy(), axis=axis)
 
     fn fill(self, fill_value: Scalar[dtype]):
@@ -1015,15 +1558,18 @@ struct Matrix[dtype: DType = DType.float64](
         for i in range(self.size):
             self._buf.ptr[i] = fill_value
 
-    fn flatten(self) -> Self:
+    # * Make it inplace?
+    fn flatten(self) -> Matrix[dtype, OwnData]:
         """
         Return a flattened copy of the matrix.
         """
-        var res = Self(shape=(1, self.size), order=self.order())
-        memcpy(res._buf.ptr, self._buf.ptr, res.size)
+        var res = Matrix[dtype, OwnData](
+            shape=(1, self.size), order=self.order()
+        )
+        memcpy(dest=res._buf.ptr, src=self._buf.ptr, count=res.size)
         return res^
 
-    fn inv(self) raises -> Self:
+    fn inv(self) raises -> Matrix[dtype, OwnData]:
         """
         Inverse of matrix.
         """
@@ -1044,7 +1590,7 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return numojo.math.extrema.max(self)
 
-    fn max(self, axis: Int) raises -> Self:
+    fn max(self, axis: Int) raises -> Matrix[dtype, OwnData]:
         """
         Find max item along the given axis.
         """
@@ -1075,7 +1621,7 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return numojo.math.extrema.min(self)
 
-    fn min(self, axis: Int) raises -> Self:
+    fn min(self, axis: Int) raises -> Matrix[dtype, OwnData]:
         """
         Find min item along the given axis.
         """
@@ -1087,7 +1633,7 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return numojo.math.prod(self)
 
-    fn prod(self, axis: Int) raises -> Self:
+    fn prod(self, axis: Int) raises -> Matrix[dtype]:
         """
         Product of items in a Matrix along the axis.
 
@@ -1104,7 +1650,7 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return numojo.math.prod(self, axis=axis)
 
-    fn reshape(self, shape: Tuple[Int, Int]) raises -> Self:
+    fn reshape(self, shape: Tuple[Int, Int]) raises -> Matrix[dtype]:
         """
         Change shape and size of matrix and return a new matrix.
         """
@@ -1114,13 +1660,13 @@ struct Matrix[dtype: DType = DType.float64](
                     "Cannot reshape matrix of size {} into shape ({}, {})."
                 ).format(self.size, shape[0], shape[1])
             )
-        var res = Self(shape=shape, order="C")
+        var res = Matrix[dtype](shape=shape, order="C")
         if self.flags.F_CONTIGUOUS:
             var temp = self.reorder_layout()
-            memcpy(res._buf.ptr, temp._buf.ptr, res.size)
+            memcpy(dest=res._buf.ptr, src=temp._buf.ptr, count=res.size)
             res = res.reorder_layout()
         else:
-            memcpy(res._buf.ptr, self._buf.ptr, res.size)
+            memcpy(dest=res._buf.ptr, src=self._buf.ptr, count=res.size)
         return res^
 
     fn resize(mut self, shape: Tuple[Int, Int]) raises:
@@ -1128,9 +1674,9 @@ struct Matrix[dtype: DType = DType.float64](
         Change shape and size of matrix in-place.
         """
         if shape[0] * shape[1] > self.size:
-            var other = Self(shape=shape)
+            var other = Matrix[dtype, Self.BufType](shape=shape)
             if self.flags.C_CONTIGUOUS:
-                memcpy(other._buf.ptr, self._buf.ptr, self.size)
+                memcpy(dest=other._buf.ptr, src=self._buf.ptr, count=self.size)
                 for i in range(self.size, other.size):
                     other._buf.ptr[i] = 0
             else:
@@ -1155,8 +1701,8 @@ struct Matrix[dtype: DType = DType.float64](
             else:
                 self.strides[1] = shape[0]
 
-    fn round(self, decimals: Int) raises -> Self:
-        return numojo.math.rounding.round(self.copy(), decimals=decimals)
+    fn round(self, decimals: Int) raises -> Matrix[dtype]:
+        return numojo.math.rounding.round(self, decimals=decimals)
 
     fn std[
         returned_dtype: DType = DType.float64
@@ -1194,7 +1740,7 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return numojo.math.sum(self)
 
-    fn sum(self, axis: Int) raises -> Self:
+    fn sum(self, axis: Int) raises -> Matrix[dtype, OwnData]:
         """
         Sum up the items in a Matrix along the axis.
 
@@ -1223,19 +1769,19 @@ struct Matrix[dtype: DType = DType.float64](
         """
         return issymmetric(self)
 
-    fn transpose(self) -> Self:
+    fn transpose(self) -> Matrix[dtype, OwnData]:
         """
         Transpose of matrix.
         """
         return transpose(self)
 
-    fn reorder_layout(self) raises -> Self:
+    fn reorder_layout(self) raises -> Matrix[dtype, Self.BufType]:
         """
         Reorder_layout matrix.
         """
         return reorder_layout(self)
 
-    fn T(self) -> Self:
+    fn T(self) -> Matrix[dtype, OwnData]:
         return transpose(self)
 
     fn variance[
@@ -1276,7 +1822,7 @@ struct Matrix[dtype: DType = DType.float64](
         var ndarray: NDArray[dtype] = NDArray[dtype](
             shape=List[Int](self.shape[0], self.shape[1]), order="C"
         )
-        memcpy(ndarray._buf.ptr, self._buf.ptr, ndarray.size)
+        memcpy(dest=ndarray._buf.ptr, src=self._buf.ptr, count=ndarray.size)
 
         return ndarray^
 
@@ -1316,7 +1862,7 @@ struct Matrix[dtype: DType = DType.float64](
                 np_dtype = np.uint8
             elif dtype == DType.bool:
                 np_dtype = np.bool_
-            elif dtype == DType.index:
+            elif dtype == DType.int:
                 np_dtype = np.int64
 
             var order = "C" if self.flags.C_CONTIGUOUS else "F"
@@ -1324,7 +1870,7 @@ struct Matrix[dtype: DType = DType.float64](
             var pointer_d = numpyarray.__array_interface__["data"][
                 0
             ].unsafe_get_as_pointer[dtype]()
-            memcpy(pointer_d, self._buf.ptr, self.size)
+            memcpy(dest=pointer_d, src=self._buf.ptr, count=self.size)
 
             return numpyarray^
 
@@ -1338,12 +1884,12 @@ struct Matrix[dtype: DType = DType.float64](
 
     @staticmethod
     fn full[
-        dtype: DType = DType.float64
+        datatype: DType = DType.float64
     ](
         shape: Tuple[Int, Int],
-        fill_value: Scalar[dtype] = 0,
+        fill_value: Scalar[datatype] = 0,
         order: String = "C",
-    ) -> Matrix[dtype]:
+    ) -> Matrix[datatype, OwnData]:
         """Return a matrix with given shape and filled value.
 
         Example:
@@ -1353,7 +1899,7 @@ struct Matrix[dtype: DType = DType.float64](
         ```
         """
 
-        var matrix = Matrix[dtype](shape, order)
+        var matrix = Matrix[datatype, OwnData](shape, order)
         for i in range(shape[0] * shape[1]):
             matrix._buf.ptr.store(i, fill_value)
 
@@ -1361,8 +1907,8 @@ struct Matrix[dtype: DType = DType.float64](
 
     @staticmethod
     fn zeros[
-        dtype: DType = DType.float64
-    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[dtype]:
+        datatype: DType = DType.float64
+    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[datatype, OwnData]:
         """Return a matrix with given shape and filled with zeros.
 
         Example:
@@ -1372,14 +1918,14 @@ struct Matrix[dtype: DType = DType.float64](
         ```
         """
 
-        var M = Matrix[dtype](shape, order)
-        memset_zero(M._buf.ptr, M.size)
-        return M^
+        var res = Matrix[datatype, OwnData](shape, order)
+        memset_zero(res._buf.ptr, res.size)
+        return res^
 
     @staticmethod
     fn ones[
-        dtype: DType = DType.float64
-    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[dtype]:
+        datatype: DType = DType.float64
+    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[datatype, OwnData]:
         """Return a matrix with given shape and filled with ones.
 
         Example:
@@ -1389,12 +1935,12 @@ struct Matrix[dtype: DType = DType.float64](
         ```
         """
 
-        return Matrix.full[dtype](shape=shape, fill_value=1)
+        return Matrix.full[datatype](shape=shape, fill_value=1)
 
     @staticmethod
     fn identity[
-        dtype: DType = DType.float64
-    ](len: Int, order: String = "C") -> Matrix[dtype]:
+        datatype: DType = DType.float64
+    ](len: Int, order: String = "C") -> Matrix[datatype, OwnData]:
         """Return an identity matrix with given size.
 
         Example:
@@ -1403,7 +1949,7 @@ struct Matrix[dtype: DType = DType.float64](
         var A = Matrix.identity(12)
         ```
         """
-        var matrix = Matrix.zeros[dtype]((len, len), order)
+        var matrix = Matrix.zeros[datatype]((len, len), order)
         for i in range(len):
             matrix._buf.ptr.store(
                 i * matrix.strides[0] + i * matrix.strides[1], 1
@@ -1412,8 +1958,8 @@ struct Matrix[dtype: DType = DType.float64](
 
     @staticmethod
     fn rand[
-        dtype: DType = DType.float64
-    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[dtype]:
+        datatype: DType = DType.float64
+    ](shape: Tuple[Int, Int], order: String = "C") -> Matrix[datatype, OwnData]:
         """Return a matrix with random values uniformed distributed between 0 and 1.
 
         Example:
@@ -1422,26 +1968,23 @@ struct Matrix[dtype: DType = DType.float64](
         var A = Matrix.rand((12, 12))
         ```
 
-        Parameters:
-            dtype: The data type of the NDArray elements.
-
         Args:
             shape: The shape of the Matrix.
             order: The order of the Matrix. "C" or "F".
         """
-        var result = Matrix[dtype](shape, order)
+        var result = Matrix[datatype](shape, order)
         for i in range(result.size):
-            result._buf.ptr.store(i, random_float64(0, 1).cast[dtype]())
+            result._buf.ptr.store(i, random_float64(0, 1).cast[datatype]())
         return result^
 
     @staticmethod
     fn fromlist[
-        dtype: DType
+        datatype: DType = DType.float64
     ](
-        object: List[Scalar[dtype]],
+        object: List[Scalar[datatype]],
         shape: Tuple[Int, Int] = (0, 0),
         order: String = "C",
-    ) raises -> Matrix[dtype]:
+    ) raises -> Matrix[datatype, OwnData]:
         """Create a matrix from a 1-dimensional list into given shape.
 
         If no shape is passed, the return matrix will be a row vector.
@@ -1455,8 +1998,8 @@ struct Matrix[dtype: DType = DType.float64](
         """
 
         if (shape[0] == 0) and (shape[1] == 0):
-            var M = Matrix[dtype](shape=(1, len(object)))
-            memcpy(M._buf.ptr, object.unsafe_ptr(), M.size)
+            var M = Matrix[datatype](shape=(1, len(object)))
+            memcpy(dest=M._buf.ptr, src=object.unsafe_ptr(), count=M.size)
             return M^
 
         if shape[0] * shape[1] != len(object):
@@ -1464,18 +2007,18 @@ struct Matrix[dtype: DType = DType.float64](
                 "The input has {} elements, but the target has the shape {}x{}"
             ).format(len(object), shape[0], shape[1])
             raise Error(message)
-        var M = Matrix[dtype](shape=shape, order="C")
-        memcpy(M._buf.ptr, object.unsafe_ptr(), M.size)
+        var M = Matrix[datatype](shape=shape, order="C")
+        memcpy(dest=M._buf.ptr, src=object.unsafe_ptr(), count=M.size)
         if order == "F":
             M = M.reorder_layout()
         return M^
 
     @staticmethod
     fn fromstring[
-        dtype: DType = DType.float64
+        datatype: DType = DType.float64
     ](
         text: String, shape: Tuple[Int, Int] = (0, 0), order: String = "C"
-    ) raises -> Matrix[dtype]:
+    ) raises -> Matrix[datatype, OwnData]:
         """Matrix initialization from string representation of an matrix.
 
         Comma, right brackets, and whitespace are treated as seperators of numbers.
@@ -1488,7 +2031,7 @@ struct Matrix[dtype: DType = DType.float64](
         from numojo.prelude import *
         from numojo import Matrix
         fn main() raises:
-            var A = Matrix.fromstring[f32](
+            var A = Matrix[f32].fromstring(
             "1 2 .3 4 5 6.5 7 1_323.12 9 10, 11.12, 12 13 14 15 16", (4, 4))
         ```
         ```console
@@ -1496,7 +2039,7 @@ struct Matrix[dtype: DType = DType.float64](
         [5.0   6.5     7.0     1323.1199951171875]
         [9.0   10.0    11.119999885559082      12.0]
         [13.0  14.0    15.0    16.0]]
-        Size: 4x4  DType: float32
+        Size: 4x4  datatype: float32
         ```
 
         Args:
@@ -1505,7 +2048,7 @@ struct Matrix[dtype: DType = DType.float64](
             order: Order of the matrix. "C" or "F".
         """
 
-        var data = List[Scalar[dtype]]()
+        var data = List[Scalar[datatype]]()
         var bytes = text.as_bytes()
         var number_as_str: String = ""
         var size = shape[0] * shape[1]
@@ -1519,7 +2062,7 @@ struct Matrix[dtype: DType = DType.float64](
             ):
                 number_as_str = number_as_str + chr(Int(b))
                 if i == len(bytes) - 1:  # Last byte
-                    var number = atof(number_as_str).cast[dtype]()
+                    var number = atof(number_as_str).cast[datatype]()
                     data.append(number)  # Add the number to the data buffer
                     number_as_str = ""  # Clean the number cache
             if (
@@ -1528,7 +2071,7 @@ struct Matrix[dtype: DType = DType.float64](
                 or (chr(Int(b)) == " ")
             ):
                 if number_as_str != "":
-                    var number = atof(number_as_str).cast[dtype]()
+                    var number = atof(number_as_str).cast[datatype]()
                     data.append(number)  # Add the number to the data buffer
                     number_as_str = ""  # Clean the number cache
 
@@ -1542,7 +2085,7 @@ struct Matrix[dtype: DType = DType.float64](
             ).format(len(data), shape[0], shape[1])
             raise Error(message)
 
-        var result = Matrix[dtype](shape=shape)
+        var result = Matrix[datatype](shape=shape)
         for i in range(len(data)):
             result._buf.ptr[i] = data[i]
         return result^
@@ -1558,6 +2101,7 @@ struct _MatrixIter[
     is_mutable: Bool, //,
     lifetime: Origin[is_mutable],
     dtype: DType,
+    buf_type: Buffered,
     forward: Bool = True,
 ](Copyable, Movable):
     """Iterator for Matrix.
@@ -1566,16 +2110,17 @@ struct _MatrixIter[
         is_mutable: Whether the iterator is mutable.
         lifetime: The lifetime of the underlying Matrix data.
         dtype: The data type of the item.
+        buf_type: The buffer type of the underlying Matrix, OwnData or RefData.
         forward: The iteration direction. `False` is backwards.
     """
 
     var index: Int
-    var matrix: Matrix[dtype]
+    var matrix: Matrix[dtype, buf_type]
     var length: Int
 
     fn __init__(
         out self,
-        matrix: Matrix[dtype],
+        matrix: Matrix[dtype, buf_type],
         length: Int,
     ):
         self.index = 0 if forward else length
@@ -1585,16 +2130,20 @@ struct _MatrixIter[
     fn __iter__(self) -> Self:
         return self.copy()
 
-    fn __next__(mut self) raises -> Matrix[dtype]:
+    fn __next__(
+        mut self,
+    ) raises -> Matrix[
+        dtype, RefData[ImmutOrigin.cast_from[origin_of(self.matrix)]]
+    ]:
         @parameter
         if forward:
             var current_index = self.index
             self.index += 1
-            return self.matrix[current_index]
+            return self.matrix.get(current_index)
         else:
             var current_index = self.index
             self.index -= 1
-            return self.matrix[current_index]
+            return self.matrix.get(current_index)
 
     @always_inline
     fn __has_next__(self) -> Bool:
@@ -1622,7 +2171,9 @@ fn _arithmetic_func_matrix_matrix_to_matrix[
     simd_func: fn[type: DType, simd_width: Int] (
         SIMD[type, simd_width], SIMD[type, simd_width]
     ) -> SIMD[type, simd_width],
-](A: Matrix[dtype], B: Matrix[dtype]) raises -> Matrix[dtype]:
+](read A: Matrix[dtype, **_], read B: Matrix[dtype, **_]) raises -> Matrix[
+    dtype, OwnData
+]:
     """
     Matrix[dtype] & Matrix[dtype] -> Matrix[dtype]
 
@@ -1643,11 +2194,11 @@ fn _arithmetic_func_matrix_matrix_to_matrix[
             )
         )
 
-    var C = Matrix[dtype](shape=A.shape, order=A.order())
+    var res = Matrix[dtype](shape=A.shape, order=A.order())
 
     @parameter
     fn vec_func[simd_width: Int](i: Int):
-        C._buf.ptr.store(
+        res._buf.ptr.store(
             i,
             simd_func(
                 A._buf.ptr.load[width=simd_width](i),
@@ -1656,8 +2207,7 @@ fn _arithmetic_func_matrix_matrix_to_matrix[
         )
 
     vectorize[vec_func, simd_width](A.size)
-
-    return C^
+    return res^
 
 
 fn _arithmetic_func_matrix_to_matrix[
@@ -1689,7 +2239,9 @@ fn _logic_func_matrix_matrix_to_matrix[
     simd_func: fn[type: DType, simd_width: Int] (
         SIMD[type, simd_width], SIMD[type, simd_width]
     ) -> SIMD[DType.bool, simd_width],
-](A: Matrix[dtype], B: Matrix[dtype]) raises -> Matrix[DType.bool]:
+](A: Matrix[dtype, **_], B: Matrix[dtype, **_]) raises -> Matrix[
+    DType.bool, **_
+]:
     """
     Matrix[dtype] & Matrix[dtype] -> Matrix[bool]
     """
diff --git a/numojo/core/ndarray.mojo b/numojo/core/ndarray.mojo
index 696b259a..af700ace 100644
--- a/numojo/core/ndarray.mojo
+++ b/numojo/core/ndarray.mojo
@@ -54,7 +54,8 @@ import builtin.math as builtin_math
 from builtin.type_aliases import Origin
 from collections.optional import Optional
 from math import log10
-from memory import UnsafePointer, memset_zero, memcpy
+from memory import memset_zero, memcpy
+from memory import LegacyUnsafePointer as UnsafePointer
 from python import PythonObject
 from sys import simd_width_of
 from utils import Variant
@@ -315,7 +316,7 @@ struct NDArray[dtype: DType = DType.float64](
         self.size = other.size
         self.strides = other.strides
         self._buf = DataContainer[dtype](self.size)
-        memcpy(self._buf.ptr, other._buf.ptr, other.size)
+        memcpy(dest=self._buf.ptr, src=other._buf.ptr, count=other.size)
         self.flags = Flags(
             c_contiguous=other.flags.C_CONTIGUOUS,
             f_contiguous=other.flags.F_CONTIGUOUS,
@@ -371,7 +372,7 @@ struct NDArray[dtype: DType = DType.float64](
     # fn __getitem__(self, *slices: Variant[Slice, Int]) raises -> Self         # Get by mix of slices/ints
     #
     # 4. Advanced Indexing
-    # fn __getitem__(self, indices: NDArray[DType.index]) raises -> Self        # Get by index array
+    # fn __getitem__(self, indices: NDArray[DType.int]) raises -> Self        # Get by index array
     # fn __getitem__(self, indices: List[Int]) raises -> Self                   # Get by list of indices
     # fn __getitem__(self, mask: NDArray[DType.bool]) raises -> Self            # Get by boolean mask
     # fn __getitem__(self, mask: List[Bool]) raises -> Self                     # Get by boolean list
@@ -611,18 +612,22 @@ struct NDArray[dtype: DType = DType.float64](
         # Fast path for C-contiguous arrays
         if self.flags.C_CONTIGUOUS:
             var block = self.size // self.shape[0]
-            memcpy(result._buf.ptr, self._buf.ptr + norm * block, block)
+            memcpy(
+                dest=result._buf.ptr,
+                src=self._buf.ptr + norm * block,
+                count=block,
+            )
             return result^
 
         # (F-order or arbitrary stride layout)
         # TODO: Optimize this further (multi-axis unrolling / smarter linear index without div/mod)
-        self._copy_first_axis_slice[dtype](self, norm, result)
+        self._copy_first_axis_slice(self, norm, result)
         return result^
 
     # perhaps move these to a utility module
-    fn _copy_first_axis_slice[
-        dtype: DType
-    ](self, src: NDArray[dtype], norm_idx: Int, mut dst: NDArray[dtype]):
+    fn _copy_first_axis_slice(
+        self, src: NDArray[dtype], norm_idx: Int, mut dst: NDArray[dtype]
+    ):
         """Generic stride-based copier for first-axis slice (works for any layout).
         """
         var out_ndim = dst.ndim
@@ -1177,7 +1182,7 @@ struct NDArray[dtype: DType = DType.float64](
         narr = self.__getitem__(slice_list^)
         return narr^
 
-    fn __getitem__(self, indices: NDArray[DType.index]) raises -> Self:
+    fn __getitem__(self, indices: NDArray[DType.int]) raises -> Self:
         """
         Get items from 0-th dimension of an ndarray of indices.
         If the original array is of shape (i,j,k) and
@@ -1243,14 +1248,14 @@ struct NDArray[dtype: DType = DType.float64](
                             " ({})."
                         ).format(self.shape[0]),
                         location=String(
-                            "NDArray.__getitem__(indices: NDArray[DType.index])"
+                            "NDArray.__getitem__(indices: NDArray[DType.int])"
                         ),
                     )
                 )
             memcpy(
-                result._buf.ptr + i * size_per_item,
-                self._buf.ptr + indices.item(i) * size_per_item,
-                size_per_item,
+                dest=result._buf.ptr + i * size_per_item,
+                src=self._buf.ptr + indices.item(i) * size_per_item,
+                count=size_per_item,
             )
 
         return result^
@@ -1259,7 +1264,7 @@ struct NDArray[dtype: DType = DType.float64](
         # TODO: Use trait IntLike when it is supported by Mojo.
         """
         Get items from 0-th dimension of an array. It is an overload of
-        `__getitem__(self, indices: NDArray[DType.index]) raises -> Self`.
+        `__getitem__(self, indices: NDArray[DType.int]) raises -> Self`.
 
         Args:
             indices: A list of Int.
@@ -1299,7 +1304,7 @@ struct NDArray[dtype: DType = DType.float64](
         ```.
         """
 
-        var indices_array = NDArray[DType.index](shape=Shape(len(indices)))
+        var indices_array = NDArray[DType.int](shape=Shape(len(indices)))
         for i in range(len(indices)):
             (indices_array._buf.ptr + i).init_pointee_copy(indices[i])
 
@@ -1393,9 +1398,9 @@ struct NDArray[dtype: DType = DType.float64](
             for i in range(mask.size):
                 if mask.item(i):
                     memcpy(
-                        result._buf.ptr + offset * size_per_item,
-                        self._buf.ptr + i * size_per_item,
-                        size_per_item,
+                        dest=result._buf.ptr + offset * size_per_item,
+                        src=self._buf.ptr + i * size_per_item,
+                        count=size_per_item,
                     )
                     offset += 1
 
@@ -1806,7 +1811,7 @@ struct NDArray[dtype: DType = DType.float64](
     # fn __setitem__(mut self, *slices: Variant[Slice, Int], val: Self) raises  # Set by mix of slices/ints
 
     # Index-based Setters
-    # fn __setitem__(self, indices: NDArray[DType.index], val: NDArray) raises  # Set by index array
+    # fn __setitem__(self, indices: NDArray[DType.int], val: NDArray) raises  # Set by index array
     # fn __setitem__(mut self, mask: NDArray[DType.bool], val: NDArray[dtype])  # Set by boolean mask array
 
     # Helper Methods
@@ -1943,16 +1948,18 @@ struct NDArray[dtype: DType = DType.float64](
         # Fast path for C-contiguous arrays (single block)
         if self.flags.C_CONTIGUOUS and val.flags.C_CONTIGUOUS:
             var block = self.size // self.shape[0]
-            memcpy(self._buf.ptr + norm * block, val._buf.ptr, block)
+            memcpy(
+                dest=self._buf.ptr + norm * block, src=val._buf.ptr, count=block
+            )
             return
 
         # Generic stride path (F-order or irregular)
-        self._write_first_axis_slice[dtype](self, norm, val)
+        self._write_first_axis_slice(self, norm, val)
 
     # perhaps move these to a utility module
-    fn _write_first_axis_slice[
-        dtype: DType
-    ](self, dst: NDArray[dtype], norm_idx: Int, src: NDArray[dtype]):
+    fn _write_first_axis_slice(
+        self, dst: NDArray[dtype], norm_idx: Int, src: NDArray[dtype]
+    ):
         var out_ndim = src.ndim
         var total = src.size
         if total == 0:
@@ -2317,7 +2324,7 @@ struct NDArray[dtype: DType = DType.float64](
 
     # TODO: fix this setter, add bound checks. Not sure about it's use case.
     fn __setitem__(
-        mut self, index: NDArray[DType.index], val: NDArray[dtype]
+        mut self, index: NDArray[DType.int], val: NDArray[dtype]
     ) raises:
         """
         Returns the items of the array from an array of indices.
@@ -2352,7 +2359,7 @@ struct NDArray[dtype: DType = DType.float64](
                         " each axis separately."
                     ),
                     location=String(
-                        "NDArray.__setitem__(index: NDArray[DType.index], val:"
+                        "NDArray.__setitem__(index: NDArray[DType.int], val:"
                         " NDArray)"
                     ),
                 )
@@ -2370,7 +2377,7 @@ struct NDArray[dtype: DType = DType.float64](
                         " first dimension ({})."
                     ).format(self.shape[0]),
                     location=String(
-                        "NDArray.__setitem__(index: NDArray[DType.index], val:"
+                        "NDArray.__setitem__(index: NDArray[DType.int], val:"
                         " NDArray)"
                     ),
                 )
@@ -2397,7 +2404,7 @@ struct NDArray[dtype: DType = DType.float64](
                             " ({})."
                         ).format(self.shape[0]),
                         location=String(
-                            "NDArray.__setitem__(index: NDArray[DType.index],"
+                            "NDArray.__setitem__(index: NDArray[DType.int],"
                             " val: NDArray)"
                         ),
                     )
@@ -3819,7 +3826,7 @@ struct NDArray[dtype: DType = DType.float64](
 
     fn __iter__(
         self,
-    ) raises -> _NDArrayIter[__origin_of(self), dtype]:
+    ) raises -> _NDArrayIter[origin_of(self), dtype]:
         """
         Iterates over elements of the NDArray and return sub-arrays as view.
 
@@ -3843,14 +3850,14 @@ struct NDArray[dtype: DType = DType.float64](
         ```.
         """
 
-        return _NDArrayIter[__origin_of(self), dtype](
+        return _NDArrayIter[origin_of(self), dtype](
             self,
             dimension=0,
         )
 
     fn __reversed__(
         self,
-    ) raises -> _NDArrayIter[__origin_of(self), dtype, forward=False]:
+    ) raises -> _NDArrayIter[origin_of(self), dtype, forward=False]:
         """
         Iterates backwards over elements of the NDArray, returning
         copied value.
@@ -3859,7 +3866,7 @@ struct NDArray[dtype: DType = DType.float64](
             A reversed iterator of NDArray elements.
         """
 
-        return _NDArrayIter[__origin_of(self), dtype, forward=False](
+        return _NDArrayIter[origin_of(self), dtype, forward=False](
             self,
             dimension=0,
         )
@@ -4177,33 +4184,33 @@ struct NDArray[dtype: DType = DType.float64](
         vectorize[vectorized_any, self.width](self.size)
         return result
 
-    fn argmax(self) raises -> Scalar[DType.index]:
+    fn argmax(self) raises -> Scalar[DType.int]:
         """Returns the indices of the maximum values along an axis.
         When no axis is specified, the array is flattened.
         See `numojo.argmax()` for more details.
         """
         return searching.argmax(self)
 
-    fn argmax(self, axis: Int) raises -> NDArray[DType.index]:
+    fn argmax(self, axis: Int) raises -> NDArray[DType.int]:
         """Returns the indices of the maximum values along an axis.
         See `numojo.argmax()` for more details.
         """
         return searching.argmax(self, axis=axis)
 
-    fn argmin(self) raises -> Scalar[DType.index]:
+    fn argmin(self) raises -> Scalar[DType.int]:
         """Returns the indices of the minimum values along an axis.
         When no axis is specified, the array is flattened.
         See `numojo.argmin()` for more details.
         """
         return searching.argmin(self)
 
-    fn argmin(self, axis: Int) raises -> NDArray[DType.index]:
+    fn argmin(self, axis: Int) raises -> NDArray[DType.int]:
         """Returns the indices of the minimum values along an axis.
         See `numojo.argmin()` for more details.
         """
         return searching.argmin(self, axis=axis)
 
-    fn argsort(mut self) raises -> NDArray[DType.index]:
+    fn argsort(mut self) raises -> NDArray[DType.int]:
         """
         Sort the NDArray and return the sorted indices.
         See `numojo.argsort()` for more details.
@@ -4214,7 +4221,7 @@ struct NDArray[dtype: DType = DType.float64](
 
         return numojo.sorting.argsort(self)
 
-    fn argsort(mut self, axis: Int) raises -> NDArray[DType.index]:
+    fn argsort(mut self, axis: Int) raises -> NDArray[DType.int]:
         """
         Sort the NDArray and return the sorted indices.
         See `numojo.argsort()` for more details.
@@ -4253,17 +4260,12 @@ struct NDArray[dtype: DType = DType.float64](
 
         return numojo.clip(self, a_min, a_max)
 
-    fn compress[
-        dtype: DType
-    ](self, condition: NDArray[DType.bool], axis: Int) raises -> Self:
+    fn compress(self, condition: NDArray[DType.bool], axis: Int) raises -> Self:
         # TODO: @forFudan try using parallelization for this function
         """
         Return selected slices of an array along given axis.
         If no axis is provided, the array is flattened before use.
 
-        Parameters:
-            dtype: DType.
-
         Args:
             condition: 1-D array of booleans that selects which entries to return.
                 If length of condition is less than the size of the array along the
@@ -4283,17 +4285,12 @@ struct NDArray[dtype: DType = DType.float64](
 
         return numojo.compress(condition=condition, a=self, axis=axis)
 
-    fn compress[
-        dtype: DType
-    ](self, condition: NDArray[DType.bool]) raises -> Self:
+    fn compress(self, condition: NDArray[DType.bool]) raises -> Self:
         """
         Return selected slices of an array along given axis.
         If no axis is provided, the array is flattened before use.
         This is a function ***OVERLOAD***.
 
-        Parameters:
-            dtype: DType.
-
         Args:
             condition: 1-D array of booleans that selects which entries to return.
                 If length of condition is less than the size of the array along the
@@ -4392,23 +4389,20 @@ struct NDArray[dtype: DType = DType.float64](
         """
         return numojo.math.cumsum[dtype](self.copy(), axis=axis)
 
-    fn diagonal[dtype: DType](self, offset: Int = 0) raises -> Self:
+    fn diagonal(self, offset: Int = 0) raises -> Self:
         """
         Returns specific diagonals.
         Currently supports only 2D arrays.
 
-        Raises:
-            Error: If the array is not 2D.
-            Error: If the offset is beyond the shape of the array.
-
-        Parameters:
-            dtype: Data type of the array.
-
         Args:
             offset: Offset of the diagonal from the main diagonal.
 
         Returns:
             The diagonal of the NDArray.
+
+        Raises:
+            Error: If the array is not 2D.
+            Error: If the offset is beyond the shape of the array.
         """
         return numojo.linalg.diagonal(self, offset=offset)
 
@@ -4438,7 +4432,7 @@ struct NDArray[dtype: DType = DType.float64](
     fn iter_along_axis[
         forward: Bool = True
     ](self, axis: Int, order: String = "C") raises -> _NDAxisIter[
-        __origin_of(self), dtype, forward
+        origin_of(self), dtype, forward
     ]:
         """
         Returns an iterator yielding 1-d array slices along the given axis.
@@ -4531,7 +4525,7 @@ struct NDArray[dtype: DType = DType.float64](
                 ).format(axis, -self.ndim, self.ndim)
             )
 
-        return _NDAxisIter[__origin_of(self), dtype, forward](
+        return _NDAxisIter[origin_of(self), dtype, forward](
             self,
             axis=normalized_axis,
             order=order,
@@ -4540,7 +4534,7 @@ struct NDArray[dtype: DType = DType.float64](
     fn iter_over_dimension[
         forward: Bool = True
     ](read self, dimension: Int) raises -> _NDArrayIter[
-        __origin_of(self), dtype, forward
+        origin_of(self), dtype, forward
     ]:
         """
         Returns an iterator yielding `ndim-1` arrays over the given dimension.
@@ -4570,7 +4564,7 @@ struct NDArray[dtype: DType = DType.float64](
                 ).format(dimension, -self.ndim, self.ndim)
             )
 
-        return _NDArrayIter[__origin_of(self), dtype, forward](
+        return _NDArrayIter[origin_of(self), dtype, forward](
             a=self,
             dimension=normalized_dim,
         )
@@ -4728,7 +4722,7 @@ struct NDArray[dtype: DType = DType.float64](
 
         return numojo.math.min(self, axis=axis)
 
-    fn nditer(self) raises -> _NDIter[__origin_of(self), dtype]:
+    fn nditer(self) raises -> _NDIter[origin_of(self), dtype]:
         """
         ***Overload*** Return an iterator yielding the array elements according
         to the memory layout of the array.
@@ -4759,7 +4753,7 @@ struct NDArray[dtype: DType = DType.float64](
 
         return self.nditer(order=order)
 
-    fn nditer(self, order: String) raises -> _NDIter[__origin_of(self), dtype]:
+    fn nditer(self, order: String) raises -> _NDIter[origin_of(self), dtype]:
         """
         Return an iterator yielding the array elements according to the order.
 
@@ -4798,7 +4792,7 @@ struct NDArray[dtype: DType = DType.float64](
         else:
             axis = 0
 
-        return _NDIter[__origin_of(self), dtype](a=self, order=order, axis=axis)
+        return _NDIter[origin_of(self), dtype](a=self, order=order, axis=axis)
 
     fn num_elements(self) -> Int:
         """
@@ -4908,7 +4902,7 @@ struct NDArray[dtype: DType = DType.float64](
 
         if shape.size_of_array() > self.size:
             var other = Self(shape=shape, order=order)
-            memcpy(other._buf.ptr, self._buf.ptr, self.size)
+            memcpy(dest=other._buf.ptr, src=self._buf.ptr, count=self.size)
             for i in range(self.size, other.size):
                 (other._buf.ptr + i).init_pointee_copy(0)
             self = other^
@@ -5166,8 +5160,8 @@ struct NDArray[dtype: DType = DType.float64](
         ref self,
     ) -> UnsafePointer[
         Scalar[dtype],
-        mut = Origin(__origin_of(self)).mut,
-        origin = __origin_of(self),
+        mut = Origin(origin_of(self)).mut,
+        origin = origin_of(self),
     ]:
         """
         Retreive pointer without taking ownership.
@@ -5176,9 +5170,9 @@ struct NDArray[dtype: DType = DType.float64](
             Unsafe pointer to the data buffer.
 
         """
-        return self._buf.ptr.origin_cast[
-            Origin(__origin_of(self)).mut, __origin_of(self)
-        ]()
+        return self._buf.ptr.mut_cast[
+            Origin(origin_of(self)).mut
+        ]().unsafe_origin_cast[origin_of(self)]()
 
     fn variance[
         returned_dtype: DType = DType.float64
@@ -5628,9 +5622,9 @@ struct _NDAxisIter[
         ):
             # The memory layout is C-contiguous or F-contiguous
             memcpy(
-                res._buf.ptr,
-                self.ptr + _get_offset(item, self.strides),
-                self.size_of_item,
+                dest=res._buf.ptr,
+                src=self.ptr + _get_offset(item, self.strides),
+                count=self.size_of_item,
             )
 
         else:
@@ -5690,9 +5684,9 @@ struct _NDAxisIter[
         ):
             # The memory layout is C-contiguous or F-contiguous
             memcpy(
-                elements._buf.ptr,
-                self.ptr + _get_offset(item, self.strides),
-                self.size_of_item,
+                dest=elements._buf.ptr,
+                src=self.ptr + _get_offset(item, self.strides),
+                count=self.size_of_item,
             )
         else:
             for j in range(self.size_of_item):
@@ -5705,7 +5699,7 @@ struct _NDAxisIter[
 
     fn ith_with_offsets(
         self, index: Int
-    ) raises -> Tuple[NDArray[DType.index], NDArray[dtype]]:
+    ) raises -> Tuple[NDArray[DType.int], NDArray[dtype]]:
         """
         Gets the i-th 1-d array of the iterator and the offsets (in C-order)
         of its elements.
@@ -5717,7 +5711,7 @@ struct _NDAxisIter[
             Offsets (in C-order) and elements of the i-th 1-d array of the
             iterator.
         """
-        var offsets: NDArray[DType.index] = NDArray[DType.index](
+        var offsets: NDArray[DType.int] = NDArray[DType.int](
             Shape(self.size_of_item)
         )
         var elements: NDArray[dtype] = NDArray[dtype](Shape(self.size_of_item))
@@ -5746,9 +5740,9 @@ struct _NDAxisIter[
         ):
             # The memory layout is C-contiguous
             memcpy(
-                elements._buf.ptr,
-                self.ptr + _get_offset(item, self.strides),
-                self.size_of_item,
+                dest=elements._buf.ptr,
+                src=self.ptr + _get_offset(item, self.strides),
+                count=self.size_of_item,
             )
             var begin_offset = _get_offset(item, new_strides)
             for j in range(self.size_of_item):
@@ -5759,9 +5753,9 @@ struct _NDAxisIter[
         ):
             # The memory layout is F-contiguous
             memcpy(
-                elements._buf.ptr,
-                self.ptr + _get_offset(item, self.strides),
-                self.size_of_item,
+                dest=elements._buf.ptr,
+                src=self.ptr + _get_offset(item, self.strides),
+                count=self.size_of_item,
             )
             for j in range(self.size_of_item):
                 (offsets._buf.ptr + j).init_pointee_copy(
diff --git a/numojo/core/ndshape.mojo b/numojo/core/ndshape.mojo
index e4c4b1dc..65831193 100644
--- a/numojo/core/ndshape.mojo
+++ b/numojo/core/ndshape.mojo
@@ -8,7 +8,8 @@
 Implements NDArrayShape type.
 """
 
-from memory import UnsafePointer, memcpy, memcmp
+from memory import memcpy, memcmp
+from memory import LegacyUnsafePointer as UnsafePointer
 
 from numojo.core.error import IndexError, ShapeError, ValueError
 
@@ -388,7 +389,7 @@ struct NDArrayShape(
         """
         self.ndim = shape.ndim
         self._buf = UnsafePointer[Scalar[Self._type]]().alloc(shape.ndim)
-        memcpy(self._buf, shape._buf, shape.ndim)
+        memcpy(dest=self._buf, src=shape._buf, count=shape.ndim)
         for i in range(self.ndim):
             (self._buf + i).init_pointee_copy(shape._buf[i])
 
@@ -491,7 +492,7 @@ struct NDArrayShape(
         """
         self.ndim = other.ndim
         self._buf = UnsafePointer[Scalar[Self._type]]().alloc(other.ndim)
-        memcpy(self._buf, other._buf, other.ndim)
+        memcpy(dest=self._buf, src=other._buf, count=other.ndim)
 
     fn __del__(deinit self):
         """
@@ -547,7 +548,7 @@ struct NDArrayShape(
     @always_inline("nodebug")
     fn _compute_slice_params(
         self, slice_index: Slice
-    ) raises -> (Int, Int, Int):
+    ) raises -> Tuple[Int, Int, Int]:
         var n = self.ndim
         if n == 0:
             return (0, 1, 0)
@@ -771,7 +772,7 @@ struct NDArrayShape(
         """
 
         var res = Self(ndim=self.ndim, initialized=False)
-        memcpy(res._buf, self._buf, self.ndim)
+        memcpy(dest=res._buf, src=self._buf, count=self.ndim)
         return res
 
     fn join(self, *shapes: Self) raises -> Self:
@@ -1057,7 +1058,7 @@ struct _ShapeIter[
         else:
             return self.index > 0
 
-    fn __next__(mut self) raises -> Scalar[DType.index]:
+    fn __next__(mut self) raises -> Scalar[DType.int]:
         @parameter
         if forward:
             var current_index = self.index
diff --git a/numojo/core/ndstrides.mojo b/numojo/core/ndstrides.mojo
index c8994cb6..83f0db33 100644
--- a/numojo/core/ndstrides.mojo
+++ b/numojo/core/ndstrides.mojo
@@ -8,7 +8,8 @@
 Implements NDArrayStrides type.
 """
 
-from memory import UnsafePointer, memcmp, memcpy
+from memory import memcmp, memcpy
+from memory import LegacyUnsafePointer as UnsafePointer
 
 from numojo.core.error import IndexError, ValueError
 
@@ -133,7 +134,7 @@ struct NDArrayStrides(
 
         self.ndim = strides.ndim
         self._buf = UnsafePointer[Scalar[Self._type]]().alloc(self.ndim)
-        memcpy(self._buf, strides._buf, strides.ndim)
+        memcpy(dest=self._buf, src=strides._buf, count=strides.ndim)
 
     @always_inline("nodebug")
     fn __init__(
@@ -339,7 +340,7 @@ struct NDArrayStrides(
         """
         self.ndim = other.ndim
         self._buf = UnsafePointer[Scalar[Self._type]]().alloc(other.ndim)
-        memcpy(self._buf, other._buf, other.ndim)
+        memcpy(dest=self._buf, src=other._buf, count=other.ndim)
 
     fn __del__(deinit self):
         """
@@ -394,7 +395,7 @@ struct NDArrayStrides(
     @always_inline("nodebug")
     fn _compute_slice_params(
         self, slice_index: Slice
-    ) raises -> (Int, Int, Int):
+    ) raises -> Tuple[Int, Int, Int]:
         """
         Compute normalized slice parameters (start, step, length).
 
@@ -644,7 +645,7 @@ struct NDArrayStrides(
         """
 
         var res = Self(ndim=self.ndim, initialized=False)
-        memcpy(res._buf, self._buf, self.ndim)
+        memcpy(dest=res._buf, src=self._buf, count=self.ndim)
         return res
 
     fn swapaxes(self, axis1: Int, axis2: Int) raises -> Self:
@@ -683,7 +684,7 @@ struct NDArrayStrides(
             )
 
         var res = Self(ndim=self.ndim, initialized=False)
-        memcpy(res._buf, self._buf, self.ndim)
+        memcpy(dest=res._buf, src=self._buf, count=self.ndim)
         res[axis1] = self[axis2]
         res[axis2] = self[axis1]
         return res^
@@ -957,7 +958,7 @@ struct _StrideIter[
         else:
             return self.index > 0
 
-    fn __next__(mut self) raises -> Scalar[DType.index]:
+    fn __next__(mut self) raises -> Scalar[DType.int]:
         @parameter
         if forward:
             var current_index = self.index
diff --git a/numojo/core/own_data.mojo b/numojo/core/own_data.mojo
index 11cc4bcd..67f88115 100644
--- a/numojo/core/own_data.mojo
+++ b/numojo/core/own_data.mojo
@@ -11,8 +11,13 @@ struct OwnData(Buffered, ImplicitlyCopyable, Movable):
     fn __init__(out self):
         pass
 
-    fn is_own_data(self) -> Bool:
+    @staticmethod
+    fn is_own_data() -> Bool:
         return True
 
-    fn is_ref_data(self) -> Bool:
+    @staticmethod
+    fn is_ref_data() -> Bool:
         return False
+
+    fn __str__(self) -> String:
+        return "OwnData"
diff --git a/numojo/core/ref_data.mojo b/numojo/core/ref_data.mojo
index f324fc9d..e268d8c8 100644
--- a/numojo/core/ref_data.mojo
+++ b/numojo/core/ref_data.mojo
@@ -17,8 +17,13 @@ struct RefData[is_mutable: Bool, //, origin: Origin[is_mutable]](
     fn __init__(out self):
         pass
 
-    fn is_own_data(self) -> Bool:
+    @staticmethod
+    fn is_own_data() -> Bool:
         return False
 
-    fn is_ref_data(self) -> Bool:
+    @staticmethod
+    fn is_ref_data() -> Bool:
         return True
+
+    fn __str__(self) -> String:
+        return "RefData"
diff --git a/numojo/core/traits/buffered.mojo b/numojo/core/traits/buffered.mojo
index 8a28911d..0675544f 100644
--- a/numojo/core/traits/buffered.mojo
+++ b/numojo/core/traits/buffered.mojo
@@ -16,8 +16,13 @@ trait Buffered(ImplicitlyCopyable, Movable):
     fn __init__(out self):
         ...
 
-    fn is_own_data(self) -> Bool:
+    @staticmethod
+    fn is_own_data() -> Bool:
         ...
 
-    fn is_ref_data(self) -> Bool:
+    @staticmethod
+    fn is_ref_data() -> Bool:
+        ...
+
+    fn __str__(self) -> String:
         ...
diff --git a/numojo/core/utility.mojo b/numojo/core/utility.mojo
index e6c035e7..4407a748 100644
--- a/numojo/core/utility.mojo
+++ b/numojo/core/utility.mojo
@@ -21,7 +21,8 @@ Implements N-DIMENSIONAL ARRAY UTILITY FUNCTIONS
 
 from algorithm.functional import vectorize, parallelize
 from collections import Dict
-from memory import UnsafePointer, memcpy
+from memory import memcpy
+from memory import LegacyUnsafePointer as UnsafePointer
 from python import Python, PythonObject
 from sys import simd_width_of
 
@@ -169,7 +170,7 @@ fn _transfer_offset(offset: Int, strides: NDArrayStrides) raises -> Int:
 
 
 fn _traverse_buffer_according_to_shape_and_strides(
-    mut ptr: UnsafePointer[Scalar[DType.index]],
+    mut ptr: UnsafePointer[Scalar[DType.int]],
     shape: NDArrayShape,
     strides: NDArrayStrides,
     current_dim: Int = 0,
@@ -194,7 +195,7 @@ fn _traverse_buffer_according_to_shape_and_strides(
     Example:
     ```console
     # A is a 2x3x4 array
-    var I = nm.NDArray[DType.index](nm.Shape(A.size))
+    var I = nm.NDArray[DType.int](nm.Shape(A.size))
     var ptr = I._buf
     _traverse_buffer_according_to_shape_and_strides(
         ptr, A.shape._flip(), A.strides._flip()
@@ -399,7 +400,7 @@ fn to_numpy[dtype: DType](array: NDArray[dtype]) raises -> PythonObject:
             np_dtype = np.int16
         elif dtype == DType.int8:
             np_dtype = np.int8
-        elif dtype == DType.index:
+        elif dtype == DType.int:
             np_dtype = np.intp
         elif dtype == DType.uint64:
             np_dtype = np.uint64
@@ -417,7 +418,7 @@ fn to_numpy[dtype: DType](array: NDArray[dtype]) raises -> PythonObject:
         var pointer_d = numpyarray.__array_interface__["data"][
             0
         ].unsafe_get_as_pointer[dtype]()
-        memcpy(pointer_d, array.unsafe_ptr(), array.size)
+        memcpy(dest=pointer_d, src=array.unsafe_ptr(), count=array.size)
         _ = array
 
         return numpyarray^
diff --git a/numojo/routines/creation.mojo b/numojo/routines/creation.mojo
index 284afb16..770a89dd 100644
--- a/numojo/routines/creation.mojo
+++ b/numojo/routines/creation.mojo
@@ -2556,7 +2556,7 @@ fn array[
         np_dtype = np.int16
     elif dtype == DType.int8:
         np_dtype = np.int8
-    elif dtype == DType.index:
+    elif dtype == DType.int:
         np_dtype = np.intp
     elif dtype == DType.uint64:
         np_dtype = np.uint64
@@ -2575,7 +2575,7 @@ fn array[
         dtype
     ]()
     var A: NDArray[dtype] = NDArray[dtype](array_shape, order)
-    memcpy[Scalar[dtype]](A._buf.ptr, pointer, A.size)
+    memcpy[Scalar[dtype]](dest=A._buf.ptr, src=pointer, count=A.size)
     return A^
 
 
@@ -2634,7 +2634,7 @@ fn array[
         np_dtype = np.int16
     elif dtype == DType.int8:
         np_dtype = np.int8
-    elif dtype == DType.index:
+    elif dtype == DType.int:
         np_dtype = np.intp
     elif dtype == DType.uint64:
         np_dtype = np.uint64
@@ -2657,8 +2657,10 @@ fn array[
         0
     ].unsafe_get_as_pointer[dtype]()
     var A: ComplexNDArray[cdtype] = ComplexNDArray[cdtype](array_shape, order)
-    memcpy[Scalar[dtype]](A._re._buf.ptr, pointer, A._re.size)
-    memcpy[Scalar[dtype]](A._im._buf.ptr, pointer_imag, A._im.size)
+    memcpy[Scalar[dtype]](dest=A._re._buf.ptr, src=pointer, count=A._re.size)
+    memcpy[Scalar[dtype]](
+        dest=A._im._buf.ptr, src=pointer_imag, count=A._im.size
+    )
     return A^
 
 
diff --git a/numojo/routines/functional.mojo b/numojo/routines/functional.mojo
index b5eb8ac5..cafdbc60 100644
--- a/numojo/routines/functional.mojo
+++ b/numojo/routines/functional.mojo
@@ -82,12 +82,12 @@ fn apply_along_axis[
 fn apply_along_axis[
     dtype: DType,
     func1d: fn[dtype_func: DType] (NDArray[dtype_func]) raises -> Scalar[
-        DType.index
+        DType.int
     ],
-](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.index]:
+](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.int]:
     """
     Applies a function to a NDArray by axis and reduce that dimension.
-    The returned data type is DType.index.
+    The returned data type is DType.int.
     When the array is 1-d, the returned array will be a 0-d array.
 
     Parameters:
@@ -105,14 +105,14 @@ fn apply_along_axis[
     # The iterator along the axis
     var iterator = a.iter_along_axis(axis=axis)
     # The final output array will have 1 less dimension than the input array
-    var res: NDArray[DType.index]
+    var res: NDArray[DType.int]
 
     if a.ndim == 1:
-        res = numojo.creation._0darray[DType.index](0)
+        res = numojo.creation._0darray[DType.int](0)
         (res._buf.ptr).init_pointee_copy(func1d[dtype](a))
 
     else:
-        res = NDArray[DType.index](a.shape._pop(axis=axis))
+        res = NDArray[DType.int](a.shape._pop(axis=axis))
 
         @parameter
         fn parallelized_func(i: Int):
@@ -221,9 +221,9 @@ fn apply_along_axis[
             try:
                 var elements: NDArray[dtype] = func1d[dtype](iterator.ith(i))
                 memcpy(
-                    result._buf.ptr + i * elements.size,
-                    elements._buf.ptr,
-                    elements.size,
+                    dest=result._buf.ptr + i * elements.size,
+                    src=elements._buf.ptr,
+                    count=elements.size,
                 )
             except e:
                 print("Error in parallelized_func", e)
@@ -236,7 +236,7 @@ fn apply_along_axis[
         fn parallelized_func(i: Int):
             try:
                 # The indices of the input array in each iteration
-                var indices: NDArray[DType.index]
+                var indices: NDArray[DType.int]
                 # The elements of the input array in each iteration
                 var elements: NDArray[dtype]
                 # The array after applied the function
@@ -293,9 +293,9 @@ fn apply_along_axis[
                 var elements: NDArray[dtype] = iterator.ith(i)
                 func1d[dtype](elements)
                 memcpy(
-                    a._buf.ptr + i * elements.size,
-                    elements._buf.ptr,
-                    elements.size,
+                    dest=a._buf.ptr + i * elements.size,
+                    src=elements._buf.ptr,
+                    count=elements.size,
                 )
             except e:
                 print("Error in parallelized_func", e)
@@ -308,7 +308,7 @@ fn apply_along_axis[
         fn parallelized_func(i: Int):
             try:
                 # The indices of the input array in each iteration
-                var indices: NDArray[DType.index]
+                var indices: NDArray[DType.int]
                 # The elements of the input array in each iteration
                 var elements: NDArray[dtype]
                 # The array after applied the function
@@ -333,9 +333,9 @@ fn apply_along_axis[
 fn apply_along_axis[
     dtype: DType,
     func1d: fn[dtype_func: DType] (NDArray[dtype_func]) raises -> NDArray[
-        DType.index
+        DType.int
     ],
-](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.index]:
+](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.int]:
     """
     Applies a function to a NDArray by axis without reducing that dimension.
     The resulting array will have the same shape as the input array.
@@ -357,20 +357,20 @@ fn apply_along_axis[
     # The iterator along the axis
     var iterator = a.iter_along_axis(axis=axis)
     # The final output array will have the same shape as the input array
-    var res = NDArray[DType.index](a.shape)
+    var res = NDArray[DType.int](a.shape)
 
     if a.flags.C_CONTIGUOUS and (axis == a.ndim - 1):
         # The memory layout is C-contiguous
         @parameter
         fn parallelized_func_c(i: Int):
             try:
-                var elements: NDArray[DType.index] = func1d[dtype](
+                var elements: NDArray[DType.int] = func1d[dtype](
                     iterator.ith(i)
                 )
                 memcpy(
-                    res._buf.ptr + i * elements.size,
-                    elements._buf.ptr,
-                    elements.size,
+                    dest=res._buf.ptr + i * elements.size,
+                    src=elements._buf.ptr,
+                    count=elements.size,
                 )
             except e:
                 print("Error in parallelized_func", e)
@@ -383,7 +383,7 @@ fn apply_along_axis[
         fn parallelized_func(i: Int):
             try:
                 # The indices of the input array in each iteration
-                var indices: NDArray[DType.index]
+                var indices: NDArray[DType.int]
                 # The elements of the input array in each iteration
                 var elements: NDArray[dtype]
                 # The array after applied the function
@@ -391,9 +391,7 @@ fn apply_along_axis[
                 indices = indices_elements[0].copy()
                 elements = indices_elements[1].copy()
 
-                var res_along_axis: NDArray[DType.index] = func1d[dtype](
-                    elements
-                )
+                var res_along_axis: NDArray[DType.int] = func1d[dtype](elements)
 
                 for j in range(a.shape[axis]):
                     (res._buf.ptr + Int(indices[j])).init_pointee_copy(
diff --git a/numojo/routines/indexing.mojo b/numojo/routines/indexing.mojo
index 4e928391..aefdd23b 100644
--- a/numojo/routines/indexing.mojo
+++ b/numojo/routines/indexing.mojo
@@ -25,7 +25,7 @@ import numojo.core.utility as utility
 # ===----------------------------------------------------------------------=== #
 
 
-fn where[
+fn `where`[
     dtype: DType
 ](
     mut x: NDArray[dtype], scalar: SIMD[dtype, 1], mask: NDArray[DType.bool]
@@ -48,7 +48,7 @@ fn where[
 
 
 # TODO: do it with vectorization
-fn where[
+fn `where`[
     dtype: DType
 ](mut x: NDArray[dtype], y: NDArray[dtype], mask: NDArray[DType.bool]) raises:
     """
@@ -237,7 +237,7 @@ fn compress[
 fn take_along_axis[
     dtype: DType, //,
 ](
-    arr: NDArray[dtype], indices: NDArray[DType.index], axis: Int = 0
+    arr: NDArray[dtype], indices: NDArray[DType.int], axis: Int = 0
 ) raises -> NDArray[dtype]:
     """
     Takes values from the input array along the given axis based on indices.
@@ -303,7 +303,7 @@ fn take_along_axis[
     # except along the axis
 
     var broadcasted_indices: NDArray[
-        DType.index
+        DType.int
     ] = indices.copy()  # make this owned and don't copy
 
     if arr.shape != indices.shape:
@@ -337,15 +337,15 @@ fn take_along_axis[
                 indices_slice
             ]
             memcpy(
-                result._buf.ptr + i * result.shape[normalized_axis],
-                arr_slice_after_applying_indices._buf.ptr,
-                result.shape[normalized_axis],
+                dest=result._buf.ptr + i * result.shape[normalized_axis],
+                src=arr_slice_after_applying_indices._buf.ptr,
+                count=result.shape[normalized_axis],
             )
     else:
         # If axis is not the last axis, the data is not contiguous.
         for i in range(length_of_iterator):
-            var indices_slice_offsets: NDArray[DType.index]
-            var indices_slice: NDArray[DType.index]
+            var indices_slice_offsets: NDArray[DType.int]
+            var indices_slice: NDArray[DType.int]
             var indices_slice_offsets_slice = indices_iterator.ith_with_offsets(
                 i
             )
diff --git a/numojo/routines/io/files.mojo b/numojo/routines/io/files.mojo
index f9781b6b..dc774c44 100644
--- a/numojo/routines/io/files.mojo
+++ b/numojo/routines/io/files.mojo
@@ -79,7 +79,7 @@ fn load[
 #         return "'<f4'"
 #     elif dtype == DType.float64:
 #         return "'<f8'"
-#     elif dtype == DType.index:
+#     elif dtype == DType.int:
 #         # Assuming index is 64-bit signed integer
 #         return "'<i8'"
 #     else:
diff --git a/numojo/routines/linalg/decompositions.mojo b/numojo/routines/linalg/decompositions.mojo
index 62dbd652..7eb8e267 100644
--- a/numojo/routines/linalg/decompositions.mojo
+++ b/numojo/routines/linalg/decompositions.mojo
@@ -3,10 +3,12 @@
 # ===----------------------------------------------------------------------=== #
 from sys import simd_width_of
 from algorithm import parallelize, vectorize
+from memory import UnsafePointer, memcpy, memset_zero
 
 import math as builtin_math
 
 from numojo.core.ndarray import NDArray
+from numojo.core.own_data import OwnData
 from numojo.core.matrix import Matrix, issymmetric
 from numojo.routines.creation import zeros, eye, full
 
@@ -213,7 +215,7 @@ fn lu_decomposition[
 
 fn lu_decomposition[
     dtype: DType
-](A: Matrix[dtype]) raises -> Tuple[Matrix[dtype], Matrix[dtype]]:
+](A: Matrix[dtype, **_]) raises -> Tuple[Matrix[dtype], Matrix[dtype]]:
     """
     Perform LU (lower-upper) decomposition for matrix.
     """
@@ -226,8 +228,8 @@ fn lu_decomposition[
     var n: Int = A.shape[0]
 
     # Initiate upper and lower triangular matrices
-    var U: Matrix[dtype] = Matrix.full[dtype](shape=(n, n), order=A.order())
-    var L: Matrix[dtype] = Matrix.full[dtype](shape=(n, n), order=A.order())
+    var U: Matrix[dtype] = Matrix.zeros[dtype](shape=(n, n), order=A.order())
+    var L: Matrix[dtype] = Matrix.zeros[dtype](shape=(n, n), order=A.order())
 
     # Fill in L and U
     for i in range(0, n):
@@ -305,32 +307,38 @@ fn partial_pivoting[
 
 fn partial_pivoting[
     dtype: DType
-](var A: Matrix[dtype]) raises -> Tuple[Matrix[dtype], Matrix[dtype], Int]:
+](A: Matrix[dtype, **_]) raises -> Tuple[
+    Matrix[dtype, **_], Matrix[dtype, **_], Int
+]:
     """
     Perform partial pivoting for matrix.
     """
     var n = A.shape[0]
-    var P = Matrix.identity[dtype](n)
-    if A.flags.F_CONTIGUOUS:
-        A = A.reorder_layout()
-    var s: Int = 0  # Number of exchanges, for determinant
+    # Work on a copy that preserves the original layout
+    var result = A.create_copy()
+    var P = Matrix.identity[dtype](n, order=A.order())
+    var s: Int = 0  # Number of row exchanges
+
     for col in range(n):
-        var max_p = abs(A[col, col])
+        var max_p = abs(result[col, col])
         var max_p_row = col
         for row in range(col + 1, n):
-            if abs(A[row, col]) > max_p:
-                max_p = abs(A[row, col])
+            if abs(result[row, col]) > max_p:
+                max_p = abs(result[row, col])
                 max_p_row = row
-        A[col], A[max_p_row] = A[max_p_row], A[col]
-        P[col], P[max_p_row] = P[max_p_row], P[col]
 
         if max_p_row != col:
+            # Swap rows in result and permutation matrix using element-wise swap
+            for j in range(n):
+                var t = result._load(col, j)
+                result._store(col, j, result._load(max_p_row, j))
+                result._store(max_p_row, j, t)
+                var tp = P._load(col, j)
+                P._store(col, j, P._load(max_p_row, j))
+                P._store(max_p_row, j, tp)
             s = s + 1
-    if A.flags.F_CONTIGUOUS:
-        A = A.reorder_layout()
-        P = P.reorder_layout()
 
-    return Tuple(A^, P^, s)
+    return Tuple(result^, P^, s)
 
 
 fn qr[
@@ -368,7 +376,7 @@ fn qr[
     else:
         raise Error(String("Invalid mode: {}").format(mode))
 
-    var R: Matrix[dtype]
+    var R: Matrix[dtype, OwnData]
 
     if A.flags.C_CONTIGUOUS:
         reorder = True
@@ -376,7 +384,10 @@ fn qr[
     if reorder:
         R = A.reorder_layout()
     else:
-        R = A.copy()
+        R = Matrix.zeros[dtype](shape=(m, n), order="F")
+        for i in range(m):
+            for j in range(n):
+                R._store(i, j, A._load(i, j))
 
     var H = Matrix.zeros[dtype](shape=(m, min_n), order="F")
 
@@ -392,16 +403,25 @@ fn qr[
         _apply_householder(H, i, Q, i, i)
 
     if reorder:
-        Q = Q.reorder_layout()
+        var Q_reordered = Q.reorder_layout()
         if reduce:
-            R = R[:inner, :].reorder_layout()
+            var R_reduced = Matrix.zeros[dtype](shape=(inner, n), order="C")
+            for i in range(inner):
+                for j in range(n):
+                    R_reduced._store(i, j, R._load(i, j))
+            return Q_reordered^, R_reduced^
         else:
-            R = R.reorder_layout()
+            var R_reordered = R.reorder_layout()
+            return Q_reordered^, R_reordered^
     else:
         if reduce:
-            R = R[:inner, :]
-
-    return Q^, R^
+            var R_reduced = Matrix.zeros[dtype](shape=(inner, n), order="F")
+            for i in range(inner):
+                for j in range(n):
+                    R_reduced._store(i, j, R._load(i, j))
+            return Q^, R_reduced^
+        else:
+            return Q^, R^
 
 
 # ===----------------------------------------------------------------------=== #
diff --git a/numojo/routines/linalg/misc.mojo b/numojo/routines/linalg/misc.mojo
index f45776b2..b10b7df0 100644
--- a/numojo/routines/linalg/misc.mojo
+++ b/numojo/routines/linalg/misc.mojo
@@ -67,7 +67,9 @@ fn diagonal[
 fn issymmetric[
     dtype: DType
 ](
-    A: Matrix[dtype], rtol: Scalar[dtype] = 1e-5, atol: Scalar[dtype] = 1e-8
+    A: Matrix[dtype, **_],
+    rtol: Scalar[dtype] = 1e-5,
+    atol: Scalar[dtype] = 1e-8,
 ) -> Bool:
     """
     Returns True if A is symmetric, False otherwise.
diff --git a/numojo/routines/linalg/norms.mojo b/numojo/routines/linalg/norms.mojo
index 21fd5f5d..5da66a1e 100644
--- a/numojo/routines/linalg/norms.mojo
+++ b/numojo/routines/linalg/norms.mojo
@@ -121,7 +121,7 @@ fn trace[
 
 fn trace[
     dtype: DType
-](A: Matrix[dtype], offset: Int = 0) raises -> Scalar[dtype]:
+](A: Matrix[dtype, **_], offset: Int = 0) raises -> Scalar[dtype]:
     """
     Return the sum along diagonals of the array.
 
diff --git a/numojo/routines/linalg/products.mojo b/numojo/routines/linalg/products.mojo
index 64961039..efaac811 100644
--- a/numojo/routines/linalg/products.mojo
+++ b/numojo/routines/linalg/products.mojo
@@ -339,27 +339,27 @@ fn matmul[
 
     for i in range(result.size // result_sub_matrix.size):
         memcpy(
-            A_sub_matrix._buf.ptr,
-            A._buf.ptr + (i * A_sub_matrix.size),
-            A_sub_matrix.size,
+            dest=A_sub_matrix._buf.ptr,
+            src=A._buf.ptr + (i * A_sub_matrix.size),
+            count=A_sub_matrix.size,
         )
         memcpy(
-            B_sub_matrix._buf.ptr,
-            B._buf.ptr + (i * B_sub_matrix.size),
-            B_sub_matrix.size,
+            dest=B_sub_matrix._buf.ptr,
+            src=B._buf.ptr + (i * B_sub_matrix.size),
+            count=B_sub_matrix.size,
         )
         result_sub_matrix = matmul_2darray(A_sub_matrix, B_sub_matrix)
         memcpy(
-            result._buf.ptr + (i * result_sub_matrix.size),
-            result_sub_matrix._buf.ptr,
-            result_sub_matrix.size,
+            dest=result._buf.ptr + (i * result_sub_matrix.size),
+            src=result_sub_matrix._buf.ptr,
+            count=result_sub_matrix.size,
         )
     return result^
 
 
 fn matmul[
     dtype: DType
-](A: Matrix[dtype], B: Matrix[dtype]) raises -> Matrix[dtype]:
+](A: Matrix[dtype, **_], B: Matrix[dtype, **_]) raises -> Matrix[dtype, **_]:
     """
     Matrix multiplication.
 
diff --git a/numojo/routines/linalg/solving.mojo b/numojo/routines/linalg/solving.mojo
index 6303f7c8..79fd7b6e 100644
--- a/numojo/routines/linalg/solving.mojo
+++ b/numojo/routines/linalg/solving.mojo
@@ -13,11 +13,15 @@ Provides:
 from algorithm import parallelize
 
 from numojo.core.ndarray import NDArray
+from numojo.core.own_data import OwnData
 from numojo.core.item import Item
 import numojo.core.matrix as matrix
 from numojo.core.matrix import Matrix
 from numojo.routines.creation import zeros, eye, full
-from numojo.routines.linalg.decompositions import partial_pivoting
+from numojo.routines.linalg.decompositions import (
+    partial_pivoting,
+    lu_decomposition,
+)
 
 
 fn forward_substitution[
@@ -113,7 +117,7 @@ fn inv[dtype: DType](A: NDArray[dtype]) raises -> NDArray[dtype]:
     return solve(A, I)
 
 
-fn inv[dtype: DType](A: Matrix[dtype]) raises -> Matrix[dtype]:
+fn inv[dtype: DType](A: Matrix[dtype, **_]) raises -> Matrix[dtype, OwnData]:
     """
     Inverse of matrix.
     """
@@ -208,13 +212,13 @@ fn lstsq[
     """Caclulate the OLS estimates.
 
     Example:
-    ```mojo
+    ```text
     from numojo import Matrix
     X = Matrix.rand((1000000, 5))
     y = Matrix.rand((1000000, 1))
-    print(mat.lstsq(X, y))
+    print(lstsq(X, y))
     ```
-    ```console
+    ```text
     [[0.18731374756029967]
      [0.18821352688798607]
      [0.18717162200411439]
@@ -369,7 +373,9 @@ fn solve[
 
 fn solve[
     dtype: DType
-](A: Matrix[dtype], Y: Matrix[dtype]) raises -> Matrix[dtype]:
+](A: Matrix[dtype, **_], Y: Matrix[dtype, **_]) raises -> Matrix[
+    dtype, OwnData
+]:
     """
     Solve `AX = Y` using LUP decomposition.
     """
@@ -381,11 +387,13 @@ fn solve[
 
     var A_pivoted_Pair: Tuple[
         Matrix[dtype], Matrix[dtype], Int
-    ] = partial_pivoting(A.copy())
-    A_pivoted = A_pivoted_Pair[0].copy()
-    P = A_pivoted_Pair[1].copy()
+    ] = partial_pivoting(A.create_copy())
+
+    var pivoted_A = A_pivoted_Pair[0].copy()
+    var P = A_pivoted_Pair[1].copy()
+
     var L_U: Tuple[Matrix[dtype], Matrix[dtype]] = lu_decomposition[dtype](
-        A_pivoted
+        pivoted_A
     )
     L = L_U[0].copy()
     U = L_U[1].copy()
@@ -393,9 +401,8 @@ fn solve[
     var m: Int = A.shape[0]
     var n: Int = Y.shape[1]
 
-    var Z: Matrix[dtype] = Matrix.full[dtype]((m, n), order=A.order())
-    var X: Matrix[dtype] = Matrix.full[dtype]((m, n), order=A.order())
-
+    var Z: Matrix[dtype] = Matrix.zeros[dtype]((m, n), order=A.order())
+    var X: Matrix[dtype] = Matrix.zeros[dtype]((m, n), order=A.order())
     var PY = P @ Y
 
     @parameter
@@ -432,7 +439,9 @@ fn solve[
 
 fn solve_lu[
     dtype: DType
-](A: Matrix[dtype], Y: Matrix[dtype]) raises -> Matrix[dtype]:
+](A: Matrix[dtype, **_], Y: Matrix[dtype, **_]) raises -> Matrix[
+    dtype, OwnData
+]:
     """
     Solve `AX = Y` using LU decomposition.
     """
@@ -457,8 +466,6 @@ fn solve_lu[
                 _temp = _temp - L._load(i, j) * Z._load(j, col)
             _temp = _temp / L._load(i, i)
             Z._store(i, col, _temp)
-
-        # Solve `UZ = Z` for `X` for each col
         for i in range(m - 1, -1, -1):
             var _temp2 = Z._load(i, col)
             for j in range(i + 1, m):
diff --git a/numojo/routines/logic/truth.mojo b/numojo/routines/logic/truth.mojo
index 0a5c5cac..a3d6064f 100644
--- a/numojo/routines/logic/truth.mojo
+++ b/numojo/routines/logic/truth.mojo
@@ -8,10 +8,11 @@ from sys import simd_width_of
 
 import numojo.routines.math._math_funcs as _mf
 from numojo.core.ndarray import NDArray
+from numojo.core.own_data import OwnData
 from numojo.core.matrix import Matrix
 
 
-fn all[dtype: DType](A: Matrix[dtype]) -> Scalar[dtype]:
+fn all[dtype: DType](A: Matrix[dtype, **_]) -> Scalar[dtype]:
     """
     Test whether all array elements evaluate to True.
 
@@ -29,7 +30,9 @@ fn all[dtype: DType](A: Matrix[dtype]) -> Scalar[dtype]:
     return res
 
 
-fn all[dtype: DType](A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
+fn all[
+    dtype: DType
+](A: Matrix[dtype, **_], axis: Int) raises -> Matrix[dtype, OwnData]:
     """
     Test whether all array elements evaluate to True along axis.
     """
@@ -121,7 +124,7 @@ fn any(array: NDArray[DType.bool]) raises -> Scalar[DType.bool]:
     return result
 
 
-fn any[dtype: DType](A: Matrix[dtype]) -> Scalar[dtype]:
+fn any[dtype: DType](A: Matrix[dtype, **_]) -> Scalar[dtype]:
     """
     Test whether any array elements evaluate to True.
 
@@ -139,7 +142,9 @@ fn any[dtype: DType](A: Matrix[dtype]) -> Scalar[dtype]:
     return res
 
 
-fn any[dtype: DType](A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
+fn any[
+    dtype: DType
+](A: Matrix[dtype, **_], axis: Int) raises -> Matrix[dtype, OwnData]:
     """
     Test whether any array elements evaluate to True along axis.
     """
diff --git a/numojo/routines/manipulation.mojo b/numojo/routines/manipulation.mojo
index a1030705..00f1043b 100644
--- a/numojo/routines/manipulation.mojo
+++ b/numojo/routines/manipulation.mojo
@@ -9,11 +9,13 @@
 Array manipulation routines.
 """
 
-from memory import UnsafePointer, memcpy
+from memory import memcpy
+from memory import LegacyUnsafePointer as UnsafePointer
 from sys import simd_width_of
 from algorithm import vectorize
 
 from numojo.core.ndarray import NDArray
+from numojo.core.own_data import OwnData
 from numojo.core.complex import ComplexNDArray
 from numojo.core.ndshape import NDArrayShape, Shape
 from numojo.core.ndstrides import NDArrayStrides
@@ -207,7 +209,7 @@ fn ravel[
 # TODO: Remove this one if the following function is working well:
 # `numojo.core.utility._traverse_buffer_according_to_shape_and_strides`
 fn _set_values_according_to_shape_and_strides(
-    mut I: NDArray[DType.index],
+    mut I: NDArray[DType.int],
     mut index: Int,
     current_dim: Int,
     previous_sum: Int,
@@ -285,8 +287,8 @@ fn transpose[
         new_strides._buf[i] = A.strides[axes[i]]
 
     var array_order: String = "C" if A.flags.C_CONTIGUOUS else "F"
-    var I = NDArray[DType.index](Shape(A.size), order=array_order)
-    var ptr: UnsafePointer[Scalar[DType.index]] = I._buf.ptr
+    var I = NDArray[DType.int](Shape(A.size), order=array_order)
+    var ptr: UnsafePointer[Scalar[DType.int]] = I._buf.ptr
     numojo.core.utility._traverse_buffer_according_to_shape_and_strides(
         ptr, new_shape, new_strides
     )
@@ -310,7 +312,7 @@ fn transpose[dtype: DType](A: NDArray[dtype]) raises -> NDArray[dtype]:
         var array_order = "C" if A.flags.C_CONTIGUOUS else "F"
         var B = NDArray[dtype](Shape(A.shape[1], A.shape[0]), order=array_order)
         if A.shape[0] == 1 or A.shape[1] == 1:
-            memcpy(B._buf.ptr, A._buf.ptr, A.size)
+            memcpy(dest=B._buf.ptr, src=A._buf.ptr, count=A.size)
         else:
             for i in range(B.shape[0]):
                 for j in range(B.shape[1]):
@@ -324,7 +326,7 @@ fn transpose[dtype: DType](A: NDArray[dtype]) raises -> NDArray[dtype]:
         return transpose(A, axes=flipped_axes)
 
 
-fn transpose[dtype: DType](A: Matrix[dtype]) -> Matrix[dtype]:
+fn transpose[dtype: DType](A: Matrix[dtype, **_]) -> Matrix[dtype]:
     """
     Transpose of matrix.
     """
@@ -335,7 +337,7 @@ fn transpose[dtype: DType](A: Matrix[dtype]) -> Matrix[dtype]:
     var B = Matrix[dtype](Tuple(A.shape[1], A.shape[0]), order=order)
 
     if A.shape[0] == 1 or A.shape[1] == 1:
-        memcpy(B._buf.ptr, A._buf.ptr, A.size)
+        memcpy(dest=B._buf.ptr, src=A._buf.ptr, count=A.size)
     else:
         for i in range(B.shape[0]):
             for j in range(B.shape[1]):
@@ -343,7 +345,9 @@ fn transpose[dtype: DType](A: Matrix[dtype]) -> Matrix[dtype]:
     return B^
 
 
-fn reorder_layout[dtype: DType](A: Matrix[dtype]) raises -> Matrix[dtype]:
+fn reorder_layout[
+    dtype: DType
+](A: Matrix[dtype, **_]) raises -> Matrix[dtype, A.BufType]:
     """
     Create a new Matrix with the opposite layout from A:
     if A is C-contiguous, then create a new F-contiguous matrix of the same shape.
@@ -368,8 +372,7 @@ fn reorder_layout[dtype: DType](A: Matrix[dtype]) raises -> Matrix[dtype]:
             )
         )
 
-    var B: Matrix[dtype] = Matrix[dtype](Tuple(rows, cols), new_order)
-
+    var B = Matrix[dtype, A.BufType](Tuple(rows, cols), new_order)
     if new_order == "C":
         for i in range(rows):
             for j in range(cols):
@@ -447,8 +450,10 @@ fn broadcast_to[
 fn broadcast_to[
     dtype: DType
 ](
-    var A: Matrix[dtype], shape: Tuple[Int, Int], override_order: String = ""
-) raises -> Matrix[dtype]:
+    read A: Matrix[dtype, **_],
+    shape: Tuple[Int, Int],
+    override_order: String = "",
+) raises -> Matrix[dtype, **_]:
     """
     Broadcasts the vector to the given shape.
 
@@ -485,11 +490,12 @@ fn broadcast_to[
     else:
         ord = override_order
 
-    var B = Matrix[dtype](shape, order=ord)
+    var B: Matrix[dtype, OwnData] = Matrix[dtype, OwnData](shape, order=ord)
     if (A.shape[0] == shape[0]) and (A.shape[1] == shape[1]):
-        return A^
+        # return A.copy()
+        memcpy(dest=B._buf.ptr, src=A._buf.ptr, count=A.size)
     elif (A.shape[0] == 1) and (A.shape[1] == 1):
-        B = Matrix.full[dtype](shape, A[0, 0], order=ord)
+        B = Matrix[dtype].full(shape, A[0, 0], order=ord)
     elif (A.shape[0] == 1) and (A.shape[1] == shape[1]):
         for i in range(shape[0]):
             memcpy(
@@ -518,7 +524,7 @@ fn broadcast_to[
     Broadcasts the scalar to the given shape.
     """
 
-    var B: Matrix[dtype] = Matrix.full[dtype](shape, A, order=order)
+    var B: Matrix[dtype] = Matrix[dtype].full(shape, A, order=order)
     return B^
 
 
@@ -614,7 +620,7 @@ fn flip[
             String("Invalid index: index out of bound [0, {}).").format(A.ndim)
         )
 
-    var I = NDArray[DType.index](Shape(A.size))
+    var I = NDArray[DType.int](Shape(A.size))
     var ptr = I._buf.ptr
 
     numojo.core.utility._traverse_buffer_according_to_shape_and_strides(
diff --git a/numojo/routines/math/_math_funcs.mojo b/numojo/routines/math/_math_funcs.mojo
index c99b08d6..b739206d 100644
--- a/numojo/routines/math/_math_funcs.mojo
+++ b/numojo/routines/math/_math_funcs.mojo
@@ -11,7 +11,7 @@ from testing import assert_raises
 from algorithm.functional import parallelize, vectorize
 from sys.info import num_physical_cores
 from sys import simd_width_of
-from memory import UnsafePointer
+from memory import LegacyUnsafePointer as UnsafePointer
 
 from numojo.core.traits.backend import Backend
 from numojo.core.ndarray import NDArray
diff --git a/numojo/routines/math/extrema.mojo b/numojo/routines/math/extrema.mojo
index 85b2abee..e126ec2c 100644
--- a/numojo/routines/math/extrema.mojo
+++ b/numojo/routines/math/extrema.mojo
@@ -1,4 +1,4 @@
-# ===----------------------------------------------------------------------=== #
+# views ===----------------------------------------------------------------------=== #
 # Distributed under the Apache 2.0 License with LLVM Exceptions.
 # See LICENSE and the LLVM License for more information.
 # https://github.com/Mojo-Numerics-and-Algorithms-group/NuMojo/blob/main/LICENSE
@@ -30,6 +30,7 @@ from sys import simd_width_of
 from numojo.core.matrix import Matrix
 import numojo.core.matrix as matrix
 from numojo.core.ndarray import NDArray
+from numojo.core.own_data import OwnData
 import numojo.core.utility as utility
 from numojo.routines.creation import full
 from numojo.routines.sorting import binary_sort
@@ -144,7 +145,7 @@ fn max[dtype: DType](a: NDArray[dtype], axis: Int) raises -> NDArray[dtype]:
 @always_inline
 fn matrix_extrema[
     dtype: DType, find_max: Bool
-](A: Matrix[dtype]) raises -> Scalar[dtype]:
+](A: Matrix[dtype, **_]) raises -> Scalar[dtype]:
     """
     Generic implementation for finding global min/max in a matrix.
     Works with any memory layout (row-major or column-major).
@@ -167,7 +168,7 @@ fn matrix_extrema[
 @always_inline
 fn matrix_extrema_axis[
     dtype: DType, find_max: Bool
-](A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
+](A: Matrix[dtype, **_], axis: Int) raises -> Matrix[dtype, OwnData]:
     """
     Generic implementation for finding min/max along an axis in a matrix.
     Works with any memory layout (row-major or column-major).
@@ -213,14 +214,16 @@ fn matrix_extrema_axis[
     return B^
 
 
-fn max[dtype: DType](A: Matrix[dtype]) raises -> Scalar[dtype]:
+fn max[dtype: DType](A: Matrix[dtype, **_]) raises -> Scalar[dtype]:
     """
     Find max item. It is first flattened before sorting.
     """
     return matrix_extrema[dtype, True](A)
 
 
-fn max[dtype: DType](A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
+fn max[
+    dtype: DType
+](A: Matrix[dtype, **_], axis: Int) raises -> Matrix[dtype, OwnData]:
     """
     Find max item along the given axis.
     """
@@ -230,7 +233,7 @@ fn max[dtype: DType](A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
 fn _max[
     dtype: DType
 ](A: Matrix[dtype], start: Int, end: Int) raises -> Tuple[
-    Scalar[dtype], Scalar[DType.index]
+    Scalar[dtype], Scalar[DType.int]
 ]:
     """
     Auxiliary function that find the max value in a range of the buffer.
@@ -243,7 +246,7 @@ fn _max[
             ).format(start, end, A.size)
         )
 
-    var max_index: Scalar[DType.index] = start
+    var max_index: Scalar[DType.int] = start
 
     var rows = A.shape[0]
     var cols = A.shape[1]
@@ -333,14 +336,14 @@ fn min[dtype: DType](a: NDArray[dtype], axis: Int) raises -> NDArray[dtype]:
     )
 
 
-fn min[dtype: DType](A: Matrix[dtype]) raises -> Scalar[dtype]:
+fn min[dtype: DType](A: Matrix[dtype, **_]) raises -> Scalar[dtype]:
     """
     Find min item.
     """
     return matrix_extrema[dtype, False](A)
 
 
-fn min[dtype: DType](A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
+fn min[dtype: DType](A: Matrix[dtype, **_], axis: Int) raises -> Matrix[dtype]:
     """
     Find min item along the given axis.
     """
@@ -350,7 +353,7 @@ fn min[dtype: DType](A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
 fn _min[
     dtype: DType
 ](A: Matrix[dtype], start: Int, end: Int) raises -> Tuple[
-    Scalar[dtype], Scalar[DType.index]
+    Scalar[dtype], Scalar[DType.int]
 ]:
     """
     Auxiliary function that find the min value in a range of the buffer.
@@ -363,7 +366,7 @@ fn _min[
             ).format(start, end, A.size)
         )
 
-    var min_index: Scalar[DType.index] = start
+    var min_index: Scalar[DType.int] = start
 
     var rows = A.shape[0]
     var cols = A.shape[1]
diff --git a/numojo/routines/math/products.mojo b/numojo/routines/math/products.mojo
index 38268dea..dc0c7829 100644
--- a/numojo/routines/math/products.mojo
+++ b/numojo/routines/math/products.mojo
@@ -1,7 +1,9 @@
 from algorithm.functional import parallelize, vectorize
 from sys import simd_width_of
+from memory import UnsafePointer, memcpy, memset_zero
 
 from numojo.core.ndarray import NDArray
+from numojo.core.own_data import OwnData
 import numojo.core.matrix as matrix
 from numojo.core.matrix import Matrix
 from numojo.routines.creation import ones
@@ -81,7 +83,7 @@ fn prod[
     return result^
 
 
-fn prod[dtype: DType](A: Matrix[dtype]) -> Scalar[dtype]:
+fn prod[dtype: DType](A: Matrix[dtype, **_]) -> Scalar[dtype]:
     """
     Product of all items in the Matrix.
 
@@ -99,7 +101,7 @@ fn prod[dtype: DType](A: Matrix[dtype]) -> Scalar[dtype]:
     return res
 
 
-fn prod[dtype: DType](A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
+fn prod[dtype: DType](A: Matrix[dtype, **_], axis: Int) raises -> Matrix[dtype]:
     """
     Product of items in a Matrix along the axis.
 
@@ -205,7 +207,7 @@ fn cumprod[
             String("Invalid index: index out of bound [0, {}).").format(A.ndim)
         )
 
-    var I = NDArray[DType.index](Shape(A.size))
+    var I = NDArray[DType.int](Shape(A.size))
     var ptr = I._buf.ptr
 
     var _shape = B.shape._move_axis_to_end(axis)
@@ -222,7 +224,9 @@ fn cumprod[
     return B^
 
 
-fn cumprod[dtype: DType](var A: Matrix[dtype]) raises -> Matrix[dtype]:
+fn cumprod[
+    dtype: DType
+](A: Matrix[dtype, **_]) raises -> Matrix[dtype, OwnData]:
     """
     Cumprod of flattened matrix.
 
@@ -236,25 +240,26 @@ fn cumprod[dtype: DType](var A: Matrix[dtype]) raises -> Matrix[dtype]:
     print(mat.cumprod(A))
     ```
     """
-    var reorder = False
-    if A.flags.F_CONTIGUOUS:
-        reorder = True
-        A = A.reorder_layout()
+    alias width: Int = simd_width_of[dtype]()
+    var result: Matrix[dtype] = Matrix.zeros[dtype](A.shape, "C")
 
-    A.resize(shape=(1, A.size))
+    if A.flags.C_CONTIGUOUS:
+        memcpy(dest=result._buf.ptr, src=A._buf.ptr, count=A.size)
+    else:
+        for i in range(A.shape[0]):
+            for j in range(A.shape[1]):
+                result[i, j] = A[i, j]
 
     for i in range(1, A.size):
-        A._buf.ptr[i] *= A._buf.ptr[i - 1]
-
-    if reorder:
-        A = A.reorder_layout()
+        result._buf.ptr[i] *= result._buf.ptr[i - 1]
 
-    return A^
+    result.resize(shape=(1, result.size))
+    return result^
 
 
 fn cumprod[
     dtype: DType
-](var A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
+](A: Matrix[dtype, **_], axis: Int) raises -> Matrix[dtype, **_]:
     """
     Cumprod of Matrix along the axis.
 
@@ -271,6 +276,19 @@ fn cumprod[
     ```
     """
     alias width: Int = simd_width_of[dtype]()
+    var order: String = "C" if A.flags.C_CONTIGUOUS else "F"
+    var result: Matrix[dtype] = Matrix.zeros[dtype](A.shape, order)
+
+    if order == "C":
+        memcpy(dest=result._buf.ptr, src=A._buf.ptr, count=A.size)
+    else:
+        for j in range(result.shape[1]):
+
+            @parameter
+            fn copy_col[width: Int](i: Int):
+                result._store[width](i, j, A._load[width](i, j))
+
+            vectorize[copy_col, width](A.shape[0])
 
     if axis == 0:
         if A.flags.C_CONTIGUOUS:
@@ -278,34 +296,40 @@ fn cumprod[
 
                 @parameter
                 fn cal_vec_row[width: Int](j: Int):
-                    A._store[width](
-                        i, j, A._load[width](i - 1, j) * A._load[width](i, j)
+                    result._store[width](
+                        i,
+                        j,
+                        result._load[width](i - 1, j)
+                        * result._load[width](i, j),
                     )
 
                 vectorize[cal_vec_row, width](A.shape[1])
-            return A^
+            return result^
         else:
             for j in range(A.shape[1]):
                 for i in range(1, A.shape[0]):
-                    A[i, j] = A[i - 1, j] * A[i, j]
-            return A^
+                    result[i, j] = result[i - 1, j] * result[i, j]
+            return result^
 
     elif axis == 1:
         if A.flags.C_CONTIGUOUS:
             for i in range(A.shape[0]):
                 for j in range(1, A.shape[1]):
-                    A[i, j] = A[i, j - 1] * A[i, j]
-            return A^
+                    result[i, j] = result[i, j - 1] * result[i, j]
+            return result^
         else:
             for j in range(1, A.shape[1]):
 
                 @parameter
                 fn cal_vec_column[width: Int](i: Int):
-                    A._store[width](
-                        i, j, A._load[width](i, j - 1) * A._load[width](i, j)
+                    result._store[width](
+                        i,
+                        j,
+                        result._load[width](i, j - 1)
+                        * result._load[width](i, j),
                     )
 
                 vectorize[cal_vec_column, width](A.shape[0])
-            return A^
+            return result^
     else:
         raise Error(String("The axis can either be 1 or 0!"))
diff --git a/numojo/routines/math/rounding.mojo b/numojo/routines/math/rounding.mojo
index bb45c538..4ae05076 100644
--- a/numojo/routines/math/rounding.mojo
+++ b/numojo/routines/math/rounding.mojo
@@ -16,15 +16,14 @@ from numojo.core.matrix import Matrix
 
 fn round[
     dtype: DType
-](var A: Matrix[dtype], decimals: Int = 0) -> Matrix[dtype]:
+](A: Matrix[dtype, **_], decimals: Int = 0) -> Matrix[dtype]:
     # FIXME
     # The built-in `round` function is not working now.
     # It will be fixed in future.
-
+    var res = Matrix.zeros[dtype](A.shape)
     for i in range(A.size):
-        A._buf.ptr[i] = builtin_math.round(A._buf.ptr[i], ndigits=decimals)
-
-    return A^
+        res._buf.ptr[i] = builtin_math.round(A._buf.ptr[i], ndigits=decimals)
+    return res^
 
 
 fn tabs[
diff --git a/numojo/routines/math/sums.mojo b/numojo/routines/math/sums.mojo
index 0d1ee799..132a7303 100644
--- a/numojo/routines/math/sums.mojo
+++ b/numojo/routines/math/sums.mojo
@@ -1,7 +1,9 @@
 from sys import simd_width_of
 from algorithm import parallelize, vectorize
+from memory import UnsafePointer, memset_zero, memcpy
 
 from numojo.core.ndarray import NDArray
+from numojo.core.own_data import OwnData
 from numojo.core.matrix import Matrix
 from numojo.routines.creation import zeros
 
@@ -108,7 +110,7 @@ fn sum[dtype: DType](A: NDArray[dtype], axis: Int) raises -> NDArray[dtype]:
     return result^
 
 
-fn sum[dtype: DType](A: Matrix[dtype]) -> Scalar[dtype]:
+fn sum[dtype: DType](A: Matrix[dtype, **_]) -> Scalar[dtype]:
     """
     Sum up all items in the Matrix.
 
@@ -133,7 +135,9 @@ fn sum[dtype: DType](A: Matrix[dtype]) -> Scalar[dtype]:
     return res
 
 
-fn sum[dtype: DType](A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
+fn sum[
+    dtype: DType
+](A: Matrix[dtype, **_], axis: Int) raises -> Matrix[dtype, OwnData]:
     """
     Sum up the items in a Matrix along the axis.
 
@@ -263,7 +267,7 @@ fn cumsum[
             String("Invalid index: index out of bound [0, {}).").format(A.ndim)
         )
 
-    var I = NDArray[DType.index](Shape(A.size))
+    var I = NDArray[DType.int](Shape(A.size))
     var ptr = I._buf.ptr
 
     var _shape = B.shape._move_axis_to_end(axis)
@@ -282,7 +286,7 @@ fn cumsum[
     return B^
 
 
-fn cumsum[dtype: DType](var A: Matrix[dtype]) raises -> Matrix[dtype]:
+fn cumsum[dtype: DType](A: Matrix[dtype, **_]) raises -> Matrix[dtype, OwnData]:
     """
     Cumsum of flattened matrix.
 
@@ -297,24 +301,28 @@ fn cumsum[dtype: DType](var A: Matrix[dtype]) raises -> Matrix[dtype]:
     ```
     """
     var reorder = False
+    var order = "C" if A.flags.C_CONTIGUOUS else "F"
+    var result: Matrix[dtype, OwnData] = Matrix.zeros[dtype](A.shape, order)
+    memcpy(dest=result._buf.ptr, src=A._buf.ptr, count=A.size)
+
     if A.flags.F_CONTIGUOUS:
         reorder = True
-        A = A.reorder_layout()
+        result = result.reorder_layout()
 
-    A.resize(shape=(1, A.size))
+    result.resize(shape=(1, A.size))
 
     for i in range(1, A.size):
-        A._buf.ptr[i] += A._buf.ptr[i - 1]
+        result._buf.ptr[i] += result._buf.ptr[i - 1]
 
     if reorder:
-        A = A.reorder_layout()
+        result = result.reorder_layout()
 
-    return A^
+    return result^
 
 
 fn cumsum[
     dtype: DType
-](var A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
+](A: Matrix[dtype, **_], axis: Int) raises -> Matrix[dtype, OwnData]:
     """
     Cumsum of Matrix along the axis.
 
@@ -332,41 +340,50 @@ fn cumsum[
     """
 
     alias width: Int = simd_width_of[dtype]()
+    var order = "C" if A.flags.C_CONTIGUOUS else "F"
+    var result: Matrix[dtype, OwnData] = Matrix.zeros[dtype](A.shape, order)
+    memcpy(dest=result._buf.ptr, src=A._buf.ptr, count=A.size)
 
     if axis == 0:
-        if A.flags.C_CONTIGUOUS:
+        if result.flags.C_CONTIGUOUS:
             for i in range(1, A.shape[0]):
 
                 @parameter
                 fn cal_vec_sum_column[width: Int](j: Int):
-                    A._store[width](
-                        i, j, A._load[width](i - 1, j) + A._load[width](i, j)
+                    result._store[width](
+                        i,
+                        j,
+                        result._load[width](i - 1, j)
+                        + result._load[width](i, j),
                     )
 
-                vectorize[cal_vec_sum_column, width](A.shape[1])
-            return A^
+                vectorize[cal_vec_sum_column, width](result.shape[1])
+            return result^
         else:
             for j in range(A.shape[1]):
                 for i in range(1, A.shape[0]):
-                    A[i, j] = A[i - 1, j] + A[i, j]
-            return A^
+                    result[i, j] = result[i - 1, j] + result[i, j]
+            return result^
 
     elif axis == 1:
         if A.flags.C_CONTIGUOUS:
             for i in range(A.shape[0]):
                 for j in range(1, A.shape[1]):
-                    A[i, j] = A[i, j - 1] + A[i, j]
-            return A^
+                    result[i, j] = result[i, j - 1] + result[i, j]
+            return result^
         else:
             for j in range(1, A.shape[1]):
 
                 @parameter
                 fn cal_vec_sum_row[width: Int](i: Int):
-                    A._store[width](
-                        i, j, A._load[width](i, j - 1) + A._load[width](i, j)
+                    result._store[width](
+                        i,
+                        j,
+                        result._load[width](i, j - 1)
+                        + result._load[width](i, j),
                     )
 
                 vectorize[cal_vec_sum_row, width](A.shape[0])
-            return A^
+            return result^
     else:
         raise Error(String("The axis can either be 1 or 0!"))
diff --git a/numojo/routines/searching.mojo b/numojo/routines/searching.mojo
index a6bb2fe2..aab5bc32 100644
--- a/numojo/routines/searching.mojo
+++ b/numojo/routines/searching.mojo
@@ -17,7 +17,7 @@ from numojo.routines.sorting import binary_sort
 from numojo.routines.math.extrema import _max, _min
 
 
-fn argmax_1d[dtype: DType](a: NDArray[dtype]) raises -> Scalar[DType.index]:
+fn argmax_1d[dtype: DType](a: NDArray[dtype]) raises -> Scalar[DType.int]:
     """Returns the index of the maximum value in the buffer.
     Regardless of the shape of input, it is treated as a 1-d array.
 
@@ -44,7 +44,7 @@ fn argmax_1d[dtype: DType](a: NDArray[dtype]) raises -> Scalar[DType.index]:
     return result
 
 
-fn argmin_1d[dtype: DType](a: NDArray[dtype]) raises -> Scalar[DType.index]:
+fn argmin_1d[dtype: DType](a: NDArray[dtype]) raises -> Scalar[DType.int]:
     """Returns the index of the minimum value in the buffer.
     Regardless of the shape of input, it is treated as a 1-d array.
 
@@ -71,7 +71,7 @@ fn argmin_1d[dtype: DType](a: NDArray[dtype]) raises -> Scalar[DType.index]:
     return result
 
 
-fn argmax[dtype: DType, //](a: NDArray[dtype]) raises -> Scalar[DType.index]:
+fn argmax[dtype: DType, //](a: NDArray[dtype]) raises -> Scalar[DType.int]:
     """Returns the indices of the maximum values of the array along an axis.
     When no axis is specified, the array is flattened.
 
@@ -98,7 +98,7 @@ fn argmax[dtype: DType, //](a: NDArray[dtype]) raises -> Scalar[DType.index]:
 
 fn argmax[
     dtype: DType, //
-](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.index]:
+](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.int]:
     """Returns the indices of the maximum values of the array along an axis.
     When no axis is specified, the array is flattened.
 
@@ -161,11 +161,11 @@ fn argmax[
 @always_inline
 fn find_extrema_index[
     dtype: DType, find_max: Bool
-](A: Matrix[dtype]) raises -> Scalar[DType.index]:
+](A: Matrix[dtype, **_]) raises -> Scalar[DType.int, **_]:
     """Find index of min/max value, either in whole matrix or along an axis."""
 
     var extreme_val = A[0, 0]
-    var extreme_idx: Scalar[DType.index] = 0
+    var extreme_idx: Scalar[DType.int] = 0
 
     for i in range(A.shape[0]):
         for j in range(A.shape[1]):
@@ -187,13 +187,13 @@ fn find_extrema_index[
 @always_inline
 fn find_extrema_index[
     dtype: DType, find_max: Bool
-](A: Matrix[dtype], axis: Optional[Int]) raises -> Matrix[DType.index]:
+](A: Matrix[dtype, **_], axis: Optional[Int]) raises -> Matrix[DType.int, **_]:
     """Find index of min/max value, either in whole matrix or along an axis."""
 
     if axis != 0 and axis != 1:
         raise Error(String("The axis can either be 1 or 0!"))
 
-    var B = Matrix[DType.index](
+    var B = Matrix[DType.int](
         shape=(A.shape[0], 1) if axis == 1 else (1, A.shape[1])
     )
 
@@ -237,19 +237,19 @@ fn find_extrema_index[
     return B^
 
 
-fn argmax[dtype: DType](A: Matrix[dtype]) raises -> Scalar[DType.index]:
+fn argmax[dtype: DType](A: Matrix[dtype, **_]) raises -> Scalar[DType.int]:
     """Find index of max value in a flattened matrix."""
     return find_extrema_index[dtype, True](A)
 
 
 fn argmax[
     dtype: DType
-](A: Matrix[dtype], axis: Int) raises -> Matrix[DType.index]:
+](A: Matrix[dtype, **_], axis: Int) raises -> Matrix[DType.int, **_]:
     """Find indices of max values along the given axis."""
     return find_extrema_index[dtype, True](A, axis)
 
 
-fn argmin[dtype: DType, //](a: NDArray[dtype]) raises -> Scalar[DType.index]:
+fn argmin[dtype: DType, //](a: NDArray[dtype]) raises -> Scalar[DType.int]:
     """Returns the indices of the minimum values of the array along an axis.
     When no axis is specified, the array is flattened.
 
@@ -276,7 +276,7 @@ fn argmin[dtype: DType, //](a: NDArray[dtype]) raises -> Scalar[DType.index]:
 
 fn argmin[
     dtype: DType, //
-](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.index]:
+](a: NDArray[dtype], axis: Int) raises -> NDArray[DType.int]:
     """Returns the indices of the minimum values of the array along an axis.
     When no axis is specified, the array is flattened.
 
@@ -309,7 +309,7 @@ fn argmin[
     return numojo.apply_along_axis[func1d=argmin_1d](a=a, axis=normalized_axis)
 
 
-fn argmin[dtype: DType](A: Matrix[dtype]) raises -> Scalar[DType.index]:
+fn argmin[dtype: DType](A: Matrix[dtype, **_]) raises -> Scalar[DType.int]:
     """
     Index of the min. It is first flattened before sorting.
     """
@@ -318,7 +318,7 @@ fn argmin[dtype: DType](A: Matrix[dtype]) raises -> Scalar[DType.index]:
 
 fn argmin[
     dtype: DType
-](A: Matrix[dtype], axis: Int) raises -> Matrix[DType.index]:
+](A: Matrix[dtype, **_], axis: Int) raises -> Matrix[DType.int, **_]:
     """
     Index of the min along the given axis.
     """
diff --git a/numojo/routines/sorting.mojo b/numojo/routines/sorting.mojo
index 54408959..835d91f6 100644
--- a/numojo/routines/sorting.mojo
+++ b/numojo/routines/sorting.mojo
@@ -23,6 +23,7 @@ import math
 from algorithm import vectorize
 
 from numojo.core.ndarray import NDArray
+from numojo.core.own_data import OwnData
 from numojo.core.ndshape import NDArrayShape
 import numojo.core.matrix as matrix
 from numojo.core.matrix import Matrix
@@ -149,7 +150,7 @@ fn sort[dtype: DType](A: Matrix[dtype]) raises -> Matrix[dtype]:
     """
     Sort the Matrix. It is first flattened before sorting.
     """
-    var I = Matrix.zeros[DType.index](shape=A.shape)
+    var I = Matrix[DType.int].zeros(shape=A.shape)
     var B = A.flatten()
     _quick_sort_inplace(B, I, 0, A.size - 1)
 
@@ -167,7 +168,7 @@ fn sort[dtype: DType](var A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
 
         for i in range(A.shape[0]):
             var row = Matrix[dtype](shape=(1, A.shape[1]), order="C")
-            var indices = Matrix.zeros[DType.index](
+            var indices = Matrix[DType.int].zeros(
                 shape=(1, A.shape[1]), order="C"
             )
 
@@ -186,7 +187,7 @@ fn sort[dtype: DType](var A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
 
         for j in range(A.shape[1]):
             var col = Matrix[dtype](shape=(A.shape[0], 1), order="C")
-            var indices = Matrix.zeros[DType.index](
+            var indices = Matrix[DType.int].zeros(
                 shape=(A.shape[0], 1), order="C"
             )
 
@@ -203,7 +204,7 @@ fn sort[dtype: DType](var A: Matrix[dtype], axis: Int) raises -> Matrix[dtype]:
         raise Error(String("The axis can either be 1 or 0!"))
 
 
-fn argsort[dtype: DType](a: NDArray[dtype]) raises -> NDArray[DType.index]:
+fn argsort[dtype: DType](a: NDArray[dtype]) raises -> NDArray[DType.int]:
     """
     Returns the indices that would sort an array.
     It is not guaranteed to be unstable.
@@ -224,7 +225,7 @@ fn argsort[dtype: DType](a: NDArray[dtype]) raises -> NDArray[DType.index]:
     else:
         a_flattened = ravel(a)
 
-    var indices = arange[DType.index](a_flattened.size)
+    var indices = arange[DType.int](a_flattened.size)
 
     _quick_sort_inplace(a_flattened, indices)
 
@@ -233,7 +234,7 @@ fn argsort[dtype: DType](a: NDArray[dtype]) raises -> NDArray[DType.index]:
 
 fn argsort[
     dtype: DType
-](mut a: NDArray[dtype], axis: Int) raises -> NDArray[DType.index]:
+](mut a: NDArray[dtype], axis: Int) raises -> NDArray[DType.int]:
     """
     Returns the indices that would sort an array.
     It is not guaranteed to be unstable.
@@ -272,11 +273,13 @@ fn argsort[
     )
 
 
-fn argsort[dtype: DType](A: Matrix[dtype]) raises -> Matrix[DType.index]:
+fn argsort[
+    dtype: DType
+](A: Matrix[dtype, **_]) raises -> Matrix[DType.int, OwnData]:
     """
     Argsort the Matrix. It is first flattened before sorting.
     """
-    var I = Matrix[DType.index](shape=(1, A.size), order=A.order())
+    var I = Matrix[DType.int](shape=(1, A.size), order=A.order())
     for i in range(I.size):
         I._buf.ptr[i] = i
     var B: Matrix[dtype]
@@ -291,18 +294,18 @@ fn argsort[dtype: DType](A: Matrix[dtype]) raises -> Matrix[DType.index]:
 
 fn argsort[
     dtype: DType
-](var A: Matrix[dtype], axis: Int) raises -> Matrix[DType.index]:
+](A: Matrix[dtype, **_], axis: Int) raises -> Matrix[DType.int, OwnData]:
     """
     Argsort the Matrix along the given axis.
     """
     var order = A.order()
 
     if axis == 1:
-        var result = Matrix[DType.index](shape=A.shape, order=order)
+        var result = Matrix[DType.int, OwnData](shape=A.shape, order=order)
 
         for i in range(A.shape[0]):
             var row = Matrix[dtype](shape=(1, A.shape[1]), order="C")
-            var idx = Matrix[DType.index](shape=(1, A.shape[1]), order="C")
+            var idx = Matrix[DType.int](shape=(1, A.shape[1]), order="C")
 
             for j in range(A.shape[1]):
                 row._store(0, j, A._load(i, j))
@@ -316,11 +319,11 @@ fn argsort[
         return result^
 
     elif axis == 0:
-        var result = Matrix[DType.index](shape=A.shape, order=order)
+        var result = Matrix[DType.int, OwnData](shape=A.shape, order=order)
 
         for j in range(A.shape[1]):
             var col = Matrix[dtype](shape=(A.shape[0], 1), order="C")
-            var idx = Matrix[DType.index](shape=(A.shape[0], 1), order="C")
+            var idx = Matrix[DType.int](shape=(A.shape[0], 1), order="C")
 
             for i in range(A.shape[0]):
                 col._store(i, 0, A._load(i, j))
@@ -542,7 +545,7 @@ fn quick_sort_stable_inplace_1d[dtype: DType](mut a: NDArray[dtype]) raises:
 
 fn argsort_quick_sort_1d[
     dtype: DType
-](a: NDArray[dtype]) raises -> NDArray[DType.index]:
+](a: NDArray[dtype]) raises -> NDArray[DType.int]:
     """
     Returns the indices that would sort the buffer of an array.
     Regardless of the shape of input, it is treated as a 1-d array.
@@ -559,7 +562,7 @@ fn argsort_quick_sort_1d[
     """
 
     var result: NDArray[dtype] = a.copy()
-    var indices = arange[DType.index](result.size)
+    var indices = arange[DType.int](result.size)
     _quick_sort_inplace(result, indices)
     return indices^
 
@@ -831,7 +834,7 @@ fn _quick_sort_inplace[dtype: DType](mut A: NDArray[dtype]) raises:
 
 fn _quick_sort_inplace[
     dtype: DType
-](mut A: NDArray[dtype], mut I: NDArray[DType.index]) raises:
+](mut A: NDArray[dtype], mut I: NDArray[DType.int]) raises:
     """
     Sort in-place array's buffer using quick sort method.
     The indices are also sorted.
diff --git a/numojo/routines/statistics/averages.mojo b/numojo/routines/statistics/averages.mojo
index d8f5e406..d8718b11 100644
--- a/numojo/routines/statistics/averages.mojo
+++ b/numojo/routines/statistics/averages.mojo
@@ -15,6 +15,7 @@ from collections.optional import Optional
 import math as mt
 
 from numojo.core.ndarray import NDArray
+from numojo.core.own_data import OwnData
 import numojo.core.matrix as matrix
 from numojo.core.matrix import Matrix
 import numojo.core.utility as utility
@@ -102,7 +103,7 @@ fn mean[
 
 fn mean[
     dtype: DType, //, returned_dtype: DType = DType.float64
-](a: Matrix[dtype]) -> Scalar[returned_dtype]:
+](a: Matrix[dtype, **_]) -> Scalar[returned_dtype]:
     """
     Calculate the arithmetic average of all items in the Matrix.
 
@@ -122,7 +123,7 @@ fn mean[
 
 fn mean[
     dtype: DType, //, returned_dtype: DType = DType.float64
-](a: Matrix[dtype], axis: Int) raises -> Matrix[returned_dtype]:
+](a: Matrix[dtype, **_], axis: Int) raises -> Matrix[returned_dtype, OwnData]:
     """
     Calculate the arithmetic average of a Matrix along the axis.
 
@@ -373,7 +374,7 @@ fn std[
 
 fn std[
     dtype: DType, //, returned_dtype: DType = DType.float64
-](A: Matrix[dtype], ddof: Int = 0) raises -> Scalar[returned_dtype]:
+](A: Matrix[dtype, **_], ddof: Int = 0) raises -> Scalar[returned_dtype]:
     """
     Compute the standard deviation.
 
@@ -398,7 +399,9 @@ fn std[
 
 fn std[
     dtype: DType, //, returned_dtype: DType = DType.float64
-](A: Matrix[dtype], axis: Int, ddof: Int = 0) raises -> Matrix[returned_dtype]:
+](A: Matrix[dtype, **_], axis: Int, ddof: Int = 0) raises -> Matrix[
+    returned_dtype
+]:
     """
     Compute the standard deviation along axis.
 
@@ -505,7 +508,7 @@ fn variance[
 
 fn variance[
     dtype: DType, //, returned_dtype: DType = DType.float64
-](A: Matrix[dtype], ddof: Int = 0) raises -> Scalar[returned_dtype]:
+](A: Matrix[dtype, **_], ddof: Int = 0) raises -> Scalar[returned_dtype]:
     """
     Compute the variance.
 
@@ -533,7 +536,9 @@ fn variance[
 
 fn variance[
     dtype: DType, //, returned_dtype: DType = DType.float64
-](A: Matrix[dtype], axis: Int, ddof: Int = 0) raises -> Matrix[returned_dtype]:
+](A: Matrix[dtype, **_], axis: Int, ddof: Int = 0) raises -> Matrix[
+    returned_dtype
+]:
     """
     Compute the variance along axis.
 
diff --git a/pixi.toml b/pixi.toml
index 02f9a46d..dd6561f6 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -1,5 +1,6 @@
 [workspace]
 channels = [
+    "https://conda.modular.com/max-nightly",
     "conda-forge",
     "https://conda.modular.com/max",
     "https://repo.prefix.dev/modular-community",
@@ -50,22 +51,48 @@ p = "clear && pixi run package"
 # format the package
 format = "pixi run mojo format ./"
 
-# test whether tests pass on the built package
-test = "pixi run package && pixi run mojo test tests -I tests/ && rm tests/numojo.mojopkg"
-t = "clear && pixi run test"
-
-# run individual tests to avoid overheat
-test_core = "pixi run package && pixi run mojo test tests/core -I tests/ && rm tests/numojo.mojopkg"
-test_creation = "pixi run package && pixi run mojo test tests/routines/test_creation.mojo -I tests/ && rm tests/numojo.mojopkg"
-test_functional = "pixi run package && pixi run mojo test tests/routines/test_functional.mojo -I tests/ && rm tests/numojo.mojopkg"
-test_indexing = "pixi run package && pixi run mojo test tests/routines/test_indexing.mojo -I tests/ && rm tests/numojo.mojopkg"
-test_linalg = "pixi run package && pixi run mojo test tests/routines/test_linalg.mojo -I tests/ && rm tests/numojo.mojopkg"
-test_manipulation = "pixi run package && pixi run mojo test tests/routines/test_manipulation.mojo -I tests/ && rm tests/numojo.mojopkg"
-test_math = "pixi run package && pixi run mojo test tests/routines/test_math.mojo -I tests/ && rm tests/numojo.mojopkg"
-test_random = "pixi run package && pixi run mojo test tests/routines/test_random.mojo -I tests/ && rm tests/numojo.mojopkg"
-test_statistics = "pixi run package && pixi run mojo test tests/routines/test_statistics.mojo -I tests/ && rm tests/numojo.mojopkg"
-test_sorting = "pixi run package && pixi run mojo test tests/routines/test_sorting.mojo -I tests/ && rm tests/numojo.mojopkg"
-test_searching = "pixi run package && pixi run mojo test tests/routines/test_searching.mojo -I tests/ && rm tests/numojo.mojopkg"
+# to run individual test files
+run-test = { cmd = "pixi run mojo run -I tests/ $TEST_FILE", env = { TEST_FILE = "" } }
+
+# Test core category
+test_core = """
+pixi run package && \
+pixi run mojo run -I tests/ tests/core/test_array_indexing_and_slicing.mojo && \
+pixi run mojo run -I tests/ tests/core/test_array_methods.mojo && \
+pixi run mojo run -I tests/ tests/core/test_bool_masks.mojo && \
+pixi run mojo run -I tests/ tests/core/test_complexArray.mojo && \
+pixi run mojo run -I tests/ tests/core/test_complexSIMD.mojo && \
+pixi run mojo run -I tests/ tests/core/test_matrix.mojo && \
+pixi run mojo run -I tests/ -D F_CONTIGUOUS tests/core/test_matrix.mojo && \
+pixi run mojo run -I tests/ tests/core/test_shape_strides_item.mojo && \
+rm tests/numojo.mojopkg
+"""
+
+# Test routines category
+test_routines = """
+pixi run package && \
+pixi run mojo run -I tests/ tests/routines/test_creation.mojo && \
+pixi run mojo run -I tests/ tests/routines/test_functional.mojo && \
+pixi run mojo run -I tests/ tests/routines/test_indexing.mojo && \
+pixi run mojo run -I tests/ tests/routines/test_io.mojo && \
+pixi run mojo run -I tests/ tests/routines/test_linalg.mojo && \
+pixi run mojo run -I tests/ tests/routines/test_manipulation.mojo && \
+pixi run mojo run -I tests/ tests/routines/test_math.mojo && \
+pixi run mojo run -I tests/ tests/routines/test_random.mojo && \
+pixi run mojo run -I tests/ tests/routines/test_statistics.mojo && \
+pixi run mojo run -I tests/ tests/routines/test_sorting.mojo && \
+pixi run mojo run -I tests/ tests/routines/test_searching.mojo && \
+rm tests/numojo.mojopkg
+"""
+
+# Test science category
+test_signal = "pixi run package && pixi run mojo run -I tests/ tests/science/test_signal.mojo && rm tests/numojo.mojopkg"
+
+test = """
+pixi run test_core && \
+pixi run test_routines && \
+pixi run test_signal
+"""
 
 # run all final checks before a commit
 final = "pixi run format && pixi run test"
@@ -78,7 +105,7 @@ doc_pages = "mojo doc numojo/ -o docs.json"
 release = "clear && pixi run final && pixi run doc_pages"
 
 [dependencies]
-python = ">=3.13.9,<3.14"
-numpy = ">=2.3.3,<3"
-scipy = ">=1.16.2,<2"
-modular = ">=25.6.1,<26"
+python = ">=3.14.0,<3.15"
+numpy = ">=2.3.4,<3"
+scipy = ">=1.16.3,<2"
+modular = ">=26.1.0.dev2025111405,<27"
diff --git a/tests/core/test_array_indexing_and_slicing.mojo b/tests/core/test_array_indexing_and_slicing.mojo
index 8c08098f..4e9ed527 100644
--- a/tests/core/test_array_indexing_and_slicing.mojo
+++ b/tests/core/test_array_indexing_and_slicing.mojo
@@ -3,6 +3,7 @@ from numojo.prelude import *
 from testing.testing import assert_true, assert_almost_equal, assert_equal
 from utils_for_test import check, check_is_close
 from python import Python
+from testing import TestSuite
 
 
 def test_getitem_scalar():
@@ -620,3 +621,7 @@ def test_3d_array_basic_slicing():
 #     nm_slice3 = nm_arr[::2, 1::2]
 #     np_sliced3 = np_arr[::2, 1::2]
 #     check(nm_slice3, np_sliced3, "F-order step [::2, 1::2] failed")
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/core/test_array_methods.mojo b/tests/core/test_array_methods.mojo
index c49a79df..6d072a8a 100644
--- a/tests/core/test_array_methods.mojo
+++ b/tests/core/test_array_methods.mojo
@@ -3,6 +3,7 @@ from python import Python
 from numojo.prelude import *
 from testing.testing import assert_true, assert_almost_equal, assert_equal
 from utils_for_test import check, check_is_close, check_values_close
+from testing import TestSuite
 
 
 def test_constructors():
@@ -148,3 +149,7 @@ def test_iterator():
             fnp_nditer_f.__next__(),
             "`_NDIter` or `nditer()` of F array by order F breaks",
         )
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/core/test_bool_masks.mojo b/tests/core/test_bool_masks.mojo
index 33c99dab..aa65b7bb 100644
--- a/tests/core/test_bool_masks.mojo
+++ b/tests/core/test_bool_masks.mojo
@@ -3,6 +3,7 @@ from numojo import *
 from testing.testing import assert_true, assert_almost_equal, assert_equal
 from utils_for_test import check, check
 from python import Python
+from testing import TestSuite
 
 
 # TODO: there's something wrong with bool comparision even though result looks same.
@@ -65,3 +66,7 @@ def test_bool_masks_eq():
     var np_mask = np_A[np_A > 10]
     var mask = A[A > Scalar[nm.i16](10)]
     check(mask, np_mask, "Masked array")
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/core/test_complexArray.mojo b/tests/core/test_complexArray.mojo
index 1c260695..9d2fcf39 100644
--- a/tests/core/test_complexArray.mojo
+++ b/tests/core/test_complexArray.mojo
@@ -1,5 +1,6 @@
 from testing import assert_equal, assert_almost_equal
 from numojo import *
+from testing import TestSuite
 
 # TODO: Added getter and setter tests
 
@@ -104,3 +105,7 @@ fn test_complex_array_div() raises:
 
     assert_almost_equal(quot.item(0).re, 0.44, "div failed")
     assert_almost_equal(quot.item(0).im, 0.08, "div failed")
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/core/test_complexSIMD.mojo b/tests/core/test_complexSIMD.mojo
index fa0a9ec0..74f9bc66 100644
--- a/tests/core/test_complexSIMD.mojo
+++ b/tests/core/test_complexSIMD.mojo
@@ -1,5 +1,6 @@
 from testing import assert_equal, assert_almost_equal
 from numojo import *
+from testing import TestSuite
 
 
 fn test_complex_init() raises:
@@ -69,3 +70,7 @@ fn test_complex_div() raises:
     var quot = c1 / c2
     assert_almost_equal(quot.re, 0.44, " division failed")
     assert_almost_equal(quot.im, 0.08, " division failed")
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/core/test_matrix.mojo b/tests/core/test_matrix.mojo
index c22a4a23..4c0affff 100644
--- a/tests/core/test_matrix.mojo
+++ b/tests/core/test_matrix.mojo
@@ -4,6 +4,7 @@ from numojo.core.matrix import Matrix
 from python import Python, PythonObject
 from testing.testing import assert_raises, assert_true
 from sys import is_defined
+from testing import assert_equal, TestSuite
 
 alias order: String = String("F") if is_defined["F_CONTIGUOUS"]() else String(
     "C"
@@ -346,9 +347,6 @@ def test_eigen_decomposition():
     namedtuple = np.linalg.eig(Anp)
 
     np_eigenvalues = namedtuple.eigenvalues
-    print(np_eigenvalues)
-    print(Lambda.to_numpy())
-    print(np.diag(Lambda.to_numpy()))
 
     # Sort eigenvalues and eigenvectors for comparison (numpy doesn't guarantee order)
     var np_sorted_eigenvalues = np.sort(np_eigenvalues)
@@ -369,7 +367,6 @@ def test_eigen_decomposition():
 
     # Check that A = Q * Lambda * Q^T (eigendecomposition property)
     var A_reconstructed = Q @ Lambda @ Q.transpose()
-    print(A_reconstructed - A)
     assert_true(
         np.allclose(A_reconstructed.to_numpy(), Anp, atol=1e-10),
         "A ≠ Q * Lambda * Q^T",
@@ -437,19 +434,19 @@ def test_math():
         )
 
     check_matrices_close(
-        nm.cumsum(A.copy()),
+        nm.cumsum(A),
         np.cumsum(Anp),
         "`cumsum` is broken",
     )
     for i in range(2):
         check_matrices_close(
-            nm.cumsum(A.copy(), axis=i),
+            nm.cumsum(A, axis=i),
             np.cumsum(Anp, axis=i),
             String("`cumsum` by axis {i} is broken"),
         )
 
     check_matrices_close(
-        nm.cumprod(A.copy()),
+        nm.cumprod(A),
         np.cumprod(Anp),
         "`cumprod` is broken",
     )
@@ -565,3 +562,7 @@ def test_searching():
             np.argmin(Anp, axis=i),
             String("`argmin` by axis {} is broken").format(i),
         )
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/core/test_shape_strides_item.mojo b/tests/core/test_shape_strides_item.mojo
index 163b65da..e96d712f 100644
--- a/tests/core/test_shape_strides_item.mojo
+++ b/tests/core/test_shape_strides_item.mojo
@@ -1,6 +1,7 @@
 from numojo.prelude import *
 from testing.testing import assert_true, assert_almost_equal, assert_equal
 from utils_for_test import check, check_is_close
+from testing import TestSuite
 
 
 def test_shape():
@@ -29,3 +30,7 @@ def test_item():
         A[-1] == 4,
         msg=String("`NDArrayStrides.__getitem__()` fails: may overflow"),
     )
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/routines/test_creation.mojo b/tests/routines/test_creation.mojo
index e32060ac..cc8c7205 100644
--- a/tests/routines/test_creation.mojo
+++ b/tests/routines/test_creation.mojo
@@ -10,6 +10,7 @@ from testing.testing import (
 from python import Python, PythonObject
 import random as builtin_random
 from utils_for_test import check, check_is_close
+from testing import TestSuite
 
 
 def test_arange():
@@ -357,3 +358,7 @@ def test_arr_manipulation():
 #     assert_equal(
 #         image == image_converted_via_array, True, "Tensor conversion is broken"
 #     )
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/routines/test_functional.mojo b/tests/routines/test_functional.mojo
index e6fef22b..592d9207 100644
--- a/tests/routines/test_functional.mojo
+++ b/tests/routines/test_functional.mojo
@@ -10,6 +10,7 @@ Test functional programming module `numojo.routines.functional`.
 from python import Python
 from testing.testing import assert_true, assert_almost_equal, assert_equal
 from utils_for_test import check, check_is_close
+from testing import TestSuite
 
 from numojo.prelude import *
 
@@ -36,3 +37,7 @@ fn test_apply_along_axis() raises:
                 "`apply_along_axis` F-order array along axis {} is broken"
             ).format(i),
         )
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/routines/test_indexing.mojo b/tests/routines/test_indexing.mojo
index 72c44e2c..0a2ea4d7 100644
--- a/tests/routines/test_indexing.mojo
+++ b/tests/routines/test_indexing.mojo
@@ -10,6 +10,7 @@ Test indexing module `numojo.routines.indexing`.
 from python import Python
 from testing.testing import assert_true, assert_almost_equal, assert_equal
 from utils_for_test import check, check_is_close
+from testing import TestSuite
 
 from numojo.prelude import *
 
@@ -298,3 +299,7 @@ fn test_take_along_axis_fortran_order() raises:
             " array is broken"
         ),
     )
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/routines/test_io.mojo b/tests/routines/test_io.mojo
index 4c81167d..4b5895eb 100644
--- a/tests/routines/test_io.mojo
+++ b/tests/routines/test_io.mojo
@@ -2,6 +2,7 @@ from numojo.routines.io.files import load, save, loadtxt, savetxt
 from numojo import ones, full
 from python import Python
 import os
+from testing import TestSuite
 
 
 fn test_save_and_load() raises:
@@ -32,3 +33,7 @@ fn test_savetxt_and_loadtxt() raises:
     np.allclose(arr2.to_numpy(), arr.to_numpy())
     # Clean up
     os.remove(fname)
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/routines/test_linalg.mojo b/tests/routines/test_linalg.mojo
index 81f03f8d..3de07f31 100644
--- a/tests/routines/test_linalg.mojo
+++ b/tests/routines/test_linalg.mojo
@@ -2,6 +2,7 @@ import numojo as nm
 from numojo.prelude import *
 from python import Python, PythonObject
 from utils_for_test import check, check_is_close, check_values_close
+from testing import TestSuite
 
 # ===-----------------------------------------------------------------------===#
 # Matmul
@@ -117,3 +118,7 @@ def test_misc():
             np.diagonal(np_arr, offset=i),
             String("`diagonal` by axis {} is broken").format(i),
         )
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/routines/test_manipulation.mojo b/tests/routines/test_manipulation.mojo
index ef0716b7..fc784cc3 100644
--- a/tests/routines/test_manipulation.mojo
+++ b/tests/routines/test_manipulation.mojo
@@ -3,6 +3,7 @@ from numojo import *
 from testing.testing import assert_true, assert_almost_equal, assert_equal
 from utils_for_test import check, check_is_close
 from python import Python
+from testing import TestSuite
 
 
 fn test_arr_manipulation() raises:
@@ -137,3 +138,7 @@ def test_broadcast():
         np.broadcast_to(a.to_numpy(), Python.tuple(2, 2, 2, 3)),
         "`broadcast_to` fails.",
     )
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/routines/test_math.mojo b/tests/routines/test_math.mojo
index aa230a67..a73b62d0 100644
--- a/tests/routines/test_math.mojo
+++ b/tests/routines/test_math.mojo
@@ -8,6 +8,7 @@ from utils_for_test import (
     check_values_close,
     check_with_dtype,
 )
+from testing import TestSuite
 
 # ===-----------------------------------------------------------------------===#
 # Sums, products, differences
@@ -444,3 +445,7 @@ fn test_misc() raises:
         np.clip(cfnp, 0.02, -0.01),
         String("`clip` 3d f-order is broken"),
     )
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/routines/test_random.mojo b/tests/routines/test_random.mojo
index 9c741000..d192e553 100644
--- a/tests/routines/test_random.mojo
+++ b/tests/routines/test_random.mojo
@@ -4,6 +4,7 @@ from numojo.prelude import *
 from python import Python, PythonObject
 from utils_for_test import check, check_is_close
 from testing.testing import assert_true, assert_almost_equal
+from testing import TestSuite
 
 
 def test_rand():
@@ -36,8 +37,8 @@ def test_randminmax():
 
 def test_randint():
     """Test random int array generation with min and max values."""
-    var arr_low_high = nm.random.randint(Shape(10, 10, 10), 0, 10)
-    var arr_high = nm.random.randint(Shape(10, 10, 10), 6)
+    var arr_low_high = nm.random.randint(Shape(30, 30, 30), 0, 10)
+    var arr_high = nm.random.randint(Shape(30, 30, 30), 6)
     var arr_low_high_mean = nm.mean(arr_low_high)
     var arr_high_mean = nm.mean(arr_high)
     assert_almost_equal(
@@ -215,3 +216,7 @@ def test_rand_exponential():
             arr_list._buf.ptr[i] >= 0,
             "Exponential distribution should only produce non-negative values",
         )
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/routines/test_searching.mojo b/tests/routines/test_searching.mojo
index 54816c80..ea77c2b9 100644
--- a/tests/routines/test_searching.mojo
+++ b/tests/routines/test_searching.mojo
@@ -1,6 +1,7 @@
 from numojo.prelude import *
 from python import Python, PythonObject
 from utils_for_test import check, check_is_close, check_values_close
+from testing import TestSuite
 
 
 fn test_argmax() raises:
@@ -223,3 +224,7 @@ fn test_take_along_axis_with_argmax_argmin() raises:
         np.take_along_axis(a2d_np, reshaped_min_indices_np, axis=1),
         "`take_along_axis` with argmin is broken",
     )
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/routines/test_sorting.mojo b/tests/routines/test_sorting.mojo
index 501b8683..68b53188 100644
--- a/tests/routines/test_sorting.mojo
+++ b/tests/routines/test_sorting.mojo
@@ -1,6 +1,7 @@
 import numojo as nm
 from python import Python, PythonObject
 from utils_for_test import check, check_is_close
+from testing import TestSuite
 
 
 fn test_sorting() raises:
@@ -125,3 +126,7 @@ fn test_sorting() raises:
             np.sort(S.to_numpy(), axis=i, stable=True),
             String("`sort` 6d stably by axis {} is broken").format(i),
         )
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/routines/test_statistics.mojo b/tests/routines/test_statistics.mojo
index 17e75845..11567091 100644
--- a/tests/routines/test_statistics.mojo
+++ b/tests/routines/test_statistics.mojo
@@ -4,6 +4,7 @@ from numojo.core.matrix import Matrix
 from python import Python, PythonObject
 from testing.testing import assert_raises, assert_true
 from utils_for_test import check, check_is_close
+from testing import TestSuite
 
 # ===-----------------------------------------------------------------------===#
 # Statistics
@@ -74,3 +75,7 @@ def test_mean_median_var_std():
             np.std(Anp, axis),
             String("`std` is broken for axis {}").format(axis),
         )
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()
diff --git a/tests/science/test_signal.mojo b/tests/science/test_signal.mojo
index 46e2d496..7249fc17 100644
--- a/tests/science/test_signal.mojo
+++ b/tests/science/test_signal.mojo
@@ -3,6 +3,7 @@ from numojo.prelude import *
 from python import Python, PythonObject
 from utils_for_test import check, check_is_close
 from testing.testing import assert_raises
+from testing import TestSuite
 
 
 def test_convolve2d():
@@ -16,3 +17,7 @@ def test_convolve2d():
     res1 = nm.science.signal.convolve2d(in1, in2)
     res2 = sp.signal.convolve2d(npin1, npin2, mode="valid")
     check(res1, res2, "test_convolve2d failed #2\n")
+
+
+def main():
+    TestSuite.discover_tests[__functions_in_module()]().run()