From 055e036da571aa575e072dd56366a563bbd5e843 Mon Sep 17 00:00:00 2001
From: Corey Lammie <corey.lammie@jcu.edu.au>
Date: Thu, 8 Jul 2021 10:05:39 +1000
Subject: [PATCH] Added C++ and CUDA bindings to `tile_matmul` for 1.1.2
 Release (#66)

* Added C++ and CUDA bindings for `memtorch.bh.crossbar.Tile.tile_matmul`.
* Added `Eigen` integration with C++ and CUDA bindings.
* Modularized C++ and CUDA `quantize` bindings.
---
 .clang-format-ignore                          |   1 +
 .github/workflows/build_release.yml           |  12 +-
 .github/workflows/push_pull.yml               |   6 +-
 .gitignore                                    |   3 +-
 .gitmodules                                   |   9 +-
 .pre-commit-config.yaml                       |   6 +-
 CHANGELOG.md                                  |  24 +-
 MANIFEST.in                                   |   5 +-
 docs/conf.py                                  |   2 +-
 memtorch/bh/Quantize.py                       |  95 ++++----
 memtorch/bh/crossbar/Crossbar.py              |   9 +-
 memtorch/bh/crossbar/Tile.py                  | 201 ++++++++++------
 .../bh/nonideality/FiniteConductanceStates.py |  26 +--
 memtorch/cpp/bindings.cpp                     |  14 ++
 memtorch/cpp/quantize.cpp                     | 218 ++++++++++++++++++
 memtorch/cpp/quantize.h                       |   3 +
 memtorch/cpp/quantize/quant.cpp               |  54 -----
 memtorch/cpp/tile_matmul.cpp                  | 108 +++++++++
 memtorch/cpp/tile_matmul.h                    |   1 +
 memtorch/cu/bindings.cpp                      |   9 +
 memtorch/cu/quantize.cuh                      |  77 +++++++
 memtorch/cu/quantize/gpu.cuh                  |   7 -
 memtorch/cu/quantize/quant.cu                 |  87 -------
 memtorch/cu/quantize/quant_cuda.cpp           |  38 ---
 memtorch/cu/tile_matmul.cpp                   |  54 +++++
 memtorch/cu/tile_matmul.h                     |   1 +
 memtorch/cu/tile_matmul_kernels.cu            | 166 +++++++++++++
 memtorch/cu/tile_matmul_kernels.cuh           |   5 +
 memtorch/cu/utils.cuh                         |  64 +++++
 memtorch/map/Module.py                        |   5 +-
 memtorch/mn/Conv1d.py                         |   2 +-
 memtorch/mn/Conv2d.py                         |   2 +-
 memtorch/mn/Conv3d.py                         |   2 +-
 memtorch/mn/Linear.py                         |  17 +-
 memtorch/submodules/__init__.py               |   2 -
 memtorch/submodules/eigen                     |   1 +
 .../memtorch/submodules/pytorch-playground    |   1 -
 memtorch/submodules/pytorch-playground        |   1 -
 memtorch/version.py                           |   2 +-
 profile_tile_matmul.py                        | 112 +++++++++
 setup.py                                      |  30 ++-
 tests/test_cpp_extensions.py                  |  65 +++---
 42 files changed, 1155 insertions(+), 392 deletions(-)
 create mode 100644 .clang-format-ignore
 create mode 100644 memtorch/cpp/bindings.cpp
 create mode 100644 memtorch/cpp/quantize.cpp
 create mode 100644 memtorch/cpp/quantize.h
 delete mode 100644 memtorch/cpp/quantize/quant.cpp
 create mode 100644 memtorch/cpp/tile_matmul.cpp
 create mode 100644 memtorch/cpp/tile_matmul.h
 create mode 100644 memtorch/cu/bindings.cpp
 create mode 100644 memtorch/cu/quantize.cuh
 delete mode 100644 memtorch/cu/quantize/gpu.cuh
 delete mode 100644 memtorch/cu/quantize/quant.cu
 delete mode 100644 memtorch/cu/quantize/quant_cuda.cpp
 create mode 100644 memtorch/cu/tile_matmul.cpp
 create mode 100644 memtorch/cu/tile_matmul.h
 create mode 100644 memtorch/cu/tile_matmul_kernels.cu
 create mode 100644 memtorch/cu/tile_matmul_kernels.cuh
 create mode 100644 memtorch/cu/utils.cuh
 create mode 160000 memtorch/submodules/eigen
 delete mode 160000 memtorch/submodules/memtorch/submodules/pytorch-playground
 delete mode 160000 memtorch/submodules/pytorch-playground
 create mode 100644 profile_tile_matmul.py

diff --git a/.clang-format-ignore b/.clang-format-ignore
new file mode 100644
index 00000000..42b14174
--- /dev/null
+++ b/.clang-format-ignore
@@ -0,0 +1 @@
+memtorch/cu/*
\ No newline at end of file
diff --git a/.github/workflows/build_release.yml b/.github/workflows/build_release.yml
index d6df9bf9..6f3737ab 100644
--- a/.github/workflows/build_release.yml
+++ b/.github/workflows/build_release.yml
@@ -12,6 +12,8 @@ jobs:
     steps:
       - name: Checkout code
         uses: actions/checkout@v2
+        with:
+          submodules: recursive
       - name: Create release
         id: create_release
         uses: actions/create-release@v1
@@ -32,6 +34,8 @@ jobs:
         os: [windows-2019, macOS-10.15, ubuntu-20.04]
     steps:
       - uses: actions/checkout@v2
+        with:
+          submodules: recursive
       - uses: actions/setup-python@v2
         with:
           python-version: 3.9
@@ -41,9 +45,9 @@ jobs:
       - name: Build wheels
         run: python -m cibuildwheel --output-dir wheelhouse
         env:
-          CIBW_BEFORE_BUILD_WINDOWS: pip3 install torch==1.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html --ignore-installed
-          CIBW_BEFORE_BUILD_MACOS: pip3 install torch==1.8.1 -f https://download.pytorch.org/whl/torch_stable.html --ignore-installed
-          CIBW_BEFORE_BUILD_LINUX: pip3 install torch==1.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html --ignore-installed
+          CIBW_BEFORE_BUILD_WINDOWS: pip3 install torch==1.9.0+cpu -f https://download.pytorch.org/whl/torch_stable.html --ignore-installed
+          CIBW_BEFORE_BUILD_MACOS: pip3 install torch==1.9.0 -f https://download.pytorch.org/whl/torch_stable.html --ignore-installed
+          CIBW_BEFORE_BUILD_LINUX: pip3 install torch==1.9.0+cpu -f https://download.pytorch.org/whl/torch_stable.html --ignore-installed
           CIBW_REPAIR_WHEEL_COMMAND: ""
           CIBW_BUILD: cp37-* cp38-* cp39-*
           CIBW_SKIP: "*-manylinux_i686 *-win32"
@@ -66,6 +70,8 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
+        with:
+          submodules: recursive
       - uses: actions/setup-python@v2
         name: Install Python
         with:
diff --git a/.github/workflows/push_pull.yml b/.github/workflows/push_pull.yml
index f71b45ee..40e52c10 100644
--- a/.github/workflows/push_pull.yml
+++ b/.github/workflows/push_pull.yml
@@ -1,5 +1,9 @@
 name: CI
-on: [push, pull_request]
+on:
+  push:
+    tags-ignore:
+      - "v*"
+  pull_request:
 jobs:
   linter:
     name: Validate code formatting
diff --git a/.gitignore b/.gitignore
index 8d4ab724..a97ad3fe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,5 +11,6 @@ MemTorch_cpu.egg-info/
 memtorch/examples/reproduce/*.csv
 tmp/
 **/.pytest_cache/
+.vscode/
 .eggs/
-.vscode/
\ No newline at end of file
+tmp.py
diff --git a/.gitmodules b/.gitmodules
index 363d3fba..f7a479ab 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
-[submodule "memtorch/submodules/pytorch-playground"]
-	path = memtorch/submodules/pytorch-playground
-	url = https://github.com/coreylammie/pytorch-playground
-[submodule "memtorch/submodules/memtorch/submodules/pytorch-playground"]
-	path = memtorch/submodules/memtorch/submodules/pytorch-playground
-	url = https://github.com/coreylammie/pytorch-playground
+[submodule "memtorch/submodules/eigen"]
+	path = memtorch/submodules/eigen
+	url = https://gitlab.com/libeigen/eigen.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1eca9387..0efc805f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,14 +1,14 @@
 repos:
   - repo: https://github.com/psf/black
-    rev: 20.8b1
+    rev: 21.6b0
     hooks:
       - id: black
         language_version: python3
   - repo: https://github.com/timothycrosley/isort
-    rev: 5.8.0
+    rev: 5.9.1
     hooks:
       - id: isort
   - repo: https://github.com/pocc/pre-commit-hooks
-    rev: python
+    rev: v1.1.1
     hooks:
       - id: clang-format
diff --git a/CHANGELOG.md b/CHANGELOG.md
index af1c16a6..b3cadca2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1 +1,23 @@
-- Transitioned from TravisCI to GitHub Actions.
+## Added
+
+1. C++ and CUDA bindings for `memtorch.bh.crossbar.Tile.tile_matmul`.
+
+Using an NVIDIA GeForce GTX 1080, a tile shape of (25, 25), and two tensors of size (500, 500), the runtime of `tile_matmul` without quantization support is reduced by 2.45x and 5.48x, for CPU-bound and GPU-bound operation, respectively. With an ADC resolution of 4 bits and an overflow rate of 0.0, the runtime of `tile_matmul` with quantization support is reduced by 2.30x and 105.27x, for CPU-bound and GPU-bound operation, respectively.
+
+| Implementation         | Runtime Without Quantization Support (s) | Runtime With Quantization Support (s) |
+| ---------------------- | ---------------------------------------- | ------------------------------------- |
+| Pure Python (Previous) | 6.917784                                 | 27.099764                             |
+| C++ (CPU-bound)        | 2.822265                                 | 11.736974                             |
+| CUDA (GPU-bound)       | 1.262861                                 | 0.2574267                             |
+
+3. `Eigen` integration with C++ and CUDA bindings.
+4. Additional unit tests.
+
+## Enhanced
+
+1. Modularized C++ and CUDA `quantize` bindings.
+2. Enhanced functionality of `naive_progam` and added additional input arguments to dictate logic for stuck devices.
+
+## Fixed
+
+1. Removed debugging code from `naive_progam`.
diff --git a/MANIFEST.in b/MANIFEST.in
index a0451b72..1d52dc83 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1,4 @@
-include memtorch/cu/quantize/gpu.cuh
+graft memtorch/submodules/eigen
+include memtorch/cpp/*.h
+include memtorch/cu/*.h
+include memtorch/cu/*.cuh
diff --git a/docs/conf.py b/docs/conf.py
index 2f26d8ea..cf9fe943 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -21,7 +21,7 @@
 author = "Corey Lammie"
 
 # The full version, including alpha/beta/rc tags
-release = "1.1.1"
+release = "1.1.2"
 autodoc_inherit_docstrings = False
 
 # -- General configuration ---------------------------------------------------
diff --git a/memtorch/bh/Quantize.py b/memtorch/bh/Quantize.py
index 46dacc0c..d6ff8e2e 100644
--- a/memtorch/bh/Quantize.py
+++ b/memtorch/bh/Quantize.py
@@ -1,30 +1,41 @@
-# Wrapper for the pytorch-playground quant.py script
-import importlib
+import copy
 
-utee = importlib.import_module(".utee", "memtorch.submodules.pytorch-playground")
 import numpy as np
 import torch
 
-quant_methods = ["linear", "log", "tanh"]
+import memtorch
+import memtorch_bindings
 
+quant_methods = ["linear", "log"]
 
-def quantize(input, bits, overflow_rate, quant_method="linear", min=None, max=None):
+
+def quantize(
+    tensor,
+    quant,
+    overflow_rate=0.0,
+    quant_method=None,
+    min=float("nan"),
+    max=float("nan"),
+    override_original=False,
+):
     """Method to quantize a tensor.
 
     Parameters
     ----------
-    input : tensor
+    tensor : tensor
         Input tensor.
-    bits : int
-        Bit width.
-    overflow_rate : float
-        Overflow rate threshold for linear quanitzation.
-    quant_method : str
-        Quantization method. Must be in ['linear', 'log', 'tanh'].
-    min : float
-        Minimum value to clip values to.
-    max : float
-        Maximum value to clip values to.
+    quant : int
+        Bit width (if quant_method is not None) or the number of discrete quantization levels (if quant_method is None).
+    overflow_rate : float, optional
+        Overflow rate threshold for linear quantization.
+    quant_method : str, optional
+        Quantization method. Must be in quant_methods.
+    min : float or tensor, optional
+        Minimum value(s) to clip numbers to.
+    max : float or tensor, optional
+        Maximum value(s) to clip numbers to.
+    override_original : bool, optional
+        Whether to override the original tensor (True) or not (False).
 
     Returns
     -------
@@ -32,26 +43,32 @@ def quantize(input, bits, overflow_rate, quant_method="linear", min=None, max=No
         Quantized tensor.
 
     """
-    assert type(bits) == int and bits > 0, "bits must be an integer > 0."
-    assert overflow_rate >= 0 and overflow_rate <= 1, "overflow_rate value invalid."
-    assert quant_method in quant_methods, "quant_method is not valid."
-    pass
-    if min is not None:
-        input = input.clip(min=min)
-
-    if max is not None:
-        input = input.clip(max=max)
-
-    if torch.unique(input).numel() == 1:
-        return input
-
-    if quant_method == "linear":
-        sf = bits - 1 - utee.compute_integral_part(input, overflow_rate)
-        return utee.linear_quantize(input, sf, bits)
-    elif quant_method == "log":
-        log_abs_input = torch.log(torch.abs(input))
-        log_abs_input[log_abs_input == float("-inf")] = 1e-12
-        sf = bits - 1 - utee.compute_integral_part(log_abs_input, overflow_rate)
-        return utee.log_linear_quantize(input, sf, bits)
-    elif quant_method == "tanh":
-        return utee.tanh_quantize(input, bits)
+    device = torch.device("cpu" if "cpu" in memtorch.__version__ else "cuda")
+    assert (
+        overflow_rate >= 0 and overflow_rate <= 1
+    ), "overflow_rate must be >= 0 and <= 1."
+    assert (
+        type(quant) == int and quant > 0
+    ), "The bit width or number of discrete quantization levels must be a positive integer."
+    if type(min) == int:
+        min = float(min)
+    if type(max) == int:
+        max = float(max)
+    if not override_original:
+        tensor = copy.deepcopy(tensor)
+    if quant_method is not None:
+        assert quant_method in quant_methods, "quant_method is invalid."
+        tensor = tensor.cpu()
+        memtorch_bindings.quantize(
+            tensor,
+            bits=quant,
+            overflow_rate=overflow_rate,
+            quant_method=quant_methods.index(quant_method),
+            min=min,
+            max=max,
+        )
+    else:
+        tensor = tensor.cpu()
+        memtorch_bindings.quantize(tensor, n_quant_levels=quant, min=min, max=max)
+
+    return tensor.to(device)
diff --git a/memtorch/bh/crossbar/Crossbar.py b/memtorch/bh/crossbar/Crossbar.py
index 0f094265..296b7fbf 100644
--- a/memtorch/bh/crossbar/Crossbar.py
+++ b/memtorch/bh/crossbar/Crossbar.py
@@ -10,9 +10,8 @@
 import torch.nn as nn
 
 import memtorch
-from memtorch.utils import pad_tensor
 
-from .Tile import gen_tiles, tile_matmul
+from .Tile import gen_tiles
 
 
 @unique
@@ -443,7 +442,7 @@ def simulate_matmul(
     ADC_overflow_rate : float
         Overflow rate threshold for linear quanitzation (if ADC_resolution is not None).
     quant_method:
-        Quantization method. Must be in ['linear', 'log', 'log_minmax', 'minmax', 'tanh'], or None.
+        Quantization method. Must be in memtorch.bh.Quantize.quant_methods.
 
     Returns
     -------
@@ -497,7 +496,7 @@ def simulate_matmul(
         if quant_method is not None:
             mat_res_ = memtorch.bh.Quantize.quantize(
                 mat_res_,
-                bits=ADC_resolution,
+                quant=ADC_resolution,
                 overflow_rate=ADC_overflow_rate,
                 quant_method=quant_method,
             )
@@ -552,7 +551,7 @@ def tile_simulate_matmul_row(
                     if quant_method is not None:
                         partial_sum[j] += memtorch.bh.Quantize.quantize(
                             mat_res.squeeze(),
-                            bits=ADC_resolution,
+                            quant=ADC_resolution,
                             overflow_rate=ADC_overflow_rate,
                             quant_method=quant_method,
                         )
diff --git a/memtorch/bh/crossbar/Tile.py b/memtorch/bh/crossbar/Tile.py
index c0424267..f892cc08 100644
--- a/memtorch/bh/crossbar/Tile.py
+++ b/memtorch/bh/crossbar/Tile.py
@@ -7,6 +7,11 @@
 
 import memtorch
 
+if "cpu" in memtorch.__version__:
+    import memtorch_bindings
+else:
+    import memtorch_cuda_bindings as memtorch_bindings
+
 
 class Tile:
     """Class used to create modular crossbar tiles to represent 2D matrices.
@@ -139,6 +144,79 @@ def gen_tiles(tensor, tile_shape, input=False):
     return tiles, tiles_map
 
 
+def tile_matmul_row(
+    mat_a_row_tiles,
+    mat_a_tiles_map,
+    mat_b_tiles,
+    mat_b_tiles_map,
+    mat_b_shape,
+    ADC_resolution=None,
+    ADC_overflow_rate=0.0,
+    quant_method=None,
+):
+    """Method to perform row-wise tile matrix multiplication, given two sets of tiles, using a pythonic approach.
+
+    Parameters
+    ----------
+    mat_a_row_tiles : torch.tensor
+        Tiles representing a row of matrix A.
+    mat_a_tiles_map : torch.tensor
+        Tiles map for matrix A.
+    mat_b_tiles : torch.tensor
+        Tiles representing matrix B.
+    mat_b_tiles_map : torch.tensor
+        Tiles map for matrix B.
+    mat_b_shape : (int, int)
+        Shape of matrix B.
+    ADC_resolution : int
+        ADC resolution (bit width). If None, quantization noise is not accounted for.
+    ADC_overflow_rate : float
+        Overflow rate threshold for linear quanitzation (if ADC_resolution is not None).
+    quant_method: str
+        Quantization method. Must be in memtorch.bh.Quantize.quant_methods.
+
+    Returns
+    -------
+    torch.tensor
+        Output tensor.
+    """
+    device = torch.device("cpu" if "cpu" in memtorch.__version__ else "cuda")
+    if quant_method is not None:
+        assert (
+            ADC_resolution is not None
+            and type(ADC_resolution) == int
+            and ADC_resolution > 0
+        ), "ADC resolution is invalid."
+        assert (
+            quant_method in memtorch.bh.Quantize.quant_methods
+        ), "quant_method is not valid."
+        assert (
+            ADC_overflow_rate is not None
+        ), "ADC_overflow_rate must be specified if quant_method is not None."
+
+    tile_shape = mat_b_tiles.shape[-2:]
+    partial_sum = torch.zeros((mat_b_tiles_map.shape[1], tile_shape[1])).to(device)
+    for j in range(mat_b_tiles_map.shape[1]):
+        for i in range(mat_b_tiles_map.shape[0]):
+            tile_a = mat_a_row_tiles[int(mat_a_tiles_map[i])]
+            tile_b = mat_b_tiles[int(mat_b_tiles_map[i][j])]
+            if quant_method is not None:
+                partial_sum[j] += memtorch.bh.Quantize.quantize(
+                    torch.matmul(tile_a.to(device), tile_b.to(device)).squeeze(),
+                    quant=ADC_resolution,
+                    overflow_rate=ADC_overflow_rate,
+                    quant_method=quant_method,
+                )
+            else:
+                partial_sum[j] += torch.matmul(
+                    tile_a.to(device), tile_b.to(device)
+                ).squeeze()
+
+    output_act = partial_sum.flatten()
+    output_act = output_act[: mat_b_shape[1]]
+    return output_act
+
+
 def tile_matmul(
     mat_a_tiles,
     mat_a_tiles_map,
@@ -149,6 +227,8 @@ def tile_matmul(
     ADC_resolution=None,
     ADC_overflow_rate=0.0,
     quant_method=None,
+    use_bindings=True,
+    cuda_malloc_heap_size=50,
 ):
     """Method to perform 2D matrix multiplication, given two sets of tiles.
 
@@ -170,75 +250,69 @@ def tile_matmul(
         ADC resolution (bit width). If None, quantization noise is not accounted for.
     ADC_overflow_rate : float
         Overflow rate threshold for linear quanitzation (if ADC_resolution is not None).
-    quant_method:
-        Quantization method. Must be in ['linear', 'log', 'log_minmax', 'minmax', 'tanh'], or None.
+    quant_method: str
+        Quantization method. Must be in memtorch.bh.Quantize.quant_methods.
+    use_bindings : bool
+        Use C++/CUDA bindings to parallelize tile_matmul operations (True).
+    cuda_malloc_heap_size : int
+        cudaLimitMallocHeapSize (in MB) to determine allocatable kernel heap memory if CUDA is used.
 
     Returns
     -------
     torch.tensor
         Output tensor.
     """
-
-    def tile_matmul_row(
-        mat_a_row_tiles,
-        mat_a_tiles_map,
-        mat_a_shape,
-        mat_b_tiles,
-        mat_b_tiles_map,
-        mat_b_shape,
-        ADC_resolution=None,
-        ADC_overflow_rate=0.0,
-        quant_method=None,
-    ):
-        device = torch.device("cpu" if "cpu" in memtorch.__version__ else "cuda")
-        if quant_method is not None:
-            assert (
-                ADC_resolution is not None
-                and type(ADC_resolution) == int
-                and ADC_resolution > 0
-            ), "ADC resolution is invalid."
-            assert (
-                quant_method in memtorch.bh.Quantize.quant_methods
-            ), "quant_method is not valid."
-            assert (
-                ADC_overflow_rate is not None
-            ), "ADC_overflow_rate must be specified if quant_method is not None."
-
-        tile_shape = mat_b_tiles.shape[-2:]
-        partial_sum = torch.zeros((mat_b_tiles_map.shape[1], tile_shape[1])).to(device)
-        for j in range(mat_b_tiles_map.shape[1]):
-            for i in range(mat_b_tiles_map.shape[0]):
-                tile_a = mat_a_row_tiles[int(mat_a_tiles_map[i])]
-                tile_b = mat_b_tiles[int(mat_b_tiles_map[i][j])]
-                if quant_method is not None:
-                    partial_sum[j] += memtorch.bh.Quantize.quantize(
-                        torch.matmul(tile_a.to(device), tile_b.to(device)).squeeze(),
-                        bits=ADC_resolution,
-                        overflow_rate=ADC_overflow_rate,
-                        quant_method=quant_method,
-                    )
-                else:
-                    partial_sum[j] += torch.matmul(
-                        tile_a.to(device), tile_b.to(device)
-                    ).squeeze()
-
-        output_act = partial_sum.flatten()
-        output_act = output_act[: mat_b_shape[1]]
-        return output_act
-
     assert (
         mat_a_tiles.shape[-1] == mat_b_tiles.shape[-2]
         and len(mat_a_tiles.shape) == 3
         and len(mat_b_tiles.shape) == 3
         and mat_a_tiles.shape[-2] != 0
     ), "Incompatible tile shapes used."
-    result = torch.zeros((mat_a_shape[0], mat_b_shape[1]))
-    if mat_a_tiles.shape[-2] > 1:
-        for row_idx in range(mat_a_tiles.shape[-2]):
-            result[row_idx] = tile_matmul_row(
-                mat_a_tiles[:, row_idx, :],
-                mat_a_tiles_map,
+    if use_bindings:
+        if quant_method is None:
+            return memtorch_bindings.tile_matmul(
+                mat_a_tiles.contiguous(),
+                mat_a_tiles_map.contiguous(),
                 mat_a_shape,
+                mat_b_tiles.contiguous(),
+                mat_b_tiles_map.contiguous(),
+                mat_b_shape,
+                cuda_malloc_heap_size,
+            )
+        else:
+            assert (
+                quant_method in memtorch.bh.Quantize.quant_methods
+            ), "quant_method is invalid."
+            return memtorch_bindings.tile_matmul(
+                mat_a_tiles.contiguous(),
+                mat_a_tiles_map.contiguous(),
+                mat_a_shape,
+                mat_b_tiles.contiguous(),
+                mat_b_tiles_map.contiguous(),
+                mat_b_shape,
+                ADC_resolution,
+                ADC_overflow_rate,
+                memtorch.bh.Quantize.quant_methods.index(quant_method),
+                cuda_malloc_heap_size,
+            )
+    else:
+        result = torch.zeros((mat_a_shape[0], mat_b_shape[1]))
+        if mat_a_tiles.shape[-2] > 1:
+            for row_idx in range(mat_a_tiles.shape[-2]):
+                result[row_idx] = tile_matmul_row(
+                    mat_a_tiles[:, row_idx, :],
+                    mat_a_tiles_map,
+                    mat_b_tiles,
+                    mat_b_tiles_map,
+                    mat_b_shape,
+                    ADC_resolution,
+                    ADC_overflow_rate,
+                    quant_method,
+                )
+        else:
+            result = tile_matmul_row(
+                mat_a_tiles,
+                mat_a_tiles_map,
                 mat_b_tiles,
                 mat_b_tiles_map,
                 mat_b_shape,
@@ -246,17 +320,4 @@ def tile_matmul_row(
                 ADC_overflow_rate,
                 quant_method,
             )
-    else:
-        result = tile_matmul_row(
-            mat_a_tiles,
-            mat_a_tiles_map,
-            mat_a_shape,
-            mat_b_tiles,
-            mat_b_tiles_map,
-            mat_b_shape,
-            ADC_resolution,
-            ADC_overflow_rate,
-            quant_method,
-        )
-
-    return result
+        return result
diff --git a/memtorch/bh/nonideality/FiniteConductanceStates.py b/memtorch/bh/nonideality/FiniteConductanceStates.py
index e7033534..c5dd538d 100644
--- a/memtorch/bh/nonideality/FiniteConductanceStates.py
+++ b/memtorch/bh/nonideality/FiniteConductanceStates.py
@@ -6,20 +6,15 @@
 
 import memtorch
 
-if "cpu" in memtorch.__version__:
-    import quantization
-else:
-    import cuda_quantization as quantization
 
-
-def apply_finite_conductance_states(layer, num_conductance_states):
+def apply_finite_conductance_states(layer, n_conductance_states):
     """Method to model a finite number of conductance states for devices within a memristive layer.
 
     Parameters
     ----------
     layer : memtorch.mn
         A memrstive layer.
-    num_conductance_states : int
+    n_conductance_states : int
         Number of finite conductance states to model.
 
     Returns
@@ -29,10 +24,10 @@ def apply_finite_conductance_states(layer, num_conductance_states):
     """
     device = torch.device("cpu" if "cpu" in memtorch.__version__ else "cuda")
     assert (
-        int(num_conductance_states) == num_conductance_states
-    ), "num_conductance_states must be a whole number."
+        int(n_conductance_states) == n_conductance_states
+    ), "n_conductance_states must be a whole number."
 
-    def apply_finite_conductance_states_to_crossbar(crossbar, num_conductance_states):
+    def apply_finite_conductance_states_to_crossbar(crossbar, n_conductance_states):
         crossbar.update()
         conductance_matrix_ = copy.deepcopy(crossbar.conductance_matrix)
         try:
@@ -68,11 +63,12 @@ def apply_finite_conductance_states_to_crossbar(crossbar, num_conductance_states
 
             conductance_matrix_shape = crossbar.conductance_matrix.shape
             conductance_matrix = crossbar.conductance_matrix.view(-1)
-            quantization.quantize(
+            memtorch.bh.Quantize.quantize(
                 conductance_matrix,
-                num_conductance_states,
-                1 / r_off.view(-1),
-                1 / r_on.view(-1),
+                n_conductance_states,
+                min=1 / r_off.view(-1),
+                max=1 / r_on.view(-1),
+                override_original=True,
             )
             conductance_matrix = conductance_matrix.view(conductance_matrix_shape)
             conductance_matrix[0]
@@ -86,7 +82,7 @@ def apply_finite_conductance_states_to_crossbar(crossbar, num_conductance_states
 
     for i in range(len(layer.crossbars)):
         layer.crossbars[i] = apply_finite_conductance_states_to_crossbar(
-            layer.crossbars[i], num_conductance_states
+            layer.crossbars[i], n_conductance_states
         )
 
     return layer
diff --git a/memtorch/cpp/bindings.cpp b/memtorch/cpp/bindings.cpp
new file mode 100644
index 00000000..f7c76dec
--- /dev/null
+++ b/memtorch/cpp/bindings.cpp
@@ -0,0 +1,14 @@
+#include <ATen/ATen.h>
+#include <cmath>
+#include <torch/extension.h>
+
+#include "quantize.h"
+#include "tile_matmul.h"
+
+void quantize_bindings(py::module_ &);
+void tile_matmul_bindings(py::module_ &);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  quantize_bindings(m);
+  tile_matmul_bindings(m);
+}
\ No newline at end of file
diff --git a/memtorch/cpp/quantize.cpp b/memtorch/cpp/quantize.cpp
new file mode 100644
index 00000000..d3a63a66
--- /dev/null
+++ b/memtorch/cpp/quantize.cpp
@@ -0,0 +1,218 @@
+#include <ATen/ATen.h>
+#include <cmath>
+#include <torch/extension.h>
+
+void quantize_element(float *tensor, int index, float *quant_levels,
+                      int num_quant_levels) {
+  int middle_point;         // Middle point
+  int optimal_point = 0;    // Optimal point
+  int l = 0;                // Lower bound
+  int h = num_quant_levels; // Higher bound
+  float difference =
+      std::numeric_limits<float>().max(); // Difference between a given point
+                                          // and the current middle point
+  while (l <= h) {
+    middle_point = l + (h - l) / 2;
+    if (fabs(tensor[index] - quant_levels[middle_point]) < difference) {
+      difference = fabs(tensor[index] - quant_levels[middle_point]);
+      optimal_point = middle_point;
+    }
+    if (quant_levels[middle_point] < tensor[index]) {
+      l = middle_point + 1;
+    } else {
+      h = middle_point - 1;
+    }
+  }
+  tensor[index] = quant_levels[optimal_point];
+}
+
+float det_integral(at::Tensor tensor, float overflow_rate, float min,
+                   float max) {
+  if (overflow_rate > 1.0) {
+    throw std::invalid_argument("Invalid overflow_rate value.");
+  } else {
+    tensor = std::get<0>(at::sort(at::flatten(at::abs(tensor)), -1, true));
+    int64_t tensor_numel = tensor.numel();
+    if ((min != NULL) || (max != NULL)) {
+      float max_bound;
+      if ((min != NULL) && (max != NULL)) {
+        max_bound = std::max(std::abs(min), std::abs(max));
+      } else if (min != NULL) {
+        max_bound = std::abs(min);
+      } else if (max != NULL) {
+        max_bound = std::abs(max);
+      }
+      if (max_bound > tensor[0].item<float>()) {
+        tensor[0] = max_bound;
+      }
+    }
+    return ceilf(
+        log2f(tensor[std::min<float>((int)round(overflow_rate * tensor_numel),
+                                     tensor_numel - 1)]
+                  .item<float>() +
+              1e-12f));
+  }
+}
+
+float det_sf(at::Tensor tensor, int bits, float overflow_rate, float min,
+             float max) {
+  return 1 - bits + det_integral(tensor, overflow_rate, min, max);
+}
+
+at::Tensor linear_quantize(at::Tensor tensor, float sf, int bits,
+                           float overflow_rate) {
+  float delta = powf(2.0f, sf);
+  float bound = powf(2.0f, bits - 1);
+  return at::clamp(at::floor(tensor / powf(2.0f, sf) + 0.5), -bound,
+                   bound - 1) *
+         delta;
+}
+
+void set_average(at::Tensor tensor, float *input_tensor_ptr) {
+  float mean_value = at::flatten(tensor).mean().item<float>();
+#pragma omp parallel for
+  for (int i = 0; i < tensor.numel(); i++) {
+    input_tensor_ptr[i] = mean_value;
+  }
+}
+
+void parse_min_max(float *min, float *max) {
+  if (isnan(*min)) {
+    *min = NULL;
+  }
+  if (isnan(*max)) {
+    *max = NULL;
+  }
+}
+
+void quantize(at::Tensor tensor, int n_quant_levels, float min = NULL,
+              float max = NULL) {
+  parse_min_max(&min, &max);
+  float *input_tensor_ptr = tensor.data_ptr<float>();
+  if (n_quant_levels == 1) {
+    set_average(tensor, input_tensor_ptr);
+    return;
+  }
+  if (min == NULL) {
+    min = at::flatten(tensor).min().item<float>();
+  }
+  if (max == NULL) {
+    max = at::flatten(tensor).max().item<float>();
+  }
+  at::Tensor quant_levels = at::linspace(min, max, n_quant_levels);
+#pragma omp parallel for
+  for (int i = 0; i < tensor.numel(); i++) {
+    quantize_element(input_tensor_ptr, i, quant_levels.data_ptr<float>(),
+                     n_quant_levels);
+  }
+  return;
+}
+
+void quantize(at::Tensor tensor, int n_quant_levels, at::Tensor min,
+              at::Tensor max) {
+  float *input_tensor_ptr = tensor.data_ptr<float>();
+  if (n_quant_levels == 1) {
+    set_average(tensor, input_tensor_ptr);
+    return;
+  }
+  float *min_ptr = min.data_ptr<float>();
+  float *max_ptr = max.data_ptr<float>();
+
+#pragma omp parallel for
+  for (int i = 0; i < tensor.numel(); i += 1) {
+    torch::Tensor quant_levels =
+        at::linspace(min_ptr[i], max_ptr[i], n_quant_levels);
+    quantize_element(input_tensor_ptr, i, quant_levels.data_ptr<float>(),
+                     n_quant_levels);
+  }
+}
+
+void quantize(at::Tensor tensor, int bits, float overflow_rate,
+              int quant_method = 0, float min = NULL, float max = NULL) {
+  parse_min_max(&min, &max);
+  float *input_tensor_ptr = tensor.data_ptr<float>();
+  float *quantized_tensor_ptr = nullptr;
+  if ((int)at::numel(std::get<0>(at::unique_consecutive(tensor))) == 1) {
+    return;
+  } else {
+    if (bits == 1) {
+      set_average(tensor, input_tensor_ptr);
+      return;
+    } else {
+      if (min != NULL) {
+        tensor = at::clamp_min(tensor, min);
+      }
+      if (max != NULL) {
+        tensor = at::clamp_max(tensor, max);
+      }
+      if ((quant_method == 0) || (quant_method == 1)) {
+        if (quant_method == 0) {
+          // linear
+          at::Tensor quantized_tensor = linear_quantize(
+              tensor, det_sf(tensor, bits, overflow_rate, min, max), bits,
+              overflow_rate);
+          float *quantized_tensor_ptr = quantized_tensor.data_ptr<float>();
+#pragma omp parallel for
+          for (int i = 0; i < tensor.numel(); i++) {
+            input_tensor_ptr[i] = quantized_tensor_ptr[i];
+          }
+        } else {
+          // log
+          at::Tensor s = at::sign(tensor);
+          float sf = det_sf(tensor, bits, overflow_rate, min, max);
+          tensor = at::log(at::abs(tensor)).clamp_min_(1e-20f);
+          at::Tensor quantized_tensor =
+              at::exp(linear_quantize(tensor, sf, bits - 1, overflow_rate)) * s;
+          float *quantized_tensor_ptr = quantized_tensor.data_ptr<float>();
+#pragma omp parallel for
+          for (int i = 0; i < tensor.numel(); i++) {
+            input_tensor_ptr[i] = quantized_tensor_ptr[i];
+          }
+        }
+      } else {
+        throw std::invalid_argument(
+            "Invalid quant_method: 0 -> linear, 1 -> log.");
+      }
+    }
+  }
+}
+
+void quantize_bindings(py::module_ &m) {
+  // Binding for void quantize(at::Tensor tensor, int n_quant_levels, float min
+  // = NULL, float max = NULL)
+  m.def(
+      "quantize",
+      [](at::Tensor tensor, int n_quant_levels, float min, float max) {
+        return quantize(tensor, n_quant_levels, min, max);
+      },
+      py::arg("tensor"), py::arg("n_quant_levels"), py::arg("min") = NULL,
+      py::arg("max") = NULL);
+  // Binding for void quantize(at::Tensor tensor, int n_quant_levels, at::Tensor
+  // min, at::Tensor max)
+  m.def(
+      "quantize",
+      [](at::Tensor tensor, int n_quant_levels, at::Tensor min,
+         at::Tensor max) { return quantize(tensor, n_quant_levels, min, max); },
+      py::arg("tensor"), py::arg("n_quant_levels"), py::arg("min"),
+      py::arg("max"));
+  // Bindings for void quantize(at::Tensor tensor, int bits, float
+  // overflow_rate, int quant_method = 0, float min = NULL, float max = NULL)
+  m.def(
+      "quantize",
+      [](at::Tensor tensor, int bits, float overflow_rate, int quant_method,
+         float min, float max) {
+        return quantize(tensor, bits, overflow_rate, quant_method, min, max);
+      },
+      py::arg("tensor"), py::arg("bits"), py::arg("overflow_rate"),
+      py::arg("quant_method") = 0, py::arg("min") = NULL,
+      py::arg("max") = NULL);
+  m.def(
+      "quantize",
+      [](at::Tensor tensor, int bits, float overflow_rate, int quant_method,
+         float min, float max) {
+        return quantize(tensor, bits, overflow_rate, quant_method, min, max);
+      },
+      py::arg("tensor"), py::arg("bits"), py::arg("overflow_rate") = 0.,
+      py::arg("quant_method") = 0, py::arg("min") = NULL,
+      py::arg("max") = NULL);
+}
\ No newline at end of file
diff --git a/memtorch/cpp/quantize.h b/memtorch/cpp/quantize.h
new file mode 100644
index 00000000..5ad0f5ce
--- /dev/null
+++ b/memtorch/cpp/quantize.h
@@ -0,0 +1,3 @@
+void quantize_bindings(py::module_ &m);
+void quantize(at::Tensor tensor, int bits, float overflow_rate,
+              int quant_method = 0, float min = NULL, float max = NULL);
\ No newline at end of file
diff --git a/memtorch/cpp/quantize/quant.cpp b/memtorch/cpp/quantize/quant.cpp
deleted file mode 100644
index b7b59d15..00000000
--- a/memtorch/cpp/quantize/quant.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-#include <ATen/ATen.h>
-#include <cmath>
-#include <torch/extension.h>
-
-void quantize_element(float *tensor, int index, float *quant_levels,
-                      int num_quant_levels) {
-  int middle_point;         // Middle point
-  int optimal_point = 0;    // Optimal point
-  int l = 0;                // Lower bound
-  int h = num_quant_levels; // Higher bound
-  float difference =
-      1.0f; // Difference between a given point and the current middle point
-  while (l <= h) {
-    middle_point = l + (h - l) / 2;
-    if (fabs(tensor[index] - quant_levels[middle_point]) < difference) {
-      difference = fabs(tensor[index] - quant_levels[middle_point]);
-      optimal_point = middle_point;
-    }
-    if (quant_levels[middle_point] < tensor[index]) {
-      l = middle_point + 1;
-    } else {
-      h = middle_point - 1;
-    }
-  }
-  tensor[index] = quant_levels[optimal_point];
-}
-
-void quant(at::Tensor tensor, int num_quant_levels, float min_value,
-           float max_value) {
-  torch::Tensor quant_levels =
-      at::linspace(min_value, max_value, num_quant_levels);
-  for (int i = 0; i < tensor.numel(); i += 1) {
-    quantize_element(tensor.data_ptr<float>(), i,
-                     quant_levels.data_ptr<float>(), num_quant_levels);
-  }
-}
-
-void quant(at::Tensor tensor, int num_quant_levels, at::Tensor min_values,
-           at::Tensor max_values) {
-  float *min_values_ = min_values.data_ptr<float>();
-  float *max_values_ = max_values.data_ptr<float>();
-  for (int i = 0; i < tensor.numel(); i += 1) {
-    torch::Tensor quant_levels =
-        at::linspace(min_values_[i], max_values_[i], num_quant_levels);
-    quantize_element(tensor.data_ptr<float>(), i,
-                     quant_levels.data_ptr<float>(), num_quant_levels);
-  }
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("quantize", (void (*)(at::Tensor, int, float, float)) & quant, "tbd");
-  m.def("quantize", (void (*)(at::Tensor, int, at::Tensor, at::Tensor)) & quant,
-        "tbd");
-}
diff --git a/memtorch/cpp/tile_matmul.cpp b/memtorch/cpp/tile_matmul.cpp
new file mode 100644
index 00000000..be39ef09
--- /dev/null
+++ b/memtorch/cpp/tile_matmul.cpp
@@ -0,0 +1,108 @@
+#include <ATen/ATen.h>
+#include <cmath>
+#include <torch/extension.h>
+
+#include "quantize.h"
+using namespace torch::indexing;
+
+at::Tensor tile_matmul(at::Tensor mat_a_tiles, at::Tensor mat_a_tiles_map,
+                       int mat_a_shape[2], at::Tensor mat_b_tiles,
+                       at::Tensor mat_b_tiles_map, int mat_b_shape[2]) {
+  int mat_a_rows = mat_a_tiles.sizes().end()[-2];
+  c10::IntArrayRef mat_b_tiles_shape = mat_b_tiles.sizes();
+  c10::IntArrayRef mat_b_tiles_map_shape = mat_b_tiles_map.sizes();
+  at::Tensor partial_sum =
+      at::zeros({mat_b_tiles_map_shape[1], mat_b_tiles_shape.back()});
+  at::Tensor result = at::zeros({mat_a_shape[0], mat_b_shape[1]});
+#pragma omp parallel for
+  for (int i = 0; i < mat_a_rows; i++) {
+    at::Tensor mat_a_row_tiles = mat_a_tiles.index({Slice(), i, Slice()});
+    for (int j = 0; j < mat_b_tiles_map_shape[0]; j++) {
+      at::Tensor tile_a = mat_a_row_tiles[mat_a_tiles_map[j].item<int>()];
+      for (int k = 0; k < mat_b_tiles_map_shape[1]; k++) {
+        at::Tensor tile_b = mat_b_tiles[mat_b_tiles_map[j][k].item<int>()];
+        partial_sum[k] += at::matmul(tile_a, tile_b).squeeze();
+      }
+      result.index_put_({i, Slice()}, result.index({i, Slice()}) +
+                                          partial_sum.flatten().index(
+                                              {Slice(0, mat_b_shape[1])}));
+      partial_sum = partial_sum.zero_();
+    }
+  }
+  return result;
+}
+
+at::Tensor tile_matmul(at::Tensor mat_a_tiles, at::Tensor mat_a_tiles_map,
+                       int mat_a_shape[2], at::Tensor mat_b_tiles,
+                       at::Tensor mat_b_tiles_map, int mat_b_shape[2],
+                       int ADC_resolution, float ADC_overflow_rate,
+                       int quant_method) {
+  int mat_a_rows = mat_a_tiles.sizes().end()[-2];
+  c10::IntArrayRef mat_b_tiles_shape = mat_b_tiles.sizes();
+  c10::IntArrayRef mat_b_tiles_map_shape = mat_b_tiles_map.sizes();
+  at::Tensor partial_sum =
+      at::zeros({mat_b_tiles_map_shape[1], mat_b_tiles_shape.back()});
+  at::Tensor result = at::zeros({mat_a_shape[0], mat_b_shape[1]});
+#pragma omp parallel for
+  for (int i = 0; i < mat_a_rows; i++) {
+    at::Tensor mat_a_row_tiles = mat_a_tiles.index({Slice(), i, Slice()});
+    for (int j = 0; j < mat_b_tiles_map_shape[0]; j++) {
+      partial_sum =
+          at::zeros({mat_b_tiles_map_shape[1], mat_b_tiles_shape.back()});
+      at::Tensor tile_a = mat_a_row_tiles[mat_a_tiles_map[j].item<int>()];
+      for (int k = 0; k < mat_b_tiles_map_shape[1]; k++) {
+        at::Tensor tile_b = mat_b_tiles[mat_b_tiles_map[j][k].item<int>()];
+        at::Tensor result = at::matmul(tile_a, tile_b).squeeze();
+        quantize(result, ADC_resolution, ADC_overflow_rate, quant_method);
+        partial_sum[k] += result;
+      }
+      partial_sum = partial_sum.flatten().index({Slice(0, mat_b_shape[1])});
+      result.index_put_({i, Slice()}, result.index({i, Slice()}) + partial_sum);
+    }
+  }
+  return result;
+}
+
+void tile_matmul_bindings(py::module_ &m) {
+  m.def(
+      "tile_matmul",
+      [](at::Tensor mat_a_tiles, at::Tensor mat_a_tiles_map,
+         std::tuple<int, float> mat_a_shape, at::Tensor mat_b_tiles,
+         at::Tensor mat_b_tiles_map, std::tuple<int, float> mat_b_shape,
+         int cuda_malloc_heap_size) {
+        assert((std::tuple_size<int, float>(mat_a_shape) == 2) &&
+               (std::tuple_size<int, float>(mat_b_shape) == 2));
+        int mat_a_shape_array[2] = {(int)std::get<0>(mat_a_shape),
+                                    (int)std::get<1>(mat_a_shape)};
+        int mat_b_shape_array[2] = {(int)std::get<0>(mat_b_shape),
+                                    (int)std::get<1>(mat_b_shape)};
+        return tile_matmul(mat_a_tiles, mat_a_tiles_map, mat_a_shape_array,
+                           mat_b_tiles, mat_b_tiles_map, mat_b_shape_array);
+      },
+      py::arg("mat_a_tiles"), py::arg("mat_a_tiles_map"),
+      py::arg("mat_a_shape"), py::arg("mat_b_tiles"),
+      py::arg("mat_b_tiles_map"), py::arg("mat_b_shape"),
+      py::arg("cuda_malloc_heap_size") = NULL);
+  m.def(
+      "tile_matmul",
+      [](at::Tensor mat_a_tiles, at::Tensor mat_a_tiles_map,
+         std::tuple<int, float> mat_a_shape, at::Tensor mat_b_tiles,
+         at::Tensor mat_b_tiles_map, std::tuple<int, float> mat_b_shape,
+         int ADC_resolution, float ADC_overflow_rate, int quant_method,
+         int cuda_malloc_heap_size) {
+        assert((std::tuple_size<int, float>(mat_a_shape) == 2) &&
+               (std::tuple_size<int, float>(mat_b_shape) == 2));
+        int mat_a_shape_array[2] = {(int)std::get<0>(mat_a_shape),
+                                    (int)std::get<1>(mat_a_shape)};
+        int mat_b_shape_array[2] = {(int)std::get<0>(mat_b_shape),
+                                    (int)std::get<1>(mat_b_shape)};
+        return tile_matmul(mat_a_tiles, mat_a_tiles_map, mat_a_shape_array,
+                           mat_b_tiles, mat_b_tiles_map, mat_b_shape_array,
+                           ADC_resolution, ADC_overflow_rate, quant_method);
+      },
+      py::arg("mat_a_tiles"), py::arg("mat_a_tiles_map"),
+      py::arg("mat_a_shape"), py::arg("mat_b_tiles"),
+      py::arg("mat_b_tiles_map"), py::arg("mat_b_shape"),
+      py::arg("ADC_resolution"), py::arg("ADC_overflow_rate"),
+      py::arg("quant_method"), py::arg("cuda_malloc_heap_size") = NULL);
+}
\ No newline at end of file
diff --git a/memtorch/cpp/tile_matmul.h b/memtorch/cpp/tile_matmul.h
new file mode 100644
index 00000000..42f7ff2a
--- /dev/null
+++ b/memtorch/cpp/tile_matmul.h
@@ -0,0 +1 @@
+void tile_matmul_bindings(py::module_ &m);
\ No newline at end of file
diff --git a/memtorch/cu/bindings.cpp b/memtorch/cu/bindings.cpp
new file mode 100644
index 00000000..fe1758c9
--- /dev/null
+++ b/memtorch/cu/bindings.cpp
@@ -0,0 +1,9 @@
+#include <ATen/ATen.h>
+#include <cmath>
+#include <torch/extension.h>
+
+#include "tile_matmul.h"
+
+void tile_matmul_bindings(py::module_ &);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { tile_matmul_bindings(m); }
\ No newline at end of file
diff --git a/memtorch/cu/quantize.cuh b/memtorch/cu/quantize.cuh
new file mode 100644
index 00000000..e5217fb9
--- /dev/null
+++ b/memtorch/cu/quantize.cuh
@@ -0,0 +1,77 @@
+__device__ float det_integral(float *tensor, int tensor_numel,
+                              float overflow_rate, float min, float max) {
+  if ((min != NULL) || (max != NULL)) {
+    float max_bound;
+    if ((min != NULL) && (max != NULL)) {
+      max_bound = max_(abs(min), abs(max));
+    } else if (min != NULL) {
+      max_bound = abs(min);
+    } else if (max != NULL) {
+      max_bound = abs(max);
+    }
+    if (max_bound > tensor[0]) {
+      tensor[0] = max_bound;
+    }
+  }
+  return ceilf(
+      log2f(tensor[(int)round(overflow_rate * tensor_numel)] + 1e-12f));
+}
+
+__device__ float det_sf(float *tensor, int tensor_numel, int bits,
+                        float overflow_rate, float min, float max) {
+  assert(overflow_rate <= 1.0);
+  sort_<float>(tensor, tensor_numel);
+  return 1 - bits + det_integral(tensor, tensor_numel, overflow_rate, min, max);
+}
+
+__device__ Eigen::VectorXf linear_quantize(Eigen::VectorXf tensor, float sf,
+                                           int bits, float overflow_rate) {
+  float delta = powf(2.0f, sf);
+  float bound = powf(2.0f, bits - 1);
+  return (tensor / powf(2.0f, sf)).unaryExpr([&](float x) {
+    float x_ = clamp_<float>(floorf(x + 0.5f), -bound, bound - 1) * delta;
+    if (isnan(x_)) {
+      return 0.0f;
+    } else {
+      return x_;
+    }
+  });
+}
+
+__device__ Eigen::VectorXf quantize(Eigen::VectorXf tensor, int bits,
+                                    float overflow_rate, int quant_method) {
+  if (quant_method == 0) {
+    // linear
+    float *tensor_data = (float *)malloc(sizeof(float) * tensor.size());
+    memcpy(tensor_data, tensor.data(), sizeof(float) * tensor.size());
+    float sf =
+        det_sf(tensor_data, tensor.size(), bits, overflow_rate, NULL, NULL);
+    delete tensor_data;
+    return linear_quantize(tensor, sf, bits, overflow_rate);
+  } else if (quant_method == 1) {
+    // log
+    float *tensor_data = (float *)malloc(sizeof(float) * tensor.size());
+    memcpy(tensor_data, tensor.data(), sizeof(float) * tensor.size());
+    float sf =
+        det_sf(tensor_data, tensor.size(), bits, overflow_rate, NULL, NULL);
+    delete tensor_data;
+    bool *s = (bool *)malloc(sizeof(bool) * tensor.size());
+    for (int i = 0; i < tensor.size(); i++) {
+      s[i] = tensor[i] >= 0.0f;
+    }
+    tensor = tensor.unaryExpr(
+        [&](float x) { return max_<float>(logf(abs_<float>(x)), 1e-20f); });
+    tensor = linear_quantize(tensor, sf, bits - 1, overflow_rate);
+    for (int i = 0; i < tensor.size(); i++) {
+      if (s[i]) {
+        tensor[i] = expf(tensor[i]);
+      } else {
+        tensor[i] = -expf(tensor[i]);
+      }
+    }
+    delete s;
+    return tensor;
+  } else {
+    return tensor;
+  }
+}
diff --git a/memtorch/cu/quantize/gpu.cuh b/memtorch/cu/quantize/gpu.cuh
deleted file mode 100644
index 35356610..00000000
--- a/memtorch/cu/quantize/gpu.cuh
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <algorithm>
-constexpr int CUDA_NUM_THREADS = 128;
-constexpr int MAXIMUM_NUM_BLOCKS = 4096;
-
-inline int GET_BLOCKS(const int N) {
-  return std::max(std::min((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS, MAXIMUM_NUM_BLOCKS), 1);
-}
diff --git a/memtorch/cu/quantize/quant.cu b/memtorch/cu/quantize/quant.cu
deleted file mode 100644
index 959a1c2c..00000000
--- a/memtorch/cu/quantize/quant.cu
+++ /dev/null
@@ -1,87 +0,0 @@
-#include "cuda_runtime.h"
-#include "gpu.cuh"
-#include <ATen/cuda/CUDAContext.h>
-#include <cmath>
-#include <iostream>
-#include <limits>
-#include <math.h>
-#include <torch/types.h>
-
-__device__ float quantize_element(float element, float *quant_levels,
-                                  int num_quant_levels) {
-  int middle_point;         // Middle point
-  int optimal_point = 0;    // Optimal point
-  int l = 0;                // Lower bound
-  int h = num_quant_levels; // Higher bound
-  float difference =
-      1.0f; // Difference between a given point and the current middle point
-  while (l <= h) {
-    middle_point = l + (h - l) / 2;
-    if (fabs(element - quant_levels[middle_point]) < difference) {
-      difference = fabs(element - quant_levels[middle_point]);
-      optimal_point = middle_point;
-    }
-    if (quant_levels[middle_point] < element) {
-      l = middle_point + 1;
-    } else {
-      h = middle_point - 1;
-    }
-  }
-  return quant_levels[optimal_point];
-}
-
-__global__ void quantize(int num_quant_levels, float *quant_levels,
-                         int num_elements, float *tensor) {
-
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-
-  for (int i = index; i < num_elements; i += stride) {
-    tensor[i] = quantize_element(tensor[i], quant_levels, num_quant_levels);
-  }
-}
-
-__global__ void quantize_(int num_quant_levels, float *min_values,
-                          float *max_values, int num_elements, float *tensor) {
-
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-
-  float *quant_levels = new float[num_quant_levels];
-  for (int i = index; i < num_elements; i += stride) {
-    // Manually generate linspace vectors
-    float step_size = (max_values[i] - min_values[i]) / (num_quant_levels - 1);
-    for (int j = 0; j < num_quant_levels; ++j) {
-      quant_levels[j] = min_values[i] + j * step_size;
-    }
-    quant_levels[num_quant_levels - 1] = max_values[i];
-    tensor[i] = quantize_element(tensor[i], quant_levels, num_quant_levels);
-  }
-  free(quant_levels);
-}
-
-void quant_cuda(at::Tensor tensor, int num_quant_levels, float min_value,
-                float max_value) {
-  torch::Tensor quant_levels =
-      at::linspace(min_value, max_value, num_quant_levels);
-  float *quant_levels_gpu;
-  cudaMalloc(&quant_levels_gpu, sizeof(float) * quant_levels.numel());
-  cudaMemcpy(quant_levels_gpu, quant_levels.data<float>(),
-             sizeof(float) * quant_levels.numel(), cudaMemcpyHostToDevice);
-  quantize<<<GET_BLOCKS(tensor.numel()), CUDA_NUM_THREADS, 0,
-             at::cuda::getCurrentCUDAStream()>>>(
-      num_quant_levels, quant_levels_gpu, tensor.numel(), tensor.data<float>());
-  cudaDeviceSynchronize();
-  cudaStreamSynchronize(at::cuda::getCurrentCUDAStream());
-  cudaFree(quant_levels_gpu);
-}
-
-void quant_cuda(at::Tensor tensor, int num_quant_levels, at::Tensor min_values,
-                at::Tensor max_values) {
-  quantize_<<<GET_BLOCKS(tensor.numel()), CUDA_NUM_THREADS, 0,
-              at::cuda::getCurrentCUDAStream()>>>(
-      num_quant_levels, min_values.data<float>(), max_values.data<float>(),
-      tensor.numel(), tensor.data<float>());
-  cudaDeviceSynchronize();
-  cudaStreamSynchronize(at::cuda::getCurrentCUDAStream());
-}
diff --git a/memtorch/cu/quantize/quant_cuda.cpp b/memtorch/cu/quantize/quant_cuda.cpp
deleted file mode 100644
index 9911ea93..00000000
--- a/memtorch/cu/quantize/quant_cuda.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
-#include <vector>
-
-// CUDA kernels
-void quant_cuda(at::Tensor tensor, int num_quant_levels, float min_value,
-                float max_value);
-void quant_cuda(at::Tensor tensor, int num_quant_levels, at::Tensor min_values,
-                at::Tensor max_values);
-
-void quant(at::Tensor tensor, int num_quant_levels, float min_value,
-           float max_value) {
-  if (at::cuda::is_available()) {
-    tensor.to(torch::Device("cuda:0"));
-    quant_cuda(tensor, num_quant_levels, min_value, max_value);
-  } else {
-    printf("To be supported.\n");
-  }
-}
-
-void quant(at::Tensor tensor, int num_quant_levels, at::Tensor min_values,
-           at::Tensor max_values) {
-  if (at::cuda::is_available()) {
-    assert(tensor.numel() == min_values.numel() == max_values.numel());
-    tensor.to(torch::Device("cuda:0"));
-    min_values.to(torch::Device("cuda:0"));
-    max_values.to(torch::Device("cuda:0"));
-    quant_cuda(tensor, num_quant_levels, min_values, max_values);
-  } else {
-    printf("To be supported.\n");
-  }
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("quantize", (void (*)(at::Tensor, int, float, float)) & quant, "tbd");
-  m.def("quantize", (void (*)(at::Tensor, int, at::Tensor, at::Tensor)) & quant,
-        "tbd");
-}
diff --git a/memtorch/cu/tile_matmul.cpp b/memtorch/cu/tile_matmul.cpp
new file mode 100644
index 00000000..6b6cf584
--- /dev/null
+++ b/memtorch/cu/tile_matmul.cpp
@@ -0,0 +1,54 @@
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <iostream>
+#include <torch/extension.h>
+#include <vector>
+
+#include "tile_matmul_kernels.cuh"
+
+void tile_matmul_bindings(py::module_ &m) {
+  m.def(
+      "tile_matmul",
+      [&](at::Tensor mat_a_tiles, at::Tensor mat_a_tiles_map,
+          std::tuple<int, float> mat_a_shape, at::Tensor mat_b_tiles,
+          at::Tensor mat_b_tiles_map, std::tuple<int, float> mat_b_shape,
+          int cuda_malloc_heap_size) {
+        assert((std::tuple_size<int, float>(mat_a_shape) == 2) &&
+               (std::tuple_size<int, float>(mat_b_shape) == 2));
+        int mat_a_shape_array[2] = {(int)std::get<0>(mat_a_shape),
+                                    (int)std::get<1>(mat_a_shape)};
+        int mat_b_shape_array[2] = {(int)std::get<0>(mat_b_shape),
+                                    (int)std::get<1>(mat_b_shape)};
+        return tile_matmul(mat_a_tiles, mat_a_tiles_map, mat_a_shape_array,
+                           mat_b_tiles, mat_b_tiles_map, mat_b_shape_array, -1,
+                           -1, -1, cuda_malloc_heap_size);
+      },
+      py::arg("mat_a_tiles"), py::arg("mat_a_tiles_map"),
+      py::arg("mat_a_shape"), py::arg("mat_b_tiles"),
+      py::arg("mat_b_tiles_map"), py::arg("mat_b_shape"),
+      py::arg("cuda_malloc_heap_size") = 50);
+  m.def(
+      "tile_matmul",
+      [&](at::Tensor mat_a_tiles, at::Tensor mat_a_tiles_map,
+          std::tuple<int, float> mat_a_shape, at::Tensor mat_b_tiles,
+          at::Tensor mat_b_tiles_map, std::tuple<int, float> mat_b_shape,
+          int ADC_resolution, float overflow_rate, int quant_method,
+          int cuda_malloc_heap_size) {
+        assert((std::tuple_size<int, float>(mat_a_shape) == 2) &&
+               (std::tuple_size<int, float>(mat_b_shape) == 2));
+        int mat_a_shape_array[2] = {(int)std::get<0>(mat_a_shape),
+                                    (int)std::get<1>(mat_a_shape)};
+        int mat_b_shape_array[2] = {(int)std::get<0>(mat_b_shape),
+                                    (int)std::get<1>(mat_b_shape)};
+        return tile_matmul(mat_a_tiles, mat_a_tiles_map, mat_a_shape_array,
+                           mat_b_tiles, mat_b_tiles_map, mat_b_shape_array,
+                           ADC_resolution, overflow_rate, quant_method,
+                           cuda_malloc_heap_size);
+      },
+      py::arg("mat_a_tiles"), py::arg("mat_a_tiles_map"),
+      py::arg("mat_a_shape"), py::arg("mat_b_tiles"),
+      py::arg("mat_b_tiles_map"), py::arg("mat_b_shape"),
+      py::arg("ADC_resolution"), py::arg("ADC_overflow_rate"),
+      py::arg("quant_method"), py::arg("cuda_malloc_heap_size") = 50);
+}
\ No newline at end of file
diff --git a/memtorch/cu/tile_matmul.h b/memtorch/cu/tile_matmul.h
new file mode 100644
index 00000000..42f7ff2a
--- /dev/null
+++ b/memtorch/cu/tile_matmul.h
@@ -0,0 +1 @@
+void tile_matmul_bindings(py::module_ &m);
\ No newline at end of file
diff --git a/memtorch/cu/tile_matmul_kernels.cu b/memtorch/cu/tile_matmul_kernels.cu
new file mode 100644
index 00000000..99bff258
--- /dev/null
+++ b/memtorch/cu/tile_matmul_kernels.cu
@@ -0,0 +1,166 @@
+#include "cuda_runtime.h"
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <iostream>
+#include <limits>
+#include <math.h>
+#include <torch/types.h>
+
+#include <Eigen/Core>
+
+#include "utils.cuh"
+
+#include "quantize.cuh"
+
+__global__ void tile_matmul_kernel(
+    float *mat_a_tiles_accessor,
+    torch::PackedTensorAccessor32<float, 1> mat_a_tiles_map_accessor,
+    int64_t *mat_a_tiles_shape, float *mat_b_tiles_accessor,
+    torch::PackedTensorAccessor32<float, 2> mat_b_tiles_map_accessor,
+    int64_t *mat_b_tiles_shape, int mat_b_shape_back, int limit_i, int limit_j,
+    int limit_k, float *result) {
+  int i = threadIdx.x + blockIdx.x * blockDim.x;
+  int j = threadIdx.y + blockIdx.y * blockDim.y;
+  int k = threadIdx.z + blockIdx.z * blockDim.z;
+  if (i < limit_i && j < limit_j && k < limit_k) {
+    Eigen::Map<Eigen::MatrixXf> tile_a(
+        &mat_a_tiles_accessor[transform_3d_index(mat_a_tiles_map_accessor[k], i,
+                                                 0, mat_a_tiles_shape[1],
+                                                 mat_a_tiles_shape[2])],
+        1, mat_a_tiles_shape[2]);
+    Eigen::Map<Eigen::MatrixXf, Eigen::RowMajor,
+               Eigen::Stride<1, Eigen::Dynamic>>
+        tile_b(&mat_b_tiles_accessor[transform_3d_index(
+                   mat_b_tiles_map_accessor[k][j], 0, 0, mat_b_tiles_shape[1],
+                   mat_b_tiles_shape[2])],
+               mat_b_tiles_shape[1], mat_b_tiles_shape[2],
+               Eigen::Stride<1, Eigen::Dynamic>(1, mat_b_tiles_shape[2]));
+    Eigen::VectorXf partial_sum = (tile_a * tile_b).transpose();
+#pragma omp parallel for
+    for (int ii = 0; ii < partial_sum.size(); ii++) {
+      result[transform_2d_index(i, j * mat_b_tiles_shape[2] + ii,
+                                mat_b_shape_back)] += partial_sum[ii];
+    }
+    free(&partial_sum);
+  }
+}
+
+__global__ void tile_matmul_kernel(
+    float *mat_a_tiles_accessor,
+    torch::PackedTensorAccessor32<float, 1> mat_a_tiles_map_accessor,
+    int64_t *mat_a_tiles_shape, float *mat_b_tiles_accessor,
+    torch::PackedTensorAccessor32<float, 2> mat_b_tiles_map_accessor,
+    int64_t *mat_b_tiles_shape, int mat_b_shape_back, int ADC_resolution,
+    float overflow_rate, int quant_method, int limit_i, int limit_j,
+    int limit_k, float *result) {
+  int i = threadIdx.x + blockIdx.x * blockDim.x;
+  int j = threadIdx.y + blockIdx.y * blockDim.y;
+  int k = threadIdx.z + blockIdx.z * blockDim.z;
+  if (i < limit_i && j < limit_j && k < limit_k) {
+    Eigen::Map<Eigen::MatrixXf> tile_a(
+        &mat_a_tiles_accessor[transform_3d_index(mat_a_tiles_map_accessor[k], i,
+                                                 0, mat_a_tiles_shape[1],
+                                                 mat_a_tiles_shape[2])],
+        1, mat_a_tiles_shape[2]);
+
+    Eigen::Map<Eigen::MatrixXf, Eigen::RowMajor,
+               Eigen::Stride<1, Eigen::Dynamic>>
+        tile_b(&mat_b_tiles_accessor[transform_3d_index(
+                   mat_b_tiles_map_accessor[k][j], 0, 0, mat_b_tiles_shape[1],
+                   mat_b_tiles_shape[2])],
+               mat_b_tiles_shape[1], mat_b_tiles_shape[2],
+               Eigen::Stride<1, Eigen::Dynamic>(1, mat_b_tiles_shape[2]));
+    Eigen::VectorXf partial_sum = (tile_a * tile_b).transpose();
+    partial_sum =
+        quantize(partial_sum, ADC_resolution, overflow_rate, quant_method);
+#pragma omp parallel for
+    for (int ii = 0; ii < partial_sum.size(); ii++) {
+      result[transform_2d_index(i, j * mat_b_tiles_shape[2] + ii,
+                                mat_b_shape_back)] += partial_sum[ii];
+    }
+    free(&partial_sum);
+  }
+}
+
+at::Tensor tile_matmul(at::Tensor mat_a_tiles, at::Tensor mat_a_tiles_map,
+                       int mat_a_shape[2], at::Tensor mat_b_tiles,
+                       at::Tensor mat_b_tiles_map, int mat_b_shape[2],
+                       int ADC_resolution, float overflow_rate,
+                       int quant_method, int cuda_malloc_heap_size) {
+  assert(at::cuda::is_available());
+  mat_a_tiles = mat_a_tiles.to(torch::Device("cuda:0"));
+  mat_a_tiles_map = mat_a_tiles_map.to(torch::Device("cuda:0"));
+  mat_b_tiles = mat_b_tiles.to(torch::Device("cuda:0"));
+  mat_b_tiles_map = mat_b_tiles_map.to(torch::Device("cuda:0"));
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, 0);
+  int *max_threads_dim = prop.maxThreadsDim;
+  int64_t *mat_a_tiles_shape_host = (int64_t *)malloc(sizeof(int64_t) * 3);
+  int64_t *mat_b_tiles_shape_host = (int64_t *)malloc(sizeof(int64_t) * 3);
+  for (int i = 0; i < 3; i++) {
+    mat_a_tiles_shape_host[i] = mat_a_tiles.sizes()[i];
+    mat_b_tiles_shape_host[i] = mat_b_tiles.sizes()[i];
+  }
+  int64_t *mat_a_tiles_shape;
+  int64_t *mat_b_tiles_shape;
+  cudaSafeCall(cudaMalloc(&mat_a_tiles_shape, sizeof(int64_t) * 3));
+  cudaSafeCall(cudaMalloc(&mat_b_tiles_shape, sizeof(int64_t) * 3));
+  cudaSafeCall(cudaMemcpy(mat_a_tiles_shape, mat_a_tiles_shape_host,
+                          sizeof(int64_t) * 3, cudaMemcpyHostToDevice));
+  cudaSafeCall(cudaMemcpy(mat_b_tiles_shape, mat_b_tiles_shape_host,
+                          sizeof(int64_t) * 3, cudaMemcpyHostToDevice));
+  float *mat_a_tiles_accessor = mat_a_tiles.data_ptr<float>();
+  float *mat_b_tiles_accessor = mat_b_tiles.data_ptr<float>();
+  torch::PackedTensorAccessor32<float, 1> mat_a_tiles_map_accessor =
+      mat_a_tiles_map.packed_accessor32<float, 1>();
+  torch::PackedTensorAccessor32<float, 2> mat_b_tiles_map_accessor =
+      mat_b_tiles_map.packed_accessor32<float, 2>();
+  int limit_i = mat_a_tiles.sizes().end()[-2];
+  int limit_j = mat_b_tiles_map.sizes()[1];
+  int limit_k = mat_b_tiles_map.sizes()[0];
+  at::Tensor result =
+      at::zeros({mat_a_shape[0], mat_b_shape[1]}, torch::device(torch::kCUDA));
+  cudaDeviceSetLimit(cudaLimitMallocHeapSize,
+                     1024 * 1024 * cuda_malloc_heap_size);
+  if (max_threads_dim[0] >= limit_i && max_threads_dim[1] >= limit_j &&
+      max_threads_dim[2] >= limit_k) {
+    // If multiple blocks are not required
+    dim3 grid(limit_i, limit_j, limit_k);
+    dim3 block(1, 1, 1);
+    if (ADC_resolution == -1) {
+      tile_matmul_kernel<<<grid, block>>>(
+          mat_a_tiles_accessor, mat_a_tiles_map_accessor, mat_a_tiles_shape,
+          mat_b_tiles_accessor, mat_b_tiles_map_accessor, mat_b_tiles_shape,
+          mat_b_shape[1], limit_i, limit_j, limit_k, result.data_ptr<float>());
+    } else {
+      tile_matmul_kernel<<<grid, block>>>(
+          mat_a_tiles_accessor, mat_a_tiles_map_accessor, mat_a_tiles_shape,
+          mat_b_tiles_accessor, mat_b_tiles_map_accessor, mat_b_tiles_shape,
+          mat_b_shape[1], ADC_resolution, overflow_rate, quant_method, limit_i,
+          limit_j, limit_k, result.data_ptr<float>());
+    }
+  } else {
+    // If multiple blocks are required
+    dim3 grid(max_threads_dim[0], max_threads_dim[1], max_threads_dim[2]);
+    dim3 block(ceil_int_div(limit_i, max_threads_dim[0]),
+               ceil_int_div(limit_j, max_threads_dim[1]),
+               ceil_int_div(limit_k, max_threads_dim[2]));
+    if (ADC_resolution == -1) {
+      tile_matmul_kernel<<<grid, block>>>(
+          mat_a_tiles_accessor, mat_a_tiles_map_accessor, mat_a_tiles_shape,
+          mat_b_tiles_accessor, mat_b_tiles_map_accessor, mat_b_tiles_shape,
+          mat_b_shape[1], limit_i, limit_j, limit_k, result.data_ptr<float>());
+    } else {
+      tile_matmul_kernel<<<grid, block>>>(
+          mat_a_tiles_accessor, mat_a_tiles_map_accessor, mat_a_tiles_shape,
+          mat_b_tiles_accessor, mat_b_tiles_map_accessor, mat_b_tiles_shape,
+          mat_b_shape[1], ADC_resolution, overflow_rate, quant_method, limit_i,
+          limit_j, limit_k, result.data_ptr<float>());
+    }
+  }
+  cudaSafeCall(cudaDeviceSynchronize());
+  cudaSafeCall(cudaFree(mat_a_tiles_shape));
+  cudaSafeCall(cudaFree(mat_b_tiles_shape));
+  cudaStreamSynchronize(at::cuda::getCurrentCUDAStream());
+  return result;
+}
\ No newline at end of file
diff --git a/memtorch/cu/tile_matmul_kernels.cuh b/memtorch/cu/tile_matmul_kernels.cuh
new file mode 100644
index 00000000..d434d172
--- /dev/null
+++ b/memtorch/cu/tile_matmul_kernels.cuh
@@ -0,0 +1,5 @@
+at::Tensor tile_matmul(at::Tensor mat_a_tiles, at::Tensor mat_a_tiles_map,
+                       int mat_a_shape[2], at::Tensor mat_b_tiles,
+                       at::Tensor mat_b_tiles_map, int mat_b_shape[2],
+                       int ADC_resolution, float overflow_rate,
+                       int quant_method, int cuda_malloc_heap_size);
\ No newline at end of file
diff --git a/memtorch/cu/utils.cuh b/memtorch/cu/utils.cuh
new file mode 100644
index 00000000..0ee5cd77
--- /dev/null
+++ b/memtorch/cu/utils.cuh
@@ -0,0 +1,64 @@
+#define cudaSafeCall(call)                                                     \
+  do {                                                                         \
+    cudaError_t err = call;                                                    \
+    if (cudaSuccess != err) {                                                  \
+      std::cerr << "CUDA error in " << __FILE__ << "(" << __LINE__             \
+                << "): " << cudaGetErrorString(err);                           \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+template <class T> __host__ __device__ T min_(T a, T b) {
+  return !(b < a) ? a : b;
+};
+
+template <class T> __host__ __device__ T max_(T a, T b) {
+  return (a < b) ? b : a;
+};
+
+template <class T> __host__ __device__ T clamp_(T x, T min, T max) {
+  if (x < min)
+    x = min;
+  if (x > max)
+    x = max;
+  return x;
+}
+
+template <class T> __host__ __device__ T sign_(T x) {
+  if (x > (T)0)
+    return 1;
+  if (x < (T)0)
+    return -1;
+  return (T)0.0;
+}
+
+template <class T> __host__ __device__ T abs_(T x) {
+  if (x < 0)
+    return -x;
+  if (x >= 0)
+    return x;
+}
+
+template <class T> __device__ void sort_(T *tensor, int tensor_numel) {
+  T temp;
+#pragma omp parallel for
+  for (int i = 0; i < tensor_numel; i++) {
+    for (int j = i + 1; j < tensor_numel; j++) {
+      if (tensor[i] < tensor[j]) {
+        temp = tensor[i];
+        tensor[i] = tensor[j];
+        tensor[j] = temp;
+      }
+    }
+  }
+}
+
+__device__ int transform_2d_index(int x, int y, int len_y) {
+  return x * len_y + y;
+}
+
+__device__ int transform_3d_index(int x, int y, int z, int len_y, int len_z) {
+  return x * len_y * len_z + y * len_z + z;
+}
+
+int ceil_int_div(int a, int b) { return (a + b - 1) / b; }
\ No newline at end of file
diff --git a/memtorch/map/Module.py b/memtorch/map/Module.py
index 2ea91a51..8afd3146 100644
--- a/memtorch/map/Module.py
+++ b/memtorch/map/Module.py
@@ -41,7 +41,10 @@ def naive_tune(module, input_shape, verbose=True):
     reg = linear_model.LinearRegression(fit_intercept=True).fit(output, legacy_output)
     coef = np.array(reg.coef_).item()
     intercept = np.array(reg.intercept_).item()
-    transform_output = lambda x: x * coef + intercept
+
+    def transform_output(x):
+        return x * coef + intercept
+
     module.bias = tmp
     if verbose:
         print(
diff --git a/memtorch/mn/Conv1d.py b/memtorch/mn/Conv1d.py
index ee59bcb0..49bc925c 100644
--- a/memtorch/mn/Conv1d.py
+++ b/memtorch/mn/Conv1d.py
@@ -257,7 +257,7 @@ def forward(self, input):
                         if self.quant_method is not None:
                             out_ = memtorch.bh.Quantize.quantize(
                                 out_,
-                                bits=self.ADC_resolution,
+                                quant=self.ADC_resolution,
                                 overflow_rate=self.ADC_overflow_rate,
                                 quant_method=self.quant_method,
                             )
diff --git a/memtorch/mn/Conv2d.py b/memtorch/mn/Conv2d.py
index 00125164..ee1650bf 100644
--- a/memtorch/mn/Conv2d.py
+++ b/memtorch/mn/Conv2d.py
@@ -279,7 +279,7 @@ def forward(self, input):
                         if self.quant_method is not None:
                             out_ = memtorch.bh.Quantize.quantize(
                                 out_,
-                                bits=self.ADC_resolution,
+                                quant=self.ADC_resolution,
                                 overflow_rate=self.ADC_overflow_rate,
                                 quant_method=self.quant_method,
                             )
diff --git a/memtorch/mn/Conv3d.py b/memtorch/mn/Conv3d.py
index 701639a5..944502f9 100644
--- a/memtorch/mn/Conv3d.py
+++ b/memtorch/mn/Conv3d.py
@@ -297,7 +297,7 @@ def forward(self, input):
                         if self.quant_method is not None:
                             out_ = memtorch.bh.Quantize.quantize(
                                 out_,
-                                bits=self.ADC_resolution,
+                                quant=self.ADC_resolution,
                                 overflow_rate=self.ADC_overflow_rate,
                                 quant_method=self.quant_method,
                             )
diff --git a/memtorch/mn/Linear.py b/memtorch/mn/Linear.py
index 6fc7237b..4305394f 100644
--- a/memtorch/mn/Linear.py
+++ b/memtorch/mn/Linear.py
@@ -155,7 +155,6 @@ def forward(self, input):
                 ) and self.max_input_voltage > 0, (
                     "The maximum input voltage (max_input_voltage) must be >0."
                 )
-                # if torch.amax(abs(input)) > self.max_input_voltage:
                 input_range = torch.amax(torch.abs(input))
                 input = convert_range(
                     input,
@@ -178,7 +177,7 @@ def forward(self, input):
                 else:
                     nl = True
 
-                out = self.crossbar_operation(
+                out_ = self.crossbar_operation(
                     self.crossbars,
                     lambda crossbar, input_: simulate_matmul(
                         input,
@@ -195,12 +194,12 @@ def forward(self, input):
                 ).to(self.device)
             else:
                 if self.tile_shape is not None:
-                    input_tiles, input_tiles_map = gen_tiles(
+                    (input_tiles, input_tiles_map) = gen_tiles(
                         input, self.tile_shape, input=True
                     )
                     crossbar_shape = (self.crossbars[0].rows, self.crossbars[0].columns)
                     tiles_map = self.crossbars[0].tiles_map
-                    out = tile_matmul(
+                    out_ = tile_matmul(
                         input_tiles,
                         input_tiles_map,
                         input_shape,
@@ -214,21 +213,21 @@ def forward(self, input):
                         self.quant_method,
                     )
                 else:
-                    out = torch.matmul(
+                    out_ = torch.matmul(
                         input.to(self.device),
                         self.crossbar_operation(
                             self.crossbars, lambda crossbar: crossbar.conductance_matrix
                         ),
                     )
                     if self.quant_method is not None:
-                        out = memtorch.bh.Quantize.quantize(
-                            out,
-                            bits=self.ADC_resolution,
+                        out_ = memtorch.bh.Quantize.quantize(
+                            out_,
+                            quant=self.ADC_resolution,
                             overflow_rate=self.ADC_overflow_rate,
                             quant_method=self.quant_method,
                         )
 
-            out = self.transform_output(out).to(self.device)
+            out = self.transform_output(out_).to(self.device)
             if self.bias is not None:
                 out += self.bias.data.view(1, -1).to(self.device).expand_as(out)
 
diff --git a/memtorch/submodules/__init__.py b/memtorch/submodules/__init__.py
index 0d75fdc5..cb8bac77 100644
--- a/memtorch/submodules/__init__.py
+++ b/memtorch/submodules/__init__.py
@@ -1,3 +1 @@
 import importlib
-
-importlib.import_module(".pytorch-playground", "memtorch.submodules")
diff --git a/memtorch/submodules/eigen b/memtorch/submodules/eigen
new file mode 160000
index 00000000..1f4c0311
--- /dev/null
+++ b/memtorch/submodules/eigen
@@ -0,0 +1 @@
+Subproject commit 1f4c0311cda3403999b702c996898af5707973a9
diff --git a/memtorch/submodules/memtorch/submodules/pytorch-playground b/memtorch/submodules/memtorch/submodules/pytorch-playground
deleted file mode 160000
index ff7dd3a6..00000000
--- a/memtorch/submodules/memtorch/submodules/pytorch-playground
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ff7dd3a6c40481326120895065e120b4fefa1c9e
diff --git a/memtorch/submodules/pytorch-playground b/memtorch/submodules/pytorch-playground
deleted file mode 160000
index ff7dd3a6..00000000
--- a/memtorch/submodules/pytorch-playground
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ff7dd3a6c40481326120895065e120b4fefa1c9e
diff --git a/memtorch/version.py b/memtorch/version.py
index 3d9036eb..72f26f59 100644
--- a/memtorch/version.py
+++ b/memtorch/version.py
@@ -1 +1 @@
-__version__ = "1.1.1-cpu"
+__version__ = "1.1.2"
diff --git a/profile_tile_matmul.py b/profile_tile_matmul.py
new file mode 100644
index 00000000..074871ec
--- /dev/null
+++ b/profile_tile_matmul.py
@@ -0,0 +1,112 @@
+import time
+
+import torch
+
+import memtorch
+import memtorch_bindings
+import memtorch_cuda_bindings
+from memtorch.bh.crossbar.Tile import gen_tiles
+
+tile_shape = (25, 25)
+test_shape_a = (500, 500)
+test_shape_b = (500, 500)
+a = torch.zeros(test_shape_a).uniform_(0, 1)
+b = torch.zeros(test_shape_b).uniform_(0, 1)
+tile_a_tiles, tile_a_map = gen_tiles(a, tile_shape, input=True)
+tile_b_tiles, tile_b_map = gen_tiles(b, tile_shape, input=False)
+ADC_resolution = 4
+overflow_rate = 0.0
+# Without quantization
+print("----------------------------")
+print("Without quantization")
+print("----------------------------")
+start_time = time.time()
+python_res = memtorch.bh.crossbar.tile_matmul(
+    tile_a_tiles,
+    tile_a_map,
+    test_shape_a,
+    tile_b_tiles,
+    tile_b_map,
+    test_shape_b,
+    use_bindings=False,
+)
+elapsed_time = time.time() - start_time
+print("Pure Python")
+print(python_res)
+print(python_res.shape)
+print(elapsed_time)
+start_time = time.time()
+cpp_res = memtorch_bindings.tile_matmul(
+    tile_a_tiles, tile_a_map, test_shape_a, tile_b_tiles, tile_b_map, test_shape_b
+)
+elapsed_time = time.time() - start_time
+print("C++ (CPU)")
+print(cpp_res)
+print(cpp_res.shape)
+print(elapsed_time)
+start_time = time.time()
+cuda_res = memtorch_cuda_bindings.tile_matmul(
+    tile_a_tiles, tile_a_map, test_shape_a, tile_b_tiles, tile_b_map, test_shape_b
+)
+elapsed_time = time.time() - start_time
+print("CUDA (GPU)")
+print(cuda_res)
+print(cuda_res.shape)
+print(elapsed_time)
+
+# With quantization
+print("----------------------------")
+print("With quantization")
+print("----------------------------")
+start_time = time.time()
+python_res = memtorch.bh.crossbar.tile_matmul(
+    tile_a_tiles,
+    tile_a_map,
+    test_shape_a,
+    tile_b_tiles,
+    tile_b_map,
+    test_shape_b,
+    ADC_resolution=ADC_resolution,
+    ADC_overflow_rate=overflow_rate,
+    quant_method="linear",
+    use_bindings=False,
+)
+elapsed_time = time.time() - start_time
+print("Pure Python")
+print(python_res)
+print(python_res.shape)
+print(elapsed_time)
+start_time = time.time()
+cpp_res = memtorch_bindings.tile_matmul(
+    tile_a_tiles,
+    tile_a_map,
+    test_shape_a,
+    tile_b_tiles,
+    tile_b_map,
+    test_shape_b,
+    ADC_resolution,
+    overflow_rate,
+    0,
+)
+elapsed_time = time.time() - start_time
+print("C++ (CPU)")
+print(cpp_res)
+print(cpp_res.shape)
+print(elapsed_time)
+start_time = time.time()
+cuda_res = memtorch_cuda_bindings.tile_matmul(
+    tile_a_tiles,
+    tile_a_map,
+    test_shape_a,
+    tile_b_tiles,
+    tile_b_map,
+    test_shape_b,
+    ADC_resolution,
+    overflow_rate,
+    0,
+)
+elapsed_time = time.time() - start_time
+print("CUDA (GPU)")
+print(cuda_res)
+print(cuda_res.shape)
+print(elapsed_time)
diff --git a/setup.py b/setup.py
index 9296cfde..6290089c 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,11 @@
+import glob
+import os
+
 import torch
 from setuptools import find_packages, setup
+from torch.utils.cpp_extension import include_paths
 
-version = "1.1.1"
+version = "1.1.2"
 CUDA = False
 
 
@@ -22,21 +26,27 @@ def create_version_py(version, CUDA):
 
     ext_modules = [
         CUDAExtension(
-            name="cuda_quantization",
-            sources=[
-                "memtorch/cu/quantize/quant_cuda.cpp",
-                "memtorch/cu/quantize/quant.cu",
-            ],
-            include_paths="memtorch/cu/quantize",
+            name="memtorch_cuda_bindings",
+            sources=glob.glob("memtorch/cu/*.cu") + glob.glob("memtorch/cu/*.cpp"),
+            library_dirs=["memtorch/submodules"],
+            include_dirs=["memtorch/cu/", "memtorch/submodules/eigen/"],
+        ),
+        CppExtension(
+            name="memtorch_bindings",
+            sources=glob.glob("memtorch/cpp/*.cpp"),
+            include_dirs=["memtorch/cpp/"],
         ),
-        CppExtension(name="quantization", sources=["memtorch/cpp/quantize/quant.cpp"]),
     ]
     name = "memtorch"
 else:
     from torch.utils.cpp_extension import BuildExtension, CppExtension
 
     ext_modules = [
-        CppExtension(name="quantization", sources=["memtorch/cpp/quantize/quant.cpp"])
+        CppExtension(
+            name="memtorch_bindings",
+            sources=glob.glob("memtorch/cpp/*.cpp"),
+            include_dirs=["memtorch/cpp/"],
+        )
     ]
     name = "memtorch-cpu"
 
@@ -66,6 +76,6 @@ def create_version_py(version, CUDA):
             "ipython",
             "lmfit",
         ],
-        include_package_data=CUDA,
+        include_package_data=True,
         python_requires=">=3.6",
     )
diff --git a/tests/test_cpp_extensions.py b/tests/test_cpp_extensions.py
index 5a903cd7..63d91fd9 100644
--- a/tests/test_cpp_extensions.py
+++ b/tests/test_cpp_extensions.py
@@ -3,10 +3,10 @@
 
 import memtorch
 
-if "cpu" in memtorch.__version__:
-    import quantization
-else:
-    import cuda_quantization as quantization
+# if "cpu" in memtorch.__version__:
+#     import quantization
+# else:
+#     import cuda_quantization as quantization
 
 import copy
 import math
@@ -16,31 +16,32 @@
 import numpy as np
 
 
-@pytest.mark.parametrize(
-    "shape, quantization_levels", [((20, 50), 10), ((100, 100), 5)]
-)
-def test_quantize(shape, quantization_levels):
-    device = torch.device("cpu" if "cpu" in memtorch.__version__ else "cuda")
-    tensor = torch.zeros(shape).uniform_(0, 1).to(device)
-    quantized_tensor = copy.deepcopy(tensor)
-    quantization.quantize(quantized_tensor, quantization_levels, 0, 1)
-    valid_values = torch.linspace(0, 1, quantization_levels)
-    quantized_tensor_unique = quantized_tensor.unique()
-    assert any(
-        [
-            bool(val)
-            for val in [
-                torch.isclose(quantized_tensor_unique, valid_value).any()
-                for valid_value in valid_values
-            ]
-        ]
-    )
-    assert tensor.shape == quantized_tensor.shape
-    assert math.isclose(
-        min(valid_values.tolist(), key=lambda x: abs(x - tensor[0][0])),
-        quantized_tensor[0][0],
-    )
-    assert math.isclose(
-        min(valid_values.tolist(), key=lambda x: abs(x - tensor[0][1])),
-        quantized_tensor[0][1],
-    )
+# TBD
+# @pytest.mark.parametrize(
+#     "shape, quantization_levels", [((20, 50), 10), ((100, 100), 5)]
+# )
+# def test_quantize(shape, quantization_levels):
+#     device = torch.device("cpu" if "cpu" in memtorch.__version__ else "cuda")
+#     tensor = torch.zeros(shape).uniform_(0, 1).to(device)
+#     quantized_tensor = copy.deepcopy(tensor)
+#     quantization.quantize(quantized_tensor, quantization_levels, 0, 1)
+#     valid_values = torch.linspace(0, 1, quantization_levels)
+#     quantized_tensor_unique = quantized_tensor.unique()
+#     assert any(
+#         [
+#             bool(val)
+#             for val in [
+#                 torch.isclose(quantized_tensor_unique, valid_value).any()
+#                 for valid_value in valid_values
+#             ]
+#         ]
+#     )
+#     assert tensor.shape == quantized_tensor.shape
+#     assert math.isclose(
+#         min(valid_values.tolist(), key=lambda x: abs(x - tensor[0][0])),
+#         quantized_tensor[0][0],
+#     )
+#     assert math.isclose(
+#         min(valid_values.tolist(), key=lambda x: abs(x - tensor[0][1])),
+#         quantized_tensor[0][1],
+#     )