From 4ce968b60bf3a04045a717634ca29bf8a0229e84 Mon Sep 17 00:00:00 2001
From: Philippe Drolet <83427055+Philippe-Drolet@users.noreply.github.com>
Date: Wed, 9 Feb 2022 19:46:37 -0500
Subject: [PATCH] Implementation of CUDA Accelerated Passive Crossbar
 Programming Routines for the 2021 Data-Driven Model (#125)

Implementation of CUDA accelerated passive crossbar programming routines for the 2021 Data-Driven model (#125) as a partial solution to (#53).
---
 memtorch/bh/crossbar/Crossbar.py         |  73 +++-
 memtorch/bh/memristor/Data_Driven2021.py |   1 +
 memtorch/cu/bindings.cpp                 |   2 +
 memtorch/cu/simulate_passive.cpp         | 109 +++++
 memtorch/cu/simulate_passive.cuh         |   1 +
 memtorch/cu/simulate_passive.h           |   1 +
 memtorch/cu/simulate_passive_kernels.cu  | 524 +++++++++++++++++++++++
 memtorch/cu/simulate_passive_kernels.cuh |  28 ++
 memtorch/mn/Conv1d.py                    |   1 +
 memtorch/mn/Conv2d.py                    |   1 +
 memtorch/mn/Conv3d.py                    |   1 +
 memtorch/mn/Linear.py                    |   1 +
 memtorch/version.py                      |   2 +-
 13 files changed, 727 insertions(+), 18 deletions(-)
 create mode 100644 memtorch/cu/simulate_passive.cpp
 create mode 100644 memtorch/cu/simulate_passive.cuh
 create mode 100644 memtorch/cu/simulate_passive.h
 create mode 100644 memtorch/cu/simulate_passive_kernels.cu
 create mode 100644 memtorch/cu/simulate_passive_kernels.cuh

diff --git a/memtorch/bh/crossbar/Crossbar.py b/memtorch/bh/crossbar/Crossbar.py
index ce1aef2e..049c6a0f 100644
--- a/memtorch/bh/crossbar/Crossbar.py
+++ b/memtorch/bh/crossbar/Crossbar.py
@@ -10,9 +10,15 @@
 import torch.nn as nn
 
 import memtorch
+from memtorch.bh.memristor import Data_Driven2021
+
+if "cpu" not in memtorch.__version__:
+    import memtorch_cuda_bindings
 
 from .Tile import gen_tiles
 
+CUDA_supported_memristor_models = [Data_Driven2021]
+
 
 @unique
 class Scheme(Enum):
@@ -48,14 +54,17 @@ def __init__(
         shape,
         tile_shape=None,
         use_bindings=True,
+        cuda_malloc_heap_size=50,
         random_crossbar_init=False,
     ):
+        self.memristor_model_params = memristor_model_params
         self.time_series_resolution = memristor_model_params.get(
             "time_series_resolution"
         )
         self.device = torch.device("cpu" if "cpu" in memtorch.__version__ else "cuda")
         self.tile_shape = tile_shape
         self.use_bindings = use_bindings
+        self.cuda_malloc_heap_size = cuda_malloc_heap_size
         if hasattr(memristor_model_params, "r_off"):
             self.r_off_mean = memristor_model_params["r_off"]
             if callable(self.r_off_mean):
@@ -201,7 +210,6 @@ def write_conductance_matrix(
             )
         else:
             raise Exception("Unsupported crossbar shape.")
-
         if self.tile_shape is not None:
             conductance_matrix, tiles_map = gen_tiles(
                 conductance_matrix,
@@ -231,27 +239,53 @@ def write_conductance_matrix(
             )
             self.update(from_devices=False)
         else:
-            if self.tile_shape is not None:
-                for i in range(0, self.devices.shape[0]):
-                    for j in range(0, self.devices.shape[1]):
-                        for k in range(0, self.devices.shape[2]):
+            if (
+                self.use_bindings
+                and type(self.devices.any()) in CUDA_supported_memristor_models
+                and "cpu" not in memtorch.__version__
+            ):
+                device_matrix = torch.FloatTensor(self.g_np(self.devices))
+                device_matrix_aug = device_matrix
+                conductance_matrix_aug = conductance_matrix
+                if (
+                    len(device_matrix.shape) == 2
+                ):  # To ensure compatibility with CUDA code
+                    device_matrix_aug = device_matrix[:, :, None]
+                    conductance_matrix_aug = conductance_matrix[:, :, None]
+
+                self.conductance_matrix = memtorch_cuda_bindings.simulate_passive(
+                    conductance_matrix_aug,
+                    device_matrix_aug,
+                    self.cuda_malloc_heap_size,
+                    **programming_routine_params,
+                    **self.memristor_model_params
+                )
+                self.max_abs_conductance = (
+                    torch.abs(self.conductance_matrix).flatten().max()
+                )
+                self.update(from_devices=False)
+            else:
+                if self.tile_shape is not None:
+                    for i in range(0, self.devices.shape[0]):
+                        for j in range(0, self.devices.shape[1]):
+                            for k in range(0, self.devices.shape[2]):
+                                self.devices = programming_routine(
+                                    self,
+                                    (i, j, k),
+                                    conductance_matrix[i][j][k],
+                                    **programming_routine_params
+                                )
+                else:
+                    for i in range(0, self.rows):
+                        for j in range(0, self.columns):
                             self.devices = programming_routine(
                                 self,
-                                (i, j, k),
-                                conductance_matrix[i][j][k],
+                                (i, j),
+                                conductance_matrix[i][j],
                                 **programming_routine_params
                             )
-            else:
-                for i in range(0, self.rows):
-                    for j in range(0, self.columns):
-                        self.devices = programming_routine(
-                            self,
-                            (i, j),
-                            conductance_matrix[i][j],
-                            **programming_routine_params
-                        )
 
-            self.update(from_devices=True)
+                self.update(from_devices=True)
 
 
 def init_crossbar(
@@ -266,6 +300,7 @@ def init_crossbar(
     scheme=Scheme.DoubleColumn,
     tile_shape=(128, 128),
     use_bindings=True,
+    cuda_malloc_heap_size=50,
     random_crossbar_init=False,
 ):
     """Method to initialise and construct memristive crossbars.
@@ -319,6 +354,7 @@ def init_crossbar(
                         channel_weights.shape,
                         tile_shape,
                         use_bindings=use_bindings,
+                        cuda_malloc_heap_size=cuda_malloc_heap_size,
                         random_crossbar_init=random_crossbar_init,
                     )
                 )
@@ -329,6 +365,7 @@ def init_crossbar(
                         channel_weights.shape,
                         tile_shape,
                         use_bindings=use_bindings,
+                        cuda_malloc_heap_size=cuda_malloc_heap_size,
                         random_crossbar_init=random_crossbar_init,
                     )
                 )
@@ -413,6 +450,7 @@ def out(crossbars, operation, idx=(0, 1), **kwargs):
                         channel_weights.shape,
                         tile_shape,
                         use_bindings=use_bindings,
+                        random_crossbar_init=random_crossbar_init,
                     )
                 )
                 conductance_matrix = mapping_routine(
@@ -437,6 +475,7 @@ def out(crossbars, operation, idx=(0, 1), **kwargs):
                     weights.shape,
                     tile_shape,
                     use_bindings=use_bindings,
+                    random_crossbar_init=random_crossbar_init,
                 )
             )
             conductance_matrix = mapping_routine(
diff --git a/memtorch/bh/memristor/Data_Driven2021.py b/memtorch/bh/memristor/Data_Driven2021.py
index ac50893f..062ed1f9 100644
--- a/memtorch/bh/memristor/Data_Driven2021.py
+++ b/memtorch/bh/memristor/Data_Driven2021.py
@@ -4,6 +4,7 @@
 
 import memtorch
 from memtorch.utils import clip
+
 from .Memristor import Memristor as Memristor
 
 
diff --git a/memtorch/cu/bindings.cpp b/memtorch/cu/bindings.cpp
index 96a691eb..eb9b9c73 100644
--- a/memtorch/cu/bindings.cpp
+++ b/memtorch/cu/bindings.cpp
@@ -6,11 +6,13 @@
 #include "inference.h"
 #include "solve_passive.h"
 #include "tile_matmul.h"
+#include "simulate_passive.h"
 
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   gen_tiles_bindings_gpu(m);
   tile_matmul_bindings(m);
   inference_bindings(m);
+  simulate_passive_bindings(m);
   solve_passive_bindings(m);
 }
\ No newline at end of file
diff --git a/memtorch/cu/simulate_passive.cpp b/memtorch/cu/simulate_passive.cpp
new file mode 100644
index 00000000..777aa7bd
--- /dev/null
+++ b/memtorch/cu/simulate_passive.cpp
@@ -0,0 +1,109 @@
+#include <ATen/ATen.h>
+#include <cmath>
+#include <torch/extension.h>
+
+#include <Eigen/Core>
+
+#include <Eigen/SparseCore>
+
+#include <Eigen/SparseLU>
+
+#include "simulate_passive_kernels.cuh"
+
+//Default values
+std::vector<float> r_p{2699.2336, -672.930205};
+std::vector<float> r_n{649.413746, -1474.32358};
+
+void simulate_passive_bindings(py::module_ &m) {
+
+  //Data_Driven2021 model
+  m.def(
+      "simulate_passive",
+      [&](at::Tensor conductance_matrix, at::Tensor device_matrix,int cuda_malloc_heap_size, float rel_tol,
+          float pulse_duration, float refactory_period, float pos_voltage_level, float neg_voltage_level,
+          float timeout, float force_adjustment, float force_adjustment_rel_tol, float force_adjustment_pos_voltage_threshold,
+          float force_adjustment_neg_voltage_threshold, float time_series_resolution , float r_off, float r_on, float A_p, float A_n, float t_p, float t_n,
+          float k_p, float k_n, std::vector<float>  r_p, std::vector<float> r_n, float a_p, float a_n, float b_p, float b_n, bool sim_neighbors) {
+        return simulate_passive_dd(conductance_matrix, device_matrix,cuda_malloc_heap_size, rel_tol,
+          pulse_duration, refactory_period,  pos_voltage_level,  neg_voltage_level,
+          timeout, force_adjustment, force_adjustment_rel_tol, force_adjustment_pos_voltage_threshold,
+          force_adjustment_neg_voltage_threshold,time_series_resolution,r_off,r_on,A_p,A_n,t_p,t_n,k_p,k_n,r_p,r_n,a_p,a_n,b_p,b_n, sim_neighbors);
+      },
+      py::arg("conductance_matrix"), py::arg("device_matrix"),py::arg("cuda_malloc_heap_size")=50, py::arg("rel_tol")=0.1,
+      py::arg("pulse_duration") = 1e-3, py::arg("refactory_period") = 0, py::arg("pos_voltage_level") = 1.0,
+      py::arg("neg_voltage_level") = -1.0, py::arg("timeout") = 5, py::arg("force_adjustment") = 1e-3,
+      py::arg("force_adjustment_rel_tol") = 1e-1, py::arg("force_adjustment_pos_voltage_threshold") = 0,
+      py::arg("force_adjustment_neg_voltage_threshold") = 0, py::arg("time_series_resolution") = 1e-10, py::arg("r_off") = 10000, py::arg("r_on") = 1000, py::arg("A_p") = 600.10075,
+      py::arg("A_n")=-34.5988399, py::arg("t_p") = -0.0212028, py::arg("t_n") = -0.05343997, py::arg("k_p") = 5.11e-4, py::arg("k_n") = 1.17e-3,
+      py::arg("r_p") = r_p, py::arg("r_n") = r_n, py::arg("a_p")=0.32046175,
+      py::arg("a_n")=0.32046175, py::arg("b_p")=2.71689828, py::arg("b_n")=2.71689828, py::arg("simulate_neighbours") = true); //Maybe change order of simulate_neighbours to before memristor args
+
+  //Linear Ion Drift
+  m.def(
+      "simulate_passive",
+      [&](at::Tensor conductance_matrix, at::Tensor device_matrix,int cuda_malloc_heap_size, float rel_tol,
+          float pulse_duration, float refactory_period, float pos_voltage_level, float neg_voltage_level,
+          float timeout, float force_adjustment, float force_adjustment_rel_tol, float force_adjustment_pos_voltage_threshold,
+          float force_adjustment_neg_voltage_threshold, float time_series_resolution , float r_off, float r_on, float u_v,
+          float d,float pos_write_threshold, float neg_write_threshold, float p, bool sim_neighbors) {
+        return simulate_passive_linearIonDrift(conductance_matrix, device_matrix,cuda_malloc_heap_size, rel_tol,
+          pulse_duration, refactory_period,  pos_voltage_level,  neg_voltage_level,
+          timeout, force_adjustment, force_adjustment_rel_tol, force_adjustment_pos_voltage_threshold,
+          force_adjustment_neg_voltage_threshold,time_series_resolution,r_off,r_on, u_v,
+          d, pos_write_threshold, neg_write_threshold, p,sim_neighbors);
+      },
+      py::arg("conductance_matrix"), py::arg("device_matrix"),py::arg("cuda_malloc_heap_size")=50, py::arg("rel_tol")=0.1,
+      py::arg("pulse_duration") = 1e-3, py::arg("refactory_period") = 0, py::arg("pos_voltage_level") = 1.0,
+      py::arg("neg_voltage_level") = -1.0, py::arg("timeout") = 5, py::arg("force_adjustment") = 1e-3,
+      py::arg("force_adjustment_rel_tol") = 1e-1, py::arg("force_adjustment_pos_voltage_threshold") = 0,
+      py::arg("force_adjustment_neg_voltage_threshold") = 0, py::arg("time_series_resolution") = 1e-4, py::arg("r_off") = 10000, py::arg("r_on") = 1000, py::arg("u_v") = 1e-14,
+      py::arg("d") = 10e-9, py::arg("pos_write_threshold") = 0.55, py::arg("neg_write_threshold") = -0.55, py::arg("p") = 1, py::arg("simulate_neighbours") = true);
+
+   //VTEAM
+   m.def(
+      "simulate_passive",
+      [&](at::Tensor conductance_matrix, at::Tensor device_matrix,int cuda_malloc_heap_size, float rel_tol,
+          float pulse_duration, float refactory_period, float pos_voltage_level, float neg_voltage_level,
+          float timeout, float force_adjustment, float force_adjustment_rel_tol, float force_adjustment_pos_voltage_threshold,
+          float force_adjustment_neg_voltage_threshold, float time_series_resolution , float r_off, float r_on, float d,
+          float k_on, float k_off, float alpha_on,  float alpha_off, float v_on, float v_off, float x_on, float x_off, bool sim_neighbors) {
+        return simulate_passive_VTEAM(conductance_matrix, device_matrix,cuda_malloc_heap_size, rel_tol,
+          pulse_duration, refactory_period,  pos_voltage_level,  neg_voltage_level,
+          timeout, force_adjustment, force_adjustment_rel_tol, force_adjustment_pos_voltage_threshold,
+          force_adjustment_neg_voltage_threshold,time_series_resolution,r_off,r_on,d,
+          k_on, k_off, alpha_on, alpha_off, v_on, v_off, x_on, x_off, sim_neighbors);
+      },
+      py::arg("conductance_matrix"), py::arg("device_matrix"),py::arg("cuda_malloc_heap_size")=50, py::arg("rel_tol")=0.1,
+      py::arg("pulse_duration") = 1e-3, py::arg("refactory_period") = 0, py::arg("pos_voltage_level") = 1.0,
+      py::arg("neg_voltage_level") = -1.0, py::arg("timeout") = 5, py::arg("force_adjustment") = 1e-3,
+      py::arg("force_adjustment_rel_tol") = 1e-1, py::arg("force_adjustment_pos_voltage_threshold") = 0,
+      py::arg("force_adjustment_neg_voltage_threshold") = 0, py::arg("time_series_resolution") = 1e-10, py::arg("r_off") = 10000, py::arg("r_on") = 1000, py::arg("d") = 3e-9,
+      py::arg("k_on") =-10, py::arg("k_off") = 5e-4, py::arg("alpha_on") =3, py::arg("alpha_off") = 1, py::arg("v_on") = 0.2, py::arg("v_off") = 0.02, py::arg("x_on") = 0,
+      py::arg("x_off") = 3e-9, py::arg("simulate_neighbours") = true);
+
+   //Stanford_PKU
+   m.def(
+      "simulate_passive",
+      [&](at::Tensor conductance_matrix, at::Tensor device_matrix,int cuda_malloc_heap_size, float rel_tol,
+          float pulse_duration, float refactory_period, float pos_voltage_level, float neg_voltage_level,
+          float timeout, float force_adjustment, float force_adjustment_rel_tol, float force_adjustment_pos_voltage_threshold,
+          float force_adjustment_neg_voltage_threshold, float time_series_resolution , float r_off, float r_on, float gap_init,
+          float g_0, float V_0, float I_0, float read_voltage, float T_init, float R_th, float gamma_init,
+          float beta, float t_ox, float F_min, float vel_0, float E_a, float a_0, float delta_g_init,
+          float model_switch, float T_crit, float T_smth, bool sim_neighbors) {
+        return simulate_passive_Stanford_PKU(conductance_matrix, device_matrix,cuda_malloc_heap_size, rel_tol,
+          pulse_duration, refactory_period,  pos_voltage_level,  neg_voltage_level,
+          timeout, force_adjustment, force_adjustment_rel_tol, force_adjustment_pos_voltage_threshold,
+          force_adjustment_neg_voltage_threshold,time_series_resolution,r_off,r_on, gap_init,
+          g_0, V_0, I_0, read_voltage,  T_init, R_th, gamma_init, beta, t_ox, F_min, vel_0, E_a, a_0,
+          delta_g_init, model_switch, T_crit, T_smth, sim_neighbors);
+      },
+      py::arg("conductance_matrix"), py::arg("device_matrix"),py::arg("cuda_malloc_heap_size")=50, py::arg("rel_tol")=0.1,
+      py::arg("pulse_duration") = 1e-3, py::arg("refactory_period") = 0, py::arg("pos_voltage_level") = 1.0,
+      py::arg("neg_voltage_level") = -1.0, py::arg("timeout") = 5, py::arg("force_adjustment") = 1e-3,
+      py::arg("force_adjustment_rel_tol") = 1e-1, py::arg("force_adjustment_pos_voltage_threshold") = 0,
+      py::arg("force_adjustment_neg_voltage_threshold") = 0, py::arg("time_series_resolution") = 1e-10, py::arg("r_off") = 10000, py::arg("r_on") = 1000, py::arg("gap_init") = 2e-10,
+      py::arg("g_0") = 0.25e-9, py::arg("V_0") = 0.25, py::arg("I_0") = 1000e-6, py::arg("read_voltage") = 0.1, py::arg("T_init") = 298, py::arg("R_th") = 2.1e3,
+      py::arg("gamma_init") = 16, py::arg("beta") = 0.8, py::arg("t_ox") = 12e-9,py::arg("F_min") = 1.4e9, py::arg("vel_0") = 10, py::arg("E_a") = 0.6, py::arg("a_0") = 0.25e-9,
+      py::arg("delta_g_init") = 0.02, py::arg("model_switch") = 0, py::arg("T_crit") = 450, py::arg("T_smth") = 500, py::arg("simulate_neighbours") = true);
+}
\ No newline at end of file
diff --git a/memtorch/cu/simulate_passive.cuh b/memtorch/cu/simulate_passive.cuh
new file mode 100644
index 00000000..65ec0cc3
--- /dev/null
+++ b/memtorch/cu/simulate_passive.cuh
@@ -0,0 +1 @@
+//unused so far
diff --git a/memtorch/cu/simulate_passive.h b/memtorch/cu/simulate_passive.h
new file mode 100644
index 00000000..dabda189
--- /dev/null
+++ b/memtorch/cu/simulate_passive.h
@@ -0,0 +1 @@
+void simulate_passive_bindings(py::module_ &m);
\ No newline at end of file
diff --git a/memtorch/cu/simulate_passive_kernels.cu b/memtorch/cu/simulate_passive_kernels.cu
new file mode 100644
index 00000000..d1ff7f71
--- /dev/null
+++ b/memtorch/cu/simulate_passive_kernels.cu
@@ -0,0 +1,524 @@
+#include "cuda_runtime.h"
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <iostream>
+#include <limits>
+#include <math.h>
+#include <torch/extension.h>
+#include <torch/types.h>
+#include <assert.h>
+#include <stdexcept>
+
+#include <Eigen/Core>
+#include <Eigen/SparseCore>
+
+#include <Eigen/SparseLU>
+
+#include "simulate_passive.h"
+#include "solve_sparse_linear.h"
+#include "utils.cuh"
+
+__constant__ float positive_voltage;
+__constant__ float negative_voltage;
+
+__constant__ int NX; //number of rows
+__constant__ int NY; //number of columns
+__constant__ int NZ; //number of tiles
+
+__constant__ float tsr_global; //time series resolution
+__constant__ float r_off_global;
+__constant__ float r_on_global;
+__constant__ float pulse_dur_global;   //pulse duration
+__constant__ float writing_tol_global; //rel_tol
+__constant__ float res_adjustment; //adjustment made if the resistance is stuck
+__constant__ float res_adjustment_rel_tol;
+
+//Data_Driven Global variables
+__constant__ float t_n_global;
+__constant__ float t_p_global;
+
+__constant__ float s_n_global;
+__constant__ float s_p_global;
+__constant__ float r_p_global;
+__constant__ float r_n_global;
+
+__constant__ float r_p_0;
+__constant__ float r_p_1;
+__constant__ float r_n_0;
+__constant__ float r_n_1;
+__constant__ float A_p_global;
+__constant__ float A_n_global;
+
+
+
+/**
+ * Cuda kernel to simulate the devices with neighbors
+ *
+ * @param device_matrix the device matrix of conductances
+ * @param current_i the position in i for the current device to simulate
+ * @param current_j the position in j for the current device to simulate
+ * @param instruction_array array of integers corresponding to the current instruction for the tile being programmed
+ * @param r_n_arr array of r_n values shared across devices
+ * @param s_n_arr array of s_n values shared across devices
+ * @param r_p_arr array of r_n values shared across devices
+ * @param s_p_arr array of s_n values shared across devices
+ * @param r_n_half_arr array of r_n/2 values shared across devices
+ * @param s_n_half_arr array of s_n/2 values shared across devices
+ * @param r_p_half_arr array of r_n/2 values shared across devices
+ * @param s_p_half_arr array of s_n/2 values shared across devices
+ */
+
+__global__ void simulate_device_dd(float *device_matrix, int current_i, int current_j, int *instruction_array, float *r_n_arr, float *s_n_arr, float *r_n_half_arr, float *s_n_half_arr, float *r_p_arr, float *s_p_arr, float *r_p_half_arr, float *s_p_half_arr)
+{
+  int i = threadIdx.x + blockIdx.x * blockDim.x; // for (int i = 0; i < NX; i++)
+  int j = threadIdx.y + blockIdx.y * blockDim.y; // for (int j = 0; j < NY; j++)
+  int k = threadIdx.z + blockIdx.z * blockDim.z; // for (int k = 0; k < NZ; k++)
+  if (i < NX && j < NY && k < NZ)
+  {
+    float resistance_;
+    int index = (k * NX * NY) + (j * NX) + i;
+    if (i == current_i && j == current_j) //if it is the device to program
+    {
+      float R0 = 1 / device_matrix[index];
+      if (instruction_array[k] == 1)
+      {
+        resistance_ = (R0 + (s_p_arr[k] * r_p_arr[k] * (r_p_arr[k] - R0)) * pulse_dur_global) / (1 + s_p_arr[k] * (r_p_arr[k] - R0) * pulse_dur_global);
+        if(resistance_ < r_p_arr[k]){
+            resistance_ = R0;
+        }
+      }
+      else if (instruction_array[k] == 2)
+      {
+        resistance_ = (R0 + (s_n_arr[k] * r_n_arr[k] * (r_n_arr[k] - R0)) * pulse_dur_global) / (1 + s_n_arr[k] * (r_n_arr[k] - R0) * pulse_dur_global);
+        if(resistance_ > r_n_arr[k]){
+            resistance_ = R0;
+        }
+      }
+      if (instruction_array[k] != 0)
+      {
+        if(resistance_ >= R0 - res_adjustment_rel_tol*R0 && resistance_ <= R0 + res_adjustment_rel_tol*R0){
+            if(instruction_array[k] == 2 && resistance_ < r_off_global)
+                resistance_ += res_adjustment*resistance_;
+            else if(instruction_array[k] == 1 && resistance_ > r_on_global)
+                resistance_ -= res_adjustment*resistance_;
+        }
+      }
+    }
+    else if (i == current_i || j == current_j) //if the device is in the same row or column
+    {
+      float R0 = 1 / device_matrix[index];
+      if (instruction_array[k] == 1)
+      {
+        resistance_ = (R0 + (s_p_half_arr[k] * r_p_half_arr[k] * (r_p_half_arr[k] - R0)) * pulse_dur_global) / (1 + s_p_half_arr[k] * (r_p_half_arr[k] - R0) * pulse_dur_global);
+      }
+      else if (instruction_array[k] == 2)
+      {
+        resistance_ = (R0 + (s_n_half_arr[k] * r_n_half_arr[k] * (r_n_half_arr[k] - R0)) * pulse_dur_global) / (1 + s_n_half_arr[k] * (r_n_half_arr[k] - R0) * pulse_dur_global);
+      }
+    }
+    if((i == current_i && j == current_j) || (i == current_i || j == current_j)){
+      if (instruction_array[k] != 0)
+      {
+        //Check to ensure that the resistance remains within possible range
+        if (resistance_ > r_off_global)
+        {
+          resistance_ = r_off_global;
+        }
+        if (resistance_ < r_on_global)
+        {
+          resistance_ = r_on_global;
+        }
+        device_matrix[index] = 1 / resistance_;
+      }
+    }
+  }
+}
+
+/**
+ * Cuda kernel to simulate the devices without neighbors
+ *
+ * @param device_matrix the device matrix of conductances
+ * @param conductance_matrix the matrix of target conductances
+ * @param force_adjustment_pos_voltage_threshold maximum positive voltage
+ * @param force_adjustment_neg_voltage_threshold minimum negative voltage
+
+ */
+__global__ void simulate_device_dd_no_neighbours(float *device_matrix,float *conductance_matrix,float force_adjustment_pos_voltage_threshold,float force_adjustment_neg_voltage_threshold)
+{
+  int i = threadIdx.x + blockIdx.x * blockDim.x; // for (int i = 0; i < i; i++)
+  int j = threadIdx.y + blockIdx.y * blockDim.y; // for (int j = 0; j < j; j++)
+  int k = threadIdx.z + blockIdx.z * blockDim.z; // for (int k = 0; k < z; k++)
+  if (i < NX && j < NY && k < NZ)
+  {
+   int index = (k * NX * NY) + (j * NX) + i;
+   float R0 = 1 / device_matrix[index];
+   float target_R = 1 / conductance_matrix[index];
+   float resistance_;
+   float s_n;
+   float r_n;
+   float s_p;
+   float r_p;
+   float neg_voltage_level = negative_voltage;
+   float pos_voltage_level = positive_voltage;
+   int iterations = 0;
+   while ((R0 < target_R - writing_tol_global*target_R || R0 > target_R + writing_tol_global*target_R) && iterations < 1000)
+    {
+       iterations += 1;
+       if(R0 < target_R - writing_tol_global*target_R)
+            {
+              s_n = A_n_global * (exp(abs(neg_voltage_level) / t_n_global) - 1);
+              r_n = r_n_0 + r_n_1 * neg_voltage_level;
+              resistance_ = (R0 + (s_n * r_n * (r_n - R0)) * pulse_dur_global) / (1 + s_n * (r_n - R0) * pulse_dur_global);
+              if(resistance_ > r_n)
+              {
+                resistance_ = R0;
+              }
+              pos_voltage_level = positive_voltage;
+              if(neg_voltage_level > force_adjustment_neg_voltage_threshold){
+                    neg_voltage_level -= 0.02;
+              }
+              if(resistance_ >= R0 - res_adjustment_rel_tol*R0 && resistance_ <= R0 + res_adjustment_rel_tol*R0){
+                resistance_ += res_adjustment*resistance_;
+              }
+              R0 = resistance_;
+            }
+       else if (R0 > target_R + writing_tol_global*target_R)
+            {
+              s_p = A_p_global * (exp(abs(pos_voltage_level) / t_p_global) - 1);
+              r_p = r_p_0 + r_p_1 * pos_voltage_level;
+              resistance_ = (R0 + (s_p * r_p * (r_p - R0)) * pulse_dur_global) / (1 + s_p * (r_p - R0) * pulse_dur_global);
+              neg_voltage_level = negative_voltage;
+              if (resistance_ < r_p)
+              {
+                resistance_ = R0; // Artificially confine the resistance between r_on and r_off
+              }
+              if(resistance_ >= R0 - res_adjustment_rel_tol*R0 && resistance_ <= R0 + res_adjustment_rel_tol*R0){
+                resistance_ -= res_adjustment*resistance_;
+              }
+              if(pos_voltage_level < force_adjustment_pos_voltage_threshold){
+                  pos_voltage_level += 0.02;
+              }
+              R0 = resistance_;
+            }
+    }
+  //Check to ensure that the resistance remains within possible range
+  if (R0 > r_off_global)
+      {
+          R0 = r_off_global;
+      }
+  if (R0 < r_on_global)
+        {
+          R0 = r_on_global;
+        }
+  device_matrix[index] = 1/R0;
+}
+}
+
+/**
+ * Simulates passive crossbar programming using CUDA. The amplitudes of the voltages are increased or decreased by 0.02V (TODO: make this a parameter) every time the desired resistance is not achieved
+ * The voltages are reset to their initial values if the desired resistance is passed.
+ *
+ *
+ * @param conductance_matrix
+ * @param device_matrix
+ * @param cuda_malloc_heap_size
+ * @param rel_tol acceptable tolerance on the achieved tolerance value
+ * @param pulse_duration duration of the pulses
+ * @param cuda_malloc_heap_size maximum heap size for Cuda
+ * @param refactory_period refactory period in between every pulse sent //Not currently used//
+ * @param pos_voltage_level initial positive voltage level
+ * @param neg_voltage_level initial negative voltage level
+ * @param timeout timeout in between every pulse sent //Not currently used//
+ * @param force_adjustment percentage of the current resistance value to artificially force towards target resistance if the programming has not changed the resistance significantly (+/- force_adjustment*current_resistance)
+ * @param force_adjustment_rel_tol percentage of the previous resistance used in determining if the resistance has not changed enough in one cycle to warrant forced adjustment using parameter force_adjustment (if current_resistance </> previous_resistance +/- force_adjustment_rel_tol*previous_resistance where the signs depend on polarity )
+ * @param force_adjustment_pos_voltage_threshold Maximum voltage that the incrementation of 0.02V can lead to (voltage will always be lower than this even with incrementation)
+ * @param force_adjustment_neg_voltage_threshold Minimum voltage that the decrementation of 0.02V can lead to (voltage will always be higher than this even with decrementation)
+ * @param time_series_resolution time series resolution used in the simulation //Not currently used//
+ * @param r_off Resistance at HRS
+ * @param r_on Resistance at LRS
+ * @param A_p A_p parameter of Data Driven model
+ * @param A_n A_n parameter of Data Driven model
+ * @param t_p t_p parameter of Data Driven model
+ * @param t_n t_n parameter of Data Driven model
+ * @param k_p k_p parameter of Data Driven model
+ * @param k_n k_n parameter of Data Driven model
+ * @param r_p r_p parameter of Data Driven model
+ * @param r_n r_n parameter of Data Driven model
+ * @param a_p r_p parameter of Data Driven model
+ * @param a_n r_n parameter of Data Driven model
+ * @param b_p r_p parameter of Data Driven model
+ * @param b_n r_n parameter of Data Driven model
+ * @param simulate_neighbors boolean to determine if neighbor simulation is necessary
+ * @return Tensor of new devices
+ */
+at::Tensor simulate_passive_dd(at::Tensor conductance_matrix, at::Tensor device_matrix,int cuda_malloc_heap_size, float rel_tol,
+                               float pulse_duration, float refactory_period, float pos_voltage_level, float neg_voltage_level,
+                               float timeout, float force_adjustment, float force_adjustment_rel_tol, float force_adjustment_pos_voltage_threshold,
+                               float force_adjustment_neg_voltage_threshold, float time_series_resolution, float r_off, float r_on, float A_p, float A_n, float t_p, float t_n,
+                               float k_p, float k_n, std::vector<float> r_p, std::vector<float> r_n, float a_p, float a_n, float b_p, float b_n, bool sim_neighbors)
+{
+
+  assert(at::cuda::is_available());
+  //Assign global variables their value
+  float original_pos_voltage = pos_voltage_level;
+  float original_neg_voltage = neg_voltage_level;
+  const size_t sz = sizeof(float);
+  const size_t si = sizeof(int);
+  cudaDeviceSetLimit(cudaLimitMallocHeapSize,
+                     size_t(1024) * size_t(1024) *
+                         size_t(cuda_malloc_heap_size));
+  cudaSafeCall(cudaMemcpyToSymbol(res_adjustment_rel_tol, &force_adjustment_rel_tol, sz, size_t(0), cudaMemcpyHostToDevice));
+  cudaSafeCall(cudaMemcpyToSymbol(res_adjustment, &force_adjustment, sz, size_t(0), cudaMemcpyHostToDevice));
+  cudaSafeCall(cudaMemcpyToSymbol(writing_tol_global, &rel_tol, sz, size_t(0), cudaMemcpyHostToDevice));
+  cudaSafeCall(cudaMemcpyToSymbol(tsr_global, &time_series_resolution, sz, size_t(0), cudaMemcpyHostToDevice));
+  cudaSafeCall(cudaMemcpyToSymbol(r_off_global, &r_off, sz, size_t(0), cudaMemcpyHostToDevice));
+  cudaSafeCall(cudaMemcpyToSymbol(r_on_global, &r_on, sz, size_t(0), cudaMemcpyHostToDevice));
+  cudaSafeCall(cudaMemcpyToSymbol(t_p_global, &t_p, sz, size_t(0), cudaMemcpyHostToDevice));
+  cudaSafeCall(cudaMemcpyToSymbol(t_n_global, &t_n, sz, size_t(0), cudaMemcpyHostToDevice));
+  cudaSafeCall(cudaMemcpyToSymbol(A_p_global, &A_p, sz, size_t(0), cudaMemcpyHostToDevice));
+  cudaSafeCall(cudaMemcpyToSymbol(A_n_global, &A_n, sz, size_t(0), cudaMemcpyHostToDevice));
+  cudaSafeCall(cudaMemcpyToSymbol(pulse_dur_global, &pulse_duration, sz, size_t(0), cudaMemcpyHostToDevice));
+  float *device_matrix_accessor = device_matrix.data_ptr<float>();
+  float *conductance_matrix_accessor = conductance_matrix.data_ptr<float>();
+  float *device_matrix_device;
+  float *conductance_matrix_device;
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, 0);
+  const int nz = conductance_matrix.sizes()[0]; //n_tiles
+  const int ny = conductance_matrix.sizes()[2]; //n_columns
+  const int nx = conductance_matrix.sizes()[1]; //n_rows
+  int max_threads = prop.maxThreadsDim[0];
+  dim3 grid;
+  dim3 block;
+  at::Tensor new_device_matrix;
+  if (nx * ny * nz > max_threads)
+  {
+    int n_grid = ceil_int_div(nx * ny * nz, max_threads);
+    grid = dim3(n_grid, n_grid, n_grid);
+    block = dim3(ceil_int_div(nx, n_grid), ceil_int_div(ny, n_grid), ceil_int_div(nz, n_grid));
+  }
+  else
+  {
+    grid = dim3(1, 1, 1);
+    block = dim3(nx, ny, nz);
+  }
+  cudaMemcpyToSymbol(NX, &nx, si, size_t(0), cudaMemcpyHostToDevice);
+  cudaMemcpyToSymbol(NY, &ny, si, size_t(0), cudaMemcpyHostToDevice);
+  cudaMemcpyToSymbol(NZ, &nz, si, size_t(0), cudaMemcpyHostToDevice);
+  //boolean set to true when all tiles are programmed and false otherwise
+  bool all_tiles_programmed = false;
+  if (!sim_neighbors)
+  {
+    cudaMalloc(&conductance_matrix_device, sizeof(float) * nz * nx * ny);
+    cudaMemcpy(conductance_matrix_device, conductance_matrix_accessor, nz * ny * nx * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMalloc(&device_matrix_device, sizeof(float) * nz * nx * ny);
+    cudaMemcpy(device_matrix_device, device_matrix_accessor, nz * ny * nx * sizeof(float), cudaMemcpyHostToDevice);
+    simulate_device_dd_no_neighbours<<<grid, block>>>(device_matrix_device,conductance_matrix_device,force_adjustment_pos_voltage_threshold,force_adjustment_neg_voltage_threshold);
+    cudaSafeCall(cudaDeviceSynchronize()); //Erreur ici
+    cudaMemcpy(device_matrix_accessor, device_matrix_device, nz * ny * nx * sizeof(float), cudaMemcpyDeviceToHost);
+    new_device_matrix = torch::from_blob(device_matrix_accessor, {nz,nx,ny},at::kFloat);
+    cudaSafeCall(cudaFree(device_matrix_device));
+    cudaSafeCall(cudaFree(conductance_matrix_device));
+  }
+  else
+  {
+    int iterations = 0;
+    float *neg_voltage_levels;
+    float *pos_voltage_levels;
+    //vector passed to the threads to determine the amplitude of the negative voltages
+    neg_voltage_levels = new float[nz];
+    //vector passed to the threads to determine the amplitude of the positive voltages
+    pos_voltage_levels = new float[nz];
+    //set all the voltages to be passed to the threads to their initial value
+    for (int k = 0; k < nz; k++)
+    {
+      neg_voltage_levels[k] = original_neg_voltage;
+      pos_voltage_levels[k] = original_pos_voltage;
+    }
+    int *instruction_array;
+    //vector passed to the threads to determine the polarity of the voltage on each tile: 0 -> no voltage, 1 -> positive_voltage, 2 -> negative_voltage
+    //the size of this vector corresponds to the number of tiles
+    instruction_array = (int *)malloc(nz * si);
+    float *r_n_array;
+    float *r_p_array;
+    float *s_n_array;
+    float *s_p_array;
+    //vector of data driven parameter for each tile as they will be the same for all devices
+    r_n_array = (float *)malloc(nz * sz);
+    r_p_array = (float *)malloc(nz * sz);
+    s_n_array = (float *)malloc(nz * sz);
+    s_p_array = (float *)malloc(nz * sz);
+    float *r_n_half_array;
+    float *r_p_half_array;
+    float *s_n_half_array;
+    float *s_p_half_array;
+    r_n_half_array = (float *)malloc(nz * sz);
+    r_p_half_array = (float *)malloc(nz * sz);
+    s_n_half_array = (float *)malloc(nz * sz);
+    s_p_half_array = (float *)malloc(nz * sz);
+    int *i_a;
+    //n_programmed corresponds to the number of tiles programmed
+    int n_programmed;
+    float *r_n_arr;
+    float *r_p_arr;
+    float *s_n_arr;
+    float *s_p_arr;
+    float *r_n_half_arr;
+    float *r_p_half_arr;
+    float *s_n_half_arr;
+    float *s_p_half_arr;
+    //Allocate the memory for all necessary arrays on the GPU
+    cudaMalloc(&i_a, sizeof(int) * nz);
+    cudaMalloc(&r_n_arr, sizeof(float) * nz);
+    cudaMalloc(&r_p_arr, sizeof(float) * nz);
+    cudaMalloc(&s_n_arr, sizeof(float) * nz);
+    cudaMalloc(&s_p_arr, sizeof(float) * nz);
+    cudaMalloc(&r_n_half_arr, sizeof(float) * nz);
+    cudaMalloc(&r_p_half_arr, sizeof(float) * nz);
+    cudaMalloc(&s_n_half_arr, sizeof(float) * nz);
+    cudaMalloc(&s_p_half_arr, sizeof(float) * nz);
+    cudaMalloc(&device_matrix_device, sizeof(float) * nz * nx * ny);
+
+    for (int i = 0; i < nx; i++)
+    { //for all i rows
+      for (int j = 0; j < ny; j++)
+      { //for all j columns
+        all_tiles_programmed = false;
+        iterations = 0;
+        while (!all_tiles_programmed)
+        {
+          if (iterations == 100)
+          { //Safety to ensure we do not get stuck with devices TODO: make this a variable
+            printf("unable to program device(s) at row %d and column %d\n",i,j);
+            all_tiles_programmed = true;
+            iterations = 0;
+          }
+          iterations++;
+          n_programmed = 0;
+          for (int k = 0; k < nz; k++)
+          { //should not be a very big array (corresponds to the number of tiles)
+            int index = i + j * nx + k * nx * ny;
+            if (1/conductance_matrix[k][i][j].item<float>() - rel_tol * 1/conductance_matrix[k][i][j].item<float>() > 1/device_matrix_accessor[index])
+            {
+              instruction_array[k] = 2;
+              s_n_array[k] = A_n * (exp(abs(neg_voltage_levels[k]) / t_n) - 1);
+              r_n_array[k] = r_n[0] + r_n[1] * neg_voltage_levels[k];
+              s_n_half_array[k] = A_n * (exp(abs(neg_voltage_levels[k] / 2) / t_n) - 1);
+              r_n_half_array[k] = r_n[0] + r_n[1] * neg_voltage_levels[k] / 2;
+              s_p_array[k] = 0;
+              r_p_array[k] = 0;
+              s_p_half_array[k] = 0;
+              r_p_half_array[k] = 0;
+              pos_voltage_levels[k] = original_pos_voltage;
+              //0.02 hard coded so far
+              if(neg_voltage_levels[k] > force_adjustment_neg_voltage_threshold){
+                    neg_voltage_levels[k] -= 0.02;
+              }
+            }
+            else if (1/conductance_matrix[k][i][j].item<float>() + rel_tol * 1/conductance_matrix[k][i][j].item<float>() < 1/(device_matrix_accessor[index]))
+            {
+              instruction_array[k] = 1;
+              s_p_array[k] = A_p * (exp(abs(pos_voltage_levels[k]) / t_p) - 1);
+              r_p_array[k] = r_p[0] + r_p[1] * pos_voltage_levels[k];
+              s_p_half_array[k] = A_p * (exp(abs(pos_voltage_levels[k] / 2) / t_p) - 1);
+              r_p_half_array[k] = r_p[0] + r_p[1] * pos_voltage_levels[k] / 2;
+              s_n_array[k] = 0;
+              r_n_array[k] = 0;
+              s_n_half_array[k] = 0;
+              r_n_half_array[k] = 0;
+              neg_voltage_levels[k] = original_neg_voltage;
+              if(pos_voltage_levels[k] < force_adjustment_pos_voltage_threshold){
+                  pos_voltage_levels[k] += 0.02;
+              }
+            }
+            else
+            {
+              instruction_array[k] = 0;
+              n_programmed++;
+            }
+          }
+          if (n_programmed == nz && nz != 0)
+          {
+            all_tiles_programmed = true;
+            iterations = 0;
+            n_programmed = 0;
+          }
+          //The gain in rapidity is probably limited by these transfers of data to the GPU but is still significant.
+          cudaMemcpy(i_a, instruction_array, nz * sizeof(int), cudaMemcpyHostToDevice);
+          cudaMemcpy(r_n_arr, r_n_array, nz * sizeof(float), cudaMemcpyHostToDevice);
+          cudaMemcpy(s_n_arr, s_n_array, nz * sizeof(float), cudaMemcpyHostToDevice);
+          cudaMemcpy(r_n_half_arr, r_n_half_array, nz * sizeof(float), cudaMemcpyHostToDevice);
+          cudaMemcpy(s_n_half_arr, s_n_half_array, nz * sizeof(float), cudaMemcpyHostToDevice);
+          cudaMemcpy(r_p_arr, r_p_array, nz * sizeof(float), cudaMemcpyHostToDevice);
+          cudaMemcpy(s_p_arr, s_p_array, nz * sizeof(float), cudaMemcpyHostToDevice);
+          cudaMemcpy(r_p_half_arr, r_p_half_array, nz * sizeof(float), cudaMemcpyHostToDevice);
+          cudaMemcpy(s_p_half_arr, s_p_half_array, nz * sizeof(float), cudaMemcpyHostToDevice);
+          cudaMemcpy(device_matrix_device, device_matrix_accessor, nz * ny * nx * sizeof(float), cudaMemcpyHostToDevice);
+          simulate_device_dd<<<grid, block>>>(device_matrix_device, i, j, i_a, r_n_arr, s_n_arr, r_n_half_arr, s_n_half_arr, r_p_arr, s_p_arr, r_p_half_arr, s_p_half_arr);
+          cudaSafeCall(cudaDeviceSynchronize());
+          cudaMemcpy(device_matrix_accessor, device_matrix_device, nz * ny * nx * sizeof(float), cudaMemcpyDeviceToHost);
+        }
+      }
+    }
+    new_device_matrix = torch::from_blob(device_matrix_accessor, {nz,nx,ny},at::kFloat);
+    cudaSafeCall(cudaFree(i_a));
+    cudaSafeCall(cudaFree(r_n_arr));
+    cudaSafeCall(cudaFree(s_n_arr));
+    cudaSafeCall(cudaFree(r_n_half_arr));
+    cudaSafeCall(cudaFree(s_n_half_arr));
+    cudaSafeCall(cudaFree(r_p_arr));
+    cudaSafeCall(cudaFree(s_p_arr));
+    cudaSafeCall(cudaFree(r_p_half_arr));
+    cudaSafeCall(cudaFree(s_p_half_arr));
+    cudaSafeCall(cudaFree(device_matrix_device));
+  }
+  cudaStreamSynchronize(at::cuda::getCurrentCUDAStream());
+  return new_device_matrix;
+}
+
+at::Tensor simulate_passive_linearIonDrift(at::Tensor conductance_matrix, at::Tensor device_matrix, int cuda_malloc_heap_size, float rel_tol,
+                                           float pulse_duration, float refactory_period, float pos_voltage_level, float neg_voltage_level,
+                                           float timeout, float force_adjustment, float force_adjustment_rel_tol, float force_adjustment_pos_voltage_threshold,
+                                           float force_adjustment_neg_voltage_threshold, float time_series_resolution, float r_off, float r_on, float u_v,
+                                           float d, float pos_write_threshold, float neg_write_threshold, float p, bool sim_neighbors)
+{
+
+  assert(at::cuda::is_available());
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, 0);
+  int max_threads = prop.maxThreadsDim[0];
+  throw std::runtime_error(std::string("Failed: simulate_passive_linearIonDrift has yet to be implemented"));
+  return conductance_matrix;
+}
+
+at::Tensor simulate_passive_Stanford_PKU(at::Tensor conductance_matrix, at::Tensor device_matrix,int cuda_malloc_heap_size, float rel_tol,
+                                         float pulse_duration, float refactory_period, float pos_voltage_level, float neg_voltage_level,
+                                         float timeout, float force_adjustment, float force_adjustment_rel_tol, float force_adjustment_pos_voltage_threshold,
+                                         float force_adjustment_neg_voltage_threshold, float time_series_resolution, float r_off, float r_on, float gap_init,
+                                         float g_0, float V_0, float I_0, float read_voltage, float T_init, float R_th, float gamma_init,
+                                         float beta, float t_ox, float F_min, float vel_0, float E_a, float a_0, float delta_g_init,
+                                         float model_switch, float T_crit, float T_smth, bool sim_neighbors)
+{
+
+  assert(at::cuda::is_available());
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, 0);
+  int max_threads = prop.maxThreadsDim[0];
+  throw std::runtime_error(std::string("Failed: simulate_passive_Stanford_PKU has yet to be implemented"));
+  return conductance_matrix;
+}
+
+at::Tensor simulate_passive_VTEAM(at::Tensor conductance_matrix, at::Tensor device_matrix, int cuda_malloc_heap_size, float rel_tol,
+                                  float pulse_duration, float refactory_period, float pos_voltage_level, float neg_voltage_level,
+                                  float timeout, float force_adjustment, float force_adjustment_rel_tol, float force_adjustment_pos_voltage_threshold,
+                                  float force_adjustment_neg_voltage_threshold, float time_series_resolution, float r_off, float r_on, float d,
+                                  float k_on, float k_off, float alpha_on, float alpha_off, float v_on, float v_off, float x_on, float x_off, bool sim_neighbors)
+{
+
+  assert(at::cuda::is_available());
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, 0);
+  int max_threads = prop.maxThreadsDim[0];
+  throw std::runtime_error(std::string("Failed: simulate_passive_VTEAM has yet to be implemented"));
+  return conductance_matrix;
+}
diff --git a/memtorch/cu/simulate_passive_kernels.cuh b/memtorch/cu/simulate_passive_kernels.cuh
new file mode 100644
index 00000000..670ba7a3
--- /dev/null
+++ b/memtorch/cu/simulate_passive_kernels.cuh
@@ -0,0 +1,28 @@
+at::Tensor simulate_passive_dd(at::Tensor conductance_matrix, at::Tensor device_matrix, int cuda_malloc_heap_size, float rel_tol,
+          float pulse_duration, float refactory_period, float pos_voltage_level, float neg_voltage_level,
+          float timeout, float force_adjustment, float force_adjustment_rel_tol, float force_adjustment_pos_voltage_threshold,
+          float force_adjustment_neg_voltage_threshold, float time_series_resolution , float r_off, float r_on, float A_p, float A_n, float t_p, float t_n,
+          float k_p, float k_n, std::vector<float> r_p, std::vector<float> r_n, float a_p, float a_n, float b_p, float b_n, bool sim_neighbors);
+
+at::Tensor simulate_passive_linearIonDrift(at::Tensor conductance_matrix, at::Tensor device_matrix, int cuda_malloc_heap_size, float rel_tol,
+          float pulse_duration, float refactory_period, float pos_voltage_level, float neg_voltage_level,
+          float timeout, float force_adjustment, float force_adjustment_rel_tol, float force_adjustment_pos_voltage_threshold,
+          float force_adjustment_neg_voltage_threshold, float time_series_resolution , float r_off, float r_on, float u_v,
+          float d,float pos_write_threshold, float neg_write_threshold, float p, bool sim_neighbors);
+
+at::Tensor simulate_passive_Stanford_PKU(at::Tensor conductance_matrix, at::Tensor device_matrix, int cuda_malloc_heap_size, float rel_tol,
+          float pulse_duration, float refactory_period, float pos_voltage_level, float neg_voltage_level,
+          float timeout, float force_adjustment, float force_adjustment_rel_tol, float force_adjustment_pos_voltage_threshold,
+          float force_adjustment_neg_voltage_threshold, float time_series_resolution , float r_off, float r_on, float gap_init,
+          float g_0, float V_0, float I_0, float read_voltage, float T_init, float R_th, float gamma_init,
+          float beta, float t_ox, float F_min, float vel_0, float E_a, float a_0, float delta_g_init,
+          float model_switch, float T_crit, float T_smth, bool sim_neighbors);
+
+at::Tensor simulate_passive_VTEAM(at::Tensor conductance_matrix, at::Tensor device_matrix, int cuda_malloc_heap_size, float rel_tol,
+          float pulse_duration, float refactory_period, float pos_voltage_level, float neg_voltage_level,
+          float timeout, float force_adjustment, float force_adjustment_rel_tol, float force_adjustment_pos_voltage_threshold,
+          float force_adjustment_neg_voltage_threshold, float time_series_resolution , float r_off, float r_on, float d,
+          float k_on, float k_off, float alpha_on,  float alpha_off, float v_on, float v_off, float x_on, float x_off, bool sim_neighbors);
+
+
+int countOccurrences(int arr[], int n, int x);
\ No newline at end of file
diff --git a/memtorch/mn/Conv1d.py b/memtorch/mn/Conv1d.py
index 777fcabd..2587d18f 100644
--- a/memtorch/mn/Conv1d.py
+++ b/memtorch/mn/Conv1d.py
@@ -167,6 +167,7 @@ def __init__(
             scheme=scheme,
             tile_shape=tile_shape,
             use_bindings=use_bindings,
+            cuda_malloc_heap_size = self.cuda_malloc_heap_size,
             random_crossbar_init=random_crossbar_init,
         )
         self.transform_output = lambda x: x
diff --git a/memtorch/mn/Conv2d.py b/memtorch/mn/Conv2d.py
index 3ff1cb46..5ca9263a 100644
--- a/memtorch/mn/Conv2d.py
+++ b/memtorch/mn/Conv2d.py
@@ -168,6 +168,7 @@ def __init__(
             scheme=scheme,
             tile_shape=tile_shape,
             use_bindings=use_bindings,
+            cuda_malloc_heap_size=self.cuda_malloc_heap_size,
             random_crossbar_init=random_crossbar_init,
         )
         self.transform_output = lambda x: x
diff --git a/memtorch/mn/Conv3d.py b/memtorch/mn/Conv3d.py
index c95148aa..e59e8c66 100644
--- a/memtorch/mn/Conv3d.py
+++ b/memtorch/mn/Conv3d.py
@@ -167,6 +167,7 @@ def __init__(
             scheme=scheme,
             tile_shape=tile_shape,
             use_bindings=use_bindings,
+            cuda_malloc_heap_size=self.cuda_malloc_heap_size,
             random_crossbar_init=random_crossbar_init,
         )
         self.transform_output = lambda x: x
diff --git a/memtorch/mn/Linear.py b/memtorch/mn/Linear.py
index 7dc9d6fa..ee21fe26 100644
--- a/memtorch/mn/Linear.py
+++ b/memtorch/mn/Linear.py
@@ -159,6 +159,7 @@ def __init__(
             scheme=scheme,
             tile_shape=tile_shape,
             use_bindings=use_bindings,
+            cuda_malloc_heap_size=self.cuda_malloc_heap_size,
             random_crossbar_init=random_crossbar_init,
         )
         self.transform_output = lambda x: x
diff --git a/memtorch/version.py b/memtorch/version.py
index d14425a9..21533d04 100644
--- a/memtorch/version.py
+++ b/memtorch/version.py
@@ -1 +1 @@
-__version__ = "1.1.5-cpu"
+__version__ = "1.1.5-cpu"
\ No newline at end of file