From 55f155ffcc466511f676a5b559a9929ec26bfdfa Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 9 Oct 2025 18:33:12 -0400
Subject: [PATCH 01/15] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-gpu-kernels/src/bit_unpack.rs    |   1 +
 vortex-gpu-kernels/src/lib.rs           |   1 +
 vortex-gpu/Cargo.toml                   |   1 -
 vortex-gpu/kernels/dict_take.cu         |   4 +-
 vortex-gpu/kernels/fused_bitpack_for.cu |  22 +-
 vortex-gpu/src/bit_unpack.rs            |   2 +-
 vortex-gpu/src/for_bp.rs                |  29 +--
 vortex-gpu/src/indent.rs                |  34 +++
 vortex-gpu/src/jit.rs                   | 310 ++++++++++++++++++++++++
 vortex-gpu/src/lib.rs                   |   2 +
 vortex-gpu/src/task.rs                  |   4 +-
 11 files changed, 389 insertions(+), 21 deletions(-)
 create mode 100644 vortex-gpu/src/indent.rs
 create mode 100644 vortex-gpu/src/jit.rs
diff --git a/vortex-gpu-kernels/src/bit_unpack.rs b/vortex-gpu-kernels/src/bit_unpack.rs
index 976b66e9286..743c072a9a0 100644
--- a/vortex-gpu-kernels/src/bit_unpack.rs
+++ b/vortex-gpu-kernels/src/bit_unpack.rs
@@ -157,6 +157,7 @@ fn generate_unpack_for_width<T: FastLanes, W: Write>(
         writeln!(output)?;
         generate_device_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
         writeln!(output)?;
+
         generate_global_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
         writeln!(output)?;
     }
diff --git a/vortex-gpu-kernels/src/lib.rs b/vortex-gpu-kernels/src/lib.rs
index 2130b890b5f..ff4e2a95799 100644
--- a/vortex-gpu-kernels/src/lib.rs
+++ b/vortex-gpu-kernels/src/lib.rs
@@ -5,3 +5,4 @@ mod bit_unpack;
 mod indent;
 
 pub use bit_unpack::generate_unpack;
+pub use indent::IndentedWriter;
diff --git a/vortex-gpu/Cargo.toml b/vortex-gpu/Cargo.toml
index df1239457c5..01ddc50d151 100644
--- a/vortex-gpu/Cargo.toml
+++ b/vortex-gpu/Cargo.toml
@@ -14,7 +14,6 @@ rust-version = { workspace = true }
 version = { workspace = true }
 
 [dependencies]
-#askama = { workspace = true }
 cudarc = { workspace = true, features = ["f16"] }
 parking_lot = { workspace = true }
 vortex-array = { workspace = true }
diff --git a/vortex-gpu/kernels/dict_take.cu b/vortex-gpu/kernels/dict_take.cu
index 3f3b1b0a551..9f222769302 100644
--- a/vortex-gpu/kernels/dict_take.cu
+++ b/vortex-gpu/kernels/dict_take.cu
@@ -20,8 +20,8 @@ __device__ void dict_take(
     const int thread_ops = 32;
 
     for (auto j = 0; j < thread_ops; j++) {
-            auto idx = i * thread_ops + j;
-            out[idx] = values[codes[idx]];
+        auto idx = i * thread_ops + j;
+        out[idx] = values[codes[idx]];
     }
 }
 
diff --git a/vortex-gpu/kernels/fused_bitpack_for.cu b/vortex-gpu/kernels/fused_bitpack_for.cu
index 4fd4bc48149..0ba14fedbe5 100644
--- a/vortex-gpu/kernels/fused_bitpack_for.cu
+++ b/vortex-gpu/kernels/fused_bitpack_for.cu
@@ -108,13 +108,28 @@ __device__ __forceinline__ void for_device(
     }
 }
 
+template<typename ValueT>
+__device__ __forceinline__ void float_device(
+    ValueT *__restrict values_in,
+    float *__restrict values_out,
+    int thread_idx
+) {
+    auto i = thread_idx;
+    const int thread_ops = blockDim.x;
+
+    for (auto j = 0; j < thread_ops; j++) {
+        auto idx = INDEX(j, i);
+        values_out[idx] = (float)values_in[idx];
+    }
+}
+
 
 // Fused kernel: bitpack unpack (3bw) + FoR addition in one pass
 // This eliminates the intermediate write-to-memory and read-from-memory
 // by keeping unpacked values in registers/L1 cache and immediately adding the reference
 extern "C" __global__ void fused_bitpack6_for_u32(
     const uint32_t *__restrict packed_in,
-    uint32_t *__restrict unpacked_out,
+    float *__restrict unpacked_out,
     uint32_t reference
 ) {
     int i = threadIdx.x;
@@ -124,13 +139,16 @@ extern "C" __global__ void fused_bitpack6_for_u32(
     auto out = unpacked_out + (blockIdx.x * 1024);
 
     __shared__ uint32_t shared_data[1024];
+    __shared__ float shared_data2[1024];
 
     fls_unpack_6bw_32ow_device(in, shared_data, i);
 
     for_device(shared_data, reference, i);
 
+    float_device(shared_data, shared_data2, i);
+
     for (int i = 0; i < 32; i++) {
         auto idx = i * 32 + threadIdx.x;
-        out[idx] = shared_data[idx];
+        out[idx] = shared_data2[idx];
     }
 }
diff --git a/vortex-gpu/src/bit_unpack.rs b/vortex-gpu/src/bit_unpack.rs
index 86f3d54a007..288b73d10c7 100644
--- a/vortex-gpu/src/bit_unpack.rs
+++ b/vortex-gpu/src/bit_unpack.rs
@@ -173,7 +173,7 @@ pub fn new_task(
 
         let launch_config = LaunchConfig {
             grid_dim: (num_chunks, 1, 1),
-            block_dim: (if size_of::<P>() == 8 { 16 } else { 32 }, 1, 1),
+            block_dim: (if P::BITS == 64 { 16 } else { 32 }, 1, 1),
             shared_mem_bytes: 0,
         };
 
diff --git a/vortex-gpu/src/for_bp.rs b/vortex-gpu/src/for_bp.rs
index e2b8fb19e40..291459bf146 100644
--- a/vortex-gpu/src/for_bp.rs
+++ b/vortex-gpu/src/for_bp.rs
@@ -26,7 +26,7 @@ struct FoRBPTask<P> {
     launch_config: LaunchConfig,
 
     packed: CudaSlice<P>,
-    unpacked: CudaSlice<P>,
+    unpacked: CudaSlice<f32>,
     reference: P,
 
     len: usize,
@@ -53,7 +53,7 @@ pub fn new_task(
         .map_err(|e| vortex_err!("Failed to copy to device: {e}"))?;
     let cu_out = unsafe {
         stream
-            .alloc::<u32>(array.len().next_multiple_of(1024))
+            .alloc::<f32>(array.len().next_multiple_of(1024))
             .map_err(|e| vortex_err!("Failed to allocate stream: {e}"))?
     };
 
@@ -101,7 +101,7 @@ impl<P: NativePType + DeviceRepr> GPUTask for FoRBPTask<P> {
 
     fn export_result(&mut self) -> VortexResult<Canonical> {
         let len = self.len();
-        let mut buffer = BufferMut::<P>::with_capacity(len);
+        let mut buffer = BufferMut::<f32>::with_capacity(len);
 
         unsafe { buffer.set_len(len) }
         self.stream
@@ -186,16 +186,17 @@ mod tests {
         ctx.set_blocking_synchronize().unwrap();
         let unpacked = cuda_for_bp_unpack(&array, ctx).unwrap();
         let primitive_array = array.into_array().to_primitive();
-        assert_eq!(
-            primitive_array.as_slice::<u32>(),
-            unpacked.as_slice::<u32>()
-        );
-        for i in 0..primitive_array.len() {
-            assert_eq!(
-                primitive_array.as_slice::<u32>()[i],
-                unpacked.as_slice::<u32>()[i],
-                "i {i}"
-            );
-        }
+        println!("unpacked {:?}", unpacked.as_slice::<f32>());
+        // assert_eq!(
+        //     primitive_array.as_slice::<u32>(),
+        //     unpacked.as_slice::<u32>()
+        // );
+        // for i in 0..primitive_array.len() {
+        //     assert_eq!(
+        //         primitive_array.as_slice::<u32>()[i],
+        //         unpacked.as_slice::<u32>()[i],
+        //         "i {i}"
+        //     );
+        // }
     }
 }
diff --git a/vortex-gpu/src/indent.rs b/vortex-gpu/src/indent.rs
new file mode 100644
index 00000000000..0518ef22a54
--- /dev/null
+++ b/vortex-gpu/src/indent.rs
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt::Write;
+use std::fmt;
+
+pub struct IndentedWriter<W: Write> {
+    write: W,
+    indent: String,
+}
+
+impl<W: Write> IndentedWriter<W> {
+    pub fn new(write: W) -> Self {
+        Self {
+            write,
+            indent: "".to_string(),
+        }
+    }
+
+    pub fn indent<F>(&mut self, indented: F) -> fmt::Result
+    where
+        F: FnOnce(&mut IndentedWriter<W>) -> fmt::Result,
+    {
+        let original_ident = self.indent.clone();
+        self.indent += "    ";
+        let res = indented(self);
+        self.indent = original_ident;
+        res
+    }
+
+    pub fn write_fmt(&mut self, fmt: fmt::Arguments<'_>) -> fmt::Result {
+        write!(self.write, "{}{}", self.indent, fmt)
+    }
+}
diff --git a/vortex-gpu/src/jit.rs b/vortex-gpu/src/jit.rs
new file mode 100644
index 00000000000..7d5033aacc5
--- /dev/null
+++ b/vortex-gpu/src/jit.rs
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt;
+use std::fmt::{Display, Write};
+
+use vortex_array::ArrayRef;
+use vortex_dtype::PType;
+use vortex_error::{VortexResult, vortex_err};
+use vortex_fastlanes::{BitPackedVTable, FoRVTable};
+
+use crate::indent::IndentedWriter;
+
+pub enum IterationOrder {
+    InOrder,
+    FastLanesTransposed,
+}
+
+struct GPUKernelParameter {
+    name: String,
+    type_: String,
+}
+
+// struct GPUPipelineParameters {
+//     inputs: Vec<GPUKernelParameter>,
+//     output: GPUKernelParameter,
+//     block_width: usize,
+//     grid_width: usize,
+//     iteration_order: IterationOrder,
+// }
+//
+// struct GPUPipeline {
+//     body: String,
+//     parameters: GPUPipelineParameters,
+// }
+
+// bp -> output
+// tmp = ...
+// out[i] = tmp;
+
+// bp -> for -> output
+
+// tmp = ....
+// tmp_for = tmp + ref
+// output[i] = tmp_for[i]
+
+// have leaves only bp for now.
+
+// step-type (each one has a unique step_id)
+// step_id
+// in_params
+// decls/setup
+// kernel-step body // fn body(var, writer) -> str
+// output_var + output_type
+
+trait GPUPipelineJIT {
+    fn step_id(&self) -> usize;
+
+    fn in_params(&self, params: &mut Vec<GPUKernelParameter>);
+
+    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result;
+
+    fn kernel_body(
+        &self,
+        w: &mut IndentedWriter<&mut dyn Write>,
+        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
+    ) -> fmt::Result;
+
+    fn output_var(&self) -> String;
+
+    fn output_type(&self) -> PType;
+}
+
+struct CUDAType(&'static str);
+
+impl Display for CUDAType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str(self.0)
+    }
+}
+
+impl From<PType> for CUDAType {
+    fn from(value: PType) -> Self {
+        CUDAType(match value {
+            PType::U8 => "uint8_t",
+            PType::U16 => "uint16_t",
+            PType::U32 => "uint32_t",
+            PType::U64 => "uint64_t",
+            PType::I8 => "int8_t",
+            PType::I16 => "int16_t",
+            PType::I32 => "int32_t",
+            PType::I64 => "int64_t",
+            PType::F32 => "float",
+            PType::F64 => "double",
+            PType::F16 => todo!(),
+        })
+    }
+}
+
+struct BitPack {
+    step_id: usize,
+    output_type: PType,
+}
+
+impl GPUPipelineJIT for BitPack {
+    fn step_id(&self) -> usize {
+        self.step_id
+    }
+
+    fn in_params(&self, p: &mut Vec<GPUKernelParameter>) {
+        p.push(GPUKernelParameter {
+            name: self.src_var(),
+            type_: format!("{} *__restrict", CUDAType::from(self.output_type)),
+        });
+    }
+
+    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
+        let output_cuda_type = CUDAType::from(self.output_type);
+        // TODO: all types
+        writeln!(w, "unsigned int LANE_COUNT = 32;")?;
+        writeln!(w, "{output_cuda_type} src{};", self.step_id)?;
+        writeln!(w, "{output_cuda_type} tmp{};", self.step_id)?;
+        Ok(())
+    }
+
+    fn kernel_body(
+        &self,
+        w: &mut IndentedWriter<&mut dyn Write>,
+        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
+    ) -> fmt::Result {
+        for i in 0..4 {
+            let src = self.src_var();
+            let tmp = self.tmp_var();
+            writeln!(w, "{src} = in[thread_ix + {i}];")?;
+            writeln!(w, "{tmp} = ({src} >> 0) & MASK(uint32_t, 1);")?;
+            f(w)?;
+            writeln!(w)?;
+        }
+        Ok(())
+    }
+
+    fn output_var(&self) -> String {
+        self.tmp_var()
+    }
+
+    fn output_type(&self) -> PType {
+        self.output_type
+    }
+}
+
+impl BitPack {
+    fn tmp_var(&self) -> String {
+        format!("tmp{}", self.step_id)
+    }
+
+    fn src_var(&self) -> String {
+        format!("src{}", self.step_id)
+    }
+}
+
+struct FoR {
+    step_id: usize,
+    reference_type: PType,
+    child: Box<dyn GPUPipelineJIT>,
+}
+
+impl FoR {
+    fn tmp_var(&self) -> String {
+        format!("tmp{}", self.step_id)
+    }
+
+    fn ref_var(&self) -> String {
+        format!("ref{}", self.step_id)
+    }
+}
+
+impl GPUPipelineJIT for FoR {
+    fn step_id(&self) -> usize {
+        self.step_id
+    }
+
+    fn in_params(&self, p: &mut Vec<GPUKernelParameter>) {
+        self.child.in_params(p);
+        p.push(GPUKernelParameter {
+            name: self.ref_var(),
+            type_: CUDAType::from(self.output_type()).to_string(),
+        })
+    }
+
+    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
+        self.child.decls(w)?;
+        let output_cuda_type = CUDAType::from(self.output_type());
+        // TODO: supprort all types
+        writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?;
+        Ok(())
+    }
+
+    fn kernel_body(
+        &self,
+        w: &mut IndentedWriter<&mut dyn Write>,
+        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
+    ) -> fmt::Result {
+        assert_eq!(self.output_type(), self.child.output_type());
+        let in_var = self.child.output_var();
+        let out_var = self.tmp_var();
+        let ref_var = self.ref_var();
+        self.child
+            .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
+                writeln!(w, "{out_var} = {in_var} + {ref_var};")?;
+                f(w)
+            })
+    }
+
+    fn output_var(&self) -> String {
+        self.tmp_var()
+    }
+
+    fn output_type(&self) -> PType {
+        self.reference_type
+    }
+}
+
+fn handle_array(a: &ArrayRef, step_id: usize) -> Box<dyn GPUPipelineJIT> {
+    if let Some(bp) = a.as_opt::<BitPackedVTable>() {
+        return Box::new(BitPack {
+            step_id,
+            output_type: bp.ptype(),
+        });
+    };
+
+    if let Some(for_) = a.as_opt::<FoRVTable>() {
+        return Box::new(FoR {
+            step_id,
+            reference_type: for_.reference_scalar().as_primitive().ptype(),
+            child: handle_array(for_.encoded(), step_id + 1),
+        });
+    }
+
+    todo!()
+}
+
+fn _create_jit(a: &ArrayRef) -> fmt::Result {
+    let output = handle_array(a, 0);
+
+    let mut s = String::new();
+    let w = &mut s as &mut dyn Write;
+    let mut ind = IndentedWriter::new(w);
+    let w = &mut ind;
+
+    let mut params = Vec::new();
+    output.in_params(&mut params);
+    params.push(GPUKernelParameter {
+        name: "output".to_string(),
+        type_: format!("{} *__restrict", CUDAType::from(output.output_type())),
+    });
+
+    writeln!(w, "__global__ void kernel(")?;
+    w.indent(|w| {
+        params
+            .iter()
+            .try_for_each(|p| writeln!(w, "{} {},", p.type_, p.name))
+    })?;
+    writeln!(w, ") {{")?;
+
+    w.indent(|w| {
+        output.decls(w)?;
+        writeln!(w)?;
+        output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
+            writeln!(w, "output[idx] = {}", output.output_var())
+        })
+    })?;
+    writeln!(w, "}}")?;
+
+    println!("{}", s);
+
+    Ok(())
+}
+
+fn create_jit(a: &ArrayRef) -> VortexResult<()> {
+    _create_jit(a).map_err(|e| vortex_err!("failed to write decls {e}"))
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::IntoArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_error::VortexResult;
+    use vortex_fastlanes::{BitPackedArray, FoRArray};
+
+    use crate::jit::create_jit;
+
+    #[test]
+    fn jit_arr() -> VortexResult<()> {
+        let for_ = FoRArray::try_new(
+            BitPackedArray::encode(
+                (0u32..10)
+                    .map(|_| 1u32)
+                    .collect::<PrimitiveArray>()
+                    .as_ref(),
+                2,
+            )?
+            .into_array(),
+            2u32.into(),
+        )?;
+
+        create_jit(&for_.into_array())?;
+
+        Ok(())
+    }
+}
diff --git a/vortex-gpu/src/lib.rs b/vortex-gpu/src/lib.rs
index 0af2782f515..2a9b5ed1b20 100644
--- a/vortex-gpu/src/lib.rs
+++ b/vortex-gpu/src/lib.rs
@@ -4,6 +4,8 @@
 pub mod bit_unpack;
 pub mod for_;
 mod for_bp;
+mod indent;
+mod jit;
 mod take;
 mod task;
 
diff --git a/vortex-gpu/src/task.rs b/vortex-gpu/src/task.rs
index 7bf2c079f42..1b678e74508 100644
--- a/vortex-gpu/src/task.rs
+++ b/vortex-gpu/src/task.rs
@@ -6,7 +6,7 @@ use vortex_array::Canonical;
 use vortex_error::VortexResult;
 
 pub trait GPUTask {
-    // Must call `launch_task` or `launch_task_timed` once
+    // Must call `launch_task` once
     fn launch_task(&mut self) -> VortexResult<()>;
 
     // Must call this after launch_task
@@ -16,4 +16,6 @@ pub trait GPUTask {
     fn output(&mut self) -> CudaViewMut<'_, u8>;
 
     fn len(&self) -> usize;
+
+    // fn jit(&mut self) -> OptBox<dyn JITGPUTask> ;
 }

From a8e86227ec2e2108ab99494794501fab378f71c2 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Fri, 10 Oct 2025 16:43:23 -0400
Subject: [PATCH 02/15] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock               |   1 +
 vortex-gpu/Cargo.toml    |   1 +
 vortex-gpu/src/indent.rs |   2 +-
 vortex-gpu/src/jit.rs    | 350 ++++++++++++++++++++++++++++++++++-----
 4 files changed, 313 insertions(+), 41 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 04425053515..f9ecf0f3b06 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -8978,6 +8978,7 @@ dependencies = [
  "parking_lot",
  "rand 0.9.2",
  "rstest",
+ "vortex-alp",
  "vortex-array",
  "vortex-buffer",
  "vortex-dict",
diff --git a/vortex-gpu/Cargo.toml b/vortex-gpu/Cargo.toml
index 01ddc50d151..6a9e6a1aa28 100644
--- a/vortex-gpu/Cargo.toml
+++ b/vortex-gpu/Cargo.toml
@@ -17,6 +17,7 @@ version = { workspace = true }
 cudarc = { workspace = true, features = ["f16"] }
 parking_lot = { workspace = true }
 vortex-array = { workspace = true }
+vortex-alp = { workspace = true }
 vortex-buffer = { workspace = true, features = ["cuda"] }
 vortex-dict = { workspace = true }
 vortex-dtype = { workspace = true }
diff --git a/vortex-gpu/src/indent.rs b/vortex-gpu/src/indent.rs
index 0518ef22a54..5e7d587d202 100644
--- a/vortex-gpu/src/indent.rs
+++ b/vortex-gpu/src/indent.rs
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-use std::fmt::Write;
 use std::fmt;
+use std::fmt::Write;
 
 pub struct IndentedWriter<W: Write> {
     write: W,
diff --git a/vortex-gpu/src/jit.rs b/vortex-gpu/src/jit.rs
index 7d5033aacc5..34617e81401 100644
--- a/vortex-gpu/src/jit.rs
+++ b/vortex-gpu/src/jit.rs
@@ -2,12 +2,17 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use std::fmt;
-use std::fmt::{Display, Write};
+use std::fmt::{Debug, Display, Write};
+use std::sync::Arc;
 
+use cudarc::driver::{CudaContext, CudaSlice, CudaStream, DeviceRepr, LaunchArgs, PushKernelArg};
+use vortex_alp::{ALPFloat, ALPVTable, Exponents, match_each_alp_float_ptype};
 use vortex_array::ArrayRef;
-use vortex_dtype::PType;
-use vortex_error::{VortexResult, vortex_err};
-use vortex_fastlanes::{BitPackedVTable, FoRVTable};
+use vortex_buffer::ByteBuffer;
+use vortex_dtype::half::f16;
+use vortex_dtype::{NativePType, PType, match_each_native_ptype};
+use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_err};
+use vortex_fastlanes::{BitPackedArray, BitPackedVTable, FoRVTable};
 
 use crate::indent::IndentedWriter;
 
@@ -21,6 +26,11 @@ struct GPUKernelParameter {
     type_: String,
 }
 
+struct GPULaunchConfig {
+    block_width: usize,
+    grid_width: usize,
+}
+
 // struct GPUPipelineParameters {
 //     inputs: Vec<GPUKernelParameter>,
 //     output: GPUKernelParameter,
@@ -53,11 +63,17 @@ struct GPUKernelParameter {
 // kernel-step body // fn body(var, writer) -> str
 // output_var + output_type
 
+trait GPUVisitor<'a> {
+    fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()>;
+}
+
 trait GPUPipelineJIT {
     fn step_id(&self) -> usize;
 
     fn in_params(&self, params: &mut Vec<GPUKernelParameter>);
 
+    fn args<'a>(&'a self, stream: &Arc<CudaStream>, args: &mut LaunchArgs<'a>) -> VortexResult<()>;
+
     fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result;
 
     fn kernel_body(
@@ -69,6 +85,13 @@ trait GPUPipelineJIT {
     fn output_var(&self) -> String;
 
     fn output_type(&self) -> PType;
+
+    // always pass the output iteration aligned child last.
+    fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()>;
+
+    fn launch_config(&self) -> Option<GPULaunchConfig> {
+        None
+    }
 }
 
 struct CUDAType(&'static str);
@@ -97,12 +120,15 @@ impl From<PType> for CUDAType {
     }
 }
 
-struct BitPack {
+struct BitPack<P> {
     step_id: usize,
+    bit_width: u8,
+    packed: ByteBuffer,
     output_type: PType,
+    cuda_slice: CudaSlice<P>,
 }
 
-impl GPUPipelineJIT for BitPack {
+impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
     fn step_id(&self) -> usize {
         self.step_id
     }
@@ -110,14 +136,31 @@ impl GPUPipelineJIT for BitPack {
     fn in_params(&self, p: &mut Vec<GPUKernelParameter>) {
         p.push(GPUKernelParameter {
             name: self.src_var(),
-            type_: format!("{} *__restrict", CUDAType::from(self.output_type)),
+            type_: format!(
+                "{type_} *__restrict",
+                type_ = CUDAType::from(self.output_type)
+            ),
         });
     }
 
+    fn args<'a>(
+        &'a self,
+        stream: &Arc<CudaStream>,
+        launch_args: &mut LaunchArgs<'a>, // args: &mut Vec<Box<dyn DeviceRepr>>,
+    ) -> VortexResult<()> {
+        launch_args.arg(&self.cuda_slice);
+
+        Ok(())
+    }
+
     fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
         let output_cuda_type = CUDAType::from(self.output_type);
         // TODO: all types
-        writeln!(w, "unsigned int LANE_COUNT = 32;")?;
+        writeln!(
+            w,
+            "unsigned int LANE_COUNT = {bits};",
+            bits = 1024 / self.output_type.bit_width()
+        )?;
         writeln!(w, "{output_cuda_type} src{};", self.step_id)?;
         writeln!(w, "{output_cuda_type} tmp{};", self.step_id)?;
         Ok(())
@@ -128,13 +171,59 @@ impl GPUPipelineJIT for BitPack {
         w: &mut IndentedWriter<&mut dyn Write>,
         f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
     ) -> fmt::Result {
-        for i in 0..4 {
+        let output = w;
+        let bit_width = self.bit_width as usize;
+        let bits = self.output_type.bit_width();
+        if bit_width == 0 {
+            writeln!(output, "uint{bits}_t zero = 0ULL;")?;
+            writeln!(output)?;
+            for row in 0..bits {
+                writeln!(output, "out[INDEX({row}, lane)] = zero;")?;
+            }
+        } else if bit_width == bits {
+            writeln!(output)?;
+            for row in 0..bits {
+                writeln!(
+                    output,
+                    "out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane];",
+                )?;
+            }
+        } else {
             let src = self.src_var();
             let tmp = self.tmp_var();
-            writeln!(w, "{src} = in[thread_ix + {i}];")?;
-            writeln!(w, "{tmp} = ({src} >> 0) & MASK(uint32_t, 1);")?;
-            f(w)?;
-            writeln!(w)?;
+
+            writeln!(output)?;
+            writeln!(output, "{src} = in[lane];")?;
+            for row in 0..bits {
+                let curr_word = (row * bit_width) / bits;
+                let next_word = ((row + 1) * bit_width) / bits;
+                let shift = (row * bit_width) % bits;
+
+                if next_word > curr_word {
+                    let remaining_bits = ((row + 1) * bit_width) % bits;
+                    let current_bits = bit_width - remaining_bits;
+                    writeln!(
+                        output,
+                        "{tmp} = ({src} >> {shift}) & MASK(uint{bits}_t, {current_bits});"
+                    )?;
+
+                    if next_word < bit_width {
+                        writeln!(output, "{src} = in[lane + LANE_COUNT * {next_word}];")?;
+                        writeln!(
+                            output,
+                            "{tmp} |= ({src} & MASK(uint{bits}_t, {remaining_bits})) << {current_bits};"
+                        )?;
+                    }
+                } else {
+                    writeln!(
+                        output,
+                        "{tmp} = ({src} >> {shift}) & MASK(uint{bits}_t, {bit_width});"
+                    )?;
+                }
+
+                f(output)?;
+                writeln!(output)?;
+            }
         }
         Ok(())
     }
@@ -146,9 +235,20 @@ impl GPUPipelineJIT for BitPack {
     fn output_type(&self) -> PType {
         self.output_type
     }
+
+    fn children(&self, _visitor: &mut dyn GPUVisitor) -> VortexResult<()> {
+        Ok(())
+    }
+
+    fn launch_config(&self) -> Option<GPULaunchConfig> {
+        Some(GPULaunchConfig {
+            block_width: 1024,
+            grid_width: 1,
+        })
+    }
 }
 
-impl BitPack {
+impl<P> BitPack<P> {
     fn tmp_var(&self) -> String {
         format!("tmp{}", self.step_id)
     }
@@ -158,13 +258,13 @@ impl BitPack {
     }
 }
 
-struct FoR {
+struct FoR<P> {
     step_id: usize,
-    reference_type: PType,
+    reference: P,
     child: Box<dyn GPUPipelineJIT>,
 }
 
-impl FoR {
+impl<P> FoR<P> {
     fn tmp_var(&self) -> String {
         format!("tmp{}", self.step_id)
     }
@@ -174,21 +274,28 @@ impl FoR {
     }
 }
 
-impl GPUPipelineJIT for FoR {
+impl<P: NativePType + DeviceRepr> GPUPipelineJIT for FoR<P> {
     fn step_id(&self) -> usize {
         self.step_id
     }
 
     fn in_params(&self, p: &mut Vec<GPUKernelParameter>) {
-        self.child.in_params(p);
         p.push(GPUKernelParameter {
             name: self.ref_var(),
             type_: CUDAType::from(self.output_type()).to_string(),
         })
     }
 
+    fn args<'a>(
+        &'a self,
+        _stream: &Arc<CudaStream>,
+        args: &mut LaunchArgs<'a>,
+    ) -> VortexResult<()> {
+        args.arg(&self.reference);
+        Ok(())
+    }
+
     fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
-        self.child.decls(w)?;
         let output_cuda_type = CUDAType::from(self.output_type());
         // TODO: supprort all types
         writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?;
@@ -216,40 +323,195 @@ impl GPUPipelineJIT for FoR {
     }
 
     fn output_type(&self) -> PType {
-        self.reference_type
+        P::PTYPE
+    }
+
+    fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()> {
+        visitor.accept(self.child.as_ref())
     }
 }
 
-fn handle_array(a: &ArrayRef, step_id: usize) -> Box<dyn GPUPipelineJIT> {
+fn handle_array(a: &ArrayRef, stream: &Arc<CudaStream>, step_id: usize) -> Box<dyn GPUPipelineJIT> {
+    if let Some(alp) = a.as_opt::<ALPVTable>() {
+        match_each_alp_float_ptype!(alp.ptype(), |A| {
+            return Box::new(ALP {
+                step_id,
+                float_type: alp.ptype(),
+                child: handle_array(alp.encoded(), stream, step_id + 1),
+                f: A::F10[alp.exponents().f as usize],
+                e: A::IF10[alp.exponents().e as usize],
+            });
+        })
+    }
     if let Some(bp) = a.as_opt::<BitPackedVTable>() {
+        assert_eq!(bp.offset(), 0);
+        assert!(bp.patches().is_none());
+        let cuda_slice = stream
+            .memcpy_stod(bp.packed().as_slice())
+            .map_err(|e| vortex_err!("Failed to copy to device: {e}"))
+            .vortex_unwrap();
         return Box::new(BitPack {
             step_id,
+            bit_width: bp.bit_width(),
+            packed: bp.packed().clone(),
             output_type: bp.ptype(),
+            cuda_slice,
         });
     };
 
     if let Some(for_) = a.as_opt::<FoRVTable>() {
-        return Box::new(FoR {
-            step_id,
-            reference_type: for_.reference_scalar().as_primitive().ptype(),
-            child: handle_array(for_.encoded(), step_id + 1),
-        });
+        match_each_native_ptype!(for_.reference_scalar().as_primitive().ptype(), |P| {
+            return Box::new(FoR {
+                step_id,
+                reference: for_
+                    .reference_scalar()
+                    .as_primitive()
+                    .as_::<P>()
+                    .vortex_expect("cannot have a null reference"),
+                child: handle_array(for_.encoded(), stream, step_id + 1),
+            });
+        })
     }
 
     todo!()
 }
 
+struct ALP<A: ALPFloat> {
+    step_id: usize,
+    float_type: PType,
+    child: Box<dyn GPUPipelineJIT>,
+    f: A,
+    e: A,
+}
+
+impl<A: ALPFloat> ALP<A> {
+    fn tmp_var(&self) -> String {
+        format!("tmp{}", self.step_id)
+    }
+
+    fn e_var(&self) -> String {
+        format!("e{}", self.step_id)
+    }
+
+    fn f_var(&self) -> String {
+        format!("f{}", self.step_id)
+    }
+}
+
+impl<A: ALPFloat + DeviceRepr> GPUPipelineJIT for ALP<A> {
+    fn step_id(&self) -> usize {
+        self.step_id
+    }
+
+    fn in_params(&self, params: &mut Vec<GPUKernelParameter>) {
+        params.extend([
+            GPUKernelParameter {
+                name: self.e_var(),
+                type_: CUDAType::from(A::PTYPE).to_string(),
+            },
+            GPUKernelParameter {
+                name: self.f_var(),
+                type_: CUDAType::from(A::PTYPE).to_string(),
+            },
+        ])
+    }
+
+    fn args<'a>(&'a self, stream: &Arc<CudaStream>, args: &mut LaunchArgs<'a>) -> VortexResult<()> {
+        args.arg(&self.e);
+        args.arg(&self.f);
+        Ok(())
+    }
+
+    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
+        let output_cuda_type = CUDAType::from(self.float_type);
+        writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?;
+        Ok(())
+    }
+
+    fn kernel_body(
+        &self,
+        w: &mut IndentedWriter<&mut dyn Write>,
+        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
+    ) -> fmt::Result {
+        self.child
+            .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
+                writeln!(
+                    w,
+                    "{out} = ((({type_}){tmp}) * {f}) * {e};",
+                    tmp = self.tmp_var(),
+                    type_ = CUDAType::from(self.float_type),
+                    out = self.child.output_var(),
+                    f = self.f_var(),
+                    e = self.e_var(),
+                )?;
+                f(w)
+            })
+    }
+
+    fn output_var(&self) -> String {
+        self.tmp_var()
+    }
+
+    fn output_type(&self) -> PType {
+        self.float_type
+    }
+
+    fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()> {
+        visitor.accept(self.child.as_ref())
+    }
+}
+
+struct DeclPrinter<'a, 'b: 'a> {
+    w: &'a mut IndentedWriter<&'b mut dyn Write>,
+}
+
+impl<'a> GPUVisitor<'a> for DeclPrinter<'a, '_> {
+    fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> {
+        node.children(self)?;
+        node.decls(self.w)
+            .map_err(|e| vortex_err!("cannot write {}", e))
+    }
+}
+
+struct InParamPrinter {
+    params: Vec<GPUKernelParameter>,
+}
+
+impl GPUVisitor<'_> for InParamPrinter {
+    fn accept(&mut self, node: &dyn GPUPipelineJIT) -> VortexResult<()> {
+        node.children(self)?;
+        node.in_params(&mut self.params);
+        Ok(())
+    }
+}
+
+struct ArgCollector<'a> {
+    stream: Arc<CudaStream>,
+    params: &'a mut LaunchArgs<'a>,
+}
+
+impl<'a> GPUVisitor<'a> for ArgCollector<'a> {
+    fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> {
+        node.children(self)?;
+        node.args(&self.stream, self.params)?;
+        Ok(())
+    }
+}
+
 fn _create_jit(a: &ArrayRef) -> fmt::Result {
-    let output = handle_array(a, 0);
+    let ctx = CudaContext::new(0).unwrap();
+    ctx.set_blocking_synchronize().unwrap();
+    let output = handle_array(a, &ctx.default_stream(), 0);
 
     let mut s = String::new();
     let w = &mut s as &mut dyn Write;
     let mut ind = IndentedWriter::new(w);
     let w = &mut ind;
 
-    let mut params = Vec::new();
-    output.in_params(&mut params);
-    params.push(GPUKernelParameter {
+    let mut params = InParamPrinter { params: Vec::new() };
+    params.accept(output.as_ref()).vortex_expect("cannot fail");
+
+    params.params.push(GPUKernelParameter {
         name: "output".to_string(),
         type_: format!("{} *__restrict", CUDAType::from(output.output_type())),
     });
@@ -257,13 +519,15 @@ fn _create_jit(a: &ArrayRef) -> fmt::Result {
     writeln!(w, "__global__ void kernel(")?;
     w.indent(|w| {
         params
+            .params
             .iter()
             .try_for_each(|p| writeln!(w, "{} {},", p.type_, p.name))
     })?;
     writeln!(w, ") {{")?;
 
     w.indent(|w| {
-        output.decls(w)?;
+        let mut decl = DeclPrinter { w };
+        decl.accept(output.as_ref()).vortex_expect("cannot fail");
         writeln!(w)?;
         output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
             writeln!(w, "output[idx] = {}", output.output_var())
@@ -282,6 +546,7 @@ fn create_jit(a: &ArrayRef) -> VortexResult<()> {
 
 #[cfg(test)]
 mod tests {
+    use vortex_alp::{ALPArray, Exponents};
     use vortex_array::IntoArray;
     use vortex_array::arrays::PrimitiveArray;
     use vortex_error::VortexResult;
@@ -291,16 +556,21 @@ mod tests {
 
     #[test]
     fn jit_arr() -> VortexResult<()> {
-        let for_ = FoRArray::try_new(
-            BitPackedArray::encode(
-                (0u32..10)
-                    .map(|_| 1u32)
-                    .collect::<PrimitiveArray>()
-                    .as_ref(),
-                2,
+        let for_ = ALPArray::try_new(
+            FoRArray::try_new(
+                BitPackedArray::encode(
+                    (0i32..10)
+                        .map(|_| 1i32)
+                        .collect::<PrimitiveArray>()
+                        .as_ref(),
+                    2,
+                )?
+                .into_array(),
+                2i32.into(),
             )?
             .into_array(),
-            2u32.into(),
+            Exponents { e: 4, f: 5 },
+            None,
         )?;
 
         create_jit(&for_.into_array())?;

From 4bab23bccc688ca985dfa06ffa96d0f5ace4dd12 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Mon, 13 Oct 2025 14:50:33 -0400
Subject: [PATCH 03/15] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock            |   1 +
 vortex-gpu/Cargo.toml |   1 +
 vortex-gpu/src/jit.rs | 190 +++++++++++++++++++++++++++++++++---------
 3 files changed, 151 insertions(+), 41 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 692d6f8af52..604f37a8144 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -8977,6 +8977,7 @@ dependencies = [
  "anyhow",
  "criterion",
  "cudarc",
+ "itertools 0.14.0",
  "parking_lot",
  "rand 0.9.2",
  "rstest",
diff --git a/vortex-gpu/Cargo.toml b/vortex-gpu/Cargo.toml
index 2cc4875097e..0a81e79367c 100644
--- a/vortex-gpu/Cargo.toml
+++ b/vortex-gpu/Cargo.toml
@@ -17,6 +17,7 @@ publish = false
 [dependencies]
 cudarc = { workspace = true, features = ["f16"] }
 parking_lot = { workspace = true }
+itertools = { workspace = true }
 vortex-array = { workspace = true }
 vortex-alp = { workspace = true }
 vortex-buffer = { workspace = true, features = ["cuda"] }
diff --git a/vortex-gpu/src/jit.rs b/vortex-gpu/src/jit.rs
index 34617e81401..a9609c3332e 100644
--- a/vortex-gpu/src/jit.rs
+++ b/vortex-gpu/src/jit.rs
@@ -5,10 +5,15 @@ use std::fmt;
 use std::fmt::{Debug, Display, Write};
 use std::sync::Arc;
 
-use cudarc::driver::{CudaContext, CudaSlice, CudaStream, DeviceRepr, LaunchArgs, PushKernelArg};
+use cudarc::driver::{
+    CudaContext, CudaSlice, CudaStream, DeviceRepr, LaunchArgs, LaunchConfig, PushKernelArg,
+};
+use itertools::Itertools;
 use vortex_alp::{ALPFloat, ALPVTable, Exponents, match_each_alp_float_ptype};
-use vortex_array::ArrayRef;
-use vortex_buffer::ByteBuffer;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::validity::Validity;
+use vortex_array::{Array, ArrayRef, Canonical, IntoArray};
+use vortex_buffer::{Buffer, BufferMut, ByteBuffer};
 use vortex_dtype::half::f16;
 use vortex_dtype::{NativePType, PType, match_each_native_ptype};
 use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_err};
@@ -105,14 +110,14 @@ impl Display for CUDAType {
 impl From<PType> for CUDAType {
     fn from(value: PType) -> Self {
         CUDAType(match value {
-            PType::U8 => "uint8_t",
-            PType::U16 => "uint16_t",
-            PType::U32 => "uint32_t",
-            PType::U64 => "uint64_t",
-            PType::I8 => "int8_t",
-            PType::I16 => "int16_t",
-            PType::I32 => "int32_t",
-            PType::I64 => "int64_t",
+            PType::U8 => "unsigned char",
+            PType::U16 => "unsigned short",
+            PType::U32 => "unsigned int",
+            PType::U64 => "unsigned long long",
+            PType::I8 => "char",
+            PType::I16 => "short",
+            PType::I32 => "int",
+            PType::I64 => "long long",
             PType::F32 => "float",
             PType::F64 => "double",
             PType::F16 => todo!(),
@@ -135,7 +140,7 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
 
     fn in_params(&self, p: &mut Vec<GPUKernelParameter>) {
         p.push(GPUKernelParameter {
-            name: self.src_var(),
+            name: self.in_var(),
             type_: format!(
                 "{type_} *__restrict",
                 type_ = CUDAType::from(self.output_type)
@@ -163,6 +168,8 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
         )?;
         writeln!(w, "{output_cuda_type} src{};", self.step_id)?;
         writeln!(w, "{output_cuda_type} tmp{};", self.step_id)?;
+        writeln!(w, "unsigned int out_idx;")?;
+        writeln!(w, "unsigned int lane = threadIdx.x;")?;
         Ok(())
     }
 
@@ -174,6 +181,7 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
         let output = w;
         let bit_width = self.bit_width as usize;
         let bits = self.output_type.bit_width();
+        let in_ = self.in_var();
         if bit_width == 0 {
             writeln!(output, "uint{bits}_t zero = 0ULL;")?;
             writeln!(output)?;
@@ -185,15 +193,24 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
             for row in 0..bits {
                 writeln!(
                     output,
-                    "out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane];",
+                    "out[INDEX({row}, lane)] = {in_}[LANE_COUNT * {row} + lane];",
                 )?;
             }
         } else {
             let src = self.src_var();
             let tmp = self.tmp_var();
+            println!("P {}", P::PTYPE);
+
+            let mask_fn = |bits: usize| {
+                format!(
+                    "((({type_})1 << {width}) - 1)",
+                    type_ = CUDAType::from(P::PTYPE),
+                    width = bit_width
+                )
+            };
 
             writeln!(output)?;
-            writeln!(output, "{src} = in[lane];")?;
+            writeln!(output, "{src} = {in}[lane];", in = self.in_var())?;
             for row in 0..bits {
                 let curr_word = (row * bit_width) / bits;
                 let next_word = ((row + 1) * bit_width) / bits;
@@ -204,23 +221,26 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
                     let current_bits = bit_width - remaining_bits;
                     writeln!(
                         output,
-                        "{tmp} = ({src} >> {shift}) & MASK(uint{bits}_t, {current_bits});"
+                        "{tmp} = ({src} >> {shift}) & {mask};",
+                        mask = mask_fn(current_bits)
                     )?;
 
                     if next_word < bit_width {
-                        writeln!(output, "{src} = in[lane + LANE_COUNT * {next_word}];")?;
+                        writeln!(output, "{src} = {in_}[lane + LANE_COUNT * {next_word}];")?;
                         writeln!(
                             output,
-                            "{tmp} |= ({src} & MASK(uint{bits}_t, {remaining_bits})) << {current_bits};"
+                            "{tmp} |= ({src} & {mask}) << {current_bits};",
+                            mask = mask_fn(remaining_bits)
                         )?;
                     }
                 } else {
                     writeln!(
                         output,
-                        "{tmp} = ({src} >> {shift}) & MASK(uint{bits}_t, {bit_width});"
+                        "{tmp} = ({src} >> {shift}) & {mask};",
+                        mask = mask_fn(bit_width)
                     )?;
                 }
-
+                writeln!(output, "out_idx = INDEX({row}, lane);")?;
                 f(output)?;
                 writeln!(output)?;
             }
@@ -256,6 +276,14 @@ impl<P> BitPack<P> {
     fn src_var(&self) -> String {
         format!("src{}", self.step_id)
     }
+
+    fn in_var(&self) -> String {
+        format!("in{}", self.step_id)
+    }
+
+    fn out_idx(&self) -> String {
+        format!("out_idx{}", self.step_id)
+    }
 }
 
 struct FoR<P> {
@@ -346,17 +374,20 @@ fn handle_array(a: &ArrayRef, stream: &Arc<CudaStream>, step_id: usize) -> Box<d
     if let Some(bp) = a.as_opt::<BitPackedVTable>() {
         assert_eq!(bp.offset(), 0);
         assert!(bp.patches().is_none());
-        let cuda_slice = stream
-            .memcpy_stod(bp.packed().as_slice())
-            .map_err(|e| vortex_err!("Failed to copy to device: {e}"))
-            .vortex_unwrap();
-        return Box::new(BitPack {
-            step_id,
-            bit_width: bp.bit_width(),
-            packed: bp.packed().clone(),
-            output_type: bp.ptype(),
-            cuda_slice,
-        });
+        match_each_native_ptype!(bp.ptype(), |P| {
+            let values = Buffer::<P>::from_byte_buffer(bp.packed().clone());
+            let cuda_slice = stream
+                .memcpy_stod(values.as_slice())
+                .map_err(|e| vortex_err!("Failed to copy to device: {e}"))
+                .vortex_unwrap();
+            return Box::new(BitPack::<P> {
+                step_id,
+                bit_width: bp.bit_width(),
+                packed: bp.packed().clone(),
+                output_type: bp.ptype(),
+                cuda_slice,
+            });
+        })
     };
 
     if let Some(for_) = a.as_opt::<FoRVTable>() {
@@ -419,6 +450,7 @@ impl<A: ALPFloat + DeviceRepr> GPUPipelineJIT for ALP<A> {
     fn args<'a>(&'a self, stream: &Arc<CudaStream>, args: &mut LaunchArgs<'a>) -> VortexResult<()> {
         args.arg(&self.e);
         args.arg(&self.f);
+        println!("---e {}, f {}", self.e, self.f);
         Ok(())
     }
 
@@ -438,9 +470,9 @@ impl<A: ALPFloat + DeviceRepr> GPUPipelineJIT for ALP<A> {
                 writeln!(
                     w,
                     "{out} = ((({type_}){tmp}) * {f}) * {e};",
-                    tmp = self.tmp_var(),
+                    out = self.tmp_var(),
                     type_ = CUDAType::from(self.float_type),
-                    out = self.child.output_var(),
+                    tmp = self.child.output_var(),
                     f = self.f_var(),
                     e = self.e_var(),
                 )?;
@@ -501,7 +533,8 @@ impl<'a> GPUVisitor<'a> for ArgCollector<'a> {
 fn _create_jit(a: &ArrayRef) -> fmt::Result {
     let ctx = CudaContext::new(0).unwrap();
     ctx.set_blocking_synchronize().unwrap();
-    let output = handle_array(a, &ctx.default_stream(), 0);
+    let stream = ctx.default_stream();
+    let output = handle_array(a, &stream, 0);
 
     let mut s = String::new();
     let w = &mut s as &mut dyn Write;
@@ -513,15 +546,31 @@ fn _create_jit(a: &ArrayRef) -> fmt::Result {
 
     params.params.push(GPUKernelParameter {
         name: "output".to_string(),
-        type_: format!("{} *__restrict", CUDAType::from(output.output_type())),
+        type_: format!("{} *__restrict__", CUDAType::from(output.output_type())),
     });
 
-    writeln!(w, "__global__ void kernel(")?;
+    writeln!(w, "__device__ int FL_ORDER[] = {{0, 4, 2, 6, 1, 5, 3, 7}};")?;
+    writeln!(
+        w,
+        "#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)"
+    )?;
+    writeln!(w, "extern \"C\" __global__ void kernel(")?;
     w.indent(|w| {
-        params
-            .params
-            .iter()
-            .try_for_each(|p| writeln!(w, "{} {},", p.type_, p.name))
+        params.params.iter().enumerate().try_for_each(|(idx, p)| {
+            writeln!(
+                w,
+                "{} {}{end}",
+                p.type_,
+                p.name,
+                end = if idx == params.params.len() - 1 {
+                    ""
+                } else {
+                    ","
+                }
+            )
+        })
+
+        // .try_for_each(|p| writeln!(w, "{} {},", p.type_, p.name))
     })?;
     writeln!(w, ") {{")?;
 
@@ -530,12 +579,71 @@ fn _create_jit(a: &ArrayRef) -> fmt::Result {
         decl.accept(output.as_ref()).vortex_expect("cannot fail");
         writeln!(w)?;
         output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
-            writeln!(w, "output[idx] = {}", output.output_var())
+            writeln!(
+                w,
+                "output[out_idx] = {output};",
+                output = output.output_var()
+            )
         })
     })?;
     writeln!(w, "}}")?;
 
     println!("{}", s);
+    let module = cudarc::nvrtc::compile_ptx(s.clone())
+        .map_err(|e| vortex_err!("compile ptx {e}"))
+        .vortex_unwrap();
+    println!("{}", module.to_src());
+
+    // Dynamically load it into the device
+    let module = ctx
+        .load_module(module)
+        .map_err(|e| vortex_err!("load module {e}"))
+        .vortex_unwrap();
+
+    let kernel = module
+        .load_function("kernel")
+        .map_err(|e| vortex_err!("get function {e}"))
+        .vortex_unwrap();
+
+    let num_chunks = u32::try_from(a.len().div_ceil(1024)).vortex_expect("Too many grid elements");
+
+    let mut launch_builder = stream.launch_builder(&kernel);
+
+    let mut collector = ArgCollector {
+        stream: stream.clone(),
+
+        params: &mut launch_builder,
+    };
+    collector
+        .accept(output.as_ref())
+        .vortex_expect("cannot fail");
+
+    let launch_config = LaunchConfig {
+        grid_dim: (num_chunks, 1, 1),
+        block_dim: (32, 1, 1),
+        shared_mem_bytes: 0,
+    };
+
+    let mut out = stream.alloc_zeros::<f32>(a.len()).unwrap();
+    collector.params.arg(&mut out);
+
+    let _ = unsafe { collector.params.launch(launch_config) };
+
+    let mut buffer = BufferMut::<f32>::with_capacity(a.len());
+    unsafe { buffer.set_len(a.len()) }
+
+    stream
+        .memcpy_dtoh(&out, &mut buffer)
+        .map_err(|e| vortex_err!("Failed to copy to device: {e}"))
+        .vortex_unwrap();
+    stream
+        .synchronize()
+        .map_err(|e| vortex_err!("Failed to synchronize: {e}"))
+        .vortex_unwrap();
+    let c = Canonical::Primitive(PrimitiveArray::new(buffer, Validity::NonNullable)).into_array();
+
+    println!("c {}", c.display_tree());
+    println!("c {}", c.display_values());
 
     Ok(())
 }
@@ -559,7 +667,7 @@ mod tests {
         let for_ = ALPArray::try_new(
             FoRArray::try_new(
                 BitPackedArray::encode(
-                    (0i32..10)
+                    (0i32..1024)
                         .map(|_| 1i32)
                         .collect::<PrimitiveArray>()
                         .as_ref(),

From eec4cc93a20f83b509be4649b2d85b0b58d2a9a4 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Mon, 13 Oct 2025 17:53:24 -0400
Subject: [PATCH 04/15] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-gpu/benches/gpu_bitunpack.rs |  70 +++++++-
 vortex-gpu/src/jit.rs               | 268 +++++++++++++++++-----------
 vortex-gpu/src/lib.rs               |   1 +
 3 files changed, 234 insertions(+), 105 deletions(-)

diff --git a/vortex-gpu/benches/gpu_bitunpack.rs b/vortex-gpu/benches/gpu_bitunpack.rs
index 98b5d5b633a..71e18c0ebcd 100644
--- a/vortex-gpu/benches/gpu_bitunpack.rs
+++ b/vortex-gpu/benches/gpu_bitunpack.rs
@@ -10,12 +10,15 @@ use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_m
 use cudarc::driver::CudaContext;
 use rand::prelude::StdRng;
 use rand::{Rng, SeedableRng};
-use vortex_array::{IntoArray, ToCanonical};
+use vortex_alp::{ALPArray, Exponents};
+use vortex_array::{Array, ArrayRef, IntoArray, ToCanonical};
 use vortex_buffer::BufferMut;
 use vortex_dtype::NativePType;
 use vortex_error::VortexUnwrap;
 use vortex_fastlanes::{BitPackedArray, FoRArray};
-use vortex_gpu::{cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed};
+use vortex_gpu::{
+    create_jit, cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed,
+};
 
 // Data sizes: 1GB, 2.5GB, 5GB, 10GB
 // These are approximate sizes in bytes, accounting for bit-packing compression
@@ -61,6 +64,32 @@ fn make_for_bitpackable_array(len: usize) -> FoRArray {
     FoRArray::try_new(bitpacked.into_array(), reference.into()).vortex_unwrap()
 }
 
+fn make_alp_array(len: usize) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(42);
+    let reference = 100i32;
+
+    // Generate values that fit in 6 bits (0-63)
+    let values = (0..len)
+        .map(|_| rng.random_range(0..64))
+        .collect::<BufferMut<i32>>()
+        .into_array()
+        .to_primitive();
+
+    // Create bitpacked array first
+    let bitpacked = BitPackedArray::encode(values.as_ref(), 6).unwrap();
+
+    // Wrap in FoR encoding with reference value
+    ALPArray::try_new(
+        FoRArray::try_new(bitpacked.into_array(), reference.into())
+            .vortex_unwrap()
+            .into_array(),
+        Exponents { e: 4, f: 5 },
+        None,
+    )
+    .vortex_unwrap()
+    .into_array()
+}
+
 fn benchmark_gpu_decompress_kernel_only(c: &mut Criterion) {
     let mut group = c.benchmark_group("gpu_decompress_kernel_only");
 
@@ -155,6 +184,38 @@ fn benchmark_gpu_for_bp_fused_decompress_kernel_only(c: &mut Criterion) {
     group.finish();
 }
 
+fn benchmark_gpu_for_bp_jit_decompress_kernel_only(c: &mut Criterion) {
+    let mut group = c.benchmark_group("benchmark_gpu_for_bp_jit_decompress_kernel_only");
+
+    group.sample_size(10);
+
+    for (len, label) in DATA_SIZES {
+        let len = len.next_multiple_of(1024);
+        let array = make_alp_array(len).into_array();
+
+        let ctx = CudaContext::new(0).unwrap();
+        let ctx = Arc::new(ctx);
+        ctx.set_blocking_synchronize().unwrap();
+
+        group.throughput(Throughput::Bytes(
+            (len * array.dtype().as_ptype().byte_width()) as u64,
+        ));
+        group.bench_with_input(BenchmarkId::new("for/jit", label), &array, |b, array| {
+            b.iter_custom(|iters| {
+                let mut total_time = Duration::ZERO;
+                for _ in 0..iters {
+                    // This only measures kernel execution time, not memory transfers
+                    let (_result, kernel_time) = create_jit(array, Arc::clone(&ctx)).unwrap();
+                    total_time += kernel_time;
+                }
+                total_time
+            });
+        });
+    }
+
+    group.finish();
+}
+
 #[allow(dead_code)]
 fn benchmark_cpu_canonicalize(c: &mut Criterion) {
     let mut group = c.benchmark_group("cpu_canonicalize");
@@ -176,6 +237,9 @@ criterion_group!(
     benches,
     benchmark_gpu_decompress_kernel_only,
     benchmark_gpu_for_decompress_kernel_only,
-    benchmark_gpu_for_bp_fused_decompress_kernel_only
+    benchmark_gpu_for_bp_fused_decompress_kernel_only,
+    benchmark_gpu_for_bp_jit_decompress_kernel_only
 );
+
+// criterion_group!(benches, benchmark_gpu_for_bp_jit_decompress_kernel_only);
 criterion_main!(benches);
diff --git a/vortex-gpu/src/jit.rs b/vortex-gpu/src/jit.rs
index a9609c3332e..0b813c60e0f 100644
--- a/vortex-gpu/src/jit.rs
+++ b/vortex-gpu/src/jit.rs
@@ -2,30 +2,25 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use std::fmt;
-use std::fmt::{Debug, Display, Write};
+use std::fmt::{Display, Write};
 use std::sync::Arc;
+use std::time::Duration;
 
+use cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT;
 use cudarc::driver::{
     CudaContext, CudaSlice, CudaStream, DeviceRepr, LaunchArgs, LaunchConfig, PushKernelArg,
 };
-use itertools::Itertools;
-use vortex_alp::{ALPFloat, ALPVTable, Exponents, match_each_alp_float_ptype};
+use vortex_alp::{ALPFloat, ALPVTable, match_each_alp_float_ptype};
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity;
 use vortex_array::{Array, ArrayRef, Canonical, IntoArray};
-use vortex_buffer::{Buffer, BufferMut, ByteBuffer};
-use vortex_dtype::half::f16;
+use vortex_buffer::{Buffer, BufferMut};
 use vortex_dtype::{NativePType, PType, match_each_native_ptype};
 use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_err};
-use vortex_fastlanes::{BitPackedArray, BitPackedVTable, FoRVTable};
+use vortex_fastlanes::{BitPackedVTable, FoRVTable};
 
 use crate::indent::IndentedWriter;
 
-pub enum IterationOrder {
-    InOrder,
-    FastLanesTransposed,
-}
-
 struct GPUKernelParameter {
     name: String,
     type_: String,
@@ -128,7 +123,6 @@ impl From<PType> for CUDAType {
 struct BitPack<P> {
     step_id: usize,
     bit_width: u8,
-    packed: ByteBuffer,
     output_type: PType,
     cuda_slice: CudaSlice<P>,
 }
@@ -140,17 +134,17 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
 
     fn in_params(&self, p: &mut Vec<GPUKernelParameter>) {
         p.push(GPUKernelParameter {
-            name: self.in_var(),
+            name: self.in_var_g(),
             type_: format!(
                 "{type_} *__restrict",
-                type_ = CUDAType::from(self.output_type)
+                type_ = CUDAType::from(self.output_type.to_unsigned())
             ),
         });
     }
 
     fn args<'a>(
         &'a self,
-        stream: &Arc<CudaStream>,
+        _stream: &Arc<CudaStream>,
         launch_args: &mut LaunchArgs<'a>, // args: &mut Vec<Box<dyn DeviceRepr>>,
     ) -> VortexResult<()> {
         launch_args.arg(&self.cuda_slice);
@@ -160,16 +154,26 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
 
     fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
         let output_cuda_type = CUDAType::from(self.output_type);
+        let uoutput_cuda_type = CUDAType::from(self.output_type.to_unsigned());
         // TODO: all types
         writeln!(
             w,
             "unsigned int LANE_COUNT = {bits};",
             bits = 1024 / self.output_type.bit_width()
         )?;
-        writeln!(w, "{output_cuda_type} src{};", self.step_id)?;
-        writeln!(w, "{output_cuda_type} tmp{};", self.step_id)?;
+        writeln!(w, "{output_cuda_type} {};", self.tmp_var())?;
+        writeln!(w, "{uoutput_cuda_type} {};", self.src_var())?;
+        writeln!(w, "{uoutput_cuda_type} {};", self.utmp_var())?;
         writeln!(w, "unsigned int out_idx;")?;
         writeln!(w, "unsigned int lane = threadIdx.x;")?;
+        writeln!(
+            w,
+            "{uoutput_cuda_type} *{in_l} = {in_g} + (blockIdx.x * 128 * {bit_width} / {bit_size});",
+            in_l = self.in_var_l(),
+            in_g = self.in_var_g(),
+            bit_width = self.bit_width,
+            bit_size = P::PTYPE.byte_width()
+        )?;
         Ok(())
     }
 
@@ -181,7 +185,7 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
         let output = w;
         let bit_width = self.bit_width as usize;
         let bits = self.output_type.bit_width();
-        let in_ = self.in_var();
+        let in_ = self.in_var_l();
         if bit_width == 0 {
             writeln!(output, "uint{bits}_t zero = 0ULL;")?;
             writeln!(output)?;
@@ -198,19 +202,19 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
             }
         } else {
             let src = self.src_var();
+            let utmp = self.utmp_var();
             let tmp = self.tmp_var();
-            println!("P {}", P::PTYPE);
 
             let mask_fn = |bits: usize| {
                 format!(
                     "((({type_})1 << {width}) - 1)",
-                    type_ = CUDAType::from(P::PTYPE),
-                    width = bit_width
+                    type_ = CUDAType::from(P::PTYPE.to_unsigned()),
+                    width = bits
                 )
             };
 
             writeln!(output)?;
-            writeln!(output, "{src} = {in}[lane];", in = self.in_var())?;
+            writeln!(output, "{src} = {in}[lane];", in = self.in_var_l())?;
             for row in 0..bits {
                 let curr_word = (row * bit_width) / bits;
                 let next_word = ((row + 1) * bit_width) / bits;
@@ -221,7 +225,7 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
                     let current_bits = bit_width - remaining_bits;
                     writeln!(
                         output,
-                        "{tmp} = ({src} >> {shift}) & {mask};",
+                        "{utmp} = ({src} >> {shift}) & {mask};",
                         mask = mask_fn(current_bits)
                     )?;
 
@@ -229,17 +233,22 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
                         writeln!(output, "{src} = {in_}[lane + LANE_COUNT * {next_word}];")?;
                         writeln!(
                             output,
-                            "{tmp} |= ({src} & {mask}) << {current_bits};",
+                            "{utmp} |= ({src} & {mask}) << {current_bits};",
                             mask = mask_fn(remaining_bits)
                         )?;
                     }
                 } else {
                     writeln!(
                         output,
-                        "{tmp} = ({src} >> {shift}) & {mask};",
+                        "{utmp} = ({src} >> {shift}) & {mask};",
                         mask = mask_fn(bit_width)
                     )?;
                 }
+                writeln!(
+                    output,
+                    "{tmp} = ({type_}){utmp};",
+                    type_ = CUDAType::from(self.output_type),
+                )?;
                 writeln!(output, "out_idx = INDEX({row}, lane);")?;
                 f(output)?;
                 writeln!(output)?;
@@ -277,10 +286,18 @@ impl<P> BitPack<P> {
         format!("src{}", self.step_id)
     }
 
-    fn in_var(&self) -> String {
+    fn utmp_var(&self) -> String {
+        format!("utmp{}", self.step_id)
+    }
+
+    fn in_var_l(&self) -> String {
         format!("in{}", self.step_id)
     }
 
+    fn in_var_g(&self) -> String {
+        format!("_in{}", self.step_id)
+    }
+
     fn out_idx(&self) -> String {
         format!("out_idx{}", self.step_id)
     }
@@ -383,7 +400,6 @@ fn handle_array(a: &ArrayRef, stream: &Arc<CudaStream>, step_id: usize) -> Box<d
             return Box::new(BitPack::<P> {
                 step_id,
                 bit_width: bp.bit_width(),
-                packed: bp.packed().clone(),
                 output_type: bp.ptype(),
                 cuda_slice,
             });
@@ -447,10 +463,13 @@ impl<A: ALPFloat + DeviceRepr> GPUPipelineJIT for ALP<A> {
         ])
     }
 
-    fn args<'a>(&'a self, stream: &Arc<CudaStream>, args: &mut LaunchArgs<'a>) -> VortexResult<()> {
+    fn args<'a>(
+        &'a self,
+        _stream: &Arc<CudaStream>,
+        args: &mut LaunchArgs<'a>,
+    ) -> VortexResult<()> {
         args.arg(&self.e);
         args.arg(&self.f);
-        println!("---e {}, f {}", self.e, self.f);
         Ok(())
     }
 
@@ -530,22 +549,12 @@ impl<'a> GPUVisitor<'a> for ArgCollector<'a> {
     }
 }
 
-fn _create_jit(a: &ArrayRef) -> fmt::Result {
-    let ctx = CudaContext::new(0).unwrap();
-    ctx.set_blocking_synchronize().unwrap();
-    let stream = ctx.default_stream();
-    let output = handle_array(a, &stream, 0);
-
-    let mut s = String::new();
-    let w = &mut s as &mut dyn Write;
-    let mut ind = IndentedWriter::new(w);
-    let w = &mut ind;
-
+fn jit_str(w: &mut IndentedWriter<&mut dyn Write>, output: &dyn GPUPipelineJIT) -> fmt::Result {
     let mut params = InParamPrinter { params: Vec::new() };
-    params.accept(output.as_ref()).vortex_expect("cannot fail");
+    params.accept(output).vortex_expect("cannot fail");
 
     params.params.push(GPUKernelParameter {
-        name: "output".to_string(),
+        name: "_output".to_string(),
         type_: format!("{} *__restrict__", CUDAType::from(output.output_type())),
     });
 
@@ -569,43 +578,64 @@ fn _create_jit(a: &ArrayRef) -> fmt::Result {
                 }
             )
         })
-
-        // .try_for_each(|p| writeln!(w, "{} {},", p.type_, p.name))
     })?;
     writeln!(w, ") {{")?;
 
     w.indent(|w| {
+        writeln!(
+            w,
+            "{output_type} *output = _output + (blockIdx.x * 1024);",
+            output_type = CUDAType::from(output.output_type())
+        )?;
+
+        writeln!(w, "__shared__ float s_output[1024];")?;
+
         let mut decl = DeclPrinter { w };
-        decl.accept(output.as_ref()).vortex_expect("cannot fail");
+        decl.accept(output).vortex_expect("cannot fail");
         writeln!(w)?;
         output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
-            writeln!(
-                w,
-                "output[out_idx] = {output};",
-                output = output.output_var()
-            )
+            writeln!(w, "s_output[out_idx] = {tmp};", tmp = output.output_var())
         })
     })?;
-    writeln!(w, "}}")?;
 
-    println!("{}", s);
-    let module = cudarc::nvrtc::compile_ptx(s.clone())
-        .map_err(|e| vortex_err!("compile ptx {e}"))
-        .vortex_unwrap();
-    println!("{}", module.to_src());
+    writeln!(
+        w,
+        "    for (int i = 0; i < 32; i++) {{
+        auto idx = i * 32 + threadIdx.x;
+        output[idx] = s_output[idx];
+    }}"
+    )?;
+
+    writeln!(w, "}}")
+}
+
+pub fn create_jit(array: &ArrayRef, ctx: Arc<CudaContext>) -> VortexResult<(ArrayRef, Duration)> {
+    let stream = ctx.default_stream();
+
+    let output = handle_array(array, &stream, 0);
+
+    let mut s = String::new();
+    let w = &mut s as &mut dyn Write;
+    let mut ind = IndentedWriter::new(w);
+    let w = &mut ind;
+
+    let _ = jit_str(w, output.as_ref()).map_err(|e| vortex_err!("jit str cannot fail {e}"));
+    // println!("s {}", s);
+
+    let module =
+        cudarc::nvrtc::compile_ptx(s.clone()).map_err(|e| vortex_err!("compile ptx {e}"))?;
 
     // Dynamically load it into the device
     let module = ctx
         .load_module(module)
-        .map_err(|e| vortex_err!("load module {e}"))
-        .vortex_unwrap();
+        .map_err(|e| vortex_err!("load module {e}"))?;
 
     let kernel = module
         .load_function("kernel")
-        .map_err(|e| vortex_err!("get function {e}"))
-        .vortex_unwrap();
+        .map_err(|e| vortex_err!("get function {e}"))?;
 
-    let num_chunks = u32::try_from(a.len().div_ceil(1024)).vortex_expect("Too many grid elements");
+    let num_chunks =
+        u32::try_from(array.len().div_ceil(1024)).vortex_expect("Too many grid elements");
 
     let mut launch_builder = stream.launch_builder(&kernel);
 
@@ -614,9 +644,7 @@ fn _create_jit(a: &ArrayRef) -> fmt::Result {
 
         params: &mut launch_builder,
     };
-    collector
-        .accept(output.as_ref())
-        .vortex_expect("cannot fail");
+    collector.accept(output.as_ref())?;
 
     let launch_config = LaunchConfig {
         grid_dim: (num_chunks, 1, 1),
@@ -624,64 +652,100 @@ fn _create_jit(a: &ArrayRef) -> fmt::Result {
         shared_mem_bytes: 0,
     };
 
-    let mut out = stream.alloc_zeros::<f32>(a.len()).unwrap();
-    collector.params.arg(&mut out);
-
-    let _ = unsafe { collector.params.launch(launch_config) };
-
-    let mut buffer = BufferMut::<f32>::with_capacity(a.len());
-    unsafe { buffer.set_len(a.len()) }
-
-    stream
-        .memcpy_dtoh(&out, &mut buffer)
-        .map_err(|e| vortex_err!("Failed to copy to device: {e}"))
-        .vortex_unwrap();
-    stream
-        .synchronize()
-        .map_err(|e| vortex_err!("Failed to synchronize: {e}"))
-        .vortex_unwrap();
-    let c = Canonical::Primitive(PrimitiveArray::new(buffer, Validity::NonNullable)).into_array();
-
-    println!("c {}", c.display_tree());
-    println!("c {}", c.display_values());
-
-    Ok(())
-}
-
-fn create_jit(a: &ArrayRef) -> VortexResult<()> {
-    _create_jit(a).map_err(|e| vortex_err!("failed to write decls {e}"))
+    match_each_native_ptype!(array.dtype().as_ptype(), |P| {
+        let mut out = stream.alloc_zeros::<P>(array.len()).unwrap();
+        collector.params.arg(&mut out);
+        stream
+            .synchronize()
+            .map_err(|e| vortex_err!("failed to sync {e}"))?;
+        let start = stream
+            .record_event(Some(CU_EVENT_DEFAULT))
+            .ok()
+            .vortex_expect("Failed to record event");
+        let _ = unsafe { collector.params.launch(launch_config) };
+        ctx.synchronize()
+            .map_err(|e| vortex_err!("Failed to synchronize: {e}"))?;
+        let end = stream
+            .record_event(Some(CU_EVENT_DEFAULT))
+            .ok()
+            .vortex_expect("Failed to record event");
+
+        let duration = start.elapsed_ms(&end).unwrap();
+
+        let mut buffer = BufferMut::<P>::with_capacity(array.len());
+        unsafe { buffer.set_len(array.len()) }
+
+        stream
+            .memcpy_dtoh(&out, &mut buffer)
+            .map_err(|e| vortex_err!("Failed to copy to device: {e}"))
+            .vortex_unwrap();
+        stream
+            .synchronize()
+            .map_err(|e| vortex_err!("Failed to synchronize: {e}"))
+            .vortex_unwrap();
+        let c =
+            Canonical::Primitive(PrimitiveArray::new(buffer, Validity::NonNullable)).into_array();
+
+        Ok((c, Duration::from_secs_f32(duration / 1000.0)))
+    })
 }
 
 #[cfg(test)]
 mod tests {
+    use cudarc::driver::CudaContext;
     use vortex_alp::{ALPArray, Exponents};
-    use vortex_array::IntoArray;
     use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::{IntoArray, ToCanonical};
     use vortex_error::VortexResult;
     use vortex_fastlanes::{BitPackedArray, FoRArray};
 
     use crate::jit::create_jit;
 
     #[test]
-    fn jit_arr() -> VortexResult<()> {
+    fn jit_arr_f32() -> VortexResult<()> {
+        let ctx = CudaContext::new(0).unwrap();
+        ctx.set_blocking_synchronize().unwrap();
         let for_ = ALPArray::try_new(
             FoRArray::try_new(
-                BitPackedArray::encode(
-                    (0i32..1024)
-                        .map(|_| 1i32)
-                        .collect::<PrimitiveArray>()
-                        .as_ref(),
-                    2,
-                )?
-                .into_array(),
+                BitPackedArray::encode((0i32..1024 * 2).collect::<PrimitiveArray>().as_ref(), 12)?
+                    .into_array(),
                 2i32.into(),
             )?
             .into_array(),
             Exponents { e: 4, f: 5 },
             None,
-        )?;
+        )?
+        .into_array();
+
+        let (d, _) = create_jit(&for_, ctx)?;
+        let prim = d.to_primitive();
+        let expect = for_.to_primitive();
+
+        for i in 0..prim.len() {
+            assert_eq!(
+                prim.as_slice::<f32>()[i],
+                expect.as_slice::<f32>()[i],
+                "i = {i}"
+            );
+        }
 
-        create_jit(&for_.into_array())?;
+        Ok(())
+    }
+
+    #[test]
+    fn jit_arrs() -> VortexResult<()> {
+        let ctx = CudaContext::new(0).unwrap();
+        ctx.set_blocking_synchronize().unwrap();
+        let for_ = BitPackedArray::encode(
+            (0i32..1024)
+                .map(|_| 1u32)
+                .collect::<PrimitiveArray>()
+                .as_ref(),
+            2,
+        )?
+        .into_array();
+
+        create_jit(&for_.into_array(), ctx)?;
 
         Ok(())
     }
diff --git a/vortex-gpu/src/lib.rs b/vortex-gpu/src/lib.rs
index c8f4bdf9d24..2b1a63bbadc 100644
--- a/vortex-gpu/src/lib.rs
+++ b/vortex-gpu/src/lib.rs
@@ -13,4 +13,5 @@ mod task;
 pub use bit_unpack::{cuda_bit_unpack, cuda_bit_unpack_timed};
 pub use for_::{cuda_for_unpack, cuda_for_unpack_timed};
 pub use for_bp::{cuda_for_bp_unpack, cuda_for_bp_unpack_timed};
+pub use jit::create_jit;
 pub use take::cuda_take;

From 3db750da658e1dfd8be8f91042a291e21fc61fec Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Tue, 14 Oct 2025 15:09:40 -0400
Subject: [PATCH 05/15] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-gpu/benches/gpu_bitunpack.rs |   5 +-
 vortex-gpu/src/for_bp.rs            |  23 +-
 vortex-gpu/src/jit.rs               | 354 +++++++++++++++++-----------
 vortex-gpu/src/lib.rs               |   2 +-
 4 files changed, 230 insertions(+), 154 deletions(-)

diff --git a/vortex-gpu/benches/gpu_bitunpack.rs b/vortex-gpu/benches/gpu_bitunpack.rs
index 71e18c0ebcd..c9a5d8428df 100644
--- a/vortex-gpu/benches/gpu_bitunpack.rs
+++ b/vortex-gpu/benches/gpu_bitunpack.rs
@@ -17,7 +17,7 @@ use vortex_dtype::NativePType;
 use vortex_error::VortexUnwrap;
 use vortex_fastlanes::{BitPackedArray, FoRArray};
 use vortex_gpu::{
-    create_jit, cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed,
+    create_run_jit_kernel, cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed,
 };
 
 // Data sizes: 1GB, 2.5GB, 5GB, 10GB
@@ -194,7 +194,6 @@ fn benchmark_gpu_for_bp_jit_decompress_kernel_only(c: &mut Criterion) {
         let array = make_alp_array(len).into_array();
 
         let ctx = CudaContext::new(0).unwrap();
-        let ctx = Arc::new(ctx);
         ctx.set_blocking_synchronize().unwrap();
 
         group.throughput(Throughput::Bytes(
@@ -205,7 +204,7 @@ fn benchmark_gpu_for_bp_jit_decompress_kernel_only(c: &mut Criterion) {
                 let mut total_time = Duration::ZERO;
                 for _ in 0..iters {
                     // This only measures kernel execution time, not memory transfers
-                    let (_result, kernel_time) = create_jit(array, Arc::clone(&ctx)).unwrap();
+                    let (_result, kernel_time) = create_run_jit_kernel(ctx.clone(), array).unwrap();
                     total_time += kernel_time;
                 }
                 total_time
diff --git a/vortex-gpu/src/for_bp.rs b/vortex-gpu/src/for_bp.rs
index 291459bf146..f9e7fe6860c 100644
--- a/vortex-gpu/src/for_bp.rs
+++ b/vortex-gpu/src/for_bp.rs
@@ -186,17 +186,16 @@ mod tests {
         ctx.set_blocking_synchronize().unwrap();
         let unpacked = cuda_for_bp_unpack(&array, ctx).unwrap();
         let primitive_array = array.into_array().to_primitive();
-        println!("unpacked {:?}", unpacked.as_slice::<f32>());
-        // assert_eq!(
-        //     primitive_array.as_slice::<u32>(),
-        //     unpacked.as_slice::<u32>()
-        // );
-        // for i in 0..primitive_array.len() {
-        //     assert_eq!(
-        //         primitive_array.as_slice::<u32>()[i],
-        //         unpacked.as_slice::<u32>()[i],
-        //         "i {i}"
-        //     );
-        // }
+        assert_eq!(
+            primitive_array.as_slice::<u32>(),
+            unpacked.as_slice::<u32>()
+        );
+        for i in 0..primitive_array.len() {
+            assert_eq!(
+                primitive_array.as_slice::<u32>()[i],
+                unpacked.as_slice::<u32>()[i],
+                "i {i}"
+            );
+        }
     }
 }
diff --git a/vortex-gpu/src/jit.rs b/vortex-gpu/src/jit.rs
index 0b813c60e0f..967a8c00387 100644
--- a/vortex-gpu/src/jit.rs
+++ b/vortex-gpu/src/jit.rs
@@ -8,7 +8,8 @@ use std::time::Duration;
 
 use cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT;
 use cudarc::driver::{
-    CudaContext, CudaSlice, CudaStream, DeviceRepr, LaunchArgs, LaunchConfig, PushKernelArg,
+    CudaContext, CudaFunction, CudaSlice, CudaStream, DeviceRepr, LaunchArgs, LaunchConfig,
+    PushKernelArg,
 };
 use vortex_alp::{ALPFloat, ALPVTable, match_each_alp_float_ptype};
 use vortex_array::arrays::PrimitiveArray;
@@ -27,22 +28,8 @@ struct GPUKernelParameter {
 }
 
 struct GPULaunchConfig {
-    block_width: usize,
-    grid_width: usize,
-}
-
-// struct GPUPipelineParameters {
-//     inputs: Vec<GPUKernelParameter>,
-//     output: GPUKernelParameter,
-//     block_width: usize,
-//     grid_width: usize,
-//     iteration_order: IterationOrder,
-// }
-//
-// struct GPUPipeline {
-//     body: String,
-//     parameters: GPUPipelineParameters,
-// }
+    block_width: u32,
+}
 
 // bp -> output
 // tmp = ...
@@ -89,8 +76,74 @@ trait GPUPipelineJIT {
     // always pass the output iteration aligned child last.
     fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()>;
 
-    fn launch_config(&self) -> Option<GPULaunchConfig> {
-        None
+    fn launch_config(&self) -> GPULaunchConfig;
+}
+
+trait ScalarGPUPipelineJIT {
+    fn step_id(&self) -> usize;
+
+    fn in_params(&self, params: &mut Vec<GPUKernelParameter>);
+
+    fn args<'a>(&'a self, stream: &Arc<CudaStream>, args: &mut LaunchArgs<'a>) -> VortexResult<()>;
+
+    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result;
+
+    fn kernel_body(
+        &self,
+        w: &mut IndentedWriter<&mut dyn Write>,
+        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
+    ) -> fmt::Result;
+
+    fn output_var(&self) -> String;
+
+    fn output_type(&self) -> PType;
+
+    fn child<'a>(&'a self) -> &'a dyn GPUPipelineJIT;
+}
+
+struct ScalarGPUPipelineJITNode<T> {
+    inner: T,
+}
+
+impl<T: ScalarGPUPipelineJIT> GPUPipelineJIT for ScalarGPUPipelineJITNode<T> {
+    fn step_id(&self) -> usize {
+        self.inner.step_id()
+    }
+
+    fn in_params(&self, params: &mut Vec<GPUKernelParameter>) {
+        self.inner.in_params(params)
+    }
+
+    fn args<'a>(&'a self, stream: &Arc<CudaStream>, args: &mut LaunchArgs<'a>) -> VortexResult<()> {
+        self.inner.args(stream, args)
+    }
+
+    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
+        self.inner.decls(w)
+    }
+
+    fn kernel_body(
+        &self,
+        w: &mut IndentedWriter<&mut dyn Write>,
+        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
+    ) -> fmt::Result {
+        self.inner.kernel_body(w, f)
+    }
+
+    fn output_var(&self) -> String {
+        self.inner.output_var()
+    }
+
+    fn output_type(&self) -> PType {
+        self.inner.output_type()
+    }
+
+    fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()> {
+        visitor.accept(self.inner.child())
+    }
+
+    fn launch_config(&self) -> GPULaunchConfig {
+        self.inner.child().launch_config()
     }
 }
 
@@ -145,7 +198,7 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
     fn args<'a>(
         &'a self,
         _stream: &Arc<CudaStream>,
-        launch_args: &mut LaunchArgs<'a>, // args: &mut Vec<Box<dyn DeviceRepr>>,
+        launch_args: &mut LaunchArgs<'a>,
     ) -> VortexResult<()> {
         launch_args.arg(&self.cuda_slice);
 
@@ -155,7 +208,6 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
     fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
         let output_cuda_type = CUDAType::from(self.output_type);
         let uoutput_cuda_type = CUDAType::from(self.output_type.to_unsigned());
-        // TODO: all types
         writeln!(
             w,
             "unsigned int LANE_COUNT = {bits};",
@@ -182,21 +234,20 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
         w: &mut IndentedWriter<&mut dyn Write>,
         f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
     ) -> fmt::Result {
-        let output = w;
         let bit_width = self.bit_width as usize;
         let bits = self.output_type.bit_width();
         let in_ = self.in_var_l();
         if bit_width == 0 {
-            writeln!(output, "uint{bits}_t zero = 0ULL;")?;
-            writeln!(output)?;
+            writeln!(w, "uint{bits}_t zero = 0ULL;")?;
+            writeln!(w)?;
             for row in 0..bits {
-                writeln!(output, "out[INDEX({row}, lane)] = zero;")?;
+                writeln!(w, "out[INDEX({row}, lane)] = zero;")?;
             }
         } else if bit_width == bits {
-            writeln!(output)?;
+            writeln!(w)?;
             for row in 0..bits {
                 writeln!(
-                    output,
+                    w,
                     "out[INDEX({row}, lane)] = {in_}[LANE_COUNT * {row} + lane];",
                 )?;
             }
@@ -213,8 +264,8 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
                 )
             };
 
-            writeln!(output)?;
-            writeln!(output, "{src} = {in}[lane];", in = self.in_var_l())?;
+            writeln!(w)?;
+            writeln!(w, "{src} = {in}[lane];", in = self.in_var_l())?;
             for row in 0..bits {
                 let curr_word = (row * bit_width) / bits;
                 let next_word = ((row + 1) * bit_width) / bits;
@@ -224,34 +275,35 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
                     let remaining_bits = ((row + 1) * bit_width) % bits;
                     let current_bits = bit_width - remaining_bits;
                     writeln!(
-                        output,
+                        w,
                         "{utmp} = ({src} >> {shift}) & {mask};",
                         mask = mask_fn(current_bits)
                     )?;
 
                     if next_word < bit_width {
-                        writeln!(output, "{src} = {in_}[lane + LANE_COUNT * {next_word}];")?;
+                        writeln!(w, "{src} = {in_}[lane + LANE_COUNT * {next_word}];")?;
                         writeln!(
-                            output,
+                            w,
                             "{utmp} |= ({src} & {mask}) << {current_bits};",
                             mask = mask_fn(remaining_bits)
                         )?;
                     }
                 } else {
                     writeln!(
-                        output,
+                        w,
                         "{utmp} = ({src} >> {shift}) & {mask};",
                         mask = mask_fn(bit_width)
                     )?;
                 }
                 writeln!(
-                    output,
+                    w,
                     "{tmp} = ({type_}){utmp};",
                     type_ = CUDAType::from(self.output_type),
                 )?;
-                writeln!(output, "out_idx = INDEX({row}, lane);")?;
-                f(output)?;
-                writeln!(output)?;
+
+                writeln!(w, "out_idx = INDEX({row}, lane);")?;
+                f(w)?;
+                writeln!(w)?;
             }
         }
         Ok(())
@@ -269,11 +321,10 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
         Ok(())
     }
 
-    fn launch_config(&self) -> Option<GPULaunchConfig> {
-        Some(GPULaunchConfig {
-            block_width: 1024,
-            grid_width: 1,
-        })
+    fn launch_config(&self) -> GPULaunchConfig {
+        GPULaunchConfig {
+            block_width: if P::PTYPE == PType::U64 { 16 } else { 32 },
+        }
     }
 }
 
@@ -297,10 +348,6 @@ impl<P> BitPack<P> {
     fn in_var_g(&self) -> String {
         format!("_in{}", self.step_id)
     }
-
-    fn out_idx(&self) -> String {
-        format!("out_idx{}", self.step_id)
-    }
 }
 
 struct FoR<P> {
@@ -319,7 +366,7 @@ impl<P> FoR<P> {
     }
 }
 
-impl<P: NativePType + DeviceRepr> GPUPipelineJIT for FoR<P> {
+impl<P: NativePType + DeviceRepr> ScalarGPUPipelineJIT for FoR<P> {
     fn step_id(&self) -> usize {
         self.step_id
     }
@@ -342,7 +389,6 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for FoR<P> {
 
     fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
         let output_cuda_type = CUDAType::from(self.output_type());
-        // TODO: supprort all types
         writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?;
         Ok(())
     }
@@ -371,20 +417,22 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for FoR<P> {
         P::PTYPE
     }
 
-    fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()> {
-        visitor.accept(self.child.as_ref())
+    fn child<'a>(&'a self) -> &'a dyn GPUPipelineJIT {
+        self.child.as_ref()
     }
 }
 
 fn handle_array(a: &ArrayRef, stream: &Arc<CudaStream>, step_id: usize) -> Box<dyn GPUPipelineJIT> {
     if let Some(alp) = a.as_opt::<ALPVTable>() {
         match_each_alp_float_ptype!(alp.ptype(), |A| {
-            return Box::new(ALP {
-                step_id,
-                float_type: alp.ptype(),
-                child: handle_array(alp.encoded(), stream, step_id + 1),
-                f: A::F10[alp.exponents().f as usize],
-                e: A::IF10[alp.exponents().e as usize],
+            return Box::new(ScalarGPUPipelineJITNode {
+                inner: ALP {
+                    step_id,
+                    float_type: alp.ptype(),
+                    child: handle_array(alp.encoded(), stream, step_id + 1),
+                    f: A::F10[alp.exponents().f as usize],
+                    e: A::IF10[alp.exponents().e as usize],
+                },
             });
         })
     }
@@ -408,14 +456,16 @@ fn handle_array(a: &ArrayRef, stream: &Arc<CudaStream>, step_id: usize) -> Box<d
 
     if let Some(for_) = a.as_opt::<FoRVTable>() {
         match_each_native_ptype!(for_.reference_scalar().as_primitive().ptype(), |P| {
-            return Box::new(FoR {
-                step_id,
-                reference: for_
-                    .reference_scalar()
-                    .as_primitive()
-                    .as_::<P>()
-                    .vortex_expect("cannot have a null reference"),
-                child: handle_array(for_.encoded(), stream, step_id + 1),
+            return Box::new(ScalarGPUPipelineJITNode {
+                inner: FoR {
+                    step_id,
+                    reference: for_
+                        .reference_scalar()
+                        .as_primitive()
+                        .as_::<P>()
+                        .vortex_expect("cannot have a null reference"),
+                    child: handle_array(for_.encoded(), stream, step_id + 1),
+                },
             });
         })
     }
@@ -445,7 +495,7 @@ impl<A: ALPFloat> ALP<A> {
     }
 }
 
-impl<A: ALPFloat + DeviceRepr> GPUPipelineJIT for ALP<A> {
+impl<A: ALPFloat + DeviceRepr> ScalarGPUPipelineJIT for ALP<A> {
     fn step_id(&self) -> usize {
         self.step_id
     }
@@ -507,8 +557,8 @@ impl<A: ALPFloat + DeviceRepr> GPUPipelineJIT for ALP<A> {
         self.float_type
     }
 
-    fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()> {
-        visitor.accept(self.child.as_ref())
+    fn child<'a>(&'a self) -> &'a dyn GPUPipelineJIT {
+        self.child.as_ref()
     }
 }
 
@@ -516,6 +566,11 @@ struct DeclPrinter<'a, 'b: 'a> {
     w: &'a mut IndentedWriter<&'b mut dyn Write>,
 }
 
+fn write_kernel_declarations(w: &mut IndentedWriter<&mut dyn Write>, node: &dyn GPUPipelineJIT) {
+    let mut decl = DeclPrinter { w };
+    decl.accept(node).vortex_expect("write decl cannot fail");
+}
+
 impl<'a> GPUVisitor<'a> for DeclPrinter<'a, '_> {
     fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> {
         node.children(self)?;
@@ -524,11 +579,11 @@ impl<'a> GPUVisitor<'a> for DeclPrinter<'a, '_> {
     }
 }
 
-struct InParamPrinter {
+struct InParamCollector {
     params: Vec<GPUKernelParameter>,
 }
 
-impl GPUVisitor<'_> for InParamPrinter {
+impl GPUVisitor<'_> for InParamCollector {
     fn accept(&mut self, node: &dyn GPUPipelineJIT) -> VortexResult<()> {
         node.children(self)?;
         node.in_params(&mut self.params);
@@ -536,12 +591,18 @@ impl GPUVisitor<'_> for InParamPrinter {
     }
 }
 
-struct ArgCollector<'a> {
+fn collect_in_param(node: &dyn GPUPipelineJIT) -> VortexResult<Vec<GPUKernelParameter>> {
+    let mut params = InParamCollector { params: Vec::new() };
+    params.accept(node)?;
+    Ok(params.params)
+}
+
+struct ArgCollector<'a, 'b> {
     stream: Arc<CudaStream>,
-    params: &'a mut LaunchArgs<'a>,
+    params: &'b mut LaunchArgs<'a>,
 }
 
-impl<'a> GPUVisitor<'a> for ArgCollector<'a> {
+impl<'a> GPUVisitor<'a> for ArgCollector<'a, '_> {
     fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> {
         node.children(self)?;
         node.args(&self.stream, self.params)?;
@@ -549,78 +610,87 @@ impl<'a> GPUVisitor<'a> for ArgCollector<'a> {
     }
 }
 
-fn jit_str(w: &mut IndentedWriter<&mut dyn Write>, output: &dyn GPUPipelineJIT) -> fmt::Result {
-    let mut params = InParamPrinter { params: Vec::new() };
-    params.accept(output).vortex_expect("cannot fail");
+fn collect_args<'a, 'b>(
+    node: &'a dyn GPUPipelineJIT,
+    stream: Arc<CudaStream>,
+    args: &'b mut LaunchArgs<'a>,
+) -> VortexResult<()> {
+    let mut collector = ArgCollector {
+        stream: stream.clone(),
+
+        params: args,
+    };
+    collector.accept(node)?;
 
-    params.params.push(GPUKernelParameter {
+    Ok(())
+}
+
+fn create_jit_str(
+    w: &mut IndentedWriter<&mut dyn Write>,
+    output: &dyn GPUPipelineJIT,
+) -> VortexResult<()> {
+    let mut params = collect_in_param(output)?;
+    params.push(GPUKernelParameter {
         name: "_output".to_string(),
         type_: format!("{} *__restrict__", CUDAType::from(output.output_type())),
     });
 
-    writeln!(w, "__device__ int FL_ORDER[] = {{0, 4, 2, 6, 1, 5, 3, 7}};")?;
-    writeln!(
-        w,
-        "#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)"
-    )?;
-    writeln!(w, "extern \"C\" __global__ void kernel(")?;
-    w.indent(|w| {
-        params.params.iter().enumerate().try_for_each(|(idx, p)| {
-            writeln!(
-                w,
-                "{} {}{end}",
-                p.type_,
-                p.name,
-                end = if idx == params.params.len() - 1 {
-                    ""
-                } else {
-                    ","
-                }
-            )
-        })
-    })?;
-    writeln!(w, ") {{")?;
-
-    w.indent(|w| {
+    (|| {
+        writeln!(w, "__device__ int FL_ORDER[] = {{0, 4, 2, 6, 1, 5, 3, 7}};")?;
         writeln!(
             w,
-            "{output_type} *output = _output + (blockIdx.x * 1024);",
-            output_type = CUDAType::from(output.output_type())
+            "#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)"
         )?;
+        writeln!(w, "extern \"C\" __global__ void kernel(")?;
+        w.indent(|w| {
+            params.iter().enumerate().try_for_each(|(idx, p)| {
+                writeln!(
+                    w,
+                    "{} {}{end}",
+                    p.type_,
+                    p.name,
+                    end = if idx == params.len() - 1 { "" } else { "," }
+                )
+            })
+        })?;
+        writeln!(w, ") {{")?;
 
-        writeln!(w, "__shared__ float s_output[1024];")?;
+        w.indent(|w| {
+            writeln!(
+                w,
+                "{output_type} *output = _output + (blockIdx.x * 1024);",
+                output_type = CUDAType::from(output.output_type())
+            )?;
 
-        let mut decl = DeclPrinter { w };
-        decl.accept(output).vortex_expect("cannot fail");
-        writeln!(w)?;
-        output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
-            writeln!(w, "s_output[out_idx] = {tmp};", tmp = output.output_var())
-        })
-    })?;
+            writeln!(w, "__shared__ float s_output[1024];")?;
 
-    writeln!(
-        w,
-        "    for (int i = 0; i < 32; i++) {{
+            write_kernel_declarations(w, output);
+            writeln!(w)?;
+            output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
+                writeln!(w, "s_output[out_idx] = {tmp};", tmp = output.output_var())
+            })
+        })?;
+
+        writeln!(
+            w,
+            "    for (int i = 0; i < 32; i++) {{
         auto idx = i * 32 + threadIdx.x;
         output[idx] = s_output[idx];
     }}"
-    )?;
+        )?;
 
-    writeln!(w, "}}")
+        writeln!(w, "}}")
+    })()
+    .map_err(|e| vortex_err!("format err {e}"))
 }
 
-pub fn create_jit(array: &ArrayRef, ctx: Arc<CudaContext>) -> VortexResult<(ArrayRef, Duration)> {
-    let stream = ctx.default_stream();
-
-    let output = handle_array(array, &stream, 0);
-
+fn create_kernel(ctx: Arc<CudaContext>, array: &dyn GPUPipelineJIT) -> VortexResult<CudaFunction> {
     let mut s = String::new();
     let w = &mut s as &mut dyn Write;
     let mut ind = IndentedWriter::new(w);
     let w = &mut ind;
 
-    let _ = jit_str(w, output.as_ref()).map_err(|e| vortex_err!("jit str cannot fail {e}"));
-    // println!("s {}", s);
+    let _ = create_jit_str(w, array).map_err(|e| vortex_err!("jit str cannot fail {e}"));
 
     let module =
         cudarc::nvrtc::compile_ptx(s.clone()).map_err(|e| vortex_err!("compile ptx {e}"))?;
@@ -630,31 +700,39 @@ pub fn create_jit(array: &ArrayRef, ctx: Arc<CudaContext>) -> VortexResult<(Arra
         .load_module(module)
         .map_err(|e| vortex_err!("load module {e}"))?;
 
-    let kernel = module
+    module
         .load_function("kernel")
-        .map_err(|e| vortex_err!("get function {e}"))?;
+        .map_err(|e| vortex_err!("load_function {e}"))
+}
+
+pub fn create_run_jit_kernel(
+    ctx: Arc<CudaContext>,
+    array: &ArrayRef,
+) -> VortexResult<(ArrayRef, Duration)> {
+    let stream = ctx.default_stream();
+
+    let output = handle_array(array, &stream, 0);
+    let kernel = create_kernel(ctx.clone(), output.as_ref())?;
 
     let num_chunks =
         u32::try_from(array.len().div_ceil(1024)).vortex_expect("Too many grid elements");
 
     let mut launch_builder = stream.launch_builder(&kernel);
 
-    let mut collector = ArgCollector {
-        stream: stream.clone(),
-
-        params: &mut launch_builder,
-    };
-    collector.accept(output.as_ref())?;
-
     let launch_config = LaunchConfig {
         grid_dim: (num_chunks, 1, 1),
-        block_dim: (32, 1, 1),
-        shared_mem_bytes: 0,
+        block_dim: (output.launch_config().block_width, 1, 1),
+        shared_mem_bytes: u32::try_from(output.output_type().byte_width())
+            .vortex_expect("oversized output type byte width")
+            * 1024,
     };
 
+    collect_args(output.as_ref(), stream.clone(), &mut launch_builder)?;
+
     match_each_native_ptype!(array.dtype().as_ptype(), |P| {
+        // append final argument (output) of the kernel
         let mut out = stream.alloc_zeros::<P>(array.len()).unwrap();
-        collector.params.arg(&mut out);
+        launch_builder.arg(&mut out);
         stream
             .synchronize()
             .map_err(|e| vortex_err!("failed to sync {e}"))?;
@@ -662,7 +740,7 @@ pub fn create_jit(array: &ArrayRef, ctx: Arc<CudaContext>) -> VortexResult<(Arra
             .record_event(Some(CU_EVENT_DEFAULT))
             .ok()
             .vortex_expect("Failed to record event");
-        let _ = unsafe { collector.params.launch(launch_config) };
+        let _ = unsafe { launch_builder.launch(launch_config) };
         ctx.synchronize()
             .map_err(|e| vortex_err!("Failed to synchronize: {e}"))?;
         let end = stream
@@ -699,7 +777,7 @@ mod tests {
     use vortex_error::VortexResult;
     use vortex_fastlanes::{BitPackedArray, FoRArray};
 
-    use crate::jit::create_jit;
+    use crate::jit::create_run_jit_kernel;
 
     #[test]
     fn jit_arr_f32() -> VortexResult<()> {
@@ -717,7 +795,7 @@ mod tests {
         )?
         .into_array();
 
-        let (d, _) = create_jit(&for_, ctx)?;
+        let (d, _) = create_run_jit_kernel(ctx, &for_)?;
         let prim = d.to_primitive();
         let expect = for_.to_primitive();
 
@@ -745,7 +823,7 @@ mod tests {
         )?
         .into_array();
 
-        create_jit(&for_.into_array(), ctx)?;
+        create_run_jit_kernel(ctx, &for_.into_array())?;
 
         Ok(())
     }
diff --git a/vortex-gpu/src/lib.rs b/vortex-gpu/src/lib.rs
index 2b1a63bbadc..2afd0baa566 100644
--- a/vortex-gpu/src/lib.rs
+++ b/vortex-gpu/src/lib.rs
@@ -13,5 +13,5 @@ mod task;
 pub use bit_unpack::{cuda_bit_unpack, cuda_bit_unpack_timed};
 pub use for_::{cuda_for_unpack, cuda_for_unpack_timed};
 pub use for_bp::{cuda_for_bp_unpack, cuda_for_bp_unpack_timed};
-pub use jit::create_jit;
+pub use jit::create_run_jit_kernel;
 pub use take::cuda_take;

From 11bde6a528904feda0625c5ec5380ef6fe2df606 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Tue, 14 Oct 2025 16:08:22 -0400
Subject: [PATCH 06/15] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-gpu/src/jit.rs                | 830 ---------------------------
 vortex-gpu/src/jit/arrays/alp.rs     | 127 ++++
 vortex-gpu/src/jit/arrays/bitpack.rs | 218 +++++++
 vortex-gpu/src/jit/arrays/for_.rs    | 111 ++++
 vortex-gpu/src/jit/arrays/mod.rs     |   6 +
 vortex-gpu/src/jit/convert.rs        |  35 ++
 vortex-gpu/src/jit/kernel_fmt.rs     | 127 ++++
 vortex-gpu/src/jit/mod.rs            | 138 +++++
 vortex-gpu/src/jit/run.rs            | 175 ++++++
 vortex-gpu/src/jit/type_.rs          |  33 ++
 10 files changed, 970 insertions(+), 830 deletions(-)
 delete mode 100644 vortex-gpu/src/jit.rs
 create mode 100644 vortex-gpu/src/jit/arrays/alp.rs
 create mode 100644 vortex-gpu/src/jit/arrays/bitpack.rs
 create mode 100644 vortex-gpu/src/jit/arrays/for_.rs
 create mode 100644 vortex-gpu/src/jit/arrays/mod.rs
 create mode 100644 vortex-gpu/src/jit/convert.rs
 create mode 100644 vortex-gpu/src/jit/kernel_fmt.rs
 create mode 100644 vortex-gpu/src/jit/mod.rs
 create mode 100644 vortex-gpu/src/jit/run.rs
 create mode 100644 vortex-gpu/src/jit/type_.rs

diff --git a/vortex-gpu/src/jit.rs b/vortex-gpu/src/jit.rs
deleted file mode 100644
index 967a8c00387..00000000000
--- a/vortex-gpu/src/jit.rs
+++ /dev/null
@@ -1,830 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-use std::fmt;
-use std::fmt::{Display, Write};
-use std::sync::Arc;
-use std::time::Duration;
-
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT;
-use cudarc::driver::{
-    CudaContext, CudaFunction, CudaSlice, CudaStream, DeviceRepr, LaunchArgs, LaunchConfig,
-    PushKernelArg,
-};
-use vortex_alp::{ALPFloat, ALPVTable, match_each_alp_float_ptype};
-use vortex_array::arrays::PrimitiveArray;
-use vortex_array::validity::Validity;
-use vortex_array::{Array, ArrayRef, Canonical, IntoArray};
-use vortex_buffer::{Buffer, BufferMut};
-use vortex_dtype::{NativePType, PType, match_each_native_ptype};
-use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_err};
-use vortex_fastlanes::{BitPackedVTable, FoRVTable};
-
-use crate::indent::IndentedWriter;
-
-struct GPUKernelParameter {
-    name: String,
-    type_: String,
-}
-
-struct GPULaunchConfig {
-    block_width: u32,
-}
-
-// bp -> output
-// tmp = ...
-// out[i] = tmp;
-
-// bp -> for -> output
-
-// tmp = ....
-// tmp_for = tmp + ref
-// output[i] = tmp_for[i]
-
-// have leaves only bp for now.
-
-// step-type (each one has a unique step_id)
-// step_id
-// in_params
-// decls/setup
-// kernel-step body // fn body(var, writer) -> str
-// output_var + output_type
-
-trait GPUVisitor<'a> {
-    fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()>;
-}
-
-trait GPUPipelineJIT {
-    fn step_id(&self) -> usize;
-
-    fn in_params(&self, params: &mut Vec<GPUKernelParameter>);
-
-    fn args<'a>(&'a self, stream: &Arc<CudaStream>, args: &mut LaunchArgs<'a>) -> VortexResult<()>;
-
-    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result;
-
-    fn kernel_body(
-        &self,
-        w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
-    ) -> fmt::Result;
-
-    fn output_var(&self) -> String;
-
-    fn output_type(&self) -> PType;
-
-    // always pass the output iteration aligned child last.
-    fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()>;
-
-    fn launch_config(&self) -> GPULaunchConfig;
-}
-
-trait ScalarGPUPipelineJIT {
-    fn step_id(&self) -> usize;
-
-    fn in_params(&self, params: &mut Vec<GPUKernelParameter>);
-
-    fn args<'a>(&'a self, stream: &Arc<CudaStream>, args: &mut LaunchArgs<'a>) -> VortexResult<()>;
-
-    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result;
-
-    fn kernel_body(
-        &self,
-        w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
-    ) -> fmt::Result;
-
-    fn output_var(&self) -> String;
-
-    fn output_type(&self) -> PType;
-
-    fn child<'a>(&'a self) -> &'a dyn GPUPipelineJIT;
-}
-
-struct ScalarGPUPipelineJITNode<T> {
-    inner: T,
-}
-
-impl<T: ScalarGPUPipelineJIT> GPUPipelineJIT for ScalarGPUPipelineJITNode<T> {
-    fn step_id(&self) -> usize {
-        self.inner.step_id()
-    }
-
-    fn in_params(&self, params: &mut Vec<GPUKernelParameter>) {
-        self.inner.in_params(params)
-    }
-
-    fn args<'a>(&'a self, stream: &Arc<CudaStream>, args: &mut LaunchArgs<'a>) -> VortexResult<()> {
-        self.inner.args(stream, args)
-    }
-
-    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
-        self.inner.decls(w)
-    }
-
-    fn kernel_body(
-        &self,
-        w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
-    ) -> fmt::Result {
-        self.inner.kernel_body(w, f)
-    }
-
-    fn output_var(&self) -> String {
-        self.inner.output_var()
-    }
-
-    fn output_type(&self) -> PType {
-        self.inner.output_type()
-    }
-
-    fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()> {
-        visitor.accept(self.inner.child())
-    }
-
-    fn launch_config(&self) -> GPULaunchConfig {
-        self.inner.child().launch_config()
-    }
-}
-
-struct CUDAType(&'static str);
-
-impl Display for CUDAType {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.write_str(self.0)
-    }
-}
-
-impl From<PType> for CUDAType {
-    fn from(value: PType) -> Self {
-        CUDAType(match value {
-            PType::U8 => "unsigned char",
-            PType::U16 => "unsigned short",
-            PType::U32 => "unsigned int",
-            PType::U64 => "unsigned long long",
-            PType::I8 => "char",
-            PType::I16 => "short",
-            PType::I32 => "int",
-            PType::I64 => "long long",
-            PType::F32 => "float",
-            PType::F64 => "double",
-            PType::F16 => todo!(),
-        })
-    }
-}
-
-struct BitPack<P> {
-    step_id: usize,
-    bit_width: u8,
-    output_type: PType,
-    cuda_slice: CudaSlice<P>,
-}
-
-impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
-    fn step_id(&self) -> usize {
-        self.step_id
-    }
-
-    fn in_params(&self, p: &mut Vec<GPUKernelParameter>) {
-        p.push(GPUKernelParameter {
-            name: self.in_var_g(),
-            type_: format!(
-                "{type_} *__restrict",
-                type_ = CUDAType::from(self.output_type.to_unsigned())
-            ),
-        });
-    }
-
-    fn args<'a>(
-        &'a self,
-        _stream: &Arc<CudaStream>,
-        launch_args: &mut LaunchArgs<'a>,
-    ) -> VortexResult<()> {
-        launch_args.arg(&self.cuda_slice);
-
-        Ok(())
-    }
-
-    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
-        let output_cuda_type = CUDAType::from(self.output_type);
-        let uoutput_cuda_type = CUDAType::from(self.output_type.to_unsigned());
-        writeln!(
-            w,
-            "unsigned int LANE_COUNT = {bits};",
-            bits = 1024 / self.output_type.bit_width()
-        )?;
-        writeln!(w, "{output_cuda_type} {};", self.tmp_var())?;
-        writeln!(w, "{uoutput_cuda_type} {};", self.src_var())?;
-        writeln!(w, "{uoutput_cuda_type} {};", self.utmp_var())?;
-        writeln!(w, "unsigned int out_idx;")?;
-        writeln!(w, "unsigned int lane = threadIdx.x;")?;
-        writeln!(
-            w,
-            "{uoutput_cuda_type} *{in_l} = {in_g} + (blockIdx.x * 128 * {bit_width} / {bit_size});",
-            in_l = self.in_var_l(),
-            in_g = self.in_var_g(),
-            bit_width = self.bit_width,
-            bit_size = P::PTYPE.byte_width()
-        )?;
-        Ok(())
-    }
-
-    fn kernel_body(
-        &self,
-        w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
-    ) -> fmt::Result {
-        let bit_width = self.bit_width as usize;
-        let bits = self.output_type.bit_width();
-        let in_ = self.in_var_l();
-        if bit_width == 0 {
-            writeln!(w, "uint{bits}_t zero = 0ULL;")?;
-            writeln!(w)?;
-            for row in 0..bits {
-                writeln!(w, "out[INDEX({row}, lane)] = zero;")?;
-            }
-        } else if bit_width == bits {
-            writeln!(w)?;
-            for row in 0..bits {
-                writeln!(
-                    w,
-                    "out[INDEX({row}, lane)] = {in_}[LANE_COUNT * {row} + lane];",
-                )?;
-            }
-        } else {
-            let src = self.src_var();
-            let utmp = self.utmp_var();
-            let tmp = self.tmp_var();
-
-            let mask_fn = |bits: usize| {
-                format!(
-                    "((({type_})1 << {width}) - 1)",
-                    type_ = CUDAType::from(P::PTYPE.to_unsigned()),
-                    width = bits
-                )
-            };
-
-            writeln!(w)?;
-            writeln!(w, "{src} = {in}[lane];", in = self.in_var_l())?;
-            for row in 0..bits {
-                let curr_word = (row * bit_width) / bits;
-                let next_word = ((row + 1) * bit_width) / bits;
-                let shift = (row * bit_width) % bits;
-
-                if next_word > curr_word {
-                    let remaining_bits = ((row + 1) * bit_width) % bits;
-                    let current_bits = bit_width - remaining_bits;
-                    writeln!(
-                        w,
-                        "{utmp} = ({src} >> {shift}) & {mask};",
-                        mask = mask_fn(current_bits)
-                    )?;
-
-                    if next_word < bit_width {
-                        writeln!(w, "{src} = {in_}[lane + LANE_COUNT * {next_word}];")?;
-                        writeln!(
-                            w,
-                            "{utmp} |= ({src} & {mask}) << {current_bits};",
-                            mask = mask_fn(remaining_bits)
-                        )?;
-                    }
-                } else {
-                    writeln!(
-                        w,
-                        "{utmp} = ({src} >> {shift}) & {mask};",
-                        mask = mask_fn(bit_width)
-                    )?;
-                }
-                writeln!(
-                    w,
-                    "{tmp} = ({type_}){utmp};",
-                    type_ = CUDAType::from(self.output_type),
-                )?;
-
-                writeln!(w, "out_idx = INDEX({row}, lane);")?;
-                f(w)?;
-                writeln!(w)?;
-            }
-        }
-        Ok(())
-    }
-
-    fn output_var(&self) -> String {
-        self.tmp_var()
-    }
-
-    fn output_type(&self) -> PType {
-        self.output_type
-    }
-
-    fn children(&self, _visitor: &mut dyn GPUVisitor) -> VortexResult<()> {
-        Ok(())
-    }
-
-    fn launch_config(&self) -> GPULaunchConfig {
-        GPULaunchConfig {
-            block_width: if P::PTYPE == PType::U64 { 16 } else { 32 },
-        }
-    }
-}
-
-impl<P> BitPack<P> {
-    fn tmp_var(&self) -> String {
-        format!("tmp{}", self.step_id)
-    }
-
-    fn src_var(&self) -> String {
-        format!("src{}", self.step_id)
-    }
-
-    fn utmp_var(&self) -> String {
-        format!("utmp{}", self.step_id)
-    }
-
-    fn in_var_l(&self) -> String {
-        format!("in{}", self.step_id)
-    }
-
-    fn in_var_g(&self) -> String {
-        format!("_in{}", self.step_id)
-    }
-}
-
-struct FoR<P> {
-    step_id: usize,
-    reference: P,
-    child: Box<dyn GPUPipelineJIT>,
-}
-
-impl<P> FoR<P> {
-    fn tmp_var(&self) -> String {
-        format!("tmp{}", self.step_id)
-    }
-
-    fn ref_var(&self) -> String {
-        format!("ref{}", self.step_id)
-    }
-}
-
-impl<P: NativePType + DeviceRepr> ScalarGPUPipelineJIT for FoR<P> {
-    fn step_id(&self) -> usize {
-        self.step_id
-    }
-
-    fn in_params(&self, p: &mut Vec<GPUKernelParameter>) {
-        p.push(GPUKernelParameter {
-            name: self.ref_var(),
-            type_: CUDAType::from(self.output_type()).to_string(),
-        })
-    }
-
-    fn args<'a>(
-        &'a self,
-        _stream: &Arc<CudaStream>,
-        args: &mut LaunchArgs<'a>,
-    ) -> VortexResult<()> {
-        args.arg(&self.reference);
-        Ok(())
-    }
-
-    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
-        let output_cuda_type = CUDAType::from(self.output_type());
-        writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?;
-        Ok(())
-    }
-
-    fn kernel_body(
-        &self,
-        w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
-    ) -> fmt::Result {
-        assert_eq!(self.output_type(), self.child.output_type());
-        let in_var = self.child.output_var();
-        let out_var = self.tmp_var();
-        let ref_var = self.ref_var();
-        self.child
-            .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
-                writeln!(w, "{out_var} = {in_var} + {ref_var};")?;
-                f(w)
-            })
-    }
-
-    fn output_var(&self) -> String {
-        self.tmp_var()
-    }
-
-    fn output_type(&self) -> PType {
-        P::PTYPE
-    }
-
-    fn child<'a>(&'a self) -> &'a dyn GPUPipelineJIT {
-        self.child.as_ref()
-    }
-}
-
-fn handle_array(a: &ArrayRef, stream: &Arc<CudaStream>, step_id: usize) -> Box<dyn GPUPipelineJIT> {
-    if let Some(alp) = a.as_opt::<ALPVTable>() {
-        match_each_alp_float_ptype!(alp.ptype(), |A| {
-            return Box::new(ScalarGPUPipelineJITNode {
-                inner: ALP {
-                    step_id,
-                    float_type: alp.ptype(),
-                    child: handle_array(alp.encoded(), stream, step_id + 1),
-                    f: A::F10[alp.exponents().f as usize],
-                    e: A::IF10[alp.exponents().e as usize],
-                },
-            });
-        })
-    }
-    if let Some(bp) = a.as_opt::<BitPackedVTable>() {
-        assert_eq!(bp.offset(), 0);
-        assert!(bp.patches().is_none());
-        match_each_native_ptype!(bp.ptype(), |P| {
-            let values = Buffer::<P>::from_byte_buffer(bp.packed().clone());
-            let cuda_slice = stream
-                .memcpy_stod(values.as_slice())
-                .map_err(|e| vortex_err!("Failed to copy to device: {e}"))
-                .vortex_unwrap();
-            return Box::new(BitPack::<P> {
-                step_id,
-                bit_width: bp.bit_width(),
-                output_type: bp.ptype(),
-                cuda_slice,
-            });
-        })
-    };
-
-    if let Some(for_) = a.as_opt::<FoRVTable>() {
-        match_each_native_ptype!(for_.reference_scalar().as_primitive().ptype(), |P| {
-            return Box::new(ScalarGPUPipelineJITNode {
-                inner: FoR {
-                    step_id,
-                    reference: for_
-                        .reference_scalar()
-                        .as_primitive()
-                        .as_::<P>()
-                        .vortex_expect("cannot have a null reference"),
-                    child: handle_array(for_.encoded(), stream, step_id + 1),
-                },
-            });
-        })
-    }
-
-    todo!()
-}
-
-struct ALP<A: ALPFloat> {
-    step_id: usize,
-    float_type: PType,
-    child: Box<dyn GPUPipelineJIT>,
-    f: A,
-    e: A,
-}
-
-impl<A: ALPFloat> ALP<A> {
-    fn tmp_var(&self) -> String {
-        format!("tmp{}", self.step_id)
-    }
-
-    fn e_var(&self) -> String {
-        format!("e{}", self.step_id)
-    }
-
-    fn f_var(&self) -> String {
-        format!("f{}", self.step_id)
-    }
-}
-
-impl<A: ALPFloat + DeviceRepr> ScalarGPUPipelineJIT for ALP<A> {
-    fn step_id(&self) -> usize {
-        self.step_id
-    }
-
-    fn in_params(&self, params: &mut Vec<GPUKernelParameter>) {
-        params.extend([
-            GPUKernelParameter {
-                name: self.e_var(),
-                type_: CUDAType::from(A::PTYPE).to_string(),
-            },
-            GPUKernelParameter {
-                name: self.f_var(),
-                type_: CUDAType::from(A::PTYPE).to_string(),
-            },
-        ])
-    }
-
-    fn args<'a>(
-        &'a self,
-        _stream: &Arc<CudaStream>,
-        args: &mut LaunchArgs<'a>,
-    ) -> VortexResult<()> {
-        args.arg(&self.e);
-        args.arg(&self.f);
-        Ok(())
-    }
-
-    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
-        let output_cuda_type = CUDAType::from(self.float_type);
-        writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?;
-        Ok(())
-    }
-
-    fn kernel_body(
-        &self,
-        w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
-    ) -> fmt::Result {
-        self.child
-            .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
-                writeln!(
-                    w,
-                    "{out} = ((({type_}){tmp}) * {f}) * {e};",
-                    out = self.tmp_var(),
-                    type_ = CUDAType::from(self.float_type),
-                    tmp = self.child.output_var(),
-                    f = self.f_var(),
-                    e = self.e_var(),
-                )?;
-                f(w)
-            })
-    }
-
-    fn output_var(&self) -> String {
-        self.tmp_var()
-    }
-
-    fn output_type(&self) -> PType {
-        self.float_type
-    }
-
-    fn child<'a>(&'a self) -> &'a dyn GPUPipelineJIT {
-        self.child.as_ref()
-    }
-}
-
-struct DeclPrinter<'a, 'b: 'a> {
-    w: &'a mut IndentedWriter<&'b mut dyn Write>,
-}
-
-fn write_kernel_declarations(w: &mut IndentedWriter<&mut dyn Write>, node: &dyn GPUPipelineJIT) {
-    let mut decl = DeclPrinter { w };
-    decl.accept(node).vortex_expect("write decl cannot fail");
-}
-
-impl<'a> GPUVisitor<'a> for DeclPrinter<'a, '_> {
-    fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> {
-        node.children(self)?;
-        node.decls(self.w)
-            .map_err(|e| vortex_err!("cannot write {}", e))
-    }
-}
-
-struct InParamCollector {
-    params: Vec<GPUKernelParameter>,
-}
-
-impl GPUVisitor<'_> for InParamCollector {
-    fn accept(&mut self, node: &dyn GPUPipelineJIT) -> VortexResult<()> {
-        node.children(self)?;
-        node.in_params(&mut self.params);
-        Ok(())
-    }
-}
-
-fn collect_in_param(node: &dyn GPUPipelineJIT) -> VortexResult<Vec<GPUKernelParameter>> {
-    let mut params = InParamCollector { params: Vec::new() };
-    params.accept(node)?;
-    Ok(params.params)
-}
-
-struct ArgCollector<'a, 'b> {
-    stream: Arc<CudaStream>,
-    params: &'b mut LaunchArgs<'a>,
-}
-
-impl<'a> GPUVisitor<'a> for ArgCollector<'a, '_> {
-    fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> {
-        node.children(self)?;
-        node.args(&self.stream, self.params)?;
-        Ok(())
-    }
-}
-
-fn collect_args<'a, 'b>(
-    node: &'a dyn GPUPipelineJIT,
-    stream: Arc<CudaStream>,
-    args: &'b mut LaunchArgs<'a>,
-) -> VortexResult<()> {
-    let mut collector = ArgCollector {
-        stream: stream.clone(),
-
-        params: args,
-    };
-    collector.accept(node)?;
-
-    Ok(())
-}
-
-fn create_jit_str(
-    w: &mut IndentedWriter<&mut dyn Write>,
-    output: &dyn GPUPipelineJIT,
-) -> VortexResult<()> {
-    let mut params = collect_in_param(output)?;
-    params.push(GPUKernelParameter {
-        name: "_output".to_string(),
-        type_: format!("{} *__restrict__", CUDAType::from(output.output_type())),
-    });
-
-    (|| {
-        writeln!(w, "__device__ int FL_ORDER[] = {{0, 4, 2, 6, 1, 5, 3, 7}};")?;
-        writeln!(
-            w,
-            "#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)"
-        )?;
-        writeln!(w, "extern \"C\" __global__ void kernel(")?;
-        w.indent(|w| {
-            params.iter().enumerate().try_for_each(|(idx, p)| {
-                writeln!(
-                    w,
-                    "{} {}{end}",
-                    p.type_,
-                    p.name,
-                    end = if idx == params.len() - 1 { "" } else { "," }
-                )
-            })
-        })?;
-        writeln!(w, ") {{")?;
-
-        w.indent(|w| {
-            writeln!(
-                w,
-                "{output_type} *output = _output + (blockIdx.x * 1024);",
-                output_type = CUDAType::from(output.output_type())
-            )?;
-
-            writeln!(w, "__shared__ float s_output[1024];")?;
-
-            write_kernel_declarations(w, output);
-            writeln!(w)?;
-            output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
-                writeln!(w, "s_output[out_idx] = {tmp};", tmp = output.output_var())
-            })
-        })?;
-
-        writeln!(
-            w,
-            "    for (int i = 0; i < 32; i++) {{
-        auto idx = i * 32 + threadIdx.x;
-        output[idx] = s_output[idx];
-    }}"
-        )?;
-
-        writeln!(w, "}}")
-    })()
-    .map_err(|e| vortex_err!("format err {e}"))
-}
-
-fn create_kernel(ctx: Arc<CudaContext>, array: &dyn GPUPipelineJIT) -> VortexResult<CudaFunction> {
-    let mut s = String::new();
-    let w = &mut s as &mut dyn Write;
-    let mut ind = IndentedWriter::new(w);
-    let w = &mut ind;
-
-    let _ = create_jit_str(w, array).map_err(|e| vortex_err!("jit str cannot fail {e}"));
-
-    let module =
-        cudarc::nvrtc::compile_ptx(s.clone()).map_err(|e| vortex_err!("compile ptx {e}"))?;
-
-    // Dynamically load it into the device
-    let module = ctx
-        .load_module(module)
-        .map_err(|e| vortex_err!("load module {e}"))?;
-
-    module
-        .load_function("kernel")
-        .map_err(|e| vortex_err!("load_function {e}"))
-}
-
-pub fn create_run_jit_kernel(
-    ctx: Arc<CudaContext>,
-    array: &ArrayRef,
-) -> VortexResult<(ArrayRef, Duration)> {
-    let stream = ctx.default_stream();
-
-    let output = handle_array(array, &stream, 0);
-    let kernel = create_kernel(ctx.clone(), output.as_ref())?;
-
-    let num_chunks =
-        u32::try_from(array.len().div_ceil(1024)).vortex_expect("Too many grid elements");
-
-    let mut launch_builder = stream.launch_builder(&kernel);
-
-    let launch_config = LaunchConfig {
-        grid_dim: (num_chunks, 1, 1),
-        block_dim: (output.launch_config().block_width, 1, 1),
-        shared_mem_bytes: u32::try_from(output.output_type().byte_width())
-            .vortex_expect("oversized output type byte width")
-            * 1024,
-    };
-
-    collect_args(output.as_ref(), stream.clone(), &mut launch_builder)?;
-
-    match_each_native_ptype!(array.dtype().as_ptype(), |P| {
-        // append final argument (output) of the kernel
-        let mut out = stream.alloc_zeros::<P>(array.len()).unwrap();
-        launch_builder.arg(&mut out);
-        stream
-            .synchronize()
-            .map_err(|e| vortex_err!("failed to sync {e}"))?;
-        let start = stream
-            .record_event(Some(CU_EVENT_DEFAULT))
-            .ok()
-            .vortex_expect("Failed to record event");
-        let _ = unsafe { launch_builder.launch(launch_config) };
-        ctx.synchronize()
-            .map_err(|e| vortex_err!("Failed to synchronize: {e}"))?;
-        let end = stream
-            .record_event(Some(CU_EVENT_DEFAULT))
-            .ok()
-            .vortex_expect("Failed to record event");
-
-        let duration = start.elapsed_ms(&end).unwrap();
-
-        let mut buffer = BufferMut::<P>::with_capacity(array.len());
-        unsafe { buffer.set_len(array.len()) }
-
-        stream
-            .memcpy_dtoh(&out, &mut buffer)
-            .map_err(|e| vortex_err!("Failed to copy to device: {e}"))
-            .vortex_unwrap();
-        stream
-            .synchronize()
-            .map_err(|e| vortex_err!("Failed to synchronize: {e}"))
-            .vortex_unwrap();
-        let c =
-            Canonical::Primitive(PrimitiveArray::new(buffer, Validity::NonNullable)).into_array();
-
-        Ok((c, Duration::from_secs_f32(duration / 1000.0)))
-    })
-}
-
-#[cfg(test)]
-mod tests {
-    use cudarc::driver::CudaContext;
-    use vortex_alp::{ALPArray, Exponents};
-    use vortex_array::arrays::PrimitiveArray;
-    use vortex_array::{IntoArray, ToCanonical};
-    use vortex_error::VortexResult;
-    use vortex_fastlanes::{BitPackedArray, FoRArray};
-
-    use crate::jit::create_run_jit_kernel;
-
-    #[test]
-    fn jit_arr_f32() -> VortexResult<()> {
-        let ctx = CudaContext::new(0).unwrap();
-        ctx.set_blocking_synchronize().unwrap();
-        let for_ = ALPArray::try_new(
-            FoRArray::try_new(
-                BitPackedArray::encode((0i32..1024 * 2).collect::<PrimitiveArray>().as_ref(), 12)?
-                    .into_array(),
-                2i32.into(),
-            )?
-            .into_array(),
-            Exponents { e: 4, f: 5 },
-            None,
-        )?
-        .into_array();
-
-        let (d, _) = create_run_jit_kernel(ctx, &for_)?;
-        let prim = d.to_primitive();
-        let expect = for_.to_primitive();
-
-        for i in 0..prim.len() {
-            assert_eq!(
-                prim.as_slice::<f32>()[i],
-                expect.as_slice::<f32>()[i],
-                "i = {i}"
-            );
-        }
-
-        Ok(())
-    }
-
-    #[test]
-    fn jit_arrs() -> VortexResult<()> {
-        let ctx = CudaContext::new(0).unwrap();
-        ctx.set_blocking_synchronize().unwrap();
-        let for_ = BitPackedArray::encode(
-            (0i32..1024)
-                .map(|_| 1u32)
-                .collect::<PrimitiveArray>()
-                .as_ref(),
-            2,
-        )?
-        .into_array();
-
-        create_run_jit_kernel(ctx, &for_.into_array())?;
-
-        Ok(())
-    }
-}
diff --git a/vortex-gpu/src/jit/arrays/alp.rs b/vortex-gpu/src/jit/arrays/alp.rs
new file mode 100644
index 00000000000..3b2515900a9
--- /dev/null
+++ b/vortex-gpu/src/jit/arrays/alp.rs
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt;
+use std::fmt::Write;
+use std::sync::Arc;
+
+use cudarc::driver::{CudaStream, DeviceRepr, LaunchArgs};
+use vortex_alp::{ALPArray, ALPFloat, match_each_alp_float_ptype};
+use vortex_dtype::PType;
+use vortex_error::VortexResult;
+
+use crate::indent::IndentedWriter;
+use crate::jit::convert::handle_array;
+use crate::jit::{
+    GPUKernelParameter, GPUPipelineJIT, ScalarGPUPipelineJIT, ScalarGPUPipelineJITNode,
+    StepIdAllocator,
+};
+
+struct ALP<A: ALPFloat> {
+    step_id: usize,
+    float_type: PType,
+    child: Box<dyn GPUPipelineJIT>,
+    f: A,
+    e: A,
+}
+
+pub fn new_jit(
+    alp: &ALPArray,
+    stream: &Arc<CudaStream>,
+    allocator: &mut StepIdAllocator,
+) -> Box<dyn GPUPipelineJIT> {
+    match_each_alp_float_ptype!(alp.ptype(), |A| {
+        let child = handle_array(alp.encoded(), stream, allocator);
+        let step_id = allocator.get_id();
+        Box::new(ScalarGPUPipelineJITNode {
+            inner: ALP {
+                step_id,
+                float_type: alp.ptype(),
+                child,
+                f: A::F10[alp.exponents().f as usize],
+                e: A::IF10[alp.exponents().e as usize],
+            },
+        })
+    })
+}
+
+impl<A: ALPFloat> ALP<A> {
+    fn tmp_var(&self) -> String {
+        format!("tmp{}", self.step_id)
+    }
+
+    fn e_var(&self) -> String {
+        format!("e{}", self.step_id)
+    }
+
+    fn f_var(&self) -> String {
+        format!("f{}", self.step_id)
+    }
+}
+
+impl<A: ALPFloat + DeviceRepr> ScalarGPUPipelineJIT for ALP<A> {
+    fn step_id(&self) -> usize {
+        self.step_id
+    }
+
+    fn in_params(&self, params: &mut Vec<GPUKernelParameter>) {
+        params.extend([
+            GPUKernelParameter {
+                name: self.e_var(),
+                type_: CUDAType::from(A::PTYPE).to_string(),
+            },
+            GPUKernelParameter {
+                name: self.f_var(),
+                type_: CUDAType::from(A::PTYPE).to_string(),
+            },
+        ])
+    }
+
+    fn args<'a>(
+        &'a self,
+        _stream: &Arc<CudaStream>,
+        args: &mut LaunchArgs<'a>,
+    ) -> VortexResult<()> {
+        args.arg(&self.e);
+        args.arg(&self.f);
+        Ok(())
+    }
+
+    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
+        let output_cuda_type = CUDAType::from(self.float_type);
+        writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?;
+        Ok(())
+    }
+
+    fn kernel_body(
+        &self,
+        w: &mut IndentedWriter<&mut dyn Write>,
+        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
+    ) -> fmt::Result {
+        self.child
+            .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
+                writeln!(
+                    w,
+                    "{out} = ((({type_}){tmp}) * {f}) * {e};",
+                    out = self.tmp_var(),
+                    type_ = CUDAType::from(self.float_type),
+                    tmp = self.child.output_var(),
+                    f = self.f_var(),
+                    e = self.e_var(),
+                )?;
+                f(w)
+            })
+    }
+
+    fn output_var(&self) -> String {
+        self.tmp_var()
+    }
+
+    fn output_type(&self) -> PType {
+        self.float_type
+    }
+
+    fn child(&self) -> &dyn GPUPipelineJIT {
+        self.child.as_ref()
+    }
+}
diff --git a/vortex-gpu/src/jit/arrays/bitpack.rs b/vortex-gpu/src/jit/arrays/bitpack.rs
new file mode 100644
index 00000000000..9bdf6e50000
--- /dev/null
+++ b/vortex-gpu/src/jit/arrays/bitpack.rs
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt;
+use std::fmt::Write;
+use std::sync::Arc;
+
+use cudarc::driver::{CudaSlice, CudaStream, DeviceRepr, LaunchArgs, PushKernelArg};
+use vortex_alp::{ALPArray, match_each_alp_float_ptype};
+use vortex_buffer::Buffer;
+use vortex_dtype::{NativePType, PType, match_each_native_ptype};
+use vortex_error::{VortexResult, VortexUnwrap, vortex_err};
+use vortex_fastlanes::BitPackedArray;
+
+use crate::indent::IndentedWriter;
+use crate::jit::{
+    CUDAType, GPUKernelParameter, GPULaunchConfig, GPUPipelineJIT, GPUVisitor, StepIdAllocator,
+};
+
+struct BitPack<P> {
+    step_id: usize,
+    bit_width: u8,
+    output_type: PType,
+    cuda_slice: CudaSlice<P>,
+}
+
+pub fn new_jit(
+    bp: &BitPackedArray,
+    stream: &Arc<CudaStream>,
+    allocator: &mut StepIdAllocator,
+) -> Box<dyn GPUPipelineJIT> {
+    assert_eq!(bp.offset(), 0);
+    assert!(bp.patches().is_none());
+    match_each_native_ptype!(bp.ptype(), |P| {
+        let values = Buffer::<P>::from_byte_buffer(bp.packed().clone());
+        let cuda_slice = stream
+            .memcpy_stod(values.as_slice())
+            .map_err(|e| vortex_err!("Failed to copy to device: {e}"))
+            .vortex_unwrap();
+        let step_id = allocator.get_id();
+        Box::new(BitPack::<P> {
+            step_id,
+            bit_width: bp.bit_width(),
+            output_type: bp.ptype(),
+            cuda_slice,
+        })
+    })
+}
+
+impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
+    fn step_id(&self) -> usize {
+        self.step_id
+    }
+
+    fn in_params(&self, p: &mut Vec<GPUKernelParameter>) {
+        p.push(GPUKernelParameter {
+            name: self.in_var_g(),
+            type_: format!(
+                "{type_} *__restrict",
+                type_ = CUDAType::from(self.output_type.to_unsigned())
+            ),
+        });
+    }
+
+    fn args<'a>(
+        &'a self,
+        _stream: &Arc<CudaStream>,
+        launch_args: &mut LaunchArgs<'a>,
+    ) -> VortexResult<()> {
+        launch_args.arg(&self.cuda_slice);
+
+        Ok(())
+    }
+
+    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
+        let output_cuda_type = CUDAType::from(self.output_type);
+        let uoutput_cuda_type = CUDAType::from(self.output_type.to_unsigned());
+        writeln!(
+            w,
+            "unsigned int LANE_COUNT = {bits};",
+            bits = 1024 / self.output_type.bit_width()
+        )?;
+        writeln!(w, "{output_cuda_type} {};", self.tmp_var())?;
+        writeln!(w, "{uoutput_cuda_type} {};", self.src_var())?;
+        writeln!(w, "{uoutput_cuda_type} {};", self.utmp_var())?;
+        writeln!(w, "unsigned int out_idx;")?;
+        writeln!(w, "unsigned int lane = threadIdx.x;")?;
+        writeln!(
+            w,
+            "{uoutput_cuda_type} *{in_l} = {in_g} + (blockIdx.x * 128 * {bit_width} / {bit_size});",
+            in_l = self.in_var_l(),
+            in_g = self.in_var_g(),
+            bit_width = self.bit_width,
+            bit_size = P::PTYPE.byte_width()
+        )?;
+        Ok(())
+    }
+
+    fn kernel_body(
+        &self,
+        w: &mut IndentedWriter<&mut dyn Write>,
+        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
+    ) -> fmt::Result {
+        let bit_width = self.bit_width as usize;
+        let bits = self.output_type.bit_width();
+        let in_ = self.in_var_l();
+        if bit_width == 0 {
+            writeln!(w, "uint{bits}_t zero = 0ULL;")?;
+            writeln!(w)?;
+            for row in 0..bits {
+                writeln!(w, "out[INDEX({row}, lane)] = zero;")?;
+            }
+        } else if bit_width == bits {
+            writeln!(w)?;
+            for row in 0..bits {
+                writeln!(
+                    w,
+                    "out[INDEX({row}, lane)] = {in_}[LANE_COUNT * {row} + lane];",
+                )?;
+            }
+        } else {
+            let src = self.src_var();
+            let utmp = self.utmp_var();
+            let tmp = self.tmp_var();
+
+            let mask_fn = |bits: usize| {
+                format!(
+                    "((({type_})1 << {width}) - 1)",
+                    type_ = CUDAType::from(P::PTYPE.to_unsigned()),
+                    width = bits
+                )
+            };
+
+            writeln!(w)?;
+            writeln!(w, "{src} = {in}[lane];", in = self.in_var_l())?;
+            for row in 0..bits {
+                let curr_word = (row * bit_width) / bits;
+                let next_word = ((row + 1) * bit_width) / bits;
+                let shift = (row * bit_width) % bits;
+
+                if next_word > curr_word {
+                    let remaining_bits = ((row + 1) * bit_width) % bits;
+                    let current_bits = bit_width - remaining_bits;
+                    writeln!(
+                        w,
+                        "{utmp} = ({src} >> {shift}) & {mask};",
+                        mask = mask_fn(current_bits)
+                    )?;
+
+                    if next_word < bit_width {
+                        writeln!(w, "{src} = {in_}[lane + LANE_COUNT * {next_word}];")?;
+                        writeln!(
+                            w,
+                            "{utmp} |= ({src} & {mask}) << {current_bits};",
+                            mask = mask_fn(remaining_bits)
+                        )?;
+                    }
+                } else {
+                    writeln!(
+                        w,
+                        "{utmp} = ({src} >> {shift}) & {mask};",
+                        mask = mask_fn(bit_width)
+                    )?;
+                }
+                writeln!(
+                    w,
+                    "{tmp} = ({type_}){utmp};",
+                    type_ = CUDAType::from(self.output_type),
+                )?;
+
+                writeln!(w, "out_idx = INDEX({row}, lane);")?;
+                f(w)?;
+                writeln!(w)?;
+            }
+        }
+        Ok(())
+    }
+
+    fn output_var(&self) -> String {
+        self.tmp_var()
+    }
+
+    fn output_type(&self) -> PType {
+        self.output_type
+    }
+
+    fn children(&self, _visitor: &mut dyn GPUVisitor) -> VortexResult<()> {
+        Ok(())
+    }
+
+    fn launch_config(&self) -> GPULaunchConfig {
+        GPULaunchConfig {
+            block_width: if P::PTYPE == PType::U64 { 16 } else { 32 },
+        }
+    }
+}
+
+impl<P> BitPack<P> {
+    fn tmp_var(&self) -> String {
+        format!("tmp{}", self.step_id)
+    }
+
+    fn src_var(&self) -> String {
+        format!("src{}", self.step_id)
+    }
+
+    fn utmp_var(&self) -> String {
+        format!("utmp{}", self.step_id)
+    }
+
+    fn in_var_l(&self) -> String {
+        format!("in{}", self.step_id)
+    }
+
+    fn in_var_g(&self) -> String {
+        format!("_in{}", self.step_id)
+    }
+}
diff --git a/vortex-gpu/src/jit/arrays/for_.rs b/vortex-gpu/src/jit/arrays/for_.rs
new file mode 100644
index 00000000000..adca961968a
--- /dev/null
+++ b/vortex-gpu/src/jit/arrays/for_.rs
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt;
+use std::fmt::Write;
+use std::sync::Arc;
+
+use cudarc::driver::{CudaStream, DeviceRepr, LaunchArgs};
+use vortex_dtype::{NativePType, PType, match_each_native_ptype};
+use vortex_error::{VortexExpect, VortexResult};
+use vortex_fastlanes::{BitPackedArray, FoRArray};
+
+use crate::indent::IndentedWriter;
+use crate::jit::convert::handle_array;
+use crate::jit::{
+    CUDAType, GPUKernelParameter, GPUPipelineJIT, ScalarGPUPipelineJIT, ScalarGPUPipelineJITNode,
+    StepIdAllocator,
+};
+
+struct FoR<P> {
+    step_id: usize,
+    reference: P,
+    child: Box<dyn GPUPipelineJIT>,
+}
+
+pub fn new_jit(
+    for_: &FoRArray,
+    stream: &Arc<CudaStream>,
+    allocator: &mut StepIdAllocator,
+) -> Box<dyn GPUPipelineJIT> {
+    match_each_native_ptype!(for_.reference_scalar().as_primitive().ptype(), |P| {
+        let child = handle_array(for_.encoded(), stream, allocator);
+        Box::new(ScalarGPUPipelineJITNode {
+            inner: FoR {
+                step_id,
+                reference: for_
+                    .reference_scalar()
+                    .as_primitive()
+                    .as_::<P>()
+                    .vortex_expect("cannot have a null reference"),
+                child,
+            },
+        })
+    })
+}
+
+impl<P> FoR<P> {
+    fn tmp_var(&self) -> String {
+        format!("tmp{}", self.step_id)
+    }
+
+    fn ref_var(&self) -> String {
+        format!("ref{}", self.step_id)
+    }
+}
+
+impl<P: NativePType + DeviceRepr> ScalarGPUPipelineJIT for FoR<P> {
+    fn step_id(&self) -> usize {
+        self.step_id
+    }
+
+    fn in_params(&self, p: &mut Vec<GPUKernelParameter>) {
+        p.push(GPUKernelParameter {
+            name: self.ref_var(),
+            type_: CUDAType::from(self.output_type()).to_string(),
+        })
+    }
+
+    fn args<'a>(
+        &'a self,
+        _stream: &Arc<CudaStream>,
+        args: &mut LaunchArgs<'a>,
+    ) -> VortexResult<()> {
+        args.arg(&self.reference);
+        Ok(())
+    }
+
+    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
+        let output_cuda_type = CUDAType::from(self.output_type());
+        writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?;
+        Ok(())
+    }
+
+    fn kernel_body(
+        &self,
+        w: &mut IndentedWriter<&mut dyn Write>,
+        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
+    ) -> fmt::Result {
+        assert_eq!(self.output_type(), self.child.output_type());
+        let in_var = self.child.output_var();
+        let out_var = self.tmp_var();
+        let ref_var = self.ref_var();
+        self.child
+            .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
+                writeln!(w, "{out_var} = {in_var} + {ref_var};")?;
+                f(w)
+            })
+    }
+
+    fn output_var(&self) -> String {
+        self.tmp_var()
+    }
+
+    fn output_type(&self) -> PType {
+        P::PTYPE
+    }
+
+    fn child(&self) -> &dyn GPUPipelineJIT {
+        self.child.as_ref()
+    }
+}
diff --git a/vortex-gpu/src/jit/arrays/mod.rs b/vortex-gpu/src/jit/arrays/mod.rs
new file mode 100644
index 00000000000..ab897d6dc2c
--- /dev/null
+++ b/vortex-gpu/src/jit/arrays/mod.rs
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+pub mod alp;
+pub mod bitpack;
+pub mod for_;
diff --git a/vortex-gpu/src/jit/convert.rs b/vortex-gpu/src/jit/convert.rs
new file mode 100644
index 00000000000..ceb1b7138c5
--- /dev/null
+++ b/vortex-gpu/src/jit/convert.rs
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::Arc;
+
+use cudarc::driver::CudaStream;
+use itertools::all;
+use vortex_alp::{ALPFloat, ALPVTable, match_each_alp_float_ptype};
+use vortex_array::{Array, ArrayRef};
+use vortex_buffer::Buffer;
+use vortex_dtype::match_each_native_ptype;
+use vortex_error::{VortexUnwrap, vortex_err};
+use vortex_fastlanes::{BitPackedVTable, FoRVTable};
+
+use crate::jit::arrays::{alp, bitpack, for_};
+use crate::jit::{GPUPipelineJIT, ScalarGPUPipelineJITNode, StepIdAllocator};
+
+pub fn handle_array(
+    a: &ArrayRef,
+    stream: &Arc<CudaStream>,
+    allocator: &mut StepIdAllocator,
+) -> Box<dyn GPUPipelineJIT> {
+    if let Some(alp) = a.as_opt::<ALPVTable>() {
+        return alp::new_jit(alp, stream, allocator);
+    }
+    if let Some(bp) = a.as_opt::<BitPackedVTable>() {
+        return bitpack::new_jit(bp, stream, allocator);
+    };
+
+    if let Some(for_) = a.as_opt::<FoRVTable>() {
+        return for_::new_jit(for_, stream, allocator);
+    }
+
+    todo!("unimplemented jit for {}", a.encoding_id())
+}
diff --git a/vortex-gpu/src/jit/kernel_fmt.rs b/vortex-gpu/src/jit/kernel_fmt.rs
new file mode 100644
index 00000000000..6ed585f1055
--- /dev/null
+++ b/vortex-gpu/src/jit/kernel_fmt.rs
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt::Write;
+use std::sync::Arc;
+
+use cudarc::driver::{CudaContext, CudaFunction, CudaStream, LaunchArgs};
+use vortex_error::{VortexExpect, VortexResult, vortex_err};
+
+use crate::indent::IndentedWriter;
+use crate::jit::type_::CUDAType;
+use crate::jit::{GPUKernelParameter, GPUPipelineJIT, GPUVisitor};
+
+struct DeclPrinter<'a, 'b: 'a> {
+    w: &'a mut IndentedWriter<&'b mut dyn Write>,
+}
+
+fn write_kernel_declarations(w: &mut IndentedWriter<&mut dyn Write>, node: &dyn GPUPipelineJIT) {
+    let mut decl = DeclPrinter { w };
+    decl.accept(node).vortex_expect("write decl cannot fail");
+}
+
+impl<'a> GPUVisitor<'a> for DeclPrinter<'a, '_> {
+    fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> {
+        node.children(self)?;
+        node.decls(self.w)
+            .map_err(|e| vortex_err!("cannot write {}", e))
+    }
+}
+
+struct InParamCollector {
+    params: Vec<GPUKernelParameter>,
+}
+
+impl GPUVisitor<'_> for InParamCollector {
+    fn accept(&mut self, node: &dyn GPUPipelineJIT) -> VortexResult<()> {
+        node.children(self)?;
+        node.in_params(&mut self.params);
+        Ok(())
+    }
+}
+
+fn collect_in_param(node: &dyn GPUPipelineJIT) -> VortexResult<Vec<GPUKernelParameter>> {
+    let mut params = InParamCollector { params: Vec::new() };
+    params.accept(node)?;
+    Ok(params.params)
+}
+
+pub fn create_kernel_str(
+    w: &mut IndentedWriter<&mut dyn Write>,
+    output: &dyn GPUPipelineJIT,
+) -> VortexResult<()> {
+    let mut params = collect_in_param(output)?;
+    params.push(GPUKernelParameter {
+        name: "_output".to_string(),
+        type_: format!("{} *__restrict__", CUDAType::from(output.output_type())),
+    });
+
+    (|| {
+        // TODO: include when only for fast lanes codecs
+        writeln!(w, "__device__ int FL_ORDER[] = {{0, 4, 2, 6, 1, 5, 3, 7}};")?;
+        writeln!(
+            w,
+            "#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)"
+        )?;
+        writeln!(w, "extern \"C\" __global__ void kernel(")?;
+        w.indent(|w| {
+            for (idx, p) in params.iter().enumerate() {
+                let separator = if idx < params.len() - 1 { "," } else { "" };
+                writeln!(w, "{} {}{}", p.type_, p.name, separator)?;
+            }
+            Ok(())
+        })?;
+        writeln!(w, ") {{")?;
+
+        w.indent(|w| {
+            writeln!(
+                w,
+                "{output_type} *output = _output + (blockIdx.x * 1024);",
+                output_type = CUDAType::from(output.output_type())
+            )?;
+
+            writeln!(w, "__shared__ float s_output[1024];")?;
+
+            write_kernel_declarations(w, output);
+            writeln!(w)?;
+            output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
+                writeln!(w, "s_output[out_idx] = {tmp};", tmp = output.output_var())
+            })?;
+            writeln!(w)?;
+
+            writeln!(w, "for (int i = 0; i < 32; i++) {{")?;
+            w.indent(|w| {
+                writeln!(w, "auto idx = i * 32 + threadIdx.x;")?;
+                writeln!(w, "output[idx] = s_output[idx];")
+            })?;
+            writeln!(w, "}}")
+        })?;
+
+        writeln!(w, "}}")
+    })()
+    .map_err(|e| vortex_err!("format err {e}"))
+}
+
+pub fn create_kernel(
+    ctx: Arc<CudaContext>,
+    array: &dyn GPUPipelineJIT,
+) -> VortexResult<CudaFunction> {
+    let mut s = String::new();
+    let w = &mut s as &mut dyn Write;
+    let mut ind = IndentedWriter::new(w);
+    let w = &mut ind;
+
+    create_kernel_str(w, array).map_err(|e| vortex_err!("jit str cannot fail {e}"))?;
+
+    let module =
+        cudarc::nvrtc::compile_ptx(s.clone()).map_err(|e| vortex_err!("compile ptx {e}"))?;
+
+    // Dynamically load it into the device
+    let module = ctx
+        .load_module(module)
+        .map_err(|e| vortex_err!("load module {e}"))?;
+
+    module
+        .load_function("kernel")
+        .map_err(|e| vortex_err!("load_function {e}"))
+}
diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs
new file mode 100644
index 00000000000..fe558996055
--- /dev/null
+++ b/vortex-gpu/src/jit/mod.rs
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+mod arrays;
+mod convert;
+mod kernel_fmt;
+mod run;
+mod type_;
+
+use std::fmt;
+use std::fmt::Write;
+use std::sync::Arc;
+
+use cudarc::driver::{CudaStream, LaunchArgs};
+pub use run::create_run_jit_kernel;
+use vortex_dtype::PType;
+use vortex_error::VortexResult;
+
+use crate::indent::IndentedWriter;
+
+pub trait GPUPipelineJIT {
+    fn step_id(&self) -> usize;
+
+    fn in_params(&self, params: &mut Vec<GPUKernelParameter>);
+
+    fn args<'a>(&'a self, stream: &Arc<CudaStream>, args: &mut LaunchArgs<'a>) -> VortexResult<()>;
+
+    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result;
+
+    fn kernel_body(
+        &self,
+        w: &mut IndentedWriter<&mut dyn Write>,
+        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
+    ) -> fmt::Result;
+
+    fn output_var(&self) -> String;
+
+    fn output_type(&self) -> PType;
+
+    // always pass the output iteration aligned child last.
+    fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()>;
+
+    fn launch_config(&self) -> GPULaunchConfig;
+}
+
+pub trait ScalarGPUPipelineJIT {
+    fn step_id(&self) -> usize;
+
+    fn in_params(&self, params: &mut Vec<GPUKernelParameter>);
+
+    fn args<'a>(&'a self, stream: &Arc<CudaStream>, args: &mut LaunchArgs<'a>) -> VortexResult<()>;
+
+    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result;
+
+    fn kernel_body(
+        &self,
+        w: &mut IndentedWriter<&mut dyn Write>,
+        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
+    ) -> fmt::Result;
+
+    fn output_var(&self) -> String;
+
+    fn output_type(&self) -> PType;
+
+    fn child(&self) -> &dyn GPUPipelineJIT;
+}
+
+#[derive(Default)]
+struct StepIdAllocator {
+    next_id: usize,
+}
+
+impl StepIdAllocator {
+    pub fn get_id(&mut self) -> usize {
+        let id = self.next_id;
+        self.next_id += 1;
+        id
+    }
+}
+
+trait GPUVisitor<'a> {
+    fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()>;
+}
+
+struct GPUKernelParameter {
+    name: String,
+    type_: String,
+}
+
+struct GPULaunchConfig {
+    block_width: u32,
+}
+
+struct ScalarGPUPipelineJITNode<T> {
+    inner: T,
+}
+
+impl<T: ScalarGPUPipelineJIT> GPUPipelineJIT for ScalarGPUPipelineJITNode<T> {
+    fn step_id(&self) -> usize {
+        self.inner.step_id()
+    }
+
+    fn in_params(&self, params: &mut Vec<GPUKernelParameter>) {
+        self.inner.in_params(params)
+    }
+
+    fn args<'a>(&'a self, stream: &Arc<CudaStream>, args: &mut LaunchArgs<'a>) -> VortexResult<()> {
+        self.inner.args(stream, args)
+    }
+
+    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
+        self.inner.decls(w)
+    }
+
+    fn kernel_body(
+        &self,
+        w: &mut IndentedWriter<&mut dyn Write>,
+        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
+    ) -> fmt::Result {
+        self.inner.kernel_body(w, f)
+    }
+
+    fn output_var(&self) -> String {
+        self.inner.output_var()
+    }
+
+    fn output_type(&self) -> PType {
+        self.inner.output_type()
+    }
+
+    fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()> {
+        visitor.accept(self.inner.child())
+    }
+
+    fn launch_config(&self) -> GPULaunchConfig {
+        self.inner.child().launch_config()
+    }
+}
diff --git a/vortex-gpu/src/jit/run.rs b/vortex-gpu/src/jit/run.rs
new file mode 100644
index 00000000000..c16f9f4db36
--- /dev/null
+++ b/vortex-gpu/src/jit/run.rs
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::Arc;
+use std::time::Duration;
+
+use cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT;
+use cudarc::driver::{CudaContext, CudaStream, LaunchArgs, LaunchConfig, PushKernelArg};
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::validity::Validity;
+use vortex_array::{ArrayRef, Canonical, IntoArray};
+use vortex_buffer::BufferMut;
+use vortex_dtype::match_each_native_ptype;
+use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_err};
+
+use crate::jit::convert::handle_array;
+use crate::jit::kernel_fmt::create_kernel;
+use crate::jit::{GPUPipelineJIT, GPUVisitor, StepIdAllocator};
+
+pub fn create_run_jit_kernel(
+    ctx: Arc<CudaContext>,
+    array: &ArrayRef,
+) -> VortexResult<(ArrayRef, Duration)> {
+    let stream = ctx.default_stream();
+
+    let mut allocator = StepIdAllocator::new();
+    let output = handle_array(array, &stream, &mut allocator);
+    let kernel = create_kernel(ctx.clone(), output.as_ref())?;
+
+    let num_chunks =
+        u32::try_from(array.len().div_ceil(1024)).vortex_expect("Too many grid elements");
+
+    let mut launch_builder = stream.launch_builder(&kernel);
+
+    let launch_config = LaunchConfig {
+        grid_dim: (num_chunks, 1, 1),
+        block_dim: (output.launch_config().block_width, 1, 1),
+        shared_mem_bytes: u32::try_from(output.output_type().byte_width())
+            .vortex_expect("oversized output type byte width")
+            * 1024,
+    };
+
+    collect_args(output.as_ref(), stream.clone(), &mut launch_builder)?;
+
+    match_each_native_ptype!(array.dtype().as_ptype(), |P| {
+        // append final argument (output) of the kernel
+        let mut out = stream
+            .alloc_zeros::<P>(array.len())
+            .map_err(|e| vortex_err!("failed to alloc zeros {e}"))?;
+        launch_builder.arg(&mut out);
+        stream
+            .synchronize()
+            .map_err(|e| vortex_err!("failed to sync {e}"))?;
+        let start = stream
+            .record_event(Some(CU_EVENT_DEFAULT))
+            .ok()
+            .vortex_expect("Failed to record event");
+        let _ = unsafe { launch_builder.launch(launch_config) };
+        ctx.synchronize()
+            .map_err(|e| vortex_err!("Failed to synchronize: {e}"))?;
+        let end = stream
+            .record_event(Some(CU_EVENT_DEFAULT))
+            .ok()
+            .vortex_expect("Failed to record event");
+
+        let duration = start
+            .elapsed_ms(&end)
+            .map_err(|e| vortex_err!("failed to get elapsed time {e}"))?;
+
+        let mut buffer = BufferMut::<P>::with_capacity(array.len());
+        unsafe { buffer.set_len(array.len()) }
+
+        stream
+            .memcpy_dtoh(&out, &mut buffer)
+            .map_err(|e| vortex_err!("Failed to copy to device: {e}"))
+            .vortex_unwrap();
+        stream
+            .synchronize()
+            .map_err(|e| vortex_err!("Failed to synchronize: {e}"))
+            .vortex_unwrap();
+        let c =
+            Canonical::Primitive(PrimitiveArray::new(buffer, Validity::NonNullable)).into_array();
+
+        Ok((c, Duration::from_secs_f32(duration / 1000.0)))
+    })
+}
+
+struct ArgCollector<'a, 'b> {
+    stream: Arc<CudaStream>,
+    params: &'b mut LaunchArgs<'a>,
+}
+
+impl<'a> GPUVisitor<'a> for ArgCollector<'a, '_> {
+    fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> {
+        node.children(self)?;
+        node.args(&self.stream, self.params)?;
+        Ok(())
+    }
+}
+
+fn collect_args<'a>(
+    node: &'a dyn GPUPipelineJIT,
+    stream: Arc<CudaStream>,
+    args: &mut LaunchArgs<'a>,
+) -> VortexResult<()> {
+    let mut collector = ArgCollector {
+        stream,
+
+        params: args,
+    };
+    collector.accept(node)?;
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use cudarc::driver::CudaContext;
+    use vortex_alp::{ALPArray, Exponents};
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::{IntoArray, ToCanonical};
+    use vortex_error::VortexResult;
+    use vortex_fastlanes::{BitPackedArray, FoRArray};
+
+    use crate::jit::create_run_jit_kernel;
+
+    #[test]
+    fn jit_arr_f32() -> VortexResult<()> {
+        let ctx = CudaContext::new(0).unwrap();
+        ctx.set_blocking_synchronize().unwrap();
+        let for_ = ALPArray::try_new(
+            FoRArray::try_new(
+                BitPackedArray::encode((0i32..1024 * 2).collect::<PrimitiveArray>().as_ref(), 12)?
+                    .into_array(),
+                2i32.into(),
+            )?
+            .into_array(),
+            Exponents { e: 4, f: 5 },
+            None,
+        )?
+        .into_array();
+
+        let (d, _) = create_run_jit_kernel(ctx, &for_)?;
+        let prim = d.to_primitive();
+        let expect = for_.to_primitive();
+
+        for i in 0..prim.len() {
+            assert_eq!(
+                prim.as_slice::<f32>()[i],
+                expect.as_slice::<f32>()[i],
+                "i = {i}"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn jit_arrs() -> VortexResult<()> {
+        let ctx = CudaContext::new(0).unwrap();
+        ctx.set_blocking_synchronize().unwrap();
+        let for_ = BitPackedArray::encode(
+            (0i32..1024)
+                .map(|_| 1u32)
+                .collect::<PrimitiveArray>()
+                .as_ref(),
+            2,
+        )?
+        .into_array();
+
+        create_run_jit_kernel(ctx, &for_.into_array())?;
+
+        Ok(())
+    }
+}
diff --git a/vortex-gpu/src/jit/type_.rs b/vortex-gpu/src/jit/type_.rs
new file mode 100644
index 00000000000..8ae20f3a75d
--- /dev/null
+++ b/vortex-gpu/src/jit/type_.rs
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt;
+use std::fmt::Display;
+
+use vortex_dtype::PType;
+
+pub struct CUDAType(&'static str);
+
+impl Display for CUDAType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str(self.0)
+    }
+}
+
+impl From<PType> for CUDAType {
+    fn from(value: PType) -> Self {
+        CUDAType(match value {
+            PType::U8 => "unsigned char",
+            PType::U16 => "unsigned short",
+            PType::U32 => "unsigned int",
+            PType::U64 => "unsigned long long",
+            PType::I8 => "char",
+            PType::I16 => "short",
+            PType::I32 => "int",
+            PType::I64 => "long long",
+            PType::F32 => "float",
+            PType::F64 => "double",
+            PType::F16 => todo!(),
+        })
+    }
+}

From fcdde32d71a89569229ea952dac4b39931a0c24d Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Tue, 14 Oct 2025 16:25:35 -0400
Subject: [PATCH 07/15] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-gpu/src/indent.rs             |  2 ++
 vortex-gpu/src/jit/arrays/alp.rs     |  6 +++---
 vortex-gpu/src/jit/arrays/bitpack.rs | 11 +++++------
 vortex-gpu/src/jit/arrays/for_.rs    | 21 ++++++++++-----------
 vortex-gpu/src/jit/convert.rs        | 12 ++++++------
 vortex-gpu/src/jit/kernel_fmt.rs     | 15 ++++++---------
 vortex-gpu/src/jit/mod.rs            |  7 ++++---
 vortex-gpu/src/jit/run.rs            |  7 +++----
 8 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/vortex-gpu/src/indent.rs b/vortex-gpu/src/indent.rs
index 5e7d587d202..7594963be69 100644
--- a/vortex-gpu/src/indent.rs
+++ b/vortex-gpu/src/indent.rs
@@ -32,3 +32,5 @@ impl<W: Write> IndentedWriter<W> {
         write!(self.write, "{}{}", self.indent, fmt)
     }
 }
+
+pub type IndentedWrite<'a> = IndentedWriter<&'a mut dyn Write>;
diff --git a/vortex-gpu/src/jit/arrays/alp.rs b/vortex-gpu/src/jit/arrays/alp.rs
index 3b2515900a9..a254a3e69b3 100644
--- a/vortex-gpu/src/jit/arrays/alp.rs
+++ b/vortex-gpu/src/jit/arrays/alp.rs
@@ -5,7 +5,7 @@ use std::fmt;
 use std::fmt::Write;
 use std::sync::Arc;
 
-use cudarc::driver::{CudaStream, DeviceRepr, LaunchArgs};
+use cudarc::driver::{CudaStream, DeviceRepr, LaunchArgs, PushKernelArg};
 use vortex_alp::{ALPArray, ALPFloat, match_each_alp_float_ptype};
 use vortex_dtype::PType;
 use vortex_error::VortexResult;
@@ -13,7 +13,7 @@ use vortex_error::VortexResult;
 use crate::indent::IndentedWriter;
 use crate::jit::convert::handle_array;
 use crate::jit::{
-    GPUKernelParameter, GPUPipelineJIT, ScalarGPUPipelineJIT, ScalarGPUPipelineJITNode,
+    CUDAType, GPUKernelParameter, GPUPipelineJIT, ScalarGPUPipelineJIT, ScalarGPUPipelineJITNode,
     StepIdAllocator,
 };
 
@@ -32,7 +32,7 @@ pub fn new_jit(
 ) -> Box<dyn GPUPipelineJIT> {
     match_each_alp_float_ptype!(alp.ptype(), |A| {
         let child = handle_array(alp.encoded(), stream, allocator);
-        let step_id = allocator.get_id();
+        let step_id = allocator.fresh_id();
         Box::new(ScalarGPUPipelineJITNode {
             inner: ALP {
                 step_id,
diff --git a/vortex-gpu/src/jit/arrays/bitpack.rs b/vortex-gpu/src/jit/arrays/bitpack.rs
index 9bdf6e50000..b93b592aaa9 100644
--- a/vortex-gpu/src/jit/arrays/bitpack.rs
+++ b/vortex-gpu/src/jit/arrays/bitpack.rs
@@ -6,13 +6,12 @@ use std::fmt::Write;
 use std::sync::Arc;
 
 use cudarc::driver::{CudaSlice, CudaStream, DeviceRepr, LaunchArgs, PushKernelArg};
-use vortex_alp::{ALPArray, match_each_alp_float_ptype};
 use vortex_buffer::Buffer;
 use vortex_dtype::{NativePType, PType, match_each_native_ptype};
 use vortex_error::{VortexResult, VortexUnwrap, vortex_err};
 use vortex_fastlanes::BitPackedArray;
 
-use crate::indent::IndentedWriter;
+use crate::indent::IndentedWrite;
 use crate::jit::{
     CUDAType, GPUKernelParameter, GPULaunchConfig, GPUPipelineJIT, GPUVisitor, StepIdAllocator,
 };
@@ -37,7 +36,7 @@ pub fn new_jit(
             .memcpy_stod(values.as_slice())
             .map_err(|e| vortex_err!("Failed to copy to device: {e}"))
             .vortex_unwrap();
-        let step_id = allocator.get_id();
+        let step_id = allocator.fresh_id();
         Box::new(BitPack::<P> {
             step_id,
             bit_width: bp.bit_width(),
@@ -72,7 +71,7 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
         Ok(())
     }
 
-    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
+    fn decls(&self, w: &mut IndentedWrite) -> fmt::Result {
         let output_cuda_type = CUDAType::from(self.output_type);
         let uoutput_cuda_type = CUDAType::from(self.output_type.to_unsigned());
         writeln!(
@@ -98,8 +97,8 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
 
     fn kernel_body(
         &self,
-        w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
+        w: &mut IndentedWrite,
+        f: &dyn Fn(&mut IndentedWrite) -> fmt::Result,
     ) -> fmt::Result {
         let bit_width = self.bit_width as usize;
         let bits = self.output_type.bit_width();
diff --git a/vortex-gpu/src/jit/arrays/for_.rs b/vortex-gpu/src/jit/arrays/for_.rs
index adca961968a..f0752d32a0e 100644
--- a/vortex-gpu/src/jit/arrays/for_.rs
+++ b/vortex-gpu/src/jit/arrays/for_.rs
@@ -5,12 +5,12 @@ use std::fmt;
 use std::fmt::Write;
 use std::sync::Arc;
 
-use cudarc::driver::{CudaStream, DeviceRepr, LaunchArgs};
+use cudarc::driver::{CudaStream, DeviceRepr, LaunchArgs, PushKernelArg};
 use vortex_dtype::{NativePType, PType, match_each_native_ptype};
 use vortex_error::{VortexExpect, VortexResult};
-use vortex_fastlanes::{BitPackedArray, FoRArray};
+use vortex_fastlanes::FoRArray;
 
-use crate::indent::IndentedWriter;
+use crate::indent::{IndentedWrite, IndentedWriter};
 use crate::jit::convert::handle_array;
 use crate::jit::{
     CUDAType, GPUKernelParameter, GPUPipelineJIT, ScalarGPUPipelineJIT, ScalarGPUPipelineJITNode,
@@ -32,7 +32,7 @@ pub fn new_jit(
         let child = handle_array(for_.encoded(), stream, allocator);
         Box::new(ScalarGPUPipelineJITNode {
             inner: FoR {
-                step_id,
+                step_id: allocator.fresh_id(),
                 reference: for_
                     .reference_scalar()
                     .as_primitive()
@@ -83,18 +83,17 @@ impl<P: NativePType + DeviceRepr> ScalarGPUPipelineJIT for FoR<P> {
 
     fn kernel_body(
         &self,
-        w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
+        w: &mut IndentedWrite,
+        f: &dyn Fn(&mut IndentedWrite) -> fmt::Result,
     ) -> fmt::Result {
         assert_eq!(self.output_type(), self.child.output_type());
         let in_var = self.child.output_var();
         let out_var = self.tmp_var();
         let ref_var = self.ref_var();
-        self.child
-            .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
-                writeln!(w, "{out_var} = {in_var} + {ref_var};")?;
-                f(w)
-            })
+        self.child.kernel_body(w, &|w: &mut IndentedWrite| {
+            writeln!(w, "{out_var} = {in_var} + {ref_var};")?;
+            f(w)
+        })
     }
 
     fn output_var(&self) -> String {
diff --git a/vortex-gpu/src/jit/convert.rs b/vortex-gpu/src/jit/convert.rs
index ceb1b7138c5..60d0740a1ca 100644
--- a/vortex-gpu/src/jit/convert.rs
+++ b/vortex-gpu/src/jit/convert.rs
@@ -4,16 +4,16 @@
 use std::sync::Arc;
 
 use cudarc::driver::CudaStream;
-use itertools::all;
-use vortex_alp::{ALPFloat, ALPVTable, match_each_alp_float_ptype};
+use vortex_alp::ALPVTable;
 use vortex_array::{Array, ArrayRef};
-use vortex_buffer::Buffer;
-use vortex_dtype::match_each_native_ptype;
-use vortex_error::{VortexUnwrap, vortex_err};
 use vortex_fastlanes::{BitPackedVTable, FoRVTable};
 
 use crate::jit::arrays::{alp, bitpack, for_};
-use crate::jit::{GPUPipelineJIT, ScalarGPUPipelineJITNode, StepIdAllocator};
+use crate::jit::{GPUPipelineJIT, StepIdAllocator};
+
+pub fn new_jit_array(a: &ArrayRef, stream: &Arc<CudaStream>) -> Box<dyn GPUPipelineJIT> {
+    handle_array(a, stream, &mut StepIdAllocator::default())
+}
 
 pub fn handle_array(
     a: &ArrayRef,
diff --git a/vortex-gpu/src/jit/kernel_fmt.rs b/vortex-gpu/src/jit/kernel_fmt.rs
index 6ed585f1055..d4420ac8002 100644
--- a/vortex-gpu/src/jit/kernel_fmt.rs
+++ b/vortex-gpu/src/jit/kernel_fmt.rs
@@ -4,18 +4,18 @@
 use std::fmt::Write;
 use std::sync::Arc;
 
-use cudarc::driver::{CudaContext, CudaFunction, CudaStream, LaunchArgs};
+use cudarc::driver::{CudaContext, CudaFunction};
 use vortex_error::{VortexExpect, VortexResult, vortex_err};
 
-use crate::indent::IndentedWriter;
+use crate::indent::{IndentedWrite, IndentedWriter};
 use crate::jit::type_::CUDAType;
 use crate::jit::{GPUKernelParameter, GPUPipelineJIT, GPUVisitor};
 
 struct DeclPrinter<'a, 'b: 'a> {
-    w: &'a mut IndentedWriter<&'b mut dyn Write>,
+    w: &'a mut IndentedWrite<'b>,
 }
 
-fn write_kernel_declarations(w: &mut IndentedWriter<&mut dyn Write>, node: &dyn GPUPipelineJIT) {
+fn write_kernel_declarations(w: &mut IndentedWrite, node: &dyn GPUPipelineJIT) {
     let mut decl = DeclPrinter { w };
     decl.accept(node).vortex_expect("write decl cannot fail");
 }
@@ -46,10 +46,7 @@ fn collect_in_param(node: &dyn GPUPipelineJIT) -> VortexResult<Vec<GPUKernelPara
     Ok(params.params)
 }
 
-pub fn create_kernel_str(
-    w: &mut IndentedWriter<&mut dyn Write>,
-    output: &dyn GPUPipelineJIT,
-) -> VortexResult<()> {
+pub fn create_kernel_str(w: &mut IndentedWrite, output: &dyn GPUPipelineJIT) -> VortexResult<()> {
     let mut params = collect_in_param(output)?;
     params.push(GPUKernelParameter {
         name: "_output".to_string(),
@@ -84,7 +81,7 @@ pub fn create_kernel_str(
 
             write_kernel_declarations(w, output);
             writeln!(w)?;
-            output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
+            output.kernel_body(w, &|w: &mut IndentedWrite| {
                 writeln!(w, "s_output[out_idx] = {tmp};", tmp = output.output_var())
             })?;
             writeln!(w)?;
diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs
index fe558996055..911de7e0bc7 100644
--- a/vortex-gpu/src/jit/mod.rs
+++ b/vortex-gpu/src/jit/mod.rs
@@ -13,6 +13,7 @@ use std::sync::Arc;
 
 use cudarc::driver::{CudaStream, LaunchArgs};
 pub use run::create_run_jit_kernel;
+pub use type_::CUDAType;
 use vortex_dtype::PType;
 use vortex_error::VortexResult;
 
@@ -71,7 +72,7 @@ struct StepIdAllocator {
 }
 
 impl StepIdAllocator {
-    pub fn get_id(&mut self) -> usize {
+    pub fn fresh_id(&mut self) -> usize {
         let id = self.next_id;
         self.next_id += 1;
         id
@@ -82,12 +83,12 @@ trait GPUVisitor<'a> {
     fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()>;
 }
 
-struct GPUKernelParameter {
+pub struct GPUKernelParameter {
     name: String,
     type_: String,
 }
 
-struct GPULaunchConfig {
+pub struct GPULaunchConfig {
     block_width: u32,
 }
 
diff --git a/vortex-gpu/src/jit/run.rs b/vortex-gpu/src/jit/run.rs
index c16f9f4db36..85ce12b92f5 100644
--- a/vortex-gpu/src/jit/run.rs
+++ b/vortex-gpu/src/jit/run.rs
@@ -13,9 +13,9 @@ use vortex_buffer::BufferMut;
 use vortex_dtype::match_each_native_ptype;
 use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_err};
 
-use crate::jit::convert::handle_array;
+use crate::jit::convert::new_jit_array;
 use crate::jit::kernel_fmt::create_kernel;
-use crate::jit::{GPUPipelineJIT, GPUVisitor, StepIdAllocator};
+use crate::jit::{GPUPipelineJIT, GPUVisitor};
 
 pub fn create_run_jit_kernel(
     ctx: Arc<CudaContext>,
@@ -23,8 +23,7 @@ pub fn create_run_jit_kernel(
 ) -> VortexResult<(ArrayRef, Duration)> {
     let stream = ctx.default_stream();
 
-    let mut allocator = StepIdAllocator::new();
-    let output = handle_array(array, &stream, &mut allocator);
+    let output = new_jit_array(array, &stream);
     let kernel = create_kernel(ctx.clone(), output.as_ref())?;
 
     let num_chunks =

From dcddbbbbcb5a2bc81f1bd089f476a03cf1c93881 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Tue, 14 Oct 2025 16:26:55 -0400
Subject: [PATCH 08/15] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-gpu/kernels/fused_bitpack_for.cu | 22 ++--------------------
 vortex-gpu/src/for_bp.rs                |  6 +++---
 vortex-gpu/src/task.rs                  |  2 --
 3 files changed, 5 insertions(+), 25 deletions(-)

diff --git a/vortex-gpu/kernels/fused_bitpack_for.cu b/vortex-gpu/kernels/fused_bitpack_for.cu
index 0ba14fedbe5..4fd4bc48149 100644
--- a/vortex-gpu/kernels/fused_bitpack_for.cu
+++ b/vortex-gpu/kernels/fused_bitpack_for.cu
@@ -108,28 +108,13 @@ __device__ __forceinline__ void for_device(
     }
 }
 
-template<typename ValueT>
-__device__ __forceinline__ void float_device(
-    ValueT *__restrict values_in,
-    float *__restrict values_out,
-    int thread_idx
-) {
-    auto i = thread_idx;
-    const int thread_ops = blockDim.x;
-
-    for (auto j = 0; j < thread_ops; j++) {
-        auto idx = INDEX(j, i);
-        values_out[idx] = (float)values_in[idx];
-    }
-}
-
 
 // Fused kernel: bitpack unpack (3bw) + FoR addition in one pass
 // This eliminates the intermediate write-to-memory and read-from-memory
 // by keeping unpacked values in registers/L1 cache and immediately adding the reference
 extern "C" __global__ void fused_bitpack6_for_u32(
     const uint32_t *__restrict packed_in,
-    float *__restrict unpacked_out,
+    uint32_t *__restrict unpacked_out,
     uint32_t reference
 ) {
     int i = threadIdx.x;
@@ -139,16 +124,13 @@ extern "C" __global__ void fused_bitpack6_for_u32(
     auto out = unpacked_out + (blockIdx.x * 1024);
 
     __shared__ uint32_t shared_data[1024];
-    __shared__ float shared_data2[1024];
 
     fls_unpack_6bw_32ow_device(in, shared_data, i);
 
     for_device(shared_data, reference, i);
 
-    float_device(shared_data, shared_data2, i);
-
     for (int i = 0; i < 32; i++) {
         auto idx = i * 32 + threadIdx.x;
-        out[idx] = shared_data2[idx];
+        out[idx] = shared_data[idx];
     }
 }
diff --git a/vortex-gpu/src/for_bp.rs b/vortex-gpu/src/for_bp.rs
index f9e7fe6860c..e2b8fb19e40 100644
--- a/vortex-gpu/src/for_bp.rs
+++ b/vortex-gpu/src/for_bp.rs
@@ -26,7 +26,7 @@ struct FoRBPTask<P> {
     launch_config: LaunchConfig,
 
     packed: CudaSlice<P>,
-    unpacked: CudaSlice<f32>,
+    unpacked: CudaSlice<P>,
     reference: P,
 
     len: usize,
@@ -53,7 +53,7 @@ pub fn new_task(
         .map_err(|e| vortex_err!("Failed to copy to device: {e}"))?;
     let cu_out = unsafe {
         stream
-            .alloc::<f32>(array.len().next_multiple_of(1024))
+            .alloc::<u32>(array.len().next_multiple_of(1024))
             .map_err(|e| vortex_err!("Failed to allocate stream: {e}"))?
     };
 
@@ -101,7 +101,7 @@ impl<P: NativePType + DeviceRepr> GPUTask for FoRBPTask<P> {
 
     fn export_result(&mut self) -> VortexResult<Canonical> {
         let len = self.len();
-        let mut buffer = BufferMut::<f32>::with_capacity(len);
+        let mut buffer = BufferMut::<P>::with_capacity(len);
 
         unsafe { buffer.set_len(len) }
         self.stream
diff --git a/vortex-gpu/src/task.rs b/vortex-gpu/src/task.rs
index 1b678e74508..b0bfdd38f03 100644
--- a/vortex-gpu/src/task.rs
+++ b/vortex-gpu/src/task.rs
@@ -16,6 +16,4 @@ pub trait GPUTask {
     fn output(&mut self) -> CudaViewMut<'_, u8>;
 
     fn len(&self) -> usize;
-
-    // fn jit(&mut self) -> OptBox<dyn JITGPUTask> ;
 }

From c0b4a8d7ab2937161ec1b93602227eff5e8d18fc Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Tue, 14 Oct 2025 17:41:04 -0400
Subject: [PATCH 09/15] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-gpu/src/jit/mod.rs | 63 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs
index 911de7e0bc7..244897ad4b4 100644
--- a/vortex-gpu/src/jit/mod.rs
+++ b/vortex-gpu/src/jit/mod.rs
@@ -1,6 +1,51 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+//! # JIT Kernel Composition System
+//!
+//! This module generates CUDA kernels by composing encoding steps into a single fused kernel.
+//!
+//! ## How Kernels Are Built
+//!
+//! Each encoding step (BitPack, FoR, ALP) implements `GPUPipelineJIT` and contributes:
+//! 1. **Input parameters** - Data passed from host to device (arrays, scalars)
+//! 2. **Declarations** - Local variables needed for the step
+//! 3. **Kernel body** - The actual computation logic
+//! 4. **Output variable** - The result of this step (e.g., `tmp0`, `tmp1`)
+//!
+//! Steps are composed in a tree structure. For example: `ALP -> FoR -> BitPack`
+//!
+//! ## Data Flow Between Steps
+//!
+//! Each step produces an **output variable** that the parent step consumes:
+//!
+//! ```text
+//! BitPack:  unpacks data → produces `tmp0`
+//! FoR:      reads `tmp0` → adds reference → produces `tmp1` = tmp0 + ref0
+//! ALP:      reads `tmp1` → scales → produces `tmp2` = tmp1 * f2 * e2
+//! ```
+//!
+//! The `output_var()` method returns the variable name (e.g., "tmp2") that subsequent
+//! steps or the final output can read from.
+//!
+//! ## Writing to Final Output
+//!
+//! Each step computes a value and passes `out_idx` to its continuation:
+//! - **out_idx**: The index in the output array where this value should be written
+//! - The innermost step calculates `out_idx` based on thread/block layout
+//! - Parent steps pass this index through to their continuation function
+//! - The root continuation writes: `output[out_idx] = <final_output_var>`
+//!
+//! Example flow:
+//! ```cuda
+//! // BitPack calculates out_idx
+//! out_idx = INDEX(row, lane);
+//! // Then calls continuation with out_idx available
+//! // FoR does: tmp1 = tmp0 + ref0; calls its continuation
+//! // ALP does: tmp2 = tmp1 * f2 * e2; calls its continuation
+//! // Final: output[out_idx] = tmp2;
+//! ```
+
 mod arrays;
 mod convert;
 mod kernel_fmt;
@@ -19,28 +64,44 @@ use vortex_error::VortexResult;
 
 use crate::indent::IndentedWriter;
 
+/// Trait for encoding steps that can be JIT-compiled into a CUDA kernel.
+///
+/// Each step contributes a piece of the kernel and specifies its output variable
+/// that subsequent steps can read from.
 pub trait GPUPipelineJIT {
+    /// Unique identifier for this step (used to generate unique variable names)
     fn step_id(&self) -> usize;
 
+    /// Adds input parameters (e.g., device pointers, scalars) to the kernel signature
     fn in_params(&self, params: &mut Vec<GPUKernelParameter>);
 
+    /// Adds arguments to the kernel launch (actual values passed at runtime)
     fn args<'a>(&'a self, stream: &Arc<CudaStream>, args: &mut LaunchArgs<'a>) -> VortexResult<()>;
 
+    /// Writes variable declarations needed by this step
     fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result;
 
+    /// Writes the kernel body for this step.
+    ///
+    /// The continuation function `f` should be called after computing this step's output,
+    /// allowing parent steps to consume the output variable via `output_var()`.
     fn kernel_body(
         &self,
         w: &mut IndentedWriter<&mut dyn Write>,
         f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
     ) -> fmt::Result;
 
+    /// Returns the output variable name (e.g., "tmp0") that this step produces.
+    /// Parent steps read this variable to consume the output.
     fn output_var(&self) -> String;
 
+    /// Returns the type of the output variable
     fn output_type(&self) -> PType;
 
-    // always pass the output iteration aligned child last.
+    /// Visits child steps in the pipeline tree
     fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()>;
 
+    /// Returns the launch configuration (block size, etc.) for this kernel
     fn launch_config(&self) -> GPULaunchConfig;
 }
 

From 22b1b45fc6d16373988500e4be4435b2adab9b91 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Tue, 14 Oct 2025 17:43:03 -0400
Subject: [PATCH 10/15] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-gpu/Cargo.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vortex-gpu/Cargo.toml b/vortex-gpu/Cargo.toml
index 0a81e79367c..23fbd517361 100644
--- a/vortex-gpu/Cargo.toml
+++ b/vortex-gpu/Cargo.toml
@@ -16,10 +16,10 @@ publish = false
 
 [dependencies]
 cudarc = { workspace = true, features = ["f16"] }
-parking_lot = { workspace = true }
 itertools = { workspace = true }
-vortex-array = { workspace = true }
+parking_lot = { workspace = true }
 vortex-alp = { workspace = true }
+vortex-array = { workspace = true }
 vortex-buffer = { workspace = true, features = ["cuda"] }
 vortex-dict = { workspace = true }
 vortex-dtype = { workspace = true }

From 060c666b4abd130d83ccb67fba63140b71e0b19f Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Tue, 14 Oct 2025 17:54:09 -0400
Subject: [PATCH 11/15] wip

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-gpu/src/jit/arrays/alp.rs     |  4 ----
 vortex-gpu/src/jit/arrays/bitpack.rs |  5 -----
 vortex-gpu/src/jit/arrays/for_.rs    |  4 ----
 vortex-gpu/src/jit/mod.rs            | 11 +----------
 4 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/vortex-gpu/src/jit/arrays/alp.rs b/vortex-gpu/src/jit/arrays/alp.rs
index a254a3e69b3..3d7db15603b 100644
--- a/vortex-gpu/src/jit/arrays/alp.rs
+++ b/vortex-gpu/src/jit/arrays/alp.rs
@@ -60,10 +60,6 @@ impl<A: ALPFloat> ALP<A> {
 }
 
 impl<A: ALPFloat + DeviceRepr> ScalarGPUPipelineJIT for ALP<A> {
-    fn step_id(&self) -> usize {
-        self.step_id
-    }
-
     fn in_params(&self, params: &mut Vec<GPUKernelParameter>) {
         params.extend([
             GPUKernelParameter {
diff --git a/vortex-gpu/src/jit/arrays/bitpack.rs b/vortex-gpu/src/jit/arrays/bitpack.rs
index b93b592aaa9..6276accb4d4 100644
--- a/vortex-gpu/src/jit/arrays/bitpack.rs
+++ b/vortex-gpu/src/jit/arrays/bitpack.rs
@@ -2,7 +2,6 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use std::fmt;
-use std::fmt::Write;
 use std::sync::Arc;
 
 use cudarc::driver::{CudaSlice, CudaStream, DeviceRepr, LaunchArgs, PushKernelArg};
@@ -47,10 +46,6 @@ pub fn new_jit(
 }
 
 impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
-    fn step_id(&self) -> usize {
-        self.step_id
-    }
-
     fn in_params(&self, p: &mut Vec<GPUKernelParameter>) {
         p.push(GPUKernelParameter {
             name: self.in_var_g(),
diff --git a/vortex-gpu/src/jit/arrays/for_.rs b/vortex-gpu/src/jit/arrays/for_.rs
index f0752d32a0e..1e97c6ae47b 100644
--- a/vortex-gpu/src/jit/arrays/for_.rs
+++ b/vortex-gpu/src/jit/arrays/for_.rs
@@ -55,10 +55,6 @@ impl<P> FoR<P> {
 }
 
 impl<P: NativePType + DeviceRepr> ScalarGPUPipelineJIT for FoR<P> {
-    fn step_id(&self) -> usize {
-        self.step_id
-    }
-
     fn in_params(&self, p: &mut Vec<GPUKernelParameter>) {
         p.push(GPUKernelParameter {
             name: self.ref_var(),
diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs
index 244897ad4b4..497543e7f1e 100644
--- a/vortex-gpu/src/jit/mod.rs
+++ b/vortex-gpu/src/jit/mod.rs
@@ -69,9 +69,6 @@ use crate::indent::IndentedWriter;
 /// Each step contributes a piece of the kernel and specifies its output variable
 /// that subsequent steps can read from.
 pub trait GPUPipelineJIT {
-    /// Unique identifier for this step (used to generate unique variable names)
-    fn step_id(&self) -> usize;
-
     /// Adds input parameters (e.g., device pointers, scalars) to the kernel signature
     fn in_params(&self, params: &mut Vec<GPUKernelParameter>);
 
@@ -106,8 +103,6 @@ pub trait GPUPipelineJIT {
 }
 
 pub trait ScalarGPUPipelineJIT {
-    fn step_id(&self) -> usize;
-
     fn in_params(&self, params: &mut Vec<GPUKernelParameter>);
 
     fn args<'a>(&'a self, stream: &Arc<CudaStream>, args: &mut LaunchArgs<'a>) -> VortexResult<()>;
@@ -140,7 +135,7 @@ impl StepIdAllocator {
     }
 }
 
-trait GPUVisitor<'a> {
+pub trait GPUVisitor<'a> {
     fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()>;
 }
 
@@ -158,10 +153,6 @@ struct ScalarGPUPipelineJITNode<T> {
 }
 
 impl<T: ScalarGPUPipelineJIT> GPUPipelineJIT for ScalarGPUPipelineJITNode<T> {
-    fn step_id(&self) -> usize {
-        self.inner.step_id()
-    }
-
     fn in_params(&self, params: &mut Vec<GPUKernelParameter>) {
         self.inner.in_params(params)
     }

From ec961836ef322c465b0aa1a644d15ab97751f06e Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 16 Oct 2025 11:57:22 -0400
Subject: [PATCH 12/15] fixup

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-gpu/src/jit/arrays/alp.rs     | 27 ++++++++++------
 vortex-gpu/src/jit/arrays/bitpack.rs | 48 ++++++++++++++++++++++------
 vortex-gpu/src/jit/arrays/for_.rs    | 43 ++++++++++++++++---------
 vortex-gpu/src/jit/convert.rs        | 15 ++++++---
 vortex-gpu/src/jit/kernel_fmt.rs     | 23 +++++++------
 vortex-gpu/src/jit/mod.rs            | 37 +++++++++++++--------
 vortex-gpu/src/jit/run.rs            |  9 ++++--
 7 files changed, 137 insertions(+), 65 deletions(-)

diff --git a/vortex-gpu/src/jit/arrays/alp.rs b/vortex-gpu/src/jit/arrays/alp.rs
index 3d7db15603b..f71d8248341 100644
--- a/vortex-gpu/src/jit/arrays/alp.rs
+++ b/vortex-gpu/src/jit/arrays/alp.rs
@@ -10,7 +10,7 @@ use vortex_alp::{ALPArray, ALPFloat, match_each_alp_float_ptype};
 use vortex_dtype::PType;
 use vortex_error::VortexResult;
 
-use crate::indent::IndentedWriter;
+use crate::indent::{IndentedWrite, IndentedWriter};
 use crate::jit::convert::handle_array;
 use crate::jit::{
     CUDAType, GPUKernelParameter, GPUPipelineJIT, ScalarGPUPipelineJIT, ScalarGPUPipelineJITNode,
@@ -29,9 +29,10 @@ pub fn new_jit(
     alp: &ALPArray,
     stream: &Arc<CudaStream>,
     allocator: &mut StepIdAllocator,
+    output_array: String,
 ) -> Box<dyn GPUPipelineJIT> {
     match_each_alp_float_ptype!(alp.ptype(), |A| {
-        let child = handle_array(alp.encoded(), stream, allocator);
+        let child = handle_array(alp.encoded(), stream, allocator, output_array);
         let step_id = allocator.fresh_id();
         Box::new(ScalarGPUPipelineJITNode {
             inner: ALP {
@@ -92,25 +93,31 @@ impl<A: ALPFloat + DeviceRepr> ScalarGPUPipelineJIT for ALP<A> {
     fn kernel_body(
         &self,
         w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
-    ) -> fmt::Result {
+        f: &dyn Fn(
+            &mut IndentedWriter<&mut dyn Write>,
+            GPUKernelParameter,
+        ) -> Result<GPUKernelParameter, fmt::Error>,
+    ) -> Result<GPUKernelParameter, fmt::Error> {
         self.child
-            .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| {
+            .kernel_body(w, &|w: &mut IndentedWrite, in_: GPUKernelParameter| {
+                let in_var = in_.name;
                 writeln!(
                     w,
-                    "{out} = ((({type_}){tmp}) * {f}) * {e};",
+                    "{out} = ((({type_}){in_var}) * {f}) * {e};",
                     out = self.tmp_var(),
                     type_ = CUDAType::from(self.float_type),
-                    tmp = self.child.output_var(),
                     f = self.f_var(),
                     e = self.e_var(),
                 )?;
-                f(w)
+                f(w, self.output_parameter())
             })
     }
 
-    fn output_var(&self) -> String {
-        self.tmp_var()
+    fn output_parameter(&self) -> GPUKernelParameter {
+        GPUKernelParameter {
+            name: self.tmp_var(),
+            type_: CUDAType::from(self.float_type).to_string(),
+        }
     }
 
     fn output_type(&self) -> PType {
diff --git a/vortex-gpu/src/jit/arrays/bitpack.rs b/vortex-gpu/src/jit/arrays/bitpack.rs
index 6276accb4d4..da9a4a28fce 100644
--- a/vortex-gpu/src/jit/arrays/bitpack.rs
+++ b/vortex-gpu/src/jit/arrays/bitpack.rs
@@ -20,12 +20,14 @@ struct BitPack<P> {
     bit_width: u8,
     output_type: PType,
     cuda_slice: CudaSlice<P>,
+    output_array: String,
 }
 
 pub fn new_jit(
     bp: &BitPackedArray,
     stream: &Arc<CudaStream>,
     allocator: &mut StepIdAllocator,
+    output_array: String,
 ) -> Box<dyn GPUPipelineJIT> {
     assert_eq!(bp.offset(), 0);
     assert!(bp.patches().is_none());
@@ -41,6 +43,7 @@ pub fn new_jit(
             bit_width: bp.bit_width(),
             output_type: bp.ptype(),
             cuda_slice,
+            output_array,
         })
     })
 }
@@ -77,7 +80,6 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
         writeln!(w, "{output_cuda_type} {};", self.tmp_var())?;
         writeln!(w, "{uoutput_cuda_type} {};", self.src_var())?;
         writeln!(w, "{uoutput_cuda_type} {};", self.utmp_var())?;
-        writeln!(w, "unsigned int out_idx;")?;
         writeln!(w, "unsigned int lane = threadIdx.x;")?;
         writeln!(
             w,
@@ -93,8 +95,11 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
     fn kernel_body(
         &self,
         w: &mut IndentedWrite,
-        f: &dyn Fn(&mut IndentedWrite) -> fmt::Result,
-    ) -> fmt::Result {
+        f: &dyn Fn(
+            &mut IndentedWrite,
+            GPUKernelParameter,
+        ) -> Result<GPUKernelParameter, fmt::Error>,
+    ) -> Result<GPUKernelParameter, fmt::Error> {
         let bit_width = self.bit_width as usize;
         let bits = self.output_type.bit_width();
         let in_ = self.in_var_l();
@@ -104,6 +109,10 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
             for row in 0..bits {
                 writeln!(w, "out[INDEX({row}, lane)] = zero;")?;
             }
+            Ok(GPUKernelParameter {
+                name: "none".to_string(),
+                type_: "t_none_".to_string(),
+            })
         } else if bit_width == bits {
             writeln!(w)?;
             for row in 0..bits {
@@ -112,6 +121,10 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
                     "out[INDEX({row}, lane)] = {in_}[LANE_COUNT * {row} + lane];",
                 )?;
             }
+            Ok(GPUKernelParameter {
+                name: "none".to_string(),
+                type_: "t_none_".to_string(),
+            })
         } else {
             let src = self.src_var();
             let utmp = self.utmp_var();
@@ -162,20 +175,37 @@ impl<P: NativePType + DeviceRepr> GPUPipelineJIT for BitPack<P> {
                     type_ = CUDAType::from(self.output_type),
                 )?;
 
-                writeln!(w, "out_idx = INDEX({row}, lane);")?;
-                f(w)?;
+                let out = f(
+                    w,
+                    GPUKernelParameter {
+                        name: tmp.to_string(),
+                        type_: "unsigned int".to_string(),
+                    },
+                )?;
+                writeln!(
+                    w,
+                    "{output_a}[INDEX({row}, lane)] = {in_var};",
+                    output_a = self.output_array,
+                    in_var = out.name
+                )?;
                 writeln!(w)?;
             }
+            Ok(GPUKernelParameter {
+                name: "none___".to_string(),
+                type_: "t_none_".to_string(),
+            })
         }
-        Ok(())
     }
 
-    fn output_var(&self) -> String {
-        self.tmp_var()
+    fn output_parameter(&self) -> GPUKernelParameter {
+        GPUKernelParameter {
+            name: self.tmp_var(),
+            type_: CUDAType::from(self.output_type).to_string(),
+        }
     }
 
     fn output_type(&self) -> PType {
-        self.output_type
+        P::PTYPE
     }
 
     fn children(&self, _visitor: &mut dyn GPUVisitor) -> VortexResult<()> {
diff --git a/vortex-gpu/src/jit/arrays/for_.rs b/vortex-gpu/src/jit/arrays/for_.rs
index 1e97c6ae47b..420f6a0bfcb 100644
--- a/vortex-gpu/src/jit/arrays/for_.rs
+++ b/vortex-gpu/src/jit/arrays/for_.rs
@@ -27,9 +27,10 @@ pub fn new_jit(
     for_: &FoRArray,
     stream: &Arc<CudaStream>,
     allocator: &mut StepIdAllocator,
+    output_array: String,
 ) -> Box<dyn GPUPipelineJIT> {
     match_each_native_ptype!(for_.reference_scalar().as_primitive().ptype(), |P| {
-        let child = handle_array(for_.encoded(), stream, allocator);
+        let child = handle_array(for_.encoded(), stream, allocator, output_array);
         Box::new(ScalarGPUPipelineJITNode {
             inner: FoR {
                 step_id: allocator.fresh_id(),
@@ -58,7 +59,7 @@ impl<P: NativePType + DeviceRepr> ScalarGPUPipelineJIT for FoR<P> {
     fn in_params(&self, p: &mut Vec<GPUKernelParameter>) {
         p.push(GPUKernelParameter {
             name: self.ref_var(),
-            type_: CUDAType::from(self.output_type()).to_string(),
+            type_: self.output_parameter().type_,
         })
     }
 
@@ -72,28 +73,40 @@ impl<P: NativePType + DeviceRepr> ScalarGPUPipelineJIT for FoR<P> {
     }
 
     fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
-        let output_cuda_type = CUDAType::from(self.output_type());
-        writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?;
+        let output_param = self.output_parameter();
+        writeln!(w, "{} {};", output_param.type_, output_param.name)?;
         Ok(())
     }
 
     fn kernel_body(
         &self,
         w: &mut IndentedWrite,
-        f: &dyn Fn(&mut IndentedWrite) -> fmt::Result,
-    ) -> fmt::Result {
-        assert_eq!(self.output_type(), self.child.output_type());
-        let in_var = self.child.output_var();
-        let out_var = self.tmp_var();
+        f: &dyn Fn(
+            &mut IndentedWrite,
+            GPUKernelParameter,
+        ) -> Result<GPUKernelParameter, fmt::Error>,
+    ) -> Result<GPUKernelParameter, fmt::Error> {
+        let output_param = self.output_parameter();
+        let child_output_type = self.child.output_parameter().type_;
+        assert_eq!(output_param.type_, child_output_type);
         let ref_var = self.ref_var();
-        self.child.kernel_body(w, &|w: &mut IndentedWrite| {
-            writeln!(w, "{out_var} = {in_var} + {ref_var};")?;
-            f(w)
-        })
+        self.child
+            .kernel_body(w, &move |w: &mut IndentedWrite, in_: GPUKernelParameter| {
+                let in_var = in_.name;
+                writeln!(
+                    w,
+                    "{out_var} = {in_var} + {ref_var};",
+                    out_var = output_param.name
+                )?;
+                f(w, output_param.clone())
+            })
     }
 
-    fn output_var(&self) -> String {
-        self.tmp_var()
+    fn output_parameter(&self) -> GPUKernelParameter {
+        GPUKernelParameter {
+            name: self.tmp_var(),
+            type_: CUDAType::from(P::PTYPE).to_string(),
+        }
     }
 
     fn output_type(&self) -> PType {
diff --git a/vortex-gpu/src/jit/convert.rs b/vortex-gpu/src/jit/convert.rs
index 60d0740a1ca..cb08e6814c0 100644
--- a/vortex-gpu/src/jit/convert.rs
+++ b/vortex-gpu/src/jit/convert.rs
@@ -11,24 +11,29 @@ use vortex_fastlanes::{BitPackedVTable, FoRVTable};
 use crate::jit::arrays::{alp, bitpack, for_};
 use crate::jit::{GPUPipelineJIT, StepIdAllocator};
 
-pub fn new_jit_array(a: &ArrayRef, stream: &Arc<CudaStream>) -> Box<dyn GPUPipelineJIT> {
-    handle_array(a, stream, &mut StepIdAllocator::default())
+pub fn new_jit_array(
+    a: &ArrayRef,
+    stream: &Arc<CudaStream>,
+    output_array: String,
+) -> Box<dyn GPUPipelineJIT> {
+    handle_array(a, stream, &mut StepIdAllocator::default(), output_array)
 }
 
 pub fn handle_array(
     a: &ArrayRef,
     stream: &Arc<CudaStream>,
     allocator: &mut StepIdAllocator,
+    output_array: String,
 ) -> Box<dyn GPUPipelineJIT> {
     if let Some(alp) = a.as_opt::<ALPVTable>() {
-        return alp::new_jit(alp, stream, allocator);
+        return alp::new_jit(alp, stream, allocator, output_array);
     }
     if let Some(bp) = a.as_opt::<BitPackedVTable>() {
-        return bitpack::new_jit(bp, stream, allocator);
+        return bitpack::new_jit(bp, stream, allocator, output_array);
     };
 
     if let Some(for_) = a.as_opt::<FoRVTable>() {
-        return for_::new_jit(for_, stream, allocator);
+        return for_::new_jit(for_, stream, allocator, output_array);
     }
 
     todo!("unimplemented jit for {}", a.encoding_id())
diff --git a/vortex-gpu/src/jit/kernel_fmt.rs b/vortex-gpu/src/jit/kernel_fmt.rs
index d4420ac8002..ce19006468f 100644
--- a/vortex-gpu/src/jit/kernel_fmt.rs
+++ b/vortex-gpu/src/jit/kernel_fmt.rs
@@ -8,7 +8,6 @@ use cudarc::driver::{CudaContext, CudaFunction};
 use vortex_error::{VortexExpect, VortexResult, vortex_err};
 
 use crate::indent::{IndentedWrite, IndentedWriter};
-use crate::jit::type_::CUDAType;
 use crate::jit::{GPUKernelParameter, GPUPipelineJIT, GPUVisitor};
 
 struct DeclPrinter<'a, 'b: 'a> {
@@ -46,11 +45,15 @@ fn collect_in_param(node: &dyn GPUPipelineJIT) -> VortexResult<Vec<GPUKernelPara
     Ok(params.params)
 }
 
-pub fn create_kernel_str(w: &mut IndentedWrite, output: &dyn GPUPipelineJIT) -> VortexResult<()> {
+pub fn create_kernel_str(
+    w: &mut IndentedWrite,
+    output: &dyn GPUPipelineJIT,
+    kernel_out_array: &str,
+) -> VortexResult<()> {
     let mut params = collect_in_param(output)?;
     params.push(GPUKernelParameter {
         name: "_output".to_string(),
-        type_: format!("{} *__restrict__", CUDAType::from(output.output_type())),
+        type_: format!("{} *__restrict__", output.output_parameter().type_),
     });
 
     (|| {
@@ -74,22 +77,22 @@ pub fn create_kernel_str(w: &mut IndentedWrite, output: &dyn GPUPipelineJIT) ->
             writeln!(
                 w,
                 "{output_type} *output = _output + (blockIdx.x * 1024);",
-                output_type = CUDAType::from(output.output_type())
+                output_type = output.output_parameter().type_
             )?;
 
-            writeln!(w, "__shared__ float s_output[1024];")?;
+            writeln!(w, "__shared__ float {kernel_out_array}[1024];")?;
 
             write_kernel_declarations(w, output);
             writeln!(w)?;
-            output.kernel_body(w, &|w: &mut IndentedWrite| {
-                writeln!(w, "s_output[out_idx] = {tmp};", tmp = output.output_var())
+            output.kernel_body(w, &|_w: &mut IndentedWrite, in_: GPUKernelParameter| {
+                Ok(in_)
             })?;
             writeln!(w)?;
 
             writeln!(w, "for (int i = 0; i < 32; i++) {{")?;
             w.indent(|w| {
                 writeln!(w, "auto idx = i * 32 + threadIdx.x;")?;
-                writeln!(w, "output[idx] = s_output[idx];")
+                writeln!(w, "output[idx] = {kernel_out_array}[idx];")
             })?;
             writeln!(w, "}}")
         })?;
@@ -102,13 +105,15 @@ pub fn create_kernel_str(w: &mut IndentedWrite, output: &dyn GPUPipelineJIT) ->
 pub fn create_kernel(
     ctx: Arc<CudaContext>,
     array: &dyn GPUPipelineJIT,
+    kernel_out_array: &str,
 ) -> VortexResult<CudaFunction> {
     let mut s = String::new();
     let w = &mut s as &mut dyn Write;
     let mut ind = IndentedWriter::new(w);
     let w = &mut ind;
 
-    create_kernel_str(w, array).map_err(|e| vortex_err!("jit str cannot fail {e}"))?;
+    create_kernel_str(w, array, kernel_out_array)
+        .map_err(|e| vortex_err!("jit str cannot fail {e}"))?;
 
     let module =
         cudarc::nvrtc::compile_ptx(s.clone()).map_err(|e| vortex_err!("compile ptx {e}"))?;
diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs
index 497543e7f1e..27ff0ce377c 100644
--- a/vortex-gpu/src/jit/mod.rs
+++ b/vortex-gpu/src/jit/mod.rs
@@ -85,14 +85,15 @@ pub trait GPUPipelineJIT {
     fn kernel_body(
         &self,
         w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
-    ) -> fmt::Result;
+        f: &dyn Fn(
+            &mut IndentedWriter<&mut dyn Write>,
+            GPUKernelParameter,
+        ) -> Result<GPUKernelParameter, fmt::Error>,
+    ) -> Result<GPUKernelParameter, fmt::Error>;
 
-    /// Returns the output variable name (e.g., "tmp0") that this step produces.
-    /// Parent steps read this variable to consume the output.
-    fn output_var(&self) -> String;
+    /// Returns the name+type of the output variable
+    fn output_parameter(&self) -> GPUKernelParameter;
 
-    /// Returns the type of the output variable
     fn output_type(&self) -> PType;
 
     /// Visits child steps in the pipeline tree
@@ -112,10 +113,14 @@ pub trait ScalarGPUPipelineJIT {
     fn kernel_body(
         &self,
         w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
-    ) -> fmt::Result;
+        f: &dyn Fn(
+            &mut IndentedWriter<&mut dyn Write>,
+            GPUKernelParameter,
+        ) -> Result<GPUKernelParameter, fmt::Error>,
+    ) -> Result<GPUKernelParameter, fmt::Error>;
 
-    fn output_var(&self) -> String;
+    /// Returns the name+type of the output variable
+    fn output_parameter(&self) -> GPUKernelParameter;
 
     fn output_type(&self) -> PType;
 
@@ -139,6 +144,7 @@ pub trait GPUVisitor<'a> {
     fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()>;
 }
 
+#[derive(Clone)]
 pub struct GPUKernelParameter {
     name: String,
     type_: String,
@@ -168,13 +174,16 @@ impl<T: ScalarGPUPipelineJIT> GPUPipelineJIT for ScalarGPUPipelineJITNode<T> {
     fn kernel_body(
         &self,
         w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result,
-    ) -> fmt::Result {
-        self.inner.kernel_body(w, f)
+        f: &dyn Fn(
+            &mut IndentedWriter<&mut dyn Write>,
+            GPUKernelParameter,
+        ) -> Result<GPUKernelParameter, fmt::Error>,
+    ) -> Result<GPUKernelParameter, fmt::Error> {
+        self.inner.kernel_body(w, &f)
     }
 
-    fn output_var(&self) -> String {
-        self.inner.output_var()
+    fn output_parameter(&self) -> GPUKernelParameter {
+        self.inner.output_parameter()
     }
 
     fn output_type(&self) -> PType {
diff --git a/vortex-gpu/src/jit/run.rs b/vortex-gpu/src/jit/run.rs
index 85ce12b92f5..242fae84bd7 100644
--- a/vortex-gpu/src/jit/run.rs
+++ b/vortex-gpu/src/jit/run.rs
@@ -23,17 +23,20 @@ pub fn create_run_jit_kernel(
 ) -> VortexResult<(ArrayRef, Duration)> {
     let stream = ctx.default_stream();
 
-    let output = new_jit_array(array, &stream);
-    let kernel = create_kernel(ctx.clone(), output.as_ref())?;
+    let kernel_output_arr_name = "s_output";
+    let output = new_jit_array(array, &stream, kernel_output_arr_name.to_string());
+    let kernel = create_kernel(ctx.clone(), output.as_ref(), kernel_output_arr_name)?;
 
     let num_chunks =
         u32::try_from(array.len().div_ceil(1024)).vortex_expect("Too many grid elements");
 
     let mut launch_builder = stream.launch_builder(&kernel);
 
+    let config = output.launch_config();
+
     let launch_config = LaunchConfig {
         grid_dim: (num_chunks, 1, 1),
-        block_dim: (output.launch_config().block_width, 1, 1),
+        block_dim: (config.block_width, 1, 1),
         shared_mem_bytes: u32::try_from(output.output_type().byte_width())
             .vortex_expect("oversized output type byte width")
             * 1024,

From 8fd80c1f9577ece9987286e9b26e29a1b96437c9 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 16 Oct 2025 12:08:00 -0400
Subject: [PATCH 13/15] fixup

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-gpu/src/jit/mod.rs | 51 +++++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs
index 27ff0ce377c..a8ade0de4cd 100644
--- a/vortex-gpu/src/jit/mod.rs
+++ b/vortex-gpu/src/jit/mod.rs
@@ -28,23 +28,48 @@
 //! The `output_var()` method returns the variable name (e.g., "tmp2") that subsequent
 //! steps or the final output can read from.
 //!
-//! ## Writing to Final Output
+//! ## Final Kernel Result
 //!
-//! Each step computes a value and passes `out_idx` to its continuation:
-//! - **out_idx**: The index in the output array where this value should be written
-//! - The innermost step calculates `out_idx` based on thread/block layout
-//! - Parent steps pass this index through to their continuation function
-//! - The root continuation writes: `output[out_idx] = <final_output_var>`
+//! The composed kernel computes the final value and writes it to the output array:
 //!
-//! Example flow:
 //! ```cuda
-//! // BitPack calculates out_idx
-//! out_idx = INDEX(row, lane);
-//! // Then calls continuation with out_idx available
-//! // FoR does: tmp1 = tmp0 + ref0; calls its continuation
-//! // ALP does: tmp2 = tmp1 * f2 * e2; calls its continuation
-//! // Final: output[out_idx] = tmp2;
+//! // BitPack unpacks data
+//! tmp0 = unpack(...)
+//! // FoR adds reference value
+//! tmp1 = tmp0 + ref0
+//! // ALP scales the result
+//! tmp2 = tmp1 * scale
+//! // Final write to output
+//! output[out_idx] = tmp2
+//! ```
+//!
+//! ## Continuation-Based Composition
+//!
+//! Each step calls a continuation function after computing its output:
+//! - The continuation function receives a `GPUKernelParameter` (the child's output variable)
+//! - The step uses this variable to perform its computation
+//! - The step then calls its own continuation with its output variable
+//! - This creates a chain: innermost → ... → outermost → final write
+//!
+//! Example flow:
+//! ```text
+//! BitPack.kernel_body(w, continuation):
+//!   // Unpacks data
+//!   tmp0 = unpack(...)
+//!   continuation(w, GPUKernelParameter{name: "tmp0", type_: "int32_t"})
+//!
+//! FoR.kernel_body(w, continuation):
+//!   child_var = self.child.kernel_body(w, continuation)  // Gets "tmp0"
+//!   tmp1 = child_var + ref0
+//!   continuation(w, GPUKernelParameter{name: "tmp1", type_: "int32_t"})
+//!
+//! ALP.kernel_body(w, continuation):
+//!   child_var = self.child.kernel_body(w, continuation)  // Gets "tmp1"
+//!   tmp2 = child_var * scale
+//!   continuation(w, GPUKernelParameter{name: "tmp2", type_: "float"})
 //! ```
+//!
+//! The root continuation writes the final result to the output array.
 
 mod arrays;
 mod convert;

From 4264275bc0dd76afff10c6807005c073ab297b71 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 16 Oct 2025 12:12:53 -0400
Subject: [PATCH 14/15] fixup

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-gpu/src/jit/arrays/alp.rs |  8 ++++----
 vortex-gpu/src/jit/mod.rs        | 27 ++++++++++++++-------------
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/vortex-gpu/src/jit/arrays/alp.rs b/vortex-gpu/src/jit/arrays/alp.rs
index f71d8248341..6db6ae393c0 100644
--- a/vortex-gpu/src/jit/arrays/alp.rs
+++ b/vortex-gpu/src/jit/arrays/alp.rs
@@ -17,7 +17,7 @@ use crate::jit::{
     StepIdAllocator,
 };
 
-struct ALP<A: ALPFloat> {
+struct Alp<A: ALPFloat> {
     step_id: usize,
     float_type: PType,
     child: Box<dyn GPUPipelineJIT>,
@@ -35,7 +35,7 @@ pub fn new_jit(
         let child = handle_array(alp.encoded(), stream, allocator, output_array);
         let step_id = allocator.fresh_id();
         Box::new(ScalarGPUPipelineJITNode {
-            inner: ALP {
+            inner: Alp {
                 step_id,
                 float_type: alp.ptype(),
                 child,
@@ -46,7 +46,7 @@ pub fn new_jit(
     })
 }
 
-impl<A: ALPFloat> ALP<A> {
+impl<A: ALPFloat> Alp<A> {
     fn tmp_var(&self) -> String {
         format!("tmp{}", self.step_id)
     }
@@ -60,7 +60,7 @@ impl<A: ALPFloat> ALP<A> {
     }
 }
 
-impl<A: ALPFloat + DeviceRepr> ScalarGPUPipelineJIT for ALP<A> {
+impl<A: ALPFloat + DeviceRepr> ScalarGPUPipelineJIT for Alp<A> {
     fn in_params(&self, params: &mut Vec<GPUKernelParameter>) {
         params.extend([
             GPUKernelParameter {
diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs
index a8ade0de4cd..c74d6ef1f6a 100644
--- a/vortex-gpu/src/jit/mod.rs
+++ b/vortex-gpu/src/jit/mod.rs
@@ -89,6 +89,16 @@ use vortex_error::VortexResult;
 
 use crate::indent::IndentedWriter;
 
+/// Type alias for the continuation function passed to `kernel_body`.
+///
+/// The continuation receives the output parameter from a child step and returns
+/// the final output parameter after all parent steps have been applied.
+pub type KernelContinuation<'a> = dyn Fn(
+    &mut IndentedWriter<&mut dyn Write>,
+    GPUKernelParameter,
+) -> Result<GPUKernelParameter, fmt::Error>
+    + 'a;
+
 /// Trait for encoding steps that can be JIT-compiled into a CUDA kernel.
 ///
 /// Each step contributes a piece of the kernel and specifies its output variable
@@ -110,10 +120,7 @@ pub trait GPUPipelineJIT {
     fn kernel_body(
         &self,
         w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(
-            &mut IndentedWriter<&mut dyn Write>,
-            GPUKernelParameter,
-        ) -> Result<GPUKernelParameter, fmt::Error>,
+        f: &KernelContinuation,
     ) -> Result<GPUKernelParameter, fmt::Error>;
 
     /// Returns the name+type of the output variable
@@ -138,10 +145,7 @@ pub trait ScalarGPUPipelineJIT {
     fn kernel_body(
         &self,
         w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(
-            &mut IndentedWriter<&mut dyn Write>,
-            GPUKernelParameter,
-        ) -> Result<GPUKernelParameter, fmt::Error>,
+        f: &KernelContinuation,
     ) -> Result<GPUKernelParameter, fmt::Error>;
 
     /// Returns the name+type of the output variable
@@ -199,12 +203,9 @@ impl<T: ScalarGPUPipelineJIT> GPUPipelineJIT for ScalarGPUPipelineJITNode<T> {
     fn kernel_body(
         &self,
         w: &mut IndentedWriter<&mut dyn Write>,
-        f: &dyn Fn(
-            &mut IndentedWriter<&mut dyn Write>,
-            GPUKernelParameter,
-        ) -> Result<GPUKernelParameter, fmt::Error>,
+        f: &KernelContinuation,
     ) -> Result<GPUKernelParameter, fmt::Error> {
-        self.inner.kernel_body(w, &f)
+        self.inner.kernel_body(w, f)
     }
 
     fn output_parameter(&self) -> GPUKernelParameter {

From a81472fa9a6c762709865cd994dc6d35aa9d7f98 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 16 Oct 2025 12:20:54 -0400
Subject: [PATCH 15/15] fixup

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-gpu/src/jit/mod.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs
index c74d6ef1f6a..4beb334c845 100644
--- a/vortex-gpu/src/jit/mod.rs
+++ b/vortex-gpu/src/jit/mod.rs
@@ -94,9 +94,9 @@ use crate::indent::IndentedWriter;
 /// The continuation receives the output parameter from a child step and returns
 /// the final output parameter after all parent steps have been applied.
 pub type KernelContinuation<'a> = dyn Fn(
-    &mut IndentedWriter<&mut dyn Write>,
-    GPUKernelParameter,
-) -> Result<GPUKernelParameter, fmt::Error>
+        &mut IndentedWriter<&mut dyn Write>,
+        GPUKernelParameter,
+    ) -> Result<GPUKernelParameter, fmt::Error>
     + 'a;
 
 /// Trait for encoding steps that can be JIT-compiled into a CUDA kernel.