vortex-data · joseph-isaacs · Oct 16, 2025 · Oct 9, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/vortex-gpu-kernels/src/bit_unpack.rs b/vortex-gpu-kernels/src/bit_unpack.rs
@@ -157,6 +157,7 @@ fn generate_unpack_for_width<T: FastLanes, W: Write>(
         writeln!(output)?;
         generate_device_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
         writeln!(output)?;
+
         generate_global_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
         writeln!(output)?;
     }

diff --git a/vortex-gpu-kernels/src/lib.rs b/vortex-gpu-kernels/src/lib.rs
@@ -5,3 +5,4 @@ mod bit_unpack;
 mod indent;
 
 pub use bit_unpack::generate_unpack;
+pub use indent::IndentedWriter;
diff --git a/vortex-gpu/Cargo.toml b/vortex-gpu/Cargo.toml
@@ -15,9 +15,10 @@ version = { workspace = true }
 publish = false
 
 [dependencies]
-#askama = { workspace = true }
 cudarc = { workspace = true, features = ["f16"] }
+itertools = { workspace = true }
 parking_lot = { workspace = true }
+vortex-alp = { workspace = true }
 vortex-array = { workspace = true }
 vortex-buffer = { workspace = true, features = ["cuda"] }
 vortex-dict = { workspace = true }

diff --git a/vortex-gpu/benches/gpu_bitunpack.rs b/vortex-gpu/benches/gpu_bitunpack.rs
@@ -10,12 +10,15 @@ use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_m
 use cudarc::driver::CudaContext;
 use rand::prelude::StdRng;
 use rand::{Rng, SeedableRng};
-use vortex_array::{IntoArray, ToCanonical};
+use vortex_alp::{ALPArray, Exponents};
+use vortex_array::{Array, ArrayRef, IntoArray, ToCanonical};
 use vortex_buffer::BufferMut;
 use vortex_dtype::NativePType;
 use vortex_error::VortexUnwrap;
 use vortex_fastlanes::{BitPackedArray, FoRArray};
-use vortex_gpu::{cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed};
+use vortex_gpu::{
+    create_run_jit_kernel, cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed,
+};
 
 // Data sizes: 1GB, 2.5GB, 5GB, 10GB
 // These are approximate sizes in bytes, accounting for bit-packing compression
@@ -61,6 +64,32 @@ fn make_for_bitpackable_array(len: usize) -> FoRArray {
     FoRArray::try_new(bitpacked.into_array(), reference.into()).vortex_unwrap()
 }
 
+fn make_alp_array(len: usize) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(42);
+    let reference = 100i32;
+
+    // Generate values that fit in 6 bits (0-63)
+    let values = (0..len)
+        .map(|_| rng.random_range(0..64))
+        .collect::<BufferMut<i32>>()
+        .into_array()
+        .to_primitive();
+
+    // Create bitpacked array first
+    let bitpacked = BitPackedArray::encode(values.as_ref(), 6).unwrap();
+
+    // Wrap in FoR encoding with reference value
+    ALPArray::try_new(
+        FoRArray::try_new(bitpacked.into_array(), reference.into())
+            .vortex_unwrap()
+            .into_array(),
+        Exponents { e: 4, f: 5 },
+        None,
+    )
+    .vortex_unwrap()
+    .into_array()
+}
+
 fn benchmark_gpu_decompress_kernel_only(c: &mut Criterion) {
     let mut group = c.benchmark_group("gpu_decompress_kernel_only");
 
@@ -155,6 +184,37 @@ fn benchmark_gpu_for_bp_fused_decompress_kernel_only(c: &mut Criterion) {
     group.finish();
 }
 
+fn benchmark_gpu_for_bp_jit_decompress_kernel_only(c: &mut Criterion) {
+    let mut group = c.benchmark_group("benchmark_gpu_for_bp_jit_decompress_kernel_only");
+
+    group.sample_size(10);
+
+    for (len, label) in DATA_SIZES {
+        let len = len.next_multiple_of(1024);
+        let array = make_alp_array(len).into_array();
+
+        let ctx = CudaContext::new(0).unwrap();
+        ctx.set_blocking_synchronize().unwrap();
+
+        group.throughput(Throughput::Bytes(
+            (len * array.dtype().as_ptype().byte_width()) as u64,
+        ));
+        group.bench_with_input(BenchmarkId::new("for/jit", label), &array, |b, array| {
+            b.iter_custom(|iters| {
+                let mut total_time = Duration::ZERO;
+                for _ in 0..iters {
+                    // This only measures kernel execution time, not memory transfers
+                    let (_result, kernel_time) = create_run_jit_kernel(ctx.clone(), array).unwrap();
+                    total_time += kernel_time;
+                }
+                total_time
+            });
+        });
+    }
+
+    group.finish();
+}
+
 #[allow(dead_code)]
 fn benchmark_cpu_canonicalize(c: &mut Criterion) {
     let mut group = c.benchmark_group("cpu_canonicalize");
@@ -176,6 +236,9 @@ criterion_group!(
     benches,
     benchmark_gpu_decompress_kernel_only,
     benchmark_gpu_for_decompress_kernel_only,
-    benchmark_gpu_for_bp_fused_decompress_kernel_only
+    benchmark_gpu_for_bp_fused_decompress_kernel_only,
+    benchmark_gpu_for_bp_jit_decompress_kernel_only
 );
+
+// criterion_group!(benches, benchmark_gpu_for_bp_jit_decompress_kernel_only);
 criterion_main!(benches);
diff --git a/vortex-gpu/src/bit_unpack.rs b/vortex-gpu/src/bit_unpack.rs
@@ -163,7 +163,7 @@ pub fn new_task(
 
         let launch_config = LaunchConfig {
             grid_dim: (num_chunks, 1, 1),
-            block_dim: (if size_of::<P>() == 8 { 16 } else { 32 }, 1, 1),
+            block_dim: (if P::BITS == 64 { 16 } else { 32 }, 1, 1),
             shared_mem_bytes: 0,
         };
 

diff --git a/vortex-gpu/src/indent.rs b/vortex-gpu/src/indent.rs
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt;
+use std::fmt::Write;
+
+pub struct IndentedWriter<W: Write> {
+    write: W,
+    indent: String,
+}
+
+impl<W: Write> IndentedWriter<W> {
+    pub fn new(write: W) -> Self {
+        Self {
+            write,
+            indent: "".to_string(),
+        }
+    }
+
+    pub fn indent<F>(&mut self, indented: F) -> fmt::Result
+    where
+        F: FnOnce(&mut IndentedWriter<W>) -> fmt::Result,
+    {
+        let original_ident = self.indent.clone();
+        self.indent += "    ";
+        let res = indented(self);
+        self.indent = original_ident;
+        res
+    }
+
+    pub fn write_fmt(&mut self, fmt: fmt::Arguments<'_>) -> fmt::Result {
+        write!(self.write, "{}{}", self.indent, fmt)
+    }
+}
+
+pub type IndentedWrite<'a> = IndentedWriter<&'a mut dyn Write>;
diff --git a/vortex-gpu/src/jit/arrays/alp.rs b/vortex-gpu/src/jit/arrays/alp.rs
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt;
+use std::fmt::Write;
+use std::sync::Arc;
+
+use cudarc::driver::{CudaStream, DeviceRepr, LaunchArgs, PushKernelArg};
+use vortex_alp::{ALPArray, ALPFloat, match_each_alp_float_ptype};
+use vortex_dtype::PType;
+use vortex_error::VortexResult;
+
+use crate::indent::{IndentedWrite, IndentedWriter};
+use crate::jit::convert::handle_array;
+use crate::jit::{
+    CUDAType, GPUKernelParameter, GPUPipelineJIT, ScalarGPUPipelineJIT, ScalarGPUPipelineJITNode,
+    StepIdAllocator,
+};
+
+struct Alp<A: ALPFloat> {
+    step_id: usize,
+    float_type: PType,
+    child: Box<dyn GPUPipelineJIT>,
+    f: A,
+    e: A,
+}
+
+pub fn new_jit(
+    alp: &ALPArray,
+    stream: &Arc<CudaStream>,
+    allocator: &mut StepIdAllocator,
+    output_array: String,
+) -> Box<dyn GPUPipelineJIT> {
+    match_each_alp_float_ptype!(alp.ptype(), |A| {
+        let child = handle_array(alp.encoded(), stream, allocator, output_array);
+        let step_id = allocator.fresh_id();
+        Box::new(ScalarGPUPipelineJITNode {
+            inner: Alp {
+                step_id,
+                float_type: alp.ptype(),
+                child,
+                f: A::F10[alp.exponents().f as usize],
+                e: A::IF10[alp.exponents().e as usize],
+            },
+        })
+    })
+}
+
+impl<A: ALPFloat> Alp<A> {
+    fn tmp_var(&self) -> String {
+        format!("tmp{}", self.step_id)
+    }
+
+    fn e_var(&self) -> String {
+        format!("e{}", self.step_id)
+    }
+
+    fn f_var(&self) -> String {
+        format!("f{}", self.step_id)
+    }
+}
+
+impl<A: ALPFloat + DeviceRepr> ScalarGPUPipelineJIT for Alp<A> {
+    fn in_params(&self, params: &mut Vec<GPUKernelParameter>) {
+        params.extend([
+            GPUKernelParameter {
+                name: self.e_var(),
+                type_: CUDAType::from(A::PTYPE).to_string(),
+            },
+            GPUKernelParameter {
+                name: self.f_var(),
+                type_: CUDAType::from(A::PTYPE).to_string(),
+            },
+        ])
+    }
+
+    fn args<'a>(
+        &'a self,
+        _stream: &Arc<CudaStream>,
+        args: &mut LaunchArgs<'a>,
+    ) -> VortexResult<()> {
+        args.arg(&self.e);
+        args.arg(&self.f);
+        Ok(())
+    }
+
+    fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
+        let output_cuda_type = CUDAType::from(self.float_type);
+        writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?;
+        Ok(())
+    }
+
+    fn kernel_body(
+        &self,
+        w: &mut IndentedWriter<&mut dyn Write>,
+        f: &dyn Fn(
+            &mut IndentedWriter<&mut dyn Write>,
+            GPUKernelParameter,
+        ) -> Result<GPUKernelParameter, fmt::Error>,
+    ) -> Result<GPUKernelParameter, fmt::Error> {
+        self.child
+            .kernel_body(w, &|w: &mut IndentedWrite, in_: GPUKernelParameter| {
+                let in_var = in_.name;
+                writeln!(
+                    w,
+                    "{out} = ((({type_}){in_var}) * {f}) * {e};",
+                    out = self.tmp_var(),
+                    type_ = CUDAType::from(self.float_type),
+                    f = self.f_var(),
+                    e = self.e_var(),
+                )?;
+                f(w, self.output_parameter())
+            })
+    }
+
+    fn output_parameter(&self) -> GPUKernelParameter {
+        GPUKernelParameter {
+            name: self.tmp_var(),
+            type_: CUDAType::from(self.float_type).to_string(),
+        }
+    }
+
+    fn output_type(&self) -> PType {
+        self.float_type
+    }
+
+    fn child(&self) -> &dyn GPUPipelineJIT {
+        self.child.as_ref()
+    }
+}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,3 +5,4 @@ mod bit_unpack;
		mod indent;

		pub use bit_unpack::generate_unpack;
		pub use indent::IndentedWriter;