From 55f155ffcc466511f676a5b559a9929ec26bfdfa Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 9 Oct 2025 18:33:12 -0400 Subject: [PATCH 01/15] wip Signed-off-by: Joe Isaacs --- vortex-gpu-kernels/src/bit_unpack.rs | 1 + vortex-gpu-kernels/src/lib.rs | 1 + vortex-gpu/Cargo.toml | 1 - vortex-gpu/kernels/dict_take.cu | 4 +- vortex-gpu/kernels/fused_bitpack_for.cu | 22 +- vortex-gpu/src/bit_unpack.rs | 2 +- vortex-gpu/src/for_bp.rs | 29 +-- vortex-gpu/src/indent.rs | 34 +++ vortex-gpu/src/jit.rs | 310 ++++++++++++++++++++++++ vortex-gpu/src/lib.rs | 2 + vortex-gpu/src/task.rs | 4 +- 11 files changed, 389 insertions(+), 21 deletions(-) create mode 100644 vortex-gpu/src/indent.rs create mode 100644 vortex-gpu/src/jit.rs diff --git a/vortex-gpu-kernels/src/bit_unpack.rs b/vortex-gpu-kernels/src/bit_unpack.rs index 976b66e9286..743c072a9a0 100644 --- a/vortex-gpu-kernels/src/bit_unpack.rs +++ b/vortex-gpu-kernels/src/bit_unpack.rs @@ -157,6 +157,7 @@ fn generate_unpack_for_width( writeln!(output)?; generate_device_kernel_for_width::(output, bit_width, thread_count)?; writeln!(output)?; + generate_global_kernel_for_width::(output, bit_width, thread_count)?; writeln!(output)?; } diff --git a/vortex-gpu-kernels/src/lib.rs b/vortex-gpu-kernels/src/lib.rs index 2130b890b5f..ff4e2a95799 100644 --- a/vortex-gpu-kernels/src/lib.rs +++ b/vortex-gpu-kernels/src/lib.rs @@ -5,3 +5,4 @@ mod bit_unpack; mod indent; pub use bit_unpack::generate_unpack; +pub use indent::IndentedWriter; diff --git a/vortex-gpu/Cargo.toml b/vortex-gpu/Cargo.toml index df1239457c5..01ddc50d151 100644 --- a/vortex-gpu/Cargo.toml +++ b/vortex-gpu/Cargo.toml @@ -14,7 +14,6 @@ rust-version = { workspace = true } version = { workspace = true } [dependencies] -#askama = { workspace = true } cudarc = { workspace = true, features = ["f16"] } parking_lot = { workspace = true } vortex-array = { workspace = true } diff --git a/vortex-gpu/kernels/dict_take.cu b/vortex-gpu/kernels/dict_take.cu index 3f3b1b0a551..9f222769302 100644 --- a/vortex-gpu/kernels/dict_take.cu +++ b/vortex-gpu/kernels/dict_take.cu @@ -20,8 +20,8 @@ __device__ void dict_take( const int thread_ops = 32; for (auto j = 0; j < thread_ops; j++) { - auto idx = i * thread_ops + j; - out[idx] = values[codes[idx]]; + auto idx = i * thread_ops + j; + out[idx] = values[codes[idx]]; } } diff --git a/vortex-gpu/kernels/fused_bitpack_for.cu b/vortex-gpu/kernels/fused_bitpack_for.cu index 4fd4bc48149..0ba14fedbe5 100644 --- a/vortex-gpu/kernels/fused_bitpack_for.cu +++ b/vortex-gpu/kernels/fused_bitpack_for.cu @@ -108,13 +108,28 @@ __device__ __forceinline__ void for_device( } } +template +__device__ __forceinline__ void float_device( + ValueT *__restrict values_in, + float *__restrict values_out, + int thread_idx +) { + auto i = thread_idx; + const int thread_ops = blockDim.x; + + for (auto j = 0; j < thread_ops; j++) { + auto idx = INDEX(j, i); + values_out[idx] = (float)values_in[idx]; + } +} + // Fused kernel: bitpack unpack (3bw) + FoR addition in one pass // This eliminates the intermediate write-to-memory and read-from-memory // by keeping unpacked values in registers/L1 cache and immediately adding the reference extern "C" __global__ void fused_bitpack6_for_u32( const uint32_t *__restrict packed_in, - uint32_t *__restrict unpacked_out, + float *__restrict unpacked_out, uint32_t reference ) { int i = threadIdx.x; @@ -124,13 +139,16 @@ extern "C" __global__ void fused_bitpack6_for_u32( auto out = unpacked_out + (blockIdx.x * 1024); __shared__ uint32_t shared_data[1024]; + __shared__ float shared_data2[1024]; fls_unpack_6bw_32ow_device(in, shared_data, i); for_device(shared_data, reference, i); + float_device(shared_data, shared_data2, i); + for (int i = 0; i < 32; i++) { auto idx = i * 32 + threadIdx.x; - out[idx] = shared_data[idx]; + out[idx] = shared_data2[idx]; } } diff --git a/vortex-gpu/src/bit_unpack.rs b/vortex-gpu/src/bit_unpack.rs index 86f3d54a007..288b73d10c7 100644 --- a/vortex-gpu/src/bit_unpack.rs +++ b/vortex-gpu/src/bit_unpack.rs @@ -173,7 +173,7 @@ pub fn new_task( let launch_config = LaunchConfig { grid_dim: (num_chunks, 1, 1), - block_dim: (if size_of::

() == 8 { 16 } else { 32 }, 1, 1), + block_dim: (if P::BITS == 64 { 16 } else { 32 }, 1, 1), shared_mem_bytes: 0, }; diff --git a/vortex-gpu/src/for_bp.rs b/vortex-gpu/src/for_bp.rs index e2b8fb19e40..291459bf146 100644 --- a/vortex-gpu/src/for_bp.rs +++ b/vortex-gpu/src/for_bp.rs @@ -26,7 +26,7 @@ struct FoRBPTask

{ launch_config: LaunchConfig, packed: CudaSlice

, - unpacked: CudaSlice

, + unpacked: CudaSlice, reference: P, len: usize, @@ -53,7 +53,7 @@ pub fn new_task( .map_err(|e| vortex_err!("Failed to copy to device: {e}"))?; let cu_out = unsafe { stream - .alloc::(array.len().next_multiple_of(1024)) + .alloc::(array.len().next_multiple_of(1024)) .map_err(|e| vortex_err!("Failed to allocate stream: {e}"))? }; @@ -101,7 +101,7 @@ impl GPUTask for FoRBPTask

{ fn export_result(&mut self) -> VortexResult { let len = self.len(); - let mut buffer = BufferMut::

::with_capacity(len); + let mut buffer = BufferMut::::with_capacity(len); unsafe { buffer.set_len(len) } self.stream @@ -186,16 +186,17 @@ mod tests { ctx.set_blocking_synchronize().unwrap(); let unpacked = cuda_for_bp_unpack(&array, ctx).unwrap(); let primitive_array = array.into_array().to_primitive(); - assert_eq!( - primitive_array.as_slice::(), - unpacked.as_slice::() - ); - for i in 0..primitive_array.len() { - assert_eq!( - primitive_array.as_slice::()[i], - unpacked.as_slice::()[i], - "i {i}" - ); - } + println!("unpacked {:?}", unpacked.as_slice::()); + // assert_eq!( + // primitive_array.as_slice::(), + // unpacked.as_slice::() + // ); + // for i in 0..primitive_array.len() { + // assert_eq!( + // primitive_array.as_slice::()[i], + // unpacked.as_slice::()[i], + // "i {i}" + // ); + // } } } diff --git a/vortex-gpu/src/indent.rs b/vortex-gpu/src/indent.rs new file mode 100644 index 00000000000..0518ef22a54 --- /dev/null +++ b/vortex-gpu/src/indent.rs @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt::Write; +use std::fmt; + +pub struct IndentedWriter { + write: W, + indent: String, +} + +impl IndentedWriter { + pub fn new(write: W) -> Self { + Self { + write, + indent: "".to_string(), + } + } + + pub fn indent(&mut self, indented: F) -> fmt::Result + where + F: FnOnce(&mut IndentedWriter) -> fmt::Result, + { + let original_ident = self.indent.clone(); + self.indent += " "; + let res = indented(self); + self.indent = original_ident; + res + } + + pub fn write_fmt(&mut self, fmt: fmt::Arguments<'_>) -> fmt::Result { + write!(self.write, "{}{}", self.indent, fmt) + } +} diff --git a/vortex-gpu/src/jit.rs b/vortex-gpu/src/jit.rs new file mode 100644 index 00000000000..7d5033aacc5 --- /dev/null +++ b/vortex-gpu/src/jit.rs @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt; +use std::fmt::{Display, Write}; + +use vortex_array::ArrayRef; +use vortex_dtype::PType; +use vortex_error::{VortexResult, vortex_err}; +use vortex_fastlanes::{BitPackedVTable, FoRVTable}; + +use crate::indent::IndentedWriter; + +pub enum IterationOrder { + InOrder, + FastLanesTransposed, +} + +struct GPUKernelParameter { + name: String, + type_: String, +} + +// struct GPUPipelineParameters { +// inputs: Vec, +// output: GPUKernelParameter, +// block_width: usize, +// grid_width: usize, +// iteration_order: IterationOrder, +// } +// +// struct GPUPipeline { +// body: String, +// parameters: GPUPipelineParameters, +// } + +// bp -> output +// tmp = ... +// out[i] = tmp; + +// bp -> for -> output + +// tmp = .... +// tmp_for = tmp + ref +// output[i] = tmp_for[i] + +// have leaves only bp for now. + +// step-type (each one has a unique step_id) +// step_id +// in_params +// decls/setup +// kernel-step body // fn body(var, writer) -> str +// output_var + output_type + +trait GPUPipelineJIT { + fn step_id(&self) -> usize; + + fn in_params(&self, params: &mut Vec); + + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result; + + fn kernel_body( + &self, + w: &mut IndentedWriter<&mut dyn Write>, + f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, + ) -> fmt::Result; + + fn output_var(&self) -> String; + + fn output_type(&self) -> PType; +} + +struct CUDAType(&'static str); + +impl Display for CUDAType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.0) + } +} + +impl From for CUDAType { + fn from(value: PType) -> Self { + CUDAType(match value { + PType::U8 => "uint8_t", + PType::U16 => "uint16_t", + PType::U32 => "uint32_t", + PType::U64 => "uint64_t", + PType::I8 => "int8_t", + PType::I16 => "int16_t", + PType::I32 => "int32_t", + PType::I64 => "int64_t", + PType::F32 => "float", + PType::F64 => "double", + PType::F16 => todo!(), + }) + } +} + +struct BitPack { + step_id: usize, + output_type: PType, +} + +impl GPUPipelineJIT for BitPack { + fn step_id(&self) -> usize { + self.step_id + } + + fn in_params(&self, p: &mut Vec) { + p.push(GPUKernelParameter { + name: self.src_var(), + type_: format!("{} *__restrict", CUDAType::from(self.output_type)), + }); + } + + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { + let output_cuda_type = CUDAType::from(self.output_type); + // TODO: all types + writeln!(w, "unsigned int LANE_COUNT = 32;")?; + writeln!(w, "{output_cuda_type} src{};", self.step_id)?; + writeln!(w, "{output_cuda_type} tmp{};", self.step_id)?; + Ok(()) + } + + fn kernel_body( + &self, + w: &mut IndentedWriter<&mut dyn Write>, + f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, + ) -> fmt::Result { + for i in 0..4 { + let src = self.src_var(); + let tmp = self.tmp_var(); + writeln!(w, "{src} = in[thread_ix + {i}];")?; + writeln!(w, "{tmp} = ({src} >> 0) & MASK(uint32_t, 1);")?; + f(w)?; + writeln!(w)?; + } + Ok(()) + } + + fn output_var(&self) -> String { + self.tmp_var() + } + + fn output_type(&self) -> PType { + self.output_type + } +} + +impl BitPack { + fn tmp_var(&self) -> String { + format!("tmp{}", self.step_id) + } + + fn src_var(&self) -> String { + format!("src{}", self.step_id) + } +} + +struct FoR { + step_id: usize, + reference_type: PType, + child: Box, +} + +impl FoR { + fn tmp_var(&self) -> String { + format!("tmp{}", self.step_id) + } + + fn ref_var(&self) -> String { + format!("ref{}", self.step_id) + } +} + +impl GPUPipelineJIT for FoR { + fn step_id(&self) -> usize { + self.step_id + } + + fn in_params(&self, p: &mut Vec) { + self.child.in_params(p); + p.push(GPUKernelParameter { + name: self.ref_var(), + type_: CUDAType::from(self.output_type()).to_string(), + }) + } + + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { + self.child.decls(w)?; + let output_cuda_type = CUDAType::from(self.output_type()); + // TODO: supprort all types + writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?; + Ok(()) + } + + fn kernel_body( + &self, + w: &mut IndentedWriter<&mut dyn Write>, + f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, + ) -> fmt::Result { + assert_eq!(self.output_type(), self.child.output_type()); + let in_var = self.child.output_var(); + let out_var = self.tmp_var(); + let ref_var = self.ref_var(); + self.child + .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { + writeln!(w, "{out_var} = {in_var} + {ref_var};")?; + f(w) + }) + } + + fn output_var(&self) -> String { + self.tmp_var() + } + + fn output_type(&self) -> PType { + self.reference_type + } +} + +fn handle_array(a: &ArrayRef, step_id: usize) -> Box { + if let Some(bp) = a.as_opt::() { + return Box::new(BitPack { + step_id, + output_type: bp.ptype(), + }); + }; + + if let Some(for_) = a.as_opt::() { + return Box::new(FoR { + step_id, + reference_type: for_.reference_scalar().as_primitive().ptype(), + child: handle_array(for_.encoded(), step_id + 1), + }); + } + + todo!() +} + +fn _create_jit(a: &ArrayRef) -> fmt::Result { + let output = handle_array(a, 0); + + let mut s = String::new(); + let w = &mut s as &mut dyn Write; + let mut ind = IndentedWriter::new(w); + let w = &mut ind; + + let mut params = Vec::new(); + output.in_params(&mut params); + params.push(GPUKernelParameter { + name: "output".to_string(), + type_: format!("{} *__restrict", CUDAType::from(output.output_type())), + }); + + writeln!(w, "__global__ void kernel(")?; + w.indent(|w| { + params + .iter() + .try_for_each(|p| writeln!(w, "{} {},", p.type_, p.name)) + })?; + writeln!(w, ") {{")?; + + w.indent(|w| { + output.decls(w)?; + writeln!(w)?; + output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { + writeln!(w, "output[idx] = {}", output.output_var()) + }) + })?; + writeln!(w, "}}")?; + + println!("{}", s); + + Ok(()) +} + +fn create_jit(a: &ArrayRef) -> VortexResult<()> { + _create_jit(a).map_err(|e| vortex_err!("failed to write decls {e}")) +} + +#[cfg(test)] +mod tests { + use vortex_array::IntoArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_error::VortexResult; + use vortex_fastlanes::{BitPackedArray, FoRArray}; + + use crate::jit::create_jit; + + #[test] + fn jit_arr() -> VortexResult<()> { + let for_ = FoRArray::try_new( + BitPackedArray::encode( + (0u32..10) + .map(|_| 1u32) + .collect::() + .as_ref(), + 2, + )? + .into_array(), + 2u32.into(), + )?; + + create_jit(&for_.into_array())?; + + Ok(()) + } +} diff --git a/vortex-gpu/src/lib.rs b/vortex-gpu/src/lib.rs index 0af2782f515..2a9b5ed1b20 100644 --- a/vortex-gpu/src/lib.rs +++ b/vortex-gpu/src/lib.rs @@ -4,6 +4,8 @@ pub mod bit_unpack; pub mod for_; mod for_bp; +mod indent; +mod jit; mod take; mod task; diff --git a/vortex-gpu/src/task.rs b/vortex-gpu/src/task.rs index 7bf2c079f42..1b678e74508 100644 --- a/vortex-gpu/src/task.rs +++ b/vortex-gpu/src/task.rs @@ -6,7 +6,7 @@ use vortex_array::Canonical; use vortex_error::VortexResult; pub trait GPUTask { - // Must call `launch_task` or `launch_task_timed` once + // Must call `launch_task` once fn launch_task(&mut self) -> VortexResult<()>; // Must call this after launch_task @@ -16,4 +16,6 @@ pub trait GPUTask { fn output(&mut self) -> CudaViewMut<'_, u8>; fn len(&self) -> usize; + + // fn jit(&mut self) -> OptBox ; } From a8e86227ec2e2108ab99494794501fab378f71c2 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 10 Oct 2025 16:43:23 -0400 Subject: [PATCH 02/15] wip Signed-off-by: Joe Isaacs --- Cargo.lock | 1 + vortex-gpu/Cargo.toml | 1 + vortex-gpu/src/indent.rs | 2 +- vortex-gpu/src/jit.rs | 350 ++++++++++++++++++++++++++++++++++----- 4 files changed, 313 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 04425053515..f9ecf0f3b06 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8978,6 +8978,7 @@ dependencies = [ "parking_lot", "rand 0.9.2", "rstest", + "vortex-alp", "vortex-array", "vortex-buffer", "vortex-dict", diff --git a/vortex-gpu/Cargo.toml b/vortex-gpu/Cargo.toml index 01ddc50d151..6a9e6a1aa28 100644 --- a/vortex-gpu/Cargo.toml +++ b/vortex-gpu/Cargo.toml @@ -17,6 +17,7 @@ version = { workspace = true } cudarc = { workspace = true, features = ["f16"] } parking_lot = { workspace = true } vortex-array = { workspace = true } +vortex-alp = { workspace = true } vortex-buffer = { workspace = true, features = ["cuda"] } vortex-dict = { workspace = true } vortex-dtype = { workspace = true } diff --git a/vortex-gpu/src/indent.rs b/vortex-gpu/src/indent.rs index 0518ef22a54..5e7d587d202 100644 --- a/vortex-gpu/src/indent.rs +++ b/vortex-gpu/src/indent.rs @@ -1,8 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::fmt::Write; use std::fmt; +use std::fmt::Write; pub struct IndentedWriter { write: W, diff --git a/vortex-gpu/src/jit.rs b/vortex-gpu/src/jit.rs index 7d5033aacc5..34617e81401 100644 --- a/vortex-gpu/src/jit.rs +++ b/vortex-gpu/src/jit.rs @@ -2,12 +2,17 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use std::fmt; -use std::fmt::{Display, Write}; +use std::fmt::{Debug, Display, Write}; +use std::sync::Arc; +use cudarc::driver::{CudaContext, CudaSlice, CudaStream, DeviceRepr, LaunchArgs, PushKernelArg}; +use vortex_alp::{ALPFloat, ALPVTable, Exponents, match_each_alp_float_ptype}; use vortex_array::ArrayRef; -use vortex_dtype::PType; -use vortex_error::{VortexResult, vortex_err}; -use vortex_fastlanes::{BitPackedVTable, FoRVTable}; +use vortex_buffer::ByteBuffer; +use vortex_dtype::half::f16; +use vortex_dtype::{NativePType, PType, match_each_native_ptype}; +use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_err}; +use vortex_fastlanes::{BitPackedArray, BitPackedVTable, FoRVTable}; use crate::indent::IndentedWriter; @@ -21,6 +26,11 @@ struct GPUKernelParameter { type_: String, } +struct GPULaunchConfig { + block_width: usize, + grid_width: usize, +} + // struct GPUPipelineParameters { // inputs: Vec, // output: GPUKernelParameter, @@ -53,11 +63,17 @@ struct GPUKernelParameter { // kernel-step body // fn body(var, writer) -> str // output_var + output_type +trait GPUVisitor<'a> { + fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()>; +} + trait GPUPipelineJIT { fn step_id(&self) -> usize; fn in_params(&self, params: &mut Vec); + fn args<'a>(&'a self, stream: &Arc, args: &mut LaunchArgs<'a>) -> VortexResult<()>; + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result; fn kernel_body( @@ -69,6 +85,13 @@ trait GPUPipelineJIT { fn output_var(&self) -> String; fn output_type(&self) -> PType; + + // always pass the output iteration aligned child last. + fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()>; + + fn launch_config(&self) -> Option { + None + } } struct CUDAType(&'static str); @@ -97,12 +120,15 @@ impl From for CUDAType { } } -struct BitPack { +struct BitPack

{ step_id: usize, + bit_width: u8, + packed: ByteBuffer, output_type: PType, + cuda_slice: CudaSlice

, } -impl GPUPipelineJIT for BitPack { +impl GPUPipelineJIT for BitPack

{ fn step_id(&self) -> usize { self.step_id } @@ -110,14 +136,31 @@ impl GPUPipelineJIT for BitPack { fn in_params(&self, p: &mut Vec) { p.push(GPUKernelParameter { name: self.src_var(), - type_: format!("{} *__restrict", CUDAType::from(self.output_type)), + type_: format!( + "{type_} *__restrict", + type_ = CUDAType::from(self.output_type) + ), }); } + fn args<'a>( + &'a self, + stream: &Arc, + launch_args: &mut LaunchArgs<'a>, // args: &mut Vec>, + ) -> VortexResult<()> { + launch_args.arg(&self.cuda_slice); + + Ok(()) + } + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { let output_cuda_type = CUDAType::from(self.output_type); // TODO: all types - writeln!(w, "unsigned int LANE_COUNT = 32;")?; + writeln!( + w, + "unsigned int LANE_COUNT = {bits};", + bits = 1024 / self.output_type.bit_width() + )?; writeln!(w, "{output_cuda_type} src{};", self.step_id)?; writeln!(w, "{output_cuda_type} tmp{};", self.step_id)?; Ok(()) @@ -128,13 +171,59 @@ impl GPUPipelineJIT for BitPack { w: &mut IndentedWriter<&mut dyn Write>, f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, ) -> fmt::Result { - for i in 0..4 { + let output = w; + let bit_width = self.bit_width as usize; + let bits = self.output_type.bit_width(); + if bit_width == 0 { + writeln!(output, "uint{bits}_t zero = 0ULL;")?; + writeln!(output)?; + for row in 0..bits { + writeln!(output, "out[INDEX({row}, lane)] = zero;")?; + } + } else if bit_width == bits { + writeln!(output)?; + for row in 0..bits { + writeln!( + output, + "out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane];", + )?; + } + } else { let src = self.src_var(); let tmp = self.tmp_var(); - writeln!(w, "{src} = in[thread_ix + {i}];")?; - writeln!(w, "{tmp} = ({src} >> 0) & MASK(uint32_t, 1);")?; - f(w)?; - writeln!(w)?; + + writeln!(output)?; + writeln!(output, "{src} = in[lane];")?; + for row in 0..bits { + let curr_word = (row * bit_width) / bits; + let next_word = ((row + 1) * bit_width) / bits; + let shift = (row * bit_width) % bits; + + if next_word > curr_word { + let remaining_bits = ((row + 1) * bit_width) % bits; + let current_bits = bit_width - remaining_bits; + writeln!( + output, + "{tmp} = ({src} >> {shift}) & MASK(uint{bits}_t, {current_bits});" + )?; + + if next_word < bit_width { + writeln!(output, "{src} = in[lane + LANE_COUNT * {next_word}];")?; + writeln!( + output, + "{tmp} |= ({src} & MASK(uint{bits}_t, {remaining_bits})) << {current_bits};" + )?; + } + } else { + writeln!( + output, + "{tmp} = ({src} >> {shift}) & MASK(uint{bits}_t, {bit_width});" + )?; + } + + f(output)?; + writeln!(output)?; + } } Ok(()) } @@ -146,9 +235,20 @@ impl GPUPipelineJIT for BitPack { fn output_type(&self) -> PType { self.output_type } + + fn children(&self, _visitor: &mut dyn GPUVisitor) -> VortexResult<()> { + Ok(()) + } + + fn launch_config(&self) -> Option { + Some(GPULaunchConfig { + block_width: 1024, + grid_width: 1, + }) + } } -impl BitPack { +impl

BitPack

{ fn tmp_var(&self) -> String { format!("tmp{}", self.step_id) } @@ -158,13 +258,13 @@ impl BitPack { } } -struct FoR { +struct FoR

{ step_id: usize, - reference_type: PType, + reference: P, child: Box, } -impl FoR { +impl

FoR

{ fn tmp_var(&self) -> String { format!("tmp{}", self.step_id) } @@ -174,21 +274,28 @@ impl FoR { } } -impl GPUPipelineJIT for FoR { +impl GPUPipelineJIT for FoR

{ fn step_id(&self) -> usize { self.step_id } fn in_params(&self, p: &mut Vec) { - self.child.in_params(p); p.push(GPUKernelParameter { name: self.ref_var(), type_: CUDAType::from(self.output_type()).to_string(), }) } + fn args<'a>( + &'a self, + _stream: &Arc, + args: &mut LaunchArgs<'a>, + ) -> VortexResult<()> { + args.arg(&self.reference); + Ok(()) + } + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { - self.child.decls(w)?; let output_cuda_type = CUDAType::from(self.output_type()); // TODO: supprort all types writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?; @@ -216,40 +323,195 @@ impl GPUPipelineJIT for FoR { } fn output_type(&self) -> PType { - self.reference_type + P::PTYPE + } + + fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()> { + visitor.accept(self.child.as_ref()) } } -fn handle_array(a: &ArrayRef, step_id: usize) -> Box { +fn handle_array(a: &ArrayRef, stream: &Arc, step_id: usize) -> Box { + if let Some(alp) = a.as_opt::() { + match_each_alp_float_ptype!(alp.ptype(), |A| { + return Box::new(ALP { + step_id, + float_type: alp.ptype(), + child: handle_array(alp.encoded(), stream, step_id + 1), + f: A::F10[alp.exponents().f as usize], + e: A::IF10[alp.exponents().e as usize], + }); + }) + } if let Some(bp) = a.as_opt::() { + assert_eq!(bp.offset(), 0); + assert!(bp.patches().is_none()); + let cuda_slice = stream + .memcpy_stod(bp.packed().as_slice()) + .map_err(|e| vortex_err!("Failed to copy to device: {e}")) + .vortex_unwrap(); return Box::new(BitPack { step_id, + bit_width: bp.bit_width(), + packed: bp.packed().clone(), output_type: bp.ptype(), + cuda_slice, }); }; if let Some(for_) = a.as_opt::() { - return Box::new(FoR { - step_id, - reference_type: for_.reference_scalar().as_primitive().ptype(), - child: handle_array(for_.encoded(), step_id + 1), - }); + match_each_native_ptype!(for_.reference_scalar().as_primitive().ptype(), |P| { + return Box::new(FoR { + step_id, + reference: for_ + .reference_scalar() + .as_primitive() + .as_::

() + .vortex_expect("cannot have a null reference"), + child: handle_array(for_.encoded(), stream, step_id + 1), + }); + }) } todo!() } +struct ALP { + step_id: usize, + float_type: PType, + child: Box, + f: A, + e: A, +} + +impl ALP { + fn tmp_var(&self) -> String { + format!("tmp{}", self.step_id) + } + + fn e_var(&self) -> String { + format!("e{}", self.step_id) + } + + fn f_var(&self) -> String { + format!("f{}", self.step_id) + } +} + +impl GPUPipelineJIT for ALP { + fn step_id(&self) -> usize { + self.step_id + } + + fn in_params(&self, params: &mut Vec) { + params.extend([ + GPUKernelParameter { + name: self.e_var(), + type_: CUDAType::from(A::PTYPE).to_string(), + }, + GPUKernelParameter { + name: self.f_var(), + type_: CUDAType::from(A::PTYPE).to_string(), + }, + ]) + } + + fn args<'a>(&'a self, stream: &Arc, args: &mut LaunchArgs<'a>) -> VortexResult<()> { + args.arg(&self.e); + args.arg(&self.f); + Ok(()) + } + + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { + let output_cuda_type = CUDAType::from(self.float_type); + writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?; + Ok(()) + } + + fn kernel_body( + &self, + w: &mut IndentedWriter<&mut dyn Write>, + f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, + ) -> fmt::Result { + self.child + .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { + writeln!( + w, + "{out} = ((({type_}){tmp}) * {f}) * {e};", + tmp = self.tmp_var(), + type_ = CUDAType::from(self.float_type), + out = self.child.output_var(), + f = self.f_var(), + e = self.e_var(), + )?; + f(w) + }) + } + + fn output_var(&self) -> String { + self.tmp_var() + } + + fn output_type(&self) -> PType { + self.float_type + } + + fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()> { + visitor.accept(self.child.as_ref()) + } +} + +struct DeclPrinter<'a, 'b: 'a> { + w: &'a mut IndentedWriter<&'b mut dyn Write>, +} + +impl<'a> GPUVisitor<'a> for DeclPrinter<'a, '_> { + fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> { + node.children(self)?; + node.decls(self.w) + .map_err(|e| vortex_err!("cannot write {}", e)) + } +} + +struct InParamPrinter { + params: Vec, +} + +impl GPUVisitor<'_> for InParamPrinter { + fn accept(&mut self, node: &dyn GPUPipelineJIT) -> VortexResult<()> { + node.children(self)?; + node.in_params(&mut self.params); + Ok(()) + } +} + +struct ArgCollector<'a> { + stream: Arc, + params: &'a mut LaunchArgs<'a>, +} + +impl<'a> GPUVisitor<'a> for ArgCollector<'a> { + fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> { + node.children(self)?; + node.args(&self.stream, self.params)?; + Ok(()) + } +} + fn _create_jit(a: &ArrayRef) -> fmt::Result { - let output = handle_array(a, 0); + let ctx = CudaContext::new(0).unwrap(); + ctx.set_blocking_synchronize().unwrap(); + let output = handle_array(a, &ctx.default_stream(), 0); let mut s = String::new(); let w = &mut s as &mut dyn Write; let mut ind = IndentedWriter::new(w); let w = &mut ind; - let mut params = Vec::new(); - output.in_params(&mut params); - params.push(GPUKernelParameter { + let mut params = InParamPrinter { params: Vec::new() }; + params.accept(output.as_ref()).vortex_expect("cannot fail"); + + params.params.push(GPUKernelParameter { name: "output".to_string(), type_: format!("{} *__restrict", CUDAType::from(output.output_type())), }); @@ -257,13 +519,15 @@ fn _create_jit(a: &ArrayRef) -> fmt::Result { writeln!(w, "__global__ void kernel(")?; w.indent(|w| { params + .params .iter() .try_for_each(|p| writeln!(w, "{} {},", p.type_, p.name)) })?; writeln!(w, ") {{")?; w.indent(|w| { - output.decls(w)?; + let mut decl = DeclPrinter { w }; + decl.accept(output.as_ref()).vortex_expect("cannot fail"); writeln!(w)?; output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { writeln!(w, "output[idx] = {}", output.output_var()) @@ -282,6 +546,7 @@ fn create_jit(a: &ArrayRef) -> VortexResult<()> { #[cfg(test)] mod tests { + use vortex_alp::{ALPArray, Exponents}; use vortex_array::IntoArray; use vortex_array::arrays::PrimitiveArray; use vortex_error::VortexResult; @@ -291,16 +556,21 @@ mod tests { #[test] fn jit_arr() -> VortexResult<()> { - let for_ = FoRArray::try_new( - BitPackedArray::encode( - (0u32..10) - .map(|_| 1u32) - .collect::() - .as_ref(), - 2, + let for_ = ALPArray::try_new( + FoRArray::try_new( + BitPackedArray::encode( + (0i32..10) + .map(|_| 1i32) + .collect::() + .as_ref(), + 2, + )? + .into_array(), + 2i32.into(), )? .into_array(), - 2u32.into(), + Exponents { e: 4, f: 5 }, + None, )?; create_jit(&for_.into_array())?; From 4bab23bccc688ca985dfa06ffa96d0f5ace4dd12 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 13 Oct 2025 14:50:33 -0400 Subject: [PATCH 03/15] wip Signed-off-by: Joe Isaacs --- Cargo.lock | 1 + vortex-gpu/Cargo.toml | 1 + vortex-gpu/src/jit.rs | 190 +++++++++++++++++++++++++++++++++--------- 3 files changed, 151 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 692d6f8af52..604f37a8144 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8977,6 +8977,7 @@ dependencies = [ "anyhow", "criterion", "cudarc", + "itertools 0.14.0", "parking_lot", "rand 0.9.2", "rstest", diff --git a/vortex-gpu/Cargo.toml b/vortex-gpu/Cargo.toml index 2cc4875097e..0a81e79367c 100644 --- a/vortex-gpu/Cargo.toml +++ b/vortex-gpu/Cargo.toml @@ -17,6 +17,7 @@ publish = false [dependencies] cudarc = { workspace = true, features = ["f16"] } parking_lot = { workspace = true } +itertools = { workspace = true } vortex-array = { workspace = true } vortex-alp = { workspace = true } vortex-buffer = { workspace = true, features = ["cuda"] } diff --git a/vortex-gpu/src/jit.rs b/vortex-gpu/src/jit.rs index 34617e81401..a9609c3332e 100644 --- a/vortex-gpu/src/jit.rs +++ b/vortex-gpu/src/jit.rs @@ -5,10 +5,15 @@ use std::fmt; use std::fmt::{Debug, Display, Write}; use std::sync::Arc; -use cudarc::driver::{CudaContext, CudaSlice, CudaStream, DeviceRepr, LaunchArgs, PushKernelArg}; +use cudarc::driver::{ + CudaContext, CudaSlice, CudaStream, DeviceRepr, LaunchArgs, LaunchConfig, PushKernelArg, +}; +use itertools::Itertools; use vortex_alp::{ALPFloat, ALPVTable, Exponents, match_each_alp_float_ptype}; -use vortex_array::ArrayRef; -use vortex_buffer::ByteBuffer; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::validity::Validity; +use vortex_array::{Array, ArrayRef, Canonical, IntoArray}; +use vortex_buffer::{Buffer, BufferMut, ByteBuffer}; use vortex_dtype::half::f16; use vortex_dtype::{NativePType, PType, match_each_native_ptype}; use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_err}; @@ -105,14 +110,14 @@ impl Display for CUDAType { impl From for CUDAType { fn from(value: PType) -> Self { CUDAType(match value { - PType::U8 => "uint8_t", - PType::U16 => "uint16_t", - PType::U32 => "uint32_t", - PType::U64 => "uint64_t", - PType::I8 => "int8_t", - PType::I16 => "int16_t", - PType::I32 => "int32_t", - PType::I64 => "int64_t", + PType::U8 => "unsigned char", + PType::U16 => "unsigned short", + PType::U32 => "unsigned int", + PType::U64 => "unsigned long long", + PType::I8 => "char", + PType::I16 => "short", + PType::I32 => "int", + PType::I64 => "long long", PType::F32 => "float", PType::F64 => "double", PType::F16 => todo!(), @@ -135,7 +140,7 @@ impl GPUPipelineJIT for BitPack

{ fn in_params(&self, p: &mut Vec) { p.push(GPUKernelParameter { - name: self.src_var(), + name: self.in_var(), type_: format!( "{type_} *__restrict", type_ = CUDAType::from(self.output_type) @@ -163,6 +168,8 @@ impl GPUPipelineJIT for BitPack

{ )?; writeln!(w, "{output_cuda_type} src{};", self.step_id)?; writeln!(w, "{output_cuda_type} tmp{};", self.step_id)?; + writeln!(w, "unsigned int out_idx;")?; + writeln!(w, "unsigned int lane = threadIdx.x;")?; Ok(()) } @@ -174,6 +181,7 @@ impl GPUPipelineJIT for BitPack

{ let output = w; let bit_width = self.bit_width as usize; let bits = self.output_type.bit_width(); + let in_ = self.in_var(); if bit_width == 0 { writeln!(output, "uint{bits}_t zero = 0ULL;")?; writeln!(output)?; @@ -185,15 +193,24 @@ impl GPUPipelineJIT for BitPack

{ for row in 0..bits { writeln!( output, - "out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane];", + "out[INDEX({row}, lane)] = {in_}[LANE_COUNT * {row} + lane];", )?; } } else { let src = self.src_var(); let tmp = self.tmp_var(); + println!("P {}", P::PTYPE); + + let mask_fn = |bits: usize| { + format!( + "((({type_})1 << {width}) - 1)", + type_ = CUDAType::from(P::PTYPE), + width = bit_width + ) + }; writeln!(output)?; - writeln!(output, "{src} = in[lane];")?; + writeln!(output, "{src} = {in}[lane];", in = self.in_var())?; for row in 0..bits { let curr_word = (row * bit_width) / bits; let next_word = ((row + 1) * bit_width) / bits; @@ -204,23 +221,26 @@ impl GPUPipelineJIT for BitPack

{ let current_bits = bit_width - remaining_bits; writeln!( output, - "{tmp} = ({src} >> {shift}) & MASK(uint{bits}_t, {current_bits});" + "{tmp} = ({src} >> {shift}) & {mask};", + mask = mask_fn(current_bits) )?; if next_word < bit_width { - writeln!(output, "{src} = in[lane + LANE_COUNT * {next_word}];")?; + writeln!(output, "{src} = {in_}[lane + LANE_COUNT * {next_word}];")?; writeln!( output, - "{tmp} |= ({src} & MASK(uint{bits}_t, {remaining_bits})) << {current_bits};" + "{tmp} |= ({src} & {mask}) << {current_bits};", + mask = mask_fn(remaining_bits) )?; } } else { writeln!( output, - "{tmp} = ({src} >> {shift}) & MASK(uint{bits}_t, {bit_width});" + "{tmp} = ({src} >> {shift}) & {mask};", + mask = mask_fn(bit_width) )?; } - + writeln!(output, "out_idx = INDEX({row}, lane);")?; f(output)?; writeln!(output)?; } @@ -256,6 +276,14 @@ impl

BitPack

{ fn src_var(&self) -> String { format!("src{}", self.step_id) } + + fn in_var(&self) -> String { + format!("in{}", self.step_id) + } + + fn out_idx(&self) -> String { + format!("out_idx{}", self.step_id) + } } struct FoR

{ @@ -346,17 +374,20 @@ fn handle_array(a: &ArrayRef, stream: &Arc, step_id: usize) -> Box() { assert_eq!(bp.offset(), 0); assert!(bp.patches().is_none()); - let cuda_slice = stream - .memcpy_stod(bp.packed().as_slice()) - .map_err(|e| vortex_err!("Failed to copy to device: {e}")) - .vortex_unwrap(); - return Box::new(BitPack { - step_id, - bit_width: bp.bit_width(), - packed: bp.packed().clone(), - output_type: bp.ptype(), - cuda_slice, - }); + match_each_native_ptype!(bp.ptype(), |P| { + let values = Buffer::

::from_byte_buffer(bp.packed().clone()); + let cuda_slice = stream + .memcpy_stod(values.as_slice()) + .map_err(|e| vortex_err!("Failed to copy to device: {e}")) + .vortex_unwrap(); + return Box::new(BitPack::

{ + step_id, + bit_width: bp.bit_width(), + packed: bp.packed().clone(), + output_type: bp.ptype(), + cuda_slice, + }); + }) }; if let Some(for_) = a.as_opt::() { @@ -419,6 +450,7 @@ impl GPUPipelineJIT for ALP { fn args<'a>(&'a self, stream: &Arc, args: &mut LaunchArgs<'a>) -> VortexResult<()> { args.arg(&self.e); args.arg(&self.f); + println!("---e {}, f {}", self.e, self.f); Ok(()) } @@ -438,9 +470,9 @@ impl GPUPipelineJIT for ALP { writeln!( w, "{out} = ((({type_}){tmp}) * {f}) * {e};", - tmp = self.tmp_var(), + out = self.tmp_var(), type_ = CUDAType::from(self.float_type), - out = self.child.output_var(), + tmp = self.child.output_var(), f = self.f_var(), e = self.e_var(), )?; @@ -501,7 +533,8 @@ impl<'a> GPUVisitor<'a> for ArgCollector<'a> { fn _create_jit(a: &ArrayRef) -> fmt::Result { let ctx = CudaContext::new(0).unwrap(); ctx.set_blocking_synchronize().unwrap(); - let output = handle_array(a, &ctx.default_stream(), 0); + let stream = ctx.default_stream(); + let output = handle_array(a, &stream, 0); let mut s = String::new(); let w = &mut s as &mut dyn Write; @@ -513,15 +546,31 @@ fn _create_jit(a: &ArrayRef) -> fmt::Result { params.params.push(GPUKernelParameter { name: "output".to_string(), - type_: format!("{} *__restrict", CUDAType::from(output.output_type())), + type_: format!("{} *__restrict__", CUDAType::from(output.output_type())), }); - writeln!(w, "__global__ void kernel(")?; + writeln!(w, "__device__ int FL_ORDER[] = {{0, 4, 2, 6, 1, 5, 3, 7}};")?; + writeln!( + w, + "#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)" + )?; + writeln!(w, "extern \"C\" __global__ void kernel(")?; w.indent(|w| { - params - .params - .iter() - .try_for_each(|p| writeln!(w, "{} {},", p.type_, p.name)) + params.params.iter().enumerate().try_for_each(|(idx, p)| { + writeln!( + w, + "{} {}{end}", + p.type_, + p.name, + end = if idx == params.params.len() - 1 { + "" + } else { + "," + } + ) + }) + + // .try_for_each(|p| writeln!(w, "{} {},", p.type_, p.name)) })?; writeln!(w, ") {{")?; @@ -530,12 +579,71 @@ fn _create_jit(a: &ArrayRef) -> fmt::Result { decl.accept(output.as_ref()).vortex_expect("cannot fail"); writeln!(w)?; output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { - writeln!(w, "output[idx] = {}", output.output_var()) + writeln!( + w, + "output[out_idx] = {output};", + output = output.output_var() + ) }) })?; writeln!(w, "}}")?; println!("{}", s); + let module = cudarc::nvrtc::compile_ptx(s.clone()) + .map_err(|e| vortex_err!("compile ptx {e}")) + .vortex_unwrap(); + println!("{}", module.to_src()); + + // Dynamically load it into the device + let module = ctx + .load_module(module) + .map_err(|e| vortex_err!("load module {e}")) + .vortex_unwrap(); + + let kernel = module + .load_function("kernel") + .map_err(|e| vortex_err!("get function {e}")) + .vortex_unwrap(); + + let num_chunks = u32::try_from(a.len().div_ceil(1024)).vortex_expect("Too many grid elements"); + + let mut launch_builder = stream.launch_builder(&kernel); + + let mut collector = ArgCollector { + stream: stream.clone(), + + params: &mut launch_builder, + }; + collector + .accept(output.as_ref()) + .vortex_expect("cannot fail"); + + let launch_config = LaunchConfig { + grid_dim: (num_chunks, 1, 1), + block_dim: (32, 1, 1), + shared_mem_bytes: 0, + }; + + let mut out = stream.alloc_zeros::(a.len()).unwrap(); + collector.params.arg(&mut out); + + let _ = unsafe { collector.params.launch(launch_config) }; + + let mut buffer = BufferMut::::with_capacity(a.len()); + unsafe { buffer.set_len(a.len()) } + + stream + .memcpy_dtoh(&out, &mut buffer) + .map_err(|e| vortex_err!("Failed to copy to device: {e}")) + .vortex_unwrap(); + stream + .synchronize() + .map_err(|e| vortex_err!("Failed to synchronize: {e}")) + .vortex_unwrap(); + let c = Canonical::Primitive(PrimitiveArray::new(buffer, Validity::NonNullable)).into_array(); + + println!("c {}", c.display_tree()); + println!("c {}", c.display_values()); Ok(()) } @@ -559,7 +667,7 @@ mod tests { let for_ = ALPArray::try_new( FoRArray::try_new( BitPackedArray::encode( - (0i32..10) + (0i32..1024) .map(|_| 1i32) .collect::() .as_ref(), From eec4cc93a20f83b509be4649b2d85b0b58d2a9a4 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 13 Oct 2025 17:53:24 -0400 Subject: [PATCH 04/15] wip Signed-off-by: Joe Isaacs --- vortex-gpu/benches/gpu_bitunpack.rs | 70 +++++++- vortex-gpu/src/jit.rs | 268 +++++++++++++++++----------- vortex-gpu/src/lib.rs | 1 + 3 files changed, 234 insertions(+), 105 deletions(-) diff --git a/vortex-gpu/benches/gpu_bitunpack.rs b/vortex-gpu/benches/gpu_bitunpack.rs index 98b5d5b633a..71e18c0ebcd 100644 --- a/vortex-gpu/benches/gpu_bitunpack.rs +++ b/vortex-gpu/benches/gpu_bitunpack.rs @@ -10,12 +10,15 @@ use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_m use cudarc::driver::CudaContext; use rand::prelude::StdRng; use rand::{Rng, SeedableRng}; -use vortex_array::{IntoArray, ToCanonical}; +use vortex_alp::{ALPArray, Exponents}; +use vortex_array::{Array, ArrayRef, IntoArray, ToCanonical}; use vortex_buffer::BufferMut; use vortex_dtype::NativePType; use vortex_error::VortexUnwrap; use vortex_fastlanes::{BitPackedArray, FoRArray}; -use vortex_gpu::{cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed}; +use vortex_gpu::{ + create_jit, cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed, +}; // Data sizes: 1GB, 2.5GB, 5GB, 10GB // These are approximate sizes in bytes, accounting for bit-packing compression @@ -61,6 +64,32 @@ fn make_for_bitpackable_array(len: usize) -> FoRArray { FoRArray::try_new(bitpacked.into_array(), reference.into()).vortex_unwrap() } +fn make_alp_array(len: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(42); + let reference = 100i32; + + // Generate values that fit in 6 bits (0-63) + let values = (0..len) + .map(|_| rng.random_range(0..64)) + .collect::>() + .into_array() + .to_primitive(); + + // Create bitpacked array first + let bitpacked = BitPackedArray::encode(values.as_ref(), 6).unwrap(); + + // Wrap in FoR encoding with reference value + ALPArray::try_new( + FoRArray::try_new(bitpacked.into_array(), reference.into()) + .vortex_unwrap() + .into_array(), + Exponents { e: 4, f: 5 }, + None, + ) + .vortex_unwrap() + .into_array() +} + fn benchmark_gpu_decompress_kernel_only(c: &mut Criterion) { let mut group = c.benchmark_group("gpu_decompress_kernel_only"); @@ -155,6 +184,38 @@ fn benchmark_gpu_for_bp_fused_decompress_kernel_only(c: &mut Criterion) { group.finish(); } +fn benchmark_gpu_for_bp_jit_decompress_kernel_only(c: &mut Criterion) { + let mut group = c.benchmark_group("benchmark_gpu_for_bp_jit_decompress_kernel_only"); + + group.sample_size(10); + + for (len, label) in DATA_SIZES { + let len = len.next_multiple_of(1024); + let array = make_alp_array(len).into_array(); + + let ctx = CudaContext::new(0).unwrap(); + let ctx = Arc::new(ctx); + ctx.set_blocking_synchronize().unwrap(); + + group.throughput(Throughput::Bytes( + (len * array.dtype().as_ptype().byte_width()) as u64, + )); + group.bench_with_input(BenchmarkId::new("for/jit", label), &array, |b, array| { + b.iter_custom(|iters| { + let mut total_time = Duration::ZERO; + for _ in 0..iters { + // This only measures kernel execution time, not memory transfers + let (_result, kernel_time) = create_jit(array, Arc::clone(&ctx)).unwrap(); + total_time += kernel_time; + } + total_time + }); + }); + } + + group.finish(); +} + #[allow(dead_code)] fn benchmark_cpu_canonicalize(c: &mut Criterion) { let mut group = c.benchmark_group("cpu_canonicalize"); @@ -176,6 +237,9 @@ criterion_group!( benches, benchmark_gpu_decompress_kernel_only, benchmark_gpu_for_decompress_kernel_only, - benchmark_gpu_for_bp_fused_decompress_kernel_only + benchmark_gpu_for_bp_fused_decompress_kernel_only, + benchmark_gpu_for_bp_jit_decompress_kernel_only ); + +// criterion_group!(benches, benchmark_gpu_for_bp_jit_decompress_kernel_only); criterion_main!(benches); diff --git a/vortex-gpu/src/jit.rs b/vortex-gpu/src/jit.rs index a9609c3332e..0b813c60e0f 100644 --- a/vortex-gpu/src/jit.rs +++ b/vortex-gpu/src/jit.rs @@ -2,30 +2,25 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use std::fmt; -use std::fmt::{Debug, Display, Write}; +use std::fmt::{Display, Write}; use std::sync::Arc; +use std::time::Duration; +use cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT; use cudarc::driver::{ CudaContext, CudaSlice, CudaStream, DeviceRepr, LaunchArgs, LaunchConfig, PushKernelArg, }; -use itertools::Itertools; -use vortex_alp::{ALPFloat, ALPVTable, Exponents, match_each_alp_float_ptype}; +use vortex_alp::{ALPFloat, ALPVTable, match_each_alp_float_ptype}; use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; use vortex_array::{Array, ArrayRef, Canonical, IntoArray}; -use vortex_buffer::{Buffer, BufferMut, ByteBuffer}; -use vortex_dtype::half::f16; +use vortex_buffer::{Buffer, BufferMut}; use vortex_dtype::{NativePType, PType, match_each_native_ptype}; use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_err}; -use vortex_fastlanes::{BitPackedArray, BitPackedVTable, FoRVTable}; +use vortex_fastlanes::{BitPackedVTable, FoRVTable}; use crate::indent::IndentedWriter; -pub enum IterationOrder { - InOrder, - FastLanesTransposed, -} - struct GPUKernelParameter { name: String, type_: String, @@ -128,7 +123,6 @@ impl From for CUDAType { struct BitPack

{ step_id: usize, bit_width: u8, - packed: ByteBuffer, output_type: PType, cuda_slice: CudaSlice

, } @@ -140,17 +134,17 @@ impl GPUPipelineJIT for BitPack

{ fn in_params(&self, p: &mut Vec) { p.push(GPUKernelParameter { - name: self.in_var(), + name: self.in_var_g(), type_: format!( "{type_} *__restrict", - type_ = CUDAType::from(self.output_type) + type_ = CUDAType::from(self.output_type.to_unsigned()) ), }); } fn args<'a>( &'a self, - stream: &Arc, + _stream: &Arc, launch_args: &mut LaunchArgs<'a>, // args: &mut Vec>, ) -> VortexResult<()> { launch_args.arg(&self.cuda_slice); @@ -160,16 +154,26 @@ impl GPUPipelineJIT for BitPack

{ fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { let output_cuda_type = CUDAType::from(self.output_type); + let uoutput_cuda_type = CUDAType::from(self.output_type.to_unsigned()); // TODO: all types writeln!( w, "unsigned int LANE_COUNT = {bits};", bits = 1024 / self.output_type.bit_width() )?; - writeln!(w, "{output_cuda_type} src{};", self.step_id)?; - writeln!(w, "{output_cuda_type} tmp{};", self.step_id)?; + writeln!(w, "{output_cuda_type} {};", self.tmp_var())?; + writeln!(w, "{uoutput_cuda_type} {};", self.src_var())?; + writeln!(w, "{uoutput_cuda_type} {};", self.utmp_var())?; writeln!(w, "unsigned int out_idx;")?; writeln!(w, "unsigned int lane = threadIdx.x;")?; + writeln!( + w, + "{uoutput_cuda_type} *{in_l} = {in_g} + (blockIdx.x * 128 * {bit_width} / {bit_size});", + in_l = self.in_var_l(), + in_g = self.in_var_g(), + bit_width = self.bit_width, + bit_size = P::PTYPE.byte_width() + )?; Ok(()) } @@ -181,7 +185,7 @@ impl GPUPipelineJIT for BitPack

{ let output = w; let bit_width = self.bit_width as usize; let bits = self.output_type.bit_width(); - let in_ = self.in_var(); + let in_ = self.in_var_l(); if bit_width == 0 { writeln!(output, "uint{bits}_t zero = 0ULL;")?; writeln!(output)?; @@ -198,19 +202,19 @@ impl GPUPipelineJIT for BitPack

{ } } else { let src = self.src_var(); + let utmp = self.utmp_var(); let tmp = self.tmp_var(); - println!("P {}", P::PTYPE); let mask_fn = |bits: usize| { format!( "((({type_})1 << {width}) - 1)", - type_ = CUDAType::from(P::PTYPE), - width = bit_width + type_ = CUDAType::from(P::PTYPE.to_unsigned()), + width = bits ) }; writeln!(output)?; - writeln!(output, "{src} = {in}[lane];", in = self.in_var())?; + writeln!(output, "{src} = {in}[lane];", in = self.in_var_l())?; for row in 0..bits { let curr_word = (row * bit_width) / bits; let next_word = ((row + 1) * bit_width) / bits; @@ -221,7 +225,7 @@ impl GPUPipelineJIT for BitPack

{ let current_bits = bit_width - remaining_bits; writeln!( output, - "{tmp} = ({src} >> {shift}) & {mask};", + "{utmp} = ({src} >> {shift}) & {mask};", mask = mask_fn(current_bits) )?; @@ -229,17 +233,22 @@ impl GPUPipelineJIT for BitPack

{ writeln!(output, "{src} = {in_}[lane + LANE_COUNT * {next_word}];")?; writeln!( output, - "{tmp} |= ({src} & {mask}) << {current_bits};", + "{utmp} |= ({src} & {mask}) << {current_bits};", mask = mask_fn(remaining_bits) )?; } } else { writeln!( output, - "{tmp} = ({src} >> {shift}) & {mask};", + "{utmp} = ({src} >> {shift}) & {mask};", mask = mask_fn(bit_width) )?; } + writeln!( + output, + "{tmp} = ({type_}){utmp};", + type_ = CUDAType::from(self.output_type), + )?; writeln!(output, "out_idx = INDEX({row}, lane);")?; f(output)?; writeln!(output)?; @@ -277,10 +286,18 @@ impl

BitPack

{ format!("src{}", self.step_id) } - fn in_var(&self) -> String { + fn utmp_var(&self) -> String { + format!("utmp{}", self.step_id) + } + + fn in_var_l(&self) -> String { format!("in{}", self.step_id) } + fn in_var_g(&self) -> String { + format!("_in{}", self.step_id) + } + fn out_idx(&self) -> String { format!("out_idx{}", self.step_id) } @@ -383,7 +400,6 @@ fn handle_array(a: &ArrayRef, stream: &Arc, step_id: usize) -> Box { step_id, bit_width: bp.bit_width(), - packed: bp.packed().clone(), output_type: bp.ptype(), cuda_slice, }); @@ -447,10 +463,13 @@ impl GPUPipelineJIT for ALP { ]) } - fn args<'a>(&'a self, stream: &Arc, args: &mut LaunchArgs<'a>) -> VortexResult<()> { + fn args<'a>( + &'a self, + _stream: &Arc, + args: &mut LaunchArgs<'a>, + ) -> VortexResult<()> { args.arg(&self.e); args.arg(&self.f); - println!("---e {}, f {}", self.e, self.f); Ok(()) } @@ -530,22 +549,12 @@ impl<'a> GPUVisitor<'a> for ArgCollector<'a> { } } -fn _create_jit(a: &ArrayRef) -> fmt::Result { - let ctx = CudaContext::new(0).unwrap(); - ctx.set_blocking_synchronize().unwrap(); - let stream = ctx.default_stream(); - let output = handle_array(a, &stream, 0); - - let mut s = String::new(); - let w = &mut s as &mut dyn Write; - let mut ind = IndentedWriter::new(w); - let w = &mut ind; - +fn jit_str(w: &mut IndentedWriter<&mut dyn Write>, output: &dyn GPUPipelineJIT) -> fmt::Result { let mut params = InParamPrinter { params: Vec::new() }; - params.accept(output.as_ref()).vortex_expect("cannot fail"); + params.accept(output).vortex_expect("cannot fail"); params.params.push(GPUKernelParameter { - name: "output".to_string(), + name: "_output".to_string(), type_: format!("{} *__restrict__", CUDAType::from(output.output_type())), }); @@ -569,43 +578,64 @@ fn _create_jit(a: &ArrayRef) -> fmt::Result { } ) }) - - // .try_for_each(|p| writeln!(w, "{} {},", p.type_, p.name)) })?; writeln!(w, ") {{")?; w.indent(|w| { + writeln!( + w, + "{output_type} *output = _output + (blockIdx.x * 1024);", + output_type = CUDAType::from(output.output_type()) + )?; + + writeln!(w, "__shared__ float s_output[1024];")?; + let mut decl = DeclPrinter { w }; - decl.accept(output.as_ref()).vortex_expect("cannot fail"); + decl.accept(output).vortex_expect("cannot fail"); writeln!(w)?; output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { - writeln!( - w, - "output[out_idx] = {output};", - output = output.output_var() - ) + writeln!(w, "s_output[out_idx] = {tmp};", tmp = output.output_var()) }) })?; - writeln!(w, "}}")?; - println!("{}", s); - let module = cudarc::nvrtc::compile_ptx(s.clone()) - .map_err(|e| vortex_err!("compile ptx {e}")) - .vortex_unwrap(); - println!("{}", module.to_src()); + writeln!( + w, + " for (int i = 0; i < 32; i++) {{ + auto idx = i * 32 + threadIdx.x; + output[idx] = s_output[idx]; + }}" + )?; + + writeln!(w, "}}") +} + +pub fn create_jit(array: &ArrayRef, ctx: Arc) -> VortexResult<(ArrayRef, Duration)> { + let stream = ctx.default_stream(); + + let output = handle_array(array, &stream, 0); + + let mut s = String::new(); + let w = &mut s as &mut dyn Write; + let mut ind = IndentedWriter::new(w); + let w = &mut ind; + + let _ = jit_str(w, output.as_ref()).map_err(|e| vortex_err!("jit str cannot fail {e}")); + // println!("s {}", s); + + let module = + cudarc::nvrtc::compile_ptx(s.clone()).map_err(|e| vortex_err!("compile ptx {e}"))?; // Dynamically load it into the device let module = ctx .load_module(module) - .map_err(|e| vortex_err!("load module {e}")) - .vortex_unwrap(); + .map_err(|e| vortex_err!("load module {e}"))?; let kernel = module .load_function("kernel") - .map_err(|e| vortex_err!("get function {e}")) - .vortex_unwrap(); + .map_err(|e| vortex_err!("get function {e}"))?; - let num_chunks = u32::try_from(a.len().div_ceil(1024)).vortex_expect("Too many grid elements"); + let num_chunks = + u32::try_from(array.len().div_ceil(1024)).vortex_expect("Too many grid elements"); let mut launch_builder = stream.launch_builder(&kernel); @@ -614,9 +644,7 @@ fn _create_jit(a: &ArrayRef) -> fmt::Result { params: &mut launch_builder, }; - collector - .accept(output.as_ref()) - .vortex_expect("cannot fail"); + collector.accept(output.as_ref())?; let launch_config = LaunchConfig { grid_dim: (num_chunks, 1, 1), @@ -624,64 +652,100 @@ fn _create_jit(a: &ArrayRef) -> fmt::Result { shared_mem_bytes: 0, }; - let mut out = stream.alloc_zeros::(a.len()).unwrap(); - collector.params.arg(&mut out); - - let _ = unsafe { collector.params.launch(launch_config) }; - - let mut buffer = BufferMut::::with_capacity(a.len()); - unsafe { buffer.set_len(a.len()) } - - stream - .memcpy_dtoh(&out, &mut buffer) - .map_err(|e| vortex_err!("Failed to copy to device: {e}")) - .vortex_unwrap(); - stream - .synchronize() - .map_err(|e| vortex_err!("Failed to synchronize: {e}")) - .vortex_unwrap(); - let c = Canonical::Primitive(PrimitiveArray::new(buffer, Validity::NonNullable)).into_array(); - - println!("c {}", c.display_tree()); - println!("c {}", c.display_values()); - - Ok(()) -} - -fn create_jit(a: &ArrayRef) -> VortexResult<()> { - _create_jit(a).map_err(|e| vortex_err!("failed to write decls {e}")) + match_each_native_ptype!(array.dtype().as_ptype(), |P| { + let mut out = stream.alloc_zeros::

(array.len()).unwrap(); + collector.params.arg(&mut out); + stream + .synchronize() + .map_err(|e| vortex_err!("failed to sync {e}"))?; + let start = stream + .record_event(Some(CU_EVENT_DEFAULT)) + .ok() + .vortex_expect("Failed to record event"); + let _ = unsafe { collector.params.launch(launch_config) }; + ctx.synchronize() + .map_err(|e| vortex_err!("Failed to synchronize: {e}"))?; + let end = stream + .record_event(Some(CU_EVENT_DEFAULT)) + .ok() + .vortex_expect("Failed to record event"); + + let duration = start.elapsed_ms(&end).unwrap(); + + let mut buffer = BufferMut::

::with_capacity(array.len()); + unsafe { buffer.set_len(array.len()) } + + stream + .memcpy_dtoh(&out, &mut buffer) + .map_err(|e| vortex_err!("Failed to copy to device: {e}")) + .vortex_unwrap(); + stream + .synchronize() + .map_err(|e| vortex_err!("Failed to synchronize: {e}")) + .vortex_unwrap(); + let c = + Canonical::Primitive(PrimitiveArray::new(buffer, Validity::NonNullable)).into_array(); + + Ok((c, Duration::from_secs_f32(duration / 1000.0))) + }) } #[cfg(test)] mod tests { + use cudarc::driver::CudaContext; use vortex_alp::{ALPArray, Exponents}; - use vortex_array::IntoArray; use vortex_array::arrays::PrimitiveArray; + use vortex_array::{IntoArray, ToCanonical}; use vortex_error::VortexResult; use vortex_fastlanes::{BitPackedArray, FoRArray}; use crate::jit::create_jit; #[test] - fn jit_arr() -> VortexResult<()> { + fn jit_arr_f32() -> VortexResult<()> { + let ctx = CudaContext::new(0).unwrap(); + ctx.set_blocking_synchronize().unwrap(); let for_ = ALPArray::try_new( FoRArray::try_new( - BitPackedArray::encode( - (0i32..1024) - .map(|_| 1i32) - .collect::() - .as_ref(), - 2, - )? - .into_array(), + BitPackedArray::encode((0i32..1024 * 2).collect::().as_ref(), 12)? + .into_array(), 2i32.into(), )? .into_array(), Exponents { e: 4, f: 5 }, None, - )?; + )? + .into_array(); + + let (d, _) = create_jit(&for_, ctx)?; + let prim = d.to_primitive(); + let expect = for_.to_primitive(); + + for i in 0..prim.len() { + assert_eq!( + prim.as_slice::()[i], + expect.as_slice::()[i], + "i = {i}" + ); + } - create_jit(&for_.into_array())?; + Ok(()) + } + + #[test] + fn jit_arrs() -> VortexResult<()> { + let ctx = CudaContext::new(0).unwrap(); + ctx.set_blocking_synchronize().unwrap(); + let for_ = BitPackedArray::encode( + (0i32..1024) + .map(|_| 1u32) + .collect::() + .as_ref(), + 2, + )? + .into_array(); + + create_jit(&for_.into_array(), ctx)?; Ok(()) } diff --git a/vortex-gpu/src/lib.rs b/vortex-gpu/src/lib.rs index c8f4bdf9d24..2b1a63bbadc 100644 --- a/vortex-gpu/src/lib.rs +++ b/vortex-gpu/src/lib.rs @@ -13,4 +13,5 @@ mod task; pub use bit_unpack::{cuda_bit_unpack, cuda_bit_unpack_timed}; pub use for_::{cuda_for_unpack, cuda_for_unpack_timed}; pub use for_bp::{cuda_for_bp_unpack, cuda_for_bp_unpack_timed}; +pub use jit::create_jit; pub use take::cuda_take; From 3db750da658e1dfd8be8f91042a291e21fc61fec Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 14 Oct 2025 15:09:40 -0400 Subject: [PATCH 05/15] wip Signed-off-by: Joe Isaacs --- vortex-gpu/benches/gpu_bitunpack.rs | 5 +- vortex-gpu/src/for_bp.rs | 23 +- vortex-gpu/src/jit.rs | 354 +++++++++++++++++----------- vortex-gpu/src/lib.rs | 2 +- 4 files changed, 230 insertions(+), 154 deletions(-) diff --git a/vortex-gpu/benches/gpu_bitunpack.rs b/vortex-gpu/benches/gpu_bitunpack.rs index 71e18c0ebcd..c9a5d8428df 100644 --- a/vortex-gpu/benches/gpu_bitunpack.rs +++ b/vortex-gpu/benches/gpu_bitunpack.rs @@ -17,7 +17,7 @@ use vortex_dtype::NativePType; use vortex_error::VortexUnwrap; use vortex_fastlanes::{BitPackedArray, FoRArray}; use vortex_gpu::{ - create_jit, cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed, + create_run_jit_kernel, cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed, }; // Data sizes: 1GB, 2.5GB, 5GB, 10GB @@ -194,7 +194,6 @@ fn benchmark_gpu_for_bp_jit_decompress_kernel_only(c: &mut Criterion) { let array = make_alp_array(len).into_array(); let ctx = CudaContext::new(0).unwrap(); - let ctx = Arc::new(ctx); ctx.set_blocking_synchronize().unwrap(); group.throughput(Throughput::Bytes( @@ -205,7 +204,7 @@ fn benchmark_gpu_for_bp_jit_decompress_kernel_only(c: &mut Criterion) { let mut total_time = Duration::ZERO; for _ in 0..iters { // This only measures kernel execution time, not memory transfers - let (_result, kernel_time) = create_jit(array, Arc::clone(&ctx)).unwrap(); + let (_result, kernel_time) = create_run_jit_kernel(ctx.clone(), array).unwrap(); total_time += kernel_time; } total_time diff --git a/vortex-gpu/src/for_bp.rs b/vortex-gpu/src/for_bp.rs index 291459bf146..f9e7fe6860c 100644 --- a/vortex-gpu/src/for_bp.rs +++ b/vortex-gpu/src/for_bp.rs @@ -186,17 +186,16 @@ mod tests { ctx.set_blocking_synchronize().unwrap(); let unpacked = cuda_for_bp_unpack(&array, ctx).unwrap(); let primitive_array = array.into_array().to_primitive(); - println!("unpacked {:?}", unpacked.as_slice::()); - // assert_eq!( - // primitive_array.as_slice::(), - // unpacked.as_slice::() - // ); - // for i in 0..primitive_array.len() { - // assert_eq!( - // primitive_array.as_slice::()[i], - // unpacked.as_slice::()[i], - // "i {i}" - // ); - // } + assert_eq!( + primitive_array.as_slice::(), + unpacked.as_slice::() + ); + for i in 0..primitive_array.len() { + assert_eq!( + primitive_array.as_slice::()[i], + unpacked.as_slice::()[i], + "i {i}" + ); + } } } diff --git a/vortex-gpu/src/jit.rs b/vortex-gpu/src/jit.rs index 0b813c60e0f..967a8c00387 100644 --- a/vortex-gpu/src/jit.rs +++ b/vortex-gpu/src/jit.rs @@ -8,7 +8,8 @@ use std::time::Duration; use cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT; use cudarc::driver::{ - CudaContext, CudaSlice, CudaStream, DeviceRepr, LaunchArgs, LaunchConfig, PushKernelArg, + CudaContext, CudaFunction, CudaSlice, CudaStream, DeviceRepr, LaunchArgs, LaunchConfig, + PushKernelArg, }; use vortex_alp::{ALPFloat, ALPVTable, match_each_alp_float_ptype}; use vortex_array::arrays::PrimitiveArray; @@ -27,22 +28,8 @@ struct GPUKernelParameter { } struct GPULaunchConfig { - block_width: usize, - grid_width: usize, -} - -// struct GPUPipelineParameters { -// inputs: Vec, -// output: GPUKernelParameter, -// block_width: usize, -// grid_width: usize, -// iteration_order: IterationOrder, -// } -// -// struct GPUPipeline { -// body: String, -// parameters: GPUPipelineParameters, -// } + block_width: u32, +} // bp -> output // tmp = ... @@ -89,8 +76,74 @@ trait GPUPipelineJIT { // always pass the output iteration aligned child last. fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()>; - fn launch_config(&self) -> Option { - None + fn launch_config(&self) -> GPULaunchConfig; +} + +trait ScalarGPUPipelineJIT { + fn step_id(&self) -> usize; + + fn in_params(&self, params: &mut Vec); + + fn args<'a>(&'a self, stream: &Arc, args: &mut LaunchArgs<'a>) -> VortexResult<()>; + + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result; + + fn kernel_body( + &self, + w: &mut IndentedWriter<&mut dyn Write>, + f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, + ) -> fmt::Result; + + fn output_var(&self) -> String; + + fn output_type(&self) -> PType; + + fn child<'a>(&'a self) -> &'a dyn GPUPipelineJIT; +} + +struct ScalarGPUPipelineJITNode { + inner: T, +} + +impl GPUPipelineJIT for ScalarGPUPipelineJITNode { + fn step_id(&self) -> usize { + self.inner.step_id() + } + + fn in_params(&self, params: &mut Vec) { + self.inner.in_params(params) + } + + fn args<'a>(&'a self, stream: &Arc, args: &mut LaunchArgs<'a>) -> VortexResult<()> { + self.inner.args(stream, args) + } + + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { + self.inner.decls(w) + } + + fn kernel_body( + &self, + w: &mut IndentedWriter<&mut dyn Write>, + f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, + ) -> fmt::Result { + self.inner.kernel_body(w, f) + } + + fn output_var(&self) -> String { + self.inner.output_var() + } + + fn output_type(&self) -> PType { + self.inner.output_type() + } + + fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()> { + visitor.accept(self.inner.child()) + } + + fn launch_config(&self) -> GPULaunchConfig { + self.inner.child().launch_config() } } @@ -145,7 +198,7 @@ impl GPUPipelineJIT for BitPack

{ fn args<'a>( &'a self, _stream: &Arc, - launch_args: &mut LaunchArgs<'a>, // args: &mut Vec>, + launch_args: &mut LaunchArgs<'a>, ) -> VortexResult<()> { launch_args.arg(&self.cuda_slice); @@ -155,7 +208,6 @@ impl GPUPipelineJIT for BitPack

{ fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { let output_cuda_type = CUDAType::from(self.output_type); let uoutput_cuda_type = CUDAType::from(self.output_type.to_unsigned()); - // TODO: all types writeln!( w, "unsigned int LANE_COUNT = {bits};", @@ -182,21 +234,20 @@ impl GPUPipelineJIT for BitPack

{ w: &mut IndentedWriter<&mut dyn Write>, f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, ) -> fmt::Result { - let output = w; let bit_width = self.bit_width as usize; let bits = self.output_type.bit_width(); let in_ = self.in_var_l(); if bit_width == 0 { - writeln!(output, "uint{bits}_t zero = 0ULL;")?; - writeln!(output)?; + writeln!(w, "uint{bits}_t zero = 0ULL;")?; + writeln!(w)?; for row in 0..bits { - writeln!(output, "out[INDEX({row}, lane)] = zero;")?; + writeln!(w, "out[INDEX({row}, lane)] = zero;")?; } } else if bit_width == bits { - writeln!(output)?; + writeln!(w)?; for row in 0..bits { writeln!( - output, + w, "out[INDEX({row}, lane)] = {in_}[LANE_COUNT * {row} + lane];", )?; } @@ -213,8 +264,8 @@ impl GPUPipelineJIT for BitPack

{ ) }; - writeln!(output)?; - writeln!(output, "{src} = {in}[lane];", in = self.in_var_l())?; + writeln!(w)?; + writeln!(w, "{src} = {in}[lane];", in = self.in_var_l())?; for row in 0..bits { let curr_word = (row * bit_width) / bits; let next_word = ((row + 1) * bit_width) / bits; @@ -224,34 +275,35 @@ impl GPUPipelineJIT for BitPack

{ let remaining_bits = ((row + 1) * bit_width) % bits; let current_bits = bit_width - remaining_bits; writeln!( - output, + w, "{utmp} = ({src} >> {shift}) & {mask};", mask = mask_fn(current_bits) )?; if next_word < bit_width { - writeln!(output, "{src} = {in_}[lane + LANE_COUNT * {next_word}];")?; + writeln!(w, "{src} = {in_}[lane + LANE_COUNT * {next_word}];")?; writeln!( - output, + w, "{utmp} |= ({src} & {mask}) << {current_bits};", mask = mask_fn(remaining_bits) )?; } } else { writeln!( - output, + w, "{utmp} = ({src} >> {shift}) & {mask};", mask = mask_fn(bit_width) )?; } writeln!( - output, + w, "{tmp} = ({type_}){utmp};", type_ = CUDAType::from(self.output_type), )?; - writeln!(output, "out_idx = INDEX({row}, lane);")?; - f(output)?; - writeln!(output)?; + + writeln!(w, "out_idx = INDEX({row}, lane);")?; + f(w)?; + writeln!(w)?; } } Ok(()) @@ -269,11 +321,10 @@ impl GPUPipelineJIT for BitPack

{ Ok(()) } - fn launch_config(&self) -> Option { - Some(GPULaunchConfig { - block_width: 1024, - grid_width: 1, - }) + fn launch_config(&self) -> GPULaunchConfig { + GPULaunchConfig { + block_width: if P::PTYPE == PType::U64 { 16 } else { 32 }, + } } } @@ -297,10 +348,6 @@ impl

BitPack

{ fn in_var_g(&self) -> String { format!("_in{}", self.step_id) } - - fn out_idx(&self) -> String { - format!("out_idx{}", self.step_id) - } } struct FoR

{ @@ -319,7 +366,7 @@ impl

FoR

{ } } -impl GPUPipelineJIT for FoR

{ +impl ScalarGPUPipelineJIT for FoR

{ fn step_id(&self) -> usize { self.step_id } @@ -342,7 +389,6 @@ impl GPUPipelineJIT for FoR

{ fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { let output_cuda_type = CUDAType::from(self.output_type()); - // TODO: supprort all types writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?; Ok(()) } @@ -371,20 +417,22 @@ impl GPUPipelineJIT for FoR

{ P::PTYPE } - fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()> { - visitor.accept(self.child.as_ref()) + fn child<'a>(&'a self) -> &'a dyn GPUPipelineJIT { + self.child.as_ref() } } fn handle_array(a: &ArrayRef, stream: &Arc, step_id: usize) -> Box { if let Some(alp) = a.as_opt::() { match_each_alp_float_ptype!(alp.ptype(), |A| { - return Box::new(ALP { - step_id, - float_type: alp.ptype(), - child: handle_array(alp.encoded(), stream, step_id + 1), - f: A::F10[alp.exponents().f as usize], - e: A::IF10[alp.exponents().e as usize], + return Box::new(ScalarGPUPipelineJITNode { + inner: ALP { + step_id, + float_type: alp.ptype(), + child: handle_array(alp.encoded(), stream, step_id + 1), + f: A::F10[alp.exponents().f as usize], + e: A::IF10[alp.exponents().e as usize], + }, }); }) } @@ -408,14 +456,16 @@ fn handle_array(a: &ArrayRef, stream: &Arc, step_id: usize) -> Box() { match_each_native_ptype!(for_.reference_scalar().as_primitive().ptype(), |P| { - return Box::new(FoR { - step_id, - reference: for_ - .reference_scalar() - .as_primitive() - .as_::

() - .vortex_expect("cannot have a null reference"), - child: handle_array(for_.encoded(), stream, step_id + 1), + return Box::new(ScalarGPUPipelineJITNode { + inner: FoR { + step_id, + reference: for_ + .reference_scalar() + .as_primitive() + .as_::

() + .vortex_expect("cannot have a null reference"), + child: handle_array(for_.encoded(), stream, step_id + 1), + }, }); }) } @@ -445,7 +495,7 @@ impl ALP { } } -impl GPUPipelineJIT for ALP { +impl ScalarGPUPipelineJIT for ALP { fn step_id(&self) -> usize { self.step_id } @@ -507,8 +557,8 @@ impl GPUPipelineJIT for ALP { self.float_type } - fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()> { - visitor.accept(self.child.as_ref()) + fn child<'a>(&'a self) -> &'a dyn GPUPipelineJIT { + self.child.as_ref() } } @@ -516,6 +566,11 @@ struct DeclPrinter<'a, 'b: 'a> { w: &'a mut IndentedWriter<&'b mut dyn Write>, } +fn write_kernel_declarations(w: &mut IndentedWriter<&mut dyn Write>, node: &dyn GPUPipelineJIT) { + let mut decl = DeclPrinter { w }; + decl.accept(node).vortex_expect("write decl cannot fail"); +} + impl<'a> GPUVisitor<'a> for DeclPrinter<'a, '_> { fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> { node.children(self)?; @@ -524,11 +579,11 @@ impl<'a> GPUVisitor<'a> for DeclPrinter<'a, '_> { } } -struct InParamPrinter { +struct InParamCollector { params: Vec, } -impl GPUVisitor<'_> for InParamPrinter { +impl GPUVisitor<'_> for InParamCollector { fn accept(&mut self, node: &dyn GPUPipelineJIT) -> VortexResult<()> { node.children(self)?; node.in_params(&mut self.params); @@ -536,12 +591,18 @@ impl GPUVisitor<'_> for InParamPrinter { } } -struct ArgCollector<'a> { +fn collect_in_param(node: &dyn GPUPipelineJIT) -> VortexResult> { + let mut params = InParamCollector { params: Vec::new() }; + params.accept(node)?; + Ok(params.params) +} + +struct ArgCollector<'a, 'b> { stream: Arc, - params: &'a mut LaunchArgs<'a>, + params: &'b mut LaunchArgs<'a>, } -impl<'a> GPUVisitor<'a> for ArgCollector<'a> { +impl<'a> GPUVisitor<'a> for ArgCollector<'a, '_> { fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> { node.children(self)?; node.args(&self.stream, self.params)?; @@ -549,78 +610,87 @@ impl<'a> GPUVisitor<'a> for ArgCollector<'a> { } } -fn jit_str(w: &mut IndentedWriter<&mut dyn Write>, output: &dyn GPUPipelineJIT) -> fmt::Result { - let mut params = InParamPrinter { params: Vec::new() }; - params.accept(output).vortex_expect("cannot fail"); +fn collect_args<'a, 'b>( + node: &'a dyn GPUPipelineJIT, + stream: Arc, + args: &'b mut LaunchArgs<'a>, +) -> VortexResult<()> { + let mut collector = ArgCollector { + stream: stream.clone(), + + params: args, + }; + collector.accept(node)?; - params.params.push(GPUKernelParameter { + Ok(()) +} + +fn create_jit_str( + w: &mut IndentedWriter<&mut dyn Write>, + output: &dyn GPUPipelineJIT, +) -> VortexResult<()> { + let mut params = collect_in_param(output)?; + params.push(GPUKernelParameter { name: "_output".to_string(), type_: format!("{} *__restrict__", CUDAType::from(output.output_type())), }); - writeln!(w, "__device__ int FL_ORDER[] = {{0, 4, 2, 6, 1, 5, 3, 7}};")?; - writeln!( - w, - "#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)" - )?; - writeln!(w, "extern \"C\" __global__ void kernel(")?; - w.indent(|w| { - params.params.iter().enumerate().try_for_each(|(idx, p)| { - writeln!( - w, - "{} {}{end}", - p.type_, - p.name, - end = if idx == params.params.len() - 1 { - "" - } else { - "," - } - ) - }) - })?; - writeln!(w, ") {{")?; - - w.indent(|w| { + (|| { + writeln!(w, "__device__ int FL_ORDER[] = {{0, 4, 2, 6, 1, 5, 3, 7}};")?; writeln!( w, - "{output_type} *output = _output + (blockIdx.x * 1024);", - output_type = CUDAType::from(output.output_type()) + "#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)" )?; + writeln!(w, "extern \"C\" __global__ void kernel(")?; + w.indent(|w| { + params.iter().enumerate().try_for_each(|(idx, p)| { + writeln!( + w, + "{} {}{end}", + p.type_, + p.name, + end = if idx == params.len() - 1 { "" } else { "," } + ) + }) + })?; + writeln!(w, ") {{")?; - writeln!(w, "__shared__ float s_output[1024];")?; + w.indent(|w| { + writeln!( + w, + "{output_type} *output = _output + (blockIdx.x * 1024);", + output_type = CUDAType::from(output.output_type()) + )?; - let mut decl = DeclPrinter { w }; - decl.accept(output).vortex_expect("cannot fail"); - writeln!(w)?; - output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { - writeln!(w, "s_output[out_idx] = {tmp};", tmp = output.output_var()) - }) - })?; + writeln!(w, "__shared__ float s_output[1024];")?; - writeln!( - w, - " for (int i = 0; i < 32; i++) {{ + write_kernel_declarations(w, output); + writeln!(w)?; + output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { + writeln!(w, "s_output[out_idx] = {tmp};", tmp = output.output_var()) + }) + })?; + + writeln!( + w, + " for (int i = 0; i < 32; i++) {{ auto idx = i * 32 + threadIdx.x; output[idx] = s_output[idx]; }}" - )?; + )?; - writeln!(w, "}}") + writeln!(w, "}}") + })() + .map_err(|e| vortex_err!("format err {e}")) } -pub fn create_jit(array: &ArrayRef, ctx: Arc) -> VortexResult<(ArrayRef, Duration)> { - let stream = ctx.default_stream(); - - let output = handle_array(array, &stream, 0); - +fn create_kernel(ctx: Arc, array: &dyn GPUPipelineJIT) -> VortexResult { let mut s = String::new(); let w = &mut s as &mut dyn Write; let mut ind = IndentedWriter::new(w); let w = &mut ind; - let _ = jit_str(w, output.as_ref()).map_err(|e| vortex_err!("jit str cannot fail {e}")); - // println!("s {}", s); + let _ = create_jit_str(w, array).map_err(|e| vortex_err!("jit str cannot fail {e}")); let module = cudarc::nvrtc::compile_ptx(s.clone()).map_err(|e| vortex_err!("compile ptx {e}"))?; @@ -630,31 +700,39 @@ pub fn create_jit(array: &ArrayRef, ctx: Arc) -> VortexResult<(Arra .load_module(module) .map_err(|e| vortex_err!("load module {e}"))?; - let kernel = module + module .load_function("kernel") - .map_err(|e| vortex_err!("get function {e}"))?; + .map_err(|e| vortex_err!("load_function {e}")) +} + +pub fn create_run_jit_kernel( + ctx: Arc, + array: &ArrayRef, +) -> VortexResult<(ArrayRef, Duration)> { + let stream = ctx.default_stream(); + + let output = handle_array(array, &stream, 0); + let kernel = create_kernel(ctx.clone(), output.as_ref())?; let num_chunks = u32::try_from(array.len().div_ceil(1024)).vortex_expect("Too many grid elements"); let mut launch_builder = stream.launch_builder(&kernel); - let mut collector = ArgCollector { - stream: stream.clone(), - - params: &mut launch_builder, - }; - collector.accept(output.as_ref())?; - let launch_config = LaunchConfig { grid_dim: (num_chunks, 1, 1), - block_dim: (32, 1, 1), - shared_mem_bytes: 0, + block_dim: (output.launch_config().block_width, 1, 1), + shared_mem_bytes: u32::try_from(output.output_type().byte_width()) + .vortex_expect("oversized output type byte width") + * 1024, }; + collect_args(output.as_ref(), stream.clone(), &mut launch_builder)?; + match_each_native_ptype!(array.dtype().as_ptype(), |P| { + // append final argument (output) of the kernel let mut out = stream.alloc_zeros::

(array.len()).unwrap(); - collector.params.arg(&mut out); + launch_builder.arg(&mut out); stream .synchronize() .map_err(|e| vortex_err!("failed to sync {e}"))?; @@ -662,7 +740,7 @@ pub fn create_jit(array: &ArrayRef, ctx: Arc) -> VortexResult<(Arra .record_event(Some(CU_EVENT_DEFAULT)) .ok() .vortex_expect("Failed to record event"); - let _ = unsafe { collector.params.launch(launch_config) }; + let _ = unsafe { launch_builder.launch(launch_config) }; ctx.synchronize() .map_err(|e| vortex_err!("Failed to synchronize: {e}"))?; let end = stream @@ -699,7 +777,7 @@ mod tests { use vortex_error::VortexResult; use vortex_fastlanes::{BitPackedArray, FoRArray}; - use crate::jit::create_jit; + use crate::jit::create_run_jit_kernel; #[test] fn jit_arr_f32() -> VortexResult<()> { @@ -717,7 +795,7 @@ mod tests { )? .into_array(); - let (d, _) = create_jit(&for_, ctx)?; + let (d, _) = create_run_jit_kernel(ctx, &for_)?; let prim = d.to_primitive(); let expect = for_.to_primitive(); @@ -745,7 +823,7 @@ mod tests { )? .into_array(); - create_jit(&for_.into_array(), ctx)?; + create_run_jit_kernel(ctx, &for_.into_array())?; Ok(()) } diff --git a/vortex-gpu/src/lib.rs b/vortex-gpu/src/lib.rs index 2b1a63bbadc..2afd0baa566 100644 --- a/vortex-gpu/src/lib.rs +++ b/vortex-gpu/src/lib.rs @@ -13,5 +13,5 @@ mod task; pub use bit_unpack::{cuda_bit_unpack, cuda_bit_unpack_timed}; pub use for_::{cuda_for_unpack, cuda_for_unpack_timed}; pub use for_bp::{cuda_for_bp_unpack, cuda_for_bp_unpack_timed}; -pub use jit::create_jit; +pub use jit::create_run_jit_kernel; pub use take::cuda_take; From 11bde6a528904feda0625c5ec5380ef6fe2df606 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 14 Oct 2025 16:08:22 -0400 Subject: [PATCH 06/15] wip Signed-off-by: Joe Isaacs --- vortex-gpu/src/jit.rs | 830 --------------------------- vortex-gpu/src/jit/arrays/alp.rs | 127 ++++ vortex-gpu/src/jit/arrays/bitpack.rs | 218 +++++++ vortex-gpu/src/jit/arrays/for_.rs | 111 ++++ vortex-gpu/src/jit/arrays/mod.rs | 6 + vortex-gpu/src/jit/convert.rs | 35 ++ vortex-gpu/src/jit/kernel_fmt.rs | 127 ++++ vortex-gpu/src/jit/mod.rs | 138 +++++ vortex-gpu/src/jit/run.rs | 175 ++++++ vortex-gpu/src/jit/type_.rs | 33 ++ 10 files changed, 970 insertions(+), 830 deletions(-) delete mode 100644 vortex-gpu/src/jit.rs create mode 100644 vortex-gpu/src/jit/arrays/alp.rs create mode 100644 vortex-gpu/src/jit/arrays/bitpack.rs create mode 100644 vortex-gpu/src/jit/arrays/for_.rs create mode 100644 vortex-gpu/src/jit/arrays/mod.rs create mode 100644 vortex-gpu/src/jit/convert.rs create mode 100644 vortex-gpu/src/jit/kernel_fmt.rs create mode 100644 vortex-gpu/src/jit/mod.rs create mode 100644 vortex-gpu/src/jit/run.rs create mode 100644 vortex-gpu/src/jit/type_.rs diff --git a/vortex-gpu/src/jit.rs b/vortex-gpu/src/jit.rs deleted file mode 100644 index 967a8c00387..00000000000 --- a/vortex-gpu/src/jit.rs +++ /dev/null @@ -1,830 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -use std::fmt; -use std::fmt::{Display, Write}; -use std::sync::Arc; -use std::time::Duration; - -use cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT; -use cudarc::driver::{ - CudaContext, CudaFunction, CudaSlice, CudaStream, DeviceRepr, LaunchArgs, LaunchConfig, - PushKernelArg, -}; -use vortex_alp::{ALPFloat, ALPVTable, match_each_alp_float_ptype}; -use vortex_array::arrays::PrimitiveArray; -use vortex_array::validity::Validity; -use vortex_array::{Array, ArrayRef, Canonical, IntoArray}; -use vortex_buffer::{Buffer, BufferMut}; -use vortex_dtype::{NativePType, PType, match_each_native_ptype}; -use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_err}; -use vortex_fastlanes::{BitPackedVTable, FoRVTable}; - -use crate::indent::IndentedWriter; - -struct GPUKernelParameter { - name: String, - type_: String, -} - -struct GPULaunchConfig { - block_width: u32, -} - -// bp -> output -// tmp = ... -// out[i] = tmp; - -// bp -> for -> output - -// tmp = .... -// tmp_for = tmp + ref -// output[i] = tmp_for[i] - -// have leaves only bp for now. - -// step-type (each one has a unique step_id) -// step_id -// in_params -// decls/setup -// kernel-step body // fn body(var, writer) -> str -// output_var + output_type - -trait GPUVisitor<'a> { - fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()>; -} - -trait GPUPipelineJIT { - fn step_id(&self) -> usize; - - fn in_params(&self, params: &mut Vec); - - fn args<'a>(&'a self, stream: &Arc, args: &mut LaunchArgs<'a>) -> VortexResult<()>; - - fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result; - - fn kernel_body( - &self, - w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, - ) -> fmt::Result; - - fn output_var(&self) -> String; - - fn output_type(&self) -> PType; - - // always pass the output iteration aligned child last. - fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()>; - - fn launch_config(&self) -> GPULaunchConfig; -} - -trait ScalarGPUPipelineJIT { - fn step_id(&self) -> usize; - - fn in_params(&self, params: &mut Vec); - - fn args<'a>(&'a self, stream: &Arc, args: &mut LaunchArgs<'a>) -> VortexResult<()>; - - fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result; - - fn kernel_body( - &self, - w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, - ) -> fmt::Result; - - fn output_var(&self) -> String; - - fn output_type(&self) -> PType; - - fn child<'a>(&'a self) -> &'a dyn GPUPipelineJIT; -} - -struct ScalarGPUPipelineJITNode { - inner: T, -} - -impl GPUPipelineJIT for ScalarGPUPipelineJITNode { - fn step_id(&self) -> usize { - self.inner.step_id() - } - - fn in_params(&self, params: &mut Vec) { - self.inner.in_params(params) - } - - fn args<'a>(&'a self, stream: &Arc, args: &mut LaunchArgs<'a>) -> VortexResult<()> { - self.inner.args(stream, args) - } - - fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { - self.inner.decls(w) - } - - fn kernel_body( - &self, - w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, - ) -> fmt::Result { - self.inner.kernel_body(w, f) - } - - fn output_var(&self) -> String { - self.inner.output_var() - } - - fn output_type(&self) -> PType { - self.inner.output_type() - } - - fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()> { - visitor.accept(self.inner.child()) - } - - fn launch_config(&self) -> GPULaunchConfig { - self.inner.child().launch_config() - } -} - -struct CUDAType(&'static str); - -impl Display for CUDAType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(self.0) - } -} - -impl From for CUDAType { - fn from(value: PType) -> Self { - CUDAType(match value { - PType::U8 => "unsigned char", - PType::U16 => "unsigned short", - PType::U32 => "unsigned int", - PType::U64 => "unsigned long long", - PType::I8 => "char", - PType::I16 => "short", - PType::I32 => "int", - PType::I64 => "long long", - PType::F32 => "float", - PType::F64 => "double", - PType::F16 => todo!(), - }) - } -} - -struct BitPack

{ - step_id: usize, - bit_width: u8, - output_type: PType, - cuda_slice: CudaSlice

, -} - -impl GPUPipelineJIT for BitPack

{ - fn step_id(&self) -> usize { - self.step_id - } - - fn in_params(&self, p: &mut Vec) { - p.push(GPUKernelParameter { - name: self.in_var_g(), - type_: format!( - "{type_} *__restrict", - type_ = CUDAType::from(self.output_type.to_unsigned()) - ), - }); - } - - fn args<'a>( - &'a self, - _stream: &Arc, - launch_args: &mut LaunchArgs<'a>, - ) -> VortexResult<()> { - launch_args.arg(&self.cuda_slice); - - Ok(()) - } - - fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { - let output_cuda_type = CUDAType::from(self.output_type); - let uoutput_cuda_type = CUDAType::from(self.output_type.to_unsigned()); - writeln!( - w, - "unsigned int LANE_COUNT = {bits};", - bits = 1024 / self.output_type.bit_width() - )?; - writeln!(w, "{output_cuda_type} {};", self.tmp_var())?; - writeln!(w, "{uoutput_cuda_type} {};", self.src_var())?; - writeln!(w, "{uoutput_cuda_type} {};", self.utmp_var())?; - writeln!(w, "unsigned int out_idx;")?; - writeln!(w, "unsigned int lane = threadIdx.x;")?; - writeln!( - w, - "{uoutput_cuda_type} *{in_l} = {in_g} + (blockIdx.x * 128 * {bit_width} / {bit_size});", - in_l = self.in_var_l(), - in_g = self.in_var_g(), - bit_width = self.bit_width, - bit_size = P::PTYPE.byte_width() - )?; - Ok(()) - } - - fn kernel_body( - &self, - w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, - ) -> fmt::Result { - let bit_width = self.bit_width as usize; - let bits = self.output_type.bit_width(); - let in_ = self.in_var_l(); - if bit_width == 0 { - writeln!(w, "uint{bits}_t zero = 0ULL;")?; - writeln!(w)?; - for row in 0..bits { - writeln!(w, "out[INDEX({row}, lane)] = zero;")?; - } - } else if bit_width == bits { - writeln!(w)?; - for row in 0..bits { - writeln!( - w, - "out[INDEX({row}, lane)] = {in_}[LANE_COUNT * {row} + lane];", - )?; - } - } else { - let src = self.src_var(); - let utmp = self.utmp_var(); - let tmp = self.tmp_var(); - - let mask_fn = |bits: usize| { - format!( - "((({type_})1 << {width}) - 1)", - type_ = CUDAType::from(P::PTYPE.to_unsigned()), - width = bits - ) - }; - - writeln!(w)?; - writeln!(w, "{src} = {in}[lane];", in = self.in_var_l())?; - for row in 0..bits { - let curr_word = (row * bit_width) / bits; - let next_word = ((row + 1) * bit_width) / bits; - let shift = (row * bit_width) % bits; - - if next_word > curr_word { - let remaining_bits = ((row + 1) * bit_width) % bits; - let current_bits = bit_width - remaining_bits; - writeln!( - w, - "{utmp} = ({src} >> {shift}) & {mask};", - mask = mask_fn(current_bits) - )?; - - if next_word < bit_width { - writeln!(w, "{src} = {in_}[lane + LANE_COUNT * {next_word}];")?; - writeln!( - w, - "{utmp} |= ({src} & {mask}) << {current_bits};", - mask = mask_fn(remaining_bits) - )?; - } - } else { - writeln!( - w, - "{utmp} = ({src} >> {shift}) & {mask};", - mask = mask_fn(bit_width) - )?; - } - writeln!( - w, - "{tmp} = ({type_}){utmp};", - type_ = CUDAType::from(self.output_type), - )?; - - writeln!(w, "out_idx = INDEX({row}, lane);")?; - f(w)?; - writeln!(w)?; - } - } - Ok(()) - } - - fn output_var(&self) -> String { - self.tmp_var() - } - - fn output_type(&self) -> PType { - self.output_type - } - - fn children(&self, _visitor: &mut dyn GPUVisitor) -> VortexResult<()> { - Ok(()) - } - - fn launch_config(&self) -> GPULaunchConfig { - GPULaunchConfig { - block_width: if P::PTYPE == PType::U64 { 16 } else { 32 }, - } - } -} - -impl

BitPack

{ - fn tmp_var(&self) -> String { - format!("tmp{}", self.step_id) - } - - fn src_var(&self) -> String { - format!("src{}", self.step_id) - } - - fn utmp_var(&self) -> String { - format!("utmp{}", self.step_id) - } - - fn in_var_l(&self) -> String { - format!("in{}", self.step_id) - } - - fn in_var_g(&self) -> String { - format!("_in{}", self.step_id) - } -} - -struct FoR

{ - step_id: usize, - reference: P, - child: Box, -} - -impl

FoR

{ - fn tmp_var(&self) -> String { - format!("tmp{}", self.step_id) - } - - fn ref_var(&self) -> String { - format!("ref{}", self.step_id) - } -} - -impl ScalarGPUPipelineJIT for FoR

{ - fn step_id(&self) -> usize { - self.step_id - } - - fn in_params(&self, p: &mut Vec) { - p.push(GPUKernelParameter { - name: self.ref_var(), - type_: CUDAType::from(self.output_type()).to_string(), - }) - } - - fn args<'a>( - &'a self, - _stream: &Arc, - args: &mut LaunchArgs<'a>, - ) -> VortexResult<()> { - args.arg(&self.reference); - Ok(()) - } - - fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { - let output_cuda_type = CUDAType::from(self.output_type()); - writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?; - Ok(()) - } - - fn kernel_body( - &self, - w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, - ) -> fmt::Result { - assert_eq!(self.output_type(), self.child.output_type()); - let in_var = self.child.output_var(); - let out_var = self.tmp_var(); - let ref_var = self.ref_var(); - self.child - .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { - writeln!(w, "{out_var} = {in_var} + {ref_var};")?; - f(w) - }) - } - - fn output_var(&self) -> String { - self.tmp_var() - } - - fn output_type(&self) -> PType { - P::PTYPE - } - - fn child<'a>(&'a self) -> &'a dyn GPUPipelineJIT { - self.child.as_ref() - } -} - -fn handle_array(a: &ArrayRef, stream: &Arc, step_id: usize) -> Box { - if let Some(alp) = a.as_opt::() { - match_each_alp_float_ptype!(alp.ptype(), |A| { - return Box::new(ScalarGPUPipelineJITNode { - inner: ALP { - step_id, - float_type: alp.ptype(), - child: handle_array(alp.encoded(), stream, step_id + 1), - f: A::F10[alp.exponents().f as usize], - e: A::IF10[alp.exponents().e as usize], - }, - }); - }) - } - if let Some(bp) = a.as_opt::() { - assert_eq!(bp.offset(), 0); - assert!(bp.patches().is_none()); - match_each_native_ptype!(bp.ptype(), |P| { - let values = Buffer::

::from_byte_buffer(bp.packed().clone()); - let cuda_slice = stream - .memcpy_stod(values.as_slice()) - .map_err(|e| vortex_err!("Failed to copy to device: {e}")) - .vortex_unwrap(); - return Box::new(BitPack::

{ - step_id, - bit_width: bp.bit_width(), - output_type: bp.ptype(), - cuda_slice, - }); - }) - }; - - if let Some(for_) = a.as_opt::() { - match_each_native_ptype!(for_.reference_scalar().as_primitive().ptype(), |P| { - return Box::new(ScalarGPUPipelineJITNode { - inner: FoR { - step_id, - reference: for_ - .reference_scalar() - .as_primitive() - .as_::

() - .vortex_expect("cannot have a null reference"), - child: handle_array(for_.encoded(), stream, step_id + 1), - }, - }); - }) - } - - todo!() -} - -struct ALP { - step_id: usize, - float_type: PType, - child: Box, - f: A, - e: A, -} - -impl ALP { - fn tmp_var(&self) -> String { - format!("tmp{}", self.step_id) - } - - fn e_var(&self) -> String { - format!("e{}", self.step_id) - } - - fn f_var(&self) -> String { - format!("f{}", self.step_id) - } -} - -impl ScalarGPUPipelineJIT for ALP { - fn step_id(&self) -> usize { - self.step_id - } - - fn in_params(&self, params: &mut Vec) { - params.extend([ - GPUKernelParameter { - name: self.e_var(), - type_: CUDAType::from(A::PTYPE).to_string(), - }, - GPUKernelParameter { - name: self.f_var(), - type_: CUDAType::from(A::PTYPE).to_string(), - }, - ]) - } - - fn args<'a>( - &'a self, - _stream: &Arc, - args: &mut LaunchArgs<'a>, - ) -> VortexResult<()> { - args.arg(&self.e); - args.arg(&self.f); - Ok(()) - } - - fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { - let output_cuda_type = CUDAType::from(self.float_type); - writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?; - Ok(()) - } - - fn kernel_body( - &self, - w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, - ) -> fmt::Result { - self.child - .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { - writeln!( - w, - "{out} = ((({type_}){tmp}) * {f}) * {e};", - out = self.tmp_var(), - type_ = CUDAType::from(self.float_type), - tmp = self.child.output_var(), - f = self.f_var(), - e = self.e_var(), - )?; - f(w) - }) - } - - fn output_var(&self) -> String { - self.tmp_var() - } - - fn output_type(&self) -> PType { - self.float_type - } - - fn child<'a>(&'a self) -> &'a dyn GPUPipelineJIT { - self.child.as_ref() - } -} - -struct DeclPrinter<'a, 'b: 'a> { - w: &'a mut IndentedWriter<&'b mut dyn Write>, -} - -fn write_kernel_declarations(w: &mut IndentedWriter<&mut dyn Write>, node: &dyn GPUPipelineJIT) { - let mut decl = DeclPrinter { w }; - decl.accept(node).vortex_expect("write decl cannot fail"); -} - -impl<'a> GPUVisitor<'a> for DeclPrinter<'a, '_> { - fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> { - node.children(self)?; - node.decls(self.w) - .map_err(|e| vortex_err!("cannot write {}", e)) - } -} - -struct InParamCollector { - params: Vec, -} - -impl GPUVisitor<'_> for InParamCollector { - fn accept(&mut self, node: &dyn GPUPipelineJIT) -> VortexResult<()> { - node.children(self)?; - node.in_params(&mut self.params); - Ok(()) - } -} - -fn collect_in_param(node: &dyn GPUPipelineJIT) -> VortexResult> { - let mut params = InParamCollector { params: Vec::new() }; - params.accept(node)?; - Ok(params.params) -} - -struct ArgCollector<'a, 'b> { - stream: Arc, - params: &'b mut LaunchArgs<'a>, -} - -impl<'a> GPUVisitor<'a> for ArgCollector<'a, '_> { - fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> { - node.children(self)?; - node.args(&self.stream, self.params)?; - Ok(()) - } -} - -fn collect_args<'a, 'b>( - node: &'a dyn GPUPipelineJIT, - stream: Arc, - args: &'b mut LaunchArgs<'a>, -) -> VortexResult<()> { - let mut collector = ArgCollector { - stream: stream.clone(), - - params: args, - }; - collector.accept(node)?; - - Ok(()) -} - -fn create_jit_str( - w: &mut IndentedWriter<&mut dyn Write>, - output: &dyn GPUPipelineJIT, -) -> VortexResult<()> { - let mut params = collect_in_param(output)?; - params.push(GPUKernelParameter { - name: "_output".to_string(), - type_: format!("{} *__restrict__", CUDAType::from(output.output_type())), - }); - - (|| { - writeln!(w, "__device__ int FL_ORDER[] = {{0, 4, 2, 6, 1, 5, 3, 7}};")?; - writeln!( - w, - "#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)" - )?; - writeln!(w, "extern \"C\" __global__ void kernel(")?; - w.indent(|w| { - params.iter().enumerate().try_for_each(|(idx, p)| { - writeln!( - w, - "{} {}{end}", - p.type_, - p.name, - end = if idx == params.len() - 1 { "" } else { "," } - ) - }) - })?; - writeln!(w, ") {{")?; - - w.indent(|w| { - writeln!( - w, - "{output_type} *output = _output + (blockIdx.x * 1024);", - output_type = CUDAType::from(output.output_type()) - )?; - - writeln!(w, "__shared__ float s_output[1024];")?; - - write_kernel_declarations(w, output); - writeln!(w)?; - output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { - writeln!(w, "s_output[out_idx] = {tmp};", tmp = output.output_var()) - }) - })?; - - writeln!( - w, - " for (int i = 0; i < 32; i++) {{ - auto idx = i * 32 + threadIdx.x; - output[idx] = s_output[idx]; - }}" - )?; - - writeln!(w, "}}") - })() - .map_err(|e| vortex_err!("format err {e}")) -} - -fn create_kernel(ctx: Arc, array: &dyn GPUPipelineJIT) -> VortexResult { - let mut s = String::new(); - let w = &mut s as &mut dyn Write; - let mut ind = IndentedWriter::new(w); - let w = &mut ind; - - let _ = create_jit_str(w, array).map_err(|e| vortex_err!("jit str cannot fail {e}")); - - let module = - cudarc::nvrtc::compile_ptx(s.clone()).map_err(|e| vortex_err!("compile ptx {e}"))?; - - // Dynamically load it into the device - let module = ctx - .load_module(module) - .map_err(|e| vortex_err!("load module {e}"))?; - - module - .load_function("kernel") - .map_err(|e| vortex_err!("load_function {e}")) -} - -pub fn create_run_jit_kernel( - ctx: Arc, - array: &ArrayRef, -) -> VortexResult<(ArrayRef, Duration)> { - let stream = ctx.default_stream(); - - let output = handle_array(array, &stream, 0); - let kernel = create_kernel(ctx.clone(), output.as_ref())?; - - let num_chunks = - u32::try_from(array.len().div_ceil(1024)).vortex_expect("Too many grid elements"); - - let mut launch_builder = stream.launch_builder(&kernel); - - let launch_config = LaunchConfig { - grid_dim: (num_chunks, 1, 1), - block_dim: (output.launch_config().block_width, 1, 1), - shared_mem_bytes: u32::try_from(output.output_type().byte_width()) - .vortex_expect("oversized output type byte width") - * 1024, - }; - - collect_args(output.as_ref(), stream.clone(), &mut launch_builder)?; - - match_each_native_ptype!(array.dtype().as_ptype(), |P| { - // append final argument (output) of the kernel - let mut out = stream.alloc_zeros::

(array.len()).unwrap(); - launch_builder.arg(&mut out); - stream - .synchronize() - .map_err(|e| vortex_err!("failed to sync {e}"))?; - let start = stream - .record_event(Some(CU_EVENT_DEFAULT)) - .ok() - .vortex_expect("Failed to record event"); - let _ = unsafe { launch_builder.launch(launch_config) }; - ctx.synchronize() - .map_err(|e| vortex_err!("Failed to synchronize: {e}"))?; - let end = stream - .record_event(Some(CU_EVENT_DEFAULT)) - .ok() - .vortex_expect("Failed to record event"); - - let duration = start.elapsed_ms(&end).unwrap(); - - let mut buffer = BufferMut::

::with_capacity(array.len()); - unsafe { buffer.set_len(array.len()) } - - stream - .memcpy_dtoh(&out, &mut buffer) - .map_err(|e| vortex_err!("Failed to copy to device: {e}")) - .vortex_unwrap(); - stream - .synchronize() - .map_err(|e| vortex_err!("Failed to synchronize: {e}")) - .vortex_unwrap(); - let c = - Canonical::Primitive(PrimitiveArray::new(buffer, Validity::NonNullable)).into_array(); - - Ok((c, Duration::from_secs_f32(duration / 1000.0))) - }) -} - -#[cfg(test)] -mod tests { - use cudarc::driver::CudaContext; - use vortex_alp::{ALPArray, Exponents}; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::{IntoArray, ToCanonical}; - use vortex_error::VortexResult; - use vortex_fastlanes::{BitPackedArray, FoRArray}; - - use crate::jit::create_run_jit_kernel; - - #[test] - fn jit_arr_f32() -> VortexResult<()> { - let ctx = CudaContext::new(0).unwrap(); - ctx.set_blocking_synchronize().unwrap(); - let for_ = ALPArray::try_new( - FoRArray::try_new( - BitPackedArray::encode((0i32..1024 * 2).collect::().as_ref(), 12)? - .into_array(), - 2i32.into(), - )? - .into_array(), - Exponents { e: 4, f: 5 }, - None, - )? - .into_array(); - - let (d, _) = create_run_jit_kernel(ctx, &for_)?; - let prim = d.to_primitive(); - let expect = for_.to_primitive(); - - for i in 0..prim.len() { - assert_eq!( - prim.as_slice::()[i], - expect.as_slice::()[i], - "i = {i}" - ); - } - - Ok(()) - } - - #[test] - fn jit_arrs() -> VortexResult<()> { - let ctx = CudaContext::new(0).unwrap(); - ctx.set_blocking_synchronize().unwrap(); - let for_ = BitPackedArray::encode( - (0i32..1024) - .map(|_| 1u32) - .collect::() - .as_ref(), - 2, - )? - .into_array(); - - create_run_jit_kernel(ctx, &for_.into_array())?; - - Ok(()) - } -} diff --git a/vortex-gpu/src/jit/arrays/alp.rs b/vortex-gpu/src/jit/arrays/alp.rs new file mode 100644 index 00000000000..3b2515900a9 --- /dev/null +++ b/vortex-gpu/src/jit/arrays/alp.rs @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt; +use std::fmt::Write; +use std::sync::Arc; + +use cudarc::driver::{CudaStream, DeviceRepr, LaunchArgs}; +use vortex_alp::{ALPArray, ALPFloat, match_each_alp_float_ptype}; +use vortex_dtype::PType; +use vortex_error::VortexResult; + +use crate::indent::IndentedWriter; +use crate::jit::convert::handle_array; +use crate::jit::{ + GPUKernelParameter, GPUPipelineJIT, ScalarGPUPipelineJIT, ScalarGPUPipelineJITNode, + StepIdAllocator, +}; + +struct ALP { + step_id: usize, + float_type: PType, + child: Box, + f: A, + e: A, +} + +pub fn new_jit( + alp: &ALPArray, + stream: &Arc, + allocator: &mut StepIdAllocator, +) -> Box { + match_each_alp_float_ptype!(alp.ptype(), |A| { + let child = handle_array(alp.encoded(), stream, allocator); + let step_id = allocator.get_id(); + Box::new(ScalarGPUPipelineJITNode { + inner: ALP { + step_id, + float_type: alp.ptype(), + child, + f: A::F10[alp.exponents().f as usize], + e: A::IF10[alp.exponents().e as usize], + }, + }) + }) +} + +impl ALP { + fn tmp_var(&self) -> String { + format!("tmp{}", self.step_id) + } + + fn e_var(&self) -> String { + format!("e{}", self.step_id) + } + + fn f_var(&self) -> String { + format!("f{}", self.step_id) + } +} + +impl ScalarGPUPipelineJIT for ALP { + fn step_id(&self) -> usize { + self.step_id + } + + fn in_params(&self, params: &mut Vec) { + params.extend([ + GPUKernelParameter { + name: self.e_var(), + type_: CUDAType::from(A::PTYPE).to_string(), + }, + GPUKernelParameter { + name: self.f_var(), + type_: CUDAType::from(A::PTYPE).to_string(), + }, + ]) + } + + fn args<'a>( + &'a self, + _stream: &Arc, + args: &mut LaunchArgs<'a>, + ) -> VortexResult<()> { + args.arg(&self.e); + args.arg(&self.f); + Ok(()) + } + + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { + let output_cuda_type = CUDAType::from(self.float_type); + writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?; + Ok(()) + } + + fn kernel_body( + &self, + w: &mut IndentedWriter<&mut dyn Write>, + f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, + ) -> fmt::Result { + self.child + .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { + writeln!( + w, + "{out} = ((({type_}){tmp}) * {f}) * {e};", + out = self.tmp_var(), + type_ = CUDAType::from(self.float_type), + tmp = self.child.output_var(), + f = self.f_var(), + e = self.e_var(), + )?; + f(w) + }) + } + + fn output_var(&self) -> String { + self.tmp_var() + } + + fn output_type(&self) -> PType { + self.float_type + } + + fn child(&self) -> &dyn GPUPipelineJIT { + self.child.as_ref() + } +} diff --git a/vortex-gpu/src/jit/arrays/bitpack.rs b/vortex-gpu/src/jit/arrays/bitpack.rs new file mode 100644 index 00000000000..9bdf6e50000 --- /dev/null +++ b/vortex-gpu/src/jit/arrays/bitpack.rs @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt; +use std::fmt::Write; +use std::sync::Arc; + +use cudarc::driver::{CudaSlice, CudaStream, DeviceRepr, LaunchArgs, PushKernelArg}; +use vortex_alp::{ALPArray, match_each_alp_float_ptype}; +use vortex_buffer::Buffer; +use vortex_dtype::{NativePType, PType, match_each_native_ptype}; +use vortex_error::{VortexResult, VortexUnwrap, vortex_err}; +use vortex_fastlanes::BitPackedArray; + +use crate::indent::IndentedWriter; +use crate::jit::{ + CUDAType, GPUKernelParameter, GPULaunchConfig, GPUPipelineJIT, GPUVisitor, StepIdAllocator, +}; + +struct BitPack

{ + step_id: usize, + bit_width: u8, + output_type: PType, + cuda_slice: CudaSlice

, +} + +pub fn new_jit( + bp: &BitPackedArray, + stream: &Arc, + allocator: &mut StepIdAllocator, +) -> Box { + assert_eq!(bp.offset(), 0); + assert!(bp.patches().is_none()); + match_each_native_ptype!(bp.ptype(), |P| { + let values = Buffer::

::from_byte_buffer(bp.packed().clone()); + let cuda_slice = stream + .memcpy_stod(values.as_slice()) + .map_err(|e| vortex_err!("Failed to copy to device: {e}")) + .vortex_unwrap(); + let step_id = allocator.get_id(); + Box::new(BitPack::

{ + step_id, + bit_width: bp.bit_width(), + output_type: bp.ptype(), + cuda_slice, + }) + }) +} + +impl GPUPipelineJIT for BitPack

{ + fn step_id(&self) -> usize { + self.step_id + } + + fn in_params(&self, p: &mut Vec) { + p.push(GPUKernelParameter { + name: self.in_var_g(), + type_: format!( + "{type_} *__restrict", + type_ = CUDAType::from(self.output_type.to_unsigned()) + ), + }); + } + + fn args<'a>( + &'a self, + _stream: &Arc, + launch_args: &mut LaunchArgs<'a>, + ) -> VortexResult<()> { + launch_args.arg(&self.cuda_slice); + + Ok(()) + } + + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { + let output_cuda_type = CUDAType::from(self.output_type); + let uoutput_cuda_type = CUDAType::from(self.output_type.to_unsigned()); + writeln!( + w, + "unsigned int LANE_COUNT = {bits};", + bits = 1024 / self.output_type.bit_width() + )?; + writeln!(w, "{output_cuda_type} {};", self.tmp_var())?; + writeln!(w, "{uoutput_cuda_type} {};", self.src_var())?; + writeln!(w, "{uoutput_cuda_type} {};", self.utmp_var())?; + writeln!(w, "unsigned int out_idx;")?; + writeln!(w, "unsigned int lane = threadIdx.x;")?; + writeln!( + w, + "{uoutput_cuda_type} *{in_l} = {in_g} + (blockIdx.x * 128 * {bit_width} / {bit_size});", + in_l = self.in_var_l(), + in_g = self.in_var_g(), + bit_width = self.bit_width, + bit_size = P::PTYPE.byte_width() + )?; + Ok(()) + } + + fn kernel_body( + &self, + w: &mut IndentedWriter<&mut dyn Write>, + f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, + ) -> fmt::Result { + let bit_width = self.bit_width as usize; + let bits = self.output_type.bit_width(); + let in_ = self.in_var_l(); + if bit_width == 0 { + writeln!(w, "uint{bits}_t zero = 0ULL;")?; + writeln!(w)?; + for row in 0..bits { + writeln!(w, "out[INDEX({row}, lane)] = zero;")?; + } + } else if bit_width == bits { + writeln!(w)?; + for row in 0..bits { + writeln!( + w, + "out[INDEX({row}, lane)] = {in_}[LANE_COUNT * {row} + lane];", + )?; + } + } else { + let src = self.src_var(); + let utmp = self.utmp_var(); + let tmp = self.tmp_var(); + + let mask_fn = |bits: usize| { + format!( + "((({type_})1 << {width}) - 1)", + type_ = CUDAType::from(P::PTYPE.to_unsigned()), + width = bits + ) + }; + + writeln!(w)?; + writeln!(w, "{src} = {in}[lane];", in = self.in_var_l())?; + for row in 0..bits { + let curr_word = (row * bit_width) / bits; + let next_word = ((row + 1) * bit_width) / bits; + let shift = (row * bit_width) % bits; + + if next_word > curr_word { + let remaining_bits = ((row + 1) * bit_width) % bits; + let current_bits = bit_width - remaining_bits; + writeln!( + w, + "{utmp} = ({src} >> {shift}) & {mask};", + mask = mask_fn(current_bits) + )?; + + if next_word < bit_width { + writeln!(w, "{src} = {in_}[lane + LANE_COUNT * {next_word}];")?; + writeln!( + w, + "{utmp} |= ({src} & {mask}) << {current_bits};", + mask = mask_fn(remaining_bits) + )?; + } + } else { + writeln!( + w, + "{utmp} = ({src} >> {shift}) & {mask};", + mask = mask_fn(bit_width) + )?; + } + writeln!( + w, + "{tmp} = ({type_}){utmp};", + type_ = CUDAType::from(self.output_type), + )?; + + writeln!(w, "out_idx = INDEX({row}, lane);")?; + f(w)?; + writeln!(w)?; + } + } + Ok(()) + } + + fn output_var(&self) -> String { + self.tmp_var() + } + + fn output_type(&self) -> PType { + self.output_type + } + + fn children(&self, _visitor: &mut dyn GPUVisitor) -> VortexResult<()> { + Ok(()) + } + + fn launch_config(&self) -> GPULaunchConfig { + GPULaunchConfig { + block_width: if P::PTYPE == PType::U64 { 16 } else { 32 }, + } + } +} + +impl

BitPack

{ + fn tmp_var(&self) -> String { + format!("tmp{}", self.step_id) + } + + fn src_var(&self) -> String { + format!("src{}", self.step_id) + } + + fn utmp_var(&self) -> String { + format!("utmp{}", self.step_id) + } + + fn in_var_l(&self) -> String { + format!("in{}", self.step_id) + } + + fn in_var_g(&self) -> String { + format!("_in{}", self.step_id) + } +} diff --git a/vortex-gpu/src/jit/arrays/for_.rs b/vortex-gpu/src/jit/arrays/for_.rs new file mode 100644 index 00000000000..adca961968a --- /dev/null +++ b/vortex-gpu/src/jit/arrays/for_.rs @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt; +use std::fmt::Write; +use std::sync::Arc; + +use cudarc::driver::{CudaStream, DeviceRepr, LaunchArgs}; +use vortex_dtype::{NativePType, PType, match_each_native_ptype}; +use vortex_error::{VortexExpect, VortexResult}; +use vortex_fastlanes::{BitPackedArray, FoRArray}; + +use crate::indent::IndentedWriter; +use crate::jit::convert::handle_array; +use crate::jit::{ + CUDAType, GPUKernelParameter, GPUPipelineJIT, ScalarGPUPipelineJIT, ScalarGPUPipelineJITNode, + StepIdAllocator, +}; + +struct FoR

{ + step_id: usize, + reference: P, + child: Box, +} + +pub fn new_jit( + for_: &FoRArray, + stream: &Arc, + allocator: &mut StepIdAllocator, +) -> Box { + match_each_native_ptype!(for_.reference_scalar().as_primitive().ptype(), |P| { + let child = handle_array(for_.encoded(), stream, allocator); + Box::new(ScalarGPUPipelineJITNode { + inner: FoR { + step_id, + reference: for_ + .reference_scalar() + .as_primitive() + .as_::

() + .vortex_expect("cannot have a null reference"), + child, + }, + }) + }) +} + +impl

FoR

{ + fn tmp_var(&self) -> String { + format!("tmp{}", self.step_id) + } + + fn ref_var(&self) -> String { + format!("ref{}", self.step_id) + } +} + +impl ScalarGPUPipelineJIT for FoR

{ + fn step_id(&self) -> usize { + self.step_id + } + + fn in_params(&self, p: &mut Vec) { + p.push(GPUKernelParameter { + name: self.ref_var(), + type_: CUDAType::from(self.output_type()).to_string(), + }) + } + + fn args<'a>( + &'a self, + _stream: &Arc, + args: &mut LaunchArgs<'a>, + ) -> VortexResult<()> { + args.arg(&self.reference); + Ok(()) + } + + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { + let output_cuda_type = CUDAType::from(self.output_type()); + writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?; + Ok(()) + } + + fn kernel_body( + &self, + w: &mut IndentedWriter<&mut dyn Write>, + f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, + ) -> fmt::Result { + assert_eq!(self.output_type(), self.child.output_type()); + let in_var = self.child.output_var(); + let out_var = self.tmp_var(); + let ref_var = self.ref_var(); + self.child + .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { + writeln!(w, "{out_var} = {in_var} + {ref_var};")?; + f(w) + }) + } + + fn output_var(&self) -> String { + self.tmp_var() + } + + fn output_type(&self) -> PType { + P::PTYPE + } + + fn child(&self) -> &dyn GPUPipelineJIT { + self.child.as_ref() + } +} diff --git a/vortex-gpu/src/jit/arrays/mod.rs b/vortex-gpu/src/jit/arrays/mod.rs new file mode 100644 index 00000000000..ab897d6dc2c --- /dev/null +++ b/vortex-gpu/src/jit/arrays/mod.rs @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +pub mod alp; +pub mod bitpack; +pub mod for_; diff --git a/vortex-gpu/src/jit/convert.rs b/vortex-gpu/src/jit/convert.rs new file mode 100644 index 00000000000..ceb1b7138c5 --- /dev/null +++ b/vortex-gpu/src/jit/convert.rs @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; + +use cudarc::driver::CudaStream; +use itertools::all; +use vortex_alp::{ALPFloat, ALPVTable, match_each_alp_float_ptype}; +use vortex_array::{Array, ArrayRef}; +use vortex_buffer::Buffer; +use vortex_dtype::match_each_native_ptype; +use vortex_error::{VortexUnwrap, vortex_err}; +use vortex_fastlanes::{BitPackedVTable, FoRVTable}; + +use crate::jit::arrays::{alp, bitpack, for_}; +use crate::jit::{GPUPipelineJIT, ScalarGPUPipelineJITNode, StepIdAllocator}; + +pub fn handle_array( + a: &ArrayRef, + stream: &Arc, + allocator: &mut StepIdAllocator, +) -> Box { + if let Some(alp) = a.as_opt::() { + return alp::new_jit(alp, stream, allocator); + } + if let Some(bp) = a.as_opt::() { + return bitpack::new_jit(bp, stream, allocator); + }; + + if let Some(for_) = a.as_opt::() { + return for_::new_jit(for_, stream, allocator); + } + + todo!("unimplemented jit for {}", a.encoding_id()) +} diff --git a/vortex-gpu/src/jit/kernel_fmt.rs b/vortex-gpu/src/jit/kernel_fmt.rs new file mode 100644 index 00000000000..6ed585f1055 --- /dev/null +++ b/vortex-gpu/src/jit/kernel_fmt.rs @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt::Write; +use std::sync::Arc; + +use cudarc::driver::{CudaContext, CudaFunction, CudaStream, LaunchArgs}; +use vortex_error::{VortexExpect, VortexResult, vortex_err}; + +use crate::indent::IndentedWriter; +use crate::jit::type_::CUDAType; +use crate::jit::{GPUKernelParameter, GPUPipelineJIT, GPUVisitor}; + +struct DeclPrinter<'a, 'b: 'a> { + w: &'a mut IndentedWriter<&'b mut dyn Write>, +} + +fn write_kernel_declarations(w: &mut IndentedWriter<&mut dyn Write>, node: &dyn GPUPipelineJIT) { + let mut decl = DeclPrinter { w }; + decl.accept(node).vortex_expect("write decl cannot fail"); +} + +impl<'a> GPUVisitor<'a> for DeclPrinter<'a, '_> { + fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> { + node.children(self)?; + node.decls(self.w) + .map_err(|e| vortex_err!("cannot write {}", e)) + } +} + +struct InParamCollector { + params: Vec, +} + +impl GPUVisitor<'_> for InParamCollector { + fn accept(&mut self, node: &dyn GPUPipelineJIT) -> VortexResult<()> { + node.children(self)?; + node.in_params(&mut self.params); + Ok(()) + } +} + +fn collect_in_param(node: &dyn GPUPipelineJIT) -> VortexResult> { + let mut params = InParamCollector { params: Vec::new() }; + params.accept(node)?; + Ok(params.params) +} + +pub fn create_kernel_str( + w: &mut IndentedWriter<&mut dyn Write>, + output: &dyn GPUPipelineJIT, +) -> VortexResult<()> { + let mut params = collect_in_param(output)?; + params.push(GPUKernelParameter { + name: "_output".to_string(), + type_: format!("{} *__restrict__", CUDAType::from(output.output_type())), + }); + + (|| { + // TODO: include when only for fast lanes codecs + writeln!(w, "__device__ int FL_ORDER[] = {{0, 4, 2, 6, 1, 5, 3, 7}};")?; + writeln!( + w, + "#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)" + )?; + writeln!(w, "extern \"C\" __global__ void kernel(")?; + w.indent(|w| { + for (idx, p) in params.iter().enumerate() { + let separator = if idx < params.len() - 1 { "," } else { "" }; + writeln!(w, "{} {}{}", p.type_, p.name, separator)?; + } + Ok(()) + })?; + writeln!(w, ") {{")?; + + w.indent(|w| { + writeln!( + w, + "{output_type} *output = _output + (blockIdx.x * 1024);", + output_type = CUDAType::from(output.output_type()) + )?; + + writeln!(w, "__shared__ float s_output[1024];")?; + + write_kernel_declarations(w, output); + writeln!(w)?; + output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { + writeln!(w, "s_output[out_idx] = {tmp};", tmp = output.output_var()) + })?; + writeln!(w)?; + + writeln!(w, "for (int i = 0; i < 32; i++) {{")?; + w.indent(|w| { + writeln!(w, "auto idx = i * 32 + threadIdx.x;")?; + writeln!(w, "output[idx] = s_output[idx];") + })?; + writeln!(w, "}}") + })?; + + writeln!(w, "}}") + })() + .map_err(|e| vortex_err!("format err {e}")) +} + +pub fn create_kernel( + ctx: Arc, + array: &dyn GPUPipelineJIT, +) -> VortexResult { + let mut s = String::new(); + let w = &mut s as &mut dyn Write; + let mut ind = IndentedWriter::new(w); + let w = &mut ind; + + create_kernel_str(w, array).map_err(|e| vortex_err!("jit str cannot fail {e}"))?; + + let module = + cudarc::nvrtc::compile_ptx(s.clone()).map_err(|e| vortex_err!("compile ptx {e}"))?; + + // Dynamically load it into the device + let module = ctx + .load_module(module) + .map_err(|e| vortex_err!("load module {e}"))?; + + module + .load_function("kernel") + .map_err(|e| vortex_err!("load_function {e}")) +} diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs new file mode 100644 index 00000000000..fe558996055 --- /dev/null +++ b/vortex-gpu/src/jit/mod.rs @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +mod arrays; +mod convert; +mod kernel_fmt; +mod run; +mod type_; + +use std::fmt; +use std::fmt::Write; +use std::sync::Arc; + +use cudarc::driver::{CudaStream, LaunchArgs}; +pub use run::create_run_jit_kernel; +use vortex_dtype::PType; +use vortex_error::VortexResult; + +use crate::indent::IndentedWriter; + +pub trait GPUPipelineJIT { + fn step_id(&self) -> usize; + + fn in_params(&self, params: &mut Vec); + + fn args<'a>(&'a self, stream: &Arc, args: &mut LaunchArgs<'a>) -> VortexResult<()>; + + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result; + + fn kernel_body( + &self, + w: &mut IndentedWriter<&mut dyn Write>, + f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, + ) -> fmt::Result; + + fn output_var(&self) -> String; + + fn output_type(&self) -> PType; + + // always pass the output iteration aligned child last. + fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()>; + + fn launch_config(&self) -> GPULaunchConfig; +} + +pub trait ScalarGPUPipelineJIT { + fn step_id(&self) -> usize; + + fn in_params(&self, params: &mut Vec); + + fn args<'a>(&'a self, stream: &Arc, args: &mut LaunchArgs<'a>) -> VortexResult<()>; + + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result; + + fn kernel_body( + &self, + w: &mut IndentedWriter<&mut dyn Write>, + f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, + ) -> fmt::Result; + + fn output_var(&self) -> String; + + fn output_type(&self) -> PType; + + fn child(&self) -> &dyn GPUPipelineJIT; +} + +#[derive(Default)] +struct StepIdAllocator { + next_id: usize, +} + +impl StepIdAllocator { + pub fn get_id(&mut self) -> usize { + let id = self.next_id; + self.next_id += 1; + id + } +} + +trait GPUVisitor<'a> { + fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()>; +} + +struct GPUKernelParameter { + name: String, + type_: String, +} + +struct GPULaunchConfig { + block_width: u32, +} + +struct ScalarGPUPipelineJITNode { + inner: T, +} + +impl GPUPipelineJIT for ScalarGPUPipelineJITNode { + fn step_id(&self) -> usize { + self.inner.step_id() + } + + fn in_params(&self, params: &mut Vec) { + self.inner.in_params(params) + } + + fn args<'a>(&'a self, stream: &Arc, args: &mut LaunchArgs<'a>) -> VortexResult<()> { + self.inner.args(stream, args) + } + + fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { + self.inner.decls(w) + } + + fn kernel_body( + &self, + w: &mut IndentedWriter<&mut dyn Write>, + f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, + ) -> fmt::Result { + self.inner.kernel_body(w, f) + } + + fn output_var(&self) -> String { + self.inner.output_var() + } + + fn output_type(&self) -> PType { + self.inner.output_type() + } + + fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()> { + visitor.accept(self.inner.child()) + } + + fn launch_config(&self) -> GPULaunchConfig { + self.inner.child().launch_config() + } +} diff --git a/vortex-gpu/src/jit/run.rs b/vortex-gpu/src/jit/run.rs new file mode 100644 index 00000000000..c16f9f4db36 --- /dev/null +++ b/vortex-gpu/src/jit/run.rs @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; +use std::time::Duration; + +use cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT; +use cudarc::driver::{CudaContext, CudaStream, LaunchArgs, LaunchConfig, PushKernelArg}; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::validity::Validity; +use vortex_array::{ArrayRef, Canonical, IntoArray}; +use vortex_buffer::BufferMut; +use vortex_dtype::match_each_native_ptype; +use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_err}; + +use crate::jit::convert::handle_array; +use crate::jit::kernel_fmt::create_kernel; +use crate::jit::{GPUPipelineJIT, GPUVisitor, StepIdAllocator}; + +pub fn create_run_jit_kernel( + ctx: Arc, + array: &ArrayRef, +) -> VortexResult<(ArrayRef, Duration)> { + let stream = ctx.default_stream(); + + let mut allocator = StepIdAllocator::new(); + let output = handle_array(array, &stream, &mut allocator); + let kernel = create_kernel(ctx.clone(), output.as_ref())?; + + let num_chunks = + u32::try_from(array.len().div_ceil(1024)).vortex_expect("Too many grid elements"); + + let mut launch_builder = stream.launch_builder(&kernel); + + let launch_config = LaunchConfig { + grid_dim: (num_chunks, 1, 1), + block_dim: (output.launch_config().block_width, 1, 1), + shared_mem_bytes: u32::try_from(output.output_type().byte_width()) + .vortex_expect("oversized output type byte width") + * 1024, + }; + + collect_args(output.as_ref(), stream.clone(), &mut launch_builder)?; + + match_each_native_ptype!(array.dtype().as_ptype(), |P| { + // append final argument (output) of the kernel + let mut out = stream + .alloc_zeros::

(array.len()) + .map_err(|e| vortex_err!("failed to alloc zeros {e}"))?; + launch_builder.arg(&mut out); + stream + .synchronize() + .map_err(|e| vortex_err!("failed to sync {e}"))?; + let start = stream + .record_event(Some(CU_EVENT_DEFAULT)) + .ok() + .vortex_expect("Failed to record event"); + let _ = unsafe { launch_builder.launch(launch_config) }; + ctx.synchronize() + .map_err(|e| vortex_err!("Failed to synchronize: {e}"))?; + let end = stream + .record_event(Some(CU_EVENT_DEFAULT)) + .ok() + .vortex_expect("Failed to record event"); + + let duration = start + .elapsed_ms(&end) + .map_err(|e| vortex_err!("failed to get elapsed time {e}"))?; + + let mut buffer = BufferMut::

::with_capacity(array.len()); + unsafe { buffer.set_len(array.len()) } + + stream + .memcpy_dtoh(&out, &mut buffer) + .map_err(|e| vortex_err!("Failed to copy to device: {e}")) + .vortex_unwrap(); + stream + .synchronize() + .map_err(|e| vortex_err!("Failed to synchronize: {e}")) + .vortex_unwrap(); + let c = + Canonical::Primitive(PrimitiveArray::new(buffer, Validity::NonNullable)).into_array(); + + Ok((c, Duration::from_secs_f32(duration / 1000.0))) + }) +} + +struct ArgCollector<'a, 'b> { + stream: Arc, + params: &'b mut LaunchArgs<'a>, +} + +impl<'a> GPUVisitor<'a> for ArgCollector<'a, '_> { + fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()> { + node.children(self)?; + node.args(&self.stream, self.params)?; + Ok(()) + } +} + +fn collect_args<'a>( + node: &'a dyn GPUPipelineJIT, + stream: Arc, + args: &mut LaunchArgs<'a>, +) -> VortexResult<()> { + let mut collector = ArgCollector { + stream, + + params: args, + }; + collector.accept(node)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use cudarc::driver::CudaContext; + use vortex_alp::{ALPArray, Exponents}; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::{IntoArray, ToCanonical}; + use vortex_error::VortexResult; + use vortex_fastlanes::{BitPackedArray, FoRArray}; + + use crate::jit::create_run_jit_kernel; + + #[test] + fn jit_arr_f32() -> VortexResult<()> { + let ctx = CudaContext::new(0).unwrap(); + ctx.set_blocking_synchronize().unwrap(); + let for_ = ALPArray::try_new( + FoRArray::try_new( + BitPackedArray::encode((0i32..1024 * 2).collect::().as_ref(), 12)? + .into_array(), + 2i32.into(), + )? + .into_array(), + Exponents { e: 4, f: 5 }, + None, + )? + .into_array(); + + let (d, _) = create_run_jit_kernel(ctx, &for_)?; + let prim = d.to_primitive(); + let expect = for_.to_primitive(); + + for i in 0..prim.len() { + assert_eq!( + prim.as_slice::()[i], + expect.as_slice::()[i], + "i = {i}" + ); + } + + Ok(()) + } + + #[test] + fn jit_arrs() -> VortexResult<()> { + let ctx = CudaContext::new(0).unwrap(); + ctx.set_blocking_synchronize().unwrap(); + let for_ = BitPackedArray::encode( + (0i32..1024) + .map(|_| 1u32) + .collect::() + .as_ref(), + 2, + )? + .into_array(); + + create_run_jit_kernel(ctx, &for_.into_array())?; + + Ok(()) + } +} diff --git a/vortex-gpu/src/jit/type_.rs b/vortex-gpu/src/jit/type_.rs new file mode 100644 index 00000000000..8ae20f3a75d --- /dev/null +++ b/vortex-gpu/src/jit/type_.rs @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt; +use std::fmt::Display; + +use vortex_dtype::PType; + +pub struct CUDAType(&'static str); + +impl Display for CUDAType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.0) + } +} + +impl From for CUDAType { + fn from(value: PType) -> Self { + CUDAType(match value { + PType::U8 => "unsigned char", + PType::U16 => "unsigned short", + PType::U32 => "unsigned int", + PType::U64 => "unsigned long long", + PType::I8 => "char", + PType::I16 => "short", + PType::I32 => "int", + PType::I64 => "long long", + PType::F32 => "float", + PType::F64 => "double", + PType::F16 => todo!(), + }) + } +} From fcdde32d71a89569229ea952dac4b39931a0c24d Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 14 Oct 2025 16:25:35 -0400 Subject: [PATCH 07/15] wip Signed-off-by: Joe Isaacs --- vortex-gpu/src/indent.rs | 2 ++ vortex-gpu/src/jit/arrays/alp.rs | 6 +++--- vortex-gpu/src/jit/arrays/bitpack.rs | 11 +++++------ vortex-gpu/src/jit/arrays/for_.rs | 21 ++++++++++----------- vortex-gpu/src/jit/convert.rs | 12 ++++++------ vortex-gpu/src/jit/kernel_fmt.rs | 15 ++++++--------- vortex-gpu/src/jit/mod.rs | 7 ++++--- vortex-gpu/src/jit/run.rs | 7 +++---- 8 files changed, 39 insertions(+), 42 deletions(-) diff --git a/vortex-gpu/src/indent.rs b/vortex-gpu/src/indent.rs index 5e7d587d202..7594963be69 100644 --- a/vortex-gpu/src/indent.rs +++ b/vortex-gpu/src/indent.rs @@ -32,3 +32,5 @@ impl IndentedWriter { write!(self.write, "{}{}", self.indent, fmt) } } + +pub type IndentedWrite<'a> = IndentedWriter<&'a mut dyn Write>; diff --git a/vortex-gpu/src/jit/arrays/alp.rs b/vortex-gpu/src/jit/arrays/alp.rs index 3b2515900a9..a254a3e69b3 100644 --- a/vortex-gpu/src/jit/arrays/alp.rs +++ b/vortex-gpu/src/jit/arrays/alp.rs @@ -5,7 +5,7 @@ use std::fmt; use std::fmt::Write; use std::sync::Arc; -use cudarc::driver::{CudaStream, DeviceRepr, LaunchArgs}; +use cudarc::driver::{CudaStream, DeviceRepr, LaunchArgs, PushKernelArg}; use vortex_alp::{ALPArray, ALPFloat, match_each_alp_float_ptype}; use vortex_dtype::PType; use vortex_error::VortexResult; @@ -13,7 +13,7 @@ use vortex_error::VortexResult; use crate::indent::IndentedWriter; use crate::jit::convert::handle_array; use crate::jit::{ - GPUKernelParameter, GPUPipelineJIT, ScalarGPUPipelineJIT, ScalarGPUPipelineJITNode, + CUDAType, GPUKernelParameter, GPUPipelineJIT, ScalarGPUPipelineJIT, ScalarGPUPipelineJITNode, StepIdAllocator, }; @@ -32,7 +32,7 @@ pub fn new_jit( ) -> Box { match_each_alp_float_ptype!(alp.ptype(), |A| { let child = handle_array(alp.encoded(), stream, allocator); - let step_id = allocator.get_id(); + let step_id = allocator.fresh_id(); Box::new(ScalarGPUPipelineJITNode { inner: ALP { step_id, diff --git a/vortex-gpu/src/jit/arrays/bitpack.rs b/vortex-gpu/src/jit/arrays/bitpack.rs index 9bdf6e50000..b93b592aaa9 100644 --- a/vortex-gpu/src/jit/arrays/bitpack.rs +++ b/vortex-gpu/src/jit/arrays/bitpack.rs @@ -6,13 +6,12 @@ use std::fmt::Write; use std::sync::Arc; use cudarc::driver::{CudaSlice, CudaStream, DeviceRepr, LaunchArgs, PushKernelArg}; -use vortex_alp::{ALPArray, match_each_alp_float_ptype}; use vortex_buffer::Buffer; use vortex_dtype::{NativePType, PType, match_each_native_ptype}; use vortex_error::{VortexResult, VortexUnwrap, vortex_err}; use vortex_fastlanes::BitPackedArray; -use crate::indent::IndentedWriter; +use crate::indent::IndentedWrite; use crate::jit::{ CUDAType, GPUKernelParameter, GPULaunchConfig, GPUPipelineJIT, GPUVisitor, StepIdAllocator, }; @@ -37,7 +36,7 @@ pub fn new_jit( .memcpy_stod(values.as_slice()) .map_err(|e| vortex_err!("Failed to copy to device: {e}")) .vortex_unwrap(); - let step_id = allocator.get_id(); + let step_id = allocator.fresh_id(); Box::new(BitPack::

{ step_id, bit_width: bp.bit_width(), @@ -72,7 +71,7 @@ impl GPUPipelineJIT for BitPack

{ Ok(()) } - fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { + fn decls(&self, w: &mut IndentedWrite) -> fmt::Result { let output_cuda_type = CUDAType::from(self.output_type); let uoutput_cuda_type = CUDAType::from(self.output_type.to_unsigned()); writeln!( @@ -98,8 +97,8 @@ impl GPUPipelineJIT for BitPack

{ fn kernel_body( &self, - w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, + w: &mut IndentedWrite, + f: &dyn Fn(&mut IndentedWrite) -> fmt::Result, ) -> fmt::Result { let bit_width = self.bit_width as usize; let bits = self.output_type.bit_width(); diff --git a/vortex-gpu/src/jit/arrays/for_.rs b/vortex-gpu/src/jit/arrays/for_.rs index adca961968a..f0752d32a0e 100644 --- a/vortex-gpu/src/jit/arrays/for_.rs +++ b/vortex-gpu/src/jit/arrays/for_.rs @@ -5,12 +5,12 @@ use std::fmt; use std::fmt::Write; use std::sync::Arc; -use cudarc::driver::{CudaStream, DeviceRepr, LaunchArgs}; +use cudarc::driver::{CudaStream, DeviceRepr, LaunchArgs, PushKernelArg}; use vortex_dtype::{NativePType, PType, match_each_native_ptype}; use vortex_error::{VortexExpect, VortexResult}; -use vortex_fastlanes::{BitPackedArray, FoRArray}; +use vortex_fastlanes::FoRArray; -use crate::indent::IndentedWriter; +use crate::indent::{IndentedWrite, IndentedWriter}; use crate::jit::convert::handle_array; use crate::jit::{ CUDAType, GPUKernelParameter, GPUPipelineJIT, ScalarGPUPipelineJIT, ScalarGPUPipelineJITNode, @@ -32,7 +32,7 @@ pub fn new_jit( let child = handle_array(for_.encoded(), stream, allocator); Box::new(ScalarGPUPipelineJITNode { inner: FoR { - step_id, + step_id: allocator.fresh_id(), reference: for_ .reference_scalar() .as_primitive() @@ -83,18 +83,17 @@ impl ScalarGPUPipelineJIT for FoR

{ fn kernel_body( &self, - w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, + w: &mut IndentedWrite, + f: &dyn Fn(&mut IndentedWrite) -> fmt::Result, ) -> fmt::Result { assert_eq!(self.output_type(), self.child.output_type()); let in_var = self.child.output_var(); let out_var = self.tmp_var(); let ref_var = self.ref_var(); - self.child - .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { - writeln!(w, "{out_var} = {in_var} + {ref_var};")?; - f(w) - }) + self.child.kernel_body(w, &|w: &mut IndentedWrite| { + writeln!(w, "{out_var} = {in_var} + {ref_var};")?; + f(w) + }) } fn output_var(&self) -> String { diff --git a/vortex-gpu/src/jit/convert.rs b/vortex-gpu/src/jit/convert.rs index ceb1b7138c5..60d0740a1ca 100644 --- a/vortex-gpu/src/jit/convert.rs +++ b/vortex-gpu/src/jit/convert.rs @@ -4,16 +4,16 @@ use std::sync::Arc; use cudarc::driver::CudaStream; -use itertools::all; -use vortex_alp::{ALPFloat, ALPVTable, match_each_alp_float_ptype}; +use vortex_alp::ALPVTable; use vortex_array::{Array, ArrayRef}; -use vortex_buffer::Buffer; -use vortex_dtype::match_each_native_ptype; -use vortex_error::{VortexUnwrap, vortex_err}; use vortex_fastlanes::{BitPackedVTable, FoRVTable}; use crate::jit::arrays::{alp, bitpack, for_}; -use crate::jit::{GPUPipelineJIT, ScalarGPUPipelineJITNode, StepIdAllocator}; +use crate::jit::{GPUPipelineJIT, StepIdAllocator}; + +pub fn new_jit_array(a: &ArrayRef, stream: &Arc) -> Box { + handle_array(a, stream, &mut StepIdAllocator::default()) +} pub fn handle_array( a: &ArrayRef, diff --git a/vortex-gpu/src/jit/kernel_fmt.rs b/vortex-gpu/src/jit/kernel_fmt.rs index 6ed585f1055..d4420ac8002 100644 --- a/vortex-gpu/src/jit/kernel_fmt.rs +++ b/vortex-gpu/src/jit/kernel_fmt.rs @@ -4,18 +4,18 @@ use std::fmt::Write; use std::sync::Arc; -use cudarc::driver::{CudaContext, CudaFunction, CudaStream, LaunchArgs}; +use cudarc::driver::{CudaContext, CudaFunction}; use vortex_error::{VortexExpect, VortexResult, vortex_err}; -use crate::indent::IndentedWriter; +use crate::indent::{IndentedWrite, IndentedWriter}; use crate::jit::type_::CUDAType; use crate::jit::{GPUKernelParameter, GPUPipelineJIT, GPUVisitor}; struct DeclPrinter<'a, 'b: 'a> { - w: &'a mut IndentedWriter<&'b mut dyn Write>, + w: &'a mut IndentedWrite<'b>, } -fn write_kernel_declarations(w: &mut IndentedWriter<&mut dyn Write>, node: &dyn GPUPipelineJIT) { +fn write_kernel_declarations(w: &mut IndentedWrite, node: &dyn GPUPipelineJIT) { let mut decl = DeclPrinter { w }; decl.accept(node).vortex_expect("write decl cannot fail"); } @@ -46,10 +46,7 @@ fn collect_in_param(node: &dyn GPUPipelineJIT) -> VortexResult, - output: &dyn GPUPipelineJIT, -) -> VortexResult<()> { +pub fn create_kernel_str(w: &mut IndentedWrite, output: &dyn GPUPipelineJIT) -> VortexResult<()> { let mut params = collect_in_param(output)?; params.push(GPUKernelParameter { name: "_output".to_string(), @@ -84,7 +81,7 @@ pub fn create_kernel_str( write_kernel_declarations(w, output); writeln!(w)?; - output.kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { + output.kernel_body(w, &|w: &mut IndentedWrite| { writeln!(w, "s_output[out_idx] = {tmp};", tmp = output.output_var()) })?; writeln!(w)?; diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs index fe558996055..911de7e0bc7 100644 --- a/vortex-gpu/src/jit/mod.rs +++ b/vortex-gpu/src/jit/mod.rs @@ -13,6 +13,7 @@ use std::sync::Arc; use cudarc::driver::{CudaStream, LaunchArgs}; pub use run::create_run_jit_kernel; +pub use type_::CUDAType; use vortex_dtype::PType; use vortex_error::VortexResult; @@ -71,7 +72,7 @@ struct StepIdAllocator { } impl StepIdAllocator { - pub fn get_id(&mut self) -> usize { + pub fn fresh_id(&mut self) -> usize { let id = self.next_id; self.next_id += 1; id @@ -82,12 +83,12 @@ trait GPUVisitor<'a> { fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()>; } -struct GPUKernelParameter { +pub struct GPUKernelParameter { name: String, type_: String, } -struct GPULaunchConfig { +pub struct GPULaunchConfig { block_width: u32, } diff --git a/vortex-gpu/src/jit/run.rs b/vortex-gpu/src/jit/run.rs index c16f9f4db36..85ce12b92f5 100644 --- a/vortex-gpu/src/jit/run.rs +++ b/vortex-gpu/src/jit/run.rs @@ -13,9 +13,9 @@ use vortex_buffer::BufferMut; use vortex_dtype::match_each_native_ptype; use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_err}; -use crate::jit::convert::handle_array; +use crate::jit::convert::new_jit_array; use crate::jit::kernel_fmt::create_kernel; -use crate::jit::{GPUPipelineJIT, GPUVisitor, StepIdAllocator}; +use crate::jit::{GPUPipelineJIT, GPUVisitor}; pub fn create_run_jit_kernel( ctx: Arc, @@ -23,8 +23,7 @@ pub fn create_run_jit_kernel( ) -> VortexResult<(ArrayRef, Duration)> { let stream = ctx.default_stream(); - let mut allocator = StepIdAllocator::new(); - let output = handle_array(array, &stream, &mut allocator); + let output = new_jit_array(array, &stream); let kernel = create_kernel(ctx.clone(), output.as_ref())?; let num_chunks = From dcddbbbbcb5a2bc81f1bd089f476a03cf1c93881 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 14 Oct 2025 16:26:55 -0400 Subject: [PATCH 08/15] wip Signed-off-by: Joe Isaacs --- vortex-gpu/kernels/fused_bitpack_for.cu | 22 ++-------------------- vortex-gpu/src/for_bp.rs | 6 +++--- vortex-gpu/src/task.rs | 2 -- 3 files changed, 5 insertions(+), 25 deletions(-) diff --git a/vortex-gpu/kernels/fused_bitpack_for.cu b/vortex-gpu/kernels/fused_bitpack_for.cu index 0ba14fedbe5..4fd4bc48149 100644 --- a/vortex-gpu/kernels/fused_bitpack_for.cu +++ b/vortex-gpu/kernels/fused_bitpack_for.cu @@ -108,28 +108,13 @@ __device__ __forceinline__ void for_device( } } -template -__device__ __forceinline__ void float_device( - ValueT *__restrict values_in, - float *__restrict values_out, - int thread_idx -) { - auto i = thread_idx; - const int thread_ops = blockDim.x; - - for (auto j = 0; j < thread_ops; j++) { - auto idx = INDEX(j, i); - values_out[idx] = (float)values_in[idx]; - } -} - // Fused kernel: bitpack unpack (3bw) + FoR addition in one pass // This eliminates the intermediate write-to-memory and read-from-memory // by keeping unpacked values in registers/L1 cache and immediately adding the reference extern "C" __global__ void fused_bitpack6_for_u32( const uint32_t *__restrict packed_in, - float *__restrict unpacked_out, + uint32_t *__restrict unpacked_out, uint32_t reference ) { int i = threadIdx.x; @@ -139,16 +124,13 @@ extern "C" __global__ void fused_bitpack6_for_u32( auto out = unpacked_out + (blockIdx.x * 1024); __shared__ uint32_t shared_data[1024]; - __shared__ float shared_data2[1024]; fls_unpack_6bw_32ow_device(in, shared_data, i); for_device(shared_data, reference, i); - float_device(shared_data, shared_data2, i); - for (int i = 0; i < 32; i++) { auto idx = i * 32 + threadIdx.x; - out[idx] = shared_data2[idx]; + out[idx] = shared_data[idx]; } } diff --git a/vortex-gpu/src/for_bp.rs b/vortex-gpu/src/for_bp.rs index f9e7fe6860c..e2b8fb19e40 100644 --- a/vortex-gpu/src/for_bp.rs +++ b/vortex-gpu/src/for_bp.rs @@ -26,7 +26,7 @@ struct FoRBPTask

{ launch_config: LaunchConfig, packed: CudaSlice

, - unpacked: CudaSlice, + unpacked: CudaSlice

, reference: P, len: usize, @@ -53,7 +53,7 @@ pub fn new_task( .map_err(|e| vortex_err!("Failed to copy to device: {e}"))?; let cu_out = unsafe { stream - .alloc::(array.len().next_multiple_of(1024)) + .alloc::(array.len().next_multiple_of(1024)) .map_err(|e| vortex_err!("Failed to allocate stream: {e}"))? }; @@ -101,7 +101,7 @@ impl GPUTask for FoRBPTask

{ fn export_result(&mut self) -> VortexResult { let len = self.len(); - let mut buffer = BufferMut::::with_capacity(len); + let mut buffer = BufferMut::

::with_capacity(len); unsafe { buffer.set_len(len) } self.stream diff --git a/vortex-gpu/src/task.rs b/vortex-gpu/src/task.rs index 1b678e74508..b0bfdd38f03 100644 --- a/vortex-gpu/src/task.rs +++ b/vortex-gpu/src/task.rs @@ -16,6 +16,4 @@ pub trait GPUTask { fn output(&mut self) -> CudaViewMut<'_, u8>; fn len(&self) -> usize; - - // fn jit(&mut self) -> OptBox ; } From c0b4a8d7ab2937161ec1b93602227eff5e8d18fc Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 14 Oct 2025 17:41:04 -0400 Subject: [PATCH 09/15] wip Signed-off-by: Joe Isaacs --- vortex-gpu/src/jit/mod.rs | 63 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs index 911de7e0bc7..244897ad4b4 100644 --- a/vortex-gpu/src/jit/mod.rs +++ b/vortex-gpu/src/jit/mod.rs @@ -1,6 +1,51 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +//! # JIT Kernel Composition System +//! +//! This module generates CUDA kernels by composing encoding steps into a single fused kernel. +//! +//! ## How Kernels Are Built +//! +//! Each encoding step (BitPack, FoR, ALP) implements `GPUPipelineJIT` and contributes: +//! 1. **Input parameters** - Data passed from host to device (arrays, scalars) +//! 2. **Declarations** - Local variables needed for the step +//! 3. **Kernel body** - The actual computation logic +//! 4. **Output variable** - The result of this step (e.g., `tmp0`, `tmp1`) +//! +//! Steps are composed in a tree structure. For example: `ALP -> FoR -> BitPack` +//! +//! ## Data Flow Between Steps +//! +//! Each step produces an **output variable** that the parent step consumes: +//! +//! ```text +//! BitPack: unpacks data → produces `tmp0` +//! FoR: reads `tmp0` → adds reference → produces `tmp1` = tmp0 + ref0 +//! ALP: reads `tmp1` → scales → produces `tmp2` = tmp1 * f2 * e2 +//! ``` +//! +//! The `output_var()` method returns the variable name (e.g., "tmp2") that subsequent +//! steps or the final output can read from. +//! +//! ## Writing to Final Output +//! +//! Each step computes a value and passes `out_idx` to its continuation: +//! - **out_idx**: The index in the output array where this value should be written +//! - The innermost step calculates `out_idx` based on thread/block layout +//! - Parent steps pass this index through to their continuation function +//! - The root continuation writes: `output[out_idx] = ` +//! +//! Example flow: +//! ```cuda +//! // BitPack calculates out_idx +//! out_idx = INDEX(row, lane); +//! // Then calls continuation with out_idx available +//! // FoR does: tmp1 = tmp0 + ref0; calls its continuation +//! // ALP does: tmp2 = tmp1 * f2 * e2; calls its continuation +//! // Final: output[out_idx] = tmp2; +//! ``` + mod arrays; mod convert; mod kernel_fmt; @@ -19,28 +64,44 @@ use vortex_error::VortexResult; use crate::indent::IndentedWriter; +/// Trait for encoding steps that can be JIT-compiled into a CUDA kernel. +/// +/// Each step contributes a piece of the kernel and specifies its output variable +/// that subsequent steps can read from. pub trait GPUPipelineJIT { + /// Unique identifier for this step (used to generate unique variable names) fn step_id(&self) -> usize; + /// Adds input parameters (e.g., device pointers, scalars) to the kernel signature fn in_params(&self, params: &mut Vec); + /// Adds arguments to the kernel launch (actual values passed at runtime) fn args<'a>(&'a self, stream: &Arc, args: &mut LaunchArgs<'a>) -> VortexResult<()>; + /// Writes variable declarations needed by this step fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result; + /// Writes the kernel body for this step. + /// + /// The continuation function `f` should be called after computing this step's output, + /// allowing parent steps to consume the output variable via `output_var()`. fn kernel_body( &self, w: &mut IndentedWriter<&mut dyn Write>, f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, ) -> fmt::Result; + /// Returns the output variable name (e.g., "tmp0") that this step produces. + /// Parent steps read this variable to consume the output. fn output_var(&self) -> String; + /// Returns the type of the output variable fn output_type(&self) -> PType; - // always pass the output iteration aligned child last. + /// Visits child steps in the pipeline tree fn children<'a>(&'a self, visitor: &mut dyn GPUVisitor<'a>) -> VortexResult<()>; + /// Returns the launch configuration (block size, etc.) for this kernel fn launch_config(&self) -> GPULaunchConfig; } From 22b1b45fc6d16373988500e4be4435b2adab9b91 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 14 Oct 2025 17:43:03 -0400 Subject: [PATCH 10/15] wip Signed-off-by: Joe Isaacs --- vortex-gpu/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vortex-gpu/Cargo.toml b/vortex-gpu/Cargo.toml index 0a81e79367c..23fbd517361 100644 --- a/vortex-gpu/Cargo.toml +++ b/vortex-gpu/Cargo.toml @@ -16,10 +16,10 @@ publish = false [dependencies] cudarc = { workspace = true, features = ["f16"] } -parking_lot = { workspace = true } itertools = { workspace = true } -vortex-array = { workspace = true } +parking_lot = { workspace = true } vortex-alp = { workspace = true } +vortex-array = { workspace = true } vortex-buffer = { workspace = true, features = ["cuda"] } vortex-dict = { workspace = true } vortex-dtype = { workspace = true } From 060c666b4abd130d83ccb67fba63140b71e0b19f Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 14 Oct 2025 17:54:09 -0400 Subject: [PATCH 11/15] wip Signed-off-by: Joe Isaacs --- vortex-gpu/src/jit/arrays/alp.rs | 4 ---- vortex-gpu/src/jit/arrays/bitpack.rs | 5 ----- vortex-gpu/src/jit/arrays/for_.rs | 4 ---- vortex-gpu/src/jit/mod.rs | 11 +---------- 4 files changed, 1 insertion(+), 23 deletions(-) diff --git a/vortex-gpu/src/jit/arrays/alp.rs b/vortex-gpu/src/jit/arrays/alp.rs index a254a3e69b3..3d7db15603b 100644 --- a/vortex-gpu/src/jit/arrays/alp.rs +++ b/vortex-gpu/src/jit/arrays/alp.rs @@ -60,10 +60,6 @@ impl ALP { } impl ScalarGPUPipelineJIT for ALP { - fn step_id(&self) -> usize { - self.step_id - } - fn in_params(&self, params: &mut Vec) { params.extend([ GPUKernelParameter { diff --git a/vortex-gpu/src/jit/arrays/bitpack.rs b/vortex-gpu/src/jit/arrays/bitpack.rs index b93b592aaa9..6276accb4d4 100644 --- a/vortex-gpu/src/jit/arrays/bitpack.rs +++ b/vortex-gpu/src/jit/arrays/bitpack.rs @@ -2,7 +2,6 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use std::fmt; -use std::fmt::Write; use std::sync::Arc; use cudarc::driver::{CudaSlice, CudaStream, DeviceRepr, LaunchArgs, PushKernelArg}; @@ -47,10 +46,6 @@ pub fn new_jit( } impl GPUPipelineJIT for BitPack

{ - fn step_id(&self) -> usize { - self.step_id - } - fn in_params(&self, p: &mut Vec) { p.push(GPUKernelParameter { name: self.in_var_g(), diff --git a/vortex-gpu/src/jit/arrays/for_.rs b/vortex-gpu/src/jit/arrays/for_.rs index f0752d32a0e..1e97c6ae47b 100644 --- a/vortex-gpu/src/jit/arrays/for_.rs +++ b/vortex-gpu/src/jit/arrays/for_.rs @@ -55,10 +55,6 @@ impl

FoR

{ } impl ScalarGPUPipelineJIT for FoR

{ - fn step_id(&self) -> usize { - self.step_id - } - fn in_params(&self, p: &mut Vec) { p.push(GPUKernelParameter { name: self.ref_var(), diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs index 244897ad4b4..497543e7f1e 100644 --- a/vortex-gpu/src/jit/mod.rs +++ b/vortex-gpu/src/jit/mod.rs @@ -69,9 +69,6 @@ use crate::indent::IndentedWriter; /// Each step contributes a piece of the kernel and specifies its output variable /// that subsequent steps can read from. pub trait GPUPipelineJIT { - /// Unique identifier for this step (used to generate unique variable names) - fn step_id(&self) -> usize; - /// Adds input parameters (e.g., device pointers, scalars) to the kernel signature fn in_params(&self, params: &mut Vec); @@ -106,8 +103,6 @@ pub trait GPUPipelineJIT { } pub trait ScalarGPUPipelineJIT { - fn step_id(&self) -> usize; - fn in_params(&self, params: &mut Vec); fn args<'a>(&'a self, stream: &Arc, args: &mut LaunchArgs<'a>) -> VortexResult<()>; @@ -140,7 +135,7 @@ impl StepIdAllocator { } } -trait GPUVisitor<'a> { +pub trait GPUVisitor<'a> { fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()>; } @@ -158,10 +153,6 @@ struct ScalarGPUPipelineJITNode { } impl GPUPipelineJIT for ScalarGPUPipelineJITNode { - fn step_id(&self) -> usize { - self.inner.step_id() - } - fn in_params(&self, params: &mut Vec) { self.inner.in_params(params) } From ec961836ef322c465b0aa1a644d15ab97751f06e Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 16 Oct 2025 11:57:22 -0400 Subject: [PATCH 12/15] fixup Signed-off-by: Joe Isaacs --- vortex-gpu/src/jit/arrays/alp.rs | 27 ++++++++++------ vortex-gpu/src/jit/arrays/bitpack.rs | 48 ++++++++++++++++++++++------ vortex-gpu/src/jit/arrays/for_.rs | 43 ++++++++++++++++--------- vortex-gpu/src/jit/convert.rs | 15 ++++++--- vortex-gpu/src/jit/kernel_fmt.rs | 23 +++++++------ vortex-gpu/src/jit/mod.rs | 37 +++++++++++++-------- vortex-gpu/src/jit/run.rs | 9 ++++-- 7 files changed, 137 insertions(+), 65 deletions(-) diff --git a/vortex-gpu/src/jit/arrays/alp.rs b/vortex-gpu/src/jit/arrays/alp.rs index 3d7db15603b..f71d8248341 100644 --- a/vortex-gpu/src/jit/arrays/alp.rs +++ b/vortex-gpu/src/jit/arrays/alp.rs @@ -10,7 +10,7 @@ use vortex_alp::{ALPArray, ALPFloat, match_each_alp_float_ptype}; use vortex_dtype::PType; use vortex_error::VortexResult; -use crate::indent::IndentedWriter; +use crate::indent::{IndentedWrite, IndentedWriter}; use crate::jit::convert::handle_array; use crate::jit::{ CUDAType, GPUKernelParameter, GPUPipelineJIT, ScalarGPUPipelineJIT, ScalarGPUPipelineJITNode, @@ -29,9 +29,10 @@ pub fn new_jit( alp: &ALPArray, stream: &Arc, allocator: &mut StepIdAllocator, + output_array: String, ) -> Box { match_each_alp_float_ptype!(alp.ptype(), |A| { - let child = handle_array(alp.encoded(), stream, allocator); + let child = handle_array(alp.encoded(), stream, allocator, output_array); let step_id = allocator.fresh_id(); Box::new(ScalarGPUPipelineJITNode { inner: ALP { @@ -92,25 +93,31 @@ impl ScalarGPUPipelineJIT for ALP { fn kernel_body( &self, w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, - ) -> fmt::Result { + f: &dyn Fn( + &mut IndentedWriter<&mut dyn Write>, + GPUKernelParameter, + ) -> Result, + ) -> Result { self.child - .kernel_body(w, &|w: &mut IndentedWriter<&mut dyn Write>| { + .kernel_body(w, &|w: &mut IndentedWrite, in_: GPUKernelParameter| { + let in_var = in_.name; writeln!( w, - "{out} = ((({type_}){tmp}) * {f}) * {e};", + "{out} = ((({type_}){in_var}) * {f}) * {e};", out = self.tmp_var(), type_ = CUDAType::from(self.float_type), - tmp = self.child.output_var(), f = self.f_var(), e = self.e_var(), )?; - f(w) + f(w, self.output_parameter()) }) } - fn output_var(&self) -> String { - self.tmp_var() + fn output_parameter(&self) -> GPUKernelParameter { + GPUKernelParameter { + name: self.tmp_var(), + type_: CUDAType::from(self.float_type).to_string(), + } } fn output_type(&self) -> PType { diff --git a/vortex-gpu/src/jit/arrays/bitpack.rs b/vortex-gpu/src/jit/arrays/bitpack.rs index 6276accb4d4..da9a4a28fce 100644 --- a/vortex-gpu/src/jit/arrays/bitpack.rs +++ b/vortex-gpu/src/jit/arrays/bitpack.rs @@ -20,12 +20,14 @@ struct BitPack

{ bit_width: u8, output_type: PType, cuda_slice: CudaSlice

, + output_array: String, } pub fn new_jit( bp: &BitPackedArray, stream: &Arc, allocator: &mut StepIdAllocator, + output_array: String, ) -> Box { assert_eq!(bp.offset(), 0); assert!(bp.patches().is_none()); @@ -41,6 +43,7 @@ pub fn new_jit( bit_width: bp.bit_width(), output_type: bp.ptype(), cuda_slice, + output_array, }) }) } @@ -77,7 +80,6 @@ impl GPUPipelineJIT for BitPack

{ writeln!(w, "{output_cuda_type} {};", self.tmp_var())?; writeln!(w, "{uoutput_cuda_type} {};", self.src_var())?; writeln!(w, "{uoutput_cuda_type} {};", self.utmp_var())?; - writeln!(w, "unsigned int out_idx;")?; writeln!(w, "unsigned int lane = threadIdx.x;")?; writeln!( w, @@ -93,8 +95,11 @@ impl GPUPipelineJIT for BitPack

{ fn kernel_body( &self, w: &mut IndentedWrite, - f: &dyn Fn(&mut IndentedWrite) -> fmt::Result, - ) -> fmt::Result { + f: &dyn Fn( + &mut IndentedWrite, + GPUKernelParameter, + ) -> Result, + ) -> Result { let bit_width = self.bit_width as usize; let bits = self.output_type.bit_width(); let in_ = self.in_var_l(); @@ -104,6 +109,10 @@ impl GPUPipelineJIT for BitPack

{ for row in 0..bits { writeln!(w, "out[INDEX({row}, lane)] = zero;")?; } + Ok(GPUKernelParameter { + name: "none".to_string(), + type_: "t_none_".to_string(), + }) } else if bit_width == bits { writeln!(w)?; for row in 0..bits { @@ -112,6 +121,10 @@ impl GPUPipelineJIT for BitPack

{ "out[INDEX({row}, lane)] = {in_}[LANE_COUNT * {row} + lane];", )?; } + Ok(GPUKernelParameter { + name: "none".to_string(), + type_: "t_none_".to_string(), + }) } else { let src = self.src_var(); let utmp = self.utmp_var(); @@ -162,20 +175,37 @@ impl GPUPipelineJIT for BitPack

{ type_ = CUDAType::from(self.output_type), )?; - writeln!(w, "out_idx = INDEX({row}, lane);")?; - f(w)?; + let out = f( + w, + GPUKernelParameter { + name: tmp.to_string(), + type_: "unsigned int".to_string(), + }, + )?; + writeln!( + w, + "{output_a}[INDEX({row}, lane)] = {in_var};", + output_a = self.output_array, + in_var = out.name + )?; writeln!(w)?; } + Ok(GPUKernelParameter { + name: "none___".to_string(), + type_: "t_none_".to_string(), + }) } - Ok(()) } - fn output_var(&self) -> String { - self.tmp_var() + fn output_parameter(&self) -> GPUKernelParameter { + GPUKernelParameter { + name: self.tmp_var(), + type_: CUDAType::from(self.output_type).to_string(), + } } fn output_type(&self) -> PType { - self.output_type + P::PTYPE } fn children(&self, _visitor: &mut dyn GPUVisitor) -> VortexResult<()> { diff --git a/vortex-gpu/src/jit/arrays/for_.rs b/vortex-gpu/src/jit/arrays/for_.rs index 1e97c6ae47b..420f6a0bfcb 100644 --- a/vortex-gpu/src/jit/arrays/for_.rs +++ b/vortex-gpu/src/jit/arrays/for_.rs @@ -27,9 +27,10 @@ pub fn new_jit( for_: &FoRArray, stream: &Arc, allocator: &mut StepIdAllocator, + output_array: String, ) -> Box { match_each_native_ptype!(for_.reference_scalar().as_primitive().ptype(), |P| { - let child = handle_array(for_.encoded(), stream, allocator); + let child = handle_array(for_.encoded(), stream, allocator, output_array); Box::new(ScalarGPUPipelineJITNode { inner: FoR { step_id: allocator.fresh_id(), @@ -58,7 +59,7 @@ impl ScalarGPUPipelineJIT for FoR

{ fn in_params(&self, p: &mut Vec) { p.push(GPUKernelParameter { name: self.ref_var(), - type_: CUDAType::from(self.output_type()).to_string(), + type_: self.output_parameter().type_, }) } @@ -72,28 +73,40 @@ impl ScalarGPUPipelineJIT for FoR

{ } fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result { - let output_cuda_type = CUDAType::from(self.output_type()); - writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?; + let output_param = self.output_parameter(); + writeln!(w, "{} {};", output_param.type_, output_param.name)?; Ok(()) } fn kernel_body( &self, w: &mut IndentedWrite, - f: &dyn Fn(&mut IndentedWrite) -> fmt::Result, - ) -> fmt::Result { - assert_eq!(self.output_type(), self.child.output_type()); - let in_var = self.child.output_var(); - let out_var = self.tmp_var(); + f: &dyn Fn( + &mut IndentedWrite, + GPUKernelParameter, + ) -> Result, + ) -> Result { + let output_param = self.output_parameter(); + let child_output_type = self.child.output_parameter().type_; + assert_eq!(output_param.type_, child_output_type); let ref_var = self.ref_var(); - self.child.kernel_body(w, &|w: &mut IndentedWrite| { - writeln!(w, "{out_var} = {in_var} + {ref_var};")?; - f(w) - }) + self.child + .kernel_body(w, &move |w: &mut IndentedWrite, in_: GPUKernelParameter| { + let in_var = in_.name; + writeln!( + w, + "{out_var} = {in_var} + {ref_var};", + out_var = output_param.name + )?; + f(w, output_param.clone()) + }) } - fn output_var(&self) -> String { - self.tmp_var() + fn output_parameter(&self) -> GPUKernelParameter { + GPUKernelParameter { + name: self.tmp_var(), + type_: CUDAType::from(P::PTYPE).to_string(), + } } fn output_type(&self) -> PType { diff --git a/vortex-gpu/src/jit/convert.rs b/vortex-gpu/src/jit/convert.rs index 60d0740a1ca..cb08e6814c0 100644 --- a/vortex-gpu/src/jit/convert.rs +++ b/vortex-gpu/src/jit/convert.rs @@ -11,24 +11,29 @@ use vortex_fastlanes::{BitPackedVTable, FoRVTable}; use crate::jit::arrays::{alp, bitpack, for_}; use crate::jit::{GPUPipelineJIT, StepIdAllocator}; -pub fn new_jit_array(a: &ArrayRef, stream: &Arc) -> Box { - handle_array(a, stream, &mut StepIdAllocator::default()) +pub fn new_jit_array( + a: &ArrayRef, + stream: &Arc, + output_array: String, +) -> Box { + handle_array(a, stream, &mut StepIdAllocator::default(), output_array) } pub fn handle_array( a: &ArrayRef, stream: &Arc, allocator: &mut StepIdAllocator, + output_array: String, ) -> Box { if let Some(alp) = a.as_opt::() { - return alp::new_jit(alp, stream, allocator); + return alp::new_jit(alp, stream, allocator, output_array); } if let Some(bp) = a.as_opt::() { - return bitpack::new_jit(bp, stream, allocator); + return bitpack::new_jit(bp, stream, allocator, output_array); }; if let Some(for_) = a.as_opt::() { - return for_::new_jit(for_, stream, allocator); + return for_::new_jit(for_, stream, allocator, output_array); } todo!("unimplemented jit for {}", a.encoding_id()) diff --git a/vortex-gpu/src/jit/kernel_fmt.rs b/vortex-gpu/src/jit/kernel_fmt.rs index d4420ac8002..ce19006468f 100644 --- a/vortex-gpu/src/jit/kernel_fmt.rs +++ b/vortex-gpu/src/jit/kernel_fmt.rs @@ -8,7 +8,6 @@ use cudarc::driver::{CudaContext, CudaFunction}; use vortex_error::{VortexExpect, VortexResult, vortex_err}; use crate::indent::{IndentedWrite, IndentedWriter}; -use crate::jit::type_::CUDAType; use crate::jit::{GPUKernelParameter, GPUPipelineJIT, GPUVisitor}; struct DeclPrinter<'a, 'b: 'a> { @@ -46,11 +45,15 @@ fn collect_in_param(node: &dyn GPUPipelineJIT) -> VortexResult VortexResult<()> { +pub fn create_kernel_str( + w: &mut IndentedWrite, + output: &dyn GPUPipelineJIT, + kernel_out_array: &str, +) -> VortexResult<()> { let mut params = collect_in_param(output)?; params.push(GPUKernelParameter { name: "_output".to_string(), - type_: format!("{} *__restrict__", CUDAType::from(output.output_type())), + type_: format!("{} *__restrict__", output.output_parameter().type_), }); (|| { @@ -74,22 +77,22 @@ pub fn create_kernel_str(w: &mut IndentedWrite, output: &dyn GPUPipelineJIT) -> writeln!( w, "{output_type} *output = _output + (blockIdx.x * 1024);", - output_type = CUDAType::from(output.output_type()) + output_type = output.output_parameter().type_ )?; - writeln!(w, "__shared__ float s_output[1024];")?; + writeln!(w, "__shared__ float {kernel_out_array}[1024];")?; write_kernel_declarations(w, output); writeln!(w)?; - output.kernel_body(w, &|w: &mut IndentedWrite| { - writeln!(w, "s_output[out_idx] = {tmp};", tmp = output.output_var()) + output.kernel_body(w, &|_w: &mut IndentedWrite, in_: GPUKernelParameter| { + Ok(in_) })?; writeln!(w)?; writeln!(w, "for (int i = 0; i < 32; i++) {{")?; w.indent(|w| { writeln!(w, "auto idx = i * 32 + threadIdx.x;")?; - writeln!(w, "output[idx] = s_output[idx];") + writeln!(w, "output[idx] = {kernel_out_array}[idx];") })?; writeln!(w, "}}") })?; @@ -102,13 +105,15 @@ pub fn create_kernel_str(w: &mut IndentedWrite, output: &dyn GPUPipelineJIT) -> pub fn create_kernel( ctx: Arc, array: &dyn GPUPipelineJIT, + kernel_out_array: &str, ) -> VortexResult { let mut s = String::new(); let w = &mut s as &mut dyn Write; let mut ind = IndentedWriter::new(w); let w = &mut ind; - create_kernel_str(w, array).map_err(|e| vortex_err!("jit str cannot fail {e}"))?; + create_kernel_str(w, array, kernel_out_array) + .map_err(|e| vortex_err!("jit str cannot fail {e}"))?; let module = cudarc::nvrtc::compile_ptx(s.clone()).map_err(|e| vortex_err!("compile ptx {e}"))?; diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs index 497543e7f1e..27ff0ce377c 100644 --- a/vortex-gpu/src/jit/mod.rs +++ b/vortex-gpu/src/jit/mod.rs @@ -85,14 +85,15 @@ pub trait GPUPipelineJIT { fn kernel_body( &self, w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, - ) -> fmt::Result; + f: &dyn Fn( + &mut IndentedWriter<&mut dyn Write>, + GPUKernelParameter, + ) -> Result, + ) -> Result; - /// Returns the output variable name (e.g., "tmp0") that this step produces. - /// Parent steps read this variable to consume the output. - fn output_var(&self) -> String; + /// Returns the name+type of the output variable + fn output_parameter(&self) -> GPUKernelParameter; - /// Returns the type of the output variable fn output_type(&self) -> PType; /// Visits child steps in the pipeline tree @@ -112,10 +113,14 @@ pub trait ScalarGPUPipelineJIT { fn kernel_body( &self, w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, - ) -> fmt::Result; + f: &dyn Fn( + &mut IndentedWriter<&mut dyn Write>, + GPUKernelParameter, + ) -> Result, + ) -> Result; - fn output_var(&self) -> String; + /// Returns the name+type of the output variable + fn output_parameter(&self) -> GPUKernelParameter; fn output_type(&self) -> PType; @@ -139,6 +144,7 @@ pub trait GPUVisitor<'a> { fn accept(&mut self, node: &'a dyn GPUPipelineJIT) -> VortexResult<()>; } +#[derive(Clone)] pub struct GPUKernelParameter { name: String, type_: String, @@ -168,13 +174,16 @@ impl GPUPipelineJIT for ScalarGPUPipelineJITNode { fn kernel_body( &self, w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn(&mut IndentedWriter<&mut dyn Write>) -> fmt::Result, - ) -> fmt::Result { - self.inner.kernel_body(w, f) + f: &dyn Fn( + &mut IndentedWriter<&mut dyn Write>, + GPUKernelParameter, + ) -> Result, + ) -> Result { + self.inner.kernel_body(w, &f) } - fn output_var(&self) -> String { - self.inner.output_var() + fn output_parameter(&self) -> GPUKernelParameter { + self.inner.output_parameter() } fn output_type(&self) -> PType { diff --git a/vortex-gpu/src/jit/run.rs b/vortex-gpu/src/jit/run.rs index 85ce12b92f5..242fae84bd7 100644 --- a/vortex-gpu/src/jit/run.rs +++ b/vortex-gpu/src/jit/run.rs @@ -23,17 +23,20 @@ pub fn create_run_jit_kernel( ) -> VortexResult<(ArrayRef, Duration)> { let stream = ctx.default_stream(); - let output = new_jit_array(array, &stream); - let kernel = create_kernel(ctx.clone(), output.as_ref())?; + let kernel_output_arr_name = "s_output"; + let output = new_jit_array(array, &stream, kernel_output_arr_name.to_string()); + let kernel = create_kernel(ctx.clone(), output.as_ref(), kernel_output_arr_name)?; let num_chunks = u32::try_from(array.len().div_ceil(1024)).vortex_expect("Too many grid elements"); let mut launch_builder = stream.launch_builder(&kernel); + let config = output.launch_config(); + let launch_config = LaunchConfig { grid_dim: (num_chunks, 1, 1), - block_dim: (output.launch_config().block_width, 1, 1), + block_dim: (config.block_width, 1, 1), shared_mem_bytes: u32::try_from(output.output_type().byte_width()) .vortex_expect("oversized output type byte width") * 1024, From 8fd80c1f9577ece9987286e9b26e29a1b96437c9 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 16 Oct 2025 12:08:00 -0400 Subject: [PATCH 13/15] fixup Signed-off-by: Joe Isaacs --- vortex-gpu/src/jit/mod.rs | 51 +++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs index 27ff0ce377c..a8ade0de4cd 100644 --- a/vortex-gpu/src/jit/mod.rs +++ b/vortex-gpu/src/jit/mod.rs @@ -28,23 +28,48 @@ //! The `output_var()` method returns the variable name (e.g., "tmp2") that subsequent //! steps or the final output can read from. //! -//! ## Writing to Final Output +//! ## Final Kernel Result //! -//! Each step computes a value and passes `out_idx` to its continuation: -//! - **out_idx**: The index in the output array where this value should be written -//! - The innermost step calculates `out_idx` based on thread/block layout -//! - Parent steps pass this index through to their continuation function -//! - The root continuation writes: `output[out_idx] = ` +//! The composed kernel computes the final value and writes it to the output array: //! -//! Example flow: //! ```cuda -//! // BitPack calculates out_idx -//! out_idx = INDEX(row, lane); -//! // Then calls continuation with out_idx available -//! // FoR does: tmp1 = tmp0 + ref0; calls its continuation -//! // ALP does: tmp2 = tmp1 * f2 * e2; calls its continuation -//! // Final: output[out_idx] = tmp2; +//! // BitPack unpacks data +//! tmp0 = unpack(...) +//! // FoR adds reference value +//! tmp1 = tmp0 + ref0 +//! // ALP scales the result +//! tmp2 = tmp1 * scale +//! // Final write to output +//! output[out_idx] = tmp2 +//! ``` +//! +//! ## Continuation-Based Composition +//! +//! Each step calls a continuation function after computing its output: +//! - The continuation function receives a `GPUKernelParameter` (the child's output variable) +//! - The step uses this variable to perform its computation +//! - The step then calls its own continuation with its output variable +//! - This creates a chain: innermost → ... → outermost → final write +//! +//! Example flow: +//! ```text +//! BitPack.kernel_body(w, continuation): +//! // Unpacks data +//! tmp0 = unpack(...) +//! continuation(w, GPUKernelParameter{name: "tmp0", type_: "int32_t"}) +//! +//! FoR.kernel_body(w, continuation): +//! child_var = self.child.kernel_body(w, continuation) // Gets "tmp0" +//! tmp1 = child_var + ref0 +//! continuation(w, GPUKernelParameter{name: "tmp1", type_: "int32_t"}) +//! +//! ALP.kernel_body(w, continuation): +//! child_var = self.child.kernel_body(w, continuation) // Gets "tmp1" +//! tmp2 = child_var * scale +//! continuation(w, GPUKernelParameter{name: "tmp2", type_: "float"}) //! ``` +//! +//! The root continuation writes the final result to the output array. mod arrays; mod convert; From 4264275bc0dd76afff10c6807005c073ab297b71 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 16 Oct 2025 12:12:53 -0400 Subject: [PATCH 14/15] fixup Signed-off-by: Joe Isaacs --- vortex-gpu/src/jit/arrays/alp.rs | 8 ++++---- vortex-gpu/src/jit/mod.rs | 27 ++++++++++++++------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/vortex-gpu/src/jit/arrays/alp.rs b/vortex-gpu/src/jit/arrays/alp.rs index f71d8248341..6db6ae393c0 100644 --- a/vortex-gpu/src/jit/arrays/alp.rs +++ b/vortex-gpu/src/jit/arrays/alp.rs @@ -17,7 +17,7 @@ use crate::jit::{ StepIdAllocator, }; -struct ALP { +struct Alp { step_id: usize, float_type: PType, child: Box, @@ -35,7 +35,7 @@ pub fn new_jit( let child = handle_array(alp.encoded(), stream, allocator, output_array); let step_id = allocator.fresh_id(); Box::new(ScalarGPUPipelineJITNode { - inner: ALP { + inner: Alp { step_id, float_type: alp.ptype(), child, @@ -46,7 +46,7 @@ pub fn new_jit( }) } -impl ALP { +impl Alp { fn tmp_var(&self) -> String { format!("tmp{}", self.step_id) } @@ -60,7 +60,7 @@ impl ALP { } } -impl ScalarGPUPipelineJIT for ALP { +impl ScalarGPUPipelineJIT for Alp { fn in_params(&self, params: &mut Vec) { params.extend([ GPUKernelParameter { diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs index a8ade0de4cd..c74d6ef1f6a 100644 --- a/vortex-gpu/src/jit/mod.rs +++ b/vortex-gpu/src/jit/mod.rs @@ -89,6 +89,16 @@ use vortex_error::VortexResult; use crate::indent::IndentedWriter; +/// Type alias for the continuation function passed to `kernel_body`. +/// +/// The continuation receives the output parameter from a child step and returns +/// the final output parameter after all parent steps have been applied. +pub type KernelContinuation<'a> = dyn Fn( + &mut IndentedWriter<&mut dyn Write>, + GPUKernelParameter, +) -> Result + + 'a; + /// Trait for encoding steps that can be JIT-compiled into a CUDA kernel. /// /// Each step contributes a piece of the kernel and specifies its output variable @@ -110,10 +120,7 @@ pub trait GPUPipelineJIT { fn kernel_body( &self, w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn( - &mut IndentedWriter<&mut dyn Write>, - GPUKernelParameter, - ) -> Result, + f: &KernelContinuation, ) -> Result; /// Returns the name+type of the output variable @@ -138,10 +145,7 @@ pub trait ScalarGPUPipelineJIT { fn kernel_body( &self, w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn( - &mut IndentedWriter<&mut dyn Write>, - GPUKernelParameter, - ) -> Result, + f: &KernelContinuation, ) -> Result; /// Returns the name+type of the output variable @@ -199,12 +203,9 @@ impl GPUPipelineJIT for ScalarGPUPipelineJITNode { fn kernel_body( &self, w: &mut IndentedWriter<&mut dyn Write>, - f: &dyn Fn( - &mut IndentedWriter<&mut dyn Write>, - GPUKernelParameter, - ) -> Result, + f: &KernelContinuation, ) -> Result { - self.inner.kernel_body(w, &f) + self.inner.kernel_body(w, f) } fn output_parameter(&self) -> GPUKernelParameter { From a81472fa9a6c762709865cd994dc6d35aa9d7f98 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 16 Oct 2025 12:20:54 -0400 Subject: [PATCH 15/15] fixup Signed-off-by: Joe Isaacs --- vortex-gpu/src/jit/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vortex-gpu/src/jit/mod.rs b/vortex-gpu/src/jit/mod.rs index c74d6ef1f6a..4beb334c845 100644 --- a/vortex-gpu/src/jit/mod.rs +++ b/vortex-gpu/src/jit/mod.rs @@ -94,9 +94,9 @@ use crate::indent::IndentedWriter; /// The continuation receives the output parameter from a child step and returns /// the final output parameter after all parent steps have been applied. pub type KernelContinuation<'a> = dyn Fn( - &mut IndentedWriter<&mut dyn Write>, - GPUKernelParameter, -) -> Result + &mut IndentedWriter<&mut dyn Write>, + GPUKernelParameter, + ) -> Result + 'a; /// Trait for encoding steps that can be JIT-compiled into a CUDA kernel.