Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions vortex-gpu-kernels/src/bit_unpack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ fn generate_unpack_for_width<T: FastLanes, W: Write>(
writeln!(output)?;
generate_device_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
writeln!(output)?;

generate_global_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
writeln!(output)?;
}
Expand Down
1 change: 1 addition & 0 deletions vortex-gpu-kernels/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ mod bit_unpack;
mod indent;

pub use bit_unpack::generate_unpack;
pub use indent::IndentedWriter;
3 changes: 2 additions & 1 deletion vortex-gpu/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@ version = { workspace = true }
publish = false

[dependencies]
#askama = { workspace = true }
cudarc = { workspace = true, features = ["f16"] }
itertools = { workspace = true }
parking_lot = { workspace = true }
vortex-alp = { workspace = true }
vortex-array = { workspace = true }
vortex-buffer = { workspace = true, features = ["cuda"] }
vortex-dict = { workspace = true }
Expand Down
69 changes: 66 additions & 3 deletions vortex-gpu/benches/gpu_bitunpack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,15 @@ use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_m
use cudarc::driver::CudaContext;
use rand::prelude::StdRng;
use rand::{Rng, SeedableRng};
use vortex_array::{IntoArray, ToCanonical};
use vortex_alp::{ALPArray, Exponents};
use vortex_array::{Array, ArrayRef, IntoArray, ToCanonical};
use vortex_buffer::BufferMut;
use vortex_dtype::NativePType;
use vortex_error::VortexUnwrap;
use vortex_fastlanes::{BitPackedArray, FoRArray};
use vortex_gpu::{cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed};
use vortex_gpu::{
create_run_jit_kernel, cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed,
};

// Data sizes: 1GB, 2.5GB, 5GB, 10GB
// These are approximate sizes in bytes, accounting for bit-packing compression
Expand Down Expand Up @@ -61,6 +64,32 @@ fn make_for_bitpackable_array(len: usize) -> FoRArray {
FoRArray::try_new(bitpacked.into_array(), reference.into()).vortex_unwrap()
}

fn make_alp_array(len: usize) -> ArrayRef {
let mut rng = StdRng::seed_from_u64(42);
let reference = 100i32;

// Generate values that fit in 6 bits (0-63)
let values = (0..len)
.map(|_| rng.random_range(0..64))
.collect::<BufferMut<i32>>()
.into_array()
.to_primitive();

// Create bitpacked array first
let bitpacked = BitPackedArray::encode(values.as_ref(), 6).unwrap();

// Wrap in FoR encoding with reference value
ALPArray::try_new(
FoRArray::try_new(bitpacked.into_array(), reference.into())
.vortex_unwrap()
.into_array(),
Exponents { e: 4, f: 5 },
None,
)
.vortex_unwrap()
.into_array()
}

fn benchmark_gpu_decompress_kernel_only(c: &mut Criterion) {
let mut group = c.benchmark_group("gpu_decompress_kernel_only");

Expand Down Expand Up @@ -155,6 +184,37 @@ fn benchmark_gpu_for_bp_fused_decompress_kernel_only(c: &mut Criterion) {
group.finish();
}

fn benchmark_gpu_for_bp_jit_decompress_kernel_only(c: &mut Criterion) {
let mut group = c.benchmark_group("benchmark_gpu_for_bp_jit_decompress_kernel_only");

group.sample_size(10);

for (len, label) in DATA_SIZES {
let len = len.next_multiple_of(1024);
let array = make_alp_array(len).into_array();

let ctx = CudaContext::new(0).unwrap();
ctx.set_blocking_synchronize().unwrap();

group.throughput(Throughput::Bytes(
(len * array.dtype().as_ptype().byte_width()) as u64,
));
group.bench_with_input(BenchmarkId::new("for/jit", label), &array, |b, array| {
b.iter_custom(|iters| {
let mut total_time = Duration::ZERO;
for _ in 0..iters {
// This only measures kernel execution time, not memory transfers
let (_result, kernel_time) = create_run_jit_kernel(ctx.clone(), array).unwrap();
total_time += kernel_time;
}
total_time
});
});
}

group.finish();
}

#[allow(dead_code)]
fn benchmark_cpu_canonicalize(c: &mut Criterion) {
let mut group = c.benchmark_group("cpu_canonicalize");
Expand All @@ -176,6 +236,9 @@ criterion_group!(
benches,
benchmark_gpu_decompress_kernel_only,
benchmark_gpu_for_decompress_kernel_only,
benchmark_gpu_for_bp_fused_decompress_kernel_only
benchmark_gpu_for_bp_fused_decompress_kernel_only,
benchmark_gpu_for_bp_jit_decompress_kernel_only
);

// criterion_group!(benches, benchmark_gpu_for_bp_jit_decompress_kernel_only);
criterion_main!(benches);
2 changes: 1 addition & 1 deletion vortex-gpu/src/bit_unpack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ pub fn new_task(

let launch_config = LaunchConfig {
grid_dim: (num_chunks, 1, 1),
block_dim: (if size_of::<P>() == 8 { 16 } else { 32 }, 1, 1),
block_dim: (if P::BITS == 64 { 16 } else { 32 }, 1, 1),
shared_mem_bytes: 0,
};

Expand Down
36 changes: 36 additions & 0 deletions vortex-gpu/src/indent.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use std::fmt;
use std::fmt::Write;

pub struct IndentedWriter<W: Write> {
write: W,
indent: String,
}

impl<W: Write> IndentedWriter<W> {
pub fn new(write: W) -> Self {
Self {
write,
indent: "".to_string(),
}
}

pub fn indent<F>(&mut self, indented: F) -> fmt::Result
where
F: FnOnce(&mut IndentedWriter<W>) -> fmt::Result,
{
let original_ident = self.indent.clone();
self.indent += " ";
let res = indented(self);
self.indent = original_ident;
res
}

pub fn write_fmt(&mut self, fmt: fmt::Arguments<'_>) -> fmt::Result {
write!(self.write, "{}{}", self.indent, fmt)
}
}

pub type IndentedWrite<'a> = IndentedWriter<&'a mut dyn Write>;
130 changes: 130 additions & 0 deletions vortex-gpu/src/jit/arrays/alp.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use std::fmt;
use std::fmt::Write;
use std::sync::Arc;

use cudarc::driver::{CudaStream, DeviceRepr, LaunchArgs, PushKernelArg};
use vortex_alp::{ALPArray, ALPFloat, match_each_alp_float_ptype};
use vortex_dtype::PType;
use vortex_error::VortexResult;

use crate::indent::{IndentedWrite, IndentedWriter};
use crate::jit::convert::handle_array;
use crate::jit::{
CUDAType, GPUKernelParameter, GPUPipelineJIT, ScalarGPUPipelineJIT, ScalarGPUPipelineJITNode,
StepIdAllocator,
};

struct Alp<A: ALPFloat> {
step_id: usize,
float_type: PType,
child: Box<dyn GPUPipelineJIT>,
f: A,
e: A,
}

pub fn new_jit(
alp: &ALPArray,
stream: &Arc<CudaStream>,
allocator: &mut StepIdAllocator,
output_array: String,
) -> Box<dyn GPUPipelineJIT> {
match_each_alp_float_ptype!(alp.ptype(), |A| {
let child = handle_array(alp.encoded(), stream, allocator, output_array);
let step_id = allocator.fresh_id();
Box::new(ScalarGPUPipelineJITNode {
inner: Alp {
step_id,
float_type: alp.ptype(),
child,
f: A::F10[alp.exponents().f as usize],
e: A::IF10[alp.exponents().e as usize],
},
})
})
}

impl<A: ALPFloat> Alp<A> {
fn tmp_var(&self) -> String {
format!("tmp{}", self.step_id)
}

fn e_var(&self) -> String {
format!("e{}", self.step_id)
}

fn f_var(&self) -> String {
format!("f{}", self.step_id)
}
}

impl<A: ALPFloat + DeviceRepr> ScalarGPUPipelineJIT for Alp<A> {
fn in_params(&self, params: &mut Vec<GPUKernelParameter>) {
params.extend([
GPUKernelParameter {
name: self.e_var(),
type_: CUDAType::from(A::PTYPE).to_string(),
},
GPUKernelParameter {
name: self.f_var(),
type_: CUDAType::from(A::PTYPE).to_string(),
},
])
}

fn args<'a>(
&'a self,
_stream: &Arc<CudaStream>,
args: &mut LaunchArgs<'a>,
) -> VortexResult<()> {
args.arg(&self.e);
args.arg(&self.f);
Ok(())
}

fn decls(&self, w: &mut IndentedWriter<&mut dyn Write>) -> fmt::Result {
let output_cuda_type = CUDAType::from(self.float_type);
writeln!(w, "{} tmp{};", output_cuda_type, self.step_id)?;
Ok(())
}

fn kernel_body(
&self,
w: &mut IndentedWriter<&mut dyn Write>,
f: &dyn Fn(
&mut IndentedWriter<&mut dyn Write>,
GPUKernelParameter,
) -> Result<GPUKernelParameter, fmt::Error>,
) -> Result<GPUKernelParameter, fmt::Error> {
self.child
.kernel_body(w, &|w: &mut IndentedWrite, in_: GPUKernelParameter| {
let in_var = in_.name;
writeln!(
w,
"{out} = ((({type_}){in_var}) * {f}) * {e};",
out = self.tmp_var(),
type_ = CUDAType::from(self.float_type),
f = self.f_var(),
e = self.e_var(),
)?;
f(w, self.output_parameter())
})
}

fn output_parameter(&self) -> GPUKernelParameter {
GPUKernelParameter {
name: self.tmp_var(),
type_: CUDAType::from(self.float_type).to_string(),
}
}

fn output_type(&self) -> PType {
self.float_type
}

fn child(&self) -> &dyn GPUPipelineJIT {
self.child.as_ref()
}
}
Loading
Loading