Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions thirdparty/dietgpu/Makefile.lib
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Makefile for building libdietgpu.so (standalone, no PyTorch dependency)
#
# Usage:
# make -f Makefile.lib # build libdietgpu.so
# make -f Makefile.lib clean # remove build artifacts
# make -f Makefile.lib SM=80 # override GPU architecture

CUDA_HOME ?= /usr/local/cuda
NVCC := $(CUDA_HOME)/bin/nvcc
CXX := g++

# GPU architecture (override with: make SM=80)
SM ?= 90

# Directories
DIETGPU := dietgpu
BUILDDIR := build_lib

# Output
LIB := libdietgpu.so

# Source files
CU_SRCS := \
$(DIETGPU)/ans/GpuANSEncode.cu \
$(DIETGPU)/ans/GpuANSDecode.cu \
$(DIETGPU)/ans/GpuANSInfo.cu \
$(DIETGPU)/float/GpuFloatCompress.cu \
$(DIETGPU)/float/GpuFloatDecompress.cu \
$(DIETGPU)/float/GpuFloatInfo.cu

CPP_SRCS := \
$(DIETGPU)/utils/DeviceUtils.cpp \
$(DIETGPU)/utils/StackDeviceMemory.cpp

# Object files
CU_OBJS := $(patsubst $(DIETGPU)/%.cu,$(BUILDDIR)/%.o,$(CU_SRCS))
CPP_OBJS := $(patsubst $(DIETGPU)/%.cpp,$(BUILDDIR)/%.o,$(CPP_SRCS))
LIB_OBJS := $(CU_OBJS) $(CPP_OBJS)

# -I. resolves #include "dietgpu/ans/..." to ./dietgpu/ans/...
INCLUDES := -I. -I$(CUDA_HOME)/include

# Compiler flags
CXXFLAGS := -O3 -fPIC -std=c++17 \
-Wno-deprecated-declarations \
-Wno-unused-variable \
-Wno-sign-compare \
-Wno-reorder \
-Wno-attributes \
-Wno-unused-result \
-Wno-unused-function

NVCCFLAGS := -O3 -std=c++17 --compiler-options '-fPIC' \
-gencode=arch=compute_$(SM),code=sm_$(SM) \
-gencode=arch=compute_$(SM),code=compute_$(SM) \
-DDISABLE_AGGRESSIVE_PTX_INSTRS

LDLIBS := -lglog -L$(CUDA_HOME)/lib64 -lcudart

# ---- Targets ----

all: $(LIB)

$(LIB): $(LIB_OBJS)
$(CXX) -shared -o $@ $^ $(LDLIBS)
@echo "Built: $@"

$(BUILDDIR)/%.o: $(DIETGPU)/%.cu
@mkdir -p $(dir $@)
$(NVCC) $(NVCCFLAGS) $(INCLUDES) -c $< -o $@

$(BUILDDIR)/%.o: $(DIETGPU)/%.cpp
@mkdir -p $(dir $@)
$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@

clean:
rm -rf $(BUILDDIR) $(LIB)

.PHONY: all clean
4 changes: 2 additions & 2 deletions thirdparty/dietgpu/dietgpu/ans/GpuANSEncode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
#include <vector>

namespace dietgpu {
#if defined(__HIP_PLATFORM_AMD__)
// HIP does not support thrust exec check disabling
#if defined(__HIP_PLATFORM_AMD__) || !defined(__thrust_exec_check_disable__)
// HIP or newer CUDA versions do not support __thrust_exec_check_disable__
#define THRUST_DISABLE_EXEC_CHECK
#else
#define THRUST_DISABLE_EXEC_CHECK __thrust_exec_check_disable__
Expand Down
4 changes: 4 additions & 0 deletions thirdparty/dietgpu/test/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
obj/
*.so
test_float_compress
!test_float_compress.cu
55 changes: 55 additions & 0 deletions thirdparty/dietgpu/test/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Makefile for the dietgpu float compression test
#
# Requires libdietgpu.so to be built first:
# make -f Makefile.lib (from the parent directory)
#
# Usage:
# make # build the test binary
# make run # build and run
# make clean # remove build artifacts
# make SM=80 # override GPU architecture

CUDA_HOME ?= /usr/local/cuda
NVCC := $(CUDA_HOME)/bin/nvcc
CXX := g++

SM ?= 90

BUILDDIR := obj
TEST := test_float_compress
LIB := ../libdietgpu.so

# -I../.. resolves #include "dietgpu/..." to ../../dietgpu/...
# But we're inside thirdparty/dietgpu/test/, and sources use
# #include "dietgpu/float/...", so -I.. resolves to ../dietgpu/...
INCLUDES := -I.. -I$(CUDA_HOME)/include

NVCCFLAGS := -O3 -std=c++17 --compiler-options '-fPIC' \
-gencode=arch=compute_$(SM),code=sm_$(SM) \
-gencode=arch=compute_$(SM),code=compute_$(SM) \
-DDISABLE_AGGRESSIVE_PTX_INSTRS

LDLIBS := -lglog -L$(CUDA_HOME)/lib64 -lcudart

# ---- Targets ----

all: $(TEST)

$(TEST): $(BUILDDIR)/test_float_compress.o $(LIB)
$(CXX) -o $@ $< -L.. -ldietgpu $(LDLIBS)
@echo "Built: $@"

$(BUILDDIR)/test_float_compress.o: test_float_compress.cu
@mkdir -p $(dir $@)
$(NVCC) $(NVCCFLAGS) $(INCLUDES) -c $< -o $@

$(LIB):
$(MAKE) -C .. -f Makefile.lib SM=$(SM)

clean:
rm -rf $(BUILDDIR) $(TEST)

run: $(TEST)
LD_LIBRARY_PATH=..:$(CUDA_HOME)/lib64:$$LD_LIBRARY_PATH ./$(TEST)

.PHONY: all clean run
199 changes: 199 additions & 0 deletions thirdparty/dietgpu/test/test_float_compress.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
/**
* Standalone C++ test for dietgpu float compression/decompression.
* Verifies the float codec by:
* 1. Generating random float16 data on GPU
* 2. Compressing it with floatCompress
* 3. Decompressing the result with floatDecompress
* 4. Comparing the decompressed output against the original
*/

#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <glog/logging.h>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cmath>
#include <vector>

#include "dietgpu/float/GpuFloatCodec.h"
#include "dietgpu/utils/StackDeviceMemory.h"

using namespace dietgpu;

#define CHECK_CUDA(call) \
do { \
cudaError_t err = (call); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(err)); \
exit(1); \
} \
} while (0)

// Generate pseudo-random float16 data on host, then copy to device
void generateTestData(half** d_data, uint32_t numElements) {
std::vector<half> hostData(numElements);
srand(42);
for (uint32_t i = 0; i < numElements; i++) {
// Normal-ish distribution: values in [-2, 2]
float val = ((float)rand() / RAND_MAX) * 4.0f - 2.0f;
hostData[i] = __float2half(val);
}

CHECK_CUDA(cudaMalloc(d_data, numElements * sizeof(half)));
CHECK_CUDA(cudaMemcpy(*d_data, hostData.data(),
numElements * sizeof(half), cudaMemcpyHostToDevice));
}

bool runTest(uint32_t numElements) {
printf("=== Testing with %u float16 elements (%.2f KB) ===\n",
numElements, numElements * sizeof(half) / 1024.0f);

// 1. Setup
int device = 0;
CHECK_CUDA(cudaSetDevice(device));

cudaStream_t stream;
CHECK_CUDA(cudaStreamCreate(&stream));

// Allocate 256 MB of temp memory for dietgpu
StackDeviceMemory res(device, 256 * 1024 * 1024);

// 2. Generate input data
half* d_input = nullptr;
generateTestData(&d_input, numElements);

// 3. Allocate output buffer for compression
uint32_t maxCompSize = getMaxFloatCompressedSize(FloatType::kFloat16, numElements);
void* d_compressed = nullptr;
CHECK_CUDA(cudaMalloc(&d_compressed, maxCompSize));
printf(" Max compressed size: %u bytes\n", maxCompSize);

// 4. Compress (batch of 1)
const void* inPtrs[1] = {d_input};
uint32_t inSizes[1] = {numElements}; // in float words, not bytes
void* outPtrs[1] = {d_compressed};

uint32_t* d_compressedSize = nullptr;
CHECK_CUDA(cudaMalloc(&d_compressedSize, sizeof(uint32_t)));

FloatCompressConfig compConfig;
compConfig.floatType = FloatType::kFloat16;
compConfig.useChecksum = false;
compConfig.is16ByteAligned = false;

floatCompress(res, compConfig, 1, inPtrs, inSizes, outPtrs,
d_compressedSize, stream);
CHECK_CUDA(cudaStreamSynchronize(stream));

// Read back compressed size
uint32_t compressedSize = 0;
CHECK_CUDA(cudaMemcpy(&compressedSize, d_compressedSize, sizeof(uint32_t),
cudaMemcpyDeviceToHost));

float ratio = (float)(numElements * sizeof(half)) / compressedSize;
printf(" Compressed size: %u bytes (ratio: %.2fx)\n", compressedSize, ratio);

// 5. Decompress
half* d_output = nullptr;
CHECK_CUDA(cudaMalloc(&d_output, numElements * sizeof(half)));
CHECK_CUDA(cudaMemset(d_output, 0, numElements * sizeof(half)));

const void* decInPtrs[1] = {d_compressed};
void* decOutPtrs[1] = {d_output};
uint32_t outCapacities[1] = {numElements}; // in float words

uint8_t* d_success = nullptr;
CHECK_CUDA(cudaMalloc(&d_success, sizeof(uint8_t)));

uint32_t* d_decompSize = nullptr;
CHECK_CUDA(cudaMalloc(&d_decompSize, sizeof(uint32_t)));

FloatDecompressConfig decConfig;
decConfig.floatType = FloatType::kFloat16;
decConfig.useChecksum = false;
decConfig.is16ByteAligned = false;

FloatDecompressStatus status = floatDecompress(
res, decConfig, 1, decInPtrs, decOutPtrs, outCapacities,
d_success, d_decompSize, stream);
CHECK_CUDA(cudaStreamSynchronize(stream));

if (status.error != FloatDecompressError::None) {
printf(" FAIL: Decompression returned error\n");
for (auto& e : status.errorInfo) {
printf(" Batch %d: %s\n", e.first, e.second.c_str());
}
return false;
}

// Check success flag
uint8_t success = 0;
CHECK_CUDA(cudaMemcpy(&success, d_success, sizeof(uint8_t),
cudaMemcpyDeviceToHost));
if (!success) {
printf(" FAIL: Decompression success flag is false\n");
return false;
}

// 6. Verify: compare original vs decompressed
std::vector<half> origHost(numElements);
std::vector<half> decompHost(numElements);
CHECK_CUDA(cudaMemcpy(origHost.data(), d_input, numElements * sizeof(half),
cudaMemcpyDeviceToHost));
CHECK_CUDA(cudaMemcpy(decompHost.data(), d_output, numElements * sizeof(half),
cudaMemcpyDeviceToHost));

bool match = true;
for (uint32_t i = 0; i < numElements; i++) {
if (memcmp(&origHost[i], &decompHost[i], sizeof(half)) != 0) {
printf(" FAIL: Mismatch at index %u: orig=0x%04x, decomp=0x%04x\n",
i, *(uint16_t*)&origHost[i], *(uint16_t*)&decompHost[i]);
match = false;
break;
}
}

if (match) {
printf(" PASS: Decompressed data matches original exactly\n");
}

// Cleanup
CHECK_CUDA(cudaFree(d_input));
CHECK_CUDA(cudaFree(d_compressed));
CHECK_CUDA(cudaFree(d_compressedSize));
CHECK_CUDA(cudaFree(d_output));
CHECK_CUDA(cudaFree(d_success));
CHECK_CUDA(cudaFree(d_decompSize));
CHECK_CUDA(cudaStreamDestroy(stream));

return match;
}

int main(int argc, char** argv) {
google::InitGoogleLogging(argv[0]);

int deviceCount = 0;
CHECK_CUDA(cudaGetDeviceCount(&deviceCount));
if (deviceCount == 0) {
fprintf(stderr, "No CUDA devices found\n");
return 1;
}

cudaDeviceProp prop;
CHECK_CUDA(cudaGetDeviceProperties(&prop, 0));
printf("Using GPU: %s (compute %d.%d)\n\n", prop.name, prop.major, prop.minor);

bool allPassed = true;

// Test with various sizes
// dietgpu works best with >= 512 KiB of data
allPassed &= runTest(512 * 1024); // 512K elements = 1 MB
allPassed &= runTest(128 * 512 * 1024); // 64M elements = 128 MB
allPassed &= runTest(1024); // Small test (1K elements)

printf("\n%s\n", allPassed ? "ALL TESTS PASSED" : "SOME TESTS FAILED");
return allPassed ? 0 : 1;
}