uccl-project · Yuy1L1 · Feb 1, 2026 · Feb 2, 2026
diff --git a/thirdparty/dietgpu/Makefile.lib b/thirdparty/dietgpu/Makefile.lib
@@ -0,0 +1,79 @@
+# Makefile for building libdietgpu.so (standalone, no PyTorch dependency)
+#
+# Usage:
+#   make -f Makefile.lib              # build libdietgpu.so
+#   make -f Makefile.lib clean        # remove build artifacts
+#   make -f Makefile.lib SM=80        # override GPU architecture
+
+CUDA_HOME ?= /usr/local/cuda
+NVCC      := $(CUDA_HOME)/bin/nvcc
+CXX       := g++
+
+# GPU architecture (override with: make SM=80)
+SM ?= 90
+
+# Directories
+DIETGPU   := dietgpu
+BUILDDIR  := build_lib
+
+# Output
+LIB := libdietgpu.so
+
+# Source files
+CU_SRCS := \
+	$(DIETGPU)/ans/GpuANSEncode.cu \
+	$(DIETGPU)/ans/GpuANSDecode.cu \
+	$(DIETGPU)/ans/GpuANSInfo.cu \
+	$(DIETGPU)/float/GpuFloatCompress.cu \
+	$(DIETGPU)/float/GpuFloatDecompress.cu \
+	$(DIETGPU)/float/GpuFloatInfo.cu
+
+CPP_SRCS := \
+	$(DIETGPU)/utils/DeviceUtils.cpp \
+	$(DIETGPU)/utils/StackDeviceMemory.cpp
+
+# Object files
+CU_OBJS  := $(patsubst $(DIETGPU)/%.cu,$(BUILDDIR)/%.o,$(CU_SRCS))
+CPP_OBJS := $(patsubst $(DIETGPU)/%.cpp,$(BUILDDIR)/%.o,$(CPP_SRCS))
+LIB_OBJS := $(CU_OBJS) $(CPP_OBJS)
+
+# -I. resolves #include "dietgpu/ans/..." to ./dietgpu/ans/...
+INCLUDES := -I. -I$(CUDA_HOME)/include
+
+# Compiler flags
+CXXFLAGS := -O3 -fPIC -std=c++17 \
+	-Wno-deprecated-declarations \
+	-Wno-unused-variable \
+	-Wno-sign-compare \
+	-Wno-reorder \
+	-Wno-attributes \
+	-Wno-unused-result \
+	-Wno-unused-function
+
+NVCCFLAGS := -O3 -std=c++17 --compiler-options '-fPIC' \
+	-gencode=arch=compute_$(SM),code=sm_$(SM) \
+	-gencode=arch=compute_$(SM),code=compute_$(SM) \
+	-DDISABLE_AGGRESSIVE_PTX_INSTRS
+
+LDLIBS := -lglog -L$(CUDA_HOME)/lib64 -lcudart
+
+# ---- Targets ----
+
+all: $(LIB)
+
+$(LIB): $(LIB_OBJS)
+	$(CXX) -shared -o $@ $^ $(LDLIBS)
+	@echo "Built: $@"
+
+$(BUILDDIR)/%.o: $(DIETGPU)/%.cu
+	@mkdir -p $(dir $@)
+	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -c $< -o $@
+
+$(BUILDDIR)/%.o: $(DIETGPU)/%.cpp
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+clean:
+	rm -rf $(BUILDDIR) $(LIB)
+
+.PHONY: all clean
diff --git a/thirdparty/dietgpu/dietgpu/ans/GpuANSEncode.cuh b/thirdparty/dietgpu/dietgpu/ans/GpuANSEncode.cuh
@@ -26,8 +26,8 @@
 #include <vector>
 
 namespace dietgpu {
-#if defined(__HIP_PLATFORM_AMD__)
-  // HIP does not support thrust exec check disabling
+#if defined(__HIP_PLATFORM_AMD__) || !defined(__thrust_exec_check_disable__)
+  // HIP or newer CUDA versions do not support __thrust_exec_check_disable__
   #define THRUST_DISABLE_EXEC_CHECK
 #else
   #define THRUST_DISABLE_EXEC_CHECK __thrust_exec_check_disable__

diff --git a/thirdparty/dietgpu/test/.gitignore b/thirdparty/dietgpu/test/.gitignore
@@ -0,0 +1,4 @@
+obj/
+*.so
+test_float_compress
+!test_float_compress.cu
diff --git a/thirdparty/dietgpu/test/Makefile b/thirdparty/dietgpu/test/Makefile
@@ -0,0 +1,55 @@
+# Makefile for the dietgpu float compression test
+#
+# Requires libdietgpu.so to be built first:
+#   make -f Makefile.lib    (from the parent directory)
+#
+# Usage:
+#   make              # build the test binary
+#   make run          # build and run
+#   make clean        # remove build artifacts
+#   make SM=80        # override GPU architecture
+
+CUDA_HOME ?= /usr/local/cuda
+NVCC      := $(CUDA_HOME)/bin/nvcc
+CXX       := g++
+
+SM ?= 90
+
+BUILDDIR := obj
+TEST     := test_float_compress
+LIB      := ../libdietgpu.so
+
+# -I../.. resolves #include "dietgpu/..." to ../../dietgpu/...
+# But we're inside thirdparty/dietgpu/test/, and sources use
+# #include "dietgpu/float/...", so -I.. resolves to ../dietgpu/...
+INCLUDES := -I.. -I$(CUDA_HOME)/include
+
+NVCCFLAGS := -O3 -std=c++17 --compiler-options '-fPIC' \
+	-gencode=arch=compute_$(SM),code=sm_$(SM) \
+	-gencode=arch=compute_$(SM),code=compute_$(SM) \
+	-DDISABLE_AGGRESSIVE_PTX_INSTRS
+
+LDLIBS := -lglog -L$(CUDA_HOME)/lib64 -lcudart
+
+# ---- Targets ----
+
+all: $(TEST)
+
+$(TEST): $(BUILDDIR)/test_float_compress.o $(LIB)
+	$(CXX) -o $@ $< -L.. -ldietgpu $(LDLIBS)
+	@echo "Built: $@"
+
+$(BUILDDIR)/test_float_compress.o: test_float_compress.cu
+	@mkdir -p $(dir $@)
+	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -c $< -o $@
+
+$(LIB):
+	$(MAKE) -C .. -f Makefile.lib SM=$(SM)
+
+clean:
+	rm -rf $(BUILDDIR) $(TEST)
+
+run: $(TEST)
+	LD_LIBRARY_PATH=..:$(CUDA_HOME)/lib64:$$LD_LIBRARY_PATH ./$(TEST)
+
+.PHONY: all clean run
diff --git a/thirdparty/dietgpu/test/test_float_compress.cu b/thirdparty/dietgpu/test/test_float_compress.cu
@@ -0,0 +1,199 @@
+/**
+ * Standalone C++ test for dietgpu float compression/decompression.
+ * Verifies the float codec by:
+ *   1. Generating random float16 data on GPU
+ *   2. Compressing it with floatCompress
+ *   3. Decompressing the result with floatDecompress
+ *   4. Comparing the decompressed output against the original
+ */
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <glog/logging.h>
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+#include <vector>
+
+#include "dietgpu/float/GpuFloatCodec.h"
+#include "dietgpu/utils/StackDeviceMemory.h"
+
+using namespace dietgpu;
+
+#define CHECK_CUDA(call)                                                    \
+  do {                                                                      \
+    cudaError_t err = (call);                                               \
+    if (err != cudaSuccess) {                                               \
+      fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,     \
+              cudaGetErrorString(err));                                      \
+      exit(1);                                                              \
+    }                                                                       \
+  } while (0)
+
+// Generate pseudo-random float16 data on host, then copy to device
+void generateTestData(half** d_data, uint32_t numElements) {
+  std::vector<half> hostData(numElements);
+  srand(42);
+  for (uint32_t i = 0; i < numElements; i++) {
+    // Normal-ish distribution: values in [-2, 2]
+    float val = ((float)rand() / RAND_MAX) * 4.0f - 2.0f;
+    hostData[i] = __float2half(val);
+  }
+
+  CHECK_CUDA(cudaMalloc(d_data, numElements * sizeof(half)));
+  CHECK_CUDA(cudaMemcpy(*d_data, hostData.data(),
+                         numElements * sizeof(half), cudaMemcpyHostToDevice));
+}
+
+bool runTest(uint32_t numElements) {
+  printf("=== Testing with %u float16 elements (%.2f KB) ===\n",
+         numElements, numElements * sizeof(half) / 1024.0f);
+
+  // 1. Setup
+  int device = 0;
+  CHECK_CUDA(cudaSetDevice(device));
+
+  cudaStream_t stream;
+  CHECK_CUDA(cudaStreamCreate(&stream));
+
+  // Allocate 256 MB of temp memory for dietgpu
+  StackDeviceMemory res(device, 256 * 1024 * 1024);
+
+  // 2. Generate input data
+  half* d_input = nullptr;
+  generateTestData(&d_input, numElements);
+
+  // 3. Allocate output buffer for compression
+  uint32_t maxCompSize = getMaxFloatCompressedSize(FloatType::kFloat16, numElements);
+  void* d_compressed = nullptr;
+  CHECK_CUDA(cudaMalloc(&d_compressed, maxCompSize));
+  printf("  Max compressed size: %u bytes\n", maxCompSize);
+
+  // 4. Compress (batch of 1)
+  const void* inPtrs[1] = {d_input};
+  uint32_t inSizes[1] = {numElements};  // in float words, not bytes
+  void* outPtrs[1] = {d_compressed};
+
+  uint32_t* d_compressedSize = nullptr;
+  CHECK_CUDA(cudaMalloc(&d_compressedSize, sizeof(uint32_t)));
+
+  FloatCompressConfig compConfig;
+  compConfig.floatType = FloatType::kFloat16;
+  compConfig.useChecksum = false;
+  compConfig.is16ByteAligned = false;
+
+  floatCompress(res, compConfig, 1, inPtrs, inSizes, outPtrs,
+                d_compressedSize, stream);
+  CHECK_CUDA(cudaStreamSynchronize(stream));
+
+  // Read back compressed size
+  uint32_t compressedSize = 0;
+  CHECK_CUDA(cudaMemcpy(&compressedSize, d_compressedSize, sizeof(uint32_t),
+                         cudaMemcpyDeviceToHost));
+
+  float ratio = (float)(numElements * sizeof(half)) / compressedSize;
+  printf("  Compressed size: %u bytes (ratio: %.2fx)\n", compressedSize, ratio);
+
+  // 5. Decompress
+  half* d_output = nullptr;
+  CHECK_CUDA(cudaMalloc(&d_output, numElements * sizeof(half)));
+  CHECK_CUDA(cudaMemset(d_output, 0, numElements * sizeof(half)));
+
+  const void* decInPtrs[1] = {d_compressed};
+  void* decOutPtrs[1] = {d_output};
+  uint32_t outCapacities[1] = {numElements};  // in float words
+
+  uint8_t* d_success = nullptr;
+  CHECK_CUDA(cudaMalloc(&d_success, sizeof(uint8_t)));
+
+  uint32_t* d_decompSize = nullptr;
+  CHECK_CUDA(cudaMalloc(&d_decompSize, sizeof(uint32_t)));
+
+  FloatDecompressConfig decConfig;
+  decConfig.floatType = FloatType::kFloat16;
+  decConfig.useChecksum = false;
+  decConfig.is16ByteAligned = false;
+
+  FloatDecompressStatus status = floatDecompress(
+      res, decConfig, 1, decInPtrs, decOutPtrs, outCapacities,
+      d_success, d_decompSize, stream);
+  CHECK_CUDA(cudaStreamSynchronize(stream));
+
+  if (status.error != FloatDecompressError::None) {
+    printf("  FAIL: Decompression returned error\n");
+    for (auto& e : status.errorInfo) {
+      printf("    Batch %d: %s\n", e.first, e.second.c_str());
+    }
+    return false;
+  }
+
+  // Check success flag
+  uint8_t success = 0;
+  CHECK_CUDA(cudaMemcpy(&success, d_success, sizeof(uint8_t),
+                         cudaMemcpyDeviceToHost));
+  if (!success) {
+    printf("  FAIL: Decompression success flag is false\n");
+    return false;
+  }
+
+  // 6. Verify: compare original vs decompressed
+  std::vector<half> origHost(numElements);
+  std::vector<half> decompHost(numElements);
+  CHECK_CUDA(cudaMemcpy(origHost.data(), d_input, numElements * sizeof(half),
+                         cudaMemcpyDeviceToHost));
+  CHECK_CUDA(cudaMemcpy(decompHost.data(), d_output, numElements * sizeof(half),
+                         cudaMemcpyDeviceToHost));
+
+  bool match = true;
+  for (uint32_t i = 0; i < numElements; i++) {
+    if (memcmp(&origHost[i], &decompHost[i], sizeof(half)) != 0) {
+      printf("  FAIL: Mismatch at index %u: orig=0x%04x, decomp=0x%04x\n",
+             i, *(uint16_t*)&origHost[i], *(uint16_t*)&decompHost[i]);
+      match = false;
+      break;
+    }
+  }
+
+  if (match) {
+    printf("  PASS: Decompressed data matches original exactly\n");
+  }
+
+  // Cleanup
+  CHECK_CUDA(cudaFree(d_input));
+  CHECK_CUDA(cudaFree(d_compressed));
+  CHECK_CUDA(cudaFree(d_compressedSize));
+  CHECK_CUDA(cudaFree(d_output));
+  CHECK_CUDA(cudaFree(d_success));
+  CHECK_CUDA(cudaFree(d_decompSize));
+  CHECK_CUDA(cudaStreamDestroy(stream));
+
+  return match;
+}
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+
+  int deviceCount = 0;
+  CHECK_CUDA(cudaGetDeviceCount(&deviceCount));
+  if (deviceCount == 0) {
+    fprintf(stderr, "No CUDA devices found\n");
+    return 1;
+  }
+
+  cudaDeviceProp prop;
+  CHECK_CUDA(cudaGetDeviceProperties(&prop, 0));
+  printf("Using GPU: %s (compute %d.%d)\n\n", prop.name, prop.major, prop.minor);
+
+  bool allPassed = true;
+
+  // Test with various sizes
+  // dietgpu works best with >= 512 KiB of data
+  allPassed &= runTest(512 * 1024);       // 512K elements = 1 MB
+  allPassed &= runTest(128 * 512 * 1024); // 64M elements = 128 MB
+  allPassed &= runTest(1024);             // Small test (1K elements)
+
+  printf("\n%s\n", allPassed ? "ALL TESTS PASSED" : "SOME TESTS FAILED");
+  return allPassed ? 0 : 1;
+}