Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,16 @@ fn main() {
println!("cargo:rerun-if-changed=src/hvm.c");
println!("cargo:rerun-if-changed=src/hvm.cu");

match cc::Build::new()
let mut build = cc::Build::new();

if cfg!(target_env = "msvc") {
build.flag("/experimental:c11atomics");
}

match build
.file("src/hvm.c")
.opt_level(3)
.std("c11")
.warnings(false)
.define("TPC_L2", &*tpcl2.to_string())
.try_compile("hvm-c") {
Expand Down Expand Up @@ -38,4 +45,4 @@ fn main() {
println!("cargo:warning=\x1b[1m\x1b[31mWARNING: CUDA compiler not found.\x1b[0m \x1b[1mHVM will not be able to run on GPU.\x1b[0m");
}

}
}
44 changes: 33 additions & 11 deletions src/hvm.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#include <inttypes.h>
#include <math.h>
#include <pthread.h>
#include <threads.h>
#ifdef _WIN32
#include <windows.h>
#endif
#include <stdatomic.h>
#include <stdint.h>
#include <stdio.h>
Expand Down Expand Up @@ -113,9 +116,9 @@ typedef u32 Numb; // Numb ::= 29-bit (rounded up to u32)
#define G_RBAG_LEN (TPC * RLEN)

typedef struct Net {
APair node_buf[G_NODE_LEN]; // global node buffer
APort vars_buf[G_VARS_LEN]; // global vars buffer
APair rbag_buf[G_RBAG_LEN]; // global rbag buffer
APair* node_buf; // global node buffer, size = G_NODE_LEN
APort* vars_buf; // global vars buffer, size = G_VARS_LEN
APair* rbag_buf; // global rbag buffer, size = G_RBAG_LEN
a64 itrs; // interaction count
a32 idle; // idle thread counter
} Net;
Expand Down Expand Up @@ -268,9 +271,11 @@ static inline void swap(Port *a, Port *b) {
Port x = *a; *a = *b; *b = x;
}

#ifndef _WIN32
u32 min(u32 a, u32 b) {
return (a < b) ? a : b;
}
#endif

// A simple spin-wait barrier using atomic operations
a64 a_reached = 0; // number of threads that reached the current barrier
Expand All @@ -284,7 +289,7 @@ void sync_threads() {
} else {
u32 tries = 0;
while (atomic_load_explicit(&a_barrier, memory_order_acquire) == barrier_old) {
sched_yield();
thrd_yield();
}
}
}
Expand All @@ -301,10 +306,22 @@ u32 global_sum(u32 x) {
}

// TODO: write a time64() function that returns the time as fast as possible as a u64
// The time should be in nanoseconds, but not related to UTC time
static inline u64 time64() {

// if not on windows
#ifndef _WIN32
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (u64)ts.tv_sec * 1000000000ULL + (u64)ts.tv_nsec;
#else
// @developedby: We dont care about system time, this is just a timer.
LARGE_INTEGER freq;
LARGE_INTEGER counter;
QueryPerformanceFrequency(&freq);
QueryPerformanceCounter(&counter);
return (u64)((counter.QuadPart * 1000000000ULL) / freq.QuadPart);
#endif
}

// Ports / Pairs / Rules
Expand Down Expand Up @@ -645,6 +662,11 @@ static inline void net_init(Net* net) {
// is that needed?
atomic_store(&net->itrs, 0);
atomic_store(&net->idle, 0);

// allocates global buffers
net->node_buf = malloc(G_NODE_LEN * sizeof(APair));
net->vars_buf = malloc(G_VARS_LEN * sizeof(APort));
net->rbag_buf = malloc(G_RBAG_LEN * sizeof(APair));
}

// Allocator
Expand Down Expand Up @@ -1153,7 +1175,7 @@ void evaluator(Net* net, TM* tm, Book* book) {
}

// Chill...
sched_yield();
thrd_yield();
// Halt if all threads are idle
if (tick % 256 == 0) {
if (atomic_load_explicit(&net->idle, memory_order_relaxed) == TPC) {
Expand All @@ -1179,7 +1201,7 @@ typedef struct {
Book* book;
} ThreadArg;

void* thread_func(void* arg) {
thrd_start_t thread_func(void* arg) {
ThreadArg* data = (ThreadArg*)arg;
evaluator(data->net, data->tm, data->book);
return NULL;
Expand All @@ -1203,14 +1225,14 @@ void normalize(Net* net, Book* book) {
}

// Spawns the evaluation threads
pthread_t threads[TPC];
thrd_t threads[TPC];
for (u32 t = 0; t < TPC; ++t) {
pthread_create(&threads[t], NULL, thread_func, &thread_arg[t]);
thrd_create(&threads[t], thread_func, &thread_arg[t]);
}

// Wait for the threads to finish
for (u32 t = 0; t < TPC; ++t) {
pthread_join(threads[t], NULL);
thrd_join(threads[t], NULL);
}
}

Expand Down Expand Up @@ -1532,7 +1554,7 @@ Port io_sleep(Net* net, Book* book, u32 argc, Port* argv) {
struct timespec ts;
ts.tv_sec = dur_ns / 1000000000;
ts.tv_nsec = dur_ns % 1000000000;
nanosleep(&ts, NULL);
thrd_sleep(&ts, NULL);

// Return an eraser
return new_port(ERA, 0);
Expand Down
41 changes: 30 additions & 11 deletions src/hvm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -346,13 +346,13 @@ const u32 G_RBAG_LEN = TPB * BPG * RLEN * 3; // max 4m redexes
struct GNet {
u32 rbag_use_A; // total rbag redex count (buffer A)
u32 rbag_use_B; // total rbag redex count (buffer B)
Pair rbag_buf_A[G_RBAG_LEN]; // global redex bag (buffer A)
Pair rbag_buf_B[G_RBAG_LEN]; // global redex bag (buffer B)
Pair node_buf[G_NODE_LEN]; // global node buffer
Port vars_buf[G_VARS_LEN]; // global vars buffer
u32 node_put[TPB*BPG];
u32 vars_put[TPB*BPG];
u32 rbag_pos[TPB*BPG];
Pair* rbag_buf_A; // global redex bag (buffer A), size = G_RBAG_LEN
Pair* rbag_buf_B; // global redex bag (buffer B), size = G_RBAG_LEN
Pair* node_buf; // global node buffer, size = G_NODE_LEN
Port* vars_buf; // global vars buffer, size = G_VARS_LEN
u32* node_put; // size = TPB*BPG
u32* vars_put; // size = TPB*BPG
u32* rbag_pos; // size = TPB*BPG
u8 mode; // evaluation mode (curr)
u64 itrs; // interaction count
u64 iadd; // interaction count adder
Expand Down Expand Up @@ -1895,10 +1895,29 @@ __global__ void evaluator(GNet* gnet) {
// -------------------

GNet* gnet_create() {
GNet *gnet;
cudaMalloc((void**)&gnet, sizeof(GNet));
cudaMemset(gnet, 0, sizeof(GNet));
return gnet;
GNet gnet;
memset(&gnet, 0, sizeof(GNet));

#define ALLOCATE_HOST_POINTER(__host_pointer, __size) \
do { \
cudaMalloc((void**)&(__host_pointer), __size); \
cudaMemset(__host_pointer, 0, __size); \
} while(0)

ALLOCATE_HOST_POINTER(gnet.rbag_buf_A, G_RBAG_LEN * sizeof(Pair));
ALLOCATE_HOST_POINTER(gnet.rbag_buf_B, G_RBAG_LEN * sizeof(Pair));
ALLOCATE_HOST_POINTER(gnet.node_buf, G_NODE_LEN * sizeof(Pair));
ALLOCATE_HOST_POINTER(gnet.vars_buf, G_VARS_LEN * sizeof(Port));
ALLOCATE_HOST_POINTER(gnet.node_put, BPG * TPB * sizeof(u32));
ALLOCATE_HOST_POINTER(gnet.vars_put, BPG * TPB * sizeof(u32));
ALLOCATE_HOST_POINTER(gnet.rbag_pos, BPG * TPB * sizeof(u32));

#undef ALLOCATE_HOST_POINTER

GNet* gnet_d;
cudaMalloc(&gnet_d, sizeof(GNet));
cudaMemcpy(gnet_d, &gnet, sizeof(GNet), cudaMemcpyHostToDevice);
return gnet_d;
}

u32 gnet_get_rlen(GNet* gnet, u32 turn) {
Expand Down