diff --git a/Cargo.toml b/Cargo.toml
index 41920e80..5c0b2d8c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,6 +6,10 @@ build = "build.rs"
 description = "A massively parallel, optimal functional runtime in Rust."
 license = "Apache-2.0"
 
+[lib]
+name = "hvm"
+path = "src/lib.rs"
+
 [dependencies]
 TSPL = "0.0.12"
 clap = "4.5.2"
diff --git a/POST.md b/POST.md
deleted file mode 100644
index 7d50612b..00000000
--- a/POST.md
+++ /dev/null
@@ -1,125 +0,0 @@
-# Bend: a high-level parallel language
-
-Bend is a massively parallel, high-level programming language. Unlike existing
-alternatives like CUDA, OpenCL and Metal, which are low-level and limited, Bend
-has the feel and features of a modern language like Python and Haskell. Yet, for
-the first time ever, this language runs directly on GPUs, using
-thousands of cores with 0 annotations, powered by the [HVM2](https://github.com/HigherOrderCO/hvm2).
-
-HVM2 is the successor to HVM1, a 2022 prototype of this concept. Through the
-last year, we put massive effort into polishing, simplifying, and verifying
-HVM's correctness. As a result, we're now finally able to run it smoothly on
-GPUs. It still has many limitations, but it is finally stable, production-ready,
-and a solid foundation for all that is to come. HOC will provide long-term
-support for all features listed on HVM2's whitepaper.
-
-## Using Bend
-
-Using Bend is simple. Just install it with:
-
-//TODO
-
-And then run `bend <file.bend>`.
-
-## Examples
-
-### Parallel Sum
-
-In Bend, any work that can be done in parallel *will* be done in parallel, with
-maximum granularity. For example, in `f(A) + g(B)`, both calls are independent,
-so Bend will execute them in parallel. This gives us a very general way to
-implement parallelism: via recursion. For example, the program below adds all
-numbers from 0 to ~1 billion in a *parallelizable fashion*:
-
-```
-# Sums all numbers from 0 til 2^depth:
-def sum(depth, x):
-  switch depth:
-    case 0:
-      return x
-    case _:
-      fst = sum(depth-1, x*2+0) # adds the fst half
-      snd = sum(depth-1, x*2+1) # adds the snd half
-      return fst + snd
-    
-def main:
-  return sum(30, 0)
-```
-
-Here is how it performs:
-
-- CPU, Apple M3 Max, 1 thread: **3.5 minutes**
-
-- CPU, Apple M3 Max, 16 threads: **10.26 seconds**
-
-- GPU, NVIDIA RTX 4090, 32k threads: **1.88 seconds**
-
-Of course, adding numbers recursively isn't the best use case for Bend, as you
-could do it very fast with a low-level sequential loop. But not everything can
-be done that way. 
-
-### Parallel Bitonic Sort
-
-The Bitonic Sort algorithm is based on the following network:
-
-![bitonic_sort](https://upload.wikimedia.org/wikipedia/commons/thumb/b/bd/BitonicSort1.svg/1686px-BitonicSort1.svg.png)
-
-Basically, numbers "pass through" the network along its 16 horizontal wires.
-Every time they reach two ends of an arrow, they're swapped to ensure that the
-arrow points towards to largest. At the end, all numbers will be sorted!
-
-Implementing a Bitonic Sort in C-like languages can be challenging, as it
-involves coordination of shared memory accesses. A less known fact is that all
-sorting algorithms can be described as a series of tree rotations. Bend supports
-a functional style, allowing one to implement it with just 9 equations:
-
-```
-data Tree = (Leaf val) | (Node fst snd)
-
-// Swaps distant values in parallel; corresponds to a Red Box
-(warp s (Leaf a)   (Leaf b))   = (U60.swap (^ (> a b) s) (Leaf a) (Leaf b))
-(warp s (Node a b) (Node c d)) = (join (warp s a c) (warp s b d))
-
-// Rebuilds the warped tree in the original order
-(join (Node a b) (Node c d)) = (Node (Node a c) (Node b d))
-
-// Recursively warps each sub-tree; corresponds to a Blue/Green Box
-(flow s (Leaf a))   = (Leaf a)
-(flow s (Node a b)) = (down s (warp s a b))
-
-// Propagates Flow downwards
-(down s (Leaf a))   = (Leaf a)
-(down s (Node a b)) = (Node (flow s a) (flow s b))
-
-// Bitonic Sort
-(sort s (Leaf a))   = (Leaf a)
-(sort s (Node a b)) = (flow s (Node (sort 0 a) (sort 1 b)))
-```
-
-Unlike the CUDA counterpart, this version of the algorithm is extremely
-high-level, relying on millions of small allocations, tree rotations and
-recursion. It isn't even possible to express it that way in any existing
-GPU-targeting language. Bend is capable of compiling that algorithm, as is, 
-to in more than 32 thousand CUDA threads, achieving a near-ideal speedup:
-
-... graphics or numbers here ...
-
-### 3D Graphics
-
-TODO: perhaps write a mendelbrot set renderer in Bend, render with the
-DRAW_IMAGE IO, and measure the time it took, compared to, say, a sequential C
-implementation?
-
-## Limitations
-
-- only 1 dup label
-
-- no lazy mode (loops etc.)
-
-- no loops (TCO off)
-
-- no mutable arrays, only immutable trees
-
-- no immutable strings / buffers / textures
-
-- ... what else ... ?
diff --git a/src/hvm.c b/src/hvm.c
index 44790811..ca113d08 100644
--- a/src/hvm.c
+++ b/src/hvm.c
@@ -390,11 +390,11 @@ static inline u32 get_u24(Numb word) {
 
 // Constructor and getters for I24 (signed 24-bit integer)
 static inline Numb new_i24(i32 val) {
-  return (((u32)val << 4) & 0xFFFFFF) | I24;
+  return (((u32)val & 0xFFFFFF) << 4) | I24;
 }
 
 static inline i32 get_i24(Numb word) {
-  return (((word >> 4) & 0xFFFFFF) << 8) >> 8;
+  return ((i32)((word >> 4) & 0xFFFFFF)) << 8 >> 8;
 }
 
 // Constructor and getters for F24 (24-bit float)
diff --git a/src/hvm.cu b/src/hvm.cu
index 4fb3aa1d..122f5aaf 100644
--- a/src/hvm.cu
+++ b/src/hvm.cu
@@ -694,11 +694,11 @@ __device__ __host__ inline u32 get_u24(Numb word) {
 
 // Constructor and getters for I24 (signed 24-bit integer)
 __device__ __host__ inline Numb new_i24(i32 val) {
-  return (((u32)val << 4) & 0xFFFFFF) | I24;
+  return (((u32)val & 0xFFFFFF) << 4) | I24;
 }
 
 __device__ __host__ inline i32 get_i24(Numb word) {
-  return (((word >> 4) & 0xFFFFFF) << 8) >> 8;
+  return ((i32)((word >> 4) & 0xFFFFFF)) << 8 >> 8;
 }
 
 // Constructor and getters for F24 (24-bit float)
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 00000000..6772f631
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,3 @@
+pub mod ast;
+pub mod cmp;
+pub mod hvm;
diff --git a/src/main.rs b/src/main.rs
index 22dd02bc..3379ef66 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -3,15 +3,12 @@
 #![allow(unused_variables)]
 
 use clap::{Arg, ArgAction, Command};
+use ::hvm::{ast, cmp, hvm};
 use std::fs;
 use std::io::Write;
 use std::path::PathBuf;
 use std::process::Command as SysCommand;
 
-mod ast;
-mod cmp;
-mod hvm;
-
 #[cfg(feature = "c")]
 extern "C" {
   fn hvm_c(book_buffer: *const u32, run_io: bool);