diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fc651ea --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +/target +node_modules +dist +package-lock.json +Cargo.lock \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..0bc55e0 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,39 @@ +cargo-features = ["edition2024"] +[package] +name = "solana-huffman-encoding-challenge" +version = "0.1.0" +edition = "2024" + +[lib] +crate-type = ["rlib","cdylib"] + +[dependencies] +heapless = "0.8.0" +hex = "0.4.3" +pinocchio = "0.8.4" +pinocchio-pubkey = "0.2.4" +pinocchio-system = "0.2.3" + +[dev-dependencies] +bincode = "1.3.3" +serde = { version = "1.0.213", features = ["derive"] } +assert_matches = "1.5.0" +arbitrary = { version = "1.4.1", features = ["derive"] } +mollusk-svm = { version = "=0.0.15", features = ["all-builtins"] } +mollusk-svm-bencher = { version = "=0.0.15" } +# mollusk-svm = { version = "=0.1.5", features = ["all-builtins"] } +proptest = "1.6.0" +rand = "0.8.5" +solana-account = { version = "=2.1.10", features = ["bincode"] } +solana-program = "=2.1.10" +solana-program-test = "=2.1.10" +solana-program-runtime = "=2.1.10" +solana-config-program = "=2.1.10" +solana-vote-program = "=2.1.10" +solana-sdk = "=2.1.10" + +[features] +no-entrypoint = [] +std = [] +test-default = ["no-entrypoint", "std"] +bench-default = ["no-entrypoint", "std"] diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..3fcc0a7 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,237 @@ +#![no_std] +#![allow(unexpected_cfgs)] + +#[cfg(not(feature = "no-entrypoint"))] +pinocchio_pubkey::declare_id!("ADUtWaDe3cn7V3oskWD7UWkdq9zxc6DcZKHoUH8vWBcD"); + +use pinocchio::{no_allocator, nostd_panic_handler}; + +// lazy_program_entrypoint!(process_instruction); +no_allocator!(); +nostd_panic_handler!(); + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn entrypoint(input: *mut u8) -> u8 { + let instruction_length = unsafe { *input.add(8) }; + let instruction_data_start = unsafe { input.add(16) }; + + + + // Create slice from the pointer and length + let instruction_data = + unsafe { core::slice::from_raw_parts(instruction_data_start, instruction_length as usize) }; + let (_decoded_len, _decoded_bytes) = unsafe { huffman_decode_url(instruction_data) }; + // let res_str = unsafe { + // core::str::from_utf8_unchecked(_decoded_bytes.get_unchecked(.._decoded_len)) + // }; + // sol_log(res_str); + 0 +} + +// #[inline(always)] +// fn process_instruction(context: InstructionContext) -> ProgramResult { +// let instruction_data = unsafe { context.instruction_data_unchecked() }; + +// let (_decoded_len, _decoded_bytes) = unsafe { huffman_decode_url(instruction_data) }; +// // let res_str = unsafe { +// // core::str::from_utf8_unchecked(_decoded_bytes.get_unchecked(.._decoded_len)) +// // }; +// // sol_log(&res_str); + +// Ok(()) +// } + +#[derive(Clone, Copy)] +#[repr(C)] +pub struct Node { + value: u8, + left: u8, + right: u8, + is_leaf: bool, +} + +impl Node { + #[inline(always)] + fn new_leaf(value: u8) -> Self { + Self { + is_leaf: true, + value, + left: 0, + right: 0, + } + } + + #[inline(always)] + fn new_internal(left: u8, right: u8) -> Self { + Self { + is_leaf: false, + value: 0, + left, + right, + } + } +} + +#[inline(always)] +pub unsafe fn huffman_decode_url(instruction_data: &[u8]) -> (usize, [u8; 256]) { + let mut result = [0u8; 256]; + + let original_len = unsafe { *instruction_data.get_unchecked(0) } as usize; + + let encoded_data = unsafe { instruction_data.get_unchecked(1..) }; + + // Parse tree size from encoded data + let tree_size = unsafe { + u16::from_le_bytes([ + *encoded_data.get_unchecked(0), + *encoded_data.get_unchecked(1), + ]) + } as usize; + + let data_start = 2 + tree_size; + + // Build tree iteratively + let mut nodes: [Node; 128] = [Node::new_leaf(0); 128]; + let mut node_count = 0u8; + let root_idx = unsafe { + build_tree_iterative( + encoded_data.get_unchecked(2..2 + tree_size), + &mut nodes, + &mut node_count, + ) + }; + + // Decode bits with target length constraint + let mut result_len = 0; + let mut current_node = root_idx; + let encoded_bits = unsafe { encoded_data.get_unchecked(data_start..) }; + + for &byte in encoded_bits { + for bit_offset in (0..8).rev() { + if result_len >= original_len { + break; + } + + let bit = (byte >> bit_offset) & 1; + let node = unsafe { *nodes.get_unchecked(current_node as usize) }; + + if node.is_leaf { + if result_len < result.len() { + unsafe { + *result.get_unchecked_mut(result_len) = node.value; + } + result_len += 1; + } + current_node = root_idx; + + // Process this bit with root if not done + if result_len < original_len + && !unsafe { nodes.get_unchecked(root_idx as usize) }.is_leaf + { + let root_node = unsafe { *nodes.get_unchecked(root_idx as usize) }; + current_node = if bit == 0 { + root_node.left + } else { + root_node.right + }; + } + } else { + current_node = if bit == 0 { node.left } else { node.right }; + } + } + + if result_len >= original_len { + break; + } + } + + (result_len, result) +} + +#[inline(always)] +pub unsafe fn build_tree_iterative( + tree_data: &[u8], + nodes: &mut [Node; 128], + node_count: &mut u8, +) -> u8 { + let mut pos = 0; + let mut stack: [u8; 16] = [0; 16]; + let mut _stack_top = 0; + + // Read first node + let node_type = unsafe { *tree_data.get_unchecked(pos) }; + pos += 1; + + let root_idx = *node_count; + + if node_type == 1 { + // Single leaf node case + unsafe { + *nodes.get_unchecked_mut(*node_count as usize) = + Node::new_leaf(*tree_data.get_unchecked(pos)); + } + *node_count += 1; + return root_idx; + } else { + // Internal root node + unsafe { + *nodes.get_unchecked_mut(*node_count as usize) = Node::new_internal(0, 0); + *stack.get_unchecked_mut(0) = *node_count; + } + _stack_top = 1; + *node_count += 1; + } + + // Process remaining nodes + while pos < tree_data.len() && _stack_top > 0 { + let node_type = unsafe { *tree_data.get_unchecked(pos) }; + pos += 1; + let current_idx = *node_count; + + if node_type == 1 { + // Leaf node + unsafe { + *nodes.get_unchecked_mut(current_idx as usize) = + Node::new_leaf(*tree_data.get_unchecked(pos)); + } + pos += 1; + *node_count += 1; + + // Attach to parent + let parent_idx = unsafe { *stack.get_unchecked(_stack_top - 1) }; + let parent = unsafe { nodes.get_unchecked_mut(parent_idx as usize) }; + + if parent.left == 0 { + parent.left = current_idx; + } else { + parent.right = current_idx; + _stack_top -= 1; // Parent complete + } + } else { + // Internal node + unsafe { + *nodes.get_unchecked_mut(current_idx as usize) = Node::new_internal(0, 0); + } + *node_count += 1; + + // Attach to parent + let parent_idx = unsafe { *stack.get_unchecked(_stack_top - 1) }; + let parent = unsafe { nodes.get_unchecked_mut(parent_idx as usize) }; + + if parent.left == 0 { + parent.left = current_idx; + } else { + parent.right = current_idx; + _stack_top -= 1; // Parent complete + } + + // Push to stack + unsafe { + *stack.get_unchecked_mut(_stack_top) = current_idx; + } + _stack_top += 1; + } + } + + root_idx +} diff --git a/tests/encoder.rs b/tests/encoder.rs new file mode 100644 index 0000000..077a65f --- /dev/null +++ b/tests/encoder.rs @@ -0,0 +1,175 @@ +use std::collections::{BTreeMap, BinaryHeap, HashMap}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum HuffmanTree { + Leaf { + freq: u32, + byte: u8, + }, + Node { + freq: u32, + left: Box, + right: Box, + }, +} + +impl HuffmanTree { + fn freq(&self) -> u32 { + match self { + HuffmanTree::Leaf { freq, .. } => *freq, + HuffmanTree::Node { freq, .. } => *freq, + } + } +} + +impl Ord for HuffmanTree { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + other.freq().cmp(&self.freq()) // Reverse for min-heap + } +} + +impl PartialOrd for HuffmanTree { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +pub struct HuffmanEncoder { + codes: HashMap, // byte -> (code, bit_length) + tree_bytes: Vec, // Serialized tree for decoder +} + +impl HuffmanEncoder { + pub fn new(input: &str) -> Self { + let frequencies = Self::calculate_frequencies(input); + let tree = Self::build_tree(&frequencies); + let mut codes = HashMap::new(); + + if frequencies.len() == 1 { + // Special case: single character + let byte = frequencies.keys().next().unwrap(); + codes.insert(*byte, (0, 1)); + } else { + Self::generate_codes(&tree, &mut codes, 0, 0); + } + + let tree_bytes = Self::serialize_tree(&tree); + + Self { codes, tree_bytes } + } + + fn calculate_frequencies(input: &str) -> BTreeMap { + let mut frequencies = BTreeMap::new(); + for byte in input.bytes() { + *frequencies.entry(byte).or_insert(0) += 1; + } + frequencies + } + + fn build_tree(frequencies: &BTreeMap) -> HuffmanTree { + let mut heap = BinaryHeap::new(); + + // Create leaf nodes + for (&byte, &freq) in frequencies { + heap.push(HuffmanTree::Leaf { freq, byte }); + } + + // Build tree + while heap.len() > 1 { + let right = heap.pop().unwrap(); + let left = heap.pop().unwrap(); + let freq = left.freq() + right.freq(); + heap.push(HuffmanTree::Node { + freq, + left: Box::new(left), + right: Box::new(right), + }); + } + + heap.pop().unwrap() + } + + fn generate_codes( + tree: &HuffmanTree, + codes: &mut HashMap, + code: u32, + depth: u8, + ) { + match tree { + HuffmanTree::Leaf { byte, .. } => { + codes.insert(*byte, (code, depth.max(1))); + } + HuffmanTree::Node { left, right, .. } => { + Self::generate_codes(left, codes, code << 1, depth + 1); + Self::generate_codes(right, codes, (code << 1) | 1, depth + 1); + } + } + } + + fn serialize_tree(tree: &HuffmanTree) -> Vec { + let mut bytes = Vec::new(); + Self::serialize_tree_recursive(tree, &mut bytes); + bytes + } + + fn serialize_tree_recursive(tree: &HuffmanTree, bytes: &mut Vec) { + match tree { + HuffmanTree::Leaf { byte, .. } => { + bytes.push(1); // Leaf marker + bytes.push(*byte); + } + HuffmanTree::Node { left, right, .. } => { + bytes.push(0); // Internal node marker + Self::serialize_tree_recursive(left, bytes); + Self::serialize_tree_recursive(right, bytes); + } + } + } + + pub fn encode(&self, input: &str) -> Vec { + let mut result = Vec::new(); + let mut current_byte = 0u8; + let mut bit_count = 0u8; + + // First, write the tree + result.extend_from_slice(&(self.tree_bytes.len() as u16).to_le_bytes()); + result.extend_from_slice(&self.tree_bytes); + + // Then encode the data + for byte in input.bytes() { + if let Some(&(code, bit_length)) = self.codes.get(&byte) { + for i in (0..bit_length).rev() { + let bit = ((code >> i) & 1) as u8; + current_byte |= bit << (7 - bit_count); + bit_count += 1; + + if bit_count == 8 { + result.push(current_byte); + current_byte = 0; + bit_count = 0; + } + } + } + } + + // Push remaining bits + if bit_count > 0 { + result.push(current_byte); + } + + result + } +} + +pub fn huffman_encode_url(url: &str) -> Vec { + let encoder = HuffmanEncoder::new(url); + encoder.encode(url) +} + + +// fn main() { +// let url = "https://a-really-long-url-that-probably-would-be-so-hard-to-actually-use-but-whatever.com"; +// println!("Original URL: {}", url.len()); +// let encoded = huffman_encode_url(url); +// println!("Encoded URL: {:?}", encoded); +// } \ No newline at end of file diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs new file mode 100644 index 0000000..fa6d859 --- /dev/null +++ b/tests/unit_tests.rs @@ -0,0 +1,87 @@ +use mollusk_svm::Mollusk; +use solana_sdk::pubkey::Pubkey; +extern crate alloc; +use alloc::vec; +pub mod encoder; +use std::fs::File; +use std::io::Write; + +use crate::encoder::huffman_encode_url; + +pub const PROGRAM: Pubkey = Pubkey::new_from_array(solana_huffman_encoding_challenge::ID); + +pub fn mollusk() -> Mollusk { + Mollusk::new(&PROGRAM, "target/deploy/solana_huffman_encoding_challenge") +} + +pub fn create_instruction_data(encoded_url: &[u8], original_size: u8) -> Vec { + let mut data = Vec::new(); + data.extend_from_slice(&original_size.to_le_bytes()); + data.extend_from_slice(encoded_url); + data +} + +#[test] +pub fn test_all_challenge_urls_and_store_metrics() { + let mollusk = mollusk(); + let test_urls = vec![ + "http://localhost:3000", + "http://subdomain.localhost:3000", + "https://localhost.net", + "https://google.com", + "https://a.a", + "https://a.com", + "https://git@github.com:username/repo.git", + "https://a-really-long-url-that-probably-would-be-so-hard-to-actually-use-but-whatever.com", + "https://🦝👀🍹🌏.net", + "https://something.yourcooldomain.com?query_param=123&val=true", + ]; + + // Prepare CSV header + let mut report = String::new(); + report.push_str( + "URL,Original Size (bytes),Compressed Size (bytes),Compression Ratio,CU Consumed\n", + ); + + for test_url in test_urls { + let original_size = test_url.len(); + let encoded_url = huffman_encode_url(test_url); + let compressed_size = encoded_url.len(); + let compression_ratio = (original_size as f64) / (compressed_size as f64); + + let instruction_data = create_instruction_data(&encoded_url, test_url.len() as u8); + let ix_accounts = vec![]; + let ix = solana_sdk::instruction::Instruction::new_with_bytes( + PROGRAM, + &instruction_data, + ix_accounts.clone(), + ); + let tx_accounts = &vec![]; + + let result = mollusk.process_and_validate_instruction( + &ix, + tx_accounts, + &[mollusk_svm::result::Check::success()], + ); + assert_eq!( + result.program_result, + mollusk_svm::result::ProgramResult::Success + ); + + let cu_consumed = result.compute_units_consumed; + + // Escape commas if necessary and add the record to our report. + let record = format!( + "{},{},{},{:.2}x,{}\n", + test_url, original_size, compressed_size, compression_ratio, cu_consumed + ); + report.push_str(&record); + } + + // Store the report in the target directory. + // Adjust the path as needed. + let report_path = "target/url_metrics.csv"; + let mut file = File::create(report_path).expect("failed to create url_metrics.csv file"); + file.write_all(report.as_bytes()) + .expect("failed to write metrics to file"); +} \ No newline at end of file