TechfaneTechnologies · Feb 8, 2025
diff --git a/‎Cargo.toml
+34 b/‎Cargo.toml
+34
diff --git a/‎Readme.md
+110 b/‎Readme.md
+110
diff --git a/‎src/constants.rs
+249 b/‎src/constants.rs
+249
diff --git a/‎src/generate_dummy_data.rs
+51 b/‎src/generate_dummy_data.rs
+51
diff --git a/‎src/generate_hive_partitioned_data.rs
+300 b/‎src/generate_hive_partitioned_data.rs
+300
diff --git a/‎src/lib.rs
+38 b/‎src/lib.rs
+38
diff --git a/‎src/process_dummy_data.rs
+42 b/‎src/process_dummy_data.rs
+42
diff --git a/‎src/process_hive_partitioned_data.rs
+346 b/‎src/process_hive_partitioned_data.rs
+346
@@ -0,0 +1,34 @@
+[package]
+name = "phpd"
+version = "0.1.0"
+edition = "2021"
+license = "MIT"
+homepage = "https://github.com/TechfaneTechnologies/phpd"
+documentation = "https://github.com/TechfaneTechnologies/phpd"
+repository = "https://github.com/TechfaneTechnologies/phpd"
+authors = ["DrJuneMoone <96371033+MooneDrJune@users.noreply.github.com>"]
+
+[[bin]]
+name = "process_dummy_data"
+path = "src/process_dummy_data.rs"
+
+[[bin]]
+name = "generate_dummy_data"
+path = "src/generate_dummy_data.rs"
+
+[features]
+default = []
+parallel = []
+
+[dependencies]
+walkdir = "2"
+dirs = "6"
+rand = "0.9"
+csv = "1"
+itertools ="0.14"
+rayon = "1"
+crossbeam-channel = "0.5"
+bytes = "1.10"
+memmap2 = "0.9"
+tempfile = "3"
+time = { version = "0.3", features = ["serde", "formatting", "parsing", "macros", "local-offset", "quickcheck"] }
@@ -0,0 +1,110 @@
+# PHPD (Process Hive Partitioned Data)
+
+## Usage Guidelines
+
+To run this program, ensure that Rust is installed on your system. You can download and install Rust from the official website: [Rust Installation Guide](https://www.rust-lang.org/tools/install).
+
+### Installation
+
+Clone the repository using the following command:
+
+```bash
+git clone https://github.com/TechfaneTechnologies/phpd.git
+```
+
+Then, navigate to the project directory and build the project:
+
+```bash
+cd phpd
+cargo build --release
+```
+
+### Generating Dummy Data
+
+To generate dummy data, execute the following command:
+
+```bash
+cargo run --release --bin generate_dummy_data
+```
+
+Alternatively, you can run the compiled binary directly:
+
+```bash
+./target/release/generate_dummy_data
+```
+
+#### Example Terminal Output:
+
+```bash
+$ ./target/release/generate_dummy_data
+    Proceeding with the generation of dummy data for the following instruments: ["BANKNIFTY", "BANKEX", "FINNIFTY", "MIDCPNIFTY", "NIFTY", "NIFTYNXT50", "SENSEX"]
+    For the year 2024
+    At directory: /Users/DrJuneMoone/Document/hive_partitioned_data
+    Successfully generated dummy data at: /Users/DrJuneMoone/Document/hive_partitioned_data
+    Generated 5502 CSV files across 1841 subfolders, totaling 8.69 GiB
+    Processing speed: 923.87 MiB per second in 9.63 seconds
+```
+
+### Processing Hive Partitioned Dummy Data
+
+To process the generated hive partitioned dummy data, run the following command:
+
+```bash
+cargo run --release --bin process_dummy_data
+```
+
+Or execute the binary directly:
+
+```bash
+./target/release/process_dummy_data
+```
+
+#### Example Terminal Output:
+
+```bash
+$ ./target/release/process_dummy_data
+    Found 7 instruments
+
+    Instrument: SENSEX Grouped CSV Files {2: [CsvFile { path: "/Users/DrJuneMoone/Document/hive_partitioned_data/SENSEX/20240101/SENSEX-2.csv", date: "20240101", seq_id: 2 }, CsvFile { path: "/Users/DrJuneMoone/Document/hive_partitioned_data/SENSEX/20240102/SENSEX-2.csv", date: "20240102", seq_id: 2 }, .... ]}
+
+    Processing instrument: BANKNIFTY
+    Found 3 sequence groups
+    Processing sequence group: 2
+
+    Processing file: /Users/DrJuneMoone/Document/hive_partitioned_data/NIFTYNXT50/20240102/NIFTYNXT50-1.csv
+    Processing file: /Users/DrJuneMoone/Document/hive_partitioned_data/NIFTY/20240102/NIFTY-2.csv
+    Processing file: /Users/DrJuneMoone/Document/hive_partitioned_data/SENSEX/20240103/SENSEX-2.csv
+    ............
+    ............
+    Successfully merged dummy data at: /Users/DrJuneMoone/Document/hive_partitioned_data
+    Generated 21 sequentially merged CSV files, totaling 8.69 GiB
+    Processed 5502 CSV files across 1841 subfolders, totaling 8.69 GiB
+    Processing speed: 3.30 GiB per second in 5.27 seconds
+```
+
+### Changing the Data Directory
+
+To change the location of the hive partitioned data, modify the `base_path` variable in the source code:
+
+1. **For **``: Edit `src/generate_dummy_data.rs`, lines 18-20.
+2. **For **``: Edit `src/process_dummy_data.rs`, lines 8-10.
+
+After making the necessary changes, rebuild and run the program to regenerate and process the data with the updated location.
+
+### Processing Actual Hive Partitioned Data
+
+To process your actual hive partitioned data, update the `base_path` variable in `src/process_dummy_data.rs` (lines 8-10) and run the following command:
+
+```bash
+cargo run --release --bin process_dummy_data
+```
+
+Or execute the compiled binary:
+
+```bash
+./target/release/process_dummy_data
+```
+
+## Example Video
+
+A video demonstrating the processing performance of the program will be available soon.
@@ -0,0 +1,249 @@
+pub(crate) const ALL_INSTRUMENTS: [&str; 232] = [
+    "AARTIIND",
+    "ABB",
+    "ABBOTINDIA",
+    "ABCAPITAL",
+    "ABFRL",
+    "ACC",
+    "ADANIENSOL",
+    "ADANIENT",
+    "ADANIGREEN",
+    "ADANIPORTS",
+    "ALKEM",
+    "AMBUJACEM",
+    "ANGELONE",
+    "APLAPOLLO",
+    "APOLLOHOSP",
+    "APOLLOTYRE",
+    "ASHOKLEY",
+    "ASIANPAINT",
+    "ASTRAL",
+    "ATGL",
+    "ATUL",
+    "AUBANK",
+    "AUROPHARMA",
+    "AXISBANK",
+    "BAJAJ-AUTO",
+    "BAJAJFINSV",
+    "BAJFINANCE",
+    "BALKRISIND",
+    "BANDHANBNK",
+    "BANKBARODA",
+    "BANKINDIA",
+    "BANKNIFTY",
+    "BATAINDIA",
+    "BEL",
+    "BERGEPAINT",
+    "BHARATFORG",
+    "BHARTIARTL",
+    "BHEL",
+    "BIOCON",
+    "BOSCHLTD",
+    "BPCL",
+    "BRITANNIA",
+    "BSE",
+    "BSOFT",
+    "CAMS",
+    "CANBK",
+    "CANFINHOME",
+    "CDSL",
+    "CESC",
+    "CGPOWER",
+    "CHAMBLFERT",
+    "CHOLAFIN",
+    "CIPLA",
+    "COALINDIA",
+    "COFORGE",
+    "COLPAL",
+    "CONCOR",
+    "COROMANDEL",
+    "CROMPTON",
+    "CUB",
+    "CUMMINSIND",
+    "CYIENT",
+    "DABUR",
+    "DALBHARAT",
+    "DEEPAKNTR",
+    "DELHIVERY",
+    "DIVISLAB",
+    "DIXON",
+    "DLF",
+    "DMART",
+    "DRREDDY",
+    "EICHERMOT",
+    "ESCORTS",
+    "EXIDEIND",
+    "FEDERALBNK",
+    "FINNIFTY",
+    "GAIL",
+    "GLENMARK",
+    "GMRAIRPORT",
+    "GNFC",
+    "GODREJCP",
+    "GODREJPROP",
+    "GRANULES",
+    "GRASIM",
+    "GUJGASLTD",
+    "HAL",
+    "HAVELLS",
+    "HCLTECH",
+    "HDFCAMC",
+    "HDFCBANK",
+    "HDFCLIFE",
+    "HEROMOTOCO",
+    "HFCL",
+    "HINDALCO",
+    "HINDCOPPER",
+    "HINDPETRO",
+    "HINDUNILVR",
+    "HUDCO",
+    "ICICIBANK",
+    "ICICIGI",
+    "ICICIPRULI",
+    "IDEA",
+    "IDFCFIRSTB",
+    "IEX",
+    "IGL",
+    "INDHOTEL",
+    "INDIAMART",
+    "INDIANB",
+    "INDIGO",
+    "INDUSINDBK",
+    "INDUSTOWER",
+    "INFY",
+    "IOC",
+    "IPCALAB",
+    "IRB",
+    "IRCTC",
+    "IRFC",
+    "ITC",
+    "JINDALSTEL",
+    "JIOFIN",
+    "JKCEMENT",
+    "JSL",
+    "JSWENERGY",
+    "JSWSTEEL",
+    "JUBLFOOD",
+    "KALYANKJIL",
+    "KEI",
+    "KOTAKBANK",
+    "KPITTECH",
+    "LALPATHLAB",
+    "LAURUSLABS",
+    "LICHSGFIN",
+    "LICI",
+    "LODHA",
+    "LT",
+    "LTF",
+    "LTIM",
+    "LTTS",
+    "LUPIN",
+    "M&M",
+    "M&MFIN",
+    "MANAPPURAM",
+    "MARICO",
+    "MARUTI",
+    "MAXHEALTH",
+    "MCX",
+    "METROPOLIS",
+    "MFSL",
+    "MGL",
+    "MIDCPNIFTY",
+    "MOTHERSON",
+    "MPHASIS",
+    "MRF",
+    "MUTHOOTFIN",
+    "NATIONALUM",
+    "NAUKRI",
+    "NAVINFLUOR",
+    "NBCC",
+    "NCC",
+    "NESTLEIND",
+    "NHPC",
+    "NIFTY",
+    "NIFTYNXT50",
+    "NMDC",
+    "NTPC",
+    "NYKAA",
+    "OBEROIRLTY",
+    "OFSS",
+    "OIL",
+    "ONGC",
+    "PAGEIND",
+    "PAYTM",
+    "PEL",
+    "PERSISTENT",
+    "PETRONET",
+    "PFC",
+    "PHOENIXLTD",
+    "PIDILITIND",
+    "PIIND",
+    "PNB",
+    "POLICYBZR",
+    "POLYCAB",
+    "POONAWALLA",
+    "POWERGRID",
+    "PRESTIGE",
+    "PVRINOX",
+    "RAMCOCEM",
+    "RBLBANK",
+    "RECLTD",
+    "RELIANCE",
+    "SAIL",
+    "SBICARD",
+    "SBILIFE",
+    "SBIN",
+    "SHREECEM",
+    "SHRIRAMFIN",
+    "SIEMENS",
+    "SJVN",
+    "SOLARINDS",
+    "SONACOMS",
+    "SRF",
+    "SUNPHARMA",
+    "SUNTV",
+    "SUPREMEIND",
+    "SYNGENE",
+    "TATACHEM",
+    "TATACOMM",
+    "TATACONSUM",
+    "TATAELXSI",
+    "TATAMOTORS",
+    "TATAPOWER",
+    "TATASTEEL",
+    "TCS",
+    "TECHM",
+    "TIINDIA",
+    "TITAN",
+    "TORNTPHARM",
+    "TORNTPOWER",
+    "TRENT",
+    "TVSMOTOR",
+    "UBL",
+    "ULTRACEMCO",
+    "UNIONBANK",
+    "UNITDSPR",
+    "UPL",
+    "VBL",
+    "VEDL",
+    "VOLTAS",
+    "WIPRO",
+    "YESBANK",
+    "ZOMATO",
+    "ZYDUSLIFE",
+];
+
+pub(crate) const OHLCVOI_HEADER: &[u8; 50] = b"Timestamp,Open,High,Low,Close,Volume,OpenInterest\n";
+#[allow(dead_code)]
+pub(crate) const OHLCVOI_HEADER_ARRAY: [&str; 7] = [
+    "Timestamp",
+    "Open",
+    "High",
+    "Low",
+    "Close",
+    "Volume",
+    "OpenInterest",
+];
+pub(crate) const BUFFER_SIZE: usize = 64 * 1024; // 64KB
+pub(crate) const CHUNK_SIZE: usize = 5_000;
+pub(crate) const NSE_OPERATING_TIME_IN_SECONDS: usize = 22_500;
@@ -0,0 +1,51 @@
+use {
+    dirs::home_dir,
+    phpd::{format_size, get_folder_stats, CsvGenerationConfig},
+    std::time::Instant,
+};
+
+const INSTRUMENTS: [&str; 7] = [
+    "BANKNIFTY",
+    "BANKEX",
+    "FINNIFTY",
+    "MIDCPNIFTY",
+    "NIFTY",
+    "NIFTYNXT50",
+    "SENSEX",
+];
+
+fn main() -> std::io::Result<()> {
+    let mut base_path = home_dir().expect("Failed To Get Home Directory Path");
+    base_path.push("Document");
+    base_path.push("hive_partitioned_data");
+    let base_path_str = base_path.display().to_string();
+    let year = 2024;
+    let instruments = Some(INSTRUMENTS.as_slice());
+    let num_csvs_per_instrument_per_day = 3;
+    eprintln!(
+        "Proceeding With The Generation of Dummy Data of The Following Instruments: {:?}\nFor The Year 2024\nAt Directory: {}",
+        INSTRUMENTS,
+        &base_path_str
+    );
+    let start = Instant::now();
+    CsvGenerationConfig::generate_dummy_data(
+        year,
+        instruments,
+        num_csvs_per_instrument_per_day,
+        base_path,
+        None,
+    )?;
+    let (csv_count, subfolder_count, total_size) = get_folder_stats(&base_path_str);
+    let elapsed_time = start.elapsed().as_secs_f64();
+    let processing_speed = (total_size as f64) / elapsed_time;
+    eprintln!(
+        "Successfully Generated The Dummy Data At Directory: {}\nGenerated {} CSV Files Across {} Subfolders of Total Size {}\nAt The Processing Speed of {} Per Second In {} Seconds",
+        &base_path_str,
+        csv_count,
+        subfolder_count,
+        format_size(total_size as f64),
+        format_size(processing_speed),
+        elapsed_time
+    );
+    Ok(())
+}
@@ -0,0 +1,300 @@
+use {
+    crate::constants::{
+        ALL_INSTRUMENTS, BUFFER_SIZE, CHUNK_SIZE, NSE_OPERATING_TIME_IN_SECONDS, OHLCVOI_HEADER,
+        OHLCVOI_HEADER_ARRAY,
+    },
+    csv::{ByteRecord, WriterBuilder},
+    itertools::izip,
+    rand::{rng, Rng},
+    rayon::iter::{IntoParallelRefIterator, ParallelIterator},
+    std::{
+        fs::File,
+        io::{BufWriter, Write as _},
+        path::{Path, PathBuf},
+    },
+    time::{
+        macros::format_description, Date, Duration, Month, PrimitiveDateTime, Time, UtcOffset,
+        Weekday,
+    },
+};
+
+#[derive(Debug)]
+pub struct Ohlcvoi {
+    pub timestamp: Vec<String>,
+    pub open: Vec<f32>,
+    pub high: Vec<f32>,
+    pub low: Vec<f32>,
+    pub close: Vec<f32>,
+    pub volume: Vec<u32>,
+    pub oi: Vec<u32>,
+}
+
+impl Ohlcvoi {
+    pub fn random(date: Date, no_of_seconds: Option<usize>) -> Self {
+        let total_points = no_of_seconds.unwrap_or(NSE_OPERATING_TIME_IN_SECONDS);
+        let start_time = Time::from_hms(9, 15, 0).unwrap();
+        // let end_time = Time::from_hms(15, 30, 0).unwrap();
+        let offset = UtcOffset::from_whole_seconds(5 * 3600 + 30 * 60).unwrap();
+        let timestamp: Vec<String> = (0..total_points)
+            .filter_map(|seconds| {
+                let duration = time::Duration::seconds(seconds as i64);
+                let datetime = PrimitiveDateTime::new(date, start_time) + duration;
+                datetime.assume_offset(offset)
+                    .format(format_description!(
+                        "[year]-[month]-[day]T[hour]:[minute]:[second][offset_hour sign:mandatory]:[offset_minute]"
+                    ))
+                    .ok()
+            })
+            .collect();
+        let mut rng = rng();
+        let open: Vec<f32> = (0..total_points)
+            .map(|_| rng.random_range(100.0..100_000.0))
+            .collect();
+        let high: Vec<f32> = (0..total_points)
+            .map(|_| rng.random_range(100.0..100_000.0))
+            .collect();
+        let low: Vec<f32> = (0..total_points)
+            .map(|_| rng.random_range(100.0..100_000.0))
+            .collect();
+        let close: Vec<f32> = (0..total_points)
+            .map(|_| rng.random_range(100.0..100_000.0))
+            .collect();
+        let volume: Vec<u32> = (0..total_points)
+            .map(|_| rng.random_range(100..1_000_000))
+            .collect();
+        let oi: Vec<u32> = (0..total_points)
+            .map(|_| rng.random_range(100..1_000_000))
+            .collect();
+        Self {
+            timestamp,
+            open,
+            high,
+            low,
+            close,
+            volume,
+            oi,
+        }
+    }
+
+    #[allow(dead_code)]
+    fn write_csv<P: AsRef<Path>>(self, output_path: P) -> std::io::Result<()> {
+        let len = self.timestamp.len();
+        if ![
+            &self.open.len(),
+            &self.high.len(),
+            &self.low.len(),
+            &self.close.len(),
+            &self.volume.len(),
+            &self.oi.len(),
+        ]
+        .iter()
+        .all(|&x| *x == len)
+        {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "All vectors must have the same length",
+            ));
+        }
+        let output_file = File::create(output_path)?;
+        let mut writer = csv::WriterBuilder::new()
+            .buffer_capacity(BUFFER_SIZE)
+            .from_writer(output_file);
+        writer.write_record(OHLCVOI_HEADER_ARRAY)?;
+        for chunk_start in (0..self.timestamp.len()).step_by(CHUNK_SIZE) {
+            eprintln!(
+                "Chunk Start {} Chunk End {}, {}",
+                chunk_start,
+                chunk_start + CHUNK_SIZE,
+                self.timestamp.len()
+            );
+            let chunk_end = (chunk_start + CHUNK_SIZE).min(self.timestamp.len());
+            eprintln!("Chunk Start {} Chunk End {}", chunk_start, chunk_end);
+            for i in chunk_start..chunk_end {
+                writer.write_record([
+                    &self.timestamp[i],
+                    &self.open[i].to_string(),
+                    &self.high[i].to_string(),
+                    &self.low[i].to_string(),
+                    &self.close[i].to_string(),
+                    &self.volume[i].to_string(),
+                    &self.oi[i].to_string(),
+                ])?;
+            }
+            writer.flush()?;
+        }
+        Ok(())
+    }
+
+    #[allow(dead_code)]
+    fn to_csv<P: AsRef<Path>>(&self, output_path: P) -> std::io::Result<()> {
+        let output_file = File::create(output_path)?;
+        let mut writer = WriterBuilder::new()
+            .buffer_capacity(BUFFER_SIZE)
+            .from_writer(output_file);
+        writer.write_record(OHLCVOI_HEADER_ARRAY)?;
+        let mut record = ByteRecord::with_capacity(128, 7);
+        let mut buffer = Vec::with_capacity(128);
+        for chunk_start in (0..self.timestamp.len()).step_by(CHUNK_SIZE) {
+            let chunk_end = (chunk_start + CHUNK_SIZE).min(self.timestamp.len());
+            eprintln!("Chunk Start {} Chunk End {}", chunk_start, chunk_end);
+            for i in chunk_start..chunk_end {
+                record.clear();
+                buffer.clear();
+                record.push_field(self.timestamp[i].as_bytes());
+                write!(buffer, "{:.2}", self.open[i])?;
+                record.push_field(&buffer);
+                buffer.clear();
+                write!(buffer, "{:.2}", self.high[i])?;
+                record.push_field(&buffer);
+                buffer.clear();
+                write!(buffer, "{:.2}", self.low[i])?;
+                record.push_field(&buffer);
+                buffer.clear();
+                write!(buffer, "{:.2}", self.close[i])?;
+                record.push_field(&buffer);
+                buffer.clear();
+                write!(buffer, "{}", self.volume[i])?;
+                record.push_field(&buffer);
+                buffer.clear();
+                write!(buffer, "{}", self.oi[i])?;
+                record.push_field(&buffer);
+                writer.write_byte_record(&record)?;
+            }
+            writer.flush()?;
+        }
+        Ok(())
+    }
+
+    pub fn write_to_csv<P: AsRef<Path>>(&self, output_path: P) -> std::io::Result<()> {
+        let output_file = File::create(output_path)?;
+        let mut writer = BufWriter::with_capacity(BUFFER_SIZE, output_file);
+        writer.write_all(OHLCVOI_HEADER)?;
+        for chunk_start in (0..self.timestamp.len()).step_by(CHUNK_SIZE) {
+            let chunk_end = (chunk_start + CHUNK_SIZE).min(self.timestamp.len());
+            for (timestamp, open, high, low, close, volume, oi) in izip!(
+                &self.timestamp[chunk_start..chunk_end],
+                &self.open[chunk_start..chunk_end],
+                &self.high[chunk_start..chunk_end],
+                &self.low[chunk_start..chunk_end],
+                &self.close[chunk_start..chunk_end],
+                &self.volume[chunk_start..chunk_end],
+                &self.oi[chunk_start..chunk_end]
+            ) {
+                writeln!(
+                    writer,
+                    "{},{:.2},{:.2},{:.2},{:.2},{},{}",
+                    timestamp, open, high, low, close, volume, oi
+                )?;
+            }
+            writer.flush()?;
+        }
+        writer.flush()?;
+        Ok(())
+    }
+
+    #[cfg(feature = "parallel")]
+    pub fn generate_multiple(dates: &[Date], no_of_seconds: Option<usize>) -> Vec<Self> {
+        dates
+            .par_iter()
+            .map(|&date| Self::random(date, no_of_seconds))
+            .collect()
+    }
+}
+
+pub struct CsvGenerationConfig<'a> {
+    pub base_path: PathBuf,
+    pub instruments: Option<&'a [&'a str]>,
+    pub num_csvs_per_instrument_per_day: usize,
+    pub total_points_per_csv: Option<usize>,
+}
+
+impl<'a> CsvGenerationConfig<'a> {
+    pub fn new(
+        base_path: PathBuf,
+        instruments: Option<&'a [&'a str]>,
+        num_csvs_per_instrument_per_day: usize,
+        total_points_per_csv: Option<usize>,
+    ) -> Self {
+        CsvGenerationConfig {
+            base_path,
+            instruments,
+            num_csvs_per_instrument_per_day,
+            total_points_per_csv,
+        }
+    }
+
+    fn generate_instrument_day_csvs(&self, instrument: &str, date: Date) -> std::io::Result<()> {
+        let instrument_dir = self.base_path.join(instrument);
+        std::fs::create_dir_all(&instrument_dir)?;
+        let date_dir = instrument_dir.join(
+            date.format(format_description!("[year][month][day]"))
+                .unwrap_or_default(),
+        );
+        std::fs::create_dir_all(&date_dir)?;
+        for seq_no in 0..self.num_csvs_per_instrument_per_day {
+            let ohlcvoi = Ohlcvoi::random(date, self.total_points_per_csv);
+            let filename = format!("{}-{}.csv", instrument, seq_no);
+            let csv_path = date_dir.join(filename);
+            ohlcvoi.write_to_csv(&csv_path)?;
+        }
+        Ok(())
+    }
+
+    fn generate_all_instrument_csvs(&self, year: i32) -> std::io::Result<()> {
+        let business_days = generate_business_days(year);
+        let instruments = self.instruments.unwrap_or(ALL_INSTRUMENTS.as_slice());
+        instruments.par_iter().try_for_each(|instrument| {
+            business_days
+                .par_iter()
+                .try_for_each(|&date| self.generate_instrument_day_csvs(instrument, date))
+        })?;
+        Ok(())
+    }
+
+    pub fn generate_dummy_data(
+        year: i32,
+        instruments: Option<&'a [&'a str]>,
+        num_csvs_per_instrument_per_day: usize,
+        base_path: PathBuf,
+        total_points_per_csv: Option<usize>,
+    ) -> std::io::Result<()> {
+        let config = CsvGenerationConfig {
+            base_path,
+            instruments,
+            num_csvs_per_instrument_per_day,
+            total_points_per_csv,
+        };
+        config.generate_all_instrument_csvs(year)?;
+        Ok(())
+    }
+}
+
+#[inline]
+fn generate_business_days(year: i32) -> Vec<Date> {
+    let start_date = Date::from_calendar_date(year, Month::January, 1).unwrap();
+    let end_date = Date::from_calendar_date(year, Month::December, 31).unwrap();
+    let total_days = (end_date - start_date).whole_days();
+    let is_leap_year = year % 4 == 0 && (year % 100 != 0 || year % 400 == 0);
+    let estimated_business_days = if is_leap_year { 262 } else { 261 };
+    let mut business_days = Vec::with_capacity(estimated_business_days);
+    (0..total_days + 1)
+        .map(|days| start_date + Duration::days(days))
+        .filter(|date| !matches!(date.weekday(), Weekday::Saturday | Weekday::Sunday))
+        .for_each(|date| business_days.push(date));
+    business_days.shrink_to_fit();
+    business_days
+}
+
+#[cfg(test)]
+mod tests {
+    use time::macros::date;
+
+    use super::*;
+
+    #[test]
+    fn test_generate_business_days() {
+        let twenty_twenty_four_dates = generate_business_days(2024);
+        eprintln!("{:?}", twenty_twenty_four_dates);
+        assert_eq!(twenty_twenty_four_dates.len(), 262);
+    }
+}
@@ -0,0 +1,38 @@
+mod constants;
+pub mod generate_hive_partitioned_data;
+pub mod process_hive_partitioned_data;
+
+pub use generate_hive_partitioned_data::*;
+pub use process_hive_partitioned_data::*;
+
+use {std::path::Path, walkdir::WalkDir};
+
+pub fn get_folder_stats<P: AsRef<Path>>(path: P) -> (usize, usize, u64) {
+    let mut csv_count = 0;
+    let mut subfolder_count = 0;
+    let mut total_size = 0;
+
+    for entry in WalkDir::new(path.as_ref())
+        .into_iter()
+        .filter_map(|e| e.ok())
+    {
+        if entry.file_type().is_dir() && entry.path() != path.as_ref() {
+            subfolder_count += 1;
+        } else if entry.path().extension().and_then(|s| s.to_str()) == Some("csv") {
+            csv_count += 1;
+            if let Ok(metadata) = entry.metadata() {
+                total_size += metadata.len();
+            }
+        }
+    }
+
+    (csv_count, subfolder_count, total_size)
+}
+
+pub fn format_size(size_bytes: f64) -> String {
+    if size_bytes >= 1024.0 * 1024.0 * 1024.0 {
+        format!("{:.2} GiB", size_bytes / (1024.0 * 1024.0 * 1024.0))
+    } else {
+        format!("{:.2} MiB", size_bytes / (1024.0 * 1024.0))
+    }
+}
@@ -0,0 +1,42 @@
+use {
+    dirs::home_dir,
+    phpd::{format_size, get_folder_stats, MergeConfig, ThreadSafeError},
+    std::time::Instant,
+};
+
+fn main() -> Result<(), ThreadSafeError> {
+    let mut base_path = home_dir().expect("Failed To Get Home Directory Path");
+    base_path.push("Document");
+    base_path.push("hive_partitioned_data");
+    let base_path_str = base_path.display().to_string();
+    let (csv_count, subfolder_count, total_size) = get_folder_stats(&base_path_str);
+
+    let mut config = MergeConfig::new(base_path);
+
+    // Disable combined CSV creation if needed
+    // to create sequence wise combined csv's.
+    config.create_combined = false;
+
+    let start = Instant::now();
+    config.merge_all_instruments()?;
+    let elapsed_time = start.elapsed().as_secs_f64();
+    let (new_csv_count, _, new_total_size) = get_folder_stats(&base_path_str);
+    let merged_csv_count = new_csv_count - csv_count;
+    let merged_csv_size = new_total_size - total_size;
+    let processing_speed = (new_total_size as f64) / elapsed_time;
+    eprintln!(
+        "Successfully Merged The Dummy Data At Directory: {}\nGenerated Sequently Meged {} CSV Files of Total Size {}",
+        &base_path_str,
+        merged_csv_count,
+        format_size(merged_csv_size as f64)
+    );
+    eprintln!(
+        "From {} CSV Files Across {} Subfolders of Total Size {}\nAt The Processing Speed of {} Per Second In {} Seconds",
+        csv_count,
+        subfolder_count,
+        format_size(total_size as f64),
+        format_size(processing_speed),
+        elapsed_time
+    );
+    Ok(())
+}
@@ -0,0 +1,346 @@
+use {
+    bytes::{Bytes, BytesMut},
+    crossbeam_channel::{bounded, Receiver, Sender},
+    memmap2::MmapOptions,
+    rayon::prelude::*,
+    std::{
+        collections::HashMap,
+        error::Error,
+        fs::{self, File},
+        io::{BufReader, BufWriter, Read, Write},
+        path::{Path, PathBuf},
+        sync::Arc,
+    },
+    tempfile::{NamedTempFile, TempDir},
+    walkdir::WalkDir,
+};
+
+pub type ThreadSafeError = Box<dyn Error + Send + Sync>;
+
+#[derive(Clone)]
+pub struct MergeConfig {
+    pub base_path: PathBuf,
+    pub output_base_path: PathBuf,
+    pub chunk_size: usize,
+    pub buffer_capacity: usize,
+    pub create_combined: bool,
+    pub wal_enabled: bool,
+    wal: Option<Arc<WalWriter>>,
+}
+
+#[derive(Debug, Clone)]
+struct CsvFile {
+    path: PathBuf,
+    date: String,
+    seq_id: usize,
+}
+
+#[allow(dead_code)]
+struct WalWriter {
+    wal_dir: TempDir,
+    current_file: NamedTempFile,
+    wal_path: PathBuf,
+}
+
+impl WalWriter {
+    pub fn new() -> Result<Self, ThreadSafeError> {
+        let wal_dir = tempfile::tempdir()?;
+        let current_file = NamedTempFile::new_in(&wal_dir)?;
+        let wal_path = current_file.path().to_owned();
+
+        Ok(Self {
+            wal_dir,
+            current_file,
+            wal_path,
+        })
+    }
+
+    pub fn write(&self, data: &[u8]) -> Result<(), ThreadSafeError> {
+        self.current_file.as_file().write_all(data)?;
+        Ok(())
+    }
+
+    pub fn commit(&self) -> Result<(), ThreadSafeError> {
+        self.current_file.as_file().sync_all()?;
+        Ok(())
+    }
+}
+
+impl MergeConfig {
+    pub fn new<P: AsRef<Path>>(base_path: P) -> Self {
+        MergeConfig {
+            base_path: base_path.as_ref().to_path_buf(),
+            output_base_path: base_path.as_ref().to_path_buf(),
+            chunk_size: 1024 * 1024,          // 1MB chunks
+            buffer_capacity: 8 * 1024 * 1024, // 8MB buffer
+            create_combined: true,            // Default to creating combined CSV
+            wal_enabled: false,
+            wal: None,
+        }
+    }
+
+    pub fn new_with_wal<P: AsRef<Path>>(base_path: P) -> Result<Self, ThreadSafeError> {
+        Ok(MergeConfig {
+            base_path: base_path.as_ref().to_path_buf(),
+            output_base_path: base_path.as_ref().to_path_buf(),
+            chunk_size: 1024 * 1024,          // 1MB chunks
+            buffer_capacity: 8 * 1024 * 1024, // 8MB buffer
+            create_combined: true,            // Default to creating combined CSV
+            wal_enabled: true,
+            wal: Some(Arc::new(WalWriter::new()?)),
+        })
+    }
+
+    fn find_instruments(&self) -> Result<Vec<String>, ThreadSafeError> {
+        let mut instruments = Vec::new();
+        for entry in fs::read_dir(&self.base_path)? {
+            let entry = entry?;
+            let path = entry.path();
+            if path.is_dir() {
+                if let Some(instrument_name) = path.file_name() {
+                    instruments.push(instrument_name.to_string_lossy().to_string());
+                }
+            }
+        }
+        Ok(instruments)
+    }
+
+    fn find_grouped_csv_files(
+        &self,
+        instrument: &str,
+    ) -> Result<HashMap<usize, Vec<CsvFile>>, ThreadSafeError> {
+        let mut grouped_csv_files: HashMap<usize, Vec<CsvFile>> = HashMap::new();
+        for date_dir in WalkDir::new(self.base_path.join(instrument))
+            .min_depth(1)
+            .max_depth(1)
+            .into_iter()
+            .filter_map(|e| e.ok())
+            .filter(|e| e.file_type().is_dir())
+        {
+            let date_str = date_dir.file_name().to_string_lossy().to_string();
+            for csv_entry in WalkDir::new(date_dir.path())
+                .min_depth(1)
+                .max_depth(1)
+                .into_iter()
+                .filter_map(|e| e.ok())
+                .filter(|e| {
+                    e.file_type().is_file()
+                        && e.path()
+                            .extension()
+                            .map_or_else(|| false, |ext| ext == "csv")
+                })
+            {
+                let filename = csv_entry.file_name().to_string_lossy();
+                let seq_id = filename
+                    .split('-')
+                    .nth(1)
+                    .and_then(|s| s.split('.').next())
+                    .and_then(|s| s.parse().ok())
+                    .unwrap_or(usize::MAX);
+
+                let csv_file = CsvFile {
+                    path: csv_entry.path().to_path_buf(),
+                    date: date_str.clone(),
+                    seq_id,
+                };
+                grouped_csv_files.entry(seq_id).or_default().push(csv_file);
+            }
+        }
+        for files in grouped_csv_files.values_mut() {
+            files.sort_by(|a, b| a.date.cmp(&b.date));
+        }
+        eprintln!(
+            "Instrument: {} Grouped CSV Files {:?}",
+            &instrument, grouped_csv_files
+        );
+        Ok(grouped_csv_files)
+    }
+
+    fn process_sequence_group(
+        &self,
+        files: &[CsvFile],
+        tx: Sender<Bytes>,
+    ) -> Result<(), ThreadSafeError> {
+        for (file_index, csv_file) in files.iter().enumerate() {
+            eprintln!("Processing file: {}", csv_file.path.display());
+            let file = File::open(&csv_file.path)?;
+            if file.metadata()?.len() as usize > self.buffer_capacity * 4 {
+                self.process_large_file(&csv_file.path, tx.clone(), file_index)?;
+            } else {
+                self.process_small_file(&csv_file.path, tx.clone(), file_index)?;
+            };
+        }
+        Ok(())
+    }
+
+    fn process_small_file(
+        &self,
+        path: &PathBuf,
+        tx: Sender<Bytes>,
+        file_index: usize,
+    ) -> Result<(), ThreadSafeError> {
+        let file = File::open(path)?;
+        let mut reader = BufReader::with_capacity(self.buffer_capacity, file);
+        let mut mmap = Vec::new();
+        reader.read_to_end(&mut mmap)?;
+        if file_index > 0 {
+            let mut start_pos = 0;
+            while start_pos < mmap.len() && mmap[start_pos] != b'\n' {
+                start_pos += 1;
+            }
+            mmap.drain(..start_pos + 1);
+        }
+        let mut pos = 0;
+        let mut buffer = BytesMut::with_capacity(self.chunk_size);
+
+        while pos < mmap.len() {
+            let chunk_end = (pos + self.chunk_size).min(mmap.len());
+            let mut actual_end = chunk_end;
+            while actual_end > pos && mmap[actual_end - 1] != b'\n' {
+                actual_end -= 1;
+            }
+            if actual_end > pos {
+                buffer.extend_from_slice(&mmap[pos..actual_end]);
+                tx.send(buffer.split().freeze())?;
+                pos = actual_end;
+            } else {
+                buffer.extend_from_slice(&mmap[pos..chunk_end]);
+                pos = chunk_end;
+            }
+        }
+        if !buffer.is_empty() {
+            tx.send(buffer.freeze())?;
+        }
+        Ok(())
+    }
+
+    fn process_large_file(
+        &self,
+        path: &PathBuf,
+        tx: Sender<Bytes>,
+        file_index: usize,
+    ) -> Result<(), ThreadSafeError> {
+        let file = File::open(path)?;
+        let mmap = unsafe { MmapOptions::new().map(&file)? };
+        let mut start_pos = 0;
+        if file_index > 0 {
+            while start_pos < mmap.len() && mmap[start_pos] != b'\n' {
+                start_pos += 1;
+            }
+            start_pos += 1;
+        }
+        let mut pos = start_pos;
+        let mut buffer = BytesMut::with_capacity(self.chunk_size);
+        while pos < mmap.len() {
+            let chunk_end = (pos + self.chunk_size).min(mmap.len());
+            buffer.extend_from_slice(&mmap[pos..chunk_end]);
+            let mut last_newline = buffer.len();
+            while last_newline > 0 && buffer[last_newline - 1] != b'\n' {
+                last_newline -= 1;
+            }
+            if last_newline > 0 {
+                let chunk = buffer.split_to(last_newline).freeze();
+                tx.send(chunk)?;
+            }
+            pos = chunk_end;
+        }
+        if !buffer.is_empty() {
+            tx.send(buffer.freeze())?;
+        }
+        Ok(())
+    }
+
+    fn merge_instrument_csvs(&self, instrument: &str) -> Result<(), ThreadSafeError> {
+        let grouped_csv_files = self.find_grouped_csv_files(instrument)?;
+        eprintln!("Processing instrument: {}", instrument);
+        eprintln!("Found {} sequence groups", grouped_csv_files.len());
+        if self.create_combined {
+            self.create_combined_instrument_csv(instrument, &grouped_csv_files)?;
+        } else {
+            for (seq_id, files) in grouped_csv_files {
+                eprintln!("Processing sequence group: {}", seq_id);
+                let seq_output_path = self
+                    .output_base_path
+                    .join(instrument)
+                    .join(format!("{}-{}.csv", instrument, seq_id));
+                fs::create_dir_all(seq_output_path.parent().unwrap())?;
+                let seq_output_file = File::create(&seq_output_path)?;
+                let seq_output_writer =
+                    BufWriter::with_capacity(self.buffer_capacity, seq_output_file);
+                let (tx, rx) = bounded(10); // Bounded channel with some buffer
+                let files_arc = Arc::new(files.clone());
+                let files_clone = Arc::clone(&files_arc);
+                let config_clone = self.clone();
+                let handle = std::thread::spawn(move || {
+                    if let Err(e) = config_clone.process_sequence_group(&files_clone, tx) {
+                        eprintln!("Error processing sequence group {}: {}", seq_id, e);
+                    }
+                });
+                self.write_records(seq_output_writer, rx)?;
+                handle.join().map_err(|_| "Thread panicked")?;
+            }
+        }
+        Ok(())
+    }
+
+    fn write_records(
+        &self,
+        mut writer: BufWriter<File>,
+        rx: Receiver<Bytes>,
+    ) -> Result<(), ThreadSafeError> {
+        while let Ok(chunk) = rx.recv() {
+            if let Some(wal) = &self.wal {
+                wal.write(&chunk)?;
+            }
+            writer.write_all(&chunk)?;
+        }
+        writer.flush()?;
+        if let Some(wal) = &self.wal {
+            wal.commit()?;
+        }
+        Ok(())
+    }
+
+    fn create_combined_instrument_csv(
+        &self,
+        instrument: &str,
+        grouped_csv_files: &HashMap<usize, Vec<CsvFile>>,
+    ) -> Result<(), ThreadSafeError> {
+        eprintln!("Creating combined CSV for instrument: {}", instrument);
+        let combined_output_path = self
+            .output_base_path
+            .join(instrument)
+            .join(format!("{}.csv", instrument));
+        fs::create_dir_all(combined_output_path.parent().unwrap())?;
+        let combined_output_file = File::create(&combined_output_path)?;
+        let combined_output_writer =
+            BufWriter::with_capacity(self.buffer_capacity, combined_output_file);
+        let mut all_files: Vec<CsvFile> = grouped_csv_files.values().flatten().cloned().collect();
+        all_files.sort_by(|a, b| a.date.cmp(&b.date).then(a.seq_id.cmp(&b.seq_id)));
+        eprintln!(
+            "For Instrument {} Sorted all files: {:?}",
+            instrument, all_files
+        );
+        let (tx, rx) = bounded(10); // Bounded channel with some buffer
+        let files_arc = Arc::new(all_files);
+        let files_clone = Arc::clone(&files_arc);
+        let config_clone = self.clone();
+        let handle = std::thread::spawn(move || {
+            if let Err(e) = config_clone.process_sequence_group(&files_clone, tx) {
+                eprintln!("Error processing combined CSV: {}", e);
+            }
+        });
+        self.write_records(combined_output_writer, rx)?;
+        handle.join().map_err(|_| "Thread panicked")?;
+        Ok(())
+    }
+
+    pub fn merge_all_instruments(&self) -> Result<(), ThreadSafeError> {
+        let instruments = self.find_instruments()?;
+        eprintln!("Found {} instruments", instruments.len());
+        instruments
+            .par_iter()
+            .try_for_each(|instrument| self.merge_instrument_csvs(instrument))?;
+        Ok(())
+    }
+}