Merge pull request #1553 from jqnatividad/jsonl-refactor

jqnatividad · web-flow · commit 145b2cfe97b3 · 2024-01-20T14:35:41.000-05:00
`jsonl`: major perf refactor
diff --git a/README.md b/README.md
@@ -54,7 +54,7 @@
 | [input](/src/cmd/input.rs#L2) | Read CSV data with special commenting, quoting, trimming, line-skipping & non-UTF8 encoding handling rules. Typically used to "normalize" a CSV for further processing with other qsv commands. |
 | [join](/src/cmd/join.rs#L2) | Inner, outer, right, cross, anti & semi joins. Automatically creates a simple, in-memory hash index to make it fast.  |
 | [joinp](/src/cmd/joinp.rs#L2)<br>✨🚀🐻‍❄️ | Inner, outer, cross, anti, semi & asof joins using the [Pola.rs](https://www.pola.rs) engine. Unlike the `join` command, `joinp` can process files larger than RAM, is multi-threaded, has join key validation, pre-join filtering, supports [asof joins](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.join_asof.html) (which is [particularly useful for time series data](https://github.com/jqnatividad/qsv/blob/30cc920d0812a854fcbfedc5db81788a0600c92b/tests/test_joinp.rs#L509-L983)) & its output doesn't have duplicate columns. However, `joinp` doesn't have an --ignore-case option & it doesn't support right outer joins. |
-| [jsonl](/src/cmd/jsonl.rs#L2)<br>🔣 | Convert newline-delimited JSON ([JSONL](https://jsonlines.org/)/[NDJSON](http://ndjson.org/)) to CSV. See `tojsonl` command to convert CSV to JSONL.
+| [jsonl](/src/cmd/jsonl.rs#L2)<br>🚀🔣 | Convert newline-delimited JSON ([JSONL](https://jsonlines.org/)/[NDJSON](http://ndjson.org/)) to CSV. See `tojsonl` command to convert CSV to JSONL.
 | <a name="luau_deeplink"></a><br>[luau](/src/cmd/luau.rs#L2) 👑<br>✨📇🌐🔣 ![CKAN](docs/images/ckan.png) | Create multiple new computed columns, filter rows, compute aggregations and build complex data pipelines by executing a [Luau](https://luau-lang.org) [0.606](https://github.com/Roblox/luau/releases/tag/0.606) expression/script for every row of a CSV file ([sequential mode](https://github.com/jqnatividad/qsv/blob/bb72c4ef369d192d85d8b7cc6e972c1b7df77635/tests/test_luau.rs#L254-L298)), or using [random access](https://www.webopedia.com/definitions/random-access/) with an index ([random access mode](https://github.com/jqnatividad/qsv/blob/bb72c4ef369d192d85d8b7cc6e972c1b7df77635/tests/test_luau.rs#L367-L415)).<br>Can process a single Luau expression or [full-fledged data-wrangling scripts using lookup tables](https://github.com/dathere/qsv-lookup-tables#example) with discrete BEGIN, MAIN and END sections.<br> It is not just another qsv command, it is qsv's [Domain-specific Language](https://en.wikipedia.org/wiki/Domain-specific_language) (DSL) with [numerous qsv-specific helper functions](https://github.com/jqnatividad/qsv/blob/113eee17b97882dc368b2e65fec52b86df09f78b/src/cmd/luau.rs#L1356-L2290) to build production data pipelines. |
 | [partition](/src/cmd/partition.rs#L2) | Partition a CSV based on a column value. |
 | [pseudo](/src/cmd/pseudo.rs#L2)<br>🔣 | [Pseudonymise](https://en.wikipedia.org/wiki/Pseudonymization) the value of the given column by replacing them with an incremental identifier.  |
diff --git a/src/cmd/jsonl.rs b/src/cmd/jsonl.rs
@@ -6,7 +6,7 @@ straightforwardly convert JSON lines to CSV, the process might lose some complex
 fields from the input.
 
 Also, it will fail if the JSON documents are not consistent with one another,
-as the first JSON line will be use to infer the headers of the CSV output.
+as the first JSON line will be used to infer the headers of the CSV output.
 
 For examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_jsonl.rs.
 
@@ -16,6 +16,11 @@ Usage:
 
 jsonl options:
     --ignore-errors        Skip malformed input lines.
+    -j, --jobs <arg>       The number of jobs to run in parallel.
+                           When not set, the number of jobs is set to the 
+                           number of CPUs detected.
+    -b, --batch <size>     The number of rows per batch to load into memory,
+                           before running in parallel. [default: 50000]
 
 Common options:
     -h, --help             Display this message
@@ -29,11 +34,15 @@ use std::{
     io::{self, BufRead, BufReader},
 };
 
+use rayon::{
+    iter::{IndexedParallelIterator, ParallelIterator},
+    prelude::IntoParallelRefIterator,
+};
 use serde::Deserialize;
 use serde_json::Value;
 
 use crate::{
-    config::{Config, Delimiter},
+    config::{Config, Delimiter, DEFAULT_RDR_BUFFER_CAPACITY},
     util, CliResult,
 };
 
@@ -43,10 +52,11 @@ struct Args {
     flag_output:        Option<String>,
     flag_delimiter:     Option<Delimiter>,
     flag_ignore_errors: bool,
+    flag_jobs:          Option<usize>,
+    flag_batch:         u32,
 }
 
-#[allow(clippy::needless_pass_by_value)]
-fn recurse_to_infer_headers(value: &Value, headers: &mut Vec<Vec<String>>, path: Vec<String>) {
+fn recurse_to_infer_headers(value: &Value, headers: &mut Vec<Vec<String>>, path: &[String]) {
     match value {
         Value::Object(map) => {
             for (key, value) in map {
@@ -56,16 +66,16 @@ fn recurse_to_infer_headers(value: &Value, headers: &mut Vec<Vec<String>>, path:
                     | Value::Number(_)
                     | Value::String(_)
                     | Value::Array(_) => {
-                        let mut full_path = path.clone();
+                        let mut full_path = path.to_owned();
                         full_path.push(key.to_string());
 
                         headers.push(full_path);
                     },
                     Value::Object(_) => {
-                        let mut new_path = path.clone();
+                        let mut new_path = path.to_owned();
                         new_path.push(key.to_string());
 
-                        recurse_to_infer_headers(value, headers, new_path);
+                        recurse_to_infer_headers(value, headers, &new_path);
                     },
                     #[allow(unreachable_patterns)]
                     _ => {},
@@ -81,7 +91,7 @@ fn recurse_to_infer_headers(value: &Value, headers: &mut Vec<Vec<String>>, path:
 fn infer_headers(value: &Value) -> Vec<Vec<String>> {
     let mut headers: Vec<Vec<String>> = Vec::new();
 
-    recurse_to_infer_headers(value, &mut headers, Vec::new());
+    recurse_to_infer_headers(value, &mut headers, &Vec::new());
 
     headers
 }
@@ -103,6 +113,7 @@ fn get_value_at_path(value: &Value, path: &[String]) -> Option<Value> {
     Some(current.clone())
 }
 
+#[inline]
 fn json_line_to_csv_record(value: &Value, headers: &[Vec<String>]) -> csv::StringRecord {
     let mut record = csv::StringRecord::new();
 
@@ -141,31 +152,67 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
         .delimiter(args.flag_delimiter)
         .writer()?;
 
-    let rdr: Box<dyn BufRead> = match args.arg_input {
+    let mut rdr: Box<dyn BufRead> = match args.arg_input {
         None => Box::new(BufReader::new(io::stdin())),
-        Some(p) => Box::new(BufReader::new(fs::File::open(p)?)),
+        Some(p) => Box::new(BufReader::with_capacity(
+            DEFAULT_RDR_BUFFER_CAPACITY,
+            fs::File::open(p)?,
+        )),
     };
 
     let mut headers: Vec<Vec<String>> = Vec::new();
     let mut headers_emitted: bool = false;
 
-    for (rowidx, line) in rdr.lines().enumerate() {
-        let value: Value = match serde_json::from_str(&line?) {
-            Ok(v) => v,
-            Err(e) => {
-                if args.flag_ignore_errors {
-                    continue;
-                }
-                let human_idx = rowidx + 1; // not zero based, for readability
-                return fail_clierror!(
-                    r#"Could not parse line {human_idx} as JSON!: {e}
+    // amortize memory allocation by reusing record
+    let mut batch_line = String::new();
+
+    // reuse batch buffers
+    let batchsize: usize = args.flag_batch as usize;
+    let mut batch = Vec::with_capacity(batchsize);
+    let mut batch_results = Vec::with_capacity(batchsize);
+
+    // set RAYON_NUM_THREADS
+    util::njobs(args.flag_jobs);
+
+    let mut result_idx = 0_u64;
+
+    'batch_loop: loop {
+        for _ in 0..batchsize {
+            batch_line.clear();
+            match rdr.read_line(&mut batch_line) {
+                Ok(0) => {
+                    // EOF
+                    break;
+                },
+                Ok(_) => {
+                    batch.push(batch_line.clone());
+                },
+                Err(e) => {
+                    if args.flag_ignore_errors {
+                        continue;
+                    }
+                    return fail_clierror!(
+                        r#"Could not read input line!: {e}
 Use `--ignore-errors` option to skip malformed input lines.
 Use `tojsonl` command to convert _to_ jsonl instead of _from_ jsonl."#,
-                );
-            },
-        };
+                    );
+                },
+            }
+        }
+
+        if batch.is_empty() {
+            break 'batch_loop; // EOF
+        }
 
         if !headers_emitted {
+            let value: Value = match serde_json::from_str(&batch[0]) {
+                Ok(v) => v,
+                Err(e) => {
+                    return fail_clierror!(
+                        "Could not parse first input line as JSON to infer headers: {e}",
+                    );
+                },
+            };
             headers = infer_headers(&value);
 
             let headers_formatted = headers.iter().map(|v| v.join(".")).collect::<Vec<String>>();
@@ -175,9 +222,37 @@ Use `tojsonl` command to convert _to_ jsonl instead of _from_ jsonl."#,
             headers_emitted = true;
         }
 
-        let record = json_line_to_csv_record(&value, &headers);
-        wtr.write_record(&record)?;
-    }
+        // do actual work via rayon
+        batch
+            .par_iter()
+            .map(|json_line| match serde_json::from_str(json_line) {
+                Ok(v) => Some(json_line_to_csv_record(&v, &headers)),
+                Err(e) => {
+                    if !args.flag_ignore_errors {
+                        log::error!("serde_json::from_str error: {:#?}", e);
+                    }
+                    None
+                },
+            })
+            .collect_into_vec(&mut batch_results);
+
+        // rayon collect() guarantees original order, so we can just append results of each batch
+        for result_record in &batch_results {
+            result_idx += 1;
+            if let Some(record) = result_record {
+                wtr.write_record(record)?;
+            } else if !args.flag_ignore_errors {
+                // there was an error parsing a json line
+                return fail_clierror!(
+                    r#"Could not parse input line {result_idx} as JSON
+Use `--ignore-errors` option to skip malformed input lines.
+Use `tojsonl` command to convert _to_ jsonl instead of _from_ jsonl."#,
+                );
+            }
+        }
+
+        batch.clear();
+    } // end batch loop
 
     Ok(wtr.flush()?)
 }