From a96d5d7e0be3de4e6c1beb0142ac5c8d6d405e07 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 26 Nov 2024 06:36:49 -0500 Subject: [PATCH 01/10] docs: document load_lookup_table --- src/lookup.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/lookup.rs b/src/lookup.rs index 6747b1fd3..5a46ed6f9 100644 --- a/src/lookup.rs +++ b/src/lookup.rs @@ -51,6 +51,44 @@ pub fn set_qsv_cache_dir(cache_dir: &str) -> Result { Ok(qsv_cache_dir) } +/// Loads a lookup table from a local file, cache, or remote source. +/// +/// # Arguments +/// +/// * `opts` - Options for loading the lookup table, including: +/// - `name`: Name of the lookup table +/// - `uri`: URI/path to the lookup table file (http/https/ckan/dathere schemes supported) +/// - `cache_age_secs`: How long to keep cached files (negative to delete cache) +/// - `cache_dir`: Directory to store cached files +/// - `delimiter`: Optional CSV delimiter +/// - `ckan_api_url`: Optional CKAN API URL for CKAN resources +/// - `ckan_token`: Optional CKAN API token +/// - `timeout_secs`: Timeout in seconds for HTTP requests +/// +/// # Returns +/// +/// Returns a `LookupTableResult` containing: +/// - `filepath`: Path to the loaded lookup table file +/// - `headers`: CSV headers from the lookup table +/// +/// # Functionality +/// +/// 1. Checks if lookup table exists as local file +/// 2. If not local, checks cache: +/// - Uses cache if valid and not expired +/// - Deletes cache if cache_age_secs is negative +/// 3. For remote files: +/// - Handles dathere:// prefix for GitHub lookup tables +/// - Handles ckan:// prefix for CKAN resources +/// - Downloads HTTP(S) URLs to cache +/// 4. Reads and returns headers from the lookup table +/// +/// # Errors +/// +/// Returns error if: +/// - File operations fail (create/delete/read) +/// - Remote downloads fail +/// - CSV parsing fails pub fn load_lookup_table( opts: &LookupTableOptions, ) -> Result> { From 4166b9f6171d3f8546d1f102e3f874d827d75f48 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 26 Nov 2024 06:37:37 -0500 Subject: [PATCH 02/10] feat: add lookup support to `template` command --- src/cmd/template.rs | 311 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 301 insertions(+), 10 deletions(-) diff --git a/src/cmd/template.rs b/src/cmd/template.rs index 73f5d3f3b..c46f35230 100644 --- a/src/cmd/template.rs +++ b/src/cmd/template.rs @@ -51,10 +51,11 @@ template arguments: The CSV file to read. If not given, input is read from STDIN. The directory where the output files will be written. If it does not exist, it will be created. + If not set, output will be sent to stdout or the specified --output. template options: - --template Template string to use (alternative to --template-file) - -t, --template-file Template file to use - --outfilename Template string to use to create the filename of the output + --template MiniJinja template string to use (alternative to --template-file) + -t, --template-file MiniJinja template file to use + --outfilename MiniJinja template string to use to create the filename of the output files to write to . If set to just QSV_ROWNO, the filestem is set to the current rowno of the record, padded with leading zeroes, with the ".txt" extension (e.g. 001.txt, 002.txt, etc.) @@ -69,6 +70,17 @@ template options: -b, --batch The number of rows per batch to load into memory, before running in parallel. Set to 0 to load all rows in one batch. [default: 50000] + --timeout Timeout for downloading lookups on URLs. [default: 30] + --cache-dir The directory to use for caching downloaded lookup resources. + If the directory does not exist, qsv will attempt to create it. + If the QSV_CACHE_DIR envvar is set, it will be used instead. + [default: ~/.qsv-cache] + --ckan-api The URL of the CKAN API to use for downloading lookup resources + with the "ckan://" scheme. + If the QSV_CKAN_API envvar is set, it will be used instead. + [default: https://data.dathere.com/api/3/action] + --ckan-token The CKAN API token to use. Only required if downloading private resources. + If the QSV_CKAN_TOKEN envvar is set, it will be used instead. Common options: -h, --help Display this message @@ -83,7 +95,10 @@ Common options: use std::{ fs, io::{BufWriter, Write}, - sync::OnceLock, + sync::{ + atomic::{AtomicU16, Ordering}, + OnceLock, RwLock, + }, }; use indicatif::{ProgressBar, ProgressDrawTarget}; @@ -98,6 +113,8 @@ use simd_json::BorrowedValue; use crate::{ config::{Config, Delimiter, DEFAULT_WTR_BUFFER_CAPACITY}, + lookup, + lookup::LookupTableOptions, util, CliError, CliResult, }; @@ -117,6 +134,10 @@ struct Args { flag_delimiter: Option, flag_no_headers: bool, flag_progressbar: bool, + flag_timeout: u16, + flag_cache_dir: String, + flag_ckan_api: String, + flag_ckan_token: Option, } static FILTER_ERROR: OnceLock = OnceLock::new(); @@ -127,6 +148,41 @@ impl From for CliError { } } +use std::hash::{Hash, Hasher}; + +use ahash::{HashMap, HashMapExt, HashSet}; + +#[derive(Clone, Eq)] +struct LookupEntry { + fields: Vec<(String, String)>, +} + +impl PartialEq for LookupEntry { + fn eq(&self, other: &Self) -> bool { + self.fields == other.fields + } +} + +impl Hash for LookupEntry { + fn hash(&self, state: &mut H) { + self.fields.hash(state); + } +} + +// type alias for LookupStruct first +type LookupStruct = HashSet; + +type LookupMap = HashMap; + +// Change the static storage to use RwLock +static LOOKUP_MAP: OnceLock> = OnceLock::new(); + +static QSV_CACHE_DIR: OnceLock = OnceLock::new(); +static TIMEOUT_SECS: AtomicU16 = AtomicU16::new(30); +static CKAN_API: OnceLock = OnceLock::new(); +static CKAN_TOKEN: OnceLock> = OnceLock::new(); +static DELIMITER: OnceLock> = OnceLock::new(); + pub fn run(argv: &[&str]) -> CliResult<()> { let args: Args = util::get_args(USAGE, argv)?; @@ -153,6 +209,11 @@ pub fn run(argv: &[&str]) -> CliResult<()> { return fail!("Cannot initialize custom filter error message."); } + TIMEOUT_SECS.store( + util::timeout_secs(args.flag_timeout)? as u16, + Ordering::Relaxed, + ); + // Set up minijinja environment let mut env = Environment::new(); @@ -161,14 +222,17 @@ pub fn run(argv: &[&str]) -> CliResult<()> { minijinja_contrib::add_to_environment(&mut env); env.set_unknown_method_callback(unknown_method_callback); - // Add our own custom filters + // add custom function + env.add_function("register_lookup", register_lookup); + + // add our own custom filters env.add_filter("substr", substr); env.add_filter("format_float", format_float); env.add_filter("human_count", human_count); env.add_filter("human_float_count", human_float_count); env.add_filter("round_banker", round_banker); env.add_filter("str_to_bool", str_to_bool); - // TODO: Add lookup filter + env.add_filter("lookup", lookup_filter); // Set up template env.add_template("template", &template_content)?; @@ -230,6 +294,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> { } else { // we use a bigger BufWriter buffer here than the default 8k as ALL the output // is going to one destination and we want to minimize I/O syscalls + // we optimize the size of the BufWriter buffer here + // so that it's only one I/O syscall per row Some(match args.flag_output { Some(file) => Box::new(BufWriter::with_capacity( DEFAULT_WTR_BUFFER_CAPACITY, @@ -256,6 +322,31 @@ pub fn run(argv: &[&str]) -> CliResult<()> { progress.set_draw_target(ProgressDrawTarget::hidden()); } + DELIMITER.set(args.flag_delimiter).unwrap(); + + #[cfg(not(feature = "lite"))] + let qsv_cache_dir = lookup::set_qsv_cache_dir(&args.flag_cache_dir)?; + #[cfg(not(feature = "lite"))] + QSV_CACHE_DIR.set(qsv_cache_dir)?; + + // check the QSV_CKAN_API environment variable + #[cfg(not(feature = "lite"))] + CKAN_API.set(if let Ok(api) = std::env::var("QSV_CKAN_API") { + api + } else { + args.flag_ckan_api.clone() + })?; + + // check the QSV_CKAN_TOKEN environment variable + #[cfg(not(feature = "lite"))] + CKAN_TOKEN + .set(if let Ok(token) = std::env::var("QSV_CKAN_TOKEN") { + Some(token) + } else { + args.flag_ckan_token.clone() + }) + .unwrap(); + // reuse batch buffers #[allow(unused_assignments)] let mut batch_record = csv::StringRecord::new(); @@ -339,10 +430,10 @@ pub fn run(argv: &[&str]) -> CliResult<()> { } } - // Render template with record data using context - let rendered = template - .render(&context) - .unwrap_or_else(|_| "RENDERING ERROR".to_owned()); + let rendered = match template.render(&context) { + Ok(s) => s, + Err(e) => format!("RENDERING ERROR ({row_number}): {e}\n"), + }; if output_to_dir { let outfilename = if use_rowno_filename { @@ -471,3 +562,203 @@ fn str_to_bool(value: &str) -> bool { "true" | "1" | "yes" | "t" | "y" ) } + +/// Registers a lookup table for use with the lookup filter. +/// +/// This function loads a CSV file as a lookup table and registers it in memory for use with +/// the lookup filter in templates. The lookup table is stored as a set of LookupEntry objects +/// containing key-value pairs from the CSV. +/// +/// # Arguments +/// +/// * `lookup_name` - Name to register the lookup table under +/// * `lookup_table_uri` - Path/URI to the CSV file (supports local files, HTTP(S), CKAN resources) +/// * `cache_age_secs` - Optional cache duration in seconds for remote files. Defaults to 3600 (1 +/// hour) +/// +/// # Returns +/// +/// Returns `Ok(true)` if successful, or a `minijinja::Error` with details if registration fails. +/// +/// # Example +/// +/// ```text +/// {% set result = register_lookup('products', 'lookup.csv') %} +/// {% if result %} +/// {{ product_id|lookup('products', 'id', 'name') }} +/// {% else %} +/// Error: {{ result.err }} +/// {% endif %} +/// ``` +fn register_lookup( + lookup_name: &str, + lookup_table_uri: &str, + cache_age_secs: Option, +) -> Result { + // Validate inputs + if lookup_name.is_empty() { + return Err(minijinja::Error::new( + minijinja::ErrorKind::InvalidOperation, + "lookup name cannot be empty", + )); + } + + if lookup_table_uri.is_empty() { + return Err(minijinja::Error::new( + minijinja::ErrorKind::InvalidOperation, + "lookup table URI cannot be empty", + )); + } + + // Check if lookup_name already exists in LOOKUP_MAP + if let Some(lock) = LOOKUP_MAP.get() { + if let Ok(map) = lock.read() { + if map.contains_key(lookup_name) { + // Lookup table already registered + return Ok(true); + } + } + } + + let lookup_opts = LookupTableOptions { + name: lookup_name.to_string(), + uri: lookup_table_uri.to_string(), + cache_dir: QSV_CACHE_DIR + .get() + .ok_or_else(|| { + minijinja::Error::new( + minijinja::ErrorKind::InvalidOperation, + "cache directory not initialized", + ) + })? + .to_string(), + cache_age_secs: cache_age_secs.unwrap_or(3600), + delimiter: DELIMITER.get().copied().flatten(), + ckan_api_url: CKAN_API.get().cloned(), + ckan_token: CKAN_TOKEN.get().and_then(std::clone::Clone::clone), + timeout_secs: TIMEOUT_SECS.load(Ordering::Relaxed), + }; + + let lookup_table = lookup::load_lookup_table(&lookup_opts).map_err(|e| { + minijinja::Error::new( + minijinja::ErrorKind::InvalidOperation, + format!("failed to load lookup table: {}", e), + ) + })?; + + let mut rdr = csv::Reader::from_path(&lookup_table.filepath).map_err(|e| { + minijinja::Error::new( + minijinja::ErrorKind::InvalidOperation, + format!("failed to read CSV file: {}", e), + ) + })?; + + // Convert CSV records to LookupEntry objects + let lookup_data: HashSet = rdr + .records() + .filter_map(|record| { + record.ok().map(|record| { + let fields: Vec<(String, String)> = lookup_table + .headers + .iter() + .zip(record.iter()) + .map(|(header, value)| (header.to_string(), value.to_string())) + .collect(); + LookupEntry { fields } + }) + }) + .collect(); + + // Initialize the lookup map if it doesn't exist + if LOOKUP_MAP.get().is_none() && LOOKUP_MAP.set(RwLock::new(HashMap::new())).is_err() { + return Err(minijinja::Error::new( + minijinja::ErrorKind::InvalidOperation, + "failed to initialize lookup map", + )); + } + + // Safely get write access to the map + match LOOKUP_MAP.get().unwrap().write() { + Ok(mut map) => { + map.insert(lookup_name.to_string(), lookup_data); + Ok(true) + }, + Err(_) => Err(minijinja::Error::new( + minijinja::ErrorKind::InvalidOperation, + "failed to acquire write lock on lookup map", + )), + } +} + +/// A filter function for looking up values in a registered lookup table. +/// +/// This function is used as a template filter to look up values from a previously registered +/// lookup table. It searches for a record in the lookup table where the `lookup_key` column +/// matches the input `value`, and returns the corresponding value from the `field` column. +/// +/// # Arguments +/// +/// * `value` - The value to look up in the lookup table +/// * `lookup_name` - The name of the registered lookup table to search in +/// * `lookup_key` - The column name in the lookup table to match against the input value +/// * `field` - The column name in the lookup table whose value should be returned +/// +/// # Returns +/// +/// Returns a `Result` containing either: +/// - `Ok(String)` - The looked up value if found, or the configured error string if not found +/// - `Err(minijinja::Error)` - If any of the required parameters are empty strings +/// +/// # Example +/// +/// ```text +/// # Given a lookup table "products" with columns "id", "name": +/// {{ product_id|lookup('products', 'id', 'name') }} +/// ``` +fn lookup_filter( + value: &str, + lookup_name: &str, + lookup_key: &str, + field: &str, +) -> Result { + fn lookup_find(lookup_name: &str, key: &str, value: &str) -> Option> { + LOOKUP_MAP.get().and_then(|lock| { + lock.read().ok().and_then(|map| { + map.get(lookup_name).and_then(|set| { + set.iter() + .find(|entry| entry.fields.iter().any(|(k, v)| k == key && v == value)) + .map(|entry| entry.fields.clone()) + }) + }) + }) + } + + if lookup_name.is_empty() { + return Err(minijinja::Error::new( + minijinja::ErrorKind::InvalidOperation, + "lookup name not provided", + )); + } + + if lookup_key.is_empty() { + return Err(minijinja::Error::new( + minijinja::ErrorKind::InvalidOperation, + "lookup key not provided", + )); + } + + if field.is_empty() { + return Err(minijinja::Error::new( + minijinja::ErrorKind::InvalidOperation, + "lookup field not provided", + )); + } + + Ok(match lookup_find(lookup_name, lookup_key, value) { + Some(fields) => fields + .iter() + .find(|(k, _)| k == field) + .map_or_else(|| FILTER_ERROR.get().unwrap().clone(), |(_, v)| v.clone()), + None => FILTER_ERROR.get().unwrap().clone(), + }) +} From ee9a4a72805b513ee3e9e10c2ca4093b2ad8c546 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 26 Nov 2024 06:37:51 -0500 Subject: [PATCH 03/10] tests: add `template` lookup tests --- tests/test_template.rs | 217 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 217 insertions(+) diff --git a/tests/test_template.rs b/tests/test_template.rs index 9e28c3c06..6ff9201c9 100644 --- a/tests/test_template.rs +++ b/tests/test_template.rs @@ -658,3 +658,220 @@ fn template_pycompat_filters() { ); assert_eq!(got, expected); } + +#[test] +fn template_custom_filters_error_handling() { + let wrk = Workdir::new("template_custom_filters_error"); + wrk.create( + "data.csv", + vec![ + svec!["value", "number"], + svec!["abc", "123.456"], + svec!["def", "not_a_number"], + ], + ); + + let mut cmd = wrk.command("template"); + cmd.arg("--template") + .arg(concat!( + "format_float: {{number|format_float(2)}}\n", + "human_count: {{number|human_count}}\n", + "human_float_count: {{number|human_float_count}}\n", + "round_banker: {{number|round_banker(3)}}\n", + "\n" + )) + .arg("--customfilter-error") + .arg("ERROR") + .arg("data.csv"); + + let got: String = wrk.stdout(&mut cmd); + let expected = concat!( + "format_float: 123.46\n", + "human_count: ERROR\n", + "human_float_count: 123.456\n", + "round_banker: 123.456\n", + "format_float: ERROR\n", + "human_count: ERROR\n", + "human_float_count: ERROR\n", + "round_banker: ERROR" + ); + assert_eq!(got, expected); +} + +#[test] +fn template_str_to_bool_filter() { + let wrk = Workdir::new("template_str_to_bool"); + wrk.create( + "data.csv", + vec![ + svec!["value"], + svec!["true"], + svec!["1"], + svec!["yes"], + svec!["t"], + svec!["y"], + svec!["false"], + svec!["0"], + svec!["no"], + ], + ); + + let mut cmd = wrk.command("template"); + cmd.arg("--template") + .arg("{{value|str_to_bool}}\n\n") + .arg("data.csv"); + + let got: String = wrk.stdout(&mut cmd); + let expected = + concat!("true\n", "true\n", "true\n", "true\n", "true\n", "false\n", "false\n", "false"); + assert_eq!(got, expected); +} + +#[test] +fn template_substr_filter() { + let wrk = Workdir::new("template_substr"); + wrk.create( + "data.csv", + vec![svec!["text"], svec!["Hello World"], svec!["Testing 123"]], + ); + + let mut cmd = wrk.command("template"); + cmd.arg("--template") + .arg(concat!( + "start_only: {{text|substr(0)}}\n", + "start_end: {{text|substr(0,5)}}\n", + "middle: {{text|substr(6,11)}}\n", + "invalid: {{text|substr(100)}}\n", + "\n" + )) + .arg("--customfilter-error") + .arg("ERROR") + .arg("data.csv"); + + let got: String = wrk.stdout(&mut cmd); + let expected = concat!( + "start_only: Hello World\n", + "start_end: Hello\n", + "middle: World\n", + "invalid: ERROR\n", + "start_only: Testing 123\n", + "start_end: Testi\n", + "middle: g 123\n", + "invalid: ERROR" + ); + assert_eq!(got, expected); +} + +#[test] +fn template_lookup_filter_simple() { + let wrk = Workdir::new("template_lookup"); + + // Create a lookup table CSV + wrk.create( + "lookup.csv", + vec![ + svec!["id", "name", "description"], + svec!["1", "apple", "A red fruit"], + svec!["2", "banana", "A yellow fruit"], + svec!["3", "orange", "A citrus fruit"], + ], + ); + + // Create main data CSV + wrk.create( + "data.csv", + vec![ + svec!["product_id", "quantity"], + svec!["1", "5"], + svec!["2", "3"], + svec!["4", "1"], // Invalid ID to test error handling + ], + ); + + let mut cmd = wrk.command("template"); + cmd.arg("--template") + .arg(concat!( + "{% set result = register_lookup('products', 'lookup.csv') %}", + "{% if result %}", + "{{product_id}}: {{product_id|lookup('products', 'id', 'name')}} - \ + {{product_id|lookup('products', 'id', 'description')}}\n", + "{% else %}", + "Error: Failed to register lookup table 'products' {{ result.err }} \n", + "{% endif %}" + )) + .arg("--customfilter-error") + .arg("") + .arg("data.csv"); + + wrk.assert_success(&mut cmd); + + let got: String = wrk.stdout(&mut cmd); + let expected = concat!( + "1: apple - A red fruit\n", + "2: banana - A yellow fruit\n", + "4: - " + ); + assert_eq!(got, expected); +} + +#[test] +fn template_lookup_filter_errors() { + let wrk = Workdir::new("template_lookup_errors"); + + wrk.create("data.csv", vec![svec!["id"], svec!["1"]]); + + // Test missing lookup key + let mut cmd = wrk.command("template"); + cmd.arg("--template") + .arg("{{id|lookup('', 'name')}}") + .arg("data.csv"); + + wrk.assert_success(&mut cmd); + + let got: String = wrk.stdout(&mut cmd); + assert!(got.contains("RENDERING ERROR")); + + // Test missing field name + let mut cmd = wrk.command("template"); + cmd.arg("--template") + .arg("{{id|lookup('id', '')}}") + .arg("data.csv"); + + let got: String = wrk.stdout(&mut cmd); + assert!(got.contains("RENDERING ERROR")); + + // Test unregistered lookup table + let mut cmd = wrk.command("template"); + cmd.arg("--template") + .arg("{{id|lookup('id', 'name')}}") + .arg("data.csv"); + + let got: String = wrk.stdout(&mut cmd); + assert!(got.contains("RENDERING ERROR")); +} + +#[test] +fn template_lookup_register_errors() { + let wrk = Workdir::new("template_lookup_register"); + + wrk.create("data.csv", vec![svec!["id"], svec!["1"]]); + + // Test non-existent lookup file + let mut cmd = wrk.command("template"); + cmd.arg("--template") + .arg(concat!( + "{% if register_lookup('test', 'nonexistent.csv') %}\n", + "success\n", + "{% else %}\n", + "failed\n", + "{% endif %}" + )) + .arg("data.csv"); + + let got: String = wrk.stdout(&mut cmd); + assert_eq!( + got, + "RENDERING ERROR (1): invalid operation: failed to load lookup table: failed to open \ + nonexistent.csv: No such file or directory (os error 2) (in template:1)" + ); +} From 7f226e0493413c53aca6666875033e7137402583 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 26 Nov 2024 07:53:09 -0500 Subject: [PATCH 04/10] tests: make template test platform-independent --- tests/test_template.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_template.rs b/tests/test_template.rs index 6ff9201c9..8c531998f 100644 --- a/tests/test_template.rs +++ b/tests/test_template.rs @@ -869,9 +869,8 @@ fn template_lookup_register_errors() { .arg("data.csv"); let got: String = wrk.stdout(&mut cmd); - assert_eq!( - got, + assert!(got.starts_with( "RENDERING ERROR (1): invalid operation: failed to load lookup table: failed to open \ - nonexistent.csv: No such file or directory (os error 2) (in template:1)" - ); + nonexistent.csv:" + )); } From 91a7f1949d4e4f009ff0b17bc368602a43136545 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 26 Nov 2024 08:17:48 -0500 Subject: [PATCH 05/10] refactor: add case-sensitive arg to lookup filter; more verbose lookup error messages --- src/cmd/template.rs | 57 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/src/cmd/template.rs b/src/cmd/template.rs index c46f35230..6e0d80286 100644 --- a/src/cmd/template.rs +++ b/src/cmd/template.rs @@ -702,6 +702,7 @@ fn register_lookup( /// * `lookup_name` - The name of the registered lookup table to search in /// * `lookup_key` - The column name in the lookup table to match against the input value /// * `field` - The column name in the lookup table whose value should be returned +/// * `case_sensitive` - Optional boolean to control case-sensitive matching (defaults to true) /// /// # Returns /// @@ -712,21 +713,41 @@ fn register_lookup( /// # Example /// /// ```text -/// # Given a lookup table "products" with columns "id", "name": +/// # Case-sensitive lookup (default) /// {{ product_id|lookup('products', 'id', 'name') }} +/// # Case-insensitive lookup (supports Unicode) +/// {{ product_id|lookup('products', 'id', 'name', false) }} /// ``` fn lookup_filter( value: &str, lookup_name: &str, lookup_key: &str, field: &str, + case_sensitive: Option, ) -> Result { - fn lookup_find(lookup_name: &str, key: &str, value: &str) -> Option> { + fn lookup_find( + lookup_name: &str, + key: &str, + value: &str, + case_sensitive: bool, + ) -> Option> { + let value_lowercase = value.to_lowercase(); + let mut v_lowercase = String::new(); LOOKUP_MAP.get().and_then(|lock| { lock.read().ok().and_then(|map| { map.get(lookup_name).and_then(|set| { set.iter() - .find(|entry| entry.fields.iter().any(|(k, v)| k == key && v == value)) + .find(|entry| { + entry.fields.iter().any(|(k, v)| { + k == key + && if case_sensitive { + v == value + } else { + util::to_lowercase_into(v, &mut v_lowercase); + v_lowercase == value_lowercase + } + }) + }) .map(|entry| entry.fields.clone()) }) }) @@ -754,11 +775,27 @@ fn lookup_filter( )); } - Ok(match lookup_find(lookup_name, lookup_key, value) { - Some(fields) => fields - .iter() - .find(|(k, _)| k == field) - .map_or_else(|| FILTER_ERROR.get().unwrap().clone(), |(_, v)| v.clone()), - None => FILTER_ERROR.get().unwrap().clone(), - }) + Ok( + match lookup_find( + lookup_name, + lookup_key, + value, + case_sensitive.unwrap_or(true), + ) { + Some(fields) => fields.iter().find(|(k, _)| k == field).map_or_else( + || { + format!( + "{}: lookup: {lookup_name} key: {lookup_key} not found for: {value}", + FILTER_ERROR.get().unwrap() + ) + }, + |(_, v)| v.clone(), + ), + None => format!( + "{}: lookup: {lookup_name} key: {lookup_key} not found for: {value}", + FILTER_ERROR.get().unwrap() + ), + // None => FILTER_ERROR.get().unwrap().clone(), + }, + ) } From 73466b2dce087837107c6ad6fcc076bd71ff091c Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 26 Nov 2024 08:18:11 -0500 Subject: [PATCH 06/10] test: lookup case sensitivity tests --- tests/test_template.rs | 47 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/test_template.rs b/tests/test_template.rs index 8c531998f..f6e4d989d 100644 --- a/tests/test_template.rs +++ b/tests/test_template.rs @@ -874,3 +874,50 @@ fn template_lookup_register_errors() { nonexistent.csv:" )); } +#[test] +fn template_lookup_case_sensitivity() { + let wrk = Workdir::new("template_lookup_case"); + + // Create lookup table with mixed case values + wrk.create( + "lookup.csv", + vec![ + svec!["code", "value"], + svec!["ABC", "first"], + svec!["def", "second"], + svec!["GHI", "third"], + ], + ); + + // Create input data with different cases + wrk.create( + "data.csv", + vec![svec!["code"], svec!["abc"], svec!["DEF"], svec!["ghi"]], + ); + + // Test case-sensitive lookup (default) + let mut cmd = wrk.command("template"); + cmd.arg("--template") + .arg(concat!( + "{% if register_lookup('codes', 'lookup.csv') %}\n", + "{{code|lookup('codes', 'code', 'value')}}\n", + "{% endif %}" + )) + .arg("data.csv"); + + let got: String = wrk.stdout(&mut cmd); + assert_eq!(got, "NOT_FOUND\nNOT_FOUND\nNOT_FOUND\n"); + + // Test case-insensitive lookup + let mut cmd = wrk.command("template"); + cmd.arg("--template") + .arg(concat!( + "{% if register_lookup('codes', 'lookup.csv') %}", + "{{code|lookup('codes', 'code', 'value', false)}}\n", + "{% endif %}" + )) + .arg("data.csv"); + + let got: String = wrk.stdout(&mut cmd); + assert_eq!(got, "first\nsecond\nthird"); +} From 159c0519ec126b155512ee97f7feadb892980cd0 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Wed, 27 Nov 2024 00:24:56 -0500 Subject: [PATCH 07/10] refactor: add rowcount to LookupTableResult struct so we can use it to prealloc Hashmap in template, for better memory allocation and cache locality --- src/lookup.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lookup.rs b/src/lookup.rs index 5a46ed6f9..dc939ad21 100644 --- a/src/lookup.rs +++ b/src/lookup.rs @@ -10,7 +10,7 @@ use reqwest::blocking::Client; use serde_json::Value; use simple_expand_tilde::expand_tilde; -use crate::CliError; +use crate::{util, CliError}; pub struct LookupTableOptions { pub name: String, @@ -26,6 +26,7 @@ pub struct LookupTableOptions { pub struct LookupTableResult { pub filepath: String, pub headers: csv::StringRecord, + pub rowcount: usize, } pub fn set_qsv_cache_dir(cache_dir: &str) -> Result { @@ -188,10 +189,12 @@ pub fn load_lookup_table( let mut rdr = conf.reader()?; let headers = rdr.headers()?.clone(); + let rowcount = util::count_rows(&conf).unwrap_or_default() as usize; let lur = LookupTableResult { filepath: lookup_table_uri, headers, + rowcount, }; Ok(lur) From 81753cd05ece647a395b3682dece8f8c85f27216 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Wed, 27 Nov 2024 00:30:35 -0500 Subject: [PATCH 08/10] refactor: changed LookupMap data structure from HashSet to a nested Hashmap which is both more memory efficient and faster (O(1) vs O(n)) --- src/cmd/template.rs | 149 +++++++++++++++++--------------------------- 1 file changed, 56 insertions(+), 93 deletions(-) diff --git a/src/cmd/template.rs b/src/cmd/template.rs index 6e0d80286..797d8d8b7 100644 --- a/src/cmd/template.rs +++ b/src/cmd/template.rs @@ -148,33 +148,13 @@ impl From for CliError { } } -use std::hash::{Hash, Hasher}; +use ahash::{HashMap, HashMapExt}; -use ahash::{HashMap, HashMapExt, HashSet}; +// An efficient structure for lookups: +// First HashMap: Maps lookup table names to their indices +// Second HashMap: Maps key column values to a HashMap of field name -> field value +type LookupMap = HashMap>>; -#[derive(Clone, Eq)] -struct LookupEntry { - fields: Vec<(String, String)>, -} - -impl PartialEq for LookupEntry { - fn eq(&self, other: &Self) -> bool { - self.fields == other.fields - } -} - -impl Hash for LookupEntry { - fn hash(&self, state: &mut H) { - self.fields.hash(state); - } -} - -// type alias for LookupStruct first -type LookupStruct = HashSet; - -type LookupMap = HashMap; - -// Change the static storage to use RwLock static LOOKUP_MAP: OnceLock> = OnceLock::new(); static QSV_CACHE_DIR: OnceLock = OnceLock::new(); @@ -642,32 +622,35 @@ fn register_lookup( let lookup_table = lookup::load_lookup_table(&lookup_opts).map_err(|e| { minijinja::Error::new( minijinja::ErrorKind::InvalidOperation, - format!("failed to load lookup table: {}", e), + format!("failed to load lookup table: {e}"), ) })?; let mut rdr = csv::Reader::from_path(&lookup_table.filepath).map_err(|e| { minijinja::Error::new( minijinja::ErrorKind::InvalidOperation, - format!("failed to read CSV file: {}", e), + format!("failed to read CSV file: {e}"), ) })?; - // Convert CSV records to LookupEntry objects - let lookup_data: HashSet = rdr - .records() - .filter_map(|record| { - record.ok().map(|record| { - let fields: Vec<(String, String)> = lookup_table - .headers - .iter() - .zip(record.iter()) - .map(|(header, value)| (header.to_string(), value.to_string())) - .collect(); - LookupEntry { fields } - }) - }) - .collect(); + // Create nested HashMaps for efficient lookups + let mut lookup_data: HashMap> = + HashMap::with_capacity(lookup_table.rowcount); + + let row_len = lookup_table.headers.len(); + for record in rdr.records().flatten() { + let mut row_data: HashMap = HashMap::with_capacity(row_len); + + // Store all fields for this row + for (header, value) in lookup_table.headers.iter().zip(record.iter()) { + row_data.insert(header.to_string(), value.to_string()); + } + + // Use the first column as the key by default + if let Some(key_value) = record.get(0) { + lookup_data.insert(key_value.to_string(), row_data); + } + } // Initialize the lookup map if it doesn't exist if LOOKUP_MAP.get().is_none() && LOOKUP_MAP.set(RwLock::new(HashMap::new())).is_err() { @@ -725,35 +708,6 @@ fn lookup_filter( field: &str, case_sensitive: Option, ) -> Result { - fn lookup_find( - lookup_name: &str, - key: &str, - value: &str, - case_sensitive: bool, - ) -> Option> { - let value_lowercase = value.to_lowercase(); - let mut v_lowercase = String::new(); - LOOKUP_MAP.get().and_then(|lock| { - lock.read().ok().and_then(|map| { - map.get(lookup_name).and_then(|set| { - set.iter() - .find(|entry| { - entry.fields.iter().any(|(k, v)| { - k == key - && if case_sensitive { - v == value - } else { - util::to_lowercase_into(v, &mut v_lowercase); - v_lowercase == value_lowercase - } - }) - }) - .map(|entry| entry.fields.clone()) - }) - }) - }) - } - if lookup_name.is_empty() { return Err(minijinja::Error::new( minijinja::ErrorKind::InvalidOperation, @@ -775,27 +729,36 @@ fn lookup_filter( )); } - Ok( - match lookup_find( - lookup_name, - lookup_key, - value, - case_sensitive.unwrap_or(true), - ) { - Some(fields) => fields.iter().find(|(k, _)| k == field).map_or_else( - || { - format!( - "{}: lookup: {lookup_name} key: {lookup_key} not found for: {value}", - FILTER_ERROR.get().unwrap() - ) - }, - |(_, v)| v.clone(), - ), - None => format!( - "{}: lookup: {lookup_name} key: {lookup_key} not found for: {value}", + let case_sensitive = case_sensitive.unwrap_or(true); + let value_lower = if case_sensitive { + value.to_string() + } else { + value.to_lowercase() + }; + + let mut lowercase_buffer = String::new(); + Ok(LOOKUP_MAP + .get() + .and_then(|lock| lock.read().ok()) + .and_then(|map| map.get(lookup_name).cloned()) + .and_then(|table| { + // Find the matching row + if case_sensitive { + table.get(value).and_then(|row| row.get(field).cloned()) + } else { + table + .iter() + .find(|(k, _)| { + util::to_lowercase_into(k, &mut lowercase_buffer); + lowercase_buffer == value_lower + }) + .and_then(|(_, row)| row.get(field).cloned()) + } + }) + .unwrap_or_else(|| { + format!( + r#"{}: lookup: "{lookup_name}" key: "{lookup_key}" not found for: "{value}""#, FILTER_ERROR.get().unwrap() - ), - // None => FILTER_ERROR.get().unwrap().clone(), - }, - ) + ) + })) } From b7ecd336ff8933ace0d31844fb572461a6b22239 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Wed, 27 Nov 2024 00:30:58 -0500 Subject: [PATCH 09/10] tests: updated `template` lookup tests --- tests/test_template.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test_template.rs b/tests/test_template.rs index f6e4d989d..c59ca1b38 100644 --- a/tests/test_template.rs +++ b/tests/test_template.rs @@ -809,7 +809,8 @@ fn template_lookup_filter_simple() { let expected = concat!( "1: apple - A red fruit\n", "2: banana - A yellow fruit\n", - "4: - " + "4: : lookup: \"products\" key: \"id\" not found for: \"4\" - : \ + lookup: \"products\" key: \"id\" not found for: \"4\"" ); assert_eq!(got, expected); } @@ -899,14 +900,19 @@ fn template_lookup_case_sensitivity() { let mut cmd = wrk.command("template"); cmd.arg("--template") .arg(concat!( - "{% if register_lookup('codes', 'lookup.csv') %}\n", + "{% if register_lookup('codes', 'lookup.csv') %}", "{{code|lookup('codes', 'code', 'value')}}\n", "{% endif %}" )) .arg("data.csv"); let got: String = wrk.stdout(&mut cmd); - assert_eq!(got, "NOT_FOUND\nNOT_FOUND\nNOT_FOUND\n"); + assert_eq!( + got, + r#": lookup: "codes" key: "code" not found for: "abc" +: lookup: "codes" key: "code" not found for: "DEF" +: lookup: "codes" key: "code" not found for: "ghi""# + ); // Test case-insensitive lookup let mut cmd = wrk.command("template"); From 4ac4b95e86c7a29ac10674ef30f663d760306d12 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Wed, 27 Nov 2024 00:33:53 -0500 Subject: [PATCH 10/10] deps: update lock file --- Cargo.lock | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a80a018d6..f2c124c3c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -902,9 +902,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.4" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" +checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" dependencies = [ "arrayref", "arrayvec", @@ -3498,9 +3498,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.164" +version = "0.2.166" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f" +checksum = "c2ccc108bbc0b1331bd061864e7cd823c0cab660bbe6970e66e2c0614decde36" [[package]] name = "libflate" @@ -4376,9 +4376,9 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pathdiff" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61c5ce1153ab5b689d0c074c4e7fc613e942dfb7dd9eea5ab202d2ad91fe361" +checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3" [[package]] name = "pbkdf2" @@ -6756,9 +6756,9 @@ dependencies = [ [[package]] name = "sysinfo" -version = "0.32.0" +version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3b5ae3f4f7d64646c46c4cae4e3f01d1c5d255c7406fdd7c7f999a94e488791" +checksum = "4c33cd241af0f2e9e3b5c32163b873b29956890b5342e6745b917ce9d490f4af" dependencies = [ "core-foundation-sys", "libc", @@ -7089,9 +7089,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", @@ -7100,9 +7100,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", "valuable",