diff --git a/src/cmd/validate.rs b/src/cmd/validate.rs index e54ddc5c4..bbe60a296 100644 --- a/src/cmd/validate.rs +++ b/src/cmd/validate.rs @@ -73,6 +73,7 @@ Validate arguments: The file can be a local file or a URL. Validate options: + --trim Trim leading and trailing whitespace from fields before validating. --fail-fast Stops on first error. --valid Valid record output file suffix. [default: valid] --invalid Invalid record output file suffix. [default: invalid] @@ -97,6 +98,7 @@ Common options: -d, --delimiter The field delimiter for reading CSV data. Must be a single character. [default: ,] -p, --progressbar Show progress bars. Not valid for stdin. + -Q, --quiet Do not display validation summary message. "#; use std::{ @@ -138,6 +140,7 @@ static TIMEOUT_SECS: AtomicU16 = AtomicU16::new(15); #[derive(Deserialize)] #[allow(dead_code)] struct Args { + flag_trim: bool, flag_fail_fast: bool, flag_valid: Option, flag_invalid: Option, @@ -148,6 +151,7 @@ struct Args { flag_no_headers: bool, flag_delimiter: Option, flag_progressbar: bool, + flag_quiet: bool, arg_input: Option, arg_json_schema: Option, flag_timeout: u16, @@ -384,7 +388,9 @@ pub fn run(argv: &[&str]) -> CliResult<()> { HumanCount(record_idx) ) }; - woutinfo!("{msg}"); + if !args.flag_quiet { + woutinfo!("{msg}"); + } // we're done when validating without a schema return Ok(()); @@ -462,6 +468,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let mut validation_results = Vec::with_capacity(batch_size); let mut valid_flags: Vec = Vec::with_capacity(batch_size); let mut validation_error_messages: Vec = Vec::with_capacity(50); + let flag_trim = args.flag_trim; // set RAYON_NUM_THREADS util::njobs(args.flag_jobs); @@ -477,10 +484,9 @@ pub fn run(argv: &[&str]) -> CliResult<()> { if has_data { row_number += 1; record.push_field(buffer.format(row_number).as_bytes()); - - // non-allocating trimming in place is much faster on the record level - // with our csv fork than doing per field std::str::trim which is allocating - record.trim(); + if flag_trim { + record.trim(); + } batch.push(record.clone()); } else { // nothing else to add to batch @@ -582,7 +588,9 @@ pub fn run(argv: &[&str]) -> CliResult<()> { ); } - winfo!("All {} records valid.", HumanCount(row_number)); + if !args.flag_quiet { + winfo!("All {} records valid.", HumanCount(row_number)); + } Ok(()) } diff --git a/tests/test_schema.rs b/tests/test_schema.rs index 2e00ad065..560c3c2fd 100644 --- a/tests/test_schema.rs +++ b/tests/test_schema.rs @@ -9,10 +9,10 @@ use crate::workdir::Workdir; #[test] #[file_serial] -fn generate_schema_with_defaults_and_validate_with_no_errors() { +fn generate_schema_with_defaults_and_validate_trim_with_no_errors() { // create workspace and invoke schema command with value constraints flag - let wrk = - Workdir::new("fn generate_schema_with_defaults_and_validate_with_no_errors").flexible(true); + let wrk = Workdir::new("fn generate_schema_with_defaults_and_validate_trim_with_no_errors") + .flexible(true); wrk.clear_contents().unwrap(); // copy csv file to workdir @@ -44,6 +44,7 @@ fn generate_schema_with_defaults_and_validate_with_no_errors() { // invoke validate command from schema created above let mut cmd2 = wrk.command("validate"); cmd2.arg("adur-public-toilets.csv"); + cmd2.arg("--trim"); cmd2.arg("adur-public-toilets.csv.schema.json"); wrk.output(&mut cmd2); @@ -58,10 +59,10 @@ fn generate_schema_with_defaults_and_validate_with_no_errors() { #[test] #[file_serial] -fn generate_schema_with_optional_flags_and_validate_with_errors() { +fn generate_schema_with_optional_flags_notrim_and_validate_with_errors() { // create workspace and invoke schema command with value constraints flag - let wrk = - Workdir::new("generate_schema_with_optional_flags_and_validate_with_errors").flexible(true); + let wrk = Workdir::new("generate_schema_with_optional_flags_notrim_and_validate_with_errors") + .flexible(true); wrk.clear_contents().unwrap(); // copy csv file to workdir @@ -101,6 +102,92 @@ fn generate_schema_with_optional_flags_and_validate_with_errors() { cmd2.arg("adur-public-toilets.csv.schema.json"); wrk.output(&mut cmd2); + // validation report + let validation_errors_expected = r#"row_number field error +1 OpeningHours "S = 09:00 - 21:00 W = 09:00 - 17:00 " is not one of ["09.00 - 17.00","S = 08:00 - 21:00 W = 08:00 - 17:00","S = 09:00 - 15:00 W = 09:00 - 15:00","S = 09:00 - 21:00 W = 09:00 - 17:00",null] +2 ExtractDate "07/07/2014 00:00" is not a "date" +3 ExtractDate "2014-07-07 00:00" is not a "date" +4 ExtractDate "07/07/2014 00:00" is not a "date" +5 ExtractDate "07/07/2014 00:00" is not a "date" +6 ExtractDate "07/07/2014 00:00" is not a "date" +7 ExtractDate "07/07/2014 00:00" is not a "date" +8 ExtractDate "07/07/2014 00:00" is not a "date" +9 ExtractDate "07/07/2014 00:00" is not a "date" +10 ExtractDate "07/07/2014 00:00" is not a "date" +11 ExtractDate "07/07/2014 00:00" is not a "date" +12 ExtractDate "07/07/2014 00:00" is not a "date" +13 ExtractDate "07/07/2014 00:00" is not a "date" +14 ExtractDate "07/07/2014 00:00" is not a "date" +15 ExtractDate "07/07/2014 00:00" is not a "date" +"#; + + // expecting invalid rows, so confirm there ARE output files generated + let validation_error_path = &wrk.path("adur-public-toilets.csv.validation-errors.tsv"); + println!("expecting validation error file at: {validation_error_path:?}"); + + assert!(Path::new(validation_error_path).exists()); + assert!(Path::new(&wrk.path("adur-public-toilets.csv.valid")).exists()); + assert!(Path::new(&wrk.path("adur-public-toilets.csv.invalid")).exists()); + + // check validation error output + let validation_error_output: String = + wrk.from_str(&wrk.path("adur-public-toilets.csv.validation-errors.tsv")); + + assert!(!validation_error_output.is_empty()); + + assert_eq!( + validation_errors_expected.to_string(), + validation_error_output + ); + wrk.assert_err(&mut cmd2); +} + +#[test] +#[file_serial] +fn generate_schema_with_optional_flags_trim_and_validate_with_errors() { + // create workspace and invoke schema command with value constraints flag + let wrk = Workdir::new("generate_schema_with_optional_flags_trim_and_validate_with_errors") + .flexible(true); + wrk.clear_contents().unwrap(); + + // copy csv file to workdir + let csv = wrk.load_test_resource("adur-public-toilets.csv"); + wrk.create_from_string("adur-public-toilets.csv", &csv); + + // run schema command with value constraints option + let mut cmd = wrk.command("schema"); + cmd.arg("adur-public-toilets.csv"); + cmd.arg("--enum-threshold"); + cmd.arg("13"); + cmd.arg("--pattern-columns"); + cmd.arg("ReportEmail,OpeningHours"); + cmd.arg("--strict-dates"); + wrk.output(&mut cmd); + + // load output schema file + let output_schema_string: String = + wrk.from_str(&wrk.path("adur-public-toilets.csv.schema.json")); + let output_schema_json = + serde_json::from_str(&output_schema_string).expect("parse schema json"); + + // make sure it's a valid JSON Schema by compiling with jsonschema library + jsonschema::JSONSchema::options() + .compile(&output_schema_json) + .expect("valid JSON Schema"); + + // diff output json with expected json + let expected_schema: String = + wrk.load_test_resource("adur-public-toilets.csv.schema-strict.expected.json"); + let expected_schema_json: Value = serde_json::from_str(&expected_schema).unwrap(); + assert_json_eq!(expected_schema_json, output_schema_json); + + // invoke validate command from schema created above + let mut cmd2 = wrk.command("validate"); + cmd2.arg("adur-public-toilets.csv"); + cmd2.arg("--trim"); + cmd2.arg("adur-public-toilets.csv.schema.json"); + wrk.output(&mut cmd2); + // validation report let validation_errors_expected = r#"row_number field error 2 ExtractDate "07/07/2014 00:00" is not a "date"