From d15c51203c981b989b4e328a797ee552e1da6a4e Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Wed, 25 Oct 2023 15:27:29 -0400 Subject: [PATCH 1/2] `dedup`: preliminary sorting before dedup respects --ignore-case option also expanded usage text explaining sorting options before deduping --- src/cmd/dedup.rs | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/cmd/dedup.rs b/src/cmd/dedup.rs index 70445ebe2..7ec7c2f91 100644 --- a/src/cmd/dedup.rs +++ b/src/cmd/dedup.rs @@ -1,15 +1,20 @@ static USAGE: &str = r#" Deduplicates CSV rows. -Note that this requires reading all of the CSV data into memory because because the -rows need to be sorted first. +This requires reading all of the CSV data into memory because because the rows need +to be sorted first. -That is, unless the --sorted option is used to indicate the CSV is already sorted -(typically, with the extsort command). This will make dedup run in streaming mode -with constant memory. +That is, unless the --sorted option is used to indicate the CSV is already sorted - +typically, with the sort cmd for more sorting options or the extsort cmd for larger +than memory CSV files. This will make dedup run in streaming mode with constant memory. Either way, the output will not only be deduplicated, it will also be sorted. +Note that dedup's sorting will only be done alphabetically, not numerically. That is, +10 will come before 2. If you need to sort numerically, use the sort command first with +the --numeric option and pipe it to dedup with the --sorted option. +(i.e. qsv sort --numeric in.csv | qsv dedup --sorted) + A duplicate count will also be sent to . For examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_dedup.rs. @@ -147,11 +152,19 @@ pub fn run(argv: &[&str]) -> CliResult<()> { util::njobs(args.flag_jobs); let mut all = rdr.byte_records().collect::, _>>()?; - all.par_sort_by(|r1, r2| { - let a = sel.select(r1); - let b = sel.select(r2); - iter_cmp(a, b) - }); + if ignore_case { + all.par_sort_by(|r1, r2| { + let a = sel.select(r1); + let b = sel.select(r2); + iter_cmp_ignore_case(a, b) + }); + } else { + all.par_sort_by(|r1, r2| { + let a = sel.select(r1); + let b = sel.select(r2); + iter_cmp(a, b) + }); + } for (current, current_record) in all.iter().enumerate() { let a = sel.select(current_record); From 684a71bdf73ed2a10fcc751ce259c66d80bce23e Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Wed, 25 Oct 2023 15:28:25 -0400 Subject: [PATCH 2/2] `dedup`: adjust tests to correctly check --ignore-case option --- tests/test_dedup.rs | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/tests/test_dedup.rs b/tests/test_dedup.rs index 02afdd5ce..b2e7a9138 100644 --- a/tests/test_dedup.rs +++ b/tests/test_dedup.rs @@ -45,7 +45,32 @@ fn dedup_no_case() { cmd.arg("-i").arg("in.csv"); let got: Vec> = wrk.read_stdout(&mut cmd); - let expected = vec![svec!["N", "S"], svec!["10", "a"], svec!["2", "b"]]; + let expected = vec![svec!["N", "S"], svec!["10", "a"], svec!["2", "B"]]; + assert_eq!(got, expected); +} + +#[test] +fn dedup_issue_1381() { + let wrk = Workdir::new("dedup_issue_1381"); + wrk.create( + "in.csv", + vec![ + svec!["office"], + svec!["Member of legislative assembly"], + svec!["Member of Legislative Assembly"], + svec!["Member of Tamil Nadu Legislative Assembly"], + ], + ); + + let mut cmd = wrk.command("dedup"); + cmd.arg("-i").arg("in.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["office"], + svec!["Member of Legislative Assembly"], + svec!["Member of Tamil Nadu Legislative Assembly"], + ]; assert_eq!(got, expected); } @@ -183,7 +208,7 @@ fn dedup_alreadysorted_nocase() { svec!["N", "S"], svec!["10", "a"], svec!["100", "a"], - svec!["20", "b"], + svec!["20", "B"], svec!["3", "c"], svec!["4", "d"], ];