Merge pull request #1387 from jqnatividad/1381-dedup-ignorecase

`dedup`: fix --ignore-case option
dathere · Oct 25, 2023 · 3191335 · 3191335
2 parents 1b1a9bc + 684a71b
commit 3191335
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 12 deletions.
diff --git a/src/cmd/dedup.rs b/src/cmd/dedup.rs
@@ -1,15 +1,20 @@
 static USAGE: &str = r#"
 Deduplicates CSV rows. 
 
-Note that this requires reading all of the CSV data into memory because because the 
-rows need to be sorted first. 
+This requires reading all of the CSV data into memory because because the rows need
+to be sorted first.
 
-That is, unless the --sorted option is used to indicate the CSV is already sorted
-(typically, with the extsort command). This will make dedup run in streaming mode 
-with constant memory.
+That is, unless the --sorted option is used to indicate the CSV is already sorted -
+typically, with the sort cmd for more sorting options or the extsort cmd for larger
+than memory CSV files. This will make dedup run in streaming mode with constant memory.
 
 Either way, the output will not only be deduplicated, it will also be sorted.
 
+Note that dedup's sorting will only be done alphabetically, not numerically. That is,
+10 will come before 2. If you need to sort numerically, use the sort command first with
+the --numeric option and pipe it to dedup with the --sorted option.
+(i.e. qsv sort --numeric in.csv | qsv dedup --sorted)
+
 A duplicate count will also be sent to <stderr>.
 
 For examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_dedup.rs.
@@ -147,11 +152,19 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
         util::njobs(args.flag_jobs);
 
         let mut all = rdr.byte_records().collect::<Result<Vec<_>, _>>()?;
-        all.par_sort_by(|r1, r2| {
-            let a = sel.select(r1);
-            let b = sel.select(r2);
-            iter_cmp(a, b)
-        });
+        if ignore_case {
+            all.par_sort_by(|r1, r2| {
+                let a = sel.select(r1);
+                let b = sel.select(r2);
+                iter_cmp_ignore_case(a, b)
+            });
+        } else {
+            all.par_sort_by(|r1, r2| {
+                let a = sel.select(r1);
+                let b = sel.select(r2);
+                iter_cmp(a, b)
+            });
+        }
 
         for (current, current_record) in all.iter().enumerate() {
             let a = sel.select(current_record);

diff --git a/tests/test_dedup.rs b/tests/test_dedup.rs
@@ -45,7 +45,32 @@ fn dedup_no_case() {
     cmd.arg("-i").arg("in.csv");
 
     let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
-    let expected = vec![svec!["N", "S"], svec!["10", "a"], svec!["2", "b"]];
+    let expected = vec![svec!["N", "S"], svec!["10", "a"], svec!["2", "B"]];
+    assert_eq!(got, expected);
+}
+
+#[test]
+fn dedup_issue_1381() {
+    let wrk = Workdir::new("dedup_issue_1381");
+    wrk.create(
+        "in.csv",
+        vec![
+            svec!["office"],
+            svec!["Member of legislative assembly"],
+            svec!["Member of Legislative Assembly"],
+            svec!["Member of Tamil Nadu Legislative Assembly"],
+        ],
+    );
+
+    let mut cmd = wrk.command("dedup");
+    cmd.arg("-i").arg("in.csv");
+
+    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+    let expected = vec![
+        svec!["office"],
+        svec!["Member of Legislative Assembly"],
+        svec!["Member of Tamil Nadu Legislative Assembly"],
+    ];
     assert_eq!(got, expected);
 }
 
@@ -183,7 +208,7 @@ fn dedup_alreadysorted_nocase() {
         svec!["N", "S"],
         svec!["10", "a"],
         svec!["100", "a"],
-        svec!["20", "b"],
+        svec!["20", "B"],
         svec!["3", "c"],
         svec!["4", "d"],
     ];