From d15c51203c981b989b4e328a797ee552e1da6a4e Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Wed, 25 Oct 2023 15:27:29 -0400
Subject: [PATCH 1/2] `dedup`: preliminary sorting before dedup respects
 --ignore-case option

also expanded usage text explaining sorting options before deduping
---
 src/cmd/dedup.rs | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)
diff --git a/src/cmd/dedup.rs b/src/cmd/dedup.rs
index 70445ebe2..7ec7c2f91 100644
--- a/src/cmd/dedup.rs
+++ b/src/cmd/dedup.rs
@@ -1,15 +1,20 @@
 static USAGE: &str = r#"
 Deduplicates CSV rows. 
 
-Note that this requires reading all of the CSV data into memory because because the 
-rows need to be sorted first. 
+This requires reading all of the CSV data into memory because because the rows need
+to be sorted first.
 
-That is, unless the --sorted option is used to indicate the CSV is already sorted
-(typically, with the extsort command). This will make dedup run in streaming mode 
-with constant memory.
+That is, unless the --sorted option is used to indicate the CSV is already sorted -
+typically, with the sort cmd for more sorting options or the extsort cmd for larger
+than memory CSV files. This will make dedup run in streaming mode with constant memory.
 
 Either way, the output will not only be deduplicated, it will also be sorted.
 
+Note that dedup's sorting will only be done alphabetically, not numerically. That is,
+10 will come before 2. If you need to sort numerically, use the sort command first with
+the --numeric option and pipe it to dedup with the --sorted option.
+(i.e. qsv sort --numeric in.csv | qsv dedup --sorted)
+
 A duplicate count will also be sent to <stderr>.
 
 For examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_dedup.rs.
@@ -147,11 +152,19 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
         util::njobs(args.flag_jobs);
 
         let mut all = rdr.byte_records().collect::<Result<Vec<_>, _>>()?;
-        all.par_sort_by(|r1, r2| {
-            let a = sel.select(r1);
-            let b = sel.select(r2);
-            iter_cmp(a, b)
-        });
+        if ignore_case {
+            all.par_sort_by(|r1, r2| {
+                let a = sel.select(r1);
+                let b = sel.select(r2);
+                iter_cmp_ignore_case(a, b)
+            });
+        } else {
+            all.par_sort_by(|r1, r2| {
+                let a = sel.select(r1);
+                let b = sel.select(r2);
+                iter_cmp(a, b)
+            });
+        }
 
         for (current, current_record) in all.iter().enumerate() {
             let a = sel.select(current_record);

From 684a71bdf73ed2a10fcc751ce259c66d80bce23e Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Wed, 25 Oct 2023 15:28:25 -0400
Subject: [PATCH 2/2] `dedup`: adjust tests to correctly check --ignore-case
 option

---
 tests/test_dedup.rs | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/tests/test_dedup.rs b/tests/test_dedup.rs
index 02afdd5ce..b2e7a9138 100644
--- a/tests/test_dedup.rs
+++ b/tests/test_dedup.rs
@@ -45,7 +45,32 @@ fn dedup_no_case() {
     cmd.arg("-i").arg("in.csv");
 
     let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
-    let expected = vec![svec!["N", "S"], svec!["10", "a"], svec!["2", "b"]];
+    let expected = vec![svec!["N", "S"], svec!["10", "a"], svec!["2", "B"]];
+    assert_eq!(got, expected);
+}
+
+#[test]
+fn dedup_issue_1381() {
+    let wrk = Workdir::new("dedup_issue_1381");
+    wrk.create(
+        "in.csv",
+        vec![
+            svec!["office"],
+            svec!["Member of legislative assembly"],
+            svec!["Member of Legislative Assembly"],
+            svec!["Member of Tamil Nadu Legislative Assembly"],
+        ],
+    );
+
+    let mut cmd = wrk.command("dedup");
+    cmd.arg("-i").arg("in.csv");
+
+    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+    let expected = vec![
+        svec!["office"],
+        svec!["Member of Legislative Assembly"],
+        svec!["Member of Tamil Nadu Legislative Assembly"],
+    ];
     assert_eq!(got, expected);
 }
 
@@ -183,7 +208,7 @@ fn dedup_alreadysorted_nocase() {
         svec!["N", "S"],
         svec!["10", "a"],
         svec!["100", "a"],
-        svec!["20", "b"],
+        svec!["20", "B"],
         svec!["3", "c"],
         svec!["4", "d"],
     ];