Skip to content

Commit

Permalink
Merge pull request #1387 from jqnatividad/1381-dedup-ignorecase
Browse files Browse the repository at this point in the history
`dedup`: fix --ignore-case option
  • Loading branch information
jqnatividad authored Oct 25, 2023
2 parents 1b1a9bc + 684a71b commit 3191335
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 12 deletions.
33 changes: 23 additions & 10 deletions src/cmd/dedup.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
static USAGE: &str = r#"
Deduplicates CSV rows.
Note that this requires reading all of the CSV data into memory because because the
rows need to be sorted first.
This requires reading all of the CSV data into memory because because the rows need
to be sorted first.
That is, unless the --sorted option is used to indicate the CSV is already sorted
(typically, with the extsort command). This will make dedup run in streaming mode
with constant memory.
That is, unless the --sorted option is used to indicate the CSV is already sorted -
typically, with the sort cmd for more sorting options or the extsort cmd for larger
than memory CSV files. This will make dedup run in streaming mode with constant memory.
Either way, the output will not only be deduplicated, it will also be sorted.
Note that dedup's sorting will only be done alphabetically, not numerically. That is,
10 will come before 2. If you need to sort numerically, use the sort command first with
the --numeric option and pipe it to dedup with the --sorted option.
(i.e. qsv sort --numeric in.csv | qsv dedup --sorted)
A duplicate count will also be sent to <stderr>.
For examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_dedup.rs.
Expand Down Expand Up @@ -147,11 +152,19 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
util::njobs(args.flag_jobs);

let mut all = rdr.byte_records().collect::<Result<Vec<_>, _>>()?;
all.par_sort_by(|r1, r2| {
let a = sel.select(r1);
let b = sel.select(r2);
iter_cmp(a, b)
});
if ignore_case {
all.par_sort_by(|r1, r2| {
let a = sel.select(r1);
let b = sel.select(r2);
iter_cmp_ignore_case(a, b)
});
} else {
all.par_sort_by(|r1, r2| {
let a = sel.select(r1);
let b = sel.select(r2);
iter_cmp(a, b)
});
}

for (current, current_record) in all.iter().enumerate() {
let a = sel.select(current_record);
Expand Down
29 changes: 27 additions & 2 deletions tests/test_dedup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,32 @@ fn dedup_no_case() {
cmd.arg("-i").arg("in.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![svec!["N", "S"], svec!["10", "a"], svec!["2", "b"]];
let expected = vec![svec!["N", "S"], svec!["10", "a"], svec!["2", "B"]];
assert_eq!(got, expected);
}

#[test]
fn dedup_issue_1381() {
let wrk = Workdir::new("dedup_issue_1381");
wrk.create(
"in.csv",
vec![
svec!["office"],
svec!["Member of legislative assembly"],
svec!["Member of Legislative Assembly"],
svec!["Member of Tamil Nadu Legislative Assembly"],
],
);

let mut cmd = wrk.command("dedup");
cmd.arg("-i").arg("in.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["office"],
svec!["Member of Legislative Assembly"],
svec!["Member of Tamil Nadu Legislative Assembly"],
];
assert_eq!(got, expected);
}

Expand Down Expand Up @@ -183,7 +208,7 @@ fn dedup_alreadysorted_nocase() {
svec!["N", "S"],
svec!["10", "a"],
svec!["100", "a"],
svec!["20", "b"],
svec!["20", "B"],
svec!["3", "c"],
svec!["4", "d"],
];
Expand Down

0 comments on commit 3191335

Please sign in to comment.