Skip to content

Commit

Permalink
Merge pull request #1541 from jqnatividad/pseudo-improvements
Browse files Browse the repository at this point in the history
`pseudo`: major refactor
  • Loading branch information
jqnatividad authored Jan 11, 2024
2 parents ae170e3 + 3e6f344 commit 3a3676b
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 35 deletions.
18 changes: 5 additions & 13 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,7 @@ csvs_convert = { version = "0.8", default-features = false, features = [
], optional = true }
data-encoding = { version = "2.5", optional = true }
dotenvy = "0.15"
dynfmt = { version = "0.1", default-features = false, features = [
"curly",
], optional = true }
dynfmt = { version = "0.1", default-features = false, features = ["curly"] }
eudex = { version = "0.1", optional = true }
ext-sort = { version = "0.1", features = [
"memory-limit",
Expand Down Expand Up @@ -129,10 +127,7 @@ jsonxf = { version = "1", optional = true }
jql-runner = { version = "7.1", default-features = false, optional = true }
log = "0.4"
mimalloc = { version = "0.1", default-features = false, optional = true }
mlua = { version = "0.9", features = [
"luau",
"serialize",
], optional = true }
mlua = { version = "0.9", features = ["luau", "serialize"], optional = true }
num_cpus = "1"
odht = "0.3"
phf = { version = "0.11", features = ["macros"], optional = true }
Expand Down Expand Up @@ -234,8 +229,8 @@ serial_test = { version = "3.0", features = ["file_locks"] }

[patch.crates-io]
console = { git = "https://github.com/console-rs/console", rev = "0567bdcb6a523466586dc9ff0692c2e776f20a5f" }
dynfmt = { git = "https://github.com/jqnatividad/dynfmt", branch = "2021-clippy_ptr_as_ptr-bumpdeps"}
grex = { git = "https://github.com/pemistahl/grex", rev="8f6b35cee5f911311c2e0ef6e56f333e4c896112" }
dynfmt = { git = "https://github.com/jqnatividad/dynfmt", branch = "2021-clippy_ptr_as_ptr-bumpdeps" }
grex = { git = "https://github.com/pemistahl/grex", rev = "8f6b35cee5f911311c2e0ef6e56f333e4c896112" }

[features]
default = ["mimalloc"]
Expand All @@ -256,7 +251,6 @@ apply = [
"censor",
"cpc",
"data-encoding",
"dynfmt",
"eudex",
"hashbrown",
"qsv_currency",
Expand All @@ -269,7 +263,6 @@ apply = [
fetch = [
"cached",
"console",
"dynfmt",
"flate2",
"governor",
"hashbrown",
Expand All @@ -283,7 +276,6 @@ foreach = []
geocode = [
"anyhow",
"cached",
"dynfmt",
"geosuggest-core",
"geosuggest-utils",
"phf",
Expand All @@ -294,7 +286,7 @@ python = ["pyo3"]
to = ["csvs_convert"]
to_parquet = ["csvs_convert/parquet"]
lite = []
datapusher_plus = ["dynfmt", "self_update"]
datapusher_plus = ["self_update"]
polars = ["dep:polars", "smartstring"]
feature_capable = []
nightly = [
Expand Down
119 changes: 97 additions & 22 deletions src/cmd/pseudo.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,66 @@
static USAGE: &str = r#"
Pseudonymise the value of the given column by replacing them by an
Pseudonymise the value of the given column by replacing it with an
incremental identifier.
For examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_pseudo.rs.
Once a value is pseudonymised, it will always be replaced with the same
identifier. This means that the same value will always be replaced with
the same identifier, even if it appears in different rows.
The incremental identifier is generated by using the given format string
and the starting number and increment.
EXAMPLE:
Pseudonymise the value of the "Name" column by replacing it with an
incremental identifier starting at 1000 and incrementing by 5:
$ qsv pseudo Name --start 1000 --increment 5 --fmtstr "ID-{}" data.csv
If run on the following CSV data:
Name,Color
Mary,yellow
John,blue
Mary,purple
Sue,orange
John,magenta
Mary,cyan
will replace the value of the "Name" column with the following values:
Name,Color
ID-1000,yellow
ID-1005,blue
ID-1000,purple
ID-1010,orange
ID-1005,magenta
ID-1000,cyan
For more examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_pseudo.rs.
Usage:
qsv pseudo [options] <column> [<input>]
qsv pseudo --help
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-n, --no-headers When set, the first row will not be interpreted
as headers.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
-h, --help Display this message
--start <number> The starting number for the incremental identifier.
[default: 0]
--increment <number> The increment for the incremental identifier.
[default: 1]
--formatstr <template> The format string for the incremental identifier.
The format string must contain a single "{}" which
will be replaced with the incremental identifier.
[default: {}]
-o, --output <file> Write output to <file> instead of stdout.
-n, --no-headers When set, the first row will not be interpreted
as headers.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
"#;

use ahash::AHashMap;
use dynfmt::Format;
use serde::Deserialize;

use crate::{
Expand All @@ -32,12 +75,16 @@ use crate::{
struct Args {
arg_column: SelectColumns,
arg_input: Option<String>,
flag_start: u64,
flag_increment: u64,
flag_formatstr: String,
flag_output: Option<String>,
flag_no_headers: bool,
flag_delimiter: Option<Delimiter>,
}

type Values = AHashMap<String, u64>;
type Values = AHashMap<String, String>;
type ValuesNum = AHashMap<String, u64>;

pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
Expand Down Expand Up @@ -70,20 +117,48 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
}

let mut record = csv::StringRecord::new();
let mut values = Values::new();
let mut counter: u64 = 0;

while rdr.read_record(&mut record)? {
let value = record[column_index].to_owned();

if let Some(id) = values.get(&value) {
record = replace_column_value(&record, column_index, &id.to_string());
} else {
values.insert(value, counter);
record = replace_column_value(&record, column_index, &counter.to_string());
counter += 1;
let mut counter: u64 = args.flag_start;
let increment = args.flag_increment;

if args.flag_formatstr == "{}" {
// we don't need to use dynfmt::SimpleCurlyFormat if the format string is "{}"
let mut values_num = ValuesNum::with_capacity(1000);
let mut curr_counter: u64 = 0;

while rdr.read_record(&mut record)? {
let value = record[column_index].to_owned();
let new_value = values_num.entry(value.clone()).or_insert_with(|| {
curr_counter = counter;
counter = counter.wrapping_add(increment);
curr_counter
});
record = replace_column_value(&record, column_index, &new_value.to_string());

wtr.write_record(&record)?;
}
} else {
// we need to use dynfmt::SimpleCurlyFormat if the format string is not "{}"
let mut values = Values::with_capacity(1000);
while rdr.read_record(&mut record)? {
let value = record[column_index].to_owned();

let new_value = values.entry(value.clone()).or_insert_with(|| {
if let Ok(nvalue) =
dynfmt::SimpleCurlyFormat.format(&args.flag_formatstr, [counter])
{
counter = counter.wrapping_add(increment);
nvalue.to_string()
} else {
// safety: the unwrap() is here because we're in a closure
return fail_clierror!("Invalid format string: {}", args.flag_formatstr)
.unwrap();
}
});

record = replace_column_value(&record, column_index, new_value);
wtr.write_record(&record)?;
}
wtr.write_record(&record)?;
}

Ok(wtr.flush()?)
}
102 changes: 102 additions & 0 deletions tests/test_pseudo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,105 @@ fn pseudo_no_headers() {
];
assert_eq!(got, expected);
}

#[test]
fn pseudo_formatstr() {
let wrk = Workdir::new("pseudo_formatstr");
wrk.create(
"data.csv",
vec![
svec!["name", "colors"],
svec!["Mary", "yellow"],
svec!["John", "blue"],
svec!["Mary", "purple"],
svec!["Sue", "orange"],
svec!["John", "magenta"],
svec!["Mary", "cyan"],
],
);
let mut cmd = wrk.command("pseudo");
cmd.arg("name")
.args(["--formatstr", "ID-{}"])
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["name", "colors"],
svec!["ID-0", "yellow"],
svec!["ID-1", "blue"],
svec!["ID-0", "purple"],
svec!["ID-2", "orange"],
svec!["ID-1", "magenta"],
svec!["ID-0", "cyan"],
];
assert_eq!(got, expected);
}

#[test]
fn pseudo_formatstr_increment() {
let wrk = Workdir::new("pseudo_formatstr_increment");
wrk.create(
"data.csv",
vec![
svec!["name", "colors"],
svec!["Mary", "yellow"],
svec!["John", "blue"],
svec!["Mary", "purple"],
svec!["Sue", "orange"],
svec!["John", "magenta"],
svec!["Mary", "cyan"],
],
);
let mut cmd = wrk.command("pseudo");
cmd.arg("name")
.args(["--formatstr", "ID-{}"])
.args(["--increment", "5"])
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["name", "colors"],
svec!["ID-0", "yellow"],
svec!["ID-5", "blue"],
svec!["ID-0", "purple"],
svec!["ID-10", "orange"],
svec!["ID-5", "magenta"],
svec!["ID-0", "cyan"],
];
assert_eq!(got, expected);
}

#[test]
fn pseudo_formatstr_start_increment() {
let wrk = Workdir::new("pseudo_formatstr_start_increment");
wrk.create(
"data.csv",
vec![
svec!["name", "colors"],
svec!["Mary", "yellow"],
svec!["John", "blue"],
svec!["Mary", "purple"],
svec!["Sue", "orange"],
svec!["John", "magenta"],
svec!["Mary", "cyan"],
],
);
let mut cmd = wrk.command("pseudo");
cmd.arg("name")
.args(["--start", "1000"])
.args(["--formatstr", "ID-{}"])
.args(["--increment", "5"])
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["name", "colors"],
svec!["ID-1000", "yellow"],
svec!["ID-1005", "blue"],
svec!["ID-1000", "purple"],
svec!["ID-1010", "orange"],
svec!["ID-1005", "magenta"],
svec!["ID-1000", "cyan"],
];
assert_eq!(got, expected);
}

0 comments on commit 3a3676b

Please sign in to comment.