Skip to content

Commit

Permalink
Merge pull request #2173 from jqnatividad/2172-enum-new_column
Browse files Browse the repository at this point in the history
2172 enum new column
  • Loading branch information
jqnatividad authored Sep 28, 2024
2 parents 5cb909b + 36d2923 commit 326cafb
Show file tree
Hide file tree
Showing 2 changed files with 152 additions and 42 deletions.
88 changes: 46 additions & 42 deletions src/cmd/enumerate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,21 +54,26 @@ enum options:
Only applies in Increment mode.
(default: 1)
--constant <value> Fill a new column with the given value.
Changes the default column name to "constant".
Changes the default column name to "constant" unless
overridden by --new-column.
To specify a null value, pass the literal "<NULL>".
--copy <column> Name of a column to copy.
Changes the default column name to "{column}_copy".
Changes the default column name to "{column}_copy"
unless overridden by --new-column.
--uuid4 When set, the column will be populated with
uuids (v4) instead of the incremental identifier.
Changes the default column name to "uuid4".
Changes the default column name to "uuid4" unless
overridden by --new-column.
--uuid7 When set, the column will be populated with
uuids (v7) instead of the incremental identifier.
uuid v7 is a time-based uuid and is monotonically increasing.
See https://buildkite.com/blog/goodbye-integers-hello-uuids
Changes the default column name to "uuid7".
Changes the default column name to "uuid7" unless
overridden by --new-column.
--hash <columns> Create a new column filled with the hash of the
given column/s. Use "1-" to hash all columns.
Changes the default column name to "hash".
Changes the default column name to "hash" unless
overridden by --new-column.
Will remove an existing "hash" column if it exists.
The <columns> argument specify the columns to use
Expand Down Expand Up @@ -114,6 +119,7 @@ struct Args {
flag_delimiter: Option<Delimiter>,
}

#[derive(PartialEq)]
enum EnumOperation {
Increment,
Uuid4,
Expand Down Expand Up @@ -146,7 +152,6 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
}

let mut hash_sel = None;
let mut hash_operation = false;

if let Some(hash_columns) = &args.flag_hash {
// get the index of the column named "hash", if it exists
Expand Down Expand Up @@ -178,42 +183,6 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
// Update the configuration with the new selection
rconfig = rconfig.select(no_hash_column_selection);
hash_sel = Some(rconfig.selection(&headers)?);

hash_operation = true;
}

if !rconfig.no_headers {
if let Some(column_name) = &args.flag_new_column {
headers.push_field(column_name.as_bytes());
} else if args.flag_uuid4 {
headers.push_field(b"uuid4");
} else if args.flag_uuid7 {
headers.push_field(b"uuid7");
} else if args.flag_constant.is_some() {
headers.push_field(b"constant");
} else if copy_operation {
let current_header = match simdutf8::compat::from_utf8(&headers[copy_index]) {
Ok(s) => s,
Err(e) => return fail_clierror!("Could not parse header as utf-8!: {e}"),
};
headers.push_field(format!("{current_header}_copy").as_bytes());
} else if hash_operation {
// Remove an existing "hash" column from the header, if it exists
headers = if let Some(hash_index) = hash_index {
headers
.into_iter()
.enumerate()
.filter_map(|(i, field)| if i == hash_index { None } else { Some(field) })
.collect()
} else {
headers
};
headers.push_field(b"hash");
} else {
headers.push_field(b"index");
};

wtr.write_record(&headers)?;
}

let constant_value = if args.flag_constant == Some(NULL_VALUE.to_string()) {
Expand All @@ -236,6 +205,41 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
EnumOperation::Increment
};

if !rconfig.no_headers {
if enum_operation == EnumOperation::Hash {
// Remove an existing "hash" column from the header, if it exists
headers = if let Some(hash_index) = hash_index {
headers
.into_iter()
.enumerate()
.filter_map(|(i, field)| if i == hash_index { None } else { Some(field) })
.collect()
} else {
headers
};
}
let column_name = if let Some(new_column_name) = args.flag_new_column {
new_column_name
} else {
match enum_operation {
EnumOperation::Increment => "index".to_string(),
EnumOperation::Uuid4 => "uuid4".to_string(),
EnumOperation::Uuid7 => "uuid7".to_string(),
EnumOperation::Constant => "constant".to_string(),
EnumOperation::Copy => {
let current_header = match simdutf8::compat::from_utf8(&headers[copy_index]) {
Ok(s) => s,
Err(e) => return fail_clierror!("Could not parse header as utf-8!: {e}"),
};
format!("{current_header}_copy")
},
EnumOperation::Hash => "hash".to_string(),
}
};
headers.push_field(column_name.as_bytes());
wtr.write_byte_record(&headers)?;
}

// amortize allocations
let mut record = csv::ByteRecord::new();
let mut counter: u64 = args.flag_start;
Expand Down
106 changes: 106 additions & 0 deletions tests/test_enumerate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,3 +508,109 @@ fn enumerate_uuid7() {
assert!(got[2][2] < got[3][2]);
assert!(got[3][2] < got[4][2]);
}

#[test]
fn enumerate_constant_issue_2172_new_column() {
let wrk = Workdir::new("enumerate_constant_issue_2172_new_column");
wrk.create(
"data.csv",
vec![
svec!["name", "numcol"],
svec!["Fred", "0"],
svec!["Joe", "1"],
svec!["Mary", "2"],
],
);
let mut cmd = wrk.command("enum");
cmd.arg("--constant").arg("test").arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["name", "numcol", "constant"],
svec!["Fred", "0", "test"],
svec!["Joe", "1", "test"],
svec!["Mary", "2", "test"],
];
assert_eq!(got, expected);
}

#[test]
fn enumerate_copy_issue_2172_new_column() {
let wrk = Workdir::new("enumerate_copy_issue_2172_new_column");
wrk.create(
"data.csv",
vec![
svec!["name", "numcol"],
svec!["Fred", "0"],
svec!["Joe", "1"],
svec!["Mary", "2"],
],
);
let mut cmd = wrk.command("enum");
cmd.args(["--copy", "numcol"])
.args(["-c", "chiffre"])
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["name", "numcol", "chiffre"],
svec!["Fred", "0", "0"],
svec!["Joe", "1", "1"],
svec!["Mary", "2", "2"],
];
assert_eq!(got, expected);
}

#[test]
fn enumerate_hash_issue_2172_new_column() {
let wrk = Workdir::new("enumerate_hash_issue_2172_new_column");
wrk.create(
"data.csv",
vec![
svec!["name", "hash"],
svec!["Fred", "0"],
svec!["Joe", "1"],
svec!["Mary", "2"],
],
);
let mut cmd = wrk.command("enum");
cmd.args(["--hash", "name"])
.args(["--new-column", "id"])
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["name", "id"],
svec!["Fred", "7744023578077004230"],
svec!["Joe", "1162351066380295090"],
svec!["Mary", "13526984025446498287"],
];
assert_eq!(got, expected);
}

#[test]
fn enumerate_hash_issue_2172() {
let wrk = Workdir::new("enumerate_hash_issue_2172");
wrk.create(
"data.csv",
vec![
svec!["name", "some_other_column"],
svec!["Fred", "0"],
svec!["Joe", "1"],
svec!["Mary", "2"],
],
);
let mut cmd = wrk.command("enum");
cmd.args(["--hash", "name"])
.args(["--new-column", "id"])
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["name", "some_other_column", "id"],
svec!["Fred", "0", "7744023578077004230"],
svec!["Joe", "1", "1162351066380295090"],
svec!["Mary", "2", "13526984025446498287"],
];
assert_eq!(got, expected);
}

0 comments on commit 326cafb

Please sign in to comment.