From 2d3f6aa69cb6c61440080ef7d2a4454d8eb3aa29 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 28 Sep 2024 00:11:53 -0400 Subject: [PATCH 1/2] `enum`: `--new-column` now applies to all modes, not just --increment --- src/cmd/enumerate.rs | 88 +++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 42 deletions(-) diff --git a/src/cmd/enumerate.rs b/src/cmd/enumerate.rs index 041b25299..5e23c0af2 100644 --- a/src/cmd/enumerate.rs +++ b/src/cmd/enumerate.rs @@ -54,21 +54,26 @@ enum options: Only applies in Increment mode. (default: 1) --constant Fill a new column with the given value. - Changes the default column name to "constant". + Changes the default column name to "constant" unless + overridden by --new-column. To specify a null value, pass the literal "". --copy Name of a column to copy. - Changes the default column name to "{column}_copy". + Changes the default column name to "{column}_copy" + unless overridden by --new-column. --uuid4 When set, the column will be populated with uuids (v4) instead of the incremental identifier. - Changes the default column name to "uuid4". + Changes the default column name to "uuid4" unless + overridden by --new-column. --uuid7 When set, the column will be populated with uuids (v7) instead of the incremental identifier. uuid v7 is a time-based uuid and is monotonically increasing. See https://buildkite.com/blog/goodbye-integers-hello-uuids - Changes the default column name to "uuid7". + Changes the default column name to "uuid7" unless + overridden by --new-column. --hash Create a new column filled with the hash of the given column/s. Use "1-" to hash all columns. - Changes the default column name to "hash". + Changes the default column name to "hash" unless + overridden by --new-column. Will remove an existing "hash" column if it exists. The argument specify the columns to use @@ -114,6 +119,7 @@ struct Args { flag_delimiter: Option, } +#[derive(PartialEq)] enum EnumOperation { Increment, Uuid4, @@ -146,7 +152,6 @@ pub fn run(argv: &[&str]) -> CliResult<()> { } let mut hash_sel = None; - let mut hash_operation = false; if let Some(hash_columns) = &args.flag_hash { // get the index of the column named "hash", if it exists @@ -178,42 +183,6 @@ pub fn run(argv: &[&str]) -> CliResult<()> { // Update the configuration with the new selection rconfig = rconfig.select(no_hash_column_selection); hash_sel = Some(rconfig.selection(&headers)?); - - hash_operation = true; - } - - if !rconfig.no_headers { - if let Some(column_name) = &args.flag_new_column { - headers.push_field(column_name.as_bytes()); - } else if args.flag_uuid4 { - headers.push_field(b"uuid4"); - } else if args.flag_uuid7 { - headers.push_field(b"uuid7"); - } else if args.flag_constant.is_some() { - headers.push_field(b"constant"); - } else if copy_operation { - let current_header = match simdutf8::compat::from_utf8(&headers[copy_index]) { - Ok(s) => s, - Err(e) => return fail_clierror!("Could not parse header as utf-8!: {e}"), - }; - headers.push_field(format!("{current_header}_copy").as_bytes()); - } else if hash_operation { - // Remove an existing "hash" column from the header, if it exists - headers = if let Some(hash_index) = hash_index { - headers - .into_iter() - .enumerate() - .filter_map(|(i, field)| if i == hash_index { None } else { Some(field) }) - .collect() - } else { - headers - }; - headers.push_field(b"hash"); - } else { - headers.push_field(b"index"); - }; - - wtr.write_record(&headers)?; } let constant_value = if args.flag_constant == Some(NULL_VALUE.to_string()) { @@ -236,6 +205,41 @@ pub fn run(argv: &[&str]) -> CliResult<()> { EnumOperation::Increment }; + if !rconfig.no_headers { + if enum_operation == EnumOperation::Hash { + // Remove an existing "hash" column from the header, if it exists + headers = if let Some(hash_index) = hash_index { + headers + .into_iter() + .enumerate() + .filter_map(|(i, field)| if i == hash_index { None } else { Some(field) }) + .collect() + } else { + headers + }; + } + let column_name = if let Some(new_column_name) = args.flag_new_column { + new_column_name + } else { + match enum_operation { + EnumOperation::Increment => "index".to_string(), + EnumOperation::Uuid4 => "uuid4".to_string(), + EnumOperation::Uuid7 => "uuid7".to_string(), + EnumOperation::Constant => "constant".to_string(), + EnumOperation::Copy => { + let current_header = match simdutf8::compat::from_utf8(&headers[copy_index]) { + Ok(s) => s, + Err(e) => return fail_clierror!("Could not parse header as utf-8!: {e}"), + }; + format!("{current_header}_copy") + }, + EnumOperation::Hash => "hash".to_string(), + } + }; + headers.push_field(column_name.as_bytes()); + wtr.write_byte_record(&headers)?; + } + // amortize allocations let mut record = csv::ByteRecord::new(); let mut counter: u64 = args.flag_start; From 36d2923e8a69ff26c605653b59c65a5faf88027a Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 28 Sep 2024 00:12:37 -0400 Subject: [PATCH 2/2] `tests`: add `enumerate` tests for issue 2172 --- tests/test_enumerate.rs | 106 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/tests/test_enumerate.rs b/tests/test_enumerate.rs index 7f0f09c58..d783ca3a2 100644 --- a/tests/test_enumerate.rs +++ b/tests/test_enumerate.rs @@ -508,3 +508,109 @@ fn enumerate_uuid7() { assert!(got[2][2] < got[3][2]); assert!(got[3][2] < got[4][2]); } + +#[test] +fn enumerate_constant_issue_2172_new_column() { + let wrk = Workdir::new("enumerate_constant_issue_2172_new_column"); + wrk.create( + "data.csv", + vec![ + svec!["name", "numcol"], + svec!["Fred", "0"], + svec!["Joe", "1"], + svec!["Mary", "2"], + ], + ); + let mut cmd = wrk.command("enum"); + cmd.arg("--constant").arg("test").arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["name", "numcol", "constant"], + svec!["Fred", "0", "test"], + svec!["Joe", "1", "test"], + svec!["Mary", "2", "test"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn enumerate_copy_issue_2172_new_column() { + let wrk = Workdir::new("enumerate_copy_issue_2172_new_column"); + wrk.create( + "data.csv", + vec![ + svec!["name", "numcol"], + svec!["Fred", "0"], + svec!["Joe", "1"], + svec!["Mary", "2"], + ], + ); + let mut cmd = wrk.command("enum"); + cmd.args(["--copy", "numcol"]) + .args(["-c", "chiffre"]) + .arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["name", "numcol", "chiffre"], + svec!["Fred", "0", "0"], + svec!["Joe", "1", "1"], + svec!["Mary", "2", "2"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn enumerate_hash_issue_2172_new_column() { + let wrk = Workdir::new("enumerate_hash_issue_2172_new_column"); + wrk.create( + "data.csv", + vec![ + svec!["name", "hash"], + svec!["Fred", "0"], + svec!["Joe", "1"], + svec!["Mary", "2"], + ], + ); + let mut cmd = wrk.command("enum"); + cmd.args(["--hash", "name"]) + .args(["--new-column", "id"]) + .arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["name", "id"], + svec!["Fred", "7744023578077004230"], + svec!["Joe", "1162351066380295090"], + svec!["Mary", "13526984025446498287"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn enumerate_hash_issue_2172() { + let wrk = Workdir::new("enumerate_hash_issue_2172"); + wrk.create( + "data.csv", + vec![ + svec!["name", "some_other_column"], + svec!["Fred", "0"], + svec!["Joe", "1"], + svec!["Mary", "2"], + ], + ); + let mut cmd = wrk.command("enum"); + cmd.args(["--hash", "name"]) + .args(["--new-column", "id"]) + .arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["name", "some_other_column", "id"], + svec!["Fred", "0", "7744023578077004230"], + svec!["Joe", "1", "1162351066380295090"], + svec!["Mary", "2", "13526984025446498287"], + ]; + assert_eq!(got, expected); +}