diff --git a/Cargo.lock b/Cargo.lock index cedf85ed1..c5aa03327 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1924,6 +1924,11 @@ dependencies = [ "slab", ] +[[package]] +name = "gender_guesser" +version = "0.2.0" +source = "git+https://github.com/jqnatividad/gender_guesser?branch=bundle_namdict_txt#bbd8c88601607c4e66493664fd584f8e33b7d841" + [[package]] name = "generator" version = "0.7.5" @@ -4019,6 +4024,7 @@ dependencies = [ "flexi_logger", "futures", "futures-util", + "gender_guesser", "geosuggest-core", "geosuggest-utils", "governor", diff --git a/Cargo.toml b/Cargo.toml index bbd2b947a..4b1a5fd23 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -106,6 +106,7 @@ flexi_logger = { version = "0.27", features = [ ], default-features = false } futures = "0.3" futures-util = "0.3" +gender_guesser = { version = "0.2", optional = true } geosuggest-core = { version = "0.6", optional = true } geosuggest-utils = { version = "0.6", optional = true } governor = { version = "0.6", optional = true } @@ -235,6 +236,7 @@ serial_test = { version = "3.0", features = ["file_locks"] } dynfmt = { git = "https://github.com/jqnatividad/dynfmt", branch = "2021-clippy_ptr_as_ptr-bumpdeps" } grex = { git = "https://github.com/pemistahl/grex", rev = "8f6b35cee5f911311c2e0ef6e56f333e4c896112" } halfbrown = { git = "https://github.com/licenser/halfbrown", rev = "7cecc29422ae2775abe35a2e430f1678b4f1aa76" } +gender_guesser = { git = "https://github.com/jqnatividad/gender_guesser", branch = "bundle_namdict_txt"} [features] default = ["mimalloc"] @@ -256,6 +258,7 @@ apply = [ "cpc", "data-encoding", "eudex", + "gender_guesser", "hashbrown", "qsv_currency", "strsim", diff --git a/src/cmd/apply.rs b/src/cmd/apply.rs index 81f0581c5..1abab6ac8 100644 --- a/src/cmd/apply.rs +++ b/src/cmd/apply.rs @@ -3,7 +3,7 @@ Apply a series of transformation functions to given CSV column/s. This can be us perform typical data-wrangling tasks and/or to harmonize some values, etc. It has five subcommands: - 1. operations* - 36 string, format, currency, regex & NLP operators. + 1. operations* - 37 string, format, currency, regex & NLP operators. 2. emptyreplace* - replace empty cells with <--replacement> string. 3. datefmt* - Formats recognized date/s (19 formats recognized) to a specified date format using <--formatstr>. @@ -73,6 +73,7 @@ It has 36 supported operations: with --comparand. Automatically rounds values to two decimal places. Specify "euro" formatting (e.g. 1.000,00 instead of 1,000.00 ) by setting --formatstr to "euro". Specify conversion rate by setting --replacement to a number. + * gender_guess: Guess the gender of a name. * copy: Mark a column for copying * simdl: Damerau-Levenshtein similarity to --comparand * simdln: Normalized Damerau-Levenshtein similarity to --comparand (between 0.0 & 1.0) @@ -358,6 +359,7 @@ use cpc::{eval, units::Unit}; use data_encoding::BASE64; use dynfmt::Format; use eudex::Hash; +use gender_guesser::Gender; use indicatif::{ProgressBar, ProgressDrawTarget}; use log::debug; use qsv_currency::Currency; @@ -402,6 +404,7 @@ enum Operations { Encode, Escape, Eudex, + Gender_Guess, Len, Lower, Ltrim, @@ -999,6 +1002,13 @@ fn validate_operations( } whatlang_invokes = whatlang_invokes.saturating_add(1); }, + Operations::Gender_Guess => { + if flag_new_column.is_none() { + return fail_incorrectusage_clierror!( + "--new_column (-c) is required for Gender_Guess" + ); + } + }, _ => {}, } ops_vec.push(operation); @@ -1083,6 +1093,18 @@ fn apply_operations( Err(e) => format!("decoding error: {e:?}"), }; }, + Operations::Gender_Guess => { + let gender_detector = gender_guesser::Detector::new(); + *cell = match gender_detector.get_gender(cell) { + Gender::Male => "Male".to_string(), + Gender::Female => "Female".to_string(), + Gender::MayBeMale => "MayBeMale".to_string(), + Gender::MayBeFemale => "MayBeFemale".to_string(), + Gender::BothMaleFemale => "BothMaleFemale".to_string(), + Gender::NotSure => "NotSure".to_string(), + Gender::NotFound => "NotFound".to_string(), + }; + }, Operations::Escape => { *cell = cell.escape_default().to_string(); }, diff --git a/tests/test_apply.rs b/tests/test_apply.rs index 11467fc33..47ee275c0 100644 --- a/tests/test_apply.rs +++ b/tests/test_apply.rs @@ -47,6 +47,61 @@ fn apply_ops_upper() { assert_eq!(got, expected); } +#[test] +fn apply_ops_gender_guess() { + let wrk = Workdir::new("apply"); + wrk.create( + "data.csv", + vec![ + svec!["name"], + svec!["Peter"], + svec!["Michael"], + svec!["Joel"], + svec!["Hussein"], + svec!["Ian"], + svec!["Enrique"], + svec!["Ana"], + svec!["Olivia"], + svec!["Mackenzie"], + svec!["Adair"], + svec!["Aaf"], + svec!["Voldemort"], + svec!["Sami"], + svec!["Minhaj"], + svec!["Abdurrahman"], + svec!["Abbe"], + ], + ); + let mut cmd = wrk.command("apply"); + cmd.arg("operations") + .arg("gender_guess") + .arg("name") + .args(["--new-column", "Gender"]) + .arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["name", "Gender"], + svec!["Peter", "Male"], + svec!["Michael", "Male"], + svec!["Joel", "Male"], + svec!["Hussein", "Male"], + svec!["Ian", "Male"], + svec!["Enrique", "Male"], + svec!["Ana", "Female"], + svec!["Olivia", "Female"], + svec!["Mackenzie", "NotSure"], + svec!["Adair", "MayBeMale"], + svec!["Aaf", "MayBeFemale"], + svec!["Voldemort", "NotFound"], + svec!["Sami", "Male"], + svec!["Minhaj", "NotFound"], + svec!["Abdurrahman", "Male"], + svec!["Abbe", "NotSure"], + ]; + assert_eq!(got, expected); +} + #[test] fn apply_ops_escape() { let wrk = Workdir::new("apply");