diff --git a/src/cmd/geocode.rs b/src/cmd/geocode.rs index 62e4d9d77..45f0fd8b6 100644 --- a/src/cmd/geocode.rs +++ b/src/cmd/geocode.rs @@ -130,7 +130,8 @@ geocode options: e.g. "City: {name}, State: {admin1}, Country: {country}" [default: %+] - + --invalid-result The string to use when the geocode result is empty/invalid. + If not set, the original value is used. -j, --jobs The number of jobs to run in parallel. When not set, the number of jobs is set to the number of CPUs detected. -b, --batch The number of rows per batch to load into memory, before running in parallel. @@ -183,28 +184,29 @@ use crate::{ #[derive(Deserialize, Debug)] struct Args { - arg_column: String, - cmd_suggest: bool, - cmd_reverse: bool, - cmd_index_check: bool, - cmd_index_update: bool, - cmd_index_load: bool, - cmd_index_reset: bool, - arg_input: Option, - arg_index_file: Option, - flag_rename: Option, - flag_min_score: f32, - flag_k_weight: Option, - flag_formatstr: String, - flag_batch: u32, - flag_timeout: u16, - flag_languages: String, - flag_cache_dir: String, - flag_jobs: Option, - flag_new_column: Option, - flag_output: Option, - flag_delimiter: Option, - flag_progressbar: bool, + arg_column: String, + cmd_suggest: bool, + cmd_reverse: bool, + cmd_index_check: bool, + cmd_index_update: bool, + cmd_index_load: bool, + cmd_index_reset: bool, + arg_input: Option, + arg_index_file: Option, + flag_rename: Option, + flag_min_score: f32, + flag_k_weight: Option, + flag_formatstr: String, + flag_invalid_result: Option, + flag_batch: u32, + flag_timeout: u16, + flag_languages: String, + flag_cache_dir: String, + flag_jobs: Option, + flag_new_column: Option, + flag_output: Option, + flag_delimiter: Option, + flag_progressbar: bool, } static QSV_VERSION: &str = env!("CARGO_PKG_VERSION"); @@ -340,7 +342,7 @@ async fn geocode_main(args: Args) -> CliResult<()> { if index_cmd { match geocode_cmd { GeocodeSubCmd::IndexCheck => { - // load geocode engine + // check if we have updates winfo!("Checking main Geonames website for updates..."); check_index_file(&geocode_index_file)?; let engine = @@ -357,6 +359,8 @@ async fn geocode_main(args: Args) -> CliResult<()> { } }, GeocodeSubCmd::IndexUpdate => { + // update/rebuild Geonames index from Geonames website + // will only update if there are changes check_index_file(&geocode_index_file)?; let engine = load_engine(geocode_index_file.clone().into(), args.flag_progressbar).await?; @@ -410,6 +414,7 @@ async fn geocode_main(args: Args) -> CliResult<()> { } // we're not doing an index subcommand, so we're doing a suggest or reverse + // load the current local Geonames index let engine = load_engine(geocode_index_file.clone().into(), args.flag_progressbar).await?; let rconfig = Config::new(&args.arg_input) @@ -465,6 +470,8 @@ async fn geocode_main(args: Args) -> CliResult<()> { // set RAYON_NUM_THREADS util::njobs(args.flag_jobs); + let invalid_result = args.flag_invalid_result.unwrap_or_default(); + // main loop to read CSV and construct batches for parallel processing. // each batch is processed via Rayon parallel iterator. // loop exits when batch is empty. @@ -507,6 +514,12 @@ async fn geocode_main(args: Args) -> CliResult<()> { ); if let Some(geocoded_result) = search_result { cell = geocoded_result; + } else { + // --invalid-result is set, so use that instead + // otherwise, we leave cell untouched, and the original value remains + if !invalid_result.is_empty() { + cell = invalid_result.clone(); + } } } if args.flag_new_column.is_some() { @@ -592,7 +605,7 @@ async fn load_engine(geocode_index_file: PathBuf, show_progress: bool) -> CliRes key = "String", convert = r#"{ format!("{cell}") }"#, option = true, - sync_writes = true + sync_writes = false )] fn search_cached( engine: &Engine, @@ -604,10 +617,6 @@ fn search_cached( ) -> Option { static EMPTY_STRING: String = String::new(); - if cell.is_empty() { - return None; - } - if mode == GeocodeSubCmd::Suggest { let search_result = engine.suggest(cell, 1, Some(min_score)); let Some(cityrecord) = search_result.into_iter().next() else { diff --git a/tests/test_geocode.rs b/tests/test_geocode.rs index 0bd2157c1..47c2be891 100644 --- a/tests/test_geocode.rs +++ b/tests/test_geocode.rs @@ -10,10 +10,10 @@ fn geocode_suggest() { svec!["Melrose, New York"], svec!["East Flatbush, New York"], svec!["Manhattan, New York"], - svec!["East Harlem, New York"], + svec!["Brooklyn, New York"], svec!["East Harlem, New York"], svec!["This is not a Location and it will not be geocoded"], - svec!["East Flatbush, New York"], + svec!["Jersey City, New Jersey"], svec!["95.213424, 190,1234565"], // invalid lat, long svec!["Makati, Metro Manila, Philippines"], ], @@ -27,11 +27,51 @@ fn geocode_suggest() { svec!["(41.90059, -87.85673)"], svec!["(40.65371, -73.93042)"], svec!["(40.71427, -74.00597)"], - svec!["(40.79472, -73.9425)"], + svec!["(45.09413, -93.35634)"], svec!["(40.79472, -73.9425)"], svec!["This is not a Location and it will not be geocoded"], + svec!["(40.72816, -74.07764)"], + svec!["95.213424, 190,1234565"], // suggest expects a city name, not lat, long + svec!["(14.55027, 121.03269)"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn geocode_suggest_invalid() { + let wrk = Workdir::new("geocode_suggest_invalid"); + wrk.create( + "data.csv", + vec![ + svec!["Location"], + svec!["Melrose, New York"], + svec!["East Flatbush, New York"], + svec!["Manhattan, New York"], + svec!["East Harlem, New York"], + svec!["Brooklyn, New York"], + svec!["This is not a Location and it will not be geocoded"], + svec!["Jersey City, New Jersey"], + svec!["95.213424, 190,1234565"], // invalid lat, long + svec!["Makati, Metro Manila, Philippines"], + ], + ); + let mut cmd = wrk.command("geocode"); + cmd.arg("suggest") + .arg("Location") + .args(["--invalid-result", ""]) + .arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["Location"], + svec!["(41.90059, -87.85673)"], svec!["(40.65371, -73.93042)"], - svec!["95.213424, 190,1234565"], // invalid lat, long + svec!["(40.71427, -74.00597)"], + svec!["(40.79472, -73.9425)"], + svec!["(45.09413, -93.35634)"], + svec![""], + svec!["(40.72816, -74.07764)"], + svec![""], // suggest expects a city name, not lat, long svec!["(14.55027, 121.03269)"], ]; assert_eq!(got, expected);