Skip to content

Commit

Permalink
geocode: add --invalid-result option
Browse files Browse the repository at this point in the history
and corresponding tests
  • Loading branch information
jqnatividad committed Aug 29, 2023
1 parent ebbbdf6 commit 2a7f6be
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 33 deletions.
67 changes: 38 additions & 29 deletions src/cmd/geocode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ geocode options:
e.g. "City: {name}, State: {admin1}, Country: {country}"
[default: %+]
--invalid-result <string> The string to use when the geocode result is empty/invalid.
If not set, the original value is used.
-j, --jobs <arg> The number of jobs to run in parallel.
When not set, the number of jobs is set to the number of CPUs detected.
-b, --batch <size> The number of rows per batch to load into memory, before running in parallel.
Expand Down Expand Up @@ -183,28 +184,29 @@ use crate::{

#[derive(Deserialize, Debug)]
struct Args {
arg_column: String,
cmd_suggest: bool,
cmd_reverse: bool,
cmd_index_check: bool,
cmd_index_update: bool,
cmd_index_load: bool,
cmd_index_reset: bool,
arg_input: Option<String>,
arg_index_file: Option<String>,
flag_rename: Option<String>,
flag_min_score: f32,
flag_k_weight: Option<f32>,
flag_formatstr: String,
flag_batch: u32,
flag_timeout: u16,
flag_languages: String,
flag_cache_dir: String,
flag_jobs: Option<usize>,
flag_new_column: Option<String>,
flag_output: Option<String>,
flag_delimiter: Option<Delimiter>,
flag_progressbar: bool,
arg_column: String,
cmd_suggest: bool,
cmd_reverse: bool,
cmd_index_check: bool,
cmd_index_update: bool,
cmd_index_load: bool,
cmd_index_reset: bool,
arg_input: Option<String>,
arg_index_file: Option<String>,
flag_rename: Option<String>,
flag_min_score: f32,
flag_k_weight: Option<f32>,
flag_formatstr: String,
flag_invalid_result: Option<String>,
flag_batch: u32,
flag_timeout: u16,
flag_languages: String,
flag_cache_dir: String,
flag_jobs: Option<usize>,
flag_new_column: Option<String>,
flag_output: Option<String>,
flag_delimiter: Option<Delimiter>,
flag_progressbar: bool,
}

static QSV_VERSION: &str = env!("CARGO_PKG_VERSION");
Expand Down Expand Up @@ -340,7 +342,7 @@ async fn geocode_main(args: Args) -> CliResult<()> {
if index_cmd {
match geocode_cmd {
GeocodeSubCmd::IndexCheck => {
// load geocode engine
// check if we have updates
winfo!("Checking main Geonames website for updates...");
check_index_file(&geocode_index_file)?;
let engine =
Expand All @@ -357,6 +359,8 @@ async fn geocode_main(args: Args) -> CliResult<()> {
}
},
GeocodeSubCmd::IndexUpdate => {
// update/rebuild Geonames index from Geonames website
// will only update if there are changes
check_index_file(&geocode_index_file)?;
let engine =
load_engine(geocode_index_file.clone().into(), args.flag_progressbar).await?;
Expand Down Expand Up @@ -410,6 +414,7 @@ async fn geocode_main(args: Args) -> CliResult<()> {
}

// we're not doing an index subcommand, so we're doing a suggest or reverse
// load the current local Geonames index
let engine = load_engine(geocode_index_file.clone().into(), args.flag_progressbar).await?;

let rconfig = Config::new(&args.arg_input)
Expand Down Expand Up @@ -465,6 +470,8 @@ async fn geocode_main(args: Args) -> CliResult<()> {
// set RAYON_NUM_THREADS
util::njobs(args.flag_jobs);

let invalid_result = args.flag_invalid_result.unwrap_or_default();

// main loop to read CSV and construct batches for parallel processing.
// each batch is processed via Rayon parallel iterator.
// loop exits when batch is empty.
Expand Down Expand Up @@ -507,6 +514,12 @@ async fn geocode_main(args: Args) -> CliResult<()> {
);
if let Some(geocoded_result) = search_result {
cell = geocoded_result;
} else {
// --invalid-result is set, so use that instead
// otherwise, we leave cell untouched, and the original value remains
if !invalid_result.is_empty() {
cell = invalid_result.clone();
}
}
}
if args.flag_new_column.is_some() {
Expand Down Expand Up @@ -592,7 +605,7 @@ async fn load_engine(geocode_index_file: PathBuf, show_progress: bool) -> CliRes
key = "String",
convert = r#"{ format!("{cell}") }"#,
option = true,
sync_writes = true
sync_writes = false
)]
fn search_cached(
engine: &Engine,
Expand All @@ -604,10 +617,6 @@ fn search_cached(
) -> Option<String> {
static EMPTY_STRING: String = String::new();

if cell.is_empty() {
return None;
}

if mode == GeocodeSubCmd::Suggest {
let search_result = engine.suggest(cell, 1, Some(min_score));
let Some(cityrecord) = search_result.into_iter().next() else {
Expand Down
48 changes: 44 additions & 4 deletions tests/test_geocode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ fn geocode_suggest() {
svec!["Melrose, New York"],
svec!["East Flatbush, New York"],
svec!["Manhattan, New York"],
svec!["East Harlem, New York"],
svec!["Brooklyn, New York"],
svec!["East Harlem, New York"],
svec!["This is not a Location and it will not be geocoded"],
svec!["East Flatbush, New York"],
svec!["Jersey City, New Jersey"],
svec!["95.213424, 190,1234565"], // invalid lat, long
svec!["Makati, Metro Manila, Philippines"],
],
Expand All @@ -27,11 +27,51 @@ fn geocode_suggest() {
svec!["(41.90059, -87.85673)"],
svec!["(40.65371, -73.93042)"],
svec!["(40.71427, -74.00597)"],
svec!["(40.79472, -73.9425)"],
svec!["(45.09413, -93.35634)"],
svec!["(40.79472, -73.9425)"],
svec!["This is not a Location and it will not be geocoded"],
svec!["(40.72816, -74.07764)"],
svec!["95.213424, 190,1234565"], // suggest expects a city name, not lat, long
svec!["(14.55027, 121.03269)"],
];
assert_eq!(got, expected);
}

#[test]
fn geocode_suggest_invalid() {
let wrk = Workdir::new("geocode_suggest_invalid");
wrk.create(
"data.csv",
vec![
svec!["Location"],
svec!["Melrose, New York"],
svec!["East Flatbush, New York"],
svec!["Manhattan, New York"],
svec!["East Harlem, New York"],
svec!["Brooklyn, New York"],
svec!["This is not a Location and it will not be geocoded"],
svec!["Jersey City, New Jersey"],
svec!["95.213424, 190,1234565"], // invalid lat, long
svec!["Makati, Metro Manila, Philippines"],
],
);
let mut cmd = wrk.command("geocode");
cmd.arg("suggest")
.arg("Location")
.args(["--invalid-result", "<ERROR>"])
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["Location"],
svec!["(41.90059, -87.85673)"],
svec!["(40.65371, -73.93042)"],
svec!["95.213424, 190,1234565"], // invalid lat, long
svec!["(40.71427, -74.00597)"],
svec!["(40.79472, -73.9425)"],
svec!["(45.09413, -93.35634)"],
svec!["<ERROR>"],
svec!["(40.72816, -74.07764)"],
svec!["<ERROR>"], // suggest expects a city name, not lat, long
svec!["(14.55027, 121.03269)"],
];
assert_eq!(got, expected);
Expand Down

0 comments on commit 2a7f6be

Please sign in to comment.