Skip to content

Commit

Permalink
geocode: misc improvements
Browse files Browse the repository at this point in the history
- improve usage text further
- expand implementation comments
- add logic to trap invalid usage of --admin1 option with reversenow subcommand
- added test for reversenow --admin1 error
  • Loading branch information
jqnatividad committed Sep 7, 2023
1 parent dd1986a commit 61ee1fd
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 32 deletions.
69 changes: 43 additions & 26 deletions src/cmd/geocode.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
static USAGE: &str = r#"
Geocodes a location in CSV data against an updatable local copy of the Geonames cities index.
When its first run, the geocode command will download a prebuilt Geonames cities index
from the qsv GitHub repo, and will use it going forward. You can operate on the local
When you run the command for the first time, it will download a prebuilt Geonames cities
index from the qsv GitHub repo and use it going forward. You can operate on the local
index using the index-* subcommands.
By default, the prebuilt index uses the Geonames Gazeteer cities15000.zip file using
Expand Down Expand Up @@ -94,7 +94,7 @@ Accepts the same options as reverse, but does not require an input file.
$ qsv geocode reversenow "40.71427, -74.00597"
$ qsv geocode reversenow --country US -f %cityrecord "40.71427, -74.00597"
$ qsv geocode reversenow --admin1 "US:OH" "39.32924, -82.10126"
$ qsv geocode reversenow --admin1 "US:OH" "(39.32924, -82.10126)"
INDEX-<operation>
Updates the local Geonames cities index used by the geocode command.
Expand Down Expand Up @@ -134,7 +134,7 @@ geocode arguments:
<input> The input file to read from. If not specified, reads from stdin.
<column> The column to geocode.
<location> The location to geocode. For suggestnow, its a string pattern.
<location> The location to geocode. For suggestnow, its a City string pattern.
For reversenow, it must be a WGS 84 coordinate "lat, long" or
"(lat, long)" format.
<index-file> The alternate geonames index file to use. It must be a .bincode file.
Expand Down Expand Up @@ -224,15 +224,23 @@ geocode options:
If an invalid template is specified, "Invalid dynfmt template" is returned.
Finally, you can use the special format "%dyncols:" to add columns to the CSV.
To do so, set the --formatstr to "%dyncols:" followed by a comma-delimited list
Both predefined and dynamic formatting are cached. Subsequent calls
with the same result will be faster as it will use the cached result instead
of searching the Geonames index.
Finally, you can use the special format "%dyncols:" to dynamically add multiple
columns to the output CSV for each field in a geocode result.
To do so, set --formatstr to "%dyncols:" followed by a comma-delimited list
of key:value pairs enclosed in curly braces.
The key is the desired column name, and the value is are the same ten fields
The key is the desired column name and the value is one of the same ten fields
available for dynamic formatting.
e.g. "%dyncols: {city_col:name}, {state_col:admin1}, {country_col:country}"
will add three columns to the CSV named city_col, state_col and country_col.
will add three columns to the output CSV named city_col, state_col & country_col.
Note that using "%dyncols:" will cause the the command to geocode EACH row without
using the cache, so it will be slower than predefined or dynamic formatting.
[default: %+]
--invalid-result <string> The string to return when the geocode result is empty/invalid.
Expand Down Expand Up @@ -459,7 +467,7 @@ async fn geocode_main(args: Args) -> CliResult<()> {
GeocodeSubCmd::IndexReset
} else {
// should not happen as docopt won't allow it
unreachable!("No geocode subcommand specified.");
unreachable!();
};

// setup cache directory
Expand Down Expand Up @@ -636,7 +644,7 @@ async fn geocode_main(args: Args) -> CliResult<()> {
let _ = load_engine(geocode_index_file.clone().into(), &progress).await?;
winfo!("Default Geonames index file reset to {QSV_VERSION} release.");
},
_ => unreachable!("index_cmd is true, so this is unreachable."),
_ => unreachable!(), // index_cmd is true, so this is unreachable.
}
return Ok(());
}
Expand Down Expand Up @@ -681,7 +689,6 @@ async fn geocode_main(args: Args) -> CliResult<()> {
// first, parse the formatstr to get the column names and values in parallel vectors
let mut column_names = Vec::new();
let mut column_values = Vec::new();

let dyncols_len = if args.flag_formatstr.starts_with("%dyncols:") {
for column in args.flag_formatstr[9..].split(',') {
let column = column.trim();
Expand All @@ -693,18 +700,16 @@ async fn geocode_main(args: Args) -> CliResult<()> {
}

// now, validate the column values
// the valid column values are: id, name, latitude, longitude, country, admin1, admin2,
// capital, timezone, population
// the valid column values are in VALID_DYNCOLS
for column_value in &column_values {
if !VALID_DYNCOLS.contains(column_value) {
return fail_incorrectusage_clierror!(
"Invalid column value: {column_value}. Valid values are: id, name, latitude, \
longitude, country, admin1, admin2, capital, timezone, population"
"Invalid column value: {column_value}. Valid values are: {VALID_DYNCOLS:?}"
);
}
}

// now, we add the columns to the CSV headers
// its valid, add the columns to the CSV headers
for column in column_names {
headers.push_field(column);
}
Expand Down Expand Up @@ -761,9 +766,7 @@ async fn geocode_main(args: Args) -> CliResult<()> {
// if admin1 is set, country must also be set
// however, if all admin1 codes have the same prefix, we can infer the country from
// the admin1 codes. Otherwise, we can't infer the country from the
// admin1 code, so we error out. Note that country inferencing only
// works if all admin1 codes have the same prefix, so it only works
// for one country at a time.
// admin1 code, so we error out.
if args.flag_admin1.is_some() && flag_country.is_none() {
if !admin1_code_prefix.is_empty() && admin1_same_prefix {
admin1_code_prefix.pop(); // remove the dot
Expand All @@ -780,8 +783,15 @@ async fn geocode_main(args: Args) -> CliResult<()> {
None
}
},
// reverse/now doesn't support admin1 filters
_ => None,
_ => {
// reverse/now subcommands don't support admin1 filter
if args.flag_admin1.is_some() {
return fail_incorrectusage_clierror!(
"reverse & reversenow subcommands do not support the --admin1 filter option."
);
}
None
},
}; // end setup admin1 filters

// setup country filter - both suggest/now and reverse/now support country filters
Expand Down Expand Up @@ -853,10 +863,9 @@ async fn geocode_main(args: Args) -> CliResult<()> {
}
}
} else if dyncols_len > 0 {
// we're in dyncols mode, so use a non-cached search_index fn
// as we need to inject the column values into the output csv
// and each row have different column values and the result is
// actually in the mutable record variable
// we're in dyncols mode, so use search_index_NO_CACHE fn
// as we need to inject the column values into each row of the output csv
// so we can't use the cache
let search_results = search_index_no_cache(
&engine,
geocode_cmd,
Expand Down Expand Up @@ -939,7 +948,7 @@ async fn geocode_main(args: Args) -> CliResult<()> {

if show_progress {
// the geocode result cache is NOT used in dyncols mode,
// so don't update the cache info when dyncols_len > 0
// so update the cache info only when dyncols_len == 0
if dyncols_len == 0 {
util::update_cache_info!(progress, SEARCH_INDEX);
}
Expand Down Expand Up @@ -997,6 +1006,12 @@ async fn load_engine(geocode_index_file: PathBuf, progressbar: &ProgressBar) ->
Ok(engine)
}

/// search_index is a cached function that returns a geocode result for a given cell value.
/// It uses an LRU cache using the cell value as the key, storing the formatted geocoded result
/// in the cache. As such, we CANNOT use the cache when in dyncols mode as the cached result is
/// the formatted result, not the individual fields.
/// search_index_no_cache() is automatically derived from search_index() by the cached macro.
/// search_index_no_cache() is used in dyncols mode, and as the name implies, does not use a cache.
#[cached(
type = "SizedCache<String, String>",
create = "{ SizedCache::try_with_size(CACHE_SIZE).unwrap_or_else(|_| \
Expand Down Expand Up @@ -1126,6 +1141,8 @@ fn search_index(

// we're doing a Reverse/Now command and expect a WGS 84 coordinate
// the regex validates for "(lat, long)" or "lat, long"
// note that it is not pinned to the start of the string, so it can be in the middle
// of a string, e.g. "The location of the incident is 40.7128, -74.0060"
let locregex: &'static Regex =
regex_oncelock!(r"(?-u)([+-]?[0-9]+\.?[0-9]*|\.[0-9]+),\s*([+-]?[0-9]+\.?[0-9]*|\.[0-9]+)");

Expand Down
26 changes: 20 additions & 6 deletions tests/test_geocode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,24 @@ fn geocode_suggestnow() {
fn geocode_reversenow() {
let wrk = Workdir::new("geocode_reversenow");

let mut cmd = wrk.command("geocode");
cmd.arg("reversenow").arg("(40.67, -73.94)").args([
"-f",
"{name}, {admin2} County, {admin1} - {population} {timezone}",
]);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["Location"],
svec!["East Flatbush, Kings County, New York - 178464 America/New_York"],
];
assert_eq!(got, expected);
}

#[test]
fn geocode_reversenow_error() {
let wrk = Workdir::new("geocode_reversenow_error");

let mut cmd = wrk.command("geocode");
cmd.arg("reversenow")
.arg("(40.67, -73.94)")
Expand All @@ -183,12 +201,8 @@ fn geocode_reversenow() {
"{name}, {admin2} County, {admin1} - {population} {timezone}",
]);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["Location"],
svec!["East Flatbush, Kings County, New York - 178464 America/New_York"],
];
assert_eq!(got, expected);
// reversenow does not support admin1 filter
wrk.assert_err(&mut cmd);
}

#[test]
Expand Down

0 comments on commit 61ee1fd

Please sign in to comment.