diff --git a/docs/ENVIRONMENT_VARIABLES.md b/docs/ENVIRONMENT_VARIABLES.md index 66476a700a..62eb416163 100644 --- a/docs/ENVIRONMENT_VARIABLES.md +++ b/docs/ENVIRONMENT_VARIABLES.md @@ -6,7 +6,7 @@ | `QSV_SNIFF_DELIMITER` | if set, the delimiter is automatically detected. Overrides `QSV_DEFAULT_DELIMITER` & `--delimiter` option. Note that this does not work with stdin. | | `QSV_NO_HEADERS` | if set, the first row will **NOT** be interpreted as headers. Supersedes `QSV_TOGGLE_HEADERS`. | | `QSV_TOGGLE_HEADERS` | if set to `1`, toggles header setting - i.e. inverts qsv header behavior, with no headers being the default, & setting `--no-headers` will actually mean headers will not be ignored. | -| `QSV_AUTOINDEX` | if set, automatically create an index when none is detected. Also automatically updates stale indices. | +| `QSV_AUTOINDEX_SIZE` | if set, specifies the minimum file size (in bytes) of a CSV file before an index is automatically created. Note that stale indices are automatically updated regardless of this setting. | | `QSV_CACHE_DIR` | The directory to use for caching downloaded lookup_table resources using the `luau` qsv_register_lookup() helper function. | | `QSV_CKAN_API` | The CKAN Action API endpoint to use with the `luau` qsv_register_lookup() helper function when using the "ckan://" scheme. | | `QSV_CKAN_TOKEN`| The CKAN token to use with the `luau` qsv_register_lookup() helper function when using the "ckan://" scheme. Only required to access private resources. | diff --git a/dotenv.template.yaml b/dotenv.template.yaml index 4fc30068b0..afbe3883ab 100644 --- a/dotenv.template.yaml +++ b/dotenv.template.yaml @@ -40,9 +40,10 @@ QSV_NO_HEADERS = False # mean headers will not be ignored. # QSV_TOGGLE_HEADERS = 1 -# if true, automatically create an index when none is detected. -# Also automatically updates stale indices. -QSV_AUTOINDEX = False +# if set, specifies the minimum file size (in bytes) of a CSV file before an +# index is automatically created. Note that stale indices are automatically +# updated regardless of this setting. +# QSV_AUTOINDEX_SIZE = 1000000 # The directory to use for caching various qsv files. # Used by the `geocode` command for downloaded geocoding resources. diff --git a/src/config.rs b/src/config.rs index 9c26910fba..8d93b79cab 100644 --- a/src/config.rs +++ b/src/config.rs @@ -83,7 +83,7 @@ pub struct Config { quoting: bool, pub preamble_rows: u64, trim: csv::Trim, - autoindex: bool, + autoindex_size: u64, prefer_dmy: bool, comment: Option, snappy: bool, // flag to enable snappy compression/decompression @@ -171,7 +171,10 @@ impl Config { quoting: true, preamble_rows: preamble, trim: csv::Trim::None, - autoindex: util::get_envvar_flag("QSV_AUTOINDEX"), + autoindex_size: std::env::var("QSV_AUTOINDEX_SIZE") + .unwrap_or_else(|_| "0".to_owned()) + .parse() + .unwrap_or(0), prefer_dmy: util::get_envvar_flag("QSV_PREFER_DMY"), comment: None, snappy, @@ -359,8 +362,15 @@ impl Config { } } + /// Check if the index file exists and is newer than the CSV file. + /// If so, return the CSV file handle and the index file handle. If not, return None. + /// Unless the CSV's file size >= QSV_AUTOINDEX_SIZE, then we'll create an index automatically. + /// This will also automatically update stale indices (i.e. the CSV is newer than the index ) pub fn index_files(&self) -> io::Result, fs::File)>> { - let (csv_file, idx_file) = match (&self.path, &self.idx_path) { + let mut data_modified = 0_u64; + let data_fsize; + let mut idx_path_work = PathBuf::new(); + let (csv_file, mut idx_file) = match (&self.path, &self.idx_path) { (&None, &None) => return Ok(None), (&None, &Some(_)) => { return Err(io::Error::new( @@ -368,34 +378,39 @@ impl Config { "Cannot use with indexes", )); }, + (Some(p), Some(ip)) => (fs::File::open(p)?, fs::File::open(ip)?), (Some(p), &None) => { // We generally don't want to report an error here, since we're - // passively trying to find an index, so we just log the warning... - let idx_file = match fs::File::open(util::idx_path(p)) { - Err(e) => { - if self.autoindex && !self.snappy { - // however, if QSV_AUTOINDEX is set, we create the index automatically + // passively trying to find an index. + + (data_modified, data_fsize) = util::file_metadata(&p.metadata()?); + idx_path_work = util::idx_path(p); + let idx_file = match fs::File::open(&idx_path_work) { + Err(_) => { + // the index file doesn't exist + if self.snappy { + // cannot index snappy compressed files + return Ok(None); + } else if self.autoindex_size > 0 && data_fsize >= self.autoindex_size { + // if CSV file size >= QSV_AUTOINDEX_SIZE, and + // its not a snappy file, create an index automatically self.autoindex_file(); - fs::File::open(util::idx_path(p))? + fs::File::open(&idx_path_work)? + } else if data_fsize >= NO_INDEX_WARNING_FILESIZE { + // warn user that the CSV file is large and not indexed + use thousands::Separable; + + warn!( + "The {} MB CSV file is larger than the {} MB \ + NO_INDEX_WARNING_FILESIZE threshold. Consider creating an index \ + file as it will make qsv commands much faster.", + (data_fsize * 100).separate_with_commas(), + (NO_INDEX_WARNING_FILESIZE * 100).separate_with_commas() + ); + return Ok(None); } else { - warn!("No index file found - {p:?}: {e}"); - - let (_, data_fsize) = util::file_metadata(&p.metadata()?); - - // If the CSV file is larger than NO_INDEX_WARNING_FILESIZE, - // log a warning that the user should consider creating an index file - // for faster access. - if data_fsize > NO_INDEX_WARNING_FILESIZE { - use thousands::Separable; - - warn!( - "The {} MB CSV file is larger than the {} MB \ - NO_INDEX_WARNING_FILESIZE threshold. Consider creating an \ - index file for faster access.", - (data_fsize * 100).separate_with_commas(), - (NO_INDEX_WARNING_FILESIZE * 100).separate_with_commas() - ); - } + // CSV not greater than QSV_AUTOINDEX_SIZE, and not greater than + // NO_INDEX_WARNING_FILESIZE, so we don't create an index return Ok(None); } }, @@ -403,25 +418,16 @@ impl Config { }; (fs::File::open(p)?, idx_file) }, - (Some(p), Some(ip)) => (fs::File::open(p)?, fs::File::open(ip)?), }; // If the CSV data was last modified after the index file was last - // modified, then return an error and demand the user regenerate the index. - // Unless QSV_AUTOINDEX is set, in which case, we'll recreate the - // stale index automatically - let (data_modified, _) = util::file_metadata(&csv_file.metadata()?); + // modified, recreate the stale index automatically let (idx_modified, _) = util::file_metadata(&idx_file.metadata()?); if data_modified > idx_modified { - if self.autoindex && !self.snappy { - info!("index stale... autoindexing..."); - self.autoindex_file(); - } else { - return Err(io::Error::new( - io::ErrorKind::Other, - "The CSV file was modified after the index file. Please re-create the index.", - )); - } + info!("index stale... autoindexing..."); + self.autoindex_file(); + idx_file = fs::File::open(&idx_path_work)?; } + let csv_rdr = self.from_reader(csv_file); Ok(Some((csv_rdr, idx_file))) } diff --git a/tests/test_index.rs b/tests/test_index.rs index db54b26115..423a13d0ea 100644 --- a/tests/test_index.rs +++ b/tests/test_index.rs @@ -1,4 +1,4 @@ -use std::fs; +use std::{fs, io::Write}; use filetime::{set_file_times, FileTime}; @@ -55,15 +55,37 @@ fn index_outdated_stats() { ) .unwrap(); - // stats should fail if the index is stale + // even if the index is stale, stats should succeed + // as the index is automatically updated let mut cmd = wrk.command("stats"); - cmd.env_clear().arg("in.csv"); + cmd.arg("in.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec![ + "field", + "type", + "sum", + "min", + "max", + "range", + "min_length", + "max_length", + "mean", + "stddev", + "variance", + "nullcount", + "sparsity" + ], + svec!["letter", "String", "", "a", "c", "", "1", "1", "", "", "", "0", "0"], + svec!["number", "Integer", "6", "1", "3", "2", "1", "1", "2", "0.8165", "0.6667", "0", "0"], + ]; - wrk.assert_err(&mut cmd); + assert_eq!(got, expected); } #[test] -fn index_outdated_index_autoindex() { +fn index_outdated_index() { let wrk = Workdir::new("index_outdated_index"); wrk.create_indexed( @@ -84,17 +106,68 @@ fn index_outdated_index_autoindex() { ) .unwrap(); - // slice should NOT fail if the index is stale and - // QSV_AUTOINDEX is set - std::env::set_var("QSV_AUTOINDEX", "1"); + // slice should NOT fail if the index is stale + // as stale indexes are automatically updated + let mut cmd = wrk.command("slice"); + cmd.arg("-i").arg("2").arg("in.csv"); + + wrk.assert_success(&mut cmd); +} + +#[test] +fn index_autoindex_threshold_reached() { + let wrk = Workdir::new("index_autoindex_threshold_reached"); + + wrk.create( + "in.csv", + vec![ + svec!["letter", "number"], + svec!["a", "1"], + svec!["b", "2"], + svec!["c", "3"], + svec!["d", "4"], + ], + ); + + // slice should automatically create an index + // as the file size is greater than the QSV_AUTOINDEX_SIZE threshold let mut cmd = wrk.command("slice"); - cmd.env("QSV_AUTOINDEX", "1") + cmd.env("QSV_AUTOINDEX_SIZE", "1") .arg("-i") .arg("2") .arg("in.csv"); - std::env::remove_var("QSV_AUTOINDEX"); + wrk.assert_success(&mut cmd); + + // index should be created + assert!(wrk.path("in.csv.idx").exists()); +} + +#[test] +fn index_autoindex_threshold_not_reached() { + let wrk = Workdir::new("index_autoindex_threshold_not_reached"); + wrk.create( + "in.csv", + vec![ + svec!["letter", "number"], + svec!["a", "1"], + svec!["b", "2"], + svec!["c", "3"], + svec!["d", "4"], + ], + ); + + // slice will NOT automatically create an index + // as the file size is less than the QSV_AUTOINDEX_SIZE threshold + let mut cmd = wrk.command("slice"); + cmd.env("QSV_AUTOINDEX_SIZE", "10000000") + .arg("-i") + .arg("2") + .arg("in.csv"); wrk.assert_success(&mut cmd); + + // index should NOT be created + assert!(!wrk.path("in.csv.idx").exists()); } fn future_time(ft: FileTime) -> FileTime {