Skip to content

Commit

Permalink
Merge pull request #1301 from jqnatividad/1300-autoindex-threshold
Browse files Browse the repository at this point in the history
Replace `QSV_AUTOINDEX` env var with `QSV_AUTOINDEX_SIZE`
  • Loading branch information
jqnatividad authored Sep 14, 2023
2 parents 0871655 + e8df28a commit 4de4bcb
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 55 deletions.
2 changes: 1 addition & 1 deletion docs/ENVIRONMENT_VARIABLES.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
| `QSV_SNIFF_DELIMITER` | if set, the delimiter is automatically detected. Overrides `QSV_DEFAULT_DELIMITER` & `--delimiter` option. Note that this does not work with stdin. |
| `QSV_NO_HEADERS` | if set, the first row will **NOT** be interpreted as headers. Supersedes `QSV_TOGGLE_HEADERS`. |
| `QSV_TOGGLE_HEADERS` | if set to `1`, toggles header setting - i.e. inverts qsv header behavior, with no headers being the default, & setting `--no-headers` will actually mean headers will not be ignored. |
| `QSV_AUTOINDEX` | if set, automatically create an index when none is detected. Also automatically updates stale indices. |
| `QSV_AUTOINDEX_SIZE` | if set, specifies the minimum file size (in bytes) of a CSV file before an index is automatically created. Note that stale indices are automatically updated regardless of this setting. |
| `QSV_CACHE_DIR` | The directory to use for caching downloaded lookup_table resources using the `luau` qsv_register_lookup() helper function. |
| `QSV_CKAN_API` | The CKAN Action API endpoint to use with the `luau` qsv_register_lookup() helper function when using the "ckan://" scheme. |
| `QSV_CKAN_TOKEN`| The CKAN token to use with the `luau` qsv_register_lookup() helper function when using the "ckan://" scheme. Only required to access private resources. |
Expand Down
7 changes: 4 additions & 3 deletions dotenv.template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ QSV_NO_HEADERS = False
# mean headers will not be ignored.
# QSV_TOGGLE_HEADERS = 1

# if true, automatically create an index when none is detected.
# Also automatically updates stale indices.
QSV_AUTOINDEX = False
# if set, specifies the minimum file size (in bytes) of a CSV file before an
# index is automatically created. Note that stale indices are automatically
# updated regardless of this setting.
# QSV_AUTOINDEX_SIZE = 1000000

# The directory to use for caching various qsv files.
# Used by the `geocode` command for downloaded geocoding resources.
Expand Down
88 changes: 47 additions & 41 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ pub struct Config {
quoting: bool,
pub preamble_rows: u64,
trim: csv::Trim,
autoindex: bool,
autoindex_size: u64,
prefer_dmy: bool,
comment: Option<u8>,
snappy: bool, // flag to enable snappy compression/decompression
Expand Down Expand Up @@ -171,7 +171,10 @@ impl Config {
quoting: true,
preamble_rows: preamble,
trim: csv::Trim::None,
autoindex: util::get_envvar_flag("QSV_AUTOINDEX"),
autoindex_size: std::env::var("QSV_AUTOINDEX_SIZE")
.unwrap_or_else(|_| "0".to_owned())
.parse()
.unwrap_or(0),
prefer_dmy: util::get_envvar_flag("QSV_PREFER_DMY"),
comment: None,
snappy,
Expand Down Expand Up @@ -359,69 +362,72 @@ impl Config {
}
}

/// Check if the index file exists and is newer than the CSV file.
/// If so, return the CSV file handle and the index file handle. If not, return None.
/// Unless the CSV's file size >= QSV_AUTOINDEX_SIZE, then we'll create an index automatically.
/// This will also automatically update stale indices (i.e. the CSV is newer than the index )
pub fn index_files(&self) -> io::Result<Option<(csv::Reader<fs::File>, fs::File)>> {
let (csv_file, idx_file) = match (&self.path, &self.idx_path) {
let mut data_modified = 0_u64;
let data_fsize;
let mut idx_path_work = PathBuf::new();
let (csv_file, mut idx_file) = match (&self.path, &self.idx_path) {
(&None, &None) => return Ok(None),
(&None, &Some(_)) => {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"Cannot use <stdin> with indexes",
));
},
(Some(p), Some(ip)) => (fs::File::open(p)?, fs::File::open(ip)?),
(Some(p), &None) => {
// We generally don't want to report an error here, since we're
// passively trying to find an index, so we just log the warning...
let idx_file = match fs::File::open(util::idx_path(p)) {
Err(e) => {
if self.autoindex && !self.snappy {
// however, if QSV_AUTOINDEX is set, we create the index automatically
// passively trying to find an index.

(data_modified, data_fsize) = util::file_metadata(&p.metadata()?);
idx_path_work = util::idx_path(p);
let idx_file = match fs::File::open(&idx_path_work) {
Err(_) => {
// the index file doesn't exist
if self.snappy {
// cannot index snappy compressed files
return Ok(None);
} else if self.autoindex_size > 0 && data_fsize >= self.autoindex_size {
// if CSV file size >= QSV_AUTOINDEX_SIZE, and
// its not a snappy file, create an index automatically
self.autoindex_file();
fs::File::open(util::idx_path(p))?
fs::File::open(&idx_path_work)?
} else if data_fsize >= NO_INDEX_WARNING_FILESIZE {
// warn user that the CSV file is large and not indexed
use thousands::Separable;

warn!(
"The {} MB CSV file is larger than the {} MB \
NO_INDEX_WARNING_FILESIZE threshold. Consider creating an index \
file as it will make qsv commands much faster.",
(data_fsize * 100).separate_with_commas(),
(NO_INDEX_WARNING_FILESIZE * 100).separate_with_commas()
);
return Ok(None);
} else {
warn!("No index file found - {p:?}: {e}");

let (_, data_fsize) = util::file_metadata(&p.metadata()?);

// If the CSV file is larger than NO_INDEX_WARNING_FILESIZE,
// log a warning that the user should consider creating an index file
// for faster access.
if data_fsize > NO_INDEX_WARNING_FILESIZE {
use thousands::Separable;

warn!(
"The {} MB CSV file is larger than the {} MB \
NO_INDEX_WARNING_FILESIZE threshold. Consider creating an \
index file for faster access.",
(data_fsize * 100).separate_with_commas(),
(NO_INDEX_WARNING_FILESIZE * 100).separate_with_commas()
);
}
// CSV not greater than QSV_AUTOINDEX_SIZE, and not greater than
// NO_INDEX_WARNING_FILESIZE, so we don't create an index
return Ok(None);
}
},
Ok(f) => f,
};
(fs::File::open(p)?, idx_file)
},
(Some(p), Some(ip)) => (fs::File::open(p)?, fs::File::open(ip)?),
};
// If the CSV data was last modified after the index file was last
// modified, then return an error and demand the user regenerate the index.
// Unless QSV_AUTOINDEX is set, in which case, we'll recreate the
// stale index automatically
let (data_modified, _) = util::file_metadata(&csv_file.metadata()?);
// modified, recreate the stale index automatically
let (idx_modified, _) = util::file_metadata(&idx_file.metadata()?);
if data_modified > idx_modified {
if self.autoindex && !self.snappy {
info!("index stale... autoindexing...");
self.autoindex_file();
} else {
return Err(io::Error::new(
io::ErrorKind::Other,
"The CSV file was modified after the index file. Please re-create the index.",
));
}
info!("index stale... autoindexing...");
self.autoindex_file();
idx_file = fs::File::open(&idx_path_work)?;
}

let csv_rdr = self.from_reader(csv_file);
Ok(Some((csv_rdr, idx_file)))
}
Expand Down
93 changes: 83 additions & 10 deletions tests/test_index.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::fs;
use std::{fs, io::Write};

use filetime::{set_file_times, FileTime};

Expand Down Expand Up @@ -55,15 +55,37 @@ fn index_outdated_stats() {
)
.unwrap();

// stats should fail if the index is stale
// even if the index is stale, stats should succeed
// as the index is automatically updated
let mut cmd = wrk.command("stats");
cmd.env_clear().arg("in.csv");
cmd.arg("in.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec![
"field",
"type",
"sum",
"min",
"max",
"range",
"min_length",
"max_length",
"mean",
"stddev",
"variance",
"nullcount",
"sparsity"
],
svec!["letter", "String", "", "a", "c", "", "1", "1", "", "", "", "0", "0"],
svec!["number", "Integer", "6", "1", "3", "2", "1", "1", "2", "0.8165", "0.6667", "0", "0"],
];

wrk.assert_err(&mut cmd);
assert_eq!(got, expected);
}

#[test]
fn index_outdated_index_autoindex() {
fn index_outdated_index() {
let wrk = Workdir::new("index_outdated_index");

wrk.create_indexed(
Expand All @@ -84,17 +106,68 @@ fn index_outdated_index_autoindex() {
)
.unwrap();

// slice should NOT fail if the index is stale and
// QSV_AUTOINDEX is set
std::env::set_var("QSV_AUTOINDEX", "1");
// slice should NOT fail if the index is stale
// as stale indexes are automatically updated
let mut cmd = wrk.command("slice");
cmd.arg("-i").arg("2").arg("in.csv");

wrk.assert_success(&mut cmd);
}

#[test]
fn index_autoindex_threshold_reached() {
let wrk = Workdir::new("index_autoindex_threshold_reached");

wrk.create(
"in.csv",
vec![
svec!["letter", "number"],
svec!["a", "1"],
svec!["b", "2"],
svec!["c", "3"],
svec!["d", "4"],
],
);

// slice should automatically create an index
// as the file size is greater than the QSV_AUTOINDEX_SIZE threshold
let mut cmd = wrk.command("slice");
cmd.env("QSV_AUTOINDEX", "1")
cmd.env("QSV_AUTOINDEX_SIZE", "1")
.arg("-i")
.arg("2")
.arg("in.csv");
std::env::remove_var("QSV_AUTOINDEX");
wrk.assert_success(&mut cmd);

// index should be created
assert!(wrk.path("in.csv.idx").exists());
}

#[test]
fn index_autoindex_threshold_not_reached() {
let wrk = Workdir::new("index_autoindex_threshold_not_reached");

wrk.create(
"in.csv",
vec![
svec!["letter", "number"],
svec!["a", "1"],
svec!["b", "2"],
svec!["c", "3"],
svec!["d", "4"],
],
);

// slice will NOT automatically create an index
// as the file size is less than the QSV_AUTOINDEX_SIZE threshold
let mut cmd = wrk.command("slice");
cmd.env("QSV_AUTOINDEX_SIZE", "10000000")
.arg("-i")
.arg("2")
.arg("in.csv");
wrk.assert_success(&mut cmd);

// index should NOT be created
assert!(!wrk.path("in.csv.idx").exists());
}

fn future_time(ft: FileTime) -> FileTime {
Expand Down

0 comments on commit 4de4bcb

Please sign in to comment.