Skip to content

Commit

Permalink
Merge pull request #2309 from jqnatividad/2308-skip_format_check-simp…
Browse files Browse the repository at this point in the history
…lification

simplify input format check
  • Loading branch information
jqnatividad authored Nov 23, 2024
2 parents e788aff + 9273eca commit 5a0fb3e
Show file tree
Hide file tree
Showing 8 changed files with 43 additions and 30 deletions.
3 changes: 2 additions & 1 deletion src/cmd/sample.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let rconfig = Config::new(args.arg_input.as_ref())
.delimiter(args.flag_delimiter)
.no_headers(args.flag_no_headers)
.flexible(true);
.flexible(true)
.skip_format_check(true);

let mut sample_size = args.arg_sample_size;

Expand Down
2 changes: 1 addition & 1 deletion src/cmd/sniff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -872,7 +872,7 @@ async fn sniff_main(mut args: Args) -> CliResult<()> {
sfile_info.downloaded_records
};

let rdr = conf.reader_file()?;
let rdr = conf.clone().skip_format_check(true).reader_file()?;

let dt_preference = if args.flag_prefer_dmy || conf.get_dmy_preference() {
DatePreference::DmyFormat
Expand Down
14 changes: 12 additions & 2 deletions src/cmd/validate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,12 @@ fn dyn_enum_validator_factory<'a>(
// Read the first column into a HashSet
let mut enum_set = HashSet::with_capacity(50);
let rconfig = Config::new(Some(lookup_result.filepath).as_ref());
let mut rdr = match rconfig.flexible(true).comment(Some(b'#')).reader() {
let mut rdr = match rconfig
.flexible(true)
.comment(Some(b'#'))
.skip_format_check(true)
.reader()
{
Ok(reader) => reader,
Err(e) => return fail_validation_error!("Error opening dynamicEnum file: {e}"),
};
Expand Down Expand Up @@ -531,7 +536,12 @@ fn dyn_enum_validator_factory<'a>(
// read the first column into a HashSet
let mut enum_set = HashSet::with_capacity(50);
let rconfig = Config::new(Some(dynenum_path).as_ref());
let mut rdr = match rconfig.flexible(true).reader() {
let mut rdr = match rconfig
.flexible(true)
.comment(Some(b'#'))
.skip_format_check(true)
.reader()
{
Ok(reader) => reader,
Err(e) => return fail_validation_error!("Error opening dynamicEnum file: {e}"),
};
Expand Down
42 changes: 18 additions & 24 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,14 @@ impl Config {
/// - `QSV_PREFER_DMY`: Sets date format preference.
/// - `QSV_RDR_BUFFER_CAPACITY`: Sets read buffer capacity.
/// - `QSV_WTR_BUFFER_CAPACITY`: Sets write buffer capacity.
/// - `QSV_SKIP_FORMAT_CHECK`: Set to skip mime-type checking.
/// - `QSV_SKIP_FORMAT_CHECK`: Set to skip file extension checking.
pub fn new(path: Option<&String>) -> Config {
let default_delim = match env::var("QSV_DEFAULT_DELIMITER") {
Ok(delim) => Delimiter::decode_delimiter(&delim).unwrap().as_byte(),
_ => b',',
};
let sniff = util::get_envvar_flag("QSV_SNIFF_DELIMITER")
|| util::get_envvar_flag("QSV_SNIFF_PREAMBLE");
let mut skip_format_check = true;
let mut format_error = None;
let (path, mut delim, snappy) = match path {
Expand All @@ -168,31 +170,18 @@ impl Config {
Some(s) if s == "-" => (None, default_delim, false),
Some(ref s) => {
let path = PathBuf::from(s);
skip_format_check = util::get_envvar_flag("QSV_SKIP_FORMAT_CHECK");
let (file_extension, delim, snappy) = get_delim_by_extension(&path, default_delim);
skip_format_check = sniff || util::get_envvar_flag("QSV_SKIP_FORMAT_CHECK");
if !skip_format_check {
if let Ok(file_format) = file_format::FileFormat::from_file(&path) {
let detected_mime = file_format.media_type();
// determine the file type by scanning the file
// we support the following mime-types:
// x-empty: empty file
// octet-stream: the file-format crate falls back to this when it cannot
// figure the mime-type, so its not actually binary data
// x-snappy-framed: for snappy compressed files
// text/*: its a text file type of some sort that is a possible CSV
// candidate that we will trap later on with the csv crate
if !(detected_mime == "application/x-empty"
|| detected_mime == "application/octet-stream"
|| detected_mime == "application/x-snappy-framed"
|| detected_mime.starts_with("text/"))
{
format_error = Some(format!(
"{} is using an unsupported file format: {detected_mime}",
path.display()
));
}
}
format_error = match file_extension.as_str() {
"csv" | "tsv" | "tab" | "ssv" => None,
ext => Some(format!(
"{} is using an unsupported file format: {ext}. Set \
QSV_SKIP_FORMAT_CHECK to skip input format checking.",
path.display()
)),
};
}
let (file_extension, delim, snappy) = get_delim_by_extension(&path, default_delim);
(Some(path), delim, snappy || file_extension.ends_with("sz"))
},
};
Expand Down Expand Up @@ -295,6 +284,11 @@ impl Config {
self
}

pub const fn skip_format_check(mut self, yes: bool) -> Config {
self.skip_format_check = yes;
self
}

#[cfg(any(feature = "feature_capable", feature = "lite"))]
pub const fn crlf(mut self, yes: bool) -> Config {
if yes {
Expand Down
4 changes: 2 additions & 2 deletions src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ pub fn count_rows(conf: &Config) -> Result<u64, CliError> {

#[cfg(not(feature = "polars"))]
let count_opt = ROW_COUNT.get_or_init(|| {
if let Ok(mut rdr) = conf.reader() {
if let Ok(mut rdr) = conf.clone().skip_format_check(true).reader() {
let mut count = 0_u64;
let mut _record = csv::ByteRecord::new();
#[allow(clippy::used_underscore_binding)]
Expand Down Expand Up @@ -419,7 +419,7 @@ pub fn count_rows_regular(conf: &Config) -> Result<u64, CliError> {
} else {
// index does not exist or is stale,
let count_opt = ROW_COUNT.get_or_init(|| {
if let Ok(mut rdr) = conf.reader() {
if let Ok(mut rdr) = conf.clone().skip_format_check(true).reader() {
let mut count = 0_u64;
let mut _record = csv::ByteRecord::new();
#[allow(clippy::used_underscore_binding)]
Expand Down
2 changes: 2 additions & 0 deletions tests/test_sample.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ fn sample_seed_url() {
.arg("5")
.arg("https://github.com/jqnatividad/qsv/raw/master/resources/test/aliases.csv");

wrk.assert_success(&mut cmd);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
["position", "title"],
Expand Down
2 changes: 2 additions & 0 deletions tests/test_sniff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ fn sniff_url_snappy() {
let mut cmd = wrk.command("sniff");
cmd.arg("https://github.com/jqnatividad/qsv/raw/master/resources/test/boston311-100.csv.sz");

wrk.assert_success(&mut cmd);

let got: String = wrk.stdout(&mut cmd);

let expected_end = r#"Sampled Records: 100
Expand Down
4 changes: 4 additions & 0 deletions tests/test_table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ fn table_default() {

let mut cmd = wrk.command("table");
cmd.env("QSV_DEFAULT_DELIMITER", "\t");
cmd.env("QSV_SKIP_FORMAT_CHECK", "1");
cmd.arg("in.file");

let got: String = wrk.stdout(&mut cmd);
Expand All @@ -71,6 +72,7 @@ fn table_pipe_delimiter_env() {
wrk.create_with_delim("in.file", data(), b'|');

let mut cmd = wrk.command("table");
cmd.env("QSV_SKIP_FORMAT_CHECK", "1");
cmd.env("QSV_DEFAULT_DELIMITER", "|");
cmd.arg("in.file");

Expand All @@ -84,6 +86,7 @@ fn table_pipe_delimiter() {
wrk.create_with_delim("in.file", data(), b'|');

let mut cmd = wrk.command("table");
cmd.env("QSV_SKIP_FORMAT_CHECK", "1");
cmd.arg("--delimiter").arg("|").arg("in.file");

let got: String = wrk.stdout(&mut cmd);
Expand All @@ -96,6 +99,7 @@ fn invalid_delimiter_len() {
wrk.create_with_delim("in.file", data(), b'|');

let mut cmd = wrk.command("table");
cmd.env("QSV_SKIP_FORMAT_CHECK", "1");
cmd.arg("--delimiter").arg("||").arg("in.file");

let got: String = wrk.output_stderr(&mut cmd);
Expand Down

0 comments on commit 5a0fb3e

Please sign in to comment.