From 201dd152da542e321743d7e778ddc4cba6069104 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 28 Jan 2024 09:00:30 -0500 Subject: [PATCH] `sqlp`: refactor automatic delimiter setting for TSV and TAB files - now autosets delimiting to tab when an input file has a TSV or TAB extension - streamlined how output TSV/TAB files are processed --- src/cmd/sqlp.rs | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/src/cmd/sqlp.rs b/src/cmd/sqlp.rs index 06bdbd21e..f93733213 100644 --- a/src/cmd/sqlp.rs +++ b/src/cmd/sqlp.rs @@ -145,7 +145,7 @@ sqlp options: Only use this when you get query errors. --truncate-ragged-lines Truncate ragged lines when parsing CSVs. If set, rows with more columns than the header will be truncated. If not set, the query - will fail. + will fail. Use this only when you get an error about ragged lines. --ignore-errors Ignore errors when parsing CSVs. If set, rows with errors will be skipped. If not set, the query will fail. Only use this when debugging queries, as polars does batched @@ -280,13 +280,12 @@ impl OutputMode { Some(x) => { let path = Path::new(&x); // if the output file ends with ".tsv" or ".tab", we use tab as the delimiter + let output_extension = path + .extension() + .map_or("", |ext| ext.to_str().unwrap_or_default()); delim = if *self == OutputMode::Csv - && path - .extension() - .map_or(false, |ext| ext.eq_ignore_ascii_case("tsv")) - || path - .extension() - .map_or(false, |ext| ext.eq_ignore_ascii_case("tab")) + && output_extension.eq_ignore_ascii_case("tsv") + || output_extension.eq_ignore_ascii_case("tab") { b'\t' } else { @@ -421,13 +420,14 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let output_mode: OutputMode = args.flag_format.parse().unwrap_or(OutputMode::Csv); let no_output: OutputMode = OutputMode::None; - let delim = if let Some(delimiter) = args.flag_delimiter { + let mut delim = if let Some(delimiter) = args.flag_delimiter { delimiter.as_byte() } else if let Ok(delim) = env::var("QSV_DEFAULT_DELIMITER") { Delimiter::decode_delimiter(&delim)?.as_byte() } else { b',' }; + let orig_delim = delim; let comment_char = if let Ok(comment_char) = env::var("QSV_COMMENT_CHAR") { Some(comment_char) @@ -477,6 +477,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let mut table_aliases = HashMap::with_capacity(args.arg_input.len()); let mut lossy_table_name = Cow::default(); let mut table_name; + let mut inputfile_extension; for (idx, table) in args.arg_input.iter().enumerate() { // as we are using the table name as alias, we need to make sure that the table name is a @@ -488,6 +489,21 @@ pub fn run(argv: &[&str]) -> CliResult<()> { lossy_table_name = table.to_string_lossy(); &lossy_table_name }); + + // if the file has a TSV or TAB extension, we automatically use tab as the delimiter + inputfile_extension = Path::new(table) + .extension() + .and_then(std::ffi::OsStr::to_str) + .unwrap_or_default(); + + if inputfile_extension.eq_ignore_ascii_case("tsv") + || inputfile_extension.eq_ignore_ascii_case("tab") + { + delim = b'\t'; + } else { + delim = orig_delim; + } + table_aliases.insert(table_name.to_string(), format!("_t_{}", idx + 1)); if debuglog_flag {