From 141c0ae35d70363d0b36d9a6eb9df41846bde0e8 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 28 Jan 2024 09:50:29 -0500 Subject: [PATCH] `joinp`: automatically set tab delimiter when reading TSV/TAB input files --- src/cmd/joinp.rs | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/cmd/joinp.rs b/src/cmd/joinp.rs index 46efa0b20..b2891f0ed 100644 --- a/src/cmd/joinp.rs +++ b/src/cmd/joinp.rs @@ -162,7 +162,7 @@ joinp options: Common options: -h, --help Display this message -o, --output Write output to instead of stdout. - -d, --delimiter The field delimiter for reading CSV data. + -d, --delimiter The field delimiter for reading/writing CSV data. Must be a single character. (default: ,) -Q, --quiet Do not return join shape to stderr. "#; @@ -520,7 +520,7 @@ impl Args { .has_header(true) .with_missing_is_null(self.flag_nulls) .with_comment_prefix(comment_char.as_deref()) - .with_separator(delim) + .with_separator(tsvtab_delim(&self.arg_input1, delim)) .with_infer_schema_length(num_rows) .with_try_parse_dates(try_parsedates) .low_memory(low_memory) @@ -545,7 +545,7 @@ impl Args { .has_header(true) .with_missing_is_null(self.flag_nulls) .with_comment_prefix(comment_char.as_deref()) - .with_separator(delim) + .with_separator(tsvtab_delim(&self.arg_input2, delim)) .with_infer_schema_length(num_rows) .with_try_parse_dates(try_parsedates) .low_memory(low_memory) @@ -580,3 +580,19 @@ impl Args { }) } } + +fn tsvtab_delim(file: &str, orig_delim: u8) -> u8 { + // if the file has a TSV or TAB extension, we automatically use tab as the delimiter + let inputfile_extension = Path::new(file) + .extension() + .and_then(std::ffi::OsStr::to_str) + .unwrap_or_default(); + + if inputfile_extension.eq_ignore_ascii_case("tsv") + || inputfile_extension.eq_ignore_ascii_case("tab") + { + b'\t' + } else { + orig_delim + } +}