Merge pull request #1424 from jqnatividad/QSV_OUTPUT_BOM-envvar

Add Byte Order Mark (BOM) output support
dathere · Nov 18, 2023 · f3b7ea0 · f3b7ea0
2 parents 402cb98 + 1758770
commit f3b7ea0
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -285,7 +285,7 @@ Should you need to re-encode CSV/TSV files, you can use the `input` command to "
 
 Alternatively, if you want to truly transcode to UTF-8, there are several utilities like [`iconv`](https://en.wikipedia.org/wiki/Iconv) that you can use to do so on [Linux/macOS](https://stackoverflow.com/questions/805418/how-can-i-find-encoding-of-a-file-via-a-script-on-linux) & [Windows](https://superuser.com/questions/1163753/converting-text-file-to-utf-8-on-windows-command-prompt).
 
-### Windows Usage Note
+### Windows Excel Usage Note
 
 Unlike other modern operating systems, Microsoft Windows' [default encoding is UTF16-LE](https://stackoverflow.com/questions/66072117/why-does-windows-use-utf-16le). This will cause problems when redirecting qsv's output to a CSV file & trying to open it with Excel (which ignores the comma delimiter, with everything in the first column if the file is UTF16-LE encoded):
 
@@ -304,6 +304,12 @@ qsv stats wcp.csv > wcpstats.csv
 qsv stats wcp.csv --output wcpstats.csv
 ```
 
+Alternatively, qsv can add a [Byte Order Mark](https://en.wikipedia.org/wiki/Byte_order_mark) (BOM) to the beginning of a CSV to indicate it's UTF-8 encoded. You can do this by setting the `QSV_OUTPUT_BOM` environment variable to `1`.
+
+This will allow Excel on Windows to properly recognize the CSV file as UTF-8 encoded.
+
+Note that this problem does not occur on Excel on macOS, as macOS uses UTF-8 as its default encoding.
+
 ## Interpreters
 For complex data-wrangling tasks, you can use Luau and Python scripts.
 

diff --git a/docs/ENVIRONMENT_VARIABLES.md b/docs/ENVIRONMENT_VARIABLES.md
@@ -10,10 +10,12 @@
 | `QSV_CACHE_DIR` | The directory to use for caching downloaded lookup_table resources using the `luau` qsv_register_lookup() helper function. |
 | `QSV_CKAN_API` | The CKAN Action API endpoint to use with the `luau` qsv_register_lookup() helper function when using the "ckan://" scheme. |
 | `QSV_CKAN_TOKEN`| The CKAN token to use with the `luau` qsv_register_lookup() helper function when using the "ckan://" scheme. Only required to access private resources. |
-| `QSV_OPENAI_KEY` | The OpenAI API key to use with the `describegpt` command. |
 | `QSV_COMMENT_CHAR` | set to an ascii character. If set, any lines(including the header) that start with this character are ignored. |
 | `QSV_MAX_JOBS` | number of jobs to use for multithreaded commands (currently `apply`, `applydp`, `dedup`, `diff`, `extsort`, `frequency`, `joinp`, `schema`, `snappy`, `sort`, `split`, `stats`, `to`, `tojsonl` & `validate`). If not set, max_jobs is set to the detected number of logical processors.  See [Multithreading](docs/PERFORMANCE.md#multithreading) for more info. |
 | `QSV_NO_UPDATE` | if set, prohibit self-update version check for the latest qsv release published on GitHub. |
+| `QSV_OPENAI_KEY` | The OpenAI API key to use with the `describegpt` command. |
+| `QSV_OUTPUT_BOM` | if set, the output will have a Byte Order Mark (BOM) at the beginning. This is 
+used to generate Excel-friendly CSVs on Windows. |
 | `QSV_PREFER_DMY` | if set, date parsing will use DMY format. Otherwise, use MDY format (used with `apply datefmt`, `schema`, `sniff` & `stats` commands). |
 | `QSV_REGEX_UNICODE` | if set, makes `search`, `searchset` & `replace` commands unicode-aware. For increased performance, these commands are not unicode-aware by default & will ignore unicode values when matching & will abort when unicode characters are used in the regex. Note that the `apply operations regex_replace` operation is always unicode-aware. |
 | `QSV_RDR_BUFFER_CAPACITY` | reader buffer size (default (bytes): 16384) |

diff --git a/dotenv.template.yaml b/dotenv.template.yaml
@@ -45,6 +45,11 @@ QSV_NO_HEADERS = False
 # updated regardless of this setting.
 # QSV_AUTOINDEX_SIZE = 1000000
 
+# if set, add a BOM (Byte Order Mark) to the beginning of the output.
+# Note that this will also set the BOM for qsv's output to stdout.
+# This is useful when generating CSV files for Excel on Windows.
+# QSV_OUTPUT_BOM = 1
+
 # The directory to use for caching various qsv files.
 # Used by the `geocode` command for downloaded geocoding resources.
 # Used by the `luau`` command for downloaded lookup_table resources using

diff --git a/src/config.rs b/src/config.rs
@@ -528,11 +528,15 @@ impl Config {
     }
 
     #[allow(clippy::wrong_self_convention)]
-    pub fn from_writer<W: io::Write>(&self, wtr: W) -> csv::Writer<W> {
+    pub fn from_writer<W: io::Write>(&self, mut wtr: W) -> csv::Writer<W> {
         let wtr_capacitys = env::var("QSV_WTR_BUFFER_CAPACITY")
             .unwrap_or_else(|_| DEFAULT_WTR_BUFFER_CAPACITY.to_string());
         let wtr_buffer: usize = wtr_capacitys.parse().unwrap_or(DEFAULT_WTR_BUFFER_CAPACITY);
 
+        if util::get_envvar_flag("QSV_OUTPUT_BOM") {
+            wtr.write_all("\u{FEFF}".as_bytes()).unwrap();
+        }
+
         csv::WriterBuilder::new()
             .flexible(self.flexible)
             .delimiter(self.delimiter)