From fd3a60cd1137f49b36214593b9f5ab9058fd1ed3 Mon Sep 17 00:00:00 2001 From: Dan Gealow Date: Thu, 15 Feb 2024 17:22:48 -0500 Subject: [PATCH] Support writing linear-tsv-style with escape="sep" --- R/vroom_write.R | 26 +++++++++++++++++++++++--- src/vroom_write.cc | 44 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 58 insertions(+), 12 deletions(-) diff --git a/R/vroom_write.R b/R/vroom_write.R index 453640fd..453e9be4 100644 --- a/R/vroom_write.R +++ b/R/vroom_write.R @@ -5,6 +5,7 @@ #' @param escape The type of escape to use when quotes are in the data. #' - `double` - quotes are escaped by doubling them. #' - `backslash` - quotes are escaped by a preceding backslash. +#' - `sep` - tabs, newlines, and backslashes are escaped as `\t`, `\n`, and `\\` #' - `none` - quotes are not escaped. #' @param quote How to handle fields which contain characters that need to be #' quoted. @@ -37,7 +38,7 @@ #' # vroom_write(mtcars, "mtcars.tsv.xz") vroom_write <- function(x, file, delim = '\t', eol = "\n", na = "NA", col_names = !append, append = FALSE, quote = c("needed", "all", "none"), escape = - c("double", "backslash", "none"), bom = FALSE, num_threads = + c("double", "backslash", "sep", "none"), bom = FALSE, num_threads = vroom_threads(), progress = vroom_progress(), path = deprecated()) { if (lifecycle::is_present(path)) { @@ -53,6 +54,15 @@ vroom_write <- function(x, file, delim = '\t', eol = "\n", na = "NA", col_names quote <- match.arg(quote) escape <- match.arg(escape) + if (escape == "sep") { + if (!all(c(delim, eol) %in% c("\t", "\n", "\r", "\r\n"))) { + stop("Can only escape separators `\\t`, `\\n`, and `\\r`") + } + if (quote != "none") { + warning("quotes in data will not be escaped with `escape = sep`") + } + } + opts <- get_vroom_write_opts(quote, escape, bom) # Standardise path returns a list, but we will only ever have 1 output file. @@ -109,7 +119,8 @@ vroom_write_opts <- function() c( "quote_all" = 2L, "escape_double" = 4L, "escape_backslash" = 8L, - "bom" = 16L + "bom" = 16L, + "escape_sep" = 32L ) #' Convert a data frame to a delimited string @@ -121,7 +132,7 @@ vroom_write_opts <- function() c( #' @inheritParams vroom_write #' @export vroom_format <- function(x, delim = "\t", eol = "\n", na = "NA", col_names = TRUE, - escape = c("double", "backslash", "none"), + escape = c("double", "backslash", "sep", "none"), quote = c("needed", "all", "none"), bom = FALSE, num_threads = vroom_threads()) { @@ -135,6 +146,15 @@ vroom_format <- function(x, delim = "\t", eol = "\n", na = "NA", col_names = TRU quote <- match.arg(quote) escape <- match.arg(escape) + if (escape == "sep") { + if (!all(c(delim, eol) %in% c("\t", "\n", "\r", "\r\n"))) { + stop("Can only escape separators `\\t`, `\\n`, and `\\r`") + } + if (quote != "none") { + warning("quotes in data will not be escaped with `escape = sep`") + } + } + opts <- get_vroom_write_opts(quote, escape, bom) # This seems to work ok in practice diff --git a/src/vroom_write.cc b/src/vroom_write.cc index b3624710..7709c525 100644 --- a/src/vroom_write.cc +++ b/src/vroom_write.cc @@ -20,7 +20,8 @@ typedef enum { quote_all = 2, escape_double = 4, escape_backslash = 8, - bom = 16 + bom = 16, + escape_sep = 32 } vroom_write_opt_t; size_t get_buffer_size( @@ -135,16 +136,41 @@ void str_to_buf( } auto end = str_p + len; - bool should_escape = options & (escape_double | escape_backslash); - auto escape = - options & escape_double ? '"' : options & escape_backslash ? '\\' : '\0'; - buf.reserve(buf.size() + len); - while (str_p < end) { - if (should_escape && *str_p == '"') { - buf.push_back(escape); + + if (options & escape_sep) { + while (str_p < end) { + if (*str_p == '\t') { + buf.push_back('\\'); + buf.push_back('t'); + ++str_p; + } else if (*str_p == '\n') { + buf.push_back('\\'); + buf.push_back('n'); + ++str_p; + } else if (*str_p == '\r') { + buf.push_back('\\'); + buf.push_back('r'); + ++str_p; + } else if (*str_p == '\\') { + buf.push_back('\\'); + buf.push_back('\\'); + ++str_p; + } else { + buf.push_back(*str_p++); + } + } + } else { + bool should_escape = options & (escape_double | escape_backslash); + auto escape = + options & escape_double ? '"' : options & escape_backslash ? '\\' : '\0'; + + while (str_p < end) { + if (should_escape && *str_p == '"') { + buf.push_back(escape); + } + buf.push_back(*str_p++); } - buf.push_back(*str_p++); } if (should_quote) {