Skip to content

Commit

Permalink
Improve write_xpt() functionality (#659)
Browse files Browse the repository at this point in the history
Closes #650
* Allow user to specify output variable width using the "width" attribute.
* Set minimum length of 1 for string variables to avoid bugs when writing blank vectors
* Add xpt format roundtrip tests
* Bump dev version
  • Loading branch information
gorcha authored Mar 1, 2022
1 parent acbb771 commit 455e206
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 5 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: haven
Title: Import and Export 'SPSS', 'Stata' and 'SAS' Files
Version: 2.4.3.9000
Version: 2.4.3.9001
Authors@R: c(
person("Hadley", "Wickham", , "[email protected]", role = c("aut", "cre")),
person("Evan", "Miller", role = c("aut", "cph"),
Expand Down
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

* Updated to ReadStat 1.1.8 RC (#650).

* All `write_` functions can now write custom variable widths by setting the
`width` attribute (#650).

* When writing files, the minimum width for character variables is now 1. This
fixes issues with statistical software reading blank character variables with
width 0 (#650).

* `write_sav()` now checks for case-insensitive duplicate variable names
(@juansebastianl, #641) and verifies that variable names are valid SPSS
variables.
Expand Down
27 changes: 23 additions & 4 deletions src/DfWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ inline int displayWidth(cpp11::sexp x) {
return 0;
}

inline int userWidth(cpp11::sexp x) {
cpp11::sexp user_width_obj(x.attr("width"));
switch(TYPEOF(user_width_obj)) {
case INTSXP:
return INTEGER(user_width_obj)[0];
case REALSXP:
return REAL(user_width_obj)[0];
}
return 0;
}

class Writer {
FileExt ext_;
Expand Down Expand Up @@ -260,7 +270,7 @@ class Writer {
}

readstat_variable_t* var =
readstat_add_variable(writer_, name, READSTAT_TYPE_INT32, 0);
readstat_add_variable(writer_, name, READSTAT_TYPE_INT32, userWidth(x));
readstat_variable_set_format(var, format);
readstat_variable_set_label(var, var_label(x));
readstat_variable_set_label_set(var, labelSet);
Expand Down Expand Up @@ -305,7 +315,7 @@ class Writer {
}

readstat_variable_t* var =
readstat_add_variable(writer_, name, READSTAT_TYPE_DOUBLE, 0);
readstat_add_variable(writer_, name, READSTAT_TYPE_DOUBLE, userWidth(x));

readstat_variable_set_format(var, format);
readstat_variable_set_label(var, var_label(x));
Expand Down Expand Up @@ -343,15 +353,24 @@ class Writer {
for (int i = 0; i < values.size(); ++i)
readstat_label_string_value(labelSet, string_utf8(values, i), string_utf8(labels, i));
}
int max_length = 0;

int user_width = userWidth(x);
int max_length = 1;
for (int i = 0; i < x.size(); ++i) {
int length = strlen(string_utf8(x, i));
if (length > max_length)
max_length = length;
}
if (max_length > user_width) {
if (user_width > 0) {
cpp11::warning("Column `%s` contains string values longer than user width %d. Width set to %d to accommodate.", name, user_width, max_length);
}
user_width = max_length;
}


readstat_variable_t* var =
readstat_add_variable(writer_, name, READSTAT_TYPE_STRING, max_length);
readstat_add_variable(writer_, name, READSTAT_TYPE_STRING, user_width);
readstat_variable_set_format(var, format);
readstat_variable_set_label(var, var_label(x));
readstat_variable_set_label_set(var, labelSet);
Expand Down
20 changes: 20 additions & 0 deletions tests/testthat/test-haven-sas.R
Original file line number Diff line number Diff line change
Expand Up @@ -249,3 +249,23 @@ test_that("can roundtrip file labels", {
expect_equal(attr(roundtrip_xpt(df, label = "abcd"), "label"), "abcd")
expect_null(attr(roundtrip_xpt(df, label = NULL), "label"))
})

test_that("can roundtrip format attribute", {
df <- tibble(
char_var = structure("Hello!", format.sas = "$CHAR"),
long_char = structure("111111111111111", format.sas = "$CHAR10"),
date_var = structure(Sys.Date(), format.sas = "DATE9"),
a = structure(100.12345, format.sas = "10.3"),
b = structure(100.12345, format.sas = "10"),
c = structure(100.12345, format.sas = "F10.3"),
d = structure(100.12345, format.sas = "F10"),
e = structure(100.12345, format.sas = "COMMA10.3")
)

path <- tempfile()

write_xpt(df, path)
out <- read_xpt(path)

expect_identical(df, out)
})

0 comments on commit 455e206

Please sign in to comment.