Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: Implement tbl_subassign_col() in C #1368

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 5 additions & 27 deletions R/subsetting.R
Original file line number Diff line number Diff line change
Expand Up @@ -636,35 +636,13 @@ is_tight_sequence_at_end <- function(i_new, n) {
}

tbl_subassign_col <- function(x, j, value) {
nrow <- fast_nrow(x)

# Grow, assign new names
new <- attr(j, "new")
if (!is.null(new)) {
length(x) <- max(j[new])
names(x)[j[new]] <- names2(j)[new]
if (length(j) > 1) {
order_j <- order(j)
value <- value[order_j]
j <- j[order_j]
}

# Update
to_remove <- integer()
for (jj in seq_along(value)) {
ji <- j[[jj]]
value_jj <- value[[jj]]
if (!is.null(value_jj)) {
x[[ji]] <- value_jj
} else {
to_remove <- c(to_remove, ji)
}
}

# Remove
if (length(to_remove) > 0) {
x <- x[-to_remove]
}

# Can be destroyed by setting length
attr(x, "row.names") <- .set_row_names(nrow)
x
.Call(`tibble_tbl_subassign_col`, x, j, value)
}

tbl_expand_to_nrow <- function(x, i) {
Expand Down
1 change: 1 addition & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ static const R_CallMethodDef CallEntries[] = {
{"tibble_string_to_indices", (DL_FUNC) &tibble_string_to_indices, 1},
{"tibble_restore_impl", (DL_FUNC) &tibble_restore_impl, 2},
{"tibble_need_coerce", (DL_FUNC) &tibble_need_coerce, 1},
{"tibble_tbl_subassign_col", (DL_FUNC) &tbl_subassign_col, 3},

{NULL, NULL, 0}
};
Expand Down
131 changes: 131 additions & 0 deletions src/tbl_subassign_col.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#include "tibble.h"

R_xlen_t fast_nrow(SEXP x);

// Performs a variant of x[j] <- value, growing x as necessary
// NULL values are supported
// First, creates the storage for the resulting x
// Then, populates that storage by copying the correct values exactly once
//
// Requires j to be sorted on input
SEXP tbl_subassign_col(SEXP x, SEXP j_, SEXP value) {
// Naming conventions:
// n_x: length of x
// x_idx: index into vector x
// x_names: names of x
// n_old: number of elements with some property
// j_max: maximum value of j
// j_/j: SEXP and native version of bare vector
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use j_ffi vs j? This way it's immediately clear which does what.

// xo: output vector
R_xlen_t nrow = fast_nrow(x);
R_xlen_t n_x = Rf_xlength(x);

// Compute j from j_, converting 1-based j_ to zero-based j
R_xlen_t n_j = Rf_xlength(j_);
R_xlen_t* j = (R_xlen_t*)R_alloc(n_j + 1, sizeof(R_xlen_t));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason not to use a raw vector here? Just asking because we usually renounce to manage memory only in very specific cases.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same remark for allocs below. Another reason to manage memory manually is that it makes it clearer when things are not needed anymore, so this has a self-documenting value.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you have used vec_as_location() upstream to avoid having to deal with double vectors? I imagine tibble doesn't need support for long vectors in the column dimension.

if (TYPEOF(j_) == INTSXP) {
const int* j_int = INTEGER(j_);
for (R_xlen_t j_idx = 0; j_idx < n_j; ++j_idx) {
j[j_idx] = j_int[j_idx] - 1;
}
} else if (TYPEOF(j_) == REALSXP) {
const double* j_real = REAL(j_);
for (R_xlen_t j_idx = 0; j_idx < n_j; ++j_idx) {
j[j_idx] = (R_xlen_t)j_real[j_idx] - 1;
}
} else {
Rf_error("Internal: tbl_subassign_col: invalid type for j_.");
}

// Add sentinel value
j[n_j] = -1;

// Compute j_max and n_old
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is n_old?

R_xlen_t j_max = n_x;
R_xlen_t n_old = 0;
for (R_xlen_t j_idx = 0; j_idx < n_j; ++j_idx) {
if (j_max <= j[j_idx]) {
j_max = j[j_idx] + 1;
}
if (VECTOR_ELT(value, j_idx) == R_NilValue) {
++n_old;
}
}

R_xlen_t n_xo = j_max - n_old;

// For each target element, position of source element in the x or value vectors
R_xlen_t* xo_src_idx = (R_xlen_t*)R_alloc(n_xo, sizeof(R_xlen_t));
bool* xo_is_value = (bool*)R_alloc(n_xo, sizeof(bool));

for (R_xlen_t j_idx = 0, xo_idx = 0, x_idx = 0; xo_idx < n_xo; ++x_idx) {
// Is the next xo element taken from x or from value?
bool is_value = (x_idx == j[j_idx]);
xo_is_value[xo_idx] = is_value;

if (is_value) {
// If the next value is not NULL, use it, otherwise skip it
if (VECTOR_ELT(value, j_idx) != R_NilValue) {
xo_src_idx[xo_idx++] = j_idx;
}
j_idx++;
} else {
// Use next x
xo_src_idx[xo_idx++] = x_idx;
}
}

// Allocate output vector and output names with final size
SEXP xo = Rf_allocVector(VECSXP, n_xo);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PROTECT

Rf_copyMostAttrib(x, xo);

SEXP xo_names = Rf_allocVector(STRSXP, n_xo);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PROTECT

SEXP j_names = Rf_getAttrib(j_, R_NamesSymbol);
SEXP x_names = Rf_getAttrib(x, R_NamesSymbol);

// Populate xo, copying from x or value as planned
for (R_xlen_t xo_idx = 0; xo_idx < n_xo; ++xo_idx) {
R_xlen_t x_idx = xo_src_idx[xo_idx];
if (xo_is_value[xo_idx]) {
SET_VECTOR_ELT(xo, xo_idx, VECTOR_ELT(value, x_idx));
SET_STRING_ELT(xo_names, xo_idx, STRING_ELT(j_names, x_idx));
} else {
if (x_idx >= n_x) {
Rf_error("Internal: tbl_subassign_col: x_j >= n_x, %d >= %d", x_idx, n_x);
}
SET_VECTOR_ELT(xo, xo_idx, VECTOR_ELT(x, x_idx));
SET_STRING_ELT(xo_names, xo_idx, STRING_ELT(x_names, x_idx));
}
}

// Set output names
Rf_setAttrib(xo, R_NamesSymbol, xo_names);

// Set output row names
// FIXME: Reuse original vector?
SEXP new_row_names_ = Rf_allocVector(INTSXP, 2);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PROTECT

int* new_row_names = INTEGER(new_row_names_);
new_row_names[0] = NA_INTEGER;
new_row_names[1] = -nrow;
Rf_setAttrib(xo, R_RowNamesSymbol, new_row_names_);

return xo;
}

R_xlen_t fast_nrow(SEXP x) {
SEXP row_names = Rf_getAttrib(x, R_RowNamesSymbol);

if (Rf_xlength(row_names) == 0) {
return 0;
}

if (TYPEOF(row_names) == INTSXP && INTEGER_ELT(row_names, 0) == NA_INTEGER) {
int out = INTEGER_ELT(row_names, 1);
if (out < 0) {
out = -out;
}
return out;
} else {
return Rf_xlength(row_names);
}
}
3 changes: 3 additions & 0 deletions src/tibble.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
#define R_NO_REMAP
#include <Rinternals.h>

#include "stdbool.h"

SEXP tibble_matrixToDataFrame(SEXP xSEXP);
SEXP tibble_string_to_indices(SEXP x);
SEXP tibble_need_coerce(SEXP x);
SEXP tibble_update_attrs(SEXP x, SEXP dots);
SEXP tibble_restore_impl(SEXP xo, SEXP x);
SEXP tbl_subassign_col(SEXP x, SEXP j, SEXP value);

#endif /* TIBBLE_H */