From faafe918c03c9dfe81d54fa4d6cc4676c6deffc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Llu=C3=ADs?= Date: Wed, 6 Dec 2023 12:48:52 +0100 Subject: [PATCH] Fixes in easy cases the spatial distribution of samples --- R/indexing.R | 30 +++++++++++++++++++++++------- R/spatial.R | 22 +++++++--------------- tests/testthat/test-indexing.R | 33 +++++++++++++++++++++++++-------- 3 files changed, 55 insertions(+), 30 deletions(-) diff --git a/R/indexing.R b/R/indexing.R index b18f5df..ce90cca 100644 --- a/R/indexing.R +++ b/R/indexing.R @@ -33,22 +33,38 @@ create_subset <- function(size_data, size_subset = NULL, n = NULL, name = "SubSe } # The workhorse function without any check +# size_batches is a vector with the number of elements in each batch. create_index <- function(size_data, size_batches, n, name = "SubSet") { # The size of each batch + stopifnot("Batches match the length" = length(size_batches) == n) i <- distribute_samples(size_data, size_batches) names(i) <- id2batch_names(name, n) i } # Shuffle sample within index to improve positioning -create_index4index <- function(index, n, name) { - i <- vector("list", length = n) - names(i) <- id2batch_names(name, n) - for (id in seq_along(index)){ - s <- sample(index[[id]]) - names(i)[s] +create_index4index <- function(index, size_subset, n, name) { + index_out <- vector("list", n) + names(index_out) <- id2batch_names(name, n) + for (batch in seq_along(index)) { + pos <- index[[batch]] + # Pick a batch from the new index to place the previous position + # Which hasn't been picked within the batch + i_lengths <- lengths(index_out) + + # Pick from the ones that are almost filled: to ensure that the batches are fully used. + i_lengths <- i_lengths[order(size_subset - i_lengths, decreasing = TRUE)] + batch_w_space <- i_lengths < size_subset + possible_positions <- which(batch_w_space)[seq_len(length(pos))] + + # Sample the positions of the new index for each current position in the index + index_i <- sample(possible_positions, length(pos)) + for (position in seq_along(pos)) { + index_out[[index_i[position]]] <- c(index_out[[index_i[position]]], + pos[position]) + } } - i + index_out } id2batch_names <- function(name, n) { diff --git a/R/spatial.R b/R/spatial.R index 3a9e96e..d36b472 100644 --- a/R/spatial.R +++ b/R/spatial.R @@ -67,22 +67,13 @@ spatial <- function(index, pheno, omit = NULL, remove_positions = NULL, rows = L # Use index to duplicate samples in case the index comes from replicates. pheno_o <- pheno_o[unlist(index), ] - old_rows <- round(as.numeric(rownames(pheno_o))) rownames(pheno_o) <- NULL - new_rows <- as.numeric(rownames(pheno_o)) batches <- length(position) - # size_batches <- internal_batches(size_data, size_subset, batches) - i0 <- 0L - while (iterations > 0L) { - i <- create_index4index(index, batches, position) - # i <- create_index(size_data, size_batches, batches, name = position) - i0 <- i0 + 1L - message("Try ", i0, " iterations ", iterations) - if (!any(table(spatial = batch_names(i), batch = batch_names(index)) > 1L)) { - iterations <- iterations - 1 - } else { - next - } + size_subset <- optimum_batches(sum(lengths(index)), batches) + for (j in seq_len(iterations)) { + + i <- create_index4index(index, size_subset, name = position, n = batches) + meanDiff <- .check_index(i, pheno_o, num, eval_n, original_pheno) # Minimize the value optimize <- sum(rowMeans(abs(meanDiff))) @@ -93,5 +84,6 @@ spatial <- function(index, pheno, omit = NULL, remove_positions = NULL, rows = L val <- i } } - translate_index(val, old_rows, new_rows) + + val } diff --git a/tests/testthat/test-indexing.R b/tests/testthat/test-indexing.R index 8e125e8..2dd8f6b 100644 --- a/tests/testthat/test-indexing.R +++ b/tests/testthat/test-indexing.R @@ -1,8 +1,25 @@ - -test_that("sizes_batches works", { - out <- sizes_batches(size_data = 237, size_subset = 60, batches = 4) - expect_equal(out, c(60, 59, 59, 59)) - - expect_error(sizes_batches(size_data = 237, size_subset = 59, batches = 4), - "batches or size_subset is too small to fit all the samples.") -}) +test_that("sizes_batches works", { + out <- sizes_batches(size_data = 237, size_subset = 60, batches = 4) + expect_equal(out, c(60, 59, 59, 59)) + + expect_error(sizes_batches(size_data = 237, size_subset = 59, batches = 4), + "batches or size_subset is too small to fit all the samples.") +}) + +test_that("create_index4index works", { + i1 <- create_index(15, rep.int(5, 3), 3) + i2 <- create_index4index(i1, size_subset = 3, n = 5, name = "spatial") + bn1 <- batch_names(i1) + bn2 <- batch_names(i2) + expect_true(all(table(bn1, bn2) == 1)) +}) + + +test_that("translate_index works", { + index <- create_index(45, rep.int(9, 5), 5) + old_rows <- seq_len(47)[-c(2, 7)] + new_rows <- seq_len(45) + ti <- translate_index(index, old_rows, new_rows) + expect_true(all(c(2, 7) %in% unlist(index, FALSE, FALSE))) + expect_false(all(c(2, 7) %in% unlist(ti, FALSE, FALSE))) +})