From faafe918c03c9dfe81d54fa4d6cc4676c6deffc5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Llu=C3=ADs?= <lluis.revilla@gmail.com>
Date: Wed, 6 Dec 2023 12:48:52 +0100
Subject: [PATCH] Fixes in easy cases the spatial distribution of samples

---
 R/indexing.R                   | 30 +++++++++++++++++++++++-------
 R/spatial.R                    | 22 +++++++---------------
 tests/testthat/test-indexing.R | 33 +++++++++++++++++++++++++--------
 3 files changed, 55 insertions(+), 30 deletions(-)

diff --git a/R/indexing.R b/R/indexing.R
index b18f5df..ce90cca 100644
--- a/R/indexing.R
+++ b/R/indexing.R
@@ -33,22 +33,38 @@ create_subset <- function(size_data, size_subset = NULL, n = NULL, name = "SubSe
 }
 
 # The workhorse function without any check
+# size_batches is a vector with the number of elements in each batch.
 create_index <- function(size_data, size_batches, n, name = "SubSet") {
   # The size of each batch
+  stopifnot("Batches match the length" = length(size_batches) == n)
   i <- distribute_samples(size_data, size_batches)
   names(i) <- id2batch_names(name, n)
   i
 }
 
 # Shuffle sample within index to improve positioning
-create_index4index <- function(index, n, name) {
-  i <- vector("list", length = n)
-  names(i) <- id2batch_names(name, n)
-  for (id in seq_along(index)){
-    s <- sample(index[[id]])
-    names(i)[s]
+create_index4index <- function(index, size_subset, n, name) {
+  index_out <- vector("list", n)
+  names(index_out) <- id2batch_names(name, n)
+  for (batch in seq_along(index)) {
+    pos <- index[[batch]]
+    # Pick a batch from the new index to place the previous position
+    # Which hasn't been picked within the batch
+    i_lengths <- lengths(index_out)
+
+    # Pick from the ones that are almost filled: to ensure that the batches are fully used.
+    i_lengths <- i_lengths[order(size_subset - i_lengths, decreasing = TRUE)]
+    batch_w_space <- i_lengths < size_subset
+    possible_positions <- which(batch_w_space)[seq_len(length(pos))]
+
+    # Sample the positions of the new index for each current position in the index
+    index_i <- sample(possible_positions, length(pos))
+    for (position in seq_along(pos)) {
+      index_out[[index_i[position]]] <- c(index_out[[index_i[position]]],
+                                          pos[position])
+    }
   }
-  i
+  index_out
 }
 
 id2batch_names <- function(name, n) {
diff --git a/R/spatial.R b/R/spatial.R
index 3a9e96e..d36b472 100644
--- a/R/spatial.R
+++ b/R/spatial.R
@@ -67,22 +67,13 @@ spatial <- function(index, pheno, omit = NULL, remove_positions = NULL, rows = L
 
   # Use index to duplicate samples in case the index comes from replicates.
   pheno_o <- pheno_o[unlist(index), ]
-  old_rows <- round(as.numeric(rownames(pheno_o)))
   rownames(pheno_o) <- NULL
-  new_rows <- as.numeric(rownames(pheno_o))
   batches <- length(position)
-  # size_batches <- internal_batches(size_data, size_subset, batches)
-  i0 <- 0L
-  while (iterations > 0L) {
-    i <- create_index4index(index, batches, position)
-    # i <- create_index(size_data, size_batches, batches, name = position)
-    i0 <- i0 + 1L
-    message("Try ", i0, " iterations ", iterations)
-    if (!any(table(spatial = batch_names(i), batch = batch_names(index)) > 1L)) {
-      iterations <- iterations - 1
-    } else {
-      next
-    }
+  size_subset <- optimum_batches(sum(lengths(index)), batches)
+  for (j in seq_len(iterations)) {
+
+    i <- create_index4index(index, size_subset, name = position, n = batches)
+
     meanDiff <- .check_index(i, pheno_o, num, eval_n, original_pheno)
     # Minimize the value
     optimize <- sum(rowMeans(abs(meanDiff)))
@@ -93,5 +84,6 @@ spatial <- function(index, pheno, omit = NULL, remove_positions = NULL, rows = L
       val <- i
     }
   }
-  translate_index(val, old_rows, new_rows)
+
+  val
 }
diff --git a/tests/testthat/test-indexing.R b/tests/testthat/test-indexing.R
index 8e125e8..2dd8f6b 100644
--- a/tests/testthat/test-indexing.R
+++ b/tests/testthat/test-indexing.R
@@ -1,8 +1,25 @@
-
-test_that("sizes_batches works", {
-  out <- sizes_batches(size_data = 237, size_subset = 60, batches = 4)
-  expect_equal(out, c(60, 59, 59, 59))
-
-  expect_error(sizes_batches(size_data = 237, size_subset = 59, batches = 4),
-               "batches or size_subset is too small to fit all the samples.")
-})
+test_that("sizes_batches works", {
+  out <- sizes_batches(size_data = 237, size_subset = 60, batches = 4)
+  expect_equal(out, c(60, 59, 59, 59))
+
+  expect_error(sizes_batches(size_data = 237, size_subset = 59, batches = 4),
+               "batches or size_subset is too small to fit all the samples.")
+})
+
+test_that("create_index4index works", {
+  i1 <- create_index(15, rep.int(5, 3), 3)
+  i2 <- create_index4index(i1, size_subset = 3, n = 5, name = "spatial")
+  bn1 <- batch_names(i1)
+  bn2 <- batch_names(i2)
+  expect_true(all(table(bn1, bn2) == 1))
+})
+
+
+test_that("translate_index works", {
+  index <- create_index(45, rep.int(9, 5), 5)
+  old_rows <- seq_len(47)[-c(2, 7)]
+  new_rows <- seq_len(45)
+  ti <- translate_index(index, old_rows, new_rows)
+  expect_true(all(c(2, 7) %in% unlist(index, FALSE, FALSE)))
+  expect_false(all(c(2, 7) %in% unlist(ti, FALSE, FALSE)))
+})