From 5413cc49e1ff75425d3c0c2db6ab6ac661c6256f Mon Sep 17 00:00:00 2001
From: Barbara Vreede <b.vreede@esciencecenter.nl>
Date: Thu, 22 Aug 2024 11:46:30 +0200
Subject: [PATCH] allow for source = NULL in a talkr init (#100)

* allow for source = NULL in a talkr init

* check that existing columns arent overwritten when source = null
---
 R/init.R                   |  9 ++++++++-
 man/init.Rd                |  4 +++-
 tests/testthat/test-init.R | 27 +++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/R/init.R b/R/init.R
index 17463d9..7d317d2 100644
--- a/R/init.R
+++ b/R/init.R
@@ -6,7 +6,9 @@
 #' Initializing a talkr dataset is the first step in the talkr workflow.
 #'
 #' @param data A dataframe object
-#' @param source The column name identifying the conversation source (e.g. a filename; is used as unique conversation ID)
+#' @param source The column name identifying the conversation source
+#'  (e.g. a filename; is used as unique conversation ID). If there are no different
+#'  sources in the data, set this parameter to `NULL`.
 #' @param begin The column name with the begin time of the utterance (in milliseconds)
 #' @param end The column name with the end time of the utterance (in milliseconds)
 #' @param participant The column name with the participant who produced the utterance
@@ -43,6 +45,11 @@ init <- function(data,
     data$end <- as.numeric(data$end)
   }
 
+  # ensure a `source` column exists; if it does not exist, create one
+  if(!"source" %in% names(data)){
+    data$source <- "talkr"
+  }
+
   # generate UIDs
   if("uid" %in% names(data)){
     warning("Column 'uid' already exists in the dataset. This column will be renamed to `original_uid`.")
diff --git a/man/init.Rd b/man/init.Rd
index f6e64d0..65bc034 100644
--- a/man/init.Rd
+++ b/man/init.Rd
@@ -17,7 +17,9 @@ init(
 \arguments{
 \item{data}{A dataframe object}
 
-\item{source}{The column name identifying the conversation source (e.g. a filename; is used as unique conversation ID)}
+\item{source}{The column name identifying the conversation source
+(e.g. a filename; is used as unique conversation ID). If there are no different
+sources in the data, set this parameter to `NULL`.}
 
 \item{begin}{The column name with the begin time of the utterance (in milliseconds)}
 
diff --git a/tests/testthat/test-init.R b/tests/testthat/test-init.R
index aea9e07..6edbe86 100644
--- a/tests/testthat/test-init.R
+++ b/tests/testthat/test-init.R
@@ -113,3 +113,30 @@ test_that("Warning is generated with existing UID column", {
   expect_true("original_uid" %in% names(talkr_dataset))
 
 })
+
+test_that("init works with source = NULL", {
+  expect_no_error(talkr_dataset <- init(dummy_data,
+                        source = NULL,
+                        begin = "col1",
+                        end = "col2",
+                        participant = "x",
+                        utterance = "y"))
+  expect_false("source" %in% names(dummy_data))
+  expect_true("source" %in% names(talkr_dataset))
+  expected_UIDs <- c("talkr-0001-1",
+                     "talkr-0002-2",
+                     "talkr-0003-3",
+                     "talkr-0004-4",
+                     "talkr-0005-5")
+  expect_equal(talkr_dataset$uid, expected_UIDs)
+})
+
+test_that("init does not overwrite existing columns when source = NULL", {
+  data <- data.frame(begin = 1:4,
+                     end = 5:8,
+                     participant = "Person1",
+                     utterance = "HelloWorld",
+                     source = "A.txt")
+  talkr_dataset <- init(data, source = NULL)
+  expect_equal(talkr_dataset$source, data$source)
+})