cornball-ai · TroyHernandez · Jun 17, 2026 · Jun 17, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: chatterbox
 Title: Text-to-Speech Using Chatterbox TTS Engine
-Version: 0.1.0.15
+Version: 0.1.0.16
 Authors@R:
     c(person("Troy", "Hernandez", role = c("aut", "cre"),
              email = "troy@cornball.ai",

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+# chatterbox 0.1.0.16 (development)
+
+- `chatterbox()` gains a `tune_gc` argument (default TRUE) to opt out of
+  the CUDA GC tuning added in 0.1.0.15. The tuning is a deliberate,
+  persistent `options()` side effect (torch reads the allocator rates
+  later, at CUDA init), documented in `?chatterbox`; pass
+  `tune_gc = FALSE` to skip it. No behavior change at the default.
+
 # chatterbox 0.1.0.15 (development)
 
 - `chatterbox()` now tunes torch's CUDA garbage-collection rates before the

diff --git a/R/tts.R b/R/tts.R
@@ -97,16 +97,32 @@ normalize_tts_text <- function(text, caps = TRUE, punctuation = TRUE) {
 #' the not-loaded error paths), then load later with
 #' \code{\link{load_chatterbox}}.
 #'
+#' @details
+#' When \code{tune_gc = TRUE} (the default) and \code{device} is CUDA, this
+#' raises torch's allocator GC floors before the first CUDA op. torch otherwise
+#' runs \code{gc()} on nearly every allocation once a model occupies more than
+#' 20\% of VRAM, which dominates inference. It sets session-global
+#' \code{torch.cuda_allocator_reserved_rate} (the model footprint over VRAM) and
+#' \code{torch.threshold_call_gc}, only when they are unset, so an explicit
+#' setting always wins. This is a deliberate, persistent side effect (torch
+#' reads the rates later, at CUDA init); pass \code{tune_gc = FALSE} to skip it.
+#'
 #' @param device Device to use ("cpu", "cuda", "mps", etc.)
 #' @param turbo Use turbo model (GPT-2 backbone, MeanFlow decoder). Default FALSE.
 #' @param load Load pretrained weights before returning. Default TRUE.
 #'   Requires a prior download (\code{\link{download_chatterbox_models}}).
+#' @param tune_gc Tune torch's CUDA GC rates for faster inference (CUDA only,
+#'   and only when unset). Persistent session side effect; default TRUE. See
+#'   Details.
 #' @return Chatterbox TTS model object, loaded unless \code{load = FALSE}
 #' @export
-chatterbox <- function(device = "cpu", turbo = FALSE, load = TRUE) {
-    # Must run before the first CUDA op (cuda_is_available below): torch reads
-    # its allocator GC rates once, at lazy CUDA init.
-    .set_cuda_gc_options(device, turbo)
+chatterbox <- function(device = "cpu", turbo = FALSE, load = TRUE,
+                       tune_gc = TRUE) {
+    # GC tuning must run before the first CUDA op (cuda_is_available below):
+    # torch reads its allocator GC rates once, at lazy CUDA init.
+    if (isTRUE(tune_gc)) {
+        .set_cuda_gc_options(device, turbo)
+    }
     # Fall back to CPU when the requested accelerator is absent
     # (Python from_pretrained does the same for MPS)
     if (grepl("^cuda", device) && !torch::cuda_is_available()) {

diff --git a/man/chatterbox.Rd b/man/chatterbox.Rd
@@ -3,7 +3,7 @@
 \alias{chatterbox}
 \title{Create (and load) a Chatterbox TTS model}
 \usage{
-chatterbox(device = "cpu", turbo = FALSE, load = TRUE)
+chatterbox(device = "cpu", turbo = FALSE, load = TRUE, tune_gc = TRUE)
 }
 \arguments{
 \item{device}{Device to use ("cpu", "cuda", "mps", etc.)}
@@ -12,6 +12,10 @@ chatterbox(device = "cpu", turbo = FALSE, load = TRUE)
 
 \item{load}{Load pretrained weights before returning. Default TRUE.
 Requires a prior download (\code{\link{download_chatterbox_models}}).}
+
+\item{tune_gc}{Tune torch's CUDA GC rates for faster inference (CUDA only,
+and only when unset). Persistent session side effect; default TRUE. See
+Details.}
 }
 \value{
 Chatterbox TTS model object, loaded unless \code{load = FALSE}
@@ -24,3 +28,13 @@ weights in the same call - the Python reference's
 the not-loaded error paths), then load later with
 \code{\link{load_chatterbox}}.
 }
+\details{
+When \code{tune_gc = TRUE} (the default) and \code{device} is CUDA, this
+raises torch's allocator GC floors before the first CUDA op. torch otherwise
+runs \code{gc()} on nearly every allocation once a model occupies more than
+20\\% of VRAM, which dominates inference. It sets session-global
+\code{torch.cuda_allocator_reserved_rate} (the model footprint over VRAM) and
+\code{torch.threshold_call_gc}, only when they are unset, so an explicit
+setting always wins. This is a deliberate, persistent side effect (torch
+reads the rates later, at CUDA init); pass \code{tune_gc = FALSE} to skip it.
+}