cornball-ai · TroyHernandez · Mar 13, 2026 · Mar 12, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: whisper
 Title: Native R 'torch' Implementation of 'OpenAI' 'Whisper'
-Version: 0.2.0
+Version: 0.3.0
 Authors@R: c(
     person("Troy", "Hernandez", role = c("aut", "cre"),
            email = "troy@cornball.ai"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # tinyrox says don't edit this manually, but it can't stop you!
 
 export(audio_to_mel)
+export(detect_language)
 export(download_whisper_model)
 export(list_downloaded_models)
 export(list_whisper_models)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,19 @@
+# whisper 0.3.0
+
+* Language auto-detection: `transcribe()` now defaults to `language = NULL`,
+  which detects the spoken language from the audio before decoding. New
+  exported function `detect_language()` for standalone language identification.
+  **Breaking**: previous default was `language = "en"`. Code relying on the
+  default now auto-detects instead of assuming English. Pass `language = "en"`
+  explicitly to restore old behavior.
+* Segment-level and word-level timestamps via DTW alignment
+* Beam search decoding with temperature sampling and fallback
+* SDPA attention (FlashAttention on GPU)
+* `whisper_pipeline()` for cached model reuse across multiple transcriptions
+* Hardcoded special token table (eliminates `added_tokens.json` download)
+* Fixed invalid multibyte string crash in BPE decoder
+* Fixed DTW boundary guards and seek loop in `transcribe_chunk()`
+
 # whisper 0.1.0
 
 * Initial CRAN submission

diff --git a/R/config.R b/R/config.R
@@ -145,8 +145,36 @@ whisper_lang_token <- function(
   lang = "en",
   model = "tiny"
 ) {
-  # Language tokens start at 50259 for all models
-  langs <- c(
+  langs <- whisper_language_table()
+
+  if (!lang %in% names(langs)) {
+    stop("Unknown language: ", lang)
+  }
+
+  50259L + langs[[lang]]
+}
+
+#' Get Language Code from Token ID
+#'
+#' Reverse lookup: convert a language token ID back to a two-letter code.
+#'
+#' @param token_id Integer token ID (e.g., 50259 for English)
+#' @return Two-letter language code
+whisper_lang_from_id <- function(token_id) {
+  offset <- token_id - 50259L
+  langs <- whisper_language_table()
+  idx <- match(offset, langs)
+  if (is.na(idx)) stop("Unknown language token ID: ", token_id)
+  names(langs)[idx]
+}
+
+#' Whisper Language Table
+#'
+#' Returns the named integer vector mapping language codes to offsets.
+#'
+#' @return Named integer vector (language code -> offset from 50259)
+whisper_language_table <- function() {
+  c(
     en = 0L, zh = 1L, de = 2L, es = 3L, ru = 4L, ko = 5L, fr = 6L,
     ja = 7L, pt = 8L, tr = 9L, pl = 10L, ca = 11L, nl = 12L, ar = 13L,
     sv = 14L, it = 15L, id = 16L, hi = 17L, fi = 18L, vi = 19L,
@@ -165,11 +193,5 @@ whisper_lang_token <- function(
     tt = 92L, haw = 93L, ln = 94L, ha = 95L, ba = 96L, jw = 97L,
     su = 98L
   )
-
-  if (!lang %in% names(langs)) {
-    stop("Unknown language: ", lang)
-  }
-
-  50259L + langs[[lang]]
 }
 
diff --git a/R/language.R b/R/language.R
@@ -0,0 +1,116 @@
+#' Language Detection
+#'
+#' Detect the spoken language in an audio file using Whisper.
+
+#' Detect Language
+#'
+#' Identify the spoken language in an audio file. Uses Whisper's decoder
+#' to predict the most likely language token from the first 30 seconds
+#' of audio.
+#'
+#' @param file Path to audio file (WAV, MP3, etc.)
+#' @param model Model name: "tiny", "base", "small", "medium", "large-v3"
+#' @param device Device: "auto", "cpu", "cuda"
+#' @param dtype Data type: "auto", "float16", "float32"
+#' @param top_k Number of top language probabilities to return (default: 5)
+#' @param download If TRUE and model not present, prompt to download.
+#' @param verbose Print loading messages.
+#' @return List with \code{language} (two-letter code) and
+#'   \code{probabilities} (named numeric vector of top-k language probs).
+#' @export
+#' @examples
+#' \donttest{
+#' if (model_exists("tiny")) {
+#'   audio_file <- system.file("audio", "jfk.mp3", package = "whisper")
+#'   result <- detect_language(audio_file)
+#'   result$language
+#'   result$probabilities
+#' }
+#' }
+detect_language <- function(
+  file,
+  model = "tiny",
+  device = "auto",
+  dtype = "auto",
+  top_k = 5L,
+  download = TRUE,
+  verbose = TRUE
+) {
+  pipe <- whisper_pipeline(model, device = device, dtype = dtype,
+    download = download, verbose = verbose)
+
+  detect_language_from_pipeline(pipe, file, top_k = top_k)
+}
+
+#' Detect Language from Pipeline
+#'
+#' Internal function that runs language detection using a pre-loaded pipeline.
+#'
+#' @param pipe A whisper_pipeline object
+#' @param file Path to audio file, or numeric vector of audio samples
+#' @param top_k Number of top probabilities to return
+#' @return List with language code and probabilities
+detect_language_from_pipeline <- function(pipe, file, top_k = 5L) {
+  config <- pipe$config
+  model <- pipe$model
+  device <- pipe$device
+  dtype <- pipe$dtype
+
+  # Compute mel spectrogram from first 30s
+  mel <- audio_to_mel(file, n_mels = config$n_mels, device = device,
+    dtype = dtype)
+
+  detect_language_from_mel(model, mel, config, device)
+}
+
+#' Detect Language from Mel Spectrogram
+#'
+#' Core detection logic. Feed SOT token to decoder, read language logits.
+#'
+#' @param model WhisperModel
+#' @param mel Mel spectrogram tensor
+#' @param config Model config
+#' @param device torch device
+#' @param top_k Number of top probabilities to return
+#' @return List with language code and probabilities
+detect_language_from_mel <- function(model, mel, config, device, top_k = 5L) {
+  special <- whisper_special_tokens(config$model_name)
+  langs <- whisper_language_table()
+  n_langs <- length(langs)
+
+  # Language token IDs: 50259 to 50259 + n_langs - 1
+  lang_start <- 50259L
+  lang_end <- lang_start + n_langs - 1L
+
+  torch::with_no_grad({
+    # Encode audio
+    encoder_output <- model$encode(mel)
+
+    # Feed just the SOT token to the decoder
+    sot <- torch::torch_tensor(matrix(special$sot, nrow = 1L),
+      dtype = torch::torch_long(), device = device)
+
+    result <- model$decode(sot, encoder_output)
+    # logits shape: (1, 1, n_vocab)
+    logits <- result$logits[1, 1, ]
+
+    # Extract language token logits (R is 1-indexed, token IDs are 0-indexed in vocab)
+    # Token ID 50259 is at position 50260 in 1-indexed logits
+    lang_logits <- logits[(lang_start + 1L):(lang_end + 1L)]
+
+    # Softmax over language logits only
+    probs <- torch::nnf_softmax(lang_logits, dim = 1L)
+    probs_r <- as.numeric(probs$cpu())
+  })
+
+  names(probs_r) <- names(langs)
+
+  # Find top-k
+  top_idx <- order(probs_r, decreasing = TRUE)[seq_len(min(top_k, n_langs))]
+  top_probs <- probs_r[top_idx]
+
+  list(
+    language = names(probs_r)[top_idx[1]],
+    probabilities = top_probs
+  )
+}
diff --git a/R/transcribe.R b/R/transcribe.R
@@ -46,7 +46,7 @@ whisper_pipeline <- function(
 
   pipe$transcribe <- function(
     file,
-    language = "en",
+    language = NULL,
     task = "transcribe",
     timestamps = FALSE,
     word_timestamps = FALSE,
@@ -101,7 +101,7 @@ print.whisper_pipeline <- function(x, ...) {
 pipeline_transcribe <- function(
   pipe,
   file,
-  language = "en",
+  language = NULL,
   task = "transcribe",
   timestamps = FALSE,
   word_timestamps = FALSE,
@@ -159,7 +159,8 @@ pipeline_transcribe <- function(
 #'
 #' @param file Path to audio file (WAV, MP3, etc.)
 #' @param model Model name: "tiny", "base", "small", "medium", "large-v3"
-#' @param language Language code (e.g., "en", "es"). NULL for auto-detection.
+#' @param language Language code (e.g., "en", "es"), or NULL (default) for
+#'   auto-detection from the audio.
 #' @param task "transcribe" or "translate" (translate to English)
 #' @param timestamps If TRUE, return segment-level timestamps
 #' @param word_timestamps If TRUE, return word-level timestamps (implies timestamps)
@@ -181,12 +182,17 @@ pipeline_transcribe <- function(
 #' @export
 #' @examples
 #' \donttest{
-#' # Transcribe included sample (JFK "ask not" speech)
 #' if (model_exists("tiny")) {
 #'   audio_file <- system.file("audio", "jfk.mp3", package = "whisper")
+#'
+#'   # Auto-detect language (default)
 #'   result <- transcribe(audio_file, model = "tiny")
+#'   result$language  # "en"
 #'   result$text
 #'
+#'   # Explicit language
+#'   result <- transcribe(audio_file, model = "tiny", language = "en")
+#'
 #'   # With timestamps
 #'   result <- transcribe(audio_file, model = "tiny", timestamps = TRUE)
 #'   result$segments
@@ -201,7 +207,7 @@ pipeline_transcribe <- function(
 transcribe <- function(
   file,
   model = "tiny",
-  language = "en",
+  language = NULL,
   task = "transcribe",
   timestamps = FALSE,
   word_timestamps = FALSE,
@@ -255,7 +261,7 @@ transcribe_chunk <- function(
   model,
   tokenizer,
   config,
-  language = "en",
+  language = NULL,
   task = "transcribe",
   timestamps = FALSE,
   word_timestamps = FALSE,
@@ -282,6 +288,17 @@ transcribe_chunk <- function(
 
   special <- whisper_special_tokens(config$model_name)
 
+  # Auto-detect language if not specified
+  if (is.null(language)) {
+    detection <- detect_language_from_mel(model, full_mel, config, device)
+    language <- detection$language
+    if (verbose) {
+      top <- detection$probabilities[1]
+      message("Detected language: ", language,
+        " (", round(top * 100, 1), "%)")
+    }
+  }
+
   # Seek loop: decode repeatedly, advancing through the mel spectrogram
   seek <- 0L  # current frame position
   all_generated <- integer(0)
@@ -687,6 +704,19 @@ transcribe_long <- function(
   dtype,
   verbose
 ) {
+  # Auto-detect language from first 30s if not specified
+  if (is.null(language)) {
+    mel <- audio_to_mel(file, n_mels = config$n_mels, device = device,
+      dtype = dtype)
+    detection <- detect_language_from_mel(model, mel, config, device)
+    language <- detection$language
+    if (verbose) {
+      top <- detection$probabilities[1]
+      message("Detected language: ", language,
+        " (", round(top * 100, 1), "%)")
+    }
+  }
+
   # Split into chunks
   chunk_length <- 30
   overlap <- 1

diff --git a/inst/tinytest/test_language.R b/inst/tinytest/test_language.R
@@ -0,0 +1,35 @@
+# Test language detection
+
+# Unit tests (no model needed)
+langs <- whisper:::whisper_language_table()
+expect_equal(length(langs), 99L)
+expect_equal(langs[["en"]], 0L)
+expect_equal(langs[["su"]], 98L)
+
+# Reverse lookup
+expect_equal(whisper:::whisper_lang_from_id(50259L), "en")
+expect_equal(whisper:::whisper_lang_from_id(50260L), "zh")
+expect_equal(whisper:::whisper_lang_from_id(50357L), "su")
+
+# Round-trip: lang code -> token ID -> lang code
+for (code in names(langs)) {
+  token_id <- whisper:::whisper_lang_token(code)
+  expect_equal(whisper:::whisper_lang_from_id(token_id), code)
+}
+
+# Integration tests (need model)
+if (at_home() && whisper::model_exists("tiny")) {
+  audio_file <- system.file("audio", "jfk.mp3", package = "whisper")
+
+  # detect_language should return English for JFK speech
+  result <- whisper::detect_language(audio_file, model = "tiny", verbose = FALSE)
+  expect_true(is.list(result))
+  expect_equal(result$language, "en")
+  expect_true(result$probabilities[["en"]] > 0.5)
+  expect_true(length(result$probabilities) == 5L)
+
+  # transcribe with language = NULL should auto-detect
+  result <- whisper::transcribe(audio_file, model = "tiny", verbose = FALSE)
+  expect_equal(result$language, "en")
+  expect_true(nchar(result$text) > 0)
+}
diff --git a/man/beam_search_decode.Rd b/man/beam_search_decode.Rd
@@ -3,19 +3,10 @@
 \alias{beam_search_decode}
 \title{Beam Search Decode}
 \usage{
-beam_search_decode(
-  model,
-  encoder_output,
-  initial_tokens,
-  tokenizer,
-  beam_size = 5L,
-  max_length = 448L,
-  timestamps = FALSE,
-  word_timestamps = FALSE,
-  length_penalty = 1,
-  patience = Inf,
-  device
-)
+beam_search_decode(model, encoder_output, initial_tokens, tokenizer,
+                   beam_size = 5L, max_length = 448L, timestamps = FALSE,
+                   word_timestamps = FALSE, length_penalty = 1, patience = Inf,
+                   device)
 }
 \arguments{
 \item{model}{WhisperModel}

diff --git a/man/compute_word_timestamps.Rd b/man/compute_word_timestamps.Rd
@@ -3,14 +3,8 @@
 \alias{compute_word_timestamps}
 \title{Word-Level Timestamp Alignment}
 \usage{
-compute_word_timestamps(
-  tokens,
-  cross_attn_weights,
-  tokenizer,
-  config,
-  time_offset = 0,
-  sample_begin = 4L
-)
+compute_word_timestamps(tokens, cross_attn_weights, tokenizer, config,
+                        time_offset = 0, sample_begin = 4L)
 }
 \arguments{
 \item{tokens}{Integer vector of generated token IDs}

diff --git a/man/create_mel_filterbank_fallback.Rd b/man/create_mel_filterbank_fallback.Rd
@@ -3,11 +3,8 @@
 \alias{create_mel_filterbank_fallback}
 \title{Create Mel Filterbank (Fallback)}
 \usage{
-create_mel_filterbank_fallback(
-  n_fft = WHISPER_N_FFT,
-  n_mels = 80L,
-  sample_rate = WHISPER_SAMPLE_RATE
-)
+create_mel_filterbank_fallback(n_fft = WHISPER_N_FFT, n_mels = 80L,
+                               sample_rate = WHISPER_SAMPLE_RATE)
 }
 \arguments{
 \item{n_fft}{FFT size}