diff --git a/DESCRIPTION b/DESCRIPTION
index 5188d10..a0cb168 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: whisper
 Title: Native R 'torch' Implementation of 'OpenAI' 'Whisper'
-Version: 0.2.0
+Version: 0.3.0
 Authors@R: c(
     person("Troy", "Hernandez", role = c("aut", "cre"),
            email = "troy@cornball.ai"),
diff --git a/NAMESPACE b/NAMESPACE
index f3861cf..70c9564 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,6 +1,7 @@
 # tinyrox says don't edit this manually, but it can't stop you!
 
 export(audio_to_mel)
+export(detect_language)
 export(download_whisper_model)
 export(list_downloaded_models)
 export(list_whisper_models)
diff --git a/NEWS.md b/NEWS.md
index eaa95eb..8205c24 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,19 @@
+# whisper 0.3.0
+
+* Language auto-detection: `transcribe()` now defaults to `language = NULL`,
+  which detects the spoken language from the audio before decoding. New
+  exported function `detect_language()` for standalone language identification.
+  **Breaking**: previous default was `language = "en"`. Code relying on the
+  default now auto-detects instead of assuming English. Pass `language = "en"`
+  explicitly to restore old behavior.
+* Segment-level and word-level timestamps via DTW alignment
+* Beam search decoding with temperature sampling and fallback
+* SDPA attention (FlashAttention on GPU)
+* `whisper_pipeline()` for cached model reuse across multiple transcriptions
+* Hardcoded special token table (eliminates `added_tokens.json` download)
+* Fixed invalid multibyte string crash in BPE decoder
+* Fixed DTW boundary guards and seek loop in `transcribe_chunk()`
+
 # whisper 0.1.0
 
 * Initial CRAN submission
diff --git a/R/config.R b/R/config.R
index 52f18fa..4b9fcd9 100644
--- a/R/config.R
+++ b/R/config.R
@@ -145,8 +145,36 @@ whisper_lang_token <- function(
   lang = "en",
   model = "tiny"
 ) {
-  # Language tokens start at 50259 for all models
-  langs <- c(
+  langs <- whisper_language_table()
+
+  if (!lang %in% names(langs)) {
+    stop("Unknown language: ", lang)
+  }
+
+  50259L + langs[[lang]]
+}
+
+#' Get Language Code from Token ID
+#'
+#' Reverse lookup: convert a language token ID back to a two-letter code.
+#'
+#' @param token_id Integer token ID (e.g., 50259 for English)
+#' @return Two-letter language code
+whisper_lang_from_id <- function(token_id) {
+  offset <- token_id - 50259L
+  langs <- whisper_language_table()
+  idx <- match(offset, langs)
+  if (is.na(idx)) stop("Unknown language token ID: ", token_id)
+  names(langs)[idx]
+}
+
+#' Whisper Language Table
+#'
+#' Returns the named integer vector mapping language codes to offsets.
+#'
+#' @return Named integer vector (language code -> offset from 50259)
+whisper_language_table <- function() {
+  c(
     en = 0L, zh = 1L, de = 2L, es = 3L, ru = 4L, ko = 5L, fr = 6L,
     ja = 7L, pt = 8L, tr = 9L, pl = 10L, ca = 11L, nl = 12L, ar = 13L,
     sv = 14L, it = 15L, id = 16L, hi = 17L, fi = 18L, vi = 19L,
@@ -165,11 +193,5 @@ whisper_lang_token <- function(
     tt = 92L, haw = 93L, ln = 94L, ha = 95L, ba = 96L, jw = 97L,
     su = 98L
   )
-
-  if (!lang %in% names(langs)) {
-    stop("Unknown language: ", lang)
-  }
-
-  50259L + langs[[lang]]
 }
 
diff --git a/R/language.R b/R/language.R
new file mode 100644
index 0000000..d18798b
--- /dev/null
+++ b/R/language.R
@@ -0,0 +1,116 @@
+#' Language Detection
+#'
+#' Detect the spoken language in an audio file using Whisper.
+
+#' Detect Language
+#'
+#' Identify the spoken language in an audio file. Uses Whisper's decoder
+#' to predict the most likely language token from the first 30 seconds
+#' of audio.
+#'
+#' @param file Path to audio file (WAV, MP3, etc.)
+#' @param model Model name: "tiny", "base", "small", "medium", "large-v3"
+#' @param device Device: "auto", "cpu", "cuda"
+#' @param dtype Data type: "auto", "float16", "float32"
+#' @param top_k Number of top language probabilities to return (default: 5)
+#' @param download If TRUE and model not present, prompt to download.
+#' @param verbose Print loading messages.
+#' @return List with \code{language} (two-letter code) and
+#'   \code{probabilities} (named numeric vector of top-k language probs).
+#' @export
+#' @examples
+#' \donttest{
+#' if (model_exists("tiny")) {
+#'   audio_file <- system.file("audio", "jfk.mp3", package = "whisper")
+#'   result <- detect_language(audio_file)
+#'   result$language
+#'   result$probabilities
+#' }
+#' }
+detect_language <- function(
+  file,
+  model = "tiny",
+  device = "auto",
+  dtype = "auto",
+  top_k = 5L,
+  download = TRUE,
+  verbose = TRUE
+) {
+  pipe <- whisper_pipeline(model, device = device, dtype = dtype,
+    download = download, verbose = verbose)
+
+  detect_language_from_pipeline(pipe, file, top_k = top_k)
+}
+
+#' Detect Language from Pipeline
+#'
+#' Internal function that runs language detection using a pre-loaded pipeline.
+#'
+#' @param pipe A whisper_pipeline object
+#' @param file Path to audio file, or numeric vector of audio samples
+#' @param top_k Number of top probabilities to return
+#' @return List with language code and probabilities
+detect_language_from_pipeline <- function(pipe, file, top_k = 5L) {
+  config <- pipe$config
+  model <- pipe$model
+  device <- pipe$device
+  dtype <- pipe$dtype
+
+  # Compute mel spectrogram from first 30s
+  mel <- audio_to_mel(file, n_mels = config$n_mels, device = device,
+    dtype = dtype)
+
+  detect_language_from_mel(model, mel, config, device)
+}
+
+#' Detect Language from Mel Spectrogram
+#'
+#' Core detection logic. Feed SOT token to decoder, read language logits.
+#'
+#' @param model WhisperModel
+#' @param mel Mel spectrogram tensor
+#' @param config Model config
+#' @param device torch device
+#' @param top_k Number of top probabilities to return
+#' @return List with language code and probabilities
+detect_language_from_mel <- function(model, mel, config, device, top_k = 5L) {
+  special <- whisper_special_tokens(config$model_name)
+  langs <- whisper_language_table()
+  n_langs <- length(langs)
+
+  # Language token IDs: 50259 to 50259 + n_langs - 1
+  lang_start <- 50259L
+  lang_end <- lang_start + n_langs - 1L
+
+  torch::with_no_grad({
+    # Encode audio
+    encoder_output <- model$encode(mel)
+
+    # Feed just the SOT token to the decoder
+    sot <- torch::torch_tensor(matrix(special$sot, nrow = 1L),
+      dtype = torch::torch_long(), device = device)
+
+    result <- model$decode(sot, encoder_output)
+    # logits shape: (1, 1, n_vocab)
+    logits <- result$logits[1, 1, ]
+
+    # Extract language token logits (R is 1-indexed, token IDs are 0-indexed in vocab)
+    # Token ID 50259 is at position 50260 in 1-indexed logits
+    lang_logits <- logits[(lang_start + 1L):(lang_end + 1L)]
+
+    # Softmax over language logits only
+    probs <- torch::nnf_softmax(lang_logits, dim = 1L)
+    probs_r <- as.numeric(probs$cpu())
+  })
+
+  names(probs_r) <- names(langs)
+
+  # Find top-k
+  top_idx <- order(probs_r, decreasing = TRUE)[seq_len(min(top_k, n_langs))]
+  top_probs <- probs_r[top_idx]
+
+  list(
+    language = names(probs_r)[top_idx[1]],
+    probabilities = top_probs
+  )
+}
diff --git a/R/transcribe.R b/R/transcribe.R
index 0c2a118..8b58286 100644
--- a/R/transcribe.R
+++ b/R/transcribe.R
@@ -46,7 +46,7 @@ whisper_pipeline <- function(
 
   pipe$transcribe <- function(
     file,
-    language = "en",
+    language = NULL,
     task = "transcribe",
     timestamps = FALSE,
     word_timestamps = FALSE,
@@ -101,7 +101,7 @@ print.whisper_pipeline <- function(x, ...) {
 pipeline_transcribe <- function(
   pipe,
   file,
-  language = "en",
+  language = NULL,
   task = "transcribe",
   timestamps = FALSE,
   word_timestamps = FALSE,
@@ -159,7 +159,8 @@ pipeline_transcribe <- function(
 #'
 #' @param file Path to audio file (WAV, MP3, etc.)
 #' @param model Model name: "tiny", "base", "small", "medium", "large-v3"
-#' @param language Language code (e.g., "en", "es"). NULL for auto-detection.
+#' @param language Language code (e.g., "en", "es"), or NULL (default) for
+#'   auto-detection from the audio.
 #' @param task "transcribe" or "translate" (translate to English)
 #' @param timestamps If TRUE, return segment-level timestamps
 #' @param word_timestamps If TRUE, return word-level timestamps (implies timestamps)
@@ -181,12 +182,17 @@ pipeline_transcribe <- function(
 #' @export
 #' @examples
 #' \donttest{
-#' # Transcribe included sample (JFK "ask not" speech)
 #' if (model_exists("tiny")) {
 #'   audio_file <- system.file("audio", "jfk.mp3", package = "whisper")
+#'
+#'   # Auto-detect language (default)
 #'   result <- transcribe(audio_file, model = "tiny")
+#'   result$language  # "en"
 #'   result$text
 #'
+#'   # Explicit language
+#'   result <- transcribe(audio_file, model = "tiny", language = "en")
+#'
 #'   # With timestamps
 #'   result <- transcribe(audio_file, model = "tiny", timestamps = TRUE)
 #'   result$segments
@@ -201,7 +207,7 @@ pipeline_transcribe <- function(
 transcribe <- function(
   file,
   model = "tiny",
-  language = "en",
+  language = NULL,
   task = "transcribe",
   timestamps = FALSE,
   word_timestamps = FALSE,
@@ -255,7 +261,7 @@ transcribe_chunk <- function(
   model,
   tokenizer,
   config,
-  language = "en",
+  language = NULL,
   task = "transcribe",
   timestamps = FALSE,
   word_timestamps = FALSE,
@@ -282,6 +288,17 @@ transcribe_chunk <- function(
 
   special <- whisper_special_tokens(config$model_name)
 
+  # Auto-detect language if not specified
+  if (is.null(language)) {
+    detection <- detect_language_from_mel(model, full_mel, config, device)
+    language <- detection$language
+    if (verbose) {
+      top <- detection$probabilities[1]
+      message("Detected language: ", language,
+        " (", round(top * 100, 1), "%)")
+    }
+  }
+
   # Seek loop: decode repeatedly, advancing through the mel spectrogram
   seek <- 0L  # current frame position
   all_generated <- integer(0)
@@ -687,6 +704,19 @@ transcribe_long <- function(
   dtype,
   verbose
 ) {
+  # Auto-detect language from first 30s if not specified
+  if (is.null(language)) {
+    mel <- audio_to_mel(file, n_mels = config$n_mels, device = device,
+      dtype = dtype)
+    detection <- detect_language_from_mel(model, mel, config, device)
+    language <- detection$language
+    if (verbose) {
+      top <- detection$probabilities[1]
+      message("Detected language: ", language,
+        " (", round(top * 100, 1), "%)")
+    }
+  }
+
   # Split into chunks
   chunk_length <- 30
   overlap <- 1
diff --git a/inst/tinytest/test_language.R b/inst/tinytest/test_language.R
new file mode 100644
index 0000000..3d5e27d
--- /dev/null
+++ b/inst/tinytest/test_language.R
@@ -0,0 +1,35 @@
+# Test language detection
+
+# Unit tests (no model needed)
+langs <- whisper:::whisper_language_table()
+expect_equal(length(langs), 99L)
+expect_equal(langs[["en"]], 0L)
+expect_equal(langs[["su"]], 98L)
+
+# Reverse lookup
+expect_equal(whisper:::whisper_lang_from_id(50259L), "en")
+expect_equal(whisper:::whisper_lang_from_id(50260L), "zh")
+expect_equal(whisper:::whisper_lang_from_id(50357L), "su")
+
+# Round-trip: lang code -> token ID -> lang code
+for (code in names(langs)) {
+  token_id <- whisper:::whisper_lang_token(code)
+  expect_equal(whisper:::whisper_lang_from_id(token_id), code)
+}
+
+# Integration tests (need model)
+if (at_home() && whisper::model_exists("tiny")) {
+  audio_file <- system.file("audio", "jfk.mp3", package = "whisper")
+
+  # detect_language should return English for JFK speech
+  result <- whisper::detect_language(audio_file, model = "tiny", verbose = FALSE)
+  expect_true(is.list(result))
+  expect_equal(result$language, "en")
+  expect_true(result$probabilities[["en"]] > 0.5)
+  expect_true(length(result$probabilities) == 5L)
+
+  # transcribe with language = NULL should auto-detect
+  result <- whisper::transcribe(audio_file, model = "tiny", verbose = FALSE)
+  expect_equal(result$language, "en")
+  expect_true(nchar(result$text) > 0)
+}
diff --git a/man/beam_search_decode.Rd b/man/beam_search_decode.Rd
index 082cff5..061a8fc 100644
--- a/man/beam_search_decode.Rd
+++ b/man/beam_search_decode.Rd
@@ -3,19 +3,10 @@
 \alias{beam_search_decode}
 \title{Beam Search Decode}
 \usage{
-beam_search_decode(
-  model,
-  encoder_output,
-  initial_tokens,
-  tokenizer,
-  beam_size = 5L,
-  max_length = 448L,
-  timestamps = FALSE,
-  word_timestamps = FALSE,
-  length_penalty = 1,
-  patience = Inf,
-  device
-)
+beam_search_decode(model, encoder_output, initial_tokens, tokenizer,
+                   beam_size = 5L, max_length = 448L, timestamps = FALSE,
+                   word_timestamps = FALSE, length_penalty = 1, patience = Inf,
+                   device)
 }
 \arguments{
 \item{model}{WhisperModel}
diff --git a/man/compute_word_timestamps.Rd b/man/compute_word_timestamps.Rd
index 17198ff..f963c4a 100644
--- a/man/compute_word_timestamps.Rd
+++ b/man/compute_word_timestamps.Rd
@@ -3,14 +3,8 @@
 \alias{compute_word_timestamps}
 \title{Word-Level Timestamp Alignment}
 \usage{
-compute_word_timestamps(
-  tokens,
-  cross_attn_weights,
-  tokenizer,
-  config,
-  time_offset = 0,
-  sample_begin = 4L
-)
+compute_word_timestamps(tokens, cross_attn_weights, tokenizer, config,
+                        time_offset = 0, sample_begin = 4L)
 }
 \arguments{
 \item{tokens}{Integer vector of generated token IDs}
diff --git a/man/create_mel_filterbank_fallback.Rd b/man/create_mel_filterbank_fallback.Rd
index 6d01f8d..565a00e 100644
--- a/man/create_mel_filterbank_fallback.Rd
+++ b/man/create_mel_filterbank_fallback.Rd
@@ -3,11 +3,8 @@
 \alias{create_mel_filterbank_fallback}
 \title{Create Mel Filterbank (Fallback)}
 \usage{
-create_mel_filterbank_fallback(
-  n_fft = WHISPER_N_FFT,
-  n_mels = 80L,
-  sample_rate = WHISPER_SAMPLE_RATE
-)
+create_mel_filterbank_fallback(n_fft = WHISPER_N_FFT, n_mels = 80L,
+                               sample_rate = WHISPER_SAMPLE_RATE)
 }
 \arguments{
 \item{n_fft}{FFT size}
diff --git a/man/decode_with_fallback.Rd b/man/decode_with_fallback.Rd
index 425f26e..b1a6db4 100644
--- a/man/decode_with_fallback.Rd
+++ b/man/decode_with_fallback.Rd
@@ -3,23 +3,12 @@
 \alias{decode_with_fallback}
 \title{Decode with Temperature Fallback}
 \usage{
-decode_with_fallback(
-  model,
-  encoder_output,
-  initial_tokens,
-  tokenizer,
-  temperatures = c(0, 0.2, 0.4, 0.6, 0.8, 1),
-  beam_size = 5L,
-  best_of = 5L,
-  max_length = 448L,
-  timestamps = FALSE,
-  word_timestamps = FALSE,
-  compression_ratio_threshold = 2.4,
-  logprob_threshold = -1,
-  length_penalty = 1,
-  patience = Inf,
-  device
-)
+decode_with_fallback(model, encoder_output, initial_tokens, tokenizer,
+                     temperatures = c(0, 0.2, 0.4, 0.6, 0.8, 1),
+                     beam_size = 5L, best_of = 5L, max_length = 448L,
+                     timestamps = FALSE, word_timestamps = FALSE,
+                     compression_ratio_threshold = 2.4, logprob_threshold = -1,
+                     length_penalty = 1, patience = Inf, device)
 }
 \arguments{
 \item{model}{WhisperModel}
diff --git a/man/detect_language.Rd b/man/detect_language.Rd
new file mode 100644
index 0000000..f580f97
--- /dev/null
+++ b/man/detect_language.Rd
@@ -0,0 +1,44 @@
+% tinyrox says don't edit this manually, but it can't stop you!
+\name{detect_language}
+\alias{detect_language}
+\title{Language Detection}
+\usage{
+detect_language(file, model = "tiny", device = "auto", dtype = "auto",
+                top_k = 5L, download = TRUE, verbose = TRUE)
+}
+\arguments{
+\item{file}{Path to audio file (WAV, MP3, etc.)}
+
+\item{model}{Model name: "tiny", "base", "small", "medium", "large-v3"}
+
+\item{device}{Device: "auto", "cpu", "cuda"}
+
+\item{dtype}{Data type: "auto", "float16", "float32"}
+
+\item{top_k}{Number of top language probabilities to return (default: 5)}
+
+\item{download}{If TRUE and model not present, prompt to download.}
+
+\item{verbose}{Print loading messages.}
+}
+\value{
+List with \code{language} (two-letter code) and
+  \code{probabilities} (named numeric vector of top-k language probs).
+}
+\description{
+Detect the spoken language in an audio file using Whisper.
+Detect Language
+Identify the spoken language in an audio file. Uses Whisper's decoder
+to predict the most likely language token from the first 30 seconds
+of audio.
+}
+\examples{
+\donttest{
+if (model_exists("tiny")) {
+  audio_file <- system.file("audio", "jfk.mp3", package = "whisper")
+  result <- detect_language(audio_file)
+  result$language
+  result$probabilities
+}
+}
+}
diff --git a/man/detect_language_from_mel.Rd b/man/detect_language_from_mel.Rd
new file mode 100644
index 0000000..428d07e
--- /dev/null
+++ b/man/detect_language_from_mel.Rd
@@ -0,0 +1,24 @@
+% tinyrox says don't edit this manually, but it can't stop you!
+\name{detect_language_from_mel}
+\alias{detect_language_from_mel}
+\title{Detect Language from Mel Spectrogram}
+\usage{
+detect_language_from_mel(model, mel, config, device, top_k = 5L)
+}
+\arguments{
+\item{model}{WhisperModel}
+
+\item{mel}{Mel spectrogram tensor}
+
+\item{config}{Model config}
+
+\item{device}{torch device}
+
+\item{top_k}{Number of top probabilities to return}
+}
+\value{
+List with language code and probabilities
+}
+\description{
+Core detection logic. Feed SOT token to decoder, read language logits.
+}
diff --git a/man/detect_language_from_pipeline.Rd b/man/detect_language_from_pipeline.Rd
new file mode 100644
index 0000000..0dce67b
--- /dev/null
+++ b/man/detect_language_from_pipeline.Rd
@@ -0,0 +1,20 @@
+% tinyrox says don't edit this manually, but it can't stop you!
+\name{detect_language_from_pipeline}
+\alias{detect_language_from_pipeline}
+\title{Detect Language from Pipeline}
+\usage{
+detect_language_from_pipeline(pipe, file, top_k = 5L)
+}
+\arguments{
+\item{pipe}{A whisper_pipeline object}
+
+\item{file}{Path to audio file, or numeric vector of audio samples}
+
+\item{top_k}{Number of top probabilities to return}
+}
+\value{
+List with language code and probabilities
+}
+\description{
+Internal function that runs language detection using a pre-loaded pipeline.
+}
diff --git a/man/get_initial_tokens.Rd b/man/get_initial_tokens.Rd
index 99f85db..fb7ebfe 100644
--- a/man/get_initial_tokens.Rd
+++ b/man/get_initial_tokens.Rd
@@ -3,12 +3,8 @@
 \alias{get_initial_tokens}
 \title{Get Initial Decoder Tokens}
 \usage{
-get_initial_tokens(
-  language = "en",
-  task = "transcribe",
-  model = "tiny",
-  timestamps = FALSE
-)
+get_initial_tokens(language = "en", task = "transcribe", model = "tiny",
+                   timestamps = FALSE)
 }
 \arguments{
 \item{language}{Two-letter language code or NULL for auto}
diff --git a/man/greedy_decode.Rd b/man/greedy_decode.Rd
index 8f18b70..6ec3740 100644
--- a/man/greedy_decode.Rd
+++ b/man/greedy_decode.Rd
@@ -3,16 +3,9 @@
 \alias{greedy_decode}
 \title{Greedy Decoding}
 \usage{
-greedy_decode(
-  model,
-  encoder_output,
-  initial_tokens,
-  tokenizer,
-  max_length = 448L,
-  timestamps = FALSE,
-  word_timestamps = FALSE,
-  device
-)
+greedy_decode(model, encoder_output, initial_tokens, tokenizer,
+              max_length = 448L, timestamps = FALSE, word_timestamps = FALSE,
+              device)
 }
 \arguments{
 \item{model}{WhisperModel}
diff --git a/man/load_whisper_model.Rd b/man/load_whisper_model.Rd
index 219cb37..c62cdc1 100644
--- a/man/load_whisper_model.Rd
+++ b/man/load_whisper_model.Rd
@@ -3,13 +3,8 @@
 \alias{load_whisper_model}
 \title{Load Whisper Model}
 \usage{
-load_whisper_model(
-  model = "tiny",
-  device = "auto",
-  dtype = "auto",
-  download = FALSE,
-  verbose = TRUE
-)
+load_whisper_model(model = "tiny", device = "auto", dtype = "auto",
+                   download = FALSE, verbose = TRUE)
 }
 \arguments{
 \item{model}{Model name: "tiny", "base", "small", "medium", "large-v3"}
diff --git a/man/pipeline_transcribe.Rd b/man/pipeline_transcribe.Rd
index e1ea4fe..00621cf 100644
--- a/man/pipeline_transcribe.Rd
+++ b/man/pipeline_transcribe.Rd
@@ -3,22 +3,11 @@
 \alias{pipeline_transcribe}
 \title{Pipeline Transcribe}
 \usage{
-pipeline_transcribe(
-  pipe,
-  file,
-  language = "en",
-  task = "transcribe",
-  timestamps = FALSE,
-  word_timestamps = FALSE,
-  beam_size = 1L,
-  temperatures = 0,
-  best_of = 1L,
-  compression_ratio_threshold = 2.4,
-  logprob_threshold = -1,
-  length_penalty = 1,
-  patience = Inf,
-  verbose = TRUE
-)
+pipeline_transcribe(pipe, file, language = NULL, task = "transcribe",
+                    timestamps = FALSE, word_timestamps = FALSE,
+                    beam_size = 1L, temperatures = 0, best_of = 1L,
+                    compression_ratio_threshold = 2.4, logprob_threshold = -1,
+                    length_penalty = 1, patience = Inf, verbose = TRUE)
 }
 \arguments{
 \item{pipe}{A whisper_pipeline object.}
diff --git a/man/sample_decode.Rd b/man/sample_decode.Rd
index 6a85306..b7c76f5 100644
--- a/man/sample_decode.Rd
+++ b/man/sample_decode.Rd
@@ -3,17 +3,9 @@
 \alias{sample_decode}
 \title{Sample Decode}
 \usage{
-sample_decode(
-  model,
-  encoder_output,
-  initial_tokens,
-  tokenizer,
-  temperature = 0.6,
-  max_length = 448L,
-  timestamps = FALSE,
-  word_timestamps = FALSE,
-  device
-)
+sample_decode(model, encoder_output, initial_tokens, tokenizer,
+              temperature = 0.6, max_length = 448L, timestamps = FALSE,
+              word_timestamps = FALSE, device)
 }
 \arguments{
 \item{model}{WhisperModel}
diff --git a/man/transcribe.Rd b/man/transcribe.Rd
index 3244dc9..e386e69 100644
--- a/man/transcribe.Rd
+++ b/man/transcribe.Rd
@@ -3,31 +3,19 @@
 \alias{transcribe}
 \title{Transcribe Audio}
 \usage{
-transcribe(
-  file,
-  model = "tiny",
-  language = "en",
-  task = "transcribe",
-  timestamps = FALSE,
-  word_timestamps = FALSE,
-  beam_size = 1L,
-  temperatures = 0,
-  best_of = 1L,
-  compression_ratio_threshold = 2.4,
-  logprob_threshold = -1,
-  length_penalty = 1,
-  patience = Inf,
-  device = "auto",
-  dtype = "auto",
-  verbose = TRUE
-)
+transcribe(file, model = "tiny", language = NULL, task = "transcribe",
+           timestamps = FALSE, word_timestamps = FALSE, beam_size = 1L,
+           temperatures = 0, best_of = 1L, compression_ratio_threshold = 2.4,
+           logprob_threshold = -1, length_penalty = 1, patience = Inf,
+           device = "auto", dtype = "auto", verbose = TRUE)
 }
 \arguments{
 \item{file}{Path to audio file (WAV, MP3, etc.)}
 
 \item{model}{Model name: "tiny", "base", "small", "medium", "large-v3"}
 
-\item{language}{Language code (e.g., "en", "es"). NULL for auto-detection.}
+\item{language}{Language code (e.g., "en", "es"), or NULL (default) for
+auto-detection from the audio.}
 
 \item{task}{"transcribe" or "translate" (translate to English)}
 
@@ -69,12 +57,17 @@ load the model once.
 }
 \examples{
 \donttest{
-# Transcribe included sample (JFK "ask not" speech)
 if (model_exists("tiny")) {
   audio_file <- system.file("audio", "jfk.mp3", package = "whisper")
+
+  # Auto-detect language (default)
   result <- transcribe(audio_file, model = "tiny")
+  result$language  # "en"
   result$text
 
+  # Explicit language
+  result <- transcribe(audio_file, model = "tiny", language = "en")
+
   # With timestamps
   result <- transcribe(audio_file, model = "tiny", timestamps = TRUE)
   result$segments
diff --git a/man/transcribe_chunk.Rd b/man/transcribe_chunk.Rd
index 0892538..e209202 100644
--- a/man/transcribe_chunk.Rd
+++ b/man/transcribe_chunk.Rd
@@ -3,27 +3,12 @@
 \alias{transcribe_chunk}
 \title{Transcribe Single Chunk}
 \usage{
-transcribe_chunk(
-  file,
-  model,
-  tokenizer,
-  config,
-  language = "en",
-  task = "transcribe",
-  timestamps = FALSE,
-  word_timestamps = FALSE,
-  beam_size = 1L,
-  temperatures = 0,
-  best_of = 1L,
-  compression_ratio_threshold = 2.4,
-  logprob_threshold = -1,
-  length_penalty = 1,
-  patience = Inf,
-  time_offset = 0,
-  device,
-  dtype,
-  verbose = TRUE
-)
+transcribe_chunk(file, model, tokenizer, config, language = NULL,
+                 task = "transcribe", timestamps = FALSE,
+                 word_timestamps = FALSE, beam_size = 1L, temperatures = 0,
+                 best_of = 1L, compression_ratio_threshold = 2.4,
+                 logprob_threshold = -1, length_penalty = 1, patience = Inf,
+                 time_offset = 0, device, dtype, verbose = TRUE)
 }
 \arguments{
 \item{file}{Audio file or mel spectrogram}
diff --git a/man/transcribe_long.Rd b/man/transcribe_long.Rd
index 8afd1ba..ed7e6f3 100644
--- a/man/transcribe_long.Rd
+++ b/man/transcribe_long.Rd
@@ -3,26 +3,11 @@
 \alias{transcribe_long}
 \title{Transcribe Long Audio}
 \usage{
-transcribe_long(
-  file,
-  model,
-  tokenizer,
-  config,
-  language,
-  task,
-  timestamps = FALSE,
-  word_timestamps = FALSE,
-  beam_size = 1L,
-  temperatures = 0,
-  best_of = 1L,
-  compression_ratio_threshold = 2.4,
-  logprob_threshold = -1,
-  length_penalty = 1,
-  patience = Inf,
-  device,
-  dtype,
-  verbose
-)
+transcribe_long(file, model, tokenizer, config, language, task,
+                timestamps = FALSE, word_timestamps = FALSE, beam_size = 1L,
+                temperatures = 0, best_of = 1L,
+                compression_ratio_threshold = 2.4, logprob_threshold = -1,
+                length_penalty = 1, patience = Inf, device, dtype, verbose)
 }
 \arguments{
 \item{file}{Audio file}
diff --git a/man/whisper_lang_from_id.Rd b/man/whisper_lang_from_id.Rd
new file mode 100644
index 0000000..4195496
--- /dev/null
+++ b/man/whisper_lang_from_id.Rd
@@ -0,0 +1,16 @@
+% tinyrox says don't edit this manually, but it can't stop you!
+\name{whisper_lang_from_id}
+\alias{whisper_lang_from_id}
+\title{Get Language Code from Token ID}
+\usage{
+whisper_lang_from_id(token_id)
+}
+\arguments{
+\item{token_id}{Integer token ID (e.g., 50259 for English)}
+}
+\value{
+Two-letter language code
+}
+\description{
+Reverse lookup: convert a language token ID back to a two-letter code.
+}
diff --git a/man/whisper_language_table.Rd b/man/whisper_language_table.Rd
new file mode 100644
index 0000000..e04674f
--- /dev/null
+++ b/man/whisper_language_table.Rd
@@ -0,0 +1,13 @@
+% tinyrox says don't edit this manually, but it can't stop you!
+\name{whisper_language_table}
+\alias{whisper_language_table}
+\title{Whisper Language Table}
+\usage{
+whisper_language_table()
+}
+\value{
+Named integer vector (language code -> offset from 50259)
+}
+\description{
+Returns the named integer vector mapping language codes to offsets.
+}
diff --git a/man/whisper_pipeline.Rd b/man/whisper_pipeline.Rd
index df75dd7..8ef9efe 100644
--- a/man/whisper_pipeline.Rd
+++ b/man/whisper_pipeline.Rd
@@ -3,13 +3,8 @@
 \alias{whisper_pipeline}
 \title{Whisper Transcription}
 \usage{
-whisper_pipeline(
-  model = "tiny",
-  device = "auto",
-  dtype = "auto",
-  download = TRUE,
-  verbose = TRUE
-)
+whisper_pipeline(model = "tiny", device = "auto", dtype = "auto",
+                 download = TRUE, verbose = TRUE)
 }
 \arguments{
 \item{model}{Model name: "tiny", "base", "small", "medium", "large-v3"}