Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: whisper
Title: Native R 'torch' Implementation of 'OpenAI' 'Whisper'
Version: 0.2.0
Version: 0.3.0
Authors@R: c(
person("Troy", "Hernandez", role = c("aut", "cre"),
email = "troy@cornball.ai"),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# tinyrox says don't edit this manually, but it can't stop you!

export(audio_to_mel)
export(detect_language)
export(download_whisper_model)
export(list_downloaded_models)
export(list_whisper_models)
Expand Down
16 changes: 16 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# whisper 0.3.0

* Language auto-detection: `transcribe()` now defaults to `language = NULL`,
which detects the spoken language from the audio before decoding. New
exported function `detect_language()` for standalone language identification.
**Breaking**: previous default was `language = "en"`. Code relying on the
default now auto-detects instead of assuming English. Pass `language = "en"`
explicitly to restore old behavior.
* Segment-level and word-level timestamps via DTW alignment
* Beam search decoding with temperature sampling and fallback
* SDPA attention (FlashAttention on GPU)
* `whisper_pipeline()` for cached model reuse across multiple transcriptions
* Hardcoded special token table (eliminates `added_tokens.json` download)
* Fixed invalid multibyte string crash in BPE decoder
* Fixed DTW boundary guards and seek loop in `transcribe_chunk()`

# whisper 0.1.0

* Initial CRAN submission
Expand Down
38 changes: 30 additions & 8 deletions R/config.R
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,36 @@ whisper_lang_token <- function(
lang = "en",
model = "tiny"
) {
# Language tokens start at 50259 for all models
langs <- c(
langs <- whisper_language_table()

if (!lang %in% names(langs)) {
stop("Unknown language: ", lang)
}

50259L + langs[[lang]]
}

#' Get Language Code from Token ID
#'
#' Reverse lookup: convert a language token ID back to a two-letter code.
#'
#' @param token_id Integer token ID (e.g., 50259 for English)
#' @return Two-letter language code
whisper_lang_from_id <- function(token_id) {
offset <- token_id - 50259L
langs <- whisper_language_table()
idx <- match(offset, langs)
if (is.na(idx)) stop("Unknown language token ID: ", token_id)
names(langs)[idx]
}

#' Whisper Language Table
#'
#' Returns the named integer vector mapping language codes to offsets.
#'
#' @return Named integer vector (language code -> offset from 50259)
whisper_language_table <- function() {
c(
en = 0L, zh = 1L, de = 2L, es = 3L, ru = 4L, ko = 5L, fr = 6L,
ja = 7L, pt = 8L, tr = 9L, pl = 10L, ca = 11L, nl = 12L, ar = 13L,
sv = 14L, it = 15L, id = 16L, hi = 17L, fi = 18L, vi = 19L,
Expand All @@ -165,11 +193,5 @@ whisper_lang_token <- function(
tt = 92L, haw = 93L, ln = 94L, ha = 95L, ba = 96L, jw = 97L,
su = 98L
)

if (!lang %in% names(langs)) {
stop("Unknown language: ", lang)
}

50259L + langs[[lang]]
}

116 changes: 116 additions & 0 deletions R/language.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#' Language Detection
#'
#' Detect the spoken language in an audio file using Whisper.

#' Detect Language
#'
#' Identify the spoken language in an audio file. Uses Whisper's decoder
#' to predict the most likely language token from the first 30 seconds
#' of audio.
#'
#' @param file Path to audio file (WAV, MP3, etc.)
#' @param model Model name: "tiny", "base", "small", "medium", "large-v3"
#' @param device Device: "auto", "cpu", "cuda"
#' @param dtype Data type: "auto", "float16", "float32"
#' @param top_k Number of top language probabilities to return (default: 5)
#' @param download If TRUE and model not present, prompt to download.
#' @param verbose Print loading messages.
#' @return List with \code{language} (two-letter code) and
#' \code{probabilities} (named numeric vector of top-k language probs).
#' @export
#' @examples
#' \donttest{
#' if (model_exists("tiny")) {
#' audio_file <- system.file("audio", "jfk.mp3", package = "whisper")
#' result <- detect_language(audio_file)
#' result$language
#' result$probabilities
#' }
#' }
detect_language <- function(
file,
model = "tiny",
device = "auto",
dtype = "auto",
top_k = 5L,
download = TRUE,
verbose = TRUE
) {
pipe <- whisper_pipeline(model, device = device, dtype = dtype,
download = download, verbose = verbose)

detect_language_from_pipeline(pipe, file, top_k = top_k)
}

#' Detect Language from Pipeline
#'
#' Internal function that runs language detection using a pre-loaded pipeline.
#'
#' @param pipe A whisper_pipeline object
#' @param file Path to audio file, or numeric vector of audio samples
#' @param top_k Number of top probabilities to return
#' @return List with language code and probabilities
detect_language_from_pipeline <- function(pipe, file, top_k = 5L) {
config <- pipe$config
model <- pipe$model
device <- pipe$device
dtype <- pipe$dtype

# Compute mel spectrogram from first 30s
mel <- audio_to_mel(file, n_mels = config$n_mels, device = device,
dtype = dtype)

detect_language_from_mel(model, mel, config, device)
}

#' Detect Language from Mel Spectrogram
#'
#' Core detection logic. Feed SOT token to decoder, read language logits.
#'
#' @param model WhisperModel
#' @param mel Mel spectrogram tensor
#' @param config Model config
#' @param device torch device
#' @param top_k Number of top probabilities to return
#' @return List with language code and probabilities
detect_language_from_mel <- function(model, mel, config, device, top_k = 5L) {
special <- whisper_special_tokens(config$model_name)
langs <- whisper_language_table()
n_langs <- length(langs)

# Language token IDs: 50259 to 50259 + n_langs - 1
lang_start <- 50259L
lang_end <- lang_start + n_langs - 1L

torch::with_no_grad({
# Encode audio
encoder_output <- model$encode(mel)

# Feed just the SOT token to the decoder
sot <- torch::torch_tensor(matrix(special$sot, nrow = 1L),
dtype = torch::torch_long(), device = device)

result <- model$decode(sot, encoder_output)
# logits shape: (1, 1, n_vocab)
logits <- result$logits[1, 1, ]

# Extract language token logits (R is 1-indexed, token IDs are 0-indexed in vocab)
# Token ID 50259 is at position 50260 in 1-indexed logits
lang_logits <- logits[(lang_start + 1L):(lang_end + 1L)]

# Softmax over language logits only
probs <- torch::nnf_softmax(lang_logits, dim = 1L)
probs_r <- as.numeric(probs$cpu())
})

names(probs_r) <- names(langs)

# Find top-k
top_idx <- order(probs_r, decreasing = TRUE)[seq_len(min(top_k, n_langs))]
top_probs <- probs_r[top_idx]

list(
language = names(probs_r)[top_idx[1]],
probabilities = top_probs
)
}
42 changes: 36 additions & 6 deletions R/transcribe.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ whisper_pipeline <- function(

pipe$transcribe <- function(
file,
language = "en",
language = NULL,
task = "transcribe",
timestamps = FALSE,
word_timestamps = FALSE,
Expand Down Expand Up @@ -101,7 +101,7 @@ print.whisper_pipeline <- function(x, ...) {
pipeline_transcribe <- function(
pipe,
file,
language = "en",
language = NULL,
task = "transcribe",
timestamps = FALSE,
word_timestamps = FALSE,
Expand Down Expand Up @@ -159,7 +159,8 @@ pipeline_transcribe <- function(
#'
#' @param file Path to audio file (WAV, MP3, etc.)
#' @param model Model name: "tiny", "base", "small", "medium", "large-v3"
#' @param language Language code (e.g., "en", "es"). NULL for auto-detection.
#' @param language Language code (e.g., "en", "es"), or NULL (default) for
#' auto-detection from the audio.
#' @param task "transcribe" or "translate" (translate to English)
#' @param timestamps If TRUE, return segment-level timestamps
#' @param word_timestamps If TRUE, return word-level timestamps (implies timestamps)
Expand All @@ -181,12 +182,17 @@ pipeline_transcribe <- function(
#' @export
#' @examples
#' \donttest{
#' # Transcribe included sample (JFK "ask not" speech)
#' if (model_exists("tiny")) {
#' audio_file <- system.file("audio", "jfk.mp3", package = "whisper")
#'
#' # Auto-detect language (default)
#' result <- transcribe(audio_file, model = "tiny")
#' result$language # "en"
#' result$text
#'
#' # Explicit language
#' result <- transcribe(audio_file, model = "tiny", language = "en")
#'
#' # With timestamps
#' result <- transcribe(audio_file, model = "tiny", timestamps = TRUE)
#' result$segments
Expand All @@ -201,7 +207,7 @@ pipeline_transcribe <- function(
transcribe <- function(
file,
model = "tiny",
language = "en",
language = NULL,
task = "transcribe",
timestamps = FALSE,
word_timestamps = FALSE,
Expand Down Expand Up @@ -255,7 +261,7 @@ transcribe_chunk <- function(
model,
tokenizer,
config,
language = "en",
language = NULL,
task = "transcribe",
timestamps = FALSE,
word_timestamps = FALSE,
Expand All @@ -282,6 +288,17 @@ transcribe_chunk <- function(

special <- whisper_special_tokens(config$model_name)

# Auto-detect language if not specified
if (is.null(language)) {
detection <- detect_language_from_mel(model, full_mel, config, device)
language <- detection$language
if (verbose) {
top <- detection$probabilities[1]
message("Detected language: ", language,
" (", round(top * 100, 1), "%)")
}
}

# Seek loop: decode repeatedly, advancing through the mel spectrogram
seek <- 0L # current frame position
all_generated <- integer(0)
Expand Down Expand Up @@ -687,6 +704,19 @@ transcribe_long <- function(
dtype,
verbose
) {
# Auto-detect language from first 30s if not specified
if (is.null(language)) {
mel <- audio_to_mel(file, n_mels = config$n_mels, device = device,
dtype = dtype)
detection <- detect_language_from_mel(model, mel, config, device)
language <- detection$language
if (verbose) {
top <- detection$probabilities[1]
message("Detected language: ", language,
" (", round(top * 100, 1), "%)")
}
}

# Split into chunks
chunk_length <- 30
overlap <- 1
Expand Down
35 changes: 35 additions & 0 deletions inst/tinytest/test_language.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Test language detection

# Unit tests (no model needed)
langs <- whisper:::whisper_language_table()
expect_equal(length(langs), 99L)
expect_equal(langs[["en"]], 0L)
expect_equal(langs[["su"]], 98L)

# Reverse lookup
expect_equal(whisper:::whisper_lang_from_id(50259L), "en")
expect_equal(whisper:::whisper_lang_from_id(50260L), "zh")
expect_equal(whisper:::whisper_lang_from_id(50357L), "su")

# Round-trip: lang code -> token ID -> lang code
for (code in names(langs)) {
token_id <- whisper:::whisper_lang_token(code)
expect_equal(whisper:::whisper_lang_from_id(token_id), code)
}

# Integration tests (need model)
if (at_home() && whisper::model_exists("tiny")) {
audio_file <- system.file("audio", "jfk.mp3", package = "whisper")

# detect_language should return English for JFK speech
result <- whisper::detect_language(audio_file, model = "tiny", verbose = FALSE)
expect_true(is.list(result))
expect_equal(result$language, "en")
expect_true(result$probabilities[["en"]] > 0.5)
expect_true(length(result$probabilities) == 5L)

# transcribe with language = NULL should auto-detect
result <- whisper::transcribe(audio_file, model = "tiny", verbose = FALSE)
expect_equal(result$language, "en")
expect_true(nchar(result$text) > 0)
}
17 changes: 4 additions & 13 deletions man/beam_search_decode.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,10 @@
\alias{beam_search_decode}
\title{Beam Search Decode}
\usage{
beam_search_decode(
model,
encoder_output,
initial_tokens,
tokenizer,
beam_size = 5L,
max_length = 448L,
timestamps = FALSE,
word_timestamps = FALSE,
length_penalty = 1,
patience = Inf,
device
)
beam_search_decode(model, encoder_output, initial_tokens, tokenizer,
beam_size = 5L, max_length = 448L, timestamps = FALSE,
word_timestamps = FALSE, length_penalty = 1, patience = Inf,
device)
}
\arguments{
\item{model}{WhisperModel}
Expand Down
10 changes: 2 additions & 8 deletions man/compute_word_timestamps.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,8 @@
\alias{compute_word_timestamps}
\title{Word-Level Timestamp Alignment}
\usage{
compute_word_timestamps(
tokens,
cross_attn_weights,
tokenizer,
config,
time_offset = 0,
sample_begin = 4L
)
compute_word_timestamps(tokens, cross_attn_weights, tokenizer, config,
time_offset = 0, sample_begin = 4L)
}
\arguments{
\item{tokens}{Integer vector of generated token IDs}
Expand Down
7 changes: 2 additions & 5 deletions man/create_mel_filterbank_fallback.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,8 @@
\alias{create_mel_filterbank_fallback}
\title{Create Mel Filterbank (Fallback)}
\usage{
create_mel_filterbank_fallback(
n_fft = WHISPER_N_FFT,
n_mels = 80L,
sample_rate = WHISPER_SAMPLE_RATE
)
create_mel_filterbank_fallback(n_fft = WHISPER_N_FFT, n_mels = 80L,
sample_rate = WHISPER_SAMPLE_RATE)
}
\arguments{
\item{n_fft}{FFT size}
Expand Down
Loading