diff --git a/DESCRIPTION b/DESCRIPTION index 5188d10..a0cb168 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: whisper Title: Native R 'torch' Implementation of 'OpenAI' 'Whisper' -Version: 0.2.0 +Version: 0.3.0 Authors@R: c( person("Troy", "Hernandez", role = c("aut", "cre"), email = "troy@cornball.ai"), diff --git a/NAMESPACE b/NAMESPACE index f3861cf..70c9564 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # tinyrox says don't edit this manually, but it can't stop you! export(audio_to_mel) +export(detect_language) export(download_whisper_model) export(list_downloaded_models) export(list_whisper_models) diff --git a/NEWS.md b/NEWS.md index eaa95eb..8205c24 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,19 @@ +# whisper 0.3.0 + +* Language auto-detection: `transcribe()` now defaults to `language = NULL`, + which detects the spoken language from the audio before decoding. New + exported function `detect_language()` for standalone language identification. + **Breaking**: previous default was `language = "en"`. Code relying on the + default now auto-detects instead of assuming English. Pass `language = "en"` + explicitly to restore old behavior. +* Segment-level and word-level timestamps via DTW alignment +* Beam search decoding with temperature sampling and fallback +* SDPA attention (FlashAttention on GPU) +* `whisper_pipeline()` for cached model reuse across multiple transcriptions +* Hardcoded special token table (eliminates `added_tokens.json` download) +* Fixed invalid multibyte string crash in BPE decoder +* Fixed DTW boundary guards and seek loop in `transcribe_chunk()` + # whisper 0.1.0 * Initial CRAN submission diff --git a/R/config.R b/R/config.R index 52f18fa..4b9fcd9 100644 --- a/R/config.R +++ b/R/config.R @@ -145,8 +145,36 @@ whisper_lang_token <- function( lang = "en", model = "tiny" ) { - # Language tokens start at 50259 for all models - langs <- c( + langs <- whisper_language_table() + + if (!lang %in% names(langs)) { + stop("Unknown language: ", lang) + } + + 50259L + langs[[lang]] +} + +#' Get Language Code from Token ID +#' +#' Reverse lookup: convert a language token ID back to a two-letter code. +#' +#' @param token_id Integer token ID (e.g., 50259 for English) +#' @return Two-letter language code +whisper_lang_from_id <- function(token_id) { + offset <- token_id - 50259L + langs <- whisper_language_table() + idx <- match(offset, langs) + if (is.na(idx)) stop("Unknown language token ID: ", token_id) + names(langs)[idx] +} + +#' Whisper Language Table +#' +#' Returns the named integer vector mapping language codes to offsets. +#' +#' @return Named integer vector (language code -> offset from 50259) +whisper_language_table <- function() { + c( en = 0L, zh = 1L, de = 2L, es = 3L, ru = 4L, ko = 5L, fr = 6L, ja = 7L, pt = 8L, tr = 9L, pl = 10L, ca = 11L, nl = 12L, ar = 13L, sv = 14L, it = 15L, id = 16L, hi = 17L, fi = 18L, vi = 19L, @@ -165,11 +193,5 @@ whisper_lang_token <- function( tt = 92L, haw = 93L, ln = 94L, ha = 95L, ba = 96L, jw = 97L, su = 98L ) - - if (!lang %in% names(langs)) { - stop("Unknown language: ", lang) - } - - 50259L + langs[[lang]] } diff --git a/R/language.R b/R/language.R new file mode 100644 index 0000000..d18798b --- /dev/null +++ b/R/language.R @@ -0,0 +1,116 @@ +#' Language Detection +#' +#' Detect the spoken language in an audio file using Whisper. + +#' Detect Language +#' +#' Identify the spoken language in an audio file. Uses Whisper's decoder +#' to predict the most likely language token from the first 30 seconds +#' of audio. +#' +#' @param file Path to audio file (WAV, MP3, etc.) +#' @param model Model name: "tiny", "base", "small", "medium", "large-v3" +#' @param device Device: "auto", "cpu", "cuda" +#' @param dtype Data type: "auto", "float16", "float32" +#' @param top_k Number of top language probabilities to return (default: 5) +#' @param download If TRUE and model not present, prompt to download. +#' @param verbose Print loading messages. +#' @return List with \code{language} (two-letter code) and +#' \code{probabilities} (named numeric vector of top-k language probs). +#' @export +#' @examples +#' \donttest{ +#' if (model_exists("tiny")) { +#' audio_file <- system.file("audio", "jfk.mp3", package = "whisper") +#' result <- detect_language(audio_file) +#' result$language +#' result$probabilities +#' } +#' } +detect_language <- function( + file, + model = "tiny", + device = "auto", + dtype = "auto", + top_k = 5L, + download = TRUE, + verbose = TRUE +) { + pipe <- whisper_pipeline(model, device = device, dtype = dtype, + download = download, verbose = verbose) + + detect_language_from_pipeline(pipe, file, top_k = top_k) +} + +#' Detect Language from Pipeline +#' +#' Internal function that runs language detection using a pre-loaded pipeline. +#' +#' @param pipe A whisper_pipeline object +#' @param file Path to audio file, or numeric vector of audio samples +#' @param top_k Number of top probabilities to return +#' @return List with language code and probabilities +detect_language_from_pipeline <- function(pipe, file, top_k = 5L) { + config <- pipe$config + model <- pipe$model + device <- pipe$device + dtype <- pipe$dtype + + # Compute mel spectrogram from first 30s + mel <- audio_to_mel(file, n_mels = config$n_mels, device = device, + dtype = dtype) + + detect_language_from_mel(model, mel, config, device) +} + +#' Detect Language from Mel Spectrogram +#' +#' Core detection logic. Feed SOT token to decoder, read language logits. +#' +#' @param model WhisperModel +#' @param mel Mel spectrogram tensor +#' @param config Model config +#' @param device torch device +#' @param top_k Number of top probabilities to return +#' @return List with language code and probabilities +detect_language_from_mel <- function(model, mel, config, device, top_k = 5L) { + special <- whisper_special_tokens(config$model_name) + langs <- whisper_language_table() + n_langs <- length(langs) + + # Language token IDs: 50259 to 50259 + n_langs - 1 + lang_start <- 50259L + lang_end <- lang_start + n_langs - 1L + + torch::with_no_grad({ + # Encode audio + encoder_output <- model$encode(mel) + + # Feed just the SOT token to the decoder + sot <- torch::torch_tensor(matrix(special$sot, nrow = 1L), + dtype = torch::torch_long(), device = device) + + result <- model$decode(sot, encoder_output) + # logits shape: (1, 1, n_vocab) + logits <- result$logits[1, 1, ] + + # Extract language token logits (R is 1-indexed, token IDs are 0-indexed in vocab) + # Token ID 50259 is at position 50260 in 1-indexed logits + lang_logits <- logits[(lang_start + 1L):(lang_end + 1L)] + + # Softmax over language logits only + probs <- torch::nnf_softmax(lang_logits, dim = 1L) + probs_r <- as.numeric(probs$cpu()) + }) + + names(probs_r) <- names(langs) + + # Find top-k + top_idx <- order(probs_r, decreasing = TRUE)[seq_len(min(top_k, n_langs))] + top_probs <- probs_r[top_idx] + + list( + language = names(probs_r)[top_idx[1]], + probabilities = top_probs + ) +} diff --git a/R/transcribe.R b/R/transcribe.R index 0c2a118..8b58286 100644 --- a/R/transcribe.R +++ b/R/transcribe.R @@ -46,7 +46,7 @@ whisper_pipeline <- function( pipe$transcribe <- function( file, - language = "en", + language = NULL, task = "transcribe", timestamps = FALSE, word_timestamps = FALSE, @@ -101,7 +101,7 @@ print.whisper_pipeline <- function(x, ...) { pipeline_transcribe <- function( pipe, file, - language = "en", + language = NULL, task = "transcribe", timestamps = FALSE, word_timestamps = FALSE, @@ -159,7 +159,8 @@ pipeline_transcribe <- function( #' #' @param file Path to audio file (WAV, MP3, etc.) #' @param model Model name: "tiny", "base", "small", "medium", "large-v3" -#' @param language Language code (e.g., "en", "es"). NULL for auto-detection. +#' @param language Language code (e.g., "en", "es"), or NULL (default) for +#' auto-detection from the audio. #' @param task "transcribe" or "translate" (translate to English) #' @param timestamps If TRUE, return segment-level timestamps #' @param word_timestamps If TRUE, return word-level timestamps (implies timestamps) @@ -181,12 +182,17 @@ pipeline_transcribe <- function( #' @export #' @examples #' \donttest{ -#' # Transcribe included sample (JFK "ask not" speech) #' if (model_exists("tiny")) { #' audio_file <- system.file("audio", "jfk.mp3", package = "whisper") +#' +#' # Auto-detect language (default) #' result <- transcribe(audio_file, model = "tiny") +#' result$language # "en" #' result$text #' +#' # Explicit language +#' result <- transcribe(audio_file, model = "tiny", language = "en") +#' #' # With timestamps #' result <- transcribe(audio_file, model = "tiny", timestamps = TRUE) #' result$segments @@ -201,7 +207,7 @@ pipeline_transcribe <- function( transcribe <- function( file, model = "tiny", - language = "en", + language = NULL, task = "transcribe", timestamps = FALSE, word_timestamps = FALSE, @@ -255,7 +261,7 @@ transcribe_chunk <- function( model, tokenizer, config, - language = "en", + language = NULL, task = "transcribe", timestamps = FALSE, word_timestamps = FALSE, @@ -282,6 +288,17 @@ transcribe_chunk <- function( special <- whisper_special_tokens(config$model_name) + # Auto-detect language if not specified + if (is.null(language)) { + detection <- detect_language_from_mel(model, full_mel, config, device) + language <- detection$language + if (verbose) { + top <- detection$probabilities[1] + message("Detected language: ", language, + " (", round(top * 100, 1), "%)") + } + } + # Seek loop: decode repeatedly, advancing through the mel spectrogram seek <- 0L # current frame position all_generated <- integer(0) @@ -687,6 +704,19 @@ transcribe_long <- function( dtype, verbose ) { + # Auto-detect language from first 30s if not specified + if (is.null(language)) { + mel <- audio_to_mel(file, n_mels = config$n_mels, device = device, + dtype = dtype) + detection <- detect_language_from_mel(model, mel, config, device) + language <- detection$language + if (verbose) { + top <- detection$probabilities[1] + message("Detected language: ", language, + " (", round(top * 100, 1), "%)") + } + } + # Split into chunks chunk_length <- 30 overlap <- 1 diff --git a/inst/tinytest/test_language.R b/inst/tinytest/test_language.R new file mode 100644 index 0000000..3d5e27d --- /dev/null +++ b/inst/tinytest/test_language.R @@ -0,0 +1,35 @@ +# Test language detection + +# Unit tests (no model needed) +langs <- whisper:::whisper_language_table() +expect_equal(length(langs), 99L) +expect_equal(langs[["en"]], 0L) +expect_equal(langs[["su"]], 98L) + +# Reverse lookup +expect_equal(whisper:::whisper_lang_from_id(50259L), "en") +expect_equal(whisper:::whisper_lang_from_id(50260L), "zh") +expect_equal(whisper:::whisper_lang_from_id(50357L), "su") + +# Round-trip: lang code -> token ID -> lang code +for (code in names(langs)) { + token_id <- whisper:::whisper_lang_token(code) + expect_equal(whisper:::whisper_lang_from_id(token_id), code) +} + +# Integration tests (need model) +if (at_home() && whisper::model_exists("tiny")) { + audio_file <- system.file("audio", "jfk.mp3", package = "whisper") + + # detect_language should return English for JFK speech + result <- whisper::detect_language(audio_file, model = "tiny", verbose = FALSE) + expect_true(is.list(result)) + expect_equal(result$language, "en") + expect_true(result$probabilities[["en"]] > 0.5) + expect_true(length(result$probabilities) == 5L) + + # transcribe with language = NULL should auto-detect + result <- whisper::transcribe(audio_file, model = "tiny", verbose = FALSE) + expect_equal(result$language, "en") + expect_true(nchar(result$text) > 0) +} diff --git a/man/beam_search_decode.Rd b/man/beam_search_decode.Rd index 082cff5..061a8fc 100644 --- a/man/beam_search_decode.Rd +++ b/man/beam_search_decode.Rd @@ -3,19 +3,10 @@ \alias{beam_search_decode} \title{Beam Search Decode} \usage{ -beam_search_decode( - model, - encoder_output, - initial_tokens, - tokenizer, - beam_size = 5L, - max_length = 448L, - timestamps = FALSE, - word_timestamps = FALSE, - length_penalty = 1, - patience = Inf, - device -) +beam_search_decode(model, encoder_output, initial_tokens, tokenizer, + beam_size = 5L, max_length = 448L, timestamps = FALSE, + word_timestamps = FALSE, length_penalty = 1, patience = Inf, + device) } \arguments{ \item{model}{WhisperModel} diff --git a/man/compute_word_timestamps.Rd b/man/compute_word_timestamps.Rd index 17198ff..f963c4a 100644 --- a/man/compute_word_timestamps.Rd +++ b/man/compute_word_timestamps.Rd @@ -3,14 +3,8 @@ \alias{compute_word_timestamps} \title{Word-Level Timestamp Alignment} \usage{ -compute_word_timestamps( - tokens, - cross_attn_weights, - tokenizer, - config, - time_offset = 0, - sample_begin = 4L -) +compute_word_timestamps(tokens, cross_attn_weights, tokenizer, config, + time_offset = 0, sample_begin = 4L) } \arguments{ \item{tokens}{Integer vector of generated token IDs} diff --git a/man/create_mel_filterbank_fallback.Rd b/man/create_mel_filterbank_fallback.Rd index 6d01f8d..565a00e 100644 --- a/man/create_mel_filterbank_fallback.Rd +++ b/man/create_mel_filterbank_fallback.Rd @@ -3,11 +3,8 @@ \alias{create_mel_filterbank_fallback} \title{Create Mel Filterbank (Fallback)} \usage{ -create_mel_filterbank_fallback( - n_fft = WHISPER_N_FFT, - n_mels = 80L, - sample_rate = WHISPER_SAMPLE_RATE -) +create_mel_filterbank_fallback(n_fft = WHISPER_N_FFT, n_mels = 80L, + sample_rate = WHISPER_SAMPLE_RATE) } \arguments{ \item{n_fft}{FFT size} diff --git a/man/decode_with_fallback.Rd b/man/decode_with_fallback.Rd index 425f26e..b1a6db4 100644 --- a/man/decode_with_fallback.Rd +++ b/man/decode_with_fallback.Rd @@ -3,23 +3,12 @@ \alias{decode_with_fallback} \title{Decode with Temperature Fallback} \usage{ -decode_with_fallback( - model, - encoder_output, - initial_tokens, - tokenizer, - temperatures = c(0, 0.2, 0.4, 0.6, 0.8, 1), - beam_size = 5L, - best_of = 5L, - max_length = 448L, - timestamps = FALSE, - word_timestamps = FALSE, - compression_ratio_threshold = 2.4, - logprob_threshold = -1, - length_penalty = 1, - patience = Inf, - device -) +decode_with_fallback(model, encoder_output, initial_tokens, tokenizer, + temperatures = c(0, 0.2, 0.4, 0.6, 0.8, 1), + beam_size = 5L, best_of = 5L, max_length = 448L, + timestamps = FALSE, word_timestamps = FALSE, + compression_ratio_threshold = 2.4, logprob_threshold = -1, + length_penalty = 1, patience = Inf, device) } \arguments{ \item{model}{WhisperModel} diff --git a/man/detect_language.Rd b/man/detect_language.Rd new file mode 100644 index 0000000..f580f97 --- /dev/null +++ b/man/detect_language.Rd @@ -0,0 +1,44 @@ +% tinyrox says don't edit this manually, but it can't stop you! +\name{detect_language} +\alias{detect_language} +\title{Language Detection} +\usage{ +detect_language(file, model = "tiny", device = "auto", dtype = "auto", + top_k = 5L, download = TRUE, verbose = TRUE) +} +\arguments{ +\item{file}{Path to audio file (WAV, MP3, etc.)} + +\item{model}{Model name: "tiny", "base", "small", "medium", "large-v3"} + +\item{device}{Device: "auto", "cpu", "cuda"} + +\item{dtype}{Data type: "auto", "float16", "float32"} + +\item{top_k}{Number of top language probabilities to return (default: 5)} + +\item{download}{If TRUE and model not present, prompt to download.} + +\item{verbose}{Print loading messages.} +} +\value{ +List with \code{language} (two-letter code) and + \code{probabilities} (named numeric vector of top-k language probs). +} +\description{ +Detect the spoken language in an audio file using Whisper. +Detect Language +Identify the spoken language in an audio file. Uses Whisper's decoder +to predict the most likely language token from the first 30 seconds +of audio. +} +\examples{ +\donttest{ +if (model_exists("tiny")) { + audio_file <- system.file("audio", "jfk.mp3", package = "whisper") + result <- detect_language(audio_file) + result$language + result$probabilities +} +} +} diff --git a/man/detect_language_from_mel.Rd b/man/detect_language_from_mel.Rd new file mode 100644 index 0000000..428d07e --- /dev/null +++ b/man/detect_language_from_mel.Rd @@ -0,0 +1,24 @@ +% tinyrox says don't edit this manually, but it can't stop you! +\name{detect_language_from_mel} +\alias{detect_language_from_mel} +\title{Detect Language from Mel Spectrogram} +\usage{ +detect_language_from_mel(model, mel, config, device, top_k = 5L) +} +\arguments{ +\item{model}{WhisperModel} + +\item{mel}{Mel spectrogram tensor} + +\item{config}{Model config} + +\item{device}{torch device} + +\item{top_k}{Number of top probabilities to return} +} +\value{ +List with language code and probabilities +} +\description{ +Core detection logic. Feed SOT token to decoder, read language logits. +} diff --git a/man/detect_language_from_pipeline.Rd b/man/detect_language_from_pipeline.Rd new file mode 100644 index 0000000..0dce67b --- /dev/null +++ b/man/detect_language_from_pipeline.Rd @@ -0,0 +1,20 @@ +% tinyrox says don't edit this manually, but it can't stop you! +\name{detect_language_from_pipeline} +\alias{detect_language_from_pipeline} +\title{Detect Language from Pipeline} +\usage{ +detect_language_from_pipeline(pipe, file, top_k = 5L) +} +\arguments{ +\item{pipe}{A whisper_pipeline object} + +\item{file}{Path to audio file, or numeric vector of audio samples} + +\item{top_k}{Number of top probabilities to return} +} +\value{ +List with language code and probabilities +} +\description{ +Internal function that runs language detection using a pre-loaded pipeline. +} diff --git a/man/get_initial_tokens.Rd b/man/get_initial_tokens.Rd index 99f85db..fb7ebfe 100644 --- a/man/get_initial_tokens.Rd +++ b/man/get_initial_tokens.Rd @@ -3,12 +3,8 @@ \alias{get_initial_tokens} \title{Get Initial Decoder Tokens} \usage{ -get_initial_tokens( - language = "en", - task = "transcribe", - model = "tiny", - timestamps = FALSE -) +get_initial_tokens(language = "en", task = "transcribe", model = "tiny", + timestamps = FALSE) } \arguments{ \item{language}{Two-letter language code or NULL for auto} diff --git a/man/greedy_decode.Rd b/man/greedy_decode.Rd index 8f18b70..6ec3740 100644 --- a/man/greedy_decode.Rd +++ b/man/greedy_decode.Rd @@ -3,16 +3,9 @@ \alias{greedy_decode} \title{Greedy Decoding} \usage{ -greedy_decode( - model, - encoder_output, - initial_tokens, - tokenizer, - max_length = 448L, - timestamps = FALSE, - word_timestamps = FALSE, - device -) +greedy_decode(model, encoder_output, initial_tokens, tokenizer, + max_length = 448L, timestamps = FALSE, word_timestamps = FALSE, + device) } \arguments{ \item{model}{WhisperModel} diff --git a/man/load_whisper_model.Rd b/man/load_whisper_model.Rd index 219cb37..c62cdc1 100644 --- a/man/load_whisper_model.Rd +++ b/man/load_whisper_model.Rd @@ -3,13 +3,8 @@ \alias{load_whisper_model} \title{Load Whisper Model} \usage{ -load_whisper_model( - model = "tiny", - device = "auto", - dtype = "auto", - download = FALSE, - verbose = TRUE -) +load_whisper_model(model = "tiny", device = "auto", dtype = "auto", + download = FALSE, verbose = TRUE) } \arguments{ \item{model}{Model name: "tiny", "base", "small", "medium", "large-v3"} diff --git a/man/pipeline_transcribe.Rd b/man/pipeline_transcribe.Rd index e1ea4fe..00621cf 100644 --- a/man/pipeline_transcribe.Rd +++ b/man/pipeline_transcribe.Rd @@ -3,22 +3,11 @@ \alias{pipeline_transcribe} \title{Pipeline Transcribe} \usage{ -pipeline_transcribe( - pipe, - file, - language = "en", - task = "transcribe", - timestamps = FALSE, - word_timestamps = FALSE, - beam_size = 1L, - temperatures = 0, - best_of = 1L, - compression_ratio_threshold = 2.4, - logprob_threshold = -1, - length_penalty = 1, - patience = Inf, - verbose = TRUE -) +pipeline_transcribe(pipe, file, language = NULL, task = "transcribe", + timestamps = FALSE, word_timestamps = FALSE, + beam_size = 1L, temperatures = 0, best_of = 1L, + compression_ratio_threshold = 2.4, logprob_threshold = -1, + length_penalty = 1, patience = Inf, verbose = TRUE) } \arguments{ \item{pipe}{A whisper_pipeline object.} diff --git a/man/sample_decode.Rd b/man/sample_decode.Rd index 6a85306..b7c76f5 100644 --- a/man/sample_decode.Rd +++ b/man/sample_decode.Rd @@ -3,17 +3,9 @@ \alias{sample_decode} \title{Sample Decode} \usage{ -sample_decode( - model, - encoder_output, - initial_tokens, - tokenizer, - temperature = 0.6, - max_length = 448L, - timestamps = FALSE, - word_timestamps = FALSE, - device -) +sample_decode(model, encoder_output, initial_tokens, tokenizer, + temperature = 0.6, max_length = 448L, timestamps = FALSE, + word_timestamps = FALSE, device) } \arguments{ \item{model}{WhisperModel} diff --git a/man/transcribe.Rd b/man/transcribe.Rd index 3244dc9..e386e69 100644 --- a/man/transcribe.Rd +++ b/man/transcribe.Rd @@ -3,31 +3,19 @@ \alias{transcribe} \title{Transcribe Audio} \usage{ -transcribe( - file, - model = "tiny", - language = "en", - task = "transcribe", - timestamps = FALSE, - word_timestamps = FALSE, - beam_size = 1L, - temperatures = 0, - best_of = 1L, - compression_ratio_threshold = 2.4, - logprob_threshold = -1, - length_penalty = 1, - patience = Inf, - device = "auto", - dtype = "auto", - verbose = TRUE -) +transcribe(file, model = "tiny", language = NULL, task = "transcribe", + timestamps = FALSE, word_timestamps = FALSE, beam_size = 1L, + temperatures = 0, best_of = 1L, compression_ratio_threshold = 2.4, + logprob_threshold = -1, length_penalty = 1, patience = Inf, + device = "auto", dtype = "auto", verbose = TRUE) } \arguments{ \item{file}{Path to audio file (WAV, MP3, etc.)} \item{model}{Model name: "tiny", "base", "small", "medium", "large-v3"} -\item{language}{Language code (e.g., "en", "es"). NULL for auto-detection.} +\item{language}{Language code (e.g., "en", "es"), or NULL (default) for +auto-detection from the audio.} \item{task}{"transcribe" or "translate" (translate to English)} @@ -69,12 +57,17 @@ load the model once. } \examples{ \donttest{ -# Transcribe included sample (JFK "ask not" speech) if (model_exists("tiny")) { audio_file <- system.file("audio", "jfk.mp3", package = "whisper") + + # Auto-detect language (default) result <- transcribe(audio_file, model = "tiny") + result$language # "en" result$text + # Explicit language + result <- transcribe(audio_file, model = "tiny", language = "en") + # With timestamps result <- transcribe(audio_file, model = "tiny", timestamps = TRUE) result$segments diff --git a/man/transcribe_chunk.Rd b/man/transcribe_chunk.Rd index 0892538..e209202 100644 --- a/man/transcribe_chunk.Rd +++ b/man/transcribe_chunk.Rd @@ -3,27 +3,12 @@ \alias{transcribe_chunk} \title{Transcribe Single Chunk} \usage{ -transcribe_chunk( - file, - model, - tokenizer, - config, - language = "en", - task = "transcribe", - timestamps = FALSE, - word_timestamps = FALSE, - beam_size = 1L, - temperatures = 0, - best_of = 1L, - compression_ratio_threshold = 2.4, - logprob_threshold = -1, - length_penalty = 1, - patience = Inf, - time_offset = 0, - device, - dtype, - verbose = TRUE -) +transcribe_chunk(file, model, tokenizer, config, language = NULL, + task = "transcribe", timestamps = FALSE, + word_timestamps = FALSE, beam_size = 1L, temperatures = 0, + best_of = 1L, compression_ratio_threshold = 2.4, + logprob_threshold = -1, length_penalty = 1, patience = Inf, + time_offset = 0, device, dtype, verbose = TRUE) } \arguments{ \item{file}{Audio file or mel spectrogram} diff --git a/man/transcribe_long.Rd b/man/transcribe_long.Rd index 8afd1ba..ed7e6f3 100644 --- a/man/transcribe_long.Rd +++ b/man/transcribe_long.Rd @@ -3,26 +3,11 @@ \alias{transcribe_long} \title{Transcribe Long Audio} \usage{ -transcribe_long( - file, - model, - tokenizer, - config, - language, - task, - timestamps = FALSE, - word_timestamps = FALSE, - beam_size = 1L, - temperatures = 0, - best_of = 1L, - compression_ratio_threshold = 2.4, - logprob_threshold = -1, - length_penalty = 1, - patience = Inf, - device, - dtype, - verbose -) +transcribe_long(file, model, tokenizer, config, language, task, + timestamps = FALSE, word_timestamps = FALSE, beam_size = 1L, + temperatures = 0, best_of = 1L, + compression_ratio_threshold = 2.4, logprob_threshold = -1, + length_penalty = 1, patience = Inf, device, dtype, verbose) } \arguments{ \item{file}{Audio file} diff --git a/man/whisper_lang_from_id.Rd b/man/whisper_lang_from_id.Rd new file mode 100644 index 0000000..4195496 --- /dev/null +++ b/man/whisper_lang_from_id.Rd @@ -0,0 +1,16 @@ +% tinyrox says don't edit this manually, but it can't stop you! +\name{whisper_lang_from_id} +\alias{whisper_lang_from_id} +\title{Get Language Code from Token ID} +\usage{ +whisper_lang_from_id(token_id) +} +\arguments{ +\item{token_id}{Integer token ID (e.g., 50259 for English)} +} +\value{ +Two-letter language code +} +\description{ +Reverse lookup: convert a language token ID back to a two-letter code. +} diff --git a/man/whisper_language_table.Rd b/man/whisper_language_table.Rd new file mode 100644 index 0000000..e04674f --- /dev/null +++ b/man/whisper_language_table.Rd @@ -0,0 +1,13 @@ +% tinyrox says don't edit this manually, but it can't stop you! +\name{whisper_language_table} +\alias{whisper_language_table} +\title{Whisper Language Table} +\usage{ +whisper_language_table() +} +\value{ +Named integer vector (language code -> offset from 50259) +} +\description{ +Returns the named integer vector mapping language codes to offsets. +} diff --git a/man/whisper_pipeline.Rd b/man/whisper_pipeline.Rd index df75dd7..8ef9efe 100644 --- a/man/whisper_pipeline.Rd +++ b/man/whisper_pipeline.Rd @@ -3,13 +3,8 @@ \alias{whisper_pipeline} \title{Whisper Transcription} \usage{ -whisper_pipeline( - model = "tiny", - device = "auto", - dtype = "auto", - download = TRUE, - verbose = TRUE -) +whisper_pipeline(model = "tiny", device = "auto", dtype = "auto", + download = TRUE, verbose = TRUE) } \arguments{ \item{model}{Model name: "tiny", "base", "small", "medium", "large-v3"}