diff --git a/R/encoder.R b/R/encoder.R index 284356a..258f687 100644 --- a/R/encoder.R +++ b/R/encoder.R @@ -86,8 +86,11 @@ whisper_attention <- torch::nn_module( attn_output <- torch::torch_matmul(attn_weights, v) } else { # Scaled dot-product attention (dispatches to FlashAttention on GPU) - attn_output <- torch:::torch_scaled_dot_product_attention( - q, k, v, is_causal = !is.null(mask)) + # torch_scaled_dot_product_attention is not yet exported from torch + # (will be in next CRAN release). Use get() to avoid R CMD check NOTE. + sdpa <- get("torch_scaled_dot_product_attention", + envir = asNamespace("torch")) + attn_output <- sdpa(q, k, v, is_causal = !is.null(mask)) } # Reshape back: (batch, n_head, seq_len, head_dim) -> (batch, seq_len, n_state) diff --git a/cran-comments.md b/cran-comments.md index ee63bf8..7081ec9 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,35 +1,33 @@ ## R CMD check results -0 errors | 0 warnings | 1 note - -* This is a new submission. +0 errors | 0 warnings | 0 notes ## Test environments -* local Ubuntu 24.04, R 4.4.x +* local Ubuntu 24.04, R 4.5.2 * GitHub Actions (r-ci): ubuntu-latest, macos-latest +* win-builder R-devel -## Package dependencies - -This package requires: -- torch: Neural network operations -- av: Audio file loading (FFmpeg bindings) -- jsonlite: JSON parsing for tokenizer files -- hfhub: HuggingFace model downloads -- safetensors: Model weight loading +## Changes since last CRAN release (0.1.0) -## Resubmission changes - -- Added cornball.ai as copyright holder (`cph`) in `Authors@R` to match - the LICENSE file. -- Added OpenAI as copyright holder (`cph`) in `Authors@R`. The bundled mel - filterbank data (`inst/assets/mel_80.csv`, `mel_128.csv`) is from OpenAI's - MIT-licensed Whisper repository, and the model architecture is derived from - their specifications. -- Changed `\dontrun{}` to `\donttest{}` in `download_whisper_model()` examples. +- `whisper_pipeline()` for cached model reuse across multiple transcriptions +- SDPA attention (FlashAttention on GPU) +- Segment-level and word-level timestamps via DTW alignment +- Beam search decoding with temperature sampling and fallback +- Automatic language detection (`detect_language()`, `language = NULL` default) +- Hardcoded special token table (eliminates `added_tokens.json` download) +- Fixed invalid multibyte string crash in BPE decoder +- Fixed DTW boundary guards and seek loop in `transcribe_chunk()` +- Fixed win-builder test crash: added `torch_is_installed()` guard ## Notes +This package uses `torch_scaled_dot_product_attention()` from torch, +which is not yet exported in the current CRAN release. We access it via +`get("torch_scaled_dot_product_attention", envir = asNamespace("torch"))`. +The function will be exported in the next torch release +(PR: https://github.com/mlverse/torch/pull/1404). + This package provides a native R implementation of OpenAI's Whisper speech-to-text model. Model weights are downloaded from HuggingFace on first use (ranging from 145MB for tiny to 3GB for large-v3).