From 7b15089013d9c67f5abf5f844ea8052b41f40a1d Mon Sep 17 00:00:00 2001 From: Friedas ShitMac Date: Sat, 24 Jan 2026 16:30:21 +0100 Subject: [PATCH 1/2] Switch sag to MiniMax TTS --- README.md | 68 +++-- cmd/api_key.go | 6 +- cmd/api_key_test.go | 36 +-- cmd/http_testutil.go | 20 ++ cmd/prompting.go | 2 +- cmd/prompting_guide.md | 82 ++---- cmd/root.go | 10 +- cmd/speak.go | 359 +++++++++++++------------- cmd/speak_integration_test.go | 103 +++++--- cmd/speak_request_test.go | 122 +++++---- cmd/speak_test.go | 256 +++++++++---------- cmd/voices.go | 18 +- cmd/voices_test.go | 23 +- docs/spec.md | 39 ++- internal/elevenlabs/client_test.go | 60 +++-- internal/minimax/client.go | 389 +++++++++++++++++++++++++++++ internal/minimax/doc.go | 2 + 17 files changed, 992 insertions(+), 603 deletions(-) create mode 100644 cmd/http_testutil.go create mode 100644 internal/minimax/client.go create mode 100644 internal/minimax/doc.go diff --git a/README.md b/README.md index 3184546..4d880ec 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# sag 🗣️ — “Mac-style speech with ElevenLabs” +# sag 🗣️ — “Mac-style speech with MiniMax” One-liner TTS that works like `say`: stream to speakers by default, list voices, or save audio files. @@ -15,9 +15,10 @@ go install ./cmd/sag Requires Go 1.24+. ## Configuration -- `ELEVENLABS_API_KEY` (required) -- `--api-key-file` or `ELEVENLABS_API_KEY_FILE`/`SAG_API_KEY_FILE` to load the key from a file -- Optional defaults: `ELEVENLABS_VOICE_ID` or `SAG_VOICE_ID` +- `MINIMAX_API_KEY` (required; fallback `SAG_API_KEY`) +- `--api-key-file` or `MINIMAX_API_KEY_FILE`/`SAG_API_KEY_FILE` to load the key from a file +- Optional defaults: `MINIMAX_VOICE_ID` or `SAG_VOICE_ID` +- `--base-url` to override the API host (default `https://api.minimax.io`; some regions use `https://api-uw.minimax.io`) ## Usage @@ -25,12 +26,12 @@ Features: - macOS `say`-style default: `sag "Hello"` routes to `speak` automatically. - Streaming playback to speakers with optional file output. - Voice discovery via `sag voices` and `-v ?`. -- Speed/rate controls, latency tiers, and format inference from output extension. -- Model selection via `--model-id` (defaults to `eleven_v3`; use `eleven_multilingual_v2` for a stable baseline). +- Speed/rate controls and format inference from output extension. +- Model selection via `--model-id` (defaults to `speech-01`). Speak (streams audio): ```bash -sag speak -v Roger "Hello world" +sag speak -v "Your Voice" "Hello world" ``` Call it like macOS `say`: omitting the subcommand pipes text to `speak` by default. @@ -40,36 +41,33 @@ sag "Hello world" macOS `say` compatibility shortcuts (subcommand optional): ```bash -sag -v Roger -r 200 "Faster speech" +sag -v "Your Voice" -r 200 "Faster speech" sag -o out.mp3 "Save to file" sag -v ? # list voices ``` More examples: ```bash -echo "piped input" | sag speak -v Roger -sag speak -v Roger --stream --latency-tier 3 "Faster start" -sag speak -v Roger --speed 1.2 "Talk a bit faster" -sag speak -v Roger --model-id eleven_multilingual_v2 "Use stable v2 baseline" -sag speak -v Roger --output out.wav --format pcm_44100 "Wave output" +echo "piped input" | sag speak -v "Your Voice" +sag speak -v "Your Voice" --speed 1.2 "Talk a bit faster" +sag speak -v "Your Voice" --emotion happy "Great news, everyone!" +sag speak -v "Your Voice" --output out.wav --format wav "Wave output" ``` Key flags (subset): - `-v, --voice` voice name or ID (`?` to list) - `--api-key-file` read API key from a file -- `-r, --rate` words per minute (maps to ElevenLabs speed; default 175) +- `-r, --rate` words per minute (maps to speed; default 175) - `-f, --input-file` read text from file (`-` for stdin) -- `-o, --output` write audio file; format inferred by extension (`.wav` -> PCM, `.mp3` -> MP3) +- `-o, --output` write audio file; format inferred by extension (`.wav` -> WAV, `.mp3` -> MP3) - `--speed` explicit speed multiplier (0.5–2.0) -- `--stability` v3: `0|0.5|1` (Creative/Natural/Robust); v2/v2.5: 0..1 (higher = more consistent, less expressive) -- `--similarity` / `--similarity-boost` 0..1 (higher = closer to the reference voice) -- `--style` 0..1 (higher = more stylized delivery; model/voice dependent) -- `--speaker-boost` / `--no-speaker-boost` toggle clarity boost (model dependent) -- `--seed` 0..4294967295 best-effort repeatability across runs -- `--normalize` `auto|on|off` numbers/units/URLs normalization (when set) -- `--lang` `en|de|fr|...` 2-letter ISO 639-1 language code (when set) +- `--emotion` (model dependent) +- `--pitch` (model dependent) +- `--volume` (model dependent) +- `--normalize` `auto|on|off` (text normalization; when set) +- `--lang` language boost hint (e.g. `en`, `zh`, `auto`) +- `--format` output format (e.g. `mp3_44100_128`, `mp3`, `wav`) - `--stream/--no-stream` stream while generating (default on) -- `--latency-tier` 0–4 lower latency tiers - `--play/--no-play` control speaker playback - `--metrics` print basic stats to stderr @@ -85,26 +83,17 @@ sag prompting ``` Highlights: -- v2/v2.5: SSML pauses via `` (v3 does not support SSML breaks). -- v3: use audio tags like `[whispers]` and pause tags like `[short pause]`. -- Use the voice knobs: `--stability`, `--similarity`, `--style`, `--speaker-boost`, plus request controls `--seed`, `--normalize`, `--lang`. +- Keep scripts short and readable; punctuation drives timing. +- Use `--emotion`, `--pitch`, and `--volume` for tone shaping (model dependent). +- `--normalize` and `--lang` help with numbers/units and multilingual output. ## Models / engines -`sag` supports any ElevenLabs `model_id` via `--model-id` (we pass it through). Practical defaults + common IDs: - -| Engine | `--model-id` | Prompting style | Best for | -|---|---|---|---| -| v3 (alpha) | `eleven_v3` (default) | Audio tags like `[whispers]`, `[short pause]` (no SSML ``) | Most expressive / “acting” | -| v2 (stable) | `eleven_multilingual_v2` | SSML `` supported | Reliable baseline, simple prompts | -| v2.5 Flash | `eleven_flash_v2_5` | SSML `` supported | Ultra-low latency (~75ms) + 50% lower price per character | -| v2.5 Turbo | `eleven_turbo_v2_5` | SSML `` supported | Low latency (~250–300ms) + 50% lower price per character | +`sag` supports any MiniMax model ID via `--model-id` (we pass it through). Default is `speech-01`. Notes: -- SSML `` works on v2/v2.5, not v3. Use pause tags on v3 instead. -- Input limits differ by engine (v3: 5,000 chars; v2: 10,000 chars; v2.5 Turbo/Flash: 40,000 chars). If you hit limits, chunk text and stitch audio. -- `--normalize on` may not be available for v2.5 Turbo/Flash (higher latency); prefer `auto`/`off` if it errors. -- Source of truth: ElevenLabs “Models” docs. +- Model availability varies by account/region; see MiniMax docs for the current list. +- If you hit length limits, chunk text and stitch audio. ## Development - With pnpm: @@ -120,6 +109,7 @@ Notes: - Build: `go build ./cmd/sag` ## Limitations -- ElevenLabs account and API key required. +- MiniMax account and API key required. - Voice defaults to first available if not provided. - Non-mac platforms: playback still works via `go-mp3` + `oto`, but device selection flags are no-ops. +- Playback only supports MP3 output; use `--no-play` for WAV/FLAC/PCM. diff --git a/cmd/api_key.go b/cmd/api_key.go index 8b22232..c1ad2ba 100644 --- a/cmd/api_key.go +++ b/cmd/api_key.go @@ -15,13 +15,13 @@ func ensureAPIKey() error { cfg.APIKey = key } if cfg.APIKey == "" { - cfg.APIKey = os.Getenv("ELEVENLABS_API_KEY") + cfg.APIKey = os.Getenv("MINIMAX_API_KEY") } if cfg.APIKey == "" { cfg.APIKey = os.Getenv("SAG_API_KEY") } if cfg.APIKey == "" { - return fmt.Errorf("missing ElevenLabs API key (set --api-key, --api-key-file, or ELEVENLABS_API_KEY)") + return fmt.Errorf("missing MiniMax API key (set --api-key, --api-key-file, MINIMAX_API_KEY, or SAG_API_KEY)") } return nil } @@ -29,7 +29,7 @@ func ensureAPIKey() error { func resolveAPIKeyFromFile() (string, error) { path := cfg.APIKeyFile if path == "" { - path = os.Getenv("ELEVENLABS_API_KEY_FILE") + path = os.Getenv("MINIMAX_API_KEY_FILE") } if path == "" { path = os.Getenv("SAG_API_KEY_FILE") diff --git a/cmd/api_key_test.go b/cmd/api_key_test.go index 2dc052c..c8a5012 100644 --- a/cmd/api_key_test.go +++ b/cmd/api_key_test.go @@ -9,15 +9,15 @@ import ( func keepEnv(t *testing.T) func() { t.Helper() orig := map[string]string{ - "ELEVENLABS_API_KEY": os.Getenv("ELEVENLABS_API_KEY"), + "MINIMAX_API_KEY": os.Getenv("MINIMAX_API_KEY"), "SAG_API_KEY": os.Getenv("SAG_API_KEY"), - "ELEVENLABS_API_KEY_FILE": os.Getenv("ELEVENLABS_API_KEY_FILE"), + "MINIMAX_API_KEY_FILE": os.Getenv("MINIMAX_API_KEY_FILE"), "SAG_API_KEY_FILE": os.Getenv("SAG_API_KEY_FILE"), } return func() { - _ = os.Setenv("ELEVENLABS_API_KEY", orig["ELEVENLABS_API_KEY"]) + _ = os.Setenv("MINIMAX_API_KEY", orig["MINIMAX_API_KEY"]) _ = os.Setenv("SAG_API_KEY", orig["SAG_API_KEY"]) - _ = os.Setenv("ELEVENLABS_API_KEY_FILE", orig["ELEVENLABS_API_KEY_FILE"]) + _ = os.Setenv("MINIMAX_API_KEY_FILE", orig["MINIMAX_API_KEY_FILE"]) _ = os.Setenv("SAG_API_KEY_FILE", orig["SAG_API_KEY_FILE"]) } } @@ -26,9 +26,9 @@ func TestEnsureAPIKeyPrefersCLIValue(t *testing.T) { defer keepEnv(t)() cfg.APIKey = "cli-key" cfg.APIKeyFile = "" - _ = os.Unsetenv("ELEVENLABS_API_KEY") + _ = os.Unsetenv("MINIMAX_API_KEY") _ = os.Unsetenv("SAG_API_KEY") - _ = os.Unsetenv("ELEVENLABS_API_KEY_FILE") + _ = os.Unsetenv("MINIMAX_API_KEY_FILE") _ = os.Unsetenv("SAG_API_KEY_FILE") if err := ensureAPIKey(); err != nil { @@ -43,9 +43,9 @@ func TestEnsureAPIKeyFromFileFlag(t *testing.T) { defer keepEnv(t)() cfg.APIKey = "" cfg.APIKeyFile = "" - _ = os.Unsetenv("ELEVENLABS_API_KEY") + _ = os.Unsetenv("MINIMAX_API_KEY") _ = os.Unsetenv("SAG_API_KEY") - _ = os.Unsetenv("ELEVENLABS_API_KEY_FILE") + _ = os.Unsetenv("MINIMAX_API_KEY_FILE") _ = os.Unsetenv("SAG_API_KEY_FILE") tmp, err := os.CreateTemp("", "sag_api_key") @@ -73,7 +73,7 @@ func TestEnsureAPIKeyFromEnvFileOrder(t *testing.T) { defer keepEnv(t)() cfg.APIKey = "" cfg.APIKeyFile = "" - _ = os.Unsetenv("ELEVENLABS_API_KEY") + _ = os.Unsetenv("MINIMAX_API_KEY") _ = os.Unsetenv("SAG_API_KEY") tmpPrimary, err := os.CreateTemp("", "sag_api_key_primary") @@ -100,17 +100,17 @@ func TestEnsureAPIKeyFromEnvFileOrder(t *testing.T) { t.Fatalf("close temp: %v", err) } - _ = os.Setenv("ELEVENLABS_API_KEY_FILE", tmpPrimary.Name()) + _ = os.Setenv("MINIMAX_API_KEY_FILE", tmpPrimary.Name()) _ = os.Setenv("SAG_API_KEY_FILE", tmpFallback.Name()) if err := ensureAPIKey(); err != nil { t.Fatalf("ensureAPIKey error: %v", err) } if cfg.APIKey != "primary-key" { - t.Fatalf("expected ELEVENLABS_API_KEY_FILE to be used, got %q", cfg.APIKey) + t.Fatalf("expected MINIMAX_API_KEY_FILE to be used, got %q", cfg.APIKey) } cfg.APIKey = "" - _ = os.Unsetenv("ELEVENLABS_API_KEY_FILE") + _ = os.Unsetenv("MINIMAX_API_KEY_FILE") if err := ensureAPIKey(); err != nil { t.Fatalf("ensureAPIKey error: %v", err) } @@ -123,21 +123,21 @@ func TestEnsureAPIKeyFallsBackToEnvOrder(t *testing.T) { defer keepEnv(t)() cfg.APIKey = "" cfg.APIKeyFile = "" - _ = os.Setenv("ELEVENLABS_API_KEY", "env-key") + _ = os.Setenv("MINIMAX_API_KEY", "env-key") _ = os.Setenv("SAG_API_KEY", "sag-key") - _ = os.Unsetenv("ELEVENLABS_API_KEY_FILE") + _ = os.Unsetenv("MINIMAX_API_KEY_FILE") _ = os.Unsetenv("SAG_API_KEY_FILE") if err := ensureAPIKey(); err != nil { t.Fatalf("ensureAPIKey error: %v", err) } if cfg.APIKey != "env-key" { - t.Fatalf("expected ELEVENLABS_API_KEY to be used, got %q", cfg.APIKey) + t.Fatalf("expected MINIMAX_API_KEY to be used, got %q", cfg.APIKey) } // Clear primary env to ensure SAG_API_KEY is used next. cfg.APIKey = "" - _ = os.Unsetenv("ELEVENLABS_API_KEY") + _ = os.Unsetenv("MINIMAX_API_KEY") if err := ensureAPIKey(); err != nil { t.Fatalf("ensureAPIKey error: %v", err) } @@ -150,9 +150,9 @@ func TestEnsureAPIKeyMissing(t *testing.T) { defer keepEnv(t)() cfg.APIKey = "" cfg.APIKeyFile = "" - _ = os.Unsetenv("ELEVENLABS_API_KEY") + _ = os.Unsetenv("MINIMAX_API_KEY") _ = os.Unsetenv("SAG_API_KEY") - _ = os.Unsetenv("ELEVENLABS_API_KEY_FILE") + _ = os.Unsetenv("MINIMAX_API_KEY_FILE") _ = os.Unsetenv("SAG_API_KEY_FILE") if err := ensureAPIKey(); err == nil { diff --git a/cmd/http_testutil.go b/cmd/http_testutil.go new file mode 100644 index 0000000..c40434b --- /dev/null +++ b/cmd/http_testutil.go @@ -0,0 +1,20 @@ +package cmd + +import ( + "net/http" + "net/http/httptest" +) + +// handlerRoundTripper adapts an http.Handler to an http.RoundTripper without binding sockets. +type handlerRoundTripper struct { + handler http.Handler +} + +// RoundTrip satisfies http.RoundTripper by invoking the handler directly. +func (rt handlerRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { + rr := httptest.NewRecorder() + rt.handler.ServeHTTP(rr, req) + res := rr.Result() + res.Request = req + return res, nil +} diff --git a/cmd/prompting.go b/cmd/prompting.go index df3eccb..e895881 100644 --- a/cmd/prompting.go +++ b/cmd/prompting.go @@ -15,7 +15,7 @@ func init() { cmd := &cobra.Command{ Use: "prompting", Aliases: []string{"prompt", "guide", "tips"}, - Short: "Prompting guide for better ElevenLabs speech", + Short: "Prompting guide for better MiniMax speech", Long: "Prints a practical prompting guide (model-specific tips, tags, and knobs) to improve voice quality and control.", RunE: func(cmd *cobra.Command, _ []string) error { out := strings.TrimSpace(promptingGuide) diff --git a/cmd/prompting_guide.md b/cmd/prompting_guide.md index 7f67e3b..7dd1e4e 100644 --- a/cmd/prompting_guide.md +++ b/cmd/prompting_guide.md @@ -3,81 +3,45 @@ Goal: “more natural” output + controllable delivery. ## Choose model (matters) - -### v3 (alpha) (default in `sag`) -- Model ID: `eleven_v3` -- Uses inline audio tags: lowercase `[square brackets]` inside your text. -- SSML `` is *not* supported; use v3 pause tags instead: `[pause]`, `[short pause]`, `[long pause]`. -- Short prompts can be unstable; longer scripts tend to behave better (aim 250+ chars). - -### v2 / v2.5 (stable baseline) -- Model ID: `eleven_multilingual_v2` -- Best baseline for “just speak this”. -- Supports SSML `` for precise pauses (up to ~3s). -- Some English-only models support SSML `` for pronunciation (not yet exposed in `sag`). - -### v2.5 speed/cost options -- Flash: `eleven_flash_v2_5` (ultra-low latency; up to 40,000 chars per request; 50% lower price per character) -- Turbo: `eleven_turbo_v2_5` (low latency; up to 40,000 chars per request; 50% lower price per character) -- Prompting looks like v2 (plain text + SSML ``). If numbers/units sound off, try `--normalize auto` and/or respell. -- If `--normalize on` errors on v2.5, use `auto` or `off`. +- Default model: `speech-01`. +- MiniMax model IDs vary by account/region; check the MiniMax docs for the full list. +- If you have multiple tiers, pick higher-quality models for narration and lower-latency models for realtime. ## Universal “make it sound good” rules - Write like a script: short sentences; newlines for beats. -- Punctuation is control: commas + em-dashes slow; ellipses add weight; `!` adds energy. -- Put emphasis in words, not instructions: “quietly” usually beats “whisper please”. -- If you need exact pronunciation: respell (e.g. “key-note”); otherwise enable normalization. - -## v3 audio tags (examples) - -Voice-related: -- `[whispers]`, `[shouts]` -- `[laughs]`, `[starts laughing]`, `[wheezing]` -- `[sighs]`, `[exhales]`, `[clears throat]` -- `[sarcastic]`, `[curious]`, `[excited]`, `[crying]`, `[mischievously]` - -Sound effects: -- `[applause]`, `[clapping]`, `[gunshot]`, `[explosion]` -- `[swallows]`, `[gulps]` - -Experimental: -- `[strong X accent]` (replace X, e.g. “French”) -- `[sings]` - -Notes: -- Tag effectiveness depends on the voice + training samples; not every voice reacts well. -- Combine tags sparingly; more tags ≠ better audio. +- Punctuation is control: commas slow; em-dashes add breath; ellipses add weight; `!` adds energy. +- Use words for intent: “quietly” often works better than meta-instructions. +- If pronunciation is off: respell, add hyphens, or split syllables. -## Knobs in `sag` (0.2.1) +## Knobs in `sag` -Voice sliders: -- `--stability` (v3 presets: 0.0=Creative, 0.5=Natural, 1.0=Robust; v2/v2.5: 0..1) -- `--similarity 0..1` (higher = closer to reference voice, less flexible) -- `--style 0..1` (higher = more “styled” delivery; voice/model dependent) -- `--speaker-boost` (can add clarity; model dependent) +Voice controls (model dependent): +- `--speed` or `--rate` to control delivery pace. +- `--emotion` to hint the tone (e.g. `neutral`, `happy`, `sad`). +- `--pitch` and `--volume` for finer adjustments. Request controls: -- `--seed 0..4294967295` (best-effort repeatability; not perfect determinism) -- `--normalize auto|on|off` (numbers/units/URLs normalization) -- `--lang en|de|fr|...` (2-letter ISO 639-1; influences normalization) -- `--metrics` (prints chars/bytes/duration so you can iterate faster) +- `--normalize auto|on|off` (auto uses server defaults; on/off force text normalization). +- `--lang` language boost hint (e.g. `en`, `zh`, `auto`). +- `--format` output audio format (e.g. `mp3`, `mp3_44100_128`, `wav`). +- `--metrics` prints chars/bytes/duration so you can iterate faster. ## Quick recipes -Natural narrator (v2 baseline): +Natural narrator: ``` -sag speak -v Roger --stability 0.5 --similarity 0.75 --style 0.0 --normalize auto --lang en \ +sag speak -v "Your Voice" --normalize auto --lang en \ "We shipped today. It was close… but it worked." ``` -Fast + cheap (v2.5 Flash): +Expressive delivery: ``` -sag speak -v Roger --model-id eleven_flash_v2_5 --stability 0.5 --normalize auto --lang en \ - "Short. Crisp. Low latency." +sag speak -v "Your Voice" --emotion happy --speed 1.05 \ + "We did it! I can’t believe it actually worked." ``` -Expressive (v3): +Calm + slower: ``` -sag speak -v Roger --model-id eleven_v3 --stability 0.35 --normalize off --lang en \ - "[whispers] Don’t move. [short pause] Something’s in the hallway…" +sag speak -v "Your Voice" --emotion neutral --rate 140 \ + "Take a breath. Slow down. Let the point land." ``` diff --git a/cmd/root.go b/cmd/root.go index 4915e2b..544efcb 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -19,8 +19,8 @@ var ( versionFlag bool rootCmd = &cobra.Command{ Use: "sag", - Short: "🗣️ ElevenLabs speech, mac-style ease", - Long: "Command-line ElevenLabs TTS with macOS playback. Call it like macOS 'say': if you skip the subcommand, text args are passed to 'speak' (e.g. `sag \"Hello\"`).\n\nTip: run `sag prompting` for model-specific prompting tips.\nModels: `eleven_v3` (default), `eleven_multilingual_v2` (stable), `eleven_flash_v2_5` (fast/cheap), `eleven_turbo_v2_5` (balanced).", + Short: "🗣️ MiniMax speech, mac-style ease", + Long: "Command-line MiniMax TTS with macOS playback. Call it like macOS 'say': if you skip the subcommand, text args are passed to 'speak' (e.g. `sag \"Hello\"`).\n\nTip: run `sag prompting` for model-specific prompting tips.\nModels: `speech-01` (default; adjust per MiniMax docs).", Example: " sag \"Hi Peter\"\n echo 'piped input' | sag\n sag speak -v Roger --rate 200 \"Faster speech\"\n sag prompting", Version: "0.2.2", PersistentPreRunE: func(cmd *cobra.Command, _ []string) error { @@ -43,9 +43,9 @@ func Execute() { } func init() { - rootCmd.PersistentFlags().StringVar(&cfg.APIKey, "api-key", "", "ElevenLabs API key (or ELEVENLABS_API_KEY)") - rootCmd.PersistentFlags().StringVar(&cfg.APIKeyFile, "api-key-file", "", "Read ElevenLabs API key from file (or ELEVENLABS_API_KEY_FILE)") - rootCmd.PersistentFlags().StringVar(&cfg.BaseURL, "base-url", "https://api.elevenlabs.io", "Override ElevenLabs API base URL") + rootCmd.PersistentFlags().StringVar(&cfg.APIKey, "api-key", "", "MiniMax API key (or MINIMAX_API_KEY)") + rootCmd.PersistentFlags().StringVar(&cfg.APIKeyFile, "api-key-file", "", "Read MiniMax API key from file (or MINIMAX_API_KEY_FILE)") + rootCmd.PersistentFlags().StringVar(&cfg.BaseURL, "base-url", "https://api.minimax.io", "Override MiniMax API base URL") rootCmd.PersistentFlags().BoolVarP(&versionFlag, "version", "V", false, "Print version and exit") } diff --git a/cmd/speak.go b/cmd/speak.go index e043407..563fdbc 100644 --- a/cmd/speak.go +++ b/cmd/speak.go @@ -7,37 +7,34 @@ import ( "io" "os" "path/filepath" + "strconv" "strings" "text/tabwriter" "time" "github.com/steipete/sag/internal/audio" - "github.com/steipete/sag/internal/elevenlabs" + "github.com/steipete/sag/internal/minimax" "github.com/spf13/cobra" ) type speakOptions struct { - voiceID string - modelID string - outputPath string - outputFmt string - stream bool - play bool - latencyTier int - speed float64 - rateWPM int - inputFile string - stability float64 - similarity float64 - style float64 - seed uint64 - normalize string - lang string - metrics bool - - speakerBoost bool - noSpeakerBoost bool + voiceID string + modelID string + outputPath string + outputFmt string + stream bool + play bool + speed float64 + rateWPM int + inputFile string + normalize string + lang string + metrics bool + + emotion string + pitch int + volume float64 } const defaultWPM = 175 // matches macOS `say` default rate @@ -46,7 +43,7 @@ var playToSpeakers = audio.StreamToSpeakers func init() { opts := speakOptions{ - modelID: "eleven_v3", + modelID: "speech-01", outputFmt: "mp3_44100_128", stream: true, play: true, @@ -55,7 +52,7 @@ func init() { cmd := &cobra.Command{ Use: "speak [text]", - Short: "Speak the provided text using ElevenLabs TTS (default: stream to speakers)", + Short: "Speak the provided text using MiniMax TTS (default: stream to speakers)", Long: "If no text argument is provided, the command reads from stdin.\n\nTip: run `sag prompting` for model-specific prompting tips and recommended flag combinations.", Args: cobra.ArbitraryArgs, PreRunE: func(_ *cobra.Command, _ []string) error { @@ -69,7 +66,7 @@ func init() { forceVoiceID := cmd.Flags().Changed("voice-id") voiceInput := opts.voiceID if voiceInput == "" { - if env := os.Getenv("ELEVENLABS_VOICE_ID"); env != "" { + if env := os.Getenv("MINIMAX_VOICE_ID"); env != "" { voiceInput = env forceVoiceID = true } else if env := os.Getenv("SAG_VOICE_ID"); env != "" { @@ -77,7 +74,7 @@ func init() { forceVoiceID = true } } - client := elevenlabs.NewClient(cfg.APIKey, cfg.BaseURL) + client := minimax.NewClient(cfg.APIKey, cfg.BaseURL) voiceID, err := resolveVoice(cmd.Context(), client, voiceInput, forceVoiceID) if err != nil { @@ -113,6 +110,16 @@ func init() { return err } + if opts.play { + formatKind := outputFormatKind(opts.outputFmt) + if formatKind != "" && formatKind != "mp3" { + return fmt.Errorf("playback requires mp3 output; use --format mp3 or disable --play") + } + if formatKind == "" && opts.outputFmt != "" { + return fmt.Errorf("playback requires mp3 output; unknown format %q", opts.outputFmt) + } + } + start := time.Now() var bytes int64 if opts.stream { @@ -129,32 +136,27 @@ func init() { } } if opts.metrics { - fmt.Fprintf(os.Stderr, "metrics: chars=%d bytes=%d model=%s voice=%s stream=%t latencyTier=%d dur=%s\n", - len([]rune(text)), bytes, opts.modelID, opts.voiceID, opts.stream, opts.latencyTier, time.Since(start).Truncate(time.Millisecond)) + fmt.Fprintf(os.Stderr, "metrics: chars=%d bytes=%d model=%s voice=%s stream=%t dur=%s\n", + len([]rune(text)), bytes, opts.modelID, opts.voiceID, opts.stream, time.Since(start).Truncate(time.Millisecond)) } return nil }, } - cmd.Flags().StringVar(&opts.voiceID, "voice-id", "", "Voice ID to use (ELEVENLABS_VOICE_ID)") + cmd.Flags().StringVar(&opts.voiceID, "voice-id", "", "Voice ID to use (MINIMAX_VOICE_ID)") cmd.Flags().StringVarP(&opts.voiceID, "voice", "v", "", "Alias for --voice-id; accepts name or ID; use '?' to list voices") - cmd.Flags().StringVar(&opts.modelID, "model-id", opts.modelID, "Model ID (default: eleven_v3). Common: eleven_multilingual_v2 (stable), eleven_flash_v2_5 (fast/cheap), eleven_turbo_v2_5 (balanced).") + cmd.Flags().StringVar(&opts.modelID, "model-id", opts.modelID, "Model ID (default: speech-01). See MiniMax docs for available models.") cmd.Flags().StringVarP(&opts.outputPath, "output", "o", "", "Write audio to file (disables playback unless --play is also set)") cmd.Flags().StringVar(&opts.outputFmt, "format", opts.outputFmt, "Output format (e.g. mp3_44100_128)") cmd.Flags().BoolVar(&opts.stream, "stream", opts.stream, "Stream audio while generating") cmd.Flags().BoolVar(&opts.play, "play", opts.play, "Play audio through speakers") - cmd.Flags().IntVar(&opts.latencyTier, "latency-tier", 0, "Streaming latency tier (0=default,1-4 lower latency may cost more)") cmd.Flags().Float64Var(&opts.speed, "speed", opts.speed, "Speech speed multiplier (e.g. 1.1 faster, 0.9 slower)") cmd.Flags().IntVarP(&opts.rateWPM, "rate", "r", 0, "macOS say-style words-per-minute; overrides --speed when set (default 175 wpm)") - cmd.Flags().Float64Var(&opts.stability, "stability", 0, "Voice stability (0..1; higher = more consistent, less expressive)") - cmd.Flags().Float64Var(&opts.similarity, "similarity", 0, "Voice similarity boost (0..1; higher = closer to reference voice)") - cmd.Flags().Float64Var(&opts.similarity, "similarity-boost", 0, "Alias for --similarity") - cmd.Flags().Float64Var(&opts.style, "style", 0, "Voice style exaggeration (0..1; higher = more stylized delivery)") - cmd.Flags().BoolVar(&opts.speakerBoost, "speaker-boost", false, "Enable speaker boost (can improve clarity; model dependent)") - cmd.Flags().BoolVar(&opts.noSpeakerBoost, "no-speaker-boost", false, "Disable speaker boost") - cmd.Flags().Uint64Var(&opts.seed, "seed", 0, "Best-effort deterministic seed (0..4294967295; helps repeatability across runs)") cmd.Flags().StringVar(&opts.normalize, "normalize", "", "Text normalization: auto|on|off (numbers/units/URLs; when set)") - cmd.Flags().StringVar(&opts.lang, "lang", "", "Language code (2-letter ISO 639-1; influences normalization; when set)") + cmd.Flags().StringVar(&opts.lang, "lang", "", "Language boost hint (e.g. en, zh, auto; when set)") + cmd.Flags().StringVar(&opts.emotion, "emotion", "", "Emotion hint (model dependent; e.g. neutral, happy, sad)") + cmd.Flags().IntVar(&opts.pitch, "pitch", 0, "Pitch adjustment (model dependent; when set)") + cmd.Flags().Float64Var(&opts.volume, "volume", 0, "Volume adjustment (model dependent; when set)") cmd.Flags().BoolVar(&opts.metrics, "metrics", false, "Print request metrics to stderr (chars, bytes, duration, etc.)") cmd.Flags().StringVarP(&opts.inputFile, "input-file", "f", "", "Read text from file (use '-' for stdin), matching macOS say -f") cmd.Flags().Bool("progress", false, "Accepted for macOS say compatibility (no-op)") @@ -172,7 +174,7 @@ func init() { func applyRateAndSpeed(opts *speakOptions) error { if opts.rateWPM > 0 { - // Map macOS `say` rate (words per minute) to ElevenLabs speed multiplier. + // Map macOS `say` rate (words per minute) to speed multiplier. opts.speed = float64(opts.rateWPM) / float64(defaultWPM) if opts.speed <= 0.5 || opts.speed >= 2.0 { return fmt.Errorf("rate %d wpm maps to speed %.2f, which is outside the allowed 0.5–2.0 range", opts.rateWPM, opts.speed) @@ -185,114 +187,70 @@ func applyRateAndSpeed(opts *speakOptions) error { return nil } -func buildTTSRequest(cmd *cobra.Command, opts speakOptions, text string) (elevenlabs.TTSRequest, error) { +func buildTTSRequest(cmd *cobra.Command, opts speakOptions, text string) (minimax.TTSRequest, error) { flags := cmd.Flags() - var stabilityPtr *float64 - if flags.Changed("stability") { - if opts.stability < 0 || opts.stability > 1 { - return elevenlabs.TTSRequest{}, errors.New("stability must be between 0 and 1") - } - if opts.modelID == "eleven_v3" { - if !floatEqualsOneOf(opts.stability, []float64{0, 0.5, 1}) { - return elevenlabs.TTSRequest{}, errors.New("for eleven_v3, stability must be one of 0.0, 0.5, 1.0 (Creative/Natural/Robust)") - } - } - stabilityPtr = &opts.stability - } - - var similarityPtr *float64 - if flags.Changed("similarity") || flags.Changed("similarity-boost") { - if opts.similarity < 0 || opts.similarity > 1 { - return elevenlabs.TTSRequest{}, errors.New("similarity must be between 0 and 1") - } - similarityPtr = &opts.similarity - } - - var stylePtr *float64 - if flags.Changed("style") { - if opts.style < 0 || opts.style > 1 { - return elevenlabs.TTSRequest{}, errors.New("style must be between 0 and 1") - } - stylePtr = &opts.style - } - - if flags.Changed("speaker-boost") && flags.Changed("no-speaker-boost") { - return elevenlabs.TTSRequest{}, errors.New("choose only one of --speaker-boost or --no-speaker-boost") - } - var speakerBoostPtr *bool - if flags.Changed("speaker-boost") { - v := true - speakerBoostPtr = &v - } else if flags.Changed("no-speaker-boost") { - v := false - speakerBoostPtr = &v - } - - var seedPtr *uint32 - if flags.Changed("seed") { - if opts.seed > 4294967295 { - return elevenlabs.TTSRequest{}, errors.New("seed must be between 0 and 4294967295") - } - v := uint32(opts.seed) - seedPtr = &v - } - normalize := strings.ToLower(strings.TrimSpace(opts.normalize)) + var textNormalization *bool if flags.Changed("normalize") { switch normalize { - case "auto", "on", "off": + case "auto": + case "on": + v := true + textNormalization = &v + case "off": + v := false + textNormalization = &v default: - return elevenlabs.TTSRequest{}, errors.New("normalize must be one of: auto, on, off") + return minimax.TTSRequest{}, errors.New("normalize must be one of: auto, on, off") } - } else { - normalize = "" } - lang := strings.ToLower(strings.TrimSpace(opts.lang)) + lang := strings.TrimSpace(opts.lang) if flags.Changed("lang") { - if len(lang) != 2 { - return elevenlabs.TTSRequest{}, errors.New("lang must be a 2-letter ISO 639-1 code (e.g. en, de, fr)") - } - for _, r := range lang { - if r < 'a' || r > 'z' { - return elevenlabs.TTSRequest{}, errors.New("lang must be a 2-letter ISO 639-1 code (e.g. en, de, fr)") - } + if lang == "" { + return minimax.TTSRequest{}, errors.New("lang must be non-empty when set") } } else { lang = "" } + outputFormat, audioSetting, err := parseOutputFormat(opts.outputFmt) + if err != nil { + return minimax.TTSRequest{}, err + } + speed := opts.speed - return elevenlabs.TTSRequest{ - Text: text, - ModelID: opts.modelID, - OutputFormat: opts.outputFmt, - Seed: seedPtr, - ApplyTextNormalization: normalize, - LanguageCode: lang, - VoiceSettings: &elevenlabs.VoiceSettings{ - Speed: &speed, - Stability: stabilityPtr, - SimilarityBoost: similarityPtr, - Style: stylePtr, - UseSpeakerBoost: speakerBoostPtr, - }, - }, nil -} + voice := &minimax.VoiceSetting{ + VoiceID: opts.voiceID, + Speed: &speed, + TextNormalization: textNormalization, + } + if flags.Changed("volume") { + v := opts.volume + voice.Vol = &v + } + if flags.Changed("pitch") { + v := opts.pitch + voice.Pitch = &v + } + if flags.Changed("emotion") { + voice.Emotion = strings.TrimSpace(opts.emotion) + } -func floatEqualsOneOf(v float64, allowed []float64) bool { - const eps = 1e-9 - for _, a := range allowed { - d := v - a - if d < 0 { - d = -d - } - if d <= eps { - return true - } + req := minimax.TTSRequest{ + Model: opts.modelID, + Text: text, + Stream: opts.stream, + VoiceSetting: voice, + AudioSetting: audioSetting, + OutputFormat: outputFormat, + LanguageBoost: lang, + } + if opts.stream { + req.StreamOptions = &minimax.StreamOptions{ExcludeAggregatedAudio: true} } - return false + return req, nil } func resolveText(args []string, inputFile string) (string, error) { @@ -340,8 +298,8 @@ func isStdinTTY() bool { return (stat.Mode() & os.ModeCharDevice) != 0 } -func streamAndPlay(ctx context.Context, client *elevenlabs.Client, opts speakOptions, payload elevenlabs.TTSRequest) (int64, error) { - resp, err := client.StreamTTS(ctx, opts.voiceID, payload, opts.latencyTier) +func streamAndPlay(ctx context.Context, client *minimax.Client, opts speakOptions, payload minimax.TTSRequest) (int64, error) { + resp, err := client.StreamTTS(ctx, payload) if err != nil { return 0, err } @@ -397,8 +355,8 @@ func streamAndPlay(ctx context.Context, client *elevenlabs.Client, opts speakOpt return n, err } -func convertAndPlay(ctx context.Context, client *elevenlabs.Client, opts speakOptions, payload elevenlabs.TTSRequest) (int64, error) { - data, err := client.ConvertTTS(ctx, opts.voiceID, payload) +func convertAndPlay(ctx context.Context, client *minimax.Client, opts speakOptions, payload minimax.TTSRequest) (int64, error) { + data, err := client.ConvertTTS(ctx, payload) if err != nil { return 0, err } @@ -427,25 +385,25 @@ func convertAndPlay(ctx context.Context, client *elevenlabs.Client, opts speakOp return n, nil } -func resolveVoice(ctx context.Context, client *elevenlabs.Client, voiceInput string, forceID bool) (string, error) { +func resolveVoice(ctx context.Context, client *minimax.Client, voiceInput string, forceID bool) (string, error) { voiceInput = strings.TrimSpace(voiceInput) if voiceInput == "" { ctx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() - voices, err := client.ListVoices(ctx) + voices, err := client.ListVoices(ctx, "") if err != nil { return "", fmt.Errorf("voice not specified and failed to fetch voices: %w", err) } if len(voices) == 0 { - return "", errors.New("no voices available; specify --voice or set ELEVENLABS_VOICE_ID") + return "", errors.New("no voices available; specify --voice or set MINIMAX_VOICE_ID") } - fmt.Fprintf(os.Stderr, "defaulting to voice %s (%s)\n", voices[0].Name, voices[0].VoiceID) + fmt.Fprintf(os.Stderr, "defaulting to voice %s (%s)\n", voiceLabel(voices[0]), voices[0].VoiceID) return voices[0].VoiceID, nil } if voiceInput == "?" { ctx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() - voices, err := client.ListVoices(ctx) + voices, err := client.ListVoices(ctx, "") if err != nil { return "", err } @@ -454,7 +412,7 @@ func resolveVoice(ctx context.Context, client *elevenlabs.Client, voiceInput str return "", err } for _, v := range voices { - if _, err := fmt.Fprintf(w, "%s\t%s\t%s\n", v.VoiceID, v.Name, v.Category); err != nil { + if _, err := fmt.Fprintf(w, "%s\t%s\t%s\n", v.VoiceID, voiceLabel(v), v.Category); err != nil { return "", err } } @@ -468,29 +426,9 @@ func resolveVoice(ctx context.Context, client *elevenlabs.Client, voiceInput str return voiceInput, nil } - if looksLikeVoiceID(voiceInput) { - if containsDigit(voiceInput) { - return voiceInput, nil - } - ctx, cancel := context.WithTimeout(ctx, 30*time.Second) - defer cancel() - voices, err := client.ListVoices(ctx) - if err != nil { - return "", err - } - voiceInputLower := strings.ToLower(voiceInput) - for _, v := range voices { - if strings.ToLower(v.Name) == voiceInputLower { - fmt.Fprintf(os.Stderr, "using voice %s (%s)\n", v.Name, v.VoiceID) - return v.VoiceID, nil - } - } - return voiceInput, nil - } - ctx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() - voices, err := client.ListVoices(ctx) + voices, err := client.ListVoices(ctx, "") if err != nil { return "", err } @@ -498,16 +436,20 @@ func resolveVoice(ctx context.Context, client *elevenlabs.Client, voiceInput str // First, check for exact match (case-insensitive) for _, v := range voices { - if strings.ToLower(v.Name) == voiceInputLower { - fmt.Fprintf(os.Stderr, "using voice %s (%s)\n", v.Name, v.VoiceID) + if strings.ToLower(v.VoiceID) == voiceInputLower { + fmt.Fprintf(os.Stderr, "using voice %s (%s)\n", voiceLabel(v), v.VoiceID) + return v.VoiceID, nil + } + if strings.ToLower(v.VoiceName) == voiceInputLower { + fmt.Fprintf(os.Stderr, "using voice %s (%s)\n", voiceLabel(v), v.VoiceID) return v.VoiceID, nil } } // Then, check for substring match (case-insensitive) for _, v := range voices { - if strings.Contains(strings.ToLower(v.Name), voiceInputLower) { - fmt.Fprintf(os.Stderr, "using voice %s (%s)\n", v.Name, v.VoiceID) + if strings.Contains(strings.ToLower(voiceLabel(v)), voiceInputLower) || strings.Contains(strings.ToLower(v.VoiceID), voiceInputLower) { + fmt.Fprintf(os.Stderr, "using voice %s (%s)\n", voiceLabel(v), v.VoiceID) return v.VoiceID, nil } } @@ -515,17 +457,14 @@ func resolveVoice(ctx context.Context, client *elevenlabs.Client, voiceInput str return "", fmt.Errorf("voice %q not found; try 'sag voices' or -v '?'", voiceInput) } -func looksLikeVoiceID(voiceInput string) bool { - return len(voiceInput) >= 15 && !strings.ContainsRune(voiceInput, ' ') -} - -func containsDigit(s string) bool { - for _, r := range s { - if r >= '0' && r <= '9' { - return true - } +func voiceLabel(voice minimax.Voice) string { + if strings.TrimSpace(voice.VoiceName) != "" { + return voice.VoiceName + } + if len(voice.Description) > 0 { + return strings.Join(voice.Description, ", ") } - return false + return voice.VoiceID } func inferFormatFromExt(path string) string { @@ -534,8 +473,78 @@ func inferFormatFromExt(path string) string { case ".mp3": return "mp3_44100_128" case ".wav", ".wave": - return "pcm_44100" + return "wav" + case ".flac": + return "flac" + case ".pcm": + return "pcm" + default: + return "" + } +} + +func outputFormatKind(value string) string { + lower := strings.ToLower(strings.TrimSpace(value)) + if lower == "" { + return "" + } + if idx := strings.Index(lower, "_"); idx >= 0 { + lower = lower[:idx] + } + switch lower { + case "mp3", "wav", "flac", "pcm": + return lower default: return "" } } + +func parseOutputFormat(value string) (string, *minimax.AudioSetting, error) { + value = strings.TrimSpace(value) + if value == "" { + return "", nil, nil + } + lower := strings.ToLower(value) + parts := strings.Split(lower, "_") + format := parts[0] + + if !isSupportedFormat(format) { + return lower, nil, nil + } + + setting := &minimax.AudioSetting{Format: format} + if len(parts) == 1 { + return format, setting, nil + } + if len(parts) > 3 { + return "", nil, fmt.Errorf("invalid format %q (expected format[_sampleRate[_bitrate]])", value) + } + + if len(parts) >= 2 { + sampleRate, err := strconv.Atoi(parts[1]) + if err != nil || sampleRate <= 0 { + return "", nil, fmt.Errorf("invalid sample rate in format %q", value) + } + setting.SampleRate = &sampleRate + } + if len(parts) == 3 { + bitrate, err := strconv.Atoi(parts[2]) + if err != nil || bitrate <= 0 { + return "", nil, fmt.Errorf("invalid bitrate in format %q", value) + } + if bitrate <= 1000 { + bitrate *= 1000 + } + setting.Bitrate = &bitrate + } + return format, setting, nil +} + +func isSupportedFormat(format string) bool { + switch format { + case "mp3", "wav", "flac", "pcm": + return true + default: + return false + } +} diff --git a/cmd/speak_integration_test.go b/cmd/speak_integration_test.go index 455545a..a8f0880 100644 --- a/cmd/speak_integration_test.go +++ b/cmd/speak_integration_test.go @@ -1,70 +1,97 @@ package cmd import ( + "encoding/hex" "encoding/json" "net/http" - "net/http/httptest" "os" - "path" "strings" "testing" + + "github.com/steipete/sag/internal/minimax" ) func TestSpeakCommand_FlagsBuildRequestAndMetrics(t *testing.T) { t.Helper() - const voiceID = "abc1234567890123" + const voiceID = "voice-123" - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if !strings.Contains(r.URL.Path, "/v1/text-to-speech/") { + handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/v1/t2a_v2" { t.Fatalf("unexpected path: %s", r.URL.Path) } - if path.Base(r.URL.Path) != voiceID { - t.Fatalf("expected voice ID %q, got %q", voiceID, path.Base(r.URL.Path)) - } var got map[string]any if err := json.NewDecoder(r.Body).Decode(&got); err != nil { t.Fatalf("decode body: %v", err) } - if got["model_id"] != "eleven_v3" { - t.Fatalf("expected model_id eleven_v3, got %v", got["model_id"]) + if got["model"] != "speech-01" { + t.Fatalf("expected model speech-01, got %v", got["model"]) } - if got["output_format"] != "mp3_44100_128" { - t.Fatalf("expected output_format mp3_44100_128, got %v", got["output_format"]) + if got["text"] != "Hello world" { + t.Fatalf("expected text Hello world, got %v", got["text"]) } - if got["seed"] != float64(42) { - t.Fatalf("expected seed 42, got %v", got["seed"]) + if v, ok := got["stream"]; ok && v != false { + t.Fatalf("expected stream false, got %v", v) } - if got["apply_text_normalization"] != "auto" { - t.Fatalf("expected apply_text_normalization auto, got %v", got["apply_text_normalization"]) + if got["language_boost"] != "en" { + t.Fatalf("expected language_boost en, got %v", got["language_boost"]) } - if got["language_code"] != "en" { - t.Fatalf("expected language_code en, got %v", got["language_code"]) + if got["output_format"] != "mp3" { + t.Fatalf("expected output_format mp3, got %v", got["output_format"]) } - vs, ok := got["voice_settings"].(map[string]any) + voiceSettings, ok := got["voice_setting"].(map[string]any) if !ok { - t.Fatalf("expected voice_settings object, got %T", got["voice_settings"]) + t.Fatalf("expected voice_setting object, got %T", got["voice_setting"]) + } + if voiceSettings["voice_id"] != voiceID { + t.Fatalf("expected voice_id %q, got %v", voiceID, voiceSettings["voice_id"]) + } + if voiceSettings["text_normalization"] != true { + t.Fatalf("expected text_normalization true, got %v", voiceSettings["text_normalization"]) + } + if voiceSettings["emotion"] != "happy" { + t.Fatalf("expected emotion happy, got %v", voiceSettings["emotion"]) + } + if voiceSettings["pitch"] != float64(2) { + t.Fatalf("expected pitch 2, got %v", voiceSettings["pitch"]) } - if vs["stability"] != 0.5 { - t.Fatalf("expected stability 0.5, got %v", vs["stability"]) + if voiceSettings["vol"] != 1.5 { + t.Fatalf("expected vol 1.5, got %v", voiceSettings["vol"]) } - if vs["similarity_boost"] != 0.8 { - t.Fatalf("expected similarity_boost 0.8, got %v", vs["similarity_boost"]) + + audioSetting, ok := got["audio_setting"].(map[string]any) + if !ok { + t.Fatalf("expected audio_setting object, got %T", got["audio_setting"]) + } + if audioSetting["format"] != "mp3" { + t.Fatalf("expected audio_setting.format mp3, got %v", audioSetting["format"]) } - if vs["style"] != 0.1 { - t.Fatalf("expected style 0.1, got %v", vs["style"]) + if audioSetting["sample_rate"] != float64(44100) { + t.Fatalf("expected audio_setting.sample_rate 44100, got %v", audioSetting["sample_rate"]) } - if vs["use_speaker_boost"] != true { - t.Fatalf("expected use_speaker_boost true, got %v", vs["use_speaker_boost"]) + if audioSetting["bitrate"] != float64(128000) { + t.Fatalf("expected audio_setting.bitrate 128000, got %v", audioSetting["bitrate"]) } + resp := map[string]any{ + "data": map[string]any{ + "audio": hex.EncodeToString([]byte("audio-bytes")), + "status": 2, + }, + "base_resp": map[string]any{ + "status_code": 0, + "status_msg": "success", + }, + } w.WriteHeader(http.StatusOK) - _, _ = w.Write([]byte("audio-bytes")) - })) - defer srv.Close() + _ = json.NewEncoder(w).Encode(resp) + }) + + minimax.SetHTTPClient(&http.Client{Transport: handlerRoundTripper{handler: handler}}) + defer minimax.SetHTTPClient(nil) tmp := t.TempDir() outPath := tmp + "/out.mp3" @@ -74,20 +101,18 @@ func TestSpeakCommand_FlagsBuildRequestAndMetrics(t *testing.T) { rootCmd.SetArgs([]string{ "--api-key", "testkey", - "--base-url", srv.URL, + "--base-url", "http://minimax.test", "speak", "--voice-id", voiceID, "--stream=false", "--play=false", "--output", outPath, "--metrics", - "--stability", "0.5", - "--similarity-boost", "0.8", - "--style", "0.1", - "--speaker-boost", - "--seed", "42", - "--normalize", "auto", - "--lang", "EN", + "--emotion", "happy", + "--pitch", "2", + "--volume", "1.5", + "--normalize", "on", + "--lang", "en", "Hello world", }) diff --git a/cmd/speak_request_test.go b/cmd/speak_request_test.go index e1b0cec..5a9300d 100644 --- a/cmd/speak_request_test.go +++ b/cmd/speak_request_test.go @@ -11,20 +11,19 @@ import ( func newSpeakTestCommand(t *testing.T) (*cobra.Command, *speakOptions) { t.Helper() opts := &speakOptions{ - modelID: "eleven_multilingual_v2", + modelID: "speech-01", outputFmt: "mp3_44100_128", speed: 1.0, + voiceID: "voice-1", + stream: false, } cmd := &cobra.Command{Use: "speak"} - cmd.Flags().Float64Var(&opts.stability, "stability", 0, "") - cmd.Flags().Float64Var(&opts.similarity, "similarity", 0, "") - cmd.Flags().Float64Var(&opts.similarity, "similarity-boost", 0, "") - cmd.Flags().Float64Var(&opts.style, "style", 0, "") - cmd.Flags().BoolVar(&opts.speakerBoost, "speaker-boost", false, "") - cmd.Flags().BoolVar(&opts.noSpeakerBoost, "no-speaker-boost", false, "") - cmd.Flags().Uint64Var(&opts.seed, "seed", 0, "") cmd.Flags().StringVar(&opts.normalize, "normalize", "", "") cmd.Flags().StringVar(&opts.lang, "lang", "", "") + cmd.Flags().StringVar(&opts.emotion, "emotion", "", "") + cmd.Flags().IntVar(&opts.pitch, "pitch", 0, "") + cmd.Flags().Float64Var(&opts.volume, "volume", 0, "") + cmd.Flags().StringVar(&opts.outputFmt, "format", opts.outputFmt, "") return cmd, opts } @@ -36,20 +35,23 @@ func TestBuildTTSRequest_DefaultsOmitOptionalFields(t *testing.T) { t.Fatalf("buildTTSRequest error: %v", err) } - if req.Seed != nil { - t.Fatalf("expected seed to be nil") + if req.OutputFormat != "mp3" { + t.Fatalf("expected output format mp3, got %q", req.OutputFormat) } - if req.ApplyTextNormalization != "" { - t.Fatalf("expected apply_text_normalization to be empty, got %q", req.ApplyTextNormalization) + if req.AudioSetting == nil || req.AudioSetting.SampleRate == nil || req.AudioSetting.Bitrate == nil { + t.Fatalf("expected audio settings populated") } - if req.LanguageCode != "" { - t.Fatalf("expected language_code to be empty, got %q", req.LanguageCode) + if *req.AudioSetting.SampleRate != 44100 || *req.AudioSetting.Bitrate != 128000 { + t.Fatalf("unexpected audio settings: %+v", req.AudioSetting) } - if req.VoiceSettings == nil || req.VoiceSettings.Speed == nil { - t.Fatalf("expected voice_settings.speed to be set") + if req.LanguageBoost != "" { + t.Fatalf("expected language_boost to be empty, got %q", req.LanguageBoost) } - if req.VoiceSettings.Stability != nil || req.VoiceSettings.SimilarityBoost != nil || req.VoiceSettings.Style != nil || req.VoiceSettings.UseSpeakerBoost != nil { - t.Fatalf("expected optional voice settings to be nil") + if req.VoiceSetting == nil || req.VoiceSetting.Speed == nil { + t.Fatalf("expected voice_setting.speed to be set") + } + if req.VoiceSetting.TextNormalization != nil || req.VoiceSetting.Vol != nil || req.VoiceSetting.Pitch != nil || req.VoiceSetting.Emotion != "" { + t.Fatalf("expected optional voice settings to be omitted") } b, err := json.Marshal(req) @@ -57,14 +59,14 @@ func TestBuildTTSRequest_DefaultsOmitOptionalFields(t *testing.T) { t.Fatalf("marshal: %v", err) } s := string(b) - if strings.Contains(s, "stability") || strings.Contains(s, "similarity_boost") || strings.Contains(s, "style") || strings.Contains(s, "use_speaker_boost") { + if strings.Contains(s, "text_normalization") || strings.Contains(s, "vol") || strings.Contains(s, "pitch") || strings.Contains(s, "emotion") { t.Fatalf("expected optional fields to be omitted, got %s", s) } } -func TestBuildTTSRequest_SimilarityBoostAlias(t *testing.T) { +func TestBuildTTSRequest_NormalizeOnSetsBool(t *testing.T) { cmd, opts := newSpeakTestCommand(t) - if err := cmd.Flags().Parse([]string{"--similarity-boost", "0.9"}); err != nil { + if err := cmd.Flags().Parse([]string{"--normalize", "on"}); err != nil { t.Fatalf("parse flags: %v", err) } @@ -72,14 +74,14 @@ func TestBuildTTSRequest_SimilarityBoostAlias(t *testing.T) { if err != nil { t.Fatalf("buildTTSRequest error: %v", err) } - if req.VoiceSettings.SimilarityBoost == nil || *req.VoiceSettings.SimilarityBoost != 0.9 { - t.Fatalf("expected similarity_boost 0.9, got %#v", req.VoiceSettings.SimilarityBoost) + if req.VoiceSetting.TextNormalization == nil || *req.VoiceSetting.TextNormalization != true { + t.Fatalf("expected text_normalization true, got %#v", req.VoiceSetting.TextNormalization) } } -func TestBuildTTSRequest_SpeakerBoostSetsJSONKey(t *testing.T) { +func TestBuildTTSRequest_NormalizeAutoOmits(t *testing.T) { cmd, opts := newSpeakTestCommand(t) - if err := cmd.Flags().Parse([]string{"--speaker-boost"}); err != nil { + if err := cmd.Flags().Parse([]string{"--normalize", "auto"}); err != nil { t.Fatalf("parse flags: %v", err) } @@ -87,16 +89,8 @@ func TestBuildTTSRequest_SpeakerBoostSetsJSONKey(t *testing.T) { if err != nil { t.Fatalf("buildTTSRequest error: %v", err) } - if req.VoiceSettings.UseSpeakerBoost == nil || *req.VoiceSettings.UseSpeakerBoost != true { - t.Fatalf("expected use_speaker_boost true, got %#v", req.VoiceSettings.UseSpeakerBoost) - } - - b, err := json.Marshal(req) - if err != nil { - t.Fatalf("marshal: %v", err) - } - if !strings.Contains(string(b), "use_speaker_boost") { - t.Fatalf("expected JSON to contain use_speaker_boost, got %s", string(b)) + if req.VoiceSetting.TextNormalization != nil { + t.Fatalf("expected text_normalization omitted, got %#v", req.VoiceSetting.TextNormalization) } } @@ -111,47 +105,49 @@ func TestBuildTTSRequest_InvalidNormalize(t *testing.T) { } } -func TestBuildTTSRequest_InvalidLang(t *testing.T) { +func TestBuildTTSRequest_LangEmptyError(t *testing.T) { cmd, opts := newSpeakTestCommand(t) - if err := cmd.Flags().Parse([]string{"--lang", "eng"}); err != nil { - t.Fatalf("parse flags: %v", err) + if err := cmd.Flags().Set("lang", " "); err != nil { + t.Fatalf("set flag: %v", err) } _, err := buildTTSRequest(cmd, *opts, "hello") - if err == nil || !strings.Contains(err.Error(), "lang must be a 2-letter") { + if err == nil || !strings.Contains(err.Error(), "lang must be non-empty") { t.Fatalf("expected lang error, got %v", err) } } -func TestBuildTTSRequest_InvalidSeed(t *testing.T) { +func TestBuildTTSRequest_VoiceSettingFields(t *testing.T) { cmd, opts := newSpeakTestCommand(t) - if err := cmd.Flags().Parse([]string{"--seed", "4294967296"}); err != nil { + if err := cmd.Flags().Parse([]string{ + "--emotion", "happy", + "--pitch", "2", + "--volume", "1.5", + "--lang", "en", + "--format", "mp3_48000_96", + }); err != nil { t.Fatalf("parse flags: %v", err) } - _, err := buildTTSRequest(cmd, *opts, "hello") - if err == nil || !strings.Contains(err.Error(), "seed must be between") { - t.Fatalf("expected seed error, got %v", err) - } -} -func TestBuildTTSRequest_SpeakerBoostConflict(t *testing.T) { - cmd, opts := newSpeakTestCommand(t) - if err := cmd.Flags().Parse([]string{"--speaker-boost", "--no-speaker-boost"}); err != nil { - t.Fatalf("parse flags: %v", err) + req, err := buildTTSRequest(cmd, *opts, "hello") + if err != nil { + t.Fatalf("buildTTSRequest error: %v", err) } - _, err := buildTTSRequest(cmd, *opts, "hello") - if err == nil || !strings.Contains(err.Error(), "choose only one") { - t.Fatalf("expected conflict error, got %v", err) + if req.LanguageBoost != "en" { + t.Fatalf("expected language_boost en, got %q", req.LanguageBoost) } -} - -func TestBuildTTSRequest_V3StabilityPresetsOnly(t *testing.T) { - cmd, opts := newSpeakTestCommand(t) - opts.modelID = "eleven_v3" - if err := cmd.Flags().Parse([]string{"--stability", "0.55"}); err != nil { - t.Fatalf("parse flags: %v", err) + if req.VoiceSetting.Emotion != "happy" { + t.Fatalf("expected emotion happy, got %q", req.VoiceSetting.Emotion) } - _, err := buildTTSRequest(cmd, *opts, "hello") - if err == nil || !strings.Contains(err.Error(), "for eleven_v3, stability must be one of") { - t.Fatalf("expected v3 stability preset error, got %v", err) + if req.VoiceSetting.Pitch == nil || *req.VoiceSetting.Pitch != 2 { + t.Fatalf("expected pitch 2, got %#v", req.VoiceSetting.Pitch) + } + if req.VoiceSetting.Vol == nil || *req.VoiceSetting.Vol != 1.5 { + t.Fatalf("expected volume 1.5, got %#v", req.VoiceSetting.Vol) + } + if req.AudioSetting == nil || req.AudioSetting.SampleRate == nil || req.AudioSetting.Bitrate == nil { + t.Fatalf("expected audio settings") + } + if *req.AudioSetting.SampleRate != 48000 || *req.AudioSetting.Bitrate != 96000 { + t.Fatalf("unexpected audio settings: %+v", req.AudioSetting) } } diff --git a/cmd/speak_test.go b/cmd/speak_test.go index c6bff15..e65abc8 100644 --- a/cmd/speak_test.go +++ b/cmd/speak_test.go @@ -2,15 +2,17 @@ package cmd import ( "context" + "encoding/hex" + "encoding/json" + "fmt" "io" "math" "net/http" - "net/http/httptest" "os" "strings" "testing" - "github.com/steipete/sag/internal/elevenlabs" + "github.com/steipete/sag/internal/minimax" ) func TestInferFormatFromExt(t *testing.T) { @@ -20,8 +22,9 @@ func TestInferFormatFromExt(t *testing.T) { }{ {"out.mp3", "mp3_44100_128"}, {"out.MP3", "mp3_44100_128"}, - {"audio.wav", "pcm_44100"}, - {"audio.WAVE", "pcm_44100"}, + {"audio.wav", "wav"}, + {"audio.WAVE", "wav"}, + {"audio.flac", "flac"}, {"audio.unknown", ""}, } for _, tt := range tests { @@ -136,14 +139,12 @@ func TestApplyRateAndSpeedInvalidSpeed(t *testing.T) { } func TestResolveVoiceDefaultsToFirst(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - if _, err := w.Write([]byte(`{"voices":[{"voice_id":"id1","name":"Alpha","category":"premade"},{"voice_id":"id2","name":"Beta","category":"premade"}]}`)); err != nil { - t.Fatalf("write response: %v", err) - } + restoreHTTP := withMinimaxHandler(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + writeVoiceList(w, []minimax.Voice{{VoiceID: "id1", VoiceName: "Alpha"}, {VoiceID: "id2", VoiceName: "Beta"}}) })) - defer srv.Close() + defer restoreHTTP() - client := elevenlabs.NewClient("key", srv.URL) + client := minimax.NewClient("key", "http://minimax.test") id, err := resolveVoice(context.Background(), client, "", false) if err != nil { t.Fatalf("resolveVoice error: %v", err) @@ -153,32 +154,14 @@ func TestResolveVoiceDefaultsToFirst(t *testing.T) { } } -func TestResolveVoicePassThroughIDWithDigits(t *testing.T) { - // Should short-circuit without hitting the server when input looks like an ID with digits. - srv := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) { - t.Fatalf("server should not be called for ID pass-through") - })) - defer srv.Close() - - client := elevenlabs.NewClient("key", srv.URL) - id, err := resolveVoice(context.Background(), client, "abc1234567890123", false) - if err != nil { - t.Fatalf("resolveVoice error: %v", err) - } - if id != "abc1234567890123" { - t.Fatalf("expected ID to pass through, got %q", id) - } -} - func TestResolveVoiceForceIDPassThrough(t *testing.T) { - // Should short-circuit without hitting the server when --voice-id is set. - srv := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) { + restoreHTTP := withMinimaxHandler(t, http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) { t.Fatalf("server should not be called for forced ID pass-through") })) - defer srv.Close() + defer restoreHTTP() - client := elevenlabs.NewClient("key", srv.URL) - input := "OnlyLettersVoiceID" + client := minimax.NewClient("key", "http://minimax.test") + input := "custom-voice-id" id, err := resolveVoice(context.Background(), client, input, true) if err != nil { t.Fatalf("resolveVoice error: %v", err) @@ -188,83 +171,48 @@ func TestResolveVoiceForceIDPassThrough(t *testing.T) { } } -func TestResolveVoiceLongNameExactMatch(t *testing.T) { - var called bool - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - called = true - if _, err := w.Write([]byte(`{"voices":[{"voice_id":"id-long","name":"LongVoiceNameAlpha","category":"premade"}]}`)); err != nil { - t.Fatalf("write response: %v", err) - } +func TestResolveVoiceExactIDMatch(t *testing.T) { + restoreHTTP := withMinimaxHandler(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + writeVoiceList(w, []minimax.Voice{{VoiceID: "voice-123", VoiceName: "Alpha"}}) })) - defer srv.Close() + defer restoreHTTP() - client := elevenlabs.NewClient("key", srv.URL) - id, err := resolveVoice(context.Background(), client, "LongVoiceNameAlpha", false) + client := minimax.NewClient("key", "http://minimax.test") + id, err := resolveVoice(context.Background(), client, "voice-123", false) if err != nil { t.Fatalf("resolveVoice error: %v", err) } - if !called { - t.Fatalf("expected voice lookup for long name") - } - if id != "id-long" { - t.Fatalf("expected id-long, got %q", id) + if id != "voice-123" { + t.Fatalf("expected voice-123, got %q", id) } } -func TestResolveVoiceLooksLikeIDNoMatchPassesThrough(t *testing.T) { - var called bool - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - called = true - if _, err := w.Write([]byte(`{"voices":[{"voice_id":"id1","name":"Other","category":"premade"}]}`)); err != nil { - t.Fatalf("write response: %v", err) - } +func TestResolveVoiceNameMatch(t *testing.T) { + restoreHTTP := withMinimaxHandler(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + writeVoiceList(w, []minimax.Voice{{VoiceID: "id-sarah", VoiceName: "Sarah"}, {VoiceID: "id-roger", VoiceName: "Roger"}}) })) - defer srv.Close() + defer restoreHTTP() - client := elevenlabs.NewClient("key", srv.URL) - input := "LongVoiceNameAlpha" - id, err := resolveVoice(context.Background(), client, input, false) + client := minimax.NewClient("key", "http://minimax.test") + id, err := resolveVoice(context.Background(), client, "roger", false) if err != nil { t.Fatalf("resolveVoice error: %v", err) } - if !called { - t.Fatalf("expected voice lookup for ambiguous input") - } - if id != input { - t.Fatalf("expected %q to pass through, got %q", input, id) - } -} - -func TestResolveVoiceNoMatch(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - if _, err := w.Write([]byte(`{"voices":[{"voice_id":"id1","name":"Near","category":"premade"}]}`)); err != nil { - t.Fatalf("write response: %v", err) - } - })) - defer srv.Close() - - client := elevenlabs.NewClient("key", srv.URL) - _, err := resolveVoice(context.Background(), client, "nothing-match", false) - if err == nil { - t.Fatalf("expected error for non-matching voice") - } - if !strings.Contains(err.Error(), "not found") { - t.Fatalf("expected 'not found' error, got %q", err.Error()) + if id != "id-roger" { + t.Fatalf("resolveVoice by name = %q, want id-roger", id) } } func TestResolveVoicePartialMatch(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - if _, err := w.Write([]byte(`{"voices":[{"voice_id":"id1","name":"Sarah","category":"premade"},{"voice_id":"id2","name":"Roger - Casual","category":"premade"}]}`)); err != nil { - t.Fatalf("write response: %v", err) - } + restoreHTTP := withMinimaxHandler(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + writeVoiceList(w, []minimax.Voice{{VoiceID: "id1", VoiceName: "Sarah"}, {VoiceID: "id2", VoiceName: "Roger - Casual"}}) })) - defer srv.Close() + defer restoreHTTP() restore, read := captureStderr(t) defer restore() - client := elevenlabs.NewClient("key", srv.URL) + client := minimax.NewClient("key", "http://minimax.test") id, err := resolveVoice(context.Background(), client, "roger", false) if err != nil { t.Fatalf("resolveVoice error: %v", err) @@ -277,18 +225,32 @@ func TestResolveVoicePartialMatch(t *testing.T) { } } +func TestResolveVoiceNoMatch(t *testing.T) { + restoreHTTP := withMinimaxHandler(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + writeVoiceList(w, []minimax.Voice{{VoiceID: "id1", VoiceName: "Near"}}) + })) + defer restoreHTTP() + + client := minimax.NewClient("key", "http://minimax.test") + _, err := resolveVoice(context.Background(), client, "nothing-match", false) + if err == nil { + t.Fatalf("expected error for non-matching voice") + } + if !strings.Contains(err.Error(), "not found") { + t.Fatalf("expected 'not found' error, got %q", err.Error()) + } +} + func TestResolveVoiceListOutputsTable(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - if _, err := w.Write([]byte(`{"voices":[{"voice_id":"id1","name":"Alpha","category":"premade"}]}`)); err != nil { - t.Fatalf("write response: %v", err) - } + restoreHTTP := withMinimaxHandler(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + writeVoiceList(w, []minimax.Voice{{VoiceID: "id1", VoiceName: "Alpha"}}) })) - defer srv.Close() + defer restoreHTTP() restore, read := captureStdout(t) defer restore() - client := elevenlabs.NewClient("key", srv.URL) + client := minimax.NewClient("key", "http://minimax.test") id, err := resolveVoice(context.Background(), client, "?", false) if err != nil { t.Fatalf("resolveVoice error: %v", err) @@ -302,19 +264,19 @@ func TestResolveVoiceListOutputsTable(t *testing.T) { } func TestStreamAndPlayWritesOutput(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if !strings.Contains(r.URL.Path, "/stream") { + restoreHTTP := withMinimaxHandler(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/v1/t2a_v2" { t.Fatalf("unexpected path: %s", r.URL.Path) } - _, _ = w.Write([]byte("stream-bytes")) + writeStreamResponse(w, "stream-bytes") })) - defer srv.Close() + defer restoreHTTP() - client := elevenlabs.NewClient("key", srv.URL) + client := minimax.NewClient("key", "http://minimax.test") tmp := t.TempDir() out := tmp + "/out.mp3" opts := speakOptions{voiceID: "v1", outputPath: out, stream: true, play: false} - payload := elevenlabs.TTSRequest{Text: "hi"} + payload := minimax.TTSRequest{Text: "hi", Stream: true, VoiceSetting: &minimax.VoiceSetting{VoiceID: "v1"}} if _, err := streamAndPlay(context.Background(), client, opts, payload); err != nil { t.Fatalf("streamAndPlay error: %v", err) @@ -329,19 +291,19 @@ func TestStreamAndPlayWritesOutput(t *testing.T) { } func TestConvertAndPlayWritesOutput(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if !strings.Contains(r.URL.Path, "/text-to-speech/") { + restoreHTTP := withMinimaxHandler(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/v1/t2a_v2" { t.Fatalf("unexpected path: %s", r.URL.Path) } - _, _ = w.Write([]byte("convert-bytes")) + writeJSONResponse(w, "convert-bytes") })) - defer srv.Close() + defer restoreHTTP() - client := elevenlabs.NewClient("key", srv.URL) + client := minimax.NewClient("key", "http://minimax.test") tmp := t.TempDir() out := tmp + "/out.mp3" opts := speakOptions{voiceID: "v1", outputPath: out, play: false} - payload := elevenlabs.TTSRequest{Text: "hi"} + payload := minimax.TTSRequest{Text: "hi", VoiceSetting: &minimax.VoiceSetting{VoiceID: "v1"}} if _, err := convertAndPlay(context.Background(), client, opts, payload); err != nil { t.Fatalf("convertAndPlay error: %v", err) @@ -356,9 +318,14 @@ func TestConvertAndPlayWritesOutput(t *testing.T) { } func TestStreamAndPlayRequiresWork(t *testing.T) { - client := elevenlabs.NewClient("key", "http://invalid") + restoreHTTP := withMinimaxHandler(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + writeStreamResponse(w, "") + })) + defer restoreHTTP() + + client := minimax.NewClient("key", "http://invalid") opts := speakOptions{voiceID: "v1", play: false, stream: true} - payload := elevenlabs.TTSRequest{Text: "hi"} + payload := minimax.TTSRequest{Text: "hi", Stream: true, VoiceSetting: &minimax.VoiceSetting{VoiceID: "v1"}} _, err := streamAndPlay(context.Background(), client, opts, payload) if err == nil { @@ -376,14 +343,14 @@ func TestStreamAndPlayWithPlayback(t *testing.T) { }) defer restore() - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - _, _ = w.Write([]byte("stream-play")) + restoreHTTP := withMinimaxHandler(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + writeStreamResponse(w, "stream-play") })) - defer srv.Close() + defer restoreHTTP() - client := elevenlabs.NewClient("key", srv.URL) + client := minimax.NewClient("key", "http://minimax.test") opts := speakOptions{voiceID: "v1", play: true, stream: true} - payload := elevenlabs.TTSRequest{Text: "hi"} + payload := minimax.TTSRequest{Text: "hi", Stream: true, VoiceSetting: &minimax.VoiceSetting{VoiceID: "v1"}} if _, err := streamAndPlay(context.Background(), client, opts, payload); err != nil { t.Fatalf("streamAndPlay error: %v", err) @@ -403,14 +370,14 @@ func TestConvertAndPlayWithPlayback(t *testing.T) { }) defer restore() - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - _, _ = w.Write([]byte("convert-play")) + restoreHTTP := withMinimaxHandler(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + writeJSONResponse(w, "convert-play") })) - defer srv.Close() + defer restoreHTTP() - client := elevenlabs.NewClient("key", srv.URL) + client := minimax.NewClient("key", "http://minimax.test") opts := speakOptions{voiceID: "v1", play: true, outputPath: "", stream: false} - payload := elevenlabs.TTSRequest{Text: "hi"} + payload := minimax.TTSRequest{Text: "hi", VoiceSetting: &minimax.VoiceSetting{VoiceID: "v1"}} if _, err := convertAndPlay(context.Background(), client, opts, payload); err != nil { t.Fatalf("convertAndPlay error: %v", err) @@ -456,24 +423,6 @@ func captureStderr(t *testing.T) (restore func(), read func() string) { } } -func TestResolveVoiceByName(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if _, err := w.Write([]byte(`{"voices":[{"voice_id":"id-sarah","name":"Sarah","category":"premade"},{"voice_id":"id-roger","name":"Roger","category":"premade"}]}`)); err != nil { - t.Fatalf("write response: %v", err) - } - })) - defer srv.Close() - - client := elevenlabs.NewClient("key", srv.URL) - id, err := resolveVoice(context.Background(), client, "roger", false) - if err != nil { - t.Fatalf("resolveVoice error: %v", err) - } - if id != "id-roger" { - t.Fatalf("resolveVoice by name = %q, want id-roger", id) - } -} - func stubPlay(t *testing.T, fn func([]byte)) func() { t.Helper() orig := playToSpeakers @@ -484,3 +433,40 @@ func stubPlay(t *testing.T, fn func([]byte)) func() { } return func() { playToSpeakers = orig } } + +func withMinimaxHandler(t *testing.T, handler http.Handler) func() { + t.Helper() + minimax.SetHTTPClient(&http.Client{Transport: handlerRoundTripper{handler: handler}}) + return func() { minimax.SetHTTPClient(nil) } +} + +func writeVoiceList(w http.ResponseWriter, system []minimax.Voice) { + resp := struct { + SystemVoice []minimax.Voice `json:"system_voice"` + VoiceCloning []minimax.Voice `json:"voice_cloning"` + VoiceGeneration []minimax.Voice `json:"voice_generation"` + BaseResp *minimax.BaseResp `json:"base_resp"` + }{ + SystemVoice: system, + BaseResp: &minimax.BaseResp{StatusCode: 0, StatusMsg: "success"}, + } + _ = json.NewEncoder(w).Encode(resp) +} + +func writeStreamResponse(w http.ResponseWriter, payload string) { + w.Header().Set("Content-Type", "text/event-stream") + item := minimax.TTSResponse{ + Data: &minimax.TTSResponseData{Audio: hex.EncodeToString([]byte(payload)), Status: 1}, + BaseResp: &minimax.BaseResp{StatusCode: 0, StatusMsg: "success"}, + } + b, _ := json.Marshal(item) + _, _ = fmt.Fprintf(w, "data: %s\n\n", string(b)) +} + +func writeJSONResponse(w http.ResponseWriter, payload string) { + item := minimax.TTSResponse{ + Data: &minimax.TTSResponseData{Audio: hex.EncodeToString([]byte(payload)), Status: 2}, + BaseResp: &minimax.BaseResp{StatusCode: 0, StatusMsg: "success"}, + } + _ = json.NewEncoder(w).Encode(item) +} diff --git a/cmd/voices.go b/cmd/voices.go index 984292e..29499a0 100644 --- a/cmd/voices.go +++ b/cmd/voices.go @@ -8,7 +8,7 @@ import ( "text/tabwriter" "time" - "github.com/steipete/sag/internal/elevenlabs" + "github.com/steipete/sag/internal/minimax" "github.com/spf13/cobra" ) @@ -25,16 +25,16 @@ func init() { cmd := &cobra.Command{ Use: "voices", - Short: "List available ElevenLabs voices", + Short: "List available MiniMax voices", PreRunE: func(_ *cobra.Command, _ []string) error { return ensureAPIKey() }, RunE: func(cmd *cobra.Command, _ []string) error { - client := elevenlabs.NewClient(cfg.APIKey, cfg.BaseURL) + client := minimax.NewClient(cfg.APIKey, cfg.BaseURL) ctx, cancel := context.WithTimeout(cmd.Context(), 30*time.Second) defer cancel() - voices, err := client.ListVoices(ctx) + voices, err := client.ListVoices(ctx, "") if err != nil { return err } @@ -51,7 +51,7 @@ func init() { return err } for _, v := range voices { - if _, err := fmt.Fprintf(w, "%s\t%s\t%s\n", v.VoiceID, v.Name, v.Category); err != nil { + if _, err := fmt.Fprintf(w, "%s\t%s\t%s\n", v.VoiceID, voiceLabel(v), v.Category); err != nil { return err } } @@ -59,16 +59,16 @@ func init() { }, } - cmd.Flags().StringVar(&opts.search, "search", "", "Filter voices by name (client-side)") + cmd.Flags().StringVar(&opts.search, "search", "", "Filter voices by name or ID (client-side)") cmd.Flags().IntVar(&opts.limit, "limit", opts.limit, "Maximum rows to display (0 = all)") rootCmd.AddCommand(cmd) } -func filterVoicesByName(voices []elevenlabs.Voice, search string) []elevenlabs.Voice { +func filterVoicesByName(voices []minimax.Voice, search string) []minimax.Voice { searchLower := strings.ToLower(search) - filtered := make([]elevenlabs.Voice, 0, len(voices)) + filtered := make([]minimax.Voice, 0, len(voices)) for _, v := range voices { - if strings.Contains(strings.ToLower(v.Name), searchLower) { + if strings.Contains(strings.ToLower(voiceLabel(v)), searchLower) || strings.Contains(strings.ToLower(v.VoiceID), searchLower) { filtered = append(filtered, v) } } diff --git a/cmd/voices_test.go b/cmd/voices_test.go index ab45d75..431ab80 100644 --- a/cmd/voices_test.go +++ b/cmd/voices_test.go @@ -4,24 +4,23 @@ import ( "bytes" "io" "net/http" - "net/http/httptest" "os" "testing" - "github.com/steipete/sag/internal/elevenlabs" + "github.com/steipete/sag/internal/minimax" ) func TestVoicesCommand(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if r.URL.Path != "/v1/voices" { + restoreHTTP := withMinimaxHandler(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/v1/get_voice" { t.Fatalf("unexpected path: %s", r.URL.Path) } - _, _ = w.Write([]byte(`{"voices":[{"voice_id":"id1","name":"Alpha","category":"premade"}]}`)) + _, _ = w.Write([]byte(`{"system_voice":[{"voice_id":"id1","voice_name":"Alpha"}],"voice_cloning":[],"voice_generation":[],"base_resp":{"status_code":0,"status_msg":"success"}}`)) })) - defer srv.Close() + defer restoreHTTP() cfg.APIKey = "key" - cfg.BaseURL = srv.URL + cfg.BaseURL = "http://minimax.test" restore, readOut := captureStdoutVoices(t) defer restore() @@ -42,14 +41,14 @@ func TestVoicesCommand(t *testing.T) { // reset args to avoid polluting other tests rootCmd.SetArgs(nil) - _ = os.Unsetenv("ELEVENLABS_API_KEY") + _ = os.Unsetenv("MINIMAX_API_KEY") } func TestFilterVoicesByName(t *testing.T) { - voices := []elevenlabs.Voice{ - {VoiceID: "id1", Name: "Sarah"}, - {VoiceID: "id2", Name: "Roger - Casual"}, - {VoiceID: "id3", Name: "ROGUE"}, + voices := []minimax.Voice{ + {VoiceID: "id1", VoiceName: "Sarah"}, + {VoiceID: "id2", VoiceName: "Roger - Casual"}, + {VoiceID: "id3", VoiceName: "ROGUE"}, } filtered := filterVoicesByName(voices, "rog") diff --git a/docs/spec.md b/docs/spec.md index a8d9636..730ed9a 100644 --- a/docs/spec.md +++ b/docs/spec.md @@ -1,11 +1,11 @@ # sag specification -CLI that mirrors macOS `say` but uses ElevenLabs for synthesis. Defaults to streaming directly to speakers and can also write audio files. +CLI that mirrors macOS `say` but uses MiniMax for synthesis. Defaults to streaming directly to speakers and can also write audio files. ## Runtime & deps - Go 1.24+ - Playback uses built-in Go audio (go-mp3 + oto) and should work on macOS/Linux/Windows with a default output device. -- Auth via `ELEVENLABS_API_KEY` (or `--api-key` flag). +- Auth via `MINIMAX_API_KEY` (or `--api-key` flag). ## Commands @@ -13,30 +13,28 @@ CLI that mirrors macOS `say` but uses ElevenLabs for synthesis. Defaults to stre - Text input: pass as args, `-f/--input-file` (use `-` for stdin), or pipe stdin. - macOS `say` compatibility: - `-v/--voice` accepts voice **name** or ID; `?` lists voices. - - `-r/--rate` words-per-minute (default 175) maps to ElevenLabs speed. + - `-r/--rate` words-per-minute (default 175) maps to speed. - `-o/--output` same meaning; format inferred by extension when possible. - Accepts but ignores `--progress`, `--audio-device`, `--network-send`, `--interactive`, `--file-format`, `--data-format`, `--channels`, `--bit-rate`, `--quality`. -- Required: voice (via `-v/--voice` or `ELEVENLABS_VOICE_ID`/`SAG_VOICE_ID`). +- Required: voice (via `-v/--voice` or `MINIMAX_VOICE_ID`/`SAG_VOICE_ID`). - Flags: - - `--model-id` (default `eleven_v3`; common: `eleven_multilingual_v2`, `eleven_flash_v2_5`, `eleven_turbo_v2_5`) - - `--format` (default `mp3_44100_128`; `.wav` infers `pcm_44100`) + - `--model-id` (default `speech-01`) + - `--format` (default `mp3_44100_128`; `.wav` infers `wav`) - `--stream/--no-stream` (default stream) - - `--latency-tier` (0-4, default 0) - `--play/--no-play` (default play) - `--speed` (0.5–2.0, default 1.0; >1.0 speaks faster) - - `--stability` (0..1; when set) - - `--similarity` / `--similarity-boost` (0..1; when set) - - `--style` (0..1; when set) - - `--speaker-boost` / `--no-speaker-boost` - - `--seed` (0..4294967295; when set) + - `--emotion` (model dependent) + - `--pitch` (model dependent) + - `--volume` (model dependent) - `--normalize` (`auto|on|off`; when set) - - `--lang` (2-letter ISO 639-1; when set) + - `--lang` (language boost hint; when set) - `--metrics` print basic stats to stderr - `--output ` save audio while optionally playing - Behavior: - - Streaming path calls `POST /v1/text-to-speech/{voice_id}/stream` with JSON body. - - Non-streaming path calls `POST /v1/text-to-speech/{voice_id}` and then plays/saves. + - Streaming path calls `POST /v1/t2a_v2` with `stream=true`. + - Non-streaming path calls `POST /v1/t2a_v2` with `stream=false` and then plays/saves. - Errors if neither playback nor output is selected. + - Playback supports MP3 output only. Usage examples: ``` @@ -44,14 +42,13 @@ sag speak --voice-id VOICE_ID "Hello world" echo "piped input" | sag speak --voice-id VOICE_ID sag speak --voice-id VOICE_ID --output out.mp3 --no-play sag speak --voice-id VOICE_ID --speed 1.15 "Talk a bit faster" -sag speak --voice-id VOICE_ID --stream --latency-tier 3 "Faster start" sag speak -v "Roger" -r 200 "mac say style flags" ``` ### `sag voices` -- Lists voices via `GET /v1/voices` (server-side search when supported). +- Lists voices via `POST /v1/get_voice`. - Flags: - - `--search `: filter by name + - `--search `: filter by name/id - `--limit `: truncate output (default 100) Sample: @@ -64,9 +61,9 @@ sag voices --search "english" - Does not require an API key. ## Config sources -- `ELEVENLABS_API_KEY` for auth (required). -- Default voice env: `ELEVENLABS_VOICE_ID` or `SAG_VOICE_ID`. -- `--base-url` flag for alternate API host (defaults to `https://api.elevenlabs.io`). +- `MINIMAX_API_KEY` for auth (required; fallback `SAG_API_KEY`). +- Default voice env: `MINIMAX_VOICE_ID` or `SAG_VOICE_ID`. +- `--base-url` flag for alternate API host (defaults to `https://api.minimax.io`). ## Notes & future polish - Add cross-platform playback backends. diff --git a/internal/elevenlabs/client_test.go b/internal/elevenlabs/client_test.go index 7bd310e..6533e7a 100644 --- a/internal/elevenlabs/client_test.go +++ b/internal/elevenlabs/client_test.go @@ -19,16 +19,15 @@ func TestNewClientDefaultsBase(t *testing.T) { } func TestListVoices(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path != "/v1/voices" { t.Fatalf("unexpected path: %s", r.URL.Path) } w.Header().Set("Content-Type", "application/json") _, _ = w.Write([]byte(`{"voices":[{"voice_id":"id1","name":"Sarah","category":"premade"},{"voice_id":"id2","name":"Roger","category":"premade"}]}`)) - })) - defer srv.Close() + }) - c := NewClient("key", srv.URL) + c := newClientWithHandler(handler) voices, err := c.ListVoices(context.Background()) if err != nil { t.Fatalf("ListVoices error: %v", err) @@ -39,7 +38,7 @@ func TestListVoices(t *testing.T) { } func TestStreamTTS(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if !strings.Contains(r.URL.Path, "/v1/text-to-speech/voice123/stream") { t.Fatalf("unexpected path: %s", r.URL.Path) } @@ -47,10 +46,9 @@ func TestStreamTTS(t *testing.T) { t.Fatalf("missing Accept header") } _, _ = w.Write([]byte("audio-data")) - })) - defer srv.Close() + }) - c := NewClient("key", srv.URL) + c := newClientWithHandler(handler) rc, err := c.StreamTTS(context.Background(), "voice123", TTSRequest{Text: "hi"}, 0) if err != nil { t.Fatalf("StreamTTS error: %v", err) @@ -70,7 +68,7 @@ func TestStreamTTS_PayloadFields(t *testing.T) { speed := 1.1 seed := uint32(0) - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { var got map[string]any if err := json.NewDecoder(r.Body).Decode(&got); err != nil { t.Fatalf("decode body: %v", err) @@ -107,10 +105,9 @@ func TestStreamTTS_PayloadFields(t *testing.T) { } _, _ = w.Write([]byte("ok")) - })) - defer srv.Close() + }) - c := NewClient("key", srv.URL) + c := newClientWithHandler(handler) _, err := c.StreamTTS(context.Background(), "voice123", TTSRequest{ Text: "hi", ModelID: "eleven_multilingual_v2", @@ -132,12 +129,11 @@ func TestStreamTTS_PayloadFields(t *testing.T) { } func TestStreamTTS_Error(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + handler := http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { http.Error(w, "nope", http.StatusBadRequest) - })) - defer srv.Close() + }) - c := NewClient("key", srv.URL) + c := newClientWithHandler(handler) _, err := c.StreamTTS(context.Background(), "voice123", TTSRequest{Text: "hi"}, 0) if err == nil || !strings.Contains(err.Error(), "400") { t.Fatalf("expected 400 error, got %v", err) @@ -145,15 +141,14 @@ func TestStreamTTS_Error(t *testing.T) { } func TestConvertTTS(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if path.Base(r.URL.Path) != "voice123" { t.Fatalf("unexpected path: %s", r.URL.Path) } _, _ = w.Write([]byte("full-audio")) - })) - defer srv.Close() + }) - c := NewClient("key", srv.URL) + c := newClientWithHandler(handler) data, err := c.ConvertTTS(context.Background(), "voice123", TTSRequest{Text: "hello"}) if err != nil { t.Fatalf("ConvertTTS error: %v", err) @@ -164,14 +159,31 @@ func TestConvertTTS(t *testing.T) { } func TestConvertTTS_Error(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + handler := http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { http.Error(w, "fail", http.StatusInternalServerError) - })) - defer srv.Close() + }) - c := NewClient("key", srv.URL) + c := newClientWithHandler(handler) _, err := c.ConvertTTS(context.Background(), "voice123", TTSRequest{Text: "hello"}) if err == nil || !strings.Contains(err.Error(), "500") { t.Fatalf("expected 500 error, got %v", err) } } + +type handlerRoundTripper struct { + handler http.Handler +} + +func (rt handlerRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { + rr := httptest.NewRecorder() + rt.handler.ServeHTTP(rr, req) + res := rr.Result() + res.Request = req + return res, nil +} + +func newClientWithHandler(handler http.Handler) *Client { + c := NewClient("key", "http://eleven.test") + c.httpClient = &http.Client{Transport: handlerRoundTripper{handler: handler}} + return c +} diff --git a/internal/minimax/client.go b/internal/minimax/client.go new file mode 100644 index 0000000..edcc1e3 --- /dev/null +++ b/internal/minimax/client.go @@ -0,0 +1,389 @@ +package minimax + +import ( + "bufio" + "bytes" + "context" + "encoding/hex" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "path" + "strings" + "time" +) + +const defaultBaseURL = "https://api.minimax.io" + +// Client talks to the MiniMax HTTP API. +type Client struct { + baseURL string + apiKey string + httpClient *http.Client +} + +// NewClient returns a Client configured with the given API key and base URL. +func NewClient(apiKey, baseURL string) *Client { + if baseURL == "" { + baseURL = defaultBaseURL + } + hc := defaultHTTPClient + if hc == nil { + hc = &http.Client{ + Timeout: 60 * time.Second, + } + } + return &Client{ + baseURL: baseURL, + apiKey: apiKey, + httpClient: hc, + } +} + +// defaultHTTPClient allows tests to override the HTTP transport. +var defaultHTTPClient *http.Client + +// SetHTTPClient overrides the HTTP client used by new MiniMax clients. +func SetHTTPClient(c *http.Client) { + defaultHTTPClient = c +} + +// BaseResp contains MiniMax API status info. +type BaseResp struct { + StatusCode int `json:"status_code"` + StatusMsg string `json:"status_msg"` +} + +// Voice represents a voice entry returned by MiniMax. +type Voice struct { + VoiceID string `json:"voice_id"` + VoiceName string `json:"voice_name,omitempty"` + Description []string `json:"description,omitempty"` + CreatedTime string `json:"created_time,omitempty"` + Category string `json:"-"` +} + +type getVoiceRequest struct { + VoiceType string `json:"voice_type"` +} + +type getVoiceResponse struct { + SystemVoice []Voice `json:"system_voice"` + VoiceCloning []Voice `json:"voice_cloning"` + VoiceGeneration []Voice `json:"voice_generation"` + BaseResp *BaseResp `json:"base_resp,omitempty"` +} + +// ListVoices fetches available voices. voiceType should be one of: +// system, voice_cloning, voice_generation, all. +func (c *Client) ListVoices(ctx context.Context, voiceType string) ([]Voice, error) { + if voiceType == "" { + voiceType = "all" + } + reqBody := getVoiceRequest{VoiceType: voiceType} + resp, err := c.doJSON(ctx, http.MethodPost, "/v1/get_voice", reqBody, "application/json") + if err != nil { + return nil, err + } + defer func() { + _ = resp.Body.Close() + }() + + if resp.StatusCode >= 400 { + return nil, fmt.Errorf("list voices failed: %s", resp.Status) + } + + var body getVoiceResponse + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + return nil, err + } + if err := baseRespError(body.BaseResp); err != nil { + return nil, err + } + + voices := make([]Voice, 0, len(body.SystemVoice)+len(body.VoiceCloning)+len(body.VoiceGeneration)) + appendCategory := func(list []Voice, category string) { + for _, v := range list { + v.Category = category + voices = append(voices, v) + } + } + appendCategory(body.SystemVoice, "system") + appendCategory(body.VoiceCloning, "voice_cloning") + appendCategory(body.VoiceGeneration, "voice_generation") + return voices, nil +} + +// TTSRequest configures a text-to-audio request payload. +type TTSRequest struct { + Model string `json:"model"` + Text string `json:"text"` + Stream bool `json:"stream,omitempty"` + StreamOptions *StreamOptions `json:"stream_options,omitempty"` + VoiceSetting *VoiceSetting `json:"voice_setting,omitempty"` + AudioSetting *AudioSetting `json:"audio_setting,omitempty"` + OutputFormat string `json:"output_format,omitempty"` + LanguageBoost string `json:"language_boost,omitempty"` + SubtitleEnable *bool `json:"subtitle_enable,omitempty"` + ContinuousSound *bool `json:"continuous_sound,omitempty"` +} + +// StreamOptions tunes streaming output behavior. +type StreamOptions struct { + ExcludeAggregatedAudio bool `json:"exclude_aggregated_audio,omitempty"` +} + +// VoiceSetting controls synthesis parameters. +type VoiceSetting struct { + VoiceID string `json:"voice_id"` + Speed *float64 `json:"speed,omitempty"` + Vol *float64 `json:"vol,omitempty"` + Pitch *int `json:"pitch,omitempty"` + Emotion string `json:"emotion,omitempty"` + TextNormalization *bool `json:"text_normalization,omitempty"` +} + +// AudioSetting configures the generated audio format. +type AudioSetting struct { + SampleRate *int `json:"sample_rate,omitempty"` + Bitrate *int `json:"bitrate,omitempty"` + Format string `json:"format,omitempty"` + Channel *int `json:"channel,omitempty"` + ForceCBR *bool `json:"force_cbr,omitempty"` +} + +// TTSResponse contains audio data or stream chunks. +type TTSResponse struct { + Data *TTSResponseData `json:"data"` + TraceID string `json:"trace_id"` + ExtraInfo *TTSExtraInfo `json:"extra_info,omitempty"` + BaseResp *BaseResp `json:"base_resp"` +} + +// TTSResponseData holds audio chunk data. +type TTSResponseData struct { + Audio string `json:"audio"` + SubtitleFile string `json:"subtitle_file,omitempty"` + Status int `json:"status"` +} + +// TTSExtraInfo contains optional metadata about synthesized audio. +type TTSExtraInfo struct { + AudioLength int `json:"audio_length,omitempty"` + AudioSampleRate int `json:"audio_sample_rate,omitempty"` + AudioSize int `json:"audio_size,omitempty"` + Bitrate int `json:"bitrate,omitempty"` + AudioFormat string `json:"audio_format,omitempty"` + AudioChannel int `json:"audio_channel,omitempty"` + InvisibleCharacterRatio float64 `json:"invisible_character_ratio,omitempty"` + UsageCharacters int `json:"usage_characters,omitempty"` + WordCount int `json:"word_count,omitempty"` +} + +// StreamTTS requests streaming audio for text-to-audio. +func (c *Client) StreamTTS(ctx context.Context, payload TTSRequest) (io.ReadCloser, error) { + resp, err := c.doJSON(ctx, http.MethodPost, "/v1/t2a_v2", payload, "text/event-stream") + if err != nil { + return nil, err + } + if resp.StatusCode >= 400 { + defer func() { + _ = resp.Body.Close() + }() + b, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("stream TTS failed: %s: %s", resp.Status, string(b)) + } + + pr, pw := io.Pipe() + stream := &ttsStream{PipeReader: pr, respBody: resp.Body} + contentType := resp.Header.Get("Content-Type") + go func() { + defer func() { + _ = resp.Body.Close() + }() + if strings.Contains(contentType, "text/event-stream") { + pipeErr := streamFromSSE(resp.Body, pw) + _ = pw.CloseWithError(pipeErr) + return + } + pipeErr := streamFromJSON(resp.Body, pw) + _ = pw.CloseWithError(pipeErr) + }() + return stream, nil +} + +// ConvertTTS downloads the full audio before returning. +func (c *Client) ConvertTTS(ctx context.Context, payload TTSRequest) ([]byte, error) { + resp, err := c.doJSON(ctx, http.MethodPost, "/v1/t2a_v2", payload, "application/json") + if err != nil { + return nil, err + } + defer func() { + _ = resp.Body.Close() + }() + + if resp.StatusCode >= 400 { + b, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("convert TTS failed: %s: %s", resp.Status, string(b)) + } + + var body TTSResponse + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + return nil, err + } + if err := baseRespError(body.BaseResp); err != nil { + return nil, err + } + if body.Data == nil || body.Data.Audio == "" { + return nil, errors.New("TTS response missing audio data") + } + return decodeHexAudio(body.Data.Audio) +} + +type ttsStream struct { + *io.PipeReader + respBody io.ReadCloser +} + +func (s *ttsStream) Close() error { + _ = s.respBody.Close() + return s.PipeReader.Close() +} + +func (c *Client) doJSON(ctx context.Context, method, endpoint string, payload any, accept string) (*http.Response, error) { + u, err := url.Parse(c.baseURL) + if err != nil { + return nil, err + } + u.Path = path.Join(u.Path, endpoint) + + var body io.Reader + if payload != nil { + b, err := json.Marshal(payload) + if err != nil { + return nil, err + } + body = bytes.NewReader(b) + } + + req, err := http.NewRequestWithContext(ctx, method, u.String(), body) + if err != nil { + return nil, err + } + req.Header.Set("Content-Type", "application/json") + if accept != "" { + req.Header.Set("Accept", accept) + } + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", c.apiKey)) + + return c.httpClient.Do(req) +} + +func baseRespError(base *BaseResp) error { + if base == nil { + return nil + } + if base.StatusCode != 0 { + return fmt.Errorf("minimax error %d: %s", base.StatusCode, base.StatusMsg) + } + return nil +} + +func streamFromSSE(r io.Reader, w io.Writer) error { + reader := bufio.NewReader(r) + var dataLines []string + flush := func() error { + if len(dataLines) == 0 { + return nil + } + payload := strings.Join(dataLines, "\n") + dataLines = dataLines[:0] + return handleStreamPayload(payload, w) + } + for { + line, err := reader.ReadString('\n') + if err != nil { + if err == io.EOF { + return flush() + } + return err + } + line = strings.TrimRight(line, "\r\n") + if line == "" { + if err := flush(); err != nil { + return err + } + continue + } + if strings.HasPrefix(line, "data:") { + dataLines = append(dataLines, strings.TrimSpace(strings.TrimPrefix(line, "data:"))) + } + } +} + +func streamFromJSON(r io.Reader, w io.Writer) error { + payload, err := io.ReadAll(r) + if err != nil { + return err + } + payload = bytes.TrimSpace(payload) + if len(payload) == 0 { + return nil + } + + var items []TTSResponse + if err := json.Unmarshal(payload, &items); err == nil { + for _, item := range items { + if err := handleStreamResponse(item, w); err != nil { + return err + } + } + return nil + } + + var item TTSResponse + if err := json.Unmarshal(payload, &item); err != nil { + return err + } + return handleStreamResponse(item, w) +} + +func handleStreamPayload(payload string, w io.Writer) error { + if payload == "" || payload == "[DONE]" { + return nil + } + var item TTSResponse + if err := json.Unmarshal([]byte(payload), &item); err != nil { + return err + } + return handleStreamResponse(item, w) +} + +func handleStreamResponse(item TTSResponse, w io.Writer) error { + if err := baseRespError(item.BaseResp); err != nil { + return err + } + if item.Data == nil || item.Data.Audio == "" { + return nil + } + audio, err := decodeHexAudio(item.Data.Audio) + if err != nil { + return err + } + _, err = w.Write(audio) + return err +} + +func decodeHexAudio(value string) ([]byte, error) { + if value == "" { + return nil, nil + } + data, err := hex.DecodeString(value) + if err != nil { + return nil, fmt.Errorf("decode audio hex: %w", err) + } + return data, nil +} diff --git a/internal/minimax/doc.go b/internal/minimax/doc.go new file mode 100644 index 0000000..37389d7 --- /dev/null +++ b/internal/minimax/doc.go @@ -0,0 +1,2 @@ +// Package minimax provides a small client for the MiniMax HTTP API. +package minimax From 14555c344861be2b0eb2266249dbfa2de97fd858 Mon Sep 17 00:00:00 2001 From: Friedas ShitMac Date: Sat, 24 Jan 2026 20:06:46 +0100 Subject: [PATCH 2/2] Add MiniMax voice category filter; make test helper test-only --- README.md | 5 +- ...http_testutil.go => http_testutil_test.go} | 0 cmd/speak.go | 66 ++++++++++++------- cmd/speak_test.go | 14 ++-- cmd/voices.go | 16 +++-- docs/spec.md | 4 +- 6 files changed, 68 insertions(+), 37 deletions(-) rename cmd/{http_testutil.go => http_testutil_test.go} (100%) diff --git a/README.md b/README.md index 4d880ec..1e45ba0 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ sag speak -v "Your Voice" --output out.wav --format wav "Wave output" Key flags (subset): - `-v, --voice` voice name or ID (`?` to list) +- `--voice-category` MiniMax voice category (`system|voice_cloning|voice_generation|all`) - `--api-key-file` read API key from a file - `-r, --rate` words per minute (maps to speed; default 175) - `-f, --input-file` read text from file (`-` for stdin) @@ -64,7 +65,7 @@ Key flags (subset): - `--emotion` (model dependent) - `--pitch` (model dependent) - `--volume` (model dependent) -- `--normalize` `auto|on|off` (text normalization; when set) +- `--normalize` `auto|on|off` (auto uses server default text normalization) - `--lang` language boost hint (e.g. `en`, `zh`, `auto`) - `--format` output format (e.g. `mp3_44100_128`, `mp3`, `wav`) - `--stream/--no-stream` stream while generating (default on) @@ -73,7 +74,7 @@ Key flags (subset): Voices: ```bash -sag voices --search english --limit 20 +sag voices --category system --search english --limit 20 ``` ## Prompting (make it sound better) diff --git a/cmd/http_testutil.go b/cmd/http_testutil_test.go similarity index 100% rename from cmd/http_testutil.go rename to cmd/http_testutil_test.go diff --git a/cmd/speak.go b/cmd/speak.go index 563fdbc..e36983f 100644 --- a/cmd/speak.go +++ b/cmd/speak.go @@ -19,18 +19,19 @@ import ( ) type speakOptions struct { - voiceID string - modelID string - outputPath string - outputFmt string - stream bool - play bool - speed float64 - rateWPM int - inputFile string - normalize string - lang string - metrics bool + voiceID string + voiceCategory string + modelID string + outputPath string + outputFmt string + stream bool + play bool + speed float64 + rateWPM int + inputFile string + normalize string + lang string + metrics bool emotion string pitch int @@ -43,11 +44,12 @@ var playToSpeakers = audio.StreamToSpeakers func init() { opts := speakOptions{ - modelID: "speech-01", - outputFmt: "mp3_44100_128", - stream: true, - play: true, - speed: 1.0, + modelID: "speech-01", + outputFmt: "mp3_44100_128", + stream: true, + play: true, + speed: 1.0, + voiceCategory: "all", } cmd := &cobra.Command{ @@ -76,7 +78,12 @@ func init() { } client := minimax.NewClient(cfg.APIKey, cfg.BaseURL) - voiceID, err := resolveVoice(cmd.Context(), client, voiceInput, forceVoiceID) + category, err := normalizeVoiceCategory(opts.voiceCategory) + if err != nil { + return err + } + + voiceID, err := resolveVoice(cmd.Context(), client, voiceInput, category, forceVoiceID) if err != nil { return err } @@ -145,6 +152,7 @@ func init() { cmd.Flags().StringVar(&opts.voiceID, "voice-id", "", "Voice ID to use (MINIMAX_VOICE_ID)") cmd.Flags().StringVarP(&opts.voiceID, "voice", "v", "", "Alias for --voice-id; accepts name or ID; use '?' to list voices") + cmd.Flags().StringVar(&opts.voiceCategory, "voice-category", opts.voiceCategory, "Voice category to query (system|voice_cloning|voice_generation|all)") cmd.Flags().StringVar(&opts.modelID, "model-id", opts.modelID, "Model ID (default: speech-01). See MiniMax docs for available models.") cmd.Flags().StringVarP(&opts.outputPath, "output", "o", "", "Write audio to file (disables playback unless --play is also set)") cmd.Flags().StringVar(&opts.outputFmt, "format", opts.outputFmt, "Output format (e.g. mp3_44100_128)") @@ -152,7 +160,7 @@ func init() { cmd.Flags().BoolVar(&opts.play, "play", opts.play, "Play audio through speakers") cmd.Flags().Float64Var(&opts.speed, "speed", opts.speed, "Speech speed multiplier (e.g. 1.1 faster, 0.9 slower)") cmd.Flags().IntVarP(&opts.rateWPM, "rate", "r", 0, "macOS say-style words-per-minute; overrides --speed when set (default 175 wpm)") - cmd.Flags().StringVar(&opts.normalize, "normalize", "", "Text normalization: auto|on|off (numbers/units/URLs; when set)") + cmd.Flags().StringVar(&opts.normalize, "normalize", "", "Text normalization: auto|on|off (auto = server default; when set)") cmd.Flags().StringVar(&opts.lang, "lang", "", "Language boost hint (e.g. en, zh, auto; when set)") cmd.Flags().StringVar(&opts.emotion, "emotion", "", "Emotion hint (model dependent; e.g. neutral, happy, sad)") cmd.Flags().IntVar(&opts.pitch, "pitch", 0, "Pitch adjustment (model dependent; when set)") @@ -385,12 +393,12 @@ func convertAndPlay(ctx context.Context, client *minimax.Client, opts speakOptio return n, nil } -func resolveVoice(ctx context.Context, client *minimax.Client, voiceInput string, forceID bool) (string, error) { +func resolveVoice(ctx context.Context, client *minimax.Client, voiceInput, category string, forceID bool) (string, error) { voiceInput = strings.TrimSpace(voiceInput) if voiceInput == "" { ctx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() - voices, err := client.ListVoices(ctx, "") + voices, err := client.ListVoices(ctx, category) if err != nil { return "", fmt.Errorf("voice not specified and failed to fetch voices: %w", err) } @@ -403,7 +411,7 @@ func resolveVoice(ctx context.Context, client *minimax.Client, voiceInput string if voiceInput == "?" { ctx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() - voices, err := client.ListVoices(ctx, "") + voices, err := client.ListVoices(ctx, category) if err != nil { return "", err } @@ -428,7 +436,7 @@ func resolveVoice(ctx context.Context, client *minimax.Client, voiceInput string ctx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() - voices, err := client.ListVoices(ctx, "") + voices, err := client.ListVoices(ctx, category) if err != nil { return "", err } @@ -548,3 +556,15 @@ func isSupportedFormat(format string) bool { return false } } + +func normalizeVoiceCategory(value string) (string, error) { + v := strings.ToLower(strings.TrimSpace(value)) + switch v { + case "", "all": + return "all", nil + case "system", "voice_cloning", "voice_generation": + return v, nil + default: + return "", fmt.Errorf("invalid voice category %q (expected system|voice_cloning|voice_generation|all)", value) + } +} diff --git a/cmd/speak_test.go b/cmd/speak_test.go index e65abc8..7c6758e 100644 --- a/cmd/speak_test.go +++ b/cmd/speak_test.go @@ -145,7 +145,7 @@ func TestResolveVoiceDefaultsToFirst(t *testing.T) { defer restoreHTTP() client := minimax.NewClient("key", "http://minimax.test") - id, err := resolveVoice(context.Background(), client, "", false) + id, err := resolveVoice(context.Background(), client, "", "all", false) if err != nil { t.Fatalf("resolveVoice error: %v", err) } @@ -162,7 +162,7 @@ func TestResolveVoiceForceIDPassThrough(t *testing.T) { client := minimax.NewClient("key", "http://minimax.test") input := "custom-voice-id" - id, err := resolveVoice(context.Background(), client, input, true) + id, err := resolveVoice(context.Background(), client, input, "all", true) if err != nil { t.Fatalf("resolveVoice error: %v", err) } @@ -178,7 +178,7 @@ func TestResolveVoiceExactIDMatch(t *testing.T) { defer restoreHTTP() client := minimax.NewClient("key", "http://minimax.test") - id, err := resolveVoice(context.Background(), client, "voice-123", false) + id, err := resolveVoice(context.Background(), client, "voice-123", "all", false) if err != nil { t.Fatalf("resolveVoice error: %v", err) } @@ -194,7 +194,7 @@ func TestResolveVoiceNameMatch(t *testing.T) { defer restoreHTTP() client := minimax.NewClient("key", "http://minimax.test") - id, err := resolveVoice(context.Background(), client, "roger", false) + id, err := resolveVoice(context.Background(), client, "roger", "all", false) if err != nil { t.Fatalf("resolveVoice error: %v", err) } @@ -213,7 +213,7 @@ func TestResolveVoicePartialMatch(t *testing.T) { defer restore() client := minimax.NewClient("key", "http://minimax.test") - id, err := resolveVoice(context.Background(), client, "roger", false) + id, err := resolveVoice(context.Background(), client, "roger", "all", false) if err != nil { t.Fatalf("resolveVoice error: %v", err) } @@ -232,7 +232,7 @@ func TestResolveVoiceNoMatch(t *testing.T) { defer restoreHTTP() client := minimax.NewClient("key", "http://minimax.test") - _, err := resolveVoice(context.Background(), client, "nothing-match", false) + _, err := resolveVoice(context.Background(), client, "nothing-match", "all", false) if err == nil { t.Fatalf("expected error for non-matching voice") } @@ -251,7 +251,7 @@ func TestResolveVoiceListOutputsTable(t *testing.T) { defer restore() client := minimax.NewClient("key", "http://minimax.test") - id, err := resolveVoice(context.Background(), client, "?", false) + id, err := resolveVoice(context.Background(), client, "?", "all", false) if err != nil { t.Fatalf("resolveVoice error: %v", err) } diff --git a/cmd/voices.go b/cmd/voices.go index 29499a0..72ae316 100644 --- a/cmd/voices.go +++ b/cmd/voices.go @@ -14,13 +14,15 @@ import ( ) type voicesOptions struct { - search string - limit int + search string + limit int + category string } func init() { opts := voicesOptions{ - limit: 100, + limit: 100, + category: "all", } cmd := &cobra.Command{ @@ -34,7 +36,12 @@ func init() { ctx, cancel := context.WithTimeout(cmd.Context(), 30*time.Second) defer cancel() - voices, err := client.ListVoices(ctx, "") + category, err := normalizeVoiceCategory(opts.category) + if err != nil { + return err + } + + voices, err := client.ListVoices(ctx, category) if err != nil { return err } @@ -61,6 +68,7 @@ func init() { cmd.Flags().StringVar(&opts.search, "search", "", "Filter voices by name or ID (client-side)") cmd.Flags().IntVar(&opts.limit, "limit", opts.limit, "Maximum rows to display (0 = all)") + cmd.Flags().StringVar(&opts.category, "category", opts.category, "Server-side voice category: system|voice_cloning|voice_generation|all") rootCmd.AddCommand(cmd) } diff --git a/docs/spec.md b/docs/spec.md index 730ed9a..a1e7b73 100644 --- a/docs/spec.md +++ b/docs/spec.md @@ -13,6 +13,7 @@ CLI that mirrors macOS `say` but uses MiniMax for synthesis. Defaults to streami - Text input: pass as args, `-f/--input-file` (use `-` for stdin), or pipe stdin. - macOS `say` compatibility: - `-v/--voice` accepts voice **name** or ID; `?` lists voices. + - `--voice-category` restricts voice lookup to a MiniMax category (`system|voice_cloning|voice_generation|all`). - `-r/--rate` words-per-minute (default 175) maps to speed. - `-o/--output` same meaning; format inferred by extension when possible. - Accepts but ignores `--progress`, `--audio-device`, `--network-send`, `--interactive`, `--file-format`, `--data-format`, `--channels`, `--bit-rate`, `--quality`. @@ -26,7 +27,7 @@ CLI that mirrors macOS `say` but uses MiniMax for synthesis. Defaults to streami - `--emotion` (model dependent) - `--pitch` (model dependent) - `--volume` (model dependent) - - `--normalize` (`auto|on|off`; when set) + - `--normalize` (`auto|on|off`; auto = server default) - `--lang` (language boost hint; when set) - `--metrics` print basic stats to stderr - `--output ` save audio while optionally playing @@ -50,6 +51,7 @@ sag speak -v "Roger" -r 200 "mac say style flags" - Flags: - `--search `: filter by name/id - `--limit `: truncate output (default 100) + - `--category`: server-side voice category (`system|voice_cloning|voice_generation|all`; default all) Sample: ```