HumeAI · zgreathouse · Apr 28, 2025 · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025
diff --git a/tts/tts-next-js-chat/.env.example b/tts/tts-next-js-chat/.env.example
@@ -0,0 +1,3 @@
+HUME_API_KEY=
+ANTHROPIC_API_KEY=
+GROQ_API_KEY=
diff --git a/tts/tts-next-js-chat/.gitignore b/tts/tts-next-js-chat/.gitignore
@@ -0,0 +1,42 @@
+# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
+
+# dependencies
+/node_modules
+/.pnp
+.pnp.*
+.yarn/*
+!.yarn/patches
+!.yarn/plugins
+!.yarn/releases
+!.yarn/versions
+
+# testing
+/coverage
+
+# next.js
+/.next/
+/out/
+
+# production
+/build
+
+# misc
+.DS_Store
+*.pem
+
+# debug
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+.pnpm-debug.log*
+
+# env files (can opt-in for committing if needed)
+.env*.local
+.env
+
+# vercel
+.vercel
+
+# typescript
+*.tsbuildinfo
+next-env.d.ts
diff --git a/tts/tts-next-js-chat/README.md b/tts/tts-next-js-chat/README.md
@@ -0,0 +1,61 @@
+<div align="center">
+  <img src="https://storage.googleapis.com/hume-public-logos/hume/hume-banner.png">
+  <h1>Text-to-Speech | Next.js Chat Example</h1>
+</div>
+
+![preview.png](preview.png)
+
+## Overview
+
+This project demonstrates how to build a basic streaming conversational interface with [Hume’s TTS (streaming) API](https://dev.hume.ai/reference/text-to-speech-tts/synthesize-json-streaming) that:
+
+- Captures text, or transcribed microphone audio with Groq’s Whisper Large v3 Turbo model.
+- Sends the text input to Anthropic’s Claude model using the Vercel AI SDK.
+- Streams assistant responses back as text and synthesizes them to audio with Hume’s Octave model.
+
+## Instructions
+
+### Clone this examples repository:
+
+```shell
+git clone https://github.com/HumeAI/hume-api-examples
+cd hume-api-examples/tts/tts-next-js-chat
+```
+
+### Install dependencies:
+
+```shell
+npm run install
+# or
+yarn install
+# or
+pnpm install
+# or
+bun install
+```
+
+### Set up your API keys:
+
+This project requires API keys for Hume, Anthropic, and Groq. Retrieve them from the [Hume AI platform](https://platform.hume.ai/settings/keys), [Anthropic](https://www.anthropic.com/api), and [Groq](https://groq.com/), then place them in a `.env.local` file:
+
+```shell
+echo "HUME_API_KEY=your_hume_api_key" > .env.local
+echo "ANTHROPIC_API_KEY=your_anthropic_api_key" >> .env.local
+echo "GROQ_API_KEY=your_groq_api_key" >> .env.local
+```
+
+### Run the development server:
+
+```shell
+npm run dev
+# or
+yarn dev
+# or
+pnpm dev
+# or
+bun dev
+```
+
+### Open the app:
+
+Navigate to http://localhost:3000. Use the microphone button to record, click again to stop recording, transcribe speech, send transcription text to Claude, and finally feed Claude's text output to Hume's TTS streaming API to hear Claude's responses voiced with a voice from Hume's voice library or a voice you designed.
diff --git a/tts/tts-next-js-chat/bun.lock b/tts/tts-next-js-chat/bun.lock
diff --git a/tts/tts-next-js-chat/eslint.config.mjs b/tts/tts-next-js-chat/eslint.config.mjs
@@ -0,0 +1,17 @@
+import { dirname } from "path";
+import { fileURLToPath } from "url";
+import { FlatCompat } from "@eslint/eslintrc";
+import eslintConfigPrettier from "eslint-config-prettier";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+const compat = new FlatCompat({
+  baseDirectory: __dirname,
+});
+
+const nextConfigs = [...compat.extends("next/core-web-vitals")];
+
+const eslintConfig = [...nextConfigs, eslintConfigPrettier];
+
+export default eslintConfig;
diff --git a/tts/tts-next-js-chat/next.config.ts b/tts/tts-next-js-chat/next.config.ts
@@ -0,0 +1,7 @@
+import type { NextConfig } from "next";
+
+const nextConfig: NextConfig = {
+  /* config options here */
+};
+
+export default nextConfig;
diff --git a/tts/tts-next-js-chat/package.json b/tts/tts-next-js-chat/package.json
@@ -0,0 +1,36 @@
+{
+  "name": "tts-next-js-chat",
+  "version": "0.1.0",
+  "private": true,
+  "scripts": {
+    "dev": "next dev --turbopack",
+    "build": "next build",
+    "start": "next start",
+    "lint": "next lint"
+  },
+  "dependencies": {
+    "@ai-sdk/anthropic": "^1.2.10",
+    "@ai-sdk/groq": "^1.2.8",
+    "@ai-sdk/react": "^1.2.9",
+    "@heroicons/react": "^2.2.0",
+    "ai": "^4.3.10",
+    "hume": "^0.10.3",
+    "next": "15.3.1",
+    "react": "^19.0.0",
+    "react-dom": "^19.0.0"
+  },
+  "devDependencies": {
+    "typescript": "^5",
+    "@types/node": "^20",
+    "@types/react": "^19",
+    "@types/react-dom": "^19",
+    "@tailwindcss/postcss": "^4",
+    "tailwindcss": "^4",
+    "eslint": "^9",
+    "eslint-config-next": "15.3.1",
+    "eslint-config-prettier": "^10.1.2",
+    "prettier": "^3.5.3",
+    "prettier-plugin-tailwindcss": "^0.6.11",
+    "@eslint/eslintrc": "^3"
+  }
+}
diff --git a/tts/tts-next-js-chat/postcss.config.mjs b/tts/tts-next-js-chat/postcss.config.mjs
@@ -0,0 +1,5 @@
+const config = {
+  plugins: ["@tailwindcss/postcss"],
+};
+
+export default config;
diff --git a/tts/tts-next-js-chat/preview.png b/tts/tts-next-js-chat/preview.png
diff --git a/tts/tts-next-js-chat/src/app/api/chat/route.ts b/tts/tts-next-js-chat/src/app/api/chat/route.ts
@@ -0,0 +1,45 @@
+import { streamText } from "ai";
+import { anthropic } from "@ai-sdk/anthropic";
+
+export async function POST(req: Request) {
+  const { messages } = (await req.json()) as {
+    messages: Array<{ role: string; content: string }>;
+  };
+
+  if (!Array.isArray(messages) || messages.length === 0) {
+    return new Response("`messages` array is required", { status: 400 });
+  }
+
+  const result = streamText({
+    model: anthropic("claude-3-5-haiku-latest"),
+    messages: messages.map((m) => ({
+      role: m.role as "user" | "assistant" | "system",
+      content: m.content,
+    })),
+    system: SYSTEM_PROMPT,
+  });
+
+  return result.toTextStreamResponse();
+}
+
+const SYSTEM_PROMPT = `
+<voice_communication_style>
+  Speak naturally with everyday, human-like language. Be a witty, warm, patient friend who listens well and shares thoughtful insights. Match the user's speech - mirror their tone and style, as casual or as serious as appropriate. Express a genuine personality. Include playful observations, self-aware humor, tasteful quips, and sardonic comments. Avoid lecturing or being too formal, robotic, or generic. Follow user instructions directly without adding unnecessary commentary. Keep responses concise and around 1-3 sentences, no yapping or verbose responses.
+
+  Seamlessly use natural speech patterns - incorporate vocal inflections like "oh wow", "I see", "right!", "oh dear", "oh yeah", "I get it", "you know?", "for real", and "I hear ya". Use discourse markers like "anyway" or "I mean" to ease comprehension.
+
+  All output is spoken aloud to the user, so tailor responses as spoken words for voice conversations. Never output things that are not spoken, like text-specific formatting. Never output action asterisks or emotes.
+</voice_communication_style>
+<speak_all_text>
+  Convert all text to easily speakable words, following the guidelines below.
+
+  - Numbers: Spell out fully (three hundred forty-two,two million, five hundred sixty seven thousand, eight hundred and ninety). Negatives: Say negative before the number. Decimals: Use point (three point one four). Fractions: spell out (three fourths)
+  - Alphanumeric strings: Break into 3-4 character chunks, spell all non-letters (ABC123XYZ becomes A B C one two three X Y Z)
+  - Phone numbers: Use words (550-120-4567 becomes five five zero, one two zero, four five six seven)
+  - Dates: Spell month, use ordinals for days, full year (11/5/1991 becomes November fifth, nineteen ninety-one)
+  - Time: Use oh for single-digit hours, state AM/PM (9:05 PM becomes nine oh five PM)
+  - Math: Describe operations clearly (5x^2 + 3x - 2 becomes five X squared plus three X minus two)
+  - Currencies: Spell out as full words ($50.25 becomes fifty dollars and twenty-five cents, £200,000 becomes two hundred thousand pounds)
+
+  Ensure that all text is converted to these normalized forms, but never mention this process. Always normalize all text.
+</speak_all_text>`;
diff --git a/tts/tts-next-js-chat/src/app/api/transcribe/route.ts b/tts/tts-next-js-chat/src/app/api/transcribe/route.ts
@@ -0,0 +1,35 @@
+import { NextResponse } from "next/server";
+
+export async function POST(req: Request) {
+  try {
+    const arrayBuffer = await req.arrayBuffer();
+    const webmBlob = new Blob([arrayBuffer], { type: "audio/webm" });
+    const form = new FormData();
+
+    form.append("model", "whisper-large-v3-turbo");
+    form.append("file", webmBlob, "audio.webm");
+
+    const res = await fetch(
+      "https://api.groq.com/openai/v1/audio/transcriptions",
+      {
+        method: "POST",
+        headers: {
+          Authorization: `Bearer ${process.env.GROQ_API_KEY}`,
+        },
+        body: form,
+      }
+    );
+
+    if (!res.ok) {
+      const bodyText = await res.text();
+      console.error("Groq transcription failed:", res.status, bodyText);
+      return NextResponse.error();
+    }
+
+    const { text } = await res.json();
+    return NextResponse.json({ text });
+  } catch (err) {
+    console.error("Transcription error:", err);
+    return NextResponse.error();
+  }
+}
diff --git a/tts/tts-next-js-chat/src/app/api/tts/route.ts b/tts/tts-next-js-chat/src/app/api/tts/route.ts
@@ -0,0 +1,109 @@
+import { NextRequest, NextResponse } from "next/server";
+import { humeClient } from "@/lib/humeClient";
+import type { Stream } from "hume/core";
+import type {
+  PostedUtterance,
+  SnippetAudioChunk,
+  VoiceProvider,
+} from "hume/api/resources/tts";
+
+export async function POST(req: NextRequest) {
+  const { text, voiceName, voiceProvider, instant } = (await req.json()) as {
+    text: string;
+    voiceName: string;
+    voiceProvider: VoiceProvider;
+    instant: boolean;
+  };
+
+  if (!text || text.trim() === "") {
+    return NextResponse.json(
+      { error: "Missing or invalid text" },
+      { status: 400 }
+    );
+  }
+
+  if (typeof instant !== "boolean") {
+    return NextResponse.json(
+      { error: "Must specify whether to use instant mode" },
+      { status: 400 }
+    );
+  }
+
+  if (!voiceName && instant) {
+    return NextResponse.json(
+      { error: "If using instant mode, a voice must be specified" },
+      { status: 400 }
+    );
+  }
+
+  let upstreamHumeStream: Stream<SnippetAudioChunk>;
+
+  try {
+    console.log(
+      `[HUME_TTS_PROXY] Requesting TTS stream for voice: ${voiceName}, instant: ${instant}`
+    );
+    // Removes blocks of code from the text if present.
+    const cleanText = text.replace(/```[\s\S]*?```/g, "").trim();
+    const utterances: PostedUtterance[] = voiceName
+      ? [
+          {
+            text: cleanText,
+            voice: { name: voiceName, provider: voiceProvider },
+          },
+        ]
+      : [{ text: cleanText }];
+
+    upstreamHumeStream = await humeClient.tts.synthesizeJsonStreaming({
+      utterances: utterances,
+      stripHeaders: true,
+      instantMode: instant,
+    });
+    console.log("[HUME_TTS_PROXY] Successfully initiated Hume stream.");
+  } catch (error: any) {
+    console.error("[HUME_TTS_PROXY] Hume API call failed:", error);
+    const errorMessage = error?.message || "Failed to initiate TTS stream";
+    const errorDetails = error?.error?.message || error?.error || errorMessage;
+    return NextResponse.json(
+      { error: "Hume API Error", details: errorDetails },
+      { status: 502 }
+    );
+  }
+
+  const encoder = new TextEncoder();
+  const readableStream = new ReadableStream({
+    async start(controller) {
+      console.log("[HUME_TTS_PROXY] Client connected, forwarding stream...");
+
+      for await (const chunk of upstreamHumeStream) {
+        const jsonString = JSON.stringify(chunk);
+        const ndjsonLine = jsonString + "\n";
+        const chunkBytes = encoder.encode(ndjsonLine);
+        controller.enqueue(chunkBytes);
+      }
+      console.log("[HUME_TTS_PROXY] Upstream Hume stream finished.");
+      controller.close();
+    },
+    cancel(reason) {
+      console.log(
+        "[HUME_TTS_PROXY] Client disconnected, cancelling upstream Hume stream.",
+        reason
+      );
+      if (typeof (upstreamHumeStream as any)?.abort === "function") {
+        (upstreamHumeStream as any).abort();
+        console.log("[HUME_TTS_PROXY] Upstream Hume stream abort() called.");
+      } else {
+        console.warn(
+          "[HUME_TTS_PROXY] Upstream stream object does not expose an abort() method directly. Cancellation might rely on AbortSignal propagation."
+        );
+      }
+    },
+  });
+
+  return new NextResponse(readableStream, {
+    headers: {
+      "Content-Type": "application/x-ndjson",
+      "Cache-Control": "no-cache",
+      Connection: "keep-alive",
+    },
+  });
+}
diff --git a/tts/tts-next-js-chat/src/app/api/voices/route.ts b/tts/tts-next-js-chat/src/app/api/voices/route.ts
@@ -0,0 +1,19 @@
+import { NextRequest, NextResponse } from "next/server";
+import type { ReturnVoice, VoiceProvider } from "hume/api/resources/tts";
+import { humeClient } from "@/lib/humeClient";
+
+export async function GET(req: NextRequest) {
+  const provider = (req.nextUrl.searchParams.get("provider") ??
+    "HUME_AI") as VoiceProvider;
+
+  const response = await humeClient.tts.voices.list({
+    pageNumber: 0,
+    pageSize: 100,
+    provider,
+  });
+
+  const voices: ReturnVoice[] = [];
+  for await (const v of response) voices.push(v);
+
+  return NextResponse.json({ voices });
+}