Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Add Docker configuration for SenseVoice OpenAI server #26

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Use the official Python 3.12 base image
FROM python:3.12-slim

# Set the working directory
WORKDIR /app

# Copy the requirements file into the container
COPY requirements.txt .

# Install the dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Copy the rest of the application code into the container
COPY . .

# Expose the port FastAPI will run on
EXPOSE 8000

# Define environment variables
ENV TMP_DIR=/app/tmp

# Create the temporary directory
RUN mkdir -p $TMP_DIR

# Set the command to run the FastAPI server
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
49 changes: 49 additions & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
Provide an OpenAI API transcribe server.


## Build Docker Image

```bash
docker-compose build
```

or

```bash
docker build -t sensevoice-openai-server .
```

## Start Server

### Docker Compose
Change volumes in docker-compose.yml to the path of the model you want to use.

```bash
docker-compose up -d
```

### Docker
```bash
docker run -d -p 8000:8000 -v "/your/cache/dir:/root/.cache" sensevoice-openai-server
```

## Usage

```bash
curl http://127.0.0.1:8000/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F model="iic/SenseVoiceSmall" \
-F file="@/path/to/file/openai.mp3"
```

```python
from openai import OpenAI
client = OpenAI(base_url="http://127.0.0.1:8000/v1", api_key="anything")

audio_file= open("/path/to/file/audio.mp3", "rb")
transcription = client.audio.transcriptions.create(
model="iic/SenseVoiceSmall",
file=audio_file
)
print(transcription.text)
```
16 changes: 16 additions & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# @Author: Bi Ying
# @Date: 2024-07-10 21:10:22
# @Last Modified by: Bi Ying
# @Last Modified time: 2024-07-10 21:20:22
version: '3.8'

services:
sensevoice-openai-server:
image: sensevoice-openai-server
build: .
ports:
- "8000:8000"
volumes:
- ~/.cache:/root/.cache
environment:
TMP_DIR: /app/tmp
212 changes: 212 additions & 0 deletions docker/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
# @Author: Bi Ying
# @Date: 2024-07-10 17:22:55
import shutil
from pathlib import Path
from typing import Union

import torch
import torchaudio
import numpy as np
from funasr import AutoModel
from fastapi import FastAPI, Form, UploadFile, File, HTTPException, status


app = FastAPI()

TMP_DIR = "./tmp"

# Initialize the model
model = "iic/SenseVoiceSmall"
model = AutoModel(
model=model,
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
vad_kwargs={"max_single_segment_time": 30000},
trust_remote_code=True,
)

emo_dict = {
"<|HAPPY|>": "😊",
"<|SAD|>": "😔",
"<|ANGRY|>": "😡",
"<|NEUTRAL|>": "",
"<|FEARFUL|>": "😰",
"<|DISGUSTED|>": "🤢",
"<|SURPRISED|>": "😮",
}

event_dict = {
"<|BGM|>": "🎼",
"<|Speech|>": "",
"<|Applause|>": "👏",
"<|Laughter|>": "😀",
"<|Cry|>": "😭",
"<|Sneeze|>": "🤧",
"<|Breath|>": "",
"<|Cough|>": "🤧",
}

emoji_dict = {
"<|nospeech|><|Event_UNK|>": "❓",
"<|zh|>": "",
"<|en|>": "",
"<|yue|>": "",
"<|ja|>": "",
"<|ko|>": "",
"<|nospeech|>": "",
"<|HAPPY|>": "😊",
"<|SAD|>": "😔",
"<|ANGRY|>": "😡",
"<|NEUTRAL|>": "",
"<|BGM|>": "🎼",
"<|Speech|>": "",
"<|Applause|>": "👏",
"<|Laughter|>": "😀",
"<|FEARFUL|>": "😰",
"<|DISGUSTED|>": "🤢",
"<|SURPRISED|>": "😮",
"<|Cry|>": "😭",
"<|EMO_UNKNOWN|>": "",
"<|Sneeze|>": "🤧",
"<|Breath|>": "",
"<|Cough|>": "😷",
"<|Sing|>": "",
"<|Speech_Noise|>": "",
"<|withitn|>": "",
"<|woitn|>": "",
"<|GBG|>": "",
"<|Event_UNK|>": "",
}

lang_dict = {
"<|zh|>": "<|lang|>",
"<|en|>": "<|lang|>",
"<|yue|>": "<|lang|>",
"<|ja|>": "<|lang|>",
"<|ko|>": "<|lang|>",
"<|nospeech|>": "<|lang|>",
}

emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
event_set = {
"🎼",
"👏",
"😀",
"😭",
"🤧",
"😷",
}


def format_str_v2(text: str, show_emo=True, show_event=True):
sptk_dict = {}
for sptk in emoji_dict:
sptk_dict[sptk] = text.count(sptk)
text = text.replace(sptk, "")

emo = "<|NEUTRAL|>"
for e in emo_dict:
if sptk_dict[e] > sptk_dict[emo]:
emo = e
if show_emo:
text = text + emo_dict[emo]

for e in event_dict:
if sptk_dict[e] > 0 and show_event:
text = event_dict[e] + text

for emoji in emo_set.union(event_set):
text = text.replace(" " + emoji, emoji)
text = text.replace(emoji + " ", emoji)

return text.strip()


def format_str_v3(text: str, show_emo=True, show_event=True):
def get_emo(s):
return s[-1] if s[-1] in emo_set else None

def get_event(s):
return s[0] if s[0] in event_set else None

text = text.replace("<|nospeech|><|Event_UNK|>", "❓")
for lang in lang_dict:
text = text.replace(lang, "<|lang|>")
parts = [format_str_v2(part, show_emo, show_event).strip(" ") for part in text.split("<|lang|>")]
new_s = " " + parts[0]
cur_ent_event = get_event(new_s)
for i in range(1, len(parts)):
if len(parts[i]) == 0:
continue
if get_event(parts[i]) == cur_ent_event and get_event(parts[i]) is not None:
parts[i] = parts[i][1:]
cur_ent_event = get_event(parts[i])
if get_emo(parts[i]) is not None and get_emo(parts[i]) == get_emo(new_s):
new_s = new_s[:-1]
new_s += parts[i].strip().lstrip()
new_s = new_s.replace("The.", " ")
return new_s.strip()


def model_inference(input_wav, language, fs=16000, show_emo=True, show_event=True):
language = "auto" if len(language) < 1 else language

if isinstance(input_wav, tuple):
fs, input_wav = input_wav
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
if len(input_wav.shape) > 1:
input_wav = input_wav.mean(-1)
if fs != 16000:
resampler = torchaudio.transforms.Resample(fs, 16000)
input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
input_wav = resampler(input_wav_t[None, :])[0, :].numpy()

if len(input_wav) == 0:
raise ValueError("The provided audio is empty.")

merge_vad = True
text = model.generate(
input=input_wav,
cache={},
language=language,
use_itn=True,
batch_size_s=0,
merge_vad=merge_vad,
)

text = text[0]["text"]
text = format_str_v3(text, show_emo, show_event)

return text


@app.post("/v1/audio/transcriptions")
async def transcriptions(file: Union[UploadFile, None] = File(default=None), language: str = Form(default="auto")):
if file is None:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Bad Request, no file provided")

filename = file.filename
fileobj = file.file
tmp_file = Path(TMP_DIR) / filename

with open(tmp_file, "wb+") as upload_file:
shutil.copyfileobj(fileobj, upload_file)

# 确保音频数据保持为int32格式,并转换为一维数组
waveform, sample_rate = torchaudio.load(tmp_file)
waveform = (waveform * np.iinfo(np.int32).max).to(dtype=torch.int32).squeeze()
if len(waveform.shape) > 1:
waveform = waveform.float().mean(axis=0) # 将多通道音频转换为单通道
input_wav = (sample_rate, waveform.numpy())

result = model_inference(input_wav=input_wav, language=language, show_emo=False)

# 删除临时文件
tmp_file.unlink()

return {"text": result}


if __name__ == "__main__":
import uvicorn

uvicorn.run(app, host="0.0.0.0", port=8000)
Loading