Skip to content

Commit

Permalink
chore: Add Docker configuration for SenseVoice OpenAI server
Browse files Browse the repository at this point in the history
  • Loading branch information
AndersonBY committed Jul 10, 2024
1 parent 831cd02 commit b7fe250
Show file tree
Hide file tree
Showing 6 changed files with 1,172 additions and 0 deletions.
26 changes: 26 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Use the official Python 3.12 base image
FROM python:3.12-slim

# Set the working directory
WORKDIR /app

# Copy the requirements file into the container
COPY requirements.txt .

# Install the dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Copy the rest of the application code into the container
COPY . .

# Expose the port FastAPI will run on
EXPOSE 8000

# Define environment variables
ENV TMP_DIR=/app/tmp

# Create the temporary directory
RUN mkdir -p $TMP_DIR

# Set the command to run the FastAPI server
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
49 changes: 49 additions & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
Provide an OpenAI API transcribe server.


## Build Docker Image

```bash
docker-compose build
```

or

```bash
docker build -t sensevoice-openai-server .
```

## Start Server

### Docker Compose
Change volumes in docker-compose.yml to the path of the model you want to use.

```bash
docker-compose up -d
```

### Docker
```bash
docker run -d -p 8000:8000 -v "/your/cache/dir:/root/.cache" sensevoice-openai-server
```

## Usage

```bash
curl http://127.0.0.1:8000/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F model="iic/SenseVoiceSmall" \
-F file="@/path/to/file/openai.mp3"
```

```python
from openai import OpenAI
client = OpenAI(base_url="http://127.0.0.1:8000/v1", api_key="anything")

audio_file= open("/path/to/file/audio.mp3", "rb")
transcription = client.audio.transcriptions.create(
model="iic/SenseVoiceSmall",
file=audio_file
)
print(transcription.text)
```
16 changes: 16 additions & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# @Author: Bi Ying
# @Date: 2024-07-10 21:10:22
# @Last Modified by: Bi Ying
# @Last Modified time: 2024-07-10 21:20:22
version: '3.8'

services:
sensevoice-openai-server:
image: sensevoice-openai-server
build: .
ports:
- "8000:8000"
volumes:
- ~/.cache:/root/.cache
environment:
TMP_DIR: /app/tmp
212 changes: 212 additions & 0 deletions docker/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
# @Author: Bi Ying
# @Date: 2024-07-10 17:22:55
import shutil
from pathlib import Path
from typing import Union

import torch
import torchaudio
import numpy as np
from funasr import AutoModel
from fastapi import FastAPI, Form, UploadFile, File, HTTPException, status


app = FastAPI()

TMP_DIR = "./tmp"

# Initialize the model
model = "iic/SenseVoiceSmall"
model = AutoModel(
model=model,
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
vad_kwargs={"max_single_segment_time": 30000},
trust_remote_code=True,
)

emo_dict = {
"<|HAPPY|>": "😊",
"<|SAD|>": "😔",
"<|ANGRY|>": "😡",
"<|NEUTRAL|>": "",
"<|FEARFUL|>": "😰",
"<|DISGUSTED|>": "🤢",
"<|SURPRISED|>": "😮",
}

event_dict = {
"<|BGM|>": "🎼",
"<|Speech|>": "",
"<|Applause|>": "👏",
"<|Laughter|>": "😀",
"<|Cry|>": "😭",
"<|Sneeze|>": "🤧",
"<|Breath|>": "",
"<|Cough|>": "🤧",
}

emoji_dict = {
"<|nospeech|><|Event_UNK|>": "❓",
"<|zh|>": "",
"<|en|>": "",
"<|yue|>": "",
"<|ja|>": "",
"<|ko|>": "",
"<|nospeech|>": "",
"<|HAPPY|>": "😊",
"<|SAD|>": "😔",
"<|ANGRY|>": "😡",
"<|NEUTRAL|>": "",
"<|BGM|>": "🎼",
"<|Speech|>": "",
"<|Applause|>": "👏",
"<|Laughter|>": "😀",
"<|FEARFUL|>": "😰",
"<|DISGUSTED|>": "🤢",
"<|SURPRISED|>": "😮",
"<|Cry|>": "😭",
"<|EMO_UNKNOWN|>": "",
"<|Sneeze|>": "🤧",
"<|Breath|>": "",
"<|Cough|>": "😷",
"<|Sing|>": "",
"<|Speech_Noise|>": "",
"<|withitn|>": "",
"<|woitn|>": "",
"<|GBG|>": "",
"<|Event_UNK|>": "",
}

lang_dict = {
"<|zh|>": "<|lang|>",
"<|en|>": "<|lang|>",
"<|yue|>": "<|lang|>",
"<|ja|>": "<|lang|>",
"<|ko|>": "<|lang|>",
"<|nospeech|>": "<|lang|>",
}

emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
event_set = {
"🎼",
"👏",
"😀",
"😭",
"🤧",
"😷",
}


def format_str_v2(text: str, show_emo=True, show_event=True):
sptk_dict = {}
for sptk in emoji_dict:
sptk_dict[sptk] = text.count(sptk)
text = text.replace(sptk, "")

emo = "<|NEUTRAL|>"
for e in emo_dict:
if sptk_dict[e] > sptk_dict[emo]:
emo = e
if show_emo:
text = text + emo_dict[emo]

for e in event_dict:
if sptk_dict[e] > 0 and show_event:
text = event_dict[e] + text

for emoji in emo_set.union(event_set):
text = text.replace(" " + emoji, emoji)
text = text.replace(emoji + " ", emoji)

return text.strip()


def format_str_v3(text: str, show_emo=True, show_event=True):
def get_emo(s):
return s[-1] if s[-1] in emo_set else None

def get_event(s):
return s[0] if s[0] in event_set else None

text = text.replace("<|nospeech|><|Event_UNK|>", "❓")
for lang in lang_dict:
text = text.replace(lang, "<|lang|>")
parts = [format_str_v2(part, show_emo, show_event).strip(" ") for part in text.split("<|lang|>")]
new_s = " " + parts[0]
cur_ent_event = get_event(new_s)
for i in range(1, len(parts)):
if len(parts[i]) == 0:
continue
if get_event(parts[i]) == cur_ent_event and get_event(parts[i]) is not None:
parts[i] = parts[i][1:]
cur_ent_event = get_event(parts[i])
if get_emo(parts[i]) is not None and get_emo(parts[i]) == get_emo(new_s):
new_s = new_s[:-1]
new_s += parts[i].strip().lstrip()
new_s = new_s.replace("The.", " ")
return new_s.strip()


def model_inference(input_wav, language, fs=16000, show_emo=True, show_event=True):
language = "auto" if len(language) < 1 else language

if isinstance(input_wav, tuple):
fs, input_wav = input_wav
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
if len(input_wav.shape) > 1:
input_wav = input_wav.mean(-1)
if fs != 16000:
resampler = torchaudio.transforms.Resample(fs, 16000)
input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
input_wav = resampler(input_wav_t[None, :])[0, :].numpy()

if len(input_wav) == 0:
raise ValueError("The provided audio is empty.")

merge_vad = True
text = model.generate(
input=input_wav,
cache={},
language=language,
use_itn=True,
batch_size_s=0,
merge_vad=merge_vad,
)

text = text[0]["text"]
text = format_str_v3(text, show_emo, show_event)

return text


@app.post("/v1/audio/transcriptions")
async def transcriptions(file: Union[UploadFile, None] = File(default=None), language: str = Form(default="auto")):
if file is None:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Bad Request, no file provided")

filename = file.filename
fileobj = file.file
tmp_file = Path(TMP_DIR) / filename

with open(tmp_file, "wb+") as upload_file:
shutil.copyfileobj(fileobj, upload_file)

# 确保音频数据保持为int32格式,并转换为一维数组
waveform, sample_rate = torchaudio.load(tmp_file)
waveform = (waveform * np.iinfo(np.int32).max).to(dtype=torch.int32).squeeze()
if len(waveform.shape) > 1:
waveform = waveform.float().mean(axis=0) # 将多通道音频转换为单通道
input_wav = (sample_rate, waveform.numpy())

result = model_inference(input_wav=input_wav, language=language, show_emo=False)

# 删除临时文件
tmp_file.unlink()

return {"text": result}


if __name__ == "__main__":
import uvicorn

uvicorn.run(app, host="0.0.0.0", port=8000)
Loading

0 comments on commit b7fe250

Please sign in to comment.