diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst index 22133d71b2..4929672bcb 100644 --- a/doc/source/models/builtin/llm/index.rst +++ b/doc/source/models/builtin/llm/index.rst @@ -437,7 +437,7 @@ The following is a list of built-in LLM in Xinference: - Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data. * - :ref:`qwen2-audio ` - - chat, audio + - generate, audio - 32768 - Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions. diff --git a/doc/source/models/builtin/llm/qwen2-audio.rst b/doc/source/models/builtin/llm/qwen2-audio.rst index 2973390c44..28d71c78a4 100644 --- a/doc/source/models/builtin/llm/qwen2-audio.rst +++ b/doc/source/models/builtin/llm/qwen2-audio.rst @@ -7,7 +7,7 @@ qwen2-audio - **Context Length:** 32768 - **Model Name:** qwen2-audio - **Languages:** en, zh -- **Abilities:** chat, audio +- **Abilities:** generate, audio - **Description:** Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions. Specifications diff --git a/doc/source/models/model_abilities/multimodal.rst b/doc/source/models/model_abilities/multimodal.rst index 069e04de8b..6ac810aaab 100644 --- a/doc/source/models/model_abilities/multimodal.rst +++ b/doc/source/models/model_abilities/multimodal.rst @@ -1,13 +1,13 @@ .. _multimodal: ===================== -Vision +Multimodal ===================== Learn how to process images and audio with LLMs. -Introduction +Vision ============ With the ``vision`` ability you can have your model take in images and answer questions about them. @@ -37,13 +37,13 @@ The ``vision`` ability is supported with the following models in Xinference: Quickstart -==================== +---------------------- Images are made available to the model in two main ways: by passing a link to the image or by passing the base64 encoded image directly in the request. Example using OpenAI Client -------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -74,7 +74,7 @@ Example using OpenAI Client Uploading base 64 encoded images ------------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -125,4 +125,66 @@ You can find more examples of ``vision`` ability in the tutorial notebook: Learn vision ability from a example using qwen-vl-chat +Audio +============ + +With the ``audio`` ability you can have your model take in audio and performing audio analysis or direct textual +responses with regard to speech instructions. +Within Xinference, this indicates that certain models are capable of processing audio inputs when conducting +dialogues via the Chat API. + +Supported models +---------------------- + +The ``audio`` ability is supported with the following models in Xinference: + +* :ref:`qwen2-audio-instruct ` + +Quickstart +---------------------- + +Images are made available to the model in two main ways: by passing a link to the image or by passing the +audio url directly in the request. + + +Uploading base 64 encoded images +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import openai + import base64 + + # Function to encode the image + def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + # Path to your image + image_path = "path_to_your_image.jpg" + + # Getting the base64 string + b64_img = encode_image(image_path) + client = openai.Client( + api_key="cannot be empty", + base_url=f"http://:/v1" + ) + response = client.chat.completions.create( + model="", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What’s in this image?"}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{b64_img}", + }, + }, + ], + } + ], + ) + print(response.choices[0]) diff --git a/xinference/core/tests/test_restful_api.py b/xinference/core/tests/test_restful_api.py index 510ddcd6d3..cf9071286d 100644 --- a/xinference/core/tests/test_restful_api.py +++ b/xinference/core/tests/test_restful_api.py @@ -1332,77 +1332,3 @@ def test_launch_model_by_version(setup): # delete again url = f"{endpoint}/v1/models/test_qwen15" requests.delete(url) - - -@pytest.mark.skip(reason="Cost too many resources.") -def test_restful_api_for_qwen_audio(setup): - model_name = "qwen2-audio-instruct" - - endpoint, _ = setup - url = f"{endpoint}/v1/models" - - # list - response = requests.get(url) - response_data = response.json() - assert len(response_data["data"]) == 0 - - # launch - payload = { - "model_uid": "test_audio", - "model_name": model_name, - "model_engine": "transformers", - "model_size_in_billions": 7, - "model_format": "pytorch", - "quantization": "none", - } - - response = requests.post(url, json=payload) - response_data = response.json() - model_uid_res = response_data["model_uid"] - assert model_uid_res == "test_audio" - - response = requests.get(url) - response_data = response.json() - assert len(response_data["data"]) == 1 - - url = f"{endpoint}/v1/chat/completions" - payload = { - "model": model_uid_res, - "messages": [ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": [ - { - "type": "audio", - "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3", - }, - {"type": "text", "text": "What's that sound?"}, - ], - }, - {"role": "assistant", "content": "It is the sound of glass shattering."}, - { - "role": "user", - "content": [ - {"type": "text", "text": "What can you do when you hear that?"}, - ], - }, - { - "role": "assistant", - "content": "Stay alert and cautious, and check if anyone is hurt or if there is any damage to property.", - }, - { - "role": "user", - "content": [ - { - "type": "audio", - "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac", - }, - {"type": "text", "text": "What does the person say?"}, - ], - }, - ], - } - response = requests.post(url, json=payload) - completion = response.json() - assert len(completion["choices"][0]["message"]) > 0 diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 504a0fc293..fe416a42c9 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -7212,7 +7212,7 @@ "zh" ], "model_ability":[ - "chat", + "generate", "audio" ], "model_description":"Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.", diff --git a/xinference/model/llm/tests/test_multimodal.py b/xinference/model/llm/tests/test_multimodal.py index 7bd3e78a15..e5c0531b97 100644 --- a/xinference/model/llm/tests/test_multimodal.py +++ b/xinference/model/llm/tests/test_multimodal.py @@ -318,3 +318,77 @@ def test_restful_api_for_deepseek_vl(setup, model_format, quantization): ], ) assert any(count in completion.choices[0].message.content for count in ["两条", "四条"]) + + +@pytest.mark.skip(reason="Cost too many resources.") +def test_restful_api_for_qwen_audio(setup): + model_name = "qwen2-audio-instruct" + + endpoint, _ = setup + url = f"{endpoint}/v1/models" + + # list + response = requests.get(url) + response_data = response.json() + assert len(response_data["data"]) == 0 + + # launch + payload = { + "model_uid": "test_audio", + "model_name": model_name, + "model_engine": "transformers", + "model_size_in_billions": 7, + "model_format": "pytorch", + "quantization": "none", + } + + response = requests.post(url, json=payload) + response_data = response.json() + model_uid_res = response_data["model_uid"] + assert model_uid_res == "test_audio" + + response = requests.get(url) + response_data = response.json() + assert len(response_data["data"]) == 1 + + url = f"{endpoint}/v1/chat/completions" + payload = { + "model": model_uid_res, + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + { + "type": "audio", + "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3", + }, + {"type": "text", "text": "What's that sound?"}, + ], + }, + {"role": "assistant", "content": "It is the sound of glass shattering."}, + { + "role": "user", + "content": [ + {"type": "text", "text": "What can you do when you hear that?"}, + ], + }, + { + "role": "assistant", + "content": "Stay alert and cautious, and check if anyone is hurt or if there is any damage to property.", + }, + { + "role": "user", + "content": [ + { + "type": "audio", + "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac", + }, + {"type": "text", "text": "What does the person say?"}, + ], + }, + ], + } + response = requests.post(url, json=payload) + completion = response.json() + assert len(completion["choices"][0]["message"]) > 0 diff --git a/xinference/model/llm/transformers/qwen2_audio.py b/xinference/model/llm/transformers/qwen2_audio.py index e5ea0da981..192f516a5a 100644 --- a/xinference/model/llm/transformers/qwen2_audio.py +++ b/xinference/model/llm/transformers/qwen2_audio.py @@ -105,6 +105,8 @@ def chat( inputs = self._processor( text=text, audios=audios, return_tensors="pt", padding=True ) + # Make sure that the inputs and the model are on the same device. + inputs.data = {k: v.to(self._device) for k, v in inputs.data.items()} inputs.input_ids = inputs.input_ids.to(self._device) generate_config = generate_config if generate_config else {} stream = generate_config.get("stream", False) if generate_config else False