diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
index 22133d71b2..4929672bcb 100644
--- a/doc/source/models/builtin/llm/index.rst
+++ b/doc/source/models/builtin/llm/index.rst
@@ -437,7 +437,7 @@ The following is a list of built-in LLM in Xinference:
      - Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.
 
    * - :ref:`qwen2-audio <models_llm_qwen2-audio>`
-     - chat, audio
+     - generate, audio
      - 32768
      - Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.
 
diff --git a/doc/source/models/builtin/llm/qwen2-audio.rst b/doc/source/models/builtin/llm/qwen2-audio.rst
index 2973390c44..28d71c78a4 100644
--- a/doc/source/models/builtin/llm/qwen2-audio.rst
+++ b/doc/source/models/builtin/llm/qwen2-audio.rst
@@ -7,7 +7,7 @@ qwen2-audio
 - **Context Length:** 32768
 - **Model Name:** qwen2-audio
 - **Languages:** en, zh
-- **Abilities:** chat, audio
+- **Abilities:** generate, audio
 - **Description:** Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.
 
 Specifications
diff --git a/doc/source/models/model_abilities/multimodal.rst b/doc/source/models/model_abilities/multimodal.rst
index 069e04de8b..6ac810aaab 100644
--- a/doc/source/models/model_abilities/multimodal.rst
+++ b/doc/source/models/model_abilities/multimodal.rst
@@ -1,13 +1,13 @@
 .. _multimodal:
 
 =====================
-Vision
+Multimodal
 =====================
 
 Learn how to process images and audio with LLMs.
 
 
-Introduction
+Vision
 ============
 
 With the ``vision`` ability you can have your model take in images and answer questions about them.
@@ -37,13 +37,13 @@ The ``vision`` ability is supported with the following models in Xinference:
 
 
 Quickstart
-====================
+----------------------
 
 Images are made available to the model in two main ways: by passing a link to the image or by passing the
 base64 encoded image directly in the request.
 
 Example using OpenAI Client
--------------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: python
 
@@ -74,7 +74,7 @@ Example using OpenAI Client
 
 
 Uploading base 64 encoded images
-------------------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: python
 
@@ -125,4 +125,66 @@ You can find more examples of ``vision`` ability in the tutorial notebook:
       
       Learn vision ability from a example using qwen-vl-chat
 
+Audio
+============
+
+With the ``audio`` ability you can have your model take in audio and performing audio analysis or direct textual
+responses with regard to speech instructions.
+Within Xinference, this indicates that certain models are capable of processing audio inputs when conducting
+dialogues via the Chat API.
+
+Supported models
+----------------------
+
+The ``audio`` ability is supported with the following models in Xinference:
+
+* :ref:`qwen2-audio-instruct <models_llm_qwen2-audio-instruct>`
+
+Quickstart
+----------------------
+
+Images are made available to the model in two main ways: by passing a link to the image or by passing the
+audio url directly in the request.
+
+
+Uploading base 64 encoded images
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+    import openai
+    import base64
+
+    # Function to encode the image
+    def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+
+    # Path to your image
+    image_path = "path_to_your_image.jpg"
+
+    # Getting the base64 string
+    b64_img = encode_image(image_path)
 
+    client = openai.Client(
+        api_key="cannot be empty",
+        base_url=f"http://<XINFERENCE_HOST>:<XINFERENCE_PORT>/v1"
+    )
+    response = client.chat.completions.create(
+        model="<MODEL_UID>",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What’s in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{b64_img}",
+                        },
+                    },
+                ],
+            }
+        ],
+    )
+    print(response.choices[0])
diff --git a/xinference/core/tests/test_restful_api.py b/xinference/core/tests/test_restful_api.py
index 510ddcd6d3..cf9071286d 100644
--- a/xinference/core/tests/test_restful_api.py
+++ b/xinference/core/tests/test_restful_api.py
@@ -1332,77 +1332,3 @@ def test_launch_model_by_version(setup):
     # delete again
     url = f"{endpoint}/v1/models/test_qwen15"
     requests.delete(url)
-
-
-@pytest.mark.skip(reason="Cost too many resources.")
-def test_restful_api_for_qwen_audio(setup):
-    model_name = "qwen2-audio-instruct"
-
-    endpoint, _ = setup
-    url = f"{endpoint}/v1/models"
-
-    # list
-    response = requests.get(url)
-    response_data = response.json()
-    assert len(response_data["data"]) == 0
-
-    # launch
-    payload = {
-        "model_uid": "test_audio",
-        "model_name": model_name,
-        "model_engine": "transformers",
-        "model_size_in_billions": 7,
-        "model_format": "pytorch",
-        "quantization": "none",
-    }
-
-    response = requests.post(url, json=payload)
-    response_data = response.json()
-    model_uid_res = response_data["model_uid"]
-    assert model_uid_res == "test_audio"
-
-    response = requests.get(url)
-    response_data = response.json()
-    assert len(response_data["data"]) == 1
-
-    url = f"{endpoint}/v1/chat/completions"
-    payload = {
-        "model": model_uid_res,
-        "messages": [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "audio",
-                        "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
-                    },
-                    {"type": "text", "text": "What's that sound?"},
-                ],
-            },
-            {"role": "assistant", "content": "It is the sound of glass shattering."},
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "What can you do when you hear that?"},
-                ],
-            },
-            {
-                "role": "assistant",
-                "content": "Stay alert and cautious, and check if anyone is hurt or if there is any damage to property.",
-            },
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "audio",
-                        "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac",
-                    },
-                    {"type": "text", "text": "What does the person say?"},
-                ],
-            },
-        ],
-    }
-    response = requests.post(url, json=payload)
-    completion = response.json()
-    assert len(completion["choices"][0]["message"]) > 0
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 504a0fc293..fe416a42c9 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -7212,7 +7212,7 @@
       "zh"
     ],
     "model_ability":[
-      "chat",
+      "generate",
       "audio"
     ],
     "model_description":"Qwen2-Audio: A large-scale audio-language model which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions.",
diff --git a/xinference/model/llm/tests/test_multimodal.py b/xinference/model/llm/tests/test_multimodal.py
index 7bd3e78a15..e5c0531b97 100644
--- a/xinference/model/llm/tests/test_multimodal.py
+++ b/xinference/model/llm/tests/test_multimodal.py
@@ -318,3 +318,77 @@ def test_restful_api_for_deepseek_vl(setup, model_format, quantization):
         ],
     )
     assert any(count in completion.choices[0].message.content for count in ["两条", "四条"])
+
+
+@pytest.mark.skip(reason="Cost too many resources.")
+def test_restful_api_for_qwen_audio(setup):
+    model_name = "qwen2-audio-instruct"
+
+    endpoint, _ = setup
+    url = f"{endpoint}/v1/models"
+
+    # list
+    response = requests.get(url)
+    response_data = response.json()
+    assert len(response_data["data"]) == 0
+
+    # launch
+    payload = {
+        "model_uid": "test_audio",
+        "model_name": model_name,
+        "model_engine": "transformers",
+        "model_size_in_billions": 7,
+        "model_format": "pytorch",
+        "quantization": "none",
+    }
+
+    response = requests.post(url, json=payload)
+    response_data = response.json()
+    model_uid_res = response_data["model_uid"]
+    assert model_uid_res == "test_audio"
+
+    response = requests.get(url)
+    response_data = response.json()
+    assert len(response_data["data"]) == 1
+
+    url = f"{endpoint}/v1/chat/completions"
+    payload = {
+        "model": model_uid_res,
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio",
+                        "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
+                    },
+                    {"type": "text", "text": "What's that sound?"},
+                ],
+            },
+            {"role": "assistant", "content": "It is the sound of glass shattering."},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What can you do when you hear that?"},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Stay alert and cautious, and check if anyone is hurt or if there is any damage to property.",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio",
+                        "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac",
+                    },
+                    {"type": "text", "text": "What does the person say?"},
+                ],
+            },
+        ],
+    }
+    response = requests.post(url, json=payload)
+    completion = response.json()
+    assert len(completion["choices"][0]["message"]) > 0
diff --git a/xinference/model/llm/transformers/qwen2_audio.py b/xinference/model/llm/transformers/qwen2_audio.py
index e5ea0da981..192f516a5a 100644
--- a/xinference/model/llm/transformers/qwen2_audio.py
+++ b/xinference/model/llm/transformers/qwen2_audio.py
@@ -105,6 +105,8 @@ def chat(
         inputs = self._processor(
             text=text, audios=audios, return_tensors="pt", padding=True
         )
+        # Make sure that the inputs and the model are on the same device.
+        inputs.data = {k: v.to(self._device) for k, v in inputs.data.items()}
         inputs.input_ids = inputs.input_ids.to(self._device)
         generate_config = generate_config if generate_config else {}
         stream = generate_config.get("stream", False) if generate_config else False