From e6b54490effcf8c81a8069b3be9bc40e0d13d864 Mon Sep 17 00:00:00 2001
From: Xuye Qin <qinxuye@gmail.com>
Date: Fri, 31 Jan 2025 17:48:17 +0800
Subject: [PATCH] DOC: update model docs (#2792)

---
 README.md                                     |  4 +-
 README_zh_CN.md                               |  4 +-
 doc/source/getting_started/installation.rst   |  1 +
 doc/source/models/builtin/llm/index.rst       |  7 ++
 .../models/builtin/llm/internlm3-instruct.rst | 95 +++++++++++++++++++
 .../builtin/llm/qwen2.5-vl-instruct.rst       | 48 ++++++++++
 doc/source/user_guide/backends.rst            |  1 +
 xinference/model/llm/llm_family.json          | 44 +++++++++
 .../model/llm/llm_family_modelscope.json      | 51 +++++++++-
 9 files changed, 250 insertions(+), 5 deletions(-)
 create mode 100644 doc/source/models/builtin/llm/internlm3-instruct.rst

diff --git a/README.md b/README.md
index 8f701eaf1c..02ad970b64 100644
--- a/README.md
+++ b/README.md
@@ -48,13 +48,13 @@ potential of cutting-edge AI models.
 - Metrics support: [#906](https://github.com/xorbitsai/inference/pull/906)
 ### New Models
 - Built-in support for [DeepSeek-R1-Distill-Qwen](https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#deepseek-r1-distill-models): [#2781](https://github.com/xorbitsai/inference/pull/2781)
+- Built-in support for [qwen2.5-vl](https://github.com/QwenLM/Qwen2.5-VL): [#2788](https://github.com/xorbitsai/inference/pull/2788)
+- Built-in support for [internlm3-instruct](https://github.com/InternLM/InternLM): [#2789](https://github.com/xorbitsai/inference/pull/2789)
 - Built-in support for [MeloTTS](https://github.com/myshell-ai/MeloTTS): [#2760](https://github.com/xorbitsai/inference/pull/2760)
 - Built-in support for [CogAgent](https://github.com/THUDM/CogAgent): [#2740](https://github.com/xorbitsai/inference/pull/2740)
 - Built-in support for [HunyuanVideo](https://github.com/Tencent/HunyuanVideo): [#2721](https://github.com/xorbitsai/inference/pull/2721)
 - Built-in support for [HunyuanDiT](https://github.com/Tencent/HunyuanDiT): [#2727](https://github.com/xorbitsai/inference/pull/2727)
 - Built-in support for [Macro-o1](https://github.com/AIDC-AI/Marco-o1): [#2749](https://github.com/xorbitsai/inference/pull/2749)
-- Built-in support for [Stable Diffusion 3.5](https://huggingface.co/collections/stabilityai/stable-diffusion-35-671785cca799084f71fa2838): [#2706](https://github.com/xorbitsai/inference/pull/2706)
-- Built-in support for [CosyVoice 2](https://huggingface.co/FunAudioLLM/CosyVoice2-0.5B): [#2684](https://github.com/xorbitsai/inference/pull/2684)
 ### Integrations
 - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable.
 - [FastGPT](https://github.com/labring/FastGPT): a knowledge-based platform built on the LLM, offers out-of-the-box data processing and model invocation capabilities, allows for workflow orchestration through Flow visualization.
diff --git a/README_zh_CN.md b/README_zh_CN.md
index 239ca20bd0..07a9465965 100644
--- a/README_zh_CN.md
+++ b/README_zh_CN.md
@@ -44,13 +44,13 @@ Xorbits Inference（Xinference）是一个性能强大且功能全面的分布
 - 增加 Metrics 统计信息: [#906](https://github.com/xorbitsai/inference/pull/906)
 ### 新模型
 - 内置 [DeepSeek-R1-Distill-Qwen](https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#deepseek-r1-distill-models): [#2781](https://github.com/xorbitsai/inference/pull/2781)
+- 内置 [qwen2.5-vl](https://github.com/QwenLM/Qwen2.5-VL): [#2788](https://github.com/xorbitsai/inference/pull/2788)
+- 内置 [internlm3-instruct](https://github.com/InternLM/InternLM): [#2789](https://github.com/xorbitsai/inference/pull/2789)
 - 内置 [MeloTTS](https://github.com/myshell-ai/MeloTTS): [#2760](https://github.com/xorbitsai/inference/pull/2760)
 - 内置 [CogAgent](https://github.com/THUDM/CogAgent): [#2740](https://github.com/xorbitsai/inference/pull/2740)
 - 内置 [HunyuanVideo](https://github.com/Tencent/HunyuanVideo): [#2721](https://github.com/xorbitsai/inference/pull/2721)
 - 内置 [HunyuanDiT](https://github.com/Tencent/HunyuanDiT): [#2727](https://github.com/xorbitsai/inference/pull/2727)
 - 内置 [Macro-o1](https://github.com/AIDC-AI/Marco-o1): [#2749](https://github.com/xorbitsai/inference/pull/2749)
-- 内置 [Stable Diffusion 3.5](https://huggingface.co/collections/stabilityai/stable-diffusion-35-671785cca799084f71fa2838): [#2706](https://github.com/xorbitsai/inference/pull/2706)
-- 内置 [CosyVoice 2](https://huggingface.co/FunAudioLLM/CosyVoice2-0.5B): [#2684](https://github.com/xorbitsai/inference/pull/2684)
 ### 集成
 - [FastGPT](https://doc.fastai.site/docs/development/custom-models/xinference/)：一个基于 LLM 大模型的开源 AI 知识库构建平台。提供了开箱即用的数据处理、模型调用、RAG 检索、可视化 AI 工作流编排等能力，帮助您轻松实现复杂的问答场景。
 - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): 一个涵盖了大型语言模型开发、部署、维护和优化的 LLMOps 平台。
diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst
index 84fc13e153..e4cd4925c8 100644
--- a/doc/source/getting_started/installation.rst
+++ b/doc/source/getting_started/installation.rst
@@ -64,6 +64,7 @@ Currently, supported models include:
 - ``orion-chat``, ``orion-chat-rag``
 - ``c4ai-command-r-v01``
 - ``minicpm3-4b``
+- ``internlm3-instruct``
 .. vllm_end
 
 To install Xinference and vLLM::
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
index c82b504154..4e035f37c3 100644
--- a/doc/source/models/builtin/llm/index.rst
+++ b/doc/source/models/builtin/llm/index.rst
@@ -221,6 +221,11 @@ The following is a list of built-in LLM in Xinference:
      - 262144
      - InternLM2.5 series of the InternLM model supports 1M long-context
 
+   * - :ref:`internlm3-instruct <models_llm_internlm3-instruct>`
+     - chat, tools
+     - 32768
+     - InternLM3 has open-sourced an 8-billion parameter instruction model, InternLM3-8B-Instruct, designed for general-purpose usage and advanced reasoning.
+
    * - :ref:`internvl-chat <models_llm_internvl-chat>`
      - chat, vision
      - 32768
@@ -678,6 +683,8 @@ The following is a list of built-in LLM in Xinference:
   
    internlm2.5-chat-1m
   
+   internlm3-instruct
+  
    internvl-chat
   
    internvl2
diff --git a/doc/source/models/builtin/llm/internlm3-instruct.rst b/doc/source/models/builtin/llm/internlm3-instruct.rst
new file mode 100644
index 0000000000..83f62b4584
--- /dev/null
+++ b/doc/source/models/builtin/llm/internlm3-instruct.rst
@@ -0,0 +1,95 @@
+.. _models_llm_internlm3-instruct:
+
+========================================
+internlm3-instruct
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** internlm3-instruct
+- **Languages:** en, zh
+- **Abilities:** chat, tools
+- **Description:** InternLM3 has open-sourced an 8-billion parameter instruction model, InternLM3-8B-Instruct, designed for general-purpose usage and advanced reasoning.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 8 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 8
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers (vLLM only available for quantization none)
+- **Model ID:** internlm/internlm3-8b-instruct
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/internlm/internlm3-8b-instruct>`__, `ModelScope <https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm3-8b-instruct>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name internlm3-instruct --size-in-billions 8 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (gptq, 8 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 8
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** internlm/internlm3-8b-instruct-gptq-int4
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/internlm/internlm3-8b-instruct-gptq-int4>`__, `ModelScope <https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm3-8b-instruct-gptq-int4>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name internlm3-instruct --size-in-billions 8 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 3 (awq, 8 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 8
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers
+- **Model ID:** internlm/internlm3-8b-instruct-awq
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/internlm/internlm3-8b-instruct-awq>`__, `ModelScope <https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm3-8b-instruct-awq>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name internlm3-instruct --size-in-billions 8 --model-format awq --quantization ${quantization}
+
+
+Model Spec 4 (ggufv2, 8 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 8
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
+- **Engines**: llama.cpp
+- **Model ID:** internlm/internlm3-8b-instruct-gguf
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/internlm/internlm3-8b-instruct-gguf>`__, `ModelScope <https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm3-8b-instruct-gguf>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name internlm3-instruct --size-in-billions 8 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 5 (mlx, 8 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 8
+- **Quantizations:** 4bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/internlm3-8b-instruct-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/internlm3-8b-instruct-{quantization}>`__, `ModelScope <https://modelscope.cn/models/mlx-community/internlm3-8b-instruct-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name internlm3-instruct --size-in-billions 8 --model-format mlx --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/qwen2.5-vl-instruct.rst b/doc/source/models/builtin/llm/qwen2.5-vl-instruct.rst
index 05fab7e969..3451a8f4cc 100644
--- a/doc/source/models/builtin/llm/qwen2.5-vl-instruct.rst
+++ b/doc/source/models/builtin/llm/qwen2.5-vl-instruct.rst
@@ -61,3 +61,51 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-engine ${engine} --model-name qwen2.5-vl-instruct --size-in-billions 72 --model-format pytorch --quantization ${quantization}
 
+
+Model Spec 4 (mlx, 3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 3
+- **Quantizations:** 3bit, 4bit, 6bit, 8bit, bf16
+- **Engines**: Transformers, MLX
+- **Model ID:** mlx-community/Qwen2.5-VL-3B-Instruct-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/Qwen2.5-VL-3B-Instruct-{quantization}>`__, `ModelScope <https://modelscope.cn/models/mlx-community/Qwen2.5-VL-3B-Instruct-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-vl-instruct --size-in-billions 3 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 5 (mlx, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 7
+- **Quantizations:** 3bit, 4bit, 6bit, 8bit, bf16
+- **Engines**: Transformers, MLX
+- **Model ID:** mlx-community/Qwen2.5-VL-7B-Instruct-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/Qwen2.5-VL-7B-Instruct-{quantization}>`__, `ModelScope <https://modelscope.cn/models/mlx-community/Qwen2.5-VL-7B-Instruct-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-vl-instruct --size-in-billions 7 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 6 (mlx, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 72
+- **Quantizations:** 3bit, 4bit, 6bit, 8bit, bf16
+- **Engines**: Transformers, MLX
+- **Model ID:** mlx-community/Qwen2.5-VL-72B-Instruct-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/Qwen2.5-VL-72B-Instruct-{quantization}>`__, `ModelScope <https://modelscope.cn/models/mlx-community/Qwen2.5-VL-72B-Instruct-{quantization}>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name qwen2.5-vl-instruct --size-in-billions 72 --model-format mlx --quantization ${quantization}
+
diff --git a/doc/source/user_guide/backends.rst b/doc/source/user_guide/backends.rst
index 9bdfb21b25..b54b832be0 100644
--- a/doc/source/user_guide/backends.rst
+++ b/doc/source/user_guide/backends.rst
@@ -71,6 +71,7 @@ Currently, supported model includes:
 - ``orion-chat``, ``orion-chat-rag``
 - ``c4ai-command-r-v01``
 - ``minicpm3-4b``
+- ``internlm3-instruct``
 .. vllm_end
 
 SGLang
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 7defdd7db7..d00081f7bd 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -7162,6 +7162,42 @@
           "none"
         ],
         "model_id":"Qwen/Qwen2.5-VL-72B-Instruct"
+      },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":3,
+        "quantizations":[
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_id":"mlx-community/Qwen2.5-VL-3B-Instruct-{quantization}"
+      },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_id":"mlx-community/Qwen2.5-VL-7B-Instruct-{quantization}"
+      },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_id":"mlx-community/Qwen2.5-VL-72B-Instruct-{quantization}"
       }
     ],
     "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
@@ -9411,6 +9447,14 @@
         ],
         "model_id": "internlm/internlm3-8b-instruct-gguf",
         "model_file_name_template": "internlm3-8b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":8,
+        "quantizations":[
+          "4bit"
+        ],
+        "model_id":"mlx-community/internlm3-8b-instruct-{quantization}"
       }
     ],
     "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index e0a660f358..4d084ed8b4 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -4769,10 +4769,11 @@
         "model_format":"mlx",
         "model_size_in_billions":2,
         "quantizations":[
+          "4bit",
           "8bit"
         ],
         "model_hub": "modelscope",
-        "model_id":"okwinds/Qwen2-VL-2B-Instruct-MLX-8bit",
+        "model_id":"mlx-community/Qwen2-VL-2B-Instruct-{quantization}",
         "model_revision":"master"
       },
       {
@@ -4865,6 +4866,45 @@
         ],
         "model_hub": "modelscope",
         "model_id":"qwen/Qwen2.5-VL-72B-Instruct"
+      },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":3,
+        "quantizations":[
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"mlx-community/Qwen2.5-VL-3B-Instruct-{quantization}"
+      },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"mlx-community/Qwen2.5-VL-7B-Instruct-{quantization}"
+      },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"mlx-community/Qwen2.5-VL-72B-Instruct-{quantization}"
       }
     ],
     "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
@@ -7121,6 +7161,15 @@
         "model_id": "Shanghai_AI_Laboratory/internlm3-8b-instruct-gguf",
         "model_file_name_template": "internlm3-8b-instruct-{quantization}.gguf",
         "model_hub": "modelscope"
+      },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":8,
+        "quantizations":[
+          "4bit"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"mlx-community/internlm3-8b-instruct-{quantization}"
       }
     ],
     "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",