From e6b54490effcf8c81a8069b3be9bc40e0d13d864 Mon Sep 17 00:00:00 2001 From: Xuye Qin Date: Fri, 31 Jan 2025 17:48:17 +0800 Subject: [PATCH] DOC: update model docs (#2792) --- README.md | 4 +- README_zh_CN.md | 4 +- doc/source/getting_started/installation.rst | 1 + doc/source/models/builtin/llm/index.rst | 7 ++ .../models/builtin/llm/internlm3-instruct.rst | 95 +++++++++++++++++++ .../builtin/llm/qwen2.5-vl-instruct.rst | 48 ++++++++++ doc/source/user_guide/backends.rst | 1 + xinference/model/llm/llm_family.json | 44 +++++++++ .../model/llm/llm_family_modelscope.json | 51 +++++++++- 9 files changed, 250 insertions(+), 5 deletions(-) create mode 100644 doc/source/models/builtin/llm/internlm3-instruct.rst diff --git a/README.md b/README.md index 8f701eaf1c..02ad970b64 100644 --- a/README.md +++ b/README.md @@ -48,13 +48,13 @@ potential of cutting-edge AI models. - Metrics support: [#906](https://github.com/xorbitsai/inference/pull/906) ### New Models - Built-in support for [DeepSeek-R1-Distill-Qwen](https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#deepseek-r1-distill-models): [#2781](https://github.com/xorbitsai/inference/pull/2781) +- Built-in support for [qwen2.5-vl](https://github.com/QwenLM/Qwen2.5-VL): [#2788](https://github.com/xorbitsai/inference/pull/2788) +- Built-in support for [internlm3-instruct](https://github.com/InternLM/InternLM): [#2789](https://github.com/xorbitsai/inference/pull/2789) - Built-in support for [MeloTTS](https://github.com/myshell-ai/MeloTTS): [#2760](https://github.com/xorbitsai/inference/pull/2760) - Built-in support for [CogAgent](https://github.com/THUDM/CogAgent): [#2740](https://github.com/xorbitsai/inference/pull/2740) - Built-in support for [HunyuanVideo](https://github.com/Tencent/HunyuanVideo): [#2721](https://github.com/xorbitsai/inference/pull/2721) - Built-in support for [HunyuanDiT](https://github.com/Tencent/HunyuanDiT): [#2727](https://github.com/xorbitsai/inference/pull/2727) - Built-in support for [Macro-o1](https://github.com/AIDC-AI/Marco-o1): [#2749](https://github.com/xorbitsai/inference/pull/2749) -- Built-in support for [Stable Diffusion 3.5](https://huggingface.co/collections/stabilityai/stable-diffusion-35-671785cca799084f71fa2838): [#2706](https://github.com/xorbitsai/inference/pull/2706) -- Built-in support for [CosyVoice 2](https://huggingface.co/FunAudioLLM/CosyVoice2-0.5B): [#2684](https://github.com/xorbitsai/inference/pull/2684) ### Integrations - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable. - [FastGPT](https://github.com/labring/FastGPT): a knowledge-based platform built on the LLM, offers out-of-the-box data processing and model invocation capabilities, allows for workflow orchestration through Flow visualization. diff --git a/README_zh_CN.md b/README_zh_CN.md index 239ca20bd0..07a9465965 100644 --- a/README_zh_CN.md +++ b/README_zh_CN.md @@ -44,13 +44,13 @@ Xorbits Inference(Xinference)是一个性能强大且功能全面的分布 - 增加 Metrics 统计信息: [#906](https://github.com/xorbitsai/inference/pull/906) ### 新模型 - 内置 [DeepSeek-R1-Distill-Qwen](https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#deepseek-r1-distill-models): [#2781](https://github.com/xorbitsai/inference/pull/2781) +- 内置 [qwen2.5-vl](https://github.com/QwenLM/Qwen2.5-VL): [#2788](https://github.com/xorbitsai/inference/pull/2788) +- 内置 [internlm3-instruct](https://github.com/InternLM/InternLM): [#2789](https://github.com/xorbitsai/inference/pull/2789) - 内置 [MeloTTS](https://github.com/myshell-ai/MeloTTS): [#2760](https://github.com/xorbitsai/inference/pull/2760) - 内置 [CogAgent](https://github.com/THUDM/CogAgent): [#2740](https://github.com/xorbitsai/inference/pull/2740) - 内置 [HunyuanVideo](https://github.com/Tencent/HunyuanVideo): [#2721](https://github.com/xorbitsai/inference/pull/2721) - 内置 [HunyuanDiT](https://github.com/Tencent/HunyuanDiT): [#2727](https://github.com/xorbitsai/inference/pull/2727) - 内置 [Macro-o1](https://github.com/AIDC-AI/Marco-o1): [#2749](https://github.com/xorbitsai/inference/pull/2749) -- 内置 [Stable Diffusion 3.5](https://huggingface.co/collections/stabilityai/stable-diffusion-35-671785cca799084f71fa2838): [#2706](https://github.com/xorbitsai/inference/pull/2706) -- 内置 [CosyVoice 2](https://huggingface.co/FunAudioLLM/CosyVoice2-0.5B): [#2684](https://github.com/xorbitsai/inference/pull/2684) ### 集成 - [FastGPT](https://doc.fastai.site/docs/development/custom-models/xinference/):一个基于 LLM 大模型的开源 AI 知识库构建平台。提供了开箱即用的数据处理、模型调用、RAG 检索、可视化 AI 工作流编排等能力,帮助您轻松实现复杂的问答场景。 - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): 一个涵盖了大型语言模型开发、部署、维护和优化的 LLMOps 平台。 diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst index 84fc13e153..e4cd4925c8 100644 --- a/doc/source/getting_started/installation.rst +++ b/doc/source/getting_started/installation.rst @@ -64,6 +64,7 @@ Currently, supported models include: - ``orion-chat``, ``orion-chat-rag`` - ``c4ai-command-r-v01`` - ``minicpm3-4b`` +- ``internlm3-instruct`` .. vllm_end To install Xinference and vLLM:: diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst index c82b504154..4e035f37c3 100644 --- a/doc/source/models/builtin/llm/index.rst +++ b/doc/source/models/builtin/llm/index.rst @@ -221,6 +221,11 @@ The following is a list of built-in LLM in Xinference: - 262144 - InternLM2.5 series of the InternLM model supports 1M long-context + * - :ref:`internlm3-instruct ` + - chat, tools + - 32768 + - InternLM3 has open-sourced an 8-billion parameter instruction model, InternLM3-8B-Instruct, designed for general-purpose usage and advanced reasoning. + * - :ref:`internvl-chat ` - chat, vision - 32768 @@ -678,6 +683,8 @@ The following is a list of built-in LLM in Xinference: internlm2.5-chat-1m + internlm3-instruct + internvl-chat internvl2 diff --git a/doc/source/models/builtin/llm/internlm3-instruct.rst b/doc/source/models/builtin/llm/internlm3-instruct.rst new file mode 100644 index 0000000000..83f62b4584 --- /dev/null +++ b/doc/source/models/builtin/llm/internlm3-instruct.rst @@ -0,0 +1,95 @@ +.. _models_llm_internlm3-instruct: + +======================================== +internlm3-instruct +======================================== + +- **Context Length:** 32768 +- **Model Name:** internlm3-instruct +- **Languages:** en, zh +- **Abilities:** chat, tools +- **Description:** InternLM3 has open-sourced an 8-billion parameter instruction model, InternLM3-8B-Instruct, designed for general-purpose usage and advanced reasoning. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 8 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 8 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers (vLLM only available for quantization none) +- **Model ID:** internlm/internlm3-8b-instruct +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name internlm3-instruct --size-in-billions 8 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (gptq, 8 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 8 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** internlm/internlm3-8b-instruct-gptq-int4 +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name internlm3-instruct --size-in-billions 8 --model-format gptq --quantization ${quantization} + + +Model Spec 3 (awq, 8 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 8 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** internlm/internlm3-8b-instruct-awq +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name internlm3-instruct --size-in-billions 8 --model-format awq --quantization ${quantization} + + +Model Spec 4 (ggufv2, 8 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 8 +- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0 +- **Engines**: llama.cpp +- **Model ID:** internlm/internlm3-8b-instruct-gguf +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name internlm3-instruct --size-in-billions 8 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 5 (mlx, 8 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** mlx +- **Model Size (in billions):** 8 +- **Quantizations:** 4bit +- **Engines**: MLX +- **Model ID:** mlx-community/internlm3-8b-instruct-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name internlm3-instruct --size-in-billions 8 --model-format mlx --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/qwen2.5-vl-instruct.rst b/doc/source/models/builtin/llm/qwen2.5-vl-instruct.rst index 05fab7e969..3451a8f4cc 100644 --- a/doc/source/models/builtin/llm/qwen2.5-vl-instruct.rst +++ b/doc/source/models/builtin/llm/qwen2.5-vl-instruct.rst @@ -61,3 +61,51 @@ chosen quantization method from the options listed above:: xinference launch --model-engine ${engine} --model-name qwen2.5-vl-instruct --size-in-billions 72 --model-format pytorch --quantization ${quantization} + +Model Spec 4 (mlx, 3 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** mlx +- **Model Size (in billions):** 3 +- **Quantizations:** 3bit, 4bit, 6bit, 8bit, bf16 +- **Engines**: Transformers, MLX +- **Model ID:** mlx-community/Qwen2.5-VL-3B-Instruct-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-vl-instruct --size-in-billions 3 --model-format mlx --quantization ${quantization} + + +Model Spec 5 (mlx, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** mlx +- **Model Size (in billions):** 7 +- **Quantizations:** 3bit, 4bit, 6bit, 8bit, bf16 +- **Engines**: Transformers, MLX +- **Model ID:** mlx-community/Qwen2.5-VL-7B-Instruct-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-vl-instruct --size-in-billions 7 --model-format mlx --quantization ${quantization} + + +Model Spec 6 (mlx, 72 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** mlx +- **Model Size (in billions):** 72 +- **Quantizations:** 3bit, 4bit, 6bit, 8bit, bf16 +- **Engines**: Transformers, MLX +- **Model ID:** mlx-community/Qwen2.5-VL-72B-Instruct-{quantization} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name qwen2.5-vl-instruct --size-in-billions 72 --model-format mlx --quantization ${quantization} + diff --git a/doc/source/user_guide/backends.rst b/doc/source/user_guide/backends.rst index 9bdfb21b25..b54b832be0 100644 --- a/doc/source/user_guide/backends.rst +++ b/doc/source/user_guide/backends.rst @@ -71,6 +71,7 @@ Currently, supported model includes: - ``orion-chat``, ``orion-chat-rag`` - ``c4ai-command-r-v01`` - ``minicpm3-4b`` +- ``internlm3-instruct`` .. vllm_end SGLang diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 7defdd7db7..d00081f7bd 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -7162,6 +7162,42 @@ "none" ], "model_id":"Qwen/Qwen2.5-VL-72B-Instruct" + }, + { + "model_format":"mlx", + "model_size_in_billions":3, + "quantizations":[ + "3bit", + "4bit", + "6bit", + "8bit", + "bf16" + ], + "model_id":"mlx-community/Qwen2.5-VL-3B-Instruct-{quantization}" + }, + { + "model_format":"mlx", + "model_size_in_billions":7, + "quantizations":[ + "3bit", + "4bit", + "6bit", + "8bit", + "bf16" + ], + "model_id":"mlx-community/Qwen2.5-VL-7B-Instruct-{quantization}" + }, + { + "model_format":"mlx", + "model_size_in_billions":72, + "quantizations":[ + "3bit", + "4bit", + "6bit", + "8bit", + "bf16" + ], + "model_id":"mlx-community/Qwen2.5-VL-72B-Instruct-{quantization}" } ], "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}", @@ -9411,6 +9447,14 @@ ], "model_id": "internlm/internlm3-8b-instruct-gguf", "model_file_name_template": "internlm3-8b-instruct-{quantization}.gguf" + }, + { + "model_format":"mlx", + "model_size_in_billions":8, + "quantizations":[ + "4bit" + ], + "model_id":"mlx-community/internlm3-8b-instruct-{quantization}" } ], "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index e0a660f358..4d084ed8b4 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -4769,10 +4769,11 @@ "model_format":"mlx", "model_size_in_billions":2, "quantizations":[ + "4bit", "8bit" ], "model_hub": "modelscope", - "model_id":"okwinds/Qwen2-VL-2B-Instruct-MLX-8bit", + "model_id":"mlx-community/Qwen2-VL-2B-Instruct-{quantization}", "model_revision":"master" }, { @@ -4865,6 +4866,45 @@ ], "model_hub": "modelscope", "model_id":"qwen/Qwen2.5-VL-72B-Instruct" + }, + { + "model_format":"mlx", + "model_size_in_billions":3, + "quantizations":[ + "3bit", + "4bit", + "6bit", + "8bit", + "bf16" + ], + "model_hub": "modelscope", + "model_id":"mlx-community/Qwen2.5-VL-3B-Instruct-{quantization}" + }, + { + "model_format":"mlx", + "model_size_in_billions":7, + "quantizations":[ + "3bit", + "4bit", + "6bit", + "8bit", + "bf16" + ], + "model_hub": "modelscope", + "model_id":"mlx-community/Qwen2.5-VL-7B-Instruct-{quantization}" + }, + { + "model_format":"mlx", + "model_size_in_billions":72, + "quantizations":[ + "3bit", + "4bit", + "6bit", + "8bit", + "bf16" + ], + "model_hub": "modelscope", + "model_id":"mlx-community/Qwen2.5-VL-72B-Instruct-{quantization}" } ], "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}", @@ -7121,6 +7161,15 @@ "model_id": "Shanghai_AI_Laboratory/internlm3-8b-instruct-gguf", "model_file_name_template": "internlm3-8b-instruct-{quantization}.gguf", "model_hub": "modelscope" + }, + { + "model_format":"mlx", + "model_size_in_billions":8, + "quantizations":[ + "4bit" + ], + "model_hub": "modelscope", + "model_id":"mlx-community/internlm3-8b-instruct-{quantization}" } ], "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",