diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..f8022cf --- /dev/null +++ b/Dockerfile @@ -0,0 +1,34 @@ +#FROM python:3.8-slim-buster +FROM python:3.10 + +# 将当前目录的所有内容复制到目标目录 +ADD . /workspace/code-repo +# 设定后续操作的工作目录为 /workspace/code-repo,有利于提升可读性和管理效率 +WORKDIR /workspace/code-repo + +RUN pip install fastapi uvicorn +RUN pip3 install requests + +ENV PYTHONPATH /workspace/code-repo + +# 安装依赖工具,例如 curl 和 sh(如果尚未安装) +RUN apt-get update && apt-get install -y curl + +# 执行 Ollama 安装脚本 +RUN curl -fsSL https://ollama.com/install.sh | sh + +# 安装完成后,启动ollama服务 +#RUN ollama serve +#RUN ollama run qwen2:0.5b + +# 设置环境变量以允许 Flask 绑定到所有可用的 IP +ENV FLASK_RUN_HOST=0.0.0.0 + +# 暴露 Flask 默认端口 +EXPOSE 8000 + +# ENTRYPOINT [ "python3", "./app_stream.py" ] +# CMD ["flask", "run"] + +CMD sh -c "ollama serve & ollama run llama3.2" +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000", "--load-balancer", "sunrpc"] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c90c21f --- /dev/null +++ b/Makefile @@ -0,0 +1,14 @@ +.PHONY: build +app=rongliang_algorithm_serving_applet_cloud +legacy_image=$(shell docker images --filter=reference="*rongliang_algorithm_serving_applet_cloud*" -q) +version=$(shell date '+%Y%m%d%H%M') +build: +ifeq ($(strip $(legacy_image)),) + @echo "nope" +else + docker rmi -f ${legacy_image} +endif + docker buildx build --platform linux/amd64 -t ${app} . +# docker login --username=trsopenapi@1219654161317312 registry.cn-beijing.aliyuncs.com/saasalpha/tmaster --password=AlipaySaas22 +# docker tag ${app} registry.cn-beijing.aliyuncs.com/saasalpha/tmaster:ats_${app}_$(version) +# docker push registry.cn-beijing.aliyuncs.com/saasalpha/tmaster:ats_${app}_$(version) \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..7015661 --- /dev/null +++ b/README.md @@ -0,0 +1,213 @@ +
+

+ModelCache +

+
+ +

+

+

+

+ 中文 | + English +

+

+
+ +## Contents +- [news](#news) +- [Introduction](#Introduction) +- [Quick-Deployment](#Quick-Deployment) +- [Service-Access](#Service-Access) +- [Modules](#Modules) +- [Core-Features](#Core-Features) +- [Acknowledgements](#Acknowledgements) +- [Contributing](#Contributing) + +### news +- [2024.11.06] EasyDeploy was released, utilizing Docker and Ollama based architecture. + +## Introduction +EasyDeploy is engineered to provide users with end-to-end deployment capabilities for large-scale models. By incorporating the deployment and inference logic of large models within Docker, EasyDeploy streamlines the overall deployment process and significantly enhances the user experience. Currently, EasyDeploy supports multiple engines, including Ollama, and plans to extend support to additional engines such as vLLM in the future. +Through EasyDeploy, users are empowered to rapidly deploy and initiate large-scale models between cloud environments and local devices, effectively eliminating technical barriers and enabling a focus on the application and optimization of the models themselves. Whether operating within local environments or cloud platforms, EasyDeploy provides efficient and reliable solutions, thereby facilitating the swift advancement and practical implementation of artificial intelligence. + +## Quick-Deployment +### Dependencies ++ Python version: 3.10 ++ Package Installation +```shell +pip install -r requirements.txt +``` +### Service Startup +Download Docker Image + +Download link:上传后更新 + +```shell +docker run -p 8000:8000 easydeploy_llama3.2_3b +``` + +## Service-Access +The service provides both streaming and blocking access functionalities through RESTful APIs. An example request is presented below: + +### Chat Page +[http://127.0.0.1:8000/chat](http://127.0.0.1:8000/chat) + +### API Interface +#### Blocking Access +**Request Method**: +```python +# -*- coding: utf-8 -*- +import json +import requests +url = 'http://127.0.0.1:8000/chat/completions' +prompt = '你好' +model = 'lamma3.2' +messages = [{"role": "user", "content": prompt}] +data = {'model': model, 'messages': messages} +headers = {"Content-Type": "application/json"} +response = requests.post(url, headers=headers, data=json.dumps(data)) +if response.status_code == 200: + ans_dict = json.loads(response.text) + print('data: {}'.format(ans_dict)) +``` + +**Return Format**: + +```json +{ + "id": "ollama-123", + "object": "chat.completion", + "created": 1731847899, + "model": "lamma3.2", + "system_fingerprint": "", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "hi! How can I assist you today?" + }, + "logprobs": null, + "finish_reason": "stop" + } + ], + "usage": { + + } +} +``` + +#### **Stream Access**: +**Request Method:** + +```python +# -*- coding: utf-8 -*- +import json +import requests +url = 'http://127.0.0.1:8000/chat/completions' +prompt = 'hello' +model = 'lamma3.2' +messages = [{"role": "user", "content": prompt}] +data = {'model': model, 'messages': messages, 'stream': True} +headers = {"Content-Type": "application/json"} +response = requests.post(url, headers=headers, data=json.dumps(data)) +``` + +**Return Format**: +```json +{ + "id": "ollama-123", + "object": "chat.completion.chunk", + "created": 1731848401, + "model": "lamma3.2", + "system_fingerprint": "", + "choices": [ + { + "index": 0, + "delta": { + "role": "assistant", + "content": "hi" + }, + "logprobs": null, + "finish_reason": null + } + ] +} +``` + +## Modules +![easydeploy modules](docs/easydeploy_modules_20241125.png) +## Core-Features + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CategoryFunctionStatusDescription
API ServiceOpenAI Standard APIThe service interface complies with OpenAI standards, minimizing integration costs through standardized APIs. It enables users to seamlessly integrate and maintain the system, swiftly respond to business requirements, and concentrate on core development.
Blocking access capabilities Suitable for tasks requiring integrity and coherence or for overall verification and processing of results, this approach obtains complete output in a single iteration. Throughout the process, the user must wait until all output content has been fully generated.
Streaming access capabilitiesSuitable for real-time applications with stringent response time requirements, such as code completion, real-time translation, or websites with dynamic content loading. The model transmits content incrementally during generation, enabling users to receive and process partial outputs immediately without waiting for full completion, thereby enhancing interactivity.
High-performance gatewayHigh-performance gateways effectively manage high-concurrency requests, reduce latency, and enhance response times by optimizing data transmission, employing advanced load balancing algorithms, and implementing efficient resource management.
Multi-engine SupportOllamaHigh-performance gateways effectively manage high-concurrency requests, reduce latency, and enhance response times by optimizing data transmission, employing advanced load balancing algorithms, and implementing efficient resource management.
vLLMvLLM exhibits significant advantages in memory management and throughput. By optimizing memory usage and parallel computation, it substantially enhances inference speed and resource efficiency, while maintaining compatibility with various hardware environments. vLLM offers a wide range of configuration options, allowing users to adjust inference strategies based on their needs. Its scalable architecture makes it suitable for both research and enterprise-level applications.
Tensorrt–LLMTensorRT-LLM (TensorRT for Large Language Models) is a high-performance, scalable deep learning inference optimization library developed by NVIDIA, specifically designed for large language models (LLMs).
Docker Deployment CapabilityDocker images built with Python 3.10TensorRT-LLM is a high-performance, scalable deep learning inference optimization library developed by NVIDIA, specifically designed for large language models (LLMs).
Web UI IntegrationOpenUI protocolThe comprehensive UI open-source protocol facilitates users in integrating diverse components, enhancing product customizability and extensibility.
More Core FeaturesModelCache semantic cachingBy caching generated QA pairs, similar requests can achieve millisecond-level responses, enhancing the performance and efficiency of model inference.
+ + +## Acknowledgements +This project draws on the following open-source projects, and we express our gratitude to the relevant projects and researchers for their contributions. +[Ollama](https://github.com/ollama/ollama)、[vLLM](https://github.com/vllm-project/vllm) + +## Contributing +EasyDeploy is an intriguing and valuable project, which we believe holds significant potential. We welcome contributions from both seasoned developers and novices alike. Contributions may include, but are not limited to, submitting issues and suggestions, participating in code development, and enhancing documentation and examples. + diff --git a/README_CN.md b/README_CN.md new file mode 100644 index 0000000..8282536 --- /dev/null +++ b/README_CN.md @@ -0,0 +1,216 @@ +
+

+ModelCache +

+
+ +

+

+

+

+ 中文 | + English +

+

+
+ +## Contents +- [新闻](#新闻) +- [项目简介](#项目简介) +- [快速部署](#快速部署) +- [服务访问](#服务访问) +- [架构图](#架构图) +- [核心功能](#核心功能) +- [致谢](#致谢) +- [Contributing](#Contributing) + +## 新闻 +- [2024.11.06] EasyDeploy发布,基于docker+ollama的方式 + +## 项目简介 +EasyDeploy 旨在为用户提供端云一体的大模型部署能力,我们将大模型的部署和推理逻辑集成到 Docker 中,简化整体部署流程,全面提升用户体验。EasyDeploy 支持多种引擎,目前已支持 Ollama,未来将支持 vLLM 等其它引擎,进一步丰富用户的选择和应用场景。 + +通过 EasyDeploy,用户能够快速在云端与端设备之间部署和启动大模型,消除技术壁垒,专注于模型本身的应用和优化。无论是在本地开发环境、云端平台还是端设备中,EasyDeploy 都将为用户提供高效、可靠的解决方案,助力人工智能的快速发展与应用落地。 + +## 快速部署 +### 环境依赖 ++ python版本: 3.10 ++ 依赖包安装: + +```shell +pip install -r requirements.txt +``` +### 服务启动 +Docker 镜像下载: + +下载地址:上传后更新 + +```shell +docker run -p 8000:8000 easydeploy_llama3.2_3b +``` + +## 服务访问 +当前服务以restful API方式提供流批一体访问功能,请求demo 如下: + +### chat页面 +[http://127.0.0.1:8000/chat](http://127.0.0.1:8000/chat) + +### API接口 +#### 阻塞访问: +请求方式: + +```python +# -*- coding: utf-8 -*- +import json +import requests +url = 'http://127.0.0.1:8000/chat/completions' +prompt = '你好' +model = 'lamma3.2' +messages = [{"role": "user", "content": prompt}] +data = {'model': model, 'messages': messages} +headers = {"Content-Type": "application/json"} +response = requests.post(url, headers=headers, data=json.dumps(data)) +if response.status_code == 200: + ans_dict = json.loads(response.text) + print('data: {}'.format(ans_dict)) +``` + +返回格式: + +```json +{ + "id": "ollama-123", + "object": "chat.completion", + "created": 1731847899, + "model": "lamma3.2", + "system_fingerprint": "", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "你好,我是大语言模型,我主要的任务是提供帮助用户解决问题和解答问题,比如回答关于技术、编程、知识问答等。" + }, + "logprobs": null, + "finish_reason": "stop" + } + ], + "usage": { + + } +} +``` + +### 流式访问: +请求方式: + +```python +# -*- coding: utf-8 -*- +import json +import requests +url = 'http://127.0.0.1:8000/chat/completions' +prompt = '你好' +model = 'lamma3.2' +messages = [{"role": "user", "content": prompt}] +data = {'model': model, 'messages': messages, 'stream': True} +headers = {"Content-Type": "application/json"} +response = requests.post(url, headers=headers, data=json.dumps(data)) +``` + +返回方式: + +```json +{ + "id": "ollama-123", + "object": "chat.completion.chunk", + "created": 1731848401, + "model": "lamma3.2", + "system_fingerprint": "", + "choices": [ + { + "index": 0, + "delta": { + "role": "assistant", + "content": "你" + }, + "logprobs": null, + "finish_reason": null + } + ] +} +``` + +### 架构图 +![easydeploy modules](docs/easydeploy_modules_20241125.png) +### 核心能力 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
分类功能名称状态描述
API Service基于Open AI的标准API规范服务接口遵循 OpenAI 规范,通过标准化 API 降低接入成本,用户可轻松集成功能,快速响应业务需求,专注于核心开发。
阻塞式访问能力适用于需要完整性和准确性的任务,完成时结果进行整体校验或输出的任务,一次性获取完整输出。在整个过程中,用户需要等待直至所有输出内容完全完成。
流式访问能力适用于对响应时间要求较高的实时应用,如代码补全、实时翻译或动态内容加载的场景。模型在生成过程中分段逐步传输内容,用户可在内容生成后立即接收和处理,无需等待全部完成,从而提升效率。
高性能网络,提升用户开发能力高性能网络通过优化数据传输、采用先进负载均衡算法及高效资源管理,能有效提升数据来源、降低延迟、提升响应速度。
多引擎支持OllamaOllama 以易用和轻量著称,专注于高效稳定的大模型推理服务。其友好 API 和简洁流畅流程,使开发者能够轻松将其手作快速部署应用。
vLLMvLLM在内存管理和吞吐量上有显著优势,其通过优化存储和并行计算,显著提升推理速度和资源利用率,兼容多种硬件环境。vLLM提供丰富的配置选项,用户可根据需求调整推理策略,适用于实时和企业级应用。
Tensorrt–LLMTensorRT–LLM (TensorRT for Large Language Models) 是NVIDIA优化的高性能、大规模推理优化库,专为大型语言模型(LLM)设计。
Docker部署能力基于python3.10构建Docker镜像将大型模型及其依赖的镜像,确保版本号一致运行,简化部署与配置。利用Docker的版本构建和自动化部署,提高模型更新与迭代效率,加快从开发到生产落地的转化。
Web UI接入OpenUI 协议丰富的UI开源协议便于用户整合多种组件,提升产品的定制性和扩展性。
更多核心功能ModelCache语义缓存通过缓存已有生成的QA Pair,使得请求变更更加细粒度,提高模型推理的性能与效率。
+ +## 致谢 +本项目参考了以下开源项目,在此对相关项目和研究开发人员表示感谢。 +[Ollama](https://github.com/ollama/ollama)、[vLLM](https://github.com/vllm-project/vllm) + +## Contributing +EasyDeploy是一个非常有趣且有用的项目,我们相信这个项目有很大的潜力,无论你是经验丰富的开发者,还是刚刚入门的新手,都欢迎你为这个项目做出一些贡献,包括但不限于:提交问题和建议,参与代码编写,完善文档和示例。 + diff --git a/docs/easydeploy_modules_20241125.png b/docs/easydeploy_modules_20241125.png new file mode 100644 index 0000000..68cfe15 Binary files /dev/null and b/docs/easydeploy_modules_20241125.png differ diff --git a/example/__init__.py b/example/__init__.py new file mode 100644 index 0000000..40a96af --- /dev/null +++ b/example/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/example/hello_word.py b/example/hello_word.py new file mode 100644 index 0000000..22ff936 --- /dev/null +++ b/example/hello_word.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +import json +import requests + + +def run(): + url = 'http://127.0.0.1:5000/' + headers = {"Content-Type": "application/json"} + res = requests.post(url, headers=headers) + res_text = res.text + print('res_text: {}'.format(res_text)) + + +if __name__ == '__main__': + run() diff --git a/example/ollama_app.py b/example/ollama_app.py new file mode 100644 index 0000000..34d0af3 --- /dev/null +++ b/example/ollama_app.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +import json +import requests + + +def run(): + url = 'http://127.0.0.1:8000/chat/completions' + prompt = '你是谁?' + model = 'qwen2:0.5b' + messages = [{"role": "user", "content": prompt}] + data = {'model': model, 'messages': messages} + headers = {"Content-Type": "application/json"} + response = requests.post(url, headers=headers, data=json.dumps(data)) + + if response.status_code == 200: + ans_dict = json.loads(response.text) + print('data: {}'.format(ans_dict)) + + +if __name__ == '__main__': + run() diff --git a/example/ollama_app_stream.py b/example/ollama_app_stream.py new file mode 100644 index 0000000..4fe661e --- /dev/null +++ b/example/ollama_app_stream.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +import json +import requests + + +# 发起请求,并将stream参数设置为True以获取流式输出 +url = 'http://127.0.0.1:8000/chat/completions' +prompt = '你是谁?' +model = 'qwen2:0.5b' +messages = [{"role": "user", "content": prompt}] +data = {'model': model, 'messages': messages, 'stream': True} +headers = {"Content-Type": "application/json"} + +response = requests.post(url, headers=headers, data=json.dumps(data)) + +resp = '' +for line in response.iter_lines(): + print('line: {}'.format(line)) + print('line: {}'.format(type(line))) + data = line.decode('utf-8') + print('data: {}'.format(data)) + print('data: {}'.format(type(data))) + data_dict = json.loads(data) + print('data_dict: {}'.format(data_dict)) + print('data_dict: {}'.format(type(data_dict))) + text = data_dict['choices'][-1]['delta']['content'] + resp += text +print('resp: {}'.format(resp)) diff --git a/server/__init__.py b/server/__init__.py new file mode 100644 index 0000000..40a96af --- /dev/null +++ b/server/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/server/ollama_server.py b/server/ollama_server.py new file mode 100644 index 0000000..2da2a01 --- /dev/null +++ b/server/ollama_server.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +import requests +import json + +url_generate = "http://127.0.0.1:11434/api/generate" + + +def get_response(url, data): + response = requests.post(url, json=data) + response_dict = json.loads(response.text) + response_content = response_dict["response"] + return response_content + + +data = { + "model": "qwen2:0.5b", + "prompt": "你是谁?", + "stream": False +} + +res = get_response(url_generate,data) +print(res) diff --git a/templates/chat_page.html b/templates/chat_page.html new file mode 100644 index 0000000..6bd7291 --- /dev/null +++ b/templates/chat_page.html @@ -0,0 +1,131 @@ + + + + + + LLMs Chat Page + + + +

LLMs Chat

+
+
+ +
+
+ + +
+
+ + +
+ + + diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..96a35b5 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,12 @@ + + + + + + Welcome to Flask + + +

Hello, Flask!

+

This is a simple Flask web page.

+ +