From ef8531f993fc16854cfc4fc70ef7cd3996fdfef0 Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Fri, 27 Dec 2024 04:17:13 +0200 Subject: [PATCH] add serverless --- .github/workflows/image.yml | 26 ++++++++++++++- Dockerfile.serverless | 38 ++++++++++++++++++++++ app/serverless.py | 65 +++++++++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 Dockerfile.serverless create mode 100644 app/serverless.py diff --git a/.github/workflows/image.yml b/.github/workflows/image.yml index f367802..f8036f2 100644 --- a/.github/workflows/image.yml +++ b/.github/workflows/image.yml @@ -26,9 +26,33 @@ jobs: username: ${{github.actor}} password: ${{secrets.GITHUB_TOKEN}} - - name: 'Build Inventory Image' + - name: 'Build API Image' run: | docker build . --tag ghcr.io/mendableai/mineru-api:latest docker image ls ghcr.io/mendableai/mineru-api:latest docker history ghcr.io/mendableai/mineru-api:latest docker push ghcr.io/mendableai/mineru-api:latest + + push-serverless: + runs-on: ubuntu-latest-m + defaults: + run: + working-directory: './' + steps: + - name: 'Checkout GitHub Action' + uses: actions/checkout@main + + - name: 'Login to GitHub Container Registry' + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{github.actor}} + password: ${{secrets.GITHUB_TOKEN}} + + - name: 'Build Serverless Image' + run: | + docker build . -f Dockerfile.serverless --tag ghcr.io/mendableai/mineru-api-serverless:latest + docker image ls ghcr.io/mendableai/mineru-api-serverless:latest + docker history ghcr.io/mendableai/mineru-api-serverless:latest + docker push ghcr.io/mendableai/mineru-api-serverless:latest + diff --git a/Dockerfile.serverless b/Dockerfile.serverless new file mode 100644 index 0000000..33ef9d6 --- /dev/null +++ b/Dockerfile.serverless @@ -0,0 +1,38 @@ +ARG POETRY_VERSION=1.6.1 + +FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 +# Allow statements and log messages to immediately appear in the logs +ENV PYTHONUNBUFFERED True + +ENV DEBIAN_FRONTEND noninteractive +RUN apt-get update && apt-get install -y tzdata +# ENV TZ Asia/Tokyo + +RUN apt-get update && \ + apt-get install --yes --no-install-recommends curl g++ libopencv-dev python3.10 python3-pip && \ + rm -rf /var/lib/apt/lists/* +RUN curl -sSL https://install.python-poetry.org | POETRY_VERSION=${POETRY_VERSION} python3.10 - + +ENV APP_HOME /app +WORKDIR $APP_HOME + +COPY pyproject.toml poetry.lock ./ + +ENV PATH="/root/.local/bin:$PATH" +RUN poetry config virtualenvs.create false && \ + poetry install --no-interaction --no-root && \ + rm -rf /root/.cache/pypoetry && \ + rm -rf /root/.cache/pip + +#use paddlegpu +RUN pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ + +RUN pip install runpod + +COPY . ./ +COPY magic-pdf.gpu.json /root/magic-pdf.json + +RUN python3.10 download_models.py + + +CMD ["python3.10", "serverless.py"] diff --git a/app/serverless.py b/app/serverless.py new file mode 100644 index 0000000..ddeaa48 --- /dev/null +++ b/app/serverless.py @@ -0,0 +1,65 @@ +import base64 +import os +from pathlib import Path +from uuid import uuid4 + +import magic_pdf.model as model_config +import runpod +from magic_pdf.pipe.UNIPipe import UNIPipe +from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter + +from .office_converter import OfficeConverter, OfficeExts + +# Configure model settings +model_config.__use_inside_model__ = True +model_config.__model_mode__ = "full" + +_tmp_dir = "/tmp/{uuid}" +_local_image_dir = "/tmp/{uuid}/images" + +def handler(event): + try: + # Extract base64 encoded file and filename from the event + input_data = event.get("input", {}) + base64_content = input_data.get("file_content") + filename = input_data.get("filename") + + if not base64_content or not filename: + return {"error": "Missing file_content or filename"} + + # Decode base64 content + pdf_bytes = base64.b64decode(base64_content) + + # Set up temporary directories + uuid_str = str(uuid4()) + tmp_dir = _tmp_dir.format(uuid=uuid_str) + local_image_dir = _local_image_dir.format(uuid=uuid_str) + os.makedirs(tmp_dir, exist_ok=True) + os.makedirs(local_image_dir, exist_ok=True) + + # Handle office documents conversion + if filename.endswith(OfficeExts.__args__): + input_file: Path = Path(tmp_dir) / filename + input_file.write_bytes(pdf_bytes) + output_file: Path = Path(tmp_dir) / f"{Path(filename).stem}.pdf" + office_converter = OfficeConverter() + office_converter.convert(input_file, output_file) + pdf_bytes = output_file.read_bytes() + elif not filename.endswith(".pdf"): + return {"error": "Unsupported file type"} + + # Process PDF + image_writer = DiskReaderWriter(local_image_dir) + jso_useful_key = {"_pdf_type": "", "model_list": []} + pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True) + pipe.pipe_classify() + pipe.pipe_analyze() + pipe.pipe_parse() + md_content = pipe.pipe_mk_markdown(local_image_dir, drop_mode="none") + + return {"markdown": md_content} + + except Exception as e: + return {"error": str(e)} + +runpod.serverless.start({"handler": handler}) \ No newline at end of file