Skip to content

Commit

Permalink
Add microservice to add OCR to PDF [minor] (#50)
Browse files Browse the repository at this point in the history
  • Loading branch information
joecorall authored Oct 18, 2024
1 parent 4c4c259 commit e31c347
Show file tree
Hide file tree
Showing 8 changed files with 162 additions and 3 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/lint-test-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ jobs:
- name: Find docker files
id: images
run: |
dockerFiles=$(find examples -name Dockerfile | grep -v -E '(mergepdf|coverpage)' | jq -c --raw-input --slurp 'split("\n")| .[0:-1]')
dockerFiles=$(find examples -name Dockerfile | grep -v -E '(mergepdf|coverpage|ocrpdf)' | jq -c --raw-input --slurp 'split("\n")| .[0:-1]')
echo "dockerFiles=$dockerFiles" >> $GITHUB_OUTPUT
env:
GITHUB_REF: ${{ github.ref }}
Expand All @@ -93,7 +93,7 @@ jobs:
needs: [build-push]
strategy:
matrix:
dockerFile: ["examples/coverpage/Dockerfile", "examples/mergepdf/Dockerfile"]
dockerFile: ["examples/coverpage/Dockerfile", "examples/mergepdf/Dockerfile", "examples/ocrpdf/Dockerfile",]
uses: ./.github/workflows/build-push.yml
with:
dockerFile: ${{ matrix.dockerFile }}
Expand Down
7 changes: 7 additions & 0 deletions ci/k8s/ingress.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,10 @@ spec:
name: islandora-mergepdf
port:
number: 8080
- path: /ocrpdf(/|$)(.*)
pathType: Prefix
backend:
service:
name: islandora-ocrpdf
port:
number: 8080
46 changes: 46 additions & 0 deletions ci/k8s/ocrpdf.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
---
apiVersion: v1
kind: Service
metadata:
name: islandora-ocrpdf
spec:
selector:
app: islandora-ocrpdf
ports:
- protocol: TCP
port: 8886
targetPort: 8080
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: islandora-ocrpdf
spec:
replicas: 3
selector:
matchLabels:
app: islandora-ocrpdf
template:
metadata:
labels:
app: islandora-ocrpdf
spec:
containers:
- name: scyllaridae-ocrpdf
image: lehighlts/scyllaridae-ocrpdf:main
imagePullPolicy: IfNotPresent
resources:
requests:
memory: "128Mi"
cpu: "500m"
limits:
memory: "1Gi"
ports:
- containerPort: 8080
hostPort: 8886
readinessProbe:
httpGet:
path: /healthcheck
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
2 changes: 1 addition & 1 deletion examples/mergepdf/cmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ I=0
# iterate over all images in the IIIF manifest
curl -s "$1/book-manifest" | jq -r '.sequences[0].canvases[].images[0].resource."@id"' | while read -r URL; do
# resize image to max 1000px width
curl -s "$URL" | convert -[0] -resize 1000x\> "$TMP_DIR/img_$I" > /dev/null 2>&1
curl -s "$URL" | magick -[0] -resize 1000x\> "$TMP_DIR/img_$I" > /dev/null 2>&1

# make an OCR'd PDF from the image
tesseract "$TMP_DIR/img_$I" "$TMP_DIR/img_$I" pdf > /dev/null 2>&1
Expand Down
24 changes: 24 additions & 0 deletions examples/ocrpdf/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
ARG TAG=main
ARG DOCKER_REPOSITORY=local
FROM ${DOCKER_REPOSITORY}/scyllaridae-imagemagick:${TAG} AS scyllaridae

RUN apk update && \
apk add --no-cache \
ghostscript==10.04.0-r0 \
jq==1.7.1-r0 \
leptonica-dev==1.84.1-r0 \
tesseract-ocr==5.3.4-r0 \
tesseract-ocr-data-eng==5.3.4-r0 \
tesseract-ocr-data-fra==5.3.4-r0 \
tesseract-ocr-data-spa==5.3.4-r0 \
tesseract-ocr-data-ita==5.3.4-r0 \
tesseract-ocr-data-por==5.3.4-r0 \
tesseract-ocr-data-hin==5.3.4-r0 \
tesseract-ocr-data-deu==5.3.4-r0 \
tesseract-ocr-data-jpn==5.3.4-r0 \
tesseract-ocr-data-rus==5.3.4-r0 \
poppler-utils==24.02.0-r1

COPY . /app

ENTRYPOINT ["/app/docker-entrypoint.sh"]
49 changes: 49 additions & 0 deletions examples/ocrpdf/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# ocrpdf

Add OCR to PDF with no OCR

## Install

### Deploy microservice


#### docker-compose

Add the microservice to your docker compose

```
ocrpdf-dev: &ocrpdf
<<: [*dev, *common]
image: lehighlts/scyllaridae-ocrpdf:main
networks:
default:
aliases:
- ocrpdf
ocrpdf-prod:
<<: [*prod, *ocrpdf]
```

#### kubernetes

See [service/deployment manifest in scyllaridae repo](https://github.com/lehigh-university-libraries/scyllaridae/blob/main/ci/k8s/ocrpdf.yaml)


### Configure alpaca

You'll also need to add `ocrpdf` to `derivative.systems.installed` in your `alpaca.properties` by adding that string to the `ALPACA_DERIVATIVE_SYSTEMS` environment variable in your alpaca service.

```
ALPACA_DERIVATIVE_SYSTEMS=ocrpdf
```

You'll also need to define the service in alpaca.properties.tmpl

```
derivative.ocrpdf.enabled=true
derivative.ocrpdf.in.stream=queue:islandora-connector-ocrpdf
# this url may be different if deploying via kubernetes
derivative.ocrpdf.service.url=http://ocrpdf:8080
derivative.ocrpdf.concurrent-consumers=1
derivative.ocrpdf.max-concurrent-consumers=-1
derivative.ocrpdf.async-consumer=true
```
28 changes: 28 additions & 0 deletions examples/ocrpdf/cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash

set -eou pipefail

TMP_DIR=$(mktemp -d)

cd "$TMP_DIR"

# split pdf into PNG files
magick - page-%d.png > /dev/null 2>&1

# add OCR to each PNG
for i in page-*.png; do
tesseract "$i" "${i%.png}" --dpi 300 pdf > /dev/null 2>&1
done

# put the PDF back together
pdfunite page-*.pdf output.pdf > /dev/null 2>&1

# make sure the PDF is legit
pdfinfo output.pdf > /dev/null || exit 1

# print the results to stdout
cat output.pdf

# cleanup
cd /app
rm -rf "$TMP_DIR"
5 changes: 5 additions & 0 deletions examples/ocrpdf/scyllaridae.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
allowedMimeTypes:
- "application/pdf"
cmdByMimeType:
default:
cmd: /app/cmd.sh

0 comments on commit e31c347

Please sign in to comment.