Merge pull request #34 from allenai/pbeukema/fastapi

yawenzzzz · web-flow · commit bc424fd87d85 · 2024-10-22T10:55:59.000-07:00
FastAPI for Landsat vessel detection
diff --git a/.github/workflows/build_test.yaml b/.github/workflows/build_test.yaml
@@ -91,10 +91,20 @@ jobs:
         run: |
           COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker compose -f docker-compose.yaml build
 
+      - name: Authenticate into gcp
+        uses: "google-github-actions/auth@v2"
+        with:
+          credentials_json: ${{ secrets.GOOGLE_CREDENTIALS }}
 
       - name: Run tests with Docker Compose
         run: |
-          docker compose -f docker-compose.yaml run test pytest tests/
+          docker compose -f docker-compose.yaml run \
+            -e AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }} \
+            -e AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }} \
+            -v ${{env.GOOGLE_GHA_CREDS_PATH}}:/tmp/gcp-credentials.json:ro \
+            -e GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcp-credentials.json \
+            -e RSLP_BUCKET=rslearn-eai \
+            test pytest tests/ --ignore tests/integration_slow/
 
       - name: Clean up
         if: always()
diff --git a/Dockerfile b/Dockerfile
@@ -1,12 +1,8 @@
 FROM pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime@sha256:58a28ab734f23561aa146fbaf777fb319a953ca1e188832863ed57d510c9f197
 
-# TEMPORARY Until RSLEARN Is Public
-ARG GIT_USERNAME
-ARG GIT_TOKEN
-
 RUN apt update
 RUN apt install -y libpq-dev ffmpeg libsm6 libxext6 git
-RUN git clone https://${GIT_USERNAME}:${GIT_TOKEN}@github.com/allenai/rslearn.git /opt/rslearn_projects/rslearn
+RUN git clone https://github.com/allenai/rslearn.git /opt/rslearn_projects/rslearn
 RUN pip install -r /opt/rslearn_projects/rslearn/requirements.txt
 RUN pip install -r /opt/rslearn_projects/rslearn/extra_requirements.txt
 COPY requirements.txt /opt/rslearn_projects/requirements.txt
diff --git a/landsat/recheck_landsat_labels/phase123_config.yaml b/landsat/recheck_landsat_labels/phase123_config.yaml
@@ -52,10 +52,12 @@ data:
               allow_invalid: true
               skip_unknown_categories: true
               prob_property: "prob"
+              positive_class: "correct"
+              positive_class_threshold: 0.85
         input_mapping:
           class:
             label: "targets"
-    batch_size: 64
+    batch_size: 32
     num_workers: 32
     default_config:
       transforms:
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,9 @@
 beaker-py
+fastapi
 interrogate
+pydantic
 pytest
 python-dotenv
+ruff
+typing-extensions
+uvicorn
diff --git a/rslp/landsat_vessels/Dockerfile b/rslp/landsat_vessels/Dockerfile
@@ -0,0 +1,12 @@
+# Base image
+FROM base-image:latest
+
+# Environment variables
+ENV PYTHONPATH="/opt/rslearn_projects:${PYTHONPATH}"
+ENV LANDSAT_PORT=5555
+
+# Make port 5555 available to the world outside this container
+EXPOSE $LANDSAT_PORT
+
+# Run app.py when the container launches
+CMD ["python3", "rslp/landsat_vessels/api_main.py"]
diff --git a/rslp/landsat_vessels/api_main.py b/rslp/landsat_vessels/api_main.py
@@ -0,0 +1,98 @@
+"""Landsat Vessel Detection Service."""
+
+from __future__ import annotations
+
+import logging
+import multiprocessing
+import os
+
+import uvicorn
+from fastapi import FastAPI, Response
+from pydantic import BaseModel
+
+from rslp.landsat_vessels.predict_pipeline import FormattedPrediction, predict_pipeline
+
+app = FastAPI()
+
+# Set up the logger
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+LANDSAT_HOST = "0.0.0.0"
+LANDSAT_PORT = 5555
+
+
+class LandsatResponse(BaseModel):
+    """Response object for vessel detections."""
+
+    status: list[str]
+    predictions: list[FormattedPrediction]
+
+
+class LandsatRequest(BaseModel):
+    """Request object for vessel detections."""
+
+    scene_id: str | None = None
+    image_files: dict[str, str] | None = None
+    crop_path: str | None = None
+    scratch_path: str | None = None
+    json_path: str | None = None
+
+
+@app.on_event("startup")
+async def rslp_init() -> None:
+    """Landsat Vessel Service Initialization."""
+    logger.info("Initializing")
+    multiprocessing.set_start_method("forkserver", force=True)
+    multiprocessing.set_forkserver_preload(
+        [
+            "rslp.utils.rslearn.materialize_dataset",
+            "rslp.utils.rslearn.run_model_predict",
+        ]
+    )
+
+
+@app.get("/")
+async def home() -> dict:
+    """Returns a simple message to indicate the service is running."""
+    return {"message": "Landsat Detections App"}
+
+
+@app.post("/detections", response_model=LandsatResponse)
+async def get_detections(info: LandsatRequest, response: Response) -> LandsatResponse:
+    """Returns vessel detections Response object for a given Request object."""
+    # Ensure that either scene_id or image_files is specified.
+    if info.scene_id is None and info.image_files is None:
+        raise ValueError("Either scene_id or image_files must be specified.")
+
+    try:
+        if info.scene_id is not None:
+            logger.info(f"Received request with scene_id: {info.scene_id}")
+        elif info.image_files is not None:
+            logger.info("Received request with image_files")
+        json_data = predict_pipeline(
+            crop_path=info.crop_path,
+            scene_id=info.scene_id,
+            image_files=info.image_files,
+            scratch_path=info.scratch_path,
+            json_path=info.json_path,
+        )
+        return LandsatResponse(
+            status=["success"],
+            predictions=[pred for pred in json_data],
+        )
+    except ValueError as e:
+        logger.error(f"Value error during prediction pipeline: {e}")
+        return LandsatResponse(status=["error"], predictions=[])
+    except Exception as e:
+        logger.error(f"Unexpected error during prediction pipeline: {e}")
+        return LandsatResponse(status=["error"], predictions=[])
+
+
+if __name__ == "__main__":
+    uvicorn.run(
+        "api_main:app",
+        host=os.getenv("LANDSAT_HOST", default="0.0.0.0"),
+        port=int(os.getenv("LANDSAT_PORT", default=5555)),
+        proxy_headers=True,
+    )
diff --git a/rslp/landsat_vessels/docker-compose.yaml b/rslp/landsat_vessels/docker-compose.yaml
@@ -0,0 +1,33 @@
+version: "3.9"
+
+services:
+  # Define the base image
+  base-image:
+    build:
+      context: ../..
+      dockerfile: Dockerfile
+    image: base-image:latest   # Tag it as "base-image"
+
+  # Define the landsat-vessels service
+  landsat-vessels:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    shm_size: '10G'  # This adds the shared memory size
+    depends_on:
+      - base-image
+    ports:
+      - "5555:5555"
+    environment:
+      - RSLP_BUCKET
+      - S3_ACCESS_KEY_ID
+      - S3_SECRET_ACCESS_KEY
+      - AWS_ACCESS_KEY_ID
+      - AWS_SECRET_ACCESS_KEY
+      - NVIDIA_VISIBLE_DEVICES=all  # Make all GPUs visible
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities: [gpu]  # Ensure this service can access GPUs
+    runtime: nvidia  # Use the NVIDIA runtime
diff --git a/rslp/landsat_vessels/predict_pipeline.py b/rslp/landsat_vessels/predict_pipeline.py
@@ -1,6 +1,8 @@
 """Landsat vessel prediction pipeline."""
 
 import json
+import tempfile
+import time
 from datetime import datetime, timedelta
 
 import numpy as np
@@ -14,6 +16,7 @@
 from rslearn.dataset import Dataset, Window
 from rslearn.utils import Projection, STGeometry
 from rslearn.utils.get_utm_ups_crs import get_utm_ups_projection
+from typing_extensions import TypedDict
 from upath import UPath
 
 from rslp.utils.rslearn import materialize_dataset, run_model_predict
@@ -54,6 +57,16 @@ def __init__(
         self.crop_window_dir = crop_window_dir
 
 
+class FormattedPrediction(TypedDict):
+    """Formatted prediction for a single vessel detection."""
+
+    latitude: float
+    longitude: float
+    score: float
+    rgb_fname: str
+    b8_fname: str
+
+
 def get_vessel_detections(
     ds_path: UPath,
     projection: Projection,
@@ -180,12 +193,12 @@ def run_classifier(
 
 
 def predict_pipeline(
-    scratch_path: str,
-    json_path: str,
-    crop_path: str,
+    crop_path: str | None = None,
+    scratch_path: str | None = None,
+    json_path: str | None = None,
     image_files: dict[str, str] | None = None,
     scene_id: str | None = None,
-) -> None:
+) -> list[FormattedPrediction]:
     """Run the Landsat vessel prediction pipeline.
 
     This inputs a Landsat scene (consisting of per-band GeoTIFFs) and produces the
@@ -201,6 +214,15 @@ def predict_pipeline(
         scene_id: Landsat scene ID. Exactly one of image_files or scene_id should be
             specified.
     """
+    start_time = time.time()  # Start the timer
+    time_profile = {}
+
+    if scratch_path is None:
+        tmp_dir = tempfile.TemporaryDirectory()
+        scratch_path = tmp_dir.name
+    else:
+        tmp_dir = None
+
     ds_path = UPath(scratch_path)
     ds_path.mkdir(parents=True, exist_ok=True)
 
@@ -259,18 +281,29 @@ def predict_pipeline(
             dst_geom.time_range[1] + timedelta(minutes=30),
         )
 
+    time_profile["setup"] = time.time() - start_time
+
     # Run pipeline.
+    step_start_time = time.time()
+    print("run detector")
     detections = get_vessel_detections(
         ds_path,
         projection,
         scene_bounds,  # type: ignore
         time_range=time_range,
     )
+    time_profile["get_vessel_detections"] = time.time() - step_start_time
+
+    step_start_time = time.time()
+    print("run classifier")
     detections = run_classifier(ds_path, detections, time_range=time_range)
+    time_profile["run_classifier"] = time.time() - step_start_time
 
     # Write JSON and crops.
-    json_upath = UPath(json_path)
-    crop_upath = UPath(crop_path)
+    step_start_time = time.time()
+    if crop_path:
+        crop_upath = UPath(crop_path)
+        crop_upath.mkdir(parents=True, exist_ok=True)
 
     json_data = []
     for idx, detection in enumerate(detections):
@@ -304,13 +337,17 @@ def predict_pipeline(
             [images["B4_sharp"], images["B3_sharp"], images["B2_sharp"]], axis=2
         )
 
-        rgb_fname = crop_upath / f"{idx}_rgb.png"
-        with rgb_fname.open("wb") as f:
-            Image.fromarray(rgb).save(f, format="PNG")
+        if crop_path:
+            rgb_fname = crop_upath / f"{idx}_rgb.png"
+            with rgb_fname.open("wb") as f:
+                Image.fromarray(rgb).save(f, format="PNG")
 
-        b8_fname = crop_upath / f"{idx}_b8.png"
-        with b8_fname.open("wb") as f:
-            Image.fromarray(images["B8"]).save(f, format="PNG")
+            b8_fname = crop_upath / f"{idx}_b8.png"
+            with b8_fname.open("wb") as f:
+                Image.fromarray(images["B8"]).save(f, format="PNG")
+        else:
+            rgb_fname = ""
+            b8_fname = ""
 
         # Get longitude/latitude.
         src_geom = STGeometry(
@@ -321,14 +358,31 @@ def predict_pipeline(
         lat = dst_geom.shp.y
 
         json_data.append(
-            dict(
+            FormattedPrediction(
                 longitude=lon,
                 latitude=lat,
                 score=detection.score,
-                rgb_fname=str(rgb_fname),
-                b8_fname=str(b8_fname),
-            )
+                rgb_fname=rgb_fname,
+                b8_fname=b8_fname,
+            ),
         )
 
-    with json_upath.open("w") as f:
-        json.dump(json_data, f)
+    time_profile["write_json_and_crops"] = time.time() - step_start_time
+
+    elapsed_time = time.time() - start_time  # Calculate elapsed time
+    time_profile["total"] = elapsed_time
+
+    # Clean up any temporary directories.
+    if tmp_dir:
+        tmp_dir.cleanup()
+
+    if json_path:
+        json_upath = UPath(json_path)
+        with json_upath.open("w") as f:
+            json.dump(json_data, f)
+
+    print(f"Prediction pipeline completed in {elapsed_time:.2f} seconds")
+    for step, duration in time_profile.items():
+        print(f"{step} took {duration:.2f} seconds")
+
+    return json_data
diff --git a/rslp/utils/rslearn.py b/rslp/utils/rslearn.py
@@ -36,12 +36,14 @@ def materialize_dataset(
         dataset,
         workers=workers,
         group=group,
+        use_initial_job=False,
     )
     apply_on_windows(
         MaterializeHandler(),
         dataset,
         workers=workers,
         group=group,
+        use_initial_job=False,
     )
 
 
diff --git a/tests/integration_slow/landsat_vessels/test_fastapi.py b/tests/integration_slow/landsat_vessels/test_fastapi.py
@@ -0,0 +1,17 @@
+from fastapi.testclient import TestClient
+
+from rslp.landsat_vessels.api_main import app
+
+client = TestClient(app)
+
+
+def test_singapore_dense_scene() -> None:
+    # LC08_L1TP_125059_20240913_20240920_02_T1 is a scene that includes southeast coast
+    # of Singapore where there are hundreds of vessels.
+    response = client.post(
+        "/detections", json={"scene_id": "LC08_L1TP_125059_20240913_20240920_02_T1"}
+    )
+    assert response.status_code == 200
+    predictions = response.json()["predictions"]
+    # There are many correct vessels in this scene.
+    assert len(predictions) >= 100

Original file line number	Diff line number	Diff line change
`@@ -36,12 +36,14 @@ def materialize_dataset(`
`36`	`36`	`dataset,`
`37`	`37`	`workers=workers,`
`38`	`38`	`group=group,`
	`39`	`+ use_initial_job=False,`
`39`	`40`	`)`
`40`	`41`	`apply_on_windows(`
`41`	`42`	`MaterializeHandler(),`
`42`	`43`	`dataset,`
`43`	`44`	`workers=workers,`
`44`	`45`	`group=group,`
	`46`	`+ use_initial_job=False,`
`45`	`47`	`)`
`46`	`48`
`47`	`49`