add s3 bit

redis-developer · Jul 25, 2024 · ec4368e · ec4368e
1 parent d7f309b
commit ec4368e
Show file tree

Hide file tree

Showing 5 changed files with 3,001 additions and 13 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,12 +1,10 @@
-# FROM node:18.8-alpine AS ReactImage
 FROM node:22.0.0 AS ReactImage
 
 WORKDIR /app/frontend
 
 ENV NODE_PATH=/app/frontend/node_modules
 ENV PATH=$PATH:/app/frontend/node_modules/.bin
 
-# test that removing yarn works in this case
 COPY ./frontend/package.json ./
 RUN npm install
 
@@ -21,7 +19,6 @@ ENV PYTHONDONTWRITEBYTECODE 1
 
 WORKDIR /app/
 VOLUME [ "/data" ]
-# COPY ./data/ ./data
 
 RUN apt-get update && \
     apt-get install -y curl && \
@@ -47,7 +44,4 @@ COPY --from=ReactImage /app/frontend/build /app/backend/arxivsearch/templates/bu
 
 LABEL org.opencontainers.image.source https://github.com/RedisVentures/redis-arxiv-search
 
-# WORKDIR /app/backend/arxivsearch
-
-CMD ["poetry", "run", "start-app"]
-# CMD ["sh", "./entrypoint.sh"]
+CMD ["poetry", "run", "start-app"]
diff --git a/backend/arxivsearch/config.py b/backend/arxivsearch/config.py
@@ -8,6 +8,7 @@
 
 # Configuration
 DEFAULT_DATASET = os.environ.get("DEFAULT_DATASET", "arxiv-papers-1000.json")
+S3_DATA_URL = "https://arxiv-search.s3.us-east-2.amazonaws.com/arxiv-papers-1000.json"
 DATA_LOCATION = os.environ.get("DATA_LOCATION", "../data")
 DEPLOYMENT_ENV = os.environ.get("DEPLOYMENT", "dev")
 WRITE_CONCURRENCY = os.environ.get("WRITE_CONCURRENCY", 150)
@@ -22,7 +23,7 @@
 
 # Redis
 REDIS_HOST = os.environ.get("REDIS_HOST", "localhost")
-# REDIS_HOST = "localhost"
+REDIS_HOST = "localhost"
 REDIS_PORT = os.environ.get("REDIS_PORT", 6379)
 REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD")
 if REDIS_PASSWORD:

diff --git a/backend/arxivsearch/db/load.py b/backend/arxivsearch/db/load.py
@@ -5,6 +5,7 @@
 from typing import Any, Dict, List
 
 import numpy as np
+import requests
 from redisvl.index import AsyncSearchIndex
 
 from arxivsearch import config
@@ -20,9 +21,23 @@ def read_paper_json() -> List[Dict[str, Any]]:
     """
     logger.info("Loading papers dataset from disk")
     path = os.path.join(config.DATA_LOCATION, config.DEFAULT_DATASET)
-    with open(path, "r") as f:
-        df = json.load(f)
-    return df
+    try:
+        with open(path, "r") as f:
+            data = json.load(f)
+    except:
+        logger.info(f"Failed to read {path} => getting from s3")
+        res = requests.get(config.S3_DATA_URL)
+        data = res.json()
+
+        if os.path.isdir(config.DATA_LOCATION):
+            logger.info(f"Writing s3 file to {path}")
+            with open(path, "w") as f:
+                json.dump(data, f)
+        else:
+            logger.warning(
+                f"Data directory {config.DATA_LOCATION} not found. Skipping write of S3 data"
+            )
+    return data
 
 
 async def write_async(index: AsyncSearchIndex, papers: list):
@@ -58,9 +73,9 @@ async def load_data():
     # Load dataset and create index
     try:
         # Check if index exists
-        if await index.exists():
+        if await index.exists() and len((await index.search("*")).docs) > 0:
             # if running local and not seeing logger logs make sure index isn't already created
-            logger.info("Index already exists, skipping data load")
+            logger.info("Index and data already exists, skipping load")
         else:
             logger.info("Creating new index")
             await index.create(overwrite=True)

diff --git a/backend/arxivsearch/tests/db/test_load.py b/backend/arxivsearch/tests/db/test_load.py
@@ -0,0 +1,47 @@
+# import pytest
+from unittest.mock import mock_open, patch
+
+from arxivsearch.db.load import read_paper_json
+
+
+# Test when the file exists locally
+@patch("arxivsearch.db.load.os.path.join")
+@patch(
+    "arxivsearch.db.load.open",
+    new_callable=mock_open,
+    read_data='[{"id": "1234", "title": "Test Paper"}]',
+)
+@patch("arxivsearch.db.load.json.load")
+def test_read_paper_json_local(mock_json_load, mock_file_open, mock_path_join):
+    mock_path_join.return_value = "dummy_path"
+    mock_json_load.return_value = [{"id": "1234", "title": "Test Paper"}]
+
+    result = read_paper_json()
+
+    mock_file_open.assert_called_once_with("dummy_path", "r")
+    mock_json_load.assert_called_once()
+    assert result == [{"id": "1234", "title": "Test Paper"}]
+
+
+# Test when the file needs to be fetched from S3
+@patch("arxivsearch.db.load.os.path.join")
+@patch("arxivsearch.db.load.requests.get")
+@patch("arxivsearch.db.load.open", new_callable=mock_open)
+@patch("arxivsearch.db.load.json.dump")
+@patch("arxivsearch.db.load.json.load", side_effect=Exception("File not found"))
+def test_read_paper_json_s3(
+    mock_json_load, mock_json_dump, mock_file_open, mock_requests_get, mock_path_join
+):
+    mock_path_join.return_value = "dummy_path"
+    mock_requests_get.return_value.json.return_value = [
+        {"id": "5678", "title": "Test Paper from S3"}
+    ]
+
+    result = read_paper_json()
+
+    mock_requests_get.assert_called_once()
+    mock_file_open.assert_called_with("dummy_path", "w")
+    mock_json_dump.assert_called_once_with(
+        [{"id": "5678", "title": "Test Paper from S3"}], mock_file_open()
+    )
+    assert result == [{"id": "5678", "title": "Test Paper from S3"}]