Skip to content

Commit

Permalink
add s3 bit
Browse files Browse the repository at this point in the history
  • Loading branch information
rbs333 committed Jul 25, 2024
1 parent d7f309b commit ec4368e
Show file tree
Hide file tree
Showing 5 changed files with 3,001 additions and 13 deletions.
8 changes: 1 addition & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
# FROM node:18.8-alpine AS ReactImage
FROM node:22.0.0 AS ReactImage

WORKDIR /app/frontend

ENV NODE_PATH=/app/frontend/node_modules
ENV PATH=$PATH:/app/frontend/node_modules/.bin

# test that removing yarn works in this case
COPY ./frontend/package.json ./
RUN npm install

Expand All @@ -21,7 +19,6 @@ ENV PYTHONDONTWRITEBYTECODE 1

WORKDIR /app/
VOLUME [ "/data" ]
# COPY ./data/ ./data

RUN apt-get update && \
apt-get install -y curl && \
Expand All @@ -47,7 +44,4 @@ COPY --from=ReactImage /app/frontend/build /app/backend/arxivsearch/templates/bu

LABEL org.opencontainers.image.source https://github.com/RedisVentures/redis-arxiv-search

# WORKDIR /app/backend/arxivsearch

CMD ["poetry", "run", "start-app"]
# CMD ["sh", "./entrypoint.sh"]
CMD ["poetry", "run", "start-app"]
3 changes: 2 additions & 1 deletion backend/arxivsearch/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

# Configuration
DEFAULT_DATASET = os.environ.get("DEFAULT_DATASET", "arxiv-papers-1000.json")
S3_DATA_URL = "https://arxiv-search.s3.us-east-2.amazonaws.com/arxiv-papers-1000.json"
DATA_LOCATION = os.environ.get("DATA_LOCATION", "../data")
DEPLOYMENT_ENV = os.environ.get("DEPLOYMENT", "dev")
WRITE_CONCURRENCY = os.environ.get("WRITE_CONCURRENCY", 150)
Expand All @@ -22,7 +23,7 @@

# Redis
REDIS_HOST = os.environ.get("REDIS_HOST", "localhost")
# REDIS_HOST = "localhost"
REDIS_HOST = "localhost"
REDIS_PORT = os.environ.get("REDIS_PORT", 6379)
REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD")
if REDIS_PASSWORD:
Expand Down
25 changes: 20 additions & 5 deletions backend/arxivsearch/db/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Any, Dict, List

import numpy as np
import requests
from redisvl.index import AsyncSearchIndex

from arxivsearch import config
Expand All @@ -20,9 +21,23 @@ def read_paper_json() -> List[Dict[str, Any]]:
"""
logger.info("Loading papers dataset from disk")
path = os.path.join(config.DATA_LOCATION, config.DEFAULT_DATASET)
with open(path, "r") as f:
df = json.load(f)
return df
try:
with open(path, "r") as f:
data = json.load(f)
except:
logger.info(f"Failed to read {path} => getting from s3")
res = requests.get(config.S3_DATA_URL)
data = res.json()

if os.path.isdir(config.DATA_LOCATION):
logger.info(f"Writing s3 file to {path}")
with open(path, "w") as f:
json.dump(data, f)
else:
logger.warning(
f"Data directory {config.DATA_LOCATION} not found. Skipping write of S3 data"
)
return data


async def write_async(index: AsyncSearchIndex, papers: list):
Expand Down Expand Up @@ -58,9 +73,9 @@ async def load_data():
# Load dataset and create index
try:
# Check if index exists
if await index.exists():
if await index.exists() and len((await index.search("*")).docs) > 0:
# if running local and not seeing logger logs make sure index isn't already created
logger.info("Index already exists, skipping data load")
logger.info("Index and data already exists, skipping load")
else:
logger.info("Creating new index")
await index.create(overwrite=True)
Expand Down
47 changes: 47 additions & 0 deletions backend/arxivsearch/tests/db/test_load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# import pytest
from unittest.mock import mock_open, patch

from arxivsearch.db.load import read_paper_json


# Test when the file exists locally
@patch("arxivsearch.db.load.os.path.join")
@patch(
"arxivsearch.db.load.open",
new_callable=mock_open,
read_data='[{"id": "1234", "title": "Test Paper"}]',
)
@patch("arxivsearch.db.load.json.load")
def test_read_paper_json_local(mock_json_load, mock_file_open, mock_path_join):
mock_path_join.return_value = "dummy_path"
mock_json_load.return_value = [{"id": "1234", "title": "Test Paper"}]

result = read_paper_json()

mock_file_open.assert_called_once_with("dummy_path", "r")
mock_json_load.assert_called_once()
assert result == [{"id": "1234", "title": "Test Paper"}]


# Test when the file needs to be fetched from S3
@patch("arxivsearch.db.load.os.path.join")
@patch("arxivsearch.db.load.requests.get")
@patch("arxivsearch.db.load.open", new_callable=mock_open)
@patch("arxivsearch.db.load.json.dump")
@patch("arxivsearch.db.load.json.load", side_effect=Exception("File not found"))
def test_read_paper_json_s3(
mock_json_load, mock_json_dump, mock_file_open, mock_requests_get, mock_path_join
):
mock_path_join.return_value = "dummy_path"
mock_requests_get.return_value.json.return_value = [
{"id": "5678", "title": "Test Paper from S3"}
]

result = read_paper_json()

mock_requests_get.assert_called_once()
mock_file_open.assert_called_with("dummy_path", "w")
mock_json_dump.assert_called_once_with(
[{"id": "5678", "title": "Test Paper from S3"}], mock_file_open()
)
assert result == [{"id": "5678", "title": "Test Paper from S3"}]
Loading

0 comments on commit ec4368e

Please sign in to comment.