Skip to content

Commit

Permalink
First draft of ETL GUI.
Browse files Browse the repository at this point in the history
  • Loading branch information
milo-hyben committed Sep 26, 2023
1 parent f128b70 commit 8e22771
Show file tree
Hide file tree
Showing 11 changed files with 425 additions and 0 deletions.
1 change: 1 addition & 0 deletions api/routes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
from api.routes.web import router as web_router
from api.routes.enum import router as enum_router
from api.routes.sequencing_groups import router as sequencing_groups_router
from api.routes.etl import router as etl_router
67 changes: 67 additions & 0 deletions api/routes/etl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""
Web routes
"""
from typing import Any, Optional

from api.utils.db import (
BqConnection,
PubSubConnection,
get_projectless_bq_connection,
get_projectless_pubsub_connection
)
from db.python.layers.etl import EtlLayer, EtlPubSub
from fastapi import APIRouter, Request
from models.models.etl import EtlSummary
from models.models.search import SearchItem

router = APIRouter(prefix='/etl', tags=['etl'])


@router.post(
'/summary',
response_model=list[EtlSummary],
operation_id='getEtlSummary',
)
async def get_etl_summary(
request: Request,
grid_filter: list[SearchItem],
limit: int = 20,
token: Optional[int] = 0,
connection: BqConnection = get_projectless_bq_connection,
) -> EtlSummary:
"""Creates a new sample, and returns the internal sample ID"""

st = EtlLayer(connection)
summary = await st.get_etl_summary(
token=token, limit=limit, grid_filter=grid_filter
)
return summary


@router.post(
'/reload',
response_model=str,
operation_id='etlReload',
)
async def etl_reload(
request: Request,
request_id: str = None,
connection: PubSubConnection = get_projectless_pubsub_connection,
) -> str:
"""Resubmit request to the topic
resubmit record to the topic:
{"request_id": "640e8f2e-4e20-4959-8620-7b7741265895"}
"""

msg = {
'request_id': request_id,
}

pubsub = EtlPubSub(connection)
result = await pubsub.publish(
msg=msg,
)
return result
1 change: 1 addition & 0 deletions api/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
get_project_readonly_connection,
get_project_write_connection,
get_projectless_db_connection,
get_projectless_bq_connection,
)

T = TypeVar('T')
Expand Down
13 changes: 13 additions & 0 deletions api/utils/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from api.settings import get_default_user
from api.utils.gcp import email_from_id_token
from db.python.connect import SMConnections, Connection
from db.python.gcp_connect import BqConnection, PubSubConnection


EXPECTED_AUDIENCE = getenv('SM_OAUTHAUDIENCE')
Expand Down Expand Up @@ -76,6 +77,16 @@ async def dependable_get_connection(author: str = Depends(authenticate)):
return await SMConnections.get_connection_no_project(author)


async def dependable_get_bq_connection(author: str = Depends(authenticate)):
"""FastAPI handler for getting connection withOUT project"""
return await BqConnection.get_connection_no_project(author)


async def dependable_get_pubsub_connection(author: str = Depends(authenticate)):
"""FastAPI handler for getting connection withOUT project"""
return await PubSubConnection.get_connection_no_project(author)


def validate_iap_jwt_and_get_email(iap_jwt, audience):
"""
Validate an IAP JWT and return email
Expand All @@ -102,3 +113,5 @@ def validate_iap_jwt_and_get_email(iap_jwt, audience):
get_project_readonly_connection = Depends(dependable_get_readonly_project_connection)
get_project_write_connection = Depends(dependable_get_write_project_connection)
get_projectless_db_connection = Depends(dependable_get_connection)
get_projectless_bq_connection = Depends(dependable_get_bq_connection)
get_projectless_pubsub_connection = Depends(dependable_get_pubsub_connection)
76 changes: 76 additions & 0 deletions db/python/gcp_connect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""
Code for connecting to Big Query database
"""
import logging
import os
from typing import Optional

import google.cloud.bigquery as bq
from db.python.utils import InternalError
import google.cloud.pubsub_v1 as pubsub_v1

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)


class BqConnection:
"""Stores a Big Query DB connection, project and author"""

def __init__(
self,
author: str,
):
self.gcp_project = os.getenv('METAMIST_INFRA_GCP_PROJECT')
self.connection: bq.Client = bq.Client(project=self.gcp_project)
self.author: str = author

@staticmethod
async def get_connection_no_project(author: str):
"""Get a db connection from a project and user"""
# maybe it makes sense to perform permission checks here too
logger.debug(f'Authenticate no-project connection with {author!r}')

# we don't authenticate project-less connection, but rely on the
# the endpoint to validate the resources

return BqConnection(author=author)


class BqDbBase:
"""Base class for big query database subclasses"""

def __init__(self, connection: BqConnection):
if connection is None:
raise InternalError(
f'No connection was provided to the table {self.__class__.__name__!r}'
)
if not isinstance(connection, BqConnection):
raise InternalError(
f'Expected connection type Connection, received {type(connection)}, '
f'did you mean to call self._connection?'
)

self._connection = connection


class PubSubConnection:
"""Stores a PubSub connection, project and author"""

def __init__(
self,
author: str,
):
self.client: pubsub_v1.PublisherClient = pubsub_v1.PublisherClient()
self.author: str = author
self.topic: str = os.getenv('METAMIST_INFRA_PUBSUB_TOPIC')

@staticmethod
async def get_connection_no_project(author: str):
"""Get a pubsub connection from a project and user"""
# maybe it makes sense to perform permission checks here too
logger.debug(f'Authenticate no-project connection with {author!r}')

# we don't authenticate project-less connection, but rely on the
# the endpoint to validate the resources

return PubSubConnection(author=author)
13 changes: 13 additions & 0 deletions db/python/layers/bq_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from db.python.gcp_connect import BqConnection


class BqBaseLayer:
"""Base of all Big Query DB layers"""

def __init__(self, connection: BqConnection):
self.connection = connection

@property
def author(self):
"""Get author from connection"""
return self.connection.author
132 changes: 132 additions & 0 deletions db/python/layers/etl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import datetime
import json
import logging
from typing import Any

from db.python.gcp_connect import BqDbBase, PubSubConnection
from db.python.layers.bq_base import BqBaseLayer
from models.models import SearchItem, EtlSummary


def reformat_record(record):
"""Reformat record to be more readable"""

record['details'] = json.loads(record['details'])
record['sample_record'] = json.loads(record['sample_record'])

return EtlSummary(
request_id=record['request_id'],
last_run_at=record['last_run_at'],
status=record['status'],
source_type=record['details']['source_type'],
submitting_user=record['details']['submitting_user'],
parser_result=record['details']['result'],
)


class EtlLayer(BqBaseLayer):
"""Web layer"""

async def get_etl_summary(
self,
grid_filter: list[SearchItem],
token: int = 0,
limit: int = 20,
) -> Any:
"""
TODO
"""
etlDb = EtlDb(self.connection)
return await etlDb.get_etl_summary()


class EtlDb(BqDbBase):
"""Db layer for web related routes,"""

async def get_etl_summary(
self,
) -> list[EtlSummary]:
"""
TODO
"""
return await self.get_report()

async def get_report(
self, source_type: str = None, etl_status: str = None, start_dt: str = None
):
"""Get ETL report from BQ"""

# build query filter
query_filters = []

if source_type:
query_filters.append(
f'JSON_VALUE(details.source_type) LIKE "\/{source_type}\/%"'
)

if start_dt:
query_filters.append(f'timestamp > "{start_dt}"')

# construct query filter
query_filter = ' AND'.join(query_filters)
if query_filter:
query_filter = 'WHERE ' + query_filter

# Status filter applied after grouping by request_id
# One rquest can have multiple runs, we are interested only in the last run
if etl_status:
status_filter = f'WHERE status = "{etl_status.upper()}"'
else:
status_filter = ''

# query BQ table
_query = f"""
WITH l AS (
SELECT request_id, max(timestamp) as last_time
FROM `{self._connection.gcp_project}.metamist.etl-logs`
{query_filter}
group by request_id
)
select logs.request_id, logs.timestamp as last_run_at,
logs.status, logs.details, d.body as sample_record
from l
inner join `{self._connection.gcp_project}.metamist.etl-logs` logs on
l.request_id = logs.request_id
and logs.timestamp = l.last_time
INNER JOIN `{self._connection.gcp_project}.metamist.etl-data` d on
d.request_id = logs.request_id
{status_filter}
"""

query_job_result = self._connection.connection.query(_query).result()
records = [reformat_record(dict(row)) for row in query_job_result]
return records


class EtlPubSub:
"""Etl Pub Sub wrapper"""

def __init__(
self,
connection: PubSubConnection,
):
self.connection = connection

async def publish(
self,
msg: dict,
) -> str:
"""
publish to pubsub, append user and timestampe to the message
"""
msg['timestamp'] = datetime.datetime.utcnow().isoformat()
msg['submitting_user'] = self.connection.author

try:
self.connection.client.publish(self.connection.topic, json.dumps(msg).encode())
return 'submitted'
except Exception as e: # pylint: disable=broad-exception-caught
logging.error(f'Failed to publish to pubsub: {e}')

return 'failed'

3 changes: 3 additions & 0 deletions models/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,6 @@
ProjectSummaryInternal,
WebProject,
)
from models.models.etl import (
EtlSummary,
)
19 changes: 19 additions & 0 deletions models/models/etl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import datetime
from models.base import SMBase


class EtlSummary(SMBase):
"""Return class for the ETL summary endpoint"""

request_id: str
last_run_at: datetime.datetime | None
status: str
source_type: str

submitting_user: str
parser_result: str

class Config:
"""Config for EtlSummary Response"""
orm_mode = True

3 changes: 3 additions & 0 deletions web/src/Routes.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import ProjectSummaryView from './pages/project/ProjectSummary'
import ProjectsAdmin from './pages/admin/ProjectsAdmin'
import ErrorBoundary from './shared/utilities/errorBoundary'
import AnalysisRunnerSummary from './pages/project/AnalysisRunnerView/AnalysisRunnerSummary'
import EtlAdmin from './pages/etl/EtlAdmin'

const Routes: React.FunctionComponent = () => (
<Switch>
Expand Down Expand Up @@ -36,6 +37,8 @@ const Routes: React.FunctionComponent = () => (

<Route path="admin" element={<ProjectsAdmin />} />

<Route path="etl" element={<EtlAdmin />} />

<Route path="/" element={<DocumentationArticle articleid="index" />} />

<Route
Expand Down
Loading

0 comments on commit 8e22771

Please sign in to comment.