Skip to content

Commit 8e22771

Browse files
committed
First draft of ETL GUI.
1 parent f128b70 commit 8e22771

File tree

11 files changed

+425
-0
lines changed

11 files changed

+425
-0
lines changed

api/routes/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@
88
from api.routes.web import router as web_router
99
from api.routes.enum import router as enum_router
1010
from api.routes.sequencing_groups import router as sequencing_groups_router
11+
from api.routes.etl import router as etl_router

api/routes/etl.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""
2+
Web routes
3+
"""
4+
from typing import Any, Optional
5+
6+
from api.utils.db import (
7+
BqConnection,
8+
PubSubConnection,
9+
get_projectless_bq_connection,
10+
get_projectless_pubsub_connection
11+
)
12+
from db.python.layers.etl import EtlLayer, EtlPubSub
13+
from fastapi import APIRouter, Request
14+
from models.models.etl import EtlSummary
15+
from models.models.search import SearchItem
16+
17+
router = APIRouter(prefix='/etl', tags=['etl'])
18+
19+
20+
@router.post(
21+
'/summary',
22+
response_model=list[EtlSummary],
23+
operation_id='getEtlSummary',
24+
)
25+
async def get_etl_summary(
26+
request: Request,
27+
grid_filter: list[SearchItem],
28+
limit: int = 20,
29+
token: Optional[int] = 0,
30+
connection: BqConnection = get_projectless_bq_connection,
31+
) -> EtlSummary:
32+
"""Creates a new sample, and returns the internal sample ID"""
33+
34+
st = EtlLayer(connection)
35+
summary = await st.get_etl_summary(
36+
token=token, limit=limit, grid_filter=grid_filter
37+
)
38+
return summary
39+
40+
41+
@router.post(
42+
'/reload',
43+
response_model=str,
44+
operation_id='etlReload',
45+
)
46+
async def etl_reload(
47+
request: Request,
48+
request_id: str = None,
49+
connection: PubSubConnection = get_projectless_pubsub_connection,
50+
) -> str:
51+
"""Resubmit request to the topic
52+
53+
resubmit record to the topic:
54+
55+
{"request_id": "640e8f2e-4e20-4959-8620-7b7741265895"}
56+
57+
"""
58+
59+
msg = {
60+
'request_id': request_id,
61+
}
62+
63+
pubsub = EtlPubSub(connection)
64+
result = await pubsub.publish(
65+
msg=msg,
66+
)
67+
return result

api/utils/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
get_project_readonly_connection,
88
get_project_write_connection,
99
get_projectless_db_connection,
10+
get_projectless_bq_connection,
1011
)
1112

1213
T = TypeVar('T')

api/utils/db.py

+13
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from api.settings import get_default_user
1111
from api.utils.gcp import email_from_id_token
1212
from db.python.connect import SMConnections, Connection
13+
from db.python.gcp_connect import BqConnection, PubSubConnection
1314

1415

1516
EXPECTED_AUDIENCE = getenv('SM_OAUTHAUDIENCE')
@@ -76,6 +77,16 @@ async def dependable_get_connection(author: str = Depends(authenticate)):
7677
return await SMConnections.get_connection_no_project(author)
7778

7879

80+
async def dependable_get_bq_connection(author: str = Depends(authenticate)):
81+
"""FastAPI handler for getting connection withOUT project"""
82+
return await BqConnection.get_connection_no_project(author)
83+
84+
85+
async def dependable_get_pubsub_connection(author: str = Depends(authenticate)):
86+
"""FastAPI handler for getting connection withOUT project"""
87+
return await PubSubConnection.get_connection_no_project(author)
88+
89+
7990
def validate_iap_jwt_and_get_email(iap_jwt, audience):
8091
"""
8192
Validate an IAP JWT and return email
@@ -102,3 +113,5 @@ def validate_iap_jwt_and_get_email(iap_jwt, audience):
102113
get_project_readonly_connection = Depends(dependable_get_readonly_project_connection)
103114
get_project_write_connection = Depends(dependable_get_write_project_connection)
104115
get_projectless_db_connection = Depends(dependable_get_connection)
116+
get_projectless_bq_connection = Depends(dependable_get_bq_connection)
117+
get_projectless_pubsub_connection = Depends(dependable_get_pubsub_connection)

db/python/gcp_connect.py

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""
2+
Code for connecting to Big Query database
3+
"""
4+
import logging
5+
import os
6+
from typing import Optional
7+
8+
import google.cloud.bigquery as bq
9+
from db.python.utils import InternalError
10+
import google.cloud.pubsub_v1 as pubsub_v1
11+
12+
logging.basicConfig(level=logging.DEBUG)
13+
logger = logging.getLogger(__name__)
14+
15+
16+
class BqConnection:
17+
"""Stores a Big Query DB connection, project and author"""
18+
19+
def __init__(
20+
self,
21+
author: str,
22+
):
23+
self.gcp_project = os.getenv('METAMIST_INFRA_GCP_PROJECT')
24+
self.connection: bq.Client = bq.Client(project=self.gcp_project)
25+
self.author: str = author
26+
27+
@staticmethod
28+
async def get_connection_no_project(author: str):
29+
"""Get a db connection from a project and user"""
30+
# maybe it makes sense to perform permission checks here too
31+
logger.debug(f'Authenticate no-project connection with {author!r}')
32+
33+
# we don't authenticate project-less connection, but rely on the
34+
# the endpoint to validate the resources
35+
36+
return BqConnection(author=author)
37+
38+
39+
class BqDbBase:
40+
"""Base class for big query database subclasses"""
41+
42+
def __init__(self, connection: BqConnection):
43+
if connection is None:
44+
raise InternalError(
45+
f'No connection was provided to the table {self.__class__.__name__!r}'
46+
)
47+
if not isinstance(connection, BqConnection):
48+
raise InternalError(
49+
f'Expected connection type Connection, received {type(connection)}, '
50+
f'did you mean to call self._connection?'
51+
)
52+
53+
self._connection = connection
54+
55+
56+
class PubSubConnection:
57+
"""Stores a PubSub connection, project and author"""
58+
59+
def __init__(
60+
self,
61+
author: str,
62+
):
63+
self.client: pubsub_v1.PublisherClient = pubsub_v1.PublisherClient()
64+
self.author: str = author
65+
self.topic: str = os.getenv('METAMIST_INFRA_PUBSUB_TOPIC')
66+
67+
@staticmethod
68+
async def get_connection_no_project(author: str):
69+
"""Get a pubsub connection from a project and user"""
70+
# maybe it makes sense to perform permission checks here too
71+
logger.debug(f'Authenticate no-project connection with {author!r}')
72+
73+
# we don't authenticate project-less connection, but rely on the
74+
# the endpoint to validate the resources
75+
76+
return PubSubConnection(author=author)

db/python/layers/bq_base.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from db.python.gcp_connect import BqConnection
2+
3+
4+
class BqBaseLayer:
5+
"""Base of all Big Query DB layers"""
6+
7+
def __init__(self, connection: BqConnection):
8+
self.connection = connection
9+
10+
@property
11+
def author(self):
12+
"""Get author from connection"""
13+
return self.connection.author

db/python/layers/etl.py

+132
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import datetime
2+
import json
3+
import logging
4+
from typing import Any
5+
6+
from db.python.gcp_connect import BqDbBase, PubSubConnection
7+
from db.python.layers.bq_base import BqBaseLayer
8+
from models.models import SearchItem, EtlSummary
9+
10+
11+
def reformat_record(record):
12+
"""Reformat record to be more readable"""
13+
14+
record['details'] = json.loads(record['details'])
15+
record['sample_record'] = json.loads(record['sample_record'])
16+
17+
return EtlSummary(
18+
request_id=record['request_id'],
19+
last_run_at=record['last_run_at'],
20+
status=record['status'],
21+
source_type=record['details']['source_type'],
22+
submitting_user=record['details']['submitting_user'],
23+
parser_result=record['details']['result'],
24+
)
25+
26+
27+
class EtlLayer(BqBaseLayer):
28+
"""Web layer"""
29+
30+
async def get_etl_summary(
31+
self,
32+
grid_filter: list[SearchItem],
33+
token: int = 0,
34+
limit: int = 20,
35+
) -> Any:
36+
"""
37+
TODO
38+
"""
39+
etlDb = EtlDb(self.connection)
40+
return await etlDb.get_etl_summary()
41+
42+
43+
class EtlDb(BqDbBase):
44+
"""Db layer for web related routes,"""
45+
46+
async def get_etl_summary(
47+
self,
48+
) -> list[EtlSummary]:
49+
"""
50+
TODO
51+
"""
52+
return await self.get_report()
53+
54+
async def get_report(
55+
self, source_type: str = None, etl_status: str = None, start_dt: str = None
56+
):
57+
"""Get ETL report from BQ"""
58+
59+
# build query filter
60+
query_filters = []
61+
62+
if source_type:
63+
query_filters.append(
64+
f'JSON_VALUE(details.source_type) LIKE "\/{source_type}\/%"'
65+
)
66+
67+
if start_dt:
68+
query_filters.append(f'timestamp > "{start_dt}"')
69+
70+
# construct query filter
71+
query_filter = ' AND'.join(query_filters)
72+
if query_filter:
73+
query_filter = 'WHERE ' + query_filter
74+
75+
# Status filter applied after grouping by request_id
76+
# One rquest can have multiple runs, we are interested only in the last run
77+
if etl_status:
78+
status_filter = f'WHERE status = "{etl_status.upper()}"'
79+
else:
80+
status_filter = ''
81+
82+
# query BQ table
83+
_query = f"""
84+
WITH l AS (
85+
SELECT request_id, max(timestamp) as last_time
86+
FROM `{self._connection.gcp_project}.metamist.etl-logs`
87+
{query_filter}
88+
group by request_id
89+
)
90+
select logs.request_id, logs.timestamp as last_run_at,
91+
logs.status, logs.details, d.body as sample_record
92+
from l
93+
inner join `{self._connection.gcp_project}.metamist.etl-logs` logs on
94+
l.request_id = logs.request_id
95+
and logs.timestamp = l.last_time
96+
INNER JOIN `{self._connection.gcp_project}.metamist.etl-data` d on
97+
d.request_id = logs.request_id
98+
{status_filter}
99+
"""
100+
101+
query_job_result = self._connection.connection.query(_query).result()
102+
records = [reformat_record(dict(row)) for row in query_job_result]
103+
return records
104+
105+
106+
class EtlPubSub:
107+
"""Etl Pub Sub wrapper"""
108+
109+
def __init__(
110+
self,
111+
connection: PubSubConnection,
112+
):
113+
self.connection = connection
114+
115+
async def publish(
116+
self,
117+
msg: dict,
118+
) -> str:
119+
"""
120+
publish to pubsub, append user and timestampe to the message
121+
"""
122+
msg['timestamp'] = datetime.datetime.utcnow().isoformat()
123+
msg['submitting_user'] = self.connection.author
124+
125+
try:
126+
self.connection.client.publish(self.connection.topic, json.dumps(msg).encode())
127+
return 'submitted'
128+
except Exception as e: # pylint: disable=broad-exception-caught
129+
logging.error(f'Failed to publish to pubsub: {e}')
130+
131+
return 'failed'
132+

models/models/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,6 @@
5959
ProjectSummaryInternal,
6060
WebProject,
6161
)
62+
from models.models.etl import (
63+
EtlSummary,
64+
)

models/models/etl.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import datetime
2+
from models.base import SMBase
3+
4+
5+
class EtlSummary(SMBase):
6+
"""Return class for the ETL summary endpoint"""
7+
8+
request_id: str
9+
last_run_at: datetime.datetime | None
10+
status: str
11+
source_type: str
12+
13+
submitting_user: str
14+
parser_result: str
15+
16+
class Config:
17+
"""Config for EtlSummary Response"""
18+
orm_mode = True
19+

web/src/Routes.tsx

+3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import ProjectSummaryView from './pages/project/ProjectSummary'
99
import ProjectsAdmin from './pages/admin/ProjectsAdmin'
1010
import ErrorBoundary from './shared/utilities/errorBoundary'
1111
import AnalysisRunnerSummary from './pages/project/AnalysisRunnerView/AnalysisRunnerSummary'
12+
import EtlAdmin from './pages/etl/EtlAdmin'
1213

1314
const Routes: React.FunctionComponent = () => (
1415
<Switch>
@@ -36,6 +37,8 @@ const Routes: React.FunctionComponent = () => (
3637

3738
<Route path="admin" element={<ProjectsAdmin />} />
3839

40+
<Route path="etl" element={<EtlAdmin />} />
41+
3942
<Route path="/" element={<DocumentationArticle articleid="index" />} />
4043

4144
<Route

0 commit comments

Comments
 (0)