-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f128b70
commit 8e22771
Showing
11 changed files
with
425 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
""" | ||
Web routes | ||
""" | ||
from typing import Any, Optional | ||
|
||
from api.utils.db import ( | ||
BqConnection, | ||
PubSubConnection, | ||
get_projectless_bq_connection, | ||
get_projectless_pubsub_connection | ||
) | ||
from db.python.layers.etl import EtlLayer, EtlPubSub | ||
from fastapi import APIRouter, Request | ||
from models.models.etl import EtlSummary | ||
from models.models.search import SearchItem | ||
|
||
router = APIRouter(prefix='/etl', tags=['etl']) | ||
|
||
|
||
@router.post( | ||
'/summary', | ||
response_model=list[EtlSummary], | ||
operation_id='getEtlSummary', | ||
) | ||
async def get_etl_summary( | ||
request: Request, | ||
grid_filter: list[SearchItem], | ||
limit: int = 20, | ||
token: Optional[int] = 0, | ||
connection: BqConnection = get_projectless_bq_connection, | ||
) -> EtlSummary: | ||
"""Creates a new sample, and returns the internal sample ID""" | ||
|
||
st = EtlLayer(connection) | ||
summary = await st.get_etl_summary( | ||
token=token, limit=limit, grid_filter=grid_filter | ||
) | ||
return summary | ||
|
||
|
||
@router.post( | ||
'/reload', | ||
response_model=str, | ||
operation_id='etlReload', | ||
) | ||
async def etl_reload( | ||
request: Request, | ||
request_id: str = None, | ||
connection: PubSubConnection = get_projectless_pubsub_connection, | ||
) -> str: | ||
"""Resubmit request to the topic | ||
resubmit record to the topic: | ||
{"request_id": "640e8f2e-4e20-4959-8620-7b7741265895"} | ||
""" | ||
|
||
msg = { | ||
'request_id': request_id, | ||
} | ||
|
||
pubsub = EtlPubSub(connection) | ||
result = await pubsub.publish( | ||
msg=msg, | ||
) | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
""" | ||
Code for connecting to Big Query database | ||
""" | ||
import logging | ||
import os | ||
from typing import Optional | ||
|
||
import google.cloud.bigquery as bq | ||
from db.python.utils import InternalError | ||
import google.cloud.pubsub_v1 as pubsub_v1 | ||
|
||
logging.basicConfig(level=logging.DEBUG) | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class BqConnection: | ||
"""Stores a Big Query DB connection, project and author""" | ||
|
||
def __init__( | ||
self, | ||
author: str, | ||
): | ||
self.gcp_project = os.getenv('METAMIST_INFRA_GCP_PROJECT') | ||
self.connection: bq.Client = bq.Client(project=self.gcp_project) | ||
self.author: str = author | ||
|
||
@staticmethod | ||
async def get_connection_no_project(author: str): | ||
"""Get a db connection from a project and user""" | ||
# maybe it makes sense to perform permission checks here too | ||
logger.debug(f'Authenticate no-project connection with {author!r}') | ||
|
||
# we don't authenticate project-less connection, but rely on the | ||
# the endpoint to validate the resources | ||
|
||
return BqConnection(author=author) | ||
|
||
|
||
class BqDbBase: | ||
"""Base class for big query database subclasses""" | ||
|
||
def __init__(self, connection: BqConnection): | ||
if connection is None: | ||
raise InternalError( | ||
f'No connection was provided to the table {self.__class__.__name__!r}' | ||
) | ||
if not isinstance(connection, BqConnection): | ||
raise InternalError( | ||
f'Expected connection type Connection, received {type(connection)}, ' | ||
f'did you mean to call self._connection?' | ||
) | ||
|
||
self._connection = connection | ||
|
||
|
||
class PubSubConnection: | ||
"""Stores a PubSub connection, project and author""" | ||
|
||
def __init__( | ||
self, | ||
author: str, | ||
): | ||
self.client: pubsub_v1.PublisherClient = pubsub_v1.PublisherClient() | ||
self.author: str = author | ||
self.topic: str = os.getenv('METAMIST_INFRA_PUBSUB_TOPIC') | ||
|
||
@staticmethod | ||
async def get_connection_no_project(author: str): | ||
"""Get a pubsub connection from a project and user""" | ||
# maybe it makes sense to perform permission checks here too | ||
logger.debug(f'Authenticate no-project connection with {author!r}') | ||
|
||
# we don't authenticate project-less connection, but rely on the | ||
# the endpoint to validate the resources | ||
|
||
return PubSubConnection(author=author) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
from db.python.gcp_connect import BqConnection | ||
|
||
|
||
class BqBaseLayer: | ||
"""Base of all Big Query DB layers""" | ||
|
||
def __init__(self, connection: BqConnection): | ||
self.connection = connection | ||
|
||
@property | ||
def author(self): | ||
"""Get author from connection""" | ||
return self.connection.author |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
import datetime | ||
import json | ||
import logging | ||
from typing import Any | ||
|
||
from db.python.gcp_connect import BqDbBase, PubSubConnection | ||
from db.python.layers.bq_base import BqBaseLayer | ||
from models.models import SearchItem, EtlSummary | ||
|
||
|
||
def reformat_record(record): | ||
"""Reformat record to be more readable""" | ||
|
||
record['details'] = json.loads(record['details']) | ||
record['sample_record'] = json.loads(record['sample_record']) | ||
|
||
return EtlSummary( | ||
request_id=record['request_id'], | ||
last_run_at=record['last_run_at'], | ||
status=record['status'], | ||
source_type=record['details']['source_type'], | ||
submitting_user=record['details']['submitting_user'], | ||
parser_result=record['details']['result'], | ||
) | ||
|
||
|
||
class EtlLayer(BqBaseLayer): | ||
"""Web layer""" | ||
|
||
async def get_etl_summary( | ||
self, | ||
grid_filter: list[SearchItem], | ||
token: int = 0, | ||
limit: int = 20, | ||
) -> Any: | ||
""" | ||
TODO | ||
""" | ||
etlDb = EtlDb(self.connection) | ||
return await etlDb.get_etl_summary() | ||
|
||
|
||
class EtlDb(BqDbBase): | ||
"""Db layer for web related routes,""" | ||
|
||
async def get_etl_summary( | ||
self, | ||
) -> list[EtlSummary]: | ||
""" | ||
TODO | ||
""" | ||
return await self.get_report() | ||
|
||
async def get_report( | ||
self, source_type: str = None, etl_status: str = None, start_dt: str = None | ||
): | ||
"""Get ETL report from BQ""" | ||
|
||
# build query filter | ||
query_filters = [] | ||
|
||
if source_type: | ||
query_filters.append( | ||
f'JSON_VALUE(details.source_type) LIKE "\/{source_type}\/%"' | ||
) | ||
|
||
if start_dt: | ||
query_filters.append(f'timestamp > "{start_dt}"') | ||
|
||
# construct query filter | ||
query_filter = ' AND'.join(query_filters) | ||
if query_filter: | ||
query_filter = 'WHERE ' + query_filter | ||
|
||
# Status filter applied after grouping by request_id | ||
# One rquest can have multiple runs, we are interested only in the last run | ||
if etl_status: | ||
status_filter = f'WHERE status = "{etl_status.upper()}"' | ||
else: | ||
status_filter = '' | ||
|
||
# query BQ table | ||
_query = f""" | ||
WITH l AS ( | ||
SELECT request_id, max(timestamp) as last_time | ||
FROM `{self._connection.gcp_project}.metamist.etl-logs` | ||
{query_filter} | ||
group by request_id | ||
) | ||
select logs.request_id, logs.timestamp as last_run_at, | ||
logs.status, logs.details, d.body as sample_record | ||
from l | ||
inner join `{self._connection.gcp_project}.metamist.etl-logs` logs on | ||
l.request_id = logs.request_id | ||
and logs.timestamp = l.last_time | ||
INNER JOIN `{self._connection.gcp_project}.metamist.etl-data` d on | ||
d.request_id = logs.request_id | ||
{status_filter} | ||
""" | ||
|
||
query_job_result = self._connection.connection.query(_query).result() | ||
records = [reformat_record(dict(row)) for row in query_job_result] | ||
return records | ||
|
||
|
||
class EtlPubSub: | ||
"""Etl Pub Sub wrapper""" | ||
|
||
def __init__( | ||
self, | ||
connection: PubSubConnection, | ||
): | ||
self.connection = connection | ||
|
||
async def publish( | ||
self, | ||
msg: dict, | ||
) -> str: | ||
""" | ||
publish to pubsub, append user and timestampe to the message | ||
""" | ||
msg['timestamp'] = datetime.datetime.utcnow().isoformat() | ||
msg['submitting_user'] = self.connection.author | ||
|
||
try: | ||
self.connection.client.publish(self.connection.topic, json.dumps(msg).encode()) | ||
return 'submitted' | ||
except Exception as e: # pylint: disable=broad-exception-caught | ||
logging.error(f'Failed to publish to pubsub: {e}') | ||
|
||
return 'failed' | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,3 +59,6 @@ | |
ProjectSummaryInternal, | ||
WebProject, | ||
) | ||
from models.models.etl import ( | ||
EtlSummary, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import datetime | ||
from models.base import SMBase | ||
|
||
|
||
class EtlSummary(SMBase): | ||
"""Return class for the ETL summary endpoint""" | ||
|
||
request_id: str | ||
last_run_at: datetime.datetime | None | ||
status: str | ||
source_type: str | ||
|
||
submitting_user: str | ||
parser_result: str | ||
|
||
class Config: | ||
"""Config for EtlSummary Response""" | ||
orm_mode = True | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.