First draft of ETL GUI.

milo-hyben · milo-hyben · commit 8e2277156584 · 2023-09-26T16:36:29.000+10:00
diff --git a/api/routes/__init__.py b/api/routes/__init__.py
@@ -8,3 +8,4 @@
 from api.routes.web import router as web_router
 from api.routes.enum import router as enum_router
 from api.routes.sequencing_groups import router as sequencing_groups_router
+from api.routes.etl import router as etl_router
diff --git a/api/routes/etl.py b/api/routes/etl.py
@@ -0,0 +1,67 @@
+"""
+Web routes
+"""
+from typing import Any, Optional
+
+from api.utils.db import (
+    BqConnection,
+    PubSubConnection,
+    get_projectless_bq_connection,
+    get_projectless_pubsub_connection
+)
+from db.python.layers.etl import EtlLayer, EtlPubSub
+from fastapi import APIRouter, Request
+from models.models.etl import EtlSummary
+from models.models.search import SearchItem
+
+router = APIRouter(prefix='/etl', tags=['etl'])
+
+
+@router.post(
+    '/summary',
+    response_model=list[EtlSummary],
+    operation_id='getEtlSummary',
+)
+async def get_etl_summary(
+    request: Request,
+    grid_filter: list[SearchItem],
+    limit: int = 20,
+    token: Optional[int] = 0,
+    connection: BqConnection = get_projectless_bq_connection,
+) -> EtlSummary:
+    """Creates a new sample, and returns the internal sample ID"""
+    
+    st = EtlLayer(connection)
+    summary = await st.get_etl_summary(
+        token=token, limit=limit, grid_filter=grid_filter
+    )
+    return summary
+
+
+@router.post(
+    '/reload',
+    response_model=str,
+    operation_id='etlReload',
+)
+async def etl_reload(
+    request: Request,
+    request_id: str = None,
+    connection: PubSubConnection = get_projectless_pubsub_connection,
+) -> str:
+    """Resubmit request to the topic
+    
+    resubmit record to the topic:
+    
+    {"request_id": "640e8f2e-4e20-4959-8620-7b7741265895"}
+    
+    """
+    
+    msg = {
+        'request_id': request_id,
+    }
+    
+    pubsub = EtlPubSub(connection)
+    result = await pubsub.publish(
+        msg=msg,
+    )
+    return result
diff --git a/api/utils/__init__.py b/api/utils/__init__.py
@@ -7,6 +7,7 @@
     get_project_readonly_connection,
     get_project_write_connection,
     get_projectless_db_connection,
+    get_projectless_bq_connection,
 )
 
 T = TypeVar('T')
diff --git a/api/utils/db.py b/api/utils/db.py
@@ -10,6 +10,7 @@
 from api.settings import get_default_user
 from api.utils.gcp import email_from_id_token
 from db.python.connect import SMConnections, Connection
+from db.python.gcp_connect import BqConnection, PubSubConnection
 
 
 EXPECTED_AUDIENCE = getenv('SM_OAUTHAUDIENCE')
@@ -76,6 +77,16 @@ async def dependable_get_connection(author: str = Depends(authenticate)):
     return await SMConnections.get_connection_no_project(author)
 
 
+async def dependable_get_bq_connection(author: str = Depends(authenticate)):
+    """FastAPI handler for getting connection withOUT project"""
+    return await BqConnection.get_connection_no_project(author)
+
+
+async def dependable_get_pubsub_connection(author: str = Depends(authenticate)):
+    """FastAPI handler for getting connection withOUT project"""
+    return await PubSubConnection.get_connection_no_project(author)
+
+
 def validate_iap_jwt_and_get_email(iap_jwt, audience):
     """
     Validate an IAP JWT and return email
@@ -102,3 +113,5 @@ def validate_iap_jwt_and_get_email(iap_jwt, audience):
 get_project_readonly_connection = Depends(dependable_get_readonly_project_connection)
 get_project_write_connection = Depends(dependable_get_write_project_connection)
 get_projectless_db_connection = Depends(dependable_get_connection)
+get_projectless_bq_connection = Depends(dependable_get_bq_connection)
+get_projectless_pubsub_connection = Depends(dependable_get_pubsub_connection)
diff --git a/db/python/gcp_connect.py b/db/python/gcp_connect.py
@@ -0,0 +1,76 @@
+"""
+Code for connecting to Big Query database
+"""
+import logging
+import os
+from typing import Optional
+
+import google.cloud.bigquery as bq
+from db.python.utils import InternalError
+import google.cloud.pubsub_v1 as pubsub_v1
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+
+class BqConnection:
+    """Stores a Big Query DB connection, project and author"""
+
+    def __init__(
+        self,
+        author: str,
+    ):
+        self.gcp_project = os.getenv('METAMIST_INFRA_GCP_PROJECT')
+        self.connection: bq.Client = bq.Client(project=self.gcp_project)
+        self.author: str = author
+        
+    @staticmethod
+    async def get_connection_no_project(author: str):
+        """Get a db connection from a project and user"""
+        # maybe it makes sense to perform permission checks here too
+        logger.debug(f'Authenticate no-project connection with {author!r}')
+
+        # we don't authenticate project-less connection, but rely on the
+        # the endpoint to validate the resources
+
+        return BqConnection(author=author)
+
+
+class BqDbBase:
+    """Base class for big query database subclasses"""
+
+    def __init__(self, connection: BqConnection):
+        if connection is None:
+            raise InternalError(
+                f'No connection was provided to the table {self.__class__.__name__!r}'
+            )
+        if not isinstance(connection, BqConnection):
+            raise InternalError(
+                f'Expected connection type Connection, received {type(connection)}, '
+                f'did you mean to call self._connection?'
+            )
+
+        self._connection = connection
+
+
+class PubSubConnection:
+    """Stores a PubSub connection, project and author"""
+
+    def __init__(
+        self,
+        author: str,
+    ):
+        self.client: pubsub_v1.PublisherClient = pubsub_v1.PublisherClient()
+        self.author: str = author
+        self.topic: str = os.getenv('METAMIST_INFRA_PUBSUB_TOPIC')
+    
+    @staticmethod
+    async def get_connection_no_project(author: str):
+        """Get a pubsub connection from a project and user"""
+        # maybe it makes sense to perform permission checks here too
+        logger.debug(f'Authenticate no-project connection with {author!r}')
+
+        # we don't authenticate project-less connection, but rely on the
+        # the endpoint to validate the resources
+        
+        return PubSubConnection(author=author)
diff --git a/db/python/layers/bq_base.py b/db/python/layers/bq_base.py
@@ -0,0 +1,13 @@
+from db.python.gcp_connect import BqConnection
+
+
+class BqBaseLayer:
+    """Base of all Big Query DB layers"""
+
+    def __init__(self, connection: BqConnection):
+        self.connection = connection
+
+    @property
+    def author(self):
+        """Get author from connection"""
+        return self.connection.author
diff --git a/db/python/layers/etl.py b/db/python/layers/etl.py
@@ -0,0 +1,132 @@
+import datetime
+import json
+import logging
+from typing import Any
+
+from db.python.gcp_connect import BqDbBase, PubSubConnection
+from db.python.layers.bq_base import BqBaseLayer
+from models.models import SearchItem, EtlSummary
+
+
+def reformat_record(record):
+    """Reformat record to be more readable"""
+    
+    record['details'] = json.loads(record['details'])
+    record['sample_record'] = json.loads(record['sample_record'])
+    
+    return EtlSummary(
+        request_id=record['request_id'],
+        last_run_at=record['last_run_at'], 
+        status=record['status'], 
+        source_type=record['details']['source_type'],
+        submitting_user=record['details']['submitting_user'],
+        parser_result=record['details']['result'],
+    )
+    
+
+class EtlLayer(BqBaseLayer):
+    """Web layer"""
+
+    async def get_etl_summary(
+        self,
+        grid_filter: list[SearchItem],
+        token: int = 0,
+        limit: int = 20,
+    ) -> Any:
+        """
+        TODO
+        """
+        etlDb = EtlDb(self.connection)
+        return await etlDb.get_etl_summary()
+
+
+class EtlDb(BqDbBase):
+    """Db layer for web related routes,"""
+
+    async def get_etl_summary(
+        self,
+    ) -> list[EtlSummary]:
+        """
+        TODO
+        """
+        return await self.get_report()
+
+    async def get_report(
+        self, source_type: str = None, etl_status: str = None, start_dt: str = None
+    ):
+        """Get ETL report from BQ"""
+        
+        # build query filter
+        query_filters = []
+
+        if source_type:
+            query_filters.append(
+                f'JSON_VALUE(details.source_type) LIKE "\/{source_type}\/%"'
+            )
+
+        if start_dt:
+            query_filters.append(f'timestamp > "{start_dt}"')
+
+        # construct query filter
+        query_filter = ' AND'.join(query_filters)
+        if query_filter:
+            query_filter = 'WHERE ' + query_filter
+
+        # Status filter applied after grouping by request_id
+        # One rquest can have multiple runs, we are interested only in the last run
+        if etl_status:
+            status_filter = f'WHERE status = "{etl_status.upper()}"'
+        else:
+            status_filter = ''
+
+        # query BQ table
+        _query = f"""
+            WITH l AS (
+            SELECT request_id, max(timestamp) as last_time
+            FROM `{self._connection.gcp_project}.metamist.etl-logs` 
+            {query_filter}
+            group by request_id
+            )
+            select logs.request_id, logs.timestamp as last_run_at,
+            logs.status, logs.details, d.body as sample_record
+            from l 
+            inner join `{self._connection.gcp_project}.metamist.etl-logs` logs on
+            l.request_id = logs.request_id 
+            and logs.timestamp = l.last_time
+            INNER JOIN `{self._connection.gcp_project}.metamist.etl-data` d on
+            d.request_id = logs.request_id
+            {status_filter}
+        """
+        
+        query_job_result = self._connection.connection.query(_query).result()
+        records = [reformat_record(dict(row)) for row in query_job_result]
+        return records
+
+
+class EtlPubSub:
+    """Etl Pub Sub wrapper"""
+    
+    def __init__(
+        self,
+        connection: PubSubConnection,
+    ):
+        self.connection = connection
+
+    async def publish(
+        self,
+        msg: dict,
+    ) -> str:
+        """
+        publish to pubsub, append user and timestampe to the message  
+        """
+        msg['timestamp'] = datetime.datetime.utcnow().isoformat()
+        msg['submitting_user'] = self.connection.author
+        
+        try:
+            self.connection.client.publish(self.connection.topic, json.dumps(msg).encode())
+            return 'submitted'
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            logging.error(f'Failed to publish to pubsub: {e}')
+    
+        return 'failed'
+        
diff --git a/models/models/__init__.py b/models/models/__init__.py
@@ -59,3 +59,6 @@
     ProjectSummaryInternal,
     WebProject,
 )
+from models.models.etl import (
+    EtlSummary,
+)
diff --git a/models/models/etl.py b/models/models/etl.py
@@ -0,0 +1,19 @@
+import datetime
+from models.base import SMBase
+    
+
+class EtlSummary(SMBase):
+    """Return class for the ETL summary endpoint"""
+
+    request_id: str
+    last_run_at: datetime.datetime | None
+    status: str
+    source_type: str
+    
+    submitting_user: str
+    parser_result: str
+    
+    class Config:
+        """Config for EtlSummary Response"""
+        orm_mode = True
+    
diff --git a/web/src/Routes.tsx b/web/src/Routes.tsx
@@ -9,6 +9,7 @@ import ProjectSummaryView from './pages/project/ProjectSummary'
 import ProjectsAdmin from './pages/admin/ProjectsAdmin'
 import ErrorBoundary from './shared/utilities/errorBoundary'
 import AnalysisRunnerSummary from './pages/project/AnalysisRunnerView/AnalysisRunnerSummary'
+import EtlAdmin from './pages/etl/EtlAdmin'
 
 const Routes: React.FunctionComponent = () => (
     <Switch>
@@ -36,6 +37,8 @@ const Routes: React.FunctionComponent = () => (
 
         <Route path="admin" element={<ProjectsAdmin />} />
 
+        <Route path="etl" element={<EtlAdmin />} />
+
         <Route path="/" element={<DocumentationArticle articleid="index" />} />
 
         <Route
diff --git a/web/src/pages/etl/EtlAdmin.tsx b/web/src/pages/etl/EtlAdmin.tsx

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@`
`7`	`7`	`get_project_readonly_connection,`
`8`	`8`	`get_project_write_connection,`
`9`	`9`	`get_projectless_db_connection,`
	`10`	`+ get_projectless_bq_connection,`
`10`	`11`	`)`
`11`	`12`
`12`	`13`	`T = TypeVar('T')`
Original file line number	Diff line number	Diff line change
`@@ -59,3 +59,6 @@`
`59`	`59`	`ProjectSummaryInternal,`
`60`	`60`	`WebProject,`
`61`	`61`	`)`
	`62`	`+from models.models.etl import (`
	`63`	`+ EtlSummary,`
	`64`	`+)`