Update es-index types for seqr sync API and UI (#704)

EddieLF · web-flow · commit a0faae560dd8 · 2024-03-15T10:32:55.000+11:00
* Update es-index types for seqr sync API and UI

* Sync multiple ES-Index types in one call

* Refactor from feedback

* Add seqr enums and css for sync UI

* Use asyncio gather to execute all requests

* Uncomment sync_saved_variants update call

* Sort analyses when calculating sg diff. Fix disastrous merge commit

* Remove css in favour of inline styling in typescript
diff --git a/api/routes/web.py b/api/routes/web.py
@@ -16,6 +16,7 @@
 from db.python.layers.seqr import SeqrLayer
 from db.python.layers.web import SearchItem, WebLayer
 from db.python.tables.project import ProjectPermissionsTable
+from models.enums.web import SeqrDatasetType
 from models.models.search import SearchResponse
 from models.models.web import PagingLinks, ProjectSummary
 
@@ -102,6 +103,7 @@ async def search_by_keyword(keyword: str, connection=get_projectless_db_connecti
 )
 async def sync_seqr_project(
     sequencing_type: str,
+    es_index_types: list[SeqrDatasetType],
     sync_families: bool = True,
     sync_individual_metadata: bool = True,
     sync_individuals: bool = True,
@@ -113,6 +115,7 @@ async def sync_seqr_project(
 ):
     """
     Sync a metamist project with its seqr project (for a specific sequence type)
+    es_index_types: list of any of 'Haplotypecaller', 'SV_Caller', 'Mitochondria_Caller'
     """
     seqr = SeqrLayer(connection)
     try:
@@ -122,6 +125,7 @@ async def sync_seqr_project(
             sync_individual_metadata=sync_individual_metadata,
             sync_individuals=sync_individuals,
             sync_es_index=sync_es_index,
+            es_index_types=es_index_types,
             sync_saved_variants=sync_saved_variants,
             sync_cram_map=sync_cram_map,
             post_slack_notification=post_slack_notification,
diff --git a/db/python/layers/seqr.py b/db/python/layers/seqr.py
@@ -24,7 +24,7 @@
 )
 from db.python.connect import Connection
 from db.python.enum_tables import SequencingTypeTable
-from db.python.layers.analysis import AnalysisLayer
+from db.python.layers.analysis import AnalysisInternal, AnalysisLayer
 from db.python.layers.base import BaseLayer
 from db.python.layers.family import FamilyLayer
 from db.python.layers.participant import ParticipantLayer
@@ -33,6 +33,7 @@
 from db.python.tables.project import Project
 from db.python.utils import GenericFilter
 from models.enums import AnalysisStatus
+from models.enums.web import SeqrDatasetType
 
 # literally the most temporary thing ever, but for complete
 # automation need to have sample inclusion / exclusion
@@ -43,6 +44,14 @@
 
 SEQUENCING_GROUPS_TO_IGNORE = {22735, 22739}
 
+# production-pipelines stage names for each dataset type
+ES_INDEX_STAGES = {
+    SeqrDatasetType.SNV_INDEL: 'MtToEs',
+    SeqrDatasetType.SV: 'MtToEsSv',
+    SeqrDatasetType.GCNV: 'MtToEsCNV',
+    SeqrDatasetType.MITO: 'MtToEsMito',
+}
+
 _url_individuals_sync = '/api/project/sa/{projectGuid}/individuals/sync'
 _url_individual_meta_sync = '/api/project/sa/{projectGuid}/individuals_metadata/sync'
 _url_family_sync = '/api/project/sa/{projectGuid}/families/sync'
@@ -114,6 +123,7 @@ async def sync_dataset(
         sync_individual_metadata: bool = True,
         sync_individuals: bool = True,
         sync_es_index: bool = True,
+        es_index_types: list[SeqrDatasetType] = None,
         sync_saved_variants: bool = True,
         sync_cram_map: bool = True,
         post_slack_notification: bool = True,
@@ -206,6 +216,7 @@ async def sync_dataset(
                     self.update_es_index(
                         sequencing_type=sequencing_type,
                         sequencing_group_ids=sequencing_group_ids,
+                        es_index_types=es_index_types,
                         **params,
                     )
                 )
@@ -356,9 +367,53 @@ async def sync_individual_metadata(
             f'Uploaded individual metadata for {len(processed_records)} individuals'
         ]
 
+    def check_updated_sequencing_group_ids(self, sequencing_group_ids: set[int], es_index_analyses: list[AnalysisInternal]):
+        """Check if the sequencing group IDs have been updated"""
+        messages = []
+        if sequencing_group_ids:
+            es_index_analyses = sorted(
+                es_index_analyses, key=lambda el: el.timestamp_completed
+            )
+            sequencing_groups_in_new_index = set(
+                es_index_analyses[-1].sequencing_group_ids
+            )
+
+            if len(es_index_analyses) > 1:
+                sequencing_groups_in_old_index = set(
+                    es_index_analyses[-2].sequencing_group_ids
+                )
+                sequencing_groups_diff = sequencing_group_id_format_list(
+                    sequencing_groups_in_new_index - sequencing_groups_in_old_index
+                )
+                if sequencing_groups_diff:
+                    messages.append(
+                        'Sequencing groups added to index: ' + ', '.join(sequencing_groups_diff),
+                    )
+
+            sg_ids_missing_from_index = sequencing_group_id_format_list(
+                sequencing_group_ids - sequencing_groups_in_new_index
+            )
+            if sg_ids_missing_from_index:
+                messages.append(
+                    f'Sequencing groups missing from {es_index_analyses[-1].output}: '
+                    + ', '.join(sg_ids_missing_from_index),
+                )
+        return messages
+
+    async def post_es_index_update(self, session: aiohttp.ClientSession, url: str, post_json: dict, headers: dict[str, str]):
+        """Post request to update ES index"""
+        resp = await session.post(
+            url=url,
+            json=post_json,
+            headers=headers,
+        )
+        resp.raise_for_status()
+        return await resp.text()
+
     async def update_es_index(
         self,
         session: aiohttp.ClientSession,
+        es_index_types: list[SeqrDatasetType],
         sequencing_type: str,
         project_guid,
         headers,
@@ -392,72 +447,56 @@ async def update_es_index(
         fn_path = os.path.join(SEQR_MAP_LOCATION, filename)
         # pylint: disable=no-member
 
+        # Only need to write this once, as the POST request will ignore extra samples not in each index synced
+        with AnyPath(fn_path).open('w+') as f:  # type: ignore
+            f.write('\n'.join(rows_to_write))
+
         alayer = AnalysisLayer(connection=self.connection)
         es_index_analyses = await alayer.query(
             AnalysisFilter(
                 project=GenericFilter(eq=self.connection.project),
                 type=GenericFilter(eq='es-index'),
                 status=GenericFilter(eq=AnalysisStatus.COMPLETED),
-                meta={'sequencing_type': GenericFilter(eq=sequencing_type)},
+                meta={
+                    'sequencing_type': GenericFilter(eq=sequencing_type),
+                },
             )
         )
-
-        es_index_analyses = sorted(
-            es_index_analyses,
-            key=lambda el: el.timestamp_completed,
-        )
-
         if len(es_index_analyses) == 0:
             return ['No ES index to synchronise']
 
-        with AnyPath(fn_path).open('w+') as f:  # type: ignore
-            f.write('\n'.join(rows_to_write))
-
-        es_index = es_index_analyses[-1].output
-
         messages = []
+        requests = []  # for POST requests to gather
+        for es_index_type in es_index_types:
+            es_indexes_filtered_by_type: list[AnalysisInternal] = [
+                a
+                for a in es_index_analyses
+                if a.meta.get('stage') == ES_INDEX_STAGES[es_index_type]
+            ]
+            if not es_indexes_filtered_by_type:
+                messages.append(f'No ES index to synchronise for {es_index_type}')
+                continue
 
-        if sequencing_group_ids:
-            sequencing_groups_in_new_index = set(
-                es_index_analyses[-1].sequencing_group_ids
+            es_indexes_filtered_by_type = sorted(
+                es_indexes_filtered_by_type,
+                key=lambda el: el.timestamp_completed,
             )
 
-            if len(es_index_analyses) > 1:
-                sequencing_groups_in_old_index = set(
-                    es_index_analyses[-2].sequencing_group_ids
-                )
-                sequencing_groups_diff = sequencing_group_id_format_list(
-                    sequencing_groups_in_new_index - sequencing_groups_in_old_index
-                )
-                if sequencing_groups_diff:
-                    messages.append(
-                        'Samples added to index: ' + ', '.join(sequencing_groups_diff),
-                    )
+            es_index = es_indexes_filtered_by_type[-1].output
 
-            sg_ids_missing_from_index = sequencing_group_id_format_list(
-                sequencing_group_ids - sequencing_groups_in_new_index
-            )
-            if sg_ids_missing_from_index:
-                messages.append(
-                    'Sequencing groups missing from index: '
-                    + ', '.join(sg_ids_missing_from_index),
-                )
+            messages.extend(self.check_updated_sequencing_group_ids(sequencing_group_ids, es_indexes_filtered_by_type))
 
-        req1_url = SEQR_URL + _url_update_es_index.format(projectGuid=project_guid)
-        resp_1 = await session.post(
-            req1_url,
-            json={
+            req1_url = SEQR_URL + _url_update_es_index.format(projectGuid=project_guid)
+            post_json = {
                 'elasticsearchIndex': es_index,
-                'datasetType': 'VARIANTS',
+                'datasetType': es_index_type.value,
                 'mappingFilePath': fn_path,
                 'ignoreExtraSamplesInCallset': True,
-            },
-            headers=headers,
-        )
-        resp_1.raise_for_status()
-
-        messages.append(f'Updated ES index {es_index}')
+            }
+            requests.append(self.post_es_index_update(session, req1_url, post_json, headers))
+            messages.append(f'Updated ES index {es_index}')
 
+        messages.extend(await asyncio.gather(*requests))
         return messages
 
     async def update_saved_variants(
diff --git a/models/enums/web.py b/models/enums/web.py
@@ -9,3 +9,12 @@ class MetaSearchEntityPrefix(Enum):
     ASSAY = 'a'
     FAMILY = 'f'
     SEQUENCING_GROUP = 'sg'
+
+
+class SeqrDatasetType(Enum):
+    """Type of dataset (es-index) that can be POSTed to Seqr"""
+
+    SNV_INDEL = 'SNV_INDEL'  # Haplotypecaller in seqr UI
+    SV = 'SV'                # SV Caller in seqr UI (WGS projects)
+    GCNV = 'SV_WES'          # SV Caller in seqr UI (WES projects)
+    MITO = 'MITO'            # Mitochondria Caller in seqr UI
diff --git a/web/src/pages/project/SeqrSync.tsx b/web/src/pages/project/SeqrSync.tsx