diff --git a/requirements-dev.txt b/requirements-dev.txt index c8cf3c9d8..96c6876a8 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,7 +6,7 @@ # aiohappyeyeballs==2.4.0 # via aiohttp -aiohttp==3.10.5 +aiohttp==3.10.* # via -r requirements.in aiomysql==0.2.0 # via databases @@ -110,7 +110,7 @@ docker==7.1.0 # via testcontainers email-validator==2.2.0 # via fastapi -fastapi[all]==0.112.2 +fastapi[all]==0.115.5 # via # -r requirements.in # strawberry-graphql @@ -413,7 +413,7 @@ sqlalchemy==2.0.34 # via # databases # testcontainers -starlette==0.38.4 +starlette==0.41.2 # via # fastapi # strawberry-graphql @@ -471,7 +471,7 @@ watchfiles==0.24.0 # via uvicorn websockets==13.0.1 # via uvicorn -werkzeug==3.0.4 +werkzeug==3.0.6 # via # flask # functions-framework @@ -479,5 +479,5 @@ wrapt==1.16.0 # via # deprecated # testcontainers -yarl==1.9.10 +yarl==1.12.* # via aiohttp diff --git a/requirements.in b/requirements.in index adc163145..932253c4e 100644 --- a/requirements.in +++ b/requirements.in @@ -14,7 +14,7 @@ google-cloud-logging==2.7.0 google-cloud-pubsub==2.18.3 google-cloud-storage==1.43.0 uvicorn==0.29.0 -fastapi[all]==0.112.2 +fastapi[all]==0.115.5 strawberry-graphql[fastapi]==0.243.0 databases[mysql]==0.9.0 cryptography>=41.0.0 diff --git a/requirements.txt b/requirements.txt index 5929d5b0c..4f3e1ee70 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ # aiohappyeyeballs==2.4.0 # via aiohttp -aiohttp==3.10.5 +aiohttp==3.10.* # via -r requirements.in aiomysql==0.2.0 # via databases @@ -79,7 +79,7 @@ dnspython==2.6.1 # via email-validator email-validator==2.2.0 # via fastapi -fastapi[all]==0.112.2 +fastapi[all]==0.115.5 # via # -r requirements.in # strawberry-graphql @@ -298,7 +298,7 @@ sniffio==1.3.1 # httpx sqlalchemy==2.0.34 # via databases -starlette==0.38.4 +starlette==0.41.2 # via fastapi strawberry-graphql[fastapi]==0.243.0 # via -r requirements.in @@ -338,5 +338,5 @@ websockets==13.0.1 # via uvicorn wrapt==1.16.0 # via deprecated -yarl==1.9.10 +yarl==1.12.* # via aiohttp diff --git a/scripts/back_populate_library_type.py b/scripts/back_populate_library_type.py index 5754ed6d2..9458347a8 100644 --- a/scripts/back_populate_library_type.py +++ b/scripts/back_populate_library_type.py @@ -44,8 +44,8 @@ # VCGS fastq # This is the current name format vcgs_fastq_regex = ( - r'(?P\d{6})_(?P[A-Z\d]+)_(?P\d{4})_' - r'(?P[A-Z]{2}\d+)_(?P[\w\d-]+)_(?P[A-Z\d-]+)_' + r'(?P\d{6,8})_(?P[A-Z\d]+)_(?P\d{4})_' + r'(?P[A-Z]{2}\d+)_(?P[\w\d-]+)_(?P[A-Z\d-]+\D?)_' r'(?P[\w\d]+)_(?PL\d+)_(?PR[12])\.fastq\.gz' ) # Pre mid 2018 the library id was not included: @@ -115,6 +115,12 @@ def check_assay_meta_fields(assays: list[dict], update_sequencing_groups: bool): 'design_description' ) + # Some fastq files are not named in the standard format, but contain the word 'TWIST' + # which we can use to label then with the generic 'TWIST' library type from VCGS + elif 'TWIST' in fastq_filename: + assay_meta_fields_to_update['facility'] = 'vcgs' + assay_meta_fields_to_update['library_type'] = 'TWIST' + else: logging.warning( f'No file name match found for assay {assay_id}. Skipping {fastq_filename}.' diff --git a/scripts/upsert_participants.py b/scripts/upsert_participants.py new file mode 100644 index 000000000..556fa6afa --- /dev/null +++ b/scripts/upsert_participants.py @@ -0,0 +1,184 @@ +import csv +import logging +from collections import defaultdict +from io import StringIO +from typing import Dict, List + +import click +from google.cloud import storage + +from metamist.apis import ParticipantApi +from metamist.graphql import gql, query +from metamist.models import ( + ParticipantUpsert, + SampleUpsert, +) + +SAMPLE_QUERY = gql( + """ + query getSampleData($project: String!) { + project(name: $project) { + id + } + sample(project: {eq: $project}) { + id + active + externalId + sequencingGroups { + id + assays { + id + meta + } + analyses { + id + output + outputs + meta + } + } + } + } + """ +) + +PARTICIPANT_QUERY = gql( + """ + query getParticipantData($project: String!) { + project(name: $project) { + participants { + id + externalId + } + } + } + """ +) + +papi = ParticipantApi() + + +def map_sample_eid_to_sample_data(data: Dict[str, list]): + """Map sample external ID to sample data""" + return {sample['externalId']: sample for sample in data['sample']} + + +def get_sample_data(project: str): + """Get sample data from metamist""" + response = query(SAMPLE_QUERY, {'project': project}) + if not isinstance(response, dict): + raise ValueError('Expected a dictionary response from query') + return response + + +def create_participant(sample_data: list, peid: str, sex: int, external_org_name: str): + """Create participant upsert object""" + return ParticipantUpsert( + id=None, + external_ids={external_org_name: peid}, + reported_sex=sex, + samples=sample_data, + ) + + +def create_sample(project_id: int, seid: str, siid: str, external_org_name: str): + """Create sample upsert object""" + return SampleUpsert( + id=siid, + external_ids={external_org_name: seid}, + project=project_id, + ) + + +def read_files_from_gcs(bucket_name: str, file_path: str): + """Read files from GCS""" + client = storage.Client() + bucket = client.get_bucket(bucket_name) + blob = bucket.blob(file_path) + blob = blob.download_as_text(encoding='utf-8') + return StringIO(blob) + + +@click.command() +@click.option('--bucket', help='Bucket containing files. e.g. cpg-bucket-test-upload') +@click.option('--participant-meta', help='Path to participant metadata CSV.') +@click.option('--sample-meta', help='Path to sample metadata CSV.') +@click.option('--sample-external-id-column', help='Column name for sample external ID.') +@click.option('--external-org-name', help='External organization name.') +def main( + bucket: str, + participant_meta: str, + sample_meta: str, + sample_external_id_column: str, + external_org_name: str, +): # pylint: disable=too-many-locals + """Upsert participants to metamist""" + # Query metamist + project = bucket.split('-')[1] # project name derived from bucket name + data_response = get_sample_data(project) + project_id = data_response.get('project')['id'] + + sample_eid_mapping = map_sample_eid_to_sample_data(data_response) + + # Set up logging + logging.basicConfig(level=logging.INFO) + + # Read in csv + participant_data_io = read_files_from_gcs(bucket, participant_meta) + sample_data_io = read_files_from_gcs(bucket, sample_meta) + + participant_data_list = list(csv.DictReader(participant_data_io)) + sample_data_list = list(csv.DictReader(sample_data_io, delimiter='\t')) + + # Count of samples for each participant + participant_sample_count: Dict[str, int] = defaultdict(int) + metadata_map = {} + for meta_row in participant_data_list: + metadata_map[meta_row['sampleID']] = meta_row + peid = meta_row['externalID'] + participant_sample_count[peid] += 1 + + # Map peid to a list of sample upsert objects + participant_sex_map = {} + participant_sample_map: Dict[str, List[SampleUpsert]] = defaultdict(list) + + for name_row in sample_data_list: + # Get the external ID + seid = '.'.join(name_row['CRAM'].split('.')[:2]) + + meta_row = metadata_map.get(name_row[sample_external_id_column]) + # Get participant external ID + peid = meta_row['externalID'] + + # Code sex + sex = 2 if meta_row['isFemale'].upper() == 'TRUE' else 1 + + # Get the sample internal ID + if seid not in sample_eid_mapping: + print(f'Sample {seid} not found in metamist.') + continue + siid = sample_eid_mapping[seid]['id'] + + if peid not in participant_sample_map: + participant_sample_map[peid] = [] + participant_sex_map[peid] = sex + participant_sample_map[peid].append( + create_sample(project_id, seid, siid, external_org_name) + ) + + participant_upserts = [] + for peid, sample_upserts in participant_sample_map.items(): + participant_upserts.append( + create_participant( + sample_upserts, peid, participant_sex_map[peid], external_org_name + ) + ) + + # Upsert participants + api_response = papi.upsert_participants(project, participant_upserts) + print(api_response) + + +if __name__ == '__main__': + # pylint: disable=no-value-for-parameter + main()