Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Populate tob-wgs-test metamist with nagim gvcf files #653

Draft
wants to merge 50 commits into
base: dev
Choose a base branch
from
Draft
Changes from 47 commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
a1b5be9
Merge pull request #574 from populationgenomics/dev
illusional Oct 12, 2023
4cb1744
Merge pull request #579 from populationgenomics/dev
illusional Oct 19, 2023
6d77327
Merge pull request #581 from populationgenomics/dev
illusional Oct 21, 2023
cb53231
Merge pull request #584 from populationgenomics/dev
illusional Oct 23, 2023
24ae802
Merge pull request #587 from populationgenomics/dev
illusional Oct 25, 2023
37618f1
Merge pull request #595 from populationgenomics/dev
illusional Oct 31, 2023
f222398
Merge pull request #599 from populationgenomics/dev
illusional Nov 1, 2023
e8025df
Merge pull request #613 from populationgenomics/dev
vivbak Nov 15, 2023
f8acb63
Billing pre-release (#605) (#617)
milo-hyben Nov 20, 2023
a684651
Merge pull request #618 from populationgenomics/dev
vivbak Nov 21, 2023
7db6896
Merge pull request #623 from populationgenomics/dev
vivbak Nov 23, 2023
5beb48d
Merge pull request #632 from populationgenomics/dev
michael-harper Dec 11, 2023
2a49cd9
Merge pull request #639 from populationgenomics/dev
illusional Jan 3, 2024
570809a
Merge pull request #641 from populationgenomics/dev
illusional Jan 5, 2024
3d2df80
Merge pull request #644 from populationgenomics/dev
illusional Jan 9, 2024
1a2f22a
Merge pull request #649 from populationgenomics/dev
nevoodoo Jan 11, 2024
6f619d0
script to create samples into tob-wgs metamist
michael-harper Jan 15, 2024
2645935
changed how to read path
michael-harper Jan 15, 2024
b97ecd2
changed project id to be an integer
michael-harper Jan 15, 2024
7f963b7
making assays not None
michael-harper Jan 15, 2024
db607f2
assays has to be a list
michael-harper Jan 15, 2024
c896d30
adding assay meta fields
michael-harper Jan 15, 2024
beb62f1
added script to add gvcf analysis entries metamist
michael-harper Jan 15, 2024
57e8e83
fixed variable naming
michael-harper Jan 15, 2024
ce4f7b9
changed analysis status from COMPLETED to completed
michael-harper Jan 15, 2024
fef13dc
removed unnecessary fields from analysis upsert
michael-harper Jan 15, 2024
c1e1b7e
now adding analyses to correct sequencingGroups
michael-harper Jan 15, 2024
b7ba499
generating participants
michael-harper Jan 16, 2024
3c131d8
importing and global variable changes
michael-harper Jan 16, 2024
7ee6d51
adding necessary assay upsert fields
michael-harper Jan 16, 2024
fb2c690
adding more required fields to assay upsert
michael-harper Jan 16, 2024
225f6fd
editing assays so that their "type" is unique
michael-harper Jan 16, 2024
8196100
adding new participants to tob-wgs-test metamist
michael-harper Jan 17, 2024
2bf2f65
removed hard coding of mapping file
michael-harper Jan 17, 2024
b46e8f8
remove unwanted files
michael-harper Jan 17, 2024
76eb16f
making suggested changes
michael-harper Jan 17, 2024
cf8b604
adding suffix to append to external ids as a parameter
michael-harper Jan 17, 2024
52d7c3c
removing external id suffix properly
michael-harper Jan 17, 2024
1037931
using dataclass to retriev gvcf path from dictionary
michael-harper Jan 17, 2024
3295307
not using sg= inside a list as this is not allowed
michael-harper Jan 17, 2024
ebe2689
removing project_id as parameter input
michael-harper Jan 17, 2024
5239ebd
adding print to sg for debugging
michael-harper Jan 17, 2024
6c07839
was iterating over "test" project to fetch data instead of "main" pro…
michael-harper Jan 17, 2024
a3839ad
Merge branch 'dev' into populate-tob-test-with-nagim
michael-harper Jan 17, 2024
315bc0a
correct assay["type"] assignment
michael-harper Jan 17, 2024
0299f7e
changed naming of project-id-to-add-to parameter. Shortened it to pro…
michael-harper Jan 17, 2024
94c7cc5
linting issues!
michael-harper Jan 17, 2024
64310cf
Update scripts/add_nagim_gvcfs_to_tob_wgs_test_metamist.py
michael-harper Jan 18, 2024
4df08e1
Update scripts/add_nagim_gvcfs_to_tob_wgs_test_metamist.py
michael-harper Jan 18, 2024
2f66f18
fixing referencing of sequencing group attributes when upserting assay
michael-harper Jan 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 198 additions & 0 deletions scripts/add_nagim_gvcfs_to_tob_wgs_test_metamist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import csv
from dataclasses import dataclass
from pprint import pprint

import click
from cpg_utils import to_path

from metamist.apis import ParticipantApi, AnalysisApi
from metamist.graphql import gql, query
from metamist.models import (
ParticipantUpsert,
SampleUpsert,
SequencingGroupUpsert,
AssayUpsert,
AnalysisStatus,
Analysis,
)


@dataclass
class RowData:
"""
A class used to represent a row of data.

Attributes
----------
sgid : str
an identifier for the sequencing group
ext_id : str
an external identifier for the row
gvcf : str
the path to the gvcf file
gvcf_idx : str
the path to the gvcf index file
"""

sgid: str
ext_id: str
gvcf: str
gvcf_idx: str


PARTICIPANT_QUERY = gql(
"""
query ($project: String!) {
project(name: $project) {
id
participants {
externalId
id
samples {
id
externalId
sequencingGroups {
externalIds
id
meta
platform
technology
type
assays {
meta
id
}
}
}
}
}
}
"""
)


@click.command()
@click.option(
'--project',
required=True,
help='The name of the project to add samples to.',
)
@click.option(
'--sample-path-mappings',
required=True,
help="""The path to a CSV file containing mappings of `main` CPG ID's,
the `external_id` and `gvcf` paths.
The file should have at least four columns: sgid, ext_id, gvcf, and gvcf_idx.
Here's an example of what the first couple of lines might look like:

sgid,ext_id,gvcf,gvcf_idx
sg1,ext1,gvcf1,gvcf_idx1
sg2,ext2,gvcf2,gvcf_idx2
""",
)
@click.option(
'--project-id',
required=True,
type=int,
help="""The ID of the project to add samples to.
For example: iterate over `main` project (identified by the --project flag) to get data for each sample, then create a new participant
with the same data, but with a new external ID that has the suffix (--suffix) specified by the user.
Then upsert these into the `test` project.
""",
)
@click.option(
'--suffix',
required=True,
help="""The suffix to add to the external ID's of the participants.
For example, if the suffix is `test`, then the external ID's of the participants
will be `ext_id1-test`, `ext_id2-test`, etc.
""",
)
def main(project: str, project_id: int, sample_path_mappings: str, suffix: str):
"""
Iterate over `main` project to get data for each sample, then create a new participant
with the same data, but with a new external ID that has the suffix specified by the user.
Then upsert these into the `test` project.
"""
# Read the CSV file into a dictionary
ext_id_to_row = {}
with to_path(sample_path_mappings).open() as f:
reader = csv.reader(f)
next(reader) # skip the header
for row in reader:
data = RowData(*row[:4])
ext_id_to_row[data.ext_id] = data

query_response = query(PARTICIPANT_QUERY, {'project': project})
p_upserts = []
# pylint: disable=unsubscriptable-object
for participant in query_response['project']['participants']:
if participant['externalId'] not in ext_id_to_row:
continue
ext_id = f"{participant['externalId']}-{suffix}"
p = ParticipantUpsert(
external_id=ext_id,
active=None,
samples=[],
)
for sample in participant['samples']:
s = SampleUpsert(
external_id=ext_id,
project=project_id,
sequencing_groups=[],
)
for sg in sample['sequencingGroups']:
pprint(sg)
s.sequencing_groups.append(
SequencingGroupUpsert(
type=sg['type'],
technology=sg['technology'],
platform=sg['platform'],
meta=None,
sample_id=None,
external_ids=None,
assays=[
AssayUpsert(
type=sg['type'],
meta={
'sequencing_type': sg['assays'][0]['meta'][
michael-harper marked this conversation as resolved.
Show resolved Hide resolved
'sequencing_type'
],
'sequencing_platform': sg['assays'][0]['meta'][
'sequencing_platform'
],
'sequencing_technology': sg['assays'][0]['meta'][
'sequencing_technology'
],
},
),
],
)
)
p.samples.append(s)
p_upserts.append(p)
# pylint: enable=unsubscriptable-object

upserted_participants = ParticipantApi().upsert_participants(project, p_upserts)

for participant in upserted_participants:
for sample in participant['samples']:
old_ext_id = sample['externalId'].removesuffix(f'-{suffix}')
row_data = ext_id_to_row[old_ext_id]
gvcf_path = row_data.gvcf # get gvcf path from dictionary
AnalysisApi().create_analysis(
project,
Analysis(
type='gvcf',
status=AnalysisStatus('completed'),
output=gvcf_path,
sequencing_group_id=sample['sequencingGroups '][0]['id'],
michael-harper marked this conversation as resolved.
Show resolved Hide resolved
active=True,
),
)


if __name__ == '__main__':
# pylint: disable=no-value-for-parameter
main()
# pylint: enable=no-value-for-parameter
michael-harper marked this conversation as resolved.
Show resolved Hide resolved
Loading