-
Notifications
You must be signed in to change notification settings - Fork 34
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Mutational load function (SHM) #536
base: main
Are you sure you want to change the base?
Conversation
…nce and germline alignment
for more information, see https://pre-commit.ci
@@ -7,5 +7,6 @@ | |||
from ._diversity import alpha_diversity | |||
from ._group_abundance import group_abundance | |||
from ._ir_query import ir_query, ir_query_annotate, ir_query_annotate_df | |||
from ._mutational_load import mutational_load |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please make sure to also add the tool to the API documentation here:
https://github.com/scverse/scirpy/blob/main/docs/api.rst#tools-tl
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added it and I think it looks quite good :)
src/scirpy/tl/_mutational_load.py
Outdated
mutation_cdr1 = [] | ||
mutation_cdr2 = [] | ||
mutation_cdr3 = [] | ||
|
||
for row in range(len(airr_df)): | ||
fwr1_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][:78] | ||
cdr1_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][78:114] | ||
fwr2_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][114:165] | ||
cdr2_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][165:195] | ||
fwr3_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][195:312] | ||
cdr3_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][ | ||
312 : (312 + airr_df.iloc[row].loc[f"{chain}_junction_len"] - 6) | ||
] | ||
fwr4_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][ | ||
(312 + airr_df.iloc[row].loc[f"{chain}_junction_len"] - 6) : | ||
] | ||
|
||
if frequency: | ||
fwr1_mu_rel = simple_hamming_distance( | ||
subregion_df.iloc[row].loc[f"{chain}_fwr1"], fwr1_germline, frequency=True | ||
) | ||
cdr1_mu_rel = simple_hamming_distance( | ||
subregion_df.iloc[row].loc[f"{chain}_cdr1"], cdr1_germline, frequency=True | ||
) | ||
fwr2_mu_rel = simple_hamming_distance( | ||
subregion_df.iloc[row].loc[f"{chain}_fwr2"], fwr2_germline, frequency=True | ||
) | ||
cdr2_mu_rel = simple_hamming_distance( | ||
subregion_df.iloc[row].loc[f"{chain}_cdr2"], cdr2_germline, frequency=True | ||
) | ||
fwr3_mu_rel = simple_hamming_distance( | ||
subregion_df.iloc[row].loc[f"{chain}_fwr3"], fwr3_germline, frequency=True | ||
) | ||
cdr3_mu_rel = simple_hamming_distance( | ||
subregion_df.iloc[row].loc[f"{chain}_cdr3"], cdr3_germline, frequency=True | ||
) | ||
fwr4_mu_rel = simple_hamming_distance( | ||
subregion_df.iloc[row].loc[f"{chain}_fwr4"], fwr4_germline, frequency=True | ||
) | ||
|
||
mutation_fwr1.append(fwr1_mu_rel) | ||
mutation_fwr2.append(fwr2_mu_rel) | ||
mutation_fwr3.append(fwr3_mu_rel) | ||
mutation_fwr4.append(fwr4_mu_rel) | ||
mutation_cdr1.append(cdr1_mu_rel) | ||
mutation_cdr2.append(cdr2_mu_rel) | ||
mutation_cdr3.append(cdr3_mu_rel) | ||
|
||
else: | ||
fwr1_mu_count = simple_hamming_distance(subregion_df.iloc[row].loc[f"{chain}_fwr1"], fwr1_germline) | ||
cdr1_mu_count = simple_hamming_distance(subregion_df.iloc[row].loc[f"{chain}_cdr1"], cdr1_germline) | ||
fwr2_mu_count = simple_hamming_distance(subregion_df.iloc[row].loc[f"{chain}_fwr2"], fwr2_germline) | ||
cdr2_mu_count = simple_hamming_distance(subregion_df.iloc[row].loc[f"{chain}_cdr2"], cdr2_germline) | ||
fwr3_mu_count = simple_hamming_distance(subregion_df.iloc[row].loc[f"{chain}_fwr3"], fwr3_germline) | ||
cdr3_mu_count = simple_hamming_distance(subregion_df.iloc[row].loc[f"{chain}_cdr3"], cdr3_germline) | ||
fwr4_mu_count = simple_hamming_distance(subregion_df.iloc[row].loc[f"{chain}_fwr4"], fwr4_germline) | ||
|
||
mutation_fwr1.append(fwr1_mu_count) | ||
mutation_fwr2.append(fwr2_mu_count) | ||
mutation_fwr3.append(fwr3_mu_count) | ||
mutation_fwr4.append(fwr4_mu_count) | ||
mutation_cdr1.append(cdr1_mu_count) | ||
mutation_cdr2.append(cdr2_mu_count) | ||
mutation_cdr3.append(cdr3_mu_count) | ||
|
||
if not inplace and frequency: | ||
mutation_df[f"{chain}_fwr1_mu_freq"] = mutation_fwr1 | ||
mutation_df[f"{chain}_cdr1_mu_freq"] = mutation_cdr1 | ||
mutation_df[f"{chain}_fwr2_mu_freq"] = mutation_fwr2 | ||
mutation_df[f"{chain}_cdr2_mu_freq"] = mutation_cdr2 | ||
mutation_df[f"{chain}_fwr3_mu_freq"] = mutation_fwr3 | ||
mutation_df[f"{chain}_cdr3_mu_freq"] = mutation_cdr3 | ||
mutation_df[f"{chain}_fwr4_mu_freq"] = mutation_fwr4 | ||
|
||
if inplace and frequency: | ||
params.set_obs(f"{chain}_fwr1_mu_freq", mutation_fwr1) | ||
params.set_obs(f"{chain}_cdr1_mu_freq", mutation_cdr1) | ||
params.set_obs(f"{chain}_fwr2_mu_freq", mutation_fwr2) | ||
params.set_obs(f"{chain}_cdr2_mu_freq", mutation_cdr2) | ||
params.set_obs(f"{chain}_fwr3_mu_freq", mutation_fwr3) | ||
params.set_obs(f"{chain}_cdr3_mu_freq", mutation_cdr3) | ||
params.set_obs(f"{chain}_fwr4_mu_freq", mutation_fwr4) | ||
|
||
if inplace and not frequency: | ||
params.set_obs(f"{chain}_fwr1_mu_count", mutation_fwr1) | ||
params.set_obs(f"{chain}_cdr1_mu_count", mutation_cdr1) | ||
params.set_obs(f"{chain}_fwr2_mu_count", mutation_fwr2) | ||
params.set_obs(f"{chain}_cdr2_mu_count", mutation_cdr2) | ||
params.set_obs(f"{chain}_fwr3_mu_count", mutation_fwr3) | ||
params.set_obs(f"{chain}_cdr3_mu_count", mutation_cdr3) | ||
params.set_obs(f"{chain}_fwr4_mu_count", mutation_fwr4) | ||
|
||
if not inplace and not frequency: | ||
mutation_df[f"{chain}_fwr1_mu_count"] = mutation_fwr1 | ||
mutation_df[f"{chain}_cdr1_mu_count"] = mutation_cdr1 | ||
mutation_df[f"{chain}_fwr2_mu_count"] = mutation_fwr2 | ||
mutation_df[f"{chain}_cdr2_mu_count"] = mutation_cdr2 | ||
mutation_df[f"{chain}_fwr3_mu_count"] = mutation_fwr3 | ||
mutation_df[f"{chain}_cdr3_mu_count"] = mutation_cdr3 | ||
mutation_df[f"{chain}_fwr4_mu_count"] = mutation_fwr4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could surely be written more compactly by using a bunch of for loops...
Ideally try to extract the functionality you apply to one sequence into a smaller function and then apply it to each sequence.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I rewrote the function based on this feedback and it looks now tidier! Please let me know if I should change/adapt it further
… function to api.rst
for more information, see https://pre-commit.ci
…cirpy into mutational_load
for more information, see https://pre-commit.ci
src/scirpy/tl/_mutational_load.py
Outdated
mutation_dict = {"fwr1": [], "fwr2": [], "fwr3": [], "fwr4": [], "cdr1": [], "cdr2": [], "cdr3": []} | ||
|
||
for row in range(len(airr_df)): | ||
fwr1_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][:78] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Where do the numbers of the indices come from? Can we be sure they will remain stable?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These indices come from the IMGT unique numbering scheme (https://pubmed.ncbi.nlm.nih.gov/12477501/). This scheme is a standard approach to ensure that we can compare different V-regions of different cells. The neat thing is that sequences are aligned in a way that fwr 1-3 and cdr1-2 are always on the same spot in the germline and sequence alignment that's why these fixed indices work. cdr3 and fwr4 can be inferred by knowing the junction length and total sequence length as it is used in my code.
src/scirpy/tl/_mutational_load.py
Outdated
fwr1_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][:78] | ||
cdr1_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][78:114] | ||
fwr2_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][114:165] | ||
cdr2_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][165:195] | ||
fwr3_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][195:312] | ||
cdr3_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][ | ||
312 : (312 + airr_df.iloc[row].loc[f"{chain}_junction_len"] - 6) | ||
] | ||
fwr4_germline = airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][ | ||
(312 + airr_df.iloc[row].loc[f"{chain}_junction_len"] - 6) : | ||
] | ||
|
||
mutation_dict["fwr1"].append( | ||
simple_hamming_distance( | ||
subregion_df.iloc[row].loc[f"{chain}_fwr1"], | ||
fwr1_germline, | ||
frequency=frequency, | ||
ignore_chars=ignore_chars, | ||
) | ||
) | ||
mutation_dict["cdr1"].append( | ||
simple_hamming_distance( | ||
subregion_df.iloc[row].loc[f"{chain}_cdr1"], | ||
cdr1_germline, | ||
frequency=frequency, | ||
ignore_chars=ignore_chars, | ||
) | ||
) | ||
mutation_dict["fwr2"].append( | ||
simple_hamming_distance( | ||
subregion_df.iloc[row].loc[f"{chain}_fwr2"], | ||
fwr2_germline, | ||
frequency=frequency, | ||
ignore_chars=ignore_chars, | ||
) | ||
) | ||
mutation_dict["cdr2"].append( | ||
simple_hamming_distance( | ||
subregion_df.iloc[row].loc[f"{chain}_cdr2"], | ||
cdr2_germline, | ||
frequency=frequency, | ||
ignore_chars=ignore_chars, | ||
) | ||
) | ||
mutation_dict["fwr3"].append( | ||
simple_hamming_distance( | ||
subregion_df.iloc[row].loc[f"{chain}_fwr3"], | ||
fwr3_germline, | ||
frequency=frequency, | ||
ignore_chars=ignore_chars, | ||
) | ||
) | ||
mutation_dict["cdr3"].append( | ||
simple_hamming_distance( | ||
subregion_df.iloc[row].loc[f"{chain}_cdr3"], | ||
cdr3_germline, | ||
frequency=frequency, | ||
ignore_chars=ignore_chars, | ||
) | ||
) | ||
mutation_dict["fwr4"].append( | ||
simple_hamming_distance( | ||
subregion_df.iloc[row].loc[f"{chain}_fwr4"], | ||
fwr4_germline, | ||
frequency=frequency, | ||
ignore_chars=ignore_chars, | ||
) | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this could be further simplified by
(1)
defining a dict of regions
regions = {
"fwr1": (0, 78),
"cdr1": (78, 114),
...
}
and then looping through it; somewhat like
mutation_dict = {}
for region, coordinates in regions.items():
mutation_dict[region] = simple_hamming_distance(
subregion_df.iloc[row].loc[f"{chain}_fwr3"],
airr_df.iloc[row].loc[f"{chain}_{germline_alignment}"][slice(*coordinates)]
...
)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see :D
Thank's for the suggestions! I will try to further simplify it!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the hint. I just adapted the code based on this suggestion and now it looks so much better :D
What's still pending is a robust test-case. I had a look on how other tests are written and I get the idea. However, I still don't know what's the best way to get/generate test-data. Should I manually generate a small (e.g. 10 sequences) dataset inside the test-function, which is used to test mutational_load or what is the best-practice here?
The dataset needs to be IMGT numbered so I don't think that I could load any of scirpy's native datasets here...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should I manually generate a small (e.g. 10 sequences) dataset inside the test-function
Yes, this is common practice. You can also put a small data file in src/scirpy/tests/data
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oke, I'll come up with something for my next push here :)
for more information, see https://pre-commit.ci
src/scirpy/tl/_mutational_load.py
Outdated
), | ||
} | ||
|
||
for v, coordinates in regions.items(): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for v, coordinates in regions.items(): | |
for region, coordinates in regions.items(): |
One letter loop variables should only be used if they follow certain conventions, e.g. i/j/k
for counters in for loops,
or k, v
for key, value pairs from dict.items()
.
Since you use v
for the dict key, this can be confusing and I suggest to use a "proper" variable name like region here.
src/scirpy/tl/_mutational_load.py
Outdated
for chain in chains: | ||
airr_df[f"{chain}_junction_len"] = [len(a) for a in airr_df[f"{chain}_junction"]] | ||
|
||
mutation_dict = {"fwr1": [], "fwr2": [], "fwr3": [], "fwr4": [], "cdr1": [], "cdr2": [], "cdr3": []} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mutation_dict = {"fwr1": [], "fwr2": [], "fwr3": [], "fwr4": [], "cdr1": [], "cdr2": [], "cdr3": []} | |
mutation_dict = defaultdict(list) |
In terms of implementation, I think we're getting there :) |
…cirpy into mutational_load
for more information, see https://pre-commit.ci
…cirpy into mutational_load
Hi Gregor, For some reason pushing these changes seem to have broken something with MuData, but I have no idea why and what I could possibly have done to cause this 😢 The error massage seems to be everywhere the same: |
Breaking mudata is not your fault. It was caused by an anndata release and should be fixed by now. Just rerun the tests :) |
Codecov ReportAttention: Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## main #536 +/- ##
==========================================
+ Coverage 80.19% 81.70% +1.51%
==========================================
Files 49 50 +1
Lines 4079 4297 +218
==========================================
+ Hits 3271 3511 +240
+ Misses 808 786 -22 ☔ View full report in Codecov by Sentry. |
Check out this pull request on See visual diffs & provide feedback on Jupyter Notebooks. Powered by ReviewNB |
Added mutational_load function to calculate differences between sequence and germline alignment. This is especially useful/insightful for BCR due to SHM and help to understand how much mutational actually occurred. However, this is a rather simple approach!
Closes #...