forked from Arcadia-Science/ProteinCartography
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_source_of_hits.py
144 lines (120 loc) · 4.94 KB
/
get_source_of_hits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
import argparse
import os
import pandas as pd
# only import these functions when using import *
__all__ = ["get_source"]
# parse command line arguments
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"-i",
"--input",
required=True,
help="Input filepath containing list of protids as index. Usually pivoted distance matrix.",
)
parser.add_argument(
"-f",
"--hit-files",
nargs="+",
required=True,
help="List of filepaths to parse source info.",
)
parser.add_argument("-o", "--output", required=True, help="Destination filepath.")
parser.add_argument(
"-k",
"--keyids",
nargs="+",
help=(
"Key ids (usually protids of input proteins) used to determine the source of a protid."
"\nUseful when there are multiple input proteins."
),
)
args = parser.parse_args()
return args
def get_source(
input_file: str,
hit_files: list,
savefile=None,
groupby=("method", "keyid"),
methods=("blast", "foldseek"),
keyids=None,
) -> pd.DataFrame:
"""
Aggregates a list of hit files to determine the source of each protid for a features matrix.
Args:
input_file (str): path to input pivoted results file.
hit_files (list): list of filepaths to the results files
(usually blast_hits and foldseek_hits)
savefile (str): path to destination file.
groupby (tuple): how to group summary metrics. Defaults to 'method' and 'keyid'.
methods (tuple): what methods to look for Defaults to 'blast' and 'foldseek'.
keyids (list): which keyids to identify as the source data for blast/foldseek.
Returns:
a pandas.DataFrame containing the results.
"""
if keyids is None:
keyids = []
# Read the input file
df = pd.read_csv(input_file, sep="\t", index_col="protid")
# Get the index to get a list of protids
df_indexes = pd.DataFrame(df.index)
# Read the entries of each source file as a list
for file in hit_files:
# Get the name of the file
sourcename = os.path.basename(file)
# Get the column name (usually protid.method)
sourcecol = sourcename.partition("hits")[0]
# Read the file
with open(file) as f:
sourceitems = [i.rstrip("\n") for i in f.readlines()]
# For each protid, give it a 1 if it's a hit to that source file and a 0 if not.
df_indexes[sourcecol] = pd.Series([1 if i in sourceitems else 0 for i in df.index])
# Create a summary column by method.
# For example, if there are multiple input proteins,
# and you want to know if any of those produced a blast hit for each protid,
# this will do that for you.
if "method" in groupby:
# Go across the methods
for method in methods:
# Identify the columns that have the method and see if any are true
df_indexes[method] = df_indexes[df_indexes.filter(like=method).columns].any(axis=1)
# convert the True / False to 1 / 0 encoding.
df_indexes[method] = df_indexes[method].apply(lambda x: 1 if x is True else 0)
# If both 'blast' and 'foldseek' are in the methods,
# generate a combined column for those hits
# that were found via both methods.
if "blast" in methods and "foldseek" in methods:
df_indexes["blast+foldseek"] = df_indexes[["blast", "foldseek"]].all(axis=1)
df_indexes["blast+foldseek"] = df_indexes["blast+foldseek"].apply(
lambda x: 2 if x is True else 0
)
# Summarize across source columns to assign an "origin" for each protid.
# If a protein came from both blast and foldseek,
# because the blast+foldseek column has a max value of 2,
# that assignment overrides the other two.
df_indexes["source.method"] = df_indexes[["blast", "foldseek", "blast+foldseek"]].idxmax(
axis=1
)
# Create a summary column by keyid (usually the input proteins used to make the search).
# This tells you for each protid whether it was a hit to one of the keyid proteins.
if "keyid" in groupby and len(keyids):
for keyid in keyids:
keyidcol = keyid + ".hit"
df_indexes[keyidcol] = df_indexes[df_indexes.filter(like=keyid).columns].any(axis=1)
df_indexes[keyidcol] = df_indexes[keyidcol].apply(lambda x: 1 if x is True else 0)
# Save to file
if savefile is not None:
df_indexes.to_csv(savefile, sep="\t", index=None)
return df_indexes
# run this if called from the interpreter
def main():
args = parse_args()
input_file = args.input
hit_files = args.hit_files
output_file = args.output
keyids = args.keyids
get_source(input_file, hit_files, savefile=output_file, keyids=keyids)
# check if called from interpreter
if __name__ == "__main__":
main()