Skip to content

Commit

Permalink
migrate leontis-westhof
Browse files Browse the repository at this point in the history
  • Loading branch information
t0mdavid-m committed Apr 6, 2024
1 parent 9295f6c commit c88f306
Show file tree
Hide file tree
Showing 2 changed files with 166 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import pandas as pd
import argparse
import numpy as np
import json

def process(input, output, chain):
data = pd.read_csv(input)
# Only retain rows with basepair annotation
data = data[data['Leontis-Westhof'].notna()]

output_list = []

for _, row in data.iterrows():

nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']]

# Extract the Leontis-Westhof annotation
lw_string = row['Leontis-Westhof']

# Some interactions are labelled with `n` for near. These are
# ignored
if lw_string[0] == 'n':
continue

# Get sugar orientation from string (`c` = cis, `t` = trans)
sugar_orientation = lw_string[0]

# The residue ids of the nucleotides
res_ids = [None]*2

for i, nucleotide in enumerate(nucleotides):
nucleotide_list = nucleotide.split('.')

# if the nucleotide is not part of the specified chain, skip
# base pair
if nucleotide_list[1] != chain:
break

res_ids[i] = nucleotide_list[3]

if None in res_ids:
continue

if sugar_orientation == 'c':
sugar_orientation = 1
elif sugar_orientation == 't':
sugar_orientation = 2

this_output = sorted((int(res_ids[0]), int(res_ids[1])))
this_output.append(int(sugar_orientation))
output_list.append(this_output)
output_list = np.unique(output_list, axis=0).tolist()
with open(output, 'w') as f:
json.dump(output_list, f, indent=1)

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Parse the glycosidic bond orientation annotations in the "
"NAKB-database for a specific chain. The annotations can be "
"downloaded in the section 'Base Pairs'."
)
parser.add_argument(
"infile",
help="The path to the input file."
)
parser.add_argument(
"outfile",
help="The path to the output JSON file."
)
parser.add_argument(
"chain",
help="The chain ID to be extracted."
)
args = parser.parse_args()

process(args.infile, args.outfile, args.chain)

Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import pandas as pd
import argparse
import json
import numpy as np

def process(input, output, chain):
data = pd.read_csv(input)
# Only retain rows with basepair annotation
data = data[data['Leontis-Westhof'].notna()]

output_list = []

for _, row in data.iterrows():
nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']]

# Extract the Leontis-Westhof annotation
lw_string = row['Leontis-Westhof']

# Some interactions are labelled with `n` for near. These are
# ignored
if lw_string[0] == 'n':
continue

# Get edge annotations from string
edges = [lw_string[-2], lw_string[-1]]

# Dont allow unspecified edges in test data
if '.' in edges:
continue

res_ids = [None]*2
for i, nucleotide in enumerate(nucleotides):
nucleotide_list = nucleotide.split('.')

# if the nucleotide is not part of the specified chain, skip
# base pair
if nucleotide_list[1] != chain:
break

res_ids[i] = nucleotide_list[3]

if None in res_ids:
continue

for i, edge in enumerate(edges):
if edge == 'W':
edges[i] = 1
if edge == 'H':
edges[i] = 2
if edge == 'S':
edges[i] = 3

# Lower residue id on the left, higher residue id on the right
res_ids = np.array(res_ids, dtype=int)
edges = np.array(edges, dtype=int)
sorter = np.argsort(res_ids)
res_ids = res_ids[sorter]
edges = edges[sorter]

output_list.append(
(int(res_ids[0]), int(res_ids[1]), int(edges[0]), int(edges[1]))
)

output_list = np.unique(output_list, axis=0).tolist()
with open(output, 'w') as f:
json.dump(output_list, f, indent=1)

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Parse the edge type annotations in the NAKB-database for "
"a specific chain. The annotations can be downloaded in the section "
"'Base Pairs'."
)
parser.add_argument(
"infile",
help="The path to the input file."
)
parser.add_argument(
"outfile",
help="The path to the output JSON file."
)
parser.add_argument(
"chain",
help="The chain ID to be extracted."
)
args = parser.parse_args()

process(args.infile, args.outfile, args.chain)

0 comments on commit c88f306

Please sign in to comment.