Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve OBO contribution script #156

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 37 additions & 19 deletions src/biomappings/contribute/obo.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from biomappings.contribute.utils import get_curated_mappings


def update_obo(*, prefix: str, path: Union[str, Path]) -> None:
def update_obo(*, prefix: str, path: Union[str, Path], upper: bool = False) -> None:
"""Update an OBO flat file.

:param prefix: Prefix for the ontology
Expand All @@ -34,13 +34,13 @@ def update_obo(*, prefix: str, path: Union[str, Path]) -> None:
with path.open("r") as file:
lines = file.readlines()
mappings = get_curated_mappings(prefix)
lines = update_obo_lines(lines=lines, mappings=mappings)
lines = update_obo_lines(lines=lines, mappings=mappings, upper=upper)
with path.open("w") as file:
file.writelines(lines)


def update_obo_lines(
*, lines: List[str], mappings: List[Dict[str, Any]], progress: bool = True
*, lines: List[str], mappings: List[Dict[str, Any]], progress: bool = True, upper: bool = False
) -> List[str]:
"""Update the lines of an OBO file.

Expand All @@ -51,13 +51,20 @@ def update_obo_lines(
"""
lines = deepcopy(lines)

for mapping in tqdm(mappings, unit="mapping", unit_scale=True, disable=not progress):
for mapping in tqdm(
mappings, unit="mapping", unit_scale=True, disable=not progress, desc="Adding mappings"
):
target_prefix = mapping["target prefix"]
target_prefix = bioregistry.get_preferred_prefix(target_prefix) or target_prefix
preterred_target_prefix = bioregistry.get_preferred_prefix(target_prefix)
if preterred_target_prefix:
target_prefix = preterred_target_prefix
elif upper:
target_prefix = target_prefix.upper()
target_identifier = standardize_identifier(target_prefix, mapping["target identifier"])

source_curie = mapping["source"]
if not source_curie.startswith("orcid:"):
tqdm.write("missing ORCID")
continue

# FIXME be careful about assumption about identifier. currently
Expand All @@ -73,7 +80,7 @@ def update_obo_lines(


def add_xref(
lines: List[str], node: str, xref: str, xref_name: str, author_orcid: str
lines: List[str], node: str, xref_curie: str, xref_name: str, author_orcid: str
) -> List[str]:
"""Add xref to OBO file lines in the appropriate place."""
look_for_xref = False
Expand All @@ -84,9 +91,9 @@ def add_xref(
#: The 0-indexed line number on which the definition appears in a given
def_idx = None
xref_entries = []
xref_values = set()
existing_xref_curies = set()
for idx, line in enumerate(lines):
if line == f"id: {node}":
if line.strip() == f"id: {node}":
id_idx = idx
look_for_xref = True
if look_for_xref and line.startswith("def"):
Expand All @@ -96,18 +103,18 @@ def add_xref(
if not start_xref_idx:
start_xref_idx = idx
xref_line: str = line[len("xref:") :].strip()
xref_values.add(_extract_ref(xref_line))
existing_xref_curies.add(_extract_ref(xref_line))
xref_entries.append(xref_line)
if start_xref_idx and not line.startswith("xref"):
break
if look_for_xref and not line.strip():
start_xref_idx = def_idx

if id_idx is None:
# term not found
tqdm.write(f"term not found for {node}")
return lines

if xref in xref_values:
if _standardize_curie(xref_curie) in existing_xref_curies:
# term already has this xref, don't modify it
return lines

Expand All @@ -118,15 +125,17 @@ def add_xref(
# there were no existing xrefs, so let's just stick them directly after the definition
start_xref_idx = id_idx + 1

xref_entries.append(xref)
xref_entries = sorted(xref_entries)
xr_idx = xref_entries.index(xref)
line = f'xref: {xref} {{dcterms:contributor="https://orcid.org/{author_orcid}"}} ! {xref_name}'
xref_entries.append(xref_curie)
xref_entries = sorted(xref_entries, key=str.casefold)
xr_idx = xref_entries.index(xref_curie)
# see https://github.com/obophenotype/uberon/pull/2950/files#r1302892427 for explanation of correct
# format for xref contributor information. Names aren't added since they get stripped by protege
line = f'xref: {xref_curie} {{http://purl.org/dc/terms/contributor="https://orcid.org/{author_orcid}"}}\n'
lines.insert(start_xref_idx + xr_idx, line)
return lines


def _extract_ref(xref_line):
def _extract_ref(xref_line: str) -> str:
# remove trailing comments
if "!" in xref_line:
xref_line = xref_line[: xref_line.find("!")].strip()
Expand All @@ -136,7 +145,15 @@ def _extract_ref(xref_line):
# if there are qualifiers, only keep everything up until them
if "{" in xref_line:
return xref_line[: xref_line.find("{")].strip()
return xref_line
return _standardize_curie(xref_line)


def _standardize_curie(curie: str) -> str:
p, i = bioregistry.parse_curie(curie)
if not p or not i:
tqdm.write(f"could not parse existing xref: {curie}")
return curie
return bioregistry.curie_to_str(p, i)


@click.command()
Expand All @@ -148,9 +165,10 @@ def _extract_ref(xref_line):
help="After forking and cloning the version controlled repository for an ontology locally, "
"give the path inside the directory to the ontology's edit file.",
)
def main(prefix: str, path: Path):
@click.option("--upper", is_flag=True)
def main(prefix: str, path: Path, upper: bool):
"""Contribute to an OBO file."""
update_obo(prefix=prefix, path=path)
update_obo(prefix=prefix, path=path, upper=upper)


if __name__ == "__main__":
Expand Down
Loading