Skip to content

Commit

Permalink
Merge pull request #14 from ontodev/extract
Browse files Browse the repository at this point in the history
Add extract.py
  • Loading branch information
jamesaoverton committed Aug 25, 2020
2 parents 5e37105 + 840cc93 commit cd5586a
Show file tree
Hide file tree
Showing 5 changed files with 375 additions and 1 deletion.
274 changes: 274 additions & 0 deletions gizmos/extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
import logging
import sqlite3
import sys

from argparse import ArgumentParser

"""
Usage: python3 extract.py -d <sqlite-database> -t <curie> > <ttl-file>
Creates a TTL file containing the term, annotations, and ancestors. TTL is written to stdout.
You can include more than one `-t <curie>`/`--term <curie>`.
You may also specify multiple CURIEs to extract with `-T <file>`/`--terms <file>`
where the file contains a list of CURIEs to extract.
You may also specify which annotations you would like to include with
`-a <curie>`/`--annotation <curie>` or `-A <file>`/`--annotations <file>`
where the file contains a list of annotation property CURIEs.
Finally, if you don't wish to include the ancestors of the term/terms,
include the `-n`/`--no-hierarchy` flag.
The sqlite-database must be created by RDFTab (https://github.com/ontodev/rdftab.rs)
and include 'statements' and 'prefixes' tables.
The CURIEs must use a prefix from the 'prefixes' table.
"""

# Track terms already added to database
added = []


def main():
global added
p = ArgumentParser()
p.add_argument("-d", "--database", required=True, help="SQLite database")
p.add_argument("-t", "--term", action="append", help="CURIE of term to extract")
p.add_argument(
"-T", "--terms", help="File containing CURIES of terms to extract",
)
p.add_argument(
"-a",
"--annotation",
action="append",
help="CURIE of annotation property to include",
)
p.add_argument(
"-A",
"--annotations",
help="File containing CURIEs of annotation properties to include",
)
p.add_argument(
"-n",
"--no-hierarchy",
action="store_true",
help="If provided, do not create any rdfs:subClassOf statements",
)
args = p.parse_args()

# Get required terms
terms = []
if args.term:
terms = args.term
if args.terms:
with open(args.terms, "r") as f:
terms_from_file = [x.strip() for x in f.readlines()]
terms.exend(terms_from_file)

if not terms:
logging.critical("One or more term(s) must be specified with --term or --terms")
sys.exit(1)

# Get optional annotations (otherwise, all annotations are included)
annotations = None
if args.annotation:
# One or more annotations to add
annotations = args.annotation
if args.annotations:
with open(args.annotations, "r") as f:
annotations = [x.strip() for x in f.readlines()]

ttl = "\n".join(
extract(args.database, terms, annotations, no_hierarchy=args.no_hierarchy)
)
sys.stdout.write(ttl)


def add_annotations(cur, annotations=None):
"""Add annotations from the 'statements' table on all subjects in the 'extract' table."""
annotation_str = None
if annotations:
annotation_str = ["'" + x.replace("'", "''") + "'" for x in annotations]
annotation_str = ", ".join(annotation_str)
cur.execute("SELECT DISTINCT subject FROM extract;")
for row in cur.fetchall():
subject = row["subject"]
query = f"""INSERT INTO extract (stanza, subject, predicate, value, language, datatype)
SELECT DISTINCT
subject AS stanza,
subject,
predicate,
value,
language,
datatype
FROM statements WHERE subject = '{subject}' AND value NOT NULL"""
if annotation_str:
query += f" AND predicate IN ({annotation_str})"
cur.execute(query)


def add_ancestors(cur, term_id):
"""Add the hierarchy for a term ID starting with that term up to the top-level, assuming that
term ID exists in the database."""
global added
cur.execute(
f"""
WITH RECURSIVE ancestors(parent, child) AS (
VALUES ('{term_id}', NULL)
UNION
SELECT object AS parent, subject AS child
FROM statements
WHERE predicate = 'rdfs:subClassOf'
AND object = '{term_id}'
UNION
SELECT object AS parent, subject AS child
FROM statements, ancestors
WHERE ancestors.parent = statements.stanza
AND statements.predicate = 'rdfs:subClassOf'
AND statements.object NOT LIKE '_:%'
)
SELECT * FROM ancestors;"""
)

for row in cur.fetchall():
parent = row["parent"]
if parent and parent not in added:
# Only add rdf:type if it hasn't been added
added.append(parent)
cur.execute(
f"""INSERT INTO extract (stanza, subject, predicate, object)
VALUES ('{parent}', '{parent}', 'rdf:type', 'owl:Class');"""
)

child = row["child"]
if child and child not in added:
# Only add rdf:type if it hasn't been added
added.append(child)
cur.execute(
f"""INSERT INTO extract (stanza, subject, predicate, object)
VALUES ('{child}', '{child}', 'rdf:type', 'owl:Class');"""
)

if child and parent:
# Row has child & parent, add subclass statement
cur.execute(
f"""INSERT INTO extract (stanza, subject, predicate, object)
VALUES ('{child}', '{child}', 'rdfs:subClassOf', '{parent}');"""
)


def add_term(cur, term_id):
"""Add the class assertion for a term ID, assuming that term ID exists in the database."""
cur.execute(f"SELECT * FROM statements WHERE subject = '{term_id}';")
res = cur.fetchone()
if res:
cur.execute(
f"""INSERT INTO extract (stanza, subject, predicate, object)
VALUES ('{term_id}', '{term_id}', 'rdf:type', 'owl:Class');"""
)


def dict_factory(cursor, row):
"""Create a dict factory for sqlite cursor"""
d = {}
for idx, col in enumerate(cursor.description):
d[col[0]] = row[idx]
return d


def extract(database, terms, annotations, no_hierarchy=False):
"""Extract terms from the ontology database and return the module as lines of Turtle."""
# Create a new table (extract) and copy the triples we care about
# Then write the triples from that table to the output file
with sqlite3.connect(database) as conn:
conn.row_factory = dict_factory
cur = conn.cursor()
try:
# Create the extract table
cur.execute("DROP TABLE IF EXISTS extract;")
cur.execute(
"""CREATE TABLE extract(stanza TEXT,
subject TEXT,
predicate TEXT,
object TEXT,
value TEXT,
datatype TEXT,
language TEXT);"""
)

# Get each term up to the top-level (unless no_hierarchy)
if not no_hierarchy:
for t in terms:
add_ancestors(cur, t)
else:
# Only add the terms themselves (as long as they exist)
for t in terms:
add_term(cur, t)

# Add declarations for any annotations used in 'extract'
cur.execute(
"""INSERT INTO extract (stanza, subject, predicate, object)
SELECT DISTINCT
predicate AS stanza,
predicate AS subject,
'rdf:type',
'owl:AnnotationProperty'
FROM extract WHERE value NOT NULL;"""
)

# Add annotations for all subjects
add_annotations(cur, annotations=annotations)

# Reset row factory
conn.row_factory = sqlite3.Row
cur = conn.cursor()
return get_ttl(cur)
finally:
# Always drop the extract table
cur.execute("DROP TABLE IF EXISTS extract;")


def get_ttl(cur):
"""Get the 'extract' table as lines of Turtle (the lines are returned as a list)."""
# Get ttl lines
cur.execute(
'''WITH literal(value, escaped) AS (
SELECT DISTINCT
value,
replace(replace(replace(value, '\\', '\\\\'), '"', '\\"'), '
', '\\n') AS escaped
FROM extract
)
SELECT
"@prefix " || prefix || ": <" || base || "> ."
FROM prefix
UNION ALL
SELECT DISTINCT
subject
|| " "
|| predicate
|| " "
|| coalesce(
object,
"""" || escaped || """^^" || datatype,
"""" || escaped || """@" || language,
"""" || escaped || """"
)
|| " ."
FROM extract LEFT JOIN literal ON extract.value = literal.value;'''
)
lines = []
for row in cur.fetchall():
line = row[0]
if not line:
continue
# Replace newlines
line = line.replace("\n", "\\n")
lines.append(line)

return lines


if __name__ == "__main__":
main()
50 changes: 50 additions & 0 deletions tests/resources/obi-extract.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
@prefix : <http://purl.obolibrary.org/obo/obi.owl#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@base <http://purl.obolibrary.org/obo/obi.owl> .


#################################################################
# Classes
#################################################################

### http://purl.obolibrary.org/obo/BFO_0000001
<http://purl.obolibrary.org/obo/BFO_0000001> rdf:type owl:Class ;
rdfs:subClassOf owl:Thing ;
rdfs:label "entity"@en .


### http://purl.obolibrary.org/obo/BFO_0000002
<http://purl.obolibrary.org/obo/BFO_0000002> rdf:type owl:Class ;
rdfs:subClassOf <http://purl.obolibrary.org/obo/BFO_0000001> ;
rdfs:label "continuant"@en .


### http://purl.obolibrary.org/obo/BFO_0000004
<http://purl.obolibrary.org/obo/BFO_0000004> rdf:type owl:Class ;
rdfs:subClassOf <http://purl.obolibrary.org/obo/BFO_0000002> ;
rdfs:label "independent continuant"@en .


### http://purl.obolibrary.org/obo/BFO_0000040
<http://purl.obolibrary.org/obo/BFO_0000040> rdf:type owl:Class ;
rdfs:subClassOf <http://purl.obolibrary.org/obo/BFO_0000004> ;
rdfs:label "material entity"@en .


### http://purl.obolibrary.org/obo/OBI_0100046
<http://purl.obolibrary.org/obo/OBI_0100046> rdf:type owl:Class ;
rdfs:subClassOf <http://purl.obolibrary.org/obo/OBI_0302729> ;
rdfs:label "phosphate buffered saline solution"@en .


### http://purl.obolibrary.org/obo/OBI_0302729
<http://purl.obolibrary.org/obo/OBI_0302729> rdf:type owl:Class ;
rdfs:subClassOf <http://purl.obolibrary.org/obo/BFO_0000040> ;
rdfs:label "chemical solution"@en .


### Generated by the OWL API (version 4.5.9.2019-02-01T07:24:44Z) https://github.com/owlcs/owlapi
File renamed without changes.
50 changes: 50 additions & 0 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import gizmos.extract
import sys

from rdflib import Graph, Literal, URIRef


def test_extract():
db = "tests/resources/obi.db"
ttl = gizmos.extract.extract(db, ["OBI:0100046"], ["rdfs:label"])
ttl = "\n".join(ttl)

graph = Graph()
graph.parse(data=ttl, format="turtle")

success = True
expected_graph = Graph()
expected_graph.parse("tests/resources/obi-extract.ttl", format="turtle")

# Check that no triples are missing
subjects = expected_graph.subjects()
for subject in subjects:
for p, o in expected_graph.predicate_objects(subject):
if (subject, URIRef(p), Literal(str(o), lang="en")) not in graph and (
subject,
URIRef(p),
URIRef(o),
) not in graph:
success = False
print(f"Missing '{subject} {p} {o}'")

# Check that no triples have been added
subjects = graph.subjects()
for subject in subjects:
if str(subject) == "http://www.w3.org/2002/07/owl#Thing":
continue
for p, o in graph.predicate_objects(subject):
if (subject, URIRef(p), Literal(str(o), lang="en")) not in expected_graph and (
subject,
URIRef(p),
URIRef(o),
) not in expected_graph:
success = False
print(f"Added '{subject} {p} {o}'")

if not success:
sys.exit(1)


if __name__ == '__main__':
test_extract()
2 changes: 1 addition & 1 deletion tests/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_tree():
# Read in the expected output to compare
success = True
expected_graph = Graph()
expected_graph.parse("tests/resources/obi.ttl", format="turtle")
expected_graph.parse("tests/resources/obi-tree.ttl", format="turtle")

# Check that no triples are missing
subjects = expected_graph.subjects()
Expand Down

0 comments on commit cd5586a

Please sign in to comment.