Merge pull request #14 from ontodev/extract

Add extract.py
ontodev · Aug 25, 2020 · cd5586a · cd5586a
2 parents 5e37105 + 840cc93
commit cd5586a
Show file tree

Hide file tree

Showing 5 changed files with 375 additions and 1 deletion.
diff --git a/gizmos/extract.py b/gizmos/extract.py
@@ -0,0 +1,274 @@
+import logging
+import sqlite3
+import sys
+
+from argparse import ArgumentParser
+
+"""
+Usage: python3 extract.py -d <sqlite-database> -t <curie> > <ttl-file>
+
+Creates a TTL file containing the term, annotations, and ancestors. TTL is written to stdout.
+You can include more than one `-t <curie>`/`--term <curie>`.
+
+You may also specify multiple CURIEs to extract with `-T <file>`/`--terms <file>`
+where the file contains a list of CURIEs to extract.
+
+You may also specify which annotations you would like to include with
+`-a <curie>`/`--annotation <curie>` or `-A <file>`/`--annotations <file>`
+where the file contains a list of annotation property CURIEs.
+
+Finally, if you don't wish to include the ancestors of the term/terms,
+include the `-n`/`--no-hierarchy` flag.
+
+The sqlite-database must be created by RDFTab (https://github.com/ontodev/rdftab.rs)
+and include 'statements' and 'prefixes' tables.
+
+The CURIEs must use a prefix from the 'prefixes' table.
+"""
+
+# Track terms already added to database
+added = []
+
+
+def main():
+    global added
+    p = ArgumentParser()
+    p.add_argument("-d", "--database", required=True, help="SQLite database")
+    p.add_argument("-t", "--term", action="append", help="CURIE of term to extract")
+    p.add_argument(
+        "-T", "--terms", help="File containing CURIES of terms to extract",
+    )
+    p.add_argument(
+        "-a",
+        "--annotation",
+        action="append",
+        help="CURIE of annotation property to include",
+    )
+    p.add_argument(
+        "-A",
+        "--annotations",
+        help="File containing CURIEs of annotation properties to include",
+    )
+    p.add_argument(
+        "-n",
+        "--no-hierarchy",
+        action="store_true",
+        help="If provided, do not create any rdfs:subClassOf statements",
+    )
+    args = p.parse_args()
+
+    # Get required terms
+    terms = []
+    if args.term:
+        terms = args.term
+    if args.terms:
+        with open(args.terms, "r") as f:
+            terms_from_file = [x.strip() for x in f.readlines()]
+            terms.exend(terms_from_file)
+
+    if not terms:
+        logging.critical("One or more term(s) must be specified with --term or --terms")
+        sys.exit(1)
+
+    # Get optional annotations (otherwise, all annotations are included)
+    annotations = None
+    if args.annotation:
+        # One or more annotations to add
+        annotations = args.annotation
+    if args.annotations:
+        with open(args.annotations, "r") as f:
+            annotations = [x.strip() for x in f.readlines()]
+
+    ttl = "\n".join(
+        extract(args.database, terms, annotations, no_hierarchy=args.no_hierarchy)
+    )
+    sys.stdout.write(ttl)
+
+
+def add_annotations(cur, annotations=None):
+    """Add annotations from the 'statements' table on all subjects in the 'extract' table."""
+    annotation_str = None
+    if annotations:
+        annotation_str = ["'" + x.replace("'", "''") + "'" for x in annotations]
+        annotation_str = ", ".join(annotation_str)
+    cur.execute("SELECT DISTINCT subject FROM extract;")
+    for row in cur.fetchall():
+        subject = row["subject"]
+        query = f"""INSERT INTO extract (stanza, subject, predicate, value, language, datatype)
+                    SELECT DISTINCT
+                      subject AS stanza,
+                      subject,
+                      predicate,
+                      value,
+                      language,
+                      datatype
+                    FROM statements WHERE subject = '{subject}' AND value NOT NULL"""
+        if annotation_str:
+            query += f" AND predicate IN ({annotation_str})"
+        cur.execute(query)
+
+
+def add_ancestors(cur, term_id):
+    """Add the hierarchy for a term ID starting with that term up to the top-level, assuming that
+    term ID exists in the database."""
+    global added
+    cur.execute(
+        f"""
+          WITH RECURSIVE ancestors(parent, child) AS (
+            VALUES ('{term_id}', NULL)
+            UNION
+            SELECT object AS parent, subject AS child
+            FROM statements
+            WHERE predicate = 'rdfs:subClassOf'
+              AND object = '{term_id}'
+            UNION
+            SELECT object AS parent, subject AS child
+            FROM statements, ancestors
+            WHERE ancestors.parent = statements.stanza
+              AND statements.predicate = 'rdfs:subClassOf'
+              AND statements.object NOT LIKE '_:%'
+          )
+          SELECT * FROM ancestors;"""
+    )
+
+    for row in cur.fetchall():
+        parent = row["parent"]
+        if parent and parent not in added:
+            # Only add rdf:type if it hasn't been added
+            added.append(parent)
+            cur.execute(
+                f"""INSERT INTO extract (stanza, subject, predicate, object)
+                        VALUES ('{parent}', '{parent}', 'rdf:type', 'owl:Class');"""
+            )
+
+        child = row["child"]
+        if child and child not in added:
+            # Only add rdf:type if it hasn't been added
+            added.append(child)
+            cur.execute(
+                f"""INSERT INTO extract (stanza, subject, predicate, object)
+                        VALUES ('{child}', '{child}', 'rdf:type', 'owl:Class');"""
+            )
+
+        if child and parent:
+            # Row has child & parent, add subclass statement
+            cur.execute(
+                f"""INSERT INTO extract (stanza, subject, predicate, object)
+                        VALUES ('{child}', '{child}', 'rdfs:subClassOf', '{parent}');"""
+            )
+
+
+def add_term(cur, term_id):
+    """Add the class assertion for a term ID, assuming that term ID exists in the database."""
+    cur.execute(f"SELECT * FROM statements WHERE subject = '{term_id}';")
+    res = cur.fetchone()
+    if res:
+        cur.execute(
+            f"""INSERT INTO extract (stanza, subject, predicate, object)
+                    VALUES ('{term_id}', '{term_id}', 'rdf:type', 'owl:Class');"""
+        )
+
+
+def dict_factory(cursor, row):
+    """Create a dict factory for sqlite cursor"""
+    d = {}
+    for idx, col in enumerate(cursor.description):
+        d[col[0]] = row[idx]
+    return d
+
+
+def extract(database, terms, annotations, no_hierarchy=False):
+    """Extract terms from the ontology database and return the module as lines of Turtle."""
+    # Create a new table (extract) and copy the triples we care about
+    # Then write the triples from that table to the output file
+    with sqlite3.connect(database) as conn:
+        conn.row_factory = dict_factory
+        cur = conn.cursor()
+        try:
+            # Create the extract table
+            cur.execute("DROP TABLE IF EXISTS extract;")
+            cur.execute(
+                """CREATE TABLE extract(stanza TEXT,
+                                  subject TEXT,
+                                  predicate TEXT,
+                                  object TEXT,
+                                  value TEXT,
+                                  datatype TEXT,
+                                  language TEXT);"""
+            )
+
+            # Get each term up to the top-level (unless no_hierarchy)
+            if not no_hierarchy:
+                for t in terms:
+                    add_ancestors(cur, t)
+            else:
+                # Only add the terms themselves (as long as they exist)
+                for t in terms:
+                    add_term(cur, t)
+
+            # Add declarations for any annotations used in 'extract'
+            cur.execute(
+                """INSERT INTO extract (stanza, subject, predicate, object)
+                    SELECT DISTINCT
+                      predicate AS stanza,
+                      predicate AS subject,
+                      'rdf:type',
+                      'owl:AnnotationProperty'
+                    FROM extract WHERE value NOT NULL;"""
+            )
+
+            # Add annotations for all subjects
+            add_annotations(cur, annotations=annotations)
+
+            # Reset row factory
+            conn.row_factory = sqlite3.Row
+            cur = conn.cursor()
+            return get_ttl(cur)
+        finally:
+            # Always drop the extract table
+            cur.execute("DROP TABLE IF EXISTS extract;")
+
+
+def get_ttl(cur):
+    """Get the 'extract' table as lines of Turtle (the lines are returned as a list)."""
+    # Get ttl lines
+    cur.execute(
+        '''WITH literal(value, escaped) AS (
+              SELECT DISTINCT
+                value,
+                replace(replace(replace(value, '\\', '\\\\'), '"', '\\"'), '
+            ', '\\n') AS escaped
+              FROM extract
+            )
+            SELECT
+              "@prefix " || prefix || ": <" || base || "> ."
+            FROM prefix
+            UNION ALL
+            SELECT DISTINCT
+               subject
+            || " "
+            || predicate
+            || " "
+            || coalesce(
+                 object,
+                 """" || escaped || """^^" || datatype,
+                 """" || escaped || """@" || language,
+                 """" || escaped || """"
+               )
+            || " ."
+            FROM extract LEFT JOIN literal ON extract.value = literal.value;'''
+    )
+    lines = []
+    for row in cur.fetchall():
+        line = row[0]
+        if not line:
+            continue
+        # Replace newlines
+        line = line.replace("\n", "\\n")
+        lines.append(line)
+
+    return lines
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/resources/obi-extract.ttl b/tests/resources/obi-extract.ttl
@@ -0,0 +1,50 @@
+@prefix : <http://purl.obolibrary.org/obo/obi.owl#> .
+@prefix owl: <http://www.w3.org/2002/07/owl#> .
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix xml: <http://www.w3.org/XML/1998/namespace> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@base <http://purl.obolibrary.org/obo/obi.owl> .
+
+
+#################################################################
+#    Classes
+#################################################################
+
+###  http://purl.obolibrary.org/obo/BFO_0000001
+<http://purl.obolibrary.org/obo/BFO_0000001> rdf:type owl:Class ;
+                                             rdfs:subClassOf owl:Thing ;
+                                             rdfs:label "entity"@en .
+
+
+###  http://purl.obolibrary.org/obo/BFO_0000002
+<http://purl.obolibrary.org/obo/BFO_0000002> rdf:type owl:Class ;
+                                             rdfs:subClassOf <http://purl.obolibrary.org/obo/BFO_0000001> ;
+                                             rdfs:label "continuant"@en .
+
+
+###  http://purl.obolibrary.org/obo/BFO_0000004
+<http://purl.obolibrary.org/obo/BFO_0000004> rdf:type owl:Class ;
+                                             rdfs:subClassOf <http://purl.obolibrary.org/obo/BFO_0000002> ;
+                                             rdfs:label "independent continuant"@en .
+
+
+###  http://purl.obolibrary.org/obo/BFO_0000040
+<http://purl.obolibrary.org/obo/BFO_0000040> rdf:type owl:Class ;
+                                             rdfs:subClassOf <http://purl.obolibrary.org/obo/BFO_0000004> ;
+                                             rdfs:label "material entity"@en .
+
+
+###  http://purl.obolibrary.org/obo/OBI_0100046
+<http://purl.obolibrary.org/obo/OBI_0100046> rdf:type owl:Class ;
+                                             rdfs:subClassOf <http://purl.obolibrary.org/obo/OBI_0302729> ;
+                                             rdfs:label "phosphate buffered saline solution"@en .
+
+
+###  http://purl.obolibrary.org/obo/OBI_0302729
+<http://purl.obolibrary.org/obo/OBI_0302729> rdf:type owl:Class ;
+                                             rdfs:subClassOf <http://purl.obolibrary.org/obo/BFO_0000040> ;
+                                             rdfs:label "chemical solution"@en .
+
+
+###  Generated by the OWL API (version 4.5.9.2019-02-01T07:24:44Z) https://github.com/owlcs/owlapi
diff --git a/tests/resources/obi.ttl → tests/resources/obi-tree.ttl b/tests/resources/obi.ttl → tests/resources/obi-tree.ttl
diff --git a/tests/test_extract.py b/tests/test_extract.py
@@ -0,0 +1,50 @@
+import gizmos.extract
+import sys
+
+from rdflib import Graph, Literal, URIRef
+
+
+def test_extract():
+    db = "tests/resources/obi.db"
+    ttl = gizmos.extract.extract(db, ["OBI:0100046"], ["rdfs:label"])
+    ttl = "\n".join(ttl)
+
+    graph = Graph()
+    graph.parse(data=ttl, format="turtle")
+
+    success = True
+    expected_graph = Graph()
+    expected_graph.parse("tests/resources/obi-extract.ttl", format="turtle")
+
+    # Check that no triples are missing
+    subjects = expected_graph.subjects()
+    for subject in subjects:
+        for p, o in expected_graph.predicate_objects(subject):
+            if (subject, URIRef(p), Literal(str(o), lang="en")) not in graph and (
+                    subject,
+                    URIRef(p),
+                    URIRef(o),
+            ) not in graph:
+                success = False
+                print(f"Missing '{subject} {p} {o}'")
+
+    # Check that no triples have been added
+    subjects = graph.subjects()
+    for subject in subjects:
+        if str(subject) == "http://www.w3.org/2002/07/owl#Thing":
+            continue
+        for p, o in graph.predicate_objects(subject):
+            if (subject, URIRef(p), Literal(str(o), lang="en")) not in expected_graph and (
+                    subject,
+                    URIRef(p),
+                    URIRef(o),
+            ) not in expected_graph:
+                success = False
+                print(f"Added '{subject} {p} {o}'")
+
+    if not success:
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    test_extract()
diff --git a/tests/test_tree.py b/tests/test_tree.py
@@ -56,7 +56,7 @@ def test_tree():
     # Read in the expected output to compare
     success = True
     expected_graph = Graph()
-    expected_graph.parse("tests/resources/obi.ttl", format="turtle")
+    expected_graph.parse("tests/resources/obi-tree.ttl", format="turtle")
 
     # Check that no triples are missing
     subjects = expected_graph.subjects()