diff --git a/making_annotator_modules.html b/making_annotator_modules.html index 795b4ed..345049f 100644 --- a/making_annotator_modules.html +++ b/making_annotator_modules.html @@ -106,7 +106,9 @@

Annotator Basic

Creating an annotator module requires the following:

  1. Creating an new annotator skeleton using oc new annotator <modulename>
  2. -
  3. Loading an annotator file into a SQLite database using
  4. +
  5. Loading an annotator file into a SQLite database (<modulename>.sqlitess) using sqlite3
  6. +
  7. Mapping the annotator sqlite file in the <modulename>.py file
  8. +
  9. Customizing the output using the <modulename>.yml file
@@ -210,28 +212,28 @@

Creating our Table

Before we load our data in, we need to create our table.

create table "sift" ('chrom' TEXT, 'pos' INT, 
CREATE TABLE "sift" ('chrom' TEXT, 'pos' INT, 
                      'ref' TEXT, 'alt' TEXT, 
                      'score' REAL, 'nseq' INT);
 CREATE INDEX main_index on sift (chrom, pos, ref, alt);

Now that the table is created, we can load our sift.csv file. We need to change the mode to csv.

.mode csv

Now we can import our data using the .import dot command. Because our sift.csv


Now we can import our data using the .import dot command. Because our sift.csv has a header row, we need to skip it, so we use the --skip 1 argument.

.import --skip 1 sift.csv sift

We can check that we loaded in our data correctly by using the .schema command


We can check that we loaded in our data correctly by using the .schema command and a SELECT * query:

CREATE TABLE sift (chrom text, pos int, ref text, alt text, score real, nseq int);
 CREATE INDEX main_index on sift (chrom, pos, ref, alt);


.mode box
-select * from sift limit 5;
 │ chrom │   pos    │ ref │ alt │ score │ nseq │
@@ -241,17 +243,18 @@ 

Creating our Table

│ chr17 │ 43045682 │ T │ C │ 0.0 │ 7 │ │ chr17 │ 43045682 │ T │ G │ 0.0 │ 7 │ └───────┴──────────┴─────┴─────┴───────┴──────┘
.mode tabs
-create table "hpo" ('ncbi_gene_id' int, "gene_symbol" TEXT, 
-                    "hpo_id" TEXT, "hpo_name" TEXT, "frequency" TEXT, 
-                    "disease_id" TEXT);
-.import --skip 1 genes_to_phenotype.txt hpo
-.mode box
-select * from hpo limit 10;

When we’re done, we can

+ +
+Loading VCF Files as Annotations +
.mode tabs
 create table "vcf" ("chrom" TEXT, "pos" INT, "id" TEXT, 
                     "ref" TEXT, "alt" TEXT, "qual" INT, 
@@ -261,10 +264,13 @@ 

Creating our Table

.mode box.m .schema select * from vcf limit 10;

Fill out sift_annotator.py


Fill out sift.py


Now that our data is loaded into our .sqlite file, we need to set up our mapping. If we look in sift.py, we’ll see there are stubs for three methods: setup(), annotate(), and cleanup():

@@ -280,9 +286,70 @@

Fill out

cat /Users/Shared/open-cravat/modules/annotators/sift/sift.py

annotate() function


This is what they look like:

cat /Users/Shared/open-cravat/modules/annotators/sift/sift.py
import sys
+from cravat import BaseAnnotator
+from cravat import InvalidData
+import sqlite3
+import os
+class CravatAnnotator(BaseAnnotator):
+    def setup(self): 
+        """
+        Set up data sources. 
+        Cravat will automatically make a connection to 
+        data/example_annotator.sqlite using the sqlite3 python module. The 
+        sqlite3.Connection object is stored as self.dbconn, and the 
+        sqlite3.Cursor object is stored as self.cursor.
+        """
+        pass
+    def annotate(self, input_data, secondary_data=None):
+        """
+        The annotator parent class will call annotate for each line of the 
+        input file. It takes one positional argument, input_data, and one
+        keyword argument, secondary_data.
+        input_data is a dictionary containing the data from the current input 
+        line. The keys depend on what what file is used as the input, which can 
+        be changed in the module_name.yml file. 
+        Variant level includes the following keys: 
+            ('uid', 'chrom', 'pos', 'ref_base', 'alt_base')
+        Variant level crx files expand the key set to include:
+            ('hugo', 'transcript','so','all_mappings')
+        Gene level files include
+            ('hugo', 'num_variants', 'so', 'all_so')
+        secondary_data is used to allow an annotator to access the output of
+        other annotators. It is described in more detail in the CRAVAT 
+        documentation.
+        annotate should return a dictionary with keys matching the column names
+        defined in example_annotator.yml. Extra column names will be ignored, 
+        and absent column names will be filled with None. Check your output
+        carefully to ensure that your data is ending up where you intend.
+        """
+        out = {}
+        out['placeholder_annotation'] = 'placeholder value'
+        return out
+    def cleanup(self):
+        """
+        cleanup is called after every input line has been processed. Use it to
+        close database connections and file handlers. Automatically opened
+        database connections are also automatically closed.
+        """
+        pass
+if __name__ == '__main__':
+    annotator = CravatAnnotator(sys.argv)
+    annotator.run()

We will focus on the annotate() method first.


annotate() method


Our annotate() method is where we

@@ -312,41 +379,42 @@


chrom = input_data["chrom"]
-pos = input_data["pos"]
-query = (f'select score, nseq from sift' \
-          'where chrom="{chrom}"'\
-          'and pos="{pos}"')
-result = self.cursor.fetchone()
chrom = input_data["chrom"]
+pos = input_data["pos"]
+query = (f'select score, nseq from sift' \
+          'where chrom="{chrom}"'\
+          'and pos="{pos}"')
+result = self.cursor.fetchone()
def annotate(self, input_data, secondary_data=None):
-    chrom = input_data['chrom']
-    pos = input_data['pos']
-    ref_base = input_data['ref_base']
-    alt_base = input_data['alt_base']
-    query = f'select score, nseq from sift where chrom="{chrom}" and pos={pos} and ref="{ref_base}" and alt="{alt_base}";'
-    self.cursor.execute(query)
-    result = self.cursor.fetchone()
-    if result is not None:
-        score = result[0]
-        num_seq = result[1]
-        if score <= 0.05:
-            prediction = 'Damaging'
-        else:
-            prediction = 'Tolerated'
-        return {
-            'score': score,
-            'seq_count': num_seq,
-            'prediction': prediction,
-        }
-    else:
-        return None
def annotate(self, input_data, secondary_data=None):
+    chrom = input_data['chrom']
+    pos = input_data['pos']
+    ref_base = input_data['ref_base']
+    alt_base = input_data['alt_base']
+    query = f'select score, nseq from sift where chrom="{chrom}" and pos={pos} and ref="{ref_base}" and alt="{alt_base}";'
+    self.cursor.execute(query)
+    result = self.cursor.fetchone()
+    if result is not None:
+        score = result[0]
+        num_seq = result[1]
+        if score <= 0.05:
+            prediction = 'Damaging'
+        else:
+            prediction = 'Tolerated'
+        return {
+            'score': score,
+            'seq_count': num_seq,
+            'prediction': prediction,
+        }
+    else:
+        return None

Configure sift_annotator.yml


Now that our annotate() method is filled in, we need to configure how our annotations will be displayed.

#| eval: false
 cat /Users/Shared/open-cravat/modules/annotators/sift_annotator/sift_annotator.yml
# 'title' is the name of the module that will be displayed to the user
diff --git a/making_annotator_modules.qmd b/making_annotator_modules.qmd
index 46afb13..b06ad10 100644
--- a/making_annotator_modules.qmd
+++ b/making_annotator_modules.qmd
@@ -10,7 +10,9 @@ format: html
 Creating an annotator module requires the following:
 1. Creating an new annotator skeleton using `oc new annotator `
-2. Loading an annotator file into a SQLite database using 
+2. Loading an annotator file into a SQLite database (`.sqlite`ss) using `sqlite3`
+3. Mapping the annotator sqlite file in the `.py` file
+4. Customizing the output using the `.yml` file
 flowchart LR