greenelab · danich1 · Aug 6, 2020 · Aug 3, 2020 · Aug 3, 2020 · Aug 3, 2020
diff --git a/README.md b/README.md
@@ -4,25 +4,45 @@
 
 [PubTator](https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/PubTator/) and its 2.0 version ([PubTator Central](https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/PubTatorCentral/)) uses text mining to tag PubMed abstracts/artciles with standardized concepts. This repository retrieves and processes PubTator annotations for use in [`greenelab/snorkeling`](https://github.com/greenelab/snorkeling) and elsewhere.
 
-## Environment
+# Get Started
 
-Install the [conda](https://conda.io) environment specified in [`environment.yml`](environment.yml) by running:
+## Set-up Environment
+
+### Conda
+
+1. Install the [conda](https://conda.io) environment.
+2. Create the pubtator environmenmt by running:
+
+```sh
+conda create --name Pubtator python=3.8
+```
+3. Install packages via pip by running the following:
+
+```sh
+pip install -r requirements.txt
+```
+
+4. Activate with `conda activate pubtator`.
+
+### Pip
+
+1. Make sure you have python version **3.8** installed.
+2. Install packages by running the following:
 
 ```sh
-conda env create --file environment.yml
+pip install -r requirements.txt
 ```
 
-Activate with `conda activate pubtator`.
 
 ## Execution
 
-To download and extract Pubator Central's data (default) run the following:
+To start processing Pubtator/Pubtator Central run the following command:
 
 ```sh
-bash execute.sh {email address here}
+python execute.py --config config_files/pubtator_central_config.json
 ```
 
-If the original Pubtator is desired run the above command with the following flag: --pubtator. You do not need to provide your email address when running the first version of Pubtator.
+If the original Pubtator is desired replace `pubtator_central_config.json` with `pubtator_config.json`. The json file contains all the necessary parameters needed to run. More information for the json file can be found [here](config_files).
 
 ## License
 

diff --git a/config_files/README.md b/config_files/README.md
@@ -0,0 +1,38 @@
+# Configuration Files
+
+## File Description
+
+| File | Description |
+| --- | --- | 
+| [pubtator central config](pubtator_central_config.json) | This is a configuration file for parsing Pubtator Central. |
+| [pubtator config](pubtator_config.json) | This is a configuration file for parsing Pubtator (older version of Pubtator Central). |
+| [tests config](tests_config.json) | This is a configuration file for testing the pubtator system. Feel free to ignore this file. |
+
+## Usage
+
+Each configuration file is in json format and contains parameters for each step within the pubtator pipeline. 
+All files are organized by order of operation, which means the very first step occurs at the top and the subsequent step comes right afterwards.
+Every step can be skipped, which allows one to continue the pipeline at any step they choose. 
+To skip a step just replace the skip field with true instead of false. 
+**Note: make sure true is lowercase as json requires it to be lowercase.**
+
+Example:
+```json
+{
+  "pipieline step 1":{
+    "param1":"param1_value",
+    "param2":"param2_value",
+    "skip":false
+    },
+   "pipieline step 2":{
+    "param1":"param1_value",
+    "param2":"param2_value",
+    "skip":false
+    },
+   "pipieline step 3":{
+    "param1":"param1_value",
+    "param2":"param2_value",
+    "skip":false
+    }
+}
+```
diff --git a/config_files/pubtator_central_config.json b/config_files/pubtator_central_config.json
@@ -0,0 +1,54 @@
+{
+    "repository_download":{
+        "url":"ftp://ftp.ncbi.nlm.nih.gov/pub/lu/PubTatorCentral/bioconcepts2pubtatorcentral.offset.gz",
+        "download_folder":"download",
+        "skip":false
+    },
+
+    "pubtator_to_xml": {
+        "documents":"download/bioconcepts2pubtatorcentral.offset.gz",
+        "output":"data/pubtator-central-docs.xml.xz",
+        "skip":false
+    },
+
+    "extract_tags":{
+        "input":"data/pubtator-central-docs.xml.xz",
+        "output":"data/pubtator-central-tags.tsv.xz",
+        "skip":false
+    },
+
+    "hetnet_id_extractor":{
+        "input":"data/pubtator-central-tags.tsv.xz",
+        "output":"data/pubtator-central-hetnet-tags.tsv.xz",
+        "skip":false
+    },
+
+    "map_pmid_to_pmcids":{
+        "input":"data/pubtator-central-tags.tsv.xz",
+        "output":"data/pubtator-pmids-to-pmcids.tsv",
+        "debug":false,
+        "skip":false
+    },
+
+    "download_full_text":{
+        "input":"data/pubtator-pmids-to-pmcids.tsv",
+        "document_batch":100,
+        "output":" data/pubtator-central-full-text.xml",
+        "temp_dir":"data/temp",
+        "log_file":"batch_log.tsv",
+        "skip":false
+    },
+
+    "extract_full_text_tags":{
+        "input":"data/pubtator-central-full-text.xml",
+        "output":"data/pubtator-central-full-text-tags.tsv.xz",
+        "skip":false
+    },
+
+    "hetnet_id_extractor_full_text":{
+        "input":"data/pubtator-central-full-text-tags.tsv.xz",
+        "output":"data/pubtator-central-full-hetnet-tags.tsv.xz",
+        "skip":false
+    }
+
+}
diff --git a/config_files/pubtator_config.json b/config_files/pubtator_config.json
@@ -0,0 +1,25 @@
+{
+    "repository_download":{
+        "url":"ftp://ftp.ncbi.nlm.nih.gov/pub/lu/PubTator/bioconcepts2pubtator_offsets.gz",
+        "download_folder":"download",
+        "skip":false
+    },
+
+    "pubtator_to_xml": {
+        "documents":"download/bioconcepts2pubtator_offsets.gz",
+        "output":"data/pubtator-docs.xml.xz",
+        "skip":false
+    },
+
+    "extract_tags":{
+        "input":"data/pubtator-docs.xml.xz",
+        "output":"data/pubtator-tags.tsv.xz",
+        "skip":false
+    },
+
+    "hetnet_id_extractor":{
+        "input":"data/pubtator-tags.tsv.xz",
+        "output":"data/pubtator-hetnet-tags.tsv.xz",
+        "skip":false
+    }
+}
diff --git a/config_files/tests_config.json b/config_files/tests_config.json
@@ -0,0 +1,54 @@
+{
+    "repository_download":{
+        "url":"ftp://ftp.ncbi.nlm.nih.gov/pub/lu/PubTatorCentral/bioconcepts2pubtatorcentral.offset.gz",
+        "download_folder":"download",
+        "skip":true
+    },
+
+    "pubtator_to_xml": {
+        "documents":"data/example/1-sample-annotations.txt",
+        "output":"data/example/2-sample-docs.xml",
+        "skip":false
+    },
+
+    "extract_tags":{
+        "input":"data/example/2-sample-docs.xml",
+        "output":"data/example/3-sample-tags.tsv",
+        "skip":false
+    },
+
+    "hetnet_id_extractor":{
+        "input":"data/example/3-sample-tags.tsv",
+        "output":"data/example/4-hetnet-tags.tsv",
+        "skip":false
+    },
+
+    "map_pmid_to_pmcids":{
+        "input":"data/example/3-sample-tags.tsv",
+        "output":"data/example/5-sample-pmids-to-pmcids.tsv",
+        "debug":true,
+        "skip":false
+    },
+
+    "download_full_text":{
+        "input":"data/example/5-sample-pmids-to-pmcids.tsv",
+        "document_batch":100,
+        "output":"data/example/6-sample-full-text.xml",
+        "temp_dir":"data/temp",
+        "log_file":"batch_log.tsv",
+        "skip":false
+    },
+
+    "extract_full_text_tags":{
+        "input":"data/example/6-sample-full-text.xml",
+        "output":"data/example/7-sample-full-text-tags.tsv",
+        "skip":false
+    },
+
+    "hetnet_id_extractor_full_text":{
+        "input":"data/example/7-sample-full-text-tags.tsv",
+        "output":"data/example/8-hetnet-full-text-tags.tsv",
+        "skip":false
+    }
+
+}
diff --git a/data/example/2-sample-docs.xml b/data/example/2-sample-docs.xml
@@ -1,7 +1,7 @@
 <?xml version='1.0' encoding='UTF-8'?><!DOCTYPE collection SYSTEM 'BioC.dtd'>
 <collection>
   <source>Pubtator</source>
-  <date>2020/03/02</date>
+  <date>2020/08/03</date>
   <key>Pubtator.key</key>
 <document>
   <id>1560033</id>