From 97fe0a58b740870d50d358dd2b0284e90295958b Mon Sep 17 00:00:00 2001
From: clancyoftheoverflow
 <32432020+clancyoftheoverflow@users.noreply.github.com>
Date: Thu, 19 May 2022 01:30:29 +0800
Subject: [PATCH 1/6] n2c2_ commit

---
 biodatasets/n2c2_2012/n2c2_2012.py | 490 +++++++++++++++++++++++++++++
 1 file changed, 490 insertions(+)
 create mode 100644 biodatasets/n2c2_2012/n2c2_2012.py

diff --git a/biodatasets/n2c2_2012/n2c2_2012.py b/biodatasets/n2c2_2012/n2c2_2012.py
new file mode 100644
index 00000000..1248574a
--- /dev/null
+++ b/biodatasets/n2c2_2012/n2c2_2012.py
@@ -0,0 +1,490 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+A dataset loader for the n2c2 2012 temporal relation dataset.
+
+https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/
+
+The dataset consists of 1 training archive files and 1 annotated test archive file,
+
+* 2012-07-15.original-annotation.release.tar.gz (complete training dataset)
+* 2012-08-23.test-data.groundtruth.tar.gz (annotated, complete test dataset)
+
+The files comprising this dataset must be on the users local machine
+in a single directory that is passed to `datasets.load_dataset` via
+the `data_dir` kwarg. This loader script will read the archive files
+directly (i.e. the user should not uncompress, untar or unzip any of
+the files).
+
+NOTE. The following XML files are not well formed and have been excluded from
+the dataset: "23.xml","53.xml","143.xml","152.xml","272.xml","382.xml","397.xml","422.xml"
+"527.xml""547.xml","627.xml","687.xml","802.xml","807.xml".
+
+Registration AND submission of DUA is required to access the dataset.
+
+[bigbio_schema_name] = kb
+"""
+
+import os
+import tarfile
+from collections import defaultdict, OrderedDict
+from unittest import skip
+import xmltodict
+import json
+from typing import List, Tuple, Dict
+
+import datasets
+from datasets import Features, Value, Sequence, ClassLabel
+from utils import schemas
+from utils.configs import BigBioConfig
+from utils.constants import Tasks
+
+_CITATION = """\
+@article{,
+  author    = {
+        Sun, Weiyi and
+        Rumshisky, Anna and
+        Uzuner, Ozlem},
+  title     = {Evaluating temporal relations in clinical text: 2012 i2b2 Challenge},
+  journal   = {Journal of the American Medical Informatics Association},
+  volume    = {20},
+  year      = {5},
+  pages     = {806-813}
+  year      = {2013}
+  month     = {09}
+  url       = {https://doi.org/10.1136/amiajnl-2013-001628},
+  doi       = {10.1136/amiajnl-2013-001628},
+  eprint    = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3756273/pdf/amiajnl-2013-001628.pdf}
+}
+"""
+
+_DATASETNAME = "n2c2_2012"
+
+_DESCRIPTION = """\
+This dataset is designed for the 2012 i2b2 temporal relations challenge task.
+
+The text annotated for this challenge comes from de-identified discharge summaries. The goal of
+the annotation is to mark up temporal information present in clinical text in order to enable
+reasoning and queries over the timeline of clinically relevant events for each patient.
+
+This annotation involves marking up three kinds of information:
+1) events,
+2) temporal expressions, and
+3) temporal relations between events and temporal expressions.
+
+The latter would involve: 
+1) anchoring events to available temporal expressions, and
+2) identifying temporal relations between events.
+
+The first task is to identify all clinically relevant events and situations, including symptoms,
+tests, procedures, and other occurrences. The second task is to identify temporal expressions,
+which include all expressions related to time, such as dates, times, frequencies, and durations.
+Events and temporal expressions have a number of attributes (such as type of event or calendar
+value of the temporal expression) that need to be annotated. The final task is to record the
+temporal relations (e.g. before, after, simultaneous, etc.) that hold between different events or
+between events and temporal expressions.
+
+"""
+
+_HOMEPAGE = "https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/"
+
+_LICENSE = "External Data User Agreement"
+
+_SUPPORTED_TASKS = [Tasks.EVENT_EXTRACTION, Tasks.RELATION_EXTRACTION]
+
+_SOURCE_VERSION = "1.0.0"
+
+_BIGBIO_VERSION = "1.0.0"
+
+def _read_tar_gz_train_(file_path, samples=None):
+    if samples is None:
+        samples = defaultdict(dict)
+    with tarfile.open(file_path, "r:gz") as tf:
+        for member in tf.getmembers():
+
+            base, filename = os.path.split(member.name)
+            _, ext = os.path.splitext(filename)
+            ext = ext[1:]  # get rid of dot
+            sample_id = filename.split(".")[0]
+
+            if ext == "xml" and not filename in ["23.xml", "143.xml", "152.xml", "272.xml","382.xml","422.xml","547.xml","807.xml"]: # corrputed XML files
+                with tf.extractfile(member) as fp:
+                    content_bytes = fp.read()
+                content = content_bytes.decode("utf-8").encode()
+                values = xmltodict.parse(content)
+                samples[sample_id] = values["ClinicalNarrativeTemporalAnnotation"] 
+
+    samples_sorted = OrderedDict(sorted(samples.items(),key=lambda x: int(x[0])))
+    samples = samples_sorted
+    samples = json.loads(json.dumps(samples))
+
+    return samples
+
+def _read_tar_gz_test_(file_path, samples=None):
+    if samples is None:
+        samples = defaultdict(dict)
+    print(samples)
+    with tarfile.open(file_path, "r:gz") as tf:
+        for member in tf.getmembers():
+            if member.name.startswith("ground_truth/merged_xml"):
+
+                base, filename = os.path.split(member.name)
+                _, ext = os.path.splitext(filename)
+                ext = ext[1:]  # get rid of dot
+                sample_id = filename.split(".")[0]
+
+                if ext == "xml" and not filename in ["53.xml", "397.xml","527.xml","627.xml","687.xml","802.xml"]: #corrupted XML files
+                    with tf.extractfile(member) as fp:
+                        content_bytes = fp.read()
+                    content = content_bytes.decode("utf-8").encode()
+                    values = xmltodict.parse(content)
+                    samples[sample_id] = values["ClinicalNarrativeTemporalAnnotation"] 
+
+    samples_sorted = OrderedDict(sorted(samples.items(), key=lambda x: int(x[0])))
+    samples = samples_sorted
+    samples = json.loads(json.dumps(samples))
+
+    return samples
+
+def  _get_events_from_sample(sample_id, sample):
+    events = []
+    for idx, event in enumerate(sample.get("TAGS","").get("EVENT","")):
+        
+        evs = {
+        "id": event.get("@id",""),
+        "type": event.get("@type",""),
+        "trigger": {
+            "text": [event.get("@text","")],
+            "offsets": [(int(event.get("@start","")), int(event.get("@end","")))],
+            },
+        "arguments": [
+            {
+            "role": [],
+            "ref_id": [],
+            },
+        ],
+        }
+        events.append(evs)
+    return events
+
+def _get_entities_from_sample(sample_id, sample):
+    entities = []
+    for idx, timex3 in enumerate(sample.get("TAGS","").get("TIMEX3","")):
+
+        entity = {
+        "id": timex3.get("@id",""),
+        "type": timex3.get("@type",""),
+        "offsets": [(int(timex3.get("@start","")), int(timex3.get("@end","")))],
+        "text":  [timex3.get("@text","")],
+        "normalized": [],
+        }
+
+        entities.append(entity)
+
+    return entities
+
+def _get_relations_from_sample(sample_id, sample):
+
+    relations = []
+    for idx, tlink in enumerate(sample.get("TAGS").get("TLINK")):
+
+        rel = {
+        "id": tlink.get("@id"),
+        "type": tlink.get("@type"),
+        "arg1_id": tlink.get("@fromID"),
+        "arg2_id": tlink.get("@toID"),
+        "normalized": [],
+        }
+
+        relations.append(rel)
+
+    return relations
+
+def _get_admission_from_sample(sample_id, sample):
+
+    admission = {}
+
+    # When admission information was missing, an empty placeholder was added with id S0
+    if sample.get("TAGS","").get("SECTIME","") == "":
+        admission = {
+            "id": "S0",
+            "type": "ADMISSION",
+            "text": [],
+            "offsets": [],
+            }
+
+    elif len(sample.get("TAGS","").get("SECTIME","")) == 2:
+        for idx, sectime in enumerate(sample.get("TAGS","").get("SECTIME","")):
+            if sectime.get("@type","") == "ADMISSION":
+                admission = {
+                    "id": sectime.get("@id",""),
+                    "type": sectime.get("@type",""),
+                    "text": [sectime.get("@text","")],
+                    "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))],
+                    }
+
+    else:
+        sectime = sample.get("TAGS","").get("SECTIME","")
+        if sectime.get("@type","") == "ADMISSION":
+            admission = {
+                "id": sectime.get("@id",""),
+                "type": sectime.get("@type",""),
+                "text": [sectime.get("@text","")],
+                "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))],
+                }
+
+    return admission
+
+def _get_discharge_from_sample(sample_id, sample):
+
+    discharge = {}
+    
+    # When discharge information was missing, an empty placeholder was added with id S1
+    if sample.get("TAGS","").get("SECTIME","") == "":
+        discharge = {
+            "id": "S1",
+            "type": "DISCHARGE",
+            "text": [],
+            "offsets": [],
+            }
+
+    elif len(sample.get("TAGS","").get("SECTIME","")) == 2:
+        for idx, sectime in enumerate(sample.get("TAGS","").get("SECTIME","")):
+            if sectime.get("@type","") == "DISCHARGE":
+                discharge = {
+                    "id": sectime.get("@id",""),
+                    "type": sectime.get("@type",""),
+                    "text": [sectime.get("@text","")],
+                    "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))],
+                    }
+    else:
+        sectime = sample.get("TAGS","").get("SECTIME","")
+        if sectime.get("@type","") == "DISCHARGE":
+            discharge = {
+                "id": sectime.get("@id",""),
+                "type": sectime.get("@type",""),
+                "text": [sectime.get("@text","")],
+                "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))],
+                }
+
+    return discharge
+
+
+class N2C22012TempRelDataset(datasets.GeneratorBasedBuilder):
+    """n2c2 2012 temporal relations challenge"""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
+
+    # You will be able to load the "source" or "bigbio" configurations with
+    # ds_source = datasets.load_dataset('my_dataset', name='source')
+    # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio')
+
+    # For local datasets you can make use of the `data_dir` and `data_files` kwargs
+    # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits
+    # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files")
+    # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files")
+
+    BUILDER_CONFIGS = [
+        BigBioConfig(
+            name="n2c2_2012_source",
+            version=SOURCE_VERSION,
+            description="n2c2_2012 source schema",
+            schema="source",
+            subset_id="n2c2_2012",
+        ),
+        BigBioConfig(
+            name="n2c2_2012_bigbio_kb",
+            version=BIGBIO_VERSION,
+            description="n2c2_2012 BigBio schema",
+            schema="bigbio_kb",
+            subset_id="n2c2_2012",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "n2c2_2012_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible.
+
+        # You can arbitrarily nest lists and dictionaries.
+        # For iterables, use lists over tuples or `datasets.Sequence`
+
+        if self.config.schema == "source":
+            features = Features(
+                {
+                    "sample_id": Value("string"),
+                    "text": Value("string"),
+                    "tags":{
+                        "EVENT": Sequence({"@id": Value("string"),
+                                        "@start": Value("int64"),
+                                        "@end": Value("int64"),
+                                        "@text": Value("string"),
+                                        "@modality": Value("string"),
+                                        "@polarity": Value("string"),
+                                        "@type": Value("string"),
+                                        }),
+                        "TIMEX3": Sequence({"@id": Value("string"),
+                                        "@start": Value("int64"),
+                                        "@end": Value("int64"),
+                                        "@text": Value("string"),
+                                        "@type": Value("string"),
+                                        "@val": Value("string"),
+                                        "@mod": Value("string"), 
+                                        }),
+                        "TLINK": Sequence({"@id": Value("string"),
+                                        "@fromID": Value("string"),
+                                        "@fromText": Value("string"),
+                                        "@toID": Value("string"),
+                                        "@toText": Value("string"),
+                                        "@type": Value("string"),
+                                        }),
+                        "SECTIME": Sequence({"@id": Value("string"),
+                                        "@start": Value("string"),
+                                        "@end": Value("string"),
+                                        "@text": Value("string"),
+                                        "@type": Value("string"),
+                                        "@dvalue": Value("string"),                                        
+                                        }),
+                                  }
+                }
+            )
+
+        elif self.config.schema == "bigbio_kb":
+            features = schemas.kb_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+
+        if self.config.data_dir is None:
+            raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.")
+        else:
+            data_dir = self.config.data_dir
+
+        # Not all datasets have predefined canonical train/val/test splits.
+        # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data.
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # Whatever you put in gen_kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "split": "test",
+                },
+            ),
+        ]
+
+    @staticmethod
+    def _get_source_sample(sample_id, sample):
+        if sample.get("TAGS","").get("SECTIME","") == "":
+            return {
+                "sample_id": sample_id,
+                "text": sample.get("TEXT",""),
+                "tags":{
+                    "EVENT": sample.get("TAGS","").get("EVENT",""),
+                    "TIMEX3": sample.get("TAGS","").get("TIMEX3",""),
+                    "TLINK": sample.get("TAGS","").get("TLINK",""),
+                    "SECTIME": [],
+                    }
+                }
+        else:
+            return {
+                "sample_id": sample_id,
+                "text": sample.get("TEXT",""),
+                "tags":{
+                    "EVENT": sample.get("TAGS","").get("EVENT",""),
+                    "TIMEX3": sample.get("TAGS","").get("TIMEX3",""),
+                    "TLINK": sample.get("TAGS","").get("TLINK",""),
+                    "SECTIME": sample.get("TAGS","").get("SECTIME",""),
+                    }
+                }
+
+    @staticmethod
+    def _get_bigbio_sample(sample_id, sample):
+
+        passage_text = sample.get("TEXT","")
+        events = _get_events_from_sample(sample_id, sample)
+        entities = _get_entities_from_sample(sample_id, sample)
+        relations = _get_relations_from_sample(sample_id, sample)
+        admission = _get_admission_from_sample(sample_id, sample)
+        discharge = _get_discharge_from_sample(sample_id, sample)
+
+        return {
+            "id": sample_id,
+            "document_id": sample_id,
+            "passages": [
+                {
+                    "id": f"{sample_id}-full-passage",
+                    "type": "Clinical Narrative Temporal Annotation",
+                    "text": [passage_text],
+                    "offsets": [(0, len(passage_text))],
+                },
+                admission,
+                discharge,
+            ],
+            "events": events,
+            "entities": entities,
+            "relations": relations,
+            "coreferences": [],
+            }
+
+
+    def _generate_examples(self, data_dir, split: str) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+        
+        if split == "train":
+            _id = 0
+
+            file_path = os.path.join(data_dir, "2012-07-15.original-annotation.release.tar.gz")
+            samples = _read_tar_gz_train_(file_path)
+            for sample_id, sample in samples.items():
+                if self.config.schema == "source":
+                    yield _id, self._get_source_sample(sample_id, sample)
+                elif self.config.schema == "bigbio_kb":
+                    yield _id, self._get_bigbio_sample(sample_id, sample)
+                _id += 1
+
+        elif split == "test":
+            _id = 0
+
+            file_path = os.path.join(data_dir, "2012-08-23.test-data.groundtruth.tar.gz")
+            samples = _read_tar_gz_test_(file_path)
+            for sample_id, sample in samples.items():
+                if self.config.schema == "source":
+                    yield _id, self._get_source_sample(sample_id, sample)
+                elif self.config.schema == "bigbio_kb":
+                    yield _id, self._get_bigbio_sample(sample_id, sample)
+                _id += 1
+                
+# This template is based on the following template from the datasets package:
+# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py
\ No newline at end of file

From 7c88585059573fac85c4e67e697ebdf006c89f12 Mon Sep 17 00:00:00 2001
From: clancyoftheoverflow
 <32432020+clancyoftheoverflow@users.noreply.github.com>
Date: Mon, 6 Jun 2022 01:33:53 +0800
Subject: [PATCH 2/6] Closes #57

---
 biodatasets/emrQA/emrQA.py        | 271 +++++++++++++++++++++++
 biodatasets/emrQA/test.py         |   7 +
 biodatasets/n2c2_2012/test_tmp.py | 353 ++++++++++++++++++++++++++++++
 biodatasets/why_qa/test.py        |  27 +++
 biodatasets/why_qa/why_qa.py      | 217 ++++++++++++++++++
 tmp_TEST.py                       |   4 +
 6 files changed, 879 insertions(+)
 create mode 100644 biodatasets/emrQA/emrQA.py
 create mode 100644 biodatasets/emrQA/test.py
 create mode 100644 biodatasets/n2c2_2012/test_tmp.py
 create mode 100644 biodatasets/why_qa/test.py
 create mode 100644 biodatasets/why_qa/why_qa.py
 create mode 100644 tmp_TEST.py

diff --git a/biodatasets/emrQA/emrQA.py b/biodatasets/emrQA/emrQA.py
new file mode 100644
index 00000000..4f5c0166
--- /dev/null
+++ b/biodatasets/emrQA/emrQA.py
@@ -0,0 +1,271 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo.
+
+When modifying it for your dataset, look for TODO items that offer specific instructions.
+
+Full documentation on writing dataset loading scripts can be found here:
+https://huggingface.co/docs/datasets/add_dataset.html
+
+To create a dataset loading script you will create a class and implement 3 methods:
+  * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object.
+  * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split.
+  * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`.
+
+TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset.
+
+[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment)
+"""
+
+import os
+from typing import List, Tuple, Dict
+
+import datasets
+from utils import schemas
+from utils.configs import BigBioConfig
+from utils.constants import Tasks
+
+# TODO: Add BibTeX citation
+_CITATION = """\
+@article{,
+  author    = {},
+  title     = {},
+  journal   = {},
+  volume    = {},
+  year      = {},
+  url       = {},
+  doi       = {},
+  biburl    = {},
+  bibsource = {}
+}
+"""
+
+# TODO: create a module level variable with your dataset name (should match script name)
+#  E.g. Hallmarks of Cancer: [dataset_name] --> hallmarks_of_cancer
+_DATASETNAME = "[dataset_name]"
+
+# TODO: Add description of the dataset here
+# You can copy an official description
+_DESCRIPTION = """\
+This dataset is designed for XXX NLP task.
+"""
+
+# TODO: Add a link to an official homepage for the dataset here (if possible)
+_HOMEPAGE = ""
+
+# TODO: Add the licence for the dataset here (if possible)
+# Note that this doesn't have to be a common open source license.
+# Some datasets have custom licenses. In this case, simply put the full license terms
+# into `_LICENSE`
+_LICENSE = ""
+
+# TODO: Add links to the urls needed to download your dataset files.
+#  For local datasets, this variable can be an empty dictionary.
+
+# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators.
+# In most cases the URLs will be the same for the source and bigbio config.
+# However, if you need to access different files for each config you can have multiple entries in this dict.
+# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method)
+_URLS = {
+    _DATASETNAME: "url or list of urls or ... ",
+}
+
+# TODO: add supported task by dataset. One dataset may support multiple tasks
+_SUPPORTED_TASKS = []  # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
+
+# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0"
+#  This version doesn't have to be consistent with semantic versioning. Anything that is
+#  provided by the original dataset as a version goes.
+_SOURCE_VERSION = ""
+
+_BIGBIO_VERSION = "1.0.0"
+
+
+# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
+#  Append "Dataset" to the class name: BioASQ --> BioasqDataset
+class NewDataset(datasets.GeneratorBasedBuilder):
+    """TODO: Short description of my dataset."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
+
+    # You will be able to load the "source" or "bigbio" configurations with
+    # ds_source = datasets.load_dataset('my_dataset', name='source')
+    # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio')
+
+    # For local datasets you can make use of the `data_dir` and `data_files` kwargs
+    # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits
+    # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files")
+    # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files")
+
+    # TODO: For each dataset, implement Config for Source and BigBio;
+    #  If dataset contains more than one subset (see examples/bioasq.py) implement for EACH of them.
+    #  Each of them should contain:
+    #   - name: should be unique for each dataset config eg. bioasq10b_(source|bigbio)_[bigbio_schema_name]
+    #   - version: option = (SOURCE_VERSION|BIGBIO_VERSION)
+    #   - description: one line description for the dataset
+    #   - schema: options = (source|bigbio_[bigbio_schema_name])
+    #   - subset_id: subset id is the canonical name for the dataset (eg. bioasq10b)
+    #  where [bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment)
+
+    BUILDER_CONFIGS = [
+        BigBioConfig(
+            name="[dataset_name]_source",
+            version=SOURCE_VERSION,
+            description="[dataset_name] source schema",
+            schema="source",
+            subset_id="[dataset_name]",
+        ),
+        BigBioConfig(
+            name="[dataset_name]_bigbio_[bigbio_schema_name]",
+            version=BIGBIO_VERSION,
+            description="[dataset_name] BigBio schema",
+            schema="bigbio_[bigbio_schema_name]",
+            subset_id="[dataset_name]",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "[dataset_name]_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible.
+
+        # You can arbitrarily nest lists and dictionaries.
+        # For iterables, use lists over tuples or `datasets.Sequence`
+
+        if self.config.schema == "source":
+            # TODO: Create your source schema here
+            raise NotImplementedError()
+
+            # EX: Arbitrary NER type dataset
+            # features = datasets.Features(
+            #    {
+            #        "doc_id": datasets.Value("string"),
+            #        "text": datasets.Value("string"),
+            #        "entities": [
+            #            {
+            #                "offsets": [datasets.Value("int64")],
+            #                "text": datasets.Value("string"),
+            #                "type": datasets.Value("string"),
+            #                "entity_id": datasets.Value("string"),
+            #            }
+            #        ],
+            #    }
+            # )
+
+        # Choose the appropriate bigbio schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide.
+
+        # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple bigbio configs with a bigbio_[bigbio_schema_name] format.
+
+        # For example bigbio_kb, bigbio_t2t
+        elif self.config.schema == "bigbio_[bigbio_schema_name]":
+            # e.g. features = schemas.kb_features
+            # TODO: Choose your big-bio schema here
+            raise NotImplementedError()
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
+
+        # If you need to access the "source" or "bigbio" config choice, that will be in self.config.name
+
+        # LOCAL DATASETS: You do not need the dl_manager; you can ignore this argument. Make sure `gen_kwargs` in the return gets passed the right filepath
+
+        # PUBLIC DATASETS: Assign your data-dir based on the dl_manager.
+
+        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager
+
+        # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files.
+
+        # TODO: KEEP if your dataset is PUBLIC; remove if not
+        urls = _URLS[_DATASETNAME]
+        data_dir = dl_manager.download_and_extract(urls)
+
+        # TODO: KEEP if your dataset is LOCAL; remove if NOT
+        if self.config.data_dir is None:
+            raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.")
+        else:
+            data_dir = self.config.data_dir
+
+        # Not all datasets have predefined canonical train/val/test splits.
+        # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data.
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # Whatever you put in gen_kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "train.jsonl"),
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "test.jsonl"),
+                    "split": "test",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "dev.jsonl"),
+                    "split": "dev",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+
+    # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs.
+
+    def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+        # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
+
+        # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
+
+        # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files
+
+        if self.config.schema == "source":
+            # TODO: yield (key, example) tuples in the original dataset schema
+            for key, example in thing:
+                yield key, example
+
+        elif self.config.schema == "bigbio_[bigbio_schema_name]":
+            # TODO: yield (key, example) tuples in the bigbio schema
+            for key, example in thing:
+                yield key, example
+
+
+# This template is based on the following template from the datasets package:
+# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py
+
+
+# This allows you to run your dataloader with `python [dataset_name].py` during development
+# TODO: Remove this before making your PR
+if __name__ == "__main__":
+    datasets.load_dataset(__file__)
diff --git a/biodatasets/emrQA/test.py b/biodatasets/emrQA/test.py
new file mode 100644
index 00000000..c37aa75b
--- /dev/null
+++ b/biodatasets/emrQA/test.py
@@ -0,0 +1,7 @@
+import json
+
+with open('C:/Users/franc/Desktop/dataset/data.json') as json_file:
+    data = json.load(json_file)
+
+
+print(data['data'][0])
\ No newline at end of file
diff --git a/biodatasets/n2c2_2012/test_tmp.py b/biodatasets/n2c2_2012/test_tmp.py
new file mode 100644
index 00000000..44f092ed
--- /dev/null
+++ b/biodatasets/n2c2_2012/test_tmp.py
@@ -0,0 +1,353 @@
+import tarfile
+from collections import defaultdict, OrderedDict
+import os
+from unittest import skip
+from lxml import etree
+import xmltodict
+import json
+
+"""""
+def _read_tar_gz_old_(file_path, samples=None):
+    if samples is None:
+        samples = defaultdict(dict)
+    print(samples)
+    with tarfile.open(file_path, "r:gz") as tf:
+        for member in tf.getmembers():
+
+            base, filename = os.path.split(member.name)
+            _, ext = os.path.splitext(filename)
+            ext = ext[1:]  # get rid of dot
+            sample_id = filename.split(".")[0]
+
+            if ext == "xml" and not filename in ["23.xml", "143.xml", "152.xml", "272.xml","382.xml","422.xml","547.xml","807.xml"]:
+                with tf.extractfile(member) as fp:
+                    content_bytes = fp.read()
+                content = content_bytes.decode("utf-8").encode()
+                root = etree.XML(content)
+                text, tags = root.getchildren()
+                samples[sample_id]["txt"] = text.text
+                samples[sample_id]["tags"] = {}
+
+                for child in tags:
+                          
+
+                    if child.tag == "EVENT":
+                        samples[sample_id]["tags"][child.tag]["id"] = child.get("id")
+                        samples[sample_id]["tags"][child.tag]["start"] = child.get("start")
+                        samples[sample_id]["tags"][child.tag]["end"] = child.get("end")
+                        samples[sample_id]["tags"][child.tag]["text"] = child.get("text")
+                        samples[sample_id]["tags"][child.tag]["modality"] = child.get("modality")
+                        samples[sample_id]["tags"][child.tag]["polarity"] = child.get("polarity")
+                        samples[sample_id]["tags"][child.tag]["type"] = child.get("type")
+                    if child.tag == "TIMEx3":
+                        samples[sample_id]["tags"][child.tag]["id"] = child.get("id")
+                        samples[sample_id]["tags"][child.tag]["start"] = child.get("start")
+                        samples[sample_id]["tags"][child.tag]["end"] = child.get("end")
+                        samples[sample_id]["tags"][child.tag]["text"] = child.get("text")
+                        samples[sample_id]["tags"][child.tag]["type"] = child.get("type")
+                        samples[sample_id]["tags"][child.tag]["val"] = child.get("val")
+                        samples[sample_id]["tags"][child.tag]["mod"] = child.get("mod")
+                    if child.tag == "TLINK":
+                        samples[sample_id]["tags"][child.tag]["id"] = child.get("id")
+                        samples[sample_id]["tags"][child.tag]["fromID"] = child.get("fromID")
+                        samples[sample_id]["tags"][child.tag]["fromText"] = child.get("fromTExt")
+                        samples[sample_id]["tags"][child.tag]["toID"] = child.get("toID")
+                        samples[sample_id]["tags"][child.tag]["toText"] = child.get("toText")
+                        samples[sample_id]["tags"][child.tag]["type"] = child.get("type")
+                    if child.tag == "SECTIME":
+                        samples[sample_id]["tags"][child.tag]["id"] = child.get("id")
+                        samples[sample_id]["tags"][child.tag]["start"] = child.get("start")
+                        samples[sample_id]["tags"][child.tag]["end"] = child.get("end")
+                        samples[sample_id]["tags"][child.tag]["text"] = child.get("text")
+                        samples[sample_id]["tags"][child.tag]["type"] = child.get("type")
+                        samples[sample_id]["tags"][child.tag]["dvalue"] = child.get("dvalue")
+
+"""""
+
+def _read_tar_gz_train_(file_path, samples=None):
+    if samples is None:
+        samples = defaultdict(dict)
+    print(samples)
+    with tarfile.open(file_path, "r:gz") as tf:
+        for member in tf.getmembers():
+
+            base, filename = os.path.split(member.name)
+            _, ext = os.path.splitext(filename)
+            ext = ext[1:]  # get rid of dot
+            sample_id = filename.split(".")[0]
+
+            if ext == "xml" and not filename in ["23.xml", "143.xml", "152.xml", "272.xml","382.xml","422.xml","547.xml","807.xml"]:
+                with tf.extractfile(member) as fp:
+                    content_bytes = fp.read()
+                content = content_bytes.decode("utf-8").encode()
+                values = xmltodict.parse(content)
+                samples[sample_id] = values["ClinicalNarrativeTemporalAnnotation"] 
+
+    samples_sorted = OrderedDict(sorted(samples.items(), key=lambda x: int(x[0])))
+    samples = samples_sorted
+    samples = json.loads(json.dumps(samples))
+
+    return samples            
+
+    """
+    with open('C:/Users/franc/Desktop/result.json', 'w') as fp:
+        json.dump(samples, fp)
+
+
+    for i, event in enumerate(samples["1"]["TAGS"]["EVENT"]):
+        print(event["@id"])
+        #print(samples["1"]["TAGS"]["EVENT"][event])
+
+    admission = {}
+    discharge = {}
+    for idx, sectime in enumerate(samples["1"]["TAGS"]["SECTIME"]):
+        if sectime["@type"] == "ADMISSION":
+            admission = {
+                "id": sectime["@id"],
+                "type": sectime["@type"],
+                "text": sectime["@text"],
+                "offsets": [(sectime["@start"], sectime["@end"])],
+                }
+        elif sectime["@type"] == "DISCHARGE":
+            discharge = {
+                "id": sectime["@id"],
+                "type": sectime["@type"],
+                "text": sectime["@text"],
+                "offsets": [(sectime["@start"], sectime["@end"])],
+            }
+        print(admission)
+
+    
+    sample = samples["1"]
+    x = {"id": 1, "tags": {
+                "EVENT": sample["TAGS"]["EVENT"]}
+        }
+    print(x)
+
+    for sample_id, sample in samples.items():
+        print(sample)
+        print("/////")
+        print(sample_id)
+        print("-----------------------------------------------------------------------------")
+
+    """
+########################################################################################################################
+def _read_tar_gz_test_(file_path, samples=None):
+    if samples is None:
+        samples = defaultdict(dict)
+    print(samples)
+    with tarfile.open(file_path, "r:gz") as tf:
+        for member in tf.getmembers():
+            if member.name.startswith("ground_truth/merged_xml"):
+
+                base, filename = os.path.split(member.name)
+                _, ext = os.path.splitext(filename)
+                ext = ext[1:]  # get rid of dot
+                sample_id = filename.split(".")[0]
+
+                if ext == "xml" and not filename in ["53.xml", "397.xml","527.xml","627.xml","687.xml","802.xml"]:
+                    with tf.extractfile(member) as fp:
+                        content_bytes = fp.read()
+                    content = content_bytes.decode("utf-8").encode()
+                    values = xmltodict.parse(content)
+                    samples[sample_id] = values["ClinicalNarrativeTemporalAnnotation"] 
+
+    samples_sorted = OrderedDict(sorted(samples.items(), key=lambda x: int(x[0])))
+    samples = samples_sorted
+
+    return samples
+
+    #with open('C:/Users/franc/Desktop/result_test.json', 'w') as fp:
+        #json.dump(samples, fp)
+
+############################################################################################################################################################################
+def  _get_events_from_sample(sample_id, sample):
+    events = []
+    for idx, event in enumerate(sample["TAGS"]["EVENT"]):
+        
+        evs = {
+        "id": event["@id"],
+        "type": event["@type"],
+        "offsets": [(event["@start"], event["@end"])],
+        "text":  event["@text"],
+         }
+
+        events.append(evs)
+    print(events)
+
+def _get_source_sample(sample_id, sample):
+    output = {
+        "id": sample_id,
+        "text": sample["TEXT"],
+        "tags": sample["TAGS"],
+        }
+    return output
+
+   
+#_read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz")
+#_read_tar_gz_test_("C:/Users/franc/Desktop/2012-08-23.test-data.groundtruth.tar.gz")
+
+
+def test():
+    samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz")
+    #samples = _read_tar_gz_test_("C:/Users/franc/Desktop/2012-08-23.test-data.groundtruth.tar.gz")
+
+    #print(samples)
+    _id = 0
+    for sample_id, sample in samples.items():
+        if sample.get("TAGS","").get("SECTIME","") == "":
+            print("empty")
+        else:
+            print(sample.get("TAGS","").get("SECTIME",""))
+        print(_id)
+        print(sample_id)
+        print("-----------------------------------------------------------------------------")
+        _id += 1
+
+
+def test_2_():
+    samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz")
+
+    for sample_id, sample in samples.items():
+        events = []
+
+        for idx, event in enumerate(sample.get("TAGS","").get("EVENT","")):
+            
+            evs = {
+            "id": event.get("@id",""),
+            "type": event.get("@type",""),
+            "trigger": {
+                "text": event.get("@text",""),
+                "offests": [(int(event.get("@start","")), int(event.get("@end","")))],
+                },
+            "arguments": [
+                {
+                "role": "NA",
+                "ref_id": "NA",
+                },
+            ],
+            }
+            events.append(evs)
+    
+        print(events)
+        print("############################################################")
+        print("############################################################")
+
+def test_3_():
+    samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz")
+
+    for sample_id, sample in samples.items():
+        print(sample_id)
+        print(len(sample.get("TAGS","").get("SECTIME","")))
+
+        admission = []
+
+        if sample.get("TAGS","").get("SECTIME","") == "":
+            pass
+        elif len(sample.get("TAGS","").get("SECTIME","")) == 2:
+            for idx, sectime in enumerate(sample.get("TAGS","").get("SECTIME","")):
+                if sectime.get("@type","") == "ADMISSION":
+                    adm = {
+                        "id": sectime.get("@id",""),
+                        "type": sectime.get("@type",""),
+                        "text": [sectime.get("@text","")],
+                        "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))],
+                        }
+                    admission.append(adm)
+        else:
+            sectime = sample.get("TAGS","").get("SECTIME","")
+            if sectime.get("@type","") == "ADMISSION":
+                adm = {
+                    "id": sectime.get("@id",""),
+                    "type": sectime.get("@type",""),
+                    "text": [sectime.get("@text","")],
+                    "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))],
+                    }
+                admission.append(adm) 
+
+        print(admission)
+        print("############################################################")
+
+
+def test_4_():
+    samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz")
+
+    for sample_id, sample in samples.items():
+        print(sample_id)
+        print(len(sample.get("TAGS","").get("SECTIME","")))
+
+        discharge = []
+        
+        if sample.get("TAGS","").get("SECTIME","") == "":
+            pass
+        elif len(sample.get("TAGS","").get("SECTIME","")) == 2:
+            for idx, sectime in enumerate(sample.get("TAGS","").get("SECTIME","")):
+                if sectime.get("@type","") == "DISCHARGE":
+                    dis = {
+                        "id": sectime.get("@id",""),
+                        "type": sectime.get("@type",""),
+                        "text": [sectime.get("@text","")],
+                        "offsets": [(sectime.get("@start",""), sectime.get("@end",""))],
+                        }
+                    discharge.append(dis)
+        else:
+            sectime = sample.get("TAGS","").get("SECTIME","")
+            if sectime.get("@type","") == "DISCHARGE":
+                dis = {
+                    "id": sectime.get("@id",""),
+                    "type": sectime.get("@type",""),
+                    "text": [sectime.get("@text","")],
+                    "offsets": [(sectime.get("@start",""), sectime.get("@end",""))],
+                    }
+                discharge.append(dis)
+
+        print(discharge)
+        print("############################################################")
+
+
+
+test_3_()
+
+##############################################################################################################################################################################################
+"""
+        if self.config.schema == "source":
+            features = Features(
+                {
+                    "doc_id": Value("string"),
+                    "text": Value("string"),
+                    "entities":{
+                        "EVENT": Sequence({"id": Value("string"),
+                                        "start": Value("int64"),
+                                        "end": Value("int64"),
+                                        "text": Value("string"),
+                                        "modality": ClassLabel(names=["FACTUAL", "CONDITIONAL","POSSIBLE","PROPOSED"]),
+                                        "polarity": ClassLabel(names=["POS", "NEG"]),
+                                        "type": ClassLabel(names=["TEST","PROBLEM","TREATMENT","CLINICAL_DEPT","EVIDENTIAL","OCCURRENCE"]),
+                                        }),
+                        "TIMEX3": Sequence({"id": Value("string"),
+                                        "start": Value("int64"),
+                                        "end": Value("int64"),
+                                        "text": Value("string"),
+                                        "type": ClassLabel(names=["DATE", "TIME","DURATION","FREQUENCY"]),
+                                        "val": Value("string"),
+                                        "mod": ClassLabel(names=["NA","MORE","LESS","APPROX","START","END","MIDDLE"]), 
+                                        }),
+                        "TLINK": Sequence({"id": Value("string"),
+                                        "fromID": Value("string"),
+                                        "fromText": Value("string"),
+                                        "toID": Value("string"),
+                                        "toText": Value("string"),
+                                        "type": ClassLabel(names=["BEFORE","AFTER","SIMULTANEOUS","OVERLAP","BEGUN_BY","DURING","BEFORE_OVERLAP"]),
+                                        }),
+                        "SECTIME": Sequence({"id": Value("string"),
+                                        "start": Value("int64"),
+                                        "end": Value("int64"),
+                                        "text": Value("string"),
+                                        "type": ClassLabel(names=["ADMISSION","DISCHARGE"]),
+                                        "dvalue": Value("string"),                                        
+                                        }),
+                                 }
+                }
+            )
+
+"""
\ No newline at end of file
diff --git a/biodatasets/why_qa/test.py b/biodatasets/why_qa/test.py
new file mode 100644
index 00000000..95a9fef6
--- /dev/null
+++ b/biodatasets/why_qa/test.py
@@ -0,0 +1,27 @@
+import zipfile
+import json
+
+def read_zip_file(file_path):
+    with zipfile.ZipFile(file_path) as zf:
+        with zf.open("n2c2-community-annotations_2010-fan-why-QA/relations_whyqa_ann-v7-share.json") as f:
+            dataset = json.load(f)
+            return dataset
+
+
+dataset = read_zip_file("C:/Users/franc/Desktop/n2c2-community-annotations_2010-fan-why-QA.zip")
+
+samples = dataset['data'][0]['paragraphs']
+
+for sample in samples:
+    print(sample["qas"])
+    print("######################################")
+
+
+
+
+
+# for sample in samples:
+#     for qa in sample['qas']:
+#         print(qa['id'])
+#         print(sample['note_id'])
+#     print("######################################")
diff --git a/biodatasets/why_qa/why_qa.py b/biodatasets/why_qa/why_qa.py
new file mode 100644
index 00000000..4c33d50a
--- /dev/null
+++ b/biodatasets/why_qa/why_qa.py
@@ -0,0 +1,217 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+A dataset loader for the n2c2 community-annotated Why Questions dataset.
+
+https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/
+
+The dataset consists of a single archive (no splits) and it is available
+as a JSON file and as an XLSX file:
+
+    - relations_whyqa_ann-v7-share.json (in SQUAD 2.0 format)
+    - relations_whyqa_ann-v7-share.xlsx
+
+The dataset also includes TXT files with the full texts of the
+clinical notes.
+
+The files comprising this dataset must be on the users local machine
+in a single directory that is passed to `datasets.load_dataset` via
+the `data_dir` kwarg. This loader script will read the archive files
+directly (i.e. the user should not uncompress, untar or unzip any of
+the files).
+
+Registration AND submission of DUA is required to access the dataset.
+
+[bigbio_schema_name] = qa
+"""
+
+import os
+import zipfile
+import json
+from collections import defaultdict
+from typing import List, Tuple, Dict
+
+import datasets
+from utils import schemas
+from utils.configs import BigBioConfig
+from utils.constants import Tasks
+
+# TODO: Add BibTeX citation
+_CITATION = """\
+@inproceedings{,
+  author    = {Annotating and Characterizing Clinical Sentences with Explicit Why-{QA} Cues},
+  title     = {Fan, Jungwei},
+  booktitle = {Proceedings of the 2nd Clinical Natural Language Processing Workshop},
+  month     = {jun},
+  year      = {2019},
+  address   = {Minneapolis, Minnesota, USA},
+  publisher = {Association for Computational Linguistics},
+  url       = {https://aclanthology.org/W19-1913},
+  doi       = {10.18653/v1/W19-1913}
+
+}
+
+}
+"""
+
+_DATASETNAME = "[why_qa]"
+
+# TODO: Add description of the dataset here
+# You can copy an official description
+_DESCRIPTION = """\
+
+This dataset is a collection of why-questions and their answers generated
+from a corpus of clincal notes. The corpus is the 2010 i2b2/VA NLP
+challenge and consists of 426 discharge summaries from Partners
+Healthcare and Beth Israel Deaconess Medical Center.
+
+"""
+_HOMEPAGE = "https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/"
+
+_LICENSE = "External Data User Agreement"
+
+_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING]
+
+_SOURCE_VERSION = "1.0.0"
+
+_BIGBIO_VERSION = "1.0.0"
+
+def read_zip_file(file_path):
+    with zipfile.ZipFile(file_path) as zf:
+        with zf.open("n2c2-community-annotations_2010-fan-why-QA/relations_whyqa_ann-v7-share.json") as f:
+            dataset = json.load(f)
+            return dataset
+
+def _get_samples(dataset):
+    samples = dataset['data'][0]['paragraphs']
+    return samples
+
+# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
+#  Append "Dataset" to the class name: BioASQ --> BioasqDataset
+class WhyQaDataset(datasets.GeneratorBasedBuilder):
+    """n2c2 community-annotated Why Questions dataset."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
+
+
+    BUILDER_CONFIGS = [
+        BigBioConfig(
+            name="why_qa_source",
+            version=SOURCE_VERSION,
+            description="why_qa source schema",
+            schema="source",
+            subset_id="why_qa",
+        ),
+        BigBioConfig(
+            name="why_qa_bigbio_qa",
+            version=BIGBIO_VERSION,
+            description="why_wa BigBio schema",
+            schema="bigbio_qa",
+            subset_id="why_qa",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "why_qa_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source":
+            features = datasets.Features(
+
+            {
+            "note_id": datasets.Value("string"),
+            "qas": [
+                {"question_template": datasets.Value("string"),
+                "question": datasets.Value("string"),
+                "id": datasets.Value("string"),
+                "answers": [
+                    {"text": datasets.Value("string"),
+                    "answer_start": datasets.Value("int32"),
+                    },
+                    ],
+                "is_impossible": datasets.Value("bool"),
+                },
+                ],
+            "context": datasets.Value("string"),
+            },
+            )
+
+        elif self.config.schema == "bigbio_qa":
+            features = schemas.qa_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+
+        if self.config.data_dir is None:
+            raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.")
+        else:
+            data_dir = self.config.data_dir
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # Whatever you put in gen_kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "split": "train",
+                },
+            ),
+        ]
+
+    def _generate_examples(self, data_dir, split: str) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+        dataset = read_zip_file(data_dir)
+        samples = _get_samples(dataset)
+
+        if self.config.schema == "source":
+            _id = 0
+            for sample in samples:
+                yield _id, sample
+                _id += 1
+
+        elif self.config.schema == "bigbio_[bigbio_schema_name]":
+            _id = 0
+            for sample in samples:
+                for qa in sample['qas']:
+                    ans_list = []
+                    for answer in qa["answer"]:
+                        ans = answer["text"]
+                        ans_list.append(ans)
+                    bigbio_sample = {
+                                        "id" : qa["note_id"],
+                                        "question_id" : qa["id"],
+                                        "document_id" : sample["note_id"],
+                                        "question" : qa["question"],
+                                        "type" : qa["question_template"],
+                                        "choices" : [],
+                                        "context" : sample["context"],
+                                        "answer" : ans_list,
+                                    }
+                    yield _id, bigbio_sample
+                    _id += 1
+
+
+# This template is based on the following template from the datasets package:
+# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py
diff --git a/tmp_TEST.py b/tmp_TEST.py
new file mode 100644
index 00000000..2e65fb4d
--- /dev/null
+++ b/tmp_TEST.py
@@ -0,0 +1,4 @@
+from datasets import load_dataset
+
+data = load_dataset("biodatasets/why_qa/why_qa.py", name="why_qa_bigbio_qa", data_dir = "C:/Users/franc/Desktop/n2c2-community-annotations_2010-fan-why-QA.zip")
+#data = load_dataset("biodatasets/why_qa/why_qa.py", name="why_qa_source", data_dir = "C:/Users/franc/Desktop/n2c2-community-annotations_2010-fan-why-QA.zip")
\ No newline at end of file

From eaea00dece7ee4e5960f1d1af6d5c602ceaaf355 Mon Sep 17 00:00:00 2001
From: clancyoftheoverflow
 <32432020+clancyoftheoverflow@users.noreply.github.com>
Date: Mon, 6 Jun 2022 01:38:21 +0800
Subject: [PATCH 3/6] Delete test files

---
 biodatasets/emrQA/test.py         |   7 -
 biodatasets/n2c2_2012/test_tmp.py | 353 ------------------------------
 biodatasets/why_qa/test.py        |  27 ---
 3 files changed, 387 deletions(-)
 delete mode 100644 biodatasets/emrQA/test.py
 delete mode 100644 biodatasets/n2c2_2012/test_tmp.py
 delete mode 100644 biodatasets/why_qa/test.py

diff --git a/biodatasets/emrQA/test.py b/biodatasets/emrQA/test.py
deleted file mode 100644
index c37aa75b..00000000
--- a/biodatasets/emrQA/test.py
+++ /dev/null
@@ -1,7 +0,0 @@
-import json
-
-with open('C:/Users/franc/Desktop/dataset/data.json') as json_file:
-    data = json.load(json_file)
-
-
-print(data['data'][0])
\ No newline at end of file
diff --git a/biodatasets/n2c2_2012/test_tmp.py b/biodatasets/n2c2_2012/test_tmp.py
deleted file mode 100644
index 44f092ed..00000000
--- a/biodatasets/n2c2_2012/test_tmp.py
+++ /dev/null
@@ -1,353 +0,0 @@
-import tarfile
-from collections import defaultdict, OrderedDict
-import os
-from unittest import skip
-from lxml import etree
-import xmltodict
-import json
-
-"""""
-def _read_tar_gz_old_(file_path, samples=None):
-    if samples is None:
-        samples = defaultdict(dict)
-    print(samples)
-    with tarfile.open(file_path, "r:gz") as tf:
-        for member in tf.getmembers():
-
-            base, filename = os.path.split(member.name)
-            _, ext = os.path.splitext(filename)
-            ext = ext[1:]  # get rid of dot
-            sample_id = filename.split(".")[0]
-
-            if ext == "xml" and not filename in ["23.xml", "143.xml", "152.xml", "272.xml","382.xml","422.xml","547.xml","807.xml"]:
-                with tf.extractfile(member) as fp:
-                    content_bytes = fp.read()
-                content = content_bytes.decode("utf-8").encode()
-                root = etree.XML(content)
-                text, tags = root.getchildren()
-                samples[sample_id]["txt"] = text.text
-                samples[sample_id]["tags"] = {}
-
-                for child in tags:
-                          
-
-                    if child.tag == "EVENT":
-                        samples[sample_id]["tags"][child.tag]["id"] = child.get("id")
-                        samples[sample_id]["tags"][child.tag]["start"] = child.get("start")
-                        samples[sample_id]["tags"][child.tag]["end"] = child.get("end")
-                        samples[sample_id]["tags"][child.tag]["text"] = child.get("text")
-                        samples[sample_id]["tags"][child.tag]["modality"] = child.get("modality")
-                        samples[sample_id]["tags"][child.tag]["polarity"] = child.get("polarity")
-                        samples[sample_id]["tags"][child.tag]["type"] = child.get("type")
-                    if child.tag == "TIMEx3":
-                        samples[sample_id]["tags"][child.tag]["id"] = child.get("id")
-                        samples[sample_id]["tags"][child.tag]["start"] = child.get("start")
-                        samples[sample_id]["tags"][child.tag]["end"] = child.get("end")
-                        samples[sample_id]["tags"][child.tag]["text"] = child.get("text")
-                        samples[sample_id]["tags"][child.tag]["type"] = child.get("type")
-                        samples[sample_id]["tags"][child.tag]["val"] = child.get("val")
-                        samples[sample_id]["tags"][child.tag]["mod"] = child.get("mod")
-                    if child.tag == "TLINK":
-                        samples[sample_id]["tags"][child.tag]["id"] = child.get("id")
-                        samples[sample_id]["tags"][child.tag]["fromID"] = child.get("fromID")
-                        samples[sample_id]["tags"][child.tag]["fromText"] = child.get("fromTExt")
-                        samples[sample_id]["tags"][child.tag]["toID"] = child.get("toID")
-                        samples[sample_id]["tags"][child.tag]["toText"] = child.get("toText")
-                        samples[sample_id]["tags"][child.tag]["type"] = child.get("type")
-                    if child.tag == "SECTIME":
-                        samples[sample_id]["tags"][child.tag]["id"] = child.get("id")
-                        samples[sample_id]["tags"][child.tag]["start"] = child.get("start")
-                        samples[sample_id]["tags"][child.tag]["end"] = child.get("end")
-                        samples[sample_id]["tags"][child.tag]["text"] = child.get("text")
-                        samples[sample_id]["tags"][child.tag]["type"] = child.get("type")
-                        samples[sample_id]["tags"][child.tag]["dvalue"] = child.get("dvalue")
-
-"""""
-
-def _read_tar_gz_train_(file_path, samples=None):
-    if samples is None:
-        samples = defaultdict(dict)
-    print(samples)
-    with tarfile.open(file_path, "r:gz") as tf:
-        for member in tf.getmembers():
-
-            base, filename = os.path.split(member.name)
-            _, ext = os.path.splitext(filename)
-            ext = ext[1:]  # get rid of dot
-            sample_id = filename.split(".")[0]
-
-            if ext == "xml" and not filename in ["23.xml", "143.xml", "152.xml", "272.xml","382.xml","422.xml","547.xml","807.xml"]:
-                with tf.extractfile(member) as fp:
-                    content_bytes = fp.read()
-                content = content_bytes.decode("utf-8").encode()
-                values = xmltodict.parse(content)
-                samples[sample_id] = values["ClinicalNarrativeTemporalAnnotation"] 
-
-    samples_sorted = OrderedDict(sorted(samples.items(), key=lambda x: int(x[0])))
-    samples = samples_sorted
-    samples = json.loads(json.dumps(samples))
-
-    return samples            
-
-    """
-    with open('C:/Users/franc/Desktop/result.json', 'w') as fp:
-        json.dump(samples, fp)
-
-
-    for i, event in enumerate(samples["1"]["TAGS"]["EVENT"]):
-        print(event["@id"])
-        #print(samples["1"]["TAGS"]["EVENT"][event])
-
-    admission = {}
-    discharge = {}
-    for idx, sectime in enumerate(samples["1"]["TAGS"]["SECTIME"]):
-        if sectime["@type"] == "ADMISSION":
-            admission = {
-                "id": sectime["@id"],
-                "type": sectime["@type"],
-                "text": sectime["@text"],
-                "offsets": [(sectime["@start"], sectime["@end"])],
-                }
-        elif sectime["@type"] == "DISCHARGE":
-            discharge = {
-                "id": sectime["@id"],
-                "type": sectime["@type"],
-                "text": sectime["@text"],
-                "offsets": [(sectime["@start"], sectime["@end"])],
-            }
-        print(admission)
-
-    
-    sample = samples["1"]
-    x = {"id": 1, "tags": {
-                "EVENT": sample["TAGS"]["EVENT"]}
-        }
-    print(x)
-
-    for sample_id, sample in samples.items():
-        print(sample)
-        print("/////")
-        print(sample_id)
-        print("-----------------------------------------------------------------------------")
-
-    """
-########################################################################################################################
-def _read_tar_gz_test_(file_path, samples=None):
-    if samples is None:
-        samples = defaultdict(dict)
-    print(samples)
-    with tarfile.open(file_path, "r:gz") as tf:
-        for member in tf.getmembers():
-            if member.name.startswith("ground_truth/merged_xml"):
-
-                base, filename = os.path.split(member.name)
-                _, ext = os.path.splitext(filename)
-                ext = ext[1:]  # get rid of dot
-                sample_id = filename.split(".")[0]
-
-                if ext == "xml" and not filename in ["53.xml", "397.xml","527.xml","627.xml","687.xml","802.xml"]:
-                    with tf.extractfile(member) as fp:
-                        content_bytes = fp.read()
-                    content = content_bytes.decode("utf-8").encode()
-                    values = xmltodict.parse(content)
-                    samples[sample_id] = values["ClinicalNarrativeTemporalAnnotation"] 
-
-    samples_sorted = OrderedDict(sorted(samples.items(), key=lambda x: int(x[0])))
-    samples = samples_sorted
-
-    return samples
-
-    #with open('C:/Users/franc/Desktop/result_test.json', 'w') as fp:
-        #json.dump(samples, fp)
-
-############################################################################################################################################################################
-def  _get_events_from_sample(sample_id, sample):
-    events = []
-    for idx, event in enumerate(sample["TAGS"]["EVENT"]):
-        
-        evs = {
-        "id": event["@id"],
-        "type": event["@type"],
-        "offsets": [(event["@start"], event["@end"])],
-        "text":  event["@text"],
-         }
-
-        events.append(evs)
-    print(events)
-
-def _get_source_sample(sample_id, sample):
-    output = {
-        "id": sample_id,
-        "text": sample["TEXT"],
-        "tags": sample["TAGS"],
-        }
-    return output
-
-   
-#_read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz")
-#_read_tar_gz_test_("C:/Users/franc/Desktop/2012-08-23.test-data.groundtruth.tar.gz")
-
-
-def test():
-    samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz")
-    #samples = _read_tar_gz_test_("C:/Users/franc/Desktop/2012-08-23.test-data.groundtruth.tar.gz")
-
-    #print(samples)
-    _id = 0
-    for sample_id, sample in samples.items():
-        if sample.get("TAGS","").get("SECTIME","") == "":
-            print("empty")
-        else:
-            print(sample.get("TAGS","").get("SECTIME",""))
-        print(_id)
-        print(sample_id)
-        print("-----------------------------------------------------------------------------")
-        _id += 1
-
-
-def test_2_():
-    samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz")
-
-    for sample_id, sample in samples.items():
-        events = []
-
-        for idx, event in enumerate(sample.get("TAGS","").get("EVENT","")):
-            
-            evs = {
-            "id": event.get("@id",""),
-            "type": event.get("@type",""),
-            "trigger": {
-                "text": event.get("@text",""),
-                "offests": [(int(event.get("@start","")), int(event.get("@end","")))],
-                },
-            "arguments": [
-                {
-                "role": "NA",
-                "ref_id": "NA",
-                },
-            ],
-            }
-            events.append(evs)
-    
-        print(events)
-        print("############################################################")
-        print("############################################################")
-
-def test_3_():
-    samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz")
-
-    for sample_id, sample in samples.items():
-        print(sample_id)
-        print(len(sample.get("TAGS","").get("SECTIME","")))
-
-        admission = []
-
-        if sample.get("TAGS","").get("SECTIME","") == "":
-            pass
-        elif len(sample.get("TAGS","").get("SECTIME","")) == 2:
-            for idx, sectime in enumerate(sample.get("TAGS","").get("SECTIME","")):
-                if sectime.get("@type","") == "ADMISSION":
-                    adm = {
-                        "id": sectime.get("@id",""),
-                        "type": sectime.get("@type",""),
-                        "text": [sectime.get("@text","")],
-                        "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))],
-                        }
-                    admission.append(adm)
-        else:
-            sectime = sample.get("TAGS","").get("SECTIME","")
-            if sectime.get("@type","") == "ADMISSION":
-                adm = {
-                    "id": sectime.get("@id",""),
-                    "type": sectime.get("@type",""),
-                    "text": [sectime.get("@text","")],
-                    "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))],
-                    }
-                admission.append(adm) 
-
-        print(admission)
-        print("############################################################")
-
-
-def test_4_():
-    samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz")
-
-    for sample_id, sample in samples.items():
-        print(sample_id)
-        print(len(sample.get("TAGS","").get("SECTIME","")))
-
-        discharge = []
-        
-        if sample.get("TAGS","").get("SECTIME","") == "":
-            pass
-        elif len(sample.get("TAGS","").get("SECTIME","")) == 2:
-            for idx, sectime in enumerate(sample.get("TAGS","").get("SECTIME","")):
-                if sectime.get("@type","") == "DISCHARGE":
-                    dis = {
-                        "id": sectime.get("@id",""),
-                        "type": sectime.get("@type",""),
-                        "text": [sectime.get("@text","")],
-                        "offsets": [(sectime.get("@start",""), sectime.get("@end",""))],
-                        }
-                    discharge.append(dis)
-        else:
-            sectime = sample.get("TAGS","").get("SECTIME","")
-            if sectime.get("@type","") == "DISCHARGE":
-                dis = {
-                    "id": sectime.get("@id",""),
-                    "type": sectime.get("@type",""),
-                    "text": [sectime.get("@text","")],
-                    "offsets": [(sectime.get("@start",""), sectime.get("@end",""))],
-                    }
-                discharge.append(dis)
-
-        print(discharge)
-        print("############################################################")
-
-
-
-test_3_()
-
-##############################################################################################################################################################################################
-"""
-        if self.config.schema == "source":
-            features = Features(
-                {
-                    "doc_id": Value("string"),
-                    "text": Value("string"),
-                    "entities":{
-                        "EVENT": Sequence({"id": Value("string"),
-                                        "start": Value("int64"),
-                                        "end": Value("int64"),
-                                        "text": Value("string"),
-                                        "modality": ClassLabel(names=["FACTUAL", "CONDITIONAL","POSSIBLE","PROPOSED"]),
-                                        "polarity": ClassLabel(names=["POS", "NEG"]),
-                                        "type": ClassLabel(names=["TEST","PROBLEM","TREATMENT","CLINICAL_DEPT","EVIDENTIAL","OCCURRENCE"]),
-                                        }),
-                        "TIMEX3": Sequence({"id": Value("string"),
-                                        "start": Value("int64"),
-                                        "end": Value("int64"),
-                                        "text": Value("string"),
-                                        "type": ClassLabel(names=["DATE", "TIME","DURATION","FREQUENCY"]),
-                                        "val": Value("string"),
-                                        "mod": ClassLabel(names=["NA","MORE","LESS","APPROX","START","END","MIDDLE"]), 
-                                        }),
-                        "TLINK": Sequence({"id": Value("string"),
-                                        "fromID": Value("string"),
-                                        "fromText": Value("string"),
-                                        "toID": Value("string"),
-                                        "toText": Value("string"),
-                                        "type": ClassLabel(names=["BEFORE","AFTER","SIMULTANEOUS","OVERLAP","BEGUN_BY","DURING","BEFORE_OVERLAP"]),
-                                        }),
-                        "SECTIME": Sequence({"id": Value("string"),
-                                        "start": Value("int64"),
-                                        "end": Value("int64"),
-                                        "text": Value("string"),
-                                        "type": ClassLabel(names=["ADMISSION","DISCHARGE"]),
-                                        "dvalue": Value("string"),                                        
-                                        }),
-                                 }
-                }
-            )
-
-"""
\ No newline at end of file
diff --git a/biodatasets/why_qa/test.py b/biodatasets/why_qa/test.py
deleted file mode 100644
index 95a9fef6..00000000
--- a/biodatasets/why_qa/test.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import zipfile
-import json
-
-def read_zip_file(file_path):
-    with zipfile.ZipFile(file_path) as zf:
-        with zf.open("n2c2-community-annotations_2010-fan-why-QA/relations_whyqa_ann-v7-share.json") as f:
-            dataset = json.load(f)
-            return dataset
-
-
-dataset = read_zip_file("C:/Users/franc/Desktop/n2c2-community-annotations_2010-fan-why-QA.zip")
-
-samples = dataset['data'][0]['paragraphs']
-
-for sample in samples:
-    print(sample["qas"])
-    print("######################################")
-
-
-
-
-
-# for sample in samples:
-#     for qa in sample['qas']:
-#         print(qa['id'])
-#         print(sample['note_id'])
-#     print("######################################")

From 0df92942e5079907f08767678248c87559dbf3c7 Mon Sep 17 00:00:00 2001
From: clancyoftheoverflow
 <32432020+clancyoftheoverflow@users.noreply.github.com>
Date: Mon, 6 Jun 2022 01:43:21 +0800
Subject: [PATCH 4/6] Delete tmp files

---
 biodatasets/emrQA/emrQA.py | 271 -------------------------------------
 tmp_TEST.py                |   4 -
 2 files changed, 275 deletions(-)
 delete mode 100644 biodatasets/emrQA/emrQA.py
 delete mode 100644 tmp_TEST.py

diff --git a/biodatasets/emrQA/emrQA.py b/biodatasets/emrQA/emrQA.py
deleted file mode 100644
index 4f5c0166..00000000
--- a/biodatasets/emrQA/emrQA.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo.
-
-When modifying it for your dataset, look for TODO items that offer specific instructions.
-
-Full documentation on writing dataset loading scripts can be found here:
-https://huggingface.co/docs/datasets/add_dataset.html
-
-To create a dataset loading script you will create a class and implement 3 methods:
-  * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object.
-  * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split.
-  * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`.
-
-TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset.
-
-[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment)
-"""
-
-import os
-from typing import List, Tuple, Dict
-
-import datasets
-from utils import schemas
-from utils.configs import BigBioConfig
-from utils.constants import Tasks
-
-# TODO: Add BibTeX citation
-_CITATION = """\
-@article{,
-  author    = {},
-  title     = {},
-  journal   = {},
-  volume    = {},
-  year      = {},
-  url       = {},
-  doi       = {},
-  biburl    = {},
-  bibsource = {}
-}
-"""
-
-# TODO: create a module level variable with your dataset name (should match script name)
-#  E.g. Hallmarks of Cancer: [dataset_name] --> hallmarks_of_cancer
-_DATASETNAME = "[dataset_name]"
-
-# TODO: Add description of the dataset here
-# You can copy an official description
-_DESCRIPTION = """\
-This dataset is designed for XXX NLP task.
-"""
-
-# TODO: Add a link to an official homepage for the dataset here (if possible)
-_HOMEPAGE = ""
-
-# TODO: Add the licence for the dataset here (if possible)
-# Note that this doesn't have to be a common open source license.
-# Some datasets have custom licenses. In this case, simply put the full license terms
-# into `_LICENSE`
-_LICENSE = ""
-
-# TODO: Add links to the urls needed to download your dataset files.
-#  For local datasets, this variable can be an empty dictionary.
-
-# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators.
-# In most cases the URLs will be the same for the source and bigbio config.
-# However, if you need to access different files for each config you can have multiple entries in this dict.
-# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method)
-_URLS = {
-    _DATASETNAME: "url or list of urls or ... ",
-}
-
-# TODO: add supported task by dataset. One dataset may support multiple tasks
-_SUPPORTED_TASKS = []  # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
-
-# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0"
-#  This version doesn't have to be consistent with semantic versioning. Anything that is
-#  provided by the original dataset as a version goes.
-_SOURCE_VERSION = ""
-
-_BIGBIO_VERSION = "1.0.0"
-
-
-# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
-#  Append "Dataset" to the class name: BioASQ --> BioasqDataset
-class NewDataset(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
-
-    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
-    BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
-
-    # You will be able to load the "source" or "bigbio" configurations with
-    # ds_source = datasets.load_dataset('my_dataset', name='source')
-    # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio')
-
-    # For local datasets you can make use of the `data_dir` and `data_files` kwargs
-    # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits
-    # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files")
-    # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files")
-
-    # TODO: For each dataset, implement Config for Source and BigBio;
-    #  If dataset contains more than one subset (see examples/bioasq.py) implement for EACH of them.
-    #  Each of them should contain:
-    #   - name: should be unique for each dataset config eg. bioasq10b_(source|bigbio)_[bigbio_schema_name]
-    #   - version: option = (SOURCE_VERSION|BIGBIO_VERSION)
-    #   - description: one line description for the dataset
-    #   - schema: options = (source|bigbio_[bigbio_schema_name])
-    #   - subset_id: subset id is the canonical name for the dataset (eg. bioasq10b)
-    #  where [bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment)
-
-    BUILDER_CONFIGS = [
-        BigBioConfig(
-            name="[dataset_name]_source",
-            version=SOURCE_VERSION,
-            description="[dataset_name] source schema",
-            schema="source",
-            subset_id="[dataset_name]",
-        ),
-        BigBioConfig(
-            name="[dataset_name]_bigbio_[bigbio_schema_name]",
-            version=BIGBIO_VERSION,
-            description="[dataset_name] BigBio schema",
-            schema="bigbio_[bigbio_schema_name]",
-            subset_id="[dataset_name]",
-        ),
-    ]
-
-    DEFAULT_CONFIG_NAME = "[dataset_name]_source"
-
-    def _info(self) -> datasets.DatasetInfo:
-
-        # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible.
-
-        # You can arbitrarily nest lists and dictionaries.
-        # For iterables, use lists over tuples or `datasets.Sequence`
-
-        if self.config.schema == "source":
-            # TODO: Create your source schema here
-            raise NotImplementedError()
-
-            # EX: Arbitrary NER type dataset
-            # features = datasets.Features(
-            #    {
-            #        "doc_id": datasets.Value("string"),
-            #        "text": datasets.Value("string"),
-            #        "entities": [
-            #            {
-            #                "offsets": [datasets.Value("int64")],
-            #                "text": datasets.Value("string"),
-            #                "type": datasets.Value("string"),
-            #                "entity_id": datasets.Value("string"),
-            #            }
-            #        ],
-            #    }
-            # )
-
-        # Choose the appropriate bigbio schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide.
-
-        # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple bigbio configs with a bigbio_[bigbio_schema_name] format.
-
-        # For example bigbio_kb, bigbio_t2t
-        elif self.config.schema == "bigbio_[bigbio_schema_name]":
-            # e.g. features = schemas.kb_features
-            # TODO: Choose your big-bio schema here
-            raise NotImplementedError()
-
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
-        """Returns SplitGenerators."""
-        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
-
-        # If you need to access the "source" or "bigbio" config choice, that will be in self.config.name
-
-        # LOCAL DATASETS: You do not need the dl_manager; you can ignore this argument. Make sure `gen_kwargs` in the return gets passed the right filepath
-
-        # PUBLIC DATASETS: Assign your data-dir based on the dl_manager.
-
-        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager
-
-        # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files.
-
-        # TODO: KEEP if your dataset is PUBLIC; remove if not
-        urls = _URLS[_DATASETNAME]
-        data_dir = dl_manager.download_and_extract(urls)
-
-        # TODO: KEEP if your dataset is LOCAL; remove if NOT
-        if self.config.data_dir is None:
-            raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.")
-        else:
-            data_dir = self.config.data_dir
-
-        # Not all datasets have predefined canonical train/val/test splits.
-        # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data.
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                # Whatever you put in gen_kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "train.jsonl"),
-                    "split": "train",
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "test.jsonl"),
-                    "split": "test",
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "dev.jsonl"),
-                    "split": "dev",
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-
-    # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs.
-
-    def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]:
-        """Yields examples as (key, example) tuples."""
-        # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
-
-        # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
-
-        # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files
-
-        if self.config.schema == "source":
-            # TODO: yield (key, example) tuples in the original dataset schema
-            for key, example in thing:
-                yield key, example
-
-        elif self.config.schema == "bigbio_[bigbio_schema_name]":
-            # TODO: yield (key, example) tuples in the bigbio schema
-            for key, example in thing:
-                yield key, example
-
-
-# This template is based on the following template from the datasets package:
-# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py
-
-
-# This allows you to run your dataloader with `python [dataset_name].py` during development
-# TODO: Remove this before making your PR
-if __name__ == "__main__":
-    datasets.load_dataset(__file__)
diff --git a/tmp_TEST.py b/tmp_TEST.py
deleted file mode 100644
index 2e65fb4d..00000000
--- a/tmp_TEST.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from datasets import load_dataset
-
-data = load_dataset("biodatasets/why_qa/why_qa.py", name="why_qa_bigbio_qa", data_dir = "C:/Users/franc/Desktop/n2c2-community-annotations_2010-fan-why-QA.zip")
-#data = load_dataset("biodatasets/why_qa/why_qa.py", name="why_qa_source", data_dir = "C:/Users/franc/Desktop/n2c2-community-annotations_2010-fan-why-QA.zip")
\ No newline at end of file

From 945dfda0995260dba28ca0f14c238b76a4ffeb9c Mon Sep 17 00:00:00 2001
From: clancyoftheoverflow
 <32432020+clancyoftheoverflow@users.noreply.github.com>
Date: Mon, 6 Jun 2022 02:01:16 +0800
Subject: [PATCH 5/6] Delete why_qa.py

---
 biodatasets/why_qa/why_qa.py | 217 -----------------------------------
 1 file changed, 217 deletions(-)
 delete mode 100644 biodatasets/why_qa/why_qa.py

diff --git a/biodatasets/why_qa/why_qa.py b/biodatasets/why_qa/why_qa.py
deleted file mode 100644
index 4c33d50a..00000000
--- a/biodatasets/why_qa/why_qa.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-A dataset loader for the n2c2 community-annotated Why Questions dataset.
-
-https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/
-
-The dataset consists of a single archive (no splits) and it is available
-as a JSON file and as an XLSX file:
-
-    - relations_whyqa_ann-v7-share.json (in SQUAD 2.0 format)
-    - relations_whyqa_ann-v7-share.xlsx
-
-The dataset also includes TXT files with the full texts of the
-clinical notes.
-
-The files comprising this dataset must be on the users local machine
-in a single directory that is passed to `datasets.load_dataset` via
-the `data_dir` kwarg. This loader script will read the archive files
-directly (i.e. the user should not uncompress, untar or unzip any of
-the files).
-
-Registration AND submission of DUA is required to access the dataset.
-
-[bigbio_schema_name] = qa
-"""
-
-import os
-import zipfile
-import json
-from collections import defaultdict
-from typing import List, Tuple, Dict
-
-import datasets
-from utils import schemas
-from utils.configs import BigBioConfig
-from utils.constants import Tasks
-
-# TODO: Add BibTeX citation
-_CITATION = """\
-@inproceedings{,
-  author    = {Annotating and Characterizing Clinical Sentences with Explicit Why-{QA} Cues},
-  title     = {Fan, Jungwei},
-  booktitle = {Proceedings of the 2nd Clinical Natural Language Processing Workshop},
-  month     = {jun},
-  year      = {2019},
-  address   = {Minneapolis, Minnesota, USA},
-  publisher = {Association for Computational Linguistics},
-  url       = {https://aclanthology.org/W19-1913},
-  doi       = {10.18653/v1/W19-1913}
-
-}
-
-}
-"""
-
-_DATASETNAME = "[why_qa]"
-
-# TODO: Add description of the dataset here
-# You can copy an official description
-_DESCRIPTION = """\
-
-This dataset is a collection of why-questions and their answers generated
-from a corpus of clincal notes. The corpus is the 2010 i2b2/VA NLP
-challenge and consists of 426 discharge summaries from Partners
-Healthcare and Beth Israel Deaconess Medical Center.
-
-"""
-_HOMEPAGE = "https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/"
-
-_LICENSE = "External Data User Agreement"
-
-_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING]
-
-_SOURCE_VERSION = "1.0.0"
-
-_BIGBIO_VERSION = "1.0.0"
-
-def read_zip_file(file_path):
-    with zipfile.ZipFile(file_path) as zf:
-        with zf.open("n2c2-community-annotations_2010-fan-why-QA/relations_whyqa_ann-v7-share.json") as f:
-            dataset = json.load(f)
-            return dataset
-
-def _get_samples(dataset):
-    samples = dataset['data'][0]['paragraphs']
-    return samples
-
-# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
-#  Append "Dataset" to the class name: BioASQ --> BioasqDataset
-class WhyQaDataset(datasets.GeneratorBasedBuilder):
-    """n2c2 community-annotated Why Questions dataset."""
-
-    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
-    BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
-
-
-    BUILDER_CONFIGS = [
-        BigBioConfig(
-            name="why_qa_source",
-            version=SOURCE_VERSION,
-            description="why_qa source schema",
-            schema="source",
-            subset_id="why_qa",
-        ),
-        BigBioConfig(
-            name="why_qa_bigbio_qa",
-            version=BIGBIO_VERSION,
-            description="why_wa BigBio schema",
-            schema="bigbio_qa",
-            subset_id="why_qa",
-        ),
-    ]
-
-    DEFAULT_CONFIG_NAME = "why_qa_source"
-
-    def _info(self) -> datasets.DatasetInfo:
-
-        if self.config.schema == "source":
-            features = datasets.Features(
-
-            {
-            "note_id": datasets.Value("string"),
-            "qas": [
-                {"question_template": datasets.Value("string"),
-                "question": datasets.Value("string"),
-                "id": datasets.Value("string"),
-                "answers": [
-                    {"text": datasets.Value("string"),
-                    "answer_start": datasets.Value("int32"),
-                    },
-                    ],
-                "is_impossible": datasets.Value("bool"),
-                },
-                ],
-            "context": datasets.Value("string"),
-            },
-            )
-
-        elif self.config.schema == "bigbio_qa":
-            features = schemas.qa_features
-
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
-        """Returns SplitGenerators."""
-
-        if self.config.data_dir is None:
-            raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.")
-        else:
-            data_dir = self.config.data_dir
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                # Whatever you put in gen_kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "data_dir": data_dir,
-                    "split": "train",
-                },
-            ),
-        ]
-
-    def _generate_examples(self, data_dir, split: str) -> Tuple[int, Dict]:
-        """Yields examples as (key, example) tuples."""
-        dataset = read_zip_file(data_dir)
-        samples = _get_samples(dataset)
-
-        if self.config.schema == "source":
-            _id = 0
-            for sample in samples:
-                yield _id, sample
-                _id += 1
-
-        elif self.config.schema == "bigbio_[bigbio_schema_name]":
-            _id = 0
-            for sample in samples:
-                for qa in sample['qas']:
-                    ans_list = []
-                    for answer in qa["answer"]:
-                        ans = answer["text"]
-                        ans_list.append(ans)
-                    bigbio_sample = {
-                                        "id" : qa["note_id"],
-                                        "question_id" : qa["id"],
-                                        "document_id" : sample["note_id"],
-                                        "question" : qa["question"],
-                                        "type" : qa["question_template"],
-                                        "choices" : [],
-                                        "context" : sample["context"],
-                                        "answer" : ans_list,
-                                    }
-                    yield _id, bigbio_sample
-                    _id += 1
-
-
-# This template is based on the following template from the datasets package:
-# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py

From cceaaf951e4b5496669f4cfd58c5c45bcc32163a Mon Sep 17 00:00:00 2001
From: clancyoftheoverflow
 <32432020+clancyoftheoverflow@users.noreply.github.com>
Date: Mon, 6 Jun 2022 09:51:54 +0800
Subject: [PATCH 6/6] Update n2c2_2012.py

---
 biodatasets/n2c2_2012/n2c2_2012.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/biodatasets/n2c2_2012/n2c2_2012.py b/biodatasets/n2c2_2012/n2c2_2012.py
index 1248574a..76b38f17 100644
--- a/biodatasets/n2c2_2012/n2c2_2012.py
+++ b/biodatasets/n2c2_2012/n2c2_2012.py
@@ -52,6 +52,8 @@
 from utils.configs import BigBioConfig
 from utils.constants import Tasks
 
+_LOCAL = True
+
 _CITATION = """\
 @article{,
   author    = {