From 97fe0a58b740870d50d358dd2b0284e90295958b Mon Sep 17 00:00:00 2001 From: clancyoftheoverflow <32432020+clancyoftheoverflow@users.noreply.github.com> Date: Thu, 19 May 2022 01:30:29 +0800 Subject: [PATCH 1/6] n2c2_ commit --- biodatasets/n2c2_2012/n2c2_2012.py | 490 +++++++++++++++++++++++++++++ 1 file changed, 490 insertions(+) create mode 100644 biodatasets/n2c2_2012/n2c2_2012.py diff --git a/biodatasets/n2c2_2012/n2c2_2012.py b/biodatasets/n2c2_2012/n2c2_2012.py new file mode 100644 index 00000000..1248574a --- /dev/null +++ b/biodatasets/n2c2_2012/n2c2_2012.py @@ -0,0 +1,490 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +A dataset loader for the n2c2 2012 temporal relation dataset. + +https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/ + +The dataset consists of 1 training archive files and 1 annotated test archive file, + +* 2012-07-15.original-annotation.release.tar.gz (complete training dataset) +* 2012-08-23.test-data.groundtruth.tar.gz (annotated, complete test dataset) + +The files comprising this dataset must be on the users local machine +in a single directory that is passed to `datasets.load_dataset` via +the `data_dir` kwarg. This loader script will read the archive files +directly (i.e. the user should not uncompress, untar or unzip any of +the files). + +NOTE. The following XML files are not well formed and have been excluded from +the dataset: "23.xml","53.xml","143.xml","152.xml","272.xml","382.xml","397.xml","422.xml" +"527.xml""547.xml","627.xml","687.xml","802.xml","807.xml". + +Registration AND submission of DUA is required to access the dataset. + +[bigbio_schema_name] = kb +""" + +import os +import tarfile +from collections import defaultdict, OrderedDict +from unittest import skip +import xmltodict +import json +from typing import List, Tuple, Dict + +import datasets +from datasets import Features, Value, Sequence, ClassLabel +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + +_CITATION = """\ +@article{, + author = { + Sun, Weiyi and + Rumshisky, Anna and + Uzuner, Ozlem}, + title = {Evaluating temporal relations in clinical text: 2012 i2b2 Challenge}, + journal = {Journal of the American Medical Informatics Association}, + volume = {20}, + year = {5}, + pages = {806-813} + year = {2013} + month = {09} + url = {https://doi.org/10.1136/amiajnl-2013-001628}, + doi = {10.1136/amiajnl-2013-001628}, + eprint = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3756273/pdf/amiajnl-2013-001628.pdf} +} +""" + +_DATASETNAME = "n2c2_2012" + +_DESCRIPTION = """\ +This dataset is designed for the 2012 i2b2 temporal relations challenge task. + +The text annotated for this challenge comes from de-identified discharge summaries. The goal of +the annotation is to mark up temporal information present in clinical text in order to enable +reasoning and queries over the timeline of clinically relevant events for each patient. + +This annotation involves marking up three kinds of information: +1) events, +2) temporal expressions, and +3) temporal relations between events and temporal expressions. + +The latter would involve: +1) anchoring events to available temporal expressions, and +2) identifying temporal relations between events. + +The first task is to identify all clinically relevant events and situations, including symptoms, +tests, procedures, and other occurrences. The second task is to identify temporal expressions, +which include all expressions related to time, such as dates, times, frequencies, and durations. +Events and temporal expressions have a number of attributes (such as type of event or calendar +value of the temporal expression) that need to be annotated. The final task is to record the +temporal relations (e.g. before, after, simultaneous, etc.) that hold between different events or +between events and temporal expressions. + +""" + +_HOMEPAGE = "https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/" + +_LICENSE = "External Data User Agreement" + +_SUPPORTED_TASKS = [Tasks.EVENT_EXTRACTION, Tasks.RELATION_EXTRACTION] + +_SOURCE_VERSION = "1.0.0" + +_BIGBIO_VERSION = "1.0.0" + +def _read_tar_gz_train_(file_path, samples=None): + if samples is None: + samples = defaultdict(dict) + with tarfile.open(file_path, "r:gz") as tf: + for member in tf.getmembers(): + + base, filename = os.path.split(member.name) + _, ext = os.path.splitext(filename) + ext = ext[1:] # get rid of dot + sample_id = filename.split(".")[0] + + if ext == "xml" and not filename in ["23.xml", "143.xml", "152.xml", "272.xml","382.xml","422.xml","547.xml","807.xml"]: # corrputed XML files + with tf.extractfile(member) as fp: + content_bytes = fp.read() + content = content_bytes.decode("utf-8").encode() + values = xmltodict.parse(content) + samples[sample_id] = values["ClinicalNarrativeTemporalAnnotation"] + + samples_sorted = OrderedDict(sorted(samples.items(),key=lambda x: int(x[0]))) + samples = samples_sorted + samples = json.loads(json.dumps(samples)) + + return samples + +def _read_tar_gz_test_(file_path, samples=None): + if samples is None: + samples = defaultdict(dict) + print(samples) + with tarfile.open(file_path, "r:gz") as tf: + for member in tf.getmembers(): + if member.name.startswith("ground_truth/merged_xml"): + + base, filename = os.path.split(member.name) + _, ext = os.path.splitext(filename) + ext = ext[1:] # get rid of dot + sample_id = filename.split(".")[0] + + if ext == "xml" and not filename in ["53.xml", "397.xml","527.xml","627.xml","687.xml","802.xml"]: #corrupted XML files + with tf.extractfile(member) as fp: + content_bytes = fp.read() + content = content_bytes.decode("utf-8").encode() + values = xmltodict.parse(content) + samples[sample_id] = values["ClinicalNarrativeTemporalAnnotation"] + + samples_sorted = OrderedDict(sorted(samples.items(), key=lambda x: int(x[0]))) + samples = samples_sorted + samples = json.loads(json.dumps(samples)) + + return samples + +def _get_events_from_sample(sample_id, sample): + events = [] + for idx, event in enumerate(sample.get("TAGS","").get("EVENT","")): + + evs = { + "id": event.get("@id",""), + "type": event.get("@type",""), + "trigger": { + "text": [event.get("@text","")], + "offsets": [(int(event.get("@start","")), int(event.get("@end","")))], + }, + "arguments": [ + { + "role": [], + "ref_id": [], + }, + ], + } + events.append(evs) + return events + +def _get_entities_from_sample(sample_id, sample): + entities = [] + for idx, timex3 in enumerate(sample.get("TAGS","").get("TIMEX3","")): + + entity = { + "id": timex3.get("@id",""), + "type": timex3.get("@type",""), + "offsets": [(int(timex3.get("@start","")), int(timex3.get("@end","")))], + "text": [timex3.get("@text","")], + "normalized": [], + } + + entities.append(entity) + + return entities + +def _get_relations_from_sample(sample_id, sample): + + relations = [] + for idx, tlink in enumerate(sample.get("TAGS").get("TLINK")): + + rel = { + "id": tlink.get("@id"), + "type": tlink.get("@type"), + "arg1_id": tlink.get("@fromID"), + "arg2_id": tlink.get("@toID"), + "normalized": [], + } + + relations.append(rel) + + return relations + +def _get_admission_from_sample(sample_id, sample): + + admission = {} + + # When admission information was missing, an empty placeholder was added with id S0 + if sample.get("TAGS","").get("SECTIME","") == "": + admission = { + "id": "S0", + "type": "ADMISSION", + "text": [], + "offsets": [], + } + + elif len(sample.get("TAGS","").get("SECTIME","")) == 2: + for idx, sectime in enumerate(sample.get("TAGS","").get("SECTIME","")): + if sectime.get("@type","") == "ADMISSION": + admission = { + "id": sectime.get("@id",""), + "type": sectime.get("@type",""), + "text": [sectime.get("@text","")], + "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))], + } + + else: + sectime = sample.get("TAGS","").get("SECTIME","") + if sectime.get("@type","") == "ADMISSION": + admission = { + "id": sectime.get("@id",""), + "type": sectime.get("@type",""), + "text": [sectime.get("@text","")], + "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))], + } + + return admission + +def _get_discharge_from_sample(sample_id, sample): + + discharge = {} + + # When discharge information was missing, an empty placeholder was added with id S1 + if sample.get("TAGS","").get("SECTIME","") == "": + discharge = { + "id": "S1", + "type": "DISCHARGE", + "text": [], + "offsets": [], + } + + elif len(sample.get("TAGS","").get("SECTIME","")) == 2: + for idx, sectime in enumerate(sample.get("TAGS","").get("SECTIME","")): + if sectime.get("@type","") == "DISCHARGE": + discharge = { + "id": sectime.get("@id",""), + "type": sectime.get("@type",""), + "text": [sectime.get("@text","")], + "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))], + } + else: + sectime = sample.get("TAGS","").get("SECTIME","") + if sectime.get("@type","") == "DISCHARGE": + discharge = { + "id": sectime.get("@id",""), + "type": sectime.get("@type",""), + "text": [sectime.get("@text","")], + "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))], + } + + return discharge + + +class N2C22012TempRelDataset(datasets.GeneratorBasedBuilder): + """n2c2 2012 temporal relations challenge""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + # You will be able to load the "source" or "bigbio" configurations with + # ds_source = datasets.load_dataset('my_dataset', name='source') + # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio') + + # For local datasets you can make use of the `data_dir` and `data_files` kwargs + # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits + # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files") + # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files") + + BUILDER_CONFIGS = [ + BigBioConfig( + name="n2c2_2012_source", + version=SOURCE_VERSION, + description="n2c2_2012 source schema", + schema="source", + subset_id="n2c2_2012", + ), + BigBioConfig( + name="n2c2_2012_bigbio_kb", + version=BIGBIO_VERSION, + description="n2c2_2012 BigBio schema", + schema="bigbio_kb", + subset_id="n2c2_2012", + ), + ] + + DEFAULT_CONFIG_NAME = "n2c2_2012_source" + + def _info(self) -> datasets.DatasetInfo: + + # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible. + + # You can arbitrarily nest lists and dictionaries. + # For iterables, use lists over tuples or `datasets.Sequence` + + if self.config.schema == "source": + features = Features( + { + "sample_id": Value("string"), + "text": Value("string"), + "tags":{ + "EVENT": Sequence({"@id": Value("string"), + "@start": Value("int64"), + "@end": Value("int64"), + "@text": Value("string"), + "@modality": Value("string"), + "@polarity": Value("string"), + "@type": Value("string"), + }), + "TIMEX3": Sequence({"@id": Value("string"), + "@start": Value("int64"), + "@end": Value("int64"), + "@text": Value("string"), + "@type": Value("string"), + "@val": Value("string"), + "@mod": Value("string"), + }), + "TLINK": Sequence({"@id": Value("string"), + "@fromID": Value("string"), + "@fromText": Value("string"), + "@toID": Value("string"), + "@toText": Value("string"), + "@type": Value("string"), + }), + "SECTIME": Sequence({"@id": Value("string"), + "@start": Value("string"), + "@end": Value("string"), + "@text": Value("string"), + "@type": Value("string"), + "@dvalue": Value("string"), + }), + } + } + ) + + elif self.config.schema == "bigbio_kb": + features = schemas.kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + if self.config.data_dir is None: + raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") + else: + data_dir = self.config.data_dir + + # Not all datasets have predefined canonical train/val/test splits. + # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "data_dir": data_dir, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "data_dir": data_dir, + "split": "test", + }, + ), + ] + + @staticmethod + def _get_source_sample(sample_id, sample): + if sample.get("TAGS","").get("SECTIME","") == "": + return { + "sample_id": sample_id, + "text": sample.get("TEXT",""), + "tags":{ + "EVENT": sample.get("TAGS","").get("EVENT",""), + "TIMEX3": sample.get("TAGS","").get("TIMEX3",""), + "TLINK": sample.get("TAGS","").get("TLINK",""), + "SECTIME": [], + } + } + else: + return { + "sample_id": sample_id, + "text": sample.get("TEXT",""), + "tags":{ + "EVENT": sample.get("TAGS","").get("EVENT",""), + "TIMEX3": sample.get("TAGS","").get("TIMEX3",""), + "TLINK": sample.get("TAGS","").get("TLINK",""), + "SECTIME": sample.get("TAGS","").get("SECTIME",""), + } + } + + @staticmethod + def _get_bigbio_sample(sample_id, sample): + + passage_text = sample.get("TEXT","") + events = _get_events_from_sample(sample_id, sample) + entities = _get_entities_from_sample(sample_id, sample) + relations = _get_relations_from_sample(sample_id, sample) + admission = _get_admission_from_sample(sample_id, sample) + discharge = _get_discharge_from_sample(sample_id, sample) + + return { + "id": sample_id, + "document_id": sample_id, + "passages": [ + { + "id": f"{sample_id}-full-passage", + "type": "Clinical Narrative Temporal Annotation", + "text": [passage_text], + "offsets": [(0, len(passage_text))], + }, + admission, + discharge, + ], + "events": events, + "entities": entities, + "relations": relations, + "coreferences": [], + } + + + def _generate_examples(self, data_dir, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if split == "train": + _id = 0 + + file_path = os.path.join(data_dir, "2012-07-15.original-annotation.release.tar.gz") + samples = _read_tar_gz_train_(file_path) + for sample_id, sample in samples.items(): + if self.config.schema == "source": + yield _id, self._get_source_sample(sample_id, sample) + elif self.config.schema == "bigbio_kb": + yield _id, self._get_bigbio_sample(sample_id, sample) + _id += 1 + + elif split == "test": + _id = 0 + + file_path = os.path.join(data_dir, "2012-08-23.test-data.groundtruth.tar.gz") + samples = _read_tar_gz_test_(file_path) + for sample_id, sample in samples.items(): + if self.config.schema == "source": + yield _id, self._get_source_sample(sample_id, sample) + elif self.config.schema == "bigbio_kb": + yield _id, self._get_bigbio_sample(sample_id, sample) + _id += 1 + +# This template is based on the following template from the datasets package: +# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py \ No newline at end of file From 7c88585059573fac85c4e67e697ebdf006c89f12 Mon Sep 17 00:00:00 2001 From: clancyoftheoverflow <32432020+clancyoftheoverflow@users.noreply.github.com> Date: Mon, 6 Jun 2022 01:33:53 +0800 Subject: [PATCH 2/6] Closes #57 --- biodatasets/emrQA/emrQA.py | 271 +++++++++++++++++++++++ biodatasets/emrQA/test.py | 7 + biodatasets/n2c2_2012/test_tmp.py | 353 ++++++++++++++++++++++++++++++ biodatasets/why_qa/test.py | 27 +++ biodatasets/why_qa/why_qa.py | 217 ++++++++++++++++++ tmp_TEST.py | 4 + 6 files changed, 879 insertions(+) create mode 100644 biodatasets/emrQA/emrQA.py create mode 100644 biodatasets/emrQA/test.py create mode 100644 biodatasets/n2c2_2012/test_tmp.py create mode 100644 biodatasets/why_qa/test.py create mode 100644 biodatasets/why_qa/why_qa.py create mode 100644 tmp_TEST.py diff --git a/biodatasets/emrQA/emrQA.py b/biodatasets/emrQA/emrQA.py new file mode 100644 index 00000000..4f5c0166 --- /dev/null +++ b/biodatasets/emrQA/emrQA.py @@ -0,0 +1,271 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo. + +When modifying it for your dataset, look for TODO items that offer specific instructions. + +Full documentation on writing dataset loading scripts can be found here: +https://huggingface.co/docs/datasets/add_dataset.html + +To create a dataset loading script you will create a class and implement 3 methods: + * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. + * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. + * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. + +TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset. + +[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) +""" + +import os +from typing import List, Tuple, Dict + +import datasets +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + +# TODO: Add BibTeX citation +_CITATION = """\ +@article{, + author = {}, + title = {}, + journal = {}, + volume = {}, + year = {}, + url = {}, + doi = {}, + biburl = {}, + bibsource = {} +} +""" + +# TODO: create a module level variable with your dataset name (should match script name) +# E.g. Hallmarks of Cancer: [dataset_name] --> hallmarks_of_cancer +_DATASETNAME = "[dataset_name]" + +# TODO: Add description of the dataset here +# You can copy an official description +_DESCRIPTION = """\ +This dataset is designed for XXX NLP task. +""" + +# TODO: Add a link to an official homepage for the dataset here (if possible) +_HOMEPAGE = "" + +# TODO: Add the licence for the dataset here (if possible) +# Note that this doesn't have to be a common open source license. +# Some datasets have custom licenses. In this case, simply put the full license terms +# into `_LICENSE` +_LICENSE = "" + +# TODO: Add links to the urls needed to download your dataset files. +# For local datasets, this variable can be an empty dictionary. + +# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators. +# In most cases the URLs will be the same for the source and bigbio config. +# However, if you need to access different files for each config you can have multiple entries in this dict. +# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) +_URLS = { + _DATASETNAME: "url or list of urls or ... ", +} + +# TODO: add supported task by dataset. One dataset may support multiple tasks +_SUPPORTED_TASKS = [] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" +# This version doesn't have to be consistent with semantic versioning. Anything that is +# provided by the original dataset as a version goes. +_SOURCE_VERSION = "" + +_BIGBIO_VERSION = "1.0.0" + + +# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case +# Append "Dataset" to the class name: BioASQ --> BioasqDataset +class NewDataset(datasets.GeneratorBasedBuilder): + """TODO: Short description of my dataset.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + # You will be able to load the "source" or "bigbio" configurations with + # ds_source = datasets.load_dataset('my_dataset', name='source') + # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio') + + # For local datasets you can make use of the `data_dir` and `data_files` kwargs + # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits + # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files") + # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files") + + # TODO: For each dataset, implement Config for Source and BigBio; + # If dataset contains more than one subset (see examples/bioasq.py) implement for EACH of them. + # Each of them should contain: + # - name: should be unique for each dataset config eg. bioasq10b_(source|bigbio)_[bigbio_schema_name] + # - version: option = (SOURCE_VERSION|BIGBIO_VERSION) + # - description: one line description for the dataset + # - schema: options = (source|bigbio_[bigbio_schema_name]) + # - subset_id: subset id is the canonical name for the dataset (eg. bioasq10b) + # where [bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="[dataset_name]_source", + version=SOURCE_VERSION, + description="[dataset_name] source schema", + schema="source", + subset_id="[dataset_name]", + ), + BigBioConfig( + name="[dataset_name]_bigbio_[bigbio_schema_name]", + version=BIGBIO_VERSION, + description="[dataset_name] BigBio schema", + schema="bigbio_[bigbio_schema_name]", + subset_id="[dataset_name]", + ), + ] + + DEFAULT_CONFIG_NAME = "[dataset_name]_source" + + def _info(self) -> datasets.DatasetInfo: + + # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible. + + # You can arbitrarily nest lists and dictionaries. + # For iterables, use lists over tuples or `datasets.Sequence` + + if self.config.schema == "source": + # TODO: Create your source schema here + raise NotImplementedError() + + # EX: Arbitrary NER type dataset + # features = datasets.Features( + # { + # "doc_id": datasets.Value("string"), + # "text": datasets.Value("string"), + # "entities": [ + # { + # "offsets": [datasets.Value("int64")], + # "text": datasets.Value("string"), + # "type": datasets.Value("string"), + # "entity_id": datasets.Value("string"), + # } + # ], + # } + # ) + + # Choose the appropriate bigbio schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide. + + # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple bigbio configs with a bigbio_[bigbio_schema_name] format. + + # For example bigbio_kb, bigbio_t2t + elif self.config.schema == "bigbio_[bigbio_schema_name]": + # e.g. features = schemas.kb_features + # TODO: Choose your big-bio schema here + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration + + # If you need to access the "source" or "bigbio" config choice, that will be in self.config.name + + # LOCAL DATASETS: You do not need the dl_manager; you can ignore this argument. Make sure `gen_kwargs` in the return gets passed the right filepath + + # PUBLIC DATASETS: Assign your data-dir based on the dl_manager. + + # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager + + # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files. + + # TODO: KEEP if your dataset is PUBLIC; remove if not + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + # TODO: KEEP if your dataset is LOCAL; remove if NOT + if self.config.data_dir is None: + raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") + else: + data_dir = self.config.data_dir + + # Not all datasets have predefined canonical train/val/test splits. + # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": os.path.join(data_dir, "train.jsonl"), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join(data_dir, "test.jsonl"), + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir, "dev.jsonl"), + "split": "dev", + }, + ), + ] + + # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` + + # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. + + def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. + + # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. + + # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files + + if self.config.schema == "source": + # TODO: yield (key, example) tuples in the original dataset schema + for key, example in thing: + yield key, example + + elif self.config.schema == "bigbio_[bigbio_schema_name]": + # TODO: yield (key, example) tuples in the bigbio schema + for key, example in thing: + yield key, example + + +# This template is based on the following template from the datasets package: +# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py + + +# This allows you to run your dataloader with `python [dataset_name].py` during development +# TODO: Remove this before making your PR +if __name__ == "__main__": + datasets.load_dataset(__file__) diff --git a/biodatasets/emrQA/test.py b/biodatasets/emrQA/test.py new file mode 100644 index 00000000..c37aa75b --- /dev/null +++ b/biodatasets/emrQA/test.py @@ -0,0 +1,7 @@ +import json + +with open('C:/Users/franc/Desktop/dataset/data.json') as json_file: + data = json.load(json_file) + + +print(data['data'][0]) \ No newline at end of file diff --git a/biodatasets/n2c2_2012/test_tmp.py b/biodatasets/n2c2_2012/test_tmp.py new file mode 100644 index 00000000..44f092ed --- /dev/null +++ b/biodatasets/n2c2_2012/test_tmp.py @@ -0,0 +1,353 @@ +import tarfile +from collections import defaultdict, OrderedDict +import os +from unittest import skip +from lxml import etree +import xmltodict +import json + +""""" +def _read_tar_gz_old_(file_path, samples=None): + if samples is None: + samples = defaultdict(dict) + print(samples) + with tarfile.open(file_path, "r:gz") as tf: + for member in tf.getmembers(): + + base, filename = os.path.split(member.name) + _, ext = os.path.splitext(filename) + ext = ext[1:] # get rid of dot + sample_id = filename.split(".")[0] + + if ext == "xml" and not filename in ["23.xml", "143.xml", "152.xml", "272.xml","382.xml","422.xml","547.xml","807.xml"]: + with tf.extractfile(member) as fp: + content_bytes = fp.read() + content = content_bytes.decode("utf-8").encode() + root = etree.XML(content) + text, tags = root.getchildren() + samples[sample_id]["txt"] = text.text + samples[sample_id]["tags"] = {} + + for child in tags: + + + if child.tag == "EVENT": + samples[sample_id]["tags"][child.tag]["id"] = child.get("id") + samples[sample_id]["tags"][child.tag]["start"] = child.get("start") + samples[sample_id]["tags"][child.tag]["end"] = child.get("end") + samples[sample_id]["tags"][child.tag]["text"] = child.get("text") + samples[sample_id]["tags"][child.tag]["modality"] = child.get("modality") + samples[sample_id]["tags"][child.tag]["polarity"] = child.get("polarity") + samples[sample_id]["tags"][child.tag]["type"] = child.get("type") + if child.tag == "TIMEx3": + samples[sample_id]["tags"][child.tag]["id"] = child.get("id") + samples[sample_id]["tags"][child.tag]["start"] = child.get("start") + samples[sample_id]["tags"][child.tag]["end"] = child.get("end") + samples[sample_id]["tags"][child.tag]["text"] = child.get("text") + samples[sample_id]["tags"][child.tag]["type"] = child.get("type") + samples[sample_id]["tags"][child.tag]["val"] = child.get("val") + samples[sample_id]["tags"][child.tag]["mod"] = child.get("mod") + if child.tag == "TLINK": + samples[sample_id]["tags"][child.tag]["id"] = child.get("id") + samples[sample_id]["tags"][child.tag]["fromID"] = child.get("fromID") + samples[sample_id]["tags"][child.tag]["fromText"] = child.get("fromTExt") + samples[sample_id]["tags"][child.tag]["toID"] = child.get("toID") + samples[sample_id]["tags"][child.tag]["toText"] = child.get("toText") + samples[sample_id]["tags"][child.tag]["type"] = child.get("type") + if child.tag == "SECTIME": + samples[sample_id]["tags"][child.tag]["id"] = child.get("id") + samples[sample_id]["tags"][child.tag]["start"] = child.get("start") + samples[sample_id]["tags"][child.tag]["end"] = child.get("end") + samples[sample_id]["tags"][child.tag]["text"] = child.get("text") + samples[sample_id]["tags"][child.tag]["type"] = child.get("type") + samples[sample_id]["tags"][child.tag]["dvalue"] = child.get("dvalue") + +""""" + +def _read_tar_gz_train_(file_path, samples=None): + if samples is None: + samples = defaultdict(dict) + print(samples) + with tarfile.open(file_path, "r:gz") as tf: + for member in tf.getmembers(): + + base, filename = os.path.split(member.name) + _, ext = os.path.splitext(filename) + ext = ext[1:] # get rid of dot + sample_id = filename.split(".")[0] + + if ext == "xml" and not filename in ["23.xml", "143.xml", "152.xml", "272.xml","382.xml","422.xml","547.xml","807.xml"]: + with tf.extractfile(member) as fp: + content_bytes = fp.read() + content = content_bytes.decode("utf-8").encode() + values = xmltodict.parse(content) + samples[sample_id] = values["ClinicalNarrativeTemporalAnnotation"] + + samples_sorted = OrderedDict(sorted(samples.items(), key=lambda x: int(x[0]))) + samples = samples_sorted + samples = json.loads(json.dumps(samples)) + + return samples + + """ + with open('C:/Users/franc/Desktop/result.json', 'w') as fp: + json.dump(samples, fp) + + + for i, event in enumerate(samples["1"]["TAGS"]["EVENT"]): + print(event["@id"]) + #print(samples["1"]["TAGS"]["EVENT"][event]) + + admission = {} + discharge = {} + for idx, sectime in enumerate(samples["1"]["TAGS"]["SECTIME"]): + if sectime["@type"] == "ADMISSION": + admission = { + "id": sectime["@id"], + "type": sectime["@type"], + "text": sectime["@text"], + "offsets": [(sectime["@start"], sectime["@end"])], + } + elif sectime["@type"] == "DISCHARGE": + discharge = { + "id": sectime["@id"], + "type": sectime["@type"], + "text": sectime["@text"], + "offsets": [(sectime["@start"], sectime["@end"])], + } + print(admission) + + + sample = samples["1"] + x = {"id": 1, "tags": { + "EVENT": sample["TAGS"]["EVENT"]} + } + print(x) + + for sample_id, sample in samples.items(): + print(sample) + print("/////") + print(sample_id) + print("-----------------------------------------------------------------------------") + + """ +######################################################################################################################## +def _read_tar_gz_test_(file_path, samples=None): + if samples is None: + samples = defaultdict(dict) + print(samples) + with tarfile.open(file_path, "r:gz") as tf: + for member in tf.getmembers(): + if member.name.startswith("ground_truth/merged_xml"): + + base, filename = os.path.split(member.name) + _, ext = os.path.splitext(filename) + ext = ext[1:] # get rid of dot + sample_id = filename.split(".")[0] + + if ext == "xml" and not filename in ["53.xml", "397.xml","527.xml","627.xml","687.xml","802.xml"]: + with tf.extractfile(member) as fp: + content_bytes = fp.read() + content = content_bytes.decode("utf-8").encode() + values = xmltodict.parse(content) + samples[sample_id] = values["ClinicalNarrativeTemporalAnnotation"] + + samples_sorted = OrderedDict(sorted(samples.items(), key=lambda x: int(x[0]))) + samples = samples_sorted + + return samples + + #with open('C:/Users/franc/Desktop/result_test.json', 'w') as fp: + #json.dump(samples, fp) + +############################################################################################################################################################################ +def _get_events_from_sample(sample_id, sample): + events = [] + for idx, event in enumerate(sample["TAGS"]["EVENT"]): + + evs = { + "id": event["@id"], + "type": event["@type"], + "offsets": [(event["@start"], event["@end"])], + "text": event["@text"], + } + + events.append(evs) + print(events) + +def _get_source_sample(sample_id, sample): + output = { + "id": sample_id, + "text": sample["TEXT"], + "tags": sample["TAGS"], + } + return output + + +#_read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz") +#_read_tar_gz_test_("C:/Users/franc/Desktop/2012-08-23.test-data.groundtruth.tar.gz") + + +def test(): + samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz") + #samples = _read_tar_gz_test_("C:/Users/franc/Desktop/2012-08-23.test-data.groundtruth.tar.gz") + + #print(samples) + _id = 0 + for sample_id, sample in samples.items(): + if sample.get("TAGS","").get("SECTIME","") == "": + print("empty") + else: + print(sample.get("TAGS","").get("SECTIME","")) + print(_id) + print(sample_id) + print("-----------------------------------------------------------------------------") + _id += 1 + + +def test_2_(): + samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz") + + for sample_id, sample in samples.items(): + events = [] + + for idx, event in enumerate(sample.get("TAGS","").get("EVENT","")): + + evs = { + "id": event.get("@id",""), + "type": event.get("@type",""), + "trigger": { + "text": event.get("@text",""), + "offests": [(int(event.get("@start","")), int(event.get("@end","")))], + }, + "arguments": [ + { + "role": "NA", + "ref_id": "NA", + }, + ], + } + events.append(evs) + + print(events) + print("############################################################") + print("############################################################") + +def test_3_(): + samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz") + + for sample_id, sample in samples.items(): + print(sample_id) + print(len(sample.get("TAGS","").get("SECTIME",""))) + + admission = [] + + if sample.get("TAGS","").get("SECTIME","") == "": + pass + elif len(sample.get("TAGS","").get("SECTIME","")) == 2: + for idx, sectime in enumerate(sample.get("TAGS","").get("SECTIME","")): + if sectime.get("@type","") == "ADMISSION": + adm = { + "id": sectime.get("@id",""), + "type": sectime.get("@type",""), + "text": [sectime.get("@text","")], + "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))], + } + admission.append(adm) + else: + sectime = sample.get("TAGS","").get("SECTIME","") + if sectime.get("@type","") == "ADMISSION": + adm = { + "id": sectime.get("@id",""), + "type": sectime.get("@type",""), + "text": [sectime.get("@text","")], + "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))], + } + admission.append(adm) + + print(admission) + print("############################################################") + + +def test_4_(): + samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz") + + for sample_id, sample in samples.items(): + print(sample_id) + print(len(sample.get("TAGS","").get("SECTIME",""))) + + discharge = [] + + if sample.get("TAGS","").get("SECTIME","") == "": + pass + elif len(sample.get("TAGS","").get("SECTIME","")) == 2: + for idx, sectime in enumerate(sample.get("TAGS","").get("SECTIME","")): + if sectime.get("@type","") == "DISCHARGE": + dis = { + "id": sectime.get("@id",""), + "type": sectime.get("@type",""), + "text": [sectime.get("@text","")], + "offsets": [(sectime.get("@start",""), sectime.get("@end",""))], + } + discharge.append(dis) + else: + sectime = sample.get("TAGS","").get("SECTIME","") + if sectime.get("@type","") == "DISCHARGE": + dis = { + "id": sectime.get("@id",""), + "type": sectime.get("@type",""), + "text": [sectime.get("@text","")], + "offsets": [(sectime.get("@start",""), sectime.get("@end",""))], + } + discharge.append(dis) + + print(discharge) + print("############################################################") + + + +test_3_() + +############################################################################################################################################################################################## +""" + if self.config.schema == "source": + features = Features( + { + "doc_id": Value("string"), + "text": Value("string"), + "entities":{ + "EVENT": Sequence({"id": Value("string"), + "start": Value("int64"), + "end": Value("int64"), + "text": Value("string"), + "modality": ClassLabel(names=["FACTUAL", "CONDITIONAL","POSSIBLE","PROPOSED"]), + "polarity": ClassLabel(names=["POS", "NEG"]), + "type": ClassLabel(names=["TEST","PROBLEM","TREATMENT","CLINICAL_DEPT","EVIDENTIAL","OCCURRENCE"]), + }), + "TIMEX3": Sequence({"id": Value("string"), + "start": Value("int64"), + "end": Value("int64"), + "text": Value("string"), + "type": ClassLabel(names=["DATE", "TIME","DURATION","FREQUENCY"]), + "val": Value("string"), + "mod": ClassLabel(names=["NA","MORE","LESS","APPROX","START","END","MIDDLE"]), + }), + "TLINK": Sequence({"id": Value("string"), + "fromID": Value("string"), + "fromText": Value("string"), + "toID": Value("string"), + "toText": Value("string"), + "type": ClassLabel(names=["BEFORE","AFTER","SIMULTANEOUS","OVERLAP","BEGUN_BY","DURING","BEFORE_OVERLAP"]), + }), + "SECTIME": Sequence({"id": Value("string"), + "start": Value("int64"), + "end": Value("int64"), + "text": Value("string"), + "type": ClassLabel(names=["ADMISSION","DISCHARGE"]), + "dvalue": Value("string"), + }), + } + } + ) + +""" \ No newline at end of file diff --git a/biodatasets/why_qa/test.py b/biodatasets/why_qa/test.py new file mode 100644 index 00000000..95a9fef6 --- /dev/null +++ b/biodatasets/why_qa/test.py @@ -0,0 +1,27 @@ +import zipfile +import json + +def read_zip_file(file_path): + with zipfile.ZipFile(file_path) as zf: + with zf.open("n2c2-community-annotations_2010-fan-why-QA/relations_whyqa_ann-v7-share.json") as f: + dataset = json.load(f) + return dataset + + +dataset = read_zip_file("C:/Users/franc/Desktop/n2c2-community-annotations_2010-fan-why-QA.zip") + +samples = dataset['data'][0]['paragraphs'] + +for sample in samples: + print(sample["qas"]) + print("######################################") + + + + + +# for sample in samples: +# for qa in sample['qas']: +# print(qa['id']) +# print(sample['note_id']) +# print("######################################") diff --git a/biodatasets/why_qa/why_qa.py b/biodatasets/why_qa/why_qa.py new file mode 100644 index 00000000..4c33d50a --- /dev/null +++ b/biodatasets/why_qa/why_qa.py @@ -0,0 +1,217 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +A dataset loader for the n2c2 community-annotated Why Questions dataset. + +https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/ + +The dataset consists of a single archive (no splits) and it is available +as a JSON file and as an XLSX file: + + - relations_whyqa_ann-v7-share.json (in SQUAD 2.0 format) + - relations_whyqa_ann-v7-share.xlsx + +The dataset also includes TXT files with the full texts of the +clinical notes. + +The files comprising this dataset must be on the users local machine +in a single directory that is passed to `datasets.load_dataset` via +the `data_dir` kwarg. This loader script will read the archive files +directly (i.e. the user should not uncompress, untar or unzip any of +the files). + +Registration AND submission of DUA is required to access the dataset. + +[bigbio_schema_name] = qa +""" + +import os +import zipfile +import json +from collections import defaultdict +from typing import List, Tuple, Dict + +import datasets +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + +# TODO: Add BibTeX citation +_CITATION = """\ +@inproceedings{, + author = {Annotating and Characterizing Clinical Sentences with Explicit Why-{QA} Cues}, + title = {Fan, Jungwei}, + booktitle = {Proceedings of the 2nd Clinical Natural Language Processing Workshop}, + month = {jun}, + year = {2019}, + address = {Minneapolis, Minnesota, USA}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/W19-1913}, + doi = {10.18653/v1/W19-1913} + +} + +} +""" + +_DATASETNAME = "[why_qa]" + +# TODO: Add description of the dataset here +# You can copy an official description +_DESCRIPTION = """\ + +This dataset is a collection of why-questions and their answers generated +from a corpus of clincal notes. The corpus is the 2010 i2b2/VA NLP +challenge and consists of 426 discharge summaries from Partners +Healthcare and Beth Israel Deaconess Medical Center. + +""" +_HOMEPAGE = "https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/" + +_LICENSE = "External Data User Agreement" + +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] + +_SOURCE_VERSION = "1.0.0" + +_BIGBIO_VERSION = "1.0.0" + +def read_zip_file(file_path): + with zipfile.ZipFile(file_path) as zf: + with zf.open("n2c2-community-annotations_2010-fan-why-QA/relations_whyqa_ann-v7-share.json") as f: + dataset = json.load(f) + return dataset + +def _get_samples(dataset): + samples = dataset['data'][0]['paragraphs'] + return samples + +# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case +# Append "Dataset" to the class name: BioASQ --> BioasqDataset +class WhyQaDataset(datasets.GeneratorBasedBuilder): + """n2c2 community-annotated Why Questions dataset.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + + BUILDER_CONFIGS = [ + BigBioConfig( + name="why_qa_source", + version=SOURCE_VERSION, + description="why_qa source schema", + schema="source", + subset_id="why_qa", + ), + BigBioConfig( + name="why_qa_bigbio_qa", + version=BIGBIO_VERSION, + description="why_wa BigBio schema", + schema="bigbio_qa", + subset_id="why_qa", + ), + ] + + DEFAULT_CONFIG_NAME = "why_qa_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + + { + "note_id": datasets.Value("string"), + "qas": [ + {"question_template": datasets.Value("string"), + "question": datasets.Value("string"), + "id": datasets.Value("string"), + "answers": [ + {"text": datasets.Value("string"), + "answer_start": datasets.Value("int32"), + }, + ], + "is_impossible": datasets.Value("bool"), + }, + ], + "context": datasets.Value("string"), + }, + ) + + elif self.config.schema == "bigbio_qa": + features = schemas.qa_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + if self.config.data_dir is None: + raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") + else: + data_dir = self.config.data_dir + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "data_dir": data_dir, + "split": "train", + }, + ), + ] + + def _generate_examples(self, data_dir, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + dataset = read_zip_file(data_dir) + samples = _get_samples(dataset) + + if self.config.schema == "source": + _id = 0 + for sample in samples: + yield _id, sample + _id += 1 + + elif self.config.schema == "bigbio_[bigbio_schema_name]": + _id = 0 + for sample in samples: + for qa in sample['qas']: + ans_list = [] + for answer in qa["answer"]: + ans = answer["text"] + ans_list.append(ans) + bigbio_sample = { + "id" : qa["note_id"], + "question_id" : qa["id"], + "document_id" : sample["note_id"], + "question" : qa["question"], + "type" : qa["question_template"], + "choices" : [], + "context" : sample["context"], + "answer" : ans_list, + } + yield _id, bigbio_sample + _id += 1 + + +# This template is based on the following template from the datasets package: +# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py diff --git a/tmp_TEST.py b/tmp_TEST.py new file mode 100644 index 00000000..2e65fb4d --- /dev/null +++ b/tmp_TEST.py @@ -0,0 +1,4 @@ +from datasets import load_dataset + +data = load_dataset("biodatasets/why_qa/why_qa.py", name="why_qa_bigbio_qa", data_dir = "C:/Users/franc/Desktop/n2c2-community-annotations_2010-fan-why-QA.zip") +#data = load_dataset("biodatasets/why_qa/why_qa.py", name="why_qa_source", data_dir = "C:/Users/franc/Desktop/n2c2-community-annotations_2010-fan-why-QA.zip") \ No newline at end of file From eaea00dece7ee4e5960f1d1af6d5c602ceaaf355 Mon Sep 17 00:00:00 2001 From: clancyoftheoverflow <32432020+clancyoftheoverflow@users.noreply.github.com> Date: Mon, 6 Jun 2022 01:38:21 +0800 Subject: [PATCH 3/6] Delete test files --- biodatasets/emrQA/test.py | 7 - biodatasets/n2c2_2012/test_tmp.py | 353 ------------------------------ biodatasets/why_qa/test.py | 27 --- 3 files changed, 387 deletions(-) delete mode 100644 biodatasets/emrQA/test.py delete mode 100644 biodatasets/n2c2_2012/test_tmp.py delete mode 100644 biodatasets/why_qa/test.py diff --git a/biodatasets/emrQA/test.py b/biodatasets/emrQA/test.py deleted file mode 100644 index c37aa75b..00000000 --- a/biodatasets/emrQA/test.py +++ /dev/null @@ -1,7 +0,0 @@ -import json - -with open('C:/Users/franc/Desktop/dataset/data.json') as json_file: - data = json.load(json_file) - - -print(data['data'][0]) \ No newline at end of file diff --git a/biodatasets/n2c2_2012/test_tmp.py b/biodatasets/n2c2_2012/test_tmp.py deleted file mode 100644 index 44f092ed..00000000 --- a/biodatasets/n2c2_2012/test_tmp.py +++ /dev/null @@ -1,353 +0,0 @@ -import tarfile -from collections import defaultdict, OrderedDict -import os -from unittest import skip -from lxml import etree -import xmltodict -import json - -""""" -def _read_tar_gz_old_(file_path, samples=None): - if samples is None: - samples = defaultdict(dict) - print(samples) - with tarfile.open(file_path, "r:gz") as tf: - for member in tf.getmembers(): - - base, filename = os.path.split(member.name) - _, ext = os.path.splitext(filename) - ext = ext[1:] # get rid of dot - sample_id = filename.split(".")[0] - - if ext == "xml" and not filename in ["23.xml", "143.xml", "152.xml", "272.xml","382.xml","422.xml","547.xml","807.xml"]: - with tf.extractfile(member) as fp: - content_bytes = fp.read() - content = content_bytes.decode("utf-8").encode() - root = etree.XML(content) - text, tags = root.getchildren() - samples[sample_id]["txt"] = text.text - samples[sample_id]["tags"] = {} - - for child in tags: - - - if child.tag == "EVENT": - samples[sample_id]["tags"][child.tag]["id"] = child.get("id") - samples[sample_id]["tags"][child.tag]["start"] = child.get("start") - samples[sample_id]["tags"][child.tag]["end"] = child.get("end") - samples[sample_id]["tags"][child.tag]["text"] = child.get("text") - samples[sample_id]["tags"][child.tag]["modality"] = child.get("modality") - samples[sample_id]["tags"][child.tag]["polarity"] = child.get("polarity") - samples[sample_id]["tags"][child.tag]["type"] = child.get("type") - if child.tag == "TIMEx3": - samples[sample_id]["tags"][child.tag]["id"] = child.get("id") - samples[sample_id]["tags"][child.tag]["start"] = child.get("start") - samples[sample_id]["tags"][child.tag]["end"] = child.get("end") - samples[sample_id]["tags"][child.tag]["text"] = child.get("text") - samples[sample_id]["tags"][child.tag]["type"] = child.get("type") - samples[sample_id]["tags"][child.tag]["val"] = child.get("val") - samples[sample_id]["tags"][child.tag]["mod"] = child.get("mod") - if child.tag == "TLINK": - samples[sample_id]["tags"][child.tag]["id"] = child.get("id") - samples[sample_id]["tags"][child.tag]["fromID"] = child.get("fromID") - samples[sample_id]["tags"][child.tag]["fromText"] = child.get("fromTExt") - samples[sample_id]["tags"][child.tag]["toID"] = child.get("toID") - samples[sample_id]["tags"][child.tag]["toText"] = child.get("toText") - samples[sample_id]["tags"][child.tag]["type"] = child.get("type") - if child.tag == "SECTIME": - samples[sample_id]["tags"][child.tag]["id"] = child.get("id") - samples[sample_id]["tags"][child.tag]["start"] = child.get("start") - samples[sample_id]["tags"][child.tag]["end"] = child.get("end") - samples[sample_id]["tags"][child.tag]["text"] = child.get("text") - samples[sample_id]["tags"][child.tag]["type"] = child.get("type") - samples[sample_id]["tags"][child.tag]["dvalue"] = child.get("dvalue") - -""""" - -def _read_tar_gz_train_(file_path, samples=None): - if samples is None: - samples = defaultdict(dict) - print(samples) - with tarfile.open(file_path, "r:gz") as tf: - for member in tf.getmembers(): - - base, filename = os.path.split(member.name) - _, ext = os.path.splitext(filename) - ext = ext[1:] # get rid of dot - sample_id = filename.split(".")[0] - - if ext == "xml" and not filename in ["23.xml", "143.xml", "152.xml", "272.xml","382.xml","422.xml","547.xml","807.xml"]: - with tf.extractfile(member) as fp: - content_bytes = fp.read() - content = content_bytes.decode("utf-8").encode() - values = xmltodict.parse(content) - samples[sample_id] = values["ClinicalNarrativeTemporalAnnotation"] - - samples_sorted = OrderedDict(sorted(samples.items(), key=lambda x: int(x[0]))) - samples = samples_sorted - samples = json.loads(json.dumps(samples)) - - return samples - - """ - with open('C:/Users/franc/Desktop/result.json', 'w') as fp: - json.dump(samples, fp) - - - for i, event in enumerate(samples["1"]["TAGS"]["EVENT"]): - print(event["@id"]) - #print(samples["1"]["TAGS"]["EVENT"][event]) - - admission = {} - discharge = {} - for idx, sectime in enumerate(samples["1"]["TAGS"]["SECTIME"]): - if sectime["@type"] == "ADMISSION": - admission = { - "id": sectime["@id"], - "type": sectime["@type"], - "text": sectime["@text"], - "offsets": [(sectime["@start"], sectime["@end"])], - } - elif sectime["@type"] == "DISCHARGE": - discharge = { - "id": sectime["@id"], - "type": sectime["@type"], - "text": sectime["@text"], - "offsets": [(sectime["@start"], sectime["@end"])], - } - print(admission) - - - sample = samples["1"] - x = {"id": 1, "tags": { - "EVENT": sample["TAGS"]["EVENT"]} - } - print(x) - - for sample_id, sample in samples.items(): - print(sample) - print("/////") - print(sample_id) - print("-----------------------------------------------------------------------------") - - """ -######################################################################################################################## -def _read_tar_gz_test_(file_path, samples=None): - if samples is None: - samples = defaultdict(dict) - print(samples) - with tarfile.open(file_path, "r:gz") as tf: - for member in tf.getmembers(): - if member.name.startswith("ground_truth/merged_xml"): - - base, filename = os.path.split(member.name) - _, ext = os.path.splitext(filename) - ext = ext[1:] # get rid of dot - sample_id = filename.split(".")[0] - - if ext == "xml" and not filename in ["53.xml", "397.xml","527.xml","627.xml","687.xml","802.xml"]: - with tf.extractfile(member) as fp: - content_bytes = fp.read() - content = content_bytes.decode("utf-8").encode() - values = xmltodict.parse(content) - samples[sample_id] = values["ClinicalNarrativeTemporalAnnotation"] - - samples_sorted = OrderedDict(sorted(samples.items(), key=lambda x: int(x[0]))) - samples = samples_sorted - - return samples - - #with open('C:/Users/franc/Desktop/result_test.json', 'w') as fp: - #json.dump(samples, fp) - -############################################################################################################################################################################ -def _get_events_from_sample(sample_id, sample): - events = [] - for idx, event in enumerate(sample["TAGS"]["EVENT"]): - - evs = { - "id": event["@id"], - "type": event["@type"], - "offsets": [(event["@start"], event["@end"])], - "text": event["@text"], - } - - events.append(evs) - print(events) - -def _get_source_sample(sample_id, sample): - output = { - "id": sample_id, - "text": sample["TEXT"], - "tags": sample["TAGS"], - } - return output - - -#_read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz") -#_read_tar_gz_test_("C:/Users/franc/Desktop/2012-08-23.test-data.groundtruth.tar.gz") - - -def test(): - samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz") - #samples = _read_tar_gz_test_("C:/Users/franc/Desktop/2012-08-23.test-data.groundtruth.tar.gz") - - #print(samples) - _id = 0 - for sample_id, sample in samples.items(): - if sample.get("TAGS","").get("SECTIME","") == "": - print("empty") - else: - print(sample.get("TAGS","").get("SECTIME","")) - print(_id) - print(sample_id) - print("-----------------------------------------------------------------------------") - _id += 1 - - -def test_2_(): - samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz") - - for sample_id, sample in samples.items(): - events = [] - - for idx, event in enumerate(sample.get("TAGS","").get("EVENT","")): - - evs = { - "id": event.get("@id",""), - "type": event.get("@type",""), - "trigger": { - "text": event.get("@text",""), - "offests": [(int(event.get("@start","")), int(event.get("@end","")))], - }, - "arguments": [ - { - "role": "NA", - "ref_id": "NA", - }, - ], - } - events.append(evs) - - print(events) - print("############################################################") - print("############################################################") - -def test_3_(): - samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz") - - for sample_id, sample in samples.items(): - print(sample_id) - print(len(sample.get("TAGS","").get("SECTIME",""))) - - admission = [] - - if sample.get("TAGS","").get("SECTIME","") == "": - pass - elif len(sample.get("TAGS","").get("SECTIME","")) == 2: - for idx, sectime in enumerate(sample.get("TAGS","").get("SECTIME","")): - if sectime.get("@type","") == "ADMISSION": - adm = { - "id": sectime.get("@id",""), - "type": sectime.get("@type",""), - "text": [sectime.get("@text","")], - "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))], - } - admission.append(adm) - else: - sectime = sample.get("TAGS","").get("SECTIME","") - if sectime.get("@type","") == "ADMISSION": - adm = { - "id": sectime.get("@id",""), - "type": sectime.get("@type",""), - "text": [sectime.get("@text","")], - "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))], - } - admission.append(adm) - - print(admission) - print("############################################################") - - -def test_4_(): - samples = _read_tar_gz_train_("C:/Users/franc/Desktop/2012-07-15.original-annotation.release.tar.gz") - - for sample_id, sample in samples.items(): - print(sample_id) - print(len(sample.get("TAGS","").get("SECTIME",""))) - - discharge = [] - - if sample.get("TAGS","").get("SECTIME","") == "": - pass - elif len(sample.get("TAGS","").get("SECTIME","")) == 2: - for idx, sectime in enumerate(sample.get("TAGS","").get("SECTIME","")): - if sectime.get("@type","") == "DISCHARGE": - dis = { - "id": sectime.get("@id",""), - "type": sectime.get("@type",""), - "text": [sectime.get("@text","")], - "offsets": [(sectime.get("@start",""), sectime.get("@end",""))], - } - discharge.append(dis) - else: - sectime = sample.get("TAGS","").get("SECTIME","") - if sectime.get("@type","") == "DISCHARGE": - dis = { - "id": sectime.get("@id",""), - "type": sectime.get("@type",""), - "text": [sectime.get("@text","")], - "offsets": [(sectime.get("@start",""), sectime.get("@end",""))], - } - discharge.append(dis) - - print(discharge) - print("############################################################") - - - -test_3_() - -############################################################################################################################################################################################## -""" - if self.config.schema == "source": - features = Features( - { - "doc_id": Value("string"), - "text": Value("string"), - "entities":{ - "EVENT": Sequence({"id": Value("string"), - "start": Value("int64"), - "end": Value("int64"), - "text": Value("string"), - "modality": ClassLabel(names=["FACTUAL", "CONDITIONAL","POSSIBLE","PROPOSED"]), - "polarity": ClassLabel(names=["POS", "NEG"]), - "type": ClassLabel(names=["TEST","PROBLEM","TREATMENT","CLINICAL_DEPT","EVIDENTIAL","OCCURRENCE"]), - }), - "TIMEX3": Sequence({"id": Value("string"), - "start": Value("int64"), - "end": Value("int64"), - "text": Value("string"), - "type": ClassLabel(names=["DATE", "TIME","DURATION","FREQUENCY"]), - "val": Value("string"), - "mod": ClassLabel(names=["NA","MORE","LESS","APPROX","START","END","MIDDLE"]), - }), - "TLINK": Sequence({"id": Value("string"), - "fromID": Value("string"), - "fromText": Value("string"), - "toID": Value("string"), - "toText": Value("string"), - "type": ClassLabel(names=["BEFORE","AFTER","SIMULTANEOUS","OVERLAP","BEGUN_BY","DURING","BEFORE_OVERLAP"]), - }), - "SECTIME": Sequence({"id": Value("string"), - "start": Value("int64"), - "end": Value("int64"), - "text": Value("string"), - "type": ClassLabel(names=["ADMISSION","DISCHARGE"]), - "dvalue": Value("string"), - }), - } - } - ) - -""" \ No newline at end of file diff --git a/biodatasets/why_qa/test.py b/biodatasets/why_qa/test.py deleted file mode 100644 index 95a9fef6..00000000 --- a/biodatasets/why_qa/test.py +++ /dev/null @@ -1,27 +0,0 @@ -import zipfile -import json - -def read_zip_file(file_path): - with zipfile.ZipFile(file_path) as zf: - with zf.open("n2c2-community-annotations_2010-fan-why-QA/relations_whyqa_ann-v7-share.json") as f: - dataset = json.load(f) - return dataset - - -dataset = read_zip_file("C:/Users/franc/Desktop/n2c2-community-annotations_2010-fan-why-QA.zip") - -samples = dataset['data'][0]['paragraphs'] - -for sample in samples: - print(sample["qas"]) - print("######################################") - - - - - -# for sample in samples: -# for qa in sample['qas']: -# print(qa['id']) -# print(sample['note_id']) -# print("######################################") From 0df92942e5079907f08767678248c87559dbf3c7 Mon Sep 17 00:00:00 2001 From: clancyoftheoverflow <32432020+clancyoftheoverflow@users.noreply.github.com> Date: Mon, 6 Jun 2022 01:43:21 +0800 Subject: [PATCH 4/6] Delete tmp files --- biodatasets/emrQA/emrQA.py | 271 ------------------------------------- tmp_TEST.py | 4 - 2 files changed, 275 deletions(-) delete mode 100644 biodatasets/emrQA/emrQA.py delete mode 100644 tmp_TEST.py diff --git a/biodatasets/emrQA/emrQA.py b/biodatasets/emrQA/emrQA.py deleted file mode 100644 index 4f5c0166..00000000 --- a/biodatasets/emrQA/emrQA.py +++ /dev/null @@ -1,271 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo. - -When modifying it for your dataset, look for TODO items that offer specific instructions. - -Full documentation on writing dataset loading scripts can be found here: -https://huggingface.co/docs/datasets/add_dataset.html - -To create a dataset loading script you will create a class and implement 3 methods: - * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. - * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. - * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. - -TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset. - -[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) -""" - -import os -from typing import List, Tuple, Dict - -import datasets -from utils import schemas -from utils.configs import BigBioConfig -from utils.constants import Tasks - -# TODO: Add BibTeX citation -_CITATION = """\ -@article{, - author = {}, - title = {}, - journal = {}, - volume = {}, - year = {}, - url = {}, - doi = {}, - biburl = {}, - bibsource = {} -} -""" - -# TODO: create a module level variable with your dataset name (should match script name) -# E.g. Hallmarks of Cancer: [dataset_name] --> hallmarks_of_cancer -_DATASETNAME = "[dataset_name]" - -# TODO: Add description of the dataset here -# You can copy an official description -_DESCRIPTION = """\ -This dataset is designed for XXX NLP task. -""" - -# TODO: Add a link to an official homepage for the dataset here (if possible) -_HOMEPAGE = "" - -# TODO: Add the licence for the dataset here (if possible) -# Note that this doesn't have to be a common open source license. -# Some datasets have custom licenses. In this case, simply put the full license terms -# into `_LICENSE` -_LICENSE = "" - -# TODO: Add links to the urls needed to download your dataset files. -# For local datasets, this variable can be an empty dictionary. - -# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators. -# In most cases the URLs will be the same for the source and bigbio config. -# However, if you need to access different files for each config you can have multiple entries in this dict. -# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) -_URLS = { - _DATASETNAME: "url or list of urls or ... ", -} - -# TODO: add supported task by dataset. One dataset may support multiple tasks -_SUPPORTED_TASKS = [] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] - -# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" -# This version doesn't have to be consistent with semantic versioning. Anything that is -# provided by the original dataset as a version goes. -_SOURCE_VERSION = "" - -_BIGBIO_VERSION = "1.0.0" - - -# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case -# Append "Dataset" to the class name: BioASQ --> BioasqDataset -class NewDataset(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) - - # You will be able to load the "source" or "bigbio" configurations with - # ds_source = datasets.load_dataset('my_dataset', name='source') - # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio') - - # For local datasets you can make use of the `data_dir` and `data_files` kwargs - # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits - # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files") - # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files") - - # TODO: For each dataset, implement Config for Source and BigBio; - # If dataset contains more than one subset (see examples/bioasq.py) implement for EACH of them. - # Each of them should contain: - # - name: should be unique for each dataset config eg. bioasq10b_(source|bigbio)_[bigbio_schema_name] - # - version: option = (SOURCE_VERSION|BIGBIO_VERSION) - # - description: one line description for the dataset - # - schema: options = (source|bigbio_[bigbio_schema_name]) - # - subset_id: subset id is the canonical name for the dataset (eg. bioasq10b) - # where [bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) - - BUILDER_CONFIGS = [ - BigBioConfig( - name="[dataset_name]_source", - version=SOURCE_VERSION, - description="[dataset_name] source schema", - schema="source", - subset_id="[dataset_name]", - ), - BigBioConfig( - name="[dataset_name]_bigbio_[bigbio_schema_name]", - version=BIGBIO_VERSION, - description="[dataset_name] BigBio schema", - schema="bigbio_[bigbio_schema_name]", - subset_id="[dataset_name]", - ), - ] - - DEFAULT_CONFIG_NAME = "[dataset_name]_source" - - def _info(self) -> datasets.DatasetInfo: - - # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible. - - # You can arbitrarily nest lists and dictionaries. - # For iterables, use lists over tuples or `datasets.Sequence` - - if self.config.schema == "source": - # TODO: Create your source schema here - raise NotImplementedError() - - # EX: Arbitrary NER type dataset - # features = datasets.Features( - # { - # "doc_id": datasets.Value("string"), - # "text": datasets.Value("string"), - # "entities": [ - # { - # "offsets": [datasets.Value("int64")], - # "text": datasets.Value("string"), - # "type": datasets.Value("string"), - # "entity_id": datasets.Value("string"), - # } - # ], - # } - # ) - - # Choose the appropriate bigbio schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide. - - # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple bigbio configs with a bigbio_[bigbio_schema_name] format. - - # For example bigbio_kb, bigbio_t2t - elif self.config.schema == "bigbio_[bigbio_schema_name]": - # e.g. features = schemas.kb_features - # TODO: Choose your big-bio schema here - raise NotImplementedError() - - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - homepage=_HOMEPAGE, - license=_LICENSE, - citation=_CITATION, - ) - - def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: - """Returns SplitGenerators.""" - # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration - - # If you need to access the "source" or "bigbio" config choice, that will be in self.config.name - - # LOCAL DATASETS: You do not need the dl_manager; you can ignore this argument. Make sure `gen_kwargs` in the return gets passed the right filepath - - # PUBLIC DATASETS: Assign your data-dir based on the dl_manager. - - # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager - - # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files. - - # TODO: KEEP if your dataset is PUBLIC; remove if not - urls = _URLS[_DATASETNAME] - data_dir = dl_manager.download_and_extract(urls) - - # TODO: KEEP if your dataset is LOCAL; remove if NOT - if self.config.data_dir is None: - raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") - else: - data_dir = self.config.data_dir - - # Not all datasets have predefined canonical train/val/test splits. - # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - # Whatever you put in gen_kwargs will be passed to _generate_examples - gen_kwargs={ - "filepath": os.path.join(data_dir, "train.jsonl"), - "split": "train", - }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": os.path.join(data_dir, "test.jsonl"), - "split": "test", - }, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={ - "filepath": os.path.join(data_dir, "dev.jsonl"), - "split": "dev", - }, - ), - ] - - # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` - - # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. - - def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: - """Yields examples as (key, example) tuples.""" - # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. - - # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. - - # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files - - if self.config.schema == "source": - # TODO: yield (key, example) tuples in the original dataset schema - for key, example in thing: - yield key, example - - elif self.config.schema == "bigbio_[bigbio_schema_name]": - # TODO: yield (key, example) tuples in the bigbio schema - for key, example in thing: - yield key, example - - -# This template is based on the following template from the datasets package: -# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py - - -# This allows you to run your dataloader with `python [dataset_name].py` during development -# TODO: Remove this before making your PR -if __name__ == "__main__": - datasets.load_dataset(__file__) diff --git a/tmp_TEST.py b/tmp_TEST.py deleted file mode 100644 index 2e65fb4d..00000000 --- a/tmp_TEST.py +++ /dev/null @@ -1,4 +0,0 @@ -from datasets import load_dataset - -data = load_dataset("biodatasets/why_qa/why_qa.py", name="why_qa_bigbio_qa", data_dir = "C:/Users/franc/Desktop/n2c2-community-annotations_2010-fan-why-QA.zip") -#data = load_dataset("biodatasets/why_qa/why_qa.py", name="why_qa_source", data_dir = "C:/Users/franc/Desktop/n2c2-community-annotations_2010-fan-why-QA.zip") \ No newline at end of file From 945dfda0995260dba28ca0f14c238b76a4ffeb9c Mon Sep 17 00:00:00 2001 From: clancyoftheoverflow <32432020+clancyoftheoverflow@users.noreply.github.com> Date: Mon, 6 Jun 2022 02:01:16 +0800 Subject: [PATCH 5/6] Delete why_qa.py --- biodatasets/why_qa/why_qa.py | 217 ----------------------------------- 1 file changed, 217 deletions(-) delete mode 100644 biodatasets/why_qa/why_qa.py diff --git a/biodatasets/why_qa/why_qa.py b/biodatasets/why_qa/why_qa.py deleted file mode 100644 index 4c33d50a..00000000 --- a/biodatasets/why_qa/why_qa.py +++ /dev/null @@ -1,217 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -A dataset loader for the n2c2 community-annotated Why Questions dataset. - -https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/ - -The dataset consists of a single archive (no splits) and it is available -as a JSON file and as an XLSX file: - - - relations_whyqa_ann-v7-share.json (in SQUAD 2.0 format) - - relations_whyqa_ann-v7-share.xlsx - -The dataset also includes TXT files with the full texts of the -clinical notes. - -The files comprising this dataset must be on the users local machine -in a single directory that is passed to `datasets.load_dataset` via -the `data_dir` kwarg. This loader script will read the archive files -directly (i.e. the user should not uncompress, untar or unzip any of -the files). - -Registration AND submission of DUA is required to access the dataset. - -[bigbio_schema_name] = qa -""" - -import os -import zipfile -import json -from collections import defaultdict -from typing import List, Tuple, Dict - -import datasets -from utils import schemas -from utils.configs import BigBioConfig -from utils.constants import Tasks - -# TODO: Add BibTeX citation -_CITATION = """\ -@inproceedings{, - author = {Annotating and Characterizing Clinical Sentences with Explicit Why-{QA} Cues}, - title = {Fan, Jungwei}, - booktitle = {Proceedings of the 2nd Clinical Natural Language Processing Workshop}, - month = {jun}, - year = {2019}, - address = {Minneapolis, Minnesota, USA}, - publisher = {Association for Computational Linguistics}, - url = {https://aclanthology.org/W19-1913}, - doi = {10.18653/v1/W19-1913} - -} - -} -""" - -_DATASETNAME = "[why_qa]" - -# TODO: Add description of the dataset here -# You can copy an official description -_DESCRIPTION = """\ - -This dataset is a collection of why-questions and their answers generated -from a corpus of clincal notes. The corpus is the 2010 i2b2/VA NLP -challenge and consists of 426 discharge summaries from Partners -Healthcare and Beth Israel Deaconess Medical Center. - -""" -_HOMEPAGE = "https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/" - -_LICENSE = "External Data User Agreement" - -_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] - -_SOURCE_VERSION = "1.0.0" - -_BIGBIO_VERSION = "1.0.0" - -def read_zip_file(file_path): - with zipfile.ZipFile(file_path) as zf: - with zf.open("n2c2-community-annotations_2010-fan-why-QA/relations_whyqa_ann-v7-share.json") as f: - dataset = json.load(f) - return dataset - -def _get_samples(dataset): - samples = dataset['data'][0]['paragraphs'] - return samples - -# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case -# Append "Dataset" to the class name: BioASQ --> BioasqDataset -class WhyQaDataset(datasets.GeneratorBasedBuilder): - """n2c2 community-annotated Why Questions dataset.""" - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) - - - BUILDER_CONFIGS = [ - BigBioConfig( - name="why_qa_source", - version=SOURCE_VERSION, - description="why_qa source schema", - schema="source", - subset_id="why_qa", - ), - BigBioConfig( - name="why_qa_bigbio_qa", - version=BIGBIO_VERSION, - description="why_wa BigBio schema", - schema="bigbio_qa", - subset_id="why_qa", - ), - ] - - DEFAULT_CONFIG_NAME = "why_qa_source" - - def _info(self) -> datasets.DatasetInfo: - - if self.config.schema == "source": - features = datasets.Features( - - { - "note_id": datasets.Value("string"), - "qas": [ - {"question_template": datasets.Value("string"), - "question": datasets.Value("string"), - "id": datasets.Value("string"), - "answers": [ - {"text": datasets.Value("string"), - "answer_start": datasets.Value("int32"), - }, - ], - "is_impossible": datasets.Value("bool"), - }, - ], - "context": datasets.Value("string"), - }, - ) - - elif self.config.schema == "bigbio_qa": - features = schemas.qa_features - - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - homepage=_HOMEPAGE, - license=_LICENSE, - citation=_CITATION, - ) - - def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: - """Returns SplitGenerators.""" - - if self.config.data_dir is None: - raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") - else: - data_dir = self.config.data_dir - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - # Whatever you put in gen_kwargs will be passed to _generate_examples - gen_kwargs={ - "data_dir": data_dir, - "split": "train", - }, - ), - ] - - def _generate_examples(self, data_dir, split: str) -> Tuple[int, Dict]: - """Yields examples as (key, example) tuples.""" - dataset = read_zip_file(data_dir) - samples = _get_samples(dataset) - - if self.config.schema == "source": - _id = 0 - for sample in samples: - yield _id, sample - _id += 1 - - elif self.config.schema == "bigbio_[bigbio_schema_name]": - _id = 0 - for sample in samples: - for qa in sample['qas']: - ans_list = [] - for answer in qa["answer"]: - ans = answer["text"] - ans_list.append(ans) - bigbio_sample = { - "id" : qa["note_id"], - "question_id" : qa["id"], - "document_id" : sample["note_id"], - "question" : qa["question"], - "type" : qa["question_template"], - "choices" : [], - "context" : sample["context"], - "answer" : ans_list, - } - yield _id, bigbio_sample - _id += 1 - - -# This template is based on the following template from the datasets package: -# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py From cceaaf951e4b5496669f4cfd58c5c45bcc32163a Mon Sep 17 00:00:00 2001 From: clancyoftheoverflow <32432020+clancyoftheoverflow@users.noreply.github.com> Date: Mon, 6 Jun 2022 09:51:54 +0800 Subject: [PATCH 6/6] Update n2c2_2012.py --- biodatasets/n2c2_2012/n2c2_2012.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/biodatasets/n2c2_2012/n2c2_2012.py b/biodatasets/n2c2_2012/n2c2_2012.py index 1248574a..76b38f17 100644 --- a/biodatasets/n2c2_2012/n2c2_2012.py +++ b/biodatasets/n2c2_2012/n2c2_2012.py @@ -52,6 +52,8 @@ from utils.configs import BigBioConfig from utils.constants import Tasks +_LOCAL = True + _CITATION = """\ @article{, author = {