diff --git a/biodatasets/n2c2_2012/n2c2_2012.py b/biodatasets/n2c2_2012/n2c2_2012.py new file mode 100644 index 00000000..76b38f17 --- /dev/null +++ b/biodatasets/n2c2_2012/n2c2_2012.py @@ -0,0 +1,492 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +A dataset loader for the n2c2 2012 temporal relation dataset. + +https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/ + +The dataset consists of 1 training archive files and 1 annotated test archive file, + +* 2012-07-15.original-annotation.release.tar.gz (complete training dataset) +* 2012-08-23.test-data.groundtruth.tar.gz (annotated, complete test dataset) + +The files comprising this dataset must be on the users local machine +in a single directory that is passed to `datasets.load_dataset` via +the `data_dir` kwarg. This loader script will read the archive files +directly (i.e. the user should not uncompress, untar or unzip any of +the files). + +NOTE. The following XML files are not well formed and have been excluded from +the dataset: "23.xml","53.xml","143.xml","152.xml","272.xml","382.xml","397.xml","422.xml" +"527.xml""547.xml","627.xml","687.xml","802.xml","807.xml". + +Registration AND submission of DUA is required to access the dataset. + +[bigbio_schema_name] = kb +""" + +import os +import tarfile +from collections import defaultdict, OrderedDict +from unittest import skip +import xmltodict +import json +from typing import List, Tuple, Dict + +import datasets +from datasets import Features, Value, Sequence, ClassLabel +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + +_LOCAL = True + +_CITATION = """\ +@article{, + author = { + Sun, Weiyi and + Rumshisky, Anna and + Uzuner, Ozlem}, + title = {Evaluating temporal relations in clinical text: 2012 i2b2 Challenge}, + journal = {Journal of the American Medical Informatics Association}, + volume = {20}, + year = {5}, + pages = {806-813} + year = {2013} + month = {09} + url = {https://doi.org/10.1136/amiajnl-2013-001628}, + doi = {10.1136/amiajnl-2013-001628}, + eprint = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3756273/pdf/amiajnl-2013-001628.pdf} +} +""" + +_DATASETNAME = "n2c2_2012" + +_DESCRIPTION = """\ +This dataset is designed for the 2012 i2b2 temporal relations challenge task. + +The text annotated for this challenge comes from de-identified discharge summaries. The goal of +the annotation is to mark up temporal information present in clinical text in order to enable +reasoning and queries over the timeline of clinically relevant events for each patient. + +This annotation involves marking up three kinds of information: +1) events, +2) temporal expressions, and +3) temporal relations between events and temporal expressions. + +The latter would involve: +1) anchoring events to available temporal expressions, and +2) identifying temporal relations between events. + +The first task is to identify all clinically relevant events and situations, including symptoms, +tests, procedures, and other occurrences. The second task is to identify temporal expressions, +which include all expressions related to time, such as dates, times, frequencies, and durations. +Events and temporal expressions have a number of attributes (such as type of event or calendar +value of the temporal expression) that need to be annotated. The final task is to record the +temporal relations (e.g. before, after, simultaneous, etc.) that hold between different events or +between events and temporal expressions. + +""" + +_HOMEPAGE = "https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/" + +_LICENSE = "External Data User Agreement" + +_SUPPORTED_TASKS = [Tasks.EVENT_EXTRACTION, Tasks.RELATION_EXTRACTION] + +_SOURCE_VERSION = "1.0.0" + +_BIGBIO_VERSION = "1.0.0" + +def _read_tar_gz_train_(file_path, samples=None): + if samples is None: + samples = defaultdict(dict) + with tarfile.open(file_path, "r:gz") as tf: + for member in tf.getmembers(): + + base, filename = os.path.split(member.name) + _, ext = os.path.splitext(filename) + ext = ext[1:] # get rid of dot + sample_id = filename.split(".")[0] + + if ext == "xml" and not filename in ["23.xml", "143.xml", "152.xml", "272.xml","382.xml","422.xml","547.xml","807.xml"]: # corrputed XML files + with tf.extractfile(member) as fp: + content_bytes = fp.read() + content = content_bytes.decode("utf-8").encode() + values = xmltodict.parse(content) + samples[sample_id] = values["ClinicalNarrativeTemporalAnnotation"] + + samples_sorted = OrderedDict(sorted(samples.items(),key=lambda x: int(x[0]))) + samples = samples_sorted + samples = json.loads(json.dumps(samples)) + + return samples + +def _read_tar_gz_test_(file_path, samples=None): + if samples is None: + samples = defaultdict(dict) + print(samples) + with tarfile.open(file_path, "r:gz") as tf: + for member in tf.getmembers(): + if member.name.startswith("ground_truth/merged_xml"): + + base, filename = os.path.split(member.name) + _, ext = os.path.splitext(filename) + ext = ext[1:] # get rid of dot + sample_id = filename.split(".")[0] + + if ext == "xml" and not filename in ["53.xml", "397.xml","527.xml","627.xml","687.xml","802.xml"]: #corrupted XML files + with tf.extractfile(member) as fp: + content_bytes = fp.read() + content = content_bytes.decode("utf-8").encode() + values = xmltodict.parse(content) + samples[sample_id] = values["ClinicalNarrativeTemporalAnnotation"] + + samples_sorted = OrderedDict(sorted(samples.items(), key=lambda x: int(x[0]))) + samples = samples_sorted + samples = json.loads(json.dumps(samples)) + + return samples + +def _get_events_from_sample(sample_id, sample): + events = [] + for idx, event in enumerate(sample.get("TAGS","").get("EVENT","")): + + evs = { + "id": event.get("@id",""), + "type": event.get("@type",""), + "trigger": { + "text": [event.get("@text","")], + "offsets": [(int(event.get("@start","")), int(event.get("@end","")))], + }, + "arguments": [ + { + "role": [], + "ref_id": [], + }, + ], + } + events.append(evs) + return events + +def _get_entities_from_sample(sample_id, sample): + entities = [] + for idx, timex3 in enumerate(sample.get("TAGS","").get("TIMEX3","")): + + entity = { + "id": timex3.get("@id",""), + "type": timex3.get("@type",""), + "offsets": [(int(timex3.get("@start","")), int(timex3.get("@end","")))], + "text": [timex3.get("@text","")], + "normalized": [], + } + + entities.append(entity) + + return entities + +def _get_relations_from_sample(sample_id, sample): + + relations = [] + for idx, tlink in enumerate(sample.get("TAGS").get("TLINK")): + + rel = { + "id": tlink.get("@id"), + "type": tlink.get("@type"), + "arg1_id": tlink.get("@fromID"), + "arg2_id": tlink.get("@toID"), + "normalized": [], + } + + relations.append(rel) + + return relations + +def _get_admission_from_sample(sample_id, sample): + + admission = {} + + # When admission information was missing, an empty placeholder was added with id S0 + if sample.get("TAGS","").get("SECTIME","") == "": + admission = { + "id": "S0", + "type": "ADMISSION", + "text": [], + "offsets": [], + } + + elif len(sample.get("TAGS","").get("SECTIME","")) == 2: + for idx, sectime in enumerate(sample.get("TAGS","").get("SECTIME","")): + if sectime.get("@type","") == "ADMISSION": + admission = { + "id": sectime.get("@id",""), + "type": sectime.get("@type",""), + "text": [sectime.get("@text","")], + "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))], + } + + else: + sectime = sample.get("TAGS","").get("SECTIME","") + if sectime.get("@type","") == "ADMISSION": + admission = { + "id": sectime.get("@id",""), + "type": sectime.get("@type",""), + "text": [sectime.get("@text","")], + "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))], + } + + return admission + +def _get_discharge_from_sample(sample_id, sample): + + discharge = {} + + # When discharge information was missing, an empty placeholder was added with id S1 + if sample.get("TAGS","").get("SECTIME","") == "": + discharge = { + "id": "S1", + "type": "DISCHARGE", + "text": [], + "offsets": [], + } + + elif len(sample.get("TAGS","").get("SECTIME","")) == 2: + for idx, sectime in enumerate(sample.get("TAGS","").get("SECTIME","")): + if sectime.get("@type","") == "DISCHARGE": + discharge = { + "id": sectime.get("@id",""), + "type": sectime.get("@type",""), + "text": [sectime.get("@text","")], + "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))], + } + else: + sectime = sample.get("TAGS","").get("SECTIME","") + if sectime.get("@type","") == "DISCHARGE": + discharge = { + "id": sectime.get("@id",""), + "type": sectime.get("@type",""), + "text": [sectime.get("@text","")], + "offsets": [(int(sectime.get("@start","")), int(sectime.get("@end","")))], + } + + return discharge + + +class N2C22012TempRelDataset(datasets.GeneratorBasedBuilder): + """n2c2 2012 temporal relations challenge""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + # You will be able to load the "source" or "bigbio" configurations with + # ds_source = datasets.load_dataset('my_dataset', name='source') + # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio') + + # For local datasets you can make use of the `data_dir` and `data_files` kwargs + # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits + # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files") + # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files") + + BUILDER_CONFIGS = [ + BigBioConfig( + name="n2c2_2012_source", + version=SOURCE_VERSION, + description="n2c2_2012 source schema", + schema="source", + subset_id="n2c2_2012", + ), + BigBioConfig( + name="n2c2_2012_bigbio_kb", + version=BIGBIO_VERSION, + description="n2c2_2012 BigBio schema", + schema="bigbio_kb", + subset_id="n2c2_2012", + ), + ] + + DEFAULT_CONFIG_NAME = "n2c2_2012_source" + + def _info(self) -> datasets.DatasetInfo: + + # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible. + + # You can arbitrarily nest lists and dictionaries. + # For iterables, use lists over tuples or `datasets.Sequence` + + if self.config.schema == "source": + features = Features( + { + "sample_id": Value("string"), + "text": Value("string"), + "tags":{ + "EVENT": Sequence({"@id": Value("string"), + "@start": Value("int64"), + "@end": Value("int64"), + "@text": Value("string"), + "@modality": Value("string"), + "@polarity": Value("string"), + "@type": Value("string"), + }), + "TIMEX3": Sequence({"@id": Value("string"), + "@start": Value("int64"), + "@end": Value("int64"), + "@text": Value("string"), + "@type": Value("string"), + "@val": Value("string"), + "@mod": Value("string"), + }), + "TLINK": Sequence({"@id": Value("string"), + "@fromID": Value("string"), + "@fromText": Value("string"), + "@toID": Value("string"), + "@toText": Value("string"), + "@type": Value("string"), + }), + "SECTIME": Sequence({"@id": Value("string"), + "@start": Value("string"), + "@end": Value("string"), + "@text": Value("string"), + "@type": Value("string"), + "@dvalue": Value("string"), + }), + } + } + ) + + elif self.config.schema == "bigbio_kb": + features = schemas.kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + if self.config.data_dir is None: + raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") + else: + data_dir = self.config.data_dir + + # Not all datasets have predefined canonical train/val/test splits. + # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "data_dir": data_dir, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "data_dir": data_dir, + "split": "test", + }, + ), + ] + + @staticmethod + def _get_source_sample(sample_id, sample): + if sample.get("TAGS","").get("SECTIME","") == "": + return { + "sample_id": sample_id, + "text": sample.get("TEXT",""), + "tags":{ + "EVENT": sample.get("TAGS","").get("EVENT",""), + "TIMEX3": sample.get("TAGS","").get("TIMEX3",""), + "TLINK": sample.get("TAGS","").get("TLINK",""), + "SECTIME": [], + } + } + else: + return { + "sample_id": sample_id, + "text": sample.get("TEXT",""), + "tags":{ + "EVENT": sample.get("TAGS","").get("EVENT",""), + "TIMEX3": sample.get("TAGS","").get("TIMEX3",""), + "TLINK": sample.get("TAGS","").get("TLINK",""), + "SECTIME": sample.get("TAGS","").get("SECTIME",""), + } + } + + @staticmethod + def _get_bigbio_sample(sample_id, sample): + + passage_text = sample.get("TEXT","") + events = _get_events_from_sample(sample_id, sample) + entities = _get_entities_from_sample(sample_id, sample) + relations = _get_relations_from_sample(sample_id, sample) + admission = _get_admission_from_sample(sample_id, sample) + discharge = _get_discharge_from_sample(sample_id, sample) + + return { + "id": sample_id, + "document_id": sample_id, + "passages": [ + { + "id": f"{sample_id}-full-passage", + "type": "Clinical Narrative Temporal Annotation", + "text": [passage_text], + "offsets": [(0, len(passage_text))], + }, + admission, + discharge, + ], + "events": events, + "entities": entities, + "relations": relations, + "coreferences": [], + } + + + def _generate_examples(self, data_dir, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if split == "train": + _id = 0 + + file_path = os.path.join(data_dir, "2012-07-15.original-annotation.release.tar.gz") + samples = _read_tar_gz_train_(file_path) + for sample_id, sample in samples.items(): + if self.config.schema == "source": + yield _id, self._get_source_sample(sample_id, sample) + elif self.config.schema == "bigbio_kb": + yield _id, self._get_bigbio_sample(sample_id, sample) + _id += 1 + + elif split == "test": + _id = 0 + + file_path = os.path.join(data_dir, "2012-08-23.test-data.groundtruth.tar.gz") + samples = _read_tar_gz_test_(file_path) + for sample_id, sample in samples.items(): + if self.config.schema == "source": + yield _id, self._get_source_sample(sample_id, sample) + elif self.config.schema == "bigbio_kb": + yield _id, self._get_bigbio_sample(sample_id, sample) + _id += 1 + +# This template is based on the following template from the datasets package: +# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py \ No newline at end of file