From ae433ec8078bc47d8f2cfca1251d054253afe450 Mon Sep 17 00:00:00 2001 From: Terazus Date: Tue, 19 Mar 2024 11:49:32 +0000 Subject: [PATCH 1/7] Refactor isatab loader --- isatools/isatab/load/core.py | 656 +++++++++++++++++++++++++++++++- isatools/isatab/load/mapping.py | 62 +++ isatools/isatab/load/read.py | 8 +- 3 files changed, 707 insertions(+), 19 deletions(-) create mode 100644 isatools/isatab/load/mapping.py diff --git a/isatools/isatab/load/core.py b/isatools/isatab/load/core.py index d953c385..74f9bf87 100644 --- a/isatools/isatab/load/core.py +++ b/isatools/isatab/load/core.py @@ -1,8 +1,14 @@ +from __future__ import annotations +from typing import TextIO +from io import StringIO + +from abc import ABCMeta, abstractmethod + from os import path from glob import glob from re import compile -from pandas import merge, read_csv +from pandas import merge, read_csv, DataFrame, Series from numpy import nan from isatools.utils import utf8_text_file_open @@ -20,9 +26,629 @@ Study, StudyFactor, Protocol, + Process, ProtocolParameter, Assay ) +from .mapping import investigation_sections_mapping, get_investigation_base_output, study_sections_mapping + + +class ISATabReader: + """ A class to read an ISA-Tab investigation file into a dictionary of DataFrames + + :param fp: A file-like buffer object of the investigation file + """ + + def __init__(self, fp: TextIO) -> None: + """ Constructor for the ISATabReader class """ + self.memory_file: TextIO = fp + self.dataframe_dict: dict[str, DataFrame | str, list[DataFrame]] = {} + + def __del__(self) -> None: + """ Destructor for the ISATabReader class """ + self.memory_file.close() + + @property + def memory_file(self) -> TextIO: + """ A file-like buffer object + + :return: A file-like buffer object + """ + return self.__memory_file + + @memory_file.setter + def memory_file(self, fp: TextIO) -> None: + """ Reads the input file into memory, stripping out comments and sets the memory_file property + + :param fp: A file-like buffer object + """ + memory_file: StringIO = StringIO() + line: bool | str = True + while line: + line = fp.readline() + if not line.lstrip().startswith('#'): + memory_file.write(line) + memory_file.seek(0) + self.__memory_file = memory_file + + def __peek(self) -> str: + """Peek at the next line without moving to the next line. This function + get the position of the next line, reads the next line, then resets the + file pointer to the original position + + :return: The next line past the current line + """ + position: int = self.memory_file.tell() + line: str = self.memory_file.readline() + self.memory_file.seek(position) + return line + + def __read_tab_section(self, sec_key: str, next_sec_key: str) -> StringIO: + """Slices a file by section delimited by section keys + + :param sec_key: Delimiter key of beginning of section + :param next_sec_key: Delimiter key of end of section + :return: A memory file of the section slice, as a string buffer object + """ + fileline: str = self.memory_file.readline() + normed_line: str = fileline.rstrip().strip('"') + memory_file: StringIO = StringIO() + + if not normed_line == sec_key: + raise IOError(f"Expected: {sec_key} section, but got: {normed_line}") + while not self.__peek().rstrip() == next_sec_key: + fileline = self.memory_file.readline() + if not fileline: + break + memory_file.write(fileline.rstrip() + '\n') + memory_file.seek(0) + return memory_file + + def __build_section_df(self, current_section_key: str, next_section_key: str) -> DataFrame: + """Reads a file section into a DataFrame + + :param current_section_key: Name of the current section + :param next_section_key: Name of the next section + :return: A DataFrame corresponding to the file section + """ + file_handler: StringIO = self.__read_tab_section(sec_key=current_section_key, next_sec_key=next_section_key) + df: DataFrame = read_csv( + filepath_or_buffer=file_handler, + sep='\t', + engine='python', + encoding='utf-8' + ).dropna(axis=1, how='all').T + # Strip out the nan entries + df.replace(nan, '', regex=True, inplace=True) + # Reset study_index so it is accessible as column + df.reset_index(inplace=True) + # If all was OK, promote this row to the column headers + df.columns = df.iloc[0] + # Return the re-indexed DataFrame + return df.reindex(df.index.drop(0)) + + def run(self) -> dict[str, DataFrame | str, list[DataFrame]]: + """ Main method to run the ISATabReader and return the dictionary of DataFrames + + :return: A dictionary holding a set of DataFrames for each section of the investigation file + """ + # Make a copy of the base output to avoid modifying the original + output: dict[str, DataFrame | str, list] = {**get_investigation_base_output()} + for section, section_keys in investigation_sections_mapping.items(): + output[section] = self.__build_section_df(**section_keys) + while self.__peek(): + for section, section_keys in study_sections_mapping.items(): + output[section].append(self.__build_section_df(**section_keys)) + return output + + +class ISATabLoaderMixin(metaclass=ABCMeta): + """ A mixin to provide modeling for the ISATab loaders. Provides shared methods and attributes and implementations + """ + + ontology_source_map: dict + skip_load_tables: bool = False + filepath: str + + def __get_ontology_source(self, term_source_ref) -> OntologySource | None: + """ Small wrapper to return an ontology source from the map or None if not found + + :param term_source_ref: The term source reference + :return: An OntologySource object or None + """ + return self.ontology_source_map[term_source_ref] if term_source_ref else None + + def get_contacts(self, contact_dataframe: DataFrame) -> list[Person]: + """Get a list of Person objects from the relevant investigation file + section + + :param contact_dataframe: A CONTACTS section DataFrame + :return: A list of Person objects + """ + contacts: list[Person] = [] + prefix: str + + if 'Investigation Person Last Name' in contact_dataframe.columns: + prefix = 'Investigation ' + elif 'Study Person Last Name' in contact_dataframe.columns: + prefix = 'Study ' + else: + raise KeyError + + for current_row in contact_dataframe.to_dict(orient='records'): + person: Person = Person( + last_name=current_row[prefix + 'Person Last Name'], + first_name=current_row[prefix + 'Person First Name'], + mid_initials=current_row[prefix + 'Person Mid Initials'], + email=current_row[prefix + 'Person Email'], + phone=current_row[prefix + 'Person Phone'], + fax=current_row[prefix + 'Person Fax'], + address=current_row[prefix + 'Person Address'], + affiliation=current_row[prefix + 'Person Affiliation'] + ) + person.roles = self.get_ontology_annotations( + vals=current_row[prefix + 'Person Roles'], + accessions=current_row[prefix + 'Person Roles Term Accession Number'], + ts_refs=current_row[prefix + 'Person Roles Term Source REF'] + ) + person.comments = self.get_comments_row(contact_dataframe.columns, current_row) + contacts.append(person) + + return contacts + + @staticmethod + def get_comments(section_df: DataFrame) -> list[Comment]: + """Get Comments from a section DataFrame + + :param section_df: A section DataFrame + :return: A list of Comment objects as found in the section + """ + comments: list[Comment] = [] + for col in [x for x in section_df.columns if _RX_COMMENT.match(str(x))]: + for _, current_row in section_df.iterrows(): + comments.append(Comment(name=next(iter(_RX_COMMENT.findall(col))), value=current_row[col])) + return comments + + @staticmethod + def get_comments_row(cols, row) -> list[Comment]: + """Get Comments in a given DataFrame row + + :param cols: List of DataFrame columns + :param row: DataFrame row as a Series object + :return: A list of Comment objects + """ + comments: list[Comment] = [] + for col in [x for x in cols if _RX_COMMENT.match(str(x))]: + comments.append(Comment(name=next(iter(_RX_COMMENT.findall(col))), value=row[col])) + return comments + + def get_ontology_annotation(self, val, accession, ts_ref) -> OntologyAnnotation | None: + """Gets a OntologyAnnotation for a give value, accession and + term source REF + + :param val: Value of the OA + :param accession: Term Accession Number of the OA + :param ts_ref: Term Source REF of the OA + :return: An OntologyAnnotation object + """ + if val == '' and accession == '': + return None + return OntologyAnnotation( + term=val, + term_accession=accession, + term_source=self.__get_ontology_source(ts_ref) + ) + + def get_ontology_annotations(self, vals, accessions, ts_refs) -> list[OntologyAnnotation]: + """ Gets a list of OntologyAnnotations from semicolon delimited lists + + :param vals: A list of values, separated by semi-colons + :param accessions: A list of accessions, separated by semicolons + :param ts_refs: A list of term source REFs, separated by semicolons + :return: A list of OntologyAnnotation objects + """ + ontology_annotations: list[OntologyAnnotation] = [] + accession_split: list[str] = accessions.split(';') + ts_refs_split: list[str] = ts_refs.split(';') + + # if no acc or ts_refs + if accession_split == [''] and ts_refs_split == ['']: + for val in vals.split(';'): + ontology_annotations.append(OntologyAnnotation(term=val)) + else: + for index, val in enumerate(vals.split(';')): + ontology_annotation: OntologyAnnotation | None = self.get_ontology_annotation( + val=val, accession=accessions.split(';')[index], ts_ref=ts_refs.split(';')[index] + ) + if ontology_annotation: + ontology_annotations.append(ontology_annotation) + return ontology_annotations + + def get_publications(self, section_df) -> list[Publication]: + publications: list[Publication] = [] + prefix: str + + if 'Investigation PubMed ID' in section_df.columns: + prefix = 'Investigation ' + elif 'Study PubMed ID' in section_df.columns: + prefix = 'Study ' + else: + raise KeyError + + for _, current_row in section_df.iterrows(): + publication: Publication = Publication( + pubmed_id=current_row[prefix + 'PubMed ID'], + doi=current_row[prefix + 'Publication DOI'], + author_list=current_row[prefix + 'Publication Author List'], + title=current_row[prefix + 'Publication Title'] + ) + publication.status = self.get_ontology_annotation( + current_row[prefix + 'Publication Status'], + current_row[prefix + 'Publication Status Term Accession Number'], + current_row[prefix + 'Publication Status Term Source REF']) + publication.comments = self.get_comments_row(section_df.columns, current_row) + publications.append(publication) + return publications + + @abstractmethod + def load(self, **kwargs): + raise NotImplementedError + + +class ISATabLoaderStudyAssayMixin(metaclass=ABCMeta): + """ A mixin for the Study and Assay loaders. Provides shared abstract methods to prevent code duplication """ + + unknown_protocol_description: str = "This protocol was auto-generated where a protocol could not be determined." + + def update_protocols(self, process: Process, study: Study, protocol_map) -> None: + """ Update the protocols in the process with the protocol map and binds it to the study in case of an + unknown protocol + + :param process: The process to update + :param study: The study to bind the protocol to + :param protocol_map: A dictionary of Protocol objects references + """ + if process.executes_protocol in protocol_map: + protocol_name: str | Protocol = process.executes_protocol + process.executes_protocol = protocol_map[protocol_name] + return + if 'unknown' in protocol_map: + process.executes_protocol = protocol_map['unknown'] + return + protocol: Protocol = Protocol(name="unknown protocol", description=self.unknown_protocol_description) + protocol_map['unknown'] = protocol + process.executes_protocol = protocol + study.protocols.append(protocol) + process.executes_protocol = protocol + + @staticmethod + def load_misc( + target: Study | Assay, + samples: dict, + processes: dict, + characteristic_categories: dict, + unit_categories: dict + ) -> Study | Assay: + """ Loads misc data and update the target object with the given data. The data to be loaded includes: + - samples + - process_sequence + - characteristic_categories + - units in the study or assay + + :param target: The study or assay to updated + :param samples: A dictionary of Sample objects + :param processes: A dictionary of Process objects + :param characteristic_categories: A dictionary of characteristic categories + :param unit_categories: A dictionary of unit categories + :return: The updated study or assay + """ + target.samples = sorted(list(samples.values()), key=lambda x: x.name) + target.process_sequence = list(processes.values()) + target.characteristic_categories = sorted(list(characteristic_categories.values()), key=lambda x: x.term) + target.units = sorted(list(unit_categories.values()), key=lambda x: x.term) + return target + + +class ISATabInvestigationLoader(ISATabLoaderMixin): + """ A class to load an ISA-Tab investigation file into an Investigation object + + :param file: A file-like buffer object or a string representing a file path / directory containing the ISA-Tab + :param run: Whether to run the load method in the constructor + :param skip_load_table: Whether to skip loading the table files + """ + + def __init__(self, file: TextIO | str, run: bool = True, skip_load_table: bool = False) -> None: + """ Constructor for the ISATabInvestigationLoader class + + """ + ISATabLoaderMixin.skip_load_tables = skip_load_table + self.__df_dict: dict = {} + self.file: TextIO = file + self.__investigation: Investigation = Investigation() + if run: + self.load() + + def __del__(self, **kwargs) -> None: + """ Destructor hook for the ISATabInvestigationLoader class. Called by the garbage collector """ + self.file.close() + + @property + def investigation(self) -> Investigation: + """ The getter for the investigation object + + :return: An Investigation object + """ + return self.__investigation + + @property + def file(self) -> TextIO: + """ Getter for the in memory file-like buffer object + + :return: A file-like buffer object + """ + return self.__file + + @file.setter + def file(self, file: str | TextIO) -> None: + """ Setter for the file property. Also sets the __df_dict property + + :param file: A file-like buffer object or a string representing a file path / directory containing the ISA-Tab + """ + file_content: TextIO | None = None + if isinstance(file, str): + if path.isdir(file): + fnames: list = glob(path.join(file, "i_*.txt")) + assert len(fnames) == 1 + file_content = utf8_text_file_open(fnames[0]) + elif hasattr(file, 'read'): + file_content = file + else: + raise IOError("Cannot resolve input file") + self.__file = file_content + isatab_reader: ISATabReader = ISATabReader(file_content) + self.__df_dict = isatab_reader.run() + ISATabLoaderMixin.filepath = self.file.name + + def __get_ontology_sources(self, row: Series) -> None: + """ Get an ontology source from the given row at the top of the investigation file + + :param row: A row from the investigation file + """ + ontology_source: OntologySource = OntologySource( + name=row['Term Source Name'], + file=row['Term Source File'], + version=row['Term Source Version'], + description=row['Term Source Description']) + ontology_source.comments = self.get_comments(self.__df_dict['ontology_sources']) + self.__investigation.ontology_source_references.append(ontology_source) + + def __load_investigation(self) -> None: + """ Loads all data regarding the investigation into the Investigation object. Studies and assays are + loaded in a separate private method. + """ + self.__df_dict['ontology_sources'].apply(lambda r: self.__get_ontology_sources(r), axis=1) + ISATabLoaderMixin.ontology_source_map = dict( + map(lambda x: (x.name, x), self.investigation.ontology_source_references) + ) + + if not self.__df_dict['investigation'].empty: + row = self.__df_dict['investigation'].iloc[0] + self.investigation.identifier = str(row['Investigation Identifier']) + self.investigation.title = row['Investigation Title'] + self.investigation.description = row['Investigation Description'] + self.investigation.submission_date = row['Investigation Submission Date'] + self.investigation.public_release_date = row['Investigation Public Release Date'] + self.investigation.publications = self.get_publications(self.__df_dict['i_publications']) + self.investigation.contacts = self.get_contacts(self.__df_dict['i_contacts']) + self.investigation.comments = self.get_comments(self.__df_dict['investigation']) + + def __load_studies(self) -> None: + """ Loads all the studies inside the investigation object """ + for i, row in enumerate(self.__df_dict['studies']): + row = row.iloc[0] + study_loader: ISATabStudyLoader = ISATabStudyLoader(row, self.__df_dict, i) + study_loader.load() + self.__investigation.studies.append(study_loader.study) + + def load(self): + """ Public wrapper to load the investigation file into the Investigation object. """ + self.__load_investigation() + self.__load_studies() + + +class ISATabStudyLoader(ISATabLoaderMixin, ISATabLoaderStudyAssayMixin): + """ A class to load an ISA-Tab study file into a Study object + + :param row: A row from the study file + :param df_dict: A dictionary of DataFrames containing the data extracted from the investigation file + :param index: The study_index of this study in this investigation + """ + + def __init__(self, row: DataFrame, df_dict: dict, index: int) -> None: + """ Constructor for the ISATabStudyLoader class """ + self.__study_index: int = index + self.__row: DataFrame = row + self.__protocol_map: dict[str, Protocol] = {} + + self.__publications: list[DataFrame] = df_dict['s_publications'] + self.__contacts: list[DataFrame] = df_dict['s_contacts'] + self.__comments: DataFrame = df_dict['studies'] + self.__design_descriptors: list[DataFrame] = df_dict['s_design_descriptors'] + self.__factors: list[DataFrame] = df_dict['s_factors'] + self.__protocols: list[DataFrame] = df_dict['s_protocols'] + self.__assays: list[DataFrame] = df_dict['s_assays'] + + self.study: Study | None = None + + def __load_design_descriptors(self) -> list[OntologyAnnotation]: + """ Load the design descriptors from the study file into the Study object + + :return: A list of OntologyAnnotation describing design descriptors + """ + design_descriptors: list[OntologyAnnotation] = [] + for _, row in self.__design_descriptors[self.__study_index].iterrows(): + design_descriptor = self.get_ontology_annotation( + row['Study Design Type'], + row['Study Design Type Term Accession Number'], + row['Study Design Type Term Source REF']) + design_descriptor.comments = self.get_comments_row( + self.__design_descriptors[self.__study_index].columns, row + ) + design_descriptors.append(design_descriptor) + return design_descriptors + + def __load_factors(self) -> list[StudyFactor]: + """ Load the factors from the study file into the Study object + + :return: A list of StudyFactor + """ + factors: list[StudyFactor] = [] + for _, row in self.__factors[self.__study_index].iterrows(): + factor = StudyFactor(name=row['Study Factor Name']) + factor.factor_type = self.get_ontology_annotation( + row['Study Factor Type'], + row['Study Factor Type Term Accession Number'], + row['Study Factor Type Term Source REF']) + factor.comments = self.get_comments_row(self.__factors[self.__study_index].columns, row) + factors.append(factor) + return factors + + def __load_protocols(self) -> list[Protocol]: + """ Load the protocols from the study file into the Study object + + :return: A list of Protocol + """ + protocols: list[Protocol] = [] + for _, row in self.__protocols[self.__study_index].iterrows(): + protocol = Protocol() + protocol.name = row['Study Protocol Name'] + protocol.description = row['Study Protocol Description'] + protocol.uri = row['Study Protocol URI'] + protocol.version = row['Study Protocol Version'] + protocol.protocol_type = self.get_ontology_annotation( + row['Study Protocol Type'], + row['Study Protocol Type Term Accession Number'], + row['Study Protocol Type Term Source REF']) + params = self.get_ontology_annotations( + row['Study Protocol Parameters Name'], + row['Study Protocol Parameters Name Term Accession Number'], + row['Study Protocol Parameters Name Term Source REF']) + for param in params: + protocol_param = ProtocolParameter(parameter_name=param) + protocol.parameters.append(protocol_param) + protocol.comments = self.get_comments_row(self.__protocols[self.__study_index].columns, row) + protocols.append(protocol) + self.__protocol_map[protocol.name] = protocol + return protocols + + def __load_tables(self, filename: str) -> None: + """ Load the study table file into the Study object. + + :param filename: The filename of the study file + """ + process_sequence_factory: ProcessSequenceFactory = ProcessSequenceFactory( + ontology_sources=self.ontology_source_map.values(), + study_protocols=self.study.protocols, + study_factors=self.study.factors + ) + sources, samples, _, __, processes, characteristic_categories, unit_categories = \ + process_sequence_factory.create_from_df(read_tfile(path.join(path.dirname(self.filepath), filename))) + self.study.sources = sorted(list(sources.values()), key=lambda x: x.name) + self.study = self.load_misc(self.study, samples, processes, characteristic_categories, unit_categories) + + for process in self.study.process_sequence: + self.update_protocols(process, self.study, self.__protocol_map) + + def __load_assays(self): + """ Load the assays in the Study object """ + for _, row in self.__assays[self.__study_index].iterrows(): + assay_loader: ISATabAssayLoader = ISATabAssayLoader( + row, self.__assays, self.__study_index, self.study, self.__protocol_map + ) + assay_loader.load() + self.study.assays.append(assay_loader.assay) + + def __load_study(self) -> None: + """ Create the Study object from the dataframes """ + self.study = Study( + identifier=str(self.__row['Study Identifier']), + title=self.__row['Study Title'], + description=self.__row['Study Description'], + submission_date=self.__row['Study Submission Date'], + public_release_date=self.__row['Study Public Release Date'], + filename=self.__row['Study File Name'], + publications=self.get_publications(self.__publications[self.__study_index]), + contacts=self.get_contacts(self.__contacts[self.__study_index]), + comments=self.get_comments(self.__comments[self.__study_index]) + ) + self.study.design_descriptors = self.__load_design_descriptors() + self.study.factors = self.__load_factors() + self.study.protocols = self.__load_protocols() + + if not self.skip_load_tables: + self.__load_tables(filename=self.study.filename) + + def load(self): + """ Public wrapper to load the study file into the Study object """ + self.__load_study() + self.__load_assays() + + +class ISATabAssayLoader(ISATabLoaderMixin, ISATabLoaderStudyAssayMixin): + """ A class to load an ISA-Tab assay file into an Assay object + + :param row: A row from the assay file + :param assays: A list of DataFrames containing the assays data + :param study_index: The index of this study in this investigation + :param study: The Study object to which this assay belongs (required to add protocols to the study) + :param protocols: A dictionary of Protocol objects + """ + + def __init__( + self, row: Series, assays: list[DataFrame], study_index: int, study: Study, protocols: dict[str, Protocol] + ) -> None: + """ Constructor for the ISATabAssayLoader class """ + self.__row: Series = row + self.__assays: list[DataFrame] = assays + self.__study_index: int = study_index + self.__study: Study = study + self.__protocol_map: dict[str, Protocol] = protocols + self.assay: Assay | None = None + + def load(self): + """ Create the assay object from the dataframes """ + self.assay = Assay(**{ + "filename": self.__row['Study Assay File Name'], + "measurement_type": self.get_ontology_annotation( + self.__row['Study Assay Measurement Type'], + self.__row['Study Assay Measurement Type Term Accession Number'], + self.__row['Study Assay Measurement Type Term Source REF'] + ), + "technology_type": self.get_ontology_annotation( + self.__row['Study Assay Technology Type'], + self.__row['Study Assay Technology Type Term Accession Number'], + self.__row['Study Assay Technology Type Term Source REF'] + ), + "technology_platform": self.__row['Study Assay Technology Platform'], + "comments": self.get_comments_row(self.__assays[self.__study_index].columns, self.__row) + }) + if not self.skip_load_tables: + self.__load_tables() + + def __load_tables(self): + """ Load the assay table file into the Assay object """ + assay_table_file = read_tfile(path.join(path.dirname(self.filepath), self.assay.filename)) + _, samples, other, data, processes, characteristic_categories, unit_categories = ProcessSequenceFactory( + ontology_sources=self.ontology_source_map.values(), + study_samples=self.__study.samples, + study_protocols=self.__study.protocols, + study_factors=self.__study.factors + ).create_from_df(assay_table_file) + self.assay.other_material = sorted(list(other.values()), key=lambda x: x.name) + self.assay.data_files = sorted(list(data.values()), key=lambda x: x.filename) + self.assay = self.load_misc(self.assay, samples, processes, characteristic_categories, unit_categories) + for process in self.assay.process_sequence: + self.update_protocols(process, self.__study, self.__protocol_map) def load(isatab_path_or_ifile: object, skip_load_tables: object = False) -> object: @@ -179,6 +805,15 @@ def get_comments_row(cols, row): comments.append(comment) return comments + def get_ontology_sources(r): + ontology_source = OntologySource( + name=r['Term Source Name'], + file=r['Term Source File'], + version=r['Term Source Version'], + description=r['Term Source Description']) + ontology_source.comments = get_comments_row(df_dict['ontology_sources'].columns, r) + investigation.ontology_source_references.append(ontology_source) + FP = None if isinstance(isatab_path_or_ifile, str): @@ -195,16 +830,9 @@ def get_comments_row(cols, row): df_dict = read_investigation_file(FP) investigation = Investigation() - for _, row in df_dict['ontology_sources'].iterrows(): - ontology_source = OntologySource( - name=row['Term Source Name'], - file=row['Term Source File'], - version=row['Term Source Version'], - description=row['Term Source Description']) - ontology_source.comments = get_comments_row(df_dict['ontology_sources'].columns, row) - investigation.ontology_source_references.append(ontology_source) - + df_dict['ontology_sources'].apply(lambda x: get_ontology_sources(x), axis=1) ontology_source_map = dict(map(lambda x: (x.name, x), investigation.ontology_source_references)) + if not df_dict['investigation'].empty: row = df_dict['investigation'].iloc[0] investigation.identifier = str(row['Investigation Identifier']) @@ -212,9 +840,9 @@ def get_comments_row(cols, row): investigation.description = row['Investigation Description'] investigation.submission_date = row['Investigation Submission Date'] investigation.public_release_date = row['Investigation Public Release Date'] - investigation.publications = get_publications(df_dict['i_publications']) - investigation.contacts = get_contacts(df_dict['i_contacts']) - investigation.comments = get_comments(df_dict['investigation']) + investigation.publications = get_publications(df_dict['i_publications']) + investigation.contacts = get_contacts(df_dict['i_contacts']) + investigation.comments = get_comments(df_dict['investigation']) for i in range(0, len(df_dict['studies'])): row = df_dict['studies'][i].iloc[0] @@ -350,7 +978,6 @@ def get_comments_row(cols, row): list(unit_categories.values()), key=lambda x: x.term, reverse=False) - description = "This protocol was auto-generated where a protocol could not be determined." for process in assay.process_sequence: try: process.executes_protocol = protocol_map[process.executes_protocol] @@ -358,6 +985,7 @@ def get_comments_row(cols, row): try: unknown_protocol = protocol_map['unknown'] except KeyError: + description = "This protocol was auto-generated where a protocol could not be determined." protocol_map['unknown'] = Protocol(name="unknown protocol", description=description) unknown_protocol = protocol_map['unknown'] study.protocols.append(unknown_protocol) diff --git a/isatools/isatab/load/mapping.py b/isatools/isatab/load/mapping.py new file mode 100644 index 00000000..5cd43988 --- /dev/null +++ b/isatools/isatab/load/mapping.py @@ -0,0 +1,62 @@ +investigation_sections_mapping: dict[str, dict[str, str]] = { + 'ontology_sources': { + 'current_section_key': 'ONTOLOGY SOURCE REFERENCE', + 'next_section_key': 'INVESTIGATION' + }, + 'investigation': { + 'current_section_key': 'INVESTIGATION', + 'next_section_key': 'INVESTIGATION PUBLICATIONS' + }, + 'i_publications': { + 'current_section_key': 'INVESTIGATION PUBLICATIONS', + 'next_section_key': 'INVESTIGATION CONTACTS' + }, + 'i_contacts': { + 'current_section_key': 'INVESTIGATION CONTACTS', + 'next_section_key': 'STUDY' + } +} + + +def get_investigation_base_output() -> dict[str, list]: + return { + 'studies': [], + 's_design_descriptors': [], + 's_publications': [], + 's_factors': [], + 's_assays': [], + 's_protocols': [], + 's_contacts': [], + } + + +study_sections_mapping: dict[str, dict[str, str]] = { + 'studies': { + 'current_section_key': 'STUDY', + 'next_section_key': 'STUDY DESIGN DESCRIPTORS' + }, + 's_design_descriptors': { + 'current_section_key': 'STUDY DESIGN DESCRIPTORS', + 'next_section_key': 'STUDY PUBLICATIONS' + }, + 's_publications': { + 'current_section_key': 'STUDY PUBLICATIONS', + 'next_section_key': 'STUDY FACTORS' + }, + 's_factors': { + 'current_section_key': 'STUDY FACTORS', + 'next_section_key': 'STUDY ASSAYS' + }, + 's_assays': { + 'current_section_key': 'STUDY ASSAYS', + 'next_section_key': 'STUDY PROTOCOLS' + }, + 's_protocols': { + 'current_section_key': 'STUDY PROTOCOLS', + 'next_section_key': 'STUDY CONTACTS' + }, + 's_contacts': { + 'current_section_key': 'STUDY CONTACTS', + 'next_section_key': 'STUDY' + } +} \ No newline at end of file diff --git a/isatools/isatab/load/read.py b/isatools/isatab/load/read.py index 830915ac..b454b82c 100644 --- a/isatools/isatab/load/read.py +++ b/isatools/isatab/load/read.py @@ -1,3 +1,4 @@ +from __future__ import annotations from io import StringIO from pandas import read_csv @@ -69,7 +70,7 @@ def _build_section_df(f: StringIO): df.replace(nan, '', regex=True, inplace=True) # Strip out the nan entries df.reset_index(inplace=True) - # Reset index so it is accessible as column + # Reset study_index so it is accessible as column df.columns = df.iloc[0] # If all was OK, promote this row to the column headers df = df.reindex(df.index.drop(0)) @@ -158,16 +159,13 @@ def read_tfile(tfile_path, index_col=None, factor_filter=None) -> IsaTabDataFram """Read a table file into a DataFrame :param tfile_path: Path to a table file to load - :param index_col: The column to use as index + :param index_col: The column to use as study_index :param factor_filter: Factor filter tuple, e.g. ('Gender', 'Male') will filter on FactorValue[Gender] == Male :return: A table file DataFrame """ - log.debug("Opening %s", tfile_path) with utf8_text_file_open(tfile_path) as tfile_fp: - log.debug("Reading file header") tfile_fp.seek(0) - log.debug("Reading file into DataFrame") tfile_fp = strip_comments(tfile_fp) csv = read_csv(tfile_fp, dtype=str, sep='\t', index_col=index_col, encoding='utf-8').fillna('') tfile_df = IsaTabDataFrame(csv) From 6f3fbfd2945429557a66e006399b6cf66f6f2f5c Mon Sep 17 00:00:00 2001 From: Terazus Date: Tue, 19 Mar 2024 11:53:46 +0000 Subject: [PATCH 2/7] update signature of mapping --- isatools/isatab/load/mapping.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/isatools/isatab/load/mapping.py b/isatools/isatab/load/mapping.py index 5cd43988..b80e6fc2 100644 --- a/isatools/isatab/load/mapping.py +++ b/isatools/isatab/load/mapping.py @@ -30,7 +30,7 @@ def get_investigation_base_output() -> dict[str, list]: } -study_sections_mapping: dict[str, dict[str, str]] = { +study_sections_mapping: dict = { 'studies': { 'current_section_key': 'STUDY', 'next_section_key': 'STUDY DESIGN DESCRIPTORS' @@ -59,4 +59,4 @@ def get_investigation_base_output() -> dict[str, list]: 'current_section_key': 'STUDY CONTACTS', 'next_section_key': 'STUDY' } -} \ No newline at end of file +} From b51a38576186351fd5e06bcc3649ae26e21cb9eb Mon Sep 17 00:00:00 2001 From: Terazus Date: Tue, 19 Mar 2024 11:56:22 +0000 Subject: [PATCH 3/7] update signature of mapping --- isatools/isatab/load/mapping.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/isatools/isatab/load/mapping.py b/isatools/isatab/load/mapping.py index b80e6fc2..2e39acf5 100644 --- a/isatools/isatab/load/mapping.py +++ b/isatools/isatab/load/mapping.py @@ -1,4 +1,4 @@ -investigation_sections_mapping: dict[str, dict[str, str]] = { +investigation_sections_mapping: dict = { 'ontology_sources': { 'current_section_key': 'ONTOLOGY SOURCE REFERENCE', 'next_section_key': 'INVESTIGATION' @@ -18,7 +18,7 @@ } -def get_investigation_base_output() -> dict[str, list]: +def get_investigation_base_output() -> dict: return { 'studies': [], 's_design_descriptors': [], From b59b6aecd7a5388882eae9c71fd97b7301d347d1 Mon Sep 17 00:00:00 2001 From: Terazus Date: Wed, 20 Mar 2024 12:06:44 +0000 Subject: [PATCH 4/7] simplified ISATabAssayLoader constructor parameters --- isatools/isatab/load/core.py | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/isatools/isatab/load/core.py b/isatools/isatab/load/core.py index 74f9bf87..1981b702 100644 --- a/isatools/isatab/load/core.py +++ b/isatools/isatab/load/core.py @@ -299,6 +299,7 @@ class ISATabLoaderStudyAssayMixin(metaclass=ABCMeta): """ A mixin for the Study and Assay loaders. Provides shared abstract methods to prevent code duplication """ unknown_protocol_description: str = "This protocol was auto-generated where a protocol could not be determined." + protocol_map: dict[str, Protocol] = {} def update_protocols(self, process: Process, study: Study, protocol_map) -> None: """ Update the protocols in the process with the protocol map and binds it to the study in case of an @@ -329,13 +330,13 @@ def load_misc( characteristic_categories: dict, unit_categories: dict ) -> Study | Assay: - """ Loads misc data and update the target object with the given data. The data to be loaded includes: + """ Bind misc data to the target object (Study or Assay). The data to be loaded includes: - samples - process_sequence - characteristic_categories - - units in the study or assay + - units - :param target: The study or assay to updated + :param target: The study or assay to update :param samples: A dictionary of Sample objects :param processes: A dictionary of Process objects :param characteristic_categories: A dictionary of characteristic categories @@ -466,10 +467,10 @@ class ISATabStudyLoader(ISATabLoaderMixin, ISATabLoaderStudyAssayMixin): def __init__(self, row: DataFrame, df_dict: dict, index: int) -> None: """ Constructor for the ISATabStudyLoader class """ + ISATabLoaderStudyAssayMixin.protocol_map = {} + self.__study_index: int = index self.__row: DataFrame = row - self.__protocol_map: dict[str, Protocol] = {} - self.__publications: list[DataFrame] = df_dict['s_publications'] self.__contacts: list[DataFrame] = df_dict['s_contacts'] self.__comments: DataFrame = df_dict['studies'] @@ -477,7 +478,6 @@ def __init__(self, row: DataFrame, df_dict: dict, index: int) -> None: self.__factors: list[DataFrame] = df_dict['s_factors'] self.__protocols: list[DataFrame] = df_dict['s_protocols'] self.__assays: list[DataFrame] = df_dict['s_assays'] - self.study: Study | None = None def __load_design_descriptors(self) -> list[OntologyAnnotation]: @@ -538,7 +538,7 @@ def __load_protocols(self) -> list[Protocol]: protocol.parameters.append(protocol_param) protocol.comments = self.get_comments_row(self.__protocols[self.__study_index].columns, row) protocols.append(protocol) - self.__protocol_map[protocol.name] = protocol + ISATabLoaderStudyAssayMixin.protocol_map[protocol.name] = protocol return protocols def __load_tables(self, filename: str) -> None: @@ -557,13 +557,13 @@ def __load_tables(self, filename: str) -> None: self.study = self.load_misc(self.study, samples, processes, characteristic_categories, unit_categories) for process in self.study.process_sequence: - self.update_protocols(process, self.study, self.__protocol_map) + self.update_protocols(process, self.study, self.protocol_map) def __load_assays(self): """ Load the assays in the Study object """ for _, row in self.__assays[self.__study_index].iterrows(): assay_loader: ISATabAssayLoader = ISATabAssayLoader( - row, self.__assays, self.__study_index, self.study, self.__protocol_map + row, self.__assays[self.__study_index].columns, self.study ) assay_loader.load() self.study.assays.append(assay_loader.assay) @@ -598,21 +598,14 @@ class ISATabAssayLoader(ISATabLoaderMixin, ISATabLoaderStudyAssayMixin): """ A class to load an ISA-Tab assay file into an Assay object :param row: A row from the assay file - :param assays: A list of DataFrames containing the assays data - :param study_index: The index of this study in this investigation :param study: The Study object to which this assay belongs (required to add protocols to the study) - :param protocols: A dictionary of Protocol objects """ - def __init__( - self, row: Series, assays: list[DataFrame], study_index: int, study: Study, protocols: dict[str, Protocol] - ) -> None: + def __init__(self, row: Series, columns: list[str], study: Study) -> None: """ Constructor for the ISATabAssayLoader class """ self.__row: Series = row - self.__assays: list[DataFrame] = assays - self.__study_index: int = study_index + self.__columns: list[str] = columns self.__study: Study = study - self.__protocol_map: dict[str, Protocol] = protocols self.assay: Assay | None = None def load(self): @@ -630,7 +623,7 @@ def load(self): self.__row['Study Assay Technology Type Term Source REF'] ), "technology_platform": self.__row['Study Assay Technology Platform'], - "comments": self.get_comments_row(self.__assays[self.__study_index].columns, self.__row) + "comments": self.get_comments_row(self.__columns, self.__row) }) if not self.skip_load_tables: self.__load_tables() @@ -648,7 +641,7 @@ def __load_tables(self): self.assay.data_files = sorted(list(data.values()), key=lambda x: x.filename) self.assay = self.load_misc(self.assay, samples, processes, characteristic_categories, unit_categories) for process in self.assay.process_sequence: - self.update_protocols(process, self.__study, self.__protocol_map) + self.update_protocols(process, self.__study, self.protocol_map) def load(isatab_path_or_ifile: object, skip_load_tables: object = False) -> object: From 1fee42b578a611a1683072ef59a2fc41a47f26c2 Mon Sep 17 00:00:00 2001 From: Terazus Date: Wed, 20 Mar 2024 13:23:25 +0000 Subject: [PATCH 5/7] renamed lot of methods for clarification --- isatools/isatab/load/core.py | 194 ++++++++++++++++++++--------------- 1 file changed, 111 insertions(+), 83 deletions(-) diff --git a/isatools/isatab/load/core.py b/isatools/isatab/load/core.py index 1981b702..984e3079 100644 --- a/isatools/isatab/load/core.py +++ b/isatools/isatab/load/core.py @@ -45,12 +45,14 @@ def __init__(self, fp: TextIO) -> None: self.dataframe_dict: dict[str, DataFrame | str, list[DataFrame]] = {} def __del__(self) -> None: - """ Destructor for the ISATabReader class """ + """ Destructor hook for the ISATabReader class. Called by the garbage collector. Makes sure the file-like + buffer object is closed even if the program crashes. + """ self.memory_file.close() @property def memory_file(self) -> TextIO: - """ A file-like buffer object + """ Getter for the in memory file-like buffer object :return: A file-like buffer object """ @@ -58,7 +60,8 @@ def memory_file(self) -> TextIO: @memory_file.setter def memory_file(self, fp: TextIO) -> None: - """ Reads the input file into memory, stripping out comments and sets the memory_file property + """ Setter for the memory_file property. Reads the input file into memory, stripping out comments and + sets the memory_file property :param fp: A file-like buffer object """ @@ -72,9 +75,8 @@ def memory_file(self, fp: TextIO) -> None: self.__memory_file = memory_file def __peek(self) -> str: - """Peek at the next line without moving to the next line. This function - get the position of the next line, reads the next line, then resets the - file pointer to the original position + """Peek at the next line without moving to the next line. This function get the position of the next line, + reads the next line, then resets the file pointer to the original position :return: The next line past the current line """ @@ -94,9 +96,9 @@ def __read_tab_section(self, sec_key: str, next_sec_key: str) -> StringIO: normed_line: str = fileline.rstrip().strip('"') memory_file: StringIO = StringIO() - if not normed_line == sec_key: + if normed_line != sec_key: raise IOError(f"Expected: {sec_key} section, but got: {normed_line}") - while not self.__peek().rstrip() == next_sec_key: + while self.__peek().rstrip() != next_sec_key: fileline = self.memory_file.readline() if not fileline: break @@ -114,18 +116,15 @@ def __build_section_df(self, current_section_key: str, next_section_key: str) -> file_handler: StringIO = self.__read_tab_section(sec_key=current_section_key, next_sec_key=next_section_key) df: DataFrame = read_csv( filepath_or_buffer=file_handler, + names=range(0, 128), sep='\t', engine='python', encoding='utf-8' ).dropna(axis=1, how='all').T - # Strip out the nan entries - df.replace(nan, '', regex=True, inplace=True) - # Reset study_index so it is accessible as column - df.reset_index(inplace=True) - # If all was OK, promote this row to the column headers - df.columns = df.iloc[0] - # Return the re-indexed DataFrame - return df.reindex(df.index.drop(0)) + df.replace(nan, '', regex=True, inplace=True) # Strip out the nan entries + df.reset_index(inplace=True) # Reset study_index so it is accessible as column + df.columns = df.iloc[0] # If all was OK, promote this row to the column headers + return df.reindex(df.index.drop(0)) # Return the re-indexed DataFrame def run(self) -> dict[str, DataFrame | str, list[DataFrame]]: """ Main method to run the ISATabReader and return the dictionary of DataFrames @@ -143,7 +142,23 @@ def run(self) -> dict[str, DataFrame | str, list[DataFrame]]: class ISATabLoaderMixin(metaclass=ABCMeta): - """ A mixin to provide modeling for the ISATab loaders. Provides shared methods and attributes and implementations + """ A mixin to provide modeling for the ISATab loaders. Provides shared methods, attributes and implementations + + - Properties: + - ontology_source_map: A dictionary of OntologySource objects references + - skip_load_tables: A boolean to skip loading the studies and assays table files + - filepath: The filepath of the investigation file + + - Methods: + - get_contacts: Get a list of Person objects from the relevant investigation file section + - get_comments: Get Comments from a section DataFrame + - get_comments_row: Get Comments in a given DataFrame row + - get_ontology_annotation: Gets an OntologyAnnotation for a given value, accession and term source REF + - get_ontology_annotations: Gets a list of OntologyAnnotations from semicolon delimited lists + - get_publications: Get a list of Publication objects from the relevant investigation file section + + - Abstract Methods: + - load: Load the investigation file into the Investigation object """ ontology_source_map: dict @@ -156,7 +171,7 @@ def __get_ontology_source(self, term_source_ref) -> OntologySource | None: :param term_source_ref: The term source reference :return: An OntologySource object or None """ - return self.ontology_source_map[term_source_ref] if term_source_ref else None + return None if term_source_ref not in self.ontology_source_map else self.ontology_source_map[term_source_ref] def get_contacts(self, contact_dataframe: DataFrame) -> list[Person]: """Get a list of Person objects from the relevant investigation file @@ -223,21 +238,16 @@ def get_comments_row(cols, row) -> list[Comment]: return comments def get_ontology_annotation(self, val, accession, ts_ref) -> OntologyAnnotation | None: - """Gets a OntologyAnnotation for a give value, accession and - term source REF + """Gets an OntologyAnnotation for a given value, accession and term source REF - :param val: Value of the OA - :param accession: Term Accession Number of the OA - :param ts_ref: Term Source REF of the OA + :param val: Value of the OntologyAnnotation + :param accession: Term Accession Number of the OntologyAnnotation + :param ts_ref: Term Source REF of the OntologyAnnotation :return: An OntologyAnnotation object """ if val == '' and accession == '': return None - return OntologyAnnotation( - term=val, - term_accession=accession, - term_source=self.__get_ontology_source(ts_ref) - ) + return OntologyAnnotation(val, self.__get_ontology_source(ts_ref), accession) def get_ontology_annotations(self, vals, accessions, ts_refs) -> list[OntologyAnnotation]: """ Gets a list of OntologyAnnotations from semicolon delimited lists @@ -296,7 +306,19 @@ def load(self, **kwargs): class ISATabLoaderStudyAssayMixin(metaclass=ABCMeta): - """ A mixin for the Study and Assay loaders. Provides shared abstract methods to prevent code duplication """ + """ A mixin for the Study and Assay loaders. Provides shared abstract methods to prevent code duplication + + - Properties: + - unknown_protocol_description: A description for an unknown protocol + - protocol_map: A dictionary of Protocol objects references + + - Methods: + - update_protocols: Update the protocols in the process with the protocol map + - set_misc: Bind misc data to the target object (Study or Assay) + + - Abstract Methods: + - load_tables: Load the study or assay table file + """ unknown_protocol_description: str = "This protocol was auto-generated where a protocol could not be determined." protocol_map: dict[str, Protocol] = {} @@ -323,7 +345,7 @@ def update_protocols(self, process: Process, study: Study, protocol_map) -> None process.executes_protocol = protocol @staticmethod - def load_misc( + def set_misc( target: Study | Assay, samples: dict, processes: dict, @@ -349,6 +371,10 @@ def load_misc( target.units = sorted(list(unit_categories.values()), key=lambda x: x.term) return target + @abstractmethod + def load_tables(self, **kwargs): + raise NotImplementedError + class ISATabInvestigationLoader(ISATabLoaderMixin): """ A class to load an ISA-Tab investigation file into an Investigation object @@ -370,12 +396,14 @@ def __init__(self, file: TextIO | str, run: bool = True, skip_load_table: bool = self.load() def __del__(self, **kwargs) -> None: - """ Destructor hook for the ISATabInvestigationLoader class. Called by the garbage collector """ + """ Destructor hook for the ISATabInvestigationLoader class. Called by the garbage collector. Makes sure + the file-like buffer object is closed even if the program crashes. + """ self.file.close() @property def investigation(self) -> Investigation: - """ The getter for the investigation object + """ Getter for the ISA Investigation object. Setter is not allowed :return: An Investigation object """ @@ -410,8 +438,8 @@ def file(self, file: str | TextIO) -> None: self.__df_dict = isatab_reader.run() ISATabLoaderMixin.filepath = self.file.name - def __get_ontology_sources(self, row: Series) -> None: - """ Get an ontology source from the given row at the top of the investigation file + def __set_ontology_source(self, row: Series) -> None: + """Sets the ontology source from the given row at the top of the investigation file in the investigation object :param row: A row from the investigation file """ @@ -423,27 +451,27 @@ def __get_ontology_sources(self, row: Series) -> None: ontology_source.comments = self.get_comments(self.__df_dict['ontology_sources']) self.__investigation.ontology_source_references.append(ontology_source) - def __load_investigation(self) -> None: + def __set_investigation(self) -> None: """ Loads all data regarding the investigation into the Investigation object. Studies and assays are loaded in a separate private method. """ - self.__df_dict['ontology_sources'].apply(lambda r: self.__get_ontology_sources(r), axis=1) + self.__df_dict['ontology_sources'].apply(lambda r: self.__set_ontology_source(r), axis=1) ISATabLoaderMixin.ontology_source_map = dict( - map(lambda x: (x.name, x), self.investigation.ontology_source_references) + map(lambda x: (x.name, x), self.__investigation.ontology_source_references) ) if not self.__df_dict['investigation'].empty: row = self.__df_dict['investigation'].iloc[0] - self.investigation.identifier = str(row['Investigation Identifier']) - self.investigation.title = row['Investigation Title'] - self.investigation.description = row['Investigation Description'] - self.investigation.submission_date = row['Investigation Submission Date'] - self.investigation.public_release_date = row['Investigation Public Release Date'] - self.investigation.publications = self.get_publications(self.__df_dict['i_publications']) - self.investigation.contacts = self.get_contacts(self.__df_dict['i_contacts']) - self.investigation.comments = self.get_comments(self.__df_dict['investigation']) - - def __load_studies(self) -> None: + self.__investigation.identifier = str(row['Investigation Identifier']) + self.__investigation.title = row['Investigation Title'] + self.__investigation.description = row['Investigation Description'] + self.__investigation.submission_date = row['Investigation Submission Date'] + self.__investigation.public_release_date = row['Investigation Public Release Date'] + self.__investigation.publications = self.get_publications(self.__df_dict['i_publications']) + self.__investigation.contacts = self.get_contacts(self.__df_dict['i_contacts']) + self.__investigation.comments = self.get_comments(self.__df_dict['investigation']) + + def __create_studies(self) -> None: """ Loads all the studies inside the investigation object """ for i, row in enumerate(self.__df_dict['studies']): row = row.iloc[0] @@ -453,8 +481,8 @@ def __load_studies(self) -> None: def load(self): """ Public wrapper to load the investigation file into the Investigation object. """ - self.__load_investigation() - self.__load_studies() + self.__set_investigation() + self.__create_studies() class ISATabStudyLoader(ISATabLoaderMixin, ISATabLoaderStudyAssayMixin): @@ -462,7 +490,7 @@ class ISATabStudyLoader(ISATabLoaderMixin, ISATabLoaderStudyAssayMixin): :param row: A row from the study file :param df_dict: A dictionary of DataFrames containing the data extracted from the investigation file - :param index: The study_index of this study in this investigation + :param index: The study index of this study in this investigation """ def __init__(self, row: DataFrame, df_dict: dict, index: int) -> None: @@ -480,7 +508,7 @@ def __init__(self, row: DataFrame, df_dict: dict, index: int) -> None: self.__assays: list[DataFrame] = df_dict['s_assays'] self.study: Study | None = None - def __load_design_descriptors(self) -> list[OntologyAnnotation]: + def __get_design_descriptors(self) -> list[OntologyAnnotation]: """ Load the design descriptors from the study file into the Study object :return: A list of OntologyAnnotation describing design descriptors @@ -497,7 +525,7 @@ def __load_design_descriptors(self) -> list[OntologyAnnotation]: design_descriptors.append(design_descriptor) return design_descriptors - def __load_factors(self) -> list[StudyFactor]: + def __get_factors(self) -> list[StudyFactor]: """ Load the factors from the study file into the Study object :return: A list of StudyFactor @@ -513,7 +541,7 @@ def __load_factors(self) -> list[StudyFactor]: factors.append(factor) return factors - def __load_protocols(self) -> list[Protocol]: + def __get_protocols(self) -> list[Protocol]: """ Load the protocols from the study file into the Study object :return: A list of Protocol @@ -541,26 +569,8 @@ def __load_protocols(self) -> list[Protocol]: ISATabLoaderStudyAssayMixin.protocol_map[protocol.name] = protocol return protocols - def __load_tables(self, filename: str) -> None: - """ Load the study table file into the Study object. - - :param filename: The filename of the study file - """ - process_sequence_factory: ProcessSequenceFactory = ProcessSequenceFactory( - ontology_sources=self.ontology_source_map.values(), - study_protocols=self.study.protocols, - study_factors=self.study.factors - ) - sources, samples, _, __, processes, characteristic_categories, unit_categories = \ - process_sequence_factory.create_from_df(read_tfile(path.join(path.dirname(self.filepath), filename))) - self.study.sources = sorted(list(sources.values()), key=lambda x: x.name) - self.study = self.load_misc(self.study, samples, processes, characteristic_categories, unit_categories) - - for process in self.study.process_sequence: - self.update_protocols(process, self.study, self.protocol_map) - - def __load_assays(self): - """ Load the assays in the Study object """ + def __create_assays(self): + """ Create the assays and bind them to the study object """ for _, row in self.__assays[self.__study_index].iterrows(): assay_loader: ISATabAssayLoader = ISATabAssayLoader( row, self.__assays[self.__study_index].columns, self.study @@ -568,7 +578,7 @@ def __load_assays(self): assay_loader.load() self.study.assays.append(assay_loader.assay) - def __load_study(self) -> None: + def __create_study(self) -> None: """ Create the Study object from the dataframes """ self.study = Study( identifier=str(self.__row['Study Identifier']), @@ -581,17 +591,35 @@ def __load_study(self) -> None: contacts=self.get_contacts(self.__contacts[self.__study_index]), comments=self.get_comments(self.__comments[self.__study_index]) ) - self.study.design_descriptors = self.__load_design_descriptors() - self.study.factors = self.__load_factors() - self.study.protocols = self.__load_protocols() + self.study.design_descriptors = self.__get_design_descriptors() + self.study.factors = self.__get_factors() + self.study.protocols = self.__get_protocols() if not self.skip_load_tables: - self.__load_tables(filename=self.study.filename) + self.load_tables(filename=self.study.filename) def load(self): """ Public wrapper to load the study file into the Study object """ - self.__load_study() - self.__load_assays() + self.__create_study() + self.__create_assays() + + def load_tables(self, filename: str) -> None: + """ Load the study table file into the Study object. + + :param filename: The filename of the study file + """ + process_sequence_factory: ProcessSequenceFactory = ProcessSequenceFactory( + ontology_sources=self.ontology_source_map.values(), + study_protocols=self.study.protocols, + study_factors=self.study.factors + ) + sources, samples, _, __, processes, characteristic_categories, unit_categories = \ + process_sequence_factory.create_from_df(read_tfile(path.join(path.dirname(self.filepath), filename))) + self.study.sources = sorted(list(sources.values()), key=lambda x: x.name) + self.study = self.set_misc(self.study, samples, processes, characteristic_categories, unit_categories) + + for process in self.study.process_sequence: + self.update_protocols(process, self.study, self.protocol_map) class ISATabAssayLoader(ISATabLoaderMixin, ISATabLoaderStudyAssayMixin): @@ -626,9 +654,9 @@ def load(self): "comments": self.get_comments_row(self.__columns, self.__row) }) if not self.skip_load_tables: - self.__load_tables() + self.load_tables() - def __load_tables(self): + def load_tables(self): """ Load the assay table file into the Assay object """ assay_table_file = read_tfile(path.join(path.dirname(self.filepath), self.assay.filename)) _, samples, other, data, processes, characteristic_categories, unit_categories = ProcessSequenceFactory( @@ -639,7 +667,7 @@ def __load_tables(self): ).create_from_df(assay_table_file) self.assay.other_material = sorted(list(other.values()), key=lambda x: x.name) self.assay.data_files = sorted(list(data.values()), key=lambda x: x.filename) - self.assay = self.load_misc(self.assay, samples, processes, characteristic_categories, unit_categories) + self.assay = self.set_misc(self.assay, samples, processes, characteristic_categories, unit_categories) for process in self.assay.process_sequence: self.update_protocols(process, self.__study, self.protocol_map) From fce86a15e430854d267e1eead74d5404eb7bcac5 Mon Sep 17 00:00:00 2001 From: Terazus Date: Wed, 20 Mar 2024 13:26:38 +0000 Subject: [PATCH 6/7] renamed __set_investigation to __create_investigation --- isatools/isatab/load/core.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/isatools/isatab/load/core.py b/isatools/isatab/load/core.py index 984e3079..9eaeb3c5 100644 --- a/isatools/isatab/load/core.py +++ b/isatools/isatab/load/core.py @@ -389,9 +389,9 @@ def __init__(self, file: TextIO | str, run: bool = True, skip_load_table: bool = """ ISATabLoaderMixin.skip_load_tables = skip_load_table + self.__investigation: Investigation self.__df_dict: dict = {} self.file: TextIO = file - self.__investigation: Investigation = Investigation() if run: self.load() @@ -451,10 +451,11 @@ def __set_ontology_source(self, row: Series) -> None: ontology_source.comments = self.get_comments(self.__df_dict['ontology_sources']) self.__investigation.ontology_source_references.append(ontology_source) - def __set_investigation(self) -> None: + def __create_investigation(self) -> None: """ Loads all data regarding the investigation into the Investigation object. Studies and assays are loaded in a separate private method. """ + self.__investigation = Investigation() self.__df_dict['ontology_sources'].apply(lambda r: self.__set_ontology_source(r), axis=1) ISATabLoaderMixin.ontology_source_map = dict( map(lambda x: (x.name, x), self.__investigation.ontology_source_references) @@ -481,7 +482,7 @@ def __create_studies(self) -> None: def load(self): """ Public wrapper to load the investigation file into the Investigation object. """ - self.__set_investigation() + self.__create_investigation() self.__create_studies() From 83cab6d0c46f116cf5bc51cfc35fa8aef3bb2584 Mon Sep 17 00:00:00 2001 From: Terazus Date: Wed, 20 Mar 2024 14:04:09 +0000 Subject: [PATCH 7/7] removed old code --- isatools/isatab/load/__init__.py | 9 +- isatools/isatab/load/core.py | 391 ++++--------------------------- isatools/isatab/load/read.py | 175 -------------- 3 files changed, 52 insertions(+), 523 deletions(-) delete mode 100644 isatools/isatab/load/read.py diff --git a/isatools/isatab/load/__init__.py b/isatools/isatab/load/__init__.py index 28e0d128..c35bb98e 100644 --- a/isatools/isatab/load/__init__.py +++ b/isatools/isatab/load/__init__.py @@ -1,3 +1,8 @@ -from isatools.isatab.load.read import read_investigation_file, read_tfile from isatools.isatab.load.ProcessSequenceFactory import ProcessSequenceFactory, preprocess -from isatools.isatab.load.core import load, merge_study_with_assay_tables, load_table +from isatools.isatab.load.core import ( + load, + merge_study_with_assay_tables, + load_table, + read_investigation_file, + read_tfile +) diff --git a/isatools/isatab/load/core.py b/isatools/isatab/load/core.py index 9eaeb3c5..1619d285 100644 --- a/isatools/isatab/load/core.py +++ b/isatools/isatab/load/core.py @@ -12,10 +12,9 @@ from numpy import nan from isatools.utils import utf8_text_file_open -from isatools.isatab.load.read import read_tfile, read_investigation_file from isatools.isatab.load.ProcessSequenceFactory import ProcessSequenceFactory from isatools.isatab.defaults import _RX_COMMENT, log -from isatools.isatab.utils import strip_comments +from isatools.isatab.utils import strip_comments, IsaTabDataFrame from isatools.model import ( OntologyAnnotation, Publication, @@ -673,354 +672,21 @@ def load_tables(self): self.update_protocols(process, self.__study, self.protocol_map) -def load(isatab_path_or_ifile: object, skip_load_tables: object = False) -> object: +def load(isatab_path_or_ifile: TextIO, skip_load_tables: bool = False) -> Investigation: """Load an ISA-Tab into ISA Data Model objects - :rtype: object :param isatab_path_or_ifile: Full path to an ISA-Tab directory or file-like buffer object pointing to an investigation file - :param skip_load_tables: Whether or not to skip loading the table files + :param skip_load_tables: Whether to skip loading the table files :return: Investigation objects """ + investigation_loader: ISATabInvestigationLoader = ISATabInvestigationLoader( + file=isatab_path_or_ifile, skip_load_table=skip_load_tables + ) + return investigation_loader.investigation - # from DF of investigation file - def get_ontology_source(term_source_ref): - try: - current_onto_source = ontology_source_map[term_source_ref] - except KeyError: - current_onto_source = None - return current_onto_source - - def get_oa(val, accession, ts_ref): - """Gets a OntologyAnnotation for a give value, accession and - term source REF - - :param val: Value of the OA - :param accession: Term Accession Number of the OA - :param ts_ref: Term Source REF of the OA - :return: An OntologyAnnotation object - """ - if val == '' and accession == '': - return None - else: - return OntologyAnnotation( - term=val, - term_accession=accession, - term_source=get_ontology_source(ts_ref) - ) - - def get_oa_list_from_semi_c_list(vals, accessions, ts_refs): - """Gets a list of OntologyAnnotations from semi-colon delimited lists - - :param vals: A list of values, separated by semi-colons - :param accessions: A list of accessions, separated by semi-colons - :param ts_refs: A list of term source REFs, separated by semi-colons - :return: A list of OntologyAnnotation objects - """ - oa_list = [] - accession_split = accessions.split(';') - ts_refs_split = ts_refs.split(';') - # if no acc or ts_refs - if accession_split == [''] and ts_refs_split == ['']: - for val in vals.split(';'): - oa_list.append(OntologyAnnotation(term=val, )) - else: # try parse all three sections - for _, val in enumerate(vals.split(';')): - oa = get_oa(val, accessions.split(';')[_], ts_refs.split(';')[_]) - if oa is not None: - oa_list.append(oa) - return oa_list - - def get_publications(section_df): - """Get a list of Publications from the relevant investigation file - section - - :param section_df: A PUBLICATIONS section DataFrame - :return: A list of Publication objects - """ - if 'Investigation PubMed ID' in section_df.columns: - prefix = 'Investigation ' - elif 'Study PubMed ID' in section_df.columns: - prefix = 'Study ' - else: - raise KeyError - - publications = [] - - for _, current_row in section_df.iterrows(): - publication = Publication(pubmed_id=current_row[prefix + 'PubMed ID'], - doi=current_row[prefix + 'Publication DOI'], - author_list=current_row[ - prefix + 'Publication Author List'], - title=current_row[prefix + 'Publication Title']) - - publication.status = get_oa( - current_row[prefix + 'Publication Status'], - current_row[prefix + 'Publication Status Term Accession Number'], - current_row[prefix + 'Publication Status Term Source REF']) - publication.comments = get_comments_row(section_df.columns, current_row) - publications.append(publication) - - return publications - - def get_contacts(section_df): - """Get a list of Person objects from the relevant investigation file - section - - :param section_df: A CONTACTS section DataFrame - :return: A list of Person objects - """ - if 'Investigation Person Last Name' in section_df.columns: - prefix = 'Investigation ' - elif 'Study Person Last Name' in section_df.columns: - prefix = 'Study ' - else: - raise KeyError - - contacts = [] - - for _, current_row in section_df.iterrows(): - person = Person(last_name=current_row[prefix + 'Person Last Name'], - first_name=current_row[prefix + 'Person First Name'], - mid_initials=current_row[prefix + 'Person Mid Initials'], - email=current_row[prefix + 'Person Email'], - phone=current_row[prefix + 'Person Phone'], - fax=current_row[prefix + 'Person Fax'], - address=current_row[prefix + 'Person Address'], - affiliation=current_row[prefix + 'Person Affiliation']) - - person.roles = get_oa_list_from_semi_c_list( - current_row[prefix + 'Person Roles'], - current_row[prefix + 'Person Roles Term Accession Number'], - current_row[prefix + 'Person Roles Term Source REF']) - person.comments = get_comments_row(section_df.columns, current_row) - contacts.append(person) - - return contacts - - def get_comments(section_df): - """Get Comments from a section DataFrame - - :param section_df: A section DataFrame - :return: A list of Comment objects as found in the section - """ - comments = [] - for col in [x for x in section_df.columns if _RX_COMMENT.match(str(x))]: - for _, current_row in section_df.iterrows(): - comment = Comment( - name=next(iter(_RX_COMMENT.findall(col))), value=current_row[col]) - comments.append(comment) - return comments - - def get_comments_row(cols, row): - """Get Comments in a given DataFrame row - - :param cols: List of DataFrame columns - :param row: DataFrame row as a Series object - :return: A list of Comment objects - """ - comments = [] - for col in [x for x in cols if _RX_COMMENT.match(str(x))]: - comment = Comment( - name=next(iter(_RX_COMMENT.findall(col))), value=row[col]) - comments.append(comment) - return comments - - def get_ontology_sources(r): - ontology_source = OntologySource( - name=r['Term Source Name'], - file=r['Term Source File'], - version=r['Term Source Version'], - description=r['Term Source Description']) - ontology_source.comments = get_comments_row(df_dict['ontology_sources'].columns, r) - investigation.ontology_source_references.append(ontology_source) - - FP = None - - if isinstance(isatab_path_or_ifile, str): - if path.isdir(isatab_path_or_ifile): - fnames = glob(path.join(isatab_path_or_ifile, "i_*.txt")) - assert len(fnames) == 1 - FP = utf8_text_file_open(fnames[0]) - elif hasattr(isatab_path_or_ifile, 'read'): - FP = isatab_path_or_ifile - else: - raise IOError("Cannot resolve input file") - - try: - df_dict = read_investigation_file(FP) - investigation = Investigation() - - df_dict['ontology_sources'].apply(lambda x: get_ontology_sources(x), axis=1) - ontology_source_map = dict(map(lambda x: (x.name, x), investigation.ontology_source_references)) - - if not df_dict['investigation'].empty: - row = df_dict['investigation'].iloc[0] - investigation.identifier = str(row['Investigation Identifier']) - investigation.title = row['Investigation Title'] - investigation.description = row['Investigation Description'] - investigation.submission_date = row['Investigation Submission Date'] - investigation.public_release_date = row['Investigation Public Release Date'] - investigation.publications = get_publications(df_dict['i_publications']) - investigation.contacts = get_contacts(df_dict['i_contacts']) - investigation.comments = get_comments(df_dict['investigation']) - - for i in range(0, len(df_dict['studies'])): - row = df_dict['studies'][i].iloc[0] - study = Study() - study.identifier = str(row['Study Identifier']) - study.title = row['Study Title'] - study.description = row['Study Description'] - study.submission_date = row['Study Submission Date'] - study.public_release_date = row['Study Public Release Date'] - study.filename = row['Study File Name'] - - study.publications = get_publications(df_dict['s_publications'][i]) - study.contacts = get_contacts(df_dict['s_contacts'][i]) - study.comments = get_comments(df_dict['studies'][i]) - - for _, row in df_dict['s_design_descriptors'][i].iterrows(): - design_descriptor = get_oa( - row['Study Design Type'], - row['Study Design Type Term Accession Number'], - row['Study Design Type Term Source REF']) - these_comments = get_comments_row(df_dict['s_design_descriptors'][i].columns, row) - design_descriptor.comments = these_comments - study.design_descriptors.append(design_descriptor) - - for _, row in df_dict['s_factors'][i].iterrows(): - factor = StudyFactor(name=row['Study Factor Name']) - factor.factor_type = get_oa( - row['Study Factor Type'], - row['Study Factor Type Term Accession Number'], - row['Study Factor Type Term Source REF']) - factor.comments = get_comments_row(df_dict['s_factors'][i].columns, row) - study.factors.append(factor) - - protocol_map = {} - for _, row in df_dict['s_protocols'][i].iterrows(): - protocol = Protocol() - protocol.name = row['Study Protocol Name'] - protocol.description = row['Study Protocol Description'] - protocol.uri = row['Study Protocol URI'] - protocol.version = row['Study Protocol Version'] - protocol.protocol_type = get_oa( - row['Study Protocol Type'], - row['Study Protocol Type Term Accession Number'], - row['Study Protocol Type Term Source REF']) - params = get_oa_list_from_semi_c_list( - row['Study Protocol Parameters Name'], - row['Study Protocol Parameters Name Term Accession Number'], - row['Study Protocol Parameters Name Term Source REF']) - for param in params: - protocol_param = ProtocolParameter(parameter_name=param) - protocol.parameters.append(protocol_param) - protocol.comments = get_comments_row(df_dict['s_protocols'][i].columns, row) - study.protocols.append(protocol) - protocol_map[protocol.name] = protocol - study.protocols = list(protocol_map.values()) - if skip_load_tables: - pass - else: - study_tfile_df = read_tfile(path.join(path.dirname(FP.name), study.filename)) - iosrs = investigation.ontology_source_references - sources, samples, _, __, processes, characteristic_categories, unit_categories = \ - ProcessSequenceFactory( - ontology_sources=iosrs, - study_protocols=study.protocols, - study_factors=study.factors - ).create_from_df(study_tfile_df) - study.sources = sorted(list(sources.values()), key=lambda x: x.name, reverse=False) - study.samples = sorted(list(samples.values()), key=lambda x: x.name, reverse=False) - study.process_sequence = list(processes.values()) - study.characteristic_categories = sorted( - list(characteristic_categories.values()), - key=lambda x: x.term, - reverse=False) - study.units = sorted(list(unit_categories.values()), key=lambda x: x.term, reverse=False) - - for process in study.process_sequence: - try: - process.executes_protocol = protocol_map[process.executes_protocol] - except KeyError: - try: - unknown_protocol = protocol_map['unknown'] - except KeyError: - description = "This protocol was auto-generated where a protocol could not be determined." - protocol_map['unknown'] = Protocol(name="unknown protocol", description=description) - unknown_protocol = protocol_map['unknown'] - study.protocols.append(unknown_protocol) - process.executes_protocol = unknown_protocol - - for _, row in df_dict['s_assays'][i].iterrows(): - assay_dict = { - "filename": row['Study Assay File Name'], - "measurement_type": get_oa( - row['Study Assay Measurement Type'], - row['Study Assay Measurement Type Term Accession Number'], - row['Study Assay Measurement Type Term Source REF'] - ), - "technology_type": get_oa( - row['Study Assay Technology Type'], - row['Study Assay Technology Type Term Accession Number'], - row['Study Assay Technology Type Term Source REF'] - ), - "technology_platform": row['Study Assay Technology Platform'], - "comments": get_comments_row(df_dict['s_assays'][i].columns, row) - } - assay = Assay(**assay_dict) - - if skip_load_tables: - pass - else: - iosrs = investigation.ontology_source_references - assay_tfile_df = read_tfile(path.join(path.dirname(FP.name), assay.filename)) - _, samples, other, data, processes, characteristic_categories, unit_categories = \ - ProcessSequenceFactory( - ontology_sources=iosrs, - study_samples=study.samples, - study_protocols=study.protocols, - study_factors=study.factors).create_from_df( - assay_tfile_df) - assay.samples = sorted( - list(samples.values()), key=lambda x: x.name, - reverse=False) - assay.other_material = sorted( - list(other.values()), key=lambda x: x.name, - reverse=False) - assay.data_files = sorted( - list(data.values()), key=lambda x: x.filename, - reverse=False) - assay.process_sequence = list(processes.values()) - assay.characteristic_categories = sorted( - list(characteristic_categories.values()), - key=lambda x: x.term, reverse=False) - assay.units = sorted( - list(unit_categories.values()), key=lambda x: x.term, - reverse=False) - - for process in assay.process_sequence: - try: - process.executes_protocol = protocol_map[process.executes_protocol] - except KeyError: - try: - unknown_protocol = protocol_map['unknown'] - except KeyError: - description = "This protocol was auto-generated where a protocol could not be determined." - protocol_map['unknown'] = Protocol(name="unknown protocol", description=description) - unknown_protocol = protocol_map['unknown'] - study.protocols.append(unknown_protocol) - process.executes_protocol = unknown_protocol - - study.assays.append(assay) - investigation.studies.append(study) - finally: - FP.close() - return investigation - - -def merge_study_with_assay_tables(study_file_path, assay_file_path, target_file_path): +def merge_study_with_assay_tables(study_file_path: str, assay_file_path: str, target_file_path: str): """ Utility function to merge a study table file with an assay table file. The merge uses the Sample Name as the @@ -1034,14 +700,15 @@ def merge_study_with_assay_tables(study_file_path, assay_file_path, target_file_ '/path/to/assay.txt', '/path/to/merged.txt') """ log.info("Reading study file %s into DataFrame", study_file_path) - study_DF = read_tfile(study_file_path) + study_dataframe = read_tfile(study_file_path) log.info("Reading assay file %s into DataFrame", assay_file_path) - assay_DF = read_tfile(assay_file_path) + assay_dataframe = read_tfile(assay_file_path) log.info("Merging DataFrames...") - merged_DF = merge(study_DF, assay_DF, on='Sample Name') + merged_dataframe = merge(study_dataframe, assay_dataframe, on='Sample Name') log.info("Writing merged DataFrame to file %s", target_file_path) + headers = study_dataframe.isatab_header + assay_dataframe.isatab_header[1:] with open(target_file_path, 'w', encoding='utf-8') as fp: - merged_DF.to_csv(fp, sep='\t', index=False, header=study_DF.isatab_header + assay_DF.isatab_header[1:]) + merged_dataframe.to_csv(fp, sep='\t', index=False, header=headers) def load_table(fp): @@ -1081,3 +748,35 @@ def load_table(fp): new_labels.append(label) df.columns = new_labels return df + + +def read_tfile(tfile_path: str, index_col=None, factor_filter=None) -> IsaTabDataFrame: + """Read a table file into a DataFrame + + :param tfile_path: Path to a table file to load + :param index_col: The column to use as study_index + :param factor_filter: Factor filter tuple, e.g. ('Gender', 'Male') will + filter on FactorValue[Gender] == Male + :return: A table file DataFrame + """ + with utf8_text_file_open(tfile_path) as tfile_fp: + tfile_fp.seek(0) + tfile_fp = strip_comments(tfile_fp) + csv = read_csv(tfile_fp, dtype=str, sep='\t', index_col=index_col, encoding='utf-8').fillna('') + tfile_df = IsaTabDataFrame(csv) + if factor_filter: + log.debug("Filtering DataFrame contents on Factor Value %s", factor_filter) + return tfile_df[tfile_df['Factor Value[{}]'.format(factor_filter[0])] == factor_filter[1]] + return tfile_df + + +def read_investigation_file(fp): + """Reads an investigation file into a dictionary of DataFrames, each + DataFrame being each section of the investigation file. e.g. One DataFrame + for the INVESTIGATION PUBLICATIONS section + + :param fp: A file-like buffer object of the investigation file + :return: A dictionary holding a set of DataFrames for each section of the + investigation file. See below implementation for detail + """ + return ISATabReader(fp).run() \ No newline at end of file diff --git a/isatools/isatab/load/read.py b/isatools/isatab/load/read.py deleted file mode 100644 index b454b82c..00000000 --- a/isatools/isatab/load/read.py +++ /dev/null @@ -1,175 +0,0 @@ -from __future__ import annotations -from io import StringIO - -from pandas import read_csv -from numpy import nan - -from isatools.utils import utf8_text_file_open -from isatools.isatab.defaults import log -from isatools.isatab.utils import strip_comments, IsaTabDataFrame - - -def read_investigation_file(fp): - """Reads an investigation file into a dictionary of DataFrames, each - DataFrame being each section of the investigation file. e.g. One DataFrame - for the INVESTIGATION PUBLICATIONS section - - :param fp: A file-like buffer object of the investigation file - :return: A dictionary holding a set of DataFrames for each section of the - investigation file. See below implementation for detail - """ - - def _peek(f): - """Peek at the next line without moving to the next line. This function - get the position of the next line, reads the next line, then resets the - file pointer to the original position - - :param f: A file-like buffer object - :return: The next line past the current line - """ - position = f.tell() - line = f.readline() - f.seek(position) - return line - - def _read_tab_section(f, sec_key, next_sec_key=None): - """Slices a file by section delimited by section keys - - :param f: A file-like buffer object - :param sec_key: Delimiter key of beginning of section - :param next_sec_key: Delimiter key of end of section - :return: A memory file of the section slice, as a string buffer object - """ - fileline = f.readline() - normed_line = fileline.rstrip() - if normed_line[0] == '"': - normed_line = normed_line[1:] - if normed_line[len(normed_line) - 1] == '"': - normed_line = normed_line[:len(normed_line) - 1] - if not normed_line == sec_key: - raise IOError("Expected: " + sec_key + " section, but got: " - + normed_line) - memf = StringIO() - while not _peek(f=f).rstrip() == next_sec_key: - fileline = f.readline() - if not fileline: - break - memf.write(fileline.rstrip() + '\n') - memf.seek(0) - return memf - - def _build_section_df(f: StringIO): - """Reads a file section into a DataFrame - - :param f: A file-like buffer object - :return: A DataFrame corresponding to the file section - """ - df = read_csv(f, names=range(0, 128), sep='\t', engine='python', - encoding='utf-8').dropna(axis=1, how='all') - df = df.T - df.replace(nan, '', regex=True, inplace=True) - # Strip out the nan entries - df.reset_index(inplace=True) - # Reset study_index so it is accessible as column - df.columns = df.iloc[0] - # If all was OK, promote this row to the column headers - df = df.reindex(df.index.drop(0)) - # Reindex the DataFrame - return df - - memory_file = StringIO() - line = True - while line: - line = fp.readline() - if not line.lstrip().startswith('#'): - memory_file.write(line) - memory_file.seek(0) - - df_dict = dict() - - # Read in investigation file into DataFrames first - df_dict['ontology_sources'] = _build_section_df(_read_tab_section( - f=memory_file, - sec_key='ONTOLOGY SOURCE REFERENCE', - next_sec_key='INVESTIGATION' - )) - df_dict['investigation'] = _build_section_df(_read_tab_section( - f=memory_file, - sec_key='INVESTIGATION', - next_sec_key='INVESTIGATION PUBLICATIONS' - )) - df_dict['i_publications'] = _build_section_df(_read_tab_section( - f=memory_file, - sec_key='INVESTIGATION PUBLICATIONS', - next_sec_key='INVESTIGATION CONTACTS' - )) - df_dict['i_contacts'] = _build_section_df(_read_tab_section( - f=memory_file, - sec_key='INVESTIGATION CONTACTS', - next_sec_key='STUDY' - )) - df_dict['studies'] = list() - df_dict['s_design_descriptors'] = list() - df_dict['s_publications'] = list() - df_dict['s_factors'] = list() - df_dict['s_assays'] = list() - df_dict['s_protocols'] = list() - df_dict['s_contacts'] = list() - while _peek(memory_file): # Iterate through STUDY blocks until end of file - df_dict['studies'].append(_build_section_df(_read_tab_section( - f=memory_file, - sec_key='STUDY', - next_sec_key='STUDY DESIGN DESCRIPTORS' - ))) - df_dict['s_design_descriptors'].append( - _build_section_df(_read_tab_section( - f=memory_file, - sec_key='STUDY DESIGN DESCRIPTORS', - next_sec_key='STUDY PUBLICATIONS' - ))) - df_dict['s_publications'].append(_build_section_df(_read_tab_section( - f=memory_file, - sec_key='STUDY PUBLICATIONS', - next_sec_key='STUDY FACTORS' - ))) - df_dict['s_factors'].append(_build_section_df(_read_tab_section( - f=memory_file, - sec_key='STUDY FACTORS', - next_sec_key='STUDY ASSAYS' - ))) - df_dict['s_assays'].append(_build_section_df(_read_tab_section( - f=memory_file, - sec_key='STUDY ASSAYS', - next_sec_key='STUDY PROTOCOLS' - ))) - df_dict['s_protocols'].append(_build_section_df(_read_tab_section( - f=memory_file, - sec_key='STUDY PROTOCOLS', - next_sec_key='STUDY CONTACTS' - ))) - df_dict['s_contacts'].append(_build_section_df(_read_tab_section( - f=memory_file, - sec_key='STUDY CONTACTS', - next_sec_key='STUDY' - ))) - return df_dict - - -def read_tfile(tfile_path, index_col=None, factor_filter=None) -> IsaTabDataFrame: - """Read a table file into a DataFrame - - :param tfile_path: Path to a table file to load - :param index_col: The column to use as study_index - :param factor_filter: Factor filter tuple, e.g. ('Gender', 'Male') will - filter on FactorValue[Gender] == Male - :return: A table file DataFrame - """ - with utf8_text_file_open(tfile_path) as tfile_fp: - tfile_fp.seek(0) - tfile_fp = strip_comments(tfile_fp) - csv = read_csv(tfile_fp, dtype=str, sep='\t', index_col=index_col, encoding='utf-8').fillna('') - tfile_df = IsaTabDataFrame(csv) - if factor_filter: - log.debug("Filtering DataFrame contents on Factor Value %s", factor_filter) - return tfile_df[tfile_df['Factor Value[{}]'.format(factor_filter[0])] == factor_filter[1]] - return tfile_df