Merge pull request #529 from ptth222/tab-validate-idf-rename

Rename i_df and investigation_df.
ISA-tools · Mar 11, 2024 · 01dffc7 · 01dffc7
2 parents 6d30dc2 + 33dae30
commit 01dffc7
Show file tree

Hide file tree

Showing 8 changed files with 102 additions and 104 deletions.
diff --git a/isatools/isatab/validate/core.py b/isatools/isatab/validate/core.py
@@ -186,21 +186,21 @@ def validate(fp: TextIO,
 
     built_rules = build_rules(rules)
     try:
-        i_df = load_investigation(fp=fp)
+        i_df_dict = load_investigation(fp=fp)
         params = {
-            "investigation_df": i_df,
+            "investigation_df_dict": i_df_dict,
             "dir_context": path.dirname(fp.name),
             "configs": config_dir,
         }
         investigation_validator = ISAInvestigationValidator(**params, **built_rules['investigation'])
 
-        for i, study_df in enumerate(i_df['studies']):
+        for i, study_df in enumerate(i_df_dict['studies']):
             study_filename = study_df.iloc[0]['Study File Name']
             study_validator = ISAStudyValidator(validator=investigation_validator, study_index=i,
                                                 study_filename=study_filename, study_df=study_df,
                                                 **built_rules['studies'])
             assay_tables = list()
-            assay_df = study_validator.params['investigation_df']['s_assays'][i]
+            assay_df = study_validator.params['investigation_df_dict']['s_assays'][i]
             for x, assay_filename in enumerate(assay_df['Study Assay File Name'].tolist()):
                 ISAAssayValidator(assay_tables=assay_tables, validator=study_validator, assay_index=x,
                                   assay_df=assay_df, assay_filename=assay_filename, **built_rules['assays'])

diff --git a/isatools/isatab/validate/rules/core.py b/isatools/isatab/validate/rules/core.py
@@ -108,14 +108,14 @@ def validate_rules(self, validator):
 
 class ISAInvestigationValidator:
     def __init__(self,
-                 investigation_df: DataFrame,
+                 investigation_df_dict: dict,
                  dir_context: str,
                  configs: str,
                  available_rules: list = INVESTIGATION_RULES_MAPPING,
                  rules_to_run: tuple = DEFAULT_INVESTIGATION_RULES):
         """ The ISA investigation validator class
 
-        :param investigation_df: the investigation dataframe
+        :param investigation_df_dict: a dictionary of DataFrames and lists of DataFrames representing the investigation file
         :param dir_context: the directory of the investigation
         :param configs: directory of the XML config files
         :param available_rules: a customizable list of all available rules for investigation objects
@@ -124,7 +124,7 @@ def __init__(self,
         self.all_rules = Rules(rules_to_run=rules_to_run, available_rules=available_rules)
         self.has_validated = False
         self.params = {
-            'investigation_df': investigation_df,
+            'investigation_df_dict': investigation_df_dict,
             'dir_context': dir_context,
             'configs': configs,
             'term_source_refs': None
@@ -162,8 +162,8 @@ def __init__(self,
             self.params['study_sample_table'] = load_table(s_fp)
             self.params['study_sample_table'].filename = study_filename
 
-        protocol_names = self.params['investigation_df']['s_protocols'][study_index]['Study Protocol Name'].tolist()
-        protocol_types = self.params['investigation_df']['s_protocols'][study_index]['Study Protocol Type'].tolist()
+        protocol_names = self.params['investigation_df_dict']['s_protocols'][study_index]['Study Protocol Name'].tolist()
+        protocol_types = self.params['investigation_df_dict']['s_protocols'][study_index]['Study Protocol Type'].tolist()
         self.params['protocol_names_and_types'] = dict(zip(protocol_names, protocol_types))
 
         self.params['study_group_size_in_comment'] = None

diff --git a/isatools/isatab/validate/rules/defaults.py b/isatools/isatab/validate/rules/defaults.py
@@ -30,30 +30,30 @@
 
 
 INVESTIGATION_RULES_MAPPING = [
-    {'rule': check_table_files_read, 'params': ['investigation_df', 'dir_context'], 'identifier': '0006'},
+    {'rule': check_table_files_read, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '0006'},
 
-    {'rule': sample_not_declared, 'params': ['investigation_df', 'dir_context'], 'identifier': '1003'},
-    {'rule': check_protocol_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1007'},
-    {'rule': check_study_factor_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1008'},
-    {'rule': check_protocol_parameter_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1009'},
-    {'rule': check_protocol_names, 'params': ['investigation_df'], 'identifier': '1010'},
-    {'rule': check_protocol_parameter_names, 'params': ['investigation_df'], 'identifier': '1011'},
-    {'rule': check_study_factor_names, 'params': ['investigation_df'], 'identifier': '1012'},
+    {'rule': sample_not_declared, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1003'},
+    {'rule': check_protocol_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1007'},
+    {'rule': check_study_factor_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1008'},
+    {'rule': check_protocol_parameter_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1009'},
+    {'rule': check_protocol_names, 'params': ['investigation_df_dict'], 'identifier': '1010'},
+    {'rule': check_protocol_parameter_names, 'params': ['investigation_df_dict'], 'identifier': '1011'},
+    {'rule': check_study_factor_names, 'params': ['investigation_df_dict'], 'identifier': '1012'},
 
-    {'rule': check_date_formats, 'params': ['investigation_df'], 'identifier': '3001'},
-    {'rule': check_dois, 'params': ['investigation_df'], 'identifier': '3002'},
-    {'rule': check_pubmed_ids_format, 'params': ['investigation_df'], 'identifier': '3003'},
-    {'rule': check_ontology_sources, 'params': ['investigation_df'], 'identifier': '3008'},
+    {'rule': check_date_formats, 'params': ['investigation_df_dict'], 'identifier': '3001'},
+    {'rule': check_dois, 'params': ['investigation_df_dict'], 'identifier': '3002'},
+    {'rule': check_pubmed_ids_format, 'params': ['investigation_df_dict'], 'identifier': '3003'},
+    {'rule': check_ontology_sources, 'params': ['investigation_df_dict'], 'identifier': '3008'},
 
     {'rule': load_config, 'params': ['configs'], 'identifier': '4001'},
-    {'rule': check_measurement_technology_types, 'params': ['investigation_df', 'configs'], 'identifier': '4002'},
-    {'rule': check_investigation_against_config, 'params': ['investigation_df', 'configs'], 'identifier': '4003'},
+    {'rule': check_measurement_technology_types, 'params': ['investigation_df_dict', 'configs'], 'identifier': '4002'},
+    {'rule': check_investigation_against_config, 'params': ['investigation_df_dict', 'configs'], 'identifier': '4003'},
 
     # copies
-    {'rule': check_table_files_read, 'params': ['investigation_df', 'dir_context'], 'identifier': '0008'},
-    {'rule': check_protocol_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1019'},
-    {'rule': check_protocol_parameter_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1020'},
-    {'rule': check_study_factor_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1021'},
+    {'rule': check_table_files_read, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '0008'},
+    {'rule': check_protocol_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1019'},
+    {'rule': check_protocol_parameter_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1020'},
+    {'rule': check_study_factor_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1021'},
 ]
 
 STUDY_RULES_MAPPING = [

diff --git a/isatools/isatab/validate/rules/rules_00xx.py b/isatools/isatab/validate/rules/rules_00xx.py
@@ -5,14 +5,14 @@
 from isatools.isatab.defaults import log
 
 
-def check_table_files_read(i_df, dir_context):
+def check_table_files_read(i_df_dict, dir_context):
     """Used for rules 0006 and 0008
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :param dir_context: Path to where the investigation file is found
     :return: None
     """
-    for i, study_df in enumerate(i_df['studies']):
+    for i, study_df in enumerate(i_df_dict['studies']):
         study_filename = study_df.iloc[0]['Study File Name']
         if study_filename != '':
             try:
@@ -22,7 +22,7 @@ def check_table_files_read(i_df, dir_context):
                 spl = "Study File {} does not appear to exist".format(study_filename)
                 validator.add_error(message="Missing study tab file(s)", supplemental=spl, code=6)
                 log.error("(E) Study File {} does not appear to exist".format(study_filename))
-        for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+        for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     with utf8_text_file_open(path.join(dir_context, assay_filename)):

diff --git a/isatools/isatab/validate/rules/rules_10xx.py b/isatools/isatab/validate/rules/rules_10xx.py
@@ -9,14 +9,14 @@
 from isatools.isatab.utils import cell_has_value
 
 
-def check_samples_not_declared_in_study_used_in_assay(i_df, dir_context):
+def check_samples_not_declared_in_study_used_in_assay(i_df_dict, dir_context):
     """Checks if samples found in assay tables are found in the study-sample table
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :param dir_context: Path to where the investigation file is found
     :return: None
     """
-    for i, study_df in enumerate(i_df['studies']):
+    for i, study_df in enumerate(i_df_dict['studies']):
         study_filename = study_df.iloc[0]['Study File Name']
         if study_filename != '':
             try:
@@ -25,7 +25,7 @@ def check_samples_not_declared_in_study_used_in_assay(i_df, dir_context):
                     study_samples = set(study_df['Sample Name'])
             except FileNotFoundError:
                 pass
-        for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+        for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp:
@@ -40,15 +40,15 @@ def check_samples_not_declared_in_study_used_in_assay(i_df, dir_context):
                     pass
 
 
-def check_study_factor_usage(i_df, dir_context):
+def check_study_factor_usage(i_df_dict, dir_context):
     """Used for rules 1008 and 1021
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :param dir_context: Path to where the investigation file is found
     :return: None
     """
-    for i, study_df in enumerate(i_df['studies']):
-        study_factors_declared = set(i_df['s_factors'][i]['Study Factor Name'].tolist())
+    for i, study_df in enumerate(i_df_dict['studies']):
+        study_factors_declared = set(i_df_dict['s_factors'][i]['Study Factor Name'].tolist())
         study_filename = study_df.iloc[0]['Study File Name']
         error_spl = "Some factors used in an study file {} are not declared in the investigation file: {}"
         error_msg = "Some factors are not declared in the investigation"
@@ -66,7 +66,7 @@ def check_study_factor_usage(i_df, dir_context):
                         validator.add_error(message=error_msg, supplemental=spl, code=1008)
             except FileNotFoundError:
                 pass
-        for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+        for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     study_factors_used = set()
@@ -92,7 +92,7 @@ def check_study_factor_usage(i_df, dir_context):
                         study_factors_used = study_factors_used.union(set(fv))
             except FileNotFoundError:
                 pass
-        for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+        for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp:
@@ -109,15 +109,15 @@ def check_study_factor_usage(i_df, dir_context):
                         .format(list(study_factors_declared - study_factors_used)))
 
 
-def check_protocol_usage(i_df, dir_context):
+def check_protocol_usage(i_df_dict, dir_context):
     """Used for rules 1007 and 1019
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :param dir_context: Path to where the investigation file is found
     :return: None
     """
-    for i, study_df in enumerate(i_df['studies']):
-        protocols_declared = set(i_df['s_protocols'][i]['Study Protocol Name'].tolist())
+    for i, study_df in enumerate(i_df_dict['studies']):
+        protocols_declared = set(i_df_dict['s_protocols'][i]['Study Protocol Name'].tolist())
         protocols_declared.add('')
         study_filename = study_df.iloc[0]['Study File Name']
         if study_filename != '':
@@ -136,7 +136,7 @@ def check_protocol_usage(i_df, dir_context):
                         log.error("(E) {}".format(spl))
             except FileNotFoundError:
                 pass
-        for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+        for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     protocol_refs_used = set()
@@ -165,7 +165,7 @@ def check_protocol_usage(i_df, dir_context):
             except FileNotFoundError:
                 pass
         for j, assay_filename in enumerate(
-                i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+                i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp:
@@ -183,16 +183,16 @@ def check_protocol_usage(i_df, dir_context):
             log.warning(warning)
 
 
-def check_protocol_parameter_usage(i_df, dir_context):
+def check_protocol_parameter_usage(i_df_dict, dir_context):
     """Used for rules 1009 and 1020
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :param dir_context: Path to where the investigation file is found
     :return: None
     """
-    for i, study_df in enumerate(i_df['studies']):
+    for i, study_df in enumerate(i_df_dict['studies']):
         protocol_parameters_declared = set()
-        protocol_parameters_per_protocol = set(i_df['s_protocols'][i]['Study Protocol Parameters Name'].tolist())
+        protocol_parameters_per_protocol = set(i_df_dict['s_protocols'][i]['Study Protocol Parameters Name'].tolist())
         for protocol_parameters in protocol_parameters_per_protocol:
             parameters_list = protocol_parameters.split(';')
             protocol_parameters_declared = protocol_parameters_declared.union(set(parameters_list))
@@ -216,7 +216,7 @@ def check_protocol_parameter_usage(i_df, dir_context):
                         log.error(error)
             except FileNotFoundError:
                 pass
-        for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+        for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     protocol_parameters_used = set()
@@ -246,7 +246,7 @@ def check_protocol_parameter_usage(i_df, dir_context):
                         protocol_parameters_used = protocol_parameters_used.union(set(pv))
             except FileNotFoundError:
                 pass
-        for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
+        for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
             if assay_filename != '':
                 try:
                     with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp:
@@ -263,13 +263,13 @@ def check_protocol_parameter_usage(i_df, dir_context):
             log.warning(warning)
 
 
-def check_protocol_names(i_df):
+def check_protocol_names(i_df_dict):
     """Used for rule 1010
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :return: None
     """
-    for study_protocols_df in i_df['s_protocols']:
+    for study_protocols_df in i_df_dict['s_protocols']:
         for i, protocol_name in enumerate(study_protocols_df['Study Protocol Name'].tolist()):
             # DataFrames labels empty cells as 'Unnamed: n'
             if protocol_name == '' or 'Unnamed: ' in protocol_name:
@@ -279,13 +279,13 @@ def check_protocol_names(i_df):
                 log.warning(warning)
 
 
-def check_protocol_parameter_names(i_df):
+def check_protocol_parameter_names(i_df_dict):
     """Used for rule 1011
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :return: None
     """
-    for study_protocols_df in i_df['s_protocols']:
+    for study_protocols_df in i_df_dict['s_protocols']:
         for i, protocol_parameters_names in enumerate(study_protocols_df['Study Protocol Parameters Name'].tolist()):
             # There's an empty cell if no protocols
             if len(protocol_parameters_names.split(sep=';')) > 1:
@@ -298,13 +298,13 @@ def check_protocol_parameter_names(i_df):
                         log.warning(warning)
 
 
-def check_study_factor_names(i_df):
+def check_study_factor_names(i_df_dict):
     """Used for rule 1012
 
-    :param i_df: An investigation DataFrame
+    :param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
     :return: None
     """
-    for study_factors_df in i_df['s_factors']:
+    for study_factors_df in i_df_dict['s_factors']:
         for i, factor_name in enumerate(study_factors_df['Study Factor Name'].tolist()):
             # DataFrames labels empty cells as 'Unnamed: n'
             if factor_name == '' or 'Unnamed: ' in factor_name: