Skip to content

Commit

Permalink
Merge pull request #529 from ptth222/tab-validate-idf-rename
Browse files Browse the repository at this point in the history
Rename i_df and investigation_df.
  • Loading branch information
terazus authored Mar 11, 2024
2 parents 6d30dc2 + 33dae30 commit 01dffc7
Show file tree
Hide file tree
Showing 8 changed files with 102 additions and 104 deletions.
8 changes: 4 additions & 4 deletions isatools/isatab/validate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,21 +186,21 @@ def validate(fp: TextIO,

built_rules = build_rules(rules)
try:
i_df = load_investigation(fp=fp)
i_df_dict = load_investigation(fp=fp)
params = {
"investigation_df": i_df,
"investigation_df_dict": i_df_dict,
"dir_context": path.dirname(fp.name),
"configs": config_dir,
}
investigation_validator = ISAInvestigationValidator(**params, **built_rules['investigation'])

for i, study_df in enumerate(i_df['studies']):
for i, study_df in enumerate(i_df_dict['studies']):
study_filename = study_df.iloc[0]['Study File Name']
study_validator = ISAStudyValidator(validator=investigation_validator, study_index=i,
study_filename=study_filename, study_df=study_df,
**built_rules['studies'])
assay_tables = list()
assay_df = study_validator.params['investigation_df']['s_assays'][i]
assay_df = study_validator.params['investigation_df_dict']['s_assays'][i]
for x, assay_filename in enumerate(assay_df['Study Assay File Name'].tolist()):
ISAAssayValidator(assay_tables=assay_tables, validator=study_validator, assay_index=x,
assay_df=assay_df, assay_filename=assay_filename, **built_rules['assays'])
Expand Down
10 changes: 5 additions & 5 deletions isatools/isatab/validate/rules/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,14 @@ def validate_rules(self, validator):

class ISAInvestigationValidator:
def __init__(self,
investigation_df: DataFrame,
investigation_df_dict: dict,
dir_context: str,
configs: str,
available_rules: list = INVESTIGATION_RULES_MAPPING,
rules_to_run: tuple = DEFAULT_INVESTIGATION_RULES):
""" The ISA investigation validator class
:param investigation_df: the investigation dataframe
:param investigation_df_dict: a dictionary of DataFrames and lists of DataFrames representing the investigation file
:param dir_context: the directory of the investigation
:param configs: directory of the XML config files
:param available_rules: a customizable list of all available rules for investigation objects
Expand All @@ -124,7 +124,7 @@ def __init__(self,
self.all_rules = Rules(rules_to_run=rules_to_run, available_rules=available_rules)
self.has_validated = False
self.params = {
'investigation_df': investigation_df,
'investigation_df_dict': investigation_df_dict,
'dir_context': dir_context,
'configs': configs,
'term_source_refs': None
Expand Down Expand Up @@ -162,8 +162,8 @@ def __init__(self,
self.params['study_sample_table'] = load_table(s_fp)
self.params['study_sample_table'].filename = study_filename

protocol_names = self.params['investigation_df']['s_protocols'][study_index]['Study Protocol Name'].tolist()
protocol_types = self.params['investigation_df']['s_protocols'][study_index]['Study Protocol Type'].tolist()
protocol_names = self.params['investigation_df_dict']['s_protocols'][study_index]['Study Protocol Name'].tolist()
protocol_types = self.params['investigation_df_dict']['s_protocols'][study_index]['Study Protocol Type'].tolist()
self.params['protocol_names_and_types'] = dict(zip(protocol_names, protocol_types))

self.params['study_group_size_in_comment'] = None
Expand Down
36 changes: 18 additions & 18 deletions isatools/isatab/validate/rules/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,30 +30,30 @@


INVESTIGATION_RULES_MAPPING = [
{'rule': check_table_files_read, 'params': ['investigation_df', 'dir_context'], 'identifier': '0006'},
{'rule': check_table_files_read, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '0006'},

{'rule': sample_not_declared, 'params': ['investigation_df', 'dir_context'], 'identifier': '1003'},
{'rule': check_protocol_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1007'},
{'rule': check_study_factor_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1008'},
{'rule': check_protocol_parameter_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1009'},
{'rule': check_protocol_names, 'params': ['investigation_df'], 'identifier': '1010'},
{'rule': check_protocol_parameter_names, 'params': ['investigation_df'], 'identifier': '1011'},
{'rule': check_study_factor_names, 'params': ['investigation_df'], 'identifier': '1012'},
{'rule': sample_not_declared, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1003'},
{'rule': check_protocol_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1007'},
{'rule': check_study_factor_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1008'},
{'rule': check_protocol_parameter_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1009'},
{'rule': check_protocol_names, 'params': ['investigation_df_dict'], 'identifier': '1010'},
{'rule': check_protocol_parameter_names, 'params': ['investigation_df_dict'], 'identifier': '1011'},
{'rule': check_study_factor_names, 'params': ['investigation_df_dict'], 'identifier': '1012'},

{'rule': check_date_formats, 'params': ['investigation_df'], 'identifier': '3001'},
{'rule': check_dois, 'params': ['investigation_df'], 'identifier': '3002'},
{'rule': check_pubmed_ids_format, 'params': ['investigation_df'], 'identifier': '3003'},
{'rule': check_ontology_sources, 'params': ['investigation_df'], 'identifier': '3008'},
{'rule': check_date_formats, 'params': ['investigation_df_dict'], 'identifier': '3001'},
{'rule': check_dois, 'params': ['investigation_df_dict'], 'identifier': '3002'},
{'rule': check_pubmed_ids_format, 'params': ['investigation_df_dict'], 'identifier': '3003'},
{'rule': check_ontology_sources, 'params': ['investigation_df_dict'], 'identifier': '3008'},

{'rule': load_config, 'params': ['configs'], 'identifier': '4001'},
{'rule': check_measurement_technology_types, 'params': ['investigation_df', 'configs'], 'identifier': '4002'},
{'rule': check_investigation_against_config, 'params': ['investigation_df', 'configs'], 'identifier': '4003'},
{'rule': check_measurement_technology_types, 'params': ['investigation_df_dict', 'configs'], 'identifier': '4002'},
{'rule': check_investigation_against_config, 'params': ['investigation_df_dict', 'configs'], 'identifier': '4003'},

# copies
{'rule': check_table_files_read, 'params': ['investigation_df', 'dir_context'], 'identifier': '0008'},
{'rule': check_protocol_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1019'},
{'rule': check_protocol_parameter_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1020'},
{'rule': check_study_factor_usage, 'params': ['investigation_df', 'dir_context'], 'identifier': '1021'},
{'rule': check_table_files_read, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '0008'},
{'rule': check_protocol_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1019'},
{'rule': check_protocol_parameter_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1020'},
{'rule': check_study_factor_usage, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '1021'},
]

STUDY_RULES_MAPPING = [
Expand Down
8 changes: 4 additions & 4 deletions isatools/isatab/validate/rules/rules_00xx.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
from isatools.isatab.defaults import log


def check_table_files_read(i_df, dir_context):
def check_table_files_read(i_df_dict, dir_context):
"""Used for rules 0006 and 0008
:param i_df: An investigation DataFrame
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param dir_context: Path to where the investigation file is found
:return: None
"""
for i, study_df in enumerate(i_df['studies']):
for i, study_df in enumerate(i_df_dict['studies']):
study_filename = study_df.iloc[0]['Study File Name']
if study_filename != '':
try:
Expand All @@ -22,7 +22,7 @@ def check_table_files_read(i_df, dir_context):
spl = "Study File {} does not appear to exist".format(study_filename)
validator.add_error(message="Missing study tab file(s)", supplemental=spl, code=6)
log.error("(E) Study File {} does not appear to exist".format(study_filename))
for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
if assay_filename != '':
try:
with utf8_text_file_open(path.join(dir_context, assay_filename)):
Expand Down
62 changes: 31 additions & 31 deletions isatools/isatab/validate/rules/rules_10xx.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
from isatools.isatab.utils import cell_has_value


def check_samples_not_declared_in_study_used_in_assay(i_df, dir_context):
def check_samples_not_declared_in_study_used_in_assay(i_df_dict, dir_context):
"""Checks if samples found in assay tables are found in the study-sample table
:param i_df: An investigation DataFrame
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param dir_context: Path to where the investigation file is found
:return: None
"""
for i, study_df in enumerate(i_df['studies']):
for i, study_df in enumerate(i_df_dict['studies']):
study_filename = study_df.iloc[0]['Study File Name']
if study_filename != '':
try:
Expand All @@ -25,7 +25,7 @@ def check_samples_not_declared_in_study_used_in_assay(i_df, dir_context):
study_samples = set(study_df['Sample Name'])
except FileNotFoundError:
pass
for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
if assay_filename != '':
try:
with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp:
Expand All @@ -40,15 +40,15 @@ def check_samples_not_declared_in_study_used_in_assay(i_df, dir_context):
pass


def check_study_factor_usage(i_df, dir_context):
def check_study_factor_usage(i_df_dict, dir_context):
"""Used for rules 1008 and 1021
:param i_df: An investigation DataFrame
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param dir_context: Path to where the investigation file is found
:return: None
"""
for i, study_df in enumerate(i_df['studies']):
study_factors_declared = set(i_df['s_factors'][i]['Study Factor Name'].tolist())
for i, study_df in enumerate(i_df_dict['studies']):
study_factors_declared = set(i_df_dict['s_factors'][i]['Study Factor Name'].tolist())
study_filename = study_df.iloc[0]['Study File Name']
error_spl = "Some factors used in an study file {} are not declared in the investigation file: {}"
error_msg = "Some factors are not declared in the investigation"
Expand All @@ -66,7 +66,7 @@ def check_study_factor_usage(i_df, dir_context):
validator.add_error(message=error_msg, supplemental=spl, code=1008)
except FileNotFoundError:
pass
for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
if assay_filename != '':
try:
study_factors_used = set()
Expand All @@ -92,7 +92,7 @@ def check_study_factor_usage(i_df, dir_context):
study_factors_used = study_factors_used.union(set(fv))
except FileNotFoundError:
pass
for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
if assay_filename != '':
try:
with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp:
Expand All @@ -109,15 +109,15 @@ def check_study_factor_usage(i_df, dir_context):
.format(list(study_factors_declared - study_factors_used)))


def check_protocol_usage(i_df, dir_context):
def check_protocol_usage(i_df_dict, dir_context):
"""Used for rules 1007 and 1019
:param i_df: An investigation DataFrame
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param dir_context: Path to where the investigation file is found
:return: None
"""
for i, study_df in enumerate(i_df['studies']):
protocols_declared = set(i_df['s_protocols'][i]['Study Protocol Name'].tolist())
for i, study_df in enumerate(i_df_dict['studies']):
protocols_declared = set(i_df_dict['s_protocols'][i]['Study Protocol Name'].tolist())
protocols_declared.add('')
study_filename = study_df.iloc[0]['Study File Name']
if study_filename != '':
Expand All @@ -136,7 +136,7 @@ def check_protocol_usage(i_df, dir_context):
log.error("(E) {}".format(spl))
except FileNotFoundError:
pass
for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
if assay_filename != '':
try:
protocol_refs_used = set()
Expand Down Expand Up @@ -165,7 +165,7 @@ def check_protocol_usage(i_df, dir_context):
except FileNotFoundError:
pass
for j, assay_filename in enumerate(
i_df['s_assays'][i]['Study Assay File Name'].tolist()):
i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
if assay_filename != '':
try:
with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp:
Expand All @@ -183,16 +183,16 @@ def check_protocol_usage(i_df, dir_context):
log.warning(warning)


def check_protocol_parameter_usage(i_df, dir_context):
def check_protocol_parameter_usage(i_df_dict, dir_context):
"""Used for rules 1009 and 1020
:param i_df: An investigation DataFrame
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:param dir_context: Path to where the investigation file is found
:return: None
"""
for i, study_df in enumerate(i_df['studies']):
for i, study_df in enumerate(i_df_dict['studies']):
protocol_parameters_declared = set()
protocol_parameters_per_protocol = set(i_df['s_protocols'][i]['Study Protocol Parameters Name'].tolist())
protocol_parameters_per_protocol = set(i_df_dict['s_protocols'][i]['Study Protocol Parameters Name'].tolist())
for protocol_parameters in protocol_parameters_per_protocol:
parameters_list = protocol_parameters.split(';')
protocol_parameters_declared = protocol_parameters_declared.union(set(parameters_list))
Expand All @@ -216,7 +216,7 @@ def check_protocol_parameter_usage(i_df, dir_context):
log.error(error)
except FileNotFoundError:
pass
for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
if assay_filename != '':
try:
protocol_parameters_used = set()
Expand Down Expand Up @@ -246,7 +246,7 @@ def check_protocol_parameter_usage(i_df, dir_context):
protocol_parameters_used = protocol_parameters_used.union(set(pv))
except FileNotFoundError:
pass
for j, assay_filename in enumerate(i_df['s_assays'][i]['Study Assay File Name'].tolist()):
for j, assay_filename in enumerate(i_df_dict['s_assays'][i]['Study Assay File Name'].tolist()):
if assay_filename != '':
try:
with utf8_text_file_open(path.join(dir_context, assay_filename)) as a_fp:
Expand All @@ -263,13 +263,13 @@ def check_protocol_parameter_usage(i_df, dir_context):
log.warning(warning)


def check_protocol_names(i_df):
def check_protocol_names(i_df_dict):
"""Used for rule 1010
:param i_df: An investigation DataFrame
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:return: None
"""
for study_protocols_df in i_df['s_protocols']:
for study_protocols_df in i_df_dict['s_protocols']:
for i, protocol_name in enumerate(study_protocols_df['Study Protocol Name'].tolist()):
# DataFrames labels empty cells as 'Unnamed: n'
if protocol_name == '' or 'Unnamed: ' in protocol_name:
Expand All @@ -279,13 +279,13 @@ def check_protocol_names(i_df):
log.warning(warning)


def check_protocol_parameter_names(i_df):
def check_protocol_parameter_names(i_df_dict):
"""Used for rule 1011
:param i_df: An investigation DataFrame
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:return: None
"""
for study_protocols_df in i_df['s_protocols']:
for study_protocols_df in i_df_dict['s_protocols']:
for i, protocol_parameters_names in enumerate(study_protocols_df['Study Protocol Parameters Name'].tolist()):
# There's an empty cell if no protocols
if len(protocol_parameters_names.split(sep=';')) > 1:
Expand All @@ -298,13 +298,13 @@ def check_protocol_parameter_names(i_df):
log.warning(warning)


def check_study_factor_names(i_df):
def check_study_factor_names(i_df_dict):
"""Used for rule 1012
:param i_df: An investigation DataFrame
:param i_df_dict: A dictionary of DataFrames and lists of DataFrames representing the investigation file
:return: None
"""
for study_factors_df in i_df['s_factors']:
for study_factors_df in i_df_dict['s_factors']:
for i, factor_name in enumerate(study_factors_df['Study Factor Name'].tolist()):
# DataFrames labels empty cells as 'Unnamed: n'
if factor_name == '' or 'Unnamed: ' in factor_name:
Expand Down
Loading

0 comments on commit 01dffc7

Please sign in to comment.