From 7332c4a3c5867cdcc400de82958ccef80bf5ceae Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Tue, 3 Sep 2024 11:36:44 -0700 Subject: [PATCH 01/13] feat: adding a func to see if metadata json is valid --- pyproject.toml | 1 + src/aind_metadata_viz/metadata_helpers.py | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 6fe70a5..160976a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ 'altair', 'aind-data-access-api[docdb]', 'aind-data-schema-models', + 'aind-data-schema', 'flask', ] diff --git a/src/aind_metadata_viz/metadata_helpers.py b/src/aind_metadata_viz/metadata_helpers.py index dbeceb9..6f29c58 100644 --- a/src/aind_metadata_viz/metadata_helpers.py +++ b/src/aind_metadata_viz/metadata_helpers.py @@ -1,3 +1,5 @@ +from aind_data_schema.core.metadata import Metadata + def check_present(key: str, object: dict, check_present: bool = True): """Return true if the value of a key exists and is not None, or any of '' [] {} in a JSON object @@ -20,6 +22,17 @@ def check_present(key: str, object: dict, check_present: bool = True): return present if check_present else not present +def check_valid_metadata(json: str): + """Return true if the string is a valid aind metadata object + + Parameters + ---------- + json : str + json string generated from a Metadata.dump + """ + return Metadata.model_validate_json(json) + + def process_present_dict(data: dict, expected_fields: list): return {field: check_present(field, data) for field in expected_fields} From 90075a834db66424b7425659838e183357a09c72 Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Fri, 4 Oct 2024 12:23:57 -0700 Subject: [PATCH 02/13] refactor: improved color options (but broke the plots) --- pyproject.toml | 1 + src/aind_metadata_viz/app.py | 70 +++++++++++++++++------ src/aind_metadata_viz/docdb.py | 20 +++---- src/aind_metadata_viz/metadata_helpers.py | 1 + 4 files changed, 66 insertions(+), 26 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 160976a..79d61df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ 'pydantic', 'panel', 'altair', + 'aind-data-schema', 'aind-data-access-api[docdb]', 'aind-data-schema-models', 'aind-data-schema', diff --git a/src/aind_metadata_viz/app.py b/src/aind_metadata_viz/app.py index 74e082c..9bfff2f 100644 --- a/src/aind_metadata_viz/app.py +++ b/src/aind_metadata_viz/app.py @@ -1,18 +1,33 @@ import panel as pn import altair as alt from aind_metadata_viz import docdb +from aind_data_schema import __version__ as ads_version pn.extension(design="material") pn.extension("vega") alt.themes.enable("ggplot2") -color_options = {"default": ["grey", "red"], "lemonade": ["#FFEF00", "pink"]} +color_options = { + "default": { + "valid": "green", + "present": "grey", + "missing": "red", + "excluded": "black", + }, + "lemonade": { + "valid": "#9FF2F5", + "present": "#F49FD7", + "missing": "#F49FD7", + "excluded": "#9FF2F5", + }, +} colors = ( color_options[pn.state.location.query_params["color"]] if "color" in pn.state.location.query_params else color_options["default"] ) +color_list = list(colors.values()) db = docdb.Database() @@ -52,8 +67,11 @@ def file_present_chart(): axis=alt.Axis(grid=False), ), color=alt.Color( - "status:N", - scale=alt.Scale(domain=["present", "absent"], range=colors), + "category:N", + scale=alt.Scale( + domain=["valid", "present", "missing", "excluded"], + range=color_list, + ), legend=None, ), ) @@ -79,8 +97,11 @@ def notfile_present_chart(): axis=alt.Axis(grid=False), ), color=alt.Color( - "status:N", - scale=alt.Scale(domain=["present", "absent"], range=colors), + "category:N", + scale=alt.Scale( + domain=["valid", "present", "missing", "excluded"], + range=color_list, + ), legend=None, ), ) @@ -159,8 +180,11 @@ def build_mid(selected_file, **args): "sum:Q", title="Metadata assets (n)", axis=alt.Axis(grid=False) ), color=alt.Color( - "status:N", - scale=alt.Scale(domain=["present", "absent"], range=colors), + "category:N", + scale=alt.Scale( + domain=["valid", "present", "missing", "excluded"], + range=color_list, + ), legend=None, ), ) @@ -178,11 +202,21 @@ def build_mid(selected_file, **args): return pn.pane.Vega(chart) -header = f""" -# Missing metadata viewer +def hd_style(text): + return ( + f"{text}" + ) -This app steps through all of the metadata stored in DocDB and checks whether every dictionary key's value is present or missing -""" + +header = ( + f"# Metadata Portal\n\n" + "This app steps through all of the metadata stored in DocDB and determines whether every record's fields " + "(and subfields) are " + f"{hd_style('valid')} for aind-data-schema v{ads_version}, " + f"{hd_style('present')} but invalid, " + f"{hd_style('missing')}, or " + f"{hd_style('excluded')} for the record's modality." +) download_md = """ **Download options** @@ -213,18 +247,22 @@ def build_row(selected_modality, derived_filter): return pn.Row(file_present_chart, notfile_present_chart) -top_row = pn.bind(build_row, - selected_modality=modality_selector, - derived_filter=derived_switch) +top_row = pn.bind( + build_row, + selected_modality=modality_selector, + derived_filter=derived_switch, +) mid_plot = pn.bind( build_mid, selected_file=top_selector, selected_modality=modality_selector, - derived_filter=derived_switch + derived_filter=derived_switch, ) # Put everything in a column and buffer it main_col = pn.Column(top_row, mid_plot, sizing_mode="stretch_width") -pn.Row(left_col, main_col, pn.layout.HSpacer()).servable(title="Metadata Viz") +pn.Row(left_col, main_col, pn.layout.HSpacer()).servable( + title="Metadata Portal" +) diff --git a/src/aind_metadata_viz/docdb.py b/src/aind_metadata_viz/docdb.py index 655fd56..96cefc7 100644 --- a/src/aind_metadata_viz/docdb.py +++ b/src/aind_metadata_viz/docdb.py @@ -50,6 +50,7 @@ class Database(param.Parameterized): """Local representation of aind-data-schema metadata stored in a DocDB MongoDB instance """ + derived_filter = param.Boolean(default=False) modality_filter = param.String(default="all") @@ -80,18 +81,17 @@ def data_filtered(self): include: bool = True if mod_filter and not ( - data["data_description"] - and "modality" in data["data_description"] - and isinstance(data["data_description"]["modality"], - list) - and any( - mod["abbreviation"] == self.modality_filter - for mod in data["data_description"]["modality"] - ) + data["data_description"] + and "modality" in data["data_description"] + and isinstance(data["data_description"]["modality"], list) + and any( + mod["abbreviation"] == self.modality_filter + for mod in data["data_description"]["modality"] + ) ): include = False - - if derived_filter and data["name"].count('_') <= 3: + + if derived_filter and data["name"].count("_") <= 3: include = False if include: diff --git a/src/aind_metadata_viz/metadata_helpers.py b/src/aind_metadata_viz/metadata_helpers.py index 6f29c58..fa57cab 100644 --- a/src/aind_metadata_viz/metadata_helpers.py +++ b/src/aind_metadata_viz/metadata_helpers.py @@ -1,5 +1,6 @@ from aind_data_schema.core.metadata import Metadata + def check_present(key: str, object: dict, check_present: bool = True): """Return true if the value of a key exists and is not None, or any of '' [] {} in a JSON object From d9a734e18038780af11b367fba3830b6b1cbf42e Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Fri, 4 Oct 2024 14:17:17 -0700 Subject: [PATCH 03/13] feat: first pass adding classes to attempt validation against --- src/aind_metadata_viz/app.py | 3 ++- src/aind_metadata_viz/metadata_helpers.py | 30 +++++++++++++++++++---- src/aind_metadata_viz/utils.py | 2 +- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/src/aind_metadata_viz/app.py b/src/aind_metadata_viz/app.py index 9bfff2f..2066d33 100644 --- a/src/aind_metadata_viz/app.py +++ b/src/aind_metadata_viz/app.py @@ -1,5 +1,6 @@ import panel as pn import altair as alt +import pandas as pd from aind_metadata_viz import docdb from aind_data_schema import __version__ as ads_version @@ -67,7 +68,7 @@ def file_present_chart(): axis=alt.Axis(grid=False), ), color=alt.Color( - "category:N", + "status:N", scale=alt.Scale( domain=["valid", "present", "missing", "excluded"], range=color_list, diff --git a/src/aind_metadata_viz/metadata_helpers.py b/src/aind_metadata_viz/metadata_helpers.py index fa57cab..ad94c4e 100644 --- a/src/aind_metadata_viz/metadata_helpers.py +++ b/src/aind_metadata_viz/metadata_helpers.py @@ -1,4 +1,24 @@ -from aind_data_schema.core.metadata import Metadata +from aind_data_schema.core.acquisition import Acquisition +from aind_data_schema.core.data_description import DataDescription +from aind_data_schema.core.instrument import Instrument +from aind_data_schema.core.processing import Processing +from aind_data_schema.core.procedures import Procedures +from aind_data_schema.core.quality_control import QualityControl +from aind_data_schema.core.rig import Rig +from aind_data_schema.core.session import Session +from aind_data_schema.core.subject import Subject + +field_mapping = { + "data_description": DataDescription, + "acquisition": Acquisition, + "procedures": Procedures, + "subject": Subject, + "instrument": Instrument, + "processing": Processing, + "rig": Rig, + "session": Session, + "quality_control": QualityControl, +} def check_present(key: str, object: dict, check_present: bool = True): @@ -23,15 +43,15 @@ def check_present(key: str, object: dict, check_present: bool = True): return present if check_present else not present -def check_valid_metadata(json: str): - """Return true if the string is a valid aind metadata object +def check_valid_metadata(field:str, json: str): + """Return true if the json data is a valid object of the particular field class Parameters ---------- json : str - json string generated from a Metadata.dump + json string generated from a AindCoreModel dump """ - return Metadata.model_validate_json(json) + return field_mapping[field].model_validate_json(json) is not None def process_present_dict(data: dict, expected_fields: list): diff --git a/src/aind_metadata_viz/utils.py b/src/aind_metadata_viz/utils.py index 62eccc5..307b2de 100644 --- a/src/aind_metadata_viz/utils.py +++ b/src/aind_metadata_viz/utils.py @@ -8,7 +8,7 @@ def compute_count_true(df): Dataframe of False/True values """ sum_df = df.sum().to_frame(name="present") - sum_df["absent"] = df.shape[0] - sum_df["present"] + sum_df["missing"] = df.shape[0] - sum_df["present"] sum_longform_df = sum_df.reset_index().melt( id_vars="index", var_name="status", value_name="sum" From 5d4e34cf7363f8bbbc2c7d0b2f9ccfb8fd8e4ab3 Mon Sep 17 00:00:00 2001 From: Mae Moninghoff Date: Fri, 4 Oct 2024 16:28:03 -0700 Subject: [PATCH 04/13] Adding Secondary field mapping layer --- src/aind_metadata_viz/metadata_helpers.py | 202 +++++++++++++++++++++- 1 file changed, 201 insertions(+), 1 deletion(-) diff --git a/src/aind_metadata_viz/metadata_helpers.py b/src/aind_metadata_viz/metadata_helpers.py index ad94c4e..2891558 100644 --- a/src/aind_metadata_viz/metadata_helpers.py +++ b/src/aind_metadata_viz/metadata_helpers.py @@ -1,3 +1,6 @@ +### class that does xxxx for something with QC + +# First level metadata models from aind_data_schema.core.acquisition import Acquisition from aind_data_schema.core.data_description import DataDescription from aind_data_schema.core.instrument import Instrument @@ -8,7 +11,97 @@ from aind_data_schema.core.session import Session from aind_data_schema.core.subject import Subject -field_mapping = { +# General Models +from typing import List, Optional, Dict, Union, Set +from datetime import date, datetime + +# Acquisition Models +from aind_data_schema.components.devices import Calibration, Maintenance, Software +from aind_data_schema.components.tile import AcquisitionTile +from aind_data_schema.components.coordinates import ImageAxis +from aind_data_schema.core.acquisition import Immersion, ProcessingSteps + +# Data Description Models +from aind_data_schema_models.modalities import Modality +from aind_data_schema_models.organizations import Organization +from aind_data_schema_models.pid_names import PIDName +from aind_data_schema_models.platforms import Platform +from aind_data_schema_models.data_name_patterns import ( + DataLevel, + Group, +) +from aind_data_schema.core.data_description import RelatedData, Funding + +# Instrument Models +from aind_data_schema.components.devices import ( + LIGHT_SOURCES, + AdditionalImagingDevice, + DAQDevice, + Detector, + Enclosure, + Filter, + ImagingInstrumentType, + Lens, + MotorizedStage, + Objective, + OpticalTable, + ScanningStage, +) +from aind_data_schema.core.instrument import Com + +# Metadata Models +from aind_data_schema.core.metadata import MetadataStatus, ExternalPlatforms + +# Procedures Models +from aind_data_schema.core.procedures import ( + Surgery, + TrainingProtocol, + WaterRestriction, + OtherSubjectProcedure +) + +# Processing Models +from aind_data_schema.core.processing import AnalysisProcess, PipelineProcess + +# Quality Control Models +from aind_data_schema.core.quality_control import QCStatus, QCEvaluation + +# Rig Models +from aind_data_schema.core.rig import MOUSE_PLATFORMS, STIMULUS_DEVICES, RIG_DAQ_DEVICES +from aind_data_schema.components.coordinates import Axis, Origin +from aind_data_schema.components.devices import ( + LIGHT_SOURCES, + Calibration, + CameraAssembly, + DAQDevice, + Detector, + Device, + DigitalMicromirrorDevice, + Enclosure, + EphysAssembly, + FiberAssembly, + Filter, + LaserAssembly, + Lens, + Objective, + Patch, + PolygonalScanner, +) + +# Session Models +from aind_data_schema_models.units import ( + MassUnit, + VolumeUnit, +) +from aind_data_schema.core.procedures import Anaesthetic +from aind_data_schema.core.session import Stream, StimulusEpoch, RewardDeliveryConfig +from aind_data_schema.components.coordinates import Affine3dTransform + +# Subject Models +from aind_data_schema.core.subject import BackgroundStrain, BreedingInfo, WellnessReport, Housing +from aind_data_schema_models.species import Species + +first_layer_field_mapping = { "data_description": DataDescription, "acquisition": Acquisition, "procedures": Procedures, @@ -19,6 +112,113 @@ "session": Session, "quality_control": QualityControl, } +second_layer_field_mappings = { + "acquisition": { + "calibrations": List[Calibration], + "maintenance": List[Maintenance], + "tiles": List[AcquisitionTile], + "axes": List[ImageAxis], + "chamber_immersion": Immersion, + "sample_immersion": Optional[Immersion], + "processing_steps": List[ProcessingSteps], + "software": Optional[List[Software]], + }, + "data_description": { + "data_level": DataLevel, + "group": Optional[Group], + "investigators": List[PIDName], + "modality": List[Modality], + "related_data": List[RelatedData], + "platform": Platform.ONE_OF, + "funding_source": List[Funding], + "institution": Organization.RESEARCH_INSTITUTIONS, + }, + "instrument": { + "instrument_type": ImagingInstrumentType, + "manufacturer": Organization.ONE_OF, + "optical_tables": List[OpticalTable], + "enclosure": Optional[Enclosure], + "objectives": List[Objective], + "detectors": List[Detector], + "light_sources": List[LIGHT_SOURCES], + "lenses": List[Lens], + "fluorescence_filters": List[Filter], + "motorized_stages": List[MotorizedStage], + "scanning_stages": List[ScanningStage], + "additional_devices": List[AdditionalImagingDevice], + "calibration_date": Optional[date], + "com_ports": List[Com], + "daqs": List[DAQDevice], + }, + "metadata": { + **first_layer_field_mapping, + "created": datetime, + "last_modified": datetime, + "metadata_status": MetadataStatus, + "external_links": Dict[ExternalPlatforms, List[str]], + }, + "procedures": { + "subject_procedures": List[ # This one is really weird, not sure how to go about converting it. All of the procedures schema will be difficult to do this with, since some fields can have a range of 12+ models input into them. + Union[Surgery, TrainingProtocol, WaterRestriction, OtherSubjectProcedure], + ], + }, + "processing": { + "processing_pipeline": PipelineProcess, + "analyses": List[AnalysisProcess], + }, + "quality_control": { + "overall_status": List[QCStatus], + "evaluations": List[QCEvaluation], + }, + "rig": { + "modification_date": date, + "mouse_platform": MOUSE_PLATFORMS, + "stimulus_devices": List[STIMULUS_DEVICES], + "cameras": List[CameraAssembly], + "enclosure": Optional[Enclosure], + "ephys_assemblies": List[EphysAssembly], + "fiber_assemblies": List[FiberAssembly], + "stick_microscopes": List[CameraAssembly], + "laser_assemblies": List[LaserAssembly], + "patch_cords": List[Patch], + "light_sources": List[LIGHT_SOURCES], + "detectors": List[Detector], + "objectives": List[Objective], + "filters": List[Filter], + "lenses": List[Lens], + "digital_micromirror_devices": List[DigitalMicromirrorDevice], + "polygonal_scanners": List[PolygonalScanner], + "additional_devices": List[Device], + "daqs": List[RIG_DAQ_DEVICES], + "calibrations": List[Calibration], + "origin": Optional[Origin], + "rig_axes": Optional[List[Axis]], + "modalities": Set[Modality.ONE_OF] + }, + "session": { + "calibrations": List[Calibration], + "maintenance": List[Maintenance], + "weight_unit": MassUnit, + "anaesthesia": Optional[Anaesthetic], + "data_streams": List[Stream], + "stimulus_epochs": List[StimulusEpoch], + "headframe_registration": Optional[Affine3dTransform], + "reward_delivery": Optional[RewardDeliveryConfig], + "reward_consumed_unit": VolumeUnit + }, + "subject": { + "date_of_birth": date, + "species": Species.ONE_OF, + "alleles": List[PIDName], + "background_strain": Optional[BackgroundStrain], + "breeding_info": Optional[BreedingInfo], + "source": Organization.SUBJECT_SOURCES, + "rrid": Optional[PIDName], + "wellness_reports": List[WellnessReport], + "housing": Optional[Housing], + + }, +} def check_present(key: str, object: dict, check_present: bool = True): From b15fea4809373e5bed10b216d732dd4a3c15d564 Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Sun, 6 Oct 2024 10:59:59 -0700 Subject: [PATCH 05/13] chore: lint and move mappings to separate file --- src/aind_metadata_viz/metadata_class_map.py | 242 ++++++++++++++++++++ src/aind_metadata_viz/metadata_helpers.py | 43 +++- 2 files changed, 274 insertions(+), 11 deletions(-) create mode 100644 src/aind_metadata_viz/metadata_class_map.py diff --git a/src/aind_metadata_viz/metadata_class_map.py b/src/aind_metadata_viz/metadata_class_map.py new file mode 100644 index 0000000..57436ea --- /dev/null +++ b/src/aind_metadata_viz/metadata_class_map.py @@ -0,0 +1,242 @@ +### class that does xxxx for something with QC + +# First level metadata models +from aind_data_schema.core.acquisition import Acquisition +from aind_data_schema.core.data_description import DataDescription +from aind_data_schema.core.instrument import Instrument +from aind_data_schema.core.processing import Processing +from aind_data_schema.core.procedures import Procedures +from aind_data_schema.core.quality_control import QualityControl +from aind_data_schema.core.rig import Rig +from aind_data_schema.core.session import Session +from aind_data_schema.core.subject import Subject + +# General Models +from typing import List, Optional, Dict, Union, Set +from datetime import date, datetime + +# Acquisition Models +from aind_data_schema.components.devices import ( + Calibration, + Maintenance, + Software, +) +from aind_data_schema.components.tile import AcquisitionTile +from aind_data_schema.components.coordinates import ImageAxis +from aind_data_schema.core.acquisition import Immersion, ProcessingSteps + +# Data Description Models +from aind_data_schema_models.modalities import Modality +from aind_data_schema_models.organizations import Organization +from aind_data_schema_models.pid_names import PIDName +from aind_data_schema_models.platforms import Platform +from aind_data_schema_models.data_name_patterns import ( + DataLevel, + Group, +) +from aind_data_schema.core.data_description import RelatedData, Funding + +# Instrument Models +from aind_data_schema.components.devices import ( + LIGHT_SOURCES, + AdditionalImagingDevice, + DAQDevice, + Detector, + Enclosure, + Filter, + ImagingInstrumentType, + Lens, + MotorizedStage, + Objective, + OpticalTable, + ScanningStage, +) +from aind_data_schema.core.instrument import Com + +# Metadata Models +from aind_data_schema.core.metadata import MetadataStatus, ExternalPlatforms + +# Procedures Models +from aind_data_schema.core.procedures import ( + Surgery, + TrainingProtocol, + WaterRestriction, + OtherSubjectProcedure, +) + +# Processing Models +from aind_data_schema.core.processing import AnalysisProcess, PipelineProcess + +# Quality Control Models +from aind_data_schema.core.quality_control import QCStatus, QCEvaluation + +# Rig Models +from aind_data_schema.core.rig import ( + MOUSE_PLATFORMS, + STIMULUS_DEVICES, + RIG_DAQ_DEVICES, +) +from aind_data_schema.components.coordinates import Axis, Origin +from aind_data_schema.components.devices import ( + LIGHT_SOURCES, + Calibration, + CameraAssembly, + DAQDevice, + Detector, + Device, + DigitalMicromirrorDevice, + Enclosure, + EphysAssembly, + FiberAssembly, + Filter, + LaserAssembly, + Lens, + Objective, + Patch, + PolygonalScanner, +) + +# Session Models +from aind_data_schema_models.units import ( + MassUnit, + VolumeUnit, +) +from aind_data_schema.core.procedures import Anaesthetic +from aind_data_schema.core.session import ( + Stream, + StimulusEpoch, + RewardDeliveryConfig, +) +from aind_data_schema.components.coordinates import Affine3dTransform + +# Subject Models +from aind_data_schema.core.subject import ( + BackgroundStrain, + BreedingInfo, + WellnessReport, + Housing, +) +from aind_data_schema_models.species import Species + +first_layer_field_mapping = { + "data_description": DataDescription, + "acquisition": Acquisition, + "procedures": Procedures, + "subject": Subject, + "instrument": Instrument, + "processing": Processing, + "rig": Rig, + "session": Session, + "quality_control": QualityControl, +} +second_layer_field_mappings = { + "acquisition": { + "calibrations": List[Calibration], + "maintenance": List[Maintenance], + "tiles": List[AcquisitionTile], + "axes": List[ImageAxis], + "chamber_immersion": Immersion, + "sample_immersion": Optional[Immersion], + "processing_steps": List[ProcessingSteps], + "software": Optional[List[Software]], + }, + "data_description": { + "data_level": DataLevel, + "group": Optional[Group], + "investigators": List[PIDName], + "modality": List[Modality], + "related_data": List[RelatedData], + "platform": Platform.ONE_OF, + "funding_source": List[Funding], + "institution": Organization.RESEARCH_INSTITUTIONS, + }, + "instrument": { + "instrument_type": ImagingInstrumentType, + "manufacturer": Organization.ONE_OF, + "optical_tables": List[OpticalTable], + "enclosure": Optional[Enclosure], + "objectives": List[Objective], + "detectors": List[Detector], + "light_sources": List[LIGHT_SOURCES], + "lenses": List[Lens], + "fluorescence_filters": List[Filter], + "motorized_stages": List[MotorizedStage], + "scanning_stages": List[ScanningStage], + "additional_devices": List[AdditionalImagingDevice], + "calibration_date": Optional[date], + "com_ports": List[Com], + "daqs": List[DAQDevice], + }, + "metadata": { + **first_layer_field_mapping, + "created": datetime, + "last_modified": datetime, + "metadata_status": MetadataStatus, + "external_links": Dict[ExternalPlatforms, List[str]], + }, + "procedures": { + "subject_procedures": List[ # This one is really weird, not sure how to go about converting it. All of the procedures schema will be difficult to do this with, since some fields can have a range of 12+ models input into them. + Union[ + Surgery, + TrainingProtocol, + WaterRestriction, + OtherSubjectProcedure, + ], + ], + }, + "processing": { + "processing_pipeline": PipelineProcess, + "analyses": List[AnalysisProcess], + }, + "quality_control": { + "overall_status": List[QCStatus], + "evaluations": List[QCEvaluation], + }, + "rig": { + "modification_date": date, + "mouse_platform": MOUSE_PLATFORMS, + "stimulus_devices": List[STIMULUS_DEVICES], + "cameras": List[CameraAssembly], + "enclosure": Optional[Enclosure], + "ephys_assemblies": List[EphysAssembly], + "fiber_assemblies": List[FiberAssembly], + "stick_microscopes": List[CameraAssembly], + "laser_assemblies": List[LaserAssembly], + "patch_cords": List[Patch], + "light_sources": List[LIGHT_SOURCES], + "detectors": List[Detector], + "objectives": List[Objective], + "filters": List[Filter], + "lenses": List[Lens], + "digital_micromirror_devices": List[DigitalMicromirrorDevice], + "polygonal_scanners": List[PolygonalScanner], + "additional_devices": List[Device], + "daqs": List[RIG_DAQ_DEVICES], + "calibrations": List[Calibration], + "origin": Optional[Origin], + "rig_axes": Optional[List[Axis]], + "modalities": Set[Modality.ONE_OF], + }, + "session": { + "calibrations": List[Calibration], + "maintenance": List[Maintenance], + "weight_unit": MassUnit, + "anaesthesia": Optional[Anaesthetic], + "data_streams": List[Stream], + "stimulus_epochs": List[StimulusEpoch], + "headframe_registration": Optional[Affine3dTransform], + "reward_delivery": Optional[RewardDeliveryConfig], + "reward_consumed_unit": VolumeUnit, + }, + "subject": { + "date_of_birth": date, + "species": Species.ONE_OF, + "alleles": List[PIDName], + "background_strain": Optional[BackgroundStrain], + "breeding_info": Optional[BreedingInfo], + "source": Organization.SUBJECT_SOURCES, + "rrid": Optional[PIDName], + "wellness_reports": List[WellnessReport], + "housing": Optional[Housing], + }, +} diff --git a/src/aind_metadata_viz/metadata_helpers.py b/src/aind_metadata_viz/metadata_helpers.py index 2891558..ccaf65b 100644 --- a/src/aind_metadata_viz/metadata_helpers.py +++ b/src/aind_metadata_viz/metadata_helpers.py @@ -16,7 +16,11 @@ from datetime import date, datetime # Acquisition Models -from aind_data_schema.components.devices import Calibration, Maintenance, Software +from aind_data_schema.components.devices import ( + Calibration, + Maintenance, + Software, +) from aind_data_schema.components.tile import AcquisitionTile from aind_data_schema.components.coordinates import ImageAxis from aind_data_schema.core.acquisition import Immersion, ProcessingSteps @@ -57,7 +61,7 @@ Surgery, TrainingProtocol, WaterRestriction, - OtherSubjectProcedure + OtherSubjectProcedure, ) # Processing Models @@ -67,7 +71,11 @@ from aind_data_schema.core.quality_control import QCStatus, QCEvaluation # Rig Models -from aind_data_schema.core.rig import MOUSE_PLATFORMS, STIMULUS_DEVICES, RIG_DAQ_DEVICES +from aind_data_schema.core.rig import ( + MOUSE_PLATFORMS, + STIMULUS_DEVICES, + RIG_DAQ_DEVICES, +) from aind_data_schema.components.coordinates import Axis, Origin from aind_data_schema.components.devices import ( LIGHT_SOURCES, @@ -94,11 +102,20 @@ VolumeUnit, ) from aind_data_schema.core.procedures import Anaesthetic -from aind_data_schema.core.session import Stream, StimulusEpoch, RewardDeliveryConfig +from aind_data_schema.core.session import ( + Stream, + StimulusEpoch, + RewardDeliveryConfig, +) from aind_data_schema.components.coordinates import Affine3dTransform # Subject Models -from aind_data_schema.core.subject import BackgroundStrain, BreedingInfo, WellnessReport, Housing +from aind_data_schema.core.subject import ( + BackgroundStrain, + BreedingInfo, + WellnessReport, + Housing, +) from aind_data_schema_models.species import Species first_layer_field_mapping = { @@ -158,8 +175,13 @@ "external_links": Dict[ExternalPlatforms, List[str]], }, "procedures": { - "subject_procedures": List[ # This one is really weird, not sure how to go about converting it. All of the procedures schema will be difficult to do this with, since some fields can have a range of 12+ models input into them. - Union[Surgery, TrainingProtocol, WaterRestriction, OtherSubjectProcedure], + "subject_procedures": List[ # This one is really weird, not sure how to go about converting it. All of the procedures schema will be difficult to do this with, since some fields can have a range of 12+ models input into them. + Union[ + Surgery, + TrainingProtocol, + WaterRestriction, + OtherSubjectProcedure, + ], ], }, "processing": { @@ -193,7 +215,7 @@ "calibrations": List[Calibration], "origin": Optional[Origin], "rig_axes": Optional[List[Axis]], - "modalities": Set[Modality.ONE_OF] + "modalities": Set[Modality.ONE_OF], }, "session": { "calibrations": List[Calibration], @@ -204,7 +226,7 @@ "stimulus_epochs": List[StimulusEpoch], "headframe_registration": Optional[Affine3dTransform], "reward_delivery": Optional[RewardDeliveryConfig], - "reward_consumed_unit": VolumeUnit + "reward_consumed_unit": VolumeUnit, }, "subject": { "date_of_birth": date, @@ -216,7 +238,6 @@ "rrid": Optional[PIDName], "wellness_reports": List[WellnessReport], "housing": Optional[Housing], - }, } @@ -243,7 +264,7 @@ def check_present(key: str, object: dict, check_present: bool = True): return present if check_present else not present -def check_valid_metadata(field:str, json: str): +def check_valid_metadata(field: str, json: str): """Return true if the json data is a valid object of the particular field class Parameters From e9d7637c85b2d46dd3aa12b94fbfdaf1fea58c6e Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Sun, 6 Oct 2024 11:33:24 -0700 Subject: [PATCH 06/13] refactor: replacing core present/missing check with valid/present/missing/excluded check --- src/aind_metadata_viz/docdb.py | 12 +- src/aind_metadata_viz/metadata_helpers.py | 328 +++++----------------- src/aind_metadata_viz/utils.py | 10 + tests/test_metadata_helpers.py | 24 +- 4 files changed, 94 insertions(+), 280 deletions(-) diff --git a/src/aind_metadata_viz/docdb.py b/src/aind_metadata_viz/docdb.py index 96cefc7..ce98a2e 100644 --- a/src/aind_metadata_viz/docdb.py +++ b/src/aind_metadata_viz/docdb.py @@ -8,8 +8,8 @@ from aind_data_schema_models.modalities import Modality from aind_metadata_viz.metadata_helpers import ( - process_present_list, - check_present, + process_record_list, + _metadata_present_helper, ) from aind_metadata_viz.utils import compute_count_true @@ -108,7 +108,7 @@ def get_file_presence(self, files: list = EXPECTED_FILES): files : list[str], optional List of expected metadata filenames, by default EXPECTED_FILES """ - processed = process_present_list(self.data_filtered, files) + processed = process_record_list(self.data_filtered, files) df = pd.DataFrame(processed, columns=files) return compute_count_true(df) @@ -138,7 +138,7 @@ def set_file(self, file: str = EXPECTED_FILES[0]): self.mid_list = [] for data in self.data_filtered: - if check_present(self.file, data): + if _metadata_present_helper(self.file, data): self.mid_list.append(data[self.file]) def get_file_field_presence(self): @@ -157,7 +157,7 @@ def get_file_field_presence(self): expected_fields = ( self.mid_list[0].keys() if len(self.mid_list) > 0 else [] ) - processed = process_present_list(self.mid_list, expected_fields) + processed = process_record_list(self.mid_list, expected_fields) df = pd.DataFrame(processed, columns=expected_fields) return compute_count_true(df) @@ -185,7 +185,7 @@ def get_csv(self, file: str, field: str = " ", missing: str = "Missing"): df_data = [] for data in self.data_filtered: if not data[file] is None: - if field == " " or check_present( + if field == " " or _metadata_present_helper( field, data[file], check_present=get_present ): # This file/field combo is present/missing, get all the id diff --git a/src/aind_metadata_viz/metadata_helpers.py b/src/aind_metadata_viz/metadata_helpers.py index ccaf65b..4b09b3f 100644 --- a/src/aind_metadata_viz/metadata_helpers.py +++ b/src/aind_metadata_viz/metadata_helpers.py @@ -1,248 +1,9 @@ -### class that does xxxx for something with QC +from aind_metadata_viz.metadata_class_map import first_layer_field_mapping, second_layer_field_mappings +from aind_metadata_viz.utils import MetaState +from pydantic import ValidationError -# First level metadata models -from aind_data_schema.core.acquisition import Acquisition -from aind_data_schema.core.data_description import DataDescription -from aind_data_schema.core.instrument import Instrument -from aind_data_schema.core.processing import Processing -from aind_data_schema.core.procedures import Procedures -from aind_data_schema.core.quality_control import QualityControl -from aind_data_schema.core.rig import Rig -from aind_data_schema.core.session import Session -from aind_data_schema.core.subject import Subject -# General Models -from typing import List, Optional, Dict, Union, Set -from datetime import date, datetime - -# Acquisition Models -from aind_data_schema.components.devices import ( - Calibration, - Maintenance, - Software, -) -from aind_data_schema.components.tile import AcquisitionTile -from aind_data_schema.components.coordinates import ImageAxis -from aind_data_schema.core.acquisition import Immersion, ProcessingSteps - -# Data Description Models -from aind_data_schema_models.modalities import Modality -from aind_data_schema_models.organizations import Organization -from aind_data_schema_models.pid_names import PIDName -from aind_data_schema_models.platforms import Platform -from aind_data_schema_models.data_name_patterns import ( - DataLevel, - Group, -) -from aind_data_schema.core.data_description import RelatedData, Funding - -# Instrument Models -from aind_data_schema.components.devices import ( - LIGHT_SOURCES, - AdditionalImagingDevice, - DAQDevice, - Detector, - Enclosure, - Filter, - ImagingInstrumentType, - Lens, - MotorizedStage, - Objective, - OpticalTable, - ScanningStage, -) -from aind_data_schema.core.instrument import Com - -# Metadata Models -from aind_data_schema.core.metadata import MetadataStatus, ExternalPlatforms - -# Procedures Models -from aind_data_schema.core.procedures import ( - Surgery, - TrainingProtocol, - WaterRestriction, - OtherSubjectProcedure, -) - -# Processing Models -from aind_data_schema.core.processing import AnalysisProcess, PipelineProcess - -# Quality Control Models -from aind_data_schema.core.quality_control import QCStatus, QCEvaluation - -# Rig Models -from aind_data_schema.core.rig import ( - MOUSE_PLATFORMS, - STIMULUS_DEVICES, - RIG_DAQ_DEVICES, -) -from aind_data_schema.components.coordinates import Axis, Origin -from aind_data_schema.components.devices import ( - LIGHT_SOURCES, - Calibration, - CameraAssembly, - DAQDevice, - Detector, - Device, - DigitalMicromirrorDevice, - Enclosure, - EphysAssembly, - FiberAssembly, - Filter, - LaserAssembly, - Lens, - Objective, - Patch, - PolygonalScanner, -) - -# Session Models -from aind_data_schema_models.units import ( - MassUnit, - VolumeUnit, -) -from aind_data_schema.core.procedures import Anaesthetic -from aind_data_schema.core.session import ( - Stream, - StimulusEpoch, - RewardDeliveryConfig, -) -from aind_data_schema.components.coordinates import Affine3dTransform - -# Subject Models -from aind_data_schema.core.subject import ( - BackgroundStrain, - BreedingInfo, - WellnessReport, - Housing, -) -from aind_data_schema_models.species import Species - -first_layer_field_mapping = { - "data_description": DataDescription, - "acquisition": Acquisition, - "procedures": Procedures, - "subject": Subject, - "instrument": Instrument, - "processing": Processing, - "rig": Rig, - "session": Session, - "quality_control": QualityControl, -} -second_layer_field_mappings = { - "acquisition": { - "calibrations": List[Calibration], - "maintenance": List[Maintenance], - "tiles": List[AcquisitionTile], - "axes": List[ImageAxis], - "chamber_immersion": Immersion, - "sample_immersion": Optional[Immersion], - "processing_steps": List[ProcessingSteps], - "software": Optional[List[Software]], - }, - "data_description": { - "data_level": DataLevel, - "group": Optional[Group], - "investigators": List[PIDName], - "modality": List[Modality], - "related_data": List[RelatedData], - "platform": Platform.ONE_OF, - "funding_source": List[Funding], - "institution": Organization.RESEARCH_INSTITUTIONS, - }, - "instrument": { - "instrument_type": ImagingInstrumentType, - "manufacturer": Organization.ONE_OF, - "optical_tables": List[OpticalTable], - "enclosure": Optional[Enclosure], - "objectives": List[Objective], - "detectors": List[Detector], - "light_sources": List[LIGHT_SOURCES], - "lenses": List[Lens], - "fluorescence_filters": List[Filter], - "motorized_stages": List[MotorizedStage], - "scanning_stages": List[ScanningStage], - "additional_devices": List[AdditionalImagingDevice], - "calibration_date": Optional[date], - "com_ports": List[Com], - "daqs": List[DAQDevice], - }, - "metadata": { - **first_layer_field_mapping, - "created": datetime, - "last_modified": datetime, - "metadata_status": MetadataStatus, - "external_links": Dict[ExternalPlatforms, List[str]], - }, - "procedures": { - "subject_procedures": List[ # This one is really weird, not sure how to go about converting it. All of the procedures schema will be difficult to do this with, since some fields can have a range of 12+ models input into them. - Union[ - Surgery, - TrainingProtocol, - WaterRestriction, - OtherSubjectProcedure, - ], - ], - }, - "processing": { - "processing_pipeline": PipelineProcess, - "analyses": List[AnalysisProcess], - }, - "quality_control": { - "overall_status": List[QCStatus], - "evaluations": List[QCEvaluation], - }, - "rig": { - "modification_date": date, - "mouse_platform": MOUSE_PLATFORMS, - "stimulus_devices": List[STIMULUS_DEVICES], - "cameras": List[CameraAssembly], - "enclosure": Optional[Enclosure], - "ephys_assemblies": List[EphysAssembly], - "fiber_assemblies": List[FiberAssembly], - "stick_microscopes": List[CameraAssembly], - "laser_assemblies": List[LaserAssembly], - "patch_cords": List[Patch], - "light_sources": List[LIGHT_SOURCES], - "detectors": List[Detector], - "objectives": List[Objective], - "filters": List[Filter], - "lenses": List[Lens], - "digital_micromirror_devices": List[DigitalMicromirrorDevice], - "polygonal_scanners": List[PolygonalScanner], - "additional_devices": List[Device], - "daqs": List[RIG_DAQ_DEVICES], - "calibrations": List[Calibration], - "origin": Optional[Origin], - "rig_axes": Optional[List[Axis]], - "modalities": Set[Modality.ONE_OF], - }, - "session": { - "calibrations": List[Calibration], - "maintenance": List[Maintenance], - "weight_unit": MassUnit, - "anaesthesia": Optional[Anaesthetic], - "data_streams": List[Stream], - "stimulus_epochs": List[StimulusEpoch], - "headframe_registration": Optional[Affine3dTransform], - "reward_delivery": Optional[RewardDeliveryConfig], - "reward_consumed_unit": VolumeUnit, - }, - "subject": { - "date_of_birth": date, - "species": Species.ONE_OF, - "alleles": List[PIDName], - "background_strain": Optional[BackgroundStrain], - "breeding_info": Optional[BreedingInfo], - "source": Organization.SUBJECT_SOURCES, - "rrid": Optional[PIDName], - "wellness_reports": List[WellnessReport], - "housing": Optional[Housing], - }, -} - - -def check_present(key: str, object: dict, check_present: bool = True): +def _metadata_present_helper(json: str, check_present: bool = True): """Return true if the value of a key exists and is not None, or any of '' [] {} in a JSON object @@ -254,17 +15,15 @@ def check_present(key: str, object: dict, check_present: bool = True): Dictionary """ present = ( - object[key] is not None - and object[key] != "" - and object[key] != [] - and object[key] != {} - if key in object - else False + json is not None + and json != "" + and json != [] + and json != {} ) return present if check_present else not present -def check_valid_metadata(field: str, json: str): +def _metadata_valid_helper(field: str, json: str, mapping: dict, ): """Return true if the json data is a valid object of the particular field class Parameters @@ -272,26 +31,71 @@ def check_valid_metadata(field: str, json: str): json : str json string generated from a AindCoreModel dump """ - return field_mapping[field].model_validate_json(json) is not None + if field in mapping: + try: + return mapping[field].model_validate_json(json) is not None + except ValidationError as e: + print(e) + return False -def process_present_dict(data: dict, expected_fields: list): - return {field: check_present(field, data) for field in expected_fields} +def check_metadata_state(field: str, object: dict, parent: str = None, excluded_fields: list = []) -> MetaState: + """Get the MetaState for a specific key in a dictinoary + Parameters + ---------- + key : str + Field to check + object : dict + {field: value} -def process_present_list(data_list: list, expected_fields: list): - """Process a data JSON + Returns + ------- + MetaState + _description_ + """ + # if excluded, just return that + if field in excluded_fields: + return MetaState.EXCLUDED + + # if you're looking at a parent file's data then you need a different mapping + if parent: + print('not implemented') + # we're at the top level, just check the first layer mappings + else: + class_map = first_layer_field_mapping + + # First check that the key exists at all + if field in object: + value = object[field] + else: + return MetaState.MISSING + + # attempt validation + if _metadata_valid_helper(field, value, class_map): + return MetaState.VALID + + # check missing + if _metadata_present_helper(value): + return MetaState.PRESENT + + return MetaState.MISSING + + +def process_record_list(record_list: list, expected_fields: list): + """Process a list of Metadata JSON records from DocDB + + For each record, check each of the expected fields and see if they are valid/present/missing/excluded Parameters ---------- - data_list : _type_ - _description_ - expected_files : _type_ - _description_ + data_list : list[dict] + List of metadata json records as dicts + expected_fields : list[str] + List of key fields to check Returns ------- - _type_ - _description_ + list[{field: MetaState}] """ - return [process_present_dict(data, expected_fields) for data in data_list] + return [{field: check_metadata_state(field, data) for field in expected_fields} for data in record_list] diff --git a/src/aind_metadata_viz/utils.py b/src/aind_metadata_viz/utils.py index 307b2de..ea06ccf 100644 --- a/src/aind_metadata_viz/utils.py +++ b/src/aind_metadata_viz/utils.py @@ -1,3 +1,13 @@ +from enum import Enum + + +class MetaState(str, Enum): + VALID = "valid" + PRESENT = "present" + MISSING = "missing" + EXCLUDED = "excluded" + + def compute_count_true(df): """For each column, compute the count of true values and return as a longform dataframe diff --git a/tests/test_metadata_helpers.py b/tests/test_metadata_helpers.py index 33306b9..33a9c33 100644 --- a/tests/test_metadata_helpers.py +++ b/tests/test_metadata_helpers.py @@ -2,9 +2,9 @@ import unittest from aind_metadata_viz.metadata_helpers import ( - check_present, + _metadata_present_helper, process_present_dict, - process_present_list, + process_record_list, ) @@ -49,17 +49,17 @@ def setUp(self) -> None: def test_check_present(self): """Test the check_present function""" - self.assertFalse(check_present("test1", self.dict)) - self.assertFalse(check_present("test2", self.dict)) - self.assertFalse(check_present("test3", self.dict)) - self.assertFalse(check_present("test4", self.dict)) + self.assertFalse(_metadata_present_helper("test1", self.dict)) + self.assertFalse(_metadata_present_helper("test2", self.dict)) + self.assertFalse(_metadata_present_helper("test3", self.dict)) + self.assertFalse(_metadata_present_helper("test4", self.dict)) - self.assertTrue(check_present("test5", self.dict)) - self.assertTrue(check_present("test6", self.dict)) - self.assertTrue(check_present("test7", self.dict)) - self.assertTrue(check_present("test8", self.dict)) + self.assertTrue(_metadata_present_helper("test5", self.dict)) + self.assertTrue(_metadata_present_helper("test6", self.dict)) + self.assertTrue(_metadata_present_helper("test7", self.dict)) + self.assertTrue(_metadata_present_helper("test8", self.dict)) - self.assertFalse(check_present("test8", self.dict, check_present=False)) + self.assertFalse(_metadata_present_helper("test8", self.dict, check_present=False)) def test_process_present_dict(self): """Test the process_present_dict function""" @@ -71,7 +71,7 @@ def test_process_present(self): """Test that process runs properly on a list""" data_list = [self.dict, self.dict] - processed_list = process_present_list(data_list, self.expected_fields) + processed_list = process_record_list(data_list, self.expected_fields) out_list = [self.expected_out, self.expected_out] self.assertEqual(processed_list, out_list) From f6e5dea45be791d4695443c6dfd4d4d0b2e636d7 Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Sun, 6 Oct 2024 12:06:07 -0700 Subject: [PATCH 07/13] refactor: upgrade to allow for valid/excluded file types, if they exist --- src/aind_metadata_viz/app.py | 10 +-- src/aind_metadata_viz/docdb.py | 87 ++++++----------------- src/aind_metadata_viz/metadata_helpers.py | 10 +-- tests/test_docdb.py | 10 +-- 4 files changed, 35 insertions(+), 82 deletions(-) diff --git a/src/aind_metadata_viz/app.py b/src/aind_metadata_viz/app.py index 2066d33..0cd1974 100644 --- a/src/aind_metadata_viz/app.py +++ b/src/aind_metadata_viz/app.py @@ -56,22 +56,24 @@ def file_present_chart(): sum_longform_df = db.get_file_presence() + local_states = sum_longform_df["state"].unique() + local_color_list = [colors[state] for state in local_states] chart = ( alt.Chart(sum_longform_df) .mark_bar() .encode( - x=alt.X("index:N", title=None, axis=alt.Axis(grid=False)), + x=alt.X("file:N", title=None, axis=alt.Axis(grid=False)), y=alt.Y( "sum:Q", title="Metadata assets (n)", axis=alt.Axis(grid=False), ), color=alt.Color( - "status:N", + "state:N", scale=alt.Scale( - domain=["valid", "present", "missing", "excluded"], - range=color_list, + domain=local_states, + range=local_color_list, ), legend=None, ), diff --git a/src/aind_metadata_viz/docdb.py b/src/aind_metadata_viz/docdb.py index ce98a2e..12cfdb7 100644 --- a/src/aind_metadata_viz/docdb.py +++ b/src/aind_metadata_viz/docdb.py @@ -11,7 +11,7 @@ process_record_list, _metadata_present_helper, ) -from aind_metadata_viz.utils import compute_count_true +from aind_metadata_viz.utils import compute_count_true, MetaState API_GATEWAY_HOST = "api.allenneuraldynamics.org" DATABASE = "metadata_index" @@ -108,10 +108,16 @@ def get_file_presence(self, files: list = EXPECTED_FILES): files : list[str], optional List of expected metadata filenames, by default EXPECTED_FILES """ + # Get the short form df, each row is a record and each column is it's file:MetaState processed = process_record_list(self.data_filtered, files) df = pd.DataFrame(processed, columns=files) - return compute_count_true(df) + # Melt to long form + df_melted = df.melt(var_name='file', value_name='state') + # Get sum + df_summary = df_melted.groupby(["file", "state"]).size().reset_index(name="sum") + + return df_summary def get_field_presence(self): """Get the presence of fields at the top-level""" @@ -138,7 +144,7 @@ def set_file(self, file: str = EXPECTED_FILES[0]): self.mid_list = [] for data in self.data_filtered: - if _metadata_present_helper(self.file, data): + if _metadata_present_helper(data[self.file]): self.mid_list.append(data[self.file]) def get_file_field_presence(self): @@ -154,13 +160,17 @@ def get_file_field_presence(self): _type_ _description_ """ - expected_fields = ( - self.mid_list[0].keys() if len(self.mid_list) > 0 else [] - ) - processed = process_record_list(self.mid_list, expected_fields) - df = pd.DataFrame(processed, columns=expected_fields) + return pd.DataFrame() + # expected_fields = ( + # self.mid_list[0].keys() if len(self.mid_list) > 0 else [] + # ) + # processed = process_record_list(self.mid_list, expected_fields) + + # print(processed) + # df = pd.DataFrame() + # df = pd.DataFrame(processed, columns=expected_fields) - return compute_count_true(df) + # return compute_count_true(df) def get_csv(self, file: str, field: str = " ", missing: str = "Missing"): """Build a CSV file of export data based on the selected file and field @@ -208,64 +218,13 @@ def get_csv(self, file: str, field: str = " ", missing: str = "Missing"): @pn.cache(ttl=CACHE_RESET_SEC) def get_all(test_mode=False): filter = {} - limit = 0 if not test_mode else 10 - paginate_batch_size = 1000 - response = docdb_api_client.retrieve_docdb_records( - filter_query=filter, - limit=limit, - paginate_batch_size=paginate_batch_size, - ) - - return response - - -@pn.cache(ttl=CACHE_RESET_SEC) -def get_subjects(): - filter = { - "subject.subject_id": {"$exists": True}, - "session": {"$ne": None}, - } - limit = 1000 - paginate_batch_size = 100 + # limit = 0 if not test_mode else 10 + limit = 10 + paginate_batch_size = 500 response = docdb_api_client.retrieve_docdb_records( filter_query=filter, - projection={"_id": 0, "subject.subject_id": 1}, limit=limit, paginate_batch_size=paginate_batch_size, ) - # turn this into a list instead of a nested list - subjects = [] - for data in response: - subjects.append(np.int32(data["subject"]["subject_id"])) - - return np.unique(subjects).tolist() - - -@pn.cache(ttl=CACHE_RESET_SEC) -def get_sessions(subject_id): - """Get the raw JSON sessions list for a subject - - Parameters - ---------- - subject_id : string or int - _description_ - - Returns - ------- - _type_ - _description_ - """ - filter = { - "subject.subject_id": str(subject_id), - "session": {"$ne": "null"}, - } - response = docdb_api_client.retrieve_docdb_records( - filter_query=filter, projection={"_id": 0, "session": 1} - ) - - sessions = [] - for data in response: - sessions.append(data["session"]) - - return sessions + return response \ No newline at end of file diff --git a/src/aind_metadata_viz/metadata_helpers.py b/src/aind_metadata_viz/metadata_helpers.py index 4b09b3f..f717f5f 100644 --- a/src/aind_metadata_viz/metadata_helpers.py +++ b/src/aind_metadata_viz/metadata_helpers.py @@ -56,7 +56,7 @@ def check_metadata_state(field: str, object: dict, parent: str = None, excluded_ """ # if excluded, just return that if field in excluded_fields: - return MetaState.EXCLUDED + return MetaState.EXCLUDED.value # if you're looking at a parent file's data then you need a different mapping if parent: @@ -69,17 +69,17 @@ def check_metadata_state(field: str, object: dict, parent: str = None, excluded_ if field in object: value = object[field] else: - return MetaState.MISSING + return MetaState.MISSING.value # attempt validation if _metadata_valid_helper(field, value, class_map): - return MetaState.VALID + return MetaState.VALID.value # check missing if _metadata_present_helper(value): - return MetaState.PRESENT + return MetaState.PRESENT.value - return MetaState.MISSING + return MetaState.MISSING.value def process_record_list(record_list: list, expected_fields: list): diff --git a/tests/test_docdb.py b/tests/test_docdb.py index 7ea9fb5..368c79f 100644 --- a/tests/test_docdb.py +++ b/tests/test_docdb.py @@ -2,7 +2,7 @@ import unittest -from aind_metadata_viz.docdb import get_subjects, get_sessions, get_all +from aind_metadata_viz.docdb import get_subjects class DocDBTest(unittest.TestCase): @@ -10,14 +10,6 @@ class DocDBTest(unittest.TestCase): # def setUp(self): - def test_get_subjects(self): - """Get the subjects list, check that some known subjects are in it""" - self.assertIn(596930, get_subjects()) - - def test_get_sessions(self): - """Get data from the test subject's sessions""" - self.assertEqual(1, len(get_sessions(596930))) - def test_get_all(self): """Test all sessions""" data = get_all(test_mode=True) From 742ef6e5d35e48d3c81b5d9969e896bdba4fbde4 Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Sun, 6 Oct 2024 12:58:32 -0700 Subject: [PATCH 08/13] chore: longer line lengths --- .flake8 | 1 + 1 file changed, 1 insertion(+) diff --git a/.flake8 b/.flake8 index 6d5ce4f..07bd2e2 100644 --- a/.flake8 +++ b/.flake8 @@ -4,3 +4,4 @@ exclude = __pycache__, build max-complexity = 10 +max-line-length = 120 \ No newline at end of file From 208046ca0d5ce16b85df9a548e66d66728028707 Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Sun, 6 Oct 2024 13:06:25 -0700 Subject: [PATCH 09/13] feat: working on adding all files back, with excluded files in the excluded section --- src/aind_metadata_viz/app.py | 34 ++++++++++++++--------- src/aind_metadata_viz/docdb.py | 30 +++++++++++--------- src/aind_metadata_viz/metadata_helpers.py | 4 +-- src/aind_metadata_viz/utils.py | 2 +- 4 files changed, 41 insertions(+), 29 deletions(-) diff --git a/src/aind_metadata_viz/app.py b/src/aind_metadata_viz/app.py index 0cd1974..3688b34 100644 --- a/src/aind_metadata_viz/app.py +++ b/src/aind_metadata_viz/app.py @@ -46,12 +46,17 @@ name="Value state", options=["Missing", "Present"] ) -derived_switch = pn.widgets.Switch.from_param(db.param.derived_filter) +derived_selector = pn.widgets.Select( + name="Filter for:", + options=["All assets", "Raw", "Derived"], +) +derived_selector.value = "All assets" pn.state.location.sync(modality_selector, {"value": "modality"}) pn.state.location.sync(top_selector, {"value": "file"}) pn.state.location.sync(mid_selector, {"value": "field"}) pn.state.location.sync(missing_selector, {"value": "missing"}) +pn.state.location.sync(derived_selector, {"value": "derived"}) def file_present_chart(): @@ -65,7 +70,7 @@ def file_present_chart(): .encode( x=alt.X("file:N", title=None, axis=alt.Axis(grid=False)), y=alt.Y( - "sum:Q", + "count:Q", title="Metadata assets (n)", axis=alt.Axis(grid=False), ), @@ -93,9 +98,9 @@ def notfile_present_chart(): alt.Chart(sum_longform_df) .mark_bar() .encode( - x=alt.X("index:N", title=None, axis=alt.Axis(grid=False)), + x=alt.X("column:N", title=None, axis=alt.Axis(grid=False)), y=alt.Y( - "sum:Q", + "count:Q", title=None, axis=alt.Axis(grid=False), ), @@ -168,9 +173,10 @@ def build_csv_jscode(event): download_button.on_click(build_csv_jscode) -def build_mid(selected_file, **args): +def build_mid(selected_file, derived_filter, **args): """ """ db.set_file(selected_file) + db.derived_filter = derived_filter sum_longform_df = db.get_file_field_presence() @@ -178,9 +184,9 @@ def build_mid(selected_file, **args): alt.Chart(sum_longform_df) .mark_bar() .encode( - x=alt.X("index:N", title=None, axis=alt.Axis(grid=False)), + x=alt.X("column:N", title=None, axis=alt.Axis(grid=False)), y=alt.Y( - "sum:Q", title="Metadata assets (n)", axis=alt.Axis(grid=False) + "count:Q", title="Metadata assets (n)", axis=alt.Axis(grid=False) ), color=alt.Color( "category:N", @@ -196,7 +202,10 @@ def build_mid(selected_file, **args): # Also update the selected list if len(db.mid_list) > 0: - option_list = [" "] + list(db.mid_list[0].keys()) + if len(db.mid_list) > 0 and db.mid_list[0]: + option_list = [" "] + list(db.mid_list[0].keys()) + else: + option_list = [" "] else: option_list = [] @@ -233,7 +242,7 @@ def hd_style(text): header_pane, modality_selector, top_selector, - pn.Row("Filter for derived assets:", derived_switch), + derived_selector, download_pane, mid_selector, missing_selector, @@ -244,8 +253,7 @@ def hd_style(text): def build_row(selected_modality, derived_filter): db.modality_filter = selected_modality - - print(derived_filter) + db.derived_filter = derived_filter return pn.Row(file_present_chart, notfile_present_chart) @@ -253,14 +261,14 @@ def build_row(selected_modality, derived_filter): top_row = pn.bind( build_row, selected_modality=modality_selector, - derived_filter=derived_switch, + derived_filter=derived_selector, ) mid_plot = pn.bind( build_mid, selected_file=top_selector, selected_modality=modality_selector, - derived_filter=derived_switch, + derived_filter=derived_selector, ) # Put everything in a column and buffer it diff --git a/src/aind_metadata_viz/docdb.py b/src/aind_metadata_viz/docdb.py index 6923fd3..41bad41 100644 --- a/src/aind_metadata_viz/docdb.py +++ b/src/aind_metadata_viz/docdb.py @@ -11,7 +11,7 @@ process_record_list, _metadata_present_helper, ) -from aind_metadata_viz.utils import compute_count_true, MetaState +from aind_metadata_viz.utils import MetaState API_GATEWAY_HOST = "api.allenneuraldynamics.org" DATABASE = "metadata_index" @@ -124,7 +124,7 @@ def get_expected_files(self) -> tuple[list[str], list[str]]: return (expected_files_by_modality, excluded_files_by_modality) def get_file_presence( - self, files: list[str], excluded_files: list[str] = [] + self ): """Get the presence of a list of files @@ -133,8 +133,11 @@ def get_file_presence( files : list[str], optional List of expected metadata filenames, by default EXPECTED_FILES """ + (expected_files, excluded_files) = self.get_expected_files() + files = expected_files + excluded_files + # Get the short form df, each row is a record and each column is it's file:MetaState - processed = process_record_list(self.data_filtered, files) + processed = process_record_list(self.data_filtered, expected_files) df = pd.DataFrame(processed, columns=files) # Melt to long form @@ -146,16 +149,17 @@ def get_file_presence( def get_field_presence(self): """Get the presence of fields at the top-level""" - if len(self.data_filtered) > 0: - fields = [ - item - for item in list(self.data_filtered[0].keys()) - if item not in EXPECTED_FILES - ] - else: - fields = [] - - return self.get_file_presence(files=fields) + return pd.DataFrame() + # if len(self.data_filtered) > 0: + # fields = [ + # item + # for item in list(self.data_filtered[0].keys()) + # if item not in EXPECTED_FILES + # ] + # else: + # fields = [] + + # return self.get_file_presence(files=fields) def set_file(self, file: str): """Set the active file diff --git a/src/aind_metadata_viz/metadata_helpers.py b/src/aind_metadata_viz/metadata_helpers.py index 4931967..2462165 100644 --- a/src/aind_metadata_viz/metadata_helpers.py +++ b/src/aind_metadata_viz/metadata_helpers.py @@ -86,7 +86,7 @@ def check_metadata_state(field: str, object: dict, parent: str = None, excluded_ return MetaState.MISSING.value -def process_record_list(record_list: list, expected_fields: list): +def process_record_list(record_list: list, expected_fields: list, excluded_fields:list = []): """Process a list of Metadata JSON records from DocDB For each record, check each of the expected fields and see if they are valid/present/missing/excluded @@ -102,4 +102,4 @@ def process_record_list(record_list: list, expected_fields: list): ------- list[{field: MetaState}] """ - return [{field: check_metadata_state(field, data) for field in expected_fields} for data in record_list] + return [{field: check_metadata_state(field, data, excluded_fields) for field in expected_fields} for data in record_list] diff --git a/src/aind_metadata_viz/utils.py b/src/aind_metadata_viz/utils.py index aed8ee3..f0a762b 100644 --- a/src/aind_metadata_viz/utils.py +++ b/src/aind_metadata_viz/utils.py @@ -5,4 +5,4 @@ class MetaState(str, Enum): VALID = "valid" PRESENT = "present" MISSING = "missing" - EXCLUDED = "excluded" \ No newline at end of file + EXCLUDED = "excluded" From 173f00d6273e3c8d6ed7a20be559e16095d3554b Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Sun, 6 Oct 2024 22:16:24 -0700 Subject: [PATCH 10/13] fix: getting validation functional - Needed to set schema_version to the current schema's version by pulling that information from the schema - Typo in df --- src/aind_metadata_viz/app.py | 5 +++-- src/aind_metadata_viz/docdb.py | 23 +++++++++++++++++---- src/aind_metadata_viz/metadata_class_map.py | 12 +++++++++++ src/aind_metadata_viz/metadata_helpers.py | 15 +++++++++----- 4 files changed, 44 insertions(+), 11 deletions(-) diff --git a/src/aind_metadata_viz/app.py b/src/aind_metadata_viz/app.py index 3688b34..eae4308 100644 --- a/src/aind_metadata_viz/app.py +++ b/src/aind_metadata_viz/app.py @@ -19,7 +19,7 @@ "valid": "#9FF2F5", "present": "#F49FD7", "missing": "#F49FD7", - "excluded": "#9FF2F5", + "excluded": "black", }, } @@ -61,6 +61,7 @@ def file_present_chart(): sum_longform_df = db.get_file_presence() + # print(sum_longform_df) local_states = sum_longform_df["state"].unique() local_color_list = [colors[state] for state in local_states] @@ -70,7 +71,7 @@ def file_present_chart(): .encode( x=alt.X("file:N", title=None, axis=alt.Axis(grid=False)), y=alt.Y( - "count:Q", + "sum:Q", title="Metadata assets (n)", axis=alt.Axis(grid=False), ), diff --git a/src/aind_metadata_viz/docdb.py b/src/aind_metadata_viz/docdb.py index 41bad41..10f6076 100644 --- a/src/aind_metadata_viz/docdb.py +++ b/src/aind_metadata_viz/docdb.py @@ -61,12 +61,14 @@ def __init__( ): """Initialize""" # get data - self._data = get_all(test_mode=test_mode) + self._data = _get_all(test_mode=test_mode) # setup (expected_files, _) = self.get_expected_files() self.set_file(expected_files[0]) + # run validation + @property def data_filtered(self): mod_filter = not (self.modality_filter == "all") @@ -255,11 +257,24 @@ def get_csv(self, file: str, field: str = " ", missing: str = "Missing"): return sio.getvalue() +# @pn.cache(ttl=CACHE_RESET_DAY) +# def _get_all_df(test_mode=False): +# """Get all and convert to data frame format + +# Parameters +# ---------- +# test_mode : bool, optional +# _description_, by default False +# """ +# record_list = _get_all(test_mode=test_mode) + + + + @pn.cache(ttl=CACHE_RESET_DAY) -def get_all(test_mode=False): +def _get_all(test_mode=False): filter = {} - # limit = 0 if not test_mode else 10 - limit = 10 + limit = 250 if not test_mode else 10 paginate_batch_size = 500 response = docdb_api_client.retrieve_docdb_records( filter_query=filter, diff --git a/src/aind_metadata_viz/metadata_class_map.py b/src/aind_metadata_viz/metadata_class_map.py index 57436ea..00fad00 100644 --- a/src/aind_metadata_viz/metadata_class_map.py +++ b/src/aind_metadata_viz/metadata_class_map.py @@ -118,6 +118,18 @@ ) from aind_data_schema_models.species import Species +first_layer_versions = { + "data_description": DataDescription.model_construct().schema_version, + "acquisition": Acquisition.model_construct().schema_version, + "procedures": Procedures.model_construct().schema_version, + "subject": Subject.model_construct().schema_version, + "instrument": Instrument.model_construct().schema_version, + "processing": Processing.model_construct().schema_version, + "rig": Rig.model_construct().schema_version, + "session": Session.model_construct().schema_version, + "quality_control": QualityControl.model_construct().schema_version, +} + first_layer_field_mapping = { "data_description": DataDescription, "acquisition": Acquisition, diff --git a/src/aind_metadata_viz/metadata_helpers.py b/src/aind_metadata_viz/metadata_helpers.py index 2462165..4d1300a 100644 --- a/src/aind_metadata_viz/metadata_helpers.py +++ b/src/aind_metadata_viz/metadata_helpers.py @@ -1,6 +1,7 @@ -from aind_metadata_viz.metadata_class_map import first_layer_field_mapping, second_layer_field_mappings +from aind_metadata_viz.metadata_class_map import first_layer_field_mapping, second_layer_field_mappings, first_layer_versions from aind_metadata_viz.utils import MetaState from pydantic import ValidationError +from typing import Literal def _metadata_present_helper(json: str, check_present: bool = True): @@ -35,10 +36,14 @@ def _metadata_valid_helper(field: str, json: str, mapping: dict, ): json : str json string generated from a AindCoreModel dump """ + if "schema_version" in json: + # force the schema version to match the current one + json["schema_version"] = first_layer_versions[field] + if field in mapping: try: - return mapping[field].model_validate_json(json) is not None - except ValidationError as e: + return mapping[field](**json) is not None + except Exception as e: print(e) return False @@ -69,8 +74,8 @@ def check_metadata_state(field: str, object: dict, parent: str = None, excluded_ else: class_map = first_layer_field_mapping - # First check that the key exists at all - if field in object: + # First check that the key exists at all and is not None + if field in object and object[field]: value = object[field] else: return MetaState.MISSING.value From 8fce863003d6231fba6657955948c6fe65a92b45 Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Mon, 7 Oct 2024 09:28:39 -0700 Subject: [PATCH 11/13] chore: duplicate dependency --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 79d61df..9a0db6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,6 @@ dependencies = [ 'aind-data-schema', 'aind-data-access-api[docdb]', 'aind-data-schema-models', - 'aind-data-schema', 'flask', ] From 162a6f49848d3b4a4709a55bd43e57acba48fd01 Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Mon, 7 Oct 2024 14:11:19 -0700 Subject: [PATCH 12/13] feat: complete and functional --- src/aind_metadata_viz/app.py | 42 ++-- src/aind_metadata_viz/docdb.py | 221 +++++++++++++--------- src/aind_metadata_viz/metadata_helpers.py | 67 +++++-- src/aind_metadata_viz/utils.py | 79 ++++++++ tests/test_docdb_database.py | 6 +- tests/test_metadata_helpers.py | 4 +- 6 files changed, 280 insertions(+), 139 deletions(-) diff --git a/src/aind_metadata_viz/app.py b/src/aind_metadata_viz/app.py index eae4308..d1ba74d 100644 --- a/src/aind_metadata_viz/app.py +++ b/src/aind_metadata_viz/app.py @@ -1,9 +1,11 @@ import panel as pn import altair as alt -import pandas as pd from aind_metadata_viz import docdb +from aind_metadata_viz.docdb import _get_all from aind_data_schema import __version__ as ads_version +_get_all(test_mode=True) + pn.extension(design="material") pn.extension("vega") alt.themes.enable("ggplot2") @@ -12,14 +14,16 @@ "default": { "valid": "green", "present": "grey", + "optional": "grey", "missing": "red", - "excluded": "black", + "excluded": "white", }, "lemonade": { "valid": "#9FF2F5", - "present": "#F49FD7", + "optional": "#F49FD7", + "optional": "grey", "missing": "#F49FD7", - "excluded": "black", + "excluded": "white", }, } @@ -37,10 +41,10 @@ ) top_selector = pn.widgets.Select( - name="Select metadata file:", options=docdb.EXPECTED_FILES + name="Select metadata file:", options=docdb.ALL_FILES ) -mid_selector = pn.widgets.Select(name="Sub-select for field:", options=[]) +field_selector = pn.widgets.Select(name="Sub-select for field:", options=[]) missing_selector = pn.widgets.Select( name="Value state", options=["Missing", "Present"] @@ -54,7 +58,7 @@ pn.state.location.sync(modality_selector, {"value": "modality"}) pn.state.location.sync(top_selector, {"value": "file"}) -pn.state.location.sync(mid_selector, {"value": "field"}) +pn.state.location.sync(field_selector, {"value": "field"}) pn.state.location.sync(missing_selector, {"value": "missing"}) pn.state.location.sync(derived_selector, {"value": "derived"}) @@ -130,7 +134,7 @@ def build_csv_jscode(event): Create the javascript code and append it to the page. """ csv = db.get_csv( - top_selector.value, mid_selector.value, missing_selector.value + top_selector.value, field_selector.value, missing_selector.value ) csv_escaped = csv.replace("\n", "\\n").replace( '"', '\\"' @@ -139,9 +143,9 @@ def build_csv_jscode(event): get_missing = missing_selector.value == "Missing" missing_text = "missing" if get_missing else "present" - if not mid_selector.value == " ": + if not field_selector.value == " ": filename = ( - f"{top_selector.value}-{mid_selector.value}-{missing_text}.csv" + f"{top_selector.value}-{field_selector.value}-{missing_text}.csv" ) else: filename = f"{top_selector.value}-{missing_text}.csv" @@ -187,7 +191,9 @@ def build_mid(selected_file, derived_filter, **args): .encode( x=alt.X("column:N", title=None, axis=alt.Axis(grid=False)), y=alt.Y( - "count:Q", title="Metadata assets (n)", axis=alt.Axis(grid=False) + "count:Q", + title="Metadata assets (n)", + axis=alt.Axis(grid=False), ), color=alt.Color( "category:N", @@ -202,15 +208,9 @@ def build_mid(selected_file, derived_filter, **args): ) # Also update the selected list - if len(db.mid_list) > 0: - if len(db.mid_list) > 0 and db.mid_list[0]: - option_list = [" "] + list(db.mid_list[0].keys()) - else: - option_list = [" "] - else: - option_list = [] + option_list = [" "] + db.field_list - mid_selector.options = option_list + field_selector.options = option_list return pn.pane.Vega(chart) @@ -226,7 +226,7 @@ def hd_style(text): "This app steps through all of the metadata stored in DocDB and determines whether every record's fields " "(and subfields) are " f"{hd_style('valid')} for aind-data-schema v{ads_version}, " - f"{hd_style('present')} but invalid, " + f"{hd_style('present')} but invalid or {hd_style('optional')}, " f"{hd_style('missing')}, or " f"{hd_style('excluded')} for the record's modality." ) @@ -245,7 +245,7 @@ def hd_style(text): top_selector, derived_selector, download_pane, - mid_selector, + field_selector, missing_selector, download_button, width=400, diff --git a/src/aind_metadata_viz/docdb.py b/src/aind_metadata_viz/docdb.py index 10f6076..59406af 100644 --- a/src/aind_metadata_viz/docdb.py +++ b/src/aind_metadata_viz/docdb.py @@ -1,17 +1,22 @@ from aind_data_access_api.document_db import MetadataDbClient -import numpy as np import panel as pn import pandas as pd import param from io import StringIO -from aind_data_schema_models.modalities import Modality, ExpectedFiles, FileRequirement +from aind_data_schema_models.modalities import ( + Modality, + ExpectedFiles, + FileRequirement, +) from aind_metadata_viz.metadata_helpers import ( process_record_list, - _metadata_present_helper, ) -from aind_metadata_viz.utils import MetaState +from aind_metadata_viz.metadata_class_map import ( + first_layer_field_mapping, + second_layer_field_mappings, +) API_GATEWAY_HOST = "api.allenneuraldynamics.org" DATABASE = "metadata_index" @@ -23,7 +28,7 @@ collection=COLLECTION, ) -EXPECTED_FILES = sorted( +ALL_FILES = sorted( [ "data_description", "acquisition", @@ -36,6 +41,8 @@ "quality_control", ] ) +# These are the fields that need to be dropped from that data frame when building charts +EXTRA_FIELDS = ["modalities", "derived", "name", "_id", "location", "created"] # reset cache every 24 hours CACHE_RESET_DAY = 24 * 60 * 60 @@ -61,61 +68,43 @@ def __init__( ): """Initialize""" # get data - self._data = _get_all(test_mode=test_mode) + self._data = _get_file_presence() # setup (expected_files, _) = self.get_expected_files() self.set_file(expected_files[0]) - # run validation - @property def data_filtered(self): mod_filter = not (self.modality_filter == "all") - # Check if the data needs to be filtered by either modality or derived state - if mod_filter or not (self.derived_filter == "All assets"): - # filter data - filtered_list = [] - - for data in self._data: - include: bool = True - - if mod_filter and not ( - data["data_description"] - and "modality" in data["data_description"] - and isinstance(data["data_description"]["modality"], list) - and any( - mod["abbreviation"] == self.modality_filter - for mod in data["data_description"]["modality"] - ) - ): - include = False - - if ( - self.derived_filter == "Raw" - and data["name"].count("_") > 3 - ) or ( - self.derived_filter == "Derived" - and data["name"].count("_") <= 3 - ): - include = False - - if include: - filtered_list.append(data) - return filtered_list - else: - return self._data + filtered_df = self._data.copy() + + # Filter by modality + if mod_filter: + filtered_df = filtered_df[ + filtered_df["modalities"].str.contains(self.modality_filter) + ] + + if not (self.derived_filter == "All assets"): + if self.derived_filter == "Raw": + filtered_df = filtered_df[filtered_df["derived"] == False] + elif self.derived_filter == "Derived": + filtered_df = filtered_df[filtered_df["derived"] == True] + + return filtered_df def get_expected_files(self) -> tuple[list[str], list[str]]: if self.modality_filter == "all": - return (EXPECTED_FILES, []) + return (ALL_FILES, []) - expected_files_by_modality = EXPECTED_FILES.copy() + expected_files_by_modality = ALL_FILES.copy() excluded_files_by_modality = [] # get the ExpectedFiles object for this modality - expected_files = getattr(ExpectedFiles, str(self.modality_filter).upper()) + expected_files = getattr( + ExpectedFiles, str(self.modality_filter).upper() + ) # loop through the actual files and remove any that are not expected for file in expected_files_by_modality: @@ -124,10 +113,8 @@ def get_expected_files(self) -> tuple[list[str], list[str]]: excluded_files_by_modality.append(file) return (expected_files_by_modality, excluded_files_by_modality) - - def get_file_presence( - self - ): + + def get_file_presence(self): """Get the presence of a list of files Parameters @@ -135,18 +122,17 @@ def get_file_presence( files : list[str], optional List of expected metadata filenames, by default EXPECTED_FILES """ - (expected_files, excluded_files) = self.get_expected_files() - files = expected_files + excluded_files - - # Get the short form df, each row is a record and each column is it's file:MetaState - processed = process_record_list(self.data_filtered, expected_files) - df = pd.DataFrame(processed, columns=files) - # Melt to long form - df_melted = df.melt(var_name='file', value_name='state') + df = self._data.copy() + df.drop(EXTRA_FIELDS, axis=1, inplace=True) + + df_melted = df.melt(var_name="file", value_name="state") # Get sum - df_summary = df_melted.groupby(["file", "state"]).size().reset_index(name="sum") + df_summary = ( + df_melted.groupby(["file", "state"]).size().reset_index(name="sum") + ) + print(df_summary) return df_summary def get_field_presence(self): @@ -173,10 +159,7 @@ def set_file(self, file: str): """ self.file = file - self.mid_list = [] - for data in self.data_filtered: - if _metadata_present_helper(data[self.file]): - self.mid_list.append(data[self.file]) + self.field_list = list(second_layer_field_mappings[file].keys()) def get_file_field_presence(self): """Get the presence of fields in a specific file @@ -193,9 +176,9 @@ def get_file_field_presence(self): """ return pd.DataFrame() # expected_fields = ( - # self.mid_list[0].keys() if len(self.mid_list) > 0 else [] + # self.field_list[0].keys() if len(self.field_list) > 0 else [] # ) - # processed = process_record_list(self.mid_list, expected_fields) + # processed = process_record_list(self.field_list, expected_fields) # print(processed) # df = pd.DataFrame() @@ -223,42 +206,90 @@ def get_csv(self, file: str, field: str = " ", missing: str = "Missing"): get_present = missing == "Present" - df_data = [] - for data in self.data_filtered: - if not data[file] is None: - if field == " " or _metadata_present_helper( - field, data[file], check_present=get_present - ): - # This file/field combo is present/missing, get all the id - # information - id_data = {} - for id_field in id_fields: - if id_field in data: - id_data[id_field] = data[id_field] - else: - id_data[id_field] = None - - # Get subject if available - if ( - "subject" in data - and data["subject"] - and "subject_id" in data["subject"] - ): - id_data["subject_id"] = data["subject"]["subject_id"] - else: - id_data["subject_id"] = "" - - df_data.append(id_data) - - df = pd.DataFrame(df_data) + # df_data = [] + # for data in self.data_filtered: + # if not data[file] is None: + # if field == " " or _metadata_present_helper( + # field, data[file], check_present=get_present + # ): + # # This file/field combo is present/missing, get all the id + # # information + # id_data = {} + # for id_field in id_fields: + # if id_field in data: + # id_data[id_field] = data[id_field] + # else: + # id_data[id_field] = None + + # # Get subject if available + # if ( + # "subject" in data + # and data["subject"] + # and "subject_id" in data["subject"] + # ): + # id_data["subject_id"] = data["subject"]["subject_id"] + # else: + # id_data["subject_id"] = "" + + # df_data.append(id_data) + + df = pd.DataFrame() sio = StringIO() df.to_csv(sio, index=False) return sio.getvalue() +@pn.cache(ttl=CACHE_RESET_DAY) +def _get_file_presence() -> pd.DataFrame: + """Get all and convert to data frame format + + Parameters + ---------- + test_mode : bool, optional + _description_, by default False + """ + record_list = _get_all() + files = list(first_layer_field_mapping.keys()) + + processed = process_record_list(record_list, files) + + # Now add some information about the records, i.e. modality, derived state, etc. + for i, record in enumerate(record_list): + if ( + "data_description" in record + and record["data_description"] + and "modality" in record["data_description"] + ): + if isinstance(record["data_description"]["modality"], list): + modalities = [ + mod["abbreviation"] + for mod in record["data_description"]["modality"] + ] + else: + modalities = [] + derived = True if record["name"].count("_") <= 3 else False + + info_data = { + "modalities": ",".join(modalities), + "derived": derived, + "name": record["name"], + "_id": record["_id"], + "location": record["location"], + "created": record["created"], + } + + processed[i] = {**processed[i], **info_data} + + return pd.DataFrame( + processed, + columns=files + + ["modalities", "derived", "name", "_id", "location", "created"], + ) + + # @pn.cache(ttl=CACHE_RESET_DAY) -# def _get_all_df(test_mode=False): +# def _get_field_presence(file: str): # """Get all and convert to data frame format # Parameters @@ -266,15 +297,17 @@ def get_csv(self, file: str, field: str = " ", missing: str = "Missing"): # test_mode : bool, optional # _description_, by default False # """ -# record_list = _get_all(test_mode=test_mode) +# record_list = _get_all() +# # filter by file +# # get field presence @pn.cache(ttl=CACHE_RESET_DAY) def _get_all(test_mode=False): filter = {} - limit = 250 if not test_mode else 10 + limit = 0 if not test_mode else 10 paginate_batch_size = 500 response = docdb_api_client.retrieve_docdb_records( filter_query=filter, diff --git a/src/aind_metadata_viz/metadata_helpers.py b/src/aind_metadata_viz/metadata_helpers.py index 4d1300a..bbe1bc3 100644 --- a/src/aind_metadata_viz/metadata_helpers.py +++ b/src/aind_metadata_viz/metadata_helpers.py @@ -1,5 +1,10 @@ -from aind_metadata_viz.metadata_class_map import first_layer_field_mapping, second_layer_field_mappings, first_layer_versions -from aind_metadata_viz.utils import MetaState +from aind_metadata_viz.metadata_class_map import ( + first_layer_field_mapping, + second_layer_field_mappings, + first_layer_versions, +) +from aind_metadata_viz.utils import MetaState, expected_files_from_modalities +from aind_data_schema_models.modalities import FileRequirement from pydantic import ValidationError from typing import Literal @@ -15,12 +20,7 @@ def _metadata_present_helper(json: str, check_present: bool = True): object : dict Dictionary """ - present = ( - json is not None - and json != "" - and json != [] - and json != {} - ) + present = json is not None and json != "" and json != [] and json != {} if check_present: return "present" if present else "absent" @@ -28,7 +28,11 @@ def _metadata_present_helper(json: str, check_present: bool = True): return "absent" if present else "present" -def _metadata_valid_helper(field: str, json: str, mapping: dict, ): +def _metadata_valid_helper( + field: str, + json: str, + mapping: dict, +): """Return true if the json data is a valid object of the particular field class Parameters @@ -44,11 +48,11 @@ def _metadata_valid_helper(field: str, json: str, mapping: dict, ): try: return mapping[field](**json) is not None except Exception as e: - print(e) + # print(e) return False -def check_metadata_state(field: str, object: dict, parent: str = None, excluded_fields: list = []) -> MetaState: +def check_metadata_state(field: str, object: dict, parent: str = None) -> str: """Get the MetaState for a specific key in a dictinoary Parameters @@ -64,12 +68,29 @@ def check_metadata_state(field: str, object: dict, parent: str = None, excluded_ _description_ """ # if excluded, just return that - if field in excluded_fields: - return MetaState.EXCLUDED.value + # get the excluded fields from the class map + if ( + "data_description" in object + and object["data_description"] + and "modality" in object["data_description"] + ): + modality_map = expected_files_from_modalities( + modalities=object["data_description"]["modality"] + ) + + if field in modality_map: + file_req = modality_map[field] + if modality_map[field] == FileRequirement.EXCLUDED: + return MetaState.EXCLUDED.value + else: + print( + f"Warning: field {field} had incorrect modalities, so no file requirement is defined" + ) + file_req = FileRequirement.REQUIRED # if you're looking at a parent file's data then you need a different mapping if parent: - print('not implemented') + print("not implemented") # we're at the top level, just check the first layer mappings else: class_map = first_layer_field_mapping @@ -83,15 +104,18 @@ def check_metadata_state(field: str, object: dict, parent: str = None, excluded_ # attempt validation if _metadata_valid_helper(field, value, class_map): return MetaState.VALID.value - - # check missing + + # check missing if _metadata_present_helper(value): return MetaState.PRESENT.value - - return MetaState.MISSING.value + + if file_req == FileRequirement.OPTIONAL: + return MetaState.OPTIONAL.value + else: + return MetaState.MISSING.value -def process_record_list(record_list: list, expected_fields: list, excluded_fields:list = []): +def process_record_list(record_list: list, expected_fields: list): """Process a list of Metadata JSON records from DocDB For each record, check each of the expected fields and see if they are valid/present/missing/excluded @@ -107,4 +131,7 @@ def process_record_list(record_list: list, expected_fields: list, excluded_field ------- list[{field: MetaState}] """ - return [{field: check_metadata_state(field, data, excluded_fields) for field in expected_fields} for data in record_list] + return [ + {field: check_metadata_state(field, data) for field in expected_fields} + for data in record_list + ] diff --git a/src/aind_metadata_viz/utils.py b/src/aind_metadata_viz/utils.py index f0a762b..9a655d0 100644 --- a/src/aind_metadata_viz/utils.py +++ b/src/aind_metadata_viz/utils.py @@ -1,8 +1,87 @@ from enum import Enum +from aind_data_schema_models.modalities import ExpectedFiles, FileRequirement + +# from aind_data_schema.core.metadata import CORE_FILES # todo: import instead of declaring + +CORE_FILES = [ + "subject", + "data_description", + "procedures", + "session", + "rig", + "processing", + "acquisition", + "instrument", + "quality_control", +] class MetaState(str, Enum): VALID = "valid" PRESENT = "present" + OPTIONAL = "optional" MISSING = "missing" EXCLUDED = "excluded" + + +REMAPS = { + "OPHYS": "POPHYS", + "EPHYS": "ECEPHYS", + "TRAINED_BEHAVIOR": "BEHAVIOR", + "HSFP": "FIB", + "DISPIM": "SPIM", + "MULTIPLANE_OPHYS": "POPHYS", +} + + +def expected_files_from_modalities( + modalities: list[str], +) -> dict[str, FileRequirement]: + """Get the expected files for a list of modalities + + Parameters + ---------- + modalities : list[str] + List of modalities to get expected files for + + Returns + ------- + list[str] + List of expected files + """ + requirement_dict = {} + + # I can't believe I have to do this + if not isinstance(modalities, list): + modalities = [modalities] + + for modality in modalities: + if "abbreviation" not in modality: + continue + + for file in CORE_FILES: + # For each field, check if this is a required/excluded file + + # remap + abbreviation = str(modality["abbreviation"]).replace("-", "_").upper() + if abbreviation in REMAPS: + abbreviation = REMAPS[abbreviation] + + file_requirement = getattr( + getattr( + ExpectedFiles, + abbreviation, + ), + file, + ) + + if file not in requirement_dict: + requirement_dict[file] = file_requirement + elif (file_requirement == FileRequirement.REQUIRED) or ( + file_requirement == FileRequirement.OPTIONAL + and requirement_dict[file] == FileRequirement.EXCLUDED + ): + # override, required wins over all else, and optional wins over excluded + requirement_dict[file] = file_requirement + + return requirement_dict diff --git a/tests/test_docdb_database.py b/tests/test_docdb_database.py index b8f26d6..c9538ec 100644 --- a/tests/test_docdb_database.py +++ b/tests/test_docdb_database.py @@ -2,7 +2,7 @@ import unittest -from aind_metadata_viz.docdb import Database, EXPECTED_FILES +from aind_metadata_viz.docdb import Database, ALL_FILES class DocDBDatabaseTest(unittest.TestCase): @@ -45,10 +45,10 @@ def test_filtered_files(self): """Test filtering by file""" mid_len = [0, 10, 0, 10, 10, 0, 10, 2, 10] - for i, file in enumerate(EXPECTED_FILES): + for i, file in enumerate(ALL_FILES): self.db.set_file(file) - self.assertEqual(len(self.db.mid_list), mid_len[i]) + self.assertEqual(len(self.db.field_list), mid_len[i]) def test_derived(self): """Test filtering for derived""" diff --git a/tests/test_metadata_helpers.py b/tests/test_metadata_helpers.py index 33a9c33..0858b3c 100644 --- a/tests/test_metadata_helpers.py +++ b/tests/test_metadata_helpers.py @@ -59,7 +59,9 @@ def test_check_present(self): self.assertTrue(_metadata_present_helper("test7", self.dict)) self.assertTrue(_metadata_present_helper("test8", self.dict)) - self.assertFalse(_metadata_present_helper("test8", self.dict, check_present=False)) + self.assertFalse( + _metadata_present_helper("test8", self.dict, check_present=False) + ) def test_process_present_dict(self): """Test the process_present_dict function""" From df86c00d89db4c8ad46936d4cb8ed4ba9d522ede Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Mon, 7 Oct 2024 14:30:30 -0700 Subject: [PATCH 13/13] feat: fixing download issues for files --- src/aind_metadata_viz/app.py | 45 +++---------------- src/aind_metadata_viz/docdb.py | 79 +++++++++------------------------- 2 files changed, 28 insertions(+), 96 deletions(-) diff --git a/src/aind_metadata_viz/app.py b/src/aind_metadata_viz/app.py index d1ba74d..8a39be2 100644 --- a/src/aind_metadata_viz/app.py +++ b/src/aind_metadata_viz/app.py @@ -20,7 +20,7 @@ }, "lemonade": { "valid": "#9FF2F5", - "optional": "#F49FD7", + "present": "#F49FD7", "optional": "grey", "missing": "#F49FD7", "excluded": "white", @@ -47,8 +47,9 @@ field_selector = pn.widgets.Select(name="Sub-select for field:", options=[]) missing_selector = pn.widgets.Select( - name="Value state", options=["Missing", "Present"] + name="Value state", options=["Not Valid/Present", "Valid/Present"] ) +missing_selector.value = "Not Valid/Present" derived_selector = pn.widgets.Select( name="Filter for:", @@ -96,36 +97,6 @@ def file_present_chart(): return pane -def notfile_present_chart(): - sum_longform_df = db.get_field_presence() - - chart = ( - alt.Chart(sum_longform_df) - .mark_bar() - .encode( - x=alt.X("column:N", title=None, axis=alt.Axis(grid=False)), - y=alt.Y( - "count:Q", - title=None, - axis=alt.Axis(grid=False), - ), - color=alt.Color( - "category:N", - scale=alt.Scale( - domain=["valid", "present", "missing", "excluded"], - range=color_list, - ), - legend=None, - ), - ) - .properties(title="Other fields") - ) - - pane = pn.pane.Vega(chart) - - return pane - - js_pane = pn.pane.HTML("", height=0, width=0).servable() @@ -133,15 +104,13 @@ def build_csv_jscode(event): """ Create the javascript code and append it to the page. """ - csv = db.get_csv( - top_selector.value, field_selector.value, missing_selector.value - ) + csv = db.get_csv(missing_selector.value) csv_escaped = csv.replace("\n", "\\n").replace( '"', '\\"' ) # Escape newlines and double quotes - get_missing = missing_selector.value == "Missing" - missing_text = "missing" if get_missing else "present" + get_missing = missing_selector.value == "Not Valid/Present" + missing_text = "bad" if get_missing else "good" if not field_selector.value == " ": filename = ( @@ -256,7 +225,7 @@ def build_row(selected_modality, derived_filter): db.modality_filter = selected_modality db.derived_filter = derived_filter - return pn.Row(file_present_chart, notfile_present_chart) + return file_present_chart top_row = pn.bind( diff --git a/src/aind_metadata_viz/docdb.py b/src/aind_metadata_viz/docdb.py index 59406af..b9007c6 100644 --- a/src/aind_metadata_viz/docdb.py +++ b/src/aind_metadata_viz/docdb.py @@ -68,7 +68,7 @@ def __init__( ): """Initialize""" # get data - self._data = _get_file_presence() + self._data = _get_file_presence(test_mode=test_mode) # setup (expected_files, _) = self.get_expected_files() @@ -88,9 +88,9 @@ def data_filtered(self): if not (self.derived_filter == "All assets"): if self.derived_filter == "Raw": - filtered_df = filtered_df[filtered_df["derived"] == False] + filtered_df = filtered_df[filtered_df["derived"]==False] elif self.derived_filter == "Derived": - filtered_df = filtered_df[filtered_df["derived"] == True] + filtered_df = filtered_df[filtered_df["derived"]==True] return filtered_df @@ -123,32 +123,15 @@ def get_file_presence(self): List of expected metadata filenames, by default EXPECTED_FILES """ # Melt to long form - df = self._data.copy() + df = self.data_filtered.copy() df.drop(EXTRA_FIELDS, axis=1, inplace=True) df_melted = df.melt(var_name="file", value_name="state") # Get sum - df_summary = ( - df_melted.groupby(["file", "state"]).size().reset_index(name="sum") - ) + df_summary = df_melted.groupby(["file", "state"]).size().reset_index(name="sum") - print(df_summary) return df_summary - def get_field_presence(self): - """Get the presence of fields at the top-level""" - return pd.DataFrame() - # if len(self.data_filtered) > 0: - # fields = [ - # item - # for item in list(self.data_filtered[0].keys()) - # if item not in EXPECTED_FILES - # ] - # else: - # fields = [] - - # return self.get_file_presence(files=fields) - def set_file(self, file: str): """Set the active file @@ -186,7 +169,7 @@ def get_file_field_presence(self): # return compute_count_true(df) - def get_csv(self, file: str, field: str = " ", missing: str = "Missing"): + def get_csv(self, vp_state: str = "Not Valid/Present"): """Build a CSV file of export data based on the selected file and field Parameters @@ -199,41 +182,21 @@ def get_csv(self, file: str, field: str = " ", missing: str = "Missing"): Returns ------- csv - CSV file with name, _id, location, creation date, and subject_id (if available) + CSV file with name, _id, location, created date, and subject_id (if available) """ # For everybody who is missing the currently active file/field - id_fields = ["name", "_id", "location", "creation"] - - get_present = missing == "Present" - - # df_data = [] - # for data in self.data_filtered: - # if not data[file] is None: - # if field == " " or _metadata_present_helper( - # field, data[file], check_present=get_present - # ): - # # This file/field combo is present/missing, get all the id - # # information - # id_data = {} - # for id_field in id_fields: - # if id_field in data: - # id_data[id_field] = data[id_field] - # else: - # id_data[id_field] = None - - # # Get subject if available - # if ( - # "subject" in data - # and data["subject"] - # and "subject_id" in data["subject"] - # ): - # id_data["subject_id"] = data["subject"]["subject_id"] - # else: - # id_data["subject_id"] = "" - - # df_data.append(id_data) - - df = pd.DataFrame() + df = self.data_filtered + + df = df[["name", "_id", "location", "created"]] + + print(self.file) + + if vp_state == "Not Valid/Present": + df = df[(self.data_filtered[self.file] == "missing") | (self.data_filtered[self.file] == "optional")] + elif vp_state == "Valid/Present": + df = df[(self.data_filtered[self.file] == "present") | (self.data_filtered[self.file] == "valid")] + + # [TODO] add back in filtering by field and not just file sio = StringIO() df.to_csv(sio, index=False) @@ -241,7 +204,7 @@ def get_csv(self, file: str, field: str = " ", missing: str = "Missing"): @pn.cache(ttl=CACHE_RESET_DAY) -def _get_file_presence() -> pd.DataFrame: +def _get_file_presence(test_mode=False) -> pd.DataFrame: """Get all and convert to data frame format Parameters @@ -249,7 +212,7 @@ def _get_file_presence() -> pd.DataFrame: test_mode : bool, optional _description_, by default False """ - record_list = _get_all() + record_list = _get_all(test_mode=test_mode) files = list(first_layer_field_mapping.keys()) processed = process_record_list(record_list, files)