From ec9d5eb973850a861cf73466c30bb2bd8201e74a Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 17:45:59 -0400 Subject: [PATCH 001/184] Add src/vak/prep/unit_dataset/ with unit_dataset.py --- src/vak/prep/unit_dataset/__init__.py | 2 + src/vak/prep/unit_dataset/unit_dataset.py | 331 ++++++++++++++++++++++ 2 files changed, 333 insertions(+) create mode 100644 src/vak/prep/unit_dataset/__init__.py create mode 100644 src/vak/prep/unit_dataset/unit_dataset.py diff --git a/src/vak/prep/unit_dataset/__init__.py b/src/vak/prep/unit_dataset/__init__.py new file mode 100644 index 000000000..d2e934e28 --- /dev/null +++ b/src/vak/prep/unit_dataset/__init__.py @@ -0,0 +1,2 @@ +from . import unit_dataset +from .unit_dataset import prep_unit_dataset diff --git a/src/vak/prep/unit_dataset/unit_dataset.py b/src/vak/prep/unit_dataset/unit_dataset.py new file mode 100644 index 000000000..d175ef13f --- /dev/null +++ b/src/vak/prep/unit_dataset/unit_dataset.py @@ -0,0 +1,331 @@ +"""Functions for making a dataset of units from sequences, +as used to train dimensionality reduction models.""" +from __future__ import annotations + +import logging +import os +import pathlib + +import attrs +import crowsetta +import dask +import dask.delayed +from dask.diagnostics import ProgressBar +import numpy as np +import numpy.typing as npt +import pandas as pd + +from ...common import annotation, constants +from ...common.converters import expanded_user_path, labelset_to_set +from ..spectrogram_dataset.audio_helper import files_from_dir +from ..spectrogram_dataset.spect import spectrogram + + +logger = logging.getLogger(__name__) + + +@attrs.define +class Segment: + """Dataclass that represents a segment + from segmented audio or spectrogram. + + The attributes are metadata used to track + the origin of this segment in a csv file + representing a dataset of such segments. + """ + data: npt.NDArray + samplerate: int + onset_s: float + offset_s: float + label: str + sample_dur: float + segment_dur: float + audio_path: str + annot_path: str + + +@dask.delayed +def get_segment_list(audio_path, annot, audio_format, context_s=0.005): + """Get a list of :class:`Segment` instances, given + the path to an audio file and an annotation that indicates + where segments occur in that audio file. + + Function used by + :func:`vak.prep.dimensionality_reduction.unit_dataset.prep_unit_dataset`. + + Parameters + ---------- + audio_path : str + + annot : crowsetta.Annotation + + audio_format : str + + context_s : float + + + Returns + ------- + segments : list + A :class:`list` of :class:`Segment` instances. + """ + data, samplerate = constants.AUDIO_FORMAT_FUNC_MAP[audio_format](audio_path) + sample_dur = 1. / samplerate + + segments = [] + for onset_s, offset_s, label in zip(annot.seq.onsets_s, annot.seq.offsets_s, annot.seq.labels): + onset_s -= context_s + offset_s += context_s + onset_ind = int(np.floor(onset_s * samplerate)) + offset_ind = int(np.ceil(offset_s * samplerate)) + segment_data = data[onset_ind : offset_ind + 1] + segment_dur = segment_data.shape[-1] * sample_dur + segment = Segment( + segment_data, samplerate, onset_s, offset_s, label, sample_dur, segment_dur, audio_path, annot.annot_path + ) + segments.append(segment) + + return segments + + +def spectrogram_from_segment(segment, spect_params): + """Compute a spectrogram given a :class:`Segment` instance. + + Parameters + ---------- + segment : Segment + spect_params : dict + + Returns + ------- + spect : numpy.ndarray + """ + data, samplerate = np.array(segment.data), segment.samplerate + s, _, _ = spectrogram( + data, + samplerate, + spect_params.fft_size, + spect_params.step_size, + spect_params.thresh, + spect_params.transform_type, + spect_params.freq_cutoffs, + ) + return s + + +@attrs.define +class SpectToSave: + """A spectrogram to be saved. + + Used by :func:`save_spect`. + """ + spect: npt.NDArray + ind: int + audio_path: str + + +def save_spect(spect_to_save: SpectToSave, output_dir: str | pathlib.Path) -> str: + """Save a spectrogram array to an npy file. + + The filename is build from the attributes of ``spect_to_save``, + saved in output dir, and the full path is returned as a string. + + Parameters + ---------- + spect_to_save : SpectToSave + output_dir : str, pathlib.Path + + Returns + ------- + npy_path : str + Path to npy file containing spectrogram inside ``output_dir`` + """ + basename = os.path.basename(spect_to_save.audio_path) + f"-segment-{spect_to_save.ind}" + npy_path = os.path.join(os.path.normpath(output_dir), basename + ".spect.npy") + np.save(npy_path, spect_to_save.spect) + return npy_path + + +@dask.delayed +def pad_spectrogram(record: tuple, pad_length: float): + """Pads a spectrogram to being a certain length + + Parameters + ---------- + record : tuple + pad_length : + + Returns + ------- + spect_padded : numpy.ndarray + With padding of length ``pad_length`` added on left and right sides. + """ + spect_path = record[0] # 'spect_path' + spect = np.load(spect_path) + + excess_needed = pad_length - spect.shape[-1] + pad_left = np.floor(float(excess_needed) / 2).astype("int") + pad_right = np.ceil(float(excess_needed) / 2).astype("int") + spect_padded = np.pad( + spect, [(0, 0), (pad_left, pad_right)], "constant", constant_values=0 + ) + + +def abspath(a_path): + """Convert a path to an absolute path""" + if isinstance(a_path, str) or isinstance(a_path, pathlib.Path): + return str(pathlib.Path(a_path).absolute()) + elif np.isnan(a_path): + return a_path + + +# ---- make spectrograms + records for dataframe ----------------------------------------------------------------------- +def make_spect_return_record(segment, ind, spect_params, output_dir): + """helper function that enables parallelized creation of "records", + i.e. rows for dataframe, from . + Accepts a two-element tuple containing (1) a dictionary that represents a spectrogram + and (2) annotation for that file""" + + spect = spectrogram_from_segment(segment, spect_params) + # FIXME: Add parameters for these functions to config and use + # mask_spec(spect) + # log_resize_spec(spect) + n_timebins = spect.shape[-1] + + spect_to_save = SpectToSave(spect, ind, segment.audio_path) + spect_path = save_spect(spect_to_save, output_dir) + record = tuple( + [ + abspath(spect_path), + abspath(segment.audio_path), + abspath(segment.annot_path), + segment.onset_s, + segment.offset_s, + segment.label, + segment.samplerate, + segment.sample_dur, + segment.segment_dur, + ] + ) + + return record, n_timebins + + +# constant, used for names of columns in DataFrame below +# this is analogous to ``syllable_df`` that ``avgn`` uses. +DF_COLUMNS = [ + "spect_path", + "audio_path", + "annot_path", + "onset_s", + "offset_s", + "label", + "samplerate", + "sample_dur", + "duration", +] + + +def prep_unit_dataset( + audio_format: str, + output_dir: str, + spect_params: dict | config.spect_params.SpectParamsConfig, + data_dir: list | None = None, + annot_format: str | None = None, + annot_file: str | pathlib.Path | None = None, + labelset: set | None = None, + context_s: float = 0.005, +) -> pd.DataFrame: + # pre-conditions --------------------------------------------------------------------------------------------------- + if audio_format not in constants.VALID_AUDIO_FORMATS: + raise ValueError( + f"audio format must be one of '{constants.VALID_AUDIO_FORMATS}'; " + f"format '{audio_format}' not recognized." + ) + + if labelset is not None: + labelset = labelset_to_set(labelset) + + data_dir = expanded_user_path(data_dir) + if not data_dir.is_dir(): + raise NotADirectoryError(f"data_dir not found: {data_dir}") + + audio_files = files_from_dir(data_dir, audio_format) + + if annot_format is not None: + if annot_file is None: + annot_files = annotation.files_from_dir( + annot_dir=data_dir, annot_format=annot_format + ) + scribe = crowsetta.Transcriber(format=annot_format) + annot_list = [scribe.from_file(annot_file).to_annot() for annot_file in annot_files] + else: + scribe = crowsetta.Transcriber(format=annot_format) + annot_list = scribe.from_file(annot_file).to_annot() + if isinstance(annot_list, crowsetta.Annotation): + # if e.g. only one annotated audio file in directory, wrap in a list to make iterable + # fixes https://github.com/NickleDave/vak/issues/467 + annot_list = [annot_list] + else: # if annot_format not specified + annot_list = None + + if annot_list: + audio_annot_map = annotation.map_annotated_to_annot(audio_files, annot_list, annot_format) + else: + # no annotation, so map spectrogram files to None + audio_annot_map = dict((audio_path, None) for audio_path in audio_files) + + # use mapping (if generated/supplied) with labelset, if supplied, to filter + if labelset: # then remove annotations with labels not in labelset + for audio_file, annot in list(audio_annot_map.items()): + # loop in a verbose way (i.e. not a comprehension) + # so we can give user warning when we skip files + annot_labelset = set(annot.seq.labels) + # below, set(labels_mapping) is a set of that dict's keys + if not annot_labelset.issubset(set(labelset)): + # because there's some label in labels that's not in labelset + audio_annot_map.pop(audio_file) + extra_labels = annot_labelset - labelset + logger.info( + f"Found labels, {extra_labels}, in {pathlib.Path(audio_file).name}, " + "that are not in labels_mapping. Skipping file.", + ) + + segments = [] + for audio_path, annot in audio_annot_map.items(): + segment_list = dask.delayed(get_segment_list)(audio_path, annot, audio_format, context_s) + segments.append(segment_list) + + logger.info( + "Loading audio for all segments in all files", + ) + with ProgressBar(): + segments = dask.compute(*segments) + segments = [segment for segment_list in segments for segment in segment_list] + + records_n_timebins_tuples = [] + for ind, segment in enumerate(segments): + records_n_timebins_tuple = dask.delayed(make_spect_return_record)(segment, ind, spect_params, output_dir) + records_n_timebins_tuples.append(records_n_timebins_tuple) + with ProgressBar(): + records_n_timebins_tuples = dask.compute(*records_n_timebins_tuples) + + records, n_timebins_list = [], [] + for records_n_timebins_tuple in records_n_timebins_tuples: + record, n_timebins = records_n_timebins_tuple + records.append(record) + n_timebins_list.append(n_timebins) + + pad_length = max(n_timebins_list) + + padded = [] + for record in records: + padded.append( + pad_spectrogram(record, pad_length) + ) + with ProgressBar(): + _ = dask.compute(*padded) + + unit_df = pd.DataFrame.from_records(records, columns=DF_COLUMNS) + + return unit_df From 7b337562a27ab4d40452ea6f171a7dff9e4e8ba2 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 17:46:32 -0400 Subject: [PATCH 002/184] Add vak/prep/dimensionality_reduction/ with prep_dimensionality_reduction_dataset function --- .../prep/dimensionality_reduction/__init__.py | 1 + .../dimensionality_reduction.py | 290 ++++++++++++++++++ 2 files changed, 291 insertions(+) create mode 100644 src/vak/prep/dimensionality_reduction/__init__.py create mode 100644 src/vak/prep/dimensionality_reduction/dimensionality_reduction.py diff --git a/src/vak/prep/dimensionality_reduction/__init__.py b/src/vak/prep/dimensionality_reduction/__init__.py new file mode 100644 index 000000000..e04bea42d --- /dev/null +++ b/src/vak/prep/dimensionality_reduction/__init__.py @@ -0,0 +1 @@ +from .dimensionality_reduction import prep_dimensionality_reduction_dataset diff --git a/src/vak/prep/dimensionality_reduction/dimensionality_reduction.py b/src/vak/prep/dimensionality_reduction/dimensionality_reduction.py new file mode 100644 index 000000000..b9c2f1e5d --- /dev/null +++ b/src/vak/prep/dimensionality_reduction/dimensionality_reduction.py @@ -0,0 +1,290 @@ +import json +import logging +import pathlib +import warnings + +from .. import dataset_df_helper, sequence_dataset, split +from ..unit_dataset import prep_unit_dataset + +from ... import datasets +from ...common import labels +from ...common.converters import expanded_user_path, labelset_to_set +from ...common.logging import config_logging_for_cli, log_version +from ...common.timenow import get_timenow_as_str + + +logger = logging.getLogger(__name__) + + +def prep_dimensionality_reduction_dataset( + data_dir: str | pathlib.Path, + purpose: str, + output_dir: str | pathlib.Path | None = None, + audio_format: str | None = None, + spect_params: dict | None = None, + annot_format: str | None = None, + labelset: set | None = None, + context_s: float = 0.015, + train_dur: int | None = None, + val_dur: int | None = None, + test_dur: int | None = None, + train_set_durs: list[float] | None = None, + num_replicates: int | None = None, + spect_key: str = "s", + timebins_key: str = "t", +): + """Prepare datasets for neural network models + that perform a dimensionality reduction task. + + For general information on dataset preparation, + see the docstring for :func:`vak.prep.prep`. + + Parameters + ---------- + data_dir : str, Path + Path to directory with files from which to make dataset. + purpose : str + Purpose of the dataset. + One of {'train', 'eval', 'predict', 'learncurve'}. + These correspond to commands of the vak command-line interface. + output_dir : str + Path to location where data sets should be saved. + Default is ``None``, in which case it defaults to ``data_dir``. + audio_format : str + Format of audio files. One of {'wav', 'cbin'}. + Default is ``None``, but either ``audio_format`` or ``spect_format`` + must be specified. + spect_params : dict, vak.config.SpectParams + Parameters for creating spectrograms. Default is ``None``. + annot_format : str + Format of annotations. Any format that can be used with the + :module:`crowsetta` library is valid. Default is ``None``. + labelset : str, list, set + Set of unique labels for vocalizations. Strings or integers. + Default is ``None``. If not ``None``, then files will be skipped + where the associated annotation + contains labels not found in ``labelset``. + ``labelset`` is converted to a Python ``set`` using + :func:`vak.converters.labelset_to_set`. + See help for that function for details on how to specify ``labelset``. + train_dur : float + Total duration of training set, in seconds. + When creating a learning curve, + training subsets of shorter duration + will be drawn from this set. Default is None. + val_dur : float + Total duration of validation set, in seconds. + Default is None. + test_dur : float + Total duration of test set, in seconds. + Default is None. + train_set_durs : list + of int, durations in seconds of subsets taken from training data + to create a learning curve, e.g. [5, 10, 15, 20]. + num_replicates : int + number of times to replicate training for each training set duration + to better estimate metrics for a training set of that size. + Each replicate uses a different randomly drawn subset of the training + data (but of the same duration). + spect_key : str + key for accessing spectrogram in files. Default is 's'. + timebins_key : str + key for accessing vector of time bins in files. Default is 't'. + + Returns + ------- + dataset_df : pandas.DataFrame + That represents a dataset. + dataset_path : pathlib.Path + Path to csv saved from ``dataset_df``. + """ + from .. import constants # avoid circular import + + # pre-conditions --------------------------------------------------------------------------------------------------- + if purpose not in constants.VALID_PURPOSES: + raise ValueError( + f"purpose must be one of: {constants.VALID_PURPOSES}\n" + f"Value for purpose was: {purpose}" + ) + + if labelset is not None: + labelset = labelset_to_set(labelset) + + data_dir = expanded_user_path(data_dir) + if not data_dir.is_dir(): + raise NotADirectoryError(f"Path specified for ``data_dir`` not found: {data_dir}") + + if output_dir: + output_dir = expanded_user_path(output_dir) + else: + output_dir = data_dir + + if not output_dir.is_dir(): + raise NotADirectoryError(f"Path specified for ``output_dir`` not found: {output_dir}") + + if purpose == "predict": + if labelset is not None: + warnings.warn( + "The ``purpose`` argument was set to 'predict`, but a ``labelset`` was provided." + "This would cause an error because the ``prep_spectrogram_dataset`` section will attempt to " + f"check whether the files in the ``data_dir`` have labels in " + "``labelset``, even though those files don't have annotation.\n" + "Setting ``labelset`` to None." + ) + labelset = None + else: # if purpose is not predict + if labelset is None: + raise ValueError( + f"The ``purpose`` argument was set to '{purpose}', but no ``labelset`` was provided." + "This will cause an error when trying to split the dataset, " + "e.g. into training and test splits, " + "or a silent error, e.g. when calculating metrics with an evaluation set. " + "Please specify a ``labelset`` when calling ``vak.prep.frame_classification.prep`` " + f"with ``purpose='{purpose}'." + ) + + logger.info(f"Purpose for frame classification dataset: {purpose}") + # ---- set up directory that will contain dataset, and csv file name ----------------------------------------------- + data_dir_name = data_dir.name + timenow = get_timenow_as_str() + dataset_path = output_dir / f'{data_dir_name}-vak-dimensionality-reduction-dataset-generated-{timenow}' + dataset_path.mkdir() + + # NOTE we set up logging here (instead of cli) so the prep log is included in the dataset + config_logging_for_cli( + log_dst=dataset_path, + log_stem="prep", + level="INFO", + force=True + ) + log_version(logger) + + dataset_csv_path = dataset_df_helper.get_dataset_csv_path(dataset_path, data_dir_name, timenow) + logger.info( + f"Will prepare dataset as directory: {dataset_path}" + ) + + # ---- actually make the dataset ----------------------------------------------------------------------------------- + dataset_df = prep_unit_dataset( + audio_format=audio_format, + output_dir=output_dir, + spect_params=spect_params, + data_dir=data_dir, + annot_format=annot_format, + context_s=context_s, + ) + + if dataset_df.empty: + raise ValueError( + "Calling `vak.prep.unit_dataset.prep_unit_dataset` " + "with arguments passed to `vak.core.prep.prep_dimensionality_reduction_dataset` " + "returned an empty dataframe.\n" + "Please double-check arguments to `vak.core.prep` function." + ) + + # save before (possibly) splitting, just in case duration args are not valid + # (we can't know until we make dataset) + dataset_df.to_csv(dataset_csv_path) + + # ---- (possibly) split into train / val / test sets --------------------------------------------- + # catch case where user specified duration for just training set, raise a helpful error instead of failing silently + if (purpose == "train" or purpose == "learncurve") and ( + (train_dur is not None and train_dur > 0) + and (val_dur is None or val_dur == 0) + and (test_dur is None or val_dur == 0) + ): + raise ValueError( + "A duration specified for just training set, but prep function does not currently support creating a " + "single split of a specified duration. Either remove the train_dur option from the prep section and " + "rerun, in which case all data will be included in the training set, or specify values greater than " + "zero for test_dur (and val_dur, if a validation set will be used)" + ) + + if all([dur is None for dur in (train_dur, val_dur, test_dur)]) or purpose in ( + "eval", + "predict", + ): + # then we're not going to split + logger.info("Will not split dataset.") + do_split = False + else: + if val_dur is not None and train_dur is None and test_dur is None: + raise ValueError( + "cannot specify only val_dur, unclear how to split dataset into training and test sets" + ) + else: + logger.info("Will split dataset.") + do_split = True + + if do_split: + dataset_df = split.dataframe( + dataset_df, + dataset_path, + labelset=labelset, + train_dur=train_dur, + val_dur=val_dur, + test_dur=test_dur, + ) + + elif ( + do_split is False + ): # add a split column, but assign everything to the same 'split' + # ideally we would just say split=purpose in call to add_split_col, but + # we have to special case, because "eval" looks for a 'test' split (not an "eval" split) + if purpose == "eval": + split_name = "test" # 'split_name' to avoid name clash with split package + elif purpose == "predict": + split_name = "predict" + + dataset_df = dataset_df_helper.add_split_col(dataset_df, split=split_name) + + # ---- create and save labelmap ------------------------------------------------------------------------------------ + # we do this before creating array files since we need to load the labelmap to make frame label vectors + if purpose != 'predict': + # TODO: add option to generate predict using existing dataset, so we can get labelmap from it + map_unlabeled_segments = sequence_dataset.has_unlabeled_segments(dataset_df) + labelmap = labels.to_map(labelset, map_unlabeled=map_unlabeled_segments) + logger.info( + f"Number of classes in labelmap: {len(labelmap)}", + ) + # save labelmap in case we need it later + with (dataset_path / "labelmap.json").open("w") as fp: + json.dump(labelmap, fp) + else: + labelmap = None + + # ---- if purpose is learncurve, additionally prep splits for that ------------------------------------------------- + # if purpose == 'learncurve': + # dataset_df = make_learncurve_splits_from_dataset_df( + # dataset_df, + # input_type, + # train_set_durs, + # num_replicates, + # dataset_path, + # labelmap, + # audio_format, + # spect_key, + # timebins_key, + # ) + + # ---- save csv file that captures provenance of source data ------------------------------------------------------- + logger.info( + f"Saving dataset csv file: {dataset_csv_path}" + ) + dataset_df.to_csv( + dataset_csv_path, index=False + ) # index is False to avoid having "Unnamed: 0" column when loading + + # ---- save metadata ----------------------------------------------------------------------------------------------- + frame_dur = validators.validate_and_get_frame_dur(dataset_df, input_type) + + metadata = datasets.frame_classification.Metadata( + dataset_csv_filename=str(dataset_csv_path.name), + frame_dur=frame_dur, + input_type=input_type, + audio_format=audio_format, + spect_format=spect_format, + ) + metadata.to_json(dataset_path) + + return dataset_df, dataset_path From 4452620559d8673dfc3189f15bf99fb370c0e557 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 17:46:48 -0400 Subject: [PATCH 003/184] Import new modules in vak/prep/__init__.py --- src/vak/prep/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/vak/prep/__init__.py b/src/vak/prep/__init__.py index c93eb43f6..6795fee53 100644 --- a/src/vak/prep/__init__.py +++ b/src/vak/prep/__init__.py @@ -2,8 +2,10 @@ audio_dataset, constants, dataset_df_helper, + dimensionality_reduction, frame_classification, spectrogram_dataset, + unit_dataset, ) from .prep import prep @@ -12,7 +14,9 @@ 'audio_dataset', 'constants', 'dataset_df_helper', + 'dimensionality_reduction', 'frame_classification', 'prep', 'spectrogram_dataset', + 'unit_dataset', ] From ece533de3e2dcc35dacc9ddbf9ead0c403ded858 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 17:47:10 -0400 Subject: [PATCH 004/184] Remove parameter from prep_frame_classification dataset docstring, not a parameter of this function --- src/vak/prep/frame_classification/frame_classification.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/vak/prep/frame_classification/frame_classification.py b/src/vak/prep/frame_classification/frame_classification.py index 4760589a3..7fac7a300 100644 --- a/src/vak/prep/frame_classification/frame_classification.py +++ b/src/vak/prep/frame_classification/frame_classification.py @@ -58,12 +58,6 @@ def prep_frame_classification_dataset( Purpose of the dataset. One of {'train', 'eval', 'predict', 'learncurve'}. These correspond to commands of the vak command-line interface. - dataset_type : str - String name of the type of dataset, e.g., - 'frame_classification'. Dataset types are - defined by machine learning tasks, e.g., - a 'frame_classification' dataset would be used - a :class:`vak.models.FrameClassificationModel` model. input_type : str The type of input to the neural network model. One of {'audio', 'spect'}. From 7b2ef8120b7d18c50418a7ca7c1a702ea9c81aa6 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 18:08:58 -0400 Subject: [PATCH 005/184] Rename 'vak.prep.split.dataframe' -> 'vak.prep.split.frame_classification_dataframe', and add function 'vak.prep.split.unit_dataframe' --- src/vak/prep/split/__init__.py | 2 +- src/vak/prep/split/split.py | 94 +++++++++++++++++++++++++++++++--- 2 files changed, 89 insertions(+), 7 deletions(-) diff --git a/src/vak/prep/split/__init__.py b/src/vak/prep/split/__init__.py index 5a2e95b8f..aeea84a09 100644 --- a/src/vak/prep/split/__init__.py +++ b/src/vak/prep/split/__init__.py @@ -1,3 +1,3 @@ from . import algorithms -from .split import dataframe +from .split import frame_classification_dataframe, unit_dataframe diff --git a/src/vak/prep/split/split.py b/src/vak/prep/split/split.py index d455022d8..1dbd798a9 100644 --- a/src/vak/prep/split/split.py +++ b/src/vak/prep/split/split.py @@ -27,7 +27,6 @@ def train_test_dur_split_inds( ): """Return indices to split a dataset into training, test, and validation sets of specified durations. - Given the durations of a set of vocalizations, and labels from the annotations for those vocalizations, this function returns arrays of indices for splitting up the set into training, test, and validation sets. @@ -80,7 +79,7 @@ def train_test_dur_split_inds( f"training, test, and (if specified) validation sets: {total_target_dur}" ) - logger.info( + logger.info(, f"Total target duration of splits: {total_target_dur} seconds. " f"Will be drawn from dataset with total duration: {total_dur:.3f}.", ) @@ -95,15 +94,18 @@ def train_test_dur_split_inds( return train_inds, val_inds, test_inds -def dataframe( +def frame_classification_dataframe( dataset_df: pd.DataFrame, dataset_path: str | pathlib.Path, labelset: set, train_dur: float | None = None, test_dur: float | None = None, val_dur: float | None = None ): - """Split a dataset of vocalizations into training, test, and (optionally) validation subsets, + """Create datasets splits from a dataframe + representing a frame classification dataset. + + Splits dataset into training, test, and (optionally) validation subsets, specified by their duration. - Takes dataset represented as a pandas DataFrame and adds a 'split' column that assigns each - row to 'train', 'val', 'test', or 'None'. + Additionally adds a 'split' column to the dataframe, + that assigns each row to 'train', 'val', 'test', or 'None'. Parameters ---------- @@ -167,3 +169,83 @@ def dataframe( dataset_df["split"] = split_col return dataset_df + + +def unit_dataframe( + dataset_df: pd.DataFrame, dataset_path: str | pathlib.Path, labelset: set, + train_dur: float | None = None, test_dur: float | None = None, val_dur: float | None = None +): + """Create datasets splits from a dataframe + representing a unit dataset. + + Splits dataset into training, test, and (optionally) validation subsets, + specified by their duration. + + Additionally adds a 'split' column to the dataframe, + that assigns each row to 'train', 'val', 'test', or 'None'. + + Parameters + ---------- + dataset_df : pandas.Dataframe + A pandas DataFrame representing the samples in a dataset, + generated by ``vak prep``. + dataset_path : str + Path to dataset, a directory generated by running ``vak prep``. + labelset : set, list + The set of label classes for vocalizations in dataset. + train_dur : float + Total duration of training set, in seconds. Default is None + test_dur : float + Total duration of test set, in seconds. Default is None. + val_dur : float + Total duration of validation set, in seconds. Default is None. + + Returns + ------- + dataset_df : pandas.Dataframe + A copy of the input dataset with a 'split' column added, + that assigns each vocalization (row) to a subset, + i.e., train, validation, or test. + If the vocalization was not added to one of the subsets, + its value for 'split' will be 'None'. + + Notes + ----- + Uses the function :func:`vak.dataset.split.train_test_dur_split_inds` + to find indices for each subset. + """ + dataset_path = pathlib.Path(dataset_path) + if not dataset_path.exists() or not dataset_path.is_dir(): + raise NotADirectoryError( + f"`dataset_path` not found or not recognized as a directory: {dataset_path}" + ) + + dataset_df = ( + dataset_df.copy() + ) # don't want this function to have unexpected side effects, so return a copy + labels = [ + np.array([label]) for label in dataset_df.label.values + ] + + durs = dataset_df["duration"].values + train_inds, val_inds, test_inds = train_test_dur_split_inds( + durs=durs, + labels=labels, + labelset=labelset, + train_dur=train_dur, + test_dur=test_dur, + val_dur=val_dur, + ) + + # start off with all elements set to 'None' + # so we don't have to change any that are not assigned to one of the subsets to 'None' after + split_col = np.asarray(["None" for _ in range(len(dataset_df))], dtype="object") + split_zip = zip(["train", "val", "test"], [train_inds, val_inds, test_inds]) + for split_name, split_inds in split_zip: + if split_inds is not None: + split_col[split_inds] = split_name + + # add split column to dataframe + dataset_df["split"] = split_col + + return dataset_df \ No newline at end of file From 79f52441f93cbe45d7230953a58d755a491c9597 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 18:09:34 -0400 Subject: [PATCH 006/184] Use renamed 'split.frame_classification_dataframe' in vak.prep.frame_classification.prep_frame_classification_dataset --- src/vak/prep/frame_classification/frame_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/prep/frame_classification/frame_classification.py b/src/vak/prep/frame_classification/frame_classification.py index 7fac7a300..2ddaf0e39 100644 --- a/src/vak/prep/frame_classification/frame_classification.py +++ b/src/vak/prep/frame_classification/frame_classification.py @@ -314,7 +314,7 @@ def prep_frame_classification_dataset( do_split = True if do_split: - dataset_df = split.dataframe( + dataset_df = split.frame_classification_dataframe( dataset_df, dataset_path, labelset=labelset, From 41776a43c9b0bea42b3fa4fd6fb15d23eeedba64 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 18:46:59 -0400 Subject: [PATCH 007/184] Fix typo in src/vak/prep/split/split.py --- src/vak/prep/split/split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/prep/split/split.py b/src/vak/prep/split/split.py index 1dbd798a9..4a2407c42 100644 --- a/src/vak/prep/split/split.py +++ b/src/vak/prep/split/split.py @@ -79,7 +79,7 @@ def train_test_dur_split_inds( f"training, test, and (if specified) validation sets: {total_target_dur}" ) - logger.info(, + logger.info( f"Total target duration of splits: {total_target_dur} seconds. " f"Will be drawn from dataset with total duration: {total_dur:.3f}.", ) From 2c7967eb0e94862891ce91315ec8b678ad73aa6f Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 18:47:17 -0400 Subject: [PATCH 008/184] Remove wrong type hint in src/vak/prep/unit_dataset/unit_dataset.py --- src/vak/prep/unit_dataset/unit_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/prep/unit_dataset/unit_dataset.py b/src/vak/prep/unit_dataset/unit_dataset.py index d175ef13f..a8542a6a7 100644 --- a/src/vak/prep/unit_dataset/unit_dataset.py +++ b/src/vak/prep/unit_dataset/unit_dataset.py @@ -229,7 +229,7 @@ def make_spect_return_record(segment, ind, spect_params, output_dir): def prep_unit_dataset( audio_format: str, output_dir: str, - spect_params: dict | config.spect_params.SpectParamsConfig, + spect_params: dict, data_dir: list | None = None, annot_format: str | None = None, annot_file: str | pathlib.Path | None = None, From a918181f0a87309823109c7146c609269b13dbdf Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 19:16:43 -0400 Subject: [PATCH 009/184] Add vak/prep/dimensionality_reduction/dataset_arrays.py with function 'move_files_into_split_subdirs' --- .../dataset_arrays.py | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 src/vak/prep/dimensionality_reduction/dataset_arrays.py diff --git a/src/vak/prep/dimensionality_reduction/dataset_arrays.py b/src/vak/prep/dimensionality_reduction/dataset_arrays.py new file mode 100644 index 000000000..84ce2b108 --- /dev/null +++ b/src/vak/prep/dimensionality_reduction/dataset_arrays.py @@ -0,0 +1,116 @@ +"""Helper functions for `vak.prep.dimensionality_reduction` module +that handle array files. +""" +from __future__ import annotations + +import logging +import pathlib +import shutil + +import pandas as pd + + +logger = logging.getLogger(__name__) + + +def move_files_into_split_subdirs( + dataset_df: pd.DataFrame, + dataset_path: pathlib.Path, + purpose: str) -> None: + """Move npy files in dataset into sub-directories, one for each split in the dataset. + + This is run *after* calling :func:`vak.prep.unit_dataset.prep_unit_dataset` + to generate ``dataset_df``. + + Parameters + ---------- + dataset_df : pandas.DataFrame + A ``pandas.DataFrame`` returned by + :func:`vak.prep.unit_dataset.prep_unit_dataset` + with a ``'split'`` column added, as a result of calling + :func:`vak.prep.split.unit_dataframe` or because it was added "manually" + by calling :func:`vak.core.prep.prep_helper.add_split_col` (as is done + for 'predict' when the entire ``DataFrame`` belongs to this + "split"). + dataset_path : pathlib.Path + Path to directory that represents dataset. + purpose: str + A string indicating what the dataset will be used for. + One of {'train', 'eval', 'predict', 'learncurve'}. + Determined by :func:`vak.core.prep.prep` + using the TOML configuration file. + + Returns + ------- + None + + The ``DataFrame`` is modified in place + as the files are moved, so nothing is returned. + """ + moved_spect_paths = [] # to clean up after moving -- may be empty if we copy all spects (e.g., user generated) + # ---- copy/move files into split sub-directories inside dataset directory + # Next line, note we drop any na rows in the split column, since they don't belong to a split anyway + split_names = sorted(dataset_df.split.dropna().unique()) + + for split_name in split_names: + if split_name == 'None': + # these are files that didn't get assigned to a split + continue + split_subdir = dataset_path / split_name + split_subdir.mkdir() + + split_df = dataset_df[dataset_df.split == split_name].copy() + split_spect_paths = [ + # this just converts from string to pathlib.Path + pathlib.Path(spect_path) + for spect_path in split_df['spect_path'].values + ] + is_in_dataset_dir = [ + # if dataset_path is one of the parents of spect_path, we can move; otherwise, we copy + dataset_path.resolve() in list(spect_path.parents) + for spect_path in split_spect_paths + ] + if all(is_in_dataset_dir): + move_spects = True + elif all([not is_in_dir for is_in_dir in is_in_dataset_dir]): + move_spects = False + else: + raise ValueError( + "Expected to find either all spectrograms were in dataset directory, " + "or all were in some other directory, but found a mixture. " + f"Spectrogram paths for split being moved within dataset directory:\n{split_spect_paths}" + ) + + new_spect_paths = [] # to fix DataFrame + for spect_path in split_spect_paths: + spect_path = pathlib.Path(spect_path) + if move_spects: # because it's within dataset_path already + new_spect_path = spect_path.rename( + split_subdir / spect_path.name + ) + moved_spect_paths.append( + spect_path + ) + else: # copy instead of moving + new_spect_path = shutil.copy( + src=spect_path, dst=split_subdir + ) + + new_spect_paths.append( + # rewrite paths relative to dataset directory's root, so dataset is portable + pathlib.Path(new_spect_path).relative_to(dataset_path) + ) + + # cast to str before rewrite so that type doesn't silently change for some rows + new_spect_paths = [str(new_spect_path) for new_spect_path in new_spect_paths] + dataset_df.loc[split_df.index, 'spect_path'] = new_spect_paths + + # ---- clean up after moving/copying ------------------------------------------------------------------------------- + # remove any directories that we just emptied + if moved_spect_paths: + unique_parents = set([ + moved_spect.parent for moved_spect in moved_spect_paths + ]) + for parent in unique_parents: + if len(list(parent.iterdir())) < 1: + shutil.rmtree(parent) From 4ae6fb02b2bec3a616128d48feb1ea3ef61a2ab5 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 19:17:12 -0400 Subject: [PATCH 010/184] Add src/vak/datasets/dimensionality_reduction/ with unit_dataset.py and metadata.py --- .../dimensionality_reduction/__init__.py | 0 .../dimensionality_reduction/metadata.py | 134 ++++++++++++++++++ .../dimensionality_reduction/unit_dataset.py | 0 3 files changed, 134 insertions(+) create mode 100644 src/vak/datasets/dimensionality_reduction/__init__.py create mode 100644 src/vak/datasets/dimensionality_reduction/metadata.py create mode 100644 src/vak/datasets/dimensionality_reduction/unit_dataset.py diff --git a/src/vak/datasets/dimensionality_reduction/__init__.py b/src/vak/datasets/dimensionality_reduction/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/vak/datasets/dimensionality_reduction/metadata.py b/src/vak/datasets/dimensionality_reduction/metadata.py new file mode 100644 index 000000000..518de94e2 --- /dev/null +++ b/src/vak/datasets/dimensionality_reduction/metadata.py @@ -0,0 +1,134 @@ +"""A dataclass that represents metadata +associated with a dimensionality reduction dataset, +as generated by +:func:`vak.core.prep.frame_classification.prep_dimensionality_reduction_dataset`""" +from __future__ import annotations + +import json +import pathlib +from typing import ClassVar + +import attr + + +def is_valid_dataset_csv_filename(instance, attribute, value): + valid = '_prep_' in value and value.endswith('.csv') + if not valid: + raise ValueError( + f'Invalid dataset csv filename: {value}.' + f'Filename should contain the string "_prep_" ' + f'and end with the extension .csv.' + f'Valid filenames are generated by ' + f'vak.core.prep.generate_dataset_csv_filename' + ) + + +def is_valid_audio_format(instance, attribute, value): + import vak.common.constants + if value not in vak.common.constants.VALID_AUDIO_FORMATS: + raise ValueError( + f"Not a valid audio format: {value}. Valid audio formats are: {vak.common.constants.VALID_AUDIO_FORMATS}" + ) + + +def is_valid_spect_format(instance, attribute, value): + import vak.common.constants + if value not in vak.common.constants.VALID_SPECT_FORMATS: + raise ValueError( + f"Not a valid spectrogram format: {value}. " + f"Valid spectrogram formats are: {vak.common.constants.VALID_SPECT_FORMATS}" + ) + + +@attr.define +class Metadata: + """A dataclass that represents metadata + associated with a dataset that was + generated by :func:`vak.core.prep.prep`. + + Attributes + ---------- + dataset_csv_filename : str + Name of csv file representing the source files in the dataset. + Csv file will be located in root of directory representing dataset, + so only the filename is given. + """ + # declare this as a constant to avoid + # needing to remember this in multiple places, and to use in unit tests + METADATA_JSON_FILENAME: ClassVar = 'metadata.json' + + dataset_csv_filename: str = attr.field(converter=str, validator=is_valid_dataset_csv_filename) + + audio_format: str = attr.field( + converter=attr.converters.optional(str), + validator=attr.validators.optional(is_valid_audio_format), + default=None + ) + + @classmethod + def from_path(cls, json_path: str | pathlib.Path): + """Load dataset metadata from a json file. + + Class method that returns an instance of + :class:`~vak.datasets.frame_classification.FrameClassificationDatatsetMetadata`. + + Parameters + ---------- + json_path : string, pathlib.Path + Path to a 'metadata.json' file created by + :func:`vak.core.prep.prep` when generating + a dataset. + + Returns + ------- + metadata : vak.datasets.frame_classification.FrameClassificationDatatsetMetadata + Instance of :class:`~vak.datasets.frame_classification.FrameClassificationDatatsetMetadata` + with metadata loaded from json file. + """ + json_path = pathlib.Path(json_path) + with json_path.open('r') as fp: + metadata_json = json.load(fp) + return cls(**metadata_json) + + @classmethod + def from_dataset_path(cls, dataset_path: str | pathlib.Path): + dataset_path = pathlib.Path(dataset_path) + if not dataset_path.exists() or not dataset_path.is_dir(): + raise NotADirectoryError( + f"`dataset_path` not found or not recognized as a directory: {dataset_path}" + ) + + metadata_json_path = dataset_path / cls.METADATA_JSON_FILENAME + if not metadata_json_path.exists(): + raise FileNotFoundError( + f"Metadata file not found: {metadata_json_path}" + ) + + return cls.from_path(metadata_json_path) + + def to_json(self, dataset_path: str | pathlib.Path) -> None: + """Dump dataset metadata to a json file. + + This method is called by :func:`vak.core.prep.prep` + after it generates a dataset and then creates an + instance of :class:`~vak.datasets.frame_classification.FrameClassificationDatatsetMetadata` + with metadata about that dataset. + + Parameters + ---------- + dataset_path : string, pathlib.Path + Path to root of a directory representing a dataset + generated by :func:`vak.core.prep.prep`. + where 'metadata.json' file + should be saved. + """ + dataset_path = pathlib.Path(dataset_path) + if not dataset_path.exists() or not dataset_path.is_dir(): + raise NotADirectoryError( + f'dataset_path not recognized as a directory: {dataset_path}' + ) + + json_dict = attr.asdict(self) + json_path = dataset_path / self.METADATA_JSON_FILENAME + with json_path.open('w') as fp: + json.dump(json_dict, fp, indent=4) diff --git a/src/vak/datasets/dimensionality_reduction/unit_dataset.py b/src/vak/datasets/dimensionality_reduction/unit_dataset.py new file mode 100644 index 000000000..e69de29bb From 702a9e25544aa20c80c6c7f07c8d5d2e36688335 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 19:18:52 -0400 Subject: [PATCH 011/184] Add initial UnitDataset class, fix imports in datasets/dimensionality_reduction/__init__.py --- .../datasets/dimensionality_reduction/__init__.py | 10 ++++++++++ .../dimensionality_reduction/unit_dataset.py | 15 +++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/src/vak/datasets/dimensionality_reduction/__init__.py b/src/vak/datasets/dimensionality_reduction/__init__.py index e69de29bb..51fa87ea7 100644 --- a/src/vak/datasets/dimensionality_reduction/__init__.py +++ b/src/vak/datasets/dimensionality_reduction/__init__.py @@ -0,0 +1,10 @@ +from .metadata import Metadata +from .unit_dataset import UnitDataset + + +__all__ = [ + "constants", + "Metadata", + "FramesDataset", + "WindowDataset" +] diff --git a/src/vak/datasets/dimensionality_reduction/unit_dataset.py b/src/vak/datasets/dimensionality_reduction/unit_dataset.py index e69de29bb..432d51488 100644 --- a/src/vak/datasets/dimensionality_reduction/unit_dataset.py +++ b/src/vak/datasets/dimensionality_reduction/unit_dataset.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import pathlib +from typing import Callable + +import numpy as np +import numpy.typing as npt +import pandas as pd + +from . import constants +from .metadata import Metadata + + +class UnitDataset: + pass From 1ed130349c599503fe802bf8807e7ed00ab2e4bf Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 19:19:08 -0400 Subject: [PATCH 012/184] Import dataset_arrays module in src/vak/prep/dimensionality_reduction/__init__.py --- src/vak/prep/dimensionality_reduction/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vak/prep/dimensionality_reduction/__init__.py b/src/vak/prep/dimensionality_reduction/__init__.py index e04bea42d..71d3feca5 100644 --- a/src/vak/prep/dimensionality_reduction/__init__.py +++ b/src/vak/prep/dimensionality_reduction/__init__.py @@ -1 +1,2 @@ +from . import dataset_arrays from .dimensionality_reduction import prep_dimensionality_reduction_dataset From a733332f7c2242c59068a21f504fd9a4b8fdddac Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 19:25:15 -0400 Subject: [PATCH 013/184] Import dimensionality_reduction in vak/datasets/__init__.py --- src/vak/datasets/__init__.py | 2 ++ src/vak/datasets/dimensionality_reduction/__init__.py | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/vak/datasets/__init__.py b/src/vak/datasets/__init__.py index 68acbd722..8a8618ce7 100644 --- a/src/vak/datasets/__init__.py +++ b/src/vak/datasets/__init__.py @@ -1,8 +1,10 @@ from . import ( + dimensionality_reduction, frame_classification, ) __all__ = [ + "dimensionality_reduction", "frame_classification", ] diff --git a/src/vak/datasets/dimensionality_reduction/__init__.py b/src/vak/datasets/dimensionality_reduction/__init__.py index 51fa87ea7..98341d56f 100644 --- a/src/vak/datasets/dimensionality_reduction/__init__.py +++ b/src/vak/datasets/dimensionality_reduction/__init__.py @@ -2,9 +2,7 @@ from .unit_dataset import UnitDataset -__all__ = [ - "constants", +__all__ = [, "Metadata", - "FramesDataset", - "WindowDataset" + "UnitDataset", ] From 77a7cc4f67b4331cf22fcffcf5d11a89cb5bb4b9 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 19:25:58 -0400 Subject: [PATCH 014/184] Fix typo in src/vak/datasets/dimensionality_reduction/__init__.py --- src/vak/datasets/dimensionality_reduction/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/datasets/dimensionality_reduction/__init__.py b/src/vak/datasets/dimensionality_reduction/__init__.py index 98341d56f..f82402171 100644 --- a/src/vak/datasets/dimensionality_reduction/__init__.py +++ b/src/vak/datasets/dimensionality_reduction/__init__.py @@ -2,7 +2,7 @@ from .unit_dataset import UnitDataset -__all__ = [, +__all__ = [ "Metadata", "UnitDataset", ] From 7e5d6b9035f3210e7dcec68cf000e4540af1adab Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 19:37:27 -0400 Subject: [PATCH 015/184] Fix `vak.prep.dimensionality_reduction.prep_dimensionality_reduction_dataset` to use `dataset_arrays.move_array_files_into_split_dirs` and `Metadata` --- .../dimensionality_reduction.py | 62 +++++++++++++++---- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/src/vak/prep/dimensionality_reduction/dimensionality_reduction.py b/src/vak/prep/dimensionality_reduction/dimensionality_reduction.py index b9c2f1e5d..847e21ca7 100644 --- a/src/vak/prep/dimensionality_reduction/dimensionality_reduction.py +++ b/src/vak/prep/dimensionality_reduction/dimensionality_reduction.py @@ -3,7 +3,10 @@ import pathlib import warnings -from .. import dataset_df_helper, sequence_dataset, split +import crowsetta + +from . import dataset_arrays +from .. import dataset_df_helper, split from ..unit_dataset import prep_unit_dataset from ... import datasets @@ -23,6 +26,7 @@ def prep_dimensionality_reduction_dataset( audio_format: str | None = None, spect_params: dict | None = None, annot_format: str | None = None, + annot_file: str | pathlib.Path | None = None, labelset: set | None = None, context_s: float = 0.015, train_dur: int | None = None, @@ -122,6 +126,13 @@ def prep_dimensionality_reduction_dataset( if not output_dir.is_dir(): raise NotADirectoryError(f"Path specified for ``output_dir`` not found: {output_dir}") + if annot_file is not None: + annot_file = expanded_user_path(annot_file) + if not annot_file.exists(): + raise FileNotFoundError( + f'Path specified for ``annot_file`` not found: {annot_file}' + ) + if purpose == "predict": if labelset is not None: warnings.warn( @@ -150,6 +161,29 @@ def prep_dimensionality_reduction_dataset( dataset_path = output_dir / f'{data_dir_name}-vak-dimensionality-reduction-dataset-generated-{timenow}' dataset_path.mkdir() + if annot_file and annot_format == 'birdsong-recognition-dataset': + # we do this normalization / canonicalization after we make dataset_path + # so that we can put the new annot_file inside of dataset_path, instead of + # making new files elsewhere on a user's system + logger.info("The ``annot_format`` argument was set to 'birdsong-recognition-format'; " + "this format requires the audio files for their sampling rate " + "to convert onset and offset times of birdsong syllables to seconds." + "Converting this format to 'generic-seq' now with the times in seconds, " + "so that the dataset prepared by vak will not require the audio files.") + birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(annot_file) + annots = birdsongrec.to_annot() + # note we point `annot_file` at a new file we're about to make + annot_file = dataset_path / f'{annot_file.stem}.converted-to-generic-seq.csv' + # and we remake Annotations here so that annot_path points to this new file, not the birdsong-rec Annotation.xml + annots = [ + crowsetta.Annotation(seq=annot.seq, annot_path=annot_file, notated_path=annot.notated_path) + for annot in annots + ] + generic_seq = crowsetta.formats.seq.GenericSeq(annots=annots) + generic_seq.to_file(annot_file) + # and we now change `annot_format` as well. Both these will get passed to io.prep_spectrogram_dataset + annot_format = 'generic-seq' + # NOTE we set up logging here (instead of cli) so the prep log is included in the dataset config_logging_for_cli( log_dst=dataset_path, @@ -167,10 +201,12 @@ def prep_dimensionality_reduction_dataset( # ---- actually make the dataset ----------------------------------------------------------------------------------- dataset_df = prep_unit_dataset( audio_format=audio_format, - output_dir=output_dir, + output_dir=dataset_path, spect_params=spect_params, data_dir=data_dir, annot_format=annot_format, + annot_file=annot_file, + labelset=labelset, context_s=context_s, ) @@ -217,7 +253,7 @@ def prep_dimensionality_reduction_dataset( do_split = True if do_split: - dataset_df = split.dataframe( + dataset_df = split.unit_dataframe( dataset_df, dataset_path, labelset=labelset, @@ -242,8 +278,7 @@ def prep_dimensionality_reduction_dataset( # we do this before creating array files since we need to load the labelmap to make frame label vectors if purpose != 'predict': # TODO: add option to generate predict using existing dataset, so we can get labelmap from it - map_unlabeled_segments = sequence_dataset.has_unlabeled_segments(dataset_df) - labelmap = labels.to_map(labelset, map_unlabeled=map_unlabeled_segments) + labelmap = labels.to_map(labelset, map_unlabeled=False) logger.info( f"Number of classes in labelmap: {len(labelmap)}", ) @@ -253,11 +288,17 @@ def prep_dimensionality_reduction_dataset( else: labelmap = None - # ---- if purpose is learncurve, additionally prep splits for that ------------------------------------------------- + # ---- make arrays that represent final dataset -------------------------------------------------------------------- + dataset_arrays.move_files_into_split_subdirs( + dataset_df, + dataset_path, + purpose, + ) + # + # # ---- if purpose is learncurve, additionally prep splits for that ------------------------------------------------- # if purpose == 'learncurve': # dataset_df = make_learncurve_splits_from_dataset_df( # dataset_df, - # input_type, # train_set_durs, # num_replicates, # dataset_path, @@ -276,14 +317,9 @@ def prep_dimensionality_reduction_dataset( ) # index is False to avoid having "Unnamed: 0" column when loading # ---- save metadata ----------------------------------------------------------------------------------------------- - frame_dur = validators.validate_and_get_frame_dur(dataset_df, input_type) - - metadata = datasets.frame_classification.Metadata( + metadata = datasets.dimensionality_reduction.Metadata( dataset_csv_filename=str(dataset_csv_path.name), - frame_dur=frame_dur, - input_type=input_type, audio_format=audio_format, - spect_format=spect_format, ) metadata.to_json(dataset_path) From 32759b30edfc4655417d5bde88619351da4051d1 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 7 Jul 2023 19:37:40 -0400 Subject: [PATCH 016/184] Remove wrong import from src/vak/datasets/dimensionality_reduction/unit_dataset.py --- src/vak/datasets/dimensionality_reduction/unit_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/vak/datasets/dimensionality_reduction/unit_dataset.py b/src/vak/datasets/dimensionality_reduction/unit_dataset.py index 432d51488..b43b2ff63 100644 --- a/src/vak/datasets/dimensionality_reduction/unit_dataset.py +++ b/src/vak/datasets/dimensionality_reduction/unit_dataset.py @@ -7,7 +7,6 @@ import numpy.typing as npt import pandas as pd -from . import constants from .metadata import Metadata From d732fd53e7c6d3e51ef7b5b893e0e97a69c85dbe Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 07:52:05 -0400 Subject: [PATCH 017/184] Fix pad_spectrogram to re-save file after padding --- src/vak/prep/unit_dataset/unit_dataset.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/vak/prep/unit_dataset/unit_dataset.py b/src/vak/prep/unit_dataset/unit_dataset.py index a8542a6a7..83db3e50e 100644 --- a/src/vak/prep/unit_dataset/unit_dataset.py +++ b/src/vak/prep/unit_dataset/unit_dataset.py @@ -147,18 +147,14 @@ def save_spect(spect_to_save: SpectToSave, output_dir: str | pathlib.Path) -> st @dask.delayed -def pad_spectrogram(record: tuple, pad_length: float): - """Pads a spectrogram to being a certain length +def pad_spectrogram(record: tuple, pad_length: float) -> None: + """Pads a spectrogram to a specified length on the left and right sides. + Spectrogram is saved again after padding. Parameters ---------- record : tuple - pad_length : - - Returns - ------- - spect_padded : numpy.ndarray - With padding of length ``pad_length`` added on left and right sides. + pad_length : int """ spect_path = record[0] # 'spect_path' spect = np.load(spect_path) @@ -169,6 +165,7 @@ def pad_spectrogram(record: tuple, pad_length: float): spect_padded = np.pad( spect, [(0, 0), (pad_left, pad_right)], "constant", constant_values=0 ) + np.save(spect_path, spect_padded) def abspath(a_path): From dceb44a2b00d62912f9dacd4b5157e936a7471dc Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 07:52:27 -0400 Subject: [PATCH 018/184] Add vak/nn/loss/umap.py --- src/vak/nn/loss/umap.py | 68 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 src/vak/nn/loss/umap.py diff --git a/src/vak/nn/loss/umap.py b/src/vak/nn/loss/umap.py new file mode 100644 index 000000000..750358938 --- /dev/null +++ b/src/vak/nn/loss/umap.py @@ -0,0 +1,68 @@ +import torch + + +def convert_distance_to_probability(distances, a=1.0, b=1.0): + """Convert distances to probability. + + Computes equation (2.6) of Sainburg McInnes Gentner 2021, + :math:`q_{ij} = (1 + a \abs{z_i - z_j}^{2b} )^{-1}`. + + The function uses torch.log1p to avoid floating point error: + ``-torch.log1p(a * distances ** (2 * b))``. + See https://en.wikipedia.org/wiki/Natural_logarithm#lnp1 + """ + # next line, equivalent to 1.0 / (1.0 + a * distances ** (2 * b)) + # but avoids floating point error + return -torch.log1p(a * distances ** (2 * b)) + + +def compute_cross_entropy( + probabilities_graph, probabilities_distance, EPS=1e-4, repulsion_strength=1.0 +): + """Computes cross entropy as used for UMAP cost function""" + # cross entropy + attraction_term = -probabilities_graph * torch.nn.functional.logsigmoid( + probabilities_distance + ) + repulsion_term = ( + -(1.0 - probabilities_graph) * (torch.nn.functional.logsigmoid(probabilities_distance) - probabilities_distance) * repulsion_strength + ) + + # balance the expected losses between attraction and repulsion + CE = attraction_term + repulsion_term + return attraction_term, repulsion_term, CE + + +def umap_loss(embedding_to, embedding_from, a, b, batch_size, negative_sample_rate=5): + """UMAP loss function + + Converts distances to probabilities, + and then computes cross entropy. + """ + # get negative samples by randomly shuffling the batch + embedding_neg_to = embedding_to.repeat(negative_sample_rate, 1) + repeat_neg = embedding_from.repeat(negative_sample_rate, 1) + embedding_neg_from = repeat_neg[torch.randperm(repeat_neg.shape[0])] + distance_embedding = torch.cat(( + (embedding_to - embedding_from).norm(dim=1), + (embedding_neg_to - embedding_neg_from).norm(dim=1) + # ``to`` method in next line to avoid error `Expected all tensors to be on the same device` + ), dim=0).to(embedding_to.device) + + # convert probabilities to distances + probabilities_distance = convert_distance_to_probability( + distance_embedding, a, b + ) + # set true probabilities based on negative sampling + probabilities_graph = torch.cat( + (torch.ones(batch_size), torch.zeros(batch_size * negative_sample_rate)), dim=0, + # ``to`` method in next line to avoid error `Expected all tensors to be on the same device` + ).to(embedding_to.device) + + # compute cross entropy + (attraction_loss, repellant_loss, ce_loss) = compute_cross_entropy( + probabilities_graph, + probabilities_distance, + ) + loss = torch.mean(ce_loss) + return loss From d87cfadb169d96b0d25452a7187e3182216f9a23 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 07:57:32 -0400 Subject: [PATCH 019/184] Add src/vak/datasets/dimensionality_reduction/parametric_umap/ --- .../parametric_umap/__init__.py | 0 .../parametric_umap/parametric_umap.py | 194 ++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 src/vak/datasets/dimensionality_reduction/parametric_umap/__init__.py create mode 100644 src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py diff --git a/src/vak/datasets/dimensionality_reduction/parametric_umap/__init__.py b/src/vak/datasets/dimensionality_reduction/parametric_umap/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py b/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py new file mode 100644 index 000000000..c88f6ccf4 --- /dev/null +++ b/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py @@ -0,0 +1,194 @@ +import pathlib + +import numpy as np +import pandas as pd +from pynndescent import NNDescent +from sklearn.utils import check_random_state +from umap.umap_ import fuzzy_simplicial_set +from torch.utils.data import Dataset + + +def get_umap_graph(X, n_neighbors: int = 10, metric: str= "cosine", random_state: int | None = None, max_candidates=60, verbose=True): + random_state = check_random_state(None) if random_state == None else random_state + + # number of trees in random projection forest + n_trees = 5 + int(round((X.shape[0]) ** 0.5 / 20.0)) + + # max number of nearest neighbor iters to perform + n_iters = max(5, int(round(np.log2(X.shape[0])))) + # distance metric + + # get nearest neighbors + nnd = NNDescent( + X.reshape((len(X), np.product(np.shape(X)[1:]))), + n_neighbors=n_neighbors, + metric=metric, + n_trees=n_trees, + n_iters=n_iters, + max_candidates=max_candidates, + verbose=verbose + ) + + # get indices and distances + knn_indices, knn_dists = nnd.neighbor_graph + + # build fuzzy_simplicial_set + umap_graph, sigmas, rhos = fuzzy_simplicial_set( + X=X, + n_neighbors=n_neighbors, + metric=metric, + random_state=random_state, + knn_indices=knn_indices, + knn_dists=knn_dists, + ) + + return umap_graph + + +def get_graph_elements(graph, n_epochs): + """Get graph elements for UMAP Dataset""" + + graph = graph.tocoo() + + # eliminate duplicate entries by summing them together + graph.sum_duplicates() + + # number of vertices in dataset + n_vertices = graph.shape[1] + + # get the number of epochs based on the size of the dataset + if n_epochs is None: + # For smaller datasets we can use more epochs + if graph.shape[0] <= 10000: + n_epochs = 500 + else: + n_epochs = 200 + + # remove elements with very low probability + graph.data[graph.data < (graph.data.max() / float(n_epochs))] = 0.0 + graph.eliminate_zeros() + + # get epochs per sample based upon edge probability + epochs_per_sample = n_epochs * graph.data + + head = graph.row + tail = graph.col + weight = graph.data + + return graph, epochs_per_sample, head, tail, weight, n_vertices + + +class UMAPDataset(Dataset): + def __init__(self, data, graph, n_epochs=200, transform=None): + graph, epochs_per_sample, head, tail, weight, n_vertices = get_graph_elements(graph, n_epochs) + + self.edges_to_exp, self.edges_from_exp = ( + np.repeat(head, epochs_per_sample.astype("int")), + np.repeat(tail, epochs_per_sample.astype("int")), + ) + shuffle_mask = np.random.permutation(np.arange(len(self.edges_to_exp))) + self.edges_to_exp = self.edges_to_exp[shuffle_mask].astype(np.int64) + self.edges_from_exp = self.edges_from_exp[shuffle_mask].astype(np.int64) + self.data = data + self.transform = transform + + def __len__(self): + return int(self.data.shape[0]) + + def __getitem__(self, index): + edges_to_exp = self.data[self.edges_to_exp[index]] + edges_from_exp = self.data[self.edges_from_exp[index]] + if self.transform: + edges_to_exp = self.transform(edges_to_exp) + edges_from_exp = self.transform(edges_from_exp) + return (edges_to_exp, edges_from_exp) + + @classmethod + def from_dataset_path(cls, + dataset_path, + split, + n_neighbors=10, + metric='Euclidean', + random_state=None, + n_epochs=200, + transform=None): + dataset_path = pathlib.Path(dataset_path) + metadata = vak.datasets.dimensionality_reduction.Metadata.from_dataset_path(dataset_path) + + dataset_csv_path = dataset_path / metadata.dataset_csv_filename + dataset_df = pd.read_csv(dataset_csv_path) + split_df = dataset_df[dataset_df.split == split] + + split_path = dataset_path / split + + data = np.stack( + [ + np.load(dataset_path / spect_path) for spect_path in split_df.spect_path.values + ] + ) + graph = get_umap_graph(data, n_neighbors=n_neighbors, metric=metric, random_state=random_state) + + return cls( + data, + graph, + n_epochs, + transform=transform, + ) + + +class UMAPDataset(Dataset): + def __init__(self, data, graph, n_epochs=200, transform=None): + graph, epochs_per_sample, head, tail, weight, n_vertices = get_graph_elements(graph, n_epochs) + + self.edges_to_exp, self.edges_from_exp = ( + np.repeat(head, epochs_per_sample.astype("int")), + np.repeat(tail, epochs_per_sample.astype("int")), + ) + shuffle_mask = np.random.permutation(np.arange(len(self.edges_to_exp))) + self.edges_to_exp = self.edges_to_exp[shuffle_mask].astype(np.int64) + self.edges_from_exp = self.edges_from_exp[shuffle_mask].astype(np.int64) + self.data = data + self.transform = transform + + def __len__(self): + return int(self.data.shape[0]) + + def __getitem__(self, index): + edges_to_exp = self.data[self.edges_to_exp[index]] + edges_from_exp = self.data[self.edges_from_exp[index]] + if self.transform: + edges_to_exp = self.transform(edges_to_exp) + edges_from_exp = self.transform(edges_from_exp) + return (edges_to_exp, edges_from_exp) + + @classmethod + def from_dataset_path(cls, + dataset_path, + split, + n_neighbors=10, + metric='Euclidean', + random_state=None, + n_epochs=200, + transform=None): + import vak.datasets # import here just to make classmethod more explicit + + dataset_path = pathlib.Path(dataset_path) + metadata = vak.datasets.dimensionality_reduction.Metadata.from_dataset_path(dataset_path) + + dataset_csv_path = dataset_path / metadata.dataset_csv_filename + dataset_df = pd.read_csv(dataset_csv_path) + split_df = dataset_df[dataset_df.split == split] + + data = np.stack( + [ + np.load(dataset_path / spect_path) for spect_path in split_df.spect_path.values + ] + ) + graph = get_umap_graph(data, n_neighbors=n_neighbors, metric=metric, random_state=random_state) + + return cls( + data, + graph, + n_epochs, + transform=transform, + ) From 3fdd57333780d331c44a509a1a3809a38d9de20d Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 08:20:57 -0400 Subject: [PATCH 020/184] Add vak/nets/conv_encoder.py --- src/vak/nets/conv_encoder.py | 78 ++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 src/vak/nets/conv_encoder.py diff --git a/src/vak/nets/conv_encoder.py b/src/vak/nets/conv_encoder.py new file mode 100644 index 000000000..16527b0cd --- /dev/null +++ b/src/vak/nets/conv_encoder.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import torch +import torch.nn as nn + + +class ConvEncoder(nn.Module): + """Convolutional encoder, + used by Parametric UMAP model. + """ + def __init__(self, + input_shape: tuple[int], + conv1_filters: int = 64, + conv2_filters: int = 128, + conv_kernel_size: int = 3, + conv_stride: int = 2, + conv_padding: int = 1, + n_features_linear: int = 512, + n_components: int = 2): + """Initialize a ConvEncoder instance. + + Parameters + ---------- + input_shape : tuple + with 3 elements corresponding to dimensions of spectrogram: + (channels, frequency bins, time bins). + I.e., we assume input is a spectrogram and treat it like an image, + typically with one channel; the rows are frequency bins, + and the columns are time bins. + conv1_filters : int + Number of filters in first convolutional layer. Default is 64. + conv2_filters : int + Number of filters in second convolutional layer. Default is 128. + conv_kernel_size : tuple + Size of kernels, i.e. filters, in convolutional layers. Default is 3. + conv_padding : int + Amount of padding for convolutional layers. Default is 1. + n_components : int + Number of components of latent space that encoder maps to. Default is 2. + """ + super().__init__() + + if len(input_shape) != 3: + raise ValueError( + "Expected input_shape with length 3, (channels, height, width), " + f"but input shape was length {len(input_shape)}. " + f"Input shape was: {input_shape}" + ) + + self.num_input_channels = input_shape[0] + + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=self.num_input_channels, out_channels=conv1_filters, + kernel_size=conv_kernel_size, stride=conv_stride, padding=conv_padding, + ), + nn.Conv2d( + in_channels=conv1_filters, out_channels=conv2_filters, + kernel_size=conv_kernel_size, stride=conv_stride, padding=conv_padding, + ), + nn.Flatten() + ) + mock_input = torch.rand((1, *input_shape)) + mock_conv_out = self.conv(mock_input) + in_features = mock_conv_out.shape[-1] + + self.encoder = nn.Sequential( + nn.Linear(in_features, n_features_linear), + nn.ReLU(), + nn.Linear(n_features_linear, n_features_linear), + nn.ReLU(), + nn.Linear(n_features_linear, n_components) + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.conv(x) + x = self.encoder(x) + return x From 44a50946f329fe2726079cab5573deb2246e9ede Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 10:15:24 -0400 Subject: [PATCH 021/184] Fix dataset class in vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py --- .../parametric_umap/parametric_umap.py | 60 +------------------ 1 file changed, 1 insertion(+), 59 deletions(-) diff --git a/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py b/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py index c88f6ccf4..5822aa825 100644 --- a/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py +++ b/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py @@ -78,65 +78,7 @@ def get_graph_elements(graph, n_epochs): return graph, epochs_per_sample, head, tail, weight, n_vertices -class UMAPDataset(Dataset): - def __init__(self, data, graph, n_epochs=200, transform=None): - graph, epochs_per_sample, head, tail, weight, n_vertices = get_graph_elements(graph, n_epochs) - - self.edges_to_exp, self.edges_from_exp = ( - np.repeat(head, epochs_per_sample.astype("int")), - np.repeat(tail, epochs_per_sample.astype("int")), - ) - shuffle_mask = np.random.permutation(np.arange(len(self.edges_to_exp))) - self.edges_to_exp = self.edges_to_exp[shuffle_mask].astype(np.int64) - self.edges_from_exp = self.edges_from_exp[shuffle_mask].astype(np.int64) - self.data = data - self.transform = transform - - def __len__(self): - return int(self.data.shape[0]) - - def __getitem__(self, index): - edges_to_exp = self.data[self.edges_to_exp[index]] - edges_from_exp = self.data[self.edges_from_exp[index]] - if self.transform: - edges_to_exp = self.transform(edges_to_exp) - edges_from_exp = self.transform(edges_from_exp) - return (edges_to_exp, edges_from_exp) - - @classmethod - def from_dataset_path(cls, - dataset_path, - split, - n_neighbors=10, - metric='Euclidean', - random_state=None, - n_epochs=200, - transform=None): - dataset_path = pathlib.Path(dataset_path) - metadata = vak.datasets.dimensionality_reduction.Metadata.from_dataset_path(dataset_path) - - dataset_csv_path = dataset_path / metadata.dataset_csv_filename - dataset_df = pd.read_csv(dataset_csv_path) - split_df = dataset_df[dataset_df.split == split] - - split_path = dataset_path / split - - data = np.stack( - [ - np.load(dataset_path / spect_path) for spect_path in split_df.spect_path.values - ] - ) - graph = get_umap_graph(data, n_neighbors=n_neighbors, metric=metric, random_state=random_state) - - return cls( - data, - graph, - n_epochs, - transform=transform, - ) - - -class UMAPDataset(Dataset): +class ParametricUMAPDataset(Dataset): def __init__(self, data, graph, n_epochs=200, transform=None): graph, epochs_per_sample, head, tail, weight, n_vertices = get_graph_elements(graph, n_epochs) From e796a8d58dcb7506699b7ab56e29d336b9880162 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 10:15:47 -0400 Subject: [PATCH 022/184] Import ParametricUMAPDataset in vak.datasets --- src/vak/datasets/dimensionality_reduction/__init__.py | 2 ++ .../dimensionality_reduction/parametric_umap/__init__.py | 1 + 2 files changed, 3 insertions(+) diff --git a/src/vak/datasets/dimensionality_reduction/__init__.py b/src/vak/datasets/dimensionality_reduction/__init__.py index f82402171..3ab658efe 100644 --- a/src/vak/datasets/dimensionality_reduction/__init__.py +++ b/src/vak/datasets/dimensionality_reduction/__init__.py @@ -1,8 +1,10 @@ from .metadata import Metadata from .unit_dataset import UnitDataset +from .parametric_umap import ParametricUMAPDataset __all__ = [ "Metadata", + "ParametricUMAPDataset", "UnitDataset", ] diff --git a/src/vak/datasets/dimensionality_reduction/parametric_umap/__init__.py b/src/vak/datasets/dimensionality_reduction/parametric_umap/__init__.py index e69de29bb..05a3dc5ca 100644 --- a/src/vak/datasets/dimensionality_reduction/parametric_umap/__init__.py +++ b/src/vak/datasets/dimensionality_reduction/parametric_umap/__init__.py @@ -0,0 +1 @@ +from .parametric_umap import ParametricUMAPDataset \ No newline at end of file From 0b1de15ceb274621a9fa594c3cfe1962c3e6c5d0 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 10:16:02 -0400 Subject: [PATCH 023/184] Import umap_loss in src/vak/nn/loss/__init__.py --- src/vak/nn/loss/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vak/nn/loss/__init__.py b/src/vak/nn/loss/__init__.py index d0f8eae55..422a24ce1 100644 --- a/src/vak/nn/loss/__init__.py +++ b/src/vak/nn/loss/__init__.py @@ -1 +1,2 @@ from .dice import * +from .umap import umap_loss From f31864fcc3bf0b3ddb1766fb516e5cc84b4b959b Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 10:16:11 -0400 Subject: [PATCH 024/184] Add src/vak/models/parametric_umap_model.py --- src/vak/models/parametric_umap_model.py | 122 ++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 src/vak/models/parametric_umap_model.py diff --git a/src/vak/models/parametric_umap_model.py b/src/vak/models/parametric_umap_model.py new file mode 100644 index 000000000..f16871abd --- /dev/null +++ b/src/vak/models/parametric_umap_model.py @@ -0,0 +1,122 @@ +"""Parametric UMAP model, as described in [1]_. + +Code adapted from implementation by @elyxlz +https://github.com/elyxlz/umap_pytorch +with changes made by Tim Sainburg: +https://github.com/lmcinnes/umap/issues/580#issuecomment-1368649550. +""" +from __future__ import annotations + +from typing import Callable, ClassVar, Type + +import torch +from torch.nn.functional import mse_loss + +from umap.umap_ import find_ab_params + +from . import base +from .definition import ModelDefinition +from .registry import model_family + + +@model_family +class ParametricUMAPModel(base.Model): + """Parametric UMAP model, as described in [1]_. + + Notes + ----- + Code adapted from implementation by @elyxlz + https://github.com/elyxlz/umap_pytorch + with changes made by Tim Sainburg: + https://github.com/lmcinnes/umap/issues/580#issuecomment-1368649550. + + References + ---------- + .. [1] Sainburg, T., McInnes, L., & Gentner, T. Q. (2021). + Parametric UMAP embeddings for representation and semisupervised learning. + Neural Computation, 33(11), 2881-2907. + https://direct.mit.edu/neco/article/33/11/2881/107068. + """ + definition: ClassVar[ModelDefinition] + + def __init__( + self, + network: torch.nn.Module | dict[str: torch.nn.Module] | None = None, + loss: torch.nn.Module | Callable | None = None, + optimizer: torch.optim.Optimizer | None = None, + metrics: dict[str: Type] | None = None, + beta: float = 1.0, + min_dist: float = 0.1, + negative_sample_rate: int = 5, + ): + super().__init__(network=network, loss=loss, + optimizer=optimizer, metrics=metrics) + self.encoder = network['encoder'] + self.decoder = network['decoder'] + self.beta = beta # weight for reconstruction loss + self._a, self._b = find_ab_params(1.0, min_dist) + self.negative_sample_rate = negative_sample_rate + + def configure_optimizers(self): + return self.optimizer + + def training_step(self, batch, batch_idx): + (edges_to_exp, edges_from_exp) = batch + embedding_to, embedding_from = self.encoder(edges_to_exp), self.encoder(edges_from_exp) + encoder_loss = self.loss(embedding_to, embedding_from, self._a, self._b, + edges_to_exp.shape[0], negative_sample_rate=self.negative_sample_rate) + self.log("train_umap_loss", encoder_loss) + + if self.decoder is not None: + recon = self.decoder(embedding_to) + recon_loss = mse_loss(recon, edges_to_exp) + self.log("train_recon_loss", recon_loss) + return encoder_loss + self.beta * recon_loss + else: + return encoder_loss + + def validation_step(self, batch, batch_idx): + (edges_to_exp, edges_from_exp) = batch + embedding_to, embedding_from = self.encoder(edges_to_exp), self.encoder(edges_from_exp) + encoder_loss = self.loss(embedding_to, embedding_from, self._a, self._b, + edges_to_exp.shape[0], negative_sample_rate=self.negative_sample_rate) + self.log("val_umap_loss", encoder_loss, on_step=True) + + if self.decoder is not None: + recon = self.decoder(embedding_to) + recon_loss = mse_loss(recon, edges_to_exp) + self.log("val_recon_loss", recon_loss, on_step=True) + return encoder_loss + self.beta * recon_loss + else: + return encoder_loss + + @classmethod + def from_config(cls, + config: dict, + beta: float = 1.0, + min_dist: float = 0.1, + negative_sample_rate: int = 5, + ): + """Return an initialized model instance from a config ``dict`` + + Parameters + ---------- + config : dict + Returned by calling :func:`vak.config.models.map_from_path` + or :func:`vak.config.models.map_from_config_dict`. + + Returns + ------- + cls : vak.models.base.Model + An instance of the model with its attributes + initialized using parameters from ``config``. + """ + network, loss, optimizer, metrics = cls.attributes_from_config(config) + return cls(network=network, + optimizer=optimizer, + loss=loss, + metrics=metrics, + beta=beta, + min_dist=min_dist, + negative_sample_rate=negative_sample_rate, + ) From e9a425122b46530b93b0aae4e265f033a1c9b675 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 10:16:19 -0400 Subject: [PATCH 025/184] Add src/vak/models/convencoder_parametric_umap.py --- src/vak/models/convencoder_parametric_umap.py | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 src/vak/models/convencoder_parametric_umap.py diff --git a/src/vak/models/convencoder_parametric_umap.py b/src/vak/models/convencoder_parametric_umap.py new file mode 100644 index 000000000..e1c4aea47 --- /dev/null +++ b/src/vak/models/convencoder_parametric_umap.py @@ -0,0 +1,67 @@ +"""Parametric UMAP model, as described in [1]_, with a convolutional network as the encoder. + +Code adapted from implementation by @elyxlz +https://github.com/elyxlz/umap_pytorch +with changes made by Tim Sainburg: +https://github.com/lmcinnes/umap/issues/580#issuecomment-1368649550. +""" +from __future__ import annotations + +import torch + +from .. import ( + metrics, + nets, + nn, +) +from .parametric_umap_model import ParametricUMAPModel +from .decorator import model + + +@model(family=ParametricUMAPModel) +class ConvEncoderParametricUMAP: + """Parametric UMAP model, as described in [1]_, + with a convolutional network as the encoder. + + Attributes + ---------- + network : dict + A dict with two keys, 'encoder' and 'decoder'. + The 'encoder is vak.nets.ConvEncoder, + an encoder with convolutional layers. + The 'decoder' defaults to None. + loss: torch.nn.CrossEntropyLoss + Standard cross-entropy loss + optimizer: torch.optim.Adam + Adam optimizer. + metrics: dict + Mapping string names to the following metrics: + ``vak.metrics.Accuracy``, ``vak.metrics.Levenshtein``, + ``vak.metrics.SegmentErrorRate``, ``torch.nn.CrossEntropyLoss``. + + Notes + ----- + Code adapted from implementation by @elyxlz + https://github.com/elyxlz/umap_pytorch + with changes made by Tim Sainburg: + https://github.com/lmcinnes/umap/issues/580#issuecomment-1368649550. + + References + ---------- + .. [1] Sainburg, T., McInnes, L., & Gentner, T. Q. (2021). + Parametric UMAP embeddings for representation and semisupervised learning. + Neural Computation, 33(11), 2881-2907. + https://direct.mit.edu/neco/article/33/11/2881/107068. + + """ + network = {'encoder': nets.ConvEncoder, 'decoder': None} + loss = nn.loss.umap_loss + optimizer = torch.optim.AdamW + metrics = {'acc': metrics.Accuracy, + 'levenshtein': metrics.Levenshtein, + 'segment_error_rate': metrics.SegmentErrorRate, + 'loss': torch.nn.CrossEntropyLoss} + default_config = { + 'optimizer': + {'lr': 1e-3}, + } From 7e81e14479cccc952357f88b330b720aa6e860e8 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 10:16:34 -0400 Subject: [PATCH 026/184] Import ParametricUMAPModel and ConvEncoderParametricUMAP in src/vak/models/__init__.py --- src/vak/models/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/vak/models/__init__.py b/src/vak/models/__init__.py index cdac1f848..18c89804b 100644 --- a/src/vak/models/__init__.py +++ b/src/vak/models/__init__.py @@ -4,21 +4,25 @@ definition, ) from .base import Model +from .convencoder_parametric_umap import ConvEncoderParametricUMAP from .get import get from .ed_tcn import ED_TCN from .teenytweetynet import TeenyTweetyNet from .tweetynet import TweetyNet from .frame_classification_model import FrameClassificationModel +from .parametric_umap_model import ParametricUMAPModel __all__ = [ "base", + "ConvEncoderParametricUMAP", "decorator", "definition", "ED_TCN", + "FrameClassificationModel", "get", "Model", + "ParametricUMAPModel", "TeenyTweetyNet", "TweetyNet", - "FrameClassificationModel" ] From 8b25e8e727f5062622fb34e3331a51d593a75d94 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 10:33:51 -0400 Subject: [PATCH 027/184] Import conv_encoder and ConvEncoder in vak/nets/__init__.py --- src/vak/nets/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/vak/nets/__init__.py b/src/vak/nets/__init__.py index 7e3f3121a..ab0202af2 100644 --- a/src/vak/nets/__init__.py +++ b/src/vak/nets/__init__.py @@ -1,14 +1,19 @@ from . import ( + conv_encoder, ed_tcn, teenytweetynet, tweetynet, ) +from .conv_encoder import ConvEncoder from .ed_tcn import ED_TCN from .teenytweetynet import TeenyTweetyNet from .tweetynet import TweetyNet + __all__ = [ + 'conv_encoder', + 'ConvEncoder', 'ed_tcn', 'ED_TCN', 'teenytweetynet', From 0256d0dc81c11517c8ee3da636c8e2d3bc024141 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 10:54:38 -0400 Subject: [PATCH 028/184] Add shape property to ParametricUMAPDataset --- .../parametric_umap/parametric_umap.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py b/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py index 5822aa825..4b33c7e4c 100644 --- a/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py +++ b/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py @@ -95,6 +95,12 @@ def __init__(self, data, graph, n_epochs=200, transform=None): def __len__(self): return int(self.data.shape[0]) + @property + def shape(self): + tmp_x_ind = 0 + tmp_item = self.__getitem__(tmp_x_ind) + return tmp_item[0].shape + def __getitem__(self, index): edges_to_exp = self.data[self.edges_to_exp[index]] edges_from_exp = self.data[self.edges_from_exp[index]] From 291cef8b0df7509acaac91572184a6081595efdb Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 10:54:54 -0400 Subject: [PATCH 029/184] Add UmapLoss class to vak/nn/loss/umap.py --- src/vak/nn/loss/umap.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/vak/nn/loss/umap.py b/src/vak/nn/loss/umap.py index 750358938..5bcafe65f 100644 --- a/src/vak/nn/loss/umap.py +++ b/src/vak/nn/loss/umap.py @@ -66,3 +66,12 @@ def umap_loss(embedding_to, embedding_from, a, b, batch_size, negative_sample_ra ) loss = torch.mean(ce_loss) return loss + + +class UmapLoss(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, embedding_to, embedding_from, a, b, batch_size, negative_sample_rate): + return umap_loss(embedding_to, embedding_from, a, b, + batch_size, negative_sample_rate) From d8f17e7ebd8cf8aabcc8a025eb8a8c1276e83025 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 10:55:26 -0400 Subject: [PATCH 030/184] Import functional and import * from .loss and .modules in vak/nn/__init__.py --- src/vak/nn/__init__.py | 2 ++ src/vak/nn/loss/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/vak/nn/__init__.py b/src/vak/nn/__init__.py index 4e309f46c..00f6ea247 100644 --- a/src/vak/nn/__init__.py +++ b/src/vak/nn/__init__.py @@ -1 +1,3 @@ from .loss import * +from .modules import * +from . import functional diff --git a/src/vak/nn/loss/__init__.py b/src/vak/nn/loss/__init__.py index 422a24ce1..5e6951100 100644 --- a/src/vak/nn/loss/__init__.py +++ b/src/vak/nn/loss/__init__.py @@ -1,2 +1,2 @@ from .dice import * -from .umap import umap_loss +from .umap import umap_loss, UmapLoss From 72438e83ddb81fa4ff7cbc5558a0448c344c928c Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 10:55:45 -0400 Subject: [PATCH 031/184] Fix how we get decoder from network dict in ParametricUMAPModel: use get, default to None --- src/vak/models/parametric_umap_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/models/parametric_umap_model.py b/src/vak/models/parametric_umap_model.py index f16871abd..dbc29e365 100644 --- a/src/vak/models/parametric_umap_model.py +++ b/src/vak/models/parametric_umap_model.py @@ -52,7 +52,7 @@ def __init__( super().__init__(network=network, loss=loss, optimizer=optimizer, metrics=metrics) self.encoder = network['encoder'] - self.decoder = network['decoder'] + self.decoder = network.get('decoder', None) self.beta = beta # weight for reconstruction loss self._a, self._b = find_ab_params(1.0, min_dist) self.negative_sample_rate = negative_sample_rate From eeef543e095a08a5ea232dc86a415cf6e7ffbaee Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 10:56:16 -0400 Subject: [PATCH 032/184] Fix ConvEncoderParametricUMAP to only specify 'encoder' in network dict, and to use UmapLoss class for loss --- src/vak/models/convencoder_parametric_umap.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vak/models/convencoder_parametric_umap.py b/src/vak/models/convencoder_parametric_umap.py index e1c4aea47..d331a834c 100644 --- a/src/vak/models/convencoder_parametric_umap.py +++ b/src/vak/models/convencoder_parametric_umap.py @@ -54,8 +54,8 @@ class ConvEncoderParametricUMAP: https://direct.mit.edu/neco/article/33/11/2881/107068. """ - network = {'encoder': nets.ConvEncoder, 'decoder': None} - loss = nn.loss.umap_loss + network = {'encoder': nets.ConvEncoder} + loss = nn.UmapLoss optimizer = torch.optim.AdamW metrics = {'acc': metrics.Accuracy, 'levenshtein': metrics.Levenshtein, From abb51e8287e0a9499a07bbde47d2d28329264062 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 15:58:05 -0400 Subject: [PATCH 033/184] Add umap-learn and pynndescent as dependencies --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 859f30adc..338f07096 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "pytorch-lightning >=1.8.4.post0, <2.0", "matplotlib >=3.3.3", "numpy >=1.18.1", + "pynndescent >=0.5.10", "scipy >=1.4.1", "SoundFile >=0.10.3", "pandas >=1.0.1", @@ -39,6 +40,7 @@ dependencies = [ "torch >=1.7.1, <2.0.0", "torchvision >=0.5.0", "tqdm >=4.42.1", + "umap-learn >=0.5.3", ] [project.optional-dependencies] From ff8e8ee73404b6e0a5bdff8bac6759929441c22d Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 16:18:16 -0400 Subject: [PATCH 034/184] Remove out-dated parameter from docstring in train/frame_classification.py --- src/vak/train/frame_classification.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/vak/train/frame_classification.py b/src/vak/train/frame_classification.py index f60c014e8..ca4f0157e 100644 --- a/src/vak/train/frame_classification.py +++ b/src/vak/train/frame_classification.py @@ -124,23 +124,6 @@ def train_frame_classification_model( Normalization is done by subtracting off the mean for each frequency bin of the training set and then dividing by the std for that frequency bin. This same normalization is then applied to validation + test data. - source_ids : numpy.ndarray - Parameter for WindowDataset. Represents the 'id' of any spectrogram, - i.e., the index into spect_paths that will let us load it. - Default is None. - source_inds : numpy.ndarray - Parameter for WindowDataset. Same length as source_ids - but values represent indices within each spectrogram. - Default is None. - window_inds : numpy.ndarray - Parameter for WindowDataset. - Indices of each window in the dataset. The value at x[0] - represents the start index of the first window; using that - value, we can index into source_ids to get the path - of the spectrogram file to load, and we can index into - source_inds to index into the spectrogram itself - and get the window. - Default is None. val_step : int Step on which to estimate accuracy using validation set. If val_step is n, then validation is carried out every time From 1d4d17a4502a0cc1bfc7c7d34d6005e5a2ed0bbb Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 16:18:34 -0400 Subject: [PATCH 035/184] Add vak/train/parametric_umap.py --- src/vak/train/parametric_umap.py | 269 +++++++++++++++++++++++++++++++ 1 file changed, 269 insertions(+) create mode 100644 src/vak/train/parametric_umap.py diff --git a/src/vak/train/parametric_umap.py b/src/vak/train/parametric_umap.py new file mode 100644 index 000000000..f3e0cf143 --- /dev/null +++ b/src/vak/train/parametric_umap.py @@ -0,0 +1,269 @@ +"""Function that trains models in the Parametric UMAP family.""" +from __future__ import annotations + +import logging +import pathlib +import datetime + +import pandas as pd +import torch.utils.data + +from .. import ( + datasets, + models, + transforms, +) +from ..common import validators +from ..datasets.dimensionality_reduction import ParametricUMAPDataset +from ..common.device import get_default as get_default_device +from ..common.paths import generate_results_dir_name_as_path +from ..common.trainer import get_default_trainer + + +logger = logging.getLogger(__name__) + + +def get_split_dur(df: pd.DataFrame, split: str) -> float: + """Get duration of a split in a dataset from a pandas DataFrame representing the dataset.""" + return df[df["split"] == split]["duration"].sum() + + +def train_parametric_umap_model( + model_name: str, + model_config: dict, + dataset_path: str | pathlib.Path, + batch_size: int, + num_epochs: int, + num_workers: int, + checkpoint_path: str | pathlib.Path | None = None, + root_results_dir: str | pathlib.Path | None = None, + results_path: str | pathlib.Path | None = None, + shuffle: bool = True, + val_step: int | None = None, + ckpt_step: int | None = None, + patience: int | None = None, + device: str | None = None, + split: str = 'train', +) -> None: + """Train a model from the parametric UMAP family + and save results. + + Saves checkpoint files for model, + label map, and spectrogram scaler. + These are saved either in ``results_path`` + if specified, or a new directory + made inside ``root_results_dir``. + + Parameters + ---------- + model_name : str + Model name, must be one of vak.models.registry.MODEL_NAMES. + model_config : dict + Model configuration in a ``dict``, + as loaded from a .toml file, + and used by the model method ``from_config``. + dataset_path : str + Path to dataset, a directory generated by running ``vak prep``. + batch_size : int + number of samples per batch presented to models during training. + num_epochs : int + number of training epochs. One epoch = one iteration through the entire + training set. + num_workers : int + Number of processes to use for parallel loading of data. + Argument to torch.DataLoader. + dataset_csv_path + Path to csv file representing splits of dataset, + e.g., such a file generated by running ``vak prep``. + This parameter is used by :func:`vak.core.learncurve` to specify + different splits to use, when generating results for a learning curve. + If this argument is specified, the csv file must be inside the directory + ``dataset_path``. + checkpoint_path : str, pathlib.Path + path to a checkpoint file, + e.g., one generated by a previous run of ``vak.core.train``. + If specified, this checkpoint will be loaded into model. + Used when continuing training. + Default is None, in which case a new model is initialized. + root_results_dir : str, pathlib.Path + Root directory in which a new directory will be created + where results will be saved. + results_path : str, pathlib.Path + Directory where results will be saved. + If specified, this parameter overrides ``root_results_dir``. + spect_key : str + key for accessing spectrogram in files. Default is 's'. + timebins_key : str + key for accessing vector of time bins in files. Default is 't'. + device : str + Device on which to work with model + data. + Default is None. If None, then a device will be selected with vak.split.get_default. + That function defaults to 'cuda' if torch.cuda.is_available is True. + shuffle: bool + if True, shuffle training data before each epoch. Default is True. + val_step : int + Step on which to estimate accuracy using validation set. + If val_step is n, then validation is carried out every time + the global step / n is a whole number, i.e., when val_step modulo the global step is 0. + Default is None, in which case no validation is done. + ckpt_step : int + Step on which to save to checkpoint file. + If ckpt_step is n, then a checkpoint is saved every time + the global step / n is a whole number, i.e., when ckpt_step modulo the global step is 0. + Default is None, in which case checkpoint is only saved at the last epoch. + patience : int + number of validation steps to wait without performance on the + validation set improving before stopping the training. + Default is None, in which case training only stops after the specified number of epochs. + split : str + Name of split from dataset found at ``dataset_path`` to use + when training model. Default is 'train'. This parameter is used by + `vak.learncurve.learncurve` to specify specific subsets of the + training set to use when training models for a learning curve. + """ + for path, path_name in zip( + (checkpoint_path,), + ('checkpoint_path',), + ): + if path is not None: + if not validators.is_a_file(path): + raise FileNotFoundError( + f"value for ``{path_name}`` not recognized as a file: {path}" + ) + + dataset_path = pathlib.Path(dataset_path) + if not dataset_path.exists() or not dataset_path.is_dir(): + raise NotADirectoryError( + f"`dataset_path` not found or not recognized as a directory: {dataset_path}" + ) + + logger.info( + f"Loading dataset from path: {dataset_path}", + ) + metadata = datasets.dimensionality_reduction.Metadata.from_dataset_path(dataset_path) + dataset_csv_path = dataset_path / metadata.dataset_csv_filename + dataset_df = pd.read_csv(dataset_csv_path) + # ---------------- pre-conditions ---------------------------------------------------------------------------------- + if val_step and not dataset_df["split"].str.contains("val").any(): + raise ValueError( + f"val_step set to {val_step} but dataset does not contain a validation set; " + f"please run `vak prep` with a config.toml file that specifies a duration for the validation set." + ) + + # ---- set up directory to save output ----------------------------------------------------------------------------- + if results_path: + results_path = pathlib.Path(results_path).expanduser().resolve() + if not results_path.is_dir(): + raise NotADirectoryError( + f"results_path not recognized as a directory: {results_path}" + ) + else: + results_path = generate_results_dir_name_as_path(root_results_dir) + results_path.mkdir() + + # ---------------- load training data ----------------------------------------------------------------------------- + logger.info(f"using training dataset from {dataset_path}") + # below, if we're going to train network to predict unlabeled segments, then + # we need to include a class for those unlabeled segments in labelmap, + # the mapping from labelset provided by user to a set of consecutive + # integers that the network learns to predict + train_dur = get_split_dur(dataset_df, "train") + logger.info( + f"Total duration of training split from dataset (in s): {train_dur}", + ) + + transform = transforms.get_defaults("train") + + train_dataset = ParametricUMAPDataset.from_dataset_path( + dataset_path=dataset_path, + split=split, + transform=transform, + ) + logger.info( + f"Duration of WindowDataset used for training, in seconds: {train_dataset.duration}", + ) + train_loader = torch.utils.data.DataLoader( + dataset=train_dataset, + shuffle=shuffle, + batch_size=batch_size, + num_workers=num_workers, + ) + + # ---------------- load validation set (if there is one) ----------------------------------------------------------- + if val_step: + transform = transforms.get_defaults( + "eval", + ) + val_dataset = ParametricUMAPDataset.from_dataset_path( + dataset_path=dataset_path, + split=split, + transform=transform, + ) + val_loader = torch.utils.data.DataLoader( + dataset=val_dataset, + shuffle=False, + # batch size 1 because each spectrogram reshaped into a batch of windows + batch_size=1, + num_workers=num_workers, + ) + val_dur = get_split_dur(dataset_df, "val") + logger.info( + f"Total duration of validation split from dataset (in s): {val_dur}", + ) + + logger.info( + f"will measure loss on validation set every {val_step} steps of training", + ) + else: + val_loader = None + + if device is None: + device = get_default_device() + + model = models.get( + model_name, + model_config, + input_shape=train_dataset.shape, + ) + + if checkpoint_path is not None: + logger.info( + f"loading checkpoint for {model_name} from path: {checkpoint_path}", + ) + model.load_state_dict_from_path(checkpoint_path) + + results_model_root = results_path.joinpath(model_name) + results_model_root.mkdir() + ckpt_root = results_model_root.joinpath("checkpoints") + ckpt_root.mkdir() + logger.info(f"training {model_name}") + max_steps = num_epochs * len(train_loader) + default_callback_kwargs = { + 'ckpt_root': ckpt_root, + 'ckpt_step': ckpt_step, + 'patience': patience, + } + trainer = get_default_trainer( + max_steps=max_steps, + log_save_dir=results_model_root, + val_step=val_step, + default_callback_kwargs=default_callback_kwargs, + device=device, + ) + train_time_start = datetime.datetime.now() + logger.info( + f"Training start time: {train_time_start.isoformat()}" + ) + trainer.fit( + model=model, + train_dataloaders=train_loader, + val_dataloaders=val_loader, + ) + train_time_stop = datetime.datetime.now() + logger.info( + f"Training stop time: {train_time_stop.isoformat()}" + ) + elapsed = train_time_stop - train_time_start + logger.info( + f"Elapsed training time: {elapsed}" + ) From 019d5a51f788846284b45204f34ffe59c157ceac Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 16:19:07 -0400 Subject: [PATCH 036/184] Fix vak/train/train.py to call train_parametric_umap_model when model family is ParametricUMAPModel --- src/vak/train/train.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/vak/train/train.py b/src/vak/train/train.py index 0223b5625..737481ba5 100644 --- a/src/vak/train/train.py +++ b/src/vak/train/train.py @@ -5,6 +5,7 @@ import pathlib from .frame_classification import train_frame_classification_model +from .parametric_umap import train_parametric_umap_model from .. import ( models, ) @@ -181,6 +182,23 @@ def train( device=device, split=split, ) + elif model_family == "ParametricUMAPModel": + train_parametric_umap_model( + model_name=model_name, + model_config=model_config, + dataset_path=dataset_path, + batch_size=batch_size, + num_epochs=num_epochs, + num_workers=num_workers, + checkpoint_path=checkpoint_path, + results_path=results_path, + shuffle=shuffle, + val_step=val_step, + ckpt_step=ckpt_step, + patience=patience, + device=device, + split=split, + ) else: raise ValueError( f"Model family not recognized: {model_family}" From 7d7c92259fb8333a123735dfb15738e6f2893096 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 16:30:34 -0400 Subject: [PATCH 037/184] Add 'dimensionality reduction' to DATASET_TYPE_FUNCTION_MAP in vak.prep.constants --- src/vak/prep/constants.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/vak/prep/constants.py b/src/vak/prep/constants.py index ed1e1a51d..06e78f2c1 100644 --- a/src/vak/prep/constants.py +++ b/src/vak/prep/constants.py @@ -2,7 +2,10 @@ Defined in a separate module to minimize circular imports. """ -from . import frame_classification +from . import ( + dimensionality_reduction, + frame_classification +) VALID_PURPOSES = frozenset( @@ -18,6 +21,7 @@ DATASET_TYPE_FUNCTION_MAP = { 'frame classification': frame_classification.prep_frame_classification_dataset, + 'dimensionality reduction': dimensionality_reduction.prep_dimensionality_reduction_dataset, } DATASET_TYPES = tuple(DATASET_TYPE_FUNCTION_MAP.keys()) From d5a8da54a15c89fb24814a0cc328a13d4ed1b8ea Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 16:31:11 -0400 Subject: [PATCH 038/184] Fix vak/prep/prep.py so it will call prep_dimensionality_reduction_dataset appropriately --- src/vak/prep/prep.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/vak/prep/prep.py b/src/vak/prep/prep.py index f8cb98f59..7556c5cab 100644 --- a/src/vak/prep/prep.py +++ b/src/vak/prep/prep.py @@ -4,6 +4,7 @@ from . import ( constants, ) +from .dimensionality_reduction import prep_dimensionality_reduction_dataset from .frame_classification import prep_frame_classification_dataset @@ -30,6 +31,7 @@ def prep( num_replicates: int | None = None, spect_key: str = "s", timebins_key: str = "t", + context_s: float = 0.015, ): """Prepare datasets for use with neural network models. @@ -211,6 +213,26 @@ def prep( timebins_key, ) return dataset_df, dataset_path + elif dataset_type == "dimensionality reduction": + dataset_df, dataset_path = prep_dimensionality_reduction_dataset( + data_dir, + purpose, + output_dir, + audio_format, + spect_params, + annot_format, + annot_file, + labelset, + context_s, + train_dur, + val_dur, + test_dur, + train_set_durs, + num_replicates, + spect_key=spect_key, + timebins_key=timebins_key, + ) + return dataset_df, dataset_path else: # this is in case a dataset type is written wrong # in the if-else statements above, we want to error loudly From 3f99fabb4a4e0f4a220199ea94f12688e54709e1 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 16:35:30 -0400 Subject: [PATCH 039/184] Fix parameter of parametric umap dataset: 'Euclidean' -> 'euclidean' --- .../dimensionality_reduction/parametric_umap/parametric_umap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py b/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py index 4b33c7e4c..efc89951d 100644 --- a/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py +++ b/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py @@ -114,7 +114,7 @@ def from_dataset_path(cls, dataset_path, split, n_neighbors=10, - metric='Euclidean', + metric='euclidean', random_state=None, n_epochs=200, transform=None): From 2bacd21ca09e0b102439e0b6f6f219e55f9b348f Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sat, 8 Jul 2023 16:50:29 -0400 Subject: [PATCH 040/184] Add duration property to ParametricUMAPDataset --- .../parametric_umap/parametric_umap.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py b/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py index efc89951d..c83456d40 100644 --- a/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py +++ b/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py @@ -79,7 +79,7 @@ def get_graph_elements(graph, n_epochs): class ParametricUMAPDataset(Dataset): - def __init__(self, data, graph, n_epochs=200, transform=None): + def __init__(self, data, graph, dataset_df, n_epochs=200, transform=None): graph, epochs_per_sample, head, tail, weight, n_vertices = get_graph_elements(graph, n_epochs) self.edges_to_exp, self.edges_from_exp = ( @@ -89,9 +89,15 @@ def __init__(self, data, graph, n_epochs=200, transform=None): shuffle_mask = np.random.permutation(np.arange(len(self.edges_to_exp))) self.edges_to_exp = self.edges_to_exp[shuffle_mask].astype(np.int64) self.edges_from_exp = self.edges_from_exp[shuffle_mask].astype(np.int64) + self.data = data + self.dataset_df = dataset_df self.transform = transform + @property + def duration(self): + return self.dataset_df['duration'].sum() + def __len__(self): return int(self.data.shape[0]) @@ -137,6 +143,7 @@ def from_dataset_path(cls, return cls( data, graph, + split_df, n_epochs, transform=transform, ) From a7585c24974571c1ee6179a8644899db5c55adeb Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 12:55:11 -0400 Subject: [PATCH 041/184] Rename vak/transforms/defaults.py -> frame_classification.py, add 'get_default_frame_classification_transforms' helper function --- .../frame_classification.py} | 65 +++++++------------ 1 file changed, 24 insertions(+), 41 deletions(-) rename src/vak/transforms/{defaults.py => defaults/frame_classification.py} (77%) diff --git a/src/vak/transforms/defaults.py b/src/vak/transforms/defaults/frame_classification.py similarity index 77% rename from src/vak/transforms/defaults.py rename to src/vak/transforms/defaults/frame_classification.py index 21335119d..fd614ad4d 100644 --- a/src/vak/transforms/defaults.py +++ b/src/vak/transforms/defaults/frame_classification.py @@ -1,6 +1,6 @@ -"""default item transforms used with the different command-line interface commands +"""Default transforms for frame classification models. -"item" transforms because they apply transforms to input parameters +These are "item" transforms because they apply transforms to input parameters and then return them in an "item" (dictionary) that is turn returned by the __getitem__ method of a vak.FramesDataset. Having the transform return a dictionary makes it possible to avoid @@ -8,13 +8,17 @@ needed for specific neural network models, e.g., whether the returned output includes a mask to crop off padding that was added. """ +from __future__ import annotations + +from typing import Callable + import torchvision.transforms -from . import transforms as vak_transforms +from .. import transforms as vak_transforms class TrainItemTransform: - """default transform used when training models""" + """Default transform used when training frame classification models""" def __init__( self, @@ -55,7 +59,7 @@ def __call__(self, source, annot, spect_path=None): class EvalItemTransform: - """default transform used when evaluating models + """Default transform used when evaluating frame classification models. Returned item includes "source" spectrogram reshaped into a stack of windows, with padded added to make reshaping possible, and annotation also padded and @@ -123,7 +127,8 @@ def __call__(self, frames, frame_labels, source_path=None): class PredictItemTransform: - """default transform used when using trained models to make predictions. + """Default transform used when using trained frame classification models + to make predictions. Returned item includes "source" spectrogram reshaped into a stack of windows, with padded added to make reshaping possible. @@ -185,42 +190,21 @@ def __call__(self, frames, source_path=None): return item -def get_defaults( - mode, - spect_standardizer=None, - window_size=None, - padval=0.0, - return_padding_mask=False, -): - """get default transforms +def get_default_frame_classification_transform( + mode: str, transform_kwargs: dict +) -> tuple[Callable, Callable] | Callable: + """Get default transform for frame classification model. Parameters ---------- mode : str - one of {'train', 'eval', 'predict'}. Determines set of transforms. - spect_standardizer : vak.transforms.StandardizeSpect - instance that has already been fit to dataset, using fit_df method. - Default is None, in which case no standardization transform is applied. - window_size : int - width of window in number of elements. Argument to PadToWindow transform. - padval : float - value to pad with. Added to end of array, the "right side" if 2-dimensional. - Argument to PadToWindow transform. Default is 0. - return_padding_mask : bool - if True, the dictionary returned by ItemTransform classes will include - a boolean vector to use for cropping back down to size before padding. - padding_mask has size equal to width of padded array, i.e. original size - plus padding at the end, and has values of 1 where - columns in padded are from the original array, - and values of 0 where columns were added for padding. + transform_kwargs : dict Returns ------- - transform, target_transform : callable - one or more vak transforms to be applied to inputs x and, during training, the target y. - If more than one transform, they are combined into an instance of torchvision.transforms.Compose. - Note that when mode is 'predict', the target transform is None. + """ + spect_standardizer = transform_kwargs.get('spect_standardizer', None) # regardless of mode, transform always starts with StandardizeSpect, if used if spect_standardizer is not None: if not isinstance(spect_standardizer, vak_transforms.StandardizeSpect): @@ -249,20 +233,19 @@ def get_defaults( elif mode == "predict": item_transform = PredictItemTransform( spect_standardizer=spect_standardizer, - window_size=window_size, - padval=padval, - return_padding_mask=return_padding_mask, + window_size=transform_kwargs['window_size'], + padval=transform_kwargs['padval'], + return_padding_mask=transform_kwargs['return_padding_mask'], ) return item_transform elif mode == "eval": item_transform = EvalItemTransform( spect_standardizer=spect_standardizer, - window_size=window_size, - padval=padval, - return_padding_mask=return_padding_mask, + window_size=transform_kwargs['window_size'], + padval=transform_kwargs['padval'], + return_padding_mask=transform_kwargs['return_padding_mask'], ) return item_transform - else: raise ValueError(f"invalid mode: {mode}") From 347503ef52993a8221634ae7d67c17444eb15110 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 12:55:29 -0400 Subject: [PATCH 042/184] Import registry in models/__init__.py, add to __all__ there --- src/vak/models/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/vak/models/__init__.py b/src/vak/models/__init__.py index 18c89804b..cfbf1c3ec 100644 --- a/src/vak/models/__init__.py +++ b/src/vak/models/__init__.py @@ -2,6 +2,7 @@ base, decorator, definition, + registry, ) from .base import Model from .convencoder_parametric_umap import ConvEncoderParametricUMAP @@ -23,6 +24,7 @@ "get", "Model", "ParametricUMAPModel", + "registry", "TeenyTweetyNet", "TweetyNet", ] From 635f0f79109857e97a37c93748bee89b364cbc38 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 12:55:50 -0400 Subject: [PATCH 043/184] Add transforms/defaults/parametric_umap.py --- .../transforms/defaults/parametric_umap.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 src/vak/transforms/defaults/parametric_umap.py diff --git a/src/vak/transforms/defaults/parametric_umap.py b/src/vak/transforms/defaults/parametric_umap.py new file mode 100644 index 000000000..e6eef0ad5 --- /dev/null +++ b/src/vak/transforms/defaults/parametric_umap.py @@ -0,0 +1,28 @@ +"""Default transforms for Parametric UMAP models.""" +from __future__ import annotations + +from typing import Callable + +import torchvision.transforms + +from .. import transforms as vak_transforms + + +def get_default_parametric_umap_transform(transform_kwargs) -> Callable: + """Get default transform for frame classification model. + + Parameters + ---------- + transform_kwargs : dict + + Returns + ------- + transform : Callable + """ + return torchvision.transforms.Compose( + [ + vak_transforms.ToFloatTensor(), + vak_transforms.AddChannel(), + torchvision.transforms.Resize(transform_kwargs['resize']) + ] + ) From 89d033ec3bd5f081c2a85bf271401cb04d9e15f6 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 12:58:03 -0400 Subject: [PATCH 044/184] Add valid transform_kwarg key-value pairs to docstring of get_default_frame_classification transforms --- .../transforms/defaults/frame_classification.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/vak/transforms/defaults/frame_classification.py b/src/vak/transforms/defaults/frame_classification.py index fd614ad4d..77b64b425 100644 --- a/src/vak/transforms/defaults/frame_classification.py +++ b/src/vak/transforms/defaults/frame_classification.py @@ -199,6 +199,22 @@ def get_default_frame_classification_transform( ---------- mode : str transform_kwargs : dict + A dict with the following key-value pairs: + spect_standardizer : vak.transforms.StandardizeSpect + instance that has already been fit to dataset, using fit_df method. + Default is None, in which case no standardization transform is applied. + window_size : int + width of window in number of elements. Argument to PadToWindow transform. + padval : float + value to pad with. Added to end of array, the "right side" if 2-dimensional. + Argument to PadToWindow transform. Default is 0. + return_padding_mask : bool + if True, the dictionary returned by ItemTransform classes will include + a boolean vector to use for cropping back down to size before padding. + padding_mask has size equal to width of padded array, i.e. original size + plus padding at the end, and has values of 1 where + columns in padded are from the original array, + and values of 0 where columns were added for padding. Returns ------- From 8c313142faed8cc94180b2f656da25681c5dee84 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 12:58:24 -0400 Subject: [PATCH 045/184] Add transforms/defaults/get.py --- src/vak/transforms/defaults/get.py | 47 ++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 src/vak/transforms/defaults/get.py diff --git a/src/vak/transforms/defaults/get.py b/src/vak/transforms/defaults/get.py new file mode 100644 index 000000000..0e31b183f --- /dev/null +++ b/src/vak/transforms/defaults/get.py @@ -0,0 +1,47 @@ +"""Helper function that gets default transforms for a model.""" +from __future__ import annotations + +from . import ( + frame_classification, + parametric_umap, +) +from ... import models + + +def get_defaults( + model_name: str, + mode: str, + transform_kwargs: dict, +): + """Get default transforms for a model, + according to its family and what mode + the model is being used in. + + Parameters + ---------- + model_name : str + Name of model. + mode : str + one of {'train', 'eval', 'predict'}. Determines set of transforms. + + Returns + ------- + transform, target_transform : callable + one or more vak transforms to be applied to inputs x and, during training, the target y. + If more than one transform, they are combined into an instance of torchvision.transforms.Compose. + Note that when mode is 'predict', the target transform is None. + """ + try: + model_family = models.registry.MODEL_FAMILY_FROM_NAME[model_name] + except KeyError as e: + raise ValueError( + f"No model family found for the model name specified: {model_name}" + ) from e + + if model_family == "FrameClassificationModel": + return frame_classification.get_default_frame_classification_transform( + mode, transform_kwargs + ) + + elif model_family == "ParametricUMAPModel": + return parametric_umap.get_default_parametric_umap_transform(transform_kwargs) From 698a7fa8b9a6354e4edbf4bc0a3a194e2f0a2f3b Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 12:59:49 -0400 Subject: [PATCH 046/184] Rename transforms.defaults.get.get_defaults -> get_default_transform --- src/vak/transforms/defaults/get.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/transforms/defaults/get.py b/src/vak/transforms/defaults/get.py index 0e31b183f..d86736e7e 100644 --- a/src/vak/transforms/defaults/get.py +++ b/src/vak/transforms/defaults/get.py @@ -8,7 +8,7 @@ from ... import models -def get_defaults( +def get_default_transform( model_name: str, mode: str, transform_kwargs: dict, From 44de24933d32a9a9c85aad320f6caf5fe0843eb0 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 13:02:41 -0400 Subject: [PATCH 047/184] Add transforms/defaults/__init__.py with imports --- src/vak/transforms/defaults/__init__.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 src/vak/transforms/defaults/__init__.py diff --git a/src/vak/transforms/defaults/__init__.py b/src/vak/transforms/defaults/__init__.py new file mode 100644 index 000000000..f5f585b69 --- /dev/null +++ b/src/vak/transforms/defaults/__init__.py @@ -0,0 +1,12 @@ +from . import ( + frame_classification, + parametric_umap, +) +from .get import get_default_transform + + +__all__ = [ + "get_default_transform", + "frame_classification", + "parametric_umap" +] From 1feb336f08d7e57c3b31001047f27adc74c94ae9 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 13:09:46 -0400 Subject: [PATCH 048/184] Fix train/frame_classification.py to use transforms.defaults.get_default_transform --- src/vak/train/frame_classification.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/vak/train/frame_classification.py b/src/vak/train/frame_classification.py index ca4f0157e..a0ea7466f 100644 --- a/src/vak/train/frame_classification.py +++ b/src/vak/train/frame_classification.py @@ -235,7 +235,9 @@ def train_frame_classification_model( "will not standardize spectrograms", ) spect_standardizer = None - transform, target_transform = transforms.get_defaults("train", spect_standardizer) + transform, target_transform = transforms.defaults.get_default_transform( + "train", transform_kwargs={'spect_standardizer': spect_standardizer} + ) train_dataset = WindowDataset.from_dataset_path( dataset_path=dataset_path, @@ -256,11 +258,13 @@ def train_frame_classification_model( # ---------------- load validation set (if there is one) ----------------------------------------------------------- if val_step: - item_transform = transforms.get_defaults( + item_transform = transforms.defaults.get_default_transform( "eval", - spect_standardizer, - window_size=window_size, - return_padding_mask=True, + transform_kwargs=dict( + spect_standardizer=spect_standardizer, + window_size=window_size, + return_padding_mask=True, + ) ) val_dataset = FramesDataset.from_dataset_path( dataset_path=dataset_path, From 9ea20daa594ba49e572c7b2ca967f4cd43236ff3 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 13:11:57 -0400 Subject: [PATCH 049/184] Fix eval/frame_classification.py to use transforms.defaults.get_default_transform --- src/vak/eval/frame_classification.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/vak/eval/frame_classification.py b/src/vak/eval/frame_classification.py index cd2f6dda5..8db6a5a46 100644 --- a/src/vak/eval/frame_classification.py +++ b/src/vak/eval/frame_classification.py @@ -143,11 +143,13 @@ def eval_frame_classification_model( with labelmap_path.open("r") as f: labelmap = json.load(f) - item_transform = transforms.get_defaults( + item_transform = transforms.defaults.get_default_transform( "eval", - spect_standardizer, - window_size=window_size, - return_padding_mask=True, + transform_kwargs=dict( + spect_standardizer=spect_standardizer, + window_size=window_size, + return_padding_mask=True, + ) ) val_dataset = FramesDataset.from_dataset_path( From 5ce8e476ca00f40e65ddbf2de8c4c507f9b71cdd Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 13:13:08 -0400 Subject: [PATCH 050/184] Fix predict/frame_classification.py to use transforms.defaults.get_default_transform --- src/vak/predict/frame_classification.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/vak/predict/frame_classification.py b/src/vak/predict/frame_classification.py index de501d2ce..9451161ed 100644 --- a/src/vak/predict/frame_classification.py +++ b/src/vak/predict/frame_classification.py @@ -149,11 +149,13 @@ def predict_with_frame_classification_model( logger.info(f"Not loading SpectScaler, no path was specified") spect_standardizer = None - item_transform = transforms.get_defaults( + item_transform = transforms.defaults.get_default_transform( "predict", - spect_standardizer, - window_size=window_size, - return_padding_mask=True, + transform_kwargs=dict( + spect_standardizer=spect_standardizer, + window_size=window_size, + return_padding_mask=True, + ) ) logger.info(f"loading labelmap from path: {labelmap_path}") From 6241e47621d2c4ed0551972f1c49eb8e621066f2 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 13:14:03 -0400 Subject: [PATCH 051/184] Fix imports in transforms/__init__.py, just import defaults not get_defaults from defaults --- src/vak/transforms/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/vak/transforms/__init__.py b/src/vak/transforms/__init__.py index 2a49ed777..859f020b9 100644 --- a/src/vak/transforms/__init__.py +++ b/src/vak/transforms/__init__.py @@ -1,3 +1,5 @@ -from . import frame_labels -from .defaults import get_defaults +from . import ( + defaults, + frame_labels +) from .transforms import * From 8af11ec838b4cfd743c1825dfc1a060b5f566ce7 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 13:25:29 -0400 Subject: [PATCH 052/184] Fixup src/vak/eval/frame_classification.py --- src/vak/eval/frame_classification.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vak/eval/frame_classification.py b/src/vak/eval/frame_classification.py index 8db6a5a46..a2133d055 100644 --- a/src/vak/eval/frame_classification.py +++ b/src/vak/eval/frame_classification.py @@ -144,6 +144,7 @@ def eval_frame_classification_model( labelmap = json.load(f) item_transform = transforms.defaults.get_default_transform( + model_name, "eval", transform_kwargs=dict( spect_standardizer=spect_standardizer, From 23ed923de0070671ab3633aafb2ca878573353d8 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 13:25:35 -0400 Subject: [PATCH 053/184] Fixup src/vak/predict/frame_classification.py --- src/vak/predict/frame_classification.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vak/predict/frame_classification.py b/src/vak/predict/frame_classification.py index 9451161ed..20689b342 100644 --- a/src/vak/predict/frame_classification.py +++ b/src/vak/predict/frame_classification.py @@ -150,6 +150,7 @@ def predict_with_frame_classification_model( spect_standardizer = None item_transform = transforms.defaults.get_default_transform( + model_name, "predict", transform_kwargs=dict( spect_standardizer=spect_standardizer, From 3246c48edb117283eaa7cab7544b5d1e3d1a9e62 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 13:25:43 -0400 Subject: [PATCH 054/184] Fixup src/vak/train/frame_classification.py --- src/vak/train/frame_classification.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/vak/train/frame_classification.py b/src/vak/train/frame_classification.py index a0ea7466f..d62cfa1fa 100644 --- a/src/vak/train/frame_classification.py +++ b/src/vak/train/frame_classification.py @@ -236,7 +236,7 @@ def train_frame_classification_model( ) spect_standardizer = None transform, target_transform = transforms.defaults.get_default_transform( - "train", transform_kwargs={'spect_standardizer': spect_standardizer} + model_name, "train", transform_kwargs={'spect_standardizer': spect_standardizer} ) train_dataset = WindowDataset.from_dataset_path( @@ -259,6 +259,7 @@ def train_frame_classification_model( # ---------------- load validation set (if there is one) ----------------------------------------------------------- if val_step: item_transform = transforms.defaults.get_default_transform( + model_name, "eval", transform_kwargs=dict( spect_standardizer=spect_standardizer, From b724eb5e777e7c1a029ce43e9312b6a5d4c09c41 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 13:26:40 -0400 Subject: [PATCH 055/184] Fix 'get_default_frame_classification_transform' to access transform_kwargs with get and have a default --- src/vak/transforms/defaults/frame_classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vak/transforms/defaults/frame_classification.py b/src/vak/transforms/defaults/frame_classification.py index 77b64b425..b7cf13360 100644 --- a/src/vak/transforms/defaults/frame_classification.py +++ b/src/vak/transforms/defaults/frame_classification.py @@ -259,8 +259,8 @@ def get_default_frame_classification_transform( item_transform = EvalItemTransform( spect_standardizer=spect_standardizer, window_size=transform_kwargs['window_size'], - padval=transform_kwargs['padval'], - return_padding_mask=transform_kwargs['return_padding_mask'], + padval=transform_kwargs.get('padval', 0.0), + return_padding_mask=transform_kwargs.get('return_padding_mask', True), ) return item_transform else: From bb8f6495bff35bc53c92d420818bab4c80b65f0b Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 13:29:48 -0400 Subject: [PATCH 056/184] Fix function name in 'make_learncurve_splits_from_dataset_df': 'split.dataframe' -> 'split.frame_classification_dataframe' --- src/vak/prep/frame_classification/learncurve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vak/prep/frame_classification/learncurve.py b/src/vak/prep/frame_classification/learncurve.py index c20fb67c9..1424642cb 100644 --- a/src/vak/prep/frame_classification/learncurve.py +++ b/src/vak/prep/frame_classification/learncurve.py @@ -32,7 +32,7 @@ def make_learncurve_splits_from_dataset_df( """Make splits for a learning curve from a dataframe representing the entire dataset. - Uses :func:`vak.prep.split.dataframe` to make + Uses :func:`vak.prep.split.frame_classification_dataframe` to make splits/subsets of the training data from ``dataset_df``, and then uses :func:`vak.prep.frame_classification.dataset_arrays.make_npy_files_for_each_split` @@ -101,7 +101,7 @@ def make_learncurve_splits_from_dataset_df( train_dur, replicate_num ) - train_dur_replicate_df = split.dataframe( + train_dur_replicate_df = split.frame_classification_dataframe( # copy to avoid mutating original train_split_df train_split_df.copy(), dataset_path, train_dur=train_dur, labelset=labelset ) From f3c4932ee71c8873231b0789a986179b250cb619 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 9 Jul 2023 13:36:00 -0400 Subject: [PATCH 057/184] Remove argument 'spect_key' in cli/predict that was removed from vak.predict.predict --- src/vak/cli/predict.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/vak/cli/predict.py b/src/vak/cli/predict.py index 5b51fee28..8ab363b2d 100644 --- a/src/vak/cli/predict.py +++ b/src/vak/cli/predict.py @@ -54,7 +54,6 @@ def predict(toml_path): labelmap_path=cfg.predict.labelmap_path, window_size=cfg.dataloader.window_size, num_workers=cfg.predict.num_workers, - spect_key=cfg.spect_params.spect_key, timebins_key=cfg.spect_params.timebins_key, spect_scaler_path=cfg.predict.spect_scaler_path, device=cfg.predict.device, From eee7896293feff208153437924e29dee768f9045 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 16 Jul 2023 14:59:29 -0500 Subject: [PATCH 058/184] Refactor/fix script that generates the 'generated' test data so we are not always running the prep step, that takes a really long time, and so that we actually run eval/predict/train_continue configs - add a CLI with argparse - and add option to run either prep step, results step, or all - and add option to specify commands to run after prep - and add option to be require only one results directory per train config or to instead just use the most recent one --- tests/scripts/generate_data_for_tests.py | 266 +++++++++++++++-------- 1 file changed, 177 insertions(+), 89 deletions(-) diff --git a/tests/scripts/generate_data_for_tests.py b/tests/scripts/generate_data_for_tests.py index 55a0c304c..2f1aa580a 100644 --- a/tests/scripts/generate_data_for_tests.py +++ b/tests/scripts/generate_data_for_tests.py @@ -1,21 +1,23 @@ -"""script run by Makefile test-data-generate command +"""This is the script run by nox session "test-data-generate". -makes all the 'generated' test data, i.e. files created by vak, +It makes all the 'generated' test data, i.e. files created by vak, It's called 'generated' test data to distinguish it from the -'source' test data, i.e., files **not** created by vak, that is, -the input data used when vak does create files (csv files, logs, -neural network checkpoints, etc.) - -This script generates: +'source' test data, i.e., files *not* created by vak: +the input data such as audio and annotation files that are used +when vak *does* create files (csv files, logs, +neural network checkpoints, etc.). +This script generates those files for use by unit and integration tests. +Specifically, it generates: * temporary config.toml files used when generating results * `prep`d (prepared) datasets, and results created with those datasets, both of which were generated using the temporary config.toml files -all the setup configs send output to one of two places: -for any prep command, the output goes to some child directory of ./tests/data_for_tests/generated/prep -for any command run with a `prep`d dataset, the output goes to some child dir of ./tests/data_for_tests/generated/results +All the setup configs send output to one of two places: +- for any prep command, the output goes to some child directory of ./tests/data_for_tests/generated/prep +- for any command run with a `prep`d dataset, the output goes to some child directory + of ./tests/data_for_tests/generated/results -examples: +Examples: when we run `vak prep tests/data_for_tests/generated/configs/test_train_audio_wav_annot_birdsongrec.toml` the `prep`d dataset will be in a new directory created in `./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsongrec` @@ -51,6 +53,7 @@ The directories will have names with timestamps like `prep_20201015_1115`. Those are the generated directories we want to remove. """ +import argparse from pathlib import Path import shutil @@ -157,15 +160,23 @@ def run_prep(config_paths): vak.cli.prep.prep(toml_path=config_path) -def fix_options_in_configs(config_paths, command): - """fix values assigned to options in predict and eval configs +def fix_options_in_configs(config_paths, model, command, single_train_result=True): + """Fix values assigned to options in predict and eval configs. Need to do this because both predict and eval configs have options - that can only be assigned *after* running the corresponding `train` config + that can only be assigned *after* running the corresponding `train` config. """ - # split configs into train and predict or eval configs - configs_to_fix = [config for config in config_paths if command in config.name] - train_configs = [config for config in config_paths if "train" in config.name] + if command not in ('eval', 'predict', 'train_continue'): + raise ValueError( + f'invalid command to fix config options: {command}' + ) + configs_to_fix, train_configs = [], [] + # split configs into predict/eval/"train_continue" configs and other configs + for config_path in config_paths: + if command in config_path.name: + configs_to_fix.append(config_path) + elif 'train' in config_path.name and 'continue' not in config_path.name: + train_configs.append(config_path) for config_to_fix in configs_to_fix: # figure out which 'train' config corresponds to this 'predict' or 'eval' config @@ -174,12 +185,12 @@ def fix_options_in_configs(config_paths, command): train_config_to_use = [] for train_config in train_configs: train_prefix, train_suffix = train_config.name.split("train") - if train_suffix == suffix: + if train_prefix.startswith(model) and train_suffix == suffix: train_config_to_use.append(train_config) - if len(train_config_to_use) != 1: + if len(train_config_to_use) > 1: raise ValueError( - f"did not find just a single train config that matches with predict config:\n" - f"{config_to_fix}" + f"Did not find just a single train config that matches with '{command}' config:\n" + f"{config_to_fix}\n" f"Matches were: {train_config_to_use}" ) train_config_to_use = train_config_to_use[0] @@ -190,14 +201,26 @@ def fix_options_in_configs(config_paths, command): train_config_toml = toml.load(fp) root_results_dir = Path(train_config_toml["TRAIN"]["root_results_dir"]) results_dir = sorted(root_results_dir.glob("results_*")) - if len(results_dir) != 1: + if len(results_dir) > 1: + if single_train_result: + raise ValueError( + f"Did not find just a single results directory in root_results_dir from train_config:\n" + f"{train_config_to_use}" + f"root_results_dir was: {root_results_dir}" + f'Matches for "results_*" were: {results_dir}' + ) + else: + results_dir = results_dir[-1] + elif len(results_dir) == 1: + results_dir = results_dir[0] + else: raise ValueError( - f"did not find just a single results directory in root_results_dir from train_config:\n" + f"Did not find a results directory in root_results_dir from train_config:\n" f"{train_config_to_use}" - f"root_results_dir was: {root_results_dir}" - f'Matches for "results_*" were: {results_dir}' + f"root_results_dir was:\n{root_results_dir}" + f'Matches for "results_*" were:\n{results_dir}' ) - results_dir = results_dir[0] + # these are the only options whose values we need to change # and they are the same for both predict and eval checkpoint_path = sorted(results_dir.glob("**/checkpoints/checkpoint.pt"))[0] @@ -238,72 +261,137 @@ def fix_options_in_configs(config_paths, command): ) -def generate_test_data(): - print( - "copying config files run to generate test data from ./tests/data_for_tests/configs to " - "./tests/data_for_tests/generated/configs" - ) - GENERATED_TEST_CONFIGS_ROOT.mkdir(parents=True) - config_paths = copy_config_files() +def generate_test_data( + step: str = 'all', + commands=COMMANDS, + single_train_result:bool = True, +): + """Main function that generates all the test data. + + Parameters + ---------- + step : str + The step of generating test data to carry out. + One of ('prep', 'results', or 'all'). + This option is provided through the command-line + because generating all the data sets is time-consuming + and may not be needed every time test data is re-generated. + Running the 'prep' step will run just `vak prep` + for every configuration file. + Running the 'results' step assumes that the 'prep' step + has already been run, and it will run the appropriate command + for each configuration file to generate results with the prep'd + dataset, e.g. `vak train`. + Running 'both' will first run 'prep' and then 'results'--this is + the default. + commands : list + List of commands to run for 'results' step. + """ + # need to run `prep` before we run other commands + if step in ('prep', 'all'): + print( + "copying config files run to generate test data from ./tests/data_for_tests/configs to " + "./tests/data_for_tests/generated/configs" + ) + GENERATED_TEST_CONFIGS_ROOT.mkdir(parents=True) + config_paths = copy_config_files() - print(f"will generate test data from these config files: {config_paths}") + print(f"will generate test data from these config files: {config_paths}") - print( - "making sub-directories in ./tests/data_for_tests/generated/ where files generated by `vak` will go" - ) - make_subdirs_in_generated(config_paths) + print( + "making sub-directories in ./tests/data_for_tests/generated/ where files generated by `vak` will go" + ) + make_subdirs_in_generated(config_paths) + + for model in MODELS: + config_paths_this_model = [ + config_path + for config_path in config_paths + if config_path.name.startswith(model) + ] + run_prep(config_paths=config_paths_this_model) + else: + config_paths = sorted(GENERATED_TEST_CONFIGS_ROOT.glob('*.toml')) + + if step in ('results', 'all'): + for model in MODELS: + for command in commands: + if command == "prep": + continue # we don't run prep in this code block + print(f"running configs for command: {command}") + + # print(f"using the following configs:\n{command_config_paths}") + if command == 'train' or command == 'learncurve': + command_config_paths = [ + config_path + for config_path in config_paths + if config_path.name.startswith(model) and command in config_path.name + ] + if command == "train": + # need to remove 'train_continue' configs + command_config_paths = [ + config_path for config_path in command_config_paths + if 'continue' not in config_path.name + ] + # we run `train` to get results needed for `eval', 'predict' and continuing 'train'; + # we run `learncurve` so there's a `previous_run_path` to test; + # skip all other commands + for config_path in command_config_paths: + vak.cli.cli.cli(command, config_path) + + elif command in ("predict", "eval", "train_continue"): + # Fix values for required options in predict / eval / train_continue configs + # using results from running the corresponding train configs. + # this only works if we ran the train configs already, + # which we should have because of ordering of COMMANDS constant above + fix_options_in_configs(config_paths, model, command, single_train_result) + command_config_paths = [ + config_path + for config_path in config_paths + if config_path.name.startswith(model) and command in config_path.name + ] + for config_path in command_config_paths: + vak.cli.cli.cli(command, config_path) + + +GENERATE_TEST_DATA_STEPS = ( + 'prep', + 'results', + 'all', +) - # need to run `prep` before we run other commands - for model in MODELS: - config_paths_this_model = [ - config_path - for config_path in config_paths - if config_path.name.startswith(model) - ] - run_prep(config_paths=config_paths_this_model) - - for model in MODELS: - for command in COMMANDS: - if command == "prep": - continue # already ran 'prep' - print(f"running configs for command: {command}") - command_config_paths = [ - config_path - for config_path in config_paths - if config_path.name.startswith(model) and command in config_path.name - ] - if command == "train": - # need to remove 'train_continue' configs - command_config_paths = [ - config_path for config_path in command_config_paths - if 'continue' not in config_path.name - ] - elif command == "train_continue": - # only keep 'train_continue' configs - command_config_paths = [ - config_path for config_path in command_config_paths - if 'continue' in config_path.name - ] - print(f"using the following configs:\n{command_config_paths}") - if command in ("predict", "eval", "train_continue"): - # fix values for required options in predict / eval configs - # using results from running the corresponding train configs. - # this only works if we ran the train configs already, - # which we should have because of ordering of COMMANDS constant above - copied_config_paths_this_model = [ - config_path - for config_path in config_paths - if config_path.name.startswith(model) - ] - fix_options_in_configs(copied_config_paths_this_model, command) - - if command == 'train' or command == 'learncurve': - # we run `train` to get results needed for `eval', 'predict' and continuing 'train'; - # we run `learncurve` so there's a `previous_run_path` to test; - # skip all other commands - for config_path in command_config_paths: - vak.cli.cli.cli(command, config_path) + +def get_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--step', + choices=GENERATE_TEST_DATA_STEPS, + help=f"Which step of generating test data to perform, one of: {GENERATE_TEST_DATA_STEPS}", + default='all' + ) + parser.add_argument( + '--commands', + choices=('train', 'learncurve', 'eval', 'predict', 'train_continue'), + help=f"Space-separated list of commands to run for 'results' step", + nargs="+", + ) + parser.add_argument( + '--single-train-result', + action = argparse.BooleanOptionalAction, + help=( + "If --single-train-result, require there be a single results directory " + "from any training config when looking for them to use in toher configs. " + "If --no-single-train-result, allow multiple and use the most recent." + ) + ) + return parser if __name__ == "__main__": - generate_test_data() + parser = get_parser() + args = parser.parse_args() + generate_test_data( + step=args.step, + commands=args.commands, + single_train_result=args.single_train_result, + ) From 1b99d8db145d86027be48eeae2e9559118fa53be Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 16 Jul 2023 15:02:05 -0500 Subject: [PATCH 059/184] Fix name in predict/frame_classificaton.py: -> 'datasets.frame_classification.metadata.Metadata' --- src/vak/predict/frame_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/predict/frame_classification.py b/src/vak/predict/frame_classification.py index 20689b342..79929f05b 100644 --- a/src/vak/predict/frame_classification.py +++ b/src/vak/predict/frame_classification.py @@ -187,7 +187,7 @@ def predict_with_frame_classification_model( annot_csv_path = pathlib.Path(output_dir).joinpath(annot_csv_filename) logger.info(f"will save annotations in .csv file: {annot_csv_path}") - metadata = datasets.metadata.Metadata.from_dataset_path(dataset_path) + metadata = datasets.frame_classification.metadata.Metadata.from_dataset_path(dataset_path) frame_dur = metadata.frame_dur logger.info( f"Duration of a frame in dataset, in seconds: {frame_dur}", From fd2d5f4b9700182616c1ee215dc4e09d905dbe3c Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 16 Jul 2023 15:11:07 -0500 Subject: [PATCH 060/184] Fix frames dataset so frames_labels_paths is None when split is 'predict' --- src/vak/datasets/frame_classification/frames_dataset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/vak/datasets/frame_classification/frames_dataset.py b/src/vak/datasets/frame_classification/frames_dataset.py index 24b043970..49d6e8c64 100644 --- a/src/vak/datasets/frame_classification/frames_dataset.py +++ b/src/vak/datasets/frame_classification/frames_dataset.py @@ -44,7 +44,10 @@ def __init__( dataset_df = dataset_df[dataset_df.split == split].copy() self.dataset_df = dataset_df self.frames_paths = self.dataset_df[constants.FRAMES_NPY_PATH_COL_NAME].values - self.frame_labels_paths = self.dataset_df[constants.FRAME_LABELS_NPY_PATH_COL_NAME].values + if split != 'predict': + self.frame_labels_paths = self.dataset_df[constants.FRAME_LABELS_NPY_PATH_COL_NAME].values + else: + self.frame_labels_paths = None if input_type == 'audio': self.source_paths = self.dataset_df['audio_path'].values From 5a3b46e2b1496895039712a71919bdec5337bb39 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 16 Jul 2023 15:12:26 -0500 Subject: [PATCH 061/184] Use dict get method with transform_kwargs for PredictItemTransform in transforms/defaults/frame_classification.py --- src/vak/transforms/defaults/frame_classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vak/transforms/defaults/frame_classification.py b/src/vak/transforms/defaults/frame_classification.py index b7cf13360..40f508516 100644 --- a/src/vak/transforms/defaults/frame_classification.py +++ b/src/vak/transforms/defaults/frame_classification.py @@ -250,8 +250,8 @@ def get_default_frame_classification_transform( item_transform = PredictItemTransform( spect_standardizer=spect_standardizer, window_size=transform_kwargs['window_size'], - padval=transform_kwargs['padval'], - return_padding_mask=transform_kwargs['return_padding_mask'], + padval=transform_kwargs.get('padval', 0.0), + return_padding_mask=transform_kwargs.get('return_padding_mask', True), ) return item_transform From 6d99d5551a8a30f062d3b9585abd6ad5dd9a9fd7 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 16 Jul 2023 16:32:08 -0500 Subject: [PATCH 062/184] Fix how we add spect_format to metadata in prep/frame_classification/frame_classification.py --- src/vak/prep/frame_classification/frame_classification.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/vak/prep/frame_classification/frame_classification.py b/src/vak/prep/frame_classification/frame_classification.py index 2ddaf0e39..a58c83d4f 100644 --- a/src/vak/prep/frame_classification/frame_classification.py +++ b/src/vak/prep/frame_classification/frame_classification.py @@ -387,6 +387,11 @@ def prep_frame_classification_dataset( # ---- save metadata ----------------------------------------------------------------------------------------------- frame_dur = validators.validate_and_get_frame_dur(dataset_df, input_type) + if input_type == 'spect' and spect_format != 'npz': + # then change to npz since we canonicalize data so it's always npz arrays + # We need this to be correct for other functions, e.g. predict when it loads spectrogram files + spect_format = 'npz' + metadata = datasets.frame_classification.Metadata( dataset_csv_filename=str(dataset_csv_path.name), frame_dur=frame_dur, From 22777ded4da8e1d7cced1874e585da4421dcf83d Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 16 Jul 2023 16:34:53 -0500 Subject: [PATCH 063/184] Fix how we determine source_paths for input_type 'spect' in FramesDataset --- src/vak/datasets/frame_classification/frames_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/datasets/frame_classification/frames_dataset.py b/src/vak/datasets/frame_classification/frames_dataset.py index 49d6e8c64..a58654c30 100644 --- a/src/vak/datasets/frame_classification/frames_dataset.py +++ b/src/vak/datasets/frame_classification/frames_dataset.py @@ -52,7 +52,7 @@ def __init__( if input_type == 'audio': self.source_paths = self.dataset_df['audio_path'].values elif input_type == 'spect': - self.source_paths = self.dataset_df['audio_path'].values + self.source_paths = self.dataset_df['spect_path'].values else: raise ValueError( f"Invalid `input_type`: {input_type}. Must be one of {{'audio', 'spect'}}." From b9319c012df0334199de94c3f1a10e7748442b1a Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 16 Jul 2023 16:35:53 -0500 Subject: [PATCH 064/184] Fix arg name in predict/frame_classification.py --- src/vak/predict/frame_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/predict/frame_classification.py b/src/vak/predict/frame_classification.py index 79929f05b..3276cda55 100644 --- a/src/vak/predict/frame_classification.py +++ b/src/vak/predict/frame_classification.py @@ -283,7 +283,7 @@ def predict_with_frame_classification_model( labels, onsets_s, offsets_s = transforms.frame_labels.to_segments( y_pred, labelmap=labelmap, - t=frame_times, + frame_times=frame_times, ) if labels is None and onsets_s is None and offsets_s is None: # handle the case when all time bins are predicted to be unlabeled From 2b64aa596bec3b4121dd1c92adc94946f3f96403 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 16 Jul 2023 16:37:33 -0500 Subject: [PATCH 065/184] Add print statements to tests/scripts/generate_data_for_tests.py so we can more easily troubleshoot --- tests/scripts/generate_data_for_tests.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/scripts/generate_data_for_tests.py b/tests/scripts/generate_data_for_tests.py index 2f1aa580a..b28f44273 100644 --- a/tests/scripts/generate_data_for_tests.py +++ b/tests/scripts/generate_data_for_tests.py @@ -337,6 +337,9 @@ def generate_test_data( # we run `learncurve` so there's a `previous_run_path` to test; # skip all other commands for config_path in command_config_paths: + print( + f"n\Running 'vak {command}' with model '{model}', using config: {config_path.name}" + ) vak.cli.cli.cli(command, config_path) elif command in ("predict", "eval", "train_continue"): @@ -351,6 +354,10 @@ def generate_test_data( if config_path.name.startswith(model) and command in config_path.name ] for config_path in command_config_paths: + for config_path in command_config_paths: + print( + f"\nRunning 'vak {command}' with model '{model}', using config: {config_path.name}" + ) vak.cli.cli.cli(command, config_path) From 0df0bf97611ed8d825502df40f3d3b297d278b64 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 09:54:52 -0500 Subject: [PATCH 066/184] Modify generate_data_for_tests script so it only preps datasets once --- tests/scripts/generate_data_for_tests.py | 79 ++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 6 deletions(-) diff --git a/tests/scripts/generate_data_for_tests.py b/tests/scripts/generate_data_for_tests.py index b28f44273..53ed25086 100644 --- a/tests/scripts/generate_data_for_tests.py +++ b/tests/scripts/generate_data_for_tests.py @@ -56,10 +56,15 @@ import argparse from pathlib import Path import shutil +import warnings + +from numba.core.errors import NumbaDeprecationWarning +warnings.simplefilter('ignore', category=NumbaDeprecationWarning) import toml import vak + HERE = Path(__file__).parent TEST_DATA_ROOT = HERE / ".." / "data_for_tests" GENERATED_TEST_DATA = TEST_DATA_ROOT / "generated" @@ -75,8 +80,17 @@ # like 'invalid_option_config.toml` TEST_CONFIGS_ROOT = TEST_DATA_ROOT.joinpath("configs") CONFIGS_TO_RUN = [] -MODELS = ("teenytweetynet", "tweetynet") -for model in MODELS: + +MODELS_PREP = ("tweetynet") +MODELS_REUSE_PREP = { + "teenytweetynet": "tweetynet" +} + +MODELS_RESULTS = ( + "teenytweetynet", + "tweetynet" +) +for model in MODELS_RESULTS: CONFIGS_TO_RUN.extend(sorted(TEST_CONFIGS_ROOT.glob(f"{model}*.toml"))) # the sub-directories that will get made inside `./tests/data_for_tests/generated` @@ -122,7 +136,7 @@ def make_subdirs_in_generated(config_paths): ): continue # no need to make this dir - for model in MODELS: + for model in MODELS_RESULTS: subdir_to_make = ( GENERATED_TEST_DATA / top_level_dir / command_dir / data_dir / model ) @@ -155,11 +169,44 @@ def run_prep(config_paths): if not config_path.exists(): raise FileNotFoundError(f"{config_path} not found") print( - f"running vak prep to generate data for tests test, using config: {config_path.name}" + f"running vak prep to generate data for tests, using config: {config_path.name}" ) vak.cli.prep.prep(toml_path=config_path) +def add_dataset_path_from_prepped_configs(target_configs, target_model, source_configs, source_model): + for target_config_path in target_configs: + suffix_to_match = config_path.name.replace(target_model, '') # remove model name at start of config name + source_config_path = [ + source_config_path + for source_config_path in source_configs + if source_config_path.name.replace(source_model, '') == suffix_to_match + ] + source_config_path = source_config_path[0] + command = [ + command + for command in COMMANDS + if command in source_config_path.name + ][0] + if command == 'train_continue': + section = 'TRAIN' + else: + section = command.upper() + print( + f"Re-using prepped dataset from model '{source_model}' config:\n{source_config_path}\n" + f"Will use for model '{target_model}' config:\n{target_config_path}" + ) + + with source_config_path.open("r") as fp: + source_config_toml = toml.load(fp) + dataset_path = source_config_toml[section]['dataset_path'] + with target_config_path.open("r") as fp: + target_config_toml = toml.load(fp) + target_config_toml[section]['dataset_path'] = dataset_path + with target_config_path.open("w") as fp: + toml.dump(target_config_toml, fp) + + def fix_options_in_configs(config_paths, model, command, single_train_result=True): """Fix values assigned to options in predict and eval configs. @@ -303,18 +350,38 @@ def generate_test_data( ) make_subdirs_in_generated(config_paths) - for model in MODELS: + # run prep for some models + for model in MODELS_PREP: config_paths_this_model = [ config_path for config_path in config_paths if config_path.name.startswith(model) ] run_prep(config_paths=config_paths_this_model) + # re-use some of the prepped datasets for other models + # this makes time to prep all datasets shorter + for target_model, source_model in MODELS_REUSE_PREP.items(): + print( + f"Re-using prepped datasets from model '{source_model}' for model '{target_model}'." + ) + config_paths_source_model = [ + config_path + for config_path in config_paths + if config_path.name.startswith(source_model_prep) + ] + config_paths_target_model = [ + config_path + for config_path in config_paths + if config_path.name.startswith(target_model_prep) + ] + add_dataset_path_from_prepped_configs(config_paths_target_model, target_model, + config_paths_source_model, source_model) + else: config_paths = sorted(GENERATED_TEST_CONFIGS_ROOT.glob('*.toml')) if step in ('results', 'all'): - for model in MODELS: + for model in MODELS_RESULTS: for command in commands: if command == "prep": continue # we don't run prep in this code block From e7b68753054e288e6d4a86134f67aa22e42adb72 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 11:41:22 -0500 Subject: [PATCH 067/184] Rename configs in test data so model name is capitalized --- ...tmat.toml => TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml} | 0 ...oml => TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml} | 0 ...t.toml => TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml} | 0 ...ml => TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml} | 0 ...mat.toml => TeenyTweetyNet_train_audio_cbin_annot_notmat.toml} | 0 ...toml => TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml} | 0 ...=> TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml} | 0 ...eenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml} | 0 ... => TeenyTweetyNet_train_continue_spect_mat_annot_yarden.toml} | 0 ...rden.toml => TeenyTweetyNet_train_spect_mat_annot_yarden.toml} | 0 ...ot_notmat.toml => TweetyNet_eval_audio_cbin_annot_notmat.toml} | 0 ...mat.toml => TweetyNet_learncurve_audio_cbin_annot_notmat.toml} | 0 ...notmat.toml => TweetyNet_predict_audio_cbin_annot_notmat.toml} | 0 ...ec.toml => TweetyNet_predict_audio_wav_annot_birdsongrec.toml} | 0 ...t_notmat.toml => TweetyNet_train_audio_cbin_annot_notmat.toml} | 0 ...grec.toml => TweetyNet_train_audio_wav_annot_birdsongrec.toml} | 0 ...toml => TweetyNet_train_continue_audio_cbin_annot_notmat.toml} | 0 ... => TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml} | 0 ....toml => TweetyNet_train_continue_spect_mat_annot_yarden.toml} | 0 ...ot_yarden.toml => TweetyNet_train_spect_mat_annot_yarden.toml} | 0 20 files changed, 0 insertions(+), 0 deletions(-) rename tests/data_for_tests/configs/{teenytweetynet_eval_audio_cbin_annot_notmat.toml => TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml} (100%) rename tests/data_for_tests/configs/{teenytweetynet_learncurve_audio_cbin_annot_notmat.toml => TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml} (100%) rename tests/data_for_tests/configs/{teenytweetynet_predict_audio_cbin_annot_notmat.toml => TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml} (100%) rename tests/data_for_tests/configs/{teenytweetynet_predict_audio_wav_annot_birdsongrec.toml => TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml} (100%) rename tests/data_for_tests/configs/{teenytweetynet_train_audio_cbin_annot_notmat.toml => TeenyTweetyNet_train_audio_cbin_annot_notmat.toml} (100%) rename tests/data_for_tests/configs/{teenytweetynet_train_audio_wav_annot_birdsongrec.toml => TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml} (100%) rename tests/data_for_tests/configs/{teenytweetynet_train_continue_audio_cbin_annot_notmat.toml => TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml} (100%) rename tests/data_for_tests/configs/{teenytweetynet_train_continue_audio_wav_annot_birdsongrec.toml => TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml} (100%) rename tests/data_for_tests/configs/{teenytweetynet_train_continue_spect_mat_annot_yarden.toml => TeenyTweetyNet_train_continue_spect_mat_annot_yarden.toml} (100%) rename tests/data_for_tests/configs/{teenytweetynet_train_spect_mat_annot_yarden.toml => TeenyTweetyNet_train_spect_mat_annot_yarden.toml} (100%) rename tests/data_for_tests/configs/{tweetynet_eval_audio_cbin_annot_notmat.toml => TweetyNet_eval_audio_cbin_annot_notmat.toml} (100%) rename tests/data_for_tests/configs/{tweetynet_learncurve_audio_cbin_annot_notmat.toml => TweetyNet_learncurve_audio_cbin_annot_notmat.toml} (100%) rename tests/data_for_tests/configs/{tweetynet_predict_audio_cbin_annot_notmat.toml => TweetyNet_predict_audio_cbin_annot_notmat.toml} (100%) rename tests/data_for_tests/configs/{tweetynet_predict_audio_wav_annot_birdsongrec.toml => TweetyNet_predict_audio_wav_annot_birdsongrec.toml} (100%) rename tests/data_for_tests/configs/{tweetynet_train_audio_cbin_annot_notmat.toml => TweetyNet_train_audio_cbin_annot_notmat.toml} (100%) rename tests/data_for_tests/configs/{tweetynet_train_audio_wav_annot_birdsongrec.toml => TweetyNet_train_audio_wav_annot_birdsongrec.toml} (100%) rename tests/data_for_tests/configs/{tweetynet_train_continue_audio_cbin_annot_notmat.toml => TweetyNet_train_continue_audio_cbin_annot_notmat.toml} (100%) rename tests/data_for_tests/configs/{tweetynet_train_continue_audio_wav_annot_birdsongrec.toml => TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml} (100%) rename tests/data_for_tests/configs/{tweetynet_train_continue_spect_mat_annot_yarden.toml => TweetyNet_train_continue_spect_mat_annot_yarden.toml} (100%) rename tests/data_for_tests/configs/{tweetynet_train_spect_mat_annot_yarden.toml => TweetyNet_train_spect_mat_annot_yarden.toml} (100%) diff --git a/tests/data_for_tests/configs/teenytweetynet_eval_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml similarity index 100% rename from tests/data_for_tests/configs/teenytweetynet_eval_audio_cbin_annot_notmat.toml rename to tests/data_for_tests/configs/TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml diff --git a/tests/data_for_tests/configs/teenytweetynet_learncurve_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml similarity index 100% rename from tests/data_for_tests/configs/teenytweetynet_learncurve_audio_cbin_annot_notmat.toml rename to tests/data_for_tests/configs/TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml diff --git a/tests/data_for_tests/configs/teenytweetynet_predict_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml similarity index 100% rename from tests/data_for_tests/configs/teenytweetynet_predict_audio_cbin_annot_notmat.toml rename to tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml diff --git a/tests/data_for_tests/configs/teenytweetynet_predict_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml similarity index 100% rename from tests/data_for_tests/configs/teenytweetynet_predict_audio_wav_annot_birdsongrec.toml rename to tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml diff --git a/tests/data_for_tests/configs/teenytweetynet_train_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_cbin_annot_notmat.toml similarity index 100% rename from tests/data_for_tests/configs/teenytweetynet_train_audio_cbin_annot_notmat.toml rename to tests/data_for_tests/configs/TeenyTweetyNet_train_audio_cbin_annot_notmat.toml diff --git a/tests/data_for_tests/configs/teenytweetynet_train_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml similarity index 100% rename from tests/data_for_tests/configs/teenytweetynet_train_audio_wav_annot_birdsongrec.toml rename to tests/data_for_tests/configs/TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml diff --git a/tests/data_for_tests/configs/teenytweetynet_train_continue_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml similarity index 100% rename from tests/data_for_tests/configs/teenytweetynet_train_continue_audio_cbin_annot_notmat.toml rename to tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml diff --git a/tests/data_for_tests/configs/teenytweetynet_train_continue_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml similarity index 100% rename from tests/data_for_tests/configs/teenytweetynet_train_continue_audio_wav_annot_birdsongrec.toml rename to tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml diff --git a/tests/data_for_tests/configs/teenytweetynet_train_continue_spect_mat_annot_yarden.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_spect_mat_annot_yarden.toml similarity index 100% rename from tests/data_for_tests/configs/teenytweetynet_train_continue_spect_mat_annot_yarden.toml rename to tests/data_for_tests/configs/TeenyTweetyNet_train_continue_spect_mat_annot_yarden.toml diff --git a/tests/data_for_tests/configs/teenytweetynet_train_spect_mat_annot_yarden.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_spect_mat_annot_yarden.toml similarity index 100% rename from tests/data_for_tests/configs/teenytweetynet_train_spect_mat_annot_yarden.toml rename to tests/data_for_tests/configs/TeenyTweetyNet_train_spect_mat_annot_yarden.toml diff --git a/tests/data_for_tests/configs/tweetynet_eval_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_eval_audio_cbin_annot_notmat.toml similarity index 100% rename from tests/data_for_tests/configs/tweetynet_eval_audio_cbin_annot_notmat.toml rename to tests/data_for_tests/configs/TweetyNet_eval_audio_cbin_annot_notmat.toml diff --git a/tests/data_for_tests/configs/tweetynet_learncurve_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml similarity index 100% rename from tests/data_for_tests/configs/tweetynet_learncurve_audio_cbin_annot_notmat.toml rename to tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml diff --git a/tests/data_for_tests/configs/tweetynet_predict_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml similarity index 100% rename from tests/data_for_tests/configs/tweetynet_predict_audio_cbin_annot_notmat.toml rename to tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml diff --git a/tests/data_for_tests/configs/tweetynet_predict_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml similarity index 100% rename from tests/data_for_tests/configs/tweetynet_predict_audio_wav_annot_birdsongrec.toml rename to tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml diff --git a/tests/data_for_tests/configs/tweetynet_train_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_train_audio_cbin_annot_notmat.toml similarity index 100% rename from tests/data_for_tests/configs/tweetynet_train_audio_cbin_annot_notmat.toml rename to tests/data_for_tests/configs/TweetyNet_train_audio_cbin_annot_notmat.toml diff --git a/tests/data_for_tests/configs/tweetynet_train_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml similarity index 100% rename from tests/data_for_tests/configs/tweetynet_train_audio_wav_annot_birdsongrec.toml rename to tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml diff --git a/tests/data_for_tests/configs/tweetynet_train_continue_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml similarity index 100% rename from tests/data_for_tests/configs/tweetynet_train_continue_audio_cbin_annot_notmat.toml rename to tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml diff --git a/tests/data_for_tests/configs/tweetynet_train_continue_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml similarity index 100% rename from tests/data_for_tests/configs/tweetynet_train_continue_audio_wav_annot_birdsongrec.toml rename to tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml diff --git a/tests/data_for_tests/configs/tweetynet_train_continue_spect_mat_annot_yarden.toml b/tests/data_for_tests/configs/TweetyNet_train_continue_spect_mat_annot_yarden.toml similarity index 100% rename from tests/data_for_tests/configs/tweetynet_train_continue_spect_mat_annot_yarden.toml rename to tests/data_for_tests/configs/TweetyNet_train_continue_spect_mat_annot_yarden.toml diff --git a/tests/data_for_tests/configs/tweetynet_train_spect_mat_annot_yarden.toml b/tests/data_for_tests/configs/TweetyNet_train_spect_mat_annot_yarden.toml similarity index 100% rename from tests/data_for_tests/configs/tweetynet_train_spect_mat_annot_yarden.toml rename to tests/data_for_tests/configs/TweetyNet_train_spect_mat_annot_yarden.toml From 9ce324f303803793ca8aa690683e55fd40544af1 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 11:57:13 -0500 Subject: [PATCH 068/184] Update tests/data_for_tests/configs/configs.json - Capitalize model names - Add field "use_dataset_from_config" that points to another config from which the dataset path should be used. This avoids doing a bunch of filtering logic inside the script for generating test data in favor of a declarative approach --- tests/data_for_tests/configs/configs.json | 137 ++++++++++++---------- 1 file changed, 78 insertions(+), 59 deletions(-) diff --git a/tests/data_for_tests/configs/configs.json b/tests/data_for_tests/configs/configs.json index 1986ed90c..07ef3f9ce 100644 --- a/tests/data_for_tests/configs/configs.json +++ b/tests/data_for_tests/configs/configs.json @@ -1,160 +1,179 @@ { "configs": [ { - "filename": "tweetynet_eval_audio_cbin_annot_notmat.toml", - "model": "tweetynet", + "filename": "TweetyNet_eval_audio_cbin_annot_notmat.toml", + "model": "TweetyNet", "config_type": "eval", "audio_format": "cbin", "spect_format": null, - "annot_format": "notmat" + "annot_format": "notmat", + "use_dataset_from_config": null }, { - "filename": "tweetynet_learncurve_audio_cbin_annot_notmat.toml", - "model": "tweetynet", + "filename": "TweetyNet_learncurve_audio_cbin_annot_notmat.toml", + "model": "TweetyNet", "config_type": "learncurve", "audio_format": "cbin", "spect_format": null, - "annot_format": "notmat" + "annot_format": "notmat", + "use_dataset_from_config": null }, { - "filename": "tweetynet_predict_audio_cbin_annot_notmat.toml", - "model": "tweetynet", + "filename": "TweetyNet_predict_audio_cbin_annot_notmat.toml", + "model": "TweetyNet", "config_type": "predict", "audio_format": "cbin", "spect_format": null, - "annot_format": "notmat" + "annot_format": "notmat", + "use_dataset_from_config": null }, { - "filename": "tweetynet_predict_audio_wav_annot_birdsongrec.toml", - "model": "tweetynet", + "filename": "TweetyNet_predict_audio_wav_annot_birdsongrec.toml", + "model": "TweetyNet", "config_type": "predict", "audio_format": "wav", "spect_format": null, - "annot_format": "birdsong-recognition-dataset" + "annot_format": "birdsong-recognition-dataset", + "use_dataset_from_config": null }, { - "filename": "tweetynet_train_audio_cbin_annot_notmat.toml", - "model": "tweetynet", + "filename": "TweetyNet_train_audio_cbin_annot_notmat.toml", + "model": "TweetyNet", "config_type": "train", "audio_format": "cbin", "spect_format": null, - "annot_format": "notmat" + "annot_format": "notmat", + "use_dataset_from_config": null }, { - "filename": "tweetynet_train_audio_wav_annot_birdsongrec.toml", - "model": "tweetynet", + "filename": "TweetyNet_train_audio_wav_annot_birdsongrec.toml", + "model": "TweetyNet", "config_type": "train", "audio_format": "wav", "spect_format": null, - "annot_format": "birdsong-recognition-dataset" + "annot_format": "birdsong-recognition-dataset", + "use_dataset_from_config": null }, { - "filename": "tweetynet_train_spect_mat_annot_yarden.toml", - "model": "tweetynet", + "filename": "TweetyNet_train_spect_mat_annot_yarden.toml", + "model": "TweetyNet", "config_type": "train", "audio_format": null, "spect_format": "mat", - "annot_format": "yarden" + "annot_format": "yarden", + "use_dataset_from_config": null }, { - "filename": "tweetynet_train_continue_audio_cbin_annot_notmat.toml", - "model": "tweetynet", + "filename": "TweetyNet_train_continue_audio_cbin_annot_notmat.toml", + "model": "TweetyNet", "config_type": "train_continue", "audio_format": "cbin", "spect_format": null, - "annot_format": "notmat" + "annot_format": "notmat", + "use_dataset_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml" }, { - "filename": "tweetynet_train_continue_audio_wav_annot_birdsongrec.toml", - "model": "tweetynet", + "filename": "TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml", + "model": "TweetyNet", "config_type": "train_continue", "audio_format": "wav", "spect_format": null, - "annot_format": "birdsong-recognition-dataset" + "annot_format": "birdsong-recognition-dataset", + "use_dataset_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml" }, { - "filename": "tweetynet_train_continue_spect_mat_annot_yarden.toml", - "model": "tweetynet", + "filename": "TweetyNet_train_continue_spect_mat_annot_yarden.toml", + "model": "TweetyNet", "config_type": "train_continue", "audio_format": null, "spect_format": "mat", - "annot_format": "yarden" + "annot_format": "yarden", + "use_dataset_from_config": "TweetyNet_train_spect_mat_annot_yarden.toml" }, { - "filename": "teenytweetynet_eval_audio_cbin_annot_notmat.toml", - "model": "teenytweetynet", + "filename": "TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml", + "model": "TeenyTweetyNet", "config_type": "eval", "audio_format": "cbin", "spect_format": null, - "annot_format": "notmat" + "annot_format": "notmat", + "use_dataset_from_config": "TweetyNet_eval_audio_cbin_annot_notmat.toml" }, { - "filename": "teenytweetynet_learncurve_audio_cbin_annot_notmat.toml", - "model": "teenytweetynet", + "filename": "TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml", + "model": "TeenyTweetyNet", "config_type": "learncurve", "audio_format": "cbin", "spect_format": null, - "annot_format": "notmat" + "annot_format": "notmat", + "use_dataset_from_config": "TweetyNet_learncurve_audio_cbin_annot_notmat.toml" }, { - "filename": "teenytweetynet_predict_audio_cbin_annot_notmat.toml", - "model": "teenytweetynet", + "filename": "TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml", + "model": "TeenyTweetyNet", "config_type": "predict", "audio_format": "cbin", "spect_format": null, - "annot_format": "notmat" + "annot_format": "notmat", + "use_dataset_from_config": "TweetyNet_predict_audio_cbin_annot_notmat.toml", }, { - "filename": "teenytweetynet_predict_audio_wav_annot_birdsongrec.toml", - "model": "teenytweetynet", + "filename": "TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml", + "model": "TeenyTweetyNet", "config_type": "predict", "audio_format": "wav", "spect_format": null, - "annot_format": "birdsong-recognition-dataset" + "annot_format": "birdsong-recognition-dataset", + "use_dataset_from_config": "TweetyNet_predict_audio_wav_annot_birdsongrec.toml" }, { - "filename": "teenytweetynet_train_audio_cbin_annot_notmat.toml", - "model": "teenytweetynet", + "filename": "TeenyTweetyNet_train_audio_cbin_annot_notmat.toml", + "model": "TeenyTweetyNet", "config_type": "train", "audio_format": "cbin", "spect_format": null, - "annot_format": "notmat" + "annot_format": "notmat", + "use_dataset_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml" }, { - "filename": "teenytweetynet_train_audio_wav_annot_birdsongrec.toml", - "model": "teenytweetynet", + "filename": "TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml", + "model": "TeenyTweetyNet", "config_type": "train", "audio_format": "wav", "spect_format": null, - "annot_format": "birdsong-recognition-dataset" + "annot_format": "birdsong-recognition-dataset", + "use_dataset_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml" }, { - "filename": "teenytweetynet_train_spect_mat_annot_yarden.toml", - "model": "teenytweetynet", + "filename": "TeenyTweetyNet_train_spect_mat_annot_yarden.toml", + "model": "TeenyTweetyNet", "config_type": "train", "audio_format": null, "spect_format": "mat", - "annot_format": "yarden" + "annot_format": "yarden", + "use_dataset_from_config": "TweetyNet_train_spect_mat_annot_yarden.toml" }, { - "filename": "teenytweetynet_train_continue_audio_cbin_annot_notmat.toml", - "model": "teenytweetynet", + "filename": "TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml", + "model": "TeenyTweetyNet", "config_type": "train_continue", "audio_format": "cbin", "spect_format": null, - "annot_format": "notmat" + "annot_format": "notmat", + "use_dataset_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml", }, { - "filename": "teenytweetynet_train_continue_audio_wav_annot_birdsongrec.toml", - "model": "teenytweetynet", + "filename": "TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml", + "model": "TeenyTweetyNet", "config_type": "train_continue", "audio_format": "wav", "spect_format": null, - "annot_format": "birdsong-recognition-dataset" + "annot_format": "birdsong-recognition-dataset", + "use_dataset_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml" }, { - "filename": "teenytweetynet_train_continue_spect_mat_annot_yarden.toml", - "model": "teenytweetynet", + "filename": "TeenyTweetyNet_train_continue_spect_mat_annot_yarden.toml", + "model": "TeenyTweetyNet", "config_type": "train_continue", "audio_format": null, "spect_format": "mat", From 4ab7a33119d7ae69cf364afd05622c7bb5c5fe22 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 12:29:02 -0500 Subject: [PATCH 069/184] Add package tests/scripts/vaktestdata, refactor giant script --- tests/scripts/vaktestdata/__init__.py | 8 + tests/scripts/vaktestdata/config_metadata.py | 8 + tests/scripts/vaktestdata/configs.py | 151 +++++++++++++++++++ tests/scripts/vaktestdata/constants.py | 68 +++++++++ tests/scripts/vaktestdata/dirs.py | 30 ++++ tests/scripts/vaktestdata/parser.py | 30 ++++ tests/scripts/vaktestdata/prep.py | 15 ++ 7 files changed, 310 insertions(+) create mode 100644 tests/scripts/vaktestdata/__init__.py create mode 100644 tests/scripts/vaktestdata/config_metadata.py create mode 100644 tests/scripts/vaktestdata/configs.py create mode 100644 tests/scripts/vaktestdata/constants.py create mode 100644 tests/scripts/vaktestdata/dirs.py create mode 100644 tests/scripts/vaktestdata/parser.py create mode 100644 tests/scripts/vaktestdata/prep.py diff --git a/tests/scripts/vaktestdata/__init__.py b/tests/scripts/vaktestdata/__init__.py new file mode 100644 index 000000000..f12f6b06c --- /dev/null +++ b/tests/scripts/vaktestdata/__init__.py @@ -0,0 +1,8 @@ +from . import ( + config_metadata, + configs, + constants, + dirs, + parser, + prep, +) diff --git a/tests/scripts/vaktestdata/config_metadata.py b/tests/scripts/vaktestdata/config_metadata.py new file mode 100644 index 000000000..02a8eec7d --- /dev/null +++ b/tests/scripts/vaktestdata/config_metadata.py @@ -0,0 +1,8 @@ +import attrs + + +@attrs.define +class ConfigMetadata: + config_path: str + audio_format: str + spect_format: str \ No newline at end of file diff --git a/tests/scripts/vaktestdata/configs.py b/tests/scripts/vaktestdata/configs.py new file mode 100644 index 000000000..b50bbdfc3 --- /dev/null +++ b/tests/scripts/vaktestdata/configs.py @@ -0,0 +1,151 @@ +"""Helper functions for moving and modifying configs""" +import shutil + +# TODO: use tomli +import toml + +from . import constants + + + +def copy_config_files(): + """copy config files from setup to data_for_tests/configs + + the copied files are the ones that get modified when this setup script runs, + while the originals in this directory remain unchanged. + """ + copied_configs = [] + + for toml_path in constants.CONFIGS_TO_RUN: + if not toml_path.exists(): + raise FileNotFoundError(f"{toml_path} not found") + + dst = constants.GENERATED_TEST_CONFIGS_ROOT.joinpath(toml_path.name) + print(f"\tcopying to {dst}") + shutil.copy(src=toml_path, dst=dst) + copied_configs.append(dst) + + return copied_configs + + +def add_dataset_path_from_prepped_configs(target_configs, target_model, source_configs, source_model): + for target_config_path in target_configs: + suffix_to_match = target_config_path.name.replace(target_model, '') # remove model name at start of config name + source_config_path = [ + source_config_path + for source_config_path in source_configs + if source_config_path.name.replace(source_model, '') == suffix_to_match + ] + source_config_path = source_config_path[0] + command = [ + command + for command in COMMANDS + if command in source_config_path.name + ][0] + if command == 'train_continue': + section = 'TRAIN' + else: + section = command.upper() + print( + f"Re-using prepped dataset from model '{source_model}' config:\n{source_config_path}\n" + f"Will use for model '{target_model}' config:\n{target_config_path}" + ) + + with source_config_path.open("r") as fp: + source_config_toml = toml.load(fp) + dataset_path = source_config_toml[section]['dataset_path'] + with target_config_path.open("r") as fp: + target_config_toml = toml.load(fp) + target_config_toml[section]['dataset_path'] = dataset_path + with target_config_path.open("w") as fp: + toml.dump(target_config_toml, fp) + + +def fix_options_in_configs(config_paths, model, command, single_train_result=True): + """Fix values assigned to options in predict and eval configs. + + Need to do this because both predict and eval configs have options + that can only be assigned *after* running the corresponding `train` config. + """ + if command not in ('eval', 'predict', 'train_continue'): + raise ValueError( + f'invalid command to fix config options: {command}' + ) + configs_to_fix, train_configs = [], [] + # split configs into predict/eval/"train_continue" configs and other configs + for config_path in config_paths: + if command in config_path.name: + configs_to_fix.append(config_path) + elif 'train' in config_path.name and 'continue' not in config_path.name: + train_configs.append(config_path) + + for config_to_fix in configs_to_fix: + # figure out which 'train' config corresponds to this 'predict' or 'eval' config + # by using 'suffix' of config file names. `train` suffix will match `predict`/'eval' suffix + prefix, suffix = config_to_fix.name.split(command) + train_config_to_use = [] + for train_config in train_configs: + train_prefix, train_suffix = train_config.name.split("train") + if train_prefix.startswith(model) and train_suffix == suffix: + train_config_to_use.append(train_config) + if len(train_config_to_use) > 1: + raise ValueError( + f"Did not find just a single train config that matches with '{command}' config:\n" + f"{config_to_fix}\n" + f"Matches were: {train_config_to_use}" + ) + train_config_to_use = train_config_to_use[0] + + # now use the config to find the results dir and get the values for the options we need to set + # which are checkpoint_path, spect_scaler_path, and labelmap_path + with train_config_to_use.open("r") as fp: + train_config_toml = toml.load(fp) + root_results_dir = Path(train_config_toml["TRAIN"]["root_results_dir"]) + results_dir = sorted(root_results_dir.glob("results_*")) + if len(results_dir) > 1: + if single_train_result: + raise ValueError( + f"Did not find just a single results directory in root_results_dir from train_config:\n" + f"{train_config_to_use}" + f"root_results_dir was: {root_results_dir}" + f'Matches for "results_*" were: {results_dir}' + ) + else: + results_dir = results_dir[-1] + elif len(results_dir) == 1: + results_dir = results_dir[0] + else: + raise ValueError( + f"Did not find a results directory in root_results_dir from train_config:\n" + f"{train_config_to_use}" + f"root_results_dir was:\n{root_results_dir}" + f'Matches for "results_*" were:\n{results_dir}' + ) + + # these are the only options whose values we need to change + # and they are the same for both predict and eval + checkpoint_path = sorted(results_dir.glob("**/checkpoints/checkpoint.pt"))[0] + if train_config_toml['TRAIN']['normalize_spectrograms']: + spect_scaler_path = sorted(results_dir.glob("StandardizeSpect"))[0] + else: + spect_scaler_path = None + labelmap_path = sorted(results_dir.glob("labelmap.json"))[0] + + # now add these values to corresponding options in predict / eval config + with config_to_fix.open("r") as fp: + config_toml = toml.load(fp) + if command == 'train_continue': + section = 'TRAIN' + else: + section = command.upper() + config_toml[section]["checkpoint_path"] = str(checkpoint_path) + if spect_scaler_path: + config_toml[section]["spect_scaler_path"] = str(spect_scaler_path) + else: + if 'spect_scaler_path' in config_toml[section]: + # remove any existing 'spect_scaler_path' option + del config_toml[section]["spect_scaler_path"] + if command != 'train_continue': # train always gets labelmap from dataset dir, not from a config option + config_toml[section]["labelmap_path"] = str(labelmap_path) + with config_to_fix.open("w") as fp: + toml.dump(config_toml, fp) diff --git a/tests/scripts/vaktestdata/constants.py b/tests/scripts/vaktestdata/constants.py new file mode 100644 index 000000000..15b66e308 --- /dev/null +++ b/tests/scripts/vaktestdata/constants.py @@ -0,0 +1,68 @@ +"""Constants used by vaktestdata and scripts that rely on it.""" +import pathlib + + +HERE = pathlib.Path(__file__).parent +TEST_DATA_ROOT = HERE / ".." / ".." / "data_for_tests" +GENERATED_TEST_DATA = TEST_DATA_ROOT / "generated" +GENERATED_TEST_CONFIGS_ROOT = GENERATED_TEST_DATA / "configs" + +# convention is that all the config.toml files in tests/data_for_tests/configs +# that should be run when generating test data +# have filenames of the form `{MODEL}_{COMMAND}_audio_{FORMAT}_annot_{FORMAT}.toml' +# **or** `{MODEL}_{COMMAND}_spect_{FORMAT}_annot_{FORMAT}_config.ini' +# e.g., 'TweetyNet_learncurve_audio_cbin_annot_notmat.toml'. +# Below, we iterate over model names +# so glob doesn't pick up static configs that are just used for testing, +# like 'invalid_option_config.toml` +TEST_CONFIGS_ROOT = TEST_DATA_ROOT.joinpath("configs") +CONFIGS_TO_RUN = [] + +MODELS_PREP = ("TweetyNet",) +MODELS_REUSE_PREP = { + "TeenyTweetyNet": "TweetyNet" +} + +MODELS_RESULTS = ( + "TeenyTweetyNet", + "TweetyNet", +) +for model in MODELS_RESULTS: + CONFIGS_TO_RUN.extend(sorted(TEST_CONFIGS_ROOT.glob(f"{model}*.toml"))) + +# the sub-directories that will get made inside `./tests/data_for_tests/generated` +TOP_LEVEL_DIRS = [ + "prep", + "results", +] + +# these sub-dirs get made in each of the TOP_LEVEL_DIRS (except for 'configs') +COMMAND_DIRS = [ + "eval", + "learncurve", + "predict", + "train", +] + +# these sub-dirs get made in each of the COMMAND_DIRS (except for 'configs') +DATA_DIRS = [ + "audio_cbin_annot_notmat", + "audio_wav_annot_birdsongrec", + "spect_mat_annot_yarden", +] + +# need to run 'train' config before we run 'predict' +# so we can add checkpoints, etc., from training to predict +COMMANDS = ( + "train", + "learncurve", + "eval", + "predict", + "train_continue", +) + +GENERATE_TEST_DATA_STEPS = ( + 'prep', + 'results', + 'all', +) diff --git a/tests/scripts/vaktestdata/dirs.py b/tests/scripts/vaktestdata/dirs.py new file mode 100644 index 000000000..5c425a49c --- /dev/null +++ b/tests/scripts/vaktestdata/dirs.py @@ -0,0 +1,30 @@ +"""Helper functions for setting up directories.""" +from . import constants + + +def make_subdirs_in_generated(config_paths): + """make sub-directories inside ./tests/data_for_tests/generated + + do this after copying configs, + before using those configs to generate results. + We use configs to decide which dirs we need to make + + makes three directories in data_for_tests/generated: + configs, prep, and results. + prep has one sub-directory for every data "type". + results does also, but in addition will have sub-directories + within those for models. + """ + for top_level_dir in constants.TOP_LEVEL_DIRS: + for command_dir in constants.COMMAND_DIRS: + for data_dir in constants.DATA_DIRS: + if not any( + [f'{command_dir}_{data_dir}' in str(config_path) for config_path in config_paths] + ): + continue # no need to make this dir + + for model in constants.MODELS_RESULTS: + subdir_to_make = ( + constants.GENERATED_TEST_DATA / top_level_dir / command_dir / data_dir / model + ) + subdir_to_make.mkdir(parents=True) diff --git a/tests/scripts/vaktestdata/parser.py b/tests/scripts/vaktestdata/parser.py new file mode 100644 index 000000000..4e88e0dff --- /dev/null +++ b/tests/scripts/vaktestdata/parser.py @@ -0,0 +1,30 @@ +"""CLI parser for generate test data script""" +import argparse + +from . import constants + + +def get_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--step', + choices=constants.GENERATE_TEST_DATA_STEPS, + help=f"Which step of generating test data to perform, one of: {constants.GENERATE_TEST_DATA_STEPS}", + default='all' + ) + parser.add_argument( + '--commands', + choices=('train', 'learncurve', 'eval', 'predict', 'train_continue'), + help=f"Space-separated list of commands to run for 'results' step", + nargs="+", + ) + parser.add_argument( + '--single-train-result', + action = argparse.BooleanOptionalAction, + help=( + "If --single-train-result, require there be a single results directory " + "from any training config when looking for them to use in other configs. " + "If --no-single-train-result, allow multiple and use the most recent." + ) + ) + return parser diff --git a/tests/scripts/vaktestdata/prep.py b/tests/scripts/vaktestdata/prep.py new file mode 100644 index 000000000..376e95e4b --- /dev/null +++ b/tests/scripts/vaktestdata/prep.py @@ -0,0 +1,15 @@ +# Do this here to suppress warnings before we import vak +import warnings +from numba.core.errors import NumbaDeprecationWarning +warnings.simplefilter('ignore', category=NumbaDeprecationWarning) +import vak + +def run_prep(config_paths): + """run ``vak prep`` to generate data for testing""" + for config_path in config_paths: + if not config_path.exists(): + raise FileNotFoundError(f"{config_path} not found") + print( + f"\nRunning vak prep to generate data for tests, using config:\n{config_path.name}" + ) + vak.cli.prep.prep(toml_path=config_path) From 25adc7ccb4783749f0edc904918dfd6cb5b434d8 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 12:29:40 -0500 Subject: [PATCH 070/184] In configs used for test data, capitalize model names in directory paths --- .../TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml | 4 ++-- ...nyTweetyNet_learncurve_audio_cbin_annot_notmat.toml | 4 ++-- ...TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml | 4 ++-- ...yTweetyNet_predict_audio_wav_annot_birdsongrec.toml | 4 ++-- .../TeenyTweetyNet_train_audio_cbin_annot_notmat.toml | 4 ++-- ...enyTweetyNet_train_audio_wav_annot_birdsongrec.toml | 4 ++-- ...eetyNet_train_continue_audio_cbin_annot_notmat.toml | 8 ++++---- ...Net_train_continue_audio_wav_annot_birdsongrec.toml | 8 ++++---- ...weetyNet_train_continue_spect_mat_annot_yarden.toml | 6 +++--- .../TeenyTweetyNet_train_spect_mat_annot_yarden.toml | 4 ++-- .../TweetyNet_eval_audio_cbin_annot_notmat.toml | 10 +++++----- .../TweetyNet_learncurve_audio_cbin_annot_notmat.toml | 4 ++-- .../TweetyNet_predict_audio_cbin_annot_notmat.toml | 8 ++++---- .../TweetyNet_predict_audio_wav_annot_birdsongrec.toml | 8 ++++---- .../TweetyNet_train_audio_cbin_annot_notmat.toml | 4 ++-- .../TweetyNet_train_audio_wav_annot_birdsongrec.toml | 4 ++-- ...eetyNet_train_continue_audio_cbin_annot_notmat.toml | 8 ++++---- ...Net_train_continue_audio_wav_annot_birdsongrec.toml | 8 ++++---- ...weetyNet_train_continue_spect_mat_annot_yarden.toml | 6 +++--- .../TweetyNet_train_spect_mat_annot_yarden.toml | 4 ++-- 20 files changed, 57 insertions(+), 57 deletions(-) diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml index 1f78e7d84..aba64b569 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml @@ -3,7 +3,7 @@ dataset_type = "frame classification" input_type = "spect" labelset = "iabcdefghjk" data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032412" -output_dir = "./tests/data_for_tests/generated/prep/eval/audio_cbin_annot_notmat/teenytweetynet" +output_dir = "./tests/data_for_tests/generated/prep/eval/audio_cbin_annot_notmat/TeenyTweetyNet" audio_format = "cbin" annot_format = "notmat" @@ -25,7 +25,7 @@ batch_size = 4 num_workers = 2 device = "cuda" spect_scaler_path = "~/Documents/repos/coding/birdsong/TeenyTweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" -output_dir = "./tests/data_for_tests/generated/results/eval/audio_cbin_annot_notmat/teenytweetynet" +output_dir = "./tests/data_for_tests/generated/results/eval/audio_cbin_annot_notmat/TeenyTweetyNet" [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml index 3fc6826c9..56204818d 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032312" -output_dir = "./tests/data_for_tests/generated/prep/learncurve/audio_cbin_annot_notmat/teenytweetynet" +output_dir = "./tests/data_for_tests/generated/prep/learncurve/audio_cbin_annot_notmat/TeenyTweetyNet" audio_format = "cbin" annot_format = "notmat" labelset = "iabcdefghjk" @@ -32,7 +32,7 @@ ckpt_step = 200 patience = 3 num_workers = 2 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/learncurve/audio_cbin_annot_notmat/teenytweetynet" +root_results_dir = "./tests/data_for_tests/generated/results/learncurve/audio_cbin_annot_notmat/TeenyTweetyNet" [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml index 6e48cf251..e93951be6 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032412" -output_dir = "./tests/data_for_tests/generated/prep/predict/audio_cbin_annot_notmat/teenytweetynet" +output_dir = "./tests/data_for_tests/generated/prep/predict/audio_cbin_annot_notmat/TeenyTweetyNet" audio_format = "cbin" [SPECT_PARAMS] @@ -23,7 +23,7 @@ model = "TeenyTweetyNet" batch_size = 4 num_workers = 2 device = "cuda" -output_dir = "./tests/data_for_tests/generated/results/predict/audio_cbin_annot_notmat/teenytweetynet" +output_dir = "./tests/data_for_tests/generated/results/predict/audio_cbin_annot_notmat/TeenyTweetyNet" annot_csv_filename = "bl26lb16.041912.annot.csv" [TeenyTweetyNet.optimizer] diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml index 0172e305b..248eb22b0 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/predict/audio_wav_annot_birdsongrec/teenytweetynet" +output_dir = "./tests/data_for_tests/generated/prep/predict/audio_wav_annot_birdsongrec/TeenyTweetyNet" audio_format = "wav" [SPECT_PARAMS] @@ -23,7 +23,7 @@ model = "TeenyTweetyNet" batch_size = 4 num_workers = 2 device = "cuda" -output_dir = "./tests/data_for_tests/generated/results/predict/audio_wav_annot_birdsongrec/teenytweetynet" +output_dir = "./tests/data_for_tests/generated/results/predict/audio_wav_annot_birdsongrec/TeenyTweetyNet" annot_csv_filename = "Bird0.annot.csv" [TeenyTweetyNet.optimizer] diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_cbin_annot_notmat.toml index dc0f34685..d7326cb8c 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_cbin_annot_notmat.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032312" -output_dir = "./tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/teenytweetynet" +output_dir = "./tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/TeenyTweetyNet" audio_format = "cbin" annot_format = "notmat" labelset = "iabcdefghjk" @@ -30,7 +30,7 @@ ckpt_step = 200 patience = 3 num_workers = 2 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/teenytweetynet" +root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/TeenyTweetyNet" [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml index d3518874e..4281ae76b 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml @@ -3,7 +3,7 @@ dataset_type = "frame classification" input_type = "spect" labelset = "012345678" data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsongrec/teenytweetynet" +output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsongrec/TeenyTweetyNet" audio_format = "wav" annot_format = "birdsong-recognition-dataset" annot_file = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0/Annotation.xml" @@ -31,7 +31,7 @@ ckpt_step = 200 patience = 3 num_workers = 2 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsongrec/teenytweetynet" +root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsongrec/TeenyTweetyNet" [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml index 5b0874168..c66287e40 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032312" -output_dir = "./tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/teenytweetynet" +output_dir = "./tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/TeenyTweetyNet" audio_format = "cbin" annot_format = "notmat" labelset = "iabcdefghjk" @@ -30,9 +30,9 @@ ckpt_step = 200 patience = 3 num_workers = 2 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/teenytweetynet" -checkpoint_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" -spect_scaler_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" +root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/TeenyTweetyNet" +checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" +spect_scaler_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml index 8804385c2..26dcb9856 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml @@ -3,7 +3,7 @@ dataset_type = "frame classification" input_type = "spect" labelset = "012345678" data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsongrec/teenytweetynet" +output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsongrec/TeenyTweetyNet" audio_format = "wav" annot_format = "birdsong-recognition-dataset" annot_file = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0/Annotation.xml" @@ -31,9 +31,9 @@ ckpt_step = 200 patience = 3 num_workers = 2 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsongrec/teenytweetynet" -checkpoint_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" -spect_scaler_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" +root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsongrec/TeenyTweetyNet" +checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" +spect_scaler_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_spect_mat_annot_yarden.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_spect_mat_annot_yarden.toml index 323e9d679..4f70f0bad 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_spect_mat_annot_yarden.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_spect_mat_annot_yarden.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/spect_mat_annot_yarden/llb3/spect" -output_dir = "./tests/data_for_tests/generated/prep/train/spect_mat_annot_yarden/teenytweetynet" +output_dir = "./tests/data_for_tests/generated/prep/train/spect_mat_annot_yarden/TeenyTweetyNet" spect_format = "mat" annot_format = "yarden" annot_file = "./tests/data_for_tests/source/spect_mat_annot_yarden/llb3/llb3_annot_subset.mat" @@ -30,8 +30,8 @@ ckpt_step = 200 patience = 3 num_workers = 2 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/spect_mat_annot_yarden/teenytweetynet" -checkpoint_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" +root_results_dir = "./tests/data_for_tests/generated/results/train/spect_mat_annot_yarden/TeenyTweetyNet" +checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_train_spect_mat_annot_yarden.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_spect_mat_annot_yarden.toml index 4fbd981c7..4b4d36204 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_train_spect_mat_annot_yarden.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_train_spect_mat_annot_yarden.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/spect_mat_annot_yarden/llb3/spect" -output_dir = "./tests/data_for_tests/generated/prep/train/spect_mat_annot_yarden/teenytweetynet" +output_dir = "./tests/data_for_tests/generated/prep/train/spect_mat_annot_yarden/TeenyTweetyNet" spect_format = "mat" annot_format = "yarden" annot_file = "./tests/data_for_tests/source/spect_mat_annot_yarden/llb3/llb3_annot_subset.mat" @@ -30,7 +30,7 @@ ckpt_step = 200 patience = 3 num_workers = 2 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/spect_mat_annot_yarden/teenytweetynet" +root_results_dir = "./tests/data_for_tests/generated/results/train/spect_mat_annot_yarden/TeenyTweetyNet" [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_eval_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_eval_audio_cbin_annot_notmat.toml index c70498192..e174115f5 100644 --- a/tests/data_for_tests/configs/TweetyNet_eval_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TweetyNet_eval_audio_cbin_annot_notmat.toml @@ -3,7 +3,7 @@ dataset_type = "frame classification" input_type = "spect" labelset = "iabcdefghjk" data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032412" -output_dir = "./tests/data_for_tests/generated/prep/eval/audio_cbin_annot_notmat/tweetynet" +output_dir = "./tests/data_for_tests/generated/prep/eval/audio_cbin_annot_notmat/TweetyNet" audio_format = "cbin" annot_format = "notmat" @@ -18,14 +18,14 @@ transform_type = "log_spect" window_size = 88 [EVAL] -checkpoint_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" -labelmap_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/gy6or6/results_200620_165308/labelmap.json" +checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" +labelmap_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/labelmap.json" model = "TweetyNet" batch_size = 11 num_workers = 4 device = "cuda" -spect_scaler_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" -output_dir = "./tests/data_for_tests/generated/results/eval/audio_cbin_annot_notmat/tweetynet" +spect_scaler_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" +output_dir = "./tests/data_for_tests/generated/results/eval/audio_cbin_annot_notmat/TweetyNet" [TweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml index 98cefd569..c0ab2e446 100644 --- a/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032312" -output_dir = "./tests/data_for_tests/generated/prep/learncurve/audio_cbin_annot_notmat/tweetynet" +output_dir = "./tests/data_for_tests/generated/prep/learncurve/audio_cbin_annot_notmat/TweetyNet" audio_format = "cbin" annot_format = "notmat" labelset = "iabcdefghjk" @@ -32,7 +32,7 @@ ckpt_step = 200 patience = 4 num_workers = 4 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/learncurve/audio_cbin_annot_notmat/tweetynet" +root_results_dir = "./tests/data_for_tests/generated/results/learncurve/audio_cbin_annot_notmat/TweetyNet" [TweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml index c770ed8b7..52a3dfe50 100644 --- a/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032412" -output_dir = "./tests/data_for_tests/generated/prep/predict/audio_cbin_annot_notmat/tweetynet" +output_dir = "./tests/data_for_tests/generated/prep/predict/audio_cbin_annot_notmat/TweetyNet" audio_format = "cbin" [SPECT_PARAMS] @@ -17,13 +17,13 @@ window_size = 88 [PREDICT] spect_scaler_path = "/home/user/results_181014_194418/spect_scaler" -checkpoint_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/bl26lb16/results_200620_164245/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" -labelmap_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/bl26lb16/results_200620_164245/labelmap.json" +checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/bl26lb16/results_200620_164245/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" +labelmap_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/bl26lb16/results_200620_164245/labelmap.json" model = "TweetyNet" batch_size = 11 num_workers = 4 device = "cuda" -output_dir = "./tests/data_for_tests/generated/results/predict/audio_cbin_annot_notmat/tweetynet" +output_dir = "./tests/data_for_tests/generated/results/predict/audio_cbin_annot_notmat/TweetyNet" annot_csv_filename = "bl26lb16.041912.annot.csv" [TweetyNet.optimizer] diff --git a/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml index c5c8df947..17464a7c1 100644 --- a/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/predict/audio_wav_annot_birdsongrec/tweetynet" +output_dir = "./tests/data_for_tests/generated/prep/predict/audio_wav_annot_birdsongrec/TweetyNet" audio_format = "wav" [SPECT_PARAMS] @@ -17,13 +17,13 @@ window_size = 88 [PREDICT] spect_scaler_path = "/home/user/results_181014_194418/spect_scaler" -checkpoint_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/bl26lb16/results_200620_164245/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" -labelmap_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/bl26lb16/results_200620_164245/labelmap.json" +checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/bl26lb16/results_200620_164245/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" +labelmap_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/bl26lb16/results_200620_164245/labelmap.json" model = "TweetyNet" batch_size = 11 num_workers = 4 device = "cuda" -output_dir = "./tests/data_for_tests/generated/results/predict/audio_wav_annot_birdsongrec/tweetynet" +output_dir = "./tests/data_for_tests/generated/results/predict/audio_wav_annot_birdsongrec/TweetyNet" annot_csv_filename = "Bird0.annot.csv" [TweetyNet.optimizer] diff --git a/tests/data_for_tests/configs/TweetyNet_train_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_train_audio_cbin_annot_notmat.toml index 7e23a1c49..539cfc3d4 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_audio_cbin_annot_notmat.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032312" -output_dir = "./tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/tweetynet" +output_dir = "./tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/TweetyNet" audio_format = "cbin" annot_format = "notmat" labelset = "iabcdefghjk" @@ -30,7 +30,7 @@ ckpt_step = 200 patience = 4 num_workers = 4 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/tweetynet" +root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/TweetyNet" [TweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml index 48575c5a5..7c315ffca 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml @@ -3,7 +3,7 @@ dataset_type = "frame classification" input_type = "spect" labelset = "012345678" data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsongrec/tweetynet" +output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsongrec/TweetyNet" audio_format = "wav" annot_format = "birdsong-recognition-dataset" annot_file = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0/Annotation.xml" @@ -31,7 +31,7 @@ ckpt_step = 200 patience = 4 num_workers = 4 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsongrec/tweetynet" +root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsongrec/TweetyNet" [TweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml index ad8d34c6a..c5c1ad9d6 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032312" -output_dir = "./tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/tweetynet" +output_dir = "./tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/TweetyNet" audio_format = "cbin" annot_format = "notmat" labelset = "iabcdefghjk" @@ -30,9 +30,9 @@ ckpt_step = 200 patience = 4 num_workers = 4 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/tweetynet" -checkpoint_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" -spect_scaler_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" +root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/TweetyNet" +checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" +spect_scaler_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" [TweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml index 0cefb9400..39d08cc18 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml @@ -3,7 +3,7 @@ dataset_type = "frame classification" input_type = "spect" labelset = "012345678" data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsongrec/tweetynet" +output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsongrec/TweetyNet" audio_format = "wav" annot_format = "birdsong-recognition-dataset" annot_file = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0/Annotation.xml" @@ -31,9 +31,9 @@ ckpt_step = 200 patience = 4 num_workers = 4 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsongrec/tweetynet" -checkpoint_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" -spect_scaler_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" +root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsongrec/TweetyNet" +checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" +spect_scaler_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" [TweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_train_continue_spect_mat_annot_yarden.toml b/tests/data_for_tests/configs/TweetyNet_train_continue_spect_mat_annot_yarden.toml index fa5bb1d91..8231e1400 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_continue_spect_mat_annot_yarden.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_continue_spect_mat_annot_yarden.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/spect_mat_annot_yarden/llb3/spect" -output_dir = "./tests/data_for_tests/generated/prep/train/spect_mat_annot_yarden/tweetynet" +output_dir = "./tests/data_for_tests/generated/prep/train/spect_mat_annot_yarden/TweetyNet" spect_format = "mat" annot_format = "yarden" annot_file = "./tests/data_for_tests/source/spect_mat_annot_yarden/llb3/llb3_annot_subset.mat" @@ -30,8 +30,8 @@ ckpt_step = 200 patience = 4 num_workers = 4 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/spect_mat_annot_yarden/tweetynet" -checkpoint_path = "~/Documents/repos/coding/birdsong/tweetynet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" +root_results_dir = "./tests/data_for_tests/generated/results/train/spect_mat_annot_yarden/TweetyNet" +checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" [TweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_train_spect_mat_annot_yarden.toml b/tests/data_for_tests/configs/TweetyNet_train_spect_mat_annot_yarden.toml index 2f316900b..02997d808 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_spect_mat_annot_yarden.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_spect_mat_annot_yarden.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/spect_mat_annot_yarden/llb3/spect" -output_dir = "./tests/data_for_tests/generated/prep/train/spect_mat_annot_yarden/tweetynet" +output_dir = "./tests/data_for_tests/generated/prep/train/spect_mat_annot_yarden/TweetyNet" spect_format = "mat" annot_format = "yarden" annot_file = "./tests/data_for_tests/source/spect_mat_annot_yarden/llb3/llb3_annot_subset.mat" @@ -30,7 +30,7 @@ ckpt_step = 200 patience = 4 num_workers = 4 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/spect_mat_annot_yarden/tweetynet" +root_results_dir = "./tests/data_for_tests/generated/results/train/spect_mat_annot_yarden/TweetyNet" [TweetyNet.optimizer] lr = 0.001 From c2fcfaa383fa72fba3352cb6fe47c33abe8b4737 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 12:48:26 -0500 Subject: [PATCH 071/184] Rename top-level field -> 'config_metadata' in metadata.json, add missing field 'use_dataset_from_config' for one entry --- tests/data_for_tests/configs/configs.json | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/data_for_tests/configs/configs.json b/tests/data_for_tests/configs/configs.json index 07ef3f9ce..f1fc90fcb 100644 --- a/tests/data_for_tests/configs/configs.json +++ b/tests/data_for_tests/configs/configs.json @@ -1,5 +1,5 @@ { - "configs": [ + "config_metadata": [ { "filename": "TweetyNet_eval_audio_cbin_annot_notmat.toml", "model": "TweetyNet", @@ -177,7 +177,8 @@ "config_type": "train_continue", "audio_format": null, "spect_format": "mat", - "annot_format": "yarden" + "annot_format": "yarden", + "use_dataset_from_config": "TweetyNet_train_continue_spect_mat_annot_yarden.toml" } ] } \ No newline at end of file From af6e7d3b89fab7cd666f6d4b33f7888f331339f8 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 12:48:50 -0500 Subject: [PATCH 072/184] Add ConfigMetadata dataclass in vaktestdata/config_metadata.py --- tests/scripts/vaktestdata/config_metadata.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/scripts/vaktestdata/config_metadata.py b/tests/scripts/vaktestdata/config_metadata.py index 02a8eec7d..e6aeac4fb 100644 --- a/tests/scripts/vaktestdata/config_metadata.py +++ b/tests/scripts/vaktestdata/config_metadata.py @@ -3,6 +3,9 @@ @attrs.define class ConfigMetadata: - config_path: str - audio_format: str - spect_format: str \ No newline at end of file + filename: str = attrs.field() + config_type: str = attrs.field() + audio_format: str = attrs.field() + spect_format: str = attrs.field() + annot_format: str = attrs.field() + use_dataset_from_config = attrs.field(default=None) From 6a81273541c3b7ccf50c8f7461fd945bbf18f3ab Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 12:49:04 -0500 Subject: [PATCH 073/184] Add log message to tests/scripts/vaktestdata/dirs.py --- tests/scripts/vaktestdata/dirs.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/scripts/vaktestdata/dirs.py b/tests/scripts/vaktestdata/dirs.py index 5c425a49c..7f5ad4e73 100644 --- a/tests/scripts/vaktestdata/dirs.py +++ b/tests/scripts/vaktestdata/dirs.py @@ -15,6 +15,10 @@ def make_subdirs_in_generated(config_paths): results does also, but in addition will have sub-directories within those for models. """ + print( + "Making sub-directories in ./tests/data_for_tests/generated/ where files generated by `vak` will go" + ) + for top_level_dir in constants.TOP_LEVEL_DIRS: for command_dir in constants.COMMAND_DIRS: for data_dir in constants.DATA_DIRS: From 16e6aa4bb73b866a5eef6f3ee3b1f7248df6639c Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 12:49:52 -0500 Subject: [PATCH 074/184] Modify vaktestdata.configs.copy_config_files to make GENERATED_TEST_CONFIGS_ROOT and to print a log message --- tests/scripts/vaktestdata/configs.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/scripts/vaktestdata/configs.py b/tests/scripts/vaktestdata/configs.py index b50bbdfc3..3d7e9a660 100644 --- a/tests/scripts/vaktestdata/configs.py +++ b/tests/scripts/vaktestdata/configs.py @@ -14,6 +14,13 @@ def copy_config_files(): the copied files are the ones that get modified when this setup script runs, while the originals in this directory remain unchanged. """ + print( + "Copying config files run to generate test data from ./tests/data_for_tests/configs to " + "./tests/data_for_tests/generated/configs" + ) + + constants.GENERATED_TEST_CONFIGS_ROOT.mkdir(parents=True) + copied_configs = [] for toml_path in constants.CONFIGS_TO_RUN: From 12c9c9d3ce025a81f841389debd7281b967ea6b7 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 12:51:45 -0500 Subject: [PATCH 075/184] Modify constants so it has a list of ConfigMetadata instances made from configs.json --- tests/scripts/vaktestdata/constants.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/scripts/vaktestdata/constants.py b/tests/scripts/vaktestdata/constants.py index 15b66e308..b5c7dabf3 100644 --- a/tests/scripts/vaktestdata/constants.py +++ b/tests/scripts/vaktestdata/constants.py @@ -1,9 +1,18 @@ """Constants used by vaktestdata and scripts that rely on it.""" +import json import pathlib +from .config_metadata import ConfigMetadata HERE = pathlib.Path(__file__).parent TEST_DATA_ROOT = HERE / ".." / ".." / "data_for_tests" +CONFIG_METADATA_JSON_PATH = TEST_DATA_ROOT / "configs" / "configs.json" +with CONFIG_METADATA_JSON_PATH.open('r') as fp: + CONFIG_METADATA_LIST = json.load(fp)['config_metadata'] +CONFIG_METADATA = [ + ConfigMetadata(**config_metadata_dict) + for config_metadata_dict in CONFIG_METADATA_LIST +] GENERATED_TEST_DATA = TEST_DATA_ROOT / "generated" GENERATED_TEST_CONFIGS_ROOT = GENERATED_TEST_DATA / "configs" From 32330058d616d4ebcbd422e68da4863c32b49bd4 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 12:54:24 -0500 Subject: [PATCH 076/184] Modify vaktestdata.prep.run_prep to use CONFIG_METADATA to only run prep for configs that have a null 'use_dataset_from_config' field --- tests/scripts/vaktestdata/prep.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/scripts/vaktestdata/prep.py b/tests/scripts/vaktestdata/prep.py index 376e95e4b..7b58a74cf 100644 --- a/tests/scripts/vaktestdata/prep.py +++ b/tests/scripts/vaktestdata/prep.py @@ -4,9 +4,24 @@ warnings.simplefilter('ignore', category=NumbaDeprecationWarning) import vak -def run_prep(config_paths): - """run ``vak prep`` to generate data for testing""" - for config_path in config_paths: +from . import constants + +def run_prep(): + """Run ``vak prep`` to prepare datasets used with tests. + + This function runs ``prep`` for **only** the configuration files + in configs.json that have ``null`` for the field in their metadata + ``'use_dataset_from_config'``. The ``null`` indicates that these + configs do **not** re-use a dataset prepared from another config. + """ + configs_to_prep = [ + config_metadata + for config_metadata in constants.CONFIG_METADATA + if config_metadata.use_dataset_from_config is None + ] + + for config_metadata in configs_to_prep: + config_path = constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename if not config_path.exists(): raise FileNotFoundError(f"{config_path} not found") print( From 35be4213df012c9385091694f5e2d8d5ec495e18 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 12:57:18 -0500 Subject: [PATCH 077/184] Fix formatting errors in tests/data_for_tests/configs/configs.json --- tests/data_for_tests/configs/configs.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data_for_tests/configs/configs.json b/tests/data_for_tests/configs/configs.json index f1fc90fcb..c42c3bcae 100644 --- a/tests/data_for_tests/configs/configs.json +++ b/tests/data_for_tests/configs/configs.json @@ -115,7 +115,7 @@ "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", - "use_dataset_from_config": "TweetyNet_predict_audio_cbin_annot_notmat.toml", + "use_dataset_from_config": "TweetyNet_predict_audio_cbin_annot_notmat.toml" }, { "filename": "TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml", @@ -160,7 +160,7 @@ "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", - "use_dataset_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml", + "use_dataset_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml" }, { "filename": "TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml", From 46f328c282207c1d21284797ac817175bd803b81 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 12:57:31 -0500 Subject: [PATCH 078/184] Add missing field/attribute 'model' to ConfigMetadata --- tests/scripts/vaktestdata/config_metadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/scripts/vaktestdata/config_metadata.py b/tests/scripts/vaktestdata/config_metadata.py index e6aeac4fb..a8e332408 100644 --- a/tests/scripts/vaktestdata/config_metadata.py +++ b/tests/scripts/vaktestdata/config_metadata.py @@ -4,6 +4,7 @@ @attrs.define class ConfigMetadata: filename: str = attrs.field() + model: str = attrs.field() config_type: str = attrs.field() audio_format: str = attrs.field() spect_format: str = attrs.field() From 9144c1ebd0a3bcd400f5246079159eb6fcacc699 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 13:18:30 -0500 Subject: [PATCH 079/184] Rewrite `vaktestdata.configs.add_dataset_path_from_prepped_configs` to use CONFIG_METADATA --- tests/scripts/vaktestdata/configs.py | 60 ++++++++++++++-------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/tests/scripts/vaktestdata/configs.py b/tests/scripts/vaktestdata/configs.py index 3d7e9a660..2aea60514 100644 --- a/tests/scripts/vaktestdata/configs.py +++ b/tests/scripts/vaktestdata/configs.py @@ -35,37 +35,35 @@ def copy_config_files(): return copied_configs -def add_dataset_path_from_prepped_configs(target_configs, target_model, source_configs, source_model): - for target_config_path in target_configs: - suffix_to_match = target_config_path.name.replace(target_model, '') # remove model name at start of config name - source_config_path = [ - source_config_path - for source_config_path in source_configs - if source_config_path.name.replace(source_model, '') == suffix_to_match - ] - source_config_path = source_config_path[0] - command = [ - command - for command in COMMANDS - if command in source_config_path.name - ][0] - if command == 'train_continue': - section = 'TRAIN' - else: - section = command.upper() - print( - f"Re-using prepped dataset from model '{source_model}' config:\n{source_config_path}\n" - f"Will use for model '{target_model}' config:\n{target_config_path}" - ) - - with source_config_path.open("r") as fp: - source_config_toml = toml.load(fp) - dataset_path = source_config_toml[section]['dataset_path'] - with target_config_path.open("r") as fp: - target_config_toml = toml.load(fp) - target_config_toml[section]['dataset_path'] = dataset_path - with target_config_path.open("w") as fp: - toml.dump(target_config_toml, fp) +def add_dataset_path_from_prepped_configs(): + """This helper function goes through all configs in + :data:`vaktestdata.constants.CONFIG_METADATA` + and for any that have a filename for the attribute + "use_dataset_from_config", it sets the option 'dataset_path' + in the config file that the metadata corresponds to + to the same option from the file specified + by the attribute. + """ + configs_to_change = [ + config_metadata + for config_metadata in constants.CONFIG_METADATA + if config_metadata.use_dataset_from_config is not None + ] + + for config_metadata in configs_to_change: + config_to_change_path = constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename + section = config_metadata.config_type + + config_dataset_path = constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.use_dataset_from_config + + with config_dataset_path.open("r") as fp: + dataset_config_toml = toml.load(fp) + dataset_path = dataset_config_toml[section]['dataset_path'] + with config_to_change_path.open("r") as fp: + config_to_change_toml = toml.load(fp) + config_to_change_toml[section]['dataset_path'] = dataset_path + with config_to_change_path.open("w") as fp: + toml.dump(config_to_change_toml, fp) def fix_options_in_configs(config_paths, model, command, single_train_result=True): From 5fbc0c5ff9d581ef264a751c50cffb5340bbcfbf Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 18:57:21 -0400 Subject: [PATCH 080/184] Use logger in tests/scripts/vaktestdata/prep.py --- tests/scripts/vaktestdata/prep.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/scripts/vaktestdata/prep.py b/tests/scripts/vaktestdata/prep.py index 7b58a74cf..d722233f3 100644 --- a/tests/scripts/vaktestdata/prep.py +++ b/tests/scripts/vaktestdata/prep.py @@ -1,11 +1,18 @@ # Do this here to suppress warnings before we import vak +import logging import warnings + from numba.core.errors import NumbaDeprecationWarning warnings.simplefilter('ignore', category=NumbaDeprecationWarning) + import vak from . import constants + +logger = logging.getLogger(__name__) + + def run_prep(): """Run ``vak prep`` to prepare datasets used with tests. @@ -24,7 +31,7 @@ def run_prep(): config_path = constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename if not config_path.exists(): raise FileNotFoundError(f"{config_path} not found") - print( + logger.info( f"\nRunning vak prep to generate data for tests, using config:\n{config_path.name}" ) vak.cli.prep.prep(toml_path=config_path) From 04d6ef1302ee9409101c20afa6c3187abd3290be Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 18:57:36 -0400 Subject: [PATCH 081/184] Use logger in tests/scripts/vaktestdata/dirs.py --- tests/scripts/vaktestdata/dirs.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/scripts/vaktestdata/dirs.py b/tests/scripts/vaktestdata/dirs.py index 7f5ad4e73..2018b5ce1 100644 --- a/tests/scripts/vaktestdata/dirs.py +++ b/tests/scripts/vaktestdata/dirs.py @@ -1,7 +1,12 @@ """Helper functions for setting up directories.""" +import logging + from . import constants +logger = logging.getLogger(__name__) + + def make_subdirs_in_generated(config_paths): """make sub-directories inside ./tests/data_for_tests/generated @@ -15,7 +20,7 @@ def make_subdirs_in_generated(config_paths): results does also, but in addition will have sub-directories within those for models. """ - print( + logger.info( "Making sub-directories in ./tests/data_for_tests/generated/ where files generated by `vak` will go" ) @@ -31,4 +36,7 @@ def make_subdirs_in_generated(config_paths): subdir_to_make = ( constants.GENERATED_TEST_DATA / top_level_dir / command_dir / data_dir / model ) + logger.info( + f"Making sub-directory: {subdir_to_make}" + ) subdir_to_make.mkdir(parents=True) From 51173766be392e2fe20c2831ac1115bfe35edeef Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 18:58:41 -0400 Subject: [PATCH 082/184] Use logger, get name of config section correctly, and import/use pathlib where needed in tests/scripts/vaktestdata/configs.py --- tests/scripts/vaktestdata/configs.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/tests/scripts/vaktestdata/configs.py b/tests/scripts/vaktestdata/configs.py index 2aea60514..85c001061 100644 --- a/tests/scripts/vaktestdata/configs.py +++ b/tests/scripts/vaktestdata/configs.py @@ -1,4 +1,6 @@ """Helper functions for moving and modifying configs""" +import logging +import pathlib import shutil # TODO: use tomli @@ -7,6 +9,8 @@ from . import constants +logger = logging.getLogger(__name__) + def copy_config_files(): """copy config files from setup to data_for_tests/configs @@ -14,12 +18,16 @@ def copy_config_files(): the copied files are the ones that get modified when this setup script runs, while the originals in this directory remain unchanged. """ - print( + logger.info( + f"Making directory to copy config files:\n{constants.GENERATED_TEST_CONFIGS_ROOT}" + ) + constants.GENERATED_TEST_CONFIGS_ROOT.mkdir(parents=True) + + logger.info( "Copying config files run to generate test data from ./tests/data_for_tests/configs to " - "./tests/data_for_tests/generated/configs" + f"{constants.GENERATED_TEST_CONFIGS_ROOT}" ) - constants.GENERATED_TEST_CONFIGS_ROOT.mkdir(parents=True) copied_configs = [] @@ -28,7 +36,7 @@ def copy_config_files(): raise FileNotFoundError(f"{toml_path} not found") dst = constants.GENERATED_TEST_CONFIGS_ROOT.joinpath(toml_path.name) - print(f"\tcopying to {dst}") + logger.info(f"\tCopying '{toml_path.name}'") shutil.copy(src=toml_path, dst=dst) copied_configs.append(dst) @@ -52,7 +60,10 @@ def add_dataset_path_from_prepped_configs(): for config_metadata in configs_to_change: config_to_change_path = constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename - section = config_metadata.config_type + if config_metadata.config_type == 'train_continue': + section = 'TRAIN' + else: + section = config_metadata.config_type.upper() config_dataset_path = constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.use_dataset_from_config @@ -105,7 +116,7 @@ def fix_options_in_configs(config_paths, model, command, single_train_result=Tru # which are checkpoint_path, spect_scaler_path, and labelmap_path with train_config_to_use.open("r") as fp: train_config_toml = toml.load(fp) - root_results_dir = Path(train_config_toml["TRAIN"]["root_results_dir"]) + root_results_dir = pathlib.Path(train_config_toml["TRAIN"]["root_results_dir"]) results_dir = sorted(root_results_dir.glob("results_*")) if len(results_dir) > 1: if single_train_result: From 68a781478bef2d381a102187c584f767e3955350 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 18:59:07 -0400 Subject: [PATCH 083/184] Add default for parser arg '--commands' in tests/scripts/vaktestdata/parser.py --- tests/scripts/vaktestdata/parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/scripts/vaktestdata/parser.py b/tests/scripts/vaktestdata/parser.py index 4e88e0dff..19e272bad 100644 --- a/tests/scripts/vaktestdata/parser.py +++ b/tests/scripts/vaktestdata/parser.py @@ -17,6 +17,7 @@ def get_parser(): choices=('train', 'learncurve', 'eval', 'predict', 'train_continue'), help=f"Space-separated list of commands to run for 'results' step", nargs="+", + default=('train', 'learncurve', 'eval', 'predict', 'train_continue') ) parser.add_argument( '--single-train-result', From a61c77bc4872d84650b215fbd9a0fb18797e76d5 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 18:59:31 -0400 Subject: [PATCH 084/184] Rewrite tests/scripts/generate_data_for_tests.py to use vaktestdata package --- tests/scripts/generate_data_for_tests.py | 339 ++--------------------- 1 file changed, 19 insertions(+), 320 deletions(-) diff --git a/tests/scripts/generate_data_for_tests.py b/tests/scripts/generate_data_for_tests.py index 53ed25086..88bb819b3 100644 --- a/tests/scripts/generate_data_for_tests.py +++ b/tests/scripts/generate_data_for_tests.py @@ -53,264 +53,29 @@ The directories will have names with timestamps like `prep_20201015_1115`. Those are the generated directories we want to remove. """ -import argparse -from pathlib import Path -import shutil +import logging +import sys import warnings +# Do this here to suppress warnings before we import vak from numba.core.errors import NumbaDeprecationWarning warnings.simplefilter('ignore', category=NumbaDeprecationWarning) - -import toml import vak - -HERE = Path(__file__).parent -TEST_DATA_ROOT = HERE / ".." / "data_for_tests" -GENERATED_TEST_DATA = TEST_DATA_ROOT / "generated" -GENERATED_TEST_CONFIGS_ROOT = GENERATED_TEST_DATA / "configs" - -# convention is that all the config.toml files in tests/data_for_tests/configs -# that should be run when generating test data -# have filenames of the form `{MODEL}_{COMMAND}_audio_{FORMAT}_annot_{FORMAT}.toml' -# **or** `{MODEL}_{COMMAND}_spect_{FORMAT}_annot_{FORMAT}_config.ini' -# e.g., 'tweetynet_learncurve_audio_cbin_annot_notmat.toml'. -# Below, we iterate over model names -# so glob doesn't pick up static configs that are just used for testing, -# like 'invalid_option_config.toml` -TEST_CONFIGS_ROOT = TEST_DATA_ROOT.joinpath("configs") -CONFIGS_TO_RUN = [] - -MODELS_PREP = ("tweetynet") -MODELS_REUSE_PREP = { - "teenytweetynet": "tweetynet" -} - -MODELS_RESULTS = ( - "teenytweetynet", - "tweetynet" -) -for model in MODELS_RESULTS: - CONFIGS_TO_RUN.extend(sorted(TEST_CONFIGS_ROOT.glob(f"{model}*.toml"))) - -# the sub-directories that will get made inside `./tests/data_for_tests/generated` -TOP_LEVEL_DIRS = [ - "prep", - "results", -] - -# these sub-dirs get made in each of the TOP_LEVEL_DIRS (except for 'configs') -COMMAND_DIRS = [ - "eval", - "learncurve", - "predict", - "train", -] - -# these sub-dirs get made in each of the COMMAND_DIRS (except for 'configs') -DATA_DIRS = [ - "audio_cbin_annot_notmat", - "audio_wav_annot_birdsongrec", - "spect_mat_annot_yarden", -] - - -def make_subdirs_in_generated(config_paths): - """make sub-directories inside ./tests/data_for_tests/generated - - do this after copying configs, - before using those configs to generate results. - We use configs to decide which dirs we need to make - - makes three directories in data_for_tests/generated: - configs, prep, and results. - prep has one sub-directory for every data "type". - results does also, but in addition will have sub-directories - within those for models. - """ - for top_level_dir in TOP_LEVEL_DIRS: - for command_dir in COMMAND_DIRS: - for data_dir in DATA_DIRS: - if not any( - [f'{command_dir}_{data_dir}' in str(config_path) for config_path in config_paths] - ): - continue # no need to make this dir - - for model in MODELS_RESULTS: - subdir_to_make = ( - GENERATED_TEST_DATA / top_level_dir / command_dir / data_dir / model - ) - subdir_to_make.mkdir(parents=True) +import vaktestdata -def copy_config_files(): - """copy config files from setup to data_for_tests/configs - - the copied files are the ones that get modified when this setup script runs, - while the originals in this directory remain unchanged. - """ - copied_configs = [] - - for toml_path in CONFIGS_TO_RUN: - if not toml_path.exists(): - raise FileNotFoundError(f"{toml_path} not found") - - dst = GENERATED_TEST_CONFIGS_ROOT.joinpath(toml_path.name) - print(f"\tcopying to {dst}") - shutil.copy(src=toml_path, dst=dst) - copied_configs.append(dst) - - return copied_configs - - -def run_prep(config_paths): - """run ``vak prep`` to generate data for testing""" - for config_path in config_paths: - if not config_path.exists(): - raise FileNotFoundError(f"{config_path} not found") - print( - f"running vak prep to generate data for tests, using config: {config_path.name}" - ) - vak.cli.prep.prep(toml_path=config_path) - - -def add_dataset_path_from_prepped_configs(target_configs, target_model, source_configs, source_model): - for target_config_path in target_configs: - suffix_to_match = config_path.name.replace(target_model, '') # remove model name at start of config name - source_config_path = [ - source_config_path - for source_config_path in source_configs - if source_config_path.name.replace(source_model, '') == suffix_to_match - ] - source_config_path = source_config_path[0] - command = [ - command - for command in COMMANDS - if command in source_config_path.name - ][0] - if command == 'train_continue': - section = 'TRAIN' - else: - section = command.upper() - print( - f"Re-using prepped dataset from model '{source_model}' config:\n{source_config_path}\n" - f"Will use for model '{target_model}' config:\n{target_config_path}" - ) - - with source_config_path.open("r") as fp: - source_config_toml = toml.load(fp) - dataset_path = source_config_toml[section]['dataset_path'] - with target_config_path.open("r") as fp: - target_config_toml = toml.load(fp) - target_config_toml[section]['dataset_path'] = dataset_path - with target_config_path.open("w") as fp: - toml.dump(target_config_toml, fp) - - -def fix_options_in_configs(config_paths, model, command, single_train_result=True): - """Fix values assigned to options in predict and eval configs. - - Need to do this because both predict and eval configs have options - that can only be assigned *after* running the corresponding `train` config. - """ - if command not in ('eval', 'predict', 'train_continue'): - raise ValueError( - f'invalid command to fix config options: {command}' - ) - configs_to_fix, train_configs = [], [] - # split configs into predict/eval/"train_continue" configs and other configs - for config_path in config_paths: - if command in config_path.name: - configs_to_fix.append(config_path) - elif 'train' in config_path.name and 'continue' not in config_path.name: - train_configs.append(config_path) - - for config_to_fix in configs_to_fix: - # figure out which 'train' config corresponds to this 'predict' or 'eval' config - # by using 'suffix' of config file names. `train` suffix will match `predict`/'eval' suffix - prefix, suffix = config_to_fix.name.split(command) - train_config_to_use = [] - for train_config in train_configs: - train_prefix, train_suffix = train_config.name.split("train") - if train_prefix.startswith(model) and train_suffix == suffix: - train_config_to_use.append(train_config) - if len(train_config_to_use) > 1: - raise ValueError( - f"Did not find just a single train config that matches with '{command}' config:\n" - f"{config_to_fix}\n" - f"Matches were: {train_config_to_use}" - ) - train_config_to_use = train_config_to_use[0] - - # now use the config to find the results dir and get the values for the options we need to set - # which are checkpoint_path, spect_scaler_path, and labelmap_path - with train_config_to_use.open("r") as fp: - train_config_toml = toml.load(fp) - root_results_dir = Path(train_config_toml["TRAIN"]["root_results_dir"]) - results_dir = sorted(root_results_dir.glob("results_*")) - if len(results_dir) > 1: - if single_train_result: - raise ValueError( - f"Did not find just a single results directory in root_results_dir from train_config:\n" - f"{train_config_to_use}" - f"root_results_dir was: {root_results_dir}" - f'Matches for "results_*" were: {results_dir}' - ) - else: - results_dir = results_dir[-1] - elif len(results_dir) == 1: - results_dir = results_dir[0] - else: - raise ValueError( - f"Did not find a results directory in root_results_dir from train_config:\n" - f"{train_config_to_use}" - f"root_results_dir was:\n{root_results_dir}" - f'Matches for "results_*" were:\n{results_dir}' - ) - - # these are the only options whose values we need to change - # and they are the same for both predict and eval - checkpoint_path = sorted(results_dir.glob("**/checkpoints/checkpoint.pt"))[0] - if train_config_toml['TRAIN']['normalize_spectrograms']: - spect_scaler_path = sorted(results_dir.glob("StandardizeSpect"))[0] - else: - spect_scaler_path = None - labelmap_path = sorted(results_dir.glob("labelmap.json"))[0] - - # now add these values to corresponding options in predict / eval config - with config_to_fix.open("r") as fp: - config_toml = toml.load(fp) - if command == 'train_continue': - section = 'TRAIN' - else: - section = command.upper() - config_toml[section]["checkpoint_path"] = str(checkpoint_path) - if spect_scaler_path: - config_toml[section]["spect_scaler_path"] = str(spect_scaler_path) - else: - if 'spect_scaler_path' in config_toml[section]: - # remove any existing 'spect_scaler_path' option - del config_toml[section]["spect_scaler_path"] - if command != 'train_continue': # train always gets labelmap from dataset dir, not from a config option - config_toml[section]["labelmap_path"] = str(labelmap_path) - with config_to_fix.open("w") as fp: - toml.dump(config_toml, fp) - - -# need to run 'train' config before we run 'predict' -# so we can add checkpoints, etc., from training to predict -COMMANDS = ( - "train", - "learncurve", - "eval", - "predict", - "train_continue", -) +logger = logging.getLogger() # 'base' logger +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +stream_handler = logging.StreamHandler(sys.stdout) +stream_handler.setFormatter(formatter) +logger.addHandler(stream_handler) +logger.setLevel('INFO') def generate_test_data( step: str = 'all', - commands=COMMANDS, + commands=vaktestdata.constants.COMMANDS, single_train_result:bool = True, ): """Main function that generates all the test data. @@ -336,52 +101,19 @@ def generate_test_data( """ # need to run `prep` before we run other commands if step in ('prep', 'all'): - print( - "copying config files run to generate test data from ./tests/data_for_tests/configs to " - "./tests/data_for_tests/generated/configs" - ) - GENERATED_TEST_CONFIGS_ROOT.mkdir(parents=True) - config_paths = copy_config_files() - - print(f"will generate test data from these config files: {config_paths}") - - print( - "making sub-directories in ./tests/data_for_tests/generated/ where files generated by `vak` will go" - ) - make_subdirs_in_generated(config_paths) - + config_paths = vaktestdata.configs.copy_config_files() + vaktestdata.dirs.make_subdirs_in_generated(config_paths) # run prep for some models - for model in MODELS_PREP: - config_paths_this_model = [ - config_path - for config_path in config_paths - if config_path.name.startswith(model) - ] - run_prep(config_paths=config_paths_this_model) + vaktestdata.prep.run_prep() # re-use some of the prepped datasets for other models # this makes time to prep all datasets shorter - for target_model, source_model in MODELS_REUSE_PREP.items(): - print( - f"Re-using prepped datasets from model '{source_model}' for model '{target_model}'." - ) - config_paths_source_model = [ - config_path - for config_path in config_paths - if config_path.name.startswith(source_model_prep) - ] - config_paths_target_model = [ - config_path - for config_path in config_paths - if config_path.name.startswith(target_model_prep) - ] - add_dataset_path_from_prepped_configs(config_paths_target_model, target_model, - config_paths_source_model, source_model) + vaktestdata.configs.add_dataset_path_from_prepped_configs() else: - config_paths = sorted(GENERATED_TEST_CONFIGS_ROOT.glob('*.toml')) + config_paths = sorted(vaktestdata.constants.GENERATED_TEST_CONFIGS_ROOT.glob('*.toml')) if step in ('results', 'all'): - for model in MODELS_RESULTS: + for model in vaktestdata.constants.MODELS_RESULTS: for command in commands: if command == "prep": continue # we don't run prep in this code block @@ -414,7 +146,7 @@ def generate_test_data( # using results from running the corresponding train configs. # this only works if we ran the train configs already, # which we should have because of ordering of COMMANDS constant above - fix_options_in_configs(config_paths, model, command, single_train_result) + vaktestdata.configs.fix_options_in_configs(config_paths, model, command, single_train_result) command_config_paths = [ config_path for config_path in config_paths @@ -428,41 +160,8 @@ def generate_test_data( vak.cli.cli.cli(command, config_path) -GENERATE_TEST_DATA_STEPS = ( - 'prep', - 'results', - 'all', -) - - -def get_parser(): - parser = argparse.ArgumentParser() - parser.add_argument( - '--step', - choices=GENERATE_TEST_DATA_STEPS, - help=f"Which step of generating test data to perform, one of: {GENERATE_TEST_DATA_STEPS}", - default='all' - ) - parser.add_argument( - '--commands', - choices=('train', 'learncurve', 'eval', 'predict', 'train_continue'), - help=f"Space-separated list of commands to run for 'results' step", - nargs="+", - ) - parser.add_argument( - '--single-train-result', - action = argparse.BooleanOptionalAction, - help=( - "If --single-train-result, require there be a single results directory " - "from any training config when looking for them to use in toher configs. " - "If --no-single-train-result, allow multiple and use the most recent." - ) - ) - return parser - - if __name__ == "__main__": - parser = get_parser() + parser = vaktestdata.parser.get_parser() args = parser.parse_args() generate_test_data( step=args.step, From 880b6468589b50b06d5ea01946c9850386ec3a88 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 19:07:33 -0400 Subject: [PATCH 085/184] Rename ConvEnconderParametricUMAP -> ConvEncoderUMAP (the fact that it's parametric is implied) --- src/vak/models/__init__.py | 4 ++-- .../{convencoder_parametric_umap.py => convencoder_umap.py} | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) rename src/vak/models/{convencoder_parametric_umap.py => convencoder_umap.py} (98%) diff --git a/src/vak/models/__init__.py b/src/vak/models/__init__.py index cfbf1c3ec..81ce02a4c 100644 --- a/src/vak/models/__init__.py +++ b/src/vak/models/__init__.py @@ -5,7 +5,7 @@ registry, ) from .base import Model -from .convencoder_parametric_umap import ConvEncoderParametricUMAP +from .convencoder_umap import ConvEncoderUMAP from .get import get from .ed_tcn import ED_TCN from .teenytweetynet import TeenyTweetyNet @@ -16,7 +16,7 @@ __all__ = [ "base", - "ConvEncoderParametricUMAP", + "ConvEncoderUMAP", "decorator", "definition", "ED_TCN", diff --git a/src/vak/models/convencoder_parametric_umap.py b/src/vak/models/convencoder_umap.py similarity index 98% rename from src/vak/models/convencoder_parametric_umap.py rename to src/vak/models/convencoder_umap.py index d331a834c..2d5cbe704 100644 --- a/src/vak/models/convencoder_parametric_umap.py +++ b/src/vak/models/convencoder_umap.py @@ -19,7 +19,7 @@ @model(family=ParametricUMAPModel) -class ConvEncoderParametricUMAP: +class ConvEncoderUMAP: """Parametric UMAP model, as described in [1]_, with a convolutional network as the encoder. From 533198220b95940f6fac813ba0abece53f7a2095 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 19:24:14 -0400 Subject: [PATCH 086/184] Add: tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml --- ...derUMAP_train_audio_cbin_annot_notmat.toml | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml diff --git a/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml new file mode 100644 index 000000000..9c37fffd6 --- /dev/null +++ b/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml @@ -0,0 +1,33 @@ +[PREP] +dataset_type = "dimensionality reduction" +input_type = "spect" +data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032312" +output_dir = "./tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/ConvEncoderUMAP" +audio_format = "cbin" +annot_format = "notmat" +labelset = "iabcdefghjk" +train_dur = 40 +val_dur = 15 + +[SPECT_PARAMS] +fft_size = 512 +step_size = 32 +transform_type = "log_spect_plus_one" + +[DATALOADER] +window_size = 44 + +[TRAIN] +model = "ConvEncoderUMAP" +normalize_spectrograms = true +batch_size = 4 +num_epochs = 2 +val_step = 50 +ckpt_step = 200 +patience = 3 +num_workers = 2 +device = "cuda" +root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/ConvEncoderUMAP" + +[ConvEncoderUMAP.optimizer] +lr = 0.001 From c0667e6fb24f46b3c21b68569ce11ea0fe202bd7 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Mon, 17 Jul 2023 19:24:27 -0400 Subject: [PATCH 087/184] Add ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml to configs.json --- tests/data_for_tests/configs/configs.json | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/data_for_tests/configs/configs.json b/tests/data_for_tests/configs/configs.json index c42c3bcae..e61632a1d 100644 --- a/tests/data_for_tests/configs/configs.json +++ b/tests/data_for_tests/configs/configs.json @@ -179,6 +179,15 @@ "spect_format": "mat", "annot_format": "yarden", "use_dataset_from_config": "TweetyNet_train_continue_spect_mat_annot_yarden.toml" + }, + { + "filename": "ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml", + "model": "ConvEncoderUMAP", + "config_type": "train", + "audio_format": "cbin", + "spect_format": null, + "annot_format": "notmat", + "use_dataset_from_config": null } ] } \ No newline at end of file From ebd25c62fa2feb65a2f9dc0194c85f08b5756826 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 18 Jul 2023 18:01:44 -0400 Subject: [PATCH 088/184] Add args + attributes to UMAPLoss class --- src/vak/nn/loss/umap.py | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/src/vak/nn/loss/umap.py b/src/vak/nn/loss/umap.py index 5bcafe65f..72748d25b 100644 --- a/src/vak/nn/loss/umap.py +++ b/src/vak/nn/loss/umap.py @@ -1,4 +1,9 @@ +"""Parametric UMAP loss function.""" +from __future__ import annotations + import torch +from torch.nn.functional import mse_loss +from umap.umap_ import find_ab_params def convert_distance_to_probability(distances, a=1.0, b=1.0): @@ -33,7 +38,8 @@ def compute_cross_entropy( return attraction_term, repulsion_term, CE -def umap_loss(embedding_to, embedding_from, a, b, batch_size, negative_sample_rate=5): +def umap_loss(embedding_to: torch.Tensor, embedding_from: torch.Tensor, + a, b, negative_sample_rate: int = 5): """UMAP loss function Converts distances to probabilities, @@ -54,6 +60,7 @@ def umap_loss(embedding_to, embedding_from, a, b, batch_size, negative_sample_ra distance_embedding, a, b ) # set true probabilities based on negative sampling + batch_size = embedding_to.shape[0] probabilities_graph = torch.cat( (torch.ones(batch_size), torch.zeros(batch_size * negative_sample_rate)), dim=0, # ``to`` method in next line to avoid error `Expected all tensors to be on the same device` @@ -69,9 +76,27 @@ def umap_loss(embedding_to, embedding_from, a, b, batch_size, negative_sample_ra class UmapLoss(torch.nn.Module): - def __init__(self): + """""" + def __init__(self, + spread: float = 1.0, + min_dist: float = 0.1, + negative_sample_rate: int = 5, + beta: float = 1.0, + ): super().__init__() + self.min_dist = min_dist + self.a, self.b = find_ab_params(spread, min_dist) + self.negative_sample_rate = negative_sample_rate + self.beta = beta + - def forward(self, embedding_to, embedding_from, a, b, batch_size, negative_sample_rate): - return umap_loss(embedding_to, embedding_from, a, b, - batch_size, negative_sample_rate) + def forward(self, embedding_to: torch.Tensor, embedding_from: torch.Tensor, + reconstruction: torch.Tensor | None = None, before_encoding: torch.Tensor | None = None): + loss_umap = umap_loss(embedding_to, embedding_from, self.a, self.b, self.negative_sample_rate) + if reconstruction is not None: + loss_reconstruction = mse_loss(reconstruction, before_encoding) + loss = loss_umap + self.beta * loss_reconstruction + else: + loss_reconstruction = None + loss = loss_umap + return loss_umap, loss_reconstruction, loss From a52b9c5b4638a4ec5f44ead41901f4a6c4048653 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 18 Jul 2023 18:02:07 -0400 Subject: [PATCH 089/184] Revise vak.models.parametric_umap_model - Remove loss function kwargs from model __init__ now that they are args/attributes of the loss class itself, will pass in with config - rewrite train/val steps of model to use the rewritten loss class - Add ParametricUMAP class with fit/transform methods, mirroring the UMAP-learn API, plan to just use this class inside `vak.train.parametric_umap`, `vak.eval.parametric_umap` etc. - Also add a DataModule like in Tim's code, not sure yet if I need this --- src/vak/models/parametric_umap_model.py | 129 +++++++++++++++++------- 1 file changed, 94 insertions(+), 35 deletions(-) diff --git a/src/vak/models/parametric_umap_model.py b/src/vak/models/parametric_umap_model.py index dbc29e365..a65dfb32b 100644 --- a/src/vak/models/parametric_umap_model.py +++ b/src/vak/models/parametric_umap_model.py @@ -7,12 +7,12 @@ """ from __future__ import annotations +import pathlib from typing import Callable, ClassVar, Type +import pytorch_lightning as lightning import torch -from torch.nn.functional import mse_loss - -from umap.umap_ import find_ab_params +import torch.utils.data from . import base from .definition import ModelDefinition @@ -45,17 +45,11 @@ def __init__( loss: torch.nn.Module | Callable | None = None, optimizer: torch.optim.Optimizer | None = None, metrics: dict[str: Type] | None = None, - beta: float = 1.0, - min_dist: float = 0.1, - negative_sample_rate: int = 5, ): super().__init__(network=network, loss=loss, optimizer=optimizer, metrics=metrics) self.encoder = network['encoder'] self.decoder = network.get('decoder', None) - self.beta = beta # weight for reconstruction loss - self._a, self._b = find_ab_params(1.0, min_dist) - self.negative_sample_rate = negative_sample_rate def configure_optimizers(self): return self.optimizer @@ -63,40 +57,41 @@ def configure_optimizers(self): def training_step(self, batch, batch_idx): (edges_to_exp, edges_from_exp) = batch embedding_to, embedding_from = self.encoder(edges_to_exp), self.encoder(edges_from_exp) - encoder_loss = self.loss(embedding_to, embedding_from, self._a, self._b, - edges_to_exp.shape[0], negative_sample_rate=self.negative_sample_rate) - self.log("train_umap_loss", encoder_loss) if self.decoder is not None: - recon = self.decoder(embedding_to) - recon_loss = mse_loss(recon, edges_to_exp) - self.log("train_recon_loss", recon_loss) - return encoder_loss + self.beta * recon_loss + reconstruction = self.decoder(embedding_to) + before_encoding = edges_to_exp else: - return encoder_loss + reconstruction = None + before_encoding = None + loss_umap, loss_reconstruction, loss = self.loss(embedding_to, embedding_from, reconstruction, before_encoding) + self.log("train_umap_loss", loss_umap) + if loss_reconstruction: + self.log("train_reconstruction_loss", loss_reconstruction) + # note if there's no ``loss_reconstruction``, then ``loss`` == ``loss_umap`` + self.log("train_loss", loss) + return loss def validation_step(self, batch, batch_idx): (edges_to_exp, edges_from_exp) = batch embedding_to, embedding_from = self.encoder(edges_to_exp), self.encoder(edges_from_exp) - encoder_loss = self.loss(embedding_to, embedding_from, self._a, self._b, - edges_to_exp.shape[0], negative_sample_rate=self.negative_sample_rate) - self.log("val_umap_loss", encoder_loss, on_step=True) if self.decoder is not None: - recon = self.decoder(embedding_to) - recon_loss = mse_loss(recon, edges_to_exp) - self.log("val_recon_loss", recon_loss, on_step=True) - return encoder_loss + self.beta * recon_loss + reconstruction = self.decoder(embedding_to) + before_encoding = edges_to_exp else: - return encoder_loss + reconstruction = None + before_encoding = None + loss_umap, loss_reconstruction, loss = self.loss(embedding_to, embedding_from, reconstruction, before_encoding) + self.log("val_umap_loss", loss_umap, on_step=True) + if loss_reconstruction: + self.log("val_reconstruction_loss", loss_reconstruction, on_step=True) + # note if there's no ``loss_reconstruction``, then ``loss`` == ``loss_umap`` + self.log("val_loss", loss, on_step=True) @classmethod def from_config(cls, - config: dict, - beta: float = 1.0, - min_dist: float = 0.1, - negative_sample_rate: int = 5, - ): + config: dict): """Return an initialized model instance from a config ``dict`` Parameters @@ -115,8 +110,72 @@ def from_config(cls, return cls(network=network, optimizer=optimizer, loss=loss, - metrics=metrics, - beta=beta, - min_dist=min_dist, - negative_sample_rate=negative_sample_rate, - ) + metrics=metrics) + + +class ParametricUMAPDatamodule(lightning.LightningDataModule): + def __init__( + self, + dataset, + batch_size, + num_workers, + ): + super().__init__() + self.dataset = dataset + self.batch_size = batch_size + self.num_workers = num_workers + + def train_dataloader(self) -> torch.utils.data.DataLoader: + return torch.utils.data.DataLoader( + dataset=self.dataset, + batch_size=self.batch_size, + num_workers=self.num_workers, + shuffle=True, + ) + + +class ParametricUMAP: + def __init__( + self, + encoder: torch.nn.Module, + decoder: torch.nn.Module | None = None, + n_neighbors: int = 10, + min_dist: float = 0.1, + metric: str = "euclidean", + lr: float = 1e-3, + batch_size: int = 64, + num_workers: int = 16, + random_state: int | None = None, + ): + self.encoder = encoder + self.decoder = decoder + self.n_neighbors = n_neighbors + self.min_dist = min_dist + self.metric = metric + + self.lr = lr + self.num_epochs = num_epochs + + self.batch_size = batch_size + self.num_workers = num_workers + self.random_state = random_state + + self.model = ParametricUMAPModel(self.encoder, min_dist=self.min_dist) + + def fit(self, trainer: lightning.Trainer, dataset_path: str | pathlib.Path, transform=None): + import vak.datasets + dataset = vak.datasets.UMAPDataset.from_dataset_path(dataset_path, 'train', self.n_neighbors, self.metric, + self.random_state, self.num_epochs, transform) + trainer.fit( + model=self.model, + datamodule=ParametricUMAPDatamodule(dataset, self.batch_size, self.num_workers) + ) + + @torch.no_grad() + def transform(self, X): + self.embedding_ = self.model.encoder(X).detach().cpu().numpy() + return self.embedding_ + + @torch.no_grad() + def inverse_transform(self, Z): + return self.model.decoder(Z).detach().cpu().numpy() From de42b797d98bdc72cf8fb25aadffed0203edd321 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 09:50:47 -0400 Subject: [PATCH 090/184] Move parametric UMAP dataset up into vak.datasets, get rid of dimensionality_reduction sub-package --- src/vak/datasets/__init__.py | 2 +- .../datasets/dimensionality_reduction/__init__.py | 10 ---------- .../parametric_umap/__init__.py | 1 - .../dimensionality_reduction/unit_dataset.py | 14 -------------- src/vak/datasets/parametric_umap/__init__.py | 8 ++++++++ .../metadata.py | 0 .../parametric_umap/parametric_umap.py | 0 7 files changed, 9 insertions(+), 26 deletions(-) delete mode 100644 src/vak/datasets/dimensionality_reduction/__init__.py delete mode 100644 src/vak/datasets/dimensionality_reduction/parametric_umap/__init__.py delete mode 100644 src/vak/datasets/dimensionality_reduction/unit_dataset.py create mode 100644 src/vak/datasets/parametric_umap/__init__.py rename src/vak/datasets/{dimensionality_reduction => parametric_umap}/metadata.py (100%) rename src/vak/datasets/{dimensionality_reduction => }/parametric_umap/parametric_umap.py (100%) diff --git a/src/vak/datasets/__init__.py b/src/vak/datasets/__init__.py index 8a8618ce7..55dc89254 100644 --- a/src/vak/datasets/__init__.py +++ b/src/vak/datasets/__init__.py @@ -1,6 +1,6 @@ from . import ( - dimensionality_reduction, frame_classification, + parametric_umap ) diff --git a/src/vak/datasets/dimensionality_reduction/__init__.py b/src/vak/datasets/dimensionality_reduction/__init__.py deleted file mode 100644 index 3ab658efe..000000000 --- a/src/vak/datasets/dimensionality_reduction/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from .metadata import Metadata -from .unit_dataset import UnitDataset -from .parametric_umap import ParametricUMAPDataset - - -__all__ = [ - "Metadata", - "ParametricUMAPDataset", - "UnitDataset", -] diff --git a/src/vak/datasets/dimensionality_reduction/parametric_umap/__init__.py b/src/vak/datasets/dimensionality_reduction/parametric_umap/__init__.py deleted file mode 100644 index 05a3dc5ca..000000000 --- a/src/vak/datasets/dimensionality_reduction/parametric_umap/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .parametric_umap import ParametricUMAPDataset \ No newline at end of file diff --git a/src/vak/datasets/dimensionality_reduction/unit_dataset.py b/src/vak/datasets/dimensionality_reduction/unit_dataset.py deleted file mode 100644 index b43b2ff63..000000000 --- a/src/vak/datasets/dimensionality_reduction/unit_dataset.py +++ /dev/null @@ -1,14 +0,0 @@ -from __future__ import annotations - -import pathlib -from typing import Callable - -import numpy as np -import numpy.typing as npt -import pandas as pd - -from .metadata import Metadata - - -class UnitDataset: - pass diff --git a/src/vak/datasets/parametric_umap/__init__.py b/src/vak/datasets/parametric_umap/__init__.py new file mode 100644 index 000000000..90caf061f --- /dev/null +++ b/src/vak/datasets/parametric_umap/__init__.py @@ -0,0 +1,8 @@ +from .metadata import Metadata +from .parametric_umap import ParametricUMAPDataset + + +__all__ = [ + 'Metadata', + 'ParametricUMAPDataset' +] diff --git a/src/vak/datasets/dimensionality_reduction/metadata.py b/src/vak/datasets/parametric_umap/metadata.py similarity index 100% rename from src/vak/datasets/dimensionality_reduction/metadata.py rename to src/vak/datasets/parametric_umap/metadata.py diff --git a/src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py b/src/vak/datasets/parametric_umap/parametric_umap.py similarity index 100% rename from src/vak/datasets/dimensionality_reduction/parametric_umap/parametric_umap.py rename to src/vak/datasets/parametric_umap/parametric_umap.py From b69705f80a802c315677e6e8c6ed86c3e35748b0 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 10:53:21 -0400 Subject: [PATCH 091/184] Make minor fixes to ParametricUMAP class --- src/vak/models/parametric_umap_model.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/vak/models/parametric_umap_model.py b/src/vak/models/parametric_umap_model.py index a65dfb32b..1e74a2ae3 100644 --- a/src/vak/models/parametric_umap_model.py +++ b/src/vak/models/parametric_umap_model.py @@ -142,6 +142,7 @@ def __init__( n_neighbors: int = 10, min_dist: float = 0.1, metric: str = "euclidean", + num_epochs: int = 200, lr: float = 1e-3, batch_size: int = 64, num_workers: int = 16, @@ -163,9 +164,9 @@ def __init__( self.model = ParametricUMAPModel(self.encoder, min_dist=self.min_dist) def fit(self, trainer: lightning.Trainer, dataset_path: str | pathlib.Path, transform=None): - import vak.datasets - dataset = vak.datasets.UMAPDataset.from_dataset_path(dataset_path, 'train', self.n_neighbors, self.metric, - self.random_state, self.num_epochs, transform) + from vak.datasets.parametric_umap import ParametricUMAPDataset + dataset = ParametricUMAPDataset.from_dataset_path(dataset_path, 'train', self.n_neighbors, self.metric, + self.random_state, self.num_epochs, transform) trainer.fit( model=self.model, datamodule=ParametricUMAPDatamodule(dataset, self.batch_size, self.num_workers) @@ -173,8 +174,8 @@ def fit(self, trainer: lightning.Trainer, dataset_path: str | pathlib.Path, tran @torch.no_grad() def transform(self, X): - self.embedding_ = self.model.encoder(X).detach().cpu().numpy() - return self.embedding_ + embedding = self.model.encoder(X).detach().cpu().numpy() + return embedding @torch.no_grad() def inverse_transform(self, Z): From 67d979dff281b4650046bb75125a6f8161efac76 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 10:53:49 -0400 Subject: [PATCH 092/184] WIP: Fix vak/train/parametric_umap.py so it actually works --- src/vak/train/parametric_umap.py | 45 ++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/src/vak/train/parametric_umap.py b/src/vak/train/parametric_umap.py index f3e0cf143..532443417 100644 --- a/src/vak/train/parametric_umap.py +++ b/src/vak/train/parametric_umap.py @@ -14,7 +14,7 @@ transforms, ) from ..common import validators -from ..datasets.dimensionality_reduction import ParametricUMAPDataset +from ..datasets.parametric_umap import ParametricUMAPDataset from ..common.device import get_default as get_default_device from ..common.paths import generate_results_dir_name_as_path from ..common.trainer import get_default_trainer @@ -35,6 +35,8 @@ def train_parametric_umap_model( batch_size: int, num_epochs: int, num_workers: int, + train_dataset_params: dict | None = None, + val_dataset_params: dict | None = None, checkpoint_path: str | pathlib.Path | None = None, root_results_dir: str | pathlib.Path | None = None, results_path: str | pathlib.Path | None = None, @@ -72,29 +74,28 @@ def train_parametric_umap_model( num_workers : int Number of processes to use for parallel loading of data. Argument to torch.DataLoader. - dataset_csv_path - Path to csv file representing splits of dataset, - e.g., such a file generated by running ``vak prep``. - This parameter is used by :func:`vak.core.learncurve` to specify - different splits to use, when generating results for a learning curve. - If this argument is specified, the csv file must be inside the directory - ``dataset_path``. - checkpoint_path : str, pathlib.Path + train_dataset_params: dict, optional + Parameters for training dataset. + Passed as keyword arguments to + :class:`vak.datasets.parametric_umap.ParametricUMAP`. + Optional, default is None. + val_dataset_params: dict, optional + Parameters for validation dataset. + Passed as keyword arguments to + :class:`vak.datasets.parametric_umap.ParametricUMAP`. + Optional, default is None. + checkpoint_path : str, pathlib.Path, optional path to a checkpoint file, e.g., one generated by a previous run of ``vak.core.train``. If specified, this checkpoint will be loaded into model. Used when continuing training. Default is None, in which case a new model is initialized. - root_results_dir : str, pathlib.Path + root_results_dir : str, pathlib.Path, optional Root directory in which a new directory will be created where results will be saved. - results_path : str, pathlib.Path + results_path : str, pathlib.Path, optional Directory where results will be saved. If specified, this parameter overrides ``root_results_dir``. - spect_key : str - key for accessing spectrogram in files. Default is 's'. - timebins_key : str - key for accessing vector of time bins in files. Default is 't'. device : str Device on which to work with model + data. Default is None. If None, then a device will be selected with vak.split.get_default. @@ -140,7 +141,7 @@ def train_parametric_umap_model( logger.info( f"Loading dataset from path: {dataset_path}", ) - metadata = datasets.dimensionality_reduction.Metadata.from_dataset_path(dataset_path) + metadata = datasets.parametric_umap.Metadata.from_dataset_path(dataset_path) dataset_csv_path = dataset_path / metadata.dataset_csv_filename dataset_df = pd.read_csv(dataset_csv_path) # ---------------- pre-conditions ---------------------------------------------------------------------------------- @@ -172,12 +173,15 @@ def train_parametric_umap_model( f"Total duration of training split from dataset (in s): {train_dur}", ) - transform = transforms.get_defaults("train") + transform = transforms.defaults.get_default_transform(model_name, "train") + if train_dataset_params is None: + train_dataset_params = {} train_dataset = ParametricUMAPDataset.from_dataset_path( dataset_path=dataset_path, split=split, transform=transform, + **train_dataset_params, ) logger.info( f"Duration of WindowDataset used for training, in seconds: {train_dataset.duration}", @@ -191,13 +195,14 @@ def train_parametric_umap_model( # ---------------- load validation set (if there is one) ----------------------------------------------------------- if val_step: - transform = transforms.get_defaults( - "eval", - ) + transform = transforms.defaults.get_default_transform(model_name, "eval") + if val_dataset_params is None: + val_dataset_params = {} val_dataset = ParametricUMAPDataset.from_dataset_path( dataset_path=dataset_path, split=split, transform=transform, + **val_dataset_params, ) val_loader = torch.utils.data.DataLoader( dataset=val_dataset, From eea3814273db999a307724ec4d9132329198a87a Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 14:09:36 -0400 Subject: [PATCH 093/184] Add 'train_dataset_params' and 'val_dataset_params' as attributes to TrainConfig, and add in valid.toml --- src/vak/config/train.py | 11 +++++++++++ src/vak/config/valid.toml | 2 ++ 2 files changed, 13 insertions(+) diff --git a/src/vak/config/train.py b/src/vak/config/train.py index dd93ae7d1..9149ed198 100644 --- a/src/vak/config/train.py +++ b/src/vak/config/train.py @@ -118,3 +118,14 @@ class TrainConfig: converter=converters.optional(expanded_user_path), default=None, ) + + train_dataset_params = attr.ib( + converter=converters.optional(dict), + validator=validators.optional(instance_of(dict)), + default=None, + ) + val_dataset_params = attr.ib( + converter=converters.optional(dict), + validator=validators.optional(instance_of(dict)), + default=None, + ) diff --git a/src/vak/config/valid.toml b/src/vak/config/valid.toml index d96090840..7ae082e24 100644 --- a/src/vak/config/valid.toml +++ b/src/vak/config/valid.toml @@ -52,6 +52,8 @@ patience = 4 results_dir_made_by_main_script = '/some/path/to/learncurve/' checkpoint_path = '/home/user/results_181014_194418/TweetyNet/checkpoints/' spect_scaler_path = '/home/user/results_181014_194418/spect_scaler' +train_dataset_params = {'window_size' = 80} +val_dataset_params = {'window_size' = 80} [EVAL] dataset_path = 'tests/test_data/prep/learncurve/032312_prep_191224_225910.csv' From 96adb56ce4675cb742cafd5bc7c4b1a9cf6a629b Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 14:26:27 -0400 Subject: [PATCH 094/184] Add 'train_transform_params' and 'val_transform_params' as attributes to TrainConfig, and add in valid.toml --- src/vak/config/train.py | 13 +++++++++++++ src/vak/config/valid.toml | 2 ++ 2 files changed, 15 insertions(+) diff --git a/src/vak/config/train.py b/src/vak/config/train.py index 9149ed198..df3033324 100644 --- a/src/vak/config/train.py +++ b/src/vak/config/train.py @@ -119,11 +119,24 @@ class TrainConfig: default=None, ) + train_transform_kwargs = attr.ib( + converter=converters.optional(dict), + validator=validators.optional(instance_of(dict)), + default=None, + ) + train_dataset_params = attr.ib( converter=converters.optional(dict), validator=validators.optional(instance_of(dict)), default=None, ) + + val_transform_kwargs = attr.ib( + converter=converters.optional(dict), + validator=validators.optional(instance_of(dict)), + default=None, + ) + val_dataset_params = attr.ib( converter=converters.optional(dict), validator=validators.optional(instance_of(dict)), diff --git a/src/vak/config/valid.toml b/src/vak/config/valid.toml index 7ae082e24..50ca6178d 100644 --- a/src/vak/config/valid.toml +++ b/src/vak/config/valid.toml @@ -52,7 +52,9 @@ patience = 4 results_dir_made_by_main_script = '/some/path/to/learncurve/' checkpoint_path = '/home/user/results_181014_194418/TweetyNet/checkpoints/' spect_scaler_path = '/home/user/results_181014_194418/spect_scaler' +train_transform_params = {'resize' = 128} train_dataset_params = {'window_size' = 80} +val_transform_params = {'resize' = 128} val_dataset_params = {'window_size' = 80} [EVAL] From d15aa445b1c335173126221301633e5e59d75d2d Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 14:30:14 -0400 Subject: [PATCH 095/184] Remove `root_results_path` arg from vak.train.train, no longer used --- src/vak/train/train.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/vak/train/train.py b/src/vak/train/train.py index 737481ba5..297c4c664 100644 --- a/src/vak/train/train.py +++ b/src/vak/train/train.py @@ -25,7 +25,6 @@ def train( num_workers: int, checkpoint_path: str | pathlib.Path | None = None, spect_scaler_path: str | pathlib.Path | None = None, - root_results_dir: str | pathlib.Path | None = None, results_path: str | pathlib.Path | None = None, normalize_spectrograms: bool = True, shuffle: bool = True, @@ -39,9 +38,7 @@ def train( Saves checkpoint files for model, label map, and spectrogram scaler. - These are saved either in ``results_path`` - if specified, or a new directory - made inside ``root_results_dir``. + These are saved in ``results_path``. Parameters ---------- @@ -82,12 +79,8 @@ def train( e.g., one generated by a previous run of ``vak.core.train``. Used when continuing training, for example on the same dataset. Default is None. - root_results_dir : str, pathlib.Path - Root directory in which a new directory will be created - where results will be saved. results_path : str, pathlib.Path Directory where results will be saved. - If specified, this parameter overrides ``root_results_dir``. spect_key : str key for accessing spectrogram in files. Default is 's'. timebins_key : str From 3bb75d198e1168a9a055854d31df091b49e6ceb4 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 14:34:51 -0400 Subject: [PATCH 096/184] Add args for train/val transform params and train/val dataset params to vak.train.train, pass into vak.train.train_parametric_umap_model --- src/vak/train/train.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/vak/train/train.py b/src/vak/train/train.py index 297c4c664..1360f7023 100644 --- a/src/vak/train/train.py +++ b/src/vak/train/train.py @@ -19,7 +19,10 @@ def train( model_name: str, model_config: dict, dataset_path: str | pathlib.Path, - window_size: int, + train_transform_params: dict | None = None, + train_dataset_params: dict | None = None, + val_transform_params: dict | None = None, + val_dataset_params: dict | None = None, batch_size: int, num_epochs: int, num_workers: int, @@ -160,7 +163,6 @@ def train( model_name=model_name, model_config=model_config, dataset_path=dataset_path, - window_size=window_size, batch_size=batch_size, num_epochs=num_epochs, num_workers=num_workers, @@ -180,6 +182,10 @@ def train( model_name=model_name, model_config=model_config, dataset_path=dataset_path, + train_transform_params=train_transform_params, + train_dataset_params=train_dataset_params, + val_transform_params=val_transform_params, + val_dataset_params=val_dataset_params, batch_size=batch_size, num_epochs=num_epochs, num_workers=num_workers, From 43caa36b4f78e757c708eb915bb8ba846d7e382a Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 14:35:18 -0400 Subject: [PATCH 097/184] Pass args for train/val transform params and train/val dataset params into vak.train.train from vak.cli.train.train --- src/vak/cli/train.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/vak/cli/train.py b/src/vak/cli/train.py index 6ea264e65..9f11b1a95 100644 --- a/src/vak/cli/train.py +++ b/src/vak/cli/train.py @@ -61,7 +61,10 @@ def train(toml_path): model_name=model_name, model_config=model_config, dataset_path=cfg.train.dataset_path, - window_size=cfg.dataloader.window_size, + train_transform_params=cfg.train.train_transform_params, + train_dataset_params=cfg.train.train_dataset_params, + val_transform_params=cfg.train.val_transform_params, + val_dataset_params=cfg.train.val_dataset_params, batch_size=cfg.train.batch_size, num_epochs=cfg.train.num_epochs, num_workers=cfg.train.num_workers, From 6f4781f8f9004d888d880b07beb02e9cb6acc7ed Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 14:35:52 -0400 Subject: [PATCH 098/184] Add and use args train_transform_params and val_transform_params in vak.train.train_parametric_umap_model --- src/vak/train/parametric_umap.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/vak/train/parametric_umap.py b/src/vak/train/parametric_umap.py index 532443417..d5aec7ea1 100644 --- a/src/vak/train/parametric_umap.py +++ b/src/vak/train/parametric_umap.py @@ -35,7 +35,9 @@ def train_parametric_umap_model( batch_size: int, num_epochs: int, num_workers: int, + train_transform_params: dict | None = None, train_dataset_params: dict | None = None, + val_transform_params: dict | None = None, val_dataset_params: dict | None = None, checkpoint_path: str | pathlib.Path | None = None, root_results_dir: str | pathlib.Path | None = None, @@ -173,7 +175,7 @@ def train_parametric_umap_model( f"Total duration of training split from dataset (in s): {train_dur}", ) - transform = transforms.defaults.get_default_transform(model_name, "train") + transform = transforms.defaults.get_default_transform(model_name, "train", train_transform_params) if train_dataset_params is None: train_dataset_params = {} @@ -195,7 +197,7 @@ def train_parametric_umap_model( # ---------------- load validation set (if there is one) ----------------------------------------------------------- if val_step: - transform = transforms.defaults.get_default_transform(model_name, "eval") + transform = transforms.defaults.get_default_transform(model_name, "eval", val_transform_params) if val_dataset_params is None: val_dataset_params = {} val_dataset = ParametricUMAPDataset.from_dataset_path( From af851ccd5b4af0c13b79a46e6111061ef5cd0e2b Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 15:22:03 -0400 Subject: [PATCH 099/184] Put args with defaults in correct place in vak.train.train --- src/vak/train/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/vak/train/train.py b/src/vak/train/train.py index 1360f7023..7f7a8c1a5 100644 --- a/src/vak/train/train.py +++ b/src/vak/train/train.py @@ -19,13 +19,13 @@ def train( model_name: str, model_config: dict, dataset_path: str | pathlib.Path, + batch_size: int, + num_epochs: int, + num_workers: int, train_transform_params: dict | None = None, train_dataset_params: dict | None = None, val_transform_params: dict | None = None, val_dataset_params: dict | None = None, - batch_size: int, - num_epochs: int, - num_workers: int, checkpoint_path: str | pathlib.Path | None = None, spect_scaler_path: str | pathlib.Path | None = None, results_path: str | pathlib.Path | None = None, From 23e74b5b7143e87b51ca4ab13c62831f758074d6 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 15:22:27 -0400 Subject: [PATCH 100/184] Fix TrainConfig attribute names: train/val transform kwargs -> transform params --- src/vak/config/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vak/config/train.py b/src/vak/config/train.py index df3033324..6bb9d4468 100644 --- a/src/vak/config/train.py +++ b/src/vak/config/train.py @@ -119,7 +119,7 @@ class TrainConfig: default=None, ) - train_transform_kwargs = attr.ib( + train_transform_params = attr.ib( converter=converters.optional(dict), validator=validators.optional(instance_of(dict)), default=None, @@ -131,7 +131,7 @@ class TrainConfig: default=None, ) - val_transform_kwargs = attr.ib( + val_transform_params = attr.ib( converter=converters.optional(dict), validator=validators.optional(instance_of(dict)), default=None, From 9249d229e6925f24d72f4695e000b252e50dd6fb Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 15:22:59 -0400 Subject: [PATCH 101/184] Change type annotation for ParametricUMAPModel parameter network to indicate it must be a dict --- src/vak/models/parametric_umap_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/models/parametric_umap_model.py b/src/vak/models/parametric_umap_model.py index 1e74a2ae3..540c6ad58 100644 --- a/src/vak/models/parametric_umap_model.py +++ b/src/vak/models/parametric_umap_model.py @@ -41,7 +41,7 @@ class ParametricUMAPModel(base.Model): def __init__( self, - network: torch.nn.Module | dict[str: torch.nn.Module] | None = None, + network: dict | None = None, loss: torch.nn.Module | Callable | None = None, optimizer: torch.optim.Optimizer | None = None, metrics: dict[str: Type] | None = None, From ef3034371beaae135d3b2d0b506d559d4eef0267 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 15:23:32 -0400 Subject: [PATCH 102/184] Fix where we get Metadata from in ParametricUMAPDatasets.from_dataset_path method --- src/vak/datasets/parametric_umap/parametric_umap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/datasets/parametric_umap/parametric_umap.py b/src/vak/datasets/parametric_umap/parametric_umap.py index c83456d40..b6e09696e 100644 --- a/src/vak/datasets/parametric_umap/parametric_umap.py +++ b/src/vak/datasets/parametric_umap/parametric_umap.py @@ -127,7 +127,7 @@ def from_dataset_path(cls, import vak.datasets # import here just to make classmethod more explicit dataset_path = pathlib.Path(dataset_path) - metadata = vak.datasets.dimensionality_reduction.Metadata.from_dataset_path(dataset_path) + metadata = vak.datasets.parametric_umap.Metadata.from_dataset_path(dataset_path) dataset_csv_path = dataset_path / metadata.dataset_csv_filename dataset_df = pd.read_csv(dataset_csv_path) From 7a1288235966eaa0483501b97e56eaae54b419f5 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 15:24:06 -0400 Subject: [PATCH 103/184] Fix how we get default kwargs for a network definition that's a dict inside Model class --- src/vak/models/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/models/base.py b/src/vak/models/base.py index 5f784b1c2..163ad4275 100644 --- a/src/vak/models/base.py +++ b/src/vak/models/base.py @@ -329,7 +329,7 @@ class variables elif isinstance(cls.definition.network, dict): network = {} for net_name, net_class in cls.definition.network.items(): - net_class_kwargs = network_kwargs[net_name] + net_class_kwargs = network_kwargs.get(net_name, {}) network[net_name] = net_class(**net_class_kwargs) if isinstance(cls.definition.network, dict): From e699799cbe3ea02e7b39e500f70cfd37a39958e6 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 15:32:45 -0400 Subject: [PATCH 104/184] Fix how we get ParametricUMAPModel in vak.models.get --- src/vak/models/get.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/vak/models/get.py b/src/vak/models/get.py index b9a137cf8..a749cdbde 100644 --- a/src/vak/models/get.py +++ b/src/vak/models/get.py @@ -1,6 +1,7 @@ """Function that gets an instance of a model, given its name and a configuration as a dict.""" from __future__ import annotations + import inspect from typing import Callable @@ -9,9 +10,9 @@ def get(name: str, config: dict, - num_classes: int, input_shape: tuple[int, int, int], - labelmap: dict, + num_classes: int | None = None, + labelmap: dict | None = None, post_tfm: Callable | None = None): """Get a model instance, given its name and a configuration as a :class:`dict`. @@ -76,7 +77,23 @@ def get(name: str, f"unable to determine network init arguments for model. Currently all models " f"in this family must have networks with parameters ``num_input_channels`` and ``num_freqbins``" ) + model = model_class.from_config(config=config, labelmap=labelmap, post_tfm=post_tfm) + elif model_family == 'ParametricUMAPModel': + encoder_init_params = list( + inspect.signature( + model_class.definition.network['encoder'].__init__ + ).parameters.keys() + ) + if ('input_shape' in encoder_init_params): + if "encoder" in config["network"]: + config["network"]["encoder"].update(input_shape=input_shape) + else: + config["network"]["encoder"] = dict(input_shape=input_shape) - model = model_class.from_config(config=config, labelmap=labelmap, post_tfm=post_tfm) + model = model_class.from_config(config=config) + else: + raise ValueError( + f"Value for ``model_family`` not recognized: {model_family}" + ) return model From 07e1b15b998c98dcdb819d702fadeedb79eaa501 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 16:04:36 -0400 Subject: [PATCH 105/184] Remove ckpt_step and patience args from call to train_parametric_umap model in vak/train/train.py --- src/vak/train/train.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/vak/train/train.py b/src/vak/train/train.py index 7f7a8c1a5..0bc4a65a3 100644 --- a/src/vak/train/train.py +++ b/src/vak/train/train.py @@ -193,8 +193,6 @@ def train( results_path=results_path, shuffle=shuffle, val_step=val_step, - ckpt_step=ckpt_step, - patience=patience, device=device, split=split, ) From fc49d47bbf639f0683ad6794057086f360684db4 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 19 Jul 2023 16:04:49 -0400 Subject: [PATCH 106/184] Make more fixes to train_parametric_umap_model - Add a get_trainer function instead of re-using one for frame classification models - Fix logging messages about length of dataset --- src/vak/train/parametric_umap.py | 65 +++++++++++++++----------------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/src/vak/train/parametric_umap.py b/src/vak/train/parametric_umap.py index d5aec7ea1..da0eccd48 100644 --- a/src/vak/train/parametric_umap.py +++ b/src/vak/train/parametric_umap.py @@ -7,6 +7,7 @@ import pandas as pd import torch.utils.data +import pytorch_lightning as lightning from .. import ( datasets, @@ -28,6 +29,30 @@ def get_split_dur(df: pd.DataFrame, split: str) -> float: return df[df["split"] == split]["duration"].sum() +def get_trainer(max_epochs: int, + log_save_dir: str | pathlib.Path, + device: str = 'cuda', + ) -> lightning.Trainer: + """Returns an instance of ``lightning.Trainer`` + with a default set of callbacks. + Used by ``vak.core`` functions.""" + if device == 'cuda': + accelerator = 'gpu' + else: + accelerator = None + + logger = lightning.loggers.TensorBoardLogger( + save_dir=log_save_dir + ) + + trainer = lightning.Trainer( + max_epochs=max_epochs, + accelerator=accelerator, + logger=logger, + ) + return trainer + + def train_parametric_umap_model( model_name: str, model_config: dict, @@ -44,8 +69,6 @@ def train_parametric_umap_model( results_path: str | pathlib.Path | None = None, shuffle: bool = True, val_step: int | None = None, - ckpt_step: int | None = None, - patience: int | None = None, device: str | None = None, split: str = 'train', ) -> None: @@ -104,20 +127,6 @@ def train_parametric_umap_model( That function defaults to 'cuda' if torch.cuda.is_available is True. shuffle: bool if True, shuffle training data before each epoch. Default is True. - val_step : int - Step on which to estimate accuracy using validation set. - If val_step is n, then validation is carried out every time - the global step / n is a whole number, i.e., when val_step modulo the global step is 0. - Default is None, in which case no validation is done. - ckpt_step : int - Step on which to save to checkpoint file. - If ckpt_step is n, then a checkpoint is saved every time - the global step / n is a whole number, i.e., when ckpt_step modulo the global step is 0. - Default is None, in which case checkpoint is only saved at the last epoch. - patience : int - number of validation steps to wait without performance on the - validation set improving before stopping the training. - Default is None, in which case training only stops after the specified number of epochs. split : str Name of split from dataset found at ``dataset_path`` to use when training model. Default is 'train'. This parameter is used by @@ -186,7 +195,7 @@ def train_parametric_umap_model( **train_dataset_params, ) logger.info( - f"Duration of WindowDataset used for training, in seconds: {train_dataset.duration}", + f"Duration of ParametricUMAPDataset used for training, in seconds: {train_dataset.duration}", ) train_loader = torch.utils.data.DataLoader( dataset=train_dataset, @@ -206,6 +215,9 @@ def train_parametric_umap_model( transform=transform, **val_dataset_params, ) + logger.info( + f"Duration of ParametricUMAPDataset used for validation, in seconds: {val_dataset.duration}", + ) val_loader = torch.utils.data.DataLoader( dataset=val_dataset, shuffle=False, @@ -213,14 +225,6 @@ def train_parametric_umap_model( batch_size=1, num_workers=num_workers, ) - val_dur = get_split_dur(dataset_df, "val") - logger.info( - f"Total duration of validation split from dataset (in s): {val_dur}", - ) - - logger.info( - f"will measure loss on validation set every {val_step} steps of training", - ) else: val_loader = None @@ -245,16 +249,9 @@ def train_parametric_umap_model( ckpt_root.mkdir() logger.info(f"training {model_name}") max_steps = num_epochs * len(train_loader) - default_callback_kwargs = { - 'ckpt_root': ckpt_root, - 'ckpt_step': ckpt_step, - 'patience': patience, - } - trainer = get_default_trainer( - max_steps=max_steps, + trainer = get_trainer( + max_epochs=num_epochs, log_save_dir=results_model_root, - val_step=val_step, - default_callback_kwargs=default_callback_kwargs, device=device, ) train_time_start = datetime.datetime.now() From b31ef6a134369ae32f7ba060ed9f5c528ea16e0f Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 08:43:26 -0400 Subject: [PATCH 107/184] Rewrite train_frame_classification_model to use train/val_transform_params and train/val_dataset_params args --- src/vak/train/frame_classification.py | 89 +++++++++++++++++---------- 1 file changed, 55 insertions(+), 34 deletions(-) diff --git a/src/vak/train/frame_classification.py b/src/vak/train/frame_classification.py index d62cfa1fa..5519e3b5b 100644 --- a/src/vak/train/frame_classification.py +++ b/src/vak/train/frame_classification.py @@ -39,13 +39,15 @@ def train_frame_classification_model( model_name: str, model_config: dict, dataset_path: str | pathlib.Path, - window_size: int, batch_size: int, num_epochs: int, num_workers: int, + train_transform_params: dict | None = None, + train_dataset_params: dict | None = None, + val_transform_params: dict | None = None, + val_dataset_params: dict | None = None, checkpoint_path: str | pathlib.Path | None = None, spect_scaler_path: str | pathlib.Path | None = None, - root_results_dir: str | pathlib.Path | None = None, results_path: str | pathlib.Path | None = None, normalize_spectrograms: bool = True, shuffle: bool = True, @@ -74,9 +76,6 @@ def train_frame_classification_model( and used by the model method ``from_config``. dataset_path : str Path to dataset, a directory generated by running ``vak prep``. - window_size : int - size of windows taken from spectrograms, in number of time bins, - shown to neural networks batch_size : int number of samples per batch presented to models during training. num_epochs : int @@ -85,6 +84,24 @@ def train_frame_classification_model( num_workers : int Number of processes to use for parallel loading of data. Argument to torch.DataLoader. + train_transform_params: dict, optional + Parameters for training data transform. + Passed as keyword arguments. + Optional, default is None. + train_dataset_params: dict, optional + Parameters for training dataset. + Passed as keyword arguments to + :class:`vak.datasets.frame_classification.WindowDataset`. + Optional, default is None. + val_transform_params: dict, optional + Parameters for validation data transform. + Passed as keyword arguments. + Optional, default is None. + val_dataset_params: dict, optional + Parameters for validation dataset. + Passed as keyword arguments to + :class:`vak.datasets.parametric_umap.ParametricUMAP`. + Optional, default is None. dataset_csv_path Path to csv file representing splits of dataset, e.g., such a file generated by running ``vak prep``. @@ -109,10 +126,6 @@ def train_frame_classification_model( results_path : str, pathlib.Path Directory where results will be saved. If specified, this parameter overrides ``root_results_dir``. - spect_key : str - key for accessing spectrogram in files. Default is 's'. - timebins_key : str - key for accessing vector of time bins in files. Default is 't'. device : str Device on which to work with model + data. Default is None. If None, then a device will be selected with vak.split.get_default. @@ -174,15 +187,11 @@ def train_frame_classification_model( ) # ---- set up directory to save output ----------------------------------------------------------------------------- - if results_path: - results_path = pathlib.Path(results_path).expanduser().resolve() - if not results_path.is_dir(): - raise NotADirectoryError( - f"results_path not recognized as a directory: {results_path}" - ) - else: - results_path = generate_results_dir_name_as_path(root_results_dir) - results_path.mkdir() + results_path = pathlib.Path(results_path).expanduser().resolve() + if not results_path.is_dir(): + raise NotADirectoryError( + f"results_path not recognized as a directory: {results_path}" + ) frame_dur = metadata.frame_dur logger.info( @@ -190,7 +199,7 @@ def train_frame_classification_model( ) # ---------------- load training data ----------------------------------------------------------------------------- - logger.info(f"using training dataset from {dataset_path}") + logger.info(f"Using training split from dataset: {dataset_path}") # below, if we're going to train network to predict unlabeled segments, then # we need to include a class for those unlabeled segments in labelmap, # the mapping from labelset provided by user to a set of consecutive @@ -235,16 +244,22 @@ def train_frame_classification_model( "will not standardize spectrograms", ) spect_standardizer = None + + if train_transform_params is None: + train_transform_params = {} + train_transform_params.update({'spect_standardizer': spect_standardizer}) transform, target_transform = transforms.defaults.get_default_transform( - model_name, "train", transform_kwargs={'spect_standardizer': spect_standardizer} + model_name, "train", transform_kwargs=train_transform_params ) + if train_dataset_params is None: + train_dataset_params = {} train_dataset = WindowDataset.from_dataset_path( dataset_path=dataset_path, - window_size=window_size, split=split, transform=transform, target_transform=target_transform, + **train_dataset_params, ) logger.info( f"Duration of WindowDataset used for training, in seconds: {train_dataset.duration}", @@ -258,19 +273,33 @@ def train_frame_classification_model( # ---------------- load validation set (if there is one) ----------------------------------------------------------- if val_step: + logger.info( + f"Will measure error on validation set every {val_step} steps of training", + ) + logger.info(f"Using validation split from dataset:\n{dataset_path}") + val_dur = get_split_dur(dataset_df, "val") + logger.info( + f"Total duration of validation split from dataset (in s): {val_dur}", + ) + + if val_transform_params is None: + val_transform_params = {} + val_transform_params.update({'spect_standardizer': spect_standardizer}) item_transform = transforms.defaults.get_default_transform( model_name, "eval", - transform_kwargs=dict( - spect_standardizer=spect_standardizer, - window_size=window_size, - return_padding_mask=True, - ) + val_transform_params ) + if val_dataset_params is None: + val_dataset_params = {} val_dataset = FramesDataset.from_dataset_path( dataset_path=dataset_path, split="val", item_transform=item_transform, + **val_dataset_params, + ) + logger.info( + f"Duration of FramesDataset used for evaluation, in seconds: {val_dataset.duration}", ) val_loader = torch.utils.data.DataLoader( dataset=val_dataset, @@ -279,14 +308,6 @@ def train_frame_classification_model( batch_size=1, num_workers=num_workers, ) - val_dur = get_split_dur(dataset_df, "val") - logger.info( - f"Total duration of validation split from dataset (in s): {val_dur}", - ) - - logger.info( - f"will measure error on validation set every {val_step} steps of training", - ) else: val_loader = None From 37e8923415940d7149c41ebd6a5c02ddff2d063c Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 08:43:57 -0400 Subject: [PATCH 108/184] Fix train_parametric_umap_model to use train/val_transform_params --- src/vak/train/parametric_umap.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/vak/train/parametric_umap.py b/src/vak/train/parametric_umap.py index da0eccd48..e249d6e2d 100644 --- a/src/vak/train/parametric_umap.py +++ b/src/vak/train/parametric_umap.py @@ -184,6 +184,8 @@ def train_parametric_umap_model( f"Total duration of training split from dataset (in s): {train_dur}", ) + if train_transform_params is None: + train_transform_params = {} transform = transforms.defaults.get_default_transform(model_name, "train", train_transform_params) if train_dataset_params is None: @@ -206,6 +208,8 @@ def train_parametric_umap_model( # ---------------- load validation set (if there is one) ----------------------------------------------------------- if val_step: + if val_transform_params is None: + val_transform_params = {} transform = transforms.defaults.get_default_transform(model_name, "eval", val_transform_params) if val_dataset_params is None: val_dataset_params = {} From 4ba566bba9bd644ff40cd9c5177bd77a5b67d32f Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 08:44:30 -0400 Subject: [PATCH 109/184] In vak.train.train, pass train/val_transform/dataset_params into train_frame_classification_model --- src/vak/train/train.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/vak/train/train.py b/src/vak/train/train.py index 0bc4a65a3..20ea48e95 100644 --- a/src/vak/train/train.py +++ b/src/vak/train/train.py @@ -166,6 +166,10 @@ def train( batch_size=batch_size, num_epochs=num_epochs, num_workers=num_workers, + train_transform_params=train_transform_params, + train_dataset_params=train_dataset_params, + val_transform_params=val_transform_params, + val_dataset_params=val_dataset_params, checkpoint_path=checkpoint_path, spect_scaler_path=spect_scaler_path, results_path=results_path, @@ -182,13 +186,13 @@ def train( model_name=model_name, model_config=model_config, dataset_path=dataset_path, + batch_size=batch_size, + num_epochs=num_epochs, + num_workers=num_workers, train_transform_params=train_transform_params, train_dataset_params=train_dataset_params, val_transform_params=val_transform_params, val_dataset_params=val_dataset_params, - batch_size=batch_size, - num_epochs=num_epochs, - num_workers=num_workers, checkpoint_path=checkpoint_path, results_path=results_path, shuffle=shuffle, From 80b0a25d1794716e1ab000f1f91b3bb57babea0a Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 09:57:30 -0400 Subject: [PATCH 110/184] Add train_dataset_params and val_transform_params to frame classification train configs in tests/data_for_tests/configs --- .../TeenyTweetyNet_train_audio_cbin_annot_notmat.toml | 9 ++++++--- ...TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml | 9 ++++++--- ...TweetyNet_train_continue_audio_cbin_annot_notmat.toml | 9 ++++++--- ...tyNet_train_continue_audio_wav_annot_birdsongrec.toml | 9 ++++++--- ...yTweetyNet_train_continue_spect_mat_annot_yarden.toml | 9 ++++++--- .../TeenyTweetyNet_train_spect_mat_annot_yarden.toml | 9 ++++++--- .../configs/TweetyNet_train_audio_cbin_annot_notmat.toml | 9 ++++++--- .../TweetyNet_train_audio_wav_annot_birdsongrec.toml | 9 ++++++--- ...TweetyNet_train_continue_audio_cbin_annot_notmat.toml | 9 ++++++--- ...tyNet_train_continue_audio_wav_annot_birdsongrec.toml | 9 ++++++--- .../TweetyNet_train_continue_spect_mat_annot_yarden.toml | 9 ++++++--- .../configs/TweetyNet_train_spect_mat_annot_yarden.toml | 9 ++++++--- 12 files changed, 72 insertions(+), 36 deletions(-) diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_cbin_annot_notmat.toml index d7326cb8c..1d6e54c08 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_cbin_annot_notmat.toml @@ -17,9 +17,6 @@ freq_cutoffs = [ 1000, 8000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 44 - [TRAIN] model = "TeenyTweetyNet" normalize_spectrograms = true @@ -32,5 +29,11 @@ num_workers = 2 device = "cuda" root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/TeenyTweetyNet" +[TRAIN.train_dataset_params] +window_size = 44 + +[TRAIN.val_transform_params] +window_size = 44 + [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml index 4281ae76b..a27313896 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml @@ -18,9 +18,6 @@ freq_cutoffs = [ 1000, 8000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 44 - [TRAIN] model = "TeenyTweetyNet" normalize_spectrograms = true @@ -33,5 +30,11 @@ num_workers = 2 device = "cuda" root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsongrec/TeenyTweetyNet" +[TRAIN.train_dataset_params] +window_size = 44 + +[TRAIN.val_transform_params] +window_size = 44 + [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml index c66287e40..334b5135c 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml @@ -17,9 +17,6 @@ freq_cutoffs = [ 1000, 8000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 44 - [TRAIN] model = "TeenyTweetyNet" normalize_spectrograms = true @@ -34,5 +31,11 @@ root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_an checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" spect_scaler_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" +[TRAIN.train_dataset_params] +window_size = 44 + +[TRAIN.val_transform_params] +window_size = 44 + [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml index 26dcb9856..6383f99a9 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml @@ -18,9 +18,6 @@ freq_cutoffs = [ 1000, 8000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 44 - [TRAIN] model = "TeenyTweetyNet" normalize_spectrograms = true @@ -35,5 +32,11 @@ root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_ann checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" spect_scaler_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" +[TRAIN.train_dataset_params] +window_size = 44 + +[TRAIN.val_transform_params] +window_size = 44 + [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_spect_mat_annot_yarden.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_spect_mat_annot_yarden.toml index 4f70f0bad..0881927d6 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_spect_mat_annot_yarden.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_spect_mat_annot_yarden.toml @@ -17,9 +17,6 @@ freq_cutoffs = [ 1000, 8000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 44 - [TRAIN] model = "TeenyTweetyNet" normalize_spectrograms = false @@ -33,5 +30,11 @@ device = "cuda" root_results_dir = "./tests/data_for_tests/generated/results/train/spect_mat_annot_yarden/TeenyTweetyNet" checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" +[TRAIN.train_dataset_params] +window_size = 44 + +[TRAIN.val_transform_params] +window_size = 44 + [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_train_spect_mat_annot_yarden.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_spect_mat_annot_yarden.toml index 4b4d36204..3a3859ec5 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_train_spect_mat_annot_yarden.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_train_spect_mat_annot_yarden.toml @@ -17,9 +17,6 @@ freq_cutoffs = [ 1000, 8000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 44 - [TRAIN] model = "TeenyTweetyNet" normalize_spectrograms = false @@ -32,5 +29,11 @@ num_workers = 2 device = "cuda" root_results_dir = "./tests/data_for_tests/generated/results/train/spect_mat_annot_yarden/TeenyTweetyNet" +[TRAIN.train_dataset_params] +window_size = 44 + +[TRAIN.val_transform_params] +window_size = 44 + [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_train_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_train_audio_cbin_annot_notmat.toml index 539cfc3d4..9923e48ef 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_audio_cbin_annot_notmat.toml @@ -17,9 +17,6 @@ freq_cutoffs = [ 500, 10000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 88 - [TRAIN] model = "TweetyNet" normalize_spectrograms = true @@ -32,5 +29,11 @@ num_workers = 4 device = "cuda" root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/TweetyNet" +[TRAIN.train_dataset_params] +window_size = 88 + +[TRAIN.val_transform_params] +window_size = 88 + [TweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml index 7c315ffca..fbfd2269e 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml @@ -18,9 +18,6 @@ freq_cutoffs = [ 500, 10000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 88 - [TRAIN] model = "TweetyNet" normalize_spectrograms = true @@ -33,5 +30,11 @@ num_workers = 4 device = "cuda" root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsongrec/TweetyNet" +[TRAIN.train_dataset_params] +window_size = 88 + +[TRAIN.val_transform_params] +window_size = 88 + [TweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml index c5c1ad9d6..41b78a358 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml @@ -17,9 +17,6 @@ freq_cutoffs = [ 500, 10000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 88 - [TRAIN] model = "TweetyNet" normalize_spectrograms = true @@ -34,5 +31,11 @@ root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_an checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" spect_scaler_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" +[TRAIN.train_dataset_params] +window_size = 88 + +[TRAIN.val_transform_params] +window_size = 88 + [TweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml index 39d08cc18..480ab0230 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml @@ -18,9 +18,6 @@ freq_cutoffs = [ 500, 10000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 88 - [TRAIN] model = "TweetyNet" normalize_spectrograms = true @@ -35,5 +32,11 @@ root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_ann checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" spect_scaler_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" +[TRAIN.train_dataset_params] +window_size = 88 + +[TRAIN.val_transform_params] +window_size = 88 + [TweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_train_continue_spect_mat_annot_yarden.toml b/tests/data_for_tests/configs/TweetyNet_train_continue_spect_mat_annot_yarden.toml index 8231e1400..24a362cea 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_continue_spect_mat_annot_yarden.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_continue_spect_mat_annot_yarden.toml @@ -17,9 +17,6 @@ freq_cutoffs = [ 500, 10000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 88 - [TRAIN] model = "TweetyNet" normalize_spectrograms = false @@ -33,5 +30,11 @@ device = "cuda" root_results_dir = "./tests/data_for_tests/generated/results/train/spect_mat_annot_yarden/TweetyNet" checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" +[TRAIN.train_dataset_params] +window_size = 88 + +[TRAIN.val_transform_params] +window_size = 88 + [TweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_train_spect_mat_annot_yarden.toml b/tests/data_for_tests/configs/TweetyNet_train_spect_mat_annot_yarden.toml index 02997d808..6ae2cc439 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_spect_mat_annot_yarden.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_spect_mat_annot_yarden.toml @@ -17,9 +17,6 @@ freq_cutoffs = [ 500, 10000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 88 - [TRAIN] model = "TweetyNet" normalize_spectrograms = false @@ -32,5 +29,11 @@ num_workers = 4 device = "cuda" root_results_dir = "./tests/data_for_tests/generated/results/train/spect_mat_annot_yarden/TweetyNet" +[TRAIN.train_dataset_params] +window_size = 88 + +[TRAIN.val_transform_params] +window_size = 88 + [TweetyNet.optimizer] lr = 0.001 From 32b719bb4a3fc7171c956fee3ebf802933e0dc90 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 09:59:12 -0400 Subject: [PATCH 111/184] Add train/val_transform_params to tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml --- .../ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml index 9c37fffd6..ca23fc5cd 100644 --- a/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml @@ -29,5 +29,11 @@ num_workers = 2 device = "cuda" root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/ConvEncoderUMAP" +[TRAIN.train_transform_params] +resize = 128 + +[TRAIN.val_transform_params] +resize = 128 + [ConvEncoderUMAP.optimizer] lr = 0.001 From b6b05ec4c01b340913b1219f19f605216aa32cec Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 10:15:52 -0400 Subject: [PATCH 112/184] Add train/val_transform_params adn train/val_dataset_params to vak.train.train docstring --- src/vak/train/train.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/vak/train/train.py b/src/vak/train/train.py index 20ea48e95..732652ed5 100644 --- a/src/vak/train/train.py +++ b/src/vak/train/train.py @@ -64,13 +64,24 @@ def train( num_workers : int Number of processes to use for parallel loading of data. Argument to torch.DataLoader. - dataset_csv_path - Path to csv file representing splits of dataset, - e.g., such a file generated by running ``vak prep``. - This parameter is used by :func:`vak.core.learncurve` to specify - different splits to use, when generating results for a learning curve. - If this argument is specified, the csv file must be inside the directory - ``dataset_path``. + train_transform_params: dict, optional + Parameters for training data transform. + Passed as keyword arguments. + Optional, default is None. + train_dataset_params: dict, optional + Parameters for training dataset. + Passed as keyword arguments to + :class:`vak.datasets.frame_classification.WindowDataset`. + Optional, default is None. + val_transform_params: dict, optional + Parameters for validation data transform. + Passed as keyword arguments. + Optional, default is None. + val_dataset_params: dict, optional + Parameters for validation dataset. + Passed as keyword arguments to + :class:`vak.datasets.parametric_umap.ParametricUMAP`. + Optional, default is None. checkpoint_path : str, pathlib.Path path to a checkpoint file, e.g., one generated by a previous run of ``vak.core.train``. From 454b75c50fa9ac7ca2ad047f0f07171d650921fd Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 10:18:27 -0400 Subject: [PATCH 113/184] fixup Add train/val_transform_params adn train/val_dataset_params to vak.train.train docstring --- src/vak/train/train.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/vak/train/train.py b/src/vak/train/train.py index 732652ed5..b9ae8173d 100644 --- a/src/vak/train/train.py +++ b/src/vak/train/train.py @@ -70,17 +70,15 @@ def train( Optional, default is None. train_dataset_params: dict, optional Parameters for training dataset. - Passed as keyword arguments to - :class:`vak.datasets.frame_classification.WindowDataset`. + Passed as keyword arguments. Optional, default is None. - val_transform_params: dict, optional - Parameters for validation data transform. + transform_params: dict, optional + Parameters for data transform. Passed as keyword arguments. Optional, default is None. - val_dataset_params: dict, optional - Parameters for validation dataset. - Passed as keyword arguments to - :class:`vak.datasets.parametric_umap.ParametricUMAP`. + dataset_params: dict, optional + Parameters for dataset. + Passed as keyword arguments. Optional, default is None. checkpoint_path : str, pathlib.Path path to a checkpoint file, From 0019b5ff3b446a3a7fbbce8415068471c7c47208 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 10:19:43 -0400 Subject: [PATCH 114/184] Fix definitioin in train_frame_classification_model docstring --- src/vak/train/frame_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/train/frame_classification.py b/src/vak/train/frame_classification.py index 5519e3b5b..645236a3a 100644 --- a/src/vak/train/frame_classification.py +++ b/src/vak/train/frame_classification.py @@ -100,7 +100,7 @@ def train_frame_classification_model( val_dataset_params: dict, optional Parameters for validation dataset. Passed as keyword arguments to - :class:`vak.datasets.parametric_umap.ParametricUMAP`. + :class:`vak.datasets.frame_classification.FramesDataset`. Optional, default is None. dataset_csv_path Path to csv file representing splits of dataset, From 0e3fa9520f515f62ee9e56f3a46b24e3fb9d18c2 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 10:40:48 -0400 Subject: [PATCH 115/184] Add transform/dataset_param options to EVAL and PREDICT sections in valid.toml --- src/vak/config/valid.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/vak/config/valid.toml b/src/vak/config/valid.toml index 50ca6178d..5c8ce5902 100644 --- a/src/vak/config/valid.toml +++ b/src/vak/config/valid.toml @@ -68,6 +68,8 @@ num_workers = 4 device = 'cuda' spect_scaler_path = '/home/user/results_181014_194418/spect_scaler' post_tfm_kwargs = {'majority_vote' = true, 'min_segment_dur' = 0.01} +transform_params = {'resize' = 128} +dataset_params = {'window_size' = 80} [LEARNCURVE] model = 'TweetyNet' @@ -100,3 +102,5 @@ spect_scaler_path = '/home/user/results_181014_194418/spect_scaler' min_segment_dur = 0.004 majority_vote = false save_net_outputs = false +transform_params = {'resize' = 128} +dataset_params = {'window_size' = 80} From 5c44de2d4716d9c8eac844874ca7b5b2a970cdaf Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 10:46:25 -0400 Subject: [PATCH 116/184] Add transform/dataset_params to EvalConfig --- src/vak/config/eval.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/vak/config/eval.py b/src/vak/config/eval.py index e752bce70..98f8125b6 100644 --- a/src/vak/config/eval.py +++ b/src/vak/config/eval.py @@ -99,6 +99,14 @@ class EvalConfig: a float value for ``min_segment_dur``. See the docstring of the transform for more details on these arguments and how they work. + transform_params: dict, optional + Parameters for data transform. + Passed as keyword arguments. + Optional, default is None. + dataset_params: dict, optional + Parameters for dataset. + Passed as keyword arguments. + Optional, default is None. """ # required, external files checkpoint_path = attr.ib(converter=expanded_user_path) @@ -133,3 +141,15 @@ class EvalConfig: # optional, data loader num_workers = attr.ib(validator=instance_of(int), default=2) device = attr.ib(validator=instance_of(str), default=device.get_default()) + + transform_params = attr.ib( + converter=converters.optional(dict), + validator=validators.optional(instance_of(dict)), + default=None, + ) + + dataset_params = attr.ib( + converter=converters.optional(dict), + validator=validators.optional(instance_of(dict)), + default=None, + ) From b1aae259411d6283e681b562113eca29772119a5 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 10:46:39 -0400 Subject: [PATCH 117/184] Add transform/dataset_params to PredictConfig --- src/vak/config/predict.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/vak/config/predict.py b/src/vak/config/predict.py index d301ff07d..fb221c82e 100644 --- a/src/vak/config/predict.py +++ b/src/vak/config/predict.py @@ -68,6 +68,14 @@ class PredictConfig: spectrogram with `spect_path` filename `gy6or6_032312_081416.npz`, and the network is `TweetyNet`, then the net output file will be `gy6or6_032312_081416.tweetynet.output.npz`. + transform_params: dict, optional + Parameters for data transform. + Passed as keyword arguments. + Optional, default is None. + dataset_params: dict, optional + Parameters for dataset. + Passed as keyword arguments. + Optional, default is None. """ # required, external files @@ -109,3 +117,15 @@ class PredictConfig: ) majority_vote = attr.ib(validator=instance_of(bool), default=True) save_net_outputs = attr.ib(validator=instance_of(bool), default=False) + + transform_params = attr.ib( + converter=converters.optional(dict), + validator=validators.optional(instance_of(dict)), + default=None, + ) + + dataset_params = attr.ib( + converter=converters.optional(dict), + validator=validators.optional(instance_of(dict)), + default=None, + ) From 8764b1cd7f9e0f5eb1e04e99ed477e583efa0ce2 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 10:48:56 -0400 Subject: [PATCH 118/184] Add/use transform/dataset_params in eval_frame_classification_model function --- src/vak/eval/frame_classification.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/vak/eval/frame_classification.py b/src/vak/eval/frame_classification.py index a2133d055..0315f6905 100644 --- a/src/vak/eval/frame_classification.py +++ b/src/vak/eval/frame_classification.py @@ -31,8 +31,9 @@ def eval_frame_classification_model( checkpoint_path: str | pathlib.Path, labelmap_path: str | pathlib.Path, output_dir: str | pathlib.Path, - window_size: int, num_workers: int, + transform_params: dict | None = None, + dataset_params: dict | None = None, split: str = "test", spect_scaler_path: str | pathlib.Path = None, post_tfm_kwargs: dict | None = None, @@ -54,9 +55,6 @@ def eval_frame_classification_model( path to directory with checkpoint files saved by Torch, to reload model output_dir : str, pathlib.Path Path to location where .csv files with evaluation metrics should be saved. - window_size : int - size of windows taken from spectrograms, in number of time bins, - shown to neural networks labelmap_path : str, pathlib.Path path to 'labelmap.json' file. models : list @@ -66,6 +64,14 @@ def eval_frame_classification_model( num_workers : int Number of processes to use for parallel loading of data. Argument to torch.DataLoader. Default is 2. + transform_params: dict, optional + Parameters for data transform. + Passed as keyword arguments. + Optional, default is None. + dataset_params: dict, optional + Parameters for dataset. + Passed as keyword arguments. + Optional, default is None. split : str split of dataset on which model should be evaluated. One of {'train', 'val', 'test'}. Default is 'test'. @@ -142,21 +148,21 @@ def eval_frame_classification_model( logger.info(f"loading labelmap from path: {labelmap_path}") with labelmap_path.open("r") as f: labelmap = json.load(f) - + if transform_params is None: + transform_params = {} + transform_params.update({'spect_standardizer': spect_standardizer}) item_transform = transforms.defaults.get_default_transform( model_name, "eval", - transform_kwargs=dict( - spect_standardizer=spect_standardizer, - window_size=window_size, - return_padding_mask=True, - ) + transform_params ) - + if dataset_params is None: + dataset_params = {} val_dataset = FramesDataset.from_dataset_path( dataset_path=dataset_path, split=split, item_transform=item_transform, + **dataset_params, ) val_loader = torch.utils.data.DataLoader( dataset=val_dataset, From 63e7e3a4edb5bfb262c5faaa40e885c6e6d287ac Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 10:49:17 -0400 Subject: [PATCH 119/184] Add/use transform/dataset_params in vak.eval.eval function -- pass into eval_frame_classification_model --- src/vak/eval/eval.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/vak/eval/eval.py b/src/vak/eval/eval.py index a14b3ac30..3f253e8d3 100644 --- a/src/vak/eval/eval.py +++ b/src/vak/eval/eval.py @@ -21,8 +21,9 @@ def eval( checkpoint_path: str | pathlib.Path, labelmap_path: str | pathlib.Path, output_dir: str | pathlib.Path, - window_size: int, num_workers: int, + transform_params: dict | None = None, + dataset_params: dict | None = None, split: str = "test", spect_scaler_path: str | pathlib.Path = None, post_tfm_kwargs: dict | None = None, @@ -44,9 +45,6 @@ def eval( path to directory with checkpoint files saved by Torch, to reload model output_dir : str, pathlib.Path Path to location where .csv files with evaluation metrics should be saved. - window_size : int - size of windows taken from spectrograms, in number of time bins, - shown to neural networks labelmap_path : str, pathlib.Path path to 'labelmap.json' file. models : list @@ -56,6 +54,14 @@ def eval( num_workers : int Number of processes to use for parallel loading of data. Argument to torch.DataLoader. Default is 2. + transform_params: dict, optional + Parameters for data transform. + Passed as keyword arguments. + Optional, default is None. + dataset_params: dict, optional + Parameters for dataset. + Passed as keyword arguments. + Optional, default is None. split : str split of dataset on which model should be evaluated. One of {'train', 'val', 'test'}. Default is 'test'. @@ -122,6 +128,8 @@ def eval( output_dir=output_dir, window_size=window_size, num_workers=num_workers, + transform_params=transform_params, + dataset_params=dataset_params, split=split, spect_scaler_path=spect_scaler_path, device=device, From d665154d76e5a1996efbd240c9882b29102212b5 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 11:03:43 -0400 Subject: [PATCH 120/184] Add/use transform/dataset_params in predict_frame_classification_model function --- src/vak/predict/frame_classification.py | 26 ++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/vak/predict/frame_classification.py b/src/vak/predict/frame_classification.py index 3276cda55..5c4a7f63c 100644 --- a/src/vak/predict/frame_classification.py +++ b/src/vak/predict/frame_classification.py @@ -36,8 +36,9 @@ def predict_with_frame_classification_model( dataset_path, checkpoint_path, labelmap_path, - window_size, num_workers=2, + transform_params: dict | None = None, + dataset_params: dict | None = None, timebins_key="t", spect_scaler_path=None, device=None, @@ -63,12 +64,17 @@ def predict_with_frame_classification_model( path to directory with checkpoint files saved by Torch, to reload model labelmap_path : str path to 'labelmap.json' file. - window_size : int - size of windows taken from spectrograms, in number of time bins, - shown to neural networks num_workers : int Number of processes to use for parallel loading of data. Argument to torch.DataLoader. Default is 2. + transform_params: dict, optional + Parameters for data transform. + Passed as keyword arguments. + Optional, default is None. + dataset_params: dict, optional + Parameters for dataset. + Passed as keyword arguments. + Optional, default is None. spect_key : str key for accessing spectrogram in files. Default is 's'. timebins_key : str @@ -149,14 +155,13 @@ def predict_with_frame_classification_model( logger.info(f"Not loading SpectScaler, no path was specified") spect_standardizer = None + if transform_params is None: + transform_params = {} + transform_params.update({'spect_standardizer': spect_standardizer}) item_transform = transforms.defaults.get_default_transform( model_name, "predict", - transform_kwargs=dict( - spect_standardizer=spect_standardizer, - window_size=window_size, - return_padding_mask=True, - ) + transform_params ) logger.info(f"loading labelmap from path: {labelmap_path}") @@ -167,10 +172,13 @@ def predict_with_frame_classification_model( dataset_csv_path = dataset_path / metadata.dataset_csv_filename logger.info(f"loading dataset to predict from csv path: {dataset_csv_path}") + if dataset_params is None: + dataset_params = {} pred_dataset = FramesDataset.from_dataset_path( dataset_path=dataset_path, split="predict", item_transform=item_transform, + **dataset_params ) pred_loader = torch.utils.data.DataLoader( From a76a229992c75db128c762e0edd0a52a47502303 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 11:05:15 -0400 Subject: [PATCH 121/184] Add/use transform/dataset_params in vak.predict.predict function -- pass into predict_frame_classification_model --- src/vak/predict/predict.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/vak/predict/predict.py b/src/vak/predict/predict.py index afe419418..755600aed 100644 --- a/src/vak/predict/predict.py +++ b/src/vak/predict/predict.py @@ -24,8 +24,9 @@ def predict( dataset_path: str | pathlib.Path, checkpoint_path: str | pathlib.Path, labelmap_path: str | pathlib.Path, - window_size: int, num_workers: int = 2, + transform_params: dict | None = None, + dataset_params: dict | None = None, timebins_key: str = "t", spect_scaler_path: str | pathlib.Path | None = None, device: str | None = None, @@ -57,8 +58,14 @@ def predict( num_workers : int Number of processes to use for parallel loading of data. Argument to torch.DataLoader. Default is 2. - spect_key : str - key for accessing spectrogram in files. Default is 's'. + transform_params: dict, optional + Parameters for data transform. + Passed as keyword arguments. + Optional, default is None. + dataset_params: dict, optional + Parameters for dataset. + Passed as keyword arguments. + Optional, default is None. timebins_key : str key for accessing vector of time bins in files. Default is 't'. device : str @@ -142,8 +149,9 @@ def predict( dataset_path=dataset_path, checkpoint_path=checkpoint_path, labelmap_path=labelmap_path, - window_size=window_size, num_workers=num_workers, + transform_params=transform_params, + dataset_params=dataset_params, timebins_key=timebins_key, spect_scaler_path=spect_scaler_path, device=device, From 322c2ff7df5582f316685c46c1be7fe3edb68a53 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 11:05:35 -0400 Subject: [PATCH 122/184] Fix definition in vak/train/train.py docstring --- src/vak/train/train.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/vak/train/train.py b/src/vak/train/train.py index b9ae8173d..fbef9daa3 100644 --- a/src/vak/train/train.py +++ b/src/vak/train/train.py @@ -72,16 +72,16 @@ def train( Parameters for training dataset. Passed as keyword arguments. Optional, default is None. - transform_params: dict, optional - Parameters for data transform. + val_transform_params: dict, optional + Parameters for validation data transform. Passed as keyword arguments. Optional, default is None. - dataset_params: dict, optional - Parameters for dataset. + val_dataset_params: dict, optional + Parameters for validation dataset. Passed as keyword arguments. Optional, default is None. checkpoint_path : str, pathlib.Path - path to a checkpoint file, + Path to a checkpoint file, e.g., one generated by a previous run of ``vak.core.train``. If specified, this checkpoint will be loaded into model. Used when continuing training. From 4791e7f738d241162140a1275595455324832c4c Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 21 Jul 2023 11:07:45 -0400 Subject: [PATCH 123/184] Remove src/vak/config/dataloader.py --- src/vak/config/dataloader.py | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 src/vak/config/dataloader.py diff --git a/src/vak/config/dataloader.py b/src/vak/config/dataloader.py deleted file mode 100644 index 65570ce0f..000000000 --- a/src/vak/config/dataloader.py +++ /dev/null @@ -1,16 +0,0 @@ -import attr -from attr.validators import instance_of - - -@attr.s -class DataLoaderConfig: - """represents options for DataLoaders specified in config.toml file - - Attributes - ---------- - window_size : int - size of windows taken from spectrograms, in number of time bins, - shonw to neural networks - """ - - window_size = attr.ib(converter=int, validator=instance_of(int), default=88) From 61d940384992521e38e91d24510fb9638aec0fff Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 23 Jul 2023 18:44:01 -0400 Subject: [PATCH 124/184] Remove use of Dataloader in vak/config --- src/vak/config/config.py | 7 ------- src/vak/config/parse.py | 2 -- src/vak/config/valid.toml | 3 --- 3 files changed, 12 deletions(-) diff --git a/src/vak/config/config.py b/src/vak/config/config.py index 9a2dbcca0..733163c8c 100644 --- a/src/vak/config/config.py +++ b/src/vak/config/config.py @@ -20,8 +20,6 @@ class Config: represents ``[PREP]`` section of config.toml file spect_params : vak.config.spect_params.SpectParamsConfig represents ``[SPECT_PARAMS]`` section of config.toml file - dataloader : vak.config.dataloader.DataLoaderConfig - represents ``[DATALOADER]`` section of config.toml file train : vak.config.train.TrainConfig represents ``[TRAIN]`` section of config.toml file eval : vak.config.eval.EvalConfig @@ -31,14 +29,9 @@ class Config: learncurve : vak.config.learncurve.LearncurveConfig represents ``[LEARNCURVE]`` section of config.toml file """ - spect_params = attr.ib( validator=instance_of(SpectParamsConfig), default=SpectParamsConfig() ) - dataloader = attr.ib( - validator=instance_of(DataLoaderConfig), default=DataLoaderConfig() - ) - prep = attr.ib(validator=optional(instance_of(PrepConfig)), default=None) train = attr.ib(validator=optional(instance_of(TrainConfig)), default=None) eval = attr.ib(validator=optional(instance_of(EvalConfig)), default=None) diff --git a/src/vak/config/parse.py b/src/vak/config/parse.py index da50be076..71e8b28c1 100644 --- a/src/vak/config/parse.py +++ b/src/vak/config/parse.py @@ -15,7 +15,6 @@ SECTION_CLASSES = { - "DATALOADER": DataLoaderConfig, "EVAL": EvalConfig, "LEARNCURVE": LearncurveConfig, "PREDICT": PredictConfig, @@ -25,7 +24,6 @@ } REQUIRED_OPTIONS = { - "DATALOADER": None, "EVAL": [ "checkpoint_path", "labelmap_path", diff --git a/src/vak/config/valid.toml b/src/vak/config/valid.toml index 5c8ce5902..de782a74b 100644 --- a/src/vak/config/valid.toml +++ b/src/vak/config/valid.toml @@ -33,9 +33,6 @@ freqbins_key = 'f' timebins_key = 't' audio_path_key = 'audio_path' -[DATALOADER] -window_size = 88 - [TRAIN] model = 'TweetyNet' root_results_dir = './tests/test_data/results/train' From fe3370d2822716414a1f64a481aff3e2d3478d02 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 23 Jul 2023 19:20:40 -0400 Subject: [PATCH 125/184] Add transform_params options and remove DATALOADER sections in eval/predict configs --- .../TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml | 6 +++--- .../TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml | 3 --- .../TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml | 6 +++--- .../TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml | 6 +++--- .../configs/TweetyNet_eval_audio_cbin_annot_notmat.toml | 6 +++--- .../configs/TweetyNet_predict_audio_cbin_annot_notmat.toml | 6 +++--- .../TweetyNet_predict_audio_wav_annot_birdsongrec.toml | 6 +++--- 7 files changed, 18 insertions(+), 21 deletions(-) diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml index aba64b569..c523b7068 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml @@ -14,9 +14,6 @@ freq_cutoffs = [ 1000, 8000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 44 - [EVAL] checkpoint_path = "~/Documents/repos/coding/birdsong/TeenyTweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TeenyTweetyNet/checkpoints/max-val-acc-checkpoint.pt" labelmap_path = "~/Documents/repos/coding/birdsong/TeenyTweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/labelmap.json" @@ -27,5 +24,8 @@ device = "cuda" spect_scaler_path = "~/Documents/repos/coding/birdsong/TeenyTweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" output_dir = "./tests/data_for_tests/generated/results/eval/audio_cbin_annot_notmat/TeenyTweetyNet" +[EVAL.transform_params] +window_size = 44 + [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml index 56204818d..d8421754a 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml @@ -19,9 +19,6 @@ freq_cutoffs = [ 500, 10000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 44 - [LEARNCURVE] model = "TeenyTweetyNet" normalize_spectrograms = true diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml index e93951be6..4489c8960 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml @@ -12,9 +12,6 @@ freq_cutoffs = [ 1000, 8000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 44 - [PREDICT] spect_scaler_path = "/home/user/results_181014_194418/spect_scaler" checkpoint_path = "~/Documents/repos/coding/birdsong/TeenyTweetyNet/results/BFSongRepository/bl26lb16/results_200620_164245/TeenyTweetyNet/checkpoints/max-val-acc-checkpoint.pt" @@ -26,5 +23,8 @@ device = "cuda" output_dir = "./tests/data_for_tests/generated/results/predict/audio_cbin_annot_notmat/TeenyTweetyNet" annot_csv_filename = "bl26lb16.041912.annot.csv" +[PREDICT.transform_params] +window_size = 44 + [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml index 248eb22b0..776802c55 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml @@ -12,9 +12,6 @@ freq_cutoffs = [ 1000, 8000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 44 - [PREDICT] spect_scaler_path = "/home/user/results_181014_194418/spect_scaler" checkpoint_path = "~/Documents/repos/coding/birdsong/TeenyTweetyNet/results/BFSongRepository/bl26lb16/results_200620_164245/TeenyTweetyNet/checkpoints/max-val-acc-checkpoint.pt" @@ -26,5 +23,8 @@ device = "cuda" output_dir = "./tests/data_for_tests/generated/results/predict/audio_wav_annot_birdsongrec/TeenyTweetyNet" annot_csv_filename = "Bird0.annot.csv" +[PREDICT.transform_params] +window_size = 44 + [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_eval_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_eval_audio_cbin_annot_notmat.toml index e174115f5..295c159f4 100644 --- a/tests/data_for_tests/configs/TweetyNet_eval_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TweetyNet_eval_audio_cbin_annot_notmat.toml @@ -14,9 +14,6 @@ freq_cutoffs = [ 500, 10000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 88 - [EVAL] checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" labelmap_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/labelmap.json" @@ -27,5 +24,8 @@ device = "cuda" spect_scaler_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" output_dir = "./tests/data_for_tests/generated/results/eval/audio_cbin_annot_notmat/TweetyNet" +[EVAL.transform_params] +window_size = 88 + [TweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml index 52a3dfe50..5206bc579 100644 --- a/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml @@ -12,9 +12,6 @@ freq_cutoffs = [ 500, 10000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 88 - [PREDICT] spect_scaler_path = "/home/user/results_181014_194418/spect_scaler" checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/bl26lb16/results_200620_164245/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" @@ -26,5 +23,8 @@ device = "cuda" output_dir = "./tests/data_for_tests/generated/results/predict/audio_cbin_annot_notmat/TweetyNet" annot_csv_filename = "bl26lb16.041912.annot.csv" +[PREDICT.transform_params] +window_size = 88 + [TweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml index 17464a7c1..cbc6358d5 100644 --- a/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml @@ -12,9 +12,6 @@ freq_cutoffs = [ 500, 10000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 88 - [PREDICT] spect_scaler_path = "/home/user/results_181014_194418/spect_scaler" checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/bl26lb16/results_200620_164245/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" @@ -26,5 +23,8 @@ device = "cuda" output_dir = "./tests/data_for_tests/generated/results/predict/audio_wav_annot_birdsongrec/TweetyNet" annot_csv_filename = "Bird0.annot.csv" +[PREDICT.transform_params] +window_size = 88 + [TweetyNet.optimizer] lr = 0.001 From a0bf643111532030e96a08db33d96bb2413d1d32 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 23 Jul 2023 19:28:30 -0400 Subject: [PATCH 126/184] Add train/val_transform_params and train/val_dataset_params to vak.learncurve.frame_classification --- src/vak/learncurve/frame_classification.py | 68 ++++++++++++++-------- 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/src/vak/learncurve/frame_classification.py b/src/vak/learncurve/frame_classification.py index 43d868dab..7e765d69e 100644 --- a/src/vak/learncurve/frame_classification.py +++ b/src/vak/learncurve/frame_classification.py @@ -24,11 +24,13 @@ def learning_curve_for_frame_classification_model( model_name: str, model_config: dict, dataset_path: str | pathlib.Path, - window_size: int, batch_size: int, num_epochs: int, num_workers: int, - root_results_dir: str | pathlib.Path | None = None, + train_transform_params: dict | None = None, + train_dataset_params: dict | None = None, + val_transform_params: dict | None = None, + val_dataset_params: dict | None = None, results_path: str | pathlib.Path = None, post_tfm_kwargs: dict | None =None, normalize_spectrograms: bool = True, @@ -69,12 +71,28 @@ def learning_curve_for_frame_classification_model( num_workers : int Number of processes to use for parallel loading of data. Argument to torch.DataLoader. - root_results_dir : str, pathlib.Path - Root directory in which a new directory will be created where results will be saved. + train_transform_params: dict, optional + Parameters for training data transform. + Passed as keyword arguments. + Optional, default is None. + train_dataset_params: dict, optional + Parameters for training dataset. + Passed as keyword arguments to + :class:`vak.datasets.frame_classification.WindowDataset`. + Optional, default is None. + val_transform_params: dict, optional + Parameters for validation data transform. + Passed as keyword arguments. + Optional, default is None. + val_dataset_params: dict, optional + Parameters for validation dataset. + Passed as keyword arguments to + :class:`vak.datasets.frame_classification.FramesDataset`. + Optional, default is None. results_path : str, pathlib.Path - Directory where results will be saved. If specified, this parameter overrides root_results_dir. + Directory where results will be saved. previous_run_path : str, Path - path to directory containing dataset .csv files + Path to directory containing dataset .csv files that represent subsets of training set, created by a previous run of ``vak.core.learncurve.learning_curve``. Typically directory will have a name like ``results_{timestamp}`` @@ -139,15 +157,11 @@ def learning_curve_for_frame_classification_model( ) # ---- set up directory to save output ----------------------------------------------------------------------------- - if results_path: - results_path = expanded_user_path(results_path) - if not results_path.is_dir(): - raise NotADirectoryError( - f"results_path not recognized as a directory: {results_path}" - ) - else: - results_path = generate_results_dir_name_as_path(root_results_dir) - results_path.mkdir() + results_path = expanded_user_path(results_path) + if not results_path.is_dir(): + raise NotADirectoryError( + f"results_path not recognized as a directory: {results_path}" + ) logger.info(f"Saving results to: {results_path}") @@ -194,10 +208,13 @@ def learning_curve_for_frame_classification_model( model_name, model_config, dataset_path, - window_size, batch_size, num_epochs, num_workers, + train_transform_params, + train_dataset_params, + val_transform_params, + val_dataset_params, results_path=results_path_this_replicate, normalize_spectrograms=normalize_spectrograms, shuffle=shuffle, @@ -258,15 +275,16 @@ def learning_curve_for_frame_classification_model( model_name, model_config, dataset_path, - checkpoint_path=ckpt_path, - labelmap_path=labelmap_path, - output_dir=results_path_this_replicate, - window_size=window_size, - num_workers=num_workers, - split="test", - spect_scaler_path=spect_scaler_path, - post_tfm_kwargs=post_tfm_kwargs, - device=device, + ckpt_path, + labelmap_path, + results_path_this_replicate, + num_workers, + val_transform_params, + val_dataset_params, + "test", + spect_scaler_path, + post_tfm_kwargs, + device, ) # ---- make a csv for analysis ------------------------------------------------------------------------------------- From 1c02beed0ab10f07cb3b43faf2c592feb3e2fab9 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 23 Jul 2023 19:28:43 -0400 Subject: [PATCH 127/184] Add train/val_transform_params and train/val_dataset_params to vak.learncurve.learncurve --- src/vak/learncurve/learncurve.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/vak/learncurve/learncurve.py b/src/vak/learncurve/learncurve.py index 0d98674de..2200593a2 100644 --- a/src/vak/learncurve/learncurve.py +++ b/src/vak/learncurve/learncurve.py @@ -18,10 +18,13 @@ def learning_curve( model_name: str, model_config: dict, dataset_path: str | pathlib.Path, - window_size: int, batch_size: int, num_epochs: int, num_workers: int, + train_transform_params: dict | None = None, + train_dataset_params: dict | None = None, + val_transform_params: dict | None = None, + val_dataset_params: dict | None = None, results_path: str | pathlib.Path = None, post_tfm_kwargs: dict | None =None, normalize_spectrograms: bool = True, @@ -83,10 +86,6 @@ def learning_curve( a float value for ``min_segment_dur``. See the docstring of the transform for more details on these arguments and how they work. - spect_key : str - key for accessing spectrogram in files. Default is 's'. - timebins_key : str - key for accessing vector of time bins in files. Default is 't'. device : str Device on which to work with model + data. Default is None. If None, then a device will be selected with vak.device.get_default. @@ -131,10 +130,13 @@ def learning_curve( model_name=model_name, model_config=model_config, dataset_path=dataset_path, - window_size=window_size, batch_size=batch_size, num_epochs=num_epochs, num_workers=num_workers, + train_transform_params=train_transform_params, + train_dataset_params=train_dataset_params, + val_transform_params=val_transform_params, + val_dataset_params=val_dataset_params, results_path=results_path, post_tfm_kwargs=post_tfm_kwargs, normalize_spectrograms=normalize_spectrograms, From bfa77927c132492c6b0f27b353510ea08f8da482 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 23 Jul 2023 19:33:20 -0400 Subject: [PATCH 128/184] Remove import of dataloader in config/config.py --- src/vak/config/config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/vak/config/config.py b/src/vak/config/config.py index 733163c8c..07ad130e2 100644 --- a/src/vak/config/config.py +++ b/src/vak/config/config.py @@ -1,7 +1,6 @@ import attr from attr.validators import instance_of, optional -from .dataloader import DataLoaderConfig from .eval import EvalConfig from .learncurve import LearncurveConfig from .predict import PredictConfig From dcddbe2b91bb914a513bf81a6c3774c57c4e011d Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 23 Jul 2023 20:50:21 -0400 Subject: [PATCH 129/184] Finish remove dataloader imports from config sub-package --- src/vak/config/__init__.py | 1 - src/vak/config/parse.py | 1 - 2 files changed, 2 deletions(-) diff --git a/src/vak/config/__init__.py b/src/vak/config/__init__.py index bad9749fc..33172eabd 100644 --- a/src/vak/config/__init__.py +++ b/src/vak/config/__init__.py @@ -1,7 +1,6 @@ """sub-package that parses config.toml files and returns config object""" from . import ( config, - dataloader, eval, learncurve, model, diff --git a/src/vak/config/parse.py b/src/vak/config/parse.py index 71e8b28c1..0a2316038 100644 --- a/src/vak/config/parse.py +++ b/src/vak/config/parse.py @@ -4,7 +4,6 @@ from toml.decoder import TomlDecodeError from .config import Config -from .dataloader import DataLoaderConfig from .eval import EvalConfig from .learncurve import LearncurveConfig from .predict import PredictConfig From bc1bde9d32cd7e33a4de6a51a46dbad74d92161e Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 23 Jul 2023 20:50:46 -0400 Subject: [PATCH 130/184] Filter out NumbaDeprecationWarnings triggered by umap --- src/vak/datasets/parametric_umap/parametric_umap.py | 11 ++++++++++- src/vak/nn/loss/umap.py | 8 ++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/vak/datasets/parametric_umap/parametric_umap.py b/src/vak/datasets/parametric_umap/parametric_umap.py index b6e09696e..b6b8e74f8 100644 --- a/src/vak/datasets/parametric_umap/parametric_umap.py +++ b/src/vak/datasets/parametric_umap/parametric_umap.py @@ -1,12 +1,21 @@ +from __future__ import annotations + import pathlib +import warnings import numpy as np import pandas as pd from pynndescent import NNDescent from sklearn.utils import check_random_state -from umap.umap_ import fuzzy_simplicial_set from torch.utils.data import Dataset +# Ignore warnings from Numba deprecation: +# https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit +# Numba is required by UMAP. +from numba.core.errors import NumbaDeprecationWarning +warnings.simplefilter('ignore', category=NumbaDeprecationWarning) +from umap.umap_ import fuzzy_simplicial_set + def get_umap_graph(X, n_neighbors: int = 10, metric: str= "cosine", random_state: int | None = None, max_candidates=60, verbose=True): random_state = check_random_state(None) if random_state == None else random_state diff --git a/src/vak/nn/loss/umap.py b/src/vak/nn/loss/umap.py index 72748d25b..7ceaba829 100644 --- a/src/vak/nn/loss/umap.py +++ b/src/vak/nn/loss/umap.py @@ -1,8 +1,16 @@ """Parametric UMAP loss function.""" from __future__ import annotations +import warnings + import torch from torch.nn.functional import mse_loss + +# Ignore warnings from Numba deprecation: +# https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit +# Numba is required by UMAP. +from numba.core.errors import NumbaDeprecationWarning +warnings.simplefilter('ignore', category=NumbaDeprecationWarning) from umap.umap_ import find_ab_params From 2aca82908d1da876835051b6a0e0f6f0e0baa82c Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 23 Jul 2023 20:59:22 -0400 Subject: [PATCH 131/184] Remove DATALOADER section from two learncurve configs --- ...eenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml | 6 ++++++ .../TweetyNet_learncurve_audio_cbin_annot_notmat.toml | 9 ++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml index d8421754a..b8a83600d 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml @@ -31,5 +31,11 @@ num_workers = 2 device = "cuda" root_results_dir = "./tests/data_for_tests/generated/results/learncurve/audio_cbin_annot_notmat/TeenyTweetyNet" +[LEARNCURVE.train_dataset_params] +window_size = 44 + +[LEARNCURVE.val_transform_params] +window_size = 44 + [TeenyTweetyNet.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml index c0ab2e446..01ccff182 100644 --- a/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml @@ -19,9 +19,6 @@ freq_cutoffs = [ 500, 10000,] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 88 - [LEARNCURVE] model = "TweetyNet" normalize_spectrograms = true @@ -34,5 +31,11 @@ num_workers = 4 device = "cuda" root_results_dir = "./tests/data_for_tests/generated/results/learncurve/audio_cbin_annot_notmat/TweetyNet" +[LEARNCURVE.train_dataset_params] +window_size = 88 + +[LEARNCURVE.val_transform_params] +window_size = 88 + [TweetyNet.optimizer] lr = 0.001 From 46d5ecaff4c35d0992ed5d28f3bdc37939812724 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 23 Jul 2023 20:59:38 -0400 Subject: [PATCH 132/184] Remove DATALOADER section from 4 other configs in data_for_tests/configs --- .../ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml | 3 --- tests/data_for_tests/configs/invalid_option_config.toml | 6 +++--- tests/data_for_tests/configs/invalid_section_config.toml | 3 --- .../configs/invalid_train_and_learncurve_config.toml | 3 --- 4 files changed, 3 insertions(+), 12 deletions(-) diff --git a/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml index ca23fc5cd..be8762317 100644 --- a/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml @@ -14,9 +14,6 @@ fft_size = 512 step_size = 32 transform_type = "log_spect_plus_one" -[DATALOADER] -window_size = 44 - [TRAIN] model = "ConvEncoderUMAP" normalize_spectrograms = true diff --git a/tests/data_for_tests/configs/invalid_option_config.toml b/tests/data_for_tests/configs/invalid_option_config.toml index c196c3319..5504fbf38 100644 --- a/tests/data_for_tests/configs/invalid_option_config.toml +++ b/tests/data_for_tests/configs/invalid_option_config.toml @@ -20,9 +20,6 @@ freq_cutoffs = [500, 10000] thresh = 6.25 transform_type = 'log_spect' -[DATALOADER] -window_size = 88 - [TRAIN] model = 'TweetyNet' root_results_dir = '/home/user/data/subdir/' @@ -33,5 +30,8 @@ val_error_step = 1 checkpoint_step = 1 save_only_single_checkpoint_file = true +[TRAIN.dataset_params] +window_size = 88 + [TweetyNet.optimizer] learning_rate = 0.001 diff --git a/tests/data_for_tests/configs/invalid_section_config.toml b/tests/data_for_tests/configs/invalid_section_config.toml index c0d3ce80a..f77cde3a3 100644 --- a/tests/data_for_tests/configs/invalid_section_config.toml +++ b/tests/data_for_tests/configs/invalid_section_config.toml @@ -20,9 +20,6 @@ freq_cutoffs = [500, 10000] thresh = 6.25 transform_type = 'log_spect' -[DATALOADER] -window_size = 88 - [TRIAN] # <-- invalid section 'TRIAN' (instead of 'TRAIN') model = 'TweetyNet' root_results_dir = '/home/user/data/subdir/' diff --git a/tests/data_for_tests/configs/invalid_train_and_learncurve_config.toml b/tests/data_for_tests/configs/invalid_train_and_learncurve_config.toml index ad9620642..e2809aca6 100644 --- a/tests/data_for_tests/configs/invalid_train_and_learncurve_config.toml +++ b/tests/data_for_tests/configs/invalid_train_and_learncurve_config.toml @@ -17,9 +17,6 @@ freq_cutoffs = [500, 10000] thresh = 6.25 transform_type = "log_spect" -[DATALOADER] -window_size = 88 - # this .toml file should cause 'vak.config.parse.from_toml' to raise a ValueError # because it defines both a TRAIN and a LEARNCURVE section [TRAIN] From 07736940652ec829cfa5ba84ca33370d4e023821 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 25 Jul 2023 20:48:55 -0400 Subject: [PATCH 133/184] Add name 'use_result_from_config' in configs.json --- tests/data_for_tests/configs/configs.json | 144 ++++++++++++---------- 1 file changed, 82 insertions(+), 62 deletions(-) diff --git a/tests/data_for_tests/configs/configs.json b/tests/data_for_tests/configs/configs.json index e61632a1d..09a205223 100644 --- a/tests/data_for_tests/configs/configs.json +++ b/tests/data_for_tests/configs/configs.json @@ -1,13 +1,14 @@ { "config_metadata": [ { - "filename": "TweetyNet_eval_audio_cbin_annot_notmat.toml", + "filename": "TweetyNet_train_audio_cbin_annot_notmat.toml", "model": "TweetyNet", - "config_type": "eval", + "config_type": "train", "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", - "use_dataset_from_config": null + "use_dataset_from_config": null, + "use_result_from_config": null }, { "filename": "TweetyNet_learncurve_audio_cbin_annot_notmat.toml", @@ -16,34 +17,38 @@ "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", - "use_dataset_from_config": null + "use_dataset_from_config": null, + "use_result_from_config": null }, { - "filename": "TweetyNet_predict_audio_cbin_annot_notmat.toml", + "filename": "TweetyNet_eval_audio_cbin_annot_notmat.toml", "model": "TweetyNet", - "config_type": "predict", + "config_type": "eval", "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", - "use_dataset_from_config": null + "use_dataset_from_config": null, + "use_result_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml" }, { - "filename": "TweetyNet_predict_audio_wav_annot_birdsongrec.toml", + "filename": "TweetyNet_predict_audio_cbin_annot_notmat.toml", "model": "TweetyNet", "config_type": "predict", - "audio_format": "wav", + "audio_format": "cbin", "spect_format": null, - "annot_format": "birdsong-recognition-dataset", - "use_dataset_from_config": null + "annot_format": "notmat", + "use_dataset_from_config": null, + "use_result_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml" }, { - "filename": "TweetyNet_train_audio_cbin_annot_notmat.toml", + "filename": "TweetyNet_train_continue_audio_cbin_annot_notmat.toml", "model": "TweetyNet", - "config_type": "train", + "config_type": "train_continue", "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", - "use_dataset_from_config": null + "use_dataset_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml", + "use_result_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml" }, { "filename": "TweetyNet_train_audio_wav_annot_birdsongrec.toml", @@ -52,25 +57,18 @@ "audio_format": "wav", "spect_format": null, "annot_format": "birdsong-recognition-dataset", - "use_dataset_from_config": null + "use_dataset_from_config": null, + "use_result_from_config": null }, { - "filename": "TweetyNet_train_spect_mat_annot_yarden.toml", - "model": "TweetyNet", - "config_type": "train", - "audio_format": null, - "spect_format": "mat", - "annot_format": "yarden", - "use_dataset_from_config": null - }, - { - "filename": "TweetyNet_train_continue_audio_cbin_annot_notmat.toml", + "filename": "TweetyNet_predict_audio_wav_annot_birdsongrec.toml", "model": "TweetyNet", - "config_type": "train_continue", - "audio_format": "cbin", + "config_type": "predict", + "audio_format": "wav", "spect_format": null, - "annot_format": "notmat", - "use_dataset_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml" + "annot_format": "birdsong-recognition-dataset", + "use_dataset_from_config": null, + "use_result_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml" }, { "filename": "TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml", @@ -79,7 +77,18 @@ "audio_format": "wav", "spect_format": null, "annot_format": "birdsong-recognition-dataset", - "use_dataset_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml" + "use_dataset_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml", + "use_result_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml" + }, + { + "filename": "TweetyNet_train_spect_mat_annot_yarden.toml", + "model": "TweetyNet", + "config_type": "train", + "audio_format": null, + "spect_format": "mat", + "annot_format": "yarden", + "use_dataset_from_config": null, + "use_result_from_config": null }, { "filename": "TweetyNet_train_continue_spect_mat_annot_yarden.toml", @@ -91,13 +100,14 @@ "use_dataset_from_config": "TweetyNet_train_spect_mat_annot_yarden.toml" }, { - "filename": "TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml", + "filename": "TeenyTweetyNet_train_audio_cbin_annot_notmat.toml", "model": "TeenyTweetyNet", - "config_type": "eval", + "config_type": "train", "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", - "use_dataset_from_config": "TweetyNet_eval_audio_cbin_annot_notmat.toml" + "use_dataset_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml", + "use_result_from_config": null }, { "filename": "TeenyTweetyNet_learncurve_audio_cbin_annot_notmat.toml", @@ -106,34 +116,38 @@ "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", - "use_dataset_from_config": "TweetyNet_learncurve_audio_cbin_annot_notmat.toml" + "use_dataset_from_config": "TweetyNet_learncurve_audio_cbin_annot_notmat.toml", + "use_result_from_config": null }, { - "filename": "TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml", + "filename": "TeenyTweetyNet_eval_audio_cbin_annot_notmat.toml", "model": "TeenyTweetyNet", - "config_type": "predict", + "config_type": "eval", "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", - "use_dataset_from_config": "TweetyNet_predict_audio_cbin_annot_notmat.toml" + "use_dataset_from_config": "TweetyNet_eval_audio_cbin_annot_notmat.toml", + "use_result_from_config": "TeenyTweetyNet_train_audio_cbin_annot_notmat.toml" }, { - "filename": "TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml", + "filename": "TeenyTweetyNet_predict_audio_cbin_annot_notmat.toml", "model": "TeenyTweetyNet", "config_type": "predict", - "audio_format": "wav", + "audio_format": "cbin", "spect_format": null, - "annot_format": "birdsong-recognition-dataset", - "use_dataset_from_config": "TweetyNet_predict_audio_wav_annot_birdsongrec.toml" + "annot_format": "notmat", + "use_dataset_from_config": "TweetyNet_predict_audio_cbin_annot_notmat.toml", + "use_result_from_config": "TeenyTweetyNet_train_audio_cbin_annot_notmat.toml" }, { - "filename": "TeenyTweetyNet_train_audio_cbin_annot_notmat.toml", + "filename": "TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml", "model": "TeenyTweetyNet", - "config_type": "train", + "config_type": "train_continue", "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", - "use_dataset_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml" + "use_dataset_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml", + "use_result_from_config": "TeenyTweetyNet_train_audio_cbin_annot_notmat.toml" }, { "filename": "TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml", @@ -142,25 +156,18 @@ "audio_format": "wav", "spect_format": null, "annot_format": "birdsong-recognition-dataset", - "use_dataset_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml" - }, - { - "filename": "TeenyTweetyNet_train_spect_mat_annot_yarden.toml", - "model": "TeenyTweetyNet", - "config_type": "train", - "audio_format": null, - "spect_format": "mat", - "annot_format": "yarden", - "use_dataset_from_config": "TweetyNet_train_spect_mat_annot_yarden.toml" + "use_dataset_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml", + "use_result_from_config": null }, { - "filename": "TeenyTweetyNet_train_continue_audio_cbin_annot_notmat.toml", + "filename": "TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml", "model": "TeenyTweetyNet", - "config_type": "train_continue", - "audio_format": "cbin", + "config_type": "predict", + "audio_format": "wav", "spect_format": null, - "annot_format": "notmat", - "use_dataset_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml" + "annot_format": "birdsong-recognition-dataset", + "use_dataset_from_config": "TweetyNet_predict_audio_wav_annot_birdsongrec.toml", + "use_result_from_config": "TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml" }, { "filename": "TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml", @@ -169,7 +176,18 @@ "audio_format": "wav", "spect_format": null, "annot_format": "birdsong-recognition-dataset", - "use_dataset_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml" + "use_dataset_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml", + "use_result_from_config": "TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml" + }, + { + "filename": "TeenyTweetyNet_train_spect_mat_annot_yarden.toml", + "model": "TeenyTweetyNet", + "config_type": "train", + "audio_format": null, + "spect_format": "mat", + "annot_format": "yarden", + "use_dataset_from_config": "TweetyNet_train_spect_mat_annot_yarden.toml", + "use_result_from_config": null }, { "filename": "TeenyTweetyNet_train_continue_spect_mat_annot_yarden.toml", @@ -178,7 +196,8 @@ "audio_format": null, "spect_format": "mat", "annot_format": "yarden", - "use_dataset_from_config": "TweetyNet_train_continue_spect_mat_annot_yarden.toml" + "use_dataset_from_config": "TweetyNet_train_continue_spect_mat_annot_yarden.toml", + "use_result_from_config": "TeenyTweetyNet_train_spect_mat_annot_yarden.toml" }, { "filename": "ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml", @@ -187,7 +206,8 @@ "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", - "use_dataset_from_config": null + "use_dataset_from_config": null, + "use_result_from_config": null } ] } \ No newline at end of file From 5ae5a29615fc504911069b280e0c28f1d7faa42b Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 25 Jul 2023 20:49:31 -0400 Subject: [PATCH 134/184] Add attribute `use_result_from_config` to ConfigMetadata --- tests/scripts/vaktestdata/config_metadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/scripts/vaktestdata/config_metadata.py b/tests/scripts/vaktestdata/config_metadata.py index a8e332408..73402c848 100644 --- a/tests/scripts/vaktestdata/config_metadata.py +++ b/tests/scripts/vaktestdata/config_metadata.py @@ -10,3 +10,4 @@ class ConfigMetadata: spect_format: str = attrs.field() annot_format: str = attrs.field() use_dataset_from_config = attrs.field(default=None) + use_result_from_config = attrs.field(default=None) From 824ad805fb5d1a709d7c763270ec0f63e7b0b48b Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 25 Jul 2023 20:49:56 -0400 Subject: [PATCH 135/184] Remove constants from tests/scripts/vaktestdata/constants.py that are no longer used --- tests/scripts/vaktestdata/constants.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/tests/scripts/vaktestdata/constants.py b/tests/scripts/vaktestdata/constants.py index b5c7dabf3..aa1ed7923 100644 --- a/tests/scripts/vaktestdata/constants.py +++ b/tests/scripts/vaktestdata/constants.py @@ -25,19 +25,6 @@ # so glob doesn't pick up static configs that are just used for testing, # like 'invalid_option_config.toml` TEST_CONFIGS_ROOT = TEST_DATA_ROOT.joinpath("configs") -CONFIGS_TO_RUN = [] - -MODELS_PREP = ("TweetyNet",) -MODELS_REUSE_PREP = { - "TeenyTweetyNet": "TweetyNet" -} - -MODELS_RESULTS = ( - "TeenyTweetyNet", - "TweetyNet", -) -for model in MODELS_RESULTS: - CONFIGS_TO_RUN.extend(sorted(TEST_CONFIGS_ROOT.glob(f"{model}*.toml"))) # the sub-directories that will get made inside `./tests/data_for_tests/generated` TOP_LEVEL_DIRS = [ From 5071049c423c7903afe76247934dfae05b6b30a7 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 25 Jul 2023 20:53:50 -0400 Subject: [PATCH 136/184] Rewrite `vaktestdata.configs.fix_options_in_configs` to use declarative config metadata when determining config to use results from --- tests/scripts/vaktestdata/configs.py | 48 ++++++++++------------------ 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/tests/scripts/vaktestdata/configs.py b/tests/scripts/vaktestdata/configs.py index 85c001061..f62d0a834 100644 --- a/tests/scripts/vaktestdata/configs.py +++ b/tests/scripts/vaktestdata/configs.py @@ -31,7 +31,8 @@ def copy_config_files(): copied_configs = [] - for toml_path in constants.CONFIGS_TO_RUN: + for config_metadata in constants.CONFIG_METADATA: + toml_path = constants.TEST_CONFIGS_ROOT / config_metadata.filename if not toml_path.exists(): raise FileNotFoundError(f"{toml_path} not found") @@ -77,7 +78,7 @@ def add_dataset_path_from_prepped_configs(): toml.dump(config_to_change_toml, fp) -def fix_options_in_configs(config_paths, model, command, single_train_result=True): +def fix_options_in_configs(config_metadata_list, command, single_train_result=True): """Fix values assigned to options in predict and eval configs. Need to do this because both predict and eval configs have options @@ -87,42 +88,22 @@ def fix_options_in_configs(config_paths, model, command, single_train_result=Tru raise ValueError( f'invalid command to fix config options: {command}' ) - configs_to_fix, train_configs = [], [] - # split configs into predict/eval/"train_continue" configs and other configs - for config_path in config_paths: - if command in config_path.name: - configs_to_fix.append(config_path) - elif 'train' in config_path.name and 'continue' not in config_path.name: - train_configs.append(config_path) - - for config_to_fix in configs_to_fix: - # figure out which 'train' config corresponds to this 'predict' or 'eval' config - # by using 'suffix' of config file names. `train` suffix will match `predict`/'eval' suffix - prefix, suffix = config_to_fix.name.split(command) - train_config_to_use = [] - for train_config in train_configs: - train_prefix, train_suffix = train_config.name.split("train") - if train_prefix.startswith(model) and train_suffix == suffix: - train_config_to_use.append(train_config) - if len(train_config_to_use) > 1: - raise ValueError( - f"Did not find just a single train config that matches with '{command}' config:\n" - f"{config_to_fix}\n" - f"Matches were: {train_config_to_use}" - ) - train_config_to_use = train_config_to_use[0] + + for config_metadata in config_metadata_list: + config_to_fix = constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename + config_to_use_result_from = constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.use_result_from_config # now use the config to find the results dir and get the values for the options we need to set # which are checkpoint_path, spect_scaler_path, and labelmap_path - with train_config_to_use.open("r") as fp: - train_config_toml = toml.load(fp) - root_results_dir = pathlib.Path(train_config_toml["TRAIN"]["root_results_dir"]) + with config_to_use_result_from.open("r") as fp: + config_toml = toml.load(fp) + root_results_dir = pathlib.Path(config_toml["TRAIN"]["root_results_dir"]) results_dir = sorted(root_results_dir.glob("results_*")) if len(results_dir) > 1: if single_train_result: raise ValueError( f"Did not find just a single results directory in root_results_dir from train_config:\n" - f"{train_config_to_use}" + f"{config_toml}" f"root_results_dir was: {root_results_dir}" f'Matches for "results_*" were: {results_dir}' ) @@ -133,7 +114,7 @@ def fix_options_in_configs(config_paths, model, command, single_train_result=Tru else: raise ValueError( f"Did not find a results directory in root_results_dir from train_config:\n" - f"{train_config_to_use}" + f"{config_toml}" f"root_results_dir was:\n{root_results_dir}" f'Matches for "results_*" were:\n{results_dir}' ) @@ -141,7 +122,7 @@ def fix_options_in_configs(config_paths, model, command, single_train_result=Tru # these are the only options whose values we need to change # and they are the same for both predict and eval checkpoint_path = sorted(results_dir.glob("**/checkpoints/checkpoint.pt"))[0] - if train_config_toml['TRAIN']['normalize_spectrograms']: + if config_toml['TRAIN']['normalize_spectrograms']: spect_scaler_path = sorted(results_dir.glob("StandardizeSpect"))[0] else: spect_scaler_path = None @@ -150,10 +131,12 @@ def fix_options_in_configs(config_paths, model, command, single_train_result=Tru # now add these values to corresponding options in predict / eval config with config_to_fix.open("r") as fp: config_toml = toml.load(fp) + if command == 'train_continue': section = 'TRAIN' else: section = command.upper() + config_toml[section]["checkpoint_path"] = str(checkpoint_path) if spect_scaler_path: config_toml[section]["spect_scaler_path"] = str(spect_scaler_path) @@ -163,5 +146,6 @@ def fix_options_in_configs(config_paths, model, command, single_train_result=Tru del config_toml[section]["spect_scaler_path"] if command != 'train_continue': # train always gets labelmap from dataset dir, not from a config option config_toml[section]["labelmap_path"] = str(labelmap_path) + with config_to_fix.open("w") as fp: toml.dump(config_toml, fp) From c4b6d73dad6600694d54496a50138ccd5d662de5 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 25 Jul 2023 20:54:07 -0400 Subject: [PATCH 137/184] Refactor main loop in tests/scripts/generate_data_for_tests.py --- tests/scripts/generate_data_for_tests.py | 72 ++++++++---------------- 1 file changed, 24 insertions(+), 48 deletions(-) diff --git a/tests/scripts/generate_data_for_tests.py b/tests/scripts/generate_data_for_tests.py index 88bb819b3..a03f30a02 100644 --- a/tests/scripts/generate_data_for_tests.py +++ b/tests/scripts/generate_data_for_tests.py @@ -109,55 +109,31 @@ def generate_test_data( # this makes time to prep all datasets shorter vaktestdata.configs.add_dataset_path_from_prepped_configs() - else: - config_paths = sorted(vaktestdata.constants.GENERATED_TEST_CONFIGS_ROOT.glob('*.toml')) - if step in ('results', 'all'): - for model in vaktestdata.constants.MODELS_RESULTS: - for command in commands: - if command == "prep": - continue # we don't run prep in this code block - print(f"running configs for command: {command}") - - # print(f"using the following configs:\n{command_config_paths}") - if command == 'train' or command == 'learncurve': - command_config_paths = [ - config_path - for config_path in config_paths - if config_path.name.startswith(model) and command in config_path.name - ] - if command == "train": - # need to remove 'train_continue' configs - command_config_paths = [ - config_path for config_path in command_config_paths - if 'continue' not in config_path.name - ] - # we run `train` to get results needed for `eval', 'predict' and continuing 'train'; - # we run `learncurve` so there's a `previous_run_path` to test; - # skip all other commands - for config_path in command_config_paths: - print( - f"n\Running 'vak {command}' with model '{model}', using config: {config_path.name}" - ) - vak.cli.cli.cli(command, config_path) - - elif command in ("predict", "eval", "train_continue"): - # Fix values for required options in predict / eval / train_continue configs - # using results from running the corresponding train configs. - # this only works if we ran the train configs already, - # which we should have because of ordering of COMMANDS constant above - vaktestdata.configs.fix_options_in_configs(config_paths, model, command, single_train_result) - command_config_paths = [ - config_path - for config_path in config_paths - if config_path.name.startswith(model) and command in config_path.name - ] - for config_path in command_config_paths: - for config_path in command_config_paths: - print( - f"\nRunning 'vak {command}' with model '{model}', using config: {config_path.name}" - ) - vak.cli.cli.cli(command, config_path) + # Note we need to run `train` first to get results needed for `eval', 'predict' and continuing 'train' + for command in commands: + if command == "prep": + continue # we don't run prep in this code block + print(f"running configs for command: {command}") + command_config_metadata = [ + config_metadata + for config_metadata in vaktestdata.constants.CONFIG_METADATA + if config_metadata.config_type == command + ] + + if command in ("predict", "eval", "train_continue"): + # Fix values for required options in predict / eval / train_continue configs + # using results from running the corresponding train configs. + # this only works if we ran the train configs already, + # which we should have because of ordering of COMMANDS constant above + vaktestdata.configs.fix_options_in_configs(command_config_metadata, command, single_train_result) + + for config_metadata in command_config_metadata: + config_path = vaktestdata.constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename + print( + f"n\Running 'vak {command}', using config: {config_path.name}" + ) + vak.cli.cli.cli(command, config_path) if __name__ == "__main__": From 502f9d5f36530ac9d42cd5dc45d3fe7593814448 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 25 Jul 2023 21:34:48 -0400 Subject: [PATCH 138/184] Remove other constants from tests/scripts/vaktestdata/constants.py --- tests/scripts/vaktestdata/constants.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tests/scripts/vaktestdata/constants.py b/tests/scripts/vaktestdata/constants.py index aa1ed7923..867ebf36b 100644 --- a/tests/scripts/vaktestdata/constants.py +++ b/tests/scripts/vaktestdata/constants.py @@ -32,21 +32,6 @@ "results", ] -# these sub-dirs get made in each of the TOP_LEVEL_DIRS (except for 'configs') -COMMAND_DIRS = [ - "eval", - "learncurve", - "predict", - "train", -] - -# these sub-dirs get made in each of the COMMAND_DIRS (except for 'configs') -DATA_DIRS = [ - "audio_cbin_annot_notmat", - "audio_wav_annot_birdsongrec", - "spect_mat_annot_yarden", -] - # need to run 'train' config before we run 'predict' # so we can add checkpoints, etc., from training to predict COMMANDS = ( From fc9d257642e967558f5a6f162d50bcf4f11d09cf Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 25 Jul 2023 21:35:06 -0400 Subject: [PATCH 139/184] Fix how dirs get made in tests/scripts/vaktestdata/dirs.py --- tests/scripts/vaktestdata/dirs.py | 53 +++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/tests/scripts/vaktestdata/dirs.py b/tests/scripts/vaktestdata/dirs.py index 2018b5ce1..8debd01f1 100644 --- a/tests/scripts/vaktestdata/dirs.py +++ b/tests/scripts/vaktestdata/dirs.py @@ -24,19 +24,40 @@ def make_subdirs_in_generated(config_paths): "Making sub-directories in ./tests/data_for_tests/generated/ where files generated by `vak` will go" ) - for top_level_dir in constants.TOP_LEVEL_DIRS: - for command_dir in constants.COMMAND_DIRS: - for data_dir in constants.DATA_DIRS: - if not any( - [f'{command_dir}_{data_dir}' in str(config_path) for config_path in config_paths] - ): - continue # no need to make this dir - - for model in constants.MODELS_RESULTS: - subdir_to_make = ( - constants.GENERATED_TEST_DATA / top_level_dir / command_dir / data_dir / model - ) - logger.info( - f"Making sub-directory: {subdir_to_make}" - ) - subdir_to_make.mkdir(parents=True) + for top_level_dir in constants.TOP_LEVEL_DIRS: # datasets / results + subdir_to_make = ( + constants.GENERATED_TEST_DATA / top_level_dir + ) + logger.info( + f"Making sub-directory: {subdir_to_make}" + ) + subdir_to_make.mkdir(parents=True) + + for config_metadata in constants.CONFIG_METADATA: + config_type = config_metadata.config_type # train, eval, predict, etc. + if config_metadata.audio_format: + data_dir = f"audio_{config_metadata.audio_format}_annot_{config_metadata.annot_format}" + elif config_metadata.spect_format: + data_dir = f"spect_{config_metadata.spect_format}_annot_{config_metadata.annot_format}" + else: + raise ValueError( + f'could not determine data dir for config metadata:\n{config_metadata}' + ) + model = config_metadata.model + + if config_metadata.use_dataset_from_config is None: # we need to make dataset dir + subdir_to_make = ( + constants.GENERATED_TEST_DATA / 'prep' / config_type / data_dir / model + ) + logger.info( + f"Making sub-directory: {subdir_to_make}" + ) + subdir_to_make.mkdir(parents=True) + + subdir_to_make = ( + constants.GENERATED_TEST_DATA / 'results' / config_type / data_dir / model + ) + logger.info( + f"Making sub-directory: {subdir_to_make}" + ) + subdir_to_make.mkdir(parents=True) From 71e7414d3003acb87202f7cbca39f2d2e27b1f7f Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 25 Jul 2023 21:39:29 -0400 Subject: [PATCH 140/184] Change birdsongrec -> birdsong-recognition-dataset in dir names in test configs --- .../TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml | 4 ++-- .../TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml | 4 ++-- ...yTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml | 4 ++-- .../TweetyNet_predict_audio_wav_annot_birdsongrec.toml | 4 ++-- .../configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml | 4 ++-- .../TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml index 776802c55..9a2110cca 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_predict_audio_wav_annot_birdsongrec.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/predict/audio_wav_annot_birdsongrec/TeenyTweetyNet" +output_dir = "./tests/data_for_tests/generated/prep/predict/audio_wav_annot_birdsong-recognition-dataset/TeenyTweetyNet" audio_format = "wav" [SPECT_PARAMS] @@ -20,7 +20,7 @@ model = "TeenyTweetyNet" batch_size = 4 num_workers = 2 device = "cuda" -output_dir = "./tests/data_for_tests/generated/results/predict/audio_wav_annot_birdsongrec/TeenyTweetyNet" +output_dir = "./tests/data_for_tests/generated/results/predict/audio_wav_annot_birdsong-recognition-dataset/TeenyTweetyNet" annot_csv_filename = "Bird0.annot.csv" [PREDICT.transform_params] diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml index a27313896..c9b6cae79 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_train_audio_wav_annot_birdsongrec.toml @@ -3,7 +3,7 @@ dataset_type = "frame classification" input_type = "spect" labelset = "012345678" data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsongrec/TeenyTweetyNet" +output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsong-recognition-dataset/TeenyTweetyNet" audio_format = "wav" annot_format = "birdsong-recognition-dataset" annot_file = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0/Annotation.xml" @@ -28,7 +28,7 @@ ckpt_step = 200 patience = 3 num_workers = 2 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsongrec/TeenyTweetyNet" +root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsong-recognition-dataset/TeenyTweetyNet" [TRAIN.train_dataset_params] window_size = 44 diff --git a/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml index 6383f99a9..2b0025c8c 100644 --- a/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TeenyTweetyNet_train_continue_audio_wav_annot_birdsongrec.toml @@ -3,7 +3,7 @@ dataset_type = "frame classification" input_type = "spect" labelset = "012345678" data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsongrec/TeenyTweetyNet" +output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsong-recognition-dataset/TeenyTweetyNet" audio_format = "wav" annot_format = "birdsong-recognition-dataset" annot_file = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0/Annotation.xml" @@ -28,7 +28,7 @@ ckpt_step = 200 patience = 3 num_workers = 2 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsongrec/TeenyTweetyNet" +root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsong-recognition-dataset/TeenyTweetyNet" checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" spect_scaler_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" diff --git a/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml index cbc6358d5..f0c827ba4 100644 --- a/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml @@ -2,7 +2,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/predict/audio_wav_annot_birdsongrec/TweetyNet" +output_dir = "./tests/data_for_tests/generated/prep/predict/audio_wav_annot_birdsong-recognition-dataset/TweetyNet" audio_format = "wav" [SPECT_PARAMS] @@ -20,7 +20,7 @@ model = "TweetyNet" batch_size = 11 num_workers = 4 device = "cuda" -output_dir = "./tests/data_for_tests/generated/results/predict/audio_wav_annot_birdsongrec/TweetyNet" +output_dir = "./tests/data_for_tests/generated/results/predict/audio_wav_annot_birdsong-recognition-dataset/TweetyNet" annot_csv_filename = "Bird0.annot.csv" [PREDICT.transform_params] diff --git a/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml index fbfd2269e..6b7c69cff 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml @@ -3,7 +3,7 @@ dataset_type = "frame classification" input_type = "spect" labelset = "012345678" data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsongrec/TweetyNet" +output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsong-recognition-dataset/TweetyNet" audio_format = "wav" annot_format = "birdsong-recognition-dataset" annot_file = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0/Annotation.xml" @@ -28,7 +28,7 @@ ckpt_step = 200 patience = 4 num_workers = 4 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsongrec/TweetyNet" +root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsong-recognition-dataset/TweetyNet" [TRAIN.train_dataset_params] window_size = 88 diff --git a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml index 480ab0230..e11ce475b 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml @@ -3,7 +3,7 @@ dataset_type = "frame classification" input_type = "spect" labelset = "012345678" data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsongrec/TweetyNet" +output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsong-recognition-dataset/TweetyNet" audio_format = "wav" annot_format = "birdsong-recognition-dataset" annot_file = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0/Annotation.xml" @@ -28,7 +28,7 @@ ckpt_step = 200 patience = 4 num_workers = 4 device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsongrec/TweetyNet" +root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsong-recognition-dataset/TweetyNet" checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" spect_scaler_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" From 615cfe80b84f4cb2678043ba1eb750741bac858e Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 25 Jul 2023 21:48:54 -0400 Subject: [PATCH 141/184] Rename prep/dimensionality_reduction -> prep/parametric_umap --- src/vak/prep/__init__.py | 2 +- src/vak/prep/constants.py | 4 ++-- src/vak/prep/dimensionality_reduction/__init__.py | 2 -- src/vak/prep/parametric_umap/__init__.py | 2 ++ .../dataset_arrays.py | 0 .../parametric_umap.py} | 4 ++-- src/vak/prep/prep.py | 6 +++--- .../ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml | 2 +- 8 files changed, 11 insertions(+), 11 deletions(-) delete mode 100644 src/vak/prep/dimensionality_reduction/__init__.py create mode 100644 src/vak/prep/parametric_umap/__init__.py rename src/vak/prep/{dimensionality_reduction => parametric_umap}/dataset_arrays.py (100%) rename src/vak/prep/{dimensionality_reduction/dimensionality_reduction.py => parametric_umap/parametric_umap.py} (99%) diff --git a/src/vak/prep/__init__.py b/src/vak/prep/__init__.py index 6795fee53..f22419500 100644 --- a/src/vak/prep/__init__.py +++ b/src/vak/prep/__init__.py @@ -2,8 +2,8 @@ audio_dataset, constants, dataset_df_helper, - dimensionality_reduction, frame_classification, + parametric_umap, spectrogram_dataset, unit_dataset, ) diff --git a/src/vak/prep/constants.py b/src/vak/prep/constants.py index 06e78f2c1..b828ec815 100644 --- a/src/vak/prep/constants.py +++ b/src/vak/prep/constants.py @@ -3,7 +3,7 @@ Defined in a separate module to minimize circular imports. """ from . import ( - dimensionality_reduction, + parametric_umap, frame_classification ) @@ -21,7 +21,7 @@ DATASET_TYPE_FUNCTION_MAP = { 'frame classification': frame_classification.prep_frame_classification_dataset, - 'dimensionality reduction': dimensionality_reduction.prep_dimensionality_reduction_dataset, + 'parametric umap': parametric_umap.prep_parametric_umap_dataset, } DATASET_TYPES = tuple(DATASET_TYPE_FUNCTION_MAP.keys()) diff --git a/src/vak/prep/dimensionality_reduction/__init__.py b/src/vak/prep/dimensionality_reduction/__init__.py deleted file mode 100644 index 71d3feca5..000000000 --- a/src/vak/prep/dimensionality_reduction/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from . import dataset_arrays -from .dimensionality_reduction import prep_dimensionality_reduction_dataset diff --git a/src/vak/prep/parametric_umap/__init__.py b/src/vak/prep/parametric_umap/__init__.py new file mode 100644 index 000000000..af55977fe --- /dev/null +++ b/src/vak/prep/parametric_umap/__init__.py @@ -0,0 +1,2 @@ +from . import dataset_arrays +from .parametric_umap import prep_parametric_umap_dataset diff --git a/src/vak/prep/dimensionality_reduction/dataset_arrays.py b/src/vak/prep/parametric_umap/dataset_arrays.py similarity index 100% rename from src/vak/prep/dimensionality_reduction/dataset_arrays.py rename to src/vak/prep/parametric_umap/dataset_arrays.py diff --git a/src/vak/prep/dimensionality_reduction/dimensionality_reduction.py b/src/vak/prep/parametric_umap/parametric_umap.py similarity index 99% rename from src/vak/prep/dimensionality_reduction/dimensionality_reduction.py rename to src/vak/prep/parametric_umap/parametric_umap.py index 847e21ca7..2c5ceb3e0 100644 --- a/src/vak/prep/dimensionality_reduction/dimensionality_reduction.py +++ b/src/vak/prep/parametric_umap/parametric_umap.py @@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) -def prep_dimensionality_reduction_dataset( +def prep_parametric_umap_dataset( data_dir: str | pathlib.Path, purpose: str, output_dir: str | pathlib.Path | None = None, @@ -317,7 +317,7 @@ def prep_dimensionality_reduction_dataset( ) # index is False to avoid having "Unnamed: 0" column when loading # ---- save metadata ----------------------------------------------------------------------------------------------- - metadata = datasets.dimensionality_reduction.Metadata( + metadata = datasets.parametric_umap.Metadata( dataset_csv_filename=str(dataset_csv_path.name), audio_format=audio_format, ) diff --git a/src/vak/prep/prep.py b/src/vak/prep/prep.py index 7556c5cab..277855a34 100644 --- a/src/vak/prep/prep.py +++ b/src/vak/prep/prep.py @@ -4,7 +4,7 @@ from . import ( constants, ) -from .dimensionality_reduction import prep_dimensionality_reduction_dataset +from .parametric_umap import prep_parametric_umap_dataset from .frame_classification import prep_frame_classification_dataset @@ -213,8 +213,8 @@ def prep( timebins_key, ) return dataset_df, dataset_path - elif dataset_type == "dimensionality reduction": - dataset_df, dataset_path = prep_dimensionality_reduction_dataset( + elif dataset_type == "parametric umap": + dataset_df, dataset_path = prep_parametric_umap_dataset( data_dir, purpose, output_dir, diff --git a/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml index be8762317..6769f9f22 100644 --- a/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml @@ -1,5 +1,5 @@ [PREP] -dataset_type = "dimensionality reduction" +dataset_type = "parametric umap" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032312" output_dir = "./tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/ConvEncoderUMAP" From af7765b3f5e0cc1bd2fbaf29509d4d2b9f35d1f3 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 25 Jul 2023 22:24:03 -0400 Subject: [PATCH 142/184] Add missing train/val_dataset/transform_params options to LEARNCURVE table in valid.toml --- src/vak/config/valid.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/vak/config/valid.toml b/src/vak/config/valid.toml index de782a74b..11cd535f5 100644 --- a/src/vak/config/valid.toml +++ b/src/vak/config/valid.toml @@ -83,7 +83,10 @@ results_dir_made_by_main_script = '/some/path/to/learncurve/' post_tfm_kwargs = {'majority_vote' = true, 'min_segment_dur' = 0.01} num_workers = 4 device = 'cuda' - +train_transform_params = {'resize' = 128} +train_dataset_params = {'window_size' = 80} +val_transform_params = {'resize' = 128} +val_dataset_params = {'window_size' = 80} [PREDICT] dataset_path = 'tests/test_data/prep/learncurve/032312_prep_191224_225910.csv' From 52bc9f071012b0dc67d4ba2a4b69c327b84ccb26 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 25 Jul 2023 22:24:23 -0400 Subject: [PATCH 143/184] Add missing train/val_dataset/transform_params options in call to learncurve in cli.learncurve --- src/vak/cli/learncurve.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/vak/cli/learncurve.py b/src/vak/cli/learncurve.py index 7bf099dcb..5bf302f6c 100644 --- a/src/vak/cli/learncurve.py +++ b/src/vak/cli/learncurve.py @@ -61,10 +61,13 @@ def learning_curve(toml_path): model_name=model_name, model_config=model_config, dataset_path=cfg.learncurve.dataset_path, - window_size=cfg.dataloader.window_size, batch_size=cfg.learncurve.batch_size, num_epochs=cfg.learncurve.num_epochs, num_workers=cfg.learncurve.num_workers, + train_transform_params=cfg.learncurve.train_transform_params, + train_dataset_params=cfg.learncurve.train_dataset_params, + val_transform_params=cfg.learncurve.val_transform_params, + val_dataset_params=cfg.learncurve.val_dataset_params, results_path=results_path, post_tfm_kwargs=cfg.learncurve.post_tfm_kwargs, normalize_spectrograms=cfg.learncurve.normalize_spectrograms, From c7dc6b099e9ed9e34bcd354e69c9099c2feb6f76 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 25 Jul 2023 23:09:22 -0400 Subject: [PATCH 144/184] Remove window size, add transform/dataset_params in call to eval in cli.eval --- src/vak/cli/eval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/vak/cli/eval.py b/src/vak/cli/eval.py index 39e3fc9ad..63d278066 100644 --- a/src/vak/cli/eval.py +++ b/src/vak/cli/eval.py @@ -58,8 +58,9 @@ def eval(toml_path): checkpoint_path=cfg.eval.checkpoint_path, labelmap_path=cfg.eval.labelmap_path, output_dir=cfg.eval.output_dir, - window_size=cfg.dataloader.window_size, num_workers=cfg.eval.num_workers, + transform_params=cfg.eval.transform_params, + dataset_params=cfg.eval.dataset_params, spect_scaler_path=cfg.eval.spect_scaler_path, device=cfg.eval.device, post_tfm_kwargs=cfg.eval.post_tfm_kwargs, From 02580c76a2f4a06fbe0e124a4bc387746f2b045c Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 25 Jul 2023 23:09:34 -0400 Subject: [PATCH 145/184] Remove window size, add transform/dataset_params in call to predict in cli.predict --- src/vak/cli/predict.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/vak/cli/predict.py b/src/vak/cli/predict.py index 8ab363b2d..6b462b92e 100644 --- a/src/vak/cli/predict.py +++ b/src/vak/cli/predict.py @@ -52,8 +52,9 @@ def predict(toml_path): dataset_path=cfg.predict.dataset_path, checkpoint_path=cfg.predict.checkpoint_path, labelmap_path=cfg.predict.labelmap_path, - window_size=cfg.dataloader.window_size, num_workers=cfg.predict.num_workers, + transform_params=cfg.eval.transform_params, + dataset_params=cfg.eval.dataset_params, timebins_key=cfg.spect_params.timebins_key, spect_scaler_path=cfg.predict.spect_scaler_path, device=cfg.predict.device, From 7f239525bf42f23411a9cad69bfc932989dcc060 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 25 Jul 2023 23:10:07 -0400 Subject: [PATCH 146/184] Remove window_size arg in call to eval_frame_classification_model --- src/vak/eval/eval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/vak/eval/eval.py b/src/vak/eval/eval.py index 3f253e8d3..b833beb90 100644 --- a/src/vak/eval/eval.py +++ b/src/vak/eval/eval.py @@ -126,7 +126,6 @@ def eval( checkpoint_path=checkpoint_path, labelmap_path=labelmap_path, output_dir=output_dir, - window_size=window_size, num_workers=num_workers, transform_params=transform_params, dataset_params=dataset_params, From 9ec72a364d38bc61102e5559d3f4ac508dbcd54d Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 26 Jul 2023 08:27:35 -0400 Subject: [PATCH 147/184] fixup Remove window size, add transform/dataset_params in call to predict in cli.predict --- src/vak/cli/predict.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vak/cli/predict.py b/src/vak/cli/predict.py index 6b462b92e..7393aab26 100644 --- a/src/vak/cli/predict.py +++ b/src/vak/cli/predict.py @@ -53,8 +53,8 @@ def predict(toml_path): checkpoint_path=cfg.predict.checkpoint_path, labelmap_path=cfg.predict.labelmap_path, num_workers=cfg.predict.num_workers, - transform_params=cfg.eval.transform_params, - dataset_params=cfg.eval.dataset_params, + transform_params=cfg.predict.transform_params, + dataset_params=cfg.predict.dataset_params, timebins_key=cfg.spect_params.timebins_key, spect_scaler_path=cfg.predict.spect_scaler_path, device=cfg.predict.device, From 46a6d06544a22bcb7b742a9e685be66ec21c0d73 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 28 Jul 2023 07:06:23 -0400 Subject: [PATCH 148/184] Add missing fields to some entries in tests/data_for_tests/configs/config.json --- tests/data_for_tests/configs/configs.json | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/data_for_tests/configs/configs.json b/tests/data_for_tests/configs/configs.json index 09a205223..5788fae40 100644 --- a/tests/data_for_tests/configs/configs.json +++ b/tests/data_for_tests/configs/configs.json @@ -97,7 +97,8 @@ "audio_format": null, "spect_format": "mat", "annot_format": "yarden", - "use_dataset_from_config": "TweetyNet_train_spect_mat_annot_yarden.toml" + "use_dataset_from_config": "TweetyNet_train_spect_mat_annot_yarden.toml", + "use_result_from_config": "TweetyNet_train_spect_mat_annot_yarden.toml" }, { "filename": "TeenyTweetyNet_train_audio_cbin_annot_notmat.toml", @@ -196,7 +197,7 @@ "audio_format": null, "spect_format": "mat", "annot_format": "yarden", - "use_dataset_from_config": "TweetyNet_train_continue_spect_mat_annot_yarden.toml", + "use_dataset_from_config": "TweetyNet_train_spect_mat_annot_yarden.toml", "use_result_from_config": "TeenyTweetyNet_train_spect_mat_annot_yarden.toml" }, { From 4288ce24081402f4c688de1b04756e472a728bb3 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 28 Jul 2023 07:07:01 -0400 Subject: [PATCH 149/184] Fix how we handle 'train_continue' command in generate_data_for_tests.py --- tests/scripts/generate_data_for_tests.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/scripts/generate_data_for_tests.py b/tests/scripts/generate_data_for_tests.py index a03f30a02..c4dd83ccd 100644 --- a/tests/scripts/generate_data_for_tests.py +++ b/tests/scripts/generate_data_for_tests.py @@ -128,6 +128,10 @@ def generate_test_data( # which we should have because of ordering of COMMANDS constant above vaktestdata.configs.fix_options_in_configs(command_config_metadata, command, single_train_result) + if command == "train_continue": + # so we don't get 'command not recognized' error in next code block + command = "train" + for config_metadata in command_config_metadata: config_path = vaktestdata.constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename print( From 48789248475289ba25862b23aacf14c5b857a16d Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 28 Jul 2023 07:13:30 -0400 Subject: [PATCH 150/184] Make minor fixes to docstring of eval_frame_classification_model --- src/vak/eval/frame_classification.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/vak/eval/frame_classification.py b/src/vak/eval/frame_classification.py index 0315f6905..f3d1804d2 100644 --- a/src/vak/eval/frame_classification.py +++ b/src/vak/eval/frame_classification.py @@ -52,15 +52,13 @@ def eval_frame_classification_model( dataset_path : str, pathlib.Path Path to dataset, e.g., a csv file generated by running ``vak prep``. checkpoint_path : str, pathlib.Path - path to directory with checkpoint files saved by Torch, to reload model + Path to directory with checkpoint files saved by Torch, to reload model output_dir : str, pathlib.Path Path to location where .csv files with evaluation metrics should be saved. labelmap_path : str, pathlib.Path - path to 'labelmap.json' file. - models : list - of model names. e.g., 'models = TweetyNet, GRUNet, ConvNet' + Path to 'labelmap.json' file. batch_size : int - number of samples per batch presented to models during training. + Number of samples per batch presented to models during training. num_workers : int Number of processes to use for parallel loading of data. Argument to torch.DataLoader. Default is 2. @@ -73,10 +71,10 @@ def eval_frame_classification_model( Passed as keyword arguments. Optional, default is None. split : str - split of dataset on which model should be evaluated. + Split of dataset on which model should be evaluated. One of {'train', 'val', 'test'}. Default is 'test'. spect_scaler_path : str, pathlib.Path - path to a saved SpectScaler object used to normalize spectrograms. + Path to a saved SpectScaler object used to normalize spectrograms. If spectrograms were normalized and this is not provided, will give incorrect results. Default is None. From 1bde4c6299da4a83f9442bf50309de08462d69ce Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 28 Jul 2023 07:17:38 -0400 Subject: [PATCH 151/184] Remove 'batch_size' from eval_frame_classification_model docstring, not a parameter for this function --- src/vak/eval/frame_classification.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/vak/eval/frame_classification.py b/src/vak/eval/frame_classification.py index f3d1804d2..3b017adec 100644 --- a/src/vak/eval/frame_classification.py +++ b/src/vak/eval/frame_classification.py @@ -57,8 +57,6 @@ def eval_frame_classification_model( Path to location where .csv files with evaluation metrics should be saved. labelmap_path : str, pathlib.Path Path to 'labelmap.json' file. - batch_size : int - Number of samples per batch presented to models during training. num_workers : int Number of processes to use for parallel loading of data. Argument to torch.DataLoader. Default is 2. From b80d01b701a27a24e9a8285be800894cea34fd4c Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 28 Jul 2023 07:17:56 -0400 Subject: [PATCH 152/184] Remove unused import from train_parametric_umap_model --- src/vak/train/parametric_umap.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/vak/train/parametric_umap.py b/src/vak/train/parametric_umap.py index e249d6e2d..feb10ded9 100644 --- a/src/vak/train/parametric_umap.py +++ b/src/vak/train/parametric_umap.py @@ -18,7 +18,6 @@ from ..datasets.parametric_umap import ParametricUMAPDataset from ..common.device import get_default as get_default_device from ..common.paths import generate_results_dir_name_as_path -from ..common.trainer import get_default_trainer logger = logging.getLogger(__name__) From f47ef5d10834f1b3b47c24f2e76ef68140aa6ab5 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 28 Jul 2023 08:25:24 -0400 Subject: [PATCH 153/184] Add vak/eval/parametric_umap.py --- src/vak/eval/parametric_umap.py | 183 ++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 src/vak/eval/parametric_umap.py diff --git a/src/vak/eval/parametric_umap.py b/src/vak/eval/parametric_umap.py new file mode 100644 index 000000000..6aba33646 --- /dev/null +++ b/src/vak/eval/parametric_umap.py @@ -0,0 +1,183 @@ +"""Function that evaluates trained models in the parametric UMAP family.""" +from __future__ import annotations + +from collections import OrderedDict +from datetime import datetime +import logging +import pathlib + +import joblib +import pytorch_lightning as lightning +import pandas as pd +import torch.utils.data + +from .. import ( + datasets, + models, + transforms, +) +from ..common import validators +from ..datasets.parametric_umap import ParametricUMAPDataset + + +logger = logging.getLogger(__name__) + + +def eval_parametric_umap_model( + model_name: str, + model_config: dict, + dataset_path: str | pathlib.Path, + checkpoint_path: str | pathlib.Path, + output_dir: str | pathlib.Path, + batch_size: int, + num_workers: int, + transform_params: dict | None = None, + dataset_params: dict | None = None, + split: str = "test", + spect_scaler_path: str | pathlib.Path = None, + device: str | None = None, +) -> None: + """Evaluate a trained model. + + Parameters + ---------- + model_name : str + Model name, must be one of vak.models.registry.MODEL_NAMES. + model_config : dict + Model configuration in a ``dict``, + as loaded from a .toml file, + and used by the model method ``from_config``. + dataset_path : str, pathlib.Path + Path to dataset, e.g., a csv file generated by running ``vak prep``. + checkpoint_path : str, pathlib.Path + Path to directory with checkpoint files saved by Torch, to reload model + output_dir : str, pathlib.Path + Path to location where .csv files with evaluation metrics should be saved. + batch_size : int + Number of samples per batch presented to models during training. + num_workers : int + Number of processes to use for parallel loading of data. + Argument to torch.DataLoader. Default is 2. + transform_params: dict, optional + Parameters for data transform. + Passed as keyword arguments. + Optional, default is None. + dataset_params: dict, optional + Parameters for dataset. + Passed as keyword arguments. + Optional, default is None. + split : str + Split of dataset on which model should be evaluated. + One of {'train', 'val', 'test'}. Default is 'test'. + device : str + Device on which to work with model + data. + Defaults to 'cuda' if torch.cuda.is_available is True. + """ + # ---- pre-conditions ---------------------------------------------------------------------------------------------- + for path, path_name in zip( + (checkpoint_path, spect_scaler_path), + ('checkpoint_path', 'spect_scaler_path'), + ): + if path is not None: # because `spect_scaler_path` is optional + if not validators.is_a_file(path): + raise FileNotFoundError( + f"value for ``{path_name}`` not recognized as a file: {path}" + ) + + dataset_path = pathlib.Path(dataset_path) + if not dataset_path.exists() or not dataset_path.is_dir(): + raise NotADirectoryError( + f"`dataset_path` not found or not recognized as a directory: {dataset_path}" + ) + + if not validators.is_a_directory(output_dir): + raise NotADirectoryError( + f'value for ``output_dir`` not recognized as a directory: {output_dir}' + ) + + # ---- get time for .csv file -------------------------------------------------------------------------------------- + timenow = datetime.now().strftime("%y%m%d_%H%M%S") + + # ---------------- load data for evaluation ------------------------------------------------------------------------ + if spect_scaler_path: + logger.info(f"loading spect scaler from path: {spect_scaler_path}") + spect_standardizer = joblib.load(spect_scaler_path) + else: + logger.info(f"not using a spect scaler") + spect_standardizer = None + + if transform_params is None: + transform_params = {} + transform_params.update({'spect_standardizer': spect_standardizer}) + item_transform = transforms.defaults.get_default_transform( + model_name, + "eval", + transform_params + ) + if dataset_params is None: + dataset_params = {} + val_dataset = ParametricUMAPDataset.from_dataset_path( + dataset_path=dataset_path, + split=split, + transform=item_transform, + **dataset_params, + ) + val_loader = torch.utils.data.DataLoader( + dataset=val_dataset, + shuffle=False, + batch_size=batch_size, + num_workers=num_workers, + ) + + # ---------------- do the actual evaluating ------------------------------------------------------------------------ + model = models.get( + model_name, + model_config, + input_shape=val_dataset.shape, + ) + + logger.info(f"running evaluation for model: {model_name}") + + model.load_state_dict_from_path(checkpoint_path) + + if device == 'cuda': + accelerator = 'gpu' + else: + accelerator = None + + trainer_logger = lightning.loggers.TensorBoardLogger( + save_dir=output_dir + ) + trainer = lightning.Trainer(accelerator=accelerator, logger=trainer_logger) + # TODO: check for hasattr(model, test_step) and if so run test + # below, [0] because validate returns list of dicts, length of no. of val loaders + metric_vals = trainer.validate(model, dataloaders=val_loader)[0] + for metric_name, metric_val in metric_vals.items(): + logger.info( + f'{metric_name}: {metric_val:0.5f}' + ) + + # create a "DataFrame" with just one row which we will save as a csv; + # the idea is to be able to concatenate csvs from multiple runs of eval + row = OrderedDict( + [ + ("model_name", model_name), + ("checkpoint_path", checkpoint_path), + ("spect_scaler_path", spect_scaler_path), + ("dataset_path", dataset_path), + ] + ) + # TODO: is this still necessary after switching to Lightning? Stop saying "average"? + # order metrics by name to be extra sure they will be consistent across runs + row.update( + sorted([(k, v) for k, v in metric_vals.items()]) + ) + + # pass index into dataframe, needed when using all scalar values (a single row) + # throw away index below when saving to avoid extra column + eval_df = pd.DataFrame(row, index=[0]) + eval_csv_path = output_dir.joinpath(f"eval_{model_name}_{timenow}.csv") + logger.info(f"saving csv with evaluation metrics at: {eval_csv_path}") + eval_df.to_csv( + eval_csv_path, index=False + ) # index is False to avoid having "Unnamed: 0" column when loading From c4fa22dd90b47e1134ae10a14b9bac70354e356f Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 28 Jul 2023 08:25:46 -0400 Subject: [PATCH 154/184] Modify vak/eval/eval.py to call eval_parametric_umap_model when appropriate --- src/vak/eval/eval.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/vak/eval/eval.py b/src/vak/eval/eval.py index b833beb90..3c76788a2 100644 --- a/src/vak/eval/eval.py +++ b/src/vak/eval/eval.py @@ -5,6 +5,7 @@ import pathlib from .frame_classification import eval_frame_classification_model +from .parametric_umap import eval_parametric_umap_model from .. import ( models, ) @@ -134,6 +135,20 @@ def eval( device=device, post_tfm_kwargs=post_tfm_kwargs, ) + elif model_family == "ParametricUMAPModel": + eval_parametric_umap_model( + model_name=model_name, + model_config=model_config, + dataset_path=dataset_path, + checkpoint_path=checkpoint_path, + output_dir=output_dir, + num_workers=num_workers, + transform_params=transform_params, + dataset_params=dataset_params, + split=split, + spect_scaler_path=spect_scaler_path, + device=device, + ) else: raise ValueError( f"Model family not recognized: {model_family}" From 774a2e8268e1972bcbbbe19a5ce5cf2f9a1b22de Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 28 Jul 2023 08:30:29 -0400 Subject: [PATCH 155/184] Remove 'labelmap_path' from REQUIRED_OPTIONS IN vak/config/parse.py, since it's not required for parametric UMAP models --- src/vak/config/parse.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/vak/config/parse.py b/src/vak/config/parse.py index 0a2316038..c69183a73 100644 --- a/src/vak/config/parse.py +++ b/src/vak/config/parse.py @@ -25,7 +25,6 @@ REQUIRED_OPTIONS = { "EVAL": [ "checkpoint_path", - "labelmap_path", "output_dir", "model", ], @@ -35,7 +34,6 @@ ], "PREDICT": [ "checkpoint_path", - "labelmap_path", "model", ], "PREP": [ From 103b7c600ea6919511809e0121d9100556d2abff Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 28 Jul 2023 09:28:08 -0400 Subject: [PATCH 156/184] Fix how we train parametric umap so it saves checkpoints, and in the correct location --- src/vak/train/parametric_umap.py | 45 +++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/src/vak/train/parametric_umap.py b/src/vak/train/parametric_umap.py index feb10ded9..088d72d06 100644 --- a/src/vak/train/parametric_umap.py +++ b/src/vak/train/parametric_umap.py @@ -29,6 +29,8 @@ def get_split_dur(df: pd.DataFrame, split: str) -> float: def get_trainer(max_epochs: int, + ckpt_root: str | pathlib.Path, + ckpt_step: int, log_save_dir: str | pathlib.Path, device: str = 'cuda', ) -> lightning.Trainer: @@ -40,6 +42,33 @@ def get_trainer(max_epochs: int, else: accelerator = None + ckpt_callback = lightning.callbacks.ModelCheckpoint( + dirpath=ckpt_root, + filename='checkpoint', + every_n_train_steps=ckpt_step, + save_last=True, + verbose=True, + ) + ckpt_callback.CHECKPOINT_NAME_LAST = 'checkpoint' + ckpt_callback.FILE_EXTENSION = '.pt' + + val_ckpt_callback = lightning.callbacks.ModelCheckpoint( + monitor="val_loss", + dirpath=ckpt_root, + save_top_k=1, + mode='min', + filename='min-val-loss-checkpoint', + auto_insert_metric_name=False, + verbose=True + ) + val_ckpt_callback.FILE_EXTENSION = '.pt' + + callbacks = [ + ckpt_callback, + val_ckpt_callback, + ] + + logger = lightning.loggers.TensorBoardLogger( save_dir=log_save_dir ) @@ -48,6 +77,7 @@ def get_trainer(max_epochs: int, max_epochs=max_epochs, accelerator=accelerator, logger=logger, + callbacks=callbacks, ) return trainer @@ -68,6 +98,7 @@ def train_parametric_umap_model( results_path: str | pathlib.Path | None = None, shuffle: bool = True, val_step: int | None = None, + ckpt_step: int | None = None, device: str | None = None, split: str = 'train', ) -> None: @@ -120,6 +151,14 @@ def train_parametric_umap_model( results_path : str, pathlib.Path, optional Directory where results will be saved. If specified, this parameter overrides ``root_results_dir``. + val_step : int + Computes the loss using validation set every ``val_step`` epochs. + Default is None, in which case no validation is done. + ckpt_step : int + Step on which to save to checkpoint file. + If ckpt_step is n, then a checkpoint is saved every time + the global step / n is a whole number, i.e., when ckpt_step modulo the global step is 0. + Default is None, in which case checkpoint is only saved at the last epoch. device : str Device on which to work with model + data. Default is None. If None, then a device will be selected with vak.split.get_default. @@ -224,8 +263,7 @@ def train_parametric_umap_model( val_loader = torch.utils.data.DataLoader( dataset=val_dataset, shuffle=False, - # batch size 1 because each spectrogram reshaped into a batch of windows - batch_size=1, + batch_size=batch_size, num_workers=num_workers, ) else: @@ -251,11 +289,12 @@ def train_parametric_umap_model( ckpt_root = results_model_root.joinpath("checkpoints") ckpt_root.mkdir() logger.info(f"training {model_name}") - max_steps = num_epochs * len(train_loader) trainer = get_trainer( max_epochs=num_epochs, log_save_dir=results_model_root, device=device, + ckpt_root=ckpt_root, + ckpt_step=ckpt_step, ) train_time_start = datetime.datetime.now() logger.info( From 231b5384a0fd548bd86c0c08814e1b69fdb5865a Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 28 Jul 2023 09:28:38 -0400 Subject: [PATCH 157/184] Pass 'ckpt_step' into train_parametric_umap inside vak.train.train --- src/vak/train/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vak/train/train.py b/src/vak/train/train.py index fbef9daa3..69f38efea 100644 --- a/src/vak/train/train.py +++ b/src/vak/train/train.py @@ -206,6 +206,7 @@ def train( results_path=results_path, shuffle=shuffle, val_step=val_step, + ckpt_step=ckpt_step, device=device, split=split, ) From c33cd833c91d1f80aee8596a88b92604a3bc6eba Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 28 Jul 2023 09:31:10 -0400 Subject: [PATCH 158/184] Add tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml --- ...oderUMAP_eval_audio_cbin_annot_notmat.toml | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml diff --git a/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml new file mode 100644 index 000000000..4877b7e58 --- /dev/null +++ b/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml @@ -0,0 +1,30 @@ +[PREP] +dataset_type = "parametric umap" +input_type = "spect" +data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032312" +output_dir = "./tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/ConvEncoderUMAP" +audio_format = "cbin" +annot_format = "notmat" +labelset = "iabcdefghjk" +train_dur = 40 +val_dur = 15 + +[SPECT_PARAMS] +fft_size = 512 +step_size = 32 +transform_type = "log_spect_plus_one" + +[EVAL] +checkpoint_path = "tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/ConvEncoderUMAP/results_230727_210112/ConvEncoderUMAP/checkpoints/checkpoint.pt" +model = "ConvEncoderUMAP" +batch_size = 4 +num_workers = 2 +device = "cuda" +output_dir = "./tests/data_for_tests/generated/results/eval/audio_cbin_annot_notmat/ConvEncoderUMAP" +dataset_path = "tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/ConvEncoderUMAP/032312-vak-dimensionality-reduction-dataset-generated-230727_205727" + +[EVAL.transform_params] +resize = 128 + +[ConvEncoderUMAP.optimizer] +lr = 0.001 From 584f7952f88fc70166897de1960dcc49cc4da3eb Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 28 Jul 2023 09:31:32 -0400 Subject: [PATCH 159/184] Add ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml to tests/data_for_tests/configs/configs.json --- tests/data_for_tests/configs/configs.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/data_for_tests/configs/configs.json b/tests/data_for_tests/configs/configs.json index 5788fae40..05b4731a4 100644 --- a/tests/data_for_tests/configs/configs.json +++ b/tests/data_for_tests/configs/configs.json @@ -209,6 +209,16 @@ "annot_format": "notmat", "use_dataset_from_config": null, "use_result_from_config": null + }, + { + "filename": "ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml", + "model": "ConvEncoderUMAP", + "config_type": "eval", + "audio_format": "cbin", + "spect_format": null, + "annot_format": "notmat", + "use_dataset_from_config": "ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml", + "use_result_from_config": "ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml" } ] } \ No newline at end of file From 8cb57e483aebdb32d119730d9cca13b57e39342a Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 9 Aug 2023 10:23:33 -0400 Subject: [PATCH 160/184] Make labelmap_path optional for EvalConfig, so Parametric UMAP models don't crash --- src/vak/config/eval.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/vak/config/eval.py b/src/vak/config/eval.py index 98f8125b6..f6bac1922 100644 --- a/src/vak/config/eval.py +++ b/src/vak/config/eval.py @@ -110,7 +110,6 @@ class EvalConfig: """ # required, external files checkpoint_path = attr.ib(converter=expanded_user_path) - labelmap_path = attr.ib(converter=expanded_user_path) output_dir = attr.ib(converter=expanded_user_path) # required, model / dataloader @@ -126,6 +125,10 @@ class EvalConfig: default=None, ) + # "optional" but actually required for frame classification models + # TODO: check model family in __post_init__ and raise ValueError if labelmap + # TODO: not specified for a frame classification model? + labelmap_path = attr.ib(converter=converters.optional(expanded_user_path), default=None) # optional, transform spect_scaler_path = attr.ib( converter=converters.optional(expanded_user_path), From 9274de87833a79b4a66539496538f66a35b16563 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 9 Aug 2023 19:50:02 -0400 Subject: [PATCH 161/184] Rewrite definition for batch_size in docstring of src/vak/eval/parametric_umap.py --- src/vak/eval/parametric_umap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/eval/parametric_umap.py b/src/vak/eval/parametric_umap.py index 6aba33646..e32d5bf79 100644 --- a/src/vak/eval/parametric_umap.py +++ b/src/vak/eval/parametric_umap.py @@ -54,7 +54,7 @@ def eval_parametric_umap_model( output_dir : str, pathlib.Path Path to location where .csv files with evaluation metrics should be saved. batch_size : int - Number of samples per batch presented to models during training. + Number of samples per batch fed into model. num_workers : int Number of processes to use for parallel loading of data. Argument to torch.DataLoader. Default is 2. From 787f73865df01b730cd451d3bc693f1c65d83e1d Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 9 Aug 2023 19:50:40 -0400 Subject: [PATCH 162/184] Add batch_size parameter to vak.eval.eval, make labelmap_path parameter default to None, fix parameter order in docstring --- src/vak/eval/eval.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/vak/eval/eval.py b/src/vak/eval/eval.py index 3c76788a2..5d8d4a5d8 100644 --- a/src/vak/eval/eval.py +++ b/src/vak/eval/eval.py @@ -20,9 +20,10 @@ def eval( model_config: dict, dataset_path: str | pathlib.Path, checkpoint_path: str | pathlib.Path, - labelmap_path: str | pathlib.Path, output_dir: str | pathlib.Path, num_workers: int, + labelmap_path: str | pathlib.Path | None = None, + batch_size: int | None = None, transform_params: dict | None = None, dataset_params: dict | None = None, split: str = "test", @@ -46,15 +47,15 @@ def eval( path to directory with checkpoint files saved by Torch, to reload model output_dir : str, pathlib.Path Path to location where .csv files with evaluation metrics should be saved. - labelmap_path : str, pathlib.Path - path to 'labelmap.json' file. - models : list - of model names. e.g., 'models = TweetyNet, GRUNet, ConvNet' - batch_size : int - number of samples per batch presented to models during training. num_workers : int Number of processes to use for parallel loading of data. Argument to torch.DataLoader. Default is 2. + labelmap_path : str, pathlib.Path, optional + Path to 'labelmap.json' file. + Optional, default is None. + batch_size : int, optional. + Number of samples per batch fed into model. + Optional, default is None. transform_params: dict, optional Parameters for data transform. Passed as keyword arguments. @@ -142,6 +143,7 @@ def eval( dataset_path=dataset_path, checkpoint_path=checkpoint_path, output_dir=output_dir, + batch_size=batch_size, num_workers=num_workers, transform_params=transform_params, dataset_params=dataset_params, From 72822d3edbdf902bfae330c87c7934677785551e Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 9 Aug 2023 20:03:55 -0400 Subject: [PATCH 163/184] Pass batch size from config into vak.eval.eval inside vak.cli.eval --- src/vak/cli/eval.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vak/cli/eval.py b/src/vak/cli/eval.py index 63d278066..04dd08640 100644 --- a/src/vak/cli/eval.py +++ b/src/vak/cli/eval.py @@ -59,6 +59,7 @@ def eval(toml_path): labelmap_path=cfg.eval.labelmap_path, output_dir=cfg.eval.output_dir, num_workers=cfg.eval.num_workers, + batch_size=cfg.eval.batch_size, transform_params=cfg.eval.transform_params, dataset_params=cfg.eval.dataset_params, spect_scaler_path=cfg.eval.spect_scaler_path, From 8c375f7d424a872ef1f9398566766b203e7ca52c Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 9 Aug 2023 20:04:09 -0400 Subject: [PATCH 164/184] Fix prep section of config so it makes a test split: tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml --- .../configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml index 4877b7e58..434885d33 100644 --- a/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml @@ -1,13 +1,11 @@ [PREP] dataset_type = "parametric umap" input_type = "spect" -data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032312" +data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032412" output_dir = "./tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/ConvEncoderUMAP" audio_format = "cbin" annot_format = "notmat" labelset = "iabcdefghjk" -train_dur = 40 -val_dur = 15 [SPECT_PARAMS] fft_size = 512 From f0cd19ee08b9487e869bdaa33c605a5577d5017e Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Wed, 9 Aug 2023 21:59:49 -0400 Subject: [PATCH 165/184] Fix resize option in ConvEncoderUMAP configs so that unit images are square --- .../configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml | 2 +- .../ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml index 434885d33..0eba4753c 100644 --- a/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml @@ -22,7 +22,7 @@ output_dir = "./tests/data_for_tests/generated/results/eval/audio_cbin_annot_not dataset_path = "tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/ConvEncoderUMAP/032312-vak-dimensionality-reduction-dataset-generated-230727_205727" [EVAL.transform_params] -resize = 128 +resize = [128, 128] [ConvEncoderUMAP.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml index 6769f9f22..db59906a2 100644 --- a/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml @@ -27,10 +27,10 @@ device = "cuda" root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/ConvEncoderUMAP" [TRAIN.train_transform_params] -resize = 128 +resize = [128, 128] [TRAIN.val_transform_params] -resize = 128 +resize = [128, 128] [ConvEncoderUMAP.optimizer] lr = 0.001 From de61aac4b90e2c91b69f8f71167b0be8f6fb4729 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Thu, 10 Aug 2023 12:13:18 -0400 Subject: [PATCH 166/184] Add shape attribute to parametric_umap.Metadata --- src/vak/datasets/parametric_umap/metadata.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/vak/datasets/parametric_umap/metadata.py b/src/vak/datasets/parametric_umap/metadata.py index 518de94e2..58b80d8be 100644 --- a/src/vak/datasets/parametric_umap/metadata.py +++ b/src/vak/datasets/parametric_umap/metadata.py @@ -52,6 +52,7 @@ class Metadata: Name of csv file representing the source files in the dataset. Csv file will be located in root of directory representing dataset, so only the filename is given. + audio_format """ # declare this as a constant to avoid # needing to remember this in multiple places, and to use in unit tests @@ -65,6 +66,20 @@ class Metadata: default=None ) + shape: tuple = attr.field() + @shape.validator + def is_valid_shape(self, attribute, value): + if not isinstance(value, tuple): + raise TypeError( + f"`shape` should be a tuple but type was: {type(value)}" + ) + if not all( + [isinstance(val, int) and val > 0 for val in value] + ): + raise ValueError( + f"All values of `shape` should be positive integers but values were: {value}" + ) + @classmethod def from_path(cls, json_path: str | pathlib.Path): """Load dataset metadata from a json file. From d8c95b945b09eff819d90eb76741d43e7117b0d4 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Thu, 10 Aug 2023 12:14:12 -0400 Subject: [PATCH 167/184] Revise src/vak/prep/unit_dataset/unit_dataset.py for readability, and make it return the shape of all spectrograms --- src/vak/prep/unit_dataset/unit_dataset.py | 127 ++++++++++++++-------- 1 file changed, 82 insertions(+), 45 deletions(-) diff --git a/src/vak/prep/unit_dataset/unit_dataset.py b/src/vak/prep/unit_dataset/unit_dataset.py index 83db3e50e..29002bdfb 100644 --- a/src/vak/prep/unit_dataset/unit_dataset.py +++ b/src/vak/prep/unit_dataset/unit_dataset.py @@ -30,8 +30,11 @@ class Segment: from segmented audio or spectrogram. The attributes are metadata used to track - the origin of this segment in a csv file - representing a dataset of such segments. + the origin of this segment in a dataset + of such segments. + + The dataset including metadata is saved as a csv file + where these attributes become the columns. """ data: npt.NDArray samplerate: int @@ -45,7 +48,9 @@ class Segment: @dask.delayed -def get_segment_list(audio_path, annot, audio_format, context_s=0.005): +def get_segment_list( + audio_path: str, annot: crowsetta.Annotation, audio_format: str, context_s: float = 0.005 +) -> list[Segment]: """Get a list of :class:`Segment` instances, given the path to an audio file and an annotation that indicates where segments occur in that audio file. @@ -56,13 +61,16 @@ def get_segment_list(audio_path, annot, audio_format, context_s=0.005): Parameters ---------- audio_path : str - + Path to an audio file. annot : crowsetta.Annotation - + Annotation for audio file. audio_format : str - + String representing audio file format, e.g. 'wav'. context_s : float - + Number of seconds of "context" around unit to + add, i.e., time before and after the onset + and offset respectively. Default is 0.005s, + 5 milliseconds. Returns ------- @@ -88,7 +96,7 @@ def get_segment_list(audio_path, annot, audio_format, context_s=0.005): return segments -def spectrogram_from_segment(segment, spect_params): +def spectrogram_from_segment(segment: Segment, spect_params: dict) -> npt.NDArray: """Compute a spectrogram given a :class:`Segment` instance. Parameters @@ -146,28 +154,6 @@ def save_spect(spect_to_save: SpectToSave, output_dir: str | pathlib.Path) -> st return npy_path -@dask.delayed -def pad_spectrogram(record: tuple, pad_length: float) -> None: - """Pads a spectrogram to a specified length on the left and right sides. - Spectrogram is saved again after padding. - - Parameters - ---------- - record : tuple - pad_length : int - """ - spect_path = record[0] # 'spect_path' - spect = np.load(spect_path) - - excess_needed = pad_length - spect.shape[-1] - pad_left = np.floor(float(excess_needed) / 2).astype("int") - pad_right = np.ceil(float(excess_needed) / 2).astype("int") - spect_padded = np.pad( - spect, [(0, 0), (pad_left, pad_right)], "constant", constant_values=0 - ) - np.save(spect_path, spect_padded) - - def abspath(a_path): """Convert a path to an absolute path""" if isinstance(a_path, str) or isinstance(a_path, pathlib.Path): @@ -177,16 +163,14 @@ def abspath(a_path): # ---- make spectrograms + records for dataframe ----------------------------------------------------------------------- -def make_spect_return_record(segment, ind, spect_params, output_dir): - """helper function that enables parallelized creation of "records", +@dask.delayed +def make_spect_return_record(segment: Segment, ind: int, spect_params: dict, output_dir: pathlib.Path) -> tuple: + """Helper function that enables parallelized creation of "records", i.e. rows for dataframe, from . Accepts a two-element tuple containing (1) a dictionary that represents a spectrogram and (2) annotation for that file""" spect = spectrogram_from_segment(segment, spect_params) - # FIXME: Add parameters for these functions to config and use - # mask_spec(spect) - # log_resize_spec(spect) n_timebins = spect.shape[-1] spect_to_save = SpectToSave(spect, ind, segment.audio_path) @@ -208,8 +192,29 @@ def make_spect_return_record(segment, ind, spect_params, output_dir): return record, n_timebins +@dask.delayed +def pad_spectrogram(record: tuple, pad_length: float) -> None: + """Pads a spectrogram to a specified length on the left and right sides. + Spectrogram is saved again after padding. + + Parameters + ---------- + record : tuple + pad_length : int + """ + spect_path = record[0] # 'spect_path' + spect = np.load(spect_path) + + excess_needed = pad_length - spect.shape[-1] + pad_left = np.floor(float(excess_needed) / 2).astype("int") + pad_right = np.ceil(float(excess_needed) / 2).astype("int") + spect_padded = np.pad( + spect, [(0, 0), (pad_left, pad_right)], "constant", constant_values=0 + ) + np.save(spect_path, spect_padded) + + # constant, used for names of columns in DataFrame below -# this is analogous to ``syllable_df`` that ``avgn`` uses. DF_COLUMNS = [ "spect_path", "audio_path", @@ -233,6 +238,29 @@ def prep_unit_dataset( labelset: set | None = None, context_s: float = 0.005, ) -> pd.DataFrame: + """Prepare a dataset of units from sequences, + e.g., all syllables segmented out of a dataset of birdsong. + + Parameters + ---------- + audio_format + output_dir + spect_params + data_dir + annot_format + annot_file + labelset + context_s + + Returns + ------- + unit_df : pandas.DataFrame + A DataFrame representing all the units in the dataset. + shape: tuple + A tuple representing the shape of all spectograms in the dataset. + The spectrograms of all units are padded so that they are all + as wide as the widest unit (i.e, the one with the longest duration). + """ # pre-conditions --------------------------------------------------------------------------------------------------- if audio_format not in constants.VALID_AUDIO_FORMATS: raise ValueError( @@ -272,8 +300,8 @@ def prep_unit_dataset( # no annotation, so map spectrogram files to None audio_annot_map = dict((audio_path, None) for audio_path in audio_files) - # use mapping (if generated/supplied) with labelset, if supplied, to filter - if labelset: # then remove annotations with labels not in labelset + # use labelset, if supplied, with annotations, if any, to filter; + if labelset and annot_list: # then remove annotations with labels not in labelset for audio_file, annot in list(audio_annot_map.items()): # loop in a verbose way (i.e. not a comprehension) # so we can give user warning when we skip files @@ -297,15 +325,20 @@ def prep_unit_dataset( "Loading audio for all segments in all files", ) with ProgressBar(): - segments = dask.compute(*segments) - segments = [segment for segment_list in segments for segment in segment_list] - + segments: list[list[Segment]] = dask.compute(*segments) + segments: list[Segment] = [segment for segment_list in segments for segment in segment_list] + + # ---- make and save all spectrograms *before* padding + # This is a design choice to avoid keeping all the spectrograms in memory + # but since we want to pad all spectrograms to be the same width, + # it requires us to go back, load each one, and pad it. + # Might be worth looking at how often typical dataset sizes in memory and whether this is really necessary. records_n_timebins_tuples = [] for ind, segment in enumerate(segments): - records_n_timebins_tuple = dask.delayed(make_spect_return_record)(segment, ind, spect_params, output_dir) + records_n_timebins_tuple = make_spect_return_record(segment, ind, spect_params, output_dir) records_n_timebins_tuples.append(records_n_timebins_tuple) with ProgressBar(): - records_n_timebins_tuples = dask.compute(*records_n_timebins_tuples) + records_n_timebins_tuples: list[tuple[tuple, int]] = dask.compute(*records_n_timebins_tuples) records, n_timebins_list = [], [] for records_n_timebins_tuple in records_n_timebins_tuples: @@ -321,8 +354,12 @@ def prep_unit_dataset( pad_spectrogram(record, pad_length) ) with ProgressBar(): - _ = dask.compute(*padded) + shapes:list[tuple[int, int]] = dask.compute(*padded) + + shape = set(shapes) + assert len(shape) == 1, f"Did not find a single unique shape for all spectrograms. Instead found: {shape}" + shape = shape[0] unit_df = pd.DataFrame.from_records(records, columns=DF_COLUMNS) - return unit_df + return unit_df, shape From ec67027aa2bc6476274dab67e691c6cbc0f1589c Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Thu, 10 Aug 2023 12:14:45 -0400 Subject: [PATCH 168/184] Get shape returned by prep_unit_dataset inside src/vak/prep/parametric_umap/parametric_umap.py and use with Metadata --- src/vak/prep/parametric_umap/parametric_umap.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/vak/prep/parametric_umap/parametric_umap.py b/src/vak/prep/parametric_umap/parametric_umap.py index 2c5ceb3e0..39350669a 100644 --- a/src/vak/prep/parametric_umap/parametric_umap.py +++ b/src/vak/prep/parametric_umap/parametric_umap.py @@ -199,7 +199,7 @@ def prep_parametric_umap_dataset( ) # ---- actually make the dataset ----------------------------------------------------------------------------------- - dataset_df = prep_unit_dataset( + dataset_df, shape = prep_unit_dataset( audio_format=audio_format, output_dir=dataset_path, spect_params=spect_params, @@ -320,6 +320,7 @@ def prep_parametric_umap_dataset( metadata = datasets.parametric_umap.Metadata( dataset_csv_filename=str(dataset_csv_path.name), audio_format=audio_format, + shape=shape, ) metadata.to_json(dataset_path) From c0a1586ae4b5a928584dd73fa1b096699d07389b Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Thu, 10 Aug 2023 12:16:10 -0400 Subject: [PATCH 169/184] Fix parametric_umap.Metadata -- shape attribute is mandatory, needs to come before audio_format --- src/vak/datasets/parametric_umap/metadata.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/vak/datasets/parametric_umap/metadata.py b/src/vak/datasets/parametric_umap/metadata.py index 58b80d8be..3ee9d07e4 100644 --- a/src/vak/datasets/parametric_umap/metadata.py +++ b/src/vak/datasets/parametric_umap/metadata.py @@ -60,12 +60,6 @@ class Metadata: dataset_csv_filename: str = attr.field(converter=str, validator=is_valid_dataset_csv_filename) - audio_format: str = attr.field( - converter=attr.converters.optional(str), - validator=attr.validators.optional(is_valid_audio_format), - default=None - ) - shape: tuple = attr.field() @shape.validator def is_valid_shape(self, attribute, value): @@ -80,6 +74,12 @@ def is_valid_shape(self, attribute, value): f"All values of `shape` should be positive integers but values were: {value}" ) + audio_format: str = attr.field( + converter=attr.converters.optional(str), + validator=attr.validators.optional(is_valid_audio_format), + default=None + ) + @classmethod def from_path(cls, json_path: str | pathlib.Path): """Load dataset metadata from a json file. From 071ac0cb1b9fcfd1d0cdcabfacec6354944b3406 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Thu, 10 Aug 2023 12:30:25 -0400 Subject: [PATCH 170/184] Fix prep_unit_dataset so we actually get shape of spectrograms --- src/vak/prep/unit_dataset/unit_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/vak/prep/unit_dataset/unit_dataset.py b/src/vak/prep/unit_dataset/unit_dataset.py index 29002bdfb..31464d1a8 100644 --- a/src/vak/prep/unit_dataset/unit_dataset.py +++ b/src/vak/prep/unit_dataset/unit_dataset.py @@ -212,6 +212,7 @@ def pad_spectrogram(record: tuple, pad_length: float) -> None: spect, [(0, 0), (pad_left, pad_right)], "constant", constant_values=0 ) np.save(spect_path, spect_padded) + return spect_padded.shape # constant, used for names of columns in DataFrame below @@ -358,7 +359,7 @@ def prep_unit_dataset( shape = set(shapes) assert len(shape) == 1, f"Did not find a single unique shape for all spectrograms. Instead found: {shape}" - shape = shape[0] + shape = shape.pop() unit_df = pd.DataFrame.from_records(records, columns=DF_COLUMNS) From 6ae3b928156c87faac655e81ed4906bd399ffb77 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Thu, 10 Aug 2023 13:23:20 -0400 Subject: [PATCH 171/184] Add converter to parametric_umap.Metadata.shape attribute to cast list to tuple when we load from json --- src/vak/datasets/parametric_umap/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/datasets/parametric_umap/metadata.py b/src/vak/datasets/parametric_umap/metadata.py index 3ee9d07e4..339cab793 100644 --- a/src/vak/datasets/parametric_umap/metadata.py +++ b/src/vak/datasets/parametric_umap/metadata.py @@ -60,7 +60,7 @@ class Metadata: dataset_csv_filename: str = attr.field(converter=str, validator=is_valid_dataset_csv_filename) - shape: tuple = attr.field() + shape: tuple = attr.field(converter=tuple) @shape.validator def is_valid_shape(self, attribute, value): if not isinstance(value, tuple): From c3c9945b77493cb801670423d4ebb88ec62a6943 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Thu, 10 Aug 2023 13:24:36 -0400 Subject: [PATCH 172/184] Add functions for default padding to src/vak/models/convencoder_umap.py, to use in train.parametric_umap, eval.parametric_umap, etc. --- src/vak/models/convencoder_umap.py | 32 ++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/vak/models/convencoder_umap.py b/src/vak/models/convencoder_umap.py index 2d5cbe704..adbcec837 100644 --- a/src/vak/models/convencoder_umap.py +++ b/src/vak/models/convencoder_umap.py @@ -7,6 +7,8 @@ """ from __future__ import annotations +import math + import torch from .. import ( @@ -65,3 +67,33 @@ class ConvEncoderUMAP: 'optimizer': {'lr': 1e-3}, } + + +def next_power_of_2(x: int | float) -> int: + """Compute the nearest power of 2 to a number. + + Used e.g. to pad an input + to a convolutional neural network. + + Parameters + ---------- + x : int, float + A number :math:`x` for which we would like + to find the nearest power of 2. + + Returns + ------- + pow2 : int + The nearest power of 2 to :math:`x`. + """ + return 1 if x == 0 else 2**math.ceil(math.log2(x)) + + +def get_default_padding(shape): + """Get default padding for input to ConvEncoderUMAP model. + + Pads the input shape so that each dimension is a power of 2. + """ + shape_pow2 = tuple(next_power_of_2(x) for x in shape) + padding = (xpow2 - x for (xpow2, x) in zip(shape_pow2, shape)) + return padding From 6a92e3158b8751321335a0e9a5c5167dd7ce0270 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Thu, 10 Aug 2023 13:25:34 -0400 Subject: [PATCH 173/184] Modify default parametric_umap transform so that it only adds padding transform if 'padding' is in the transform_kwargs --- .../transforms/defaults/parametric_umap.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/vak/transforms/defaults/parametric_umap.py b/src/vak/transforms/defaults/parametric_umap.py index e6eef0ad5..345332838 100644 --- a/src/vak/transforms/defaults/parametric_umap.py +++ b/src/vak/transforms/defaults/parametric_umap.py @@ -1,14 +1,12 @@ """Default transforms for Parametric UMAP models.""" from __future__ import annotations -from typing import Callable - import torchvision.transforms from .. import transforms as vak_transforms -def get_default_parametric_umap_transform(transform_kwargs) -> Callable: +def get_default_parametric_umap_transform(transform_kwargs) -> torchvision.transforms.Compose: """Get default transform for frame classification model. Parameters @@ -19,10 +17,12 @@ def get_default_parametric_umap_transform(transform_kwargs) -> Callable: ------- transform : Callable """ - return torchvision.transforms.Compose( - [ - vak_transforms.ToFloatTensor(), - vak_transforms.AddChannel(), - torchvision.transforms.Resize(transform_kwargs['resize']) - ] - ) + transforms = [ + vak_transforms.ToFloatTensor(), + vak_transforms.AddChannel(), + ] + if 'padding' in transform_kwargs: + transforms.append( + torchvision.transforms.Pad(transform_kwargs['padding']) + ) + return torchvision.transforms.Compose(transforms) From a336373ba4627a7ff33e2881a1288eecbdda34e4 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Thu, 10 Aug 2023 13:25:56 -0400 Subject: [PATCH 174/184] Modify train/parametric_umap to use default padding for ConvEncoderUMAP model --- src/vak/train/parametric_umap.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/vak/train/parametric_umap.py b/src/vak/train/parametric_umap.py index 088d72d06..55047f371 100644 --- a/src/vak/train/parametric_umap.py +++ b/src/vak/train/parametric_umap.py @@ -224,6 +224,9 @@ def train_parametric_umap_model( if train_transform_params is None: train_transform_params = {} + if 'padding' not in train_transform_params and model_name == 'ConvEncoderUMAP': + padding = models.convencoder_umap.get_default_padding(metadata.shape) + train_transform_params['padding'] = padding transform = transforms.defaults.get_default_transform(model_name, "train", train_transform_params) if train_dataset_params is None: @@ -251,6 +254,9 @@ def train_parametric_umap_model( transform = transforms.defaults.get_default_transform(model_name, "eval", val_transform_params) if val_dataset_params is None: val_dataset_params = {} + if 'padding' not in val_transform_params and model_name == 'ConvEncoderUMAP': + padding = models.convencoder_umap.get_default_padding(metadata.shape) + val_transform_params['padding'] = padding val_dataset = ParametricUMAPDataset.from_dataset_path( dataset_path=dataset_path, split=split, From d6e459064249657769c89392f34eb0ba025c74f0 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Thu, 10 Aug 2023 13:47:03 -0400 Subject: [PATCH 175/184] Rewrite default padding for convencoder_umap to round to nearest tens place --- src/vak/models/convencoder_umap.py | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/src/vak/models/convencoder_umap.py b/src/vak/models/convencoder_umap.py index adbcec837..ace0e4193 100644 --- a/src/vak/models/convencoder_umap.py +++ b/src/vak/models/convencoder_umap.py @@ -69,31 +69,11 @@ class ConvEncoderUMAP: } -def next_power_of_2(x: int | float) -> int: - """Compute the nearest power of 2 to a number. - - Used e.g. to pad an input - to a convolutional neural network. - - Parameters - ---------- - x : int, float - A number :math:`x` for which we would like - to find the nearest power of 2. - - Returns - ------- - pow2 : int - The nearest power of 2 to :math:`x`. - """ - return 1 if x == 0 else 2**math.ceil(math.log2(x)) - - def get_default_padding(shape): """Get default padding for input to ConvEncoderUMAP model. - Pads the input shape so that each dimension is a power of 2. + Rounds up to nearest tens place """ - shape_pow2 = tuple(next_power_of_2(x) for x in shape) - padding = (xpow2 - x for (xpow2, x) in zip(shape_pow2, shape)) + rounded_up = tuple(10 * math.ceil(x / 10) for x in shape) + padding = tuple(rounded_up_x - shape_x for (rounded_up_x, shape_x) in zip(rounded_up, shape)) return padding From b6e08f2f405a0541b39ce23778a10369c59f2997 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Thu, 10 Aug 2023 13:48:39 -0400 Subject: [PATCH 176/184] Move code block that gets default padding for ConvEncoderUMAP so it's in the right place, before we get the transforms with it --- src/vak/train/parametric_umap.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/vak/train/parametric_umap.py b/src/vak/train/parametric_umap.py index 55047f371..22c2a45ea 100644 --- a/src/vak/train/parametric_umap.py +++ b/src/vak/train/parametric_umap.py @@ -251,12 +251,12 @@ def train_parametric_umap_model( if val_step: if val_transform_params is None: val_transform_params = {} - transform = transforms.defaults.get_default_transform(model_name, "eval", val_transform_params) - if val_dataset_params is None: - val_dataset_params = {} if 'padding' not in val_transform_params and model_name == 'ConvEncoderUMAP': padding = models.convencoder_umap.get_default_padding(metadata.shape) val_transform_params['padding'] = padding + transform = transforms.defaults.get_default_transform(model_name, "eval", val_transform_params) + if val_dataset_params is None: + val_dataset_params = {} val_dataset = ParametricUMAPDataset.from_dataset_path( dataset_path=dataset_path, split=split, From 0fd0404b7938112276880b7d571ba6dcc278e59d Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Thu, 10 Aug 2023 13:53:53 -0400 Subject: [PATCH 177/184] Make fixes in eval/parametric_umap -- get default padding for ConvEncoder UMAP, remove spect_scaler_path since it's not used --- src/vak/eval/parametric_umap.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/vak/eval/parametric_umap.py b/src/vak/eval/parametric_umap.py index e32d5bf79..083fef8cb 100644 --- a/src/vak/eval/parametric_umap.py +++ b/src/vak/eval/parametric_umap.py @@ -6,7 +6,6 @@ import logging import pathlib -import joblib import pytorch_lightning as lightning import pandas as pd import torch.utils.data @@ -34,7 +33,6 @@ def eval_parametric_umap_model( transform_params: dict | None = None, dataset_params: dict | None = None, split: str = "test", - spect_scaler_path: str | pathlib.Path = None, device: str | None = None, ) -> None: """Evaluate a trained model. @@ -75,8 +73,8 @@ def eval_parametric_umap_model( """ # ---- pre-conditions ---------------------------------------------------------------------------------------------- for path, path_name in zip( - (checkpoint_path, spect_scaler_path), - ('checkpoint_path', 'spect_scaler_path'), + (checkpoint_path,), + ('checkpoint_path',), ): if path is not None: # because `spect_scaler_path` is optional if not validators.is_a_file(path): @@ -89,6 +87,10 @@ def eval_parametric_umap_model( raise NotADirectoryError( f"`dataset_path` not found or not recognized as a directory: {dataset_path}" ) + logger.info( + f"Loading metadata from dataset path: {dataset_path}", + ) + metadata = datasets.parametric_umap.Metadata.from_dataset_path(dataset_path) if not validators.is_a_directory(output_dir): raise NotADirectoryError( @@ -99,16 +101,12 @@ def eval_parametric_umap_model( timenow = datetime.now().strftime("%y%m%d_%H%M%S") # ---------------- load data for evaluation ------------------------------------------------------------------------ - if spect_scaler_path: - logger.info(f"loading spect scaler from path: {spect_scaler_path}") - spect_standardizer = joblib.load(spect_scaler_path) - else: - logger.info(f"not using a spect scaler") - spect_standardizer = None - if transform_params is None: transform_params = {} - transform_params.update({'spect_standardizer': spect_standardizer}) + if 'padding' not in transform_params and model_name == 'ConvEncoderUMAP': + padding = models.convencoder_umap.get_default_padding(metadata.shape) + transform_params['padding'] = padding + item_transform = transforms.defaults.get_default_transform( model_name, "eval", From ea52abb1c4624dd3d6c81ae43f3398bf14454dbb Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Thu, 10 Aug 2023 13:54:38 -0400 Subject: [PATCH 178/184] Remove passing parameter 'spect_scaler_path' into vak.eval.eval_parametric_umap_model' inside vak.cli.eval --- src/vak/eval/eval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/vak/eval/eval.py b/src/vak/eval/eval.py index 5d8d4a5d8..b99be8817 100644 --- a/src/vak/eval/eval.py +++ b/src/vak/eval/eval.py @@ -148,7 +148,6 @@ def eval( transform_params=transform_params, dataset_params=dataset_params, split=split, - spect_scaler_path=spect_scaler_path, device=device, ) else: From 23b19d67b111decf948ca3b835cf37abe5d5e737 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 11 Aug 2023 11:17:31 -0400 Subject: [PATCH 179/184] WIP: Add missing docstrings in src/vak/datasets/frame_classification/frames_dataset.py --- .../datasets/frame_classification/frames_dataset.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/vak/datasets/frame_classification/frames_dataset.py b/src/vak/datasets/frame_classification/frames_dataset.py index a58654c30..b66a640c2 100644 --- a/src/vak/datasets/frame_classification/frames_dataset.py +++ b/src/vak/datasets/frame_classification/frames_dataset.py @@ -100,6 +100,18 @@ def from_dataset_path( split: str = "val", item_transform: Callable | None = None, ): + """ + + Parameters + ---------- + dataset_path + split + item_transform + + Returns + ------- + + """ dataset_path = pathlib.Path(dataset_path) metadata = Metadata.from_dataset_path(dataset_path) frame_dur = metadata.frame_dur From c573f07a954d9133f4b192f1b694cfbba8751c49 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 11 Aug 2023 11:17:42 -0400 Subject: [PATCH 180/184] WIP: Add missing docstrings in src/vak/datasets/frame_classification/window_dataset.py --- .../frame_classification/window_dataset.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/vak/datasets/frame_classification/window_dataset.py b/src/vak/datasets/frame_classification/window_dataset.py index c66b22b28..d6397d44b 100644 --- a/src/vak/datasets/frame_classification/window_dataset.py +++ b/src/vak/datasets/frame_classification/window_dataset.py @@ -197,6 +197,21 @@ def from_dataset_path( transform: Callable | None = None, target_transform: Callable | None = None ): + """ + + Parameters + ---------- + dataset_path + window_size + stride + split + transform + target_transform + + Returns + ------- + + """ dataset_path = pathlib.Path(dataset_path) metadata = Metadata.from_dataset_path(dataset_path) frame_dur = metadata.frame_dur From 362af10cb29a9ae9a1abc5a06d0db4ab2c60f98a Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 11 Aug 2023 12:32:57 -0400 Subject: [PATCH 181/184] Revise docstrings in src/vak/datasets/parametric_umap/parametric_umap.py, rename ParametricUMAPDataset -> ParametricUMAPTrainingDatset, and add ParametricUMAPInferenceDataset --- .../parametric_umap/parametric_umap.py | 210 ++++++++++++++++-- 1 file changed, 193 insertions(+), 17 deletions(-) diff --git a/src/vak/datasets/parametric_umap/parametric_umap.py b/src/vak/datasets/parametric_umap/parametric_umap.py index b6b8e74f8..47a273bd2 100644 --- a/src/vak/datasets/parametric_umap/parametric_umap.py +++ b/src/vak/datasets/parametric_umap/parametric_umap.py @@ -2,10 +2,13 @@ import pathlib import warnings +from typing import Callable import numpy as np +import numpy.typing as npt import pandas as pd from pynndescent import NNDescent +import scipy.sparse._coo from sklearn.utils import check_random_state from torch.utils.data import Dataset @@ -17,7 +20,62 @@ from umap.umap_ import fuzzy_simplicial_set -def get_umap_graph(X, n_neighbors: int = 10, metric: str= "cosine", random_state: int | None = None, max_candidates=60, verbose=True): +def get_umap_graph(X: npt.NDArray, n_neighbors: int = 10, metric: str= "euclidean", + random_state: np.random.RandomState | None = None, + max_candidates: int = 60, verbose: bool = True) -> scipy.sparse._coo.coo_matrix: + """Get graph used by UMAP, + the fuzzy topological representation. + + Parameters + ---------- + X : numpy.ndarray + Data from which to build the graph. + n_neighbors : int + Number of nearest neighbors to use + when computing approximate nearest neighbors. + Parameter passed to :class:`pynndescent.NNDescent` + and :func:`umap._umap.fuzzy_simplicial_set`. + metric : str + Distance metric. Default is "cosine". + Parameter passed to :class:`pynndescent.NNDescent` + and :func:`umap._umap.fuzzy_simplicial_set`. + random_state : numpy.random.RandomState + Either a numpy.random.RandomState instance, + or None. + max_candidates : int + Default is 60. + Parameter passed to :class:`pynndescent.NNDescent`. + verbose : bool + Whether :class:`pynndescent.NNDescent` should log + finding the approximate nearest neighbors. + Default is True. + + Returns + ------- + graph : scipy.sparse.csr_matrix + + Notes + ----- + Adapted from https://github.com/timsainb/ParametricUMAP_paper + + The graph returned is a graph of the probabilities of an edge exists between points. + + Local, one-directional, probabilities (:math:`P^{UMAP}_{i|j}`) + are computed between a point and its neighbors to determine + the probability with which an edge (or simplex) exists, + based upon an assumption that data is uniformly distributed + across a manifold in a warped dataspace. + Under this assumption, a local notion of distance + is set by the distance to the :math:`k^{th}` nearest neighbor + and the local probability is scaled by that local notion of distance. + + Where :math:`\rho_{i}` is a local connectivity parameter set + to the distance from :math:`x_i` to its nearest neighbor, + and :math:`\sigma_{i}` is a local connectivity parameter + set to match the local distance around :math:`x_i` upon its :math:`k` nearest neighbors + (where :math:`k` is a hyperparameter). + In the UMAP package, these are calculated using :func:`umap._umap.smooth_knn_dist`. + """ random_state = check_random_state(None) if random_state == None else random_state # number of trees in random projection forest @@ -25,7 +83,6 @@ def get_umap_graph(X, n_neighbors: int = 10, metric: str= "cosine", random_state # max number of nearest neighbor iters to perform n_iters = max(5, int(round(np.log2(X.shape[0])))) - # distance metric # get nearest neighbors nnd = NNDescent( @@ -38,10 +95,10 @@ def get_umap_graph(X, n_neighbors: int = 10, metric: str= "cosine", random_state verbose=verbose ) - # get indices and distances + # get indices and distances for 10 nearest neighbors of every point in dataset knn_indices, knn_dists = nnd.neighbor_graph - # build fuzzy_simplicial_set + # build fuzzy simplicial complex umap_graph, sigmas, rhos = fuzzy_simplicial_set( X=X, n_neighbors=n_neighbors, @@ -54,9 +111,32 @@ def get_umap_graph(X, n_neighbors: int = 10, metric: str= "cosine", random_state return umap_graph -def get_graph_elements(graph, n_epochs): - """Get graph elements for UMAP Dataset""" - +def get_graph_elements( + graph: scipy.sparse._coo.coo_matrix, n_epochs: int +) -> tuple[scipy.sparse._coo.coo_matrix, npt.NDArray, npt.NDArray, npt.NDArray, npt.NDArray, int]: + """Get graph elements for Parametric UMAP Dataset. + + Parameters + ---------- + graph : scipy.sparse.csr_matrix + The graph returned by :func:`get_umap_graph`. + n_epochs : int + Number of epochs model will be trained + + Returns + ------- + graph : scipy.sparse._coo.coo_matrix + The graph, now in COOrdinate format. + epochs_per_sample : int + head : numpy.ndarray + Graph rows. + tail : numpy.ndarray + Graph columns. + weight : numpy.ndarray + Graph data. + n_vertices : int + Number of vertices in dataset. + """ graph = graph.tocoo() # eliminate duplicate entries by summing them together @@ -87,14 +167,20 @@ def get_graph_elements(graph, n_epochs): return graph, epochs_per_sample, head, tail, weight, n_vertices -class ParametricUMAPDataset(Dataset): - def __init__(self, data, graph, dataset_df, n_epochs=200, transform=None): +class ParametricUMAPTrainingDataset(Dataset): + """Dataset used for training Parametric UMAP models + + """ + def __init__(self, data: npt.NDArray, graph, + dataset_df: pd.DataFrame, n_epochs: int = 200, transform: Callable | None = None): graph, epochs_per_sample, head, tail, weight, n_vertices = get_graph_elements(graph, n_epochs) + # we repeat each sample in (head, tail) a certain number of times depending on its probability self.edges_to_exp, self.edges_from_exp = ( np.repeat(head, epochs_per_sample.astype("int")), np.repeat(tail, epochs_per_sample.astype("int")), ) + # we then shuffle -- not sure this is necessary if the dataset is shuffled during training? shuffle_mask = np.random.permutation(np.arange(len(self.edges_to_exp))) self.edges_to_exp = self.edges_to_exp[shuffle_mask].astype(np.int64) self.edges_from_exp = self.edges_from_exp[shuffle_mask].astype(np.int64) @@ -108,7 +194,7 @@ def duration(self): return self.dataset_df['duration'].sum() def __len__(self): - return int(self.data.shape[0]) + return self.edges_to_exp.shape[0] @property def shape(self): @@ -126,13 +212,30 @@ def __getitem__(self, index): @classmethod def from_dataset_path(cls, - dataset_path, - split, - n_neighbors=10, - metric='euclidean', - random_state=None, - n_epochs=200, - transform=None): + dataset_path: str | pathlib.Path, + split: str, + n_neighbors: int = 10, + metric: str = 'euclidean', + random_state: int | None = None, + n_epochs:int = 200, + transform: Callable | None = None): + """ + + Parameters + ---------- + dataset_path : str, pathlib.Path + Path to a directory that represents a dataset. + split + n_neighbors + metric + random_state + n_epochs + transform + + Returns + ------- + + """ import vak.datasets # import here just to make classmethod more explicit dataset_path = pathlib.Path(dataset_path) @@ -156,3 +259,76 @@ def from_dataset_path(cls, n_epochs, transform=transform, ) + + +class ParametricUMAPInferenceDataset(Dataset): + def __init__(self, data: npt.NDArray, dataset_df: pd.DataFrame, transform: Callable | None = None): + self.data = data + self.dataset_df = dataset_df + self.transform = transform + + @property + def duration(self): + return self.dataset_df['duration'].sum() + + def __len__(self): + return self.data.shape[0] + + @property + def shape(self): + tmp_x_ind = 0 + tmp_item = self.__getitem__(tmp_x_ind) + return tmp_item[0].shape + + def __getitem__(self, index): + x = self.data[index] + df_index = self.dataset_df.index[index] + if self.transform: + x= self.transform(x) + return {'x': x, 'df_index': df_index} + + @classmethod + def from_dataset_path(cls, + dataset_path: str | pathlib.Path, + split: str, + n_neighbors: int = 10, + metric: str = 'euclidean', + random_state: int | None = None, + n_epochs:int = 200, + transform: Callable | None = None): + """ + + Parameters + ---------- + dataset_path : str, pathlib.Path + Path to a directory that represents a dataset. + split + n_neighbors + metric + random_state + n_epochs + transform + + Returns + ------- + + """ + import vak.datasets # import here just to make classmethod more explicit + + dataset_path = pathlib.Path(dataset_path) + metadata = vak.datasets.parametric_umap.Metadata.from_dataset_path(dataset_path) + + dataset_csv_path = dataset_path / metadata.dataset_csv_filename + dataset_df = pd.read_csv(dataset_csv_path) + split_df = dataset_df[dataset_df.split == split] + + data = np.stack( + [ + np.load(dataset_path / spect_path) for spect_path in split_df.spect_path.values + ] + ) + return cls( + data, + split_df, + transform=transform, + ) From fad7b532a195ebf9c7a95db32b39e9687496868e Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Fri, 11 Aug 2023 12:33:10 -0400 Subject: [PATCH 182/184] WIP: Add src/vak/predict/parametric_umap.py --- src/vak/predict/parametric_umap.py | 186 +++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 src/vak/predict/parametric_umap.py diff --git a/src/vak/predict/parametric_umap.py b/src/vak/predict/parametric_umap.py new file mode 100644 index 000000000..4b056b289 --- /dev/null +++ b/src/vak/predict/parametric_umap.py @@ -0,0 +1,186 @@ +"""Function that generates new inferences from trained models in the frame classification family.""" +from __future__ import annotations + +import logging +import os +import pathlib + +import pytorch_lightning as lightning +import torch.utils.data + +from .. import ( + datasets, + models, + transforms +) +from ..common import ( + constants, + validators +) +from ..datasets.parametric_umap import ParametricUMAPDataset +from ..common.device import get_default as get_default_device + + +logger = logging.getLogger(__name__) + + +def predict_with_parametric_umap_model( + model_name: str, + model_config: dict, + dataset_path, + checkpoint_path, + num_workers=2, + transform_params: dict | None = None, + dataset_params: dict | None = None, + timebins_key="t", + device=None, + output_dir=None, +): + """Make predictions on a dataset with a trained model. + + Parameters + ---------- + model_name : str + Model name, must be one of vak.models.registry.MODEL_NAMES. + model_config : dict + Model configuration in a ``dict``, + as loaded from a .toml file, + and used by the model method ``from_config``. + dataset_path : str + Path to dataset, e.g., a csv file generated by running ``vak prep``. + checkpoint_path : str + path to directory with checkpoint files saved by Torch, to reload model + num_workers : int + Number of processes to use for parallel loading of data. + Argument to torch.DataLoader. Default is 2. + transform_params: dict, optional + Parameters for data transform. + Passed as keyword arguments. + Optional, default is None. + dataset_params: dict, optional + Parameters for dataset. + Passed as keyword arguments. + Optional, default is None. + timebins_key : str + key for accessing vector of time bins in files. Default is 't'. + device : str + Device on which to work with model + data. + Defaults to 'cuda' if torch.cuda.is_available is True. + annot_csv_filename : str + name of .csv file containing predicted annotations. + Default is None, in which case the name of the dataset .csv + is used, with '.annot.csv' appended to it. + output_dir : str, Path + path to location where .csv containing predicted annotation + should be saved. Defaults to current working directory. + """ + for path, path_name in zip( + (checkpoint_path,), + ('checkpoint_path',), + ): + if path is not None: + if not validators.is_a_file(path): + raise FileNotFoundError( + f"value for ``{path_name}`` not recognized as a file: {path}" + ) + + dataset_path = pathlib.Path(dataset_path) + if not dataset_path.exists() or not dataset_path.is_dir(): + raise NotADirectoryError( + f"`dataset_path` not found or not recognized as a directory: {dataset_path}" + ) + logger.info( + f"Loading metadata from dataset path: {dataset_path}", + ) + metadata = datasets.frame_classification.Metadata.from_dataset_path(dataset_path) + + if output_dir is None: + output_dir = pathlib.Path(os.getcwd()) + else: + output_dir = pathlib.Path(output_dir) + + if not output_dir.is_dir(): + raise NotADirectoryError( + f"value specified for output_dir is not recognized as a directory: {output_dir}" + ) + + if device is None: + device = get_default_device() + + # ---------------- load data for prediction ------------------------------------------------------------------------ + if transform_params is None: + transform_params = {} + if 'padding' not in transform_params and model_name == 'ConvEncoderUMAP': + padding = models.convencoder_umap.get_default_padding(metadata.shape) + transform_params['padding'] = padding + + item_transform = transforms.defaults.get_default_transform( + model_name, + "predict", + transform_params + ) + + dataset_csv_path = dataset_path / metadata.dataset_csv_filename + logger.info(f"loading dataset to predict from csv path: {dataset_csv_path}") + + if dataset_params is None: + dataset_params = {} + pred_dataset = ParametricUMAPDataset.from_dataset_path( + dataset_path=dataset_path, + split="predict", + transform=item_transform, + **dataset_params + ) + + pred_loader = torch.utils.data.DataLoader( + dataset=pred_dataset, + shuffle=False, + # batch size 1 because each spectrogram reshaped into a batch of windows + batch_size=1, + num_workers=num_workers, + ) + + # ---------------- set up to convert predictions to annotation files ----------------------------------------------- + if annot_csv_filename is None: + annot_csv_filename = pathlib.Path(dataset_path).stem + constants.ANNOT_CSV_SUFFIX + annot_csv_path = pathlib.Path(output_dir).joinpath(annot_csv_filename) + logger.info(f"will save annotations in .csv file: {annot_csv_path}") + + # ---------------- do the actual predicting + converting to annotations -------------------------------------------- + input_shape = pred_dataset.shape + # if dataset returns spectrogram reshaped into windows, + # throw out the window dimension; just want to tell network (channels, height, width) shape + if len(input_shape) == 4: + input_shape = input_shape[1:] + logger.info(f"Shape of input to networks used for predictions: {input_shape}") + + logger.info(f"instantiating model from config:/n{model_name}") + + model = models.get( + model_name, + model_config, + input_shape=input_shape, + ) + + # ---------------- do the actual predicting -------------------------------------------------------------------- + logger.info(f"loading checkpoint for {model_name} from path: {checkpoint_path}") + model.load_state_dict_from_path(checkpoint_path) + + if device == 'cuda': + accelerator = 'gpu' + else: + accelerator = None + trainer_logger = lightning.loggers.TensorBoardLogger( + save_dir=output_dir + ) + trainer = lightning.Trainer(accelerator=accelerator, logger=trainer_logger) + + logger.info(f"running predict method of {model_name}") + results = trainer.predict(model, pred_loader) + + eval_df = pd.DataFrame(row, index=[0]) + eval_csv_path = output_dir.joinpath(f"eval_{model_name}_{timenow}.csv") + logger.info(f"saving csv with evaluation metrics at: {eval_csv_path}") + eval_df.to_csv( + eval_csv_path, index=False + ) # index is False to avoid having "Unnamed: 0" column when loading From 84bd98b2b78e5f9808f4f97a29f37452afa65cf0 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 13 Aug 2023 21:25:13 -0400 Subject: [PATCH 183/184] Rename ParametricUMAPTrainingDataset -> ParametricUMAPDataset --- src/vak/datasets/parametric_umap/parametric_umap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vak/datasets/parametric_umap/parametric_umap.py b/src/vak/datasets/parametric_umap/parametric_umap.py index 47a273bd2..9dcbaabca 100644 --- a/src/vak/datasets/parametric_umap/parametric_umap.py +++ b/src/vak/datasets/parametric_umap/parametric_umap.py @@ -167,7 +167,7 @@ def get_graph_elements( return graph, epochs_per_sample, head, tail, weight, n_vertices -class ParametricUMAPTrainingDataset(Dataset): +class ParametricUMAPDataset(Dataset): """Dataset used for training Parametric UMAP models """ From 96a8d56a036fa9be3969e5050b8a73b6f2543908 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Sun, 13 Aug 2023 21:25:29 -0400 Subject: [PATCH 184/184] Remove transform_params table from ConvEncoderUMAP configs --- .../ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml | 3 --- .../ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml | 6 ------ 2 files changed, 9 deletions(-) diff --git a/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml index 0eba4753c..b85935d23 100644 --- a/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml @@ -21,8 +21,5 @@ device = "cuda" output_dir = "./tests/data_for_tests/generated/results/eval/audio_cbin_annot_notmat/ConvEncoderUMAP" dataset_path = "tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/ConvEncoderUMAP/032312-vak-dimensionality-reduction-dataset-generated-230727_205727" -[EVAL.transform_params] -resize = [128, 128] - [ConvEncoderUMAP.optimizer] lr = 0.001 diff --git a/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml index db59906a2..102dd6192 100644 --- a/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml @@ -26,11 +26,5 @@ num_workers = 2 device = "cuda" root_results_dir = "./tests/data_for_tests/generated/results/train/audio_cbin_annot_notmat/ConvEncoderUMAP" -[TRAIN.train_transform_params] -resize = [128, 128] - -[TRAIN.val_transform_params] -resize = [128, 128] - [ConvEncoderUMAP.optimizer] lr = 0.001