diff --git a/.github/workflows/validate_bids-examples.yml b/.github/workflows/validate_bids-examples.yml new file mode 100644 index 0000000000..f85d81cacf --- /dev/null +++ b/.github/workflows/validate_bids-examples.yml @@ -0,0 +1,119 @@ +--- +name: validate_datasets + +on: + push: + branches: ["master"] + pull_request: + branches: ["**"] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + strategy: + fail-fast: false + matrix: + platform: [ubuntu-latest] + bids-validator: [master-deno] + python-version: ["3.11"] + + runs-on: ${{ matrix.platform }} + + env: + TZ: Europe/Berlin + FORCE_COLOR: 1 + + steps: + - uses: actions/checkout@v4 + + # Setup Python with bst + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: "Install build dependencies" + run: pip install --upgrade build twine + - name: "Build source distribution and wheel" + run: python -m build tools/schemacode + - name: "Check distribution metadata" + run: twine check tools/schemacode/dist/* + - name: "Install bst tools from the build" + run: pip install $( ls tools/schemacode/dist/*.whl )[all] + - name: "Produce dump of the schema as schema.json" + run: bst -v export --output src/schema.json + + - uses: denoland/setup-deno@v1 + if: "matrix.bids-validator == 'master-deno'" + with: + deno-version: v1.x + + - name: Install BIDS validator (master deno build) + if: "matrix.bids-validator == 'master-deno'" + run: | + pushd .. + # Let's use specific commit for now + # TODO: progress it once in a while + commit=a7b291b882a8c6184219ccb84faae255ba96203a + git clone --depth 1 https://github.com/bids-standard/bids-validator + cd bids-validator + git fetch --depth 1 origin $commit + echo -e '#!/bin/sh\n'"$PWD/bids-validator/bids-validator-deno \"\$@\"" >| /tmp/bids-validator + chmod a+x /tmp/bids-validator + sudo mv /tmp/bids-validator /usr/local/bin/bids-validator + which -a bids-validator + bids-validator --help + popd + + - name: Display versions and environment information + run: | + echo $TZ + date + echo -n "npm: "; npm --version + echo -n "node: "; node --version + echo -n "bids-validator: "; bids-validator --version + echo -n "python: "; python --version + + # Checkout bids-examples + - uses: actions/checkout@v4 + with: + # For now use the forked repository with support for deno validator + # from https://github.com/bids-standard/bids-examples/pull/435 + repository: yarikoptic/bids-examples + ref: deno-validator + path: bids-examples + + - name: Mark known not yet to be deno-legit BIDS datasets + run: >- + touch + {ds000117,ds000246,ds000247,ds000248,eeg_ds003645s_hed_demo,ieeg_motorMiller2007,ieeg_visual}/.SKIP_VALIDATION + shell: bash + working-directory: bids-examples + + - name: Validate using bids-validator without migration + run: ./run_tests.sh + working-directory: bids-examples + + - name: Migrate all BIDS datasets + run: | + for dataset in */dataset_description.json; do + dataset_dir=$(dirname "$dataset") + echo "Migrating $dataset_dir..." + bst migrate all "$dataset_dir" \ + --skip fix_inheritance_overloading \ + --skip fix_tsv_entity_prefix || \ + echo "Migration failed for $dataset_dir" + done + shell: bash + working-directory: bids-examples + + - name: Show migrated datasets diff + run: git diff HEAD + working-directory: bids-examples + + - name: Validate all BIDS datasets using bids-validator after migration + run: >- + VALIDATOR_ARGS="--schema file://$PWD/../src/schema.json" + bash ./run_tests.sh + working-directory: bids-examples diff --git a/tools/schemacode/MIGRATE.md b/tools/schemacode/MIGRATE.md new file mode 100644 index 0000000000..b116c518f5 --- /dev/null +++ b/tools/schemacode/MIGRATE.md @@ -0,0 +1,209 @@ +# BIDS Dataset Migration Tools + +The BIDS Schema Tools package includes a `migrate` command to help dataset maintainers adopt new standardized conventions introduced in BIDS specification updates. + +## Overview + +As the BIDS specification evolves, new conventions and metadata fields are introduced. The migration tools help you upgrade existing datasets to use these new standards without manual editing. + +## Usage + +### List Available Migrations + +To see all available migrations: + +```bash +bst migrate list +``` + +This will display: +- Migration name +- BIDS version when it was introduced +- Description of what the migration does + +### Run a Specific Migration + +To run a single migration on a dataset: + +```bash +bst migrate run MIGRATION_NAME /path/to/dataset +``` + +For example: +```bash +bst migrate run standardize_generatedby /data/my_bids_dataset +``` + +### Dry Run Mode + +To preview what would be changed without modifying files: + +```bash +bst migrate run MIGRATION_NAME /path/to/dataset --dry-run +``` + +### Run All Migrations + +To run all available migrations on a dataset: + +```bash +bst migrate all /path/to/dataset +``` + +Skip specific migrations: + +```bash +bst migrate all /path/to/dataset --skip standardize_generatedby --skip another_migration +``` + +## Available Migrations + +### standardize_generatedby (version 1.10.0) + +**Purpose:** Convert pre-standard provenance metadata to the standardized `GeneratedBy` field introduced in BEP028. + +**What it does:** +- Looks for legacy provenance fields: `Pipeline`, `ProcessingPipeline`, `Software`, `Tool`, `Provenance` +- Converts them to the standard `GeneratedBy` array format +- Removes the legacy fields after migration + +**Example:** + +Before migration: +```json +{ + "Name": "My Dataset", + "BIDSVersion": "1.9.0", + "Pipeline": { + "Name": "fmriprep", + "Version": "1.4.1" + }, + "Software": "SPM12" +} +``` + +After migration: +```json +{ + "Name": "My Dataset", + "BIDSVersion": "1.9.0", + "GeneratedBy": [ + { + "Name": "fmriprep", + "Version": "1.4.1", + "Description": "Migrated from Pipeline field" + }, + { + "Name": "SPM12", + "Description": "Migrated from Software field" + } + ] +} +``` + +### fix_inheritance_overloading (version 1.10.1) + +**Purpose:** Check for deprecated inheritance overloading patterns (PR #1834). + +**What it does:** +- Scans JSON sidecar files across different inheritance scopes +- Identifies fields that have different values at different levels (dataset, subject, session) +- Reports warnings about this deprecated pattern +- Does not modify files (check-only migration) + +**Background:** Using different values for the same metadata field at different inheritance levels will not be supported in BIDS 2.0. This migration helps identify such cases so they can be addressed. + +### fix_tsv_entity_prefix (version 1.10.1) + +**Purpose:** Check for missing entity prefixes in TSV column headers (PR #2281). + +**What it does:** +- Checks TSV files that should use entity prefixes (e.g., `samples.tsv`, `participants.tsv`) +- Identifies columns that lack proper entity prefixes +- Provides suggestions for corrections +- Does not modify files (check-only migration) + +**Example issue:** +In `samples.tsv`, a column named `id` should be `sample_id`. + +## Creating New Migrations + +Migrations are registered using a decorator pattern. To add a new migration: + +```python +from bidsschematools.migrate import registry + +@registry.register( + name="my_migration", + version="1.11.0", + description="Description of what this migration does" +) +def my_migration(dataset_path, dry_run=False): + """ + Perform the migration. + + Parameters + ---------- + dataset_path : Path + Path to the BIDS dataset root + dry_run : bool + If True, don't modify files + + Returns + ------- + dict + Result with keys: success, modified_files, message + """ + # Migration implementation + modified_files = [] + + # ... your migration code ... + + return { + "success": True, + "modified_files": modified_files, + "message": "Migration completed" + } +``` + +Add your migration to `bidsschematools/migrations.py` and it will automatically be available through the CLI. + +## Migration Best Practices + +1. **Always use dry-run first:** Preview changes before applying them + ```bash + bst migrate run MIGRATION_NAME /path/to/dataset --dry-run + ``` + +2. **Backup your data:** While migrations are designed to be safe, always backup important datasets before running migrations + +3. **Version control:** If your dataset is in git, commit before running migrations so you can review and revert if needed + +4. **Review output:** Check the list of modified files and review changes to ensure they are correct + +5. **Check-only migrations:** Some migrations only report issues without making changes. Review their output and make manual corrections as needed + +## Exit Codes + +- `0`: Success +- `1`: Migration failed or encountered errors +- `2`: Invalid usage (e.g., no dataset_description.json found) + +## Logging + +Use `-v` for verbose output: +```bash +bst -v migrate run MIGRATION_NAME /path/to/dataset +``` + +Use `-vv` for even more detailed logging: +```bash +bst -vv migrate run MIGRATION_NAME /path/to/dataset +``` + +## Related Resources + +- [BIDS Specification](https://bids-specification.readthedocs.io/) +- [BEP028 (Provenance)](https://bids.neuroimaging.io/bep028) +- [PR #1834 (Inheritance Overloading)](https://github.com/bids-standard/bids-specification/pull/1834) +- [PR #2281 (Entity Prefixes)](https://github.com/bids-standard/bids-specification/pull/2281) diff --git a/tools/schemacode/README.md b/tools/schemacode/README.md index 3fe4fc2705..956df5ab9a 100644 --- a/tools/schemacode/README.md +++ b/tools/schemacode/README.md @@ -12,6 +12,7 @@ Features: * lightweight * reference schema parsing implementation used for schema testing * simple CLI bindings (e.g. `bst export`) +* dataset migration tools to adopt new BIDS conventions (e.g. `bst migrate`) If you have questions, you can post them in one of several channels where BIDS members are active: - the [NeuroStars](https://neurostars.org/tags/bids) discourse forum diff --git a/tools/schemacode/src/bidsschematools/__main__.py b/tools/schemacode/src/bidsschematools/__main__.py index b0f8e5f967..3b651bd3c8 100644 --- a/tools/schemacode/src/bidsschematools/__main__.py +++ b/tools/schemacode/src/bidsschematools/__main__.py @@ -4,6 +4,7 @@ import re import sys from itertools import chain +from pathlib import Path import click @@ -180,5 +181,161 @@ def pre_receive_hook(schema, input_, output): sys.exit(rc) +@cli.group() +def migrate(): + """Migrate BIDS datasets to adopt standardized conventions""" + pass + + +def _validate_bids_dataset(dataset_path: Path) -> None: + """Validate that a path contains a BIDS dataset. + + Parameters + ---------- + dataset_path : Path + Path to check for BIDS dataset + + Raises + ------ + SystemExit + If dataset_description.json is not found + """ + if not (dataset_path / "dataset_description.json").exists(): + lgr.error( + f"No dataset_description.json found in {dataset_path}. Is this a valid BIDS dataset?" + ) + sys.exit(1) + + +@migrate.command("list") +def migrate_list(): + """List all available migrations""" + from . import migrations # noqa: F401 - Import to register migrations + from .migrate import registry + + migrations_list = registry.list_migrations() + + if not migrations_list: + lgr.info("No migrations available") + return + + click.echo("Available migrations:\n") + for mig in sorted(migrations_list, key=lambda x: x["version"]): + click.echo(f" {mig['name']} (version {mig['version']})") + click.echo(f" {mig['description']}\n") + + +@migrate.command("run") +@click.argument("migration_name") +@click.argument("dataset_path", type=click.Path(exists=True)) +@click.option( + "--dry-run", + is_flag=True, + help="Show what would be changed without modifying files", +) +def migrate_run(migration_name, dataset_path, dry_run): + """Run a specific migration on a BIDS dataset + + MIGRATION_NAME is the name of the migration to run (use 'migrate list' to see available) + + DATASET_PATH is the path to the BIDS dataset root directory + """ + from . import migrations # noqa: F401 - Import to register migrations + from .migrate import registry + + dataset_path = Path(dataset_path).resolve() + _validate_bids_dataset(dataset_path) + + try: + result = registry.run(migration_name, dataset_path, dry_run=dry_run) + except ValueError as e: + lgr.error(str(e)) + lgr.info("Use 'bst migrate list' to see available migrations") + sys.exit(1) + + # Display results + if result.get("modified_files"): + click.echo(f"\nModified files ({len(result['modified_files'])}):") + for filepath in result["modified_files"]: + click.echo(f" - {filepath}") + + if result.get("warnings"): + click.echo(f"\nWarnings ({len(result['warnings'])}):") + for warning in result["warnings"]: + click.echo(f" - {warning}") + + if result.get("suggestions"): + click.echo(f"\nSuggestions ({len(result['suggestions'])}):") + for suggestion in result["suggestions"]: + click.echo(f" - {suggestion}") + + click.echo(f"\n{result['message']}") + + if not result["success"]: + sys.exit(1) + + +@migrate.command("all") +@click.argument("dataset_path", type=click.Path(exists=True)) +@click.option( + "--dry-run", + is_flag=True, + help="Show what would be changed without modifying files", +) +@click.option( + "--skip", + multiple=True, + help="Skip specific migrations (can be used multiple times)", +) +def migrate_all(dataset_path, dry_run, skip): + """Run all available migrations on a BIDS dataset + + DATASET_PATH is the path to the BIDS dataset root directory + """ + from . import migrations # noqa: F401 - Import to register migrations + from .migrate import registry + + dataset_path = Path(dataset_path).resolve() + _validate_bids_dataset(dataset_path) + + migrations_list = registry.list_migrations() + skip_set = set(skip) + + if not migrations_list: + click.echo("No migrations available") + return + + click.echo(f"Running {len(migrations_list)} migration(s) on {dataset_path}") + if dry_run: + click.echo("DRY RUN: No files will be modified\n") + + results = [] + for mig in sorted(migrations_list, key=lambda x: x["version"]): + if mig["name"] in skip_set: + click.echo(f"Skipping: {mig['name']}") + continue + + click.echo(f"\nRunning: {mig['name']} (version {mig['version']})") + result = registry.run(mig["name"], dataset_path, dry_run=dry_run) + results.append((mig["name"], result)) + + if result.get("modified_files"): + click.echo(f" Modified {len(result['modified_files'])} file(s)") + if result.get("warnings"): + click.echo(f" {len(result['warnings'])} warning(s)") + if result.get("suggestions"): + click.echo(f" {len(result['suggestions'])} suggestion(s)") + click.echo(f" {result['message']}") + + # Summary + click.echo("\n" + "=" * 60) + click.echo("Migration Summary:") + click.echo("=" * 60) + + for name, result in results: + status = "✓" if result["success"] else "✗" + click.echo(f"{status} {name}: {result['message']}") + + if __name__ == "__main__": cli() diff --git a/tools/schemacode/src/bidsschematools/migrate.py b/tools/schemacode/src/bidsschematools/migrate.py new file mode 100644 index 0000000000..f988d9739c --- /dev/null +++ b/tools/schemacode/src/bidsschematools/migrate.py @@ -0,0 +1,259 @@ +"""BIDS dataset migration utilities. + +This module provides functionality to migrate BIDS datasets from older versions +to newer standardized forms, helping users adopt new conventions and metadata fields. +""" + +import json +import logging +from pathlib import Path +from typing import Any, Callable, Optional + +lgr = logging.getLogger(__name__) + + +class Migration: + """Represents a single migration operation. + + Attributes + ---------- + name : str + Human-readable name of the migration + version : str + BIDS version when this migration applies (e.g., "1.10.0") + description : str + Description of what this migration does + func : Callable + The function that performs the migration + """ + + def __init__( + self, + name: str, + version: str, + description: str, + func: Callable[[Path], dict[str, Any]], + ): + """Initialize a migration. + + Parameters + ---------- + name : str + Human-readable name of the migration + version : str + BIDS version when this migration applies + description : str + Description of what this migration does + func : Callable + The function that performs the migration, takes dataset path + and returns a dict with 'success', 'modified_files', and 'message' keys + """ + self.name = name + self.version = version + self.description = description + self.func = func + + def __call__(self, dataset_path: Path, **kwargs) -> dict[str, Any]: + """Execute the migration. + + Parameters + ---------- + dataset_path : Path + Path to the BIDS dataset root + **kwargs + Additional keyword arguments for the migration function + + Returns + ------- + dict + Result of the migration with keys: + - success: bool indicating if migration succeeded + - modified_files: list of modified file paths + - message: str with details about the migration + """ + return self.func(dataset_path, **kwargs) + + +class MigrationRegistry: + """Registry for BIDS dataset migrations.""" + + def __init__(self): + """Initialize the migration registry.""" + self._migrations: dict[str, Migration] = {} + + def register( + self, + name: str, + version: str, + description: str, + ) -> Callable: + """Decorator to register a migration function. + + Parameters + ---------- + name : str + Human-readable name of the migration + version : str + BIDS version when this migration applies + description : str + Description of what this migration does + + Returns + ------- + Callable + Decorator function + + Examples + -------- + >>> registry = MigrationRegistry() + >>> @registry.register( + ... name="standardize_generatedby", + ... version="1.10.0", + ... description="Convert pre-standard provenance to GeneratedBy field" + ... ) + ... def migrate_generatedby(dataset_path): + ... # migration code here + ... return {"success": True, "modified_files": [], "message": "Done"} + """ + + def decorator(func: Callable) -> Migration: + migration = Migration(name, version, description, func) + self._migrations[name] = migration + lgr.debug(f"Registered migration: {name} (version {version})") + return migration + + return decorator + + def get(self, name: str) -> Optional[Migration]: + """Get a migration by name. + + Parameters + ---------- + name : str + Name of the migration + + Returns + ------- + Migration or None + The migration if found, None otherwise + """ + return self._migrations.get(name) + + def list_migrations(self) -> list[dict[str, str]]: + """List all registered migrations. + + Returns + ------- + list of dict + List of migration metadata with name, version, and description + """ + return [ + { + "name": mig.name, + "version": mig.version, + "description": mig.description, + } + for mig in self._migrations.values() + ] + + def run( + self, + name: str, + dataset_path: Path, + dry_run: bool = False, + **kwargs, + ) -> dict[str, Any]: + """Run a specific migration. + + Parameters + ---------- + name : str + Name of the migration to run + dataset_path : Path + Path to the BIDS dataset root + dry_run : bool, optional + If True, don't actually modify files, default False + **kwargs + Additional keyword arguments for the migration + + Returns + ------- + dict + Result of the migration + + Raises + ------ + ValueError + If migration not found + """ + migration = self.get(name) + if migration is None: + raise ValueError(f"Migration '{name}' not found") + + lgr.info(f"Running migration: {migration.name} (version {migration.version})") + lgr.info(f"Description: {migration.description}") + + if dry_run: + lgr.info("DRY RUN: No files will be modified") + kwargs["dry_run"] = True + + result = migration(dataset_path, **kwargs) + + if result.get("success"): + lgr.info(f"Migration completed successfully: {result.get('message', '')}") + else: + lgr.warning(f"Migration failed or had issues: {result.get('message', '')}") + + return result + + +# Global registry instance +registry = MigrationRegistry() + + +def load_json(filepath: Path) -> Optional[dict]: + """Load JSON file safely. + + Parameters + ---------- + filepath : Path + Path to JSON file + + Returns + ------- + dict or None + Loaded JSON data, or None if error + """ + try: + with open(filepath) as f: + return json.load(f) + except (json.JSONDecodeError, FileNotFoundError) as e: + lgr.error(f"Error loading {filepath}: {e}") + return None + + +def save_json(filepath: Path, data: dict, indent: int = 2) -> bool: + """Save JSON file safely. + + Parameters + ---------- + filepath : Path + Path to JSON file + data : dict + Data to save + indent : int, optional + JSON indentation, default 2 + + Returns + ------- + bool + True if successful, False otherwise + """ + try: + with open(filepath, "w") as f: + json.dump(data, f, indent=indent, ensure_ascii=False) + f.write("\n") # Add newline at end of file + return True + except Exception as e: + lgr.error(f"Error saving {filepath}: {e}") + return False diff --git a/tools/schemacode/src/bidsschematools/migrations.py b/tools/schemacode/src/bidsschematools/migrations.py new file mode 100644 index 0000000000..e32d695bae --- /dev/null +++ b/tools/schemacode/src/bidsschematools/migrations.py @@ -0,0 +1,384 @@ +"""Specific BIDS dataset migrations. + +This module contains individual migration functions for various BIDS version updates. +""" + +import json +import logging +from pathlib import Path +from typing import Any + +from .migrate import load_json, registry, save_json + +lgr = logging.getLogger(__name__) + + +@registry.register( + name="standardize_generatedby", + version="1.10.0", + description=( + "Convert pre-standard provenance metadata to standardized GeneratedBy field. " + "This migration helps adopt BEP028 conventions for dataset provenance tracking." + ), +) +def migrate_generatedby(dataset_path: Path, dry_run: bool = False) -> dict[str, Any]: + """Migrate pre-standard provenance to GeneratedBy field. + + This migration looks for common pre-standard provenance fields and converts + them to the standardized GeneratedBy array format introduced in BEP028. + + Common pre-standard fields that may be migrated: + - Provenance + - Pipeline + - ProcessingPipeline + - Software + + Parameters + ---------- + dataset_path : Path + Path to the BIDS dataset root + dry_run : bool, optional + If True, don't actually modify files, default False + + Returns + ------- + dict + Migration result with success status, modified files, and message + """ + modified_files = [] + issues = [] + + # Look for dataset_description.json files (including in derivatives) + # Use rglob to handle arbitrary nesting depths + desc_files = list(dataset_path.rglob("dataset_description.json")) + + if not desc_files: + return { + "success": True, + "modified_files": [], + "message": "No dataset_description.json files found", + } + + for desc_file in desc_files: + data = load_json(desc_file) + if data is None: + issues.append(f"Could not load {desc_file}") + continue + + # Check if GeneratedBy already exists + if "GeneratedBy" in data: + lgr.debug(f"GeneratedBy already exists in {desc_file}, skipping") + continue + + # Look for pre-standard fields to migrate + generated_by = [] + migrated_fields = [] + + # Check for common pre-standard fields + pre_standard_fields = { + "Provenance": "provenance", + "Pipeline": "pipeline", + "ProcessingPipeline": "processing_pipeline", + "Software": "software", + "Tool": "tool", + } + + for old_field, field_type in pre_standard_fields.items(): + if old_field in data: + value = data[old_field] + migrated_fields.append(old_field) + + # Convert to GeneratedBy format + if isinstance(value, str): + # Simple string, create basic entry + entry = {"Name": value, "Description": f"Migrated from {old_field} field"} + generated_by.append(entry) + elif isinstance(value, dict): + # Already structured, try to map fields + entry = {} + if "Name" in value: + entry["Name"] = value["Name"] + elif "name" in value: + entry["Name"] = value["name"] + else: + entry["Name"] = old_field + + if "Version" in value: + entry["Version"] = value["Version"] + elif "version" in value: + entry["Version"] = value["version"] + + if "Description" in value: + entry["Description"] = value["Description"] + elif "description" in value: + entry["Description"] = value["description"] + else: + entry["Description"] = f"Migrated from {old_field} field" + + # Handle container info if present + if "Container" in value or "container" in value: + entry["Container"] = value.get("Container", value.get("container")) + + generated_by.append(entry) + elif isinstance(value, list): + # List of entries + for item in value: + if isinstance(item, str): + generated_by.append( + {"Name": item, "Description": f"Migrated from {old_field} field"} + ) + elif isinstance(item, dict): + entry = {} + entry["Name"] = item.get("Name", item.get("name", old_field)) + if "Version" in item or "version" in item: + entry["Version"] = item.get("Version", item.get("version")) + entry["Description"] = item.get( + "Description", + item.get("description", f"Migrated from {old_field} field"), + ) + if "Container" in item or "container" in item: + entry["Container"] = item.get("Container", item.get("container")) + generated_by.append(entry) + + if generated_by: + if not dry_run: + # Add GeneratedBy field + data["GeneratedBy"] = generated_by + + # Remove old fields + for field in migrated_fields: + del data[field] + + # Save updated file + if save_json(desc_file, data): + modified_files.append(str(desc_file.relative_to(dataset_path))) + lgr.info( + f"Migrated {len(generated_by)} provenance entries in {desc_file.name}: " + f"removed fields {migrated_fields}" + ) + else: + issues.append(f"Failed to save {desc_file}") + else: + lgr.info( + f"[DRY RUN] Would migrate {len(generated_by)} provenance entries in " + f"{desc_file}: remove fields {migrated_fields}" + ) + modified_files.append(f"{desc_file.relative_to(dataset_path)} (dry run)") + + if issues: + message = f"Completed with issues: {'; '.join(issues)}" + success = False + elif modified_files: + message = f"Migrated provenance in {len(modified_files)} file(s)" + success = True + else: + message = "No pre-standard provenance fields found to migrate" + success = True + + return { + "success": success, + "modified_files": modified_files, + "message": message, + } + + +@registry.register( + name="fix_inheritance_overloading", + version="1.10.1", + description=( + "Warn about deprecated inheritance overloading patterns. " + "Per PR #1834, overloading metadata values in the inheritance principle " + "will be deprecated in BIDS 2.0. This migration scans for potential issues." + ), +) +def check_inheritance_overloading(dataset_path: Path, dry_run: bool = False) -> dict[str, Any]: + """Check for inheritance overloading patterns that will be deprecated. + + This migration scans for cases where the inheritance principle is being + used to overload values (e.g., using different values in different scopes + for the same metadata field). This pattern is deprecated per PR #1834. + + Parameters + ---------- + dataset_path : Path + Path to the BIDS dataset root + dry_run : bool, optional + Not used for this check-only migration + + Returns + ------- + dict + Migration result with warnings about overloading patterns + """ + warnings = [] + metadata_by_scope = {} + + # Find all JSON sidecar files + json_files = list(dataset_path.rglob("*.json")) + json_files = [f for f in json_files if f.name != "dataset_description.json"] + + if not json_files: + return { + "success": True, + "modified_files": [], + "message": "No JSON sidecar files found to check", + } + + # Analyze metadata fields across different scopes + for json_file in json_files: + data = load_json(json_file) + if data is None: + continue + + # Determine scope level (dataset, subject, session, etc.) + parts = json_file.relative_to(dataset_path).parts + if len(parts) > 1 and parts[0].startswith("sub-"): + if len(parts) > 2 and parts[1].startswith("ses-"): + scope = f"subject-session ({parts[0]}/{parts[1]})" + else: + scope = f"subject ({parts[0]})" + else: + scope = "dataset" + + # Track metadata by field name + for field, value in data.items(): + if field not in metadata_by_scope: + metadata_by_scope[field] = {} + + if scope not in metadata_by_scope[field]: + metadata_by_scope[field][scope] = [] + + metadata_by_scope[field][scope].append( + { + "file": str(json_file.relative_to(dataset_path)), + "value": value, + } + ) + + # Check for overloading (same field with different values in different scopes) + for field, scopes in metadata_by_scope.items(): + if len(scopes) > 1: + # Get unique values across scopes using a set for O(1) lookup + all_values = set() + for scope_data in scopes.values(): + for item in scope_data: + value_str = json.dumps(item["value"], sort_keys=True) + all_values.add(value_str) + + # If multiple different values, this is potential overloading + if len(all_values) > 1: + scope_summary = [] + for scope, items in scopes.items(): + scope_summary.append(f"{scope}: {len(items)} file(s)") + + warnings.append( + f"Field '{field}' has different values across scopes " + f"({', '.join(scope_summary)}). This inheritance overloading pattern " + "is deprecated and will not be supported in BIDS 2.0. " + "Consider using separate metadata fields or entity labels instead." + ) + + if warnings: + message = ( + f"Found {len(warnings)} potential inheritance overloading pattern(s). " + "See warnings for details." + ) + lgr.warning(message) + for warning in warnings: + lgr.warning(warning) + else: + message = "No inheritance overloading patterns detected" + + return { + "success": True, + "modified_files": [], + "message": message, + "warnings": warnings, + } + + +@registry.register( + name="fix_tsv_entity_prefix", + version="1.10.1", + description=( + "Check for missing entity prefixes in TSV column headers. " + "Per PR #2281, certain TSV files should use entity- prefix for their columns." + ), +) +def check_tsv_entity_prefix(dataset_path: Path, dry_run: bool = False) -> dict[str, Any]: + """Check for and optionally fix missing entity prefixes in TSV files. + + Some TSV files are expected to have entity prefixes (e.g., 'sample-' prefix + in samples.tsv for columns that identify samples). This migration helps + identify inconsistencies. + + Parameters + ---------- + dataset_path : Path + Path to the BIDS dataset root + dry_run : bool, optional + If True, don't actually modify files, default False + + Returns + ------- + dict + Migration result with findings about entity prefix consistency + """ + issues = [] + suggestions = [] + + # Files that should use entity prefixes + # Format: {filename_pattern: expected_entity_prefix} + expected_prefixes = { + "samples.tsv": "sample", + "participants.tsv": "participant", + } + + for pattern, entity in expected_prefixes.items(): + tsv_files = list(dataset_path.glob(pattern)) + tsv_files.extend(dataset_path.glob(f"*/{pattern}")) + + for tsv_file in tsv_files: + try: + with open(tsv_file) as f: + header = f.readline().strip() + if not header: + continue + + columns = header.split("\t") + + # Check if first column lacks proper prefix + if columns and not columns[0].startswith(f"{entity}_"): + # Special case: participant_id is standard, don't flag it + if tsv_file.name == "participants.tsv" and columns[0] == "participant_id": + continue + + suggestions.append( + f"{tsv_file.relative_to(dataset_path)}: " + f"Column '{columns[0]}' should likely be '{entity}_{columns[0]}'" + ) + except Exception as e: + issues.append(f"Error reading {tsv_file}: {e}") + + if suggestions: + message = ( + f"Found {len(suggestions)} TSV files with potential entity prefix issues. " + "Manual review recommended." + ) + lgr.info(message) + for suggestion in suggestions: + lgr.info(f" - {suggestion}") + else: + message = "No entity prefix issues detected in TSV files" + + if issues: + for issue in issues: + lgr.warning(issue) + + return { + "success": True, + "modified_files": [], + "message": message, + "suggestions": suggestions, + "issues": issues, + } diff --git a/tools/schemacode/src/bidsschematools/tests/test_migrate.py b/tools/schemacode/src/bidsschematools/tests/test_migrate.py new file mode 100644 index 0000000000..5ba7208727 --- /dev/null +++ b/tools/schemacode/src/bidsschematools/tests/test_migrate.py @@ -0,0 +1,201 @@ +"""Tests for BIDS dataset migration functionality.""" + +import json + +import pytest + +from bidsschematools.migrate import Migration, MigrationRegistry, load_json, save_json + + +class TestMigration: + """Tests for Migration class.""" + + def test_migration_creation(self): + """Test creating a migration.""" + + def dummy_func(dataset_path): + return {"success": True, "modified_files": [], "message": "Done"} + + mig = Migration( + name="test_migration", + version="1.0.0", + description="Test migration", + func=dummy_func, + ) + + assert mig.name == "test_migration" + assert mig.version == "1.0.0" + assert mig.description == "Test migration" + assert callable(mig.func) + + def test_migration_call(self, tmp_path): + """Test calling a migration.""" + + def dummy_func(dataset_path, **kwargs): + return { + "success": True, + "modified_files": ["test.json"], + "message": f"Processed {dataset_path}", + } + + mig = Migration( + name="test_migration", + version="1.0.0", + description="Test migration", + func=dummy_func, + ) + + result = mig(tmp_path) + assert result["success"] is True + assert result["modified_files"] == ["test.json"] + assert str(tmp_path) in result["message"] + + +class TestMigrationRegistry: + """Tests for MigrationRegistry class.""" + + def test_registry_creation(self): + """Test creating a registry.""" + registry = MigrationRegistry() + assert registry.list_migrations() == [] + + def test_register_migration(self): + """Test registering a migration.""" + registry = MigrationRegistry() + + @registry.register( + name="test_migration", + version="1.0.0", + description="Test migration", + ) + def test_func(dataset_path): + return {"success": True, "modified_files": [], "message": "Done"} + + migrations = registry.list_migrations() + assert len(migrations) == 1 + assert migrations[0]["name"] == "test_migration" + assert migrations[0]["version"] == "1.0.0" + + def test_get_migration(self): + """Test getting a migration by name.""" + registry = MigrationRegistry() + + @registry.register( + name="test_migration", + version="1.0.0", + description="Test migration", + ) + def test_func(dataset_path): + return {"success": True, "modified_files": [], "message": "Done"} + + mig = registry.get("test_migration") + assert mig is not None + assert mig.name == "test_migration" + + mig = registry.get("nonexistent") + assert mig is None + + def test_run_migration(self, tmp_path): + """Test running a migration.""" + registry = MigrationRegistry() + + @registry.register( + name="test_migration", + version="1.0.0", + description="Test migration", + ) + def test_func(dataset_path, **kwargs): + return { + "success": True, + "modified_files": [], + "message": "Migration completed", + } + + result = registry.run("test_migration", tmp_path) + assert result["success"] is True + assert result["message"] == "Migration completed" + + def test_run_nonexistent_migration(self, tmp_path): + """Test running a nonexistent migration raises error.""" + registry = MigrationRegistry() + + with pytest.raises(ValueError, match="Migration 'nonexistent' not found"): + registry.run("nonexistent", tmp_path) + + def test_dry_run(self, tmp_path): + """Test dry run mode.""" + registry = MigrationRegistry() + + @registry.register( + name="test_migration", + version="1.0.0", + description="Test migration", + ) + def test_func(dataset_path, dry_run=False): + return { + "success": True, + "modified_files": [] if dry_run else ["test.json"], + "message": "Dry run" if dry_run else "Modified files", + } + + result = registry.run("test_migration", tmp_path, dry_run=True) + assert result["success"] is True + assert result["modified_files"] == [] + assert "Dry run" in result["message"] + + +class TestJsonHelpers: + """Tests for JSON helper functions.""" + + def test_load_json(self, tmp_path): + """Test loading JSON file.""" + test_file = tmp_path / "test.json" + test_data = {"key": "value", "number": 42} + + with open(test_file, "w") as f: + json.dump(test_data, f) + + loaded = load_json(test_file) + assert loaded == test_data + + def test_load_json_nonexistent(self, tmp_path): + """Test loading nonexistent JSON file returns None.""" + test_file = tmp_path / "nonexistent.json" + loaded = load_json(test_file) + assert loaded is None + + def test_load_json_invalid(self, tmp_path): + """Test loading invalid JSON returns None.""" + test_file = tmp_path / "invalid.json" + + with open(test_file, "w") as f: + f.write("not valid json{") + + loaded = load_json(test_file) + assert loaded is None + + def test_save_json(self, tmp_path): + """Test saving JSON file.""" + test_file = tmp_path / "test.json" + test_data = {"key": "value", "number": 42} + + result = save_json(test_file, test_data) + assert result is True + + # Verify file contents + with open(test_file) as f: + loaded = json.load(f) + assert loaded == test_data + + def test_save_json_with_indentation(self, tmp_path): + """Test saving JSON file with custom indentation.""" + test_file = tmp_path / "test.json" + test_data = {"key": "value"} + + save_json(test_file, test_data, indent=4) + + # Check that file is properly indented + with open(test_file) as f: + content = f.read() + assert " " in content # Should have 4-space indent + assert content.endswith("\n") # Should have trailing newline diff --git a/tools/schemacode/src/bidsschematools/tests/test_migrate_cli.py b/tools/schemacode/src/bidsschematools/tests/test_migrate_cli.py new file mode 100644 index 0000000000..b4f236cd91 --- /dev/null +++ b/tools/schemacode/src/bidsschematools/tests/test_migrate_cli.py @@ -0,0 +1,115 @@ +"""Integration tests for the migrate CLI commands.""" + +import json +import subprocess + + +class TestMigrateCLI: + """Test the bst migrate CLI commands.""" + + def test_migrate_list_command(self): + """Test that 'bst migrate list' works.""" + result = subprocess.run( + ["bst", "migrate", "list"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "standardize_generatedby" in result.stdout + assert "fix_inheritance_overloading" in result.stdout + assert "fix_tsv_entity_prefix" in result.stdout + assert "version" in result.stdout + + def test_migrate_run_command(self, tmp_path): + """Test that 'bst migrate run' works on a real dataset.""" + # Create a test dataset + dataset_path = tmp_path / "test_dataset" + dataset_path.mkdir() + + desc_file = dataset_path / "dataset_description.json" + desc_data = {"Name": "Test Dataset", "BIDSVersion": "1.9.0", "Pipeline": "my_pipeline"} + + with open(desc_file, "w") as f: + json.dump(desc_data, f) + + # Run the migration + result = subprocess.run( + ["bst", "migrate", "run", "standardize_generatedby", str(dataset_path)], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "Modified files" in result.stdout + assert "dataset_description.json" in result.stdout + + # Verify the file was actually migrated + with open(desc_file) as f: + migrated_data = json.load(f) + + assert "GeneratedBy" in migrated_data + assert "Pipeline" not in migrated_data + + def test_migrate_run_dry_run(self, tmp_path): + """Test that 'bst migrate run --dry-run' doesn't modify files.""" + # Create a test dataset + dataset_path = tmp_path / "test_dataset" + dataset_path.mkdir() + + desc_file = dataset_path / "dataset_description.json" + desc_data = {"Name": "Test Dataset", "BIDSVersion": "1.9.0", "Pipeline": "my_pipeline"} + + with open(desc_file, "w") as f: + json.dump(desc_data, f) + + # Run the migration in dry-run mode + result = subprocess.run( + ["bst", "migrate", "run", "standardize_generatedby", str(dataset_path), "--dry-run"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "dry run" in result.stdout.lower() + + # Verify the file was NOT modified + with open(desc_file) as f: + data = json.load(f) + + assert "Pipeline" in data + assert "GeneratedBy" not in data + + def test_migrate_run_invalid_dataset(self, tmp_path): + """Test that running migration on invalid dataset fails gracefully.""" + # Create empty directory (no dataset_description.json) + dataset_path = tmp_path / "invalid_dataset" + dataset_path.mkdir() + + result = subprocess.run( + ["bst", "migrate", "run", "standardize_generatedby", str(dataset_path)], + capture_output=True, + text=True, + ) + + assert result.returncode == 1 + assert "dataset_description.json" in result.stderr.lower() + + def test_migrate_run_nonexistent_migration(self, tmp_path): + """Test that running nonexistent migration fails gracefully.""" + # Create a valid dataset + dataset_path = tmp_path / "test_dataset" + dataset_path.mkdir() + + desc_file = dataset_path / "dataset_description.json" + with open(desc_file, "w") as f: + json.dump({"Name": "Test", "BIDSVersion": "1.0.0"}, f) + + result = subprocess.run( + ["bst", "migrate", "run", "nonexistent_migration", str(dataset_path)], + capture_output=True, + text=True, + ) + + assert result.returncode == 1 + assert "not found" in result.stderr.lower() diff --git a/tools/schemacode/src/bidsschematools/tests/test_migrations.py b/tools/schemacode/src/bidsschematools/tests/test_migrations.py new file mode 100644 index 0000000000..456b20802f --- /dev/null +++ b/tools/schemacode/src/bidsschematools/tests/test_migrations.py @@ -0,0 +1,249 @@ +"""Tests for specific BIDS dataset migrations.""" + +import json + +import pytest + +from bidsschematools.migrations import ( + check_inheritance_overloading, + check_tsv_entity_prefix, + migrate_generatedby, +) + + +class TestMigrateGeneratedBy: + """Tests for GeneratedBy migration.""" + + @pytest.fixture + def dataset_with_old_provenance(self, tmp_path): + """Create a dataset with pre-standard provenance fields.""" + dataset_path = tmp_path / "dataset" + dataset_path.mkdir() + + # Create dataset_description.json with old-style provenance + desc_data = { + "Name": "Test Dataset", + "BIDSVersion": "1.9.0", + "Pipeline": { + "Name": "fmriprep", + "Version": "1.4.1", + }, + } + + desc_file = dataset_path / "dataset_description.json" + with open(desc_file, "w") as f: + json.dump(desc_data, f, indent=2) + + return dataset_path + + @pytest.fixture + def dataset_with_new_provenance(self, tmp_path): + """Create a dataset with standard GeneratedBy field.""" + dataset_path = tmp_path / "dataset" + dataset_path.mkdir() + + desc_data = { + "Name": "Test Dataset", + "BIDSVersion": "1.10.0", + "GeneratedBy": [ + { + "Name": "fmriprep", + "Version": "1.4.1", + } + ], + } + + desc_file = dataset_path / "dataset_description.json" + with open(desc_file, "w") as f: + json.dump(desc_data, f, indent=2) + + return dataset_path + + def test_migrate_old_pipeline_field(self, dataset_with_old_provenance): + """Test migrating old Pipeline field to GeneratedBy.""" + result = migrate_generatedby(dataset_with_old_provenance) + + assert result["success"] is True + assert len(result["modified_files"]) == 1 + assert "dataset_description.json" in result["modified_files"][0] + + # Check that file was actually modified + desc_file = dataset_with_old_provenance / "dataset_description.json" + with open(desc_file) as f: + data = json.load(f) + + assert "GeneratedBy" in data + assert "Pipeline" not in data + assert data["GeneratedBy"][0]["Name"] == "fmriprep" + assert data["GeneratedBy"][0]["Version"] == "1.4.1" + + def test_skip_if_generatedby_exists(self, dataset_with_new_provenance): + """Test that migration skips if GeneratedBy already exists.""" + result = migrate_generatedby(dataset_with_new_provenance) + + assert result["success"] is True + assert len(result["modified_files"]) == 0 + assert "already exists" in result["message"] or "No pre-standard" in result["message"] + + def test_dry_run(self, dataset_with_old_provenance): + """Test dry run mode doesn't modify files.""" + result = migrate_generatedby(dataset_with_old_provenance, dry_run=True) + + assert result["success"] is True + assert len(result["modified_files"]) > 0 + assert "dry run" in result["modified_files"][0].lower() + + # Check that file was NOT modified + desc_file = dataset_with_old_provenance / "dataset_description.json" + with open(desc_file) as f: + data = json.load(f) + + assert "Pipeline" in data + assert "GeneratedBy" not in data + + def test_migrate_multiple_fields(self, tmp_path): + """Test migrating multiple pre-standard fields.""" + dataset_path = tmp_path / "dataset" + dataset_path.mkdir() + + desc_data = { + "Name": "Test Dataset", + "BIDSVersion": "1.9.0", + "Pipeline": "fmriprep", + "Software": "SPM12", + } + + desc_file = dataset_path / "dataset_description.json" + with open(desc_file, "w") as f: + json.dump(desc_data, f, indent=2) + + result = migrate_generatedby(dataset_path) + + assert result["success"] is True + + with open(desc_file) as f: + data = json.load(f) + + assert "GeneratedBy" in data + assert len(data["GeneratedBy"]) == 2 + assert "Pipeline" not in data + assert "Software" not in data + + def test_no_dataset_description(self, tmp_path): + """Test with no dataset_description.json file.""" + result = migrate_generatedby(tmp_path) + + assert result["success"] is True + assert len(result["modified_files"]) == 0 + assert "No dataset_description.json" in result["message"] + + +class TestCheckInheritanceOverloading: + """Tests for inheritance overloading check.""" + + @pytest.fixture + def dataset_with_overloading(self, tmp_path): + """Create a dataset with inheritance overloading.""" + dataset_path = tmp_path / "dataset" + dataset_path.mkdir() + + # Create dataset-level metadata + (dataset_path / "task-rest_bold.json").write_text(json.dumps({"RepetitionTime": 2.0})) + + # Create subject-level metadata with different value + sub_dir = dataset_path / "sub-01" + sub_dir.mkdir() + (sub_dir / "task-rest_bold.json").write_text(json.dumps({"RepetitionTime": 1.5})) + + return dataset_path + + @pytest.fixture + def dataset_without_overloading(self, tmp_path): + """Create a dataset without inheritance overloading.""" + dataset_path = tmp_path / "dataset" + dataset_path.mkdir() + + # Create dataset-level metadata + (dataset_path / "task-rest_bold.json").write_text(json.dumps({"TaskName": "rest"})) + + # Create subject-level metadata with same value + sub_dir = dataset_path / "sub-01" + sub_dir.mkdir() + (sub_dir / "task-rest_bold.json").write_text(json.dumps({"TaskName": "rest"})) + + return dataset_path + + def test_detect_overloading(self, dataset_with_overloading): + """Test detection of inheritance overloading.""" + result = check_inheritance_overloading(dataset_with_overloading) + + assert result["success"] is True + assert len(result["warnings"]) > 0 + assert any("RepetitionTime" in w for w in result["warnings"]) + assert any("deprecated" in w for w in result["warnings"]) + + def test_no_overloading(self, dataset_without_overloading): + """Test dataset without overloading issues.""" + result = check_inheritance_overloading(dataset_without_overloading) + + assert result["success"] is True + assert len(result.get("warnings", [])) == 0 + assert "No inheritance overloading" in result["message"] + + def test_empty_dataset(self, tmp_path): + """Test with empty dataset.""" + result = check_inheritance_overloading(tmp_path) + + assert result["success"] is True + assert "No JSON sidecar files" in result["message"] + + +class TestCheckTsvEntityPrefix: + """Tests for TSV entity prefix check.""" + + @pytest.fixture + def dataset_with_participants(self, tmp_path): + """Create a dataset with participants.tsv.""" + dataset_path = tmp_path / "dataset" + dataset_path.mkdir() + + # Create participants.tsv with correct prefix + participants_file = dataset_path / "participants.tsv" + participants_file.write_text("participant_id\tage\tsex\n01\t25\tF\n") + + return dataset_path + + @pytest.fixture + def dataset_with_samples(self, tmp_path): + """Create a dataset with samples.tsv without proper prefix.""" + dataset_path = tmp_path / "dataset" + dataset_path.mkdir() + + # Create samples.tsv without proper prefix + samples_file = dataset_path / "samples.tsv" + samples_file.write_text("id\ttype\ttissue\n01\tblood\tWB\n") + + return dataset_path + + def test_correct_prefix(self, dataset_with_participants): + """Test dataset with correct entity prefix.""" + result = check_tsv_entity_prefix(dataset_with_participants) + + assert result["success"] is True + assert len(result.get("suggestions", [])) == 0 + assert "No entity prefix issues" in result["message"] + + def test_missing_prefix(self, dataset_with_samples): + """Test dataset with missing entity prefix.""" + result = check_tsv_entity_prefix(dataset_with_samples) + + assert result["success"] is True + assert len(result.get("suggestions", [])) > 0 + assert any("sample_" in s for s in result["suggestions"]) + + def test_empty_dataset(self, tmp_path): + """Test with empty dataset.""" + result = check_tsv_entity_prefix(tmp_path) + + assert result["success"] is True + assert len(result.get("suggestions", [])) == 0