Skip to content

feat: Add bidsignore implementation #22

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/build-test-deploy.yml
Original file line number Diff line number Diff line change
@@ -79,7 +79,6 @@ jobs:

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
if: matrix.os != 'ubuntu-latest'
with:
python-version: ${{ matrix.python-version }}
allow-prereleases: true
129 changes: 129 additions & 0 deletions src/bids_validator/bidsignore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""Utilities for working with .bidsignore files."""

import os
import re
from functools import lru_cache
from typing import Protocol, Union

import attrs

from .types.files import FileTree


@lru_cache
def compile_pat(pattern: str) -> Union[re.Pattern, None]:
"""Compile .gitignore-style ignore lines to regular expressions."""
orig = pattern
# A line starting with # serves as a comment.
if pattern.startswith('#'):
return None

# An optional prefix "!" which negates the pattern;
invert = pattern.startswith('!')

# Put a backslash ("\") in front of the first hash for patterns that begin with a hash.
# Put a backslash ("\") in front of the first "!" for patterns that begin with a literal "!"
if pattern.startswith((r'\#', r'\!')):
pattern = pattern[1:] # Unescape

# Trailing spaces are ignored unless they are quoted with backslash ("\").
pattern = re.sub(r'(?<!\\) +$', '', pattern)

# A blank line matches no files, so it can serve as a separator for readability.
if pattern == '':
return None

# If there is a separator at the beginning or middle (or both) of the pattern,
# then the pattern is relative to the [root]
relative_match = pattern == '/' or '/' in pattern[:-1]
# If there is a separator at the end of the pattern then the pattern will only match
# directories, otherwise the pattern can match both files and directories.
directory_match = pattern.endswith('/')

# This does not handle character ranges correctly except when they are also valid regex
parts = [
'.*'
if part == '**'
else part.replace('*', '[^/]*').replace('?', '[^/]').replace('.', r'\.')
for part in pattern.strip('/').split('/')
]

prefix = '^' if relative_match else '^(.*/|)'
postfix = r'/\Z' if directory_match else r'/?\Z'

# "**/" matches zero or more directories, so the separating slash needs to be optional
out_pattern = '/'.join(parts).replace('.*/', '.*/?')
out_pattern = f'{prefix}{out_pattern}{postfix}'

if invert:
raise ValueError(f'Inverted patterns not supported: {orig}')
# out_pattern = f'(?!{out_pattern})'

return re.compile(out_pattern)


class HasMatch(Protocol): # noqa: D101
def match(self, relpath: str) -> bool: ... # noqa: D102


@attrs.define
class Ignore:
"""Collection of .gitignore-style patterns.

Tracks successfully matched files for reporting.
"""

patterns: list[str] = attrs.field(factory=list)
history: list[str] = attrs.field(factory=list, init=False)

@classmethod
def from_file(cls, pathlike: os.PathLike):
"""Load Ignore contents from file."""
with open(pathlike) as fobj:
return cls([line.rstrip('\n') for line in fobj])

def match(self, relpath: str) -> bool:
"""Match a relative path against a collection of ignore patterns."""
if any(compile_pat(pattern).match(relpath) for pattern in self.patterns):
self.history.append(relpath)
return True
return False


@attrs.define
class IgnoreMany:
"""Match against several ignore filters."""

ignores: list[Ignore] = attrs.field()

def match(self, relpath: str) -> bool:
"""Return true if any filters match the given file.

Will short-circuit, so ordering is significant for side-effects,
such as recording files ignored by a particular filter.
"""
return any(ignore.match(relpath) for ignore in self.ignores)


def filter_file_tree(filetree: FileTree) -> FileTree:
"""Read .bidsignore and filter file tree."""
bidsignore = filetree.children.get('.bidsignore')
if not bidsignore:
return filetree
ignore = IgnoreMany([Ignore.from_file(bidsignore), Ignore(['/.bidsignore'])])
return _filter(filetree, ignore)


def _filter(filetree: FileTree, ignore: HasMatch) -> FileTree:
items = filetree.children.items()
children = {
name: _filter(child, ignore)
for name, child in items
if not ignore.match(child.relative_path)
}

# XXX This check may not be worth the time. Profile this.
if any(children.get(name) is not child for name, child in items):
filetree = attrs.evolve(filetree, children=children)

return filetree
89 changes: 89 additions & 0 deletions tests/test_bidsignore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""Test bids_validator.bidsignore."""

import pytest

from bids_validator.bidsignore import Ignore, compile_pat, filter_file_tree
from bids_validator.types.files import FileTree


@pytest.mark.parametrize(
('pattern', 'hits', 'misses'),
[
('/', ['/'], ['dir/', 'file']),
# Match file or directory named foo
('foo', ['foo', 'foo/', 'bar/foo', 'bar/foo/'], ['bar', 'foobar', 'barfoo', 'barfoo/']),
# Directories named foo only
('foo/', ['foo/', 'bar/foo/'], ['foo', 'bar/foo', 'bar', 'foobar', 'barfoo', 'barfoo/']),
# Files or directories at the root
('/foo', ['foo', 'foo/'], ['bar/foo', 'bar/foo/', 'bar', 'foobar', 'barfoo', 'barfoo/']),
# doc/frotz/ examples from GITIGNORE(5)
('doc/frotz/', ['doc/frotz/'], ['a/doc/frotz/']),
('frotz/', ['frotz/', 'doc/frotz/', 'a/doc/frotz/'], []),
# * matches everything because everything has a basename
('*', ['foo', 'foo/', 'foo/bar', 'foo/bar/'], []),
# *o matches things with basename ending in o
('*o', ['foo', 'foo/', 'bar/foo', 'bar/foo/'], ['foo/bar', 'foo/bar/']),
# Leading **/ matches in all directories
('**/foo', ['foo', 'foo/', 'bar/foo', 'bar/foo/'], ['foo/bar', 'foo/bar/', 'baz/foobar']),
('**/foo/bar', ['foo/bar', 'foo/bar/', 'a/foo/bar'], ['foo/', 'bar/foo', 'bar']),
# Trailing /** matches everything inside a root-relative directory
('foo/**', ['foo/', 'foo/x', 'foo/x/y/z'], ['foo', 'bar/foo/x/y/z']),
# /**/ matches zero or more directories
('a/**/b', ['a/b', 'a/x/b', 'a/x/y/b'], ['x/a/b', 'x/a/y/b']),
# ** surrounded by something other than slashes acts like a regular *
('a/x**/b', ['a/x/b', 'a/xy/b'], ['x/a/b', 'x/a/y/b', 'a/x/y/b']),
# Escaped special prefixes
(r'\#*', ['#', '#foo'], ['foo', 'bar#']),
(r'\!*', ['!', '!foo'], ['foo', 'bar!']),
],
)
def test_patterns(pattern, hits, misses):
"""Test expected hits and misses of ignore patterns."""
regex = compile_pat(pattern)
for fname in hits:
assert regex.match(fname)
for fname in misses:
assert not regex.match(fname)


def test_skipped_patterns():
"""Test ignore patterns that should match nothing."""
assert compile_pat('') is None
assert compile_pat('# commented line') is None
assert compile_pat(' ') is None
with pytest.raises(ValueError, match='Inverted patterns not supported'):
compile_pat('!inverted pattern')


def test_Ignore_ds000117(examples):
"""Test that we can load a .bidsignore file and match a file."""
ds000117 = FileTree.read_from_filesystem(examples / 'ds000117')
ignore = Ignore.from_file(ds000117.children['.bidsignore'])
assert 'run-*_echo-*_FLASH.json' in ignore.patterns
assert 'sub-01/ses-mri/anat/sub-01_ses-mri_run-1_echo-1_FLASH.nii.gz' in ds000117
assert ignore.match('sub-01/ses-mri/anat/sub-01_ses-mri_run-1_echo-1_FLASH.nii.gz')
assert not ignore.match('acq-mprage_T1w.json')
flash_file = (
ds000117.children['sub-01']
.children['ses-mri']
.children['anat']
.children['sub-01_ses-mri_run-1_echo-1_FLASH.nii.gz']
)
assert ignore.match(flash_file.relative_path)


def test_filter_file_tree(examples):
"""Test file tree filtering with .bidsignore."""
ds000117 = FileTree.read_from_filesystem(examples / 'ds000117')
assert '.bidsignore' in ds000117
assert 'sub-01/ses-mri/anat/sub-01_ses-mri_run-1_echo-1_FLASH.nii.gz' in ds000117

filtered = filter_file_tree(ds000117)
assert '.bidsignore' not in filtered
assert 'sub-01/ses-mri/anat/sub-01_ses-mri_run-1_echo-1_FLASH.nii.gz' not in filtered

ds000247 = FileTree.read_from_filesystem(examples / 'ds000247')
assert '.bidsignore' not in ds000247

filtered = filter_file_tree(ds000247)
assert filtered is ds000247