diff --git a/datalad_core/iter_collections/__init__.py b/datalad_core/iter_collections/__init__.py new file mode 100644 index 0000000..ae35e31 --- /dev/null +++ b/datalad_core/iter_collections/__init__.py @@ -0,0 +1,30 @@ +"""Iterators for particular types of collections + +Most, if not all, implementation come in the form of a function that takes +a collection identifier or a collection location (e.g., a file system path), +and possibly some additional options. When called, an iterator is returned +that produces collection items in the form of data class instances of +a given type. The particular type can be different across different +collections. + + +.. currentmodule:: datalad_core.iter_collections +.. autosummary:: + :toctree: generated + + iter_gitworktree + GitTreeItemType + GitWorktreeItem +""" + +__all__ = [ + 'GitTreeItemType', + 'GitWorktreeItem', + 'iter_gitworktree', +] + +from .gitworktree import ( + GitTreeItemType, + GitWorktreeItem, + iter_gitworktree, +) diff --git a/datalad_core/iter_collections/gitworktree.py b/datalad_core/iter_collections/gitworktree.py new file mode 100644 index 0000000..188e193 --- /dev/null +++ b/datalad_core/iter_collections/gitworktree.py @@ -0,0 +1,400 @@ +from __future__ import annotations + +from dataclasses import ( + dataclass, + replace, +) +from itertools import chain +from pathlib import ( + Path, + PurePosixPath, +) +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Generator + +from datasalad.gitpathspec import GitPathSpecs + +from datalad_core.iter_collections.utils import ( + GitTreeItemType, + git_ls_files, + git_mode_type_map, +) + + +@dataclass(frozen=True) +class GitWorktreeItem: + """Item in a Git worktree + + The ``tree_path`` property associates the item with a concrete worktree + (location) on the filesystem. The item itself is identified by its + path relative to the worktree root. Analog to Git, this path is reported + in POSIX conventions. + + The two optional properties ``gitsha`` and ``gittype`` annotate worktree + items whenever the information is known. + """ + + tree_path: Path + """``Path`` of the tree the item is part of""" + relpath: PurePosixPath + """Path of the item relative to the ``tree_path``""" + gitsha: str | None = None + """Git's SHA identifier for the item, if known""" + gittype: GitTreeItemType | None = None + """Item type identifier, if known""" + + # @property + # def name(self) -> str: + # """Name of the item is its ``str`` path relative to the tree root""" + # return str(self.relpath) + + @property + def path(self) -> Path: + """Platform path of the item + + The item's ``tree_path`` determines whether the path is relative or + absolute. + """ + return self.tree_path / self.relpath + + +# mapping of supported values of the `untracked` argument to `git ls-files` +# parameters +lsfiles_untracked_args = { + None: ('--stage', '--cached'), + 'all': ('--stage', '--cached', '--exclude-standard', '--others'), + 'whole-dir': ( + '--stage', + '--cached', + '--exclude-standard', + '--others', + '--directory', + ), + 'no-empty-dir': ( + '--stage', + '--cached', + '--exclude-standard', + '--others', + '--directory', + '--no-empty-directory', + ), + 'only': ('--exclude-standard', '--others'), + 'only-whole-dir': ('--exclude-standard', '--others', '--directory'), + 'only-no-empty-dir': ( + '--exclude-standard', + '--others', + '--directory', + '--no-empty-directory', + ), +} + + +def iter_gitworktree( + path: Path, + *, + untracked: str | None = 'all', + recursive: str = 'repository', + pathspecs: list[str] | GitPathSpecs | None = None, +) -> Generator[GitWorktreeItem]: + """Uses ``git ls-files`` to report on a work tree of a Git repository + + This iterator can be used to report on all tracked, and untracked content + of a Git repository's work tree. This includes files that have been removed + from the work tree (deleted), unless their removal has already been staged. + + For any tracked content, yielded items include type information and gitsha + as last known to Git. This means that such reports reflect the last + committed or staged content, not the state of a potential unstaged + modification in the work tree. + + :class:`GitWorktreeItem` are yielded. Their ``gitsha`` and ``gittype`` + properties being ``None`` indicates untracked work tree content. + + .. note:: + The ``gitsha`` is not equivalent to a SHA1 hash of a file's content, + but is the SHA-type blob identifier as reported and used by Git. + + Parameters + ---------- + path: Path + Path of a directory in a Git repository to report on. This directory + need not be the root directory of the repository, but must be part of + the repository's work tree. + untracked: {'all', 'whole-dir', 'no-empty-dir', 'only', 'only-whole-dir', 'only-no-empty-dir'} or None, optional + If not ``None``, also reports on untracked work tree content. + ``all`` reports on any untracked file; ``whole-dir`` yields a single + report for a directory that is entirely untracked, and not individual + untracked files in it; ``no-empty-dir`` skips any reports on + untracked empty directories. The modes starting with 'only' offer the + same untracked content reporting styles, but only untracked and no + tracked content is reported. For example, 'only' is the corresponding + mode to 'all' with no tracked content being reported. + recursive: {'submodules', 'repository', 'no'}, optional + Behavior for recursion into subdirectories of ``path``. By default + (``repository``), all directories within the repository are reported. + This possibly includes untracked ones (see ``untracked``), but not + directories within submodules. With ``submodules``, the full worktree + is reported on with recursion into submodules. With ``no``, + only direct children of ``path`` are reported on. + For any worktree items in subdirectories of ``path`` only a single + record for the containing immediate subdirectory ``path`` is yielded. + For example, with 'path/subdir/file1' and 'path/subdir/file2' there + will only be a single item with ``name='subdir'`` and + ``type='directory'``. + pathspecs: optional + Patterns used to limit results to particular paths. Any pathspecs + supported by Git can be used and are passed to the underlying ``git + ls-files`` queries. Pathspecs are also supported for recursive reporting + on submodules. In such a case, the results match those of individual + queries with analog pathspecs on the respective submodules (Git itself + does not support pathspecs for submodule-recursive operations). For + example, a ``submodule`` recursion with a pathspec ``*.jpg`` will yield + reports on all JPG files in all submodules, even though a submodule path + itself does not match ``*.jpg``. On the other hand, a pathspec + ``submoddir/*.jpg`` will only report on JPG files in the submodule at + ``submoddir/``, but on all JPG files in that submodule. + As of version 1.5, the pathspec support for submodule recursion is + preliminary and results should be carefully investigated. + """ + # we force-convert to Path to prevent delayed crashing when reading from + # the file system. The docs already ask for that, but it is easy to + # forget/ignore and leads to non-obvious errors. Running this once is + # a cheap safety net + # https://github.com/datalad/datalad-next/issues/551 + path = Path(path) + _pathspecs = GitPathSpecs(pathspecs) + + subm_to_process: list[GitWorktreeItem] = [] + + # the helper takes care of talking to Git and doing recursion + for item in _iter_gitworktree( + path=path, + untracked=untracked, + # the helper cannot do submodule recursion, we do this outside, + # so limit here + recursive='repository' if recursive == 'submodules' else recursive, + pathspecs=_pathspecs, + ): + # exclude non-submodules, or a submodule that was found at + # the root path -- which would indicate that the submodule + # itself it not around, only its record in the parent + if ( + recursive == 'submodules' + and item.gittype == GitTreeItemType.submodule + and item.relpath != PurePosixPath('.') + ): + # mark for processing at the end + subm_to_process.append(item) + continue + + yield item + + processed_submodules: set[PurePosixPath] = set() + # we may need to loop over the (remaining) submodules for two reasons: + # - with pathspecs there is a chance that a given pathspec set did not + # match a submodule (directly) that could have content that matches a + # pathspec + # - when we are looking for untracked content only, the code above + # (by definition) will not have found the submodules (because they are + # unconditionally tracked) + for subm in chain( + # submodules already found to need reporting on above + subm_to_process, + # and the (potentially) remaining ones, deduplication happening + # via `processed_submodules` + iter_submodules( + path=path, + pathspecs=_pathspecs, + match_containing=True, + ) + if recursive == 'submodules' + and ((untracked and untracked.startswith('only')) or _pathspecs) + else [], + ): + if subm.relpath in processed_submodules: + # we dealt with that above already + continue + yield from _yield_from_submodule( + subm=subm, + untracked=untracked, + recursive=recursive, + pathspecs=_pathspecs, + ) + processed_submodules.add(subm.relpath) + + +def _yield_from_submodule( + subm: GitWorktreeItem, + untracked: str | None, + recursive: str, + pathspecs: GitPathSpecs, +) -> Generator[GitWorktreeItem]: + if not subm.path.exists(): + # no point in trying to list a submodule that is not around + return + subm_pathspecs = pathspecs + if pathspecs: + # recode pathspecs to match the submodule scope + try: + subm_pathspecs = pathspecs.for_subdir(subm.relpath) + except ValueError: + # not a single pathspec could be translated, there is + # no chance for a match, we can stop here + return + for item in iter_gitworktree( + path=subm.path, + untracked=untracked, + recursive=recursive, + pathspecs=subm_pathspecs, + ): + # recode path/name + yield replace( + item, + tree_path=subm.tree_path, + relpath=subm.relpath / item.relpath, + ) + + +def _iter_gitworktree( + path: Path, + *, + untracked: str | None, + recursive: str, + pathspecs: GitPathSpecs, +) -> Generator[GitWorktreeItem, None, None]: + """Internal helper for iter_gitworktree() tp support recursion""" + + # perform an implicit test of whether the `untracked` mode is known + lsfiles_args = list(lsfiles_untracked_args[untracked]) + + if pathspecs: + lsfiles_args.extend(pathspecs.arglist()) + + # helper to handle multi-stage reports by ls-files + pending_item: GitWorktreeItem | None = None + + reported_dirs: set[PurePosixPath] = set() + _single_dir = recursive == 'no' + + # we add a "fake" `None` record at the end to avoid a special + # case for submitting the last pending item after the loop. + # otherwise the context manager handling of the file pointer + # would lead to lots of code duplication + for line in chain(git_ls_files(path, *lsfiles_args), [None]): + if pending_item is None and line is None: + return + + # a bit ugly, but we need to account for the `None` record + # that signals the final loop iteration + item = None if line is None else _lsfiles_line2item(path, line) + + # yield any pending item, if the current record is not an + # addendum of it + if item is None or (pending_item and item.relpath != pending_item.relpath): + if TYPE_CHECKING: + # we already checked that not both items are None + assert pending_item is not None + # this is the last point where we can still withhold a report. + # it is also the point where we can do this with minimal + # impact on the rest of the logic. + # so act on recursion setup now + pending_item_path_parts = pending_item.relpath.parts + if _single_dir and len(pending_item_path_parts) > 1: + # this path is pointing inside a subdirectory of the + # base directory -> ignore + # reset pending_item here, decomplexifies the conditionals + pending_item = item + dir_path = PurePosixPath(pending_item_path_parts[0]) + if dir_path in reported_dirs: + # we only yield each containing dir once, and only once + continue + yield GitWorktreeItem( + tree_path=path, + relpath=dir_path, + gittype=GitTreeItemType.directory, + gitsha=None, + ) + reported_dirs.add(dir_path) + continue + + # report on a pending item, this is not a "higher-stage" + # report by ls-files + yield pending_item + + # do not yield immediately, wait for a possible higher-stage + # report in the next loop iteration + pending_item = item + + +def iter_submodules( + path: Path, + *, + pathspecs: list[str] | GitPathSpecs | None = None, + match_containing: bool = False, +) -> Generator[GitWorktreeItem]: + """Given a path, report all submodules of a repository worktree underneath + + With ``match_containing`` set to the default ``False``, this is merely a + convenience wrapper around ``iter_gitworktree()`` that selectively reports + on submodules. With ``match_containing=True`` and ``pathspecs`` given, the + yielded items corresponding to submodules where the given ``pathsspecs`` + *could* match content. This includes submodules that are not available + locally, because no actual matching of pathspecs to submodule content is + performed -- only an evaluation of the submodule item itself. + """ + _pathspecs = GitPathSpecs(pathspecs) + if not _pathspecs: + # force flag to be sensible to simplify internal logic + match_containing = False + + for item in iter_gitworktree( + path, + untracked=None, + recursive='repository', + # if we want to match submodules that contain pathspecs matches + # we cannot give the job to Git, it won't report anything, + # but we need to match manually below + pathspecs=None if match_containing else _pathspecs, + ): + if ( + # exclude non-submodules, or a submodule that was found at + # the root path -- which would indicate that the submodule + # itself it not around, only its record in the parent + item.gittype == GitTreeItemType.submodule + and item.relpath != PurePosixPath('.') + # does any pathspec match the "inside" of the current submodule's + # path + and (not match_containing or _pathspecs.any_match_subdir(item.relpath)) + ): + yield item + + +def _lsfiles_line2item(path: Path, line: str) -> GitWorktreeItem: + items = line.split('\t', maxsplit=1) + # check if we cannot possibly have a 'staged' report with mode and gitsha + if len(items) < 2: # noqa: PLR2004 + # early exit, we have nothing but the path (untracked) + return GitWorktreeItem( + tree_path=path, + # not known to Git, but Git always reports POSIX + relpath=PurePosixPath(line), + ) + + props = items[0].split(' ') + if len(props) != 3: # noqa: PLR2004 + # early exit again, we have nothing but the path (untracked) + return GitWorktreeItem( + tree_path=path, + # not known to Git, but Git always reports POSIX + relpath=PurePosixPath(line), + ) + + return GitWorktreeItem( + tree_path=path, + # again Git reports always in POSIX + relpath=PurePosixPath(items[1]), + gitsha=props[1], + gittype=git_mode_type_map[props[0]] if props[0] else None, + ) diff --git a/datalad_core/iter_collections/tests/__init__.py b/datalad_core/iter_collections/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datalad_core/iter_collections/tests/test_itergitworktree.py b/datalad_core/iter_collections/tests/test_itergitworktree.py new file mode 100644 index 0000000..8fb6972 --- /dev/null +++ b/datalad_core/iter_collections/tests/test_itergitworktree.py @@ -0,0 +1,232 @@ +from pathlib import ( + PurePosixPath, +) + +import pytest + +from datalad_core.iter_collections.gitworktree import ( + GitWorktreeItem, + iter_gitworktree, +) +from datalad_core.tests import call_git_addcommit + + +def test_iter_gitworktree(gitrepo): + (gitrepo / 'emptydir').mkdir() + tracked_file_relpath = PurePosixPath('subdir_tracked') / 'tracked' + untracked_file_relpath = PurePosixPath('subdir_untracked') / 'untracked' + for p in (tracked_file_relpath, untracked_file_relpath): + p_abs = gitrepo / p + p_abs.parent.mkdir() + p_abs.write_text(p.name) + call_git_addcommit(gitrepo, [str(tracked_file_relpath)]) + + tracked_items = list(iter_gitworktree(gitrepo, untracked=None)) + # without untracked's and no link resolution this is plain and fast + assert all( + isinstance(i, GitWorktreeItem) and i.gitsha and i.gittype for i in tracked_items + ) + + all_items = list(iter_gitworktree(gitrepo, untracked='all')) + # empty-dir is not reported, only untracked files + assert len(all_items) == len(tracked_items) + 1 + assert any( + i.relpath == untracked_file_relpath and i.gitsha is None and i.gittype is None + for i in all_items + ) + # same again, but with a different untracked reporting + all_items = list(iter_gitworktree(gitrepo, untracked='whole-dir')) + # emptydir is reported too + assert len(all_items) == len(tracked_items) + 2 + assert any( + i.relpath == untracked_file_relpath.parent + and i.gitsha is None + and i.gittype is None + for i in all_items + ) + # and again for the last variant + all_items = list(iter_gitworktree(gitrepo, untracked='no-empty-dir')) + # and again no emptydir + assert len(all_items) == len(tracked_items) + 1 + assert any( + i.relpath == untracked_file_relpath.parent + and i.gitsha is None + and i.gittype is None + for i in all_items + ) + + +def test_name_starting_with_tab(gitrepo): + tabbed_file_name = '\ttab.txt' + tabbed_file = gitrepo / tabbed_file_name + try: + tabbed_file.write_text('name of this file starts with a tab') + except OSError: + pytest.skip('not applicable on crippled filesystems') + + call_git_addcommit(gitrepo, [tabbed_file]) + + iter_paths = [item.path for item in iter_gitworktree(gitrepo)] + assert tabbed_file in iter_paths + + +def test_iter_gitworktree_recursive(gitrepo): + # actually, this tests non-recursive, because within-repo + # recursion is the default. + # later, we might also test subdataset recursion here + # some tracked content + tracked1 = gitrepo / 'tracked1' + tracked2 = gitrepo / 'subdir' / 'tracked2' + tracked3 = gitrepo / 'subdir' / 'tracked3' + for p in (tracked1, tracked2, tracked3): + p.parent.mkdir(exist_ok=True) + p.write_text(p.name) + call_git_addcommit(gitrepo) + + # an "invisible" directory (no content) + (gitrepo / 'emptydir').mkdir() + # untracked file in subdir + untracked = gitrepo / 'subdir_u' / 'untracked' + untracked.parent.mkdir() + untracked.write_text('untracked') + + # matches git report with untracked=all + all_content = { + PurePosixPath('subdir'), + PurePosixPath('subdir_u'), + PurePosixPath('tracked1'), + } + # without any recursion, we see all top-level content, except for + # the empty directory with no content + all_items = list(iter_gitworktree(gitrepo, recursive='no')) + assert {i.relpath for i in all_items} == all_content + + # no we test a query that gooey would want to make, + # give me all content in a single directory, and also include any + # untracked files and even untracked/empty directories + all_items = list(iter_gitworktree(gitrepo, recursive='no', untracked='whole-dir')) + assert {i.relpath for i in all_items} == all_content.union( + (PurePosixPath('emptydir'),) + ) + + +def test_iter_gitworktree_empty(gitrepo): + all_items = list(iter_gitworktree(gitrepo)) + assert len(all_items) == 0 + + +@pytest.mark.usefixtures('skip_when_symlinks_not_supported') +def test_iter_gitworktree_deadsymlinks(gitrepo): + dpath = gitrepo / 'subdir' + dpath.mkdir() + fpath = dpath / 'file1' + fpath.symlink_to(gitrepo / 'not_present') + call_git_addcommit(gitrepo) + try: + # we cannot read the file + fpath.read_text() + pytest.fail( + 'we must not get here, ' 'an exception must be raised before' + ) # pragma: no cover + except FileNotFoundError: + # with dead symlinks, we end up here and that is normal + pass + # next one must not crash + all_items = list(iter_gitworktree(dpath)) + # we get our "dead symlink" -- but depending on the p[latform + # it may take a different form, hence not checking for type + assert len(all_items) == 1 + assert all_items[0].relpath == PurePosixPath('file1') + + +def test_iter_gitworktree_untracked_only(modified_dataset): + p = modified_dataset + # only untracked files + repo_items = list(iter_gitworktree(p, untracked='only')) + assert all(f.path.name == 'file_u' for f in repo_items) + + # same report, but compressed to immediate directory children + dir_items = list(iter_gitworktree(p, untracked='only', recursive='no')) + assert {f.relpath.parts[0] for f in repo_items} == { + f.relpath.name for f in dir_items + } + # no wholly untracked directories in standard report + assert not any( + f.path.name == 'dir_u' for f in iter_gitworktree(p, untracked='only') + ) + # but this can be requested + wholedir_items = list(iter_gitworktree(p, untracked='only-whole-dir')) + assert any(f.path.name == 'dir_u' for f in wholedir_items) + # smoke test remaining mode, test case doesn't cause difference + assert any(f.path.name == 'dirempty_u' for f in wholedir_items) + assert not any( + f.path.name == 'dirempty_u' + for f in iter_gitworktree(p, untracked='only-no-empty-dir') + ) + + +def test_iter_gitworktree_pathspec(modified_dataset): + p = modified_dataset + # TODO: bring this back -- for now, this test is running on a plain git repo + # with no such files + # # query for any files that are set to go straight to Git. these are just + # # dotfiles in the default config + # items = list(iter_gitworktree( + # p, + # pathspecs=[':(attr:annex.largefiles=nothing)'])) + # assert items + # assert all(str(i.name).startswith('.') for i in items) + # glob-styles + n_added_toplevel = 1 + n_added = 2 + # first some that only give a top-level match + assert len(list(iter_gitworktree(p, pathspecs=['file_a']))) == n_added_toplevel + assert ( + len(list(iter_gitworktree(p, pathspecs=[':(glob)*file_a']))) == n_added_toplevel + ) + # now some that match at any depth + assert len(list(iter_gitworktree(p, pathspecs=['*file_a']))) == n_added + assert len(list(iter_gitworktree(p, pathspecs=[':(glob)**/file_a']))) == n_added + + # some that do not match + assert not list(iter_gitworktree(p, pathspecs=['*NOOO'])) + assert not list(iter_gitworktree(p, pathspecs=[':(glob)**/NOOO'])) + + # some that do not match after translation to a submodule + assert not list( + iter_gitworktree(p, recursive='submodules', pathspecs=['dir_sm/sm_m/NOOO/*']) + ) + + +def test_iter_gitworktree_subm_recursion(modified_dataset): + p = modified_dataset + nmu_items = list(iter_gitworktree(p / 'dir_sm' / 'sm_nmu', recursive='repository')) + # doesn't matter how many exactly, but we expect multiple. + # needed for the logic below + assert len(nmu_items) > 1 + # and now from the top with recursion + + items = list(iter_gitworktree(p, recursive='submodules')) + # we see all the submodule content + assert all( + any(i.relpath == PurePosixPath('dir_sm', 'sm_nmu') / nmu.relpath for i in items) + for nmu in nmu_items + ) + # now we try listing only the 'nmu' submodule with a bunch of + # equivalent pathspecs + for ps in ( + # matches submodule directly + ['dir_sm/sm_nmu'], + # matches only inside the submodule + # (test discovery of the submodule itself) + ['dir_sm/sm_nmu/*'], + [':(glob)dir_sm/sm_nmu/**'], + [':(glob)dir_s?/*_nmu'], + ): + ps_items = [ + i.relpath for i in iter_gitworktree(p, recursive='submodules', pathspecs=ps) + ] + # we see the submodule items, all of them, and only those + assert ps_items == [ + PurePosixPath('dir_sm', 'sm_nmu') / i.relpath for i in nmu_items + ], f'Mismatch for pathspec {ps!r}' diff --git a/datalad_core/iter_collections/utils.py b/datalad_core/iter_collections/utils.py new file mode 100644 index 0000000..0f3f830 --- /dev/null +++ b/datalad_core/iter_collections/utils.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from enum import Enum +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterator + from pathlib import Path + +from datasalad.itertools import ( + decode_bytes, + itemize, +) + +from datalad_core.runners import iter_git_subproc + + +def git_ls_files(path: Path, *args: str) -> Iterator[str]: + """Run ``git ls-files`` at a given ``path`` and with ``args`` + + An unconditional ``-z`` argument is used to get zero-byte separation + of output items, internally. A generator is returned that yields ``str`` + type values corresponding to these items. + """ + with iter_git_subproc( + [ + 'ls-files', + # we rely on zero-byte splitting below + '-z', + # otherwise take whatever is coming in + *args, + ], + cwd=path, + ) as r: + yield from itemize( + decode_bytes(r, backslash_replace=True), + sep='\0', + keep_ends=False, + ) + + +# TODO: Could be `StrEnum`, came with PY3.11 +class GitTreeItemType(Enum): + """Enumeration of item types of Git trees""" + + file = 'file' + executablefile = 'executablefile' + symlink = 'symlink' + directory = 'directory' + submodule = 'submodule' + + +git_mode_type_map = { + '100644': GitTreeItemType.file, + '100755': GitTreeItemType.executablefile, + '040000': GitTreeItemType.directory, + '120000': GitTreeItemType.symlink, + '160000': GitTreeItemType.submodule, +} diff --git a/docs/index.rst b/docs/index.rst index 3788a63..7d96bb1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -14,6 +14,7 @@ Also see the :ref:`modindex`. config constraints consts + iter_collections repo runners