Skip to content

Commit baa1c9f

Browse files
feat(dataset): wildcard support when adding data from git (#1128)
* feat(dataset): wildcards when adding from git
1 parent 3421272 commit baa1c9f

File tree

5 files changed

+114
-23
lines changed

5 files changed

+114
-23
lines changed

Pipfile.lock

Lines changed: 20 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

renku/cli/dataset.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,15 @@
118118
my-dataset/
119119
datafile
120120
121+
You can use shell-like wildcards (e.g. *, **, ?) when specifying paths to be
122+
added. Put wildcard patterns in quotes to prevent your shell from expanding
123+
them.
124+
125+
.. code-block:: console
126+
127+
$ renku dataset add my-dataset --source 'path/**/datafile' \
128+
git+ssh://host.io/namespace/project.git
129+
121130
You can use ``--destination`` or ``-d`` flag to change the name of the target
122131
file or directory. The semantics here are similar to the POSIX copy command:
123132
if the destination does not exist or if it is a file then the source will be

renku/core/management/datasets.py

Lines changed: 43 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import time
2727
import uuid
2828
import warnings
29+
from collections import OrderedDict
2930
from configparser import NoSectionError
3031
from contextlib import contextmanager
3132
from pathlib import Path
@@ -36,6 +37,7 @@
3637
import patoolib
3738
import requests
3839
from git import GitCommandError, GitError, Repo
40+
from wcmatch import glob
3941

4042
from renku.core import errors
4143
from renku.core.management.clone import clone
@@ -520,33 +522,37 @@ def _add_from_git(self, dataset, url, sources, destination, ref):
520522

521523
# Get all files from repo that match sources
522524
repo, repo_path = self.prepare_git_repo(url, ref)
523-
copied_sources = set()
524525
files = set()
526+
used_sources = set()
525527
for file in repo.head.commit.tree.traverse():
526528
path = file.path
527529
result = self._get_src_and_dst(
528-
path, repo_path, sources, destination
530+
path, repo_path, sources, destination, used_sources
529531
)
530532

531533
if result:
532534
files.add(result)
533-
source = result[3]
534-
copied_sources.add(source)
535535

536-
uncopied_sources = sources - copied_sources
537-
if uncopied_sources:
538-
uncopied_sources = {str(s) for s in uncopied_sources}
536+
unused_sources = set(sources.keys()) - used_sources
537+
if unused_sources:
538+
unused_sources = {str(s) for s in unused_sources}
539539
raise errors.ParameterError(
540-
'No such file or directory', param_hint=uncopied_sources
540+
'No such file or directory', param_hint=unused_sources
541541
)
542542

543+
if destination.exists() and not destination.is_dir():
544+
if len(files) > 1:
545+
raise errors.ParameterError(
546+
'Cannot copy multiple files or directories to a file'
547+
)
548+
543549
# Create metadata and move files to dataset
544550
results = []
545551
remote_client = LocalClient(repo_path)
546552

547553
# Pull files from LFS
548554
paths = set()
549-
for path, src, _, __ in files:
555+
for path, src, _ in files:
550556
if src.is_dir():
551557
continue
552558
if src.is_symlink():
@@ -561,7 +567,7 @@ def _add_from_git(self, dataset, url, sources, destination, ref):
561567
paths = {f[0] for f in files}
562568
metadata = self._fetch_files_metadata(remote_client, paths)
563569

564-
for path, src, dst, _ in files:
570+
for path, src, dst in files:
565571
if not src.is_dir():
566572
# Use original metadata if it exists
567573
based_on = metadata.get(path)
@@ -605,7 +611,11 @@ def _check_overwrite(self, files, force):
605611

606612
def _resolve_paths(self, root_path, paths):
607613
"""Check if paths are within a root path and resolve them."""
608-
return {self._resolve_path(root_path, p) for p in paths}
614+
result = OrderedDict() # Used as an ordered-set
615+
for path in paths:
616+
r = self._resolve_path(root_path, path)
617+
result[r] = None
618+
return result
609619

610620
def _resolve_path(self, root_path, path):
611621
"""Check if a path is within a root path and resolve it."""
@@ -618,18 +628,27 @@ def _resolve_path(self, root_path, path):
618628
'File {} is not within path {}'.format(path, root_path)
619629
)
620630

621-
def _get_src_and_dst(self, path, repo_path, sources, dst_root):
631+
def _get_src_and_dst(
632+
self, path, repo_path, sources, dst_root, used_sources
633+
):
634+
is_wildcard = False
635+
622636
if not sources:
623637
source = Path('.')
624638
else:
625639
source = None
626-
for s in sources:
640+
for s in sources.keys():
627641
try:
628642
Path(path).relative_to(s)
629643
except ValueError:
630-
pass
644+
if glob.globmatch(path, str(s), flags=glob.GLOBSTAR):
645+
is_wildcard = True
646+
source = path
647+
used_sources.add(s)
648+
break
631649
else:
632650
source = s
651+
used_sources.add(source)
633652
break
634653

635654
if not source:
@@ -639,24 +658,26 @@ def _get_src_and_dst(self, path, repo_path, sources, dst_root):
639658
source_name = Path(source).name
640659
relative_path = Path(path).relative_to(source)
641660

642-
if not dst_root.exists():
643-
if len(sources) == 1:
661+
if src.is_dir() and is_wildcard:
662+
sources[source] = None
663+
used_sources.add(source)
664+
665+
if not dst_root.exists(): # Destination will be a file or directory
666+
if len(sources) == 1 and not is_wildcard:
644667
dst = dst_root / relative_path
645668
else: # Treat destination as a directory
646669
dst = dst_root / source_name / relative_path
647670
elif dst_root.is_dir():
648671
dst = dst_root / source_name / relative_path
649672
else: # Destination is an existing file
650-
if len(sources) == 1 and not src.is_dir():
651-
dst = dst_root
652-
elif not sources:
653-
raise errors.ParameterError('Cannot copy repo to file')
654-
else:
673+
if src.is_dir():
655674
raise errors.ParameterError(
656675
'Cannot copy multiple files or directories to a file'
657676
)
677+
# Later we need to check if we are copying multiple files
678+
dst = dst_root
658679

659-
return (path, src, dst, source)
680+
return (path, src, dst)
660681

661682
def _fetch_lfs_files(self, repo_path, paths):
662683
"""Fetch and checkout paths that are tracked by Git LFS."""

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ def run(self):
182182
'tabulate>=0.7.7',
183183
'tqdm>=4.31.1',
184184
'walrus>=0.8.0',
185+
'wcmatch>=6.0.0',
185186
'werkzeug>=0.15.5',
186187
]
187188

tests/cli/test_integration_datasets.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -784,6 +784,44 @@ def test_add_data_from_git(runner, client, params, path):
784784
assert Path(path).exists()
785785

786786

787+
@pytest.mark.integration
788+
@pytest.mark.parametrize(
789+
'params,files', [
790+
(['-s', 'docker*'], {'docker'}),
791+
(['-s', 'docker/*'
792+
], {'py3.7', 'cuda10.0-tf1.14', 'cuda9.2', 'r3.6.1', 'bioc3_10'}),
793+
(['-s', 'docker/**'], {
794+
'powerline.config', 'Dockerfile', 'entrypoint.sh',
795+
'requirements.txt', 'fix-permissions.sh',
796+
'LICENSE-fix-permissions', 'powerline.bashrc', 'git-config.bashrc'
797+
}),
798+
(['-s', 'docker/*/*sh'], {'entrypoint.sh', 'fix-permissions.sh'}),
799+
]
800+
)
801+
@flaky(max_runs=10, min_passes=1)
802+
def test_add_data_from_git_with_wildcards(runner, client, params, files):
803+
"""Test add data using wildcards to datasets from a git repository."""
804+
REMOTE = 'https://github.com/SwissDataScienceCenter/renku-jupyter.git'
805+
806+
result = runner.invoke(
807+
cli,
808+
['dataset', 'add', 'remote', '--create', '--ref', '0.5.2', REMOTE] +
809+
params,
810+
catch_exceptions=False,
811+
)
812+
assert 0 == result.exit_code
813+
assert files == set(os.listdir('data/remote'))
814+
815+
result = runner.invoke(
816+
cli,
817+
['dataset', 'add', 'remote', '--ref', '0.5.2', '-d', 'new', REMOTE] +
818+
params,
819+
catch_exceptions=False,
820+
)
821+
assert 0 == result.exit_code
822+
assert files == set(os.listdir('data/remote/new'))
823+
824+
787825
@pytest.mark.integration
788826
@flaky(max_runs=10, min_passes=1)
789827
def test_add_from_git_copies_metadata(runner, client):
@@ -814,11 +852,13 @@ def test_add_from_git_copies_metadata(runner, client):
814852
(['-s', 'file'], 2, 'Cannot add multiple URLs'),
815853
(['-d', 'file'], 2, 'Cannot add multiple URLs'),
816854
(['-s', 'non-existing'], 1, 'No such file or directory'),
855+
(['-s', 'docker/*Dockerfile'], 1, 'No such file or directory'),
817856
(['-s', 'docker', '-d', 'LICENSE'
818857
], 1, 'Cannot copy multiple files or directories to a file'),
819858
(['-s', 'LICENSE', '-s', 'Makefile', '-d', 'LICENSE'
820859
], 1, 'Cannot copy multiple files or directories to a file'),
821-
(['-d', 'LICENSE'], 1, 'Cannot copy repo to file'),
860+
(['-d', 'LICENSE'
861+
], 1, 'Cannot copy multiple files or directories to a file'),
822862
]
823863
)
824864
@flaky(max_runs=10, min_passes=1)

0 commit comments

Comments
 (0)