Skip to content

Commit 71befe0

Browse files
feat(dataset): update local files (#2049)
* feat(dataset): update local files
1 parent 3a0321d commit 71befe0

File tree

6 files changed

+138
-29
lines changed

6 files changed

+138
-29
lines changed

renku/cli/dataset.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,9 @@
217217
with those datasets. Modifying those datasets locally will prevent them from
218218
being updated.
219219
220+
The update command also checks for file changes in the project and updates
221+
datasets' metadata accordingly.
222+
220223
You can limit the scope of updated files by specifying dataset names, using
221224
``--include`` and ``--exclude`` to filter based on file names, or using
222225
``--creators`` to filter based on creators. For example, the following command

renku/core/commands/dataset.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -652,13 +652,16 @@ def _update_datasets(client, names, creators, include, exclude, ref, delete, ext
652652
possible_updates = []
653653
unique_remotes = set()
654654
external_files = []
655+
local_files = []
655656

656657
for file_ in records:
657658
if file_.based_on:
658659
possible_updates.append(file_)
659660
unique_remotes.add(file_.based_on.source)
660661
elif file_.external:
661662
external_files.append(file_)
663+
else:
664+
local_files.append(file_)
662665

663666
if ref and len(unique_remotes) > 1:
664667
raise ParameterError(
@@ -672,17 +675,24 @@ def _update_datasets(client, names, creators, include, exclude, ref, delete, ext
672675
else:
673676
communication.echo("To update external files run update command with '--external' flag.")
674677

675-
if not possible_updates:
676-
return
678+
updated_files = []
679+
deleted_files = []
680+
681+
if possible_updates:
682+
updated_files, deleted_files = client.update_dataset_git_files(files=possible_updates, ref=ref, delete=delete)
677683

678-
updated_files, deleted_files = client.update_dataset_git_files(files=possible_updates, ref=ref, delete=delete)
684+
if local_files:
685+
updated, deleted = client.update_dataset_local_files(records=local_files, delete=delete)
686+
updated_files.extend(updated)
687+
deleted_files.extend(deleted)
679688

680689
if deleted_files and not delete:
681-
communication.echo(
682-
"Some files are deleted from remote. To also delete them locally "
683-
"run update command with '--delete' flag."
684-
)
685-
communication.echo("Updated {} files".format(len(updated_files)))
690+
communication.echo("Some files are deleted. To also delete them from datasets' metadata use '--delete' flag.")
691+
692+
message = f"Updated {len(updated_files)} files"
693+
if delete:
694+
message += f" and deleted {len(deleted_files)} files"
695+
communication.echo(message)
686696

687697

688698
def update_datasets():

renku/core/management/datasets.py

Lines changed: 53 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,6 +1049,58 @@ def move_files(self, files, to_dataset, commit):
10491049
if to_dataset:
10501050
to_dataset.to_yaml()
10511051

1052+
def update_dataset_local_files(self, records, delete=False):
1053+
"""Update files metadata from the git history."""
1054+
updated_files = []
1055+
deleted_files = []
1056+
progress_text = "Checking for local updates"
1057+
1058+
try:
1059+
communication.start_progress(progress_text, len(records))
1060+
for file_ in records:
1061+
communication.update_progress(progress_text, 1)
1062+
1063+
if file_.based_on or file_.external:
1064+
continue
1065+
1066+
if not Path(file_.path).exists():
1067+
deleted_files.append(file_)
1068+
continue
1069+
1070+
try:
1071+
commit = self.find_previous_commit(file_.path)
1072+
except KeyError:
1073+
deleted_files.append(file_)
1074+
else:
1075+
if self._get_commit_sha_from_label(file_) != commit.hexsha:
1076+
updated_files.append(file_)
1077+
finally:
1078+
communication.finalize_progress(progress_text)
1079+
1080+
if updated_files or (deleted_files and delete):
1081+
self._update_datasets_metadata(updated_files, deleted_files, delete)
1082+
1083+
return updated_files, deleted_files
1084+
1085+
def _update_datasets_metadata(self, updated_files, deleted_files, delete):
1086+
modified_datasets = {}
1087+
1088+
for file_ in updated_files:
1089+
new_file = DatasetFile.from_revision(
1090+
self, path=file_.path, based_on=file_.based_on, url=file_.url, source=file_.source
1091+
)
1092+
file_.dataset.update_files([new_file])
1093+
modified_datasets[file_.dataset.name] = file_.dataset
1094+
1095+
if delete:
1096+
for file_ in deleted_files:
1097+
file_.dataset.unlink_file(file_.path)
1098+
modified_datasets[file_.dataset.name] = file_.dataset
1099+
1100+
for dataset in modified_datasets.values():
1101+
dataset.to_yaml()
1102+
self.update_datasets_provenance(dataset)
1103+
10521104
def update_dataset_git_files(self, files, ref, delete=False):
10531105
"""Update files and dataset metadata according to their remotes.
10541106
@@ -1133,25 +1185,7 @@ def update_dataset_git_files(self, files, ref, delete=False):
11331185
skip_hooks=skip_hooks,
11341186
)
11351187

1136-
# Update datasets' metadata
1137-
1138-
modified_datasets = {}
1139-
1140-
for file_ in updated_files:
1141-
new_file = DatasetFile.from_revision(
1142-
self, path=file_.path, based_on=file_.based_on, url=file_.url, source=file_.source
1143-
)
1144-
file_.dataset.update_files([new_file])
1145-
modified_datasets[file_.dataset.name] = file_.dataset
1146-
1147-
if delete:
1148-
for file_ in deleted_files:
1149-
file_.dataset.unlink_file(file_.path)
1150-
modified_datasets[file_.dataset.name] = file_.dataset
1151-
1152-
for dataset in modified_datasets.values():
1153-
dataset.to_yaml()
1154-
self.update_datasets_provenance(dataset)
1188+
self._update_datasets_metadata(updated_files, deleted_files, delete)
11551189

11561190
return updated_files, deleted_files
11571191

renku/core/management/migrations/models/v8.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def __init__(self, **kwargs):
4040
if hasattr(self, "path") and (not self._id or self._id.startswith("_:")):
4141
hexsha = "UNCOMMITTED"
4242
if self.client and Path(self.path).exists():
43-
hexsha = self.client.find_previous_commit().hexsha
43+
hexsha = self.client.find_previous_commit(self.path).hexsha
4444

4545
self._id = generate_file_id(client=self.client, hexsha=hexsha, path=self.path)
4646

tests/cli/test_datasets.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1987,3 +1987,65 @@ def test_authorized_import(mock_kg, client, runner):
19871987
assert 1 == result.exit_code
19881988
assert "Unauthorized access to knowledge graph" not in result.output
19891989
assert "Resource not found in knowledge graph" in result.output
1990+
1991+
1992+
def test_update_local_file(runner, client, directory_tree):
1993+
"""Check updating local files."""
1994+
assert 0 == runner.invoke(cli, ["dataset", "add", "-c", "my-data", str(directory_tree)]).exit_code
1995+
1996+
file1 = Path(DATA_DIR) / "my-data" / directory_tree.name / "file1"
1997+
file1.write_text("some updates")
1998+
client.repo.git.add("--all")
1999+
client.repo.index.commit("file1")
2000+
commit_sha_after_file1 = client.repo.head.object.hexsha
2001+
2002+
file2 = Path(DATA_DIR) / "my-data" / directory_tree.name / "dir1" / "file2"
2003+
file2.write_text("some updates")
2004+
client.repo.git.add("--all")
2005+
client.repo.index.commit("file2")
2006+
commit_sha_after_file2 = client.repo.head.object.hexsha
2007+
2008+
old_dataset = client.load_dataset("my-data")
2009+
2010+
result = runner.invoke(cli, ["dataset", "update", "my-data"])
2011+
2012+
assert 0 == result.exit_code
2013+
dataset = client.load_dataset("my-data")
2014+
assert commit_sha_after_file1 in dataset.find_file(file1)._label
2015+
assert commit_sha_after_file2 in dataset.find_file(file2)._label
2016+
assert_dataset_is_mutated(old=old_dataset, new=dataset)
2017+
2018+
2019+
def test_update_local_deleted_file(runner, client, directory_tree):
2020+
"""Check updating local deleted files."""
2021+
assert 0 == runner.invoke(cli, ["dataset", "add", "-c", "my-data", str(directory_tree)]).exit_code
2022+
2023+
file1 = Path(DATA_DIR) / "my-data" / directory_tree.name / "file1"
2024+
file1.unlink()
2025+
client.repo.git.add("--all")
2026+
client.repo.index.commit("deleted file1")
2027+
commit_sha_after_file1_delete = client.repo.head.object.hexsha
2028+
2029+
result = runner.invoke(cli, ["dataset", "update", "my-data"])
2030+
2031+
assert 0 == result.exit_code
2032+
assert "Some files are deleted." in result.output
2033+
assert "Updated 0 files" in result.output
2034+
assert commit_sha_after_file1_delete == client.repo.head.object.hexsha
2035+
old_dataset = client.load_dataset("my-data")
2036+
assert old_dataset.find_file(file1)
2037+
2038+
# NOTE: Update with `--delete`
2039+
result = runner.invoke(cli, ["dataset", "update", "--delete", "my-data"])
2040+
2041+
assert 0 == result.exit_code
2042+
assert "Updated 0 files and deleted 1 files" in result.output
2043+
assert commit_sha_after_file1_delete != client.repo.head.object.hexsha
2044+
dataset = client.load_dataset("my-data")
2045+
assert dataset.find_file(file1) is None
2046+
assert_dataset_is_mutated(old=old_dataset, new=dataset)
2047+
2048+
result = runner.invoke(cli, ["dataset", "update", "--delete", "my-data"])
2049+
2050+
assert 0 == result.exit_code
2051+
assert "Updated 0 files and deleted 0 files" in result.output

tests/cli/test_integration_datasets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1102,7 +1102,7 @@ def test_dataset_update_remove_file(client, runner):
11021102

11031103
result = runner.invoke(cli, ["dataset", "update", "--ref", "v0.5.0"], catch_exceptions=False)
11041104
assert 0 == result.exit_code, result.output + str(result.stderr_bytes)
1105-
assert "Some files are deleted from remote." in result.output
1105+
assert "Some files are deleted." in result.output
11061106
assert file_path.exists()
11071107

11081108
result = runner.invoke(cli, ["dataset", "update", "--ref", "v0.5.0", "--delete"], catch_exceptions=False)

0 commit comments

Comments
 (0)