Skip to content

Commit 420653f

Browse files
fix(dataset): deserialization error (#1675)
1 parent 2e0e891 commit 420653f

File tree

7 files changed

+101
-36
lines changed

7 files changed

+101
-36
lines changed

conftest.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,25 @@ def dataset(client):
413413
return dataset
414414

415415

416+
@pytest.fixture
417+
def client_with_datasets(client, directory_tree):
418+
"""A client with datasets."""
419+
from renku.core.models.provenance.agents import Person
420+
421+
person_1 = Person.from_string("P1 <[email protected]> [IANA]")
422+
person_2 = Person.from_string("P2 <[email protected]>")
423+
424+
client.create_dataset(name="dataset-1", keywords=["dataset", "1"], creators=[person_1])
425+
426+
with client.with_dataset("dataset-2", create=True) as dataset:
427+
dataset.keywords = ["dataset", "2"]
428+
dataset.creators = [person_1, person_2]
429+
430+
client.add_data_to_dataset(dataset=dataset, urls=[str(p) for p in directory_tree.glob("*")])
431+
432+
yield client
433+
434+
416435
@pytest.fixture(params=[".", "some/sub/directory"])
417436
def subdirectory(project, request):
418437
"""Runs tests in root directory and a subdirectory."""

renku/core/management/migrations/models/v3.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,18 @@
2222
from marshmallow import EXCLUDE, post_load, pre_load
2323

2424
from renku.core.models import jsonld
25-
from renku.core.models.calamus import JsonLDSchema, Uri, fields, prov, rdfs, renku, schema, wfprov
25+
from renku.core.models.calamus import (
26+
DateTimeList,
27+
JsonLDSchema,
28+
StringList,
29+
Uri,
30+
fields,
31+
prov,
32+
rdfs,
33+
renku,
34+
schema,
35+
wfprov,
36+
)
2637
from renku.core.models.datasets import generate_dataset_tag_id, generate_url_id
2738
from renku.core.models.git import get_user_info
2839
from renku.core.models.projects import generate_project_id
@@ -36,6 +47,8 @@ def __init__(self, **kwargs):
3647
"""Initialize an instance."""
3748
self.client = None
3849

50+
kwargs.setdefault("_id", None)
51+
3952
for k, v in kwargs.items():
4053
setattr(self, k, v)
4154

@@ -185,11 +198,11 @@ class Meta:
185198
unknown = EXCLUDE
186199

187200
_id = fields.Id()
188-
name = fields.String(schema.name)
201+
name = StringList(schema.name)
189202
email = fields.String(schema.email, missing=None)
190-
label = fields.String(rdfs.label)
191-
affiliation = fields.String(schema.affiliation, missing=None)
192-
alternate_name = fields.String(schema.alternateName, missing=None)
203+
label = StringList(rdfs.label)
204+
affiliation = StringList(schema.affiliation, missing=None)
205+
alternate_name = StringList(schema.alternateName, missing=None)
193206

194207
@post_load
195208
def make_instance(self, data, **kwargs):
@@ -213,8 +226,8 @@ class Meta:
213226
_id = fields.Id(missing=None)
214227
agent_version = fields.String(schema.agent, missing="pre-0.11.0")
215228
name = fields.String(schema.name, missing=None)
216-
created = fields.DateTime(schema.dateCreated, missing=None)
217-
version = fields.String(schema.schemaVersion, missing=1)
229+
created = DateTimeList(schema.dateCreated, missing=None)
230+
version = StringList(schema.schemaVersion, missing="1")
218231
creator = fields.Nested(schema.creator, PersonSchemaV3, missing=None)
219232

220233

renku/core/management/migrations/models/v8.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,33 @@
1818
"""Migration models V8."""
1919

2020
import os
21+
from pathlib import Path
2122

2223
from marshmallow import EXCLUDE, pre_dump
2324

2425
from renku.core.models import jsonld
2526
from renku.core.models.calamus import Uri, fields, schema
27+
from renku.core.models.entities import generate_file_id
2628

2729
from .v3 import CreatorMixinSchemaV3, DatasetTagSchemaV3, EntitySchemaV3, LanguageSchemaV3, PersonSchemaV3, UrlSchemaV3
2830
from .v7 import Base, DatasetFileSchemaV7
2931

3032

33+
class DatasetFile(Base):
34+
"""DatasetFile migration model."""
35+
36+
def __init__(self, **kwargs):
37+
"""Initialize an instance."""
38+
super().__init__(**kwargs)
39+
40+
if hasattr(self, "path") and (not self._id or self._id.startswith("_:")):
41+
hexsha = "UNCOMMITTED"
42+
if self.client and Path(self.path).exists():
43+
hexsha = self.client.find_previous_commit().hexsha
44+
45+
self._id = generate_file_id(client=self.client, hexsha=hexsha, path=self.path)
46+
47+
3148
class Dataset(Base):
3249
"""Dataset migration model."""
3350

@@ -43,11 +60,25 @@ def to_yaml(self, path=None):
4360
"""Write content to a YAML file."""
4461
from renku.core.management import LocalClient
4562

63+
for file_ in self.files:
64+
file_._project = self._project
65+
4666
data = DatasetSchemaV8(flattened=True).dump(self)
4767
path = path or self._metadata_path or os.path.join(self.path, LocalClient.METADATA)
4868
jsonld.write_yaml(path=path, data=data)
4969

5070

71+
class DatasetFileSchemaV8(DatasetFileSchemaV7):
72+
"""DatasetFile schema."""
73+
74+
class Meta:
75+
"""Meta class."""
76+
77+
rdf_type = schema.DigitalDocument
78+
model = DatasetFile
79+
unknown = EXCLUDE
80+
81+
5182
class DatasetSchemaV8(CreatorMixinSchemaV3, EntitySchemaV3):
5283
"""Dataset schema."""
5384

@@ -62,7 +93,7 @@ class Meta:
6293
date_created = fields.DateTime(schema.dateCreated, missing=None)
6394
date_published = fields.DateTime(schema.datePublished, missing=None)
6495
description = fields.String(schema.description, missing=None)
65-
files = fields.Nested(schema.hasPart, DatasetFileSchemaV7, many=True)
96+
files = fields.Nested(schema.hasPart, DatasetFileSchemaV8, many=True)
6697
identifier = fields.String(schema.identifier)
6798
in_language = fields.Nested(schema.inLanguage, LanguageSchemaV3, missing=None)
6899
keywords = fields.List(schema.keywords, fields.String())

renku/core/models/calamus.py

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -116,34 +116,22 @@ def _deserialize(self, value, attr, data, **kwargs):
116116
raise ValueError("Invalid type for field {}: {}".format(self.name, type(value)))
117117

118118

119-
class StringList(fields._JsonLDField, marshmallow.fields.String, marshmallow.fields.List):
119+
class StringList(fields.String):
120120
"""A String field that might be a list when deserializing."""
121121

122-
def __init__(self, *args, **kwargs):
122+
def __init__(self, *args, return_max_value=True, **kwargs):
123123
"""Create an instance."""
124124
super().__init__(*args, **kwargs)
125-
126-
def _serialize(self, value, attr, obj, **kwargs):
127-
if isinstance(value, list):
128-
value = value[0] if value else None
129-
if isinstance(value, str):
130-
value = super(fields._JsonLDField, self)._serialize(value, attr, obj, **kwargs)
131-
if self.parent.opts.add_value_types:
132-
value = {"@value": value, "@type": "http://www.w3.org/2001/XMLSchema#string"}
133-
134-
return value
125+
self.return_max_value = return_max_value
135126

136127
def _deserialize(self, value, attr, data, **kwargs):
137128
value = normalize_value(value)
138-
if not value:
139-
return None
140-
elif isinstance(value, str):
141-
return value
142-
elif isinstance(value, list):
143-
value = super(marshmallow.fields.String, self)._deserialize(value, attr, data, **kwargs)
144-
return value[0] if len(value) > 0 else None
145-
else:
146-
raise ValueError("Invalid type for field {}: {}".format(self.name, type(value)))
129+
130+
if isinstance(value, (list, tuple, set)):
131+
value = sorted(value, reverse=self.return_max_value)
132+
value = value[0] if len(value) > 0 else None
133+
134+
return super()._deserialize(value, attr, data, **kwargs)
147135

148136

149137
class DateTimeList(fields.DateTime):

renku/core/models/projects.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
from renku.core.management.migrate import SUPPORTED_PROJECT_VERSION
2828
from renku.core.models import jsonld
29-
from renku.core.models.calamus import JsonLDSchema, Nested, fields, prov, renku, schema
29+
from renku.core.models.calamus import DateTimeList, JsonLDSchema, Nested, StringList, fields, prov, renku, schema
3030
from renku.core.models.datastructures import Collection
3131
from renku.core.models.provenance.agents import Person, PersonSchema
3232
from renku.core.utils.datetime8601 import parse_date
@@ -184,8 +184,8 @@ class Meta:
184184
unknown = EXCLUDE
185185

186186
name = fields.String(schema.name, missing=None)
187-
created = fields.DateTime(schema.dateCreated, missing=None, format="iso", extra_formats=("%Y-%m-%d",))
188-
version = fields.String(schema.schemaVersion, missing=1)
187+
created = DateTimeList(schema.dateCreated, missing=None, format="iso", extra_formats=("%Y-%m-%d",))
188+
version = StringList(schema.schemaVersion, missing="1")
189189
agent_version = fields.String(schema.agent, missing="pre-0.11.0")
190190
template_source = fields.String(renku.templateSource, missing=None)
191191
template_ref = fields.String(renku.templateReference, missing=None)

renku/core/models/provenance/agents.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,11 @@ class Meta:
141141
model = Person
142142
unknown = EXCLUDE
143143

144-
name = StringList(schema.name, fields.String(), missing=None)
144+
name = StringList(schema.name, missing=None)
145145
email = fields.String(schema.email, missing=None)
146-
label = StringList(rdfs.label, fields.String(), missing=None)
147-
affiliation = StringList(schema.affiliation, fields.String(), missing=None)
148-
alternate_name = StringList(schema.alternateName, fields.String(), missing=None)
146+
label = StringList(rdfs.label, missing=None)
147+
affiliation = StringList(schema.affiliation, missing=None)
148+
alternate_name = StringList(schema.alternateName, missing=None)
149149
_id = fields.Id(init_name="id")
150150

151151

tests/core/commands/test_serialization.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,17 @@ def test_calamus(client, dataset_metadata_before_calamus):
8787
file_ = dataset.find_file("data/dataverse/local/result.csv")
8888
assert file_.external is False
8989
assert "file://../../../../tmp/result.csv" == file_.url
90+
91+
92+
def test_dataset_with_multiple_project_version(client_with_datasets):
93+
"""Test deserialization of a dataset where contains different project versions."""
94+
max_version = "42000"
95+
96+
# Change project version for a single file
97+
with client_with_datasets.with_dataset("dataset-2") as dataset:
98+
file_ = dataset.find_file(dataset.data_dir / "file1")
99+
file_._project.version = max_version
100+
101+
dataset = client_with_datasets.load_dataset("dataset-2")
102+
103+
assert {max_version} == {f._project.version for f in dataset.files}

0 commit comments

Comments
 (0)