2828
2929import attr
3030import requests
31- import yaml
3231
3332from renku import errors
3433from renku ._compat import Path
@@ -65,20 +64,26 @@ def datasets_from_commit(self, commit=None):
6564 blob = tree / self .METADATA
6665 except KeyError :
6766 continue
68-
69- yield Dataset .from_jsonld (
70- yaml .safe_load (blob .data_stream .read ()),
71- __reference__ = Path (blob .path ),
67+ dataset = Dataset .from_yaml (
68+ self .path / Path (blob .path ), client = self
7269 )
70+ dataset .commit = commit
71+ yield dataset
7372
7473 @property
7574 def datasets (self ):
7675 """Return mapping from path to dataset."""
7776 result = {}
7877 for path in self .renku_datasets_path .rglob (self .METADATA ):
79- result [path ] = Dataset . from_yaml (path )
78+ result [path ] = self . get_dataset (path )
8079 return result
8180
81+ def get_dataset (self , path ):
82+ """Return a dataset from a given path."""
83+ if not path .is_absolute ():
84+ path = self .path / path
85+ return Dataset .from_yaml (path , client = self )
86+
8287 def dataset_path (self , name ):
8388 """Get dataset path from name."""
8489 from renku .models .refs import LinkReference
@@ -98,7 +103,7 @@ def load_dataset(self, name=None):
98103 if name :
99104 path = self .dataset_path (name )
100105 if path .exists ():
101- dataset = Dataset . from_yaml (path )
106+ dataset = self . get_dataset (path )
102107
103108 return dataset
104109
@@ -116,7 +121,9 @@ def with_dataset(self, name=None):
116121 path .parent .mkdir (parents = True , exist_ok = True )
117122
118123 with with_reference (path ):
119- dataset = Dataset (identifier = identifier , name = name )
124+ dataset = Dataset (
125+ identifier = identifier , name = name , client = self
126+ )
120127
121128 if name :
122129 LinkReference .create (client = self , name = 'datasets/' +
@@ -150,32 +157,38 @@ def add_data_to_dataset(
150157 dataset , dataset_path , url , target , ** kwargs
151158 )
152159 else :
153- files = {}
160+ files = []
154161 for t in target :
155- files .update (
162+ files .extend (
156163 self ._add_from_git (
157164 dataset , dataset_path , url , t , ** kwargs
158165 )
159166 )
160167 else :
161168 files = self ._add_from_url (dataset , dataset_path , url , ** kwargs )
162169
163- ignored = self .find_ignored_paths (
164- * [
165- os .path .relpath (
166- str (self .renku_datasets_path / dataset .uid / key ),
167- start = str (self .path ),
168- ) for key in files .keys ()
169- ]
170- )
170+ ignored = self .find_ignored_paths (* (data ['path' ]
171+ for data in files )) or []
171172
172173 if ignored :
173174 if force :
174175 self .repo .git .add (* ignored , force = True )
175176 else :
176177 raise errors .IgnoredFiles (ignored )
177178
178- dataset .update_files (files .values ())
179+ # commit all new data
180+ file_paths = {str (data ['path' ]) for data in files if str (data ['path' ])}
181+ self .repo .git .add (* (file_paths - set (ignored )))
182+ self .repo .index .commit (
183+ 'renku dataset: commiting {} newly added files' .
184+ format (len (file_paths ) + len (ignored ))
185+ )
186+
187+ # Generate the DatasetFiles
188+ dataset_files = []
189+ for data in files :
190+ dataset_files .append (DatasetFile .from_revision (self , ** data ))
191+ dataset .update_files (dataset_files )
179192
180193 def _add_from_url (self , dataset , path , url , link = False , ** kwargs ):
181194 """Process an add from url and return the location on disk."""
@@ -202,15 +215,16 @@ def _add_from_url(self, dataset, path, url, link=False, **kwargs):
202215
203216 # if we have a directory, recurse
204217 if src .is_dir ():
205- files = {}
218+ files = []
206219 dst .mkdir (parents = True , exist_ok = True )
207220 for f in src .iterdir ():
208- files .update (
221+ files .extend (
209222 self ._add_from_url (
210223 dataset ,
211224 dst ,
212225 f .absolute ().as_posix (),
213226 link = link ,
227+ ** kwargs
214228 )
215229 )
216230 return files
@@ -243,17 +257,14 @@ def _add_from_url(self, dataset, path, url, link=False, **kwargs):
243257 dst .chmod (mode & ~ (stat .S_IXUSR | stat .S_IXGRP | stat .S_IXOTH ))
244258
245259 self .track_paths_in_storage (str (dst .relative_to (self .path )))
246- dataset_path = self .renku_datasets_path / dataset .name
247- result = os .path .relpath (str (dst ), start = str (dataset_path ))
248- return {
249- result :
250- DatasetFile (
251- path = result ,
252- url = url ,
253- creator = dataset .creator ,
254- dataset = dataset .name ,
255- )
256- }
260+
261+ return [{
262+ 'path' : dst .relative_to (self .path ),
263+ 'url' : url ,
264+ 'creator' : dataset .creator ,
265+ 'dataset' : dataset .name ,
266+ 'parent' : self
267+ }]
257268
258269 def _add_from_git (self , dataset , path , url , target , ** kwargs ):
259270 """Process adding resources from another git repository.
@@ -280,21 +291,13 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
280291 relative_url = None
281292
282293 if relative_url :
283- result = str (
284- os .path .relpath (
285- str (relative_url ),
286- start = str (self .renku_datasets_path / dataset .uid ),
287- )
288- )
289- return {
290- result :
291- DatasetFile (
292- path = result ,
293- url = url ,
294- creator = dataset .creator ,
295- dataset = dataset .name ,
296- )
297- }
294+ return [{
295+ 'path' : url ,
296+ 'url' : url ,
297+ 'creator' : dataset .creator ,
298+ 'dataset' : dataset .name ,
299+ 'parent' : self
300+ }]
298301
299302 warnings .warn ('Importing local git repository, use HTTPS' )
300303 # determine where is the base repo path
@@ -355,12 +358,12 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
355358
356359 # if we have a directory, recurse
357360 if src .is_dir ():
358- files = {}
361+ files = []
359362 dst .mkdir (parents = True , exist_ok = True )
360363 # FIXME get all files from submodule index
361364 for f in src .iterdir ():
362365 try :
363- files .update (
366+ files .extend (
364367 self ._add_from_git (
365368 dataset ,
366369 path ,
@@ -386,23 +389,18 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
386389 if creator not in creators :
387390 creators .append (creator )
388391
389- dataset_path = self .renku_datasets_path / dataset .name
390- result = os .path .relpath (str (dst ), start = str (dataset_path ))
391-
392392 if u .scheme in ('' , 'file' ):
393393 url = None
394394 else :
395395 url = '{}/{}' .format (url , target )
396396
397- return {
398- result :
399- DatasetFile (
400- path = result ,
401- url = url ,
402- creator = creators ,
403- dataset = dataset .name , # TODO detect original dataset
404- )
405- }
397+ return [{
398+ 'path' : dst .relative_to (self .path ),
399+ 'url' : url ,
400+ 'creator' : creators ,
401+ 'dataset' : dataset .name ,
402+ 'parent' : self
403+ }]
406404
407405 def get_relative_url (self , url ):
408406 """Determine if the repo url should be relative."""
0 commit comments