@@ -369,7 +369,7 @@ def export_dataset(name, provider_name, publish, tag, client_dispatcher: IClient
369369 except KeyError :
370370 raise errors .ParameterError ("Unknown provider." )
371371
372- provider .set_parameters (** kwargs )
372+ provider .set_export_parameters (** kwargs )
373373
374374 selected_tag = None
375375 tags = datasets_provenance .get_all_tags (dataset ) # type: ignore
@@ -424,6 +424,7 @@ def import_dataset(
424424 previous_dataset = None ,
425425 delete = False ,
426426 gitlab_token = None ,
427+ ** kwargs ,
427428):
428429 """Import data from a 3rd party provider or another renku project.
429430
@@ -449,11 +450,13 @@ def import_dataset(
449450
450451 assert provider is not None
451452
453+ provider .set_import_parameters (** kwargs )
454+
452455 try :
453456 record = provider .find_record (uri , gitlab_token = gitlab_token )
454457 provider_dataset : ProviderDataset = record .as_dataset (client )
455458 files : List [ProviderDatasetFile ] = record .files_info
456- total_size = 0
459+ total_size = 0.0
457460
458461 if not yes :
459462 communication .echo (
@@ -477,9 +480,9 @@ def import_dataset(
477480
478481 communication .confirm (text_prompt , abort = True , warning = True )
479482
480- for file_ in files :
481- if file_ .size_in_mb is not None :
482- total_size += file_ .size_in_mb
483+ for file in files :
484+ if file .size_in_mb is not None :
485+ total_size += file .size_in_mb
483486
484487 total_size *= 2 ** 20
485488
@@ -509,7 +512,7 @@ def import_dataset(
509512 with_metadata = provider_dataset ,
510513 force = True ,
511514 extract = extract ,
512- all_at_once = True ,
515+ is_import = True ,
513516 destination_names = names ,
514517 total_size = total_size ,
515518 overwrite = True ,
@@ -535,39 +538,51 @@ def import_dataset(
535538 if not provider_dataset .data_dir :
536539 raise errors .OperationError (f"Data directory for dataset must be set: { provider_dataset .name } " )
537540
538- sources = []
539-
540- if record .datadir_exists :
541- sources = [f"{ provider_dataset .data_dir } /*" ]
542-
543- for file in files :
544- try :
545- Path (file .path ).relative_to (provider_dataset .data_dir )
546- except ValueError : # Files that are not in dataset's data directory
547- sources .append (file .path )
541+ if provider_dataset .version : # NOTE: A tag was specified for import
542+ sources , checksums = zip (* [(f .path , f .checksum ) for f in files ]) # type: ignore
543+ else :
544+ sources = [f .path for f in files ] # type: ignore
545+ checksums = None
548546
549547 new_dataset = add_data_to_dataset (
550548 urls = [record .project_url ],
551549 dataset_name = name ,
552550 sources = sources ,
551+ checksums = checksums ,
553552 with_metadata = provider_dataset ,
553+ is_renku_import = True ,
554554 create = not previous_dataset ,
555555 overwrite = True ,
556556 repository = record .repository ,
557557 clear_files_before = True ,
558+ dataset_datadir = provider_dataset .data_dir ,
559+ force = True , # NOTE: Force-add to include any ignored files
558560 )
559561
560562 if previous_dataset :
561563 _update_datasets_metadata (new_dataset , previous_dataset , delete , provider_dataset .same_as )
562564
565+ if provider_dataset .tag :
566+ add_dataset_tag (
567+ dataset_name = new_dataset .name ,
568+ tag = provider_dataset .tag .name ,
569+ description = provider_dataset .tag .description ,
570+ )
571+ elif provider_dataset .version :
572+ add_dataset_tag (
573+ dataset_name = new_dataset .name ,
574+ tag = provider_dataset .version ,
575+ description = f"Tag { provider_dataset .version } created by renku import" ,
576+ )
577+
563578 record .import_images (new_dataset )
564579
565580 database_dispatcher .current_database .commit ()
566581
567582
568583@inject .autoparams ()
569584def update_datasets (
570- names ,
585+ names : List [ str ] ,
571586 creators ,
572587 include ,
573588 exclude ,
@@ -594,41 +609,56 @@ def update_datasets(
594609 client_dispatcher(IClientDispatcher): Injected client dispatcher.
595610 dataset_gateway(IDatasetGateway): Injected dataset gateway.
596611 """
612+ from renku .core .dataset .providers .renku import RenkuProvider
613+
597614 if not update_all and not names and not include and not exclude and not dry_run :
598615 raise errors .ParameterError ("No update criteria is specified" )
599616
600617 client = client_dispatcher .current_client
601618
602- imported_datasets : List [Dataset ] = []
619+ imported_dataset_updates : List [Dataset ] = []
603620
604621 all_datasets = dataset_gateway .get_all_active_datasets ()
622+ imported_datasets = [d for d in all_datasets if d .same_as ]
605623
606624 if names and update_all :
607625 raise errors .ParameterError ("Cannot pass dataset names when updating all datasets" )
608626 elif (include or exclude ) and update_all :
609627 raise errors .ParameterError ("Cannot specify include and exclude filters when updating all datasets" )
610- elif (include or exclude ) and names and any (d . same_as for d in all_datasets if d .name in names ):
628+ elif (include or exclude ) and names and any (d for d in imported_datasets if d .name in names ):
611629 raise errors .IncompatibleParametersError (a = "--include/--exclude" , b = "imported datasets" )
612630
613- names_provided = bool ( names )
631+ names = names or [ d . name for d in all_datasets ]
614632
615633 # NOTE: update imported datasets
616634 if not include and not exclude :
617- for dataset in all_datasets :
618- if names and dataset .name not in names or not dataset .same_as :
635+ must_match_records = False
636+
637+ for dataset in imported_datasets :
638+ if dataset .name not in names :
619639 continue
620640
621- uri = dataset .same_as .url
622- if isinstance (uri , dict ):
623- uri = cast (str , uri .get ("@id" ))
641+ uri = dataset .same_as .value # type: ignore
624642 provider , _ = ProviderFactory .from_uri (uri )
625643
626644 if not provider :
627645 continue
628646
629647 record = provider .find_record (uri )
630648
631- if record .is_last_version (uri ) and record .version == dataset .version :
649+ if isinstance (provider , RenkuProvider ) and dataset .version is not None :
650+ tags = dataset_gateway .get_all_tags (dataset = dataset )
651+ tag = next ((t for t in tags if t .name == dataset .version ), None )
652+ # NOTE: Do not update Renku dataset that are imported from a specific version
653+ if tag is not None and tag .dataset_id .value == dataset .id :
654+ communication .echo (
655+ f"Skipped updating imported Renku dataset '{ dataset .name } ' with tag '{ tag .name } '"
656+ )
657+ names .remove (dataset .name )
658+ continue
659+
660+ if record .is_last_version (uri ) and record .is_version_equal_to (dataset ):
661+ names .remove (dataset .name )
632662 continue
633663
634664 if not dry_run :
@@ -651,25 +681,25 @@ def update_datasets(
651681
652682 communication .echo (f"Updated dataset '{ dataset .name } ' from remote provider" )
653683
654- if names :
655- names .remove (dataset .name )
656- imported_datasets .append (dataset )
684+ names .remove (dataset .name )
685+ imported_dataset_updates .append (dataset )
657686 else :
658- imported_datasets = [ d for d in all_datasets if d . same_as ]
687+ must_match_records = True
659688
660- imported_datasets_view_models = [DatasetViewModel .from_dataset (d ) for d in imported_datasets ]
689+ imported_dataset_updates_view_models = [DatasetViewModel .from_dataset (d ) for d in imported_dataset_updates ]
661690
662- if names_provided and not names :
663- return imported_datasets_view_models , []
691+ if not names :
692+ return imported_dataset_updates_view_models , []
664693
694+ # NOTE: Exclude all imported dataset from individual file filter
665695 records = filter_dataset_files (
666696 names = names , creators = creators , include = include , exclude = exclude , ignore = [d .name for d in imported_datasets ]
667697 )
668698
669699 if not records :
670- if imported_datasets :
671- return imported_datasets_view_models , []
672- raise errors . ParameterError ( "No files matched the criteria." )
700+ if must_match_records :
701+ raise errors . ParameterError ( "No files matched the criteria." )
702+ return imported_dataset_updates_view_models , []
673703
674704 git_files = []
675705 unique_remotes = set ()
@@ -730,7 +760,7 @@ def update_datasets(
730760 dataset_files_view_models = [
731761 DatasetFileViewModel .from_dataset_file (cast (DatasetFile , f ), f .dataset ) for f in updated_files + deleted_files
732762 ]
733- return imported_datasets_view_models , dataset_files_view_models
763+ return imported_dataset_updates_view_models , dataset_files_view_models
734764
735765
736766def show_dataset (name : str , tag : Optional [str ] = None ):
0 commit comments