1
1
import csv
2
2
import datetime
3
- import json
4
3
import logging
5
4
import os
6
5
10
9
import click
11
10
import structlog
12
11
13
- from dsaps import helpers
12
+ from dsaps import dspace , helpers
14
13
from dsaps .s3 import S3Client
15
- from dsaps .dspace import DSpaceClient , DSpaceCollection
16
14
17
15
18
16
logger = structlog .get_logger ()
@@ -28,7 +26,10 @@ def validate_path(ctx, param, value):
28
26
29
27
@click .group (chain = True )
30
28
@click .option (
31
- "--config-file" , required = True , help = "File path to source configuration JSON."
29
+ "--config-file" ,
30
+ envvar = "CONFIG_FILE" ,
31
+ required = True ,
32
+ help = "File path to source configuration JSON with settings for bitstream retrieval and field mappings." ,
32
33
)
33
34
@click .option (
34
35
"--url" ,
@@ -83,10 +84,11 @@ def main(ctx, config_file, url, email, password):
83
84
logger .info ("Running process" )
84
85
source_config = helpers .load_source_config (config_file )
85
86
if url :
86
- dspace_client = DSpaceClient (url )
87
+ dspace_client = dspace . DSpaceClient (url )
87
88
dspace_client .authenticate (email , password )
88
89
ctx .obj ["dspace_client" ] = dspace_client
89
- ctx .obj ["config" ] = source_config
90
+ ctx .obj ["source_config" ] = source_config
91
+ logger .info ("Initializing S3 client" )
90
92
ctx .obj ["s3_client" ] = S3Client .get_client ()
91
93
ctx .obj ["start_time" ] = perf_counter ()
92
94
@@ -97,27 +99,14 @@ def main(ctx, config_file, url, email, password):
97
99
"--metadata-csv" ,
98
100
required = True ,
99
101
type = click .Path (exists = True , file_okay = True , dir_okay = False ),
100
- help = "The filepath to a CSV file containing metadata for Dspace uploads." ,
101
- )
102
- @click .option (
103
- "-f" ,
104
- "--field-map" ,
105
- required = True ,
106
- type = click .Path (exists = True , file_okay = True , dir_okay = False ),
107
- help = "The filepath to a JSON document that maps columns in the metadata CSV file to a DSpace schema." ,
102
+ help = "File path to a CSV file describing the metadata and bitstreams for DSpace uploads." ,
108
103
)
109
104
@click .option (
110
105
"-d" ,
111
106
"--content-directory" ,
112
107
required = True ,
113
108
help = "The name of the S3 bucket containing files for DSpace uploads." ,
114
109
)
115
- @click .option (
116
- "-t" ,
117
- "--file-type" ,
118
- help = "The file type for DSpace uploads (i.e., the file extension, excluding the dot)." ,
119
- default = "*" ,
120
- )
121
110
@click .option (
122
111
"-r" ,
123
112
"--ingest-report" ,
@@ -134,41 +123,51 @@ def main(ctx, config_file, url, email, password):
134
123
def additems (
135
124
ctx ,
136
125
metadata_csv ,
137
- field_map ,
138
126
content_directory ,
139
- file_type ,
140
127
ingest_report ,
141
128
collection_handle ,
142
129
):
143
130
"""Add items to a DSpace collection.
144
131
145
- The method relies on a CSV file with metadata for uploads, a JSON document that maps
146
- metadata to a DSpace schema, and a directory containing the files to be uploaded.
132
+ The updated metadata CSV file from running 'reconcile' is used for this process.
133
+ The method will first add an item to the specified DSpace collection. The bitstreams
134
+ (i.e., files) associated with the item are read from the metadata CSV file, and
135
+ uploaded to the newly created item on DSpace.
147
136
"""
148
- s3_client = ctx .obj ["s3_client " ]
137
+ mapping = ctx .obj ["source_config" ][ "mapping " ]
149
138
dspace_client = ctx .obj ["dspace_client" ]
150
139
151
140
if "collection_uuid" not in ctx .obj and collection_handle is None :
152
141
raise click .UsageError (
153
- "collection_handle option must be used or "
154
- "additems must be run after newcollection "
155
- "command."
142
+ "Option '--collection-handle' must be used or "
143
+ "run 'additems' after 'newcollection' command."
156
144
)
157
145
elif "collection_uuid" in ctx .obj :
158
146
collection_uuid = ctx .obj ["collection_uuid" ]
159
147
else :
160
148
collection_uuid = dspace_client .get_uuid_from_handle (collection_handle )
161
- with open (metadata_csv , "r" ) as csvfile , open (field_map , "r" ) as jsonfile :
149
+
150
+ if metadata_csv is None :
151
+ raise click .UsageError ("Option '--metadata-csv' must be used." )
152
+
153
+ dspace_collection = dspace .Collection (uuid = collection_uuid )
154
+
155
+ with open (metadata_csv , "r" ) as csvfile :
162
156
metadata = csv .DictReader (csvfile )
163
- mapping = json .load (jsonfile )
164
- collection = DSpaceCollection .create_metadata_for_items_from_csv (
165
- metadata , mapping
157
+ dspace_collection = dspace_collection .add_items (metadata , mapping )
158
+
159
+ for item in dspace_collection .items :
160
+ logger .info (f"Posting item: { item } " )
161
+ item_uuid , item_handle = dspace_client .post_item_to_collection (
162
+ collection_uuid , item
166
163
)
167
- for item in collection .items :
168
- item .bitstreams_in_directory (content_directory , s3_client , file_type )
169
- collection .uuid = collection_uuid
170
- for item in collection .post_items (dspace_client ):
171
- logger .info (item .file_identifier )
164
+ item .uuid = item_uuid
165
+ item .handle = item_handle
166
+ logger .info (f"Item posted: { item_uuid } " )
167
+ for bitstream in item .bitstreams :
168
+ logger .info (f"Posting bitstream: { bitstream } " )
169
+ dspace_client .post_bitstream (item .uuid , bitstream )
170
+
172
171
logger .info (
173
172
"Total elapsed: %s" ,
174
173
str (timedelta (seconds = perf_counter () - ctx .obj ["start_time" ])),
@@ -192,7 +191,9 @@ def additems(
192
191
def newcollection (ctx , community_handle , collection_name ):
193
192
"""Create a new DSpace collection within a community."""
194
193
dspace_client = ctx .obj ["dspace_client" ]
195
- collection_uuid = dspace_client .post_coll_to_comm (community_handle , collection_name )
194
+ collection_uuid = dspace_client .post_collection_to_community (
195
+ community_handle , collection_name
196
+ )
196
197
ctx .obj ["collection_uuid" ] = collection_uuid
197
198
198
199
@@ -235,22 +236,21 @@ def reconcile(ctx, metadata_csv, output_directory, content_directory):
235
236
* updated-<metadata-csv>.csv: Entries from the metadata CSV file with a
236
237
corresponding file in the content directory.
237
238
"""
238
- source_settings = ctx .obj ["config" ]["settings" ]
239
- s3_client = ctx .obj ["s3_client" ]
240
- files_dict = helpers .get_files_from_s3 (
239
+ source_settings = ctx .obj ["source_config" ]["settings" ]
240
+ bitstreams = helpers .get_files_from_s3 (
241
241
s3_path = content_directory ,
242
- s3_client = s3_client ,
242
+ s3_client = ctx . obj [ " s3_client" ] ,
243
243
bitstream_folders = source_settings .get ("bitstream_folders" ),
244
244
id_regex = source_settings ["id_regex" ],
245
245
)
246
246
metadata_ids = helpers .create_metadata_id_list (metadata_csv )
247
- metadata_matches = helpers .match_metadata_to_files (files_dict .keys (), metadata_ids )
248
- file_matches = helpers .match_files_to_metadata (files_dict .keys (), metadata_ids )
247
+ metadata_matches = helpers .match_metadata_to_files (bitstreams .keys (), metadata_ids )
248
+ file_matches = helpers .match_files_to_metadata (bitstreams .keys (), metadata_ids )
249
249
no_files = set (metadata_ids ) - set (metadata_matches )
250
- no_metadata = set (files_dict .keys ()) - set (file_matches )
250
+ no_metadata = set (bitstreams .keys ()) - set (file_matches )
251
251
helpers .create_csv_from_list (no_metadata , f"{ output_directory } no_metadata" )
252
252
helpers .create_csv_from_list (no_files , f"{ output_directory } no_files" )
253
253
helpers .create_csv_from_list (metadata_matches , f"{ output_directory } metadata_matches" )
254
254
helpers .update_metadata_csv (
255
- metadata_csv , output_directory , metadata_matches , files_dict
255
+ metadata_csv , output_directory , metadata_matches , bitstreams
256
256
)
0 commit comments