Skip to content

Commit 4aa5182

Browse files
ehanson8hakbailey
andauthored
major refactor (#19)
* major refactor * Restructure code examples * PR updates * Code review fixes Co-authored-by: Helen Bailey <[email protected]>
1 parent c92cde0 commit 4aa5182

18 files changed

+860
-635
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,9 @@ local/*
178178
!data/.keep
179179
.profile
180180
*.csv
181+
!tests/fixtures/*.csv
181182
*.json
183+
!config/*.json
184+
!tests/fixtures/*.json
182185
createItemMetadataFromCSV_*
183186
*.txt

Pipfile.lock

+122-121
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/aspace_mapping.json

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"file_identifier": {
3+
"csv_field_name": "file_identifier",
4+
"language": null,
5+
"delimiter": ""
6+
},
7+
"dc.title": {
8+
"csv_field_name": "title",
9+
"language": "en_US",
10+
"delimiter": ""
11+
},
12+
"dc.relation.isversionof": {
13+
"csv_field_name": "uri",
14+
"language": null,
15+
"delimiter": ""
16+
}
17+
}

dsaps/cli.py

+79-93
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import csv
22
import datetime
3-
import glob
43
import json
54
import logging
65
import os
@@ -9,18 +8,18 @@
98
import click
109
import structlog
1110

12-
from dsaps import models, workflows
11+
from dsaps.models import Client, Collection
12+
from dsaps import helpers
1313

1414
logger = structlog.get_logger()
1515

1616

17-
@click.group()
18-
@click.option('--url', envvar='DSPACE_URL')
19-
@click.option('-e', '--email', prompt='Enter email',
17+
@click.group(chain=True)
18+
@click.option('--url', envvar='DSPACE_URL', required=True,)
19+
@click.option('-e', '--email', envvar='TEST_EMAIL', required=True,
2020
help='The email of the user for authentication.')
21-
@click.option('-p', '--password', prompt='Enter password',
22-
envvar='TEST_PASS', hide_input=True,
23-
help='The password for authentication.')
21+
@click.option('-p', '--password', envvar='TEST_PASS', required=True,
22+
hide_input=True, help='The password for authentication.')
2423
@click.pass_context
2524
def main(ctx, url, email, password):
2625
ctx.obj = {}
@@ -42,106 +41,93 @@ def main(ctx, url, email, password):
4241
'w')],
4342
level=logging.INFO)
4443
logger.info('Application start')
45-
client = models.Client(url)
44+
client = Client(url)
4645
client.authenticate(email, password)
4746
start_time = time.time()
4847
ctx.obj['client'] = client
4948
ctx.obj['start_time'] = start_time
49+
ctx.obj['log_suffix'] = log_suffix
5050

5151

5252
@main.command()
53-
@click.option('-c', '--comm_handle', prompt='Enter the community handle',
54-
help='The handle of the community in which to create the ,'
55-
'collection.')
56-
@click.option('-n', '--coll_name', prompt='Enter the name of the collection',
57-
help='The name of the collection to be created.')
58-
@click.option('-m', '--metadata', prompt='Enter the path of the metadata file',
59-
help='The path of the JSON file of metadata.')
60-
@click.option('-f', '--file_path', prompt='Enter the path',
61-
help='The path of the content, a URL or local drive path.')
62-
@click.option('-t', '--file_type', prompt='Enter the file type',
63-
help='The file type to be uploaded.')
64-
@click.option('-i', '--ingest_type', prompt='Enter the type of ingest',
65-
help='The type of ingest to perform: local, remote.',
66-
type=click.Choice(['local', 'remote']))
53+
@click.option('-m', '--metadata-csv', required=True,
54+
type=click.Path(exists=True),
55+
help='The full path to the CSV file of metadata for the items.')
56+
@click.option('--field-map', required=True, type=click.Path(exists=True),
57+
help='Path to JSON field mapping file')
58+
@click.option('-d', '--directory', required=True,
59+
help='The full path to the content, either a directory of files '
60+
'or a URL for the storage location.')
61+
@click.option('-t', '--file-type',
62+
help='The file type to be uploaded, if limited to one file '
63+
'type.', default='*')
64+
@click.option('-r', '--ingest-report', is_flag=True,
65+
help='Create ingest report for updating other systems.')
66+
@click.option('-c', '--collection-handle',
67+
help='The handle of the collection to which items are being '
68+
'added.', default=None)
6769
@click.pass_context
68-
def newcoll(ctx, comm_handle, coll_name, metadata, file_path, file_type,
69-
ingest_type):
70+
def additems(ctx, metadata_csv, field_map, directory, file_type, ingest_report,
71+
collection_handle):
72+
"""Adds items to a specified collection from a metadata CSV, a field
73+
mapping file, and a directory of files. May be run in conjunction with the
74+
newcollection CLI commands."""
7075
client = ctx.obj['client']
7176
start_time = ctx.obj['start_time']
72-
with open(metadata, encoding='UTF-8') as fp:
73-
coll_metadata = json.load(fp)
74-
coll_id = client.post_coll_to_comm(comm_handle, coll_name)
75-
file_dict = {}
76-
if ingest_type == 'local':
77-
files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True)
78-
for file in files:
79-
file_name = os.path.splitext(os.path.basename(file))[0]
80-
file_dict[file_name] = file
81-
elif ingest_type == 'remote':
82-
file_dict = models.build_file_dict_remote(file_path, file_type,
83-
file_dict)
84-
items = client.post_items_to_coll(coll_id, coll_metadata, file_dict,
85-
ingest_type)
86-
for item in items:
87-
logger.info(f'Item posted: {item}')
88-
models.elapsed_time(start_time, 'Total runtime:')
77+
if 'collection_uuid' not in ctx.obj and collection_handle is None:
78+
raise click.UsageError('collection_handle option must be used or '
79+
'additems must be run after newcollection '
80+
'command.')
81+
elif 'collection_uuid' in ctx.obj:
82+
collection_uuid = ctx.obj['collection_uuid']
83+
else:
84+
collection_uuid = client.get_uuid_from_handle(collection_handle)
85+
with open(metadata_csv, 'r') as csvfile, open(field_map, 'r') as jsonfile:
86+
metadata = csv.DictReader(csvfile)
87+
mapping = json.load(jsonfile)
88+
collection = Collection.from_csv(metadata, mapping)
89+
for item in collection.items:
90+
item.bitstreams_from_directory(directory, file_type)
91+
collection.uuid = collection_uuid
92+
items = collection.post_items(client)
93+
if ingest_report:
94+
report_name = metadata_csv.replace('.csv', '-ingest.csv')
95+
helpers.create_ingest_report(items, report_name)
96+
elapsed_time = datetime.timedelta(seconds=time.time() - start_time)
97+
logger.info(f'Total runtime : {elapsed_time}')
8998

9099

91100
@main.command()
92-
@click.option('-m', '--metadata_csv', prompt='Enter the metadata CSV file',
93-
help='The path of the CSV file of metadata.')
94-
@click.option('-o', '--output_path', prompt='Enter the output path',
95-
default='', help='The path of the output files, include '
96-
'/ at the end of the path')
97-
@click.option('-f', '--file_path', prompt='Enter the path',
98-
help='The path of the content, a URL or local drive path.'
99-
'Include / at the end of a local drive path.')
100-
@click.option('-t', '--file_type', prompt='Enter the file type',
101-
help='The file type to be uploaded.')
102-
def reconcile(metadata_csv, file_path, file_type, output_path):
103-
workflows.reconcile_files_and_metadata(metadata_csv, output_path,
104-
file_path, file_type)
101+
@click.option('-c', '--community-handle', required=True,
102+
help='The handle of the community in which to create the ,'
103+
'collection.')
104+
@click.option('-n', '--collection-name', required=True,
105+
help='The name of the collection to be created.')
106+
@click.pass_context
107+
def newcollection(ctx, community_handle, collection_name):
108+
"""Posts a new collection to a specified community. Used in conjunction
109+
with the additems CLI command to populate the new collection with
110+
items."""
111+
client = ctx.obj['client']
112+
collection_uuid = client.post_coll_to_comm(community_handle,
113+
collection_name)
114+
ctx.obj['collection_uuid'] = collection_uuid
105115

106116

107-
@main.command()
108-
@click.option('-m', '--metadata_csv', prompt='Enter the metadata CSV file',
109-
help='The path of the CSV file of metadata.')
110-
def metadatajson(metadata_csv):
111-
with open(metadata_csv) as csvfile:
112-
reader = csv.DictReader(csvfile)
113-
metadata_group = []
114-
mapping_dict = {'fileIdentifier': ['file_identifier'],
115-
'dc.contributor.author': ['author name - direct'],
116-
'dc.contributor.advisor': ['supervisor(s)'],
117-
'dc.date.issued': ['pub date'],
118-
'dc.description.abstract': ['Abstract', 'en_US'],
119-
'dc.title': ['Title', 'en_US'],
120-
'dc.relation.ispartofseries': ['file_identifier']}
121-
for row in reader:
122-
metadata_rec = []
123-
metadata_rec = models.create_metadata_rec(mapping_dict, row,
124-
metadata_rec)
125-
metadata_rec.append({'key': 'dc.format.mimetype', 'language':
126-
'en_US', 'value': 'application/pdf'})
127-
metadata_rec.append({'key': 'dc.language.iso', 'language':
128-
'en_US', 'value': 'en_US'})
129-
metadata_rec.append({'key': 'dc.publisher', 'language': 'en_US',
130-
'value': 'Massachusetts Institute of '
131-
'Technology. Laboratory for Computer'
132-
'Science'})
133-
metadata_rec.append({'key': 'dc.rights', 'language': 'en_US',
134-
'value': 'Educational use permitted'})
135-
metadata_rec.append({'key': 'dc.rights.uri', 'language': 'en_US',
136-
'value': 'http://rightsstatements.org/vocab/'
137-
'InC-EDU/1.0/'})
138-
metadata_rec.append({'key': 'dc.type', 'language': 'en_US',
139-
'value': 'Technical Report'})
140-
item = {'metadata': metadata_rec}
141-
metadata_group.append(item)
142-
file_name = os.path.splitext(os.path.basename(metadata_csv))[0]
143-
with open(f'{file_name}.json', 'w') as f:
144-
json.dump(metadata_group, f)
117+
# @main.command()
118+
# @click.option('-m', '--metadata_csv', prompt='Enter the metadata CSV file',
119+
# help='The path of the CSV file of metadata.')
120+
# @click.option('-o', '--output_path', prompt='Enter the output path',
121+
# default='', help='The path of the output files, include '
122+
# '/ at the end of the path')
123+
# @click.option('-f', '--file_path', prompt='Enter the path',
124+
# help='The path of the content, a URL or local drive path.'
125+
# 'Include / at the end of a local drive path.')
126+
# @click.option('-t', '--file_type', prompt='Enter the file type',
127+
# help='The file type to be uploaded.')
128+
# def reconcile(metadata_csv, file_path, file_type, output_path):
129+
# workflows.reconcile_files_and_metadata(metadata_csv, output_path,
130+
# file_path, file_type)
145131

146132

147133
if __name__ == '__main__':

dsaps/workflows.py dsaps/helpers.py

+31-29
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,49 @@
22
import glob
33
import os
44

5-
from dsaps import models
5+
import structlog
6+
7+
8+
logger = structlog.get_logger()
9+
10+
11+
def create_csv_from_list(list_name, output):
12+
"""Creates CSV file from list content."""
13+
with open(f'{output}.csv', 'w') as csvfile:
14+
writer = csv.writer(csvfile)
15+
writer.writerow(['id'])
16+
for item in list_name:
17+
writer.writerow([item])
618

719

820
def create_file_dict(file_path, file_type):
921
"""Creates a dict of file IDs and file paths."""
10-
if file_path.startswith('http'):
11-
file_dict = models.build_file_dict_remote(file_path, file_type, {})
12-
else:
13-
files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True)
14-
file_dict = {}
15-
for file in files:
16-
file_name = os.path.splitext(os.path.basename(file))[0]
17-
file_dict[file_name] = file
22+
files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True)
23+
file_dict = {}
24+
for file in files:
25+
file_name = os.path.splitext(os.path.basename(file))[0]
26+
file_dict[file_name] = file
1827
return file_dict
1928

2029

30+
def create_ingest_report(items, file_name):
31+
"""Creates ingest report of other systems' identifiers with a newly created
32+
DSpace handle."""
33+
with open(f'{file_name}', 'w') as writecsv:
34+
writer = csv.writer(writecsv)
35+
writer.writerow(['uri', 'link'])
36+
for item in items:
37+
writer.writerow([item.source_system_identifier]
38+
+ [f'https://hdl.handle.net/{item.handle}'])
39+
40+
2141
def create_metadata_id_list(metadata_csv):
2242
"""Creates a list of IDs from a metadata CSV"""
2343
metadata_ids = []
2444
with open(metadata_csv) as csvfile:
2545
reader = csv.DictReader(csvfile)
26-
for row in reader:
27-
value = row['file_identifier']
28-
metadata_ids.append(value)
46+
for row in [r for r in reader if r['file_identifier'] != '']:
47+
metadata_ids.append(row['file_identifier'])
2948
return metadata_ids
3049

3150

@@ -49,23 +68,6 @@ def match_metadata_to_files(file_dict, metadata_ids):
4968
return metadata_matches
5069

5170

52-
def reconcile_files_and_metadata(metadata_csv, output_path, file_path,
53-
file_type):
54-
"""Runs a reconciliation of files and metadata."""
55-
file_dict = create_file_dict(file_path, file_type)
56-
file_ids = file_dict.keys()
57-
metadata_ids = create_metadata_id_list(metadata_csv)
58-
metadata_matches = match_metadata_to_files(file_dict, metadata_ids)
59-
file_matches = match_files_to_metadata(file_dict, metadata_ids)
60-
no_files = set(metadata_ids) - set(metadata_matches)
61-
no_metadata = set(file_ids) - set(file_matches)
62-
models.create_csv_from_list(no_metadata, f'{output_path}no_metadata')
63-
models.create_csv_from_list(no_files, f'{output_path}no_files')
64-
models.create_csv_from_list(metadata_matches,
65-
f'{output_path}metadata_matches')
66-
update_metadata_csv(metadata_csv, output_path, metadata_matches)
67-
68-
6971
def update_metadata_csv(metadata_csv, output_path, metadata_matches):
7072
"""Creates an updated CSV of metadata records with matching files."""
7173
with open(metadata_csv) as csvfile:

0 commit comments

Comments
 (0)