-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrescarta-to-dspace-saf.pl
272 lines (251 loc) · 12.6 KB
/
rescarta-to-dspace-saf.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#!/usr/bin/perl
use strict;
use warnings;
use File::Copy;
use File::Find;
use File::Path;
if($^O =~ /mswin32/i){
use Win32API::File::Time qw{:win}
}
use XML::XPath;
use XML::XPath::XMLParser;
print '
Rescarta to DSpace conversion.
This script implements a straightforward and feature-incomplete batch
conversion of Rescarta items to DSpace items. It was created to meet
the immediate needs of my library, not to meet all needs of all libraries.
Still, if you have tried this script and found it wanting, please email me
Jason Sherman <[email protected]>
and let me know. If you include a description of the issue and attach the
Rescarta metadata that\'s giving you trouble, I\'ll see what I can do.
';
print "\nEnter absolute path of the collection:\n";
print '
eg.
Windows: C:\Users\Archive\RCDATA01
or
Linux/Unix: /home/Archive/RCDATA01
';
my $inputpath = <STDIN>;
chomp $inputpath;
print "\nEnter name the destination directory:\n";
my $destdir = <STDIN>;
chomp $destdir;
my $inputmetadata = $inputpath . '/metadata.xml';
my @SAFBatches;
print "\nConverting...\n";
# create an object to parse the file and field XPath queries
my $xpath = XML::XPath->new(filename=>$inputmetadata);
# apply the path from the command line and get back a list matches
my $collectionset = $xpath->find('/mets/structMap[@TYPE="LOGICAL"]/div[@TYPE="collections"]/div[@TYPE="collection"]');
## Each collection will get its own DSpace SAF Batch folder.
my $c = -1; # Start a collection array counter
foreach my $collection ( $collectionset->get_nodelist ) {
$c++; # Not just a language...
my $collectionID;
my $titlepath;
$collectionID = $collection->getAttribute("DMDID");
$titlepath = '/mets/dmdSec[@ID="'.$collectionID.'"]/mdWrap/xmlData/mods:mods/mods:titleInfo/mods:title';
my $collectiontitle = $xpath->findvalue($titlepath);
# print each collection node in the list
my $SAFBatchTitle = $collectiontitle. '_' . $collectionID;
#print "$SAFBatchTitle\n";
push @SAFBatches, [$SAFBatchTitle];
## Loop through each item in the collection and scoop up metadata
foreach my $item ($collection->findnodes('div')) {
my $itemID = $item->getAttribute("DMDID");
my $itemType = $item->getAttribute("TYPE");
my $itemPath = '/mets/dmdSec[@ID="'.$itemID.'"]/mdWrap/xmlData/mods:mods';
my $itemTitle = $xpath->findvalue($itemPath.'/mods:titleInfo[@type!=\'alternative\']/mods:title');
my $itemAltTitle = $xpath->findvalue($itemPath.'/mods:titleInfo[@type=\'alternative\']/mods:title');
my $itemAuthor = $xpath->findvalue($itemPath.'/mods:name/mods:role[mods:roleTerm=\'aut\']/../mods:namePart');
my $itemEditor = $xpath->findvalue($itemPath.'/mods:name/mods:role[mods:roleTerm=\'edt\']/../mods:namePart');
my $itemPub = $xpath->findvalue($itemPath.'/mods:originInfo/mods:publisher');
my $itemDate = $xpath->findvalue($itemPath.'/mods:originInfo/mods:dateIssued');
my $itemExtent = $xpath->findvalue($itemPath.'/mods:physicalDescription/mods:extent');
my $itemStartPage = $xpath->findvalue($itemPath.'/mods:part/mods:extent/mods:start');
my $itemEndPage = $xpath->findvalue($itemPath.'/mods:part/mods:extent/mods:end');
my $itemAbstract = $xpath->getNodeText($itemPath.'/mods:abstract');
my $itemURL = $xpath->getNodeText($itemPath.'/mods:location/mods:url');
my @itemSubjectNodes = $xpath->findnodes($itemPath.'/mods:subject/mods:topic');
my @geoSubjectNodes = $xpath->findnodes($itemPath.'/mods:subject/mods:hierarchicalGeographic/*');
## Loop through each subject in the item
my @itemSubjects;
foreach my $itemSubjectNode (@itemSubjectNodes){
my $itemSubject = $itemSubjectNode->findvalue('.');
## If it's a hierarchical subject, split it up.
my @itemSplitSubject;
if (index($itemSubject,' -- ') ne -1) {
@itemSplitSubject = split(' -- ',$itemSubject);
foreach my $itemSplitSubject (@itemSplitSubject) {
#print "\t$itemSplitSubject\n";
push @itemSubjects, $itemSplitSubject;
}
## Otherwise, take it as is.
} else {
#print "\t$itemSubject\n";
push @itemSubjects, $itemSubject;
}
}
## Loop through each hierarchical geographic subject in the item
my @geoSubjects;
foreach my $geoSubjectNode (@geoSubjectNodes){
my $geoSubjectAttr = substr($geoSubjectNode->getName('.'),5);
my $geoSubjectVal = $geoSubjectNode->findvalue('.');
if ($geoSubjectVal) {
my $geoSubject = $geoSubjectVal .' ('. $geoSubjectAttr .')';
push @geoSubjects, $geoSubject;
}
}
## Check these data strutures....
## maybe the best route is to have a hash for each item, with an array for the subjects key?
push @{SAFBatches[$c]},[
[$itemID, $itemType, $itemPath, $itemTitle, $itemAltTitle, $itemAuthor, $itemEditor, $itemPub, $itemDate, $itemExtent, $itemStartPage, $itemEndPage, $itemAbstract, $itemURL],
[@itemSubjects],
[@geoSubjects]
];
}
}
for my $SAFBatch (@SAFBatches) {
my $batchDir = $SAFBatch->[0];
print "$batchDir\n";
for my $i ( 1 .. @{$SAFBatch}) { # iterate over each item in the batch. Index 0 refers to the batch id, so start with 1
## General item info lives at [0]; Subjects live at [1]; geoSubjects live at [2];
if (@$SAFBatch->[$i]) {
my $item = @$SAFBatch->[$i][0];
my @subjectAoA = @$SAFBatch->[$i][1];
my @geoSubjectAoA = @$SAFBatch->[$i][2];
my $itemID = &normalizeTitle($item->[0]);
my $itemType = &normalizeTitle($item->[1]);
my $itemPath = $item->[2];
my $itemTitle = &normalizeTitle($item->[3]);
my $itemAltTitle = &normalizeTitle($item->[4]);
## All of the contributors should probably be broken out into an array in case there are multiples of the same type.
my $itemAuthor = &normalizeTitle($item->[5]);
my $itemEditor = &normalizeTitle($item->[6]);
my $itemPub = &normalizeTitle($item->[7]);
my $itemDate = $item->[8];
my $itemExtent = $item->[9];
my $itemStartPage = $item->[10];
my $itemEndPage = $item->[11];
my $itemAbstract = &normalizeTitle($item->[12]);
my $itemURL = $item->[13];
## setup for the various directories and filenames
my $itemdir = $destdir . '/' . $batchDir . '/' . "item_" . $i . '/';
my $contentsfile = $itemdir.'/contents';
my $dublincorefile = $itemdir.'/dublin_core.xml';
my $perItemMetadata = $inputpath . '/' . $itemURL . '/metadata.xml';
## create the item directories if they aren't there already
if (! -e $itemdir){
mkpath($itemdir) or die "mkdir Failed.";
}
## Each Rescarta item has its own metadata file with the bitstreams listed in the desired order
## We'll check them for each item, and then print them to the contents file
my @contents = &attachBitstreams($perItemMetadata);
my $cmd;
#We'll see if we can send off any images to be put together by ABBY Reader
if($^O =~ /mswin32/i){
chdir($inputpath . '/' . $itemURL . '/') or die "$!";
## On Windows, we use ABBYY FineReader to create OCRed PDF/A documents for each item. This just saves a step.
$cmd = "\"C:/Program Files (x86)/ABBYY FineReader 11/FineCmd.exe\"";
}
open (CNTNTS, ">$contentsfile");
foreach my $bitstream (@contents) {
print CNTNTS "$bitstream\n";
my $oldbitstreampath = $inputpath . '/' . $itemURL . '/' . $bitstream;
my $newbitstreampath = $itemdir . '/' . $bitstream;
copy($oldbitstreampath,$newbitstreampath) or die "Copy of $oldbitstreampath to $itemdir Failed.";
## Copy over original mac times if possible.
if($^O =~ /mswin32/i){
(my $atime,my $mtime,my $ctime) = GetFileTime($oldbitstreampath);
SetFileTime ($newbitstreampath,$atime,$mtime,$ctime);
unless($bitstream eq 'metadata.xml'){
$cmd .= " $bitstream";
}
}
}
close CNTNTS;
## Uncomment the lines below if you actually want to send the items to ABBYYFineReader
## You'll have to manually save each PDF and add it to the DSpace item.
#print "Calling ABBYY FineReader 11...\n";
#system($cmd);
## write a bare-bones metadata file
## I should really be writing this out with one of the many excellent XML modules available for perl,
## but it's just so dang simple.
open (DBLNCRXML, ">$dublincorefile");
print DBLNCRXML "<dublin_core>\n";
print DBLNCRXML '<dcvalue qualifier="none" element="title">'.$itemTitle."</dcvalue>\n";
print DBLNCRXML '<dcvalue qualifier="alternative" element="title">'.$itemAltTitle."</dcvalue>\n";
print DBLNCRXML '<dcvalue qualifier="author" element="contributor">'.$itemAuthor."</dcvalue>\n";
print DBLNCRXML '<dcvalue qualifier="editor" element="contributor">'.$itemEditor."</dcvalue>\n";
print DBLNCRXML '<dcvalue qualifier="issued" element="date">'.$itemDate."</dcvalue>\n";
print DBLNCRXML '<dcvalue qualifier="none" element="publisher">'.$itemPub."</dcvalue>\n";
if ($itemType =~ /Serial|Monograph/i) {
print DBLNCRXML '<dcvalue qualifier="none" element="type">Text</dcvalue>'."\n";
}
print DBLNCRXML '<dcvalue qualifier="none" element="type">'.$itemType."</dcvalue>\n";
print DBLNCRXML '<dcvalue qualifier="extent" element="format">'.$itemExtent."</dcvalue>\n";
print DBLNCRXML '<dcvalue qualifier="extent" element="format">Pages: '. "$itemStartPage-$itemEndPage</dcvalue>\n";
print DBLNCRXML '<dcvalue qualifier="abstract" element="description">'.$itemAbstract."</dcvalue>\n";
## If we have any Geo subjects, add them to the file.
if (scalar @geoSubjectAoA > 0) {
## Again we're de-duping here.
my %geosubseen = ();
foreach my $geoSubjectArray (@geoSubjectAoA) {
foreach my $geoSubject (@{$geoSubjectArray}) {
print DBLNCRXML '<dcvalue qualifier="spatial" element="coverage">'.$geoSubject."</dcvalue>\n" unless $geosubseen{$geoSubject}++;
}
}
}
## If we have any regular subjects, add them to the file.
if (scalar @subjectAoA > 0) {
## Again we're de-duping here.
my %subseen = ();
foreach my $subjectArray (@subjectAoA) {
foreach my $subject (@{$subjectArray}) {
$subject = &normalizeTitle($subject);
print DBLNCRXML '<dcvalue qualifier="none" element="subject">'.$subject."</dcvalue>\n" unless $subseen{$subject}++;
}
}
}
print DBLNCRXML '</dublin_core>';
close DBLNCRXML;
print "Item $i complete\n";
}
}
}
sub attachBitstreams {
my @contents;
my $perItemMetadata = $_[0];
## Look into each item's metadata.xml file to get the correct order of files.
## Create an object to parse the file and field XPath queries
my $itemXpath = XML::XPath->new(filename=>$perItemMetadata);
## Each filegroup will contain a set of files.
my $fileGrp = $itemXpath->find('/mets:mets/mets:fileSec/mets:fileGrp/mets:file/mets:FLocat');
foreach my $fileNode ( $fileGrp->get_nodelist ) { ## Loop through those
my $xlinkHref = $fileNode->getAttribute('xlink:href'); ## Get the attribute with the path
my $bitstream;
if ($xlinkHref =~ /([^\/]+)$/) { $bitstream = $1; } ## Cut it down to just the filename
push @contents, $bitstream;
}
## We'll even throw in the per-item metadata file into the SAF in case it's useful later.
push @contents, 'metadata.xml';
return @contents;
}
## I pulled this straight out of another script I wrote for scraping fileshares.
## I left out some of the stuff that was grossly inappropriate for this use,
## but it may need to be changed to suit your purpose.
sub normalizeTitle {
# create a natural language title based off the filename
my $title = $_[0];
$title =~ s/&/and/g; # replace ampersand with "and".
$title =~ s/^[ _\.]+|[ _\.]+$//g; # strip leading and trailing underscores, dots, & spaces
$title =~ s/( )(?= *?\1)//g; # collapse double spaces
## Some Capitalization normalization
$title =~ s/^([A-Z ]*)$/\F$1/; # If the title is ALL CAPS, Convert It To Capitalized
$title =~ s/(?<![A-Za-z])(At|In|As|And|Or|The|Towards|To|For|From|On|Of)(?![A-Za-z])/\L$1/g;# De-capitalize any article
$title =~ s/\'([A-Z])/\L\'$1/g; # De-capitalize any letter after an apostrophe
$title =~ s/^([a-z])/\U$1/; # Capitalize anything at the start
$title =~ s/\b(usao|ocla|ocw|oiic)\b/\U$1/gi; # Capitalize our institution abbreviations
return $title;
}