Skip to content

Commit e441af8

Browse files
author
Willem Jan Faber
authored
Merge pull request #2 from xiffy/master
python 3 compatible
2 parents 819e3b6 + bc989fb commit e441af8

File tree

9 files changed

+37
-19
lines changed

9 files changed

+37
-19
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
*.pyc
22
__pycache__
3+
build

example.py

100755100644
+6-7
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
oai_handler = oai
1212
oai_handler.current_set = "ANP"
1313

14-
response = sru.search("beatrix AND juliana AND Bernhard AND telegram", "ANP")
15-
print "Number of records: %i" % response.sru.nr_of_records
14+
response = sru.search("karel AND reve AND hooftprijs", "ANP")
15+
print ("Number of records: %i" % response.sru.nr_of_records)
1616

1717
record_nr = 0
1818

@@ -26,10 +26,9 @@
2626

2727

2828
oai_handler.DEBUG = True
29-
record = oai_handler.get(record.identifiers[0])
30-
print(record.alto)
31-
for alto in record.alto:
32-
#print("Fulltext: %s" % alto_to_text(alto))
33-
print(alto)
29+
r = oai_handler.get(record.identifiers[0])
30+
for alto in r.alto:
31+
print("Fulltext: %s" % alto_to_text(alto))
32+
#print(alto)
3433

3534
print("*************************************\n\n")

kb/nl/api/oai.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
OAI_BASEURL = 'http://services.kb.nl/mdo/oai'
88

99

10-
class oai():
10+
class oai:
1111

1212
"""
1313
OAI interface to the National Library of the Netherlands.

kb/nl/api/sru.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import sys
22
import requests
33
import urllib
4+
import pprint
45

56
try:
67
from urllib import quote # Python 2.X
@@ -10,7 +11,7 @@
1011
from kb.nl.collections import SETS
1112
from kb.nl.helpers import etree
1213

13-
SRU_BASEURL = 'http://jsru.kb.nl/sru/sru'
14+
SRU_BASEURL = 'https://jsru.kb.nl/sru/sru'
1415
SRU_BASEURL += '?version=1.2&maximumRecords=%i'
1516
SRU_BASEURL += '&operation=searchRetrieve'
1617
SRU_BASEURL += '&startRecord=%i'
@@ -36,7 +37,7 @@ def records(self):
3637
# TODO: distinguish by xsi:type
3738
@property
3839
def identifiers(self):
39-
baseurl = 'http://resolver.kb.nl/resolve?urn='
40+
baseurl = 'https://resolver.kb.nl/resolve?urn='
4041
result = [r.text.replace(baseurl, '') for r in self.record_data.iter() if
4142
r.tag.endswith('identifier') and r.text.find(':') > -1]
4243
return result
@@ -108,7 +109,7 @@ def __init__(self, record_data, sru):
108109
def __iter__(self):
109110
return self
110111

111-
def next(self):
112+
def __next__(self):
112113
if self.sru.nr_of_records == 0:
113114
raise StopIteration
114115
if self.sru.startrecord < self.sru.nr_of_records + 1:
@@ -118,9 +119,12 @@ def next(self):
118119
else:
119120
raise StopIteration
120121

122+
def next(self):
123+
return self.__next__()
124+
121125

122126
class sru():
123-
DEBUG = False
127+
DEBUG = True
124128

125129
collection = False
126130
maximumrecords = 50
@@ -166,7 +170,7 @@ def run_query(self):
166170
url = SRU_BASEURL % (self.maximumrecords, self.startrecord,
167171
self.recordschema, self.collection, self.query)
168172
if self.DEBUG:
169-
sys.stdout.write(url)
173+
print("run_query: %s" % url)
170174

171175
r = requests.get(url)
172176

kb/nl/collections/__init__.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,19 @@
1212
'recordschema': 'ddd',
1313
'setname': 'DPO',
1414
'time_period': [1781, 1800]},
15-
'BYVANCK': {'description_en': 'Medieval Illuminated Manuscripts',
15+
'BYVANCK': {'collection': 'MISC',
16+
'description_en': 'Medieval Illuminated Manuscripts',
1617
'description_nl': 'Middeleeuwse Verluchte Handschriften',
1718
'metadataPrefix': 'dcx',
19+
'recordschema': 'dcx',
1820
'setname': 'BYVANCK',
19-
'time_period': [500, 1500]},
20-
'SGD': {'description_en': 'States General Digital',
21+
'time_period': [500, 1500],
22+
'extra_query' : '(dcterms:isPartOf="ByvanckB" OR dcterms:isPartOf="BYVANCK")'},
23+
'SGD': {'collection': 'SGD',
24+
'description_en': 'States General Digital',
2125
'description_nl': 'Staten-Generaal Digitaal',
2226
'metadataPrefix': 'dcx',
27+
'recordschema': 'dcx',
2328
'setname': 'sgd:register',
2429
'time_period': [1962, 1994]},
2530
'GGC': {'collection': 'GGC',
@@ -28,4 +33,5 @@
2833
'metadataPrefix': 'dcx',
2934
'recordschema': 'dcx',
3035
'setname': 'ggc',
31-
'time_period': [1937, 2016]}}
36+
'time_period': [1937, 2016]}
37+
}

kb/nl/helpers/__init__.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,19 @@
1818
import elementtree.ElementTree as etree
1919
except ImportError:
2020
raise("Failed to import ElementTree from any known place")
21-
21+
import codecs
2222

2323
def alto_to_text(alto_data):
2424
''' Grab the selected text blocks and write them to disk '''
25-
alto_data = etree.fromstring(alto_data.encode('utf-8'))
25+
try:
26+
# some files have a BOM available which does not get stripped on windows
27+
if not alto_data[0] == '<':
28+
alto_data = alto_data[3:]
29+
alto_data = etree.fromstring(alto_data.encode('utf-8'))
30+
31+
except etree.XMLSyntaxError as e:
32+
print ("error! %s" % alto_str[:40])
33+
return None
2634

2735
alto_text = u""
2836
prev_was_hyp = False

test/alto_to_text_test.py

100755100644
File mode changed.

test/oai_test.py

100755100644
File mode changed.

test/sru_test.py

100755100644
File mode changed.

0 commit comments

Comments
 (0)