Skip to content

Commit 14338e6

Browse files
committed
Revert "Change on folder name"
This reverts commit 074473fd8397903d76893b15dafba4e9c9084e14. # Conflicts: # ExternalSources/IN_TNA/BT43-100-178451 Christopher Dresser wallpaper design, 1864-1874.jpeg # ExternalSources/IN_TNA/BT43-100-201373 Christopher Dresser wallpaper design, 1864-1874.jpeg # ExternalSources/IN_TNA/BT43-100-201374 Christopher Dresser wallpaper design, 1864-1874.jpeg # ExternalSources/MD_out/BT43-100-178451.xml # ExternalSources/MD_out/BT43-100-201373.xml # ExternalSources/MD_out/BT43-100-201374.xml # ExternalSources/crawl_TNA.py # ExternalSources/extractMD_TNA.pl # ExternalSources/extractMD_TNA_noMD.pl # ExternalSources/modeleTNA.xml # ExternalSources/modeleTNA_noMD.xml
1 parent 2b8b3ce commit 14338e6

13 files changed

+11619
-14
lines changed
Loading
Loading
Loading

Diff for: Files/MD-BT43-extract.xml

+10,734
Large diffs are not rendered by default.

Diff for: Files/MD_out/BT43-100-178451.xml

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
<analyseAlto iiif="false">
2+
<metad>
3+
<type>I</type>
4+
<ID>BT43-100-178451</ID>
5+
<titre>Board of Trade - BT43-100-178451 - #178451</titre>
6+
7+
<editeur>William Cooke</editeur>
8+
<dateEdition>18640913</dateEdition>
9+
<notice>C12852223</notice>
10+
<parent>C439246</parent>
11+
<nbPage>1</nbPage>
12+
<lang>en</lang>
13+
<fichier>BT43-100-178451</fichier>
14+
<source>TNA</source>
15+
<descr>Registered design number: 178451. -- Proprietor: William Cooke. -- Address: Grove Works, Leeds, Yorkshire. -- Subject: Paper hanging. -- Class 5: Paper hangings--</descr>
16+
<sujet>floral -- Paper hanging</sujet>
17+
<format>Paper hangings</format>
18+
</metad>
19+
<contenus>
20+
<largeurPx>1452</largeurPx>
21+
<hauteurPx>1451</hauteurPx>
22+
<pages>
23+
<page ordre="1">
24+
<ills>
25+
<ill couleur="coul" h="1451" n="1-1" w="1452" x="0" y="0">
26+
<genre source="final">gravure</genre>
27+
<contenuImg CS="1.0" lang="en" source="md">Paper hanging</contenuImg>
28+
<contenuImg CS="1.0" lang="en" source="md">Paper hangings</contenuImg>
29+
<contenuImg CS="1.0" lang="en" source="md">floral</contenuImg>
30+
<titraille>Board of Trade - BT43-100-178451 - #178451</titraille>
31+
</ill>
32+
</ills>
33+
</page>
34+
</pages>
35+
</contenus>
36+
</analyseAlto>

Diff for: Files/MD_out/BT43-100-201373.xml

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
<analyseAlto iiif="false">
2+
<metad>
3+
<type>I</type>
4+
<ID>BT43-100-201373</ID>
5+
<titre>Board of Trade - BT43-100-201373 - #201373</titre>
6+
7+
<editeur>William Cooke</editeur>
8+
<dateEdition>18660918</dateEdition>
9+
<notice>C12852288</notice>
10+
<parent>C439246</parent>
11+
<nbPage>1</nbPage>
12+
<lang>en</lang>
13+
<fichier>BT43-100-201373</fichier>
14+
<source>TNA</source>
15+
<descr>Registered design number: 201373. -- Proprietor: William Cooke. -- Address: Grove Works, Leeds, Yorkshire. -- Subject: Paper hanging. -- Class 5: Paper hangings--</descr>
16+
<sujet>floral -- Paper hanging</sujet>
17+
<format>Paper hangings</format>
18+
</metad>
19+
<contenus>
20+
<largeurPx>1256</largeurPx>
21+
<hauteurPx>1282</hauteurPx>
22+
<pages>
23+
<page ordre="1">
24+
<ills>
25+
<ill couleur="coul" h="1282" n="1-1" w="1256" x="0" y="0">
26+
<genre source="final">gravure</genre>
27+
<contenuImg CS="1.0" lang="en" source="md">Paper hanging</contenuImg>
28+
<contenuImg CS="1.0" lang="en" source="md">Paper hangings</contenuImg>
29+
<contenuImg CS="1.0" lang="en" source="md">floral</contenuImg>
30+
<titraille>Board of Trade - BT43-100-201373 - #201373</titraille>
31+
</ill>
32+
</ills>
33+
</page>
34+
</pages>
35+
</contenus>
36+
</analyseAlto>

Diff for: Files/MD_out/BT43-100-201374.xml

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
<analyseAlto iiif="false">
2+
<metad>
3+
<type>I</type>
4+
<ID>BT43-100-201374</ID>
5+
<titre>Board of Trade - BT43-100-201374 - #201374</titre>
6+
7+
<editeur>William Cooke</editeur>
8+
<dateEdition>18660918</dateEdition>
9+
<notice>C12852289</notice>
10+
<parent>C439246</parent>
11+
<nbPage>1</nbPage>
12+
<lang>en</lang>
13+
<fichier>BT43-100-201374</fichier>
14+
<source>TNA</source>
15+
<descr>Registered design number: 201374. -- Proprietor: William Cooke. -- Address: Grove Works, Leeds, Yorkshire. -- Subject: Paper hanging. -- Class 5: Paper hangings--</descr>
16+
<sujet>floral -- Paper hanging</sujet>
17+
<format>Paper hangings</format>
18+
</metad>
19+
<contenus>
20+
<largeurPx>1497</largeurPx>
21+
<hauteurPx>1356</hauteurPx>
22+
<pages>
23+
<page ordre="1">
24+
<ills>
25+
<ill couleur="coul" h="1356" n="1-1" w="1497" x="0" y="0">
26+
<genre source="final">gravure</genre>
27+
<contenuImg CS="1.0" lang="en" source="md">Paper hanging</contenuImg>
28+
<contenuImg CS="1.0" lang="en" source="md">Paper hangings</contenuImg>
29+
<contenuImg CS="1.0" lang="en" source="md">floral</contenuImg>
30+
<titraille>Board of Trade - BT43-100-201374 - #201374</titraille>
31+
</ill>
32+
</ills>
33+
</page>
34+
</pages>
35+
</contenus>
36+
</analyseAlto>

Diff for: Files/crawl_TNA.py

+187
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
# -*- coding: utf-8 -*-
2+
#!/usr/bin/python
3+
4+
# use case: extract metadata from the TNA catalog
5+
# input is a serie list number (see below)
6+
# output is a XML like data file (not a real XML)
7+
8+
import requests; #version 2.18.4, used for connecting to the API
9+
import sys
10+
from time import sleep
11+
from math import log
12+
import os
13+
import locale;
14+
os.environ["PYTHONIOENCODING"] = "utf-8";
15+
myLocale=locale.setlocale(category=locale.LC_ALL, locale="en_GB.UTF-8");
16+
17+
#print(myText.encode('utf-8', errors='ignore'))
18+
19+
#series_list = [[x,1] for x in ['C426','C392','C394','C416','C415','C350','C428','C374','C422','C152']]
20+
#series_list = [[x,0] for x in ['A13530113']] #Richmond
21+
#series_list = [[x,0] for x in ['A13532926']] #Gloucestershire
22+
#series_list = [[x,0] for x in ['A13531661','A13531878']] #East and West Sussex
23+
#series_list = [[x,0] for x in ['A13530781','A13531184','A13532436']] #Cornwall, Devon, Dorset
24+
#series_list = [[x,0] for x in ['A13532757','A13531853','A13532331','A13530620']] #Kent, Somerset, Worcestershire
25+
#series_list = [[x,0] for x in ['A13531317']] #Cumbria
26+
#series_list = [[x,0] for x in ['A13530620']] #Northumberland
27+
#series_list = [[x,0] for x in ['A13531418']] #Surrey
28+
29+
#series_list = [[x,0] for x in ['C3085']] #TNA Designs BT 42
30+
#series_list = [[x,0] for x in ['C3086']] #TNA Designs BT 43
31+
#series_list = [[x,0] for x in ['C439493']] #TNA Designs BT43-348-394008
32+
#series_list = [[x,0] for x in ['C25263']] #TNA Designs BT43/412/216416
33+
#series_list = [[x,0] for x in ['C25261']] #TNA Designs BT43/356
34+
#series_list = [[x,0] for x in ['C439519']] #TNA Designs BT43/373
35+
#series_list = [[x,0] for x in ['C439525']] #TNA Designs BT43/379
36+
#series_list = [[x,0] for x in ['C439508']] #TNA Designs BT43/363
37+
#series_list = [[x,0] for x in ['C439446']] #TNA Designs BT43/300
38+
##series_list = [[x,0] for x in ['C439473']] #TNA Designs BT43/328
39+
#series_list = [[x,0] for x in ['C439491']] #TNA Designs BT43/346
40+
series_list = [[x,0] for x in ['C439493']] #TNA Designs BT43/348
41+
42+
#series_list = [[x,0] for x in ['C440606']] #TNA Designs BT52/143
43+
#series_list = [[x,0] for x in ['C440606']]
44+
#series_list = [[x,0] for x in ['C439598']] #TNA Designs BT50/20
45+
46+
47+
48+
# https://discovery.nationalarchives.gov.uk/browse/r/h/C3085
49+
#series_list = [[x,0] for x in ['C3093']] #TNA Designs BT 50/1
50+
51+
#series_list = [[x,0] for x in ['C11678861']] #TNA Designs BT 50/1
52+
#series_list = [[x,0] for x in ['C439589']] #TNA Designs BT 50/11
53+
#series_list = [[x,1] for x in ['C96','C246','C43','C4']] #TNA
54+
#series_list = [[x,1] for x in ['C256','C148','C18','C64']] #TNA
55+
56+
target_dir = "CATALOGUE/"
57+
if not os.path.isdir(target_dir):
58+
os.mkdir(target_dir)
59+
60+
max_level = 3
61+
62+
#series_list = ["A13530124", 0]
63+
#myfile = open("stac_dates.txt","w")
64+
#myfile.close()
65+
PAGE_LIMIT = 200
66+
TOTAL_LIMIT = 10000
67+
68+
# myfile = open(series.replace(" ","_") + ".children.txt","w")
69+
# myfile.write("|".join([str(X) for X in [rj["id"],rj["coveringDates"],rj["coveringFromDate"],rj["coveringToDate"],rj["recordOpeningDate"], \
70+
# str(rj["scopeContent"]["description"]).replace("\n"," ").replace(" ", " ").replace("|","~"),rj["closureType"], \
71+
# rj["citableReference"], str(rj["isParent"]),"\n"]]))
72+
field_list = ["id","coveringDates","coveringFromDate","coveringToDate","recordOpeningDate",["scopeContent","description"],"closureType","citableReference","isParent"]
73+
# myfile.close()
74+
75+
# weird code for unicode export
76+
def stuff2String(myStuff):
77+
if isinstance(myStuff,(int, long)):
78+
out = str(myStuff).encode('utf-8')
79+
elif myStuff==None:
80+
out = "None"
81+
else:
82+
out = (myStuff).encode('utf-8')
83+
return out
84+
85+
86+
def get_series_children(series, field_list, page_limit=100, total_limit=1000):
87+
88+
print("SERIES:",series)
89+
myparams={"limit":page_limit, "batchStartMark":"*"}
90+
headers={"Accept": "application/json"}; #we want the API to return data in JSON format
91+
url="https://discovery.nationalarchives.gov.uk/API/records/children/" + series
92+
s=requests.Session(); #creating a session just groups the set of requests together
93+
r=s.get(url, headers=headers, params=myparams); #send the url with our added parameters, call the response "r"
94+
r.raise_for_status(); #This checks that we received an http status 200 for the server response
95+
#so we know nothing's gone wrong with the call (if something has gone wrong we'd get a 404 or 500 error for example)
96+
rjson=r.json()
97+
retrieved = 0
98+
99+
out_children = []
100+
101+
for rj in rjson["assets"]:
102+
out_fields = []
103+
for f in field_list:
104+
if isinstance(f,str):
105+
field_value = rj[f]
106+
elif isinstance(f,list):
107+
field_value = rj[f[0]][f[1]] # if fields are more nested then this should be recursive but for now it is only for the scope content description
108+
out_fields.append(stuff2String(field_value).replace("\n"," ").replace("\r"," ").replace(" "," ").replace("|","~"))
109+
retrieved += 1
110+
out_children.append(out_fields)
111+
last_id = rj["sortKey"]
112+
print("Total records retrieved:", retrieved)
113+
114+
print("More:",rjson["hasMoreAfterLast"])
115+
while (rjson["hasMoreAfterLast"] and retrieved < total_limit):
116+
sleep(4)
117+
## Update the parameter set with the returned value for nextBatchMark so we can get the next portion of data with our next request
118+
119+
myparams["batchStartMark"] = last_id
120+
121+
## Make our next GET request
122+
123+
print("********Params:",myparams,"*************")
124+
r=s.get(url, headers=headers, params=myparams);
125+
rjson = r.json()
126+
for rj in rjson["assets"]:
127+
out_fields = []
128+
for f in field_list:
129+
if isinstance(f,str):
130+
field_value = rj[f]
131+
elif isinstance(f,list):
132+
field_value = rj[f[0]][f[1]] # if fields are more nested then this should be recursive but for now it is only for the scope content description
133+
out_fields.append(stuff2String(field_value).replace("\n"," ").replace(" "," ").replace("|","~"))
134+
out_children.append(out_fields)
135+
retrieved += 1
136+
last_id = rj["sortKey"]
137+
print("Total records retrieved:", retrieved)
138+
print("Total records retrieved:", retrieved)
139+
140+
return(out_children)
141+
142+
143+
while len(series_list) > 0:
144+
series = series_list[0]
145+
parent = series[0]
146+
level = series[1]
147+
series_list = series_list[1:]
148+
out_file = open(target_dir + parent + "_level_" + str(level) + ".txt", "w")
149+
children = get_series_children(parent,field_list,PAGE_LIMIT,TOTAL_LIMIT)
150+
151+
for child in children:
152+
out_file.write("<parent>" + parent + "</parent>")
153+
for idx, x in enumerate(child):
154+
element = str(field_list[idx])
155+
if element == "['scopeContent', 'description']":
156+
x=str(x).replace(r'<p>','')
157+
x=str(x).replace(r'</p>','--')
158+
x=str(x).replace(r'<scopecontent>','')
159+
x=str(x).replace(r'</scopecontent>','')
160+
out_file.write("<description>"+ x +"</description>")
161+
else:
162+
out_file.write("<"+str(field_list[idx])+">"+ x +"</"+str(field_list[idx])+">")
163+
#out_file.write("|".join([parent] + [str(x).replace(r'\r',' ') for x in child]))
164+
out_file.write("\n")
165+
if child[-1] == "True":
166+
if level+1 <= max_level:
167+
series_list.append([child[0],level+1])
168+
out_file.close()
169+
170+
171+
#series_file = open("second_level_series.txt","r")
172+
#out_file = open("third_level_series.txt","w")
173+
#for row in series_file:
174+
# parent = row.split("|")[1]
175+
# top_children = get_series_children(parent,field_list,500,25000)
176+
# print("Top level:",len(top_children))
177+
# for top in top_children:
178+
# #print(top)
179+
# out_file.write("|".join([parent] + [str(x).replace(r'\r',' ') for x in top]))
180+
# out_file.write("\n")
181+
# #second_level = get_series_children(top[0],field_list,100,1000)
182+
#for sl in second_level:
183+
#out_file.write("|".join(top))
184+
# out_file.write("|".join([top[0]]+[str(x) for x in sl]))
185+
# out_file.write("\n")
186+
187+
#out_file.close()

0 commit comments

Comments
 (0)