-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathSonicScrewdriver.py
327 lines (259 loc) · 9.25 KB
/
SonicScrewdriver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
# SonicScrewdriver.py
# Version January 1, 2015
def addtodict(word, count, lexicon):
'''Adds an integer (count) to dictionary (lexicon) under
the key (word), or increments lexicon[word] if key present. '''
if word in lexicon:
lexicon[word] += count
else:
lexicon[word] = count
def appendtodict(key, value, dictoflists):
if key in dictoflists:
dictoflists[key].append(value)
else:
dictoflists[key] = [value]
def sortkeysbyvalue(lexicon, whethertoreverse = False):
'''Accepts a dictionary where keys point to a (presumably numeric) value, and
returns a list of keys sorted by value.'''
tuplelist = list()
for key, value in lexicon.items():
tuplelist.append((value, key))
tuplelist = sorted(tuplelist, reverse = whethertoreverse)
return tuplelist
def sortvaluesbykey(lexicon):
'''Accepts a dictionary of numeric keys, such as page numbers, and
returns a tuplelist of key-value pairs sorted by the key.'''
tuplelist = list()
for key, value in lexicon.items():
tuplelist.append((key, value))
tuplelist = sorted(tuplelist)
return tuplelist
def add_dicts(source, target):
'''Adds the values in source to corresponding
keys in target, or creates them if missing.'''
for key, value in source.items():
if key in target:
target[key] += value
else:
target[key] = value
def clean_pairtree(htid):
period = htid.find('.')
prefix = htid[0:period]
postfix = htid[(period+1): ]
if ':' in postfix:
postfix = postfix.replace(':','+')
postfix = postfix.replace('/','=')
cleanname = prefix + "." + postfix
return cleanname
def dirty_pairtree(htid):
period = htid.find('.')
prefix = htid[0:period]
postfix = htid[(period+1): ]
if '=' in postfix:
postfix = postfix.replace('+',':')
postfix = postfix.replace('=','/')
dirtyname = prefix + "." + postfix
return dirtyname
def pairtreepath(htid,rootpath):
''' Given a HathiTrust volume id, returns a relative path to that
volume. While the postfix is part of the path, it's also useful to
return it separately since it can be a folder/filename in its own
right.'''
period = htid.find('.')
prefix = htid[0:period]
postfix = htid[(period+1): ]
if ':' in postfix:
postfix = postfix.replace(':','+')
postfix = postfix.replace('/','=')
if '.' in postfix:
postfix = postfix.replace('.',',')
path = rootpath + prefix + '/pairtree_root/'
if len(postfix) % 2 != 0:
for i in range(0, len(postfix) - 2, 2):
next_two = postfix[i: (i+2)]
path = path + next_two + '/'
path = path + postfix[-1] + '/'
else:
for i in range(0, len(postfix), 2):
next_two = postfix[i: (i+2)]
path = path + next_two + '/'
return path, postfix
## REVISED utility
## that reads my standard tab-separated metadata table,
## and returns three data objects: 1) a list of row indexes
## stored in the first column (e.g. volume ids).
## 2) a list of column names, and
## 3) a dictionary-of-dictionaries called table where
## table[columnname][rowindex] = the value of that cell.
## the difference here is thatthe first column, containing
## row indexes, is also returned as a column of the table.
## In the original version, it stupidly wasn't.
##
## This is equivalent to FileUtils.readtsv2
def readtsv(filepath):
with open(filepath, encoding='utf-8') as file:
filelines = file.readlines()
header = filelines[0].rstrip()
fieldnames = header.split('\t')
numcolumns = len(fieldnames)
indexfieldname = fieldnames[0]
mincols = 1000
for line in filelines:
colnum = len(line.split('\t'))
if colnum < mincols:
mincols = colnum
if mincols < numcolumns:
numcolumns = mincols
fieldnames = fieldnames[0:numcolumns]
table = dict()
indices = list()
for i in range(0, numcolumns):
table[fieldnames[i]] = dict()
for line in filelines[1:]:
line = line.rstrip()
if len(line) < 1:
continue
fields = line.split('\t')
rowindex = fields[0]
indices.append(rowindex)
for thisfield in range(0, numcolumns):
thiscolumn = fieldnames[thisfield]
if len(fields) > thisfield:
thisentry = fields[thisfield]
else:
thisentry = ""
table[thiscolumn][rowindex] = thisentry
return indices, fieldnames, table
def writetsv(columns, rowindices, table, filepath):
import os
headerstring = ""
numcols = len(columns)
filebuffer = list()
## Only create a header if the file does not yet exist.
if not os.path.exists(filepath):
headerstring = ""
for index, column in enumerate(columns):
headerstring = headerstring + column
if index < (numcols -1):
headerstring += '\t'
else:
headerstring += '\n'
filebuffer.append(headerstring)
for rowindex in rowindices:
rowstring = ""
for idx, column in enumerate(columns):
rowstring += table[column][rowindex]
if idx < (numcols -1):
rowstring += '\t'
else:
rowstring += '\n'
filebuffer.append(rowstring)
with open(filepath, mode='a', encoding = 'utf-8') as file:
for line in filebuffer:
file.write(line)
return len(filebuffer)
def easywritetsv(columns, rowindices, table, filepath):
'''This version does not assume the table contains a dict for rowindices'''
firstcolumn = columns[0]
table[firstcolumn] = dict()
for idx in rowindices:
table[firstcolumn][idx] = idx
import os
headerstring = ""
numcols = len(columns)
filebuffer = list()
## Only create a header if the file does not yet exist.
if not os.path.exists(filepath):
headerstring = ""
for index, column in enumerate(columns):
headerstring = headerstring + column
if index < (numcols -1):
headerstring += '\t'
else:
headerstring += '\n'
filebuffer.append(headerstring)
for rowindex in rowindices:
rowstring = ""
for idx, column in enumerate(columns):
rowstring += table[column][rowindex]
if idx < (numcols -1):
rowstring += '\t'
else:
rowstring += '\n'
filebuffer.append(rowstring)
with open(filepath, mode='a', encoding = 'utf-8') as file:
for line in filebuffer:
file.write(line)
return len(filebuffer)
def pairtreefile(htid):
''' Given a dirty htid, returns a clean one that can be used
as a filename.'''
if ':' in htid or '/' in htid:
htid = htid.replace(':','+')
htid = htid.replace('/','=')
return htid
def pairtreelabel(htid):
''' Given a clean htid, returns a dirty one that will match
the metadata table.'''
if '+' in htid or '=' in htid:
htid = htid.replace('+',':')
htid = htid.replace('=','/')
return htid
def infer_date(datetype, firstdate, seconddate, textdate):
'''Receives a date type and three dates, as strings, with no guarantee that any
of the dates will be numeric. The logic of the data here is defined by
MARC standards for controlfield 008:
http://www.loc.gov/marc/bibliographic/concise/bd008a.html
Returns a date that represents either a shaky consensus
about the earliest attested date for this item, or 0, indicating no
consensus.
'''
if "--" in textdate and "estimate" in textdate:
return 0
# Because that's something like <estimate="18--?">
try:
intdate = int(firstdate)
except:
# No readable date
if firstdate.endswith('uu'):
# Two missing places is too many.
intdate = 0
elif firstdate.endswith('u'):
# but one is okay
try:
decade = int(firstdate[0:3])
intdate = decade * 10
except:
# something's weird. fail.
intdate = 0
else:
intdate = 0
try:
intsecond = int(seconddate)
except:
intsecond = 0
if intsecond - intdate > 25:
# A gap of more than twenty-five years is too much.
# This is usually an estimated date that could be anywhere within
# the nineteenth century.
intdate = 0
if datetype == 't' and intsecond > 0 and intsecond < intdate:
intdate = intsecond
# This is a case where we have both a publication date and
# a copyright date. Accept the copyright date. We're going
# for 'the earliest attested date for the item.'
return intdate
def simple_date(row, table):
datetype = table["datetype"][row]
firstdate = table["startdate"][row]
secondate = table["enddate"][row]
textdate = table["textdate"][row]
intdate = infer_date(datetype, firstdate, secondate, textdate)
return intdate
def date_row(row):
datetype = row["datetype"]
firstdate = row["startdate"]
secondate = row["enddate"]
textdate = row["imprintdate"]
intdate = infer_date(datetype, firstdate, secondate, textdate)
return intdate