-
Notifications
You must be signed in to change notification settings - Fork 2
/
LanguageAnalysis.py
208 lines (168 loc) · 8.06 KB
/
LanguageAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# coding: utf-8
# # Creating a manuscript catalogue
#
# In this proof of concept, we are building an online catalogue of French manuscripts.
import pandas as pd
from collections import defaultdict
from glob import glob
from pathlib import Path
import chardet
import roman
# Going from folio side ID to an ordinal page number helps calculate the number of pages.
#
# - 1r = 1
# - 1v = 2
# - 2r = 3
# - 2v = 4
#
# etc. So multiply by 2 and subtract 1 if there's an `r` in the folio side ID.
# But these numbers do not account for sides with two or more texts – those sides count as 1 for each text.
#
# Instead, we can calculate how much of each side that is the start or end for a text should count for each language.
# Let's assume all texts on a side take equal parts of the side. Then if a side has two (parts of) English texts
# and one Latin text, the side counts as $1/3$ for each text.
#
# We need to create an index for all sides that contain the start and/or end of a text.
def folio_side_to_ordinal(folio, recto_verso):
"""Calculate the ordinal page number from folio number and recto/verso indication"""
was_roman = False
if type(folio) == str:
try:
# print("Roman!")
folio = roman.fromRoman(folio.upper())
was_roman = True
except:
folio = int(folio)
if folio < 1:
raise NotImplementedError()
if pd.isna(recto_verso):
o = folio
elif recto_verso == 'v':
o = folio * 2
elif recto_verso == 'r':
o = folio * 2 - 1
else:
raise RuntimeError("Recto/Verso must be '', 'r' or 'v'")
if was_roman:
o += 100000
return o
def fs2o(ser: pd.Series):
"""Add fields to series that contain start and end ordinal page numbers"""
ser['ordinal_start'] = folio_side_to_ordinal(ser['start_folio'], ser['start_side'])
ser['ordinal_end'] = folio_side_to_ordinal(ser['end_folio'], ser['end_side'])
return ser
def fix_range_ends(ser: pd.Series):
"""If no end of range is specified, copy the beginning of the range"""
if pd.isna(ser['end_folio']):
ser['end_folio'] = ser['start_folio']
ser['end_side'] = ser['start_side']
return ser
def count_pages_for_text(ser: pd.Series, languages_per_page=None):
if languages_per_page is None:
languages_per_page = {}
values = [1 / len(languages_per_page[p]) for p in range(ser['ordinal_start'], ser['ordinal_end'] + 1)]
ser['total_sides'] = sum(values)
return ser
# These results can be written back to a CSV file.
# To calculate the total sides and percentages for each language we need to group the rows by language.
# We also need the total number of sides.
def get_languages_per_page(mss: pd.DataFrame):
languages_per_page = defaultdict(list)
# print(mss)
for row_index, text in mss.iterrows():
for page in range(text['ordinal_start'], text['ordinal_end']+1):
languages_per_page[page].append(text['language'])
return languages_per_page
def extract_ms_id(filename):
base_name = Path(filename).stem
return base_name.split('_')[1]
def load_contents(filename: str):
with open(filename, 'rb') as input_file:
detected_encoding = chardet.detect(input_file.read())
mss = pd.read_csv(filename, encoding=detected_encoding['encoding'].lower(), index_col=0, usecols=[0, 1, 2, 3, 4, 5, 6], dtype={'item': int, 'title': str, 'language': str, 'start_folio': str, 'start_side': str, 'end_folio': str, 'end_side': str}, na_values=[""], error_bad_lines=False)
# print("Dropping empty rows")
mss = mss.dropna(how='all')
# Remove whitespace from language names, folio sides
mss['language'] = mss['language'].str.strip()
if not pd.isna(mss['start_side']).any():
mss['start_side'] = mss['start_side'].str.strip()
mss['end_side'] = mss['end_side'].str.strip()
# print("Fixing folio numbers")
mss = mss.apply(fix_range_ends, axis=1)
mss['start_folio'] = pd.to_numeric(mss['start_folio'], errors='ignore')
mss['end_folio'] = pd.to_numeric(mss['end_folio'], errors='ignore')
return mss
def merge_analysis_results(all_manuscripts, all_langs_pivot):
# print(all_langs_pivot.columns)
result = all_manuscripts.update(all_langs_pivot)
return result
def process_manuscript(filename: str):
file = Path(filename).name
parent_dir = Path(filename).parent.parent
output_dir = parent_dir / "output"
output_dir.mkdir(exist_ok=True)
# print("Trying to open")
mss = load_contents(filename)
# Convert folio + side to ordinal numbers
# print("before fs2o")
mss = mss.apply(fs2o, axis=1)
# print("before get languages")
sides_languages = get_languages_per_page(mss)
# print("Count!")
mss = mss.apply(count_pages_for_text, axis=1, languages_per_page=sides_languages)
mss.to_csv(output_dir / file, encoding="utf-8")
# print("Save to HTML")
ms_id = extract_ms_id(filename)
# print("Summarise")
grouped_by_language = mss.groupby('language')
total_sides = mss['total_sides'].sum()
# Summarise the use of languages and write the absolute number and ratio of pages per language
# to a new CSV file
sides_per_language = grouped_by_language.agg({'total_sides': sum})
sides_per_language['percentage'] = sides_per_language['total_sides'].apply(lambda x: x / total_sides * 100)
sides_per_language.to_csv(output_dir / file.replace("contents", "languages"), encoding="utf-8")
return mss, sides_per_language, ms_id
def main():
# Read list of manuscripts
ms_descriptions = pd.read_csv("data/input/manuscripts.csv", index_col=False, na_values=[""], error_bad_lines=False)
ms_descriptions.columns = ['MS_ID','Location','Collection','Reference','Headnote','MS_Date','Folia',
'Place_of_production','Produced_for','Notes','MS_Sources','percentage_French',
'percentage_Latin','percentage_English','percentage_Other','total_sides_French',
'total_sides_Latin','total_sides_English','total_sides_Other']
files = list(glob('data/input/contents_*.csv'))
ms_identifiers = []
contents_frames = []
languages_frames = []
for filename in files:
print("Working on", filename)
try:
frames = process_manuscript(filename)
contents_frames.append(frames[0])
languages_frames.append(frames[1])
ms_identifiers.append(frames[2])
print("Done")
except Exception as e:
print("ERROR in {0}: {1}".format(filename, e))
all_mss = pd.concat(contents_frames, keys=ms_identifiers, names=["MS_ID"])
all_mss.to_csv('data/output/all_contents.csv', encoding="utf-8")
all_languages = pd.concat(languages_frames, keys=ms_identifiers, names=["MS_ID"])
all_languages.reset_index(inplace=True)
print(all_languages.head())
all_languages.to_csv("data/output/all_languages.csv", index=False, encoding="utf-8")
all_langs_pivot = all_languages.pivot(index="MS_ID", columns="language")
all_langs_pivot.columns = ['_'.join(col).strip() for col in all_langs_pivot.columns]
print(all_langs_pivot.head())
all_langs_pivot.to_csv("data/output/all_langs_pivot.csv", encoding="utf-8")
# Update the (manually determined) language use with the results of the analysis
# merged_results = merge_analysis_results(ms_descriptions, all_langs_pivot)
merged_results = ms_descriptions.set_index('MS_ID')
merged_results.update(all_langs_pivot)
merged_results.reset_index(inplace=True)
# Make sure empty values are sensible, i.e. empty spaces for strings and 0 for numbers
fill_values = {'Place_of_production': "",'Produced_for': "", 'Notes': "", 'total_sides_English': 0.,
'total_sides_French': 0.,'total_sides_Latin': 0.,'total_sides_Other': 0.,'percentage_English': 0.,
'percentage_French': 0.,'percentage_Latin': 0.,'percentage_Other': 0.}
merged_results.fillna(fill_values, inplace=True)
merged_results.to_csv("data/output/all_manuscripts.csv", index_label="Item", encoding="utf-8")
if __name__ == '__main__':
main()