-
Notifications
You must be signed in to change notification settings - Fork 1
/
ingest_data.py
288 lines (237 loc) · 8.79 KB
/
ingest_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import contextlib
import json
import re
from operator import attrgetter
from pathlib import Path, PurePosixPath
from bs4 import BeautifulSoup
from tqdm import tqdm
class_cache_file = Path("class_parsed.json")
class_cache = {}
if class_cache_file.exists():
with open(class_cache_file, "r") as fp:
class_cache = json.load(fp)
get_text = attrgetter("text")
def strip_get_text(in_tag):
return get_text(in_tag).strip()
def get_soup(in_path):
return BeautifulSoup(in_path.read_text(), "html.parser")
def lmap(in_func, *in_iters):
return list(map(in_func, *in_iters))
def breakup_table(in_table_node):
return [
[td.text.strip() for td in tr.find_all("td")]
for tr in in_table_node.find_all("tr")
]
def strip_list(in_val):
if isinstance(in_val, (list, tuple)) and len(in_val) == 1:
return in_val[0]
return in_val
def parse_class_header_table(soup):
class_defs = {}
class_def_table = soup.find(class_="classHeaderTable")
if class_def_table is not None:
table_text = breakup_table(class_def_table)
class_defs = {
k.strip().lower(): strip_list(
[xx.strip().replace(",", "") for xx in v.strip().split()]
)
for k, v in table_text
}
class_defs["class_name"] = class_defs["class"][-1]
class_defs["class_type"] = class_defs["class"][0]
class_pkg = class_defs["package"]
pkg_path = ".".join(class_pkg) if isinstance(class_pkg, list) else class_pkg
class_defs["class_path"] = f'{pkg_path}.{class_defs["class_name"]}'
return class_defs
this_dir = Path(".").resolve()
def get_inheritance_from_soup(soup, file_path):
global this_dir
inherit_list = soup.find(class_="inheritanceList")
if inherit_list is None:
return {}
return {
a_tag.text.strip(): str(
PurePosixPath(
(
(file_path.parent / a_tag.attrs["href"].strip()).resolve()
).relative_to(this_dir)
)
)
for a_tag in inherit_list.find_all("a")
}
def get_constructor_from_organized_details(class_name, organized_details):
with contextlib.suppress(KeyError, IndexError):
return organized_details["constructor"][class_name][0]
return ""
def get_props_from_soup(soup):
soup_props = []
prop_table = soup.find("table", id="summaryTableProperty")
if prop_table is None:
return soup_props
for t in prop_table.find_all("tr"):
if "class" in t.attrs and "hideInheritedProperty" in t.attrs["class"]:
continue
sig_col = t.find("td", class_="summaryTableSignatureCol")
if sig_col is None:
continue
name_tag = sig_col.find("a", class_="signatureLink")
desc_tag = " ".join(
sig_col.select_one(".summaryTableDescription").text.strip().split()
)
next_sib = name_tag.find_next_sibling("a")
next_sib = next_sib.text if next_sib else ""
soup_props.append(
[name_tag.text, next_sib, desc_tag, "[read-only]" in desc_tag]
)
return soup_props
def get_methods_from_soup(soup):
soup_methods = []
method_table = soup.find("table", id="summaryTableMethod")
if method_table is None:
return soup_methods
for t in method_table.find_all("tr"):
if ("class" in t.attrs) and ("hideInheritedMethod" in t.attrs["class"]):
continue
sig_col = t.find("td", class_="summaryTableSignatureCol")
if sig_col is None:
continue
meth_sig = sig_col.find(class_="summarySignature").text
meth_name = meth_sig.split("(")[0].strip()
soup_methods.append(
[
meth_name,
meth_sig,
" ".join(
sig_col.find(class_="summaryTableDescription").text.strip().split()
),
]
)
return soup_methods
def get_constants_from_soup(soup):
# Find and Parse Class Constants
soup_constants = []
constant_table = soup.find("table", id="summaryTableConstant")
if constant_table is None:
return []
for t in constant_table.find_all("tr"):
if ("class" in t.attrs) and ("hideInheritedConstant" in t.attrs["class"]):
continue
sig_col = t.find("td", class_="summaryTableSignatureCol")
if sig_col is None:
continue
sum_tab = sig_col.find(class_="summaryTableDescription").extract()
const_sig = sig_col.text.strip()
const_name = const_sig.split(":")[0].strip()
soup_constants.append(
[const_name, const_sig, " ".join(sum_tab.text.strip().split())]
)
return soup_constants
def remap_link_to_file(in_link_tag, in_current_file):
global thiS_dir
try:
return str(
PurePosixPath(
(in_current_file.parent / (in_link_tag.attrs["href"].strip())).resolve()
).relative_to(this_dir)
)
except AttributeError:
return ""
def get_details_from_soup(soup, file_path):
global this_dir
soup_details = []
for page_filter in soup.select("#pageFilter"):
headers = page_filter.select(".detailHeader") or page_filter.select(
"#detailHeader"
)
bodies = page_filter.select(".detailBody") or page_filter.select("#detailBody")
for header, body in zip(headers, bodies):
air_tag = (header.select(".detailHeaderName > span") or [None]).pop()
details = {
"name": strip_list(
lmap(strip_get_text, header.select(".detailHeaderName"))
),
"type": strip_list(
lmap(strip_get_text, header.select(".detailHeaderType"))
),
"is_air": "" if air_tag is None else air_tag.attrs["alt"],
}
code_tag = body.find("code")
details["types_used"] = (
{
link_tag.text.strip(): remap_link_to_file(link_tag, file_path)
for link_tag in code_tag.find_all("a")
}
if code_tag
else {}
)
code_tag = code_tag.text.strip() if code_tag else ""
details["code"] = code_tag
soup_details.append(details)
orgainized_details = {}
for details in soup_details:
detail_type = details["type"].lower()
if detail_type not in orgainized_details:
orgainized_details[detail_type] = {}
orgainized_details[detail_type][details["name"]] = (
details["code"],
details["is_air"],
)
return soup_details, orgainized_details
def get_string_format_from_soup(soup, class_name):
valid_code = list(
filter(
lambda z: " function get " not in z
and " function set " not in z
and " " in z,
map(lambda x: x.text.strip(), soup.find_all("code")),
)
)
return next(
(
code_line
for code_line in valid_code
if class_name in code_line and "[" in code_line and "]" in code_line
),
None,
)
def get_constructor_args(in_constructor):
return list(
re.findall(r"(\w+)\s*:\s*(\w+)\s*(?:\=\s*(.+?))?(?:,|\))", in_constructor)
)
def write_to_cache_file():
global class_cache
with open(class_cache_file, "w") as fp:
json.dump(class_cache, fp, indent=2)
def get_file_information(file_path):
global class_cache
path_key = str(PurePosixPath(file_path))
if path_key in class_cache:
return
file_soup = get_soup(file_path)
class_defs = parse_class_header_table(file_soup)
if "class_name" not in class_defs:
return
class_defs["inheritance_links"] = get_inheritance_from_soup(file_soup, file_path)
class_defs["properties"] = get_props_from_soup(file_soup)
class_defs["methods"] = get_methods_from_soup(file_soup)
class_defs["constants"] = get_constants_from_soup(file_soup)
class_defs["details"], class_defs["organized_details"] = get_details_from_soup(
file_soup, file_path
)
class_defs["string_format"] = get_string_format_from_soup(
file_soup, class_defs["class_name"]
)
class_con = get_constructor_from_organized_details(class_defs["class_name"], class_defs["organized_details"])
class_con_args = get_constructor_args(class_con)
class_defs["constructor"] = class_con
class_defs["constructor_args"] = class_con_args
class_cache[path_key] = class_defs
def main():
all_files = list(enumerate((Path("doc_trunk") / "doc_pages").rglob("*.html")))
for i_path, path in tqdm(all_files):
get_file_information(path)
if (i_path % 1000) == 0:
write_to_cache_file()
write_to_cache_file()
if __name__ == "__main__":
main()