-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHTMLParser.py
76 lines (63 loc) · 2.06 KB
/
HTMLParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#Parse HTML -> Poem format
import re
ls = []
lsr = []
lsjd = []
with open('poems/English.txt', 'r') as f:
ls = f.readlines()
with open('poems/Romaji.txt', 'r') as f:
lsr = f.readlines()
with open('poems/Japanese.UTF8', 'rb"utf-8"') as f:
lsjd = f.readlines()
lsj = []
for i in range(len(lsjd)):
if not lsjd[i] == '\n' and not lsjd[i] == ' \n':
lsj.append(lsjd[i])
def parse(ls):
poems = []
pnum = 1
for i in range(len(ls)):
if '/images/onna'+str(pnum)+'.jpg' in ls[i]:
author = re.search('<center>(.+?)</center>', ls[i+1]).group(1) + '\n'
verse1 = re.search('<br>(.+?)\n',ls[i+2]).group(1) + '|' + \
re.search('<br>(.+?)\n',ls[i+3]).group(1) + '|' + \
re.search('<br>(.+?)\n',ls[i+4]).group(1) + '\n'
verse2 = re.search('<br>(.+?)\n',ls[i+6]).group(1) + '|' + \
re.search('<br>(.+?)\n',ls[i+7]).group(1) + '\n'
poems.append([author,verse1,verse2])
pnum = pnum + 1
return poems
poems = [[],[],[]]
poems[0] = parse(ls)
poems[1] = parse(lsr)
poems[2] = parse(lsj)
for poem in range(100):
with open('poems/poem'+str(poem+1)+'.kar', 'w+') as f:
f.write(poems[0][poem][0])
f.write(poems[0][poem][1])
f.write(poems[0][poem][2])
f.write(poems[1][poem][0])
f.write(poems[1][poem][1])
f.write(poems[1][poem][2])
f.write(poems[2][poem][0])
f.write(poems[2][poem][1])
f.write(poems[2][poem][2])
u = []
with open('unique.txt','r') as f:
u = f.readlines()
lines = []
un = []
v = []
with open('poems/2ndVerse.UTF8','rb"utf-8"') as f:
lines = f.readlines()
for i in lines:
if len(i) < 20:
un.append(i)
else:
v.append(i)
for i in range(1,101):
with open('poems/poem'+str(i)+'.kar','a') as g:
g.write(u[i-1])
le = len(u[i-1].split())
g.write((poems[2][i-1][1].decode('utf-8').replace('|','')[:le]+'\n').encode('utf-8'))
g.write(v[un.index(u[i-1])])