-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_paras.py
executable file
·60 lines (55 loc) · 1.44 KB
/
extract_paras.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import csv
import bs4
import re
if len(sys.argv) <= 1:
sys.exit()
f = open(sys.argv[1], "r")
s = bs4.BeautifulSoup(f.read().replace(" "," "))
#holder = s.select("p.Eng-text") # 2007 & 2008 & 2009
#holder = s.select("div") # 2010
#holder = s.select(".style1") # 2011
#holder = s.select(".MsoNormal") # 2012
holder = s.select("p") # 2010 & 2013
'''
holder = s.body.contents[0]
try:
if holder.contents[0].name == "div":
holder = holder.contents[0]
except:
pass
try:
if holder.contents[1].name == "table":
holder = holder.contents[1].contents[1].contents[1]
except:
pass
try:
if holder.name == "table":
holder = holder.contents[1].contents[1]
except:
pass
'''
accumltr = list()
for el in holder:
try:
txt = el.text.strip().encode("utf8")
except:
continue
m = re.match(ur"^\*?[0-9]+", txt)
if m is not None:
if len(accumltr) > 0:
print re.sub(r"[\r\n\t ]+", " ", " ".join(accumltr))
#print " ".join(accumltr)
accumltr = list()
if txt.endswith(":") or txt.endswith(":"):
accumltr.append(txt)
continue
#print txt
print re.sub(ur"[\r\n\t ]+", " ", txt, flags=re.U)
elif len(accumltr) > 0:
accumltr.append(txt)
if (len(accumltr)):
print re.sub(r"[\r\n\t ]+", " ", " ".join(accumltr))
#print " ".join(accumltr)