-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparse_trs.py
executable file
·79 lines (72 loc) · 2.08 KB
/
parse_trs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import re
import csv
import datetime
from bs4 import BeautifulSoup
import types
out = sys.stdout
if len(sys.argv) <= 1:
sys.exit()
classes = None
if len(sys.argv) > 2:
classes = sys.argv[2]
f = open(sys.argv[1], "r")
intext = f.read()
soup = BeautifulSoup(intext)
cw = csv.writer(out)
filename = sys.argv[1].split("/")[len(sys.argv[1].split("/"))-1]
if "." in filename:
shortfilename = ".".join(filename.split(".")[0:len(filename.split("."))-1])
else:
shortfilename = filename
if classes is None:
trs = soup.find_all("tr")
else:
trs = soup.select("table."+(classes.replace(" ","."))+" tr")
for tr in trs:
r = list()
if tr is None:
continue
tds = tr.find_all("td")
if len(tds) <= 0:
continue
#r.append(soup.find("input", id="hiddenSeasonOption").get("value"))
#r.append(tds[1].find("a").get("href").replace("/ice/player.htm?id=",""))
r.append(shortfilename)
for td in tds:
children_length = len(list(td.children))
if children_length > 1:
for tdc in td.children:
try:
tagname = tdc.name
if tagname == "a":
r.append(re.sub("\s+", " ", tdc.get("href").strip("\n ")))
except:
pass
try:
r.append(re.sub("\s+", " ", tdc.string.strip("\n ")))
except:
r.append("")
#for ss in td.strings:
# r.append(re.sub("\s+", " ", ss.strip("\n ")))
else:
r.append(re.sub(r"\s+", " ", td.text.strip("\n ")))
for i in range(len(r)):
try:
r[i] = r[i].encode("utf8")
except:
pass
if r[i] is not None and "," in r[i]:
try:
r[i] = re.sub(r"([0-9]),([0-9]{3})",r"\1\2",r[i])
except:
pass
try:
r[i] = r[i].strip()
if r[i] == "-":
r[i] = None
except Exception as e:
pass
cw.writerow(r)