-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_budget_establishment.py
executable file
·69 lines (57 loc) · 1.93 KB
/
parse_budget_establishment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import re
import csv
import json
import types
import copy
cols = ["head_no", "establishment_directorate", "establishment_nondirectorate"]
numcols = ["actual", "original", "revised", "estimate"]
#cols.extend(numcols)
cw = csv.DictWriter(sys.stdout, cols)
if len(sys.argv) <= 1:
cw.writeheader()
sys.exit()
fn = sys.argv[1]
try:
f = open(fn, "r")
except IOError:
sys.exit()
inside = False
inside_count = 0
inside_programmes = False
head_no = None
head_name = None
aims = list()
index = 0
inside_establishment = False
establishment_line = []
rows = []
out = {}
for line in f:
line_stripped = line.strip()
if head_no is None and line_stripped.startswith("Head"):
m = re.match(r"Head (\d{1,3}) — ?(.*)", line.strip())
if m is not None:
head_no = m.group(1)
out["head_no"] = head_no
continue
if (line_stripped.startswith("Establishment") and "establishment_nondirectorate" not in out) or (line_stripped.startswith("In addition") and "establishment_directorate" not in line):
inside_establishment = True
if inside_establishment:
establishment_line.append(line_stripped)
if " 2012 " in line: # ignores when error text is encountered
inside_establishment = False
establishment_line = []
if "Commitment balance" in line or len(line_stripped) <= 0 or ".." in line:
inside_establishment = False
if len(establishment_line) > 0:
outstr = re.sub(r"\.+", " ", re.sub(r"[ ]+", " ", " ".join(establishment_line))).strip()
if outstr.startswith("Establishment"):
out["establishment_nondirectorate"] = outstr
elif outstr.startswith("In addition"):
out["establishment_directorate"] = outstr
establishment_line = []
if "establishment_nondirectorate" in out or "establishment_directorate" in out:
cw.writerow(out)