-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
53 lines (47 loc) · 1.73 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import io
import os
import pandas
from PyPDF2 import PdfReader
import re
import ssl
from urllib.request import Request, urlopen
ssl._create_default_https_context = ssl._create_unverified_context
url = "https://particulier.edf.fr/content/dam/2-Actifs/Documents/Offres/Grille_prix_Tarif_Bleu.pdf"
remote_file = urlopen(Request(url)).read()
memory_file = io.BytesIO(remote_file)
pdf_file = PdfReader(memory_file)
# extracting text from page
pdf_text = pdf_file.pages[0].extract_text().splitlines()
next_is_base = False
next_is_hchp = False
next_is_tempo = False
base = []
hchp = []
tempo = []
for line in pdf_text:
if next_is_base:
if line[0].isdigit():
base.append(re.split(r"\s+", line.strip()))
else:
next_is_base = False
elif next_is_hchp:
if line[0].isdigit():
hchp.append(re.split(r"\s+", line.strip()))
else:
next_is_hchp = False
elif next_is_tempo:
if line[0].isdigit():
tempo.append(re.split(r"\s+", line.strip()))
else:
next_is_tempo = False
elif line.find("(€ TTC/mois ) (cts € TTC/kWh)") != -1:
next_is_base = True
elif line.find("Creuses") != -1:
next_is_hchp = True
elif line.find("HP") != -1:
next_is_tempo = True
if not os.path.exists('output'):
os.makedirs('output')
pandas.DataFrame(base, columns=['kVA', 'Abonnement (€)', 'ct€/kWh']).to_csv('output/base.csv', index=False)
pandas.DataFrame(hchp, columns=['kVA', 'Abonnement (€)', 'HP (ct€/kWh)', 'HC (ct€/kWh)']).to_csv('output/hchp.csv', index=False)
pandas.DataFrame(tempo, columns=['kVA', 'Abonnement (€)', 'Bleu HP (ct€/kWh)', 'Bleu HC (ct€/kWh)', 'Blanc HP (ct€/kWh)', 'Blanc HC (ct€/kWh)', 'Rouge HP (ct€/kWh)', 'Rouge HC (ct€/kWh)']).to_csv('output/tempo.csv', index=False)