-
Notifications
You must be signed in to change notification settings - Fork 1
/
current_data.py
121 lines (82 loc) · 3.94 KB
/
current_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 24 14:08:44 2015
@author: Oliver
"""
from ftplib import FTP
import zipfile
import io
import pandas as pd
from FTPData import rename_columns
CORRUPT_STATIONS = ['02494_akt.zip','02532_akt.zip','04878_akt.zip']
#These are the corrupt stations of the RECENT database. They have no
#information (dead stations) and they break the format of the database.
def load_current_data():
'''
Load the whole
'''
df_empty = True
path_recent_data = '/pub/CDC/observations_germany/climate/daily/kl/recent/'
ftp = FTP('ftp-cdc.dwd.de')
ftp.login()
listfiles = ftp.nlst(path_recent_data)
counter = 0
N = len(listfiles)
for zipstring in listfiles:
corrupted = False
for corrupt_station in CORRUPT_STATIONS:
if zipstring.endswith(corrupt_station):
corrupted = True
break
if corrupted:
continue
counter+=1
print('working on station number', counter, '/',N,'...')
if zipstring.endswith('.zip'):
fh = io.BytesIO()
ftp.retrbinary('RETR %s' % zipstring, fh.write)
fh.seek(0) # rewind pseudo-file
myzip = zipfile.ZipFile(fh) # open zip-file
list_in_zip = myzip.namelist() # list names
# determine the name of our txt-file
txtfilename = ''
for name in list_in_zip:
# the txt-file we need starts with 'produkt_klima_...'
if name.startswith('produkt_klima_Tageswerte'):
txtfilename = name
break
# open txt file
txtfile = myzip.open(txtfilename)
recent_dataframe=pd.read_csv(txtfile, sep=';')
recent_dataframe=recent_dataframe.drop('eor',1)
recent_dataframe=recent_dataframe.dropna(axis = 0)
recent_dataframe.iloc[:,0] = int(recent_dataframe.iloc[0,0])
#Creating a new dataframe just once at the beginning
if df_empty:
recent_data = recent_dataframe
df_empty = False
column_names = recent_dataframe.columns
up_col_names = [str(item).strip().upper() for item in column_names]
df_empty = False
else:
current_column_names = recent_dataframe.columns
if (current_column_names != column_names).any() :
print('Renaming column!')
up_current_col_names = [str(item).strip().upper() for item \
in current_column_names]
if (up_col_names != up_current_col_names):
print('WARNING: different column names!!!')
column_mapping = dict([(current_column_names[i],column_names[i]) \
for i in range(len(column_names))])
recent_dataframe = recent_dataframe.rename(columns = column_mapping)
recent_data = recent_data.append(recent_dataframe)
del fh, myzip, list_in_zip, txtfile, recent_dataframe
ftp.quit()
recent_data=rename_columns(recent_data)
recent_data=recent_data.sort(['Station ID', 'Date'])
recent_data = recent_data.replace(to_replace = -999, value = float('nan'))
recent_data['Date'] = recent_data['Date'].astype(int).astype(str)
recent_data['Year'] = [date[0:4] for date in recent_data['Date']]
recent_data['Month'] = [date[4:6] for date in recent_data['Date']]
recent_data['Day'] = [date[6:8] for date in recent_data['Date']]
return recent_data