MyFunctions/sites_obs.py at main · xuyf-web/MyFunctions · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import numpy as np
import xarray as xr
import pandas as pd
import os
from glob import glob

def load_obs(csvpath:str, start:str, end:str) -> pd.DataFrame:
    '''
    load observation data from csv files in csvpath
    csvpath: path to the csv files
    start: start date in the format 'YYYY-MM-DD'
    end: end date in the format 'YYYY-MM-DD'
    '''

    dates = pd.date_range(start, end, freq='D')
    dfdates = {}
    for date in dates:
        yy = date.strftime('%Y')
        mm = date.strftime('%m')
        dd = date.strftime('%d')
        file = csvpath + '/china_sites_' + yy + mm + dd + '.csv'
        try:
            dfdates[str(yy+mm+dd)] = pd.read_csv(file)
        except:
            print(f'Cannot find file: {file}')

    obs = pd.concat(dfdates, ignore_index=True)

    return obs

def load_location(locpath:str, extend:list =None) -> pd.DataFrame:
    '''
    load site locations from the latest file in locpath
    locpath: path to the location file
    extend: [lon_min, lon_max, lat_min, lat_max]
    '''

    try:
        sitelocations = pd.read_excel(locpath + '/sitelocations_from2022.02.13.xlsx')
    except FileNotFoundError:
        xlsx_files = glob(os.path.join(locpath, '站点列表*.xlsx'))
        csv_files = glob(os.path.join(locpath, '站点列表*.csv'))

        if xlsx_files:
            latest_file = max(xlsx_files, key=os.path.getctime)
            sitelocations = pd.read_excel(latest_file)
        elif csv_files:
            latest_file = max(csv_files, key=os.path.getctime)
            sitelocations = pd.read_csv(latest_file)
        else:
            raise FileNotFoundError(f'No suitable file found in {locpath}.')

    # 去掉经纬度不是数字的点
    sitelocations = sitelocations[pd.to_numeric(sitelocations['经度'], errors='coerce').notnull()]
    sitelocations = sitelocations[pd.to_numeric(sitelocations['纬度'], errors='coerce').notnull()]

    if extend:
        lon_min = extend[0]
        lon_max = extend[1]
        lat_min = extend[2]
        lat_max = extend[3]
        sitelocations = sitelocations[(sitelocations['经度']>=lon_min) & (sitelocations['纬度']>=lat_min) &
                                      (sitelocations['经度']<=lon_max) & (sitelocations['纬度']<=lat_max)]

    return sitelocations

def process_data(outpath:str, obs:pd.DataFrame, sitelocations:pd.DataFrame, variables: list=None) -> None:
    '''
    process observation data and save to excel files
    outpath: path to save the excel files
    obs: observation data
    sitelocations: site locations
    variables: variables to save, example: ['O3','PM2.5','PM10']
    '''

    sitelist = sitelocations['监测点编码'].tolist()

    df = obs[['date','hour','type']+sitelist]
    df['datetime'] = pd.to_datetime(df['date'].astype(str) + 'T' + df['hour'].astype(str) + ':00')
    df.set_index('datetime', inplace=True)
    df.drop(columns=['date','hour'], inplace=True)

    with pd.ExcelWriter(outpath + '/obs_sites.xlsx') as writer:
        if variables is None:
            variables = df['type'].unique()

        dfs = {}
        for var in variables:
            dfs[var] = df.groupby(['type']).get_group(var)
            dfs[var].drop(columns=['type'], inplace=True)
            dfs[var].to_excel(writer, sheet_name=var, index=True)

    return None

if __name__ == '__main__':
    csvpath = '/your/directory/obs_files'
    locpath = '/your/directory/site_locations'
    outpath = '/your/directory/outpath'
    start = '2024-04-11'
    end = '2024-04-13'
    extend = [100, 120, 30, 40]
    variables = ['O3','PM2.5','PM10']

    obs = load_obs(csvpath, start, end)
    sitelocations = load_location(locpath, extend)
    process_data(outpath, obs, sitelocations, variables)