Skip to content
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions scripts/us_epa/ejscreen/config.json
Comment thread
Rohit231998 marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
{
"YEARS": [ "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"],
Comment thread
Rohit231998 marked this conversation as resolved.
"NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"],
"CSV_COLUMNS_BY_YEAR": {
Comment thread
Rohit231998 marked this conversation as resolved.
"2015": ["FIPS", "dpm", "cancer", "resp", "o3", "pm"],
"2016": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2017": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2018": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2019": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2020": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2021": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2022": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2023": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2024": ["ID", "DSLPM", "OZONE", "PM25"]
},
"ZIP_FILENAMES": {
"2015": "EJSCREEN_20150505.csv",
"2016": "EJSCREEN_V3_USPR_090216_CSV",
"2017": null,
"2018": "EJSCREEN_2018_USPR_csv",
"2019": "EJSCREEN_2019_USPR.csv",
"2020": "EJSCREEN_2020_USPR.csv",
"2021": "EJSCREEN_2021_USPR.csv",
"2022": "EJSCREEN_2022_with_AS_CNMI_GU_VI.csv",
"2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv",
"2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv"
},
"FILENAMES": {
Comment thread
Rohit231998 marked this conversation as resolved.
"2015": "EJSCREEN_20150505",
"2016": "EJSCREEN_Full_V3_USPR_TSDFupdate",
"2017": "EJSCREEN_2017_USPR_Public",
"2018": "EJSCREEN_Full_USPR_2018",
"2019": "EJSCREEN_2019_USPR",
"2020": "EJSCREEN_2020_USPR",
"2021": "EJSCREEN_2021_USPR",
"2022": "EJSCREEN_2022_Full_with_AS_CNMI_GU_VI",
"2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI",
"2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI"
},
"TEMPLATE_MCF": [
{
"Node": "E:ejscreen_airpollutants->E0",
"typeOf": "dcs:StatVarObservation",
"variableMeasured": "dcs:Mean_Concentration_AirPollutant_DieselPM",
"observationDate": "C:ejscreen_airpollutants->year",
"observationAbout": "C:ejscreen_airpollutants->FIPS",
"observationPeriod": "dcs:P1Y",
"value": "C:ejscreen_airpollutants->DSLPM",
"unit": "dcs:MicrogramsPerCubicMeter"
},
{
"Node": "E:ejscreen_airpollutants->E1",
"typeOf": "dcs:StatVarObservation",
"variableMeasured": "dcs:AirPollutant_Cancer_Risk",
"observationDate": "C:ejscreen_airpollutants->year",
"observationAbout": "C:ejscreen_airpollutants->FIPS",
"observationPeriod": "dcs:P1Y",
"value": "C:ejscreen_airpollutants->CANCER",
"unit": "dcs:PerMillionPerson"
},
{
"Node": "E:ejscreen_airpollutants->E2",
"typeOf": "dcs:StatVarObservation",
"variableMeasured": "dcs:AirPollutant_Respiratory_Hazard",
"observationDate": "C:ejscreen_airpollutants->year",
"observationAbout": "C:ejscreen_airpollutants->FIPS",
"observationPeriod": "dcs:P1Y",
"value": "C:ejscreen_airpollutants->RESP"
},
{
"Node": "E:ejscreen_airpollutants->E3",
"typeOf": "dcs:StatVarObservation",
"variableMeasured": "dcs:Mean_Concentration_AirPollutant_Ozone",
"observationDate": "C:ejscreen_airpollutants->year",
"observationAbout": "C:ejscreen_airpollutants->FIPS",
"observationPeriod": "dcs:P1Y",
"value": "C:ejscreen_airpollutants->OZONE",
"unit": "dcs:PartsPerBillion"
},
{
"Node": "E:ejscreen_airpollutants->E4",
"typeOf": "dcs:StatVarObservation",
"variableMeasured": "dcs:Mean_Concentration_AirPollutant_PM2.5",
"observationDate": "C:ejscreen_airpollutants->year",
"observationAbout": "C:ejscreen_airpollutants->FIPS",
"observationPeriod": "dcs:P1Y",
"value": "C:ejscreen_airpollutants->PM25",
"unit": "dcs:MicrogramsPerCubicMeter"
}
]
,
Comment thread
Rohit231998 marked this conversation as resolved.
Outdated
"BASE_URL": "https://gaftp.epa.gov/EJSCREEN",
"URL_SUFFIX": {
"2023": "2.22_September_UseMe",
"2024": "2.32_August_UseMe"
}
}
228 changes: 118 additions & 110 deletions scripts/us_epa/ejscreen/ejscreen.py
Comment thread
Rohit231998 marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -1,103 +1,78 @@
'''
Generates cleaned CSV for the EPA EJSCREEN data and TMCF.
Usage: python3 ejscreen.py
'''
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os
import zipfile
import requests
import pandas as pd
import json
from absl import logging, flags, app
import sys

_MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(_MODULE_DIR, '../../../util/'))
print(_MODULE_DIR)
import file_util

logging.set_verbosity(logging.INFO)
logger = logging
_FLAGS = flags.FLAGS
flags.DEFINE_string('config_path',
'gs://unresolved_mcf/epa/ejscreen/config.json',
'Path to config file')

_MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
_CONFIG_PATH = os.path.join(_MODULE_DIR, 'config.json')

# Load configuration from config.json
with open(_CONFIG_PATH, 'r') as f:
Comment thread
Rohit231998 marked this conversation as resolved.
Outdated
config = json.load(f)

YEARS = config["YEARS"]
NORM_CSV_COLUMNS = config["NORM_CSV_COLUMNS"]
NORM_CSV_COLUMNS1 = config["NORM_CSV_COLUMNS1"]
CSV_COLUMNS_BY_YEAR = config["CSV_COLUMNS_BY_YEAR"]
ZIP_FILENAMES = config["ZIP_FILENAMES"]
FILENAMES = config["FILENAMES"]
TEMPLATE_MCF = config["TEMPLATE_MCF"]
BASE_URL = config["BASE_URL"]
URL_SUFFIX = config["URL_SUFFIX"]


# Function to build the correct URL for each year
def build_url(year, zip_filename=None):
if zip_filename:
# Construct the URL for the zip file
if year in URL_SUFFIX:
url = f'{BASE_URL}/{year}/{URL_SUFFIX[year]}/{zip_filename}.zip'
else:
url = f'{BASE_URL}/{year}/{zip_filename}.zip'
else:
# Construct the URL for the CSV file
url = f'{BASE_URL}/{year}/{FILENAMES[year]}.csv'
return url


YEARS = ['2015', '2016', '2017', '2018', '2019', '2020']

NORM_CSV_COLUMNS = ['ID', 'DSLPM', 'CANCER', 'RESP', 'OZONE', 'PM25']

# 2015 has different csv column names
CSV_COLUMNS_BY_YEAR = {
'2015': ['FIPS', 'dpm', 'cancer', 'resp', 'o3', 'pm'],
'2016': NORM_CSV_COLUMNS,
'2017': NORM_CSV_COLUMNS,
'2018': NORM_CSV_COLUMNS,
'2019': NORM_CSV_COLUMNS,
'2020': NORM_CSV_COLUMNS
}

ZIP_FILENAMES = {
'2015': 'EJSCREEN_20150505.csv',
'2016': 'EJSCREEN_V3_USPR_090216_CSV',
'2017': None,
'2018': 'EJSCREEN_2018_USPR_csv',
'2019': 'EJSCREEN_2019_USPR.csv',
'2020': 'EJSCREEN_2020_USPR.csv'
}

FILENAMES = {
'2015': 'EJSCREEN_20150505',
'2016': 'EJSCREEN_Full_V3_USPR_TSDFupdate',
'2017': 'EJSCREEN_2017_USPR_Public',
'2018': 'EJSCREEN_Full_USPR_2018',
'2019': 'EJSCREEN_2019_USPR',
'2020': 'EJSCREEN_2020_USPR'
}

TEMPLATE_MCF = '''
Node: E:ejscreen_airpollutants->E0
typeOf: dcs:StatVarObservation
variableMeasured: dcs:Mean_Concentration_AirPollutant_DieselPM
observationDate: C:ejscreen_airpollutants->year
observationAbout: C:ejscreen_airpollutants->FIPS
observationPeriod: dcs:P1Y
value: C:ejscreen_airpollutants->DSLPM
unit: dcs:MicrogramsPerCubicMeter

Node: E:ejscreen_airpollutants->E1
typeOf: dcs:StatVarObservation
variableMeasured: dcs:AirPollutant_Cancer_Risk
observationDate: C:ejscreen_airpollutants->year
observationAbout: C:ejscreen_airpollutants->FIPS
observationPeriod: dcs:P1Y
value: C:ejscreen_airpollutants->CANCER

Node: E:ejscreen_airpollutants->E2
typeOf: dcs:StatVarObservation
variableMeasured: dcs:AirPollutant_Respiratory_Hazard
observationDate: C:ejscreen_airpollutants->year
observationAbout: C:ejscreen_airpollutants->FIPS
observationPeriod: dcs:P1Y
value: C:ejscreen_airpollutants->RESP

Node: E:ejscreen_airpollutants->E3
typeOf: dcs:StatVarObservation
variableMeasured: dcs:Mean_Concentration_AirPollutant_Ozone
observationDate: C:ejscreen_airpollutants->year
observationAbout: C:ejscreen_airpollutants->FIPS
observationPeriod: dcs:P1Y
value: C:ejscreen_airpollutants->OZONE
unit: dcs:PartsPerBillion

Node: E:ejscreen_airpollutants->E4
typeOf: dcs:StatVarObservation
variableMeasured: dcs:Mean_Concentration_AirPollutant_PM2.5
observationDate: C:ejscreen_airpollutants->year
observationAbout: C:ejscreen_airpollutants->FIPS
observationPeriod: dcs:P1Y
value: C:ejscreen_airpollutants->PM25
unit: dcs:MicrogramsPerCubicMeter
'''


# data: dictionary of dataframes in the format {year: dataframe}
# outfilename: name of the csv that data will be written to
# write_csv concatenates the dataframe from each year together
# Data processing function
def write_csv(data, outfilename):
full_df = pd.DataFrame()
for curr_year, one_year_df in data.items():
one_year_df['year'] = curr_year # add year column
full_df = pd.concat(
[full_df, one_year_df],
ignore_index=True) # concatenate year onto larger dataframe
one_year_df['year'] = curr_year
full_df = pd.concat([full_df, one_year_df], ignore_index=True)

# sort by FIPS and make into dcid
# Sort by FIPS and make into dcid
full_df = full_df.rename(columns={'ID': 'FIPS'})
full_df = full_df.sort_values(by=['FIPS'], ignore_index=True)
full_df['FIPS'] = 'dcid:geoId/' + (
Expand All @@ -108,32 +83,65 @@ def write_csv(data, outfilename):


def write_tmcf(outfilename):
# Convert each item in TEMPLATE_MCF to a string, even if it's a dictionary
if isinstance(TEMPLATE_MCF, list):
# Convert each element to a string if it's not already
template_content = "\n".join(str(item) for item in TEMPLATE_MCF)
else:
template_content = str(
TEMPLATE_MCF
) # In case it's not a list, just convert it to a string

with open(outfilename, 'w') as f_out:
f_out.write(TEMPLATE_MCF)
f_out.write(template_content)


if __name__ == '__main__':
def main(_):
Comment thread
Rohit231998 marked this conversation as resolved.
dfs = {}
for year in YEARS:
print(year)
logger.info(f"Processing year: {year}")
columns = CSV_COLUMNS_BY_YEAR[year]
# request file
zip_filename = ZIP_FILENAMES[year]
if zip_filename is not None:
response = requests.get(
f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip')
with zipfile.ZipFile(io.BytesIO(response.content())) as zfile:
with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
dfs[year] = pd.read_csv(newfile, usecols=columns)
# some years are not zipped
zip_filename = ZIP_FILENAMES.get(year, None)

url = build_url(year, zip_filename)

logger.info(f"Requesting file: {url}")
response = requests.get(url, verify=False)

if response.status_code == 200:
if zip_filename:
with zipfile.ZipFile(io.BytesIO(response.content)) as zfile:
with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
dfs[year] = pd.read_csv(newfile,
engine='python',
encoding='latin1',
usecols=columns)
else:
dfs[year] = pd.read_csv(io.StringIO(response.text),
sep=',',
usecols=columns)
logger.info(
f"File downloaded and processed for {year} successfully")
else:
logger.error(
Comment thread
Rohit231998 marked this conversation as resolved.
Outdated
f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
)

# Rename columns to match other years
if year == '2024':
Comment thread
Rohit231998 marked this conversation as resolved.
Outdated
cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1))
else:
response = requests.get(
f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv')
dfs[year] = pd.read_csv(response, usecols=columns)
# rename weird column names to match other years
if columns != NORM_CSV_COLUMNS:
cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS))
dfs[year] = dfs[year].rename(columns=cols_renamed)

write_csv(dfs, 'ejscreen_airpollutants.csv')
write_tmcf('ejscreen.tmcf')
dfs[year] = dfs[year].rename(columns=cols_renamed)
logger.info(f"Columns renamed for {year} successfully")

logger.info("Writing data to CSV")
write_csv(dfs, 'ejscreen_airpollutants.csv')
logger.info("Writing template to TMCF")
write_tmcf('ejscreen.tmcf')
logger.info("Process completed successfully")


if __name__ == '__main__':
app.run(main)
Loading