From 7cbaaad62309006700c56505fac5dea69fe555a0 Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Chandaluri Date: Mon, 9 Dec 2024 09:52:00 +0000 Subject: [PATCH 1/7] created new PR because old PR is failing for cla issue --- scripts/us_eia/opendata/README.md | 51 +++- scripts/us_eia/opendata/download_bulk.py | 61 ----- .../us_eia/opendata/generate_jsonl_for_bq.py | 33 +-- scripts/us_eia/opendata/manifest.json | 130 ++++++++++ scripts/us_eia/opendata/process.py | 104 ++++++++ scripts/us_eia/opendata/process/README.md | 11 +- scripts/us_eia/opendata/process/coal.py | 243 ------------------ scripts/us_eia/opendata/process/common.py | 222 ++++++++++++++-- .../us_eia/opendata/process/common_test.py | 4 +- scripts/us_eia/opendata/process/elec.py | 5 +- scripts/us_eia/opendata/process/nuclear.py | 2 +- scripts/us_eia/opendata/process/pet.py | 2 +- .../opendata/process/test_data/categories.csv | 10 +- .../process/test_data/categories.tmcf | 1 + .../opendata/process/test_data/coal.csv | 24 +- .../opendata/process/test_data/coal.tmcf | 1 + .../opendata/process/test_data/elec.csv | 20 +- .../opendata/process/test_data/elec.tmcf | 1 + .../opendata/process/test_data/intl.csv | 14 +- .../opendata/process/test_data/intl.tmcf | 1 + .../us_eia/opendata/process/test_data/ng.csv | 24 +- .../us_eia/opendata/process/test_data/ng.tmcf | 1 + .../opendata/process/test_data/nuc_status.csv | 26 +- .../process/test_data/nuc_status.tmcf | 1 + .../us_eia/opendata/process/test_data/pet.csv | 16 +- .../opendata/process/test_data/pet.tmcf | 1 + .../opendata/process/test_data/seds.csv | 10 +- .../opendata/process/test_data/seds.tmcf | 1 + .../opendata/process/test_data/total.csv | 6 +- .../opendata/process/test_data/total.tmcf | 1 + 30 files changed, 587 insertions(+), 440 deletions(-) delete mode 100644 scripts/us_eia/opendata/download_bulk.py create mode 100644 scripts/us_eia/opendata/manifest.json create mode 100644 scripts/us_eia/opendata/process.py delete mode 100644 scripts/us_eia/opendata/process/coal.py diff --git a/scripts/us_eia/opendata/README.md b/scripts/us_eia/opendata/README.md index f61b576b76..954d48e029 100644 --- a/scripts/us_eia/opendata/README.md +++ b/scripts/us_eia/opendata/README.md @@ -6,12 +6,6 @@ Each dataset available as a Zip-file of JSONL content. See [here](https://www.eia.gov/opendata/bulkfiles.php) for more details. -To download the latest versions of ALL datasets available, run the following command. Files will be downloaded and extracted to a tmp_raw_data folder. - -```bash -python3 download_bulk.py -``` - ### Data Exploration To ease analysis of the datasets, see [`generate_jsonl_for_bq.py`](generate_jsonl_for_bq.py) for instructions to convert and import the data into BigQuery. @@ -20,11 +14,44 @@ To ease analysis of the datasets, see [`generate_jsonl_for_bq.py`](generate_json This dataset is available for public use, license is available at https://www.eia.gov/about/copyrights_reuse.php -### Import procedure -- Download data - ```bash - python3 download_bulk.py - ``` +- Run the [processor](process/README.md) + +### Downloading and Processing Data + + + If you want to perform "only download", run the below command: -- Run the [processor](process/README.md) \ No newline at end of file + python3 process.py --dataset=INTL --mode=download + python3 process.py --dataset=ELEC --mode=download + python3 process.py --dataset=PET --mode=download + python3 process.py --dataset=NG --mode=download + python3 process.py --dataset=SEDS --mode=download + python3 process.py --dataset=NUC_STATUS --mode=download + python3 process.py --dataset=TOTAL --mode=download + + + + If you want to perform "only process", run the below command: + + Running this command generates input_fles and csv, mcf, tmcf, svg.mcf files. + + python3 process.py --dataset=INTL --mode=process + python3 process.py --dataset=ELEC --mode=process + python3 process.py --dataset=PET --mode=process + python3 process.py --dataset=NG --mode=process + python3 process.py --dataset=SEDS --mode=process + python3 process.py --dataset=NUC_STATUS --mode=process + python3 process.py --dataset=TOTAL --mode=process + + To Download and process the data together, run the below command: + ```bash + python3 process.py --dataset=TOTAL + python3 process.py --dataset=INTL + python3 process.py --dataset=ELEC + python3 process.py --dataset=NG + python3 process.py --dataset=PET + python3 process.py --dataset=SEDS + python3 process.py --dataset=NUC_STATUS + + ``` \ No newline at end of file diff --git a/scripts/us_eia/opendata/download_bulk.py b/scripts/us_eia/opendata/download_bulk.py deleted file mode 100644 index 554187e9b6..0000000000 --- a/scripts/us_eia/opendata/download_bulk.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Utility to download all EIA data from https://api.eia.gov/bulk/manifest.txt -Files are stored in raw_data. - -Run this script in this folder: -python3 download_bulk.py -""" - -import io -import zipfile - -import requests - -from absl import flags -from absl import app - -MANIFEST_URL = "https://api.eia.gov/bulk/manifest.txt" - -FLAGS = flags.FLAGS -flags.DEFINE_string('data_dir', 'tmp_raw_data', 'Data dir to download into') -flags.DEFINE_list('datasets', [], 'Datasets to download. Everything, if empty.') - - -def download_file(url: str, save_path: str): - print(f'Downloading {url} to {save_path}') - r = requests.get(url, stream=True) - z = zipfile.ZipFile(io.BytesIO(r.content)) - z.extractall(save_path) - - -def download_manifest(): - return requests.get(MANIFEST_URL).json() - - -def main(_): - assert FLAGS.data_dir - manifest_json = download_manifest() - datasets = manifest_json.get('dataset', {}) - for dataset_name in datasets: - if FLAGS.datasets and dataset_name not in FLAGS.datasets: - continue - print(dataset_name) - dataset = datasets[dataset_name] - download_file(dataset['accessURL'], f'{FLAGS.data_dir}/{dataset_name}') - - -if __name__ == '__main__': - app.run(main) diff --git a/scripts/us_eia/opendata/generate_jsonl_for_bq.py b/scripts/us_eia/opendata/generate_jsonl_for_bq.py index 27227de946..af1579277c 100644 --- a/scripts/us_eia/opendata/generate_jsonl_for_bq.py +++ b/scripts/us_eia/opendata/generate_jsonl_for_bq.py @@ -40,10 +40,9 @@ IN_DATA_PATH = 'tmp_raw_data' OUT_DATA_PATH = 'tmp_bq_import' DATASETS = [ - 'AEO.2014', 'AEO.2015', 'AEO.2016', 'AEO.2017', 'AEO.2018', 'AEO.2019', - 'AEO.2020', 'AEO.2021', 'COAL', 'EBA', 'ELEC', 'EMISS', 'IEO.2017', - 'IEO.2019', 'INTL', 'NG', 'NUC_STATUS', 'PET', 'PET_IMPORTS', 'SEDS', - 'STEO', 'TOTAL' + 'AEO.2020', 'AEO.2021', 'AEO.2022', 'AEO.2023', 'AEO.IEO2', 'COAL', 'EBA', + 'ELEC', 'EMISS', 'IEO', 'INTL', 'NG', 'NUC_STATUS', 'PET', 'PET_IMPORTS', + 'SEDS', 'STEO', 'TOTAL' ] @@ -77,17 +76,18 @@ def process_dataset(dataset, in_file_path, out_file_path): with open(out_file_path + '.series.jsonl', 'w+') as series_fp: with open(out_file_path + '.categories.jsonl', 'w+') as category_fp: for line in data_fp: - data = json.loads(line) - series_id = data.get('series_id', None) - if series_id: - jsonl = extract_series_to_jsonl(line, dataset) - series_fp.write(json.dumps(jsonl)) - series_fp.write('\n') - category_id = data.get('category_id', None) - if category_id: - jsonl = extract_category_to_jsonl(line, dataset) - category_fp.write(json.dumps(jsonl)) - category_fp.write('\n') + if line.startswith('{'): + data = json.loads(line) + series_id = data.get('series_id', None) + if series_id: + jsonl = extract_series_to_jsonl(line, dataset) + series_fp.write(json.dumps(jsonl)) + series_fp.write('\n') + category_id = data.get('category_id', None) + if category_id: + jsonl = extract_category_to_jsonl(line, dataset) + category_fp.write(json.dumps(jsonl)) + category_fp.write('\n') def process_single(subdir, file): @@ -103,7 +103,8 @@ def process_all(): for file in sorted(files): if not file.endswith('.txt'): continue - print(f'Processing {subdir}/{file}') + print(f'Processing1 {subdir}/{file}') + process_single(subdir, file) diff --git a/scripts/us_eia/opendata/manifest.json b/scripts/us_eia/opendata/manifest.json new file mode 100644 index 0000000000..c0dbc85d6b --- /dev/null +++ b/scripts/us_eia/opendata/manifest.json @@ -0,0 +1,130 @@ +{ + "import_specifications": [ + { + "import_name": "EIA_Electricity", + "curator_emails": [ + "chandaluri@google.com" + ], + "provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0", + "provenance_description": "Electricity dataset has country, state-level and plant-level information on electricity generation, consumption, sales etc by energy source and “sectors” (like residential, commercial, etc.).", + "scripts": [ + "process.py --dataset=ELEC" + ], + "import_inputs": [ + { + "template_mcf": "tmp_raw_data/ELEC/ELEC.tmcf", + "cleaned_csv": "tmp_raw_data/ELEC/ELEC.csv" + } + ], + "cron_schedule": "0 1 1 * *" + }, + { + "import_name": "EIA_NaturalGas", + "curator_emails": [ + "chandaluri@google.com" + ], + "provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0", + "provenance_description": "Natural gas dataset has country and state-level data.", + "scripts": [ + "process.py --dataset=NG" + ], + "import_inputs": [ + { + "template_mcf": "tmp_raw_data/NG/NG.tmcf", + "cleaned_csv": "tmp_raw_data/NG/NG.csv" + } + ], + "cron_schedule": "0 2 1 * *" + }, + { + "import_name": "EIA_NuclearOutages", + "curator_emails": [ + "chandaluri@google.com" + ], + "provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0", + "provenance_description": "Nuclear outage dataset has nuclear-plant and national data about Nuclear energy generation capacity and planned outages.", + "scripts": [ + "process.py --dataset=NUC_STATUS" + ], + "import_inputs": [ + { + "template_mcf": "tmp_raw_data/NUC_STATUS/NUC_STATUS.tmcf", + "cleaned_csv": "tmp_raw_data/NUC_STATUS/NUC_STATUS.csv" + } + ], + "cron_schedule": "0 3 1 * *" + }, + { + "import_name": "EIA_Petroleum", + "curator_emails": [ + "chandaluri@google.com" + ], + "provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0", + "provenance_description": "EIA Petroleum dataset has country and state-level data.", + "scripts": [ + "process.py --dataset=PET" + ], + "import_inputs": [ + { + "template_mcf": "tmp_raw_data/PET/PET.tmcf", + "cleaned_csv": "tmp_raw_data/PET/PET.csv" + } + ], + "cron_schedule": "0 4 1 * *" + }, + { + "import_name": "EIA_International", + "curator_emails": [ + "chandaluri@google.com" + ], + "provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0", + "provenance_description": "EIA International Energy dataset has country, continent and world-level data.", + "scripts": [ + "process.py --dataset=INTL" + ], + "import_inputs": [ + { + "template_mcf": "tmp_raw_data/INTL/INTL.tmcf", + "cleaned_csv": "tmp_raw_data/INTL/INTL.csv" + } + ], + "cron_schedule": "0 5 1 * *" + }, + { + "import_name": "EIA_SEDS", + "curator_emails": [ + "chandaluri@google.com" + ], + "provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0", + "provenance_description": "EIA SEDS International Energy dataset has US country-level and state-level data.", + "scripts": [ + "process.py --dataset=SEDS" + ], + "import_inputs": [ + { + "template_mcf": "tmp_raw_data/SEDS/SEDS.tmcf", + "cleaned_csv": "tmp_raw_data/SEDS/SEDS.csv" + } + ], + "cron_schedule": "0 6 1 * *" + }, + { + "import_name": "EIA_TotalEnergy", + "curator_emails": [ + "chandaluri@google.com" + ], + "provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0", + "provenance_description": "Total Energy dataset has US country-level data.", + "scripts": [ + "process.py --dataset=TOTAL" + ], + "import_inputs": [ + { + "template_mcf": "tmp_raw_data/TOTAL/TOTAL.tmcf", + "cleaned_csv": "tmp_raw_data/TOTAL/TOTAL.csv" + } + ], + "cron_schedule": "20 6 1 * *" + } + ] +} \ No newline at end of file diff --git a/scripts/us_eia/opendata/process.py b/scripts/us_eia/opendata/process.py new file mode 100644 index 0000000000..feeac4ebce --- /dev/null +++ b/scripts/us_eia/opendata/process.py @@ -0,0 +1,104 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utility to download all EIA data from https://api.eia.gov/bulk/manifest.txt +Files are stored in raw_data. + +Run this script in this folder: +python3 process.py --dataset=INTL --mode=download + +Replace `INTL` with any of the other dataset codes +""" + +import io +import os +import sys +import zipfile +import requests + +from absl import flags +from absl import app +from absl import logging + +from process import common, elec, intl, ng, nuclear, pet, seds, total + +MANIFEST_URL = "https://api.eia.gov/bulk/manifest.txt" + +FLAGS = flags.FLAGS +flags.DEFINE_string('data_dir', 'tmp_raw_data', 'Data dir to download into') +flags.DEFINE_string('dataset', '', + 'Datasets to download. Everything, if empty.') +flags.DEFINE_string('mode', '', 'Options: download or process') + +## Value: (name, extract_fn, schema_fn) +_DATASETS = { + 'ELEC': ('Electricity', elec.extract_place_statvar, + elec.generate_statvar_schema), + 'INTL': ('Energy Overview (INTL)', intl.extract_place_statvar, None), + 'PET': ('Petroleum', pet.extract_place_statvar, None), + 'NG': ('Natural Gas', ng.extract_place_statvar, None), + 'NUC_STATUS': ('Nuclear Outages', nuclear.extract_place_statvar, + nuclear.generate_statvar_schema), + 'SEDS': ('Consumption, Production, Prices and Expenditure (SEDS)', + seds.extract_place_statvar, None), + 'TOTAL': ('Energy Overview (TOTAL)', total.extract_place_statvar, None) +} + + +def download_file(url: str, save_path: str): + try: + r = requests.get(url, stream=True) + z = zipfile.ZipFile(io.BytesIO(r.content)) + z.extractall(save_path) + except Exception as e: + logging.fatal(f"error while downloading the file,{url} -{e}") + + +def download_manifest(): + try: + return requests.get(MANIFEST_URL).json() + except Exception as e: + logging.fatal( + f"error while downloading the manifest,{MANIFEST_URL} -{e}") + + +def main(_): + mode = FLAGS.mode + assert FLAGS.data_dir + manifest_json = download_manifest() + datasets = manifest_json.get('dataset', {}) + for dataset_name in datasets: + if FLAGS.dataset and dataset_name not in FLAGS.dataset: + continue + dataset = datasets[dataset_name] + if mode == "" or mode == "download": + download_file(dataset['accessURL'], + f'{FLAGS.data_dir}/{dataset_name}') + if mode == "" or mode == "process": + file_prefix = os.path.join(f'{FLAGS.data_dir}/{dataset_name}', + FLAGS.dataset) + common.process( + dataset=FLAGS.dataset, + dataset_name=_DATASETS[FLAGS.dataset], + in_json=file_prefix + '.txt', + out_csv=file_prefix + '.csv', + out_sv_mcf=file_prefix + '.mcf', + out_svg_mcf=file_prefix + '.svg.mcf', + out_tmcf=file_prefix + '.tmcf', + extract_place_statvar_fn=_DATASETS[FLAGS.dataset][1], + generate_statvar_schema_fn=_DATASETS[FLAGS.dataset][2]) + + +if __name__ == '__main__': + app.run(main) diff --git a/scripts/us_eia/opendata/process/README.md b/scripts/us_eia/opendata/process/README.md index 1e3a543ace..ad5c8d8bbe 100644 --- a/scripts/us_eia/opendata/process/README.md +++ b/scripts/us_eia/opendata/process/README.md @@ -58,16 +58,21 @@ takes a raw stat-var and generates a fully defined stat-var for it. Download and unzip the data files based on the [manifest](https://api.eia.gov/bulk/manifest.txt) by running the -[`download_bulk.py`](https://github.com/datacommonsorg/data/blob/master/scripts/us_eia/opendata/download_bulk.py) +[`python3 process.py --dataset=TOTAL`](https://github.com/datacommonsorg/data/blob/master/scripts/us_eia/opendata/process.py) script. To generate CSV, TMCF and stat-var MCF for a supported dataset: ```bash -python3 main.py --data_dir=tmp_raw_data/ELEC --dataset=ELEC +python3 process.py --dataset=INTL --mode=process + python3 process.py --dataset=ELEC --mode=process + python3 process.py --dataset=PET --mode=process + python3 process.py --dataset=NG --mode=process + python3 process.py --dataset=SEDS --mode=process + python3 process.py --dataset=NUC_STATUS --mode=process + python3 process.py --dataset=TOTAL --mode=process ``` -Replace `ELEC` with any of the other dataset codes listed above. To run tests: diff --git a/scripts/us_eia/opendata/process/coal.py b/scripts/us_eia/opendata/process/coal.py deleted file mode 100644 index 6b11850b83..0000000000 --- a/scripts/us_eia/opendata/process/coal.py +++ /dev/null @@ -1,243 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""EIA Coal Dataset specific functions.""" - -import logging -import re - -from . import common - - -def extract_place_statvar(series_id, counters): - """Given the series_id, extract the raw place and stat-var ID. - - Args: - series_id: EIA series ID - counters: map for updating error statistics - - Returns a (place, raw-stat-var, is_us_place) tuple. - """ - # Pattern #1: COAL.{Measure}.{Region}-{Code}.{Period} - # Region could include 3-leter codes (e.g. MAT - Middle Atlantic) - m = re.match(r"^COAL\.([^._]+_?[^._]+)\.([A-Z]+)-([0-9]+)\.([AQM])$", - series_id) - if m: - measure = m.group(1) - place = m.group(2) - code = m.group(3) - period = m.group(4) - sv_id = f'COAL.{measure}.{code}.{period}' - return (place, sv_id, True) - - # TODO: Ignore this pattern until we have a way to model multi-location SV's. - # Pattern #2: COAL.{EXPORT|IMPORT}_{Measure}.{Type}-{CountryIso}-{UsPortIso}.{Period} - # Pattern #3: COAL.{SHIPMENT}_{Submeasure}.{Source}-{Destination}-{Material}.{Period} - # m = re.match(r"^COAL\.([A-Z]+)_([A-Z]+)\.([^-]+)-([^-]+)-([^.]+)\.([AQM])$", - # series_id) - # if m: - # return (None, None, None) - # activity = m.group(1) - # measure = m.group(2) - # if activity in ['EXPORT', 'IMPORT']: - # # Pattern #2 - # # TODO: model destination / source port as well - # type = m.group(3) - # place = m.group(4) - # port = m.group(5) - # period = m.group(6) - # sv_id = f'COAL.{activity}_{measure}.{type}.{period}' - # return (place, sv_id, False) - # elif activity == 'SHIPMENT': - # # Pattern #3 - # source = m.group(3) - # if source.isalpha(): # could include 3-letter region codes - # destination_power_plant = m.group(4) - # material = m.group(5) - # period = m.group(6) - # sv_id = f'COAL.SHIPMENT_{measure}.{material}.{period}' - # return (source, sv_id, True) - # else: - # # TODO: Handle remaining places - coal mines - # counters[f'error_unknown_coal_mine SHIPMENT '] += 1 - # return (None, None, None) - # else: - # counters[f'unknown #2,3 activity ({activity})'] += 1 - # return (None, None, None) - - # Pattern #4: COAL.PROD_DIST_STOCKS.TOT-{Place}.{Period} - # Pattern #4: COAL.PRICE_BY_RANK.{Region}-{Material}.{Period} - # Pattern #4: COAL.SHIP_{MINE|PLANT}_{ASH|HEAT|PRICE|QTY|SULFUR}.{Region}-{Material}.{Period} - m = re.match(r"^COAL\.([A-Z]+_[A-Z]+_[A-Z]+)\.([^-]+)-([^.]+)\.([AQM])$", - series_id) - if m: - measure = m.group(1) - if measure.startswith("SHIP"): - # TODO: model destination / source port as well - return (None, None, None) - if measure == "PROD_DIST_STOCKS": - assert m.group(2) == "TOT" - place = m.group(3) - period = m.group(4) - return (place, f'COAL.PROD_DIST_STOCKS.TOT.{period}', True) - else: - place = m.group(2) - material = m.group(3) - period = m.group(4) - return (place, f'COAL.{measure}.{material}.{period}', True) - - return (None, None, None) - - -## -## Maps for Schema - more definitions at https://www.eia.gov/coal/data/browser/data/termsAndDefs.php?rseAvailable=false&showFilterValues=true&showDetail=true&showTransportationMode=true&showPrimeMovers=true&showPlantFuelTypes=true&showMineType=true&showMineStatus=true&topic=26 -## - -# Each value is a list where first entry is StatVar ID component, and the rest -# are StatVar PVs. -### Make sure each constraint is added to SV name -_MEASURE_MAP = { - 'ASH_CONTENT': [ - 'Average_AshContent_Coal_For', - 'populationType: dcs:Coal', - 'measuredProperty: dcs:ashContent', - 'statType: dcs:meanValue', - ], - 'CONS_TOT': [ - 'Consumption_Coal', - 'populationType: dcs:Coal', - 'measuredProperty: dcs:consumption', - 'statType: dcs:measuredValue', - ], - 'COST': [ - 'Average_Cost_Coal', - 'populationType: dcs:Coal', - 'measuredProperty: dcs:cost', - 'statType: dcs:meanValue', - ], - 'HEAT_CONTENT': [ - 'Average_HeatContent_Coal_For', - 'populationType: dcs:Coal', - 'measuredProperty: dcs:heatContent', - 'statType: dcs:meanValue', - ], - 'RECEIPTS': [ - 'Receipt_Coal', - 'populationType: dcs:Coal', - 'measuredProperty: dcs:receipt', - 'statType: dcs:measuredValue', - ], - 'STOCKS': [ - 'Stock_Coal', - 'populationType: dcs:Coal', - 'measuredProperty: dcs:stock', - 'statType: dcs:measuredValue', - ], - 'SULFUR_CONTENT': [ - 'Average_SulfurContent_Coal_For', - 'populationType: dcs:Coal', - 'measuredProperty: dcs:sulfurContent', - 'statType: dcs:meanValue', - ], -} - -_CONSUMING_SECTOR = { - '1': 'ElectricUtility', - '2': 'ElectricUtilityNonCogen', - '3': 'ElectricUtilityCogen', - '8': 'CommercialAndInstitutional', - '9': 'CokePlants', - '10': 'OtherIndustrial', - '94': 'IndependentPowerProducers', - '98': 'ElectricPower', -} - -_UNIT_MAP = { - 'ASH_CONTENT': ('', '100'), - 'HEAT_CONTENT': ('BtuPerPound', ''), - 'SULFUR_CONTENT': ('', '100'), - 'CONS_TOT': ('ShortTon', ''), - 'RECEIPTS': ('ShortTon', ''), - 'STOCKS': ('ShortTon', ''), - 'COST': ('USDollarPerShortTon', ''), -} - - -def generate_statvar_schema(raw_sv, rows, sv_map, counters): - """Generate StatVar with full schema. - - Args: - raw_sv: Raw stat-var returned by extract_place_statvar() - rows: List of dicts corresponding to CSV row. See common._COLUMNS. - sv_map: Map from stat-var to its MCF content. - counters: Map updated with error statistics. - - Returns schema-ful stat-var ID if schema was generated, None otherwise. - """ - counters['generate_statvar_schema'] += 1 - - # COAL.{Measure}.{ConsumingSector}.{Period} - m = re.match(r"^COAL\.([^._]+_?[^._]+)\.([0-9]+)\.([AQM])$", raw_sv) - if m: - measure = m.group(1) - consuming_sector = m.group(2) - period = m.group(3) - else: - counters['error_unparsable_raw_statvar'] += 1 - return None - counters[f'measure-{measure}'] += 1 - - # Get popType and mprop based on measure. - measure_pvs = _MEASURE_MAP.get(measure, None) - if not measure_pvs: - counters[f'error_missing_measure-{measure}'] += 1 - return None - - sv_id_parts = [common.PERIOD_MAP[period], measure_pvs[0]] - sv_pvs = measure_pvs[1:] + [ - 'typeOf: dcs:StatisticalVariable', - # TODO(shanth): use new property in next iteration - f'measurementQualifier: dcs:{common.PERIOD_MAP[period]}', - ] - - if consuming_sector: - cs = _CONSUMING_SECTOR.get(consuming_sector, None) - if not cs: - counters[f'error_missing_consuming_sector-{consumingSector}'] += 1 - return None - sv_id_parts.append(cs) - sv_pvs.append(f'consumingSector: dcs:{cs}') - - if measure not in _UNIT_MAP: - counters[f'error_missing_unit-{measure}'] += 1 - return None - (unit, sfactor) = _UNIT_MAP[measure] - - sv_id = '_'.join(sv_id_parts) - - # Update the rows with new StatVar ID value and additional properties. - for row in rows: - row['stat_var'] = f'dcid:{sv_id}' - if unit: - row['unit'] = f'dcid:{unit}' - else: - # Reset unit to empty to clear the raw unit value. - row['unit'] = '' - if sfactor: - row['scaling_factor'] = sfactor - - if sv_id not in sv_map: - node = f'Node: dcid:{sv_id}' - sv_map[sv_id] = '\n'.join([node] + sv_pvs) - - return sv_id diff --git a/scripts/us_eia/opendata/process/common.py b/scripts/us_eia/opendata/process/common.py index 6624da1f2f..317d223d83 100644 --- a/scripts/us_eia/opendata/process/common.py +++ b/scripts/us_eia/opendata/process/common.py @@ -12,16 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. """Process EIA datasets to produce TMCF and CSV.""" - +import os +import sys import csv import json -import logging import re +from absl import logging from collections import defaultdict from sys import path -# For import util.alpha2_to_dcid -path.insert(1, '../../../../') +## For import util.alpha2_to_dcid +## Setup path for import from data/util + +_MODULE_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(1, os.path.join(_MODULE_DIR, '../../../../')) import util.alpha2_to_dcid as alpha2_to_dcid import util.name_to_alpha2 as name_to_alpha2 @@ -34,9 +38,153 @@ 'Q': 'Quarterly', } +MMETHOD_MAPPING_DICT = { + # input source unit wise mapping to measurmentMethod + 'Index1982-1984=100': 'BasePeriod1982_1984', + '2017=1.00000': 'BaseYear2017', + 'Real(1982-1984)CentsPerKilowatthour': 'BasePeriod1982_1984', + 'Real(1982-1984)DollarsPerGallon': 'BasePeriod1982_1984', + 'Real(1982-1984)DollarsPerMillionBtu': 'BasePeriod1982_1984', + 'Real(1982-1984)DollarsPerThousandCubicFeet': 'BasePeriod1982_1984', + 'ThousandBtuPerChained(2017)Dollar': 'BasePeriod2017', + 'BillionChained(2017)Dollars': 'BasePeriod2017', + 'MetricTonsCarbonDioxidePerMillionChained(2017)Dollars': 'BasePeriod2017' +} + +UNIT_MAPPING_DICT = { + # input source unit : DC unit + 'Days': + 'Day', + 'ThousandsOfRegisteredVehicles': + '', + 'RegisteredVehicle': + '', + 'NumberOfDays': + 'Day', + 'Dollars': + 'USDollar', + 'MillionBarrels': + 'MillionsBarrels', + 'ThousandBarrels': + 'Barrel', + 'ThousandDollars': + 'USDollar', + '1000MetricTons': + 'ThousandMetricTons', + 'BillionKilowatthours': + 'BillionKilowattHours', + 'Terajoules': + 'Terajoule', + 'DollarsPerMillionBtu': + 'USDollarPerMillionBtu', + 'DollarsPerThousandCubicFeet': + 'USDollarPerThousandCubicFeet', + 'CentsPerKilowatthour': + 'CentsPerKilowattHour', + 'MillionKilowatthours': + 'MillionKilowattHours', + 'DollarsPerGallon': + 'USDollarPerGallon', + 'Kilowatthours': + 'KilowattHour', + 'Barrels': + 'Barrel', + 'MillionDollars': + 'USDollar', + 'BillionDollars': + 'USDollar', + 'DollarsPerPoundUraniumOxide': + 'USDollarPerPoundUraniumOxide', + 'ThousandKilowatts': + 'Kilowatt', + 'DollarsPerBarrel': + 'USDollarPerBarrel', + 'NumberOfCustomers': + '', + 'NumberOfElements': + '', + 'Thousand': + "", + 'ThousandGallons': + 'USGallon', + 'MillionPounds': + 'GBP', + 'DollarsPerFoot': + 'USDollarPerFoot', + 'ThousandDollarsPerWell': + 'ThousandUSDollarsPerWell', + 'ThousandFeet': + 'Foot', + 'FeetPerWell': + 'Foot', + 'Cost': + 'USDollar', + 'Index1982-1984=100': + '', + '2017=1.00000': + '', + 'NumberOfRigs': + '', + 'Number': + '', + 'Real(1982-1984)DollarsPerGallon': + 'USDollarPerGallon', + 'Real(1982-1984)DollarsPerMillionBtu': + 'USDollarPerMillionBtu', + 'DollarsPerMillionBtu': + 'USDollarPerMillionBtu', + 'Real(1982-1984)CentsPerKilowatthour': + 'USCentPerKilowattHour', + 'Real(1982-1984)DollarsPerThousandCubicFeet': + 'USDollarPerThousandCubicFeet', + 'MetricTonsCarbonDioxidePerMillionChained(2017)Dollars': + 'MetricTonsCarbonDioxidePerMillionChainedUSDollars', + 'ThousandBtuPerChained(2017)Dollar': + 'BtuPerChainedUSDollar', + 'BillionChained(2017)Dollars': + 'ChainedUSDollar', + 'CentsPerKilowatthour,IncludingTaxes': + 'CentsPerKilowattHour', + 'TrillionBtu': + 'Btu', + 'MillionGallons': + 'USGallon', + 'MillionPeople': + '', + 'MillionNominalDollars': + 'NominalUSDollar', + 'NominalDollars': + 'NominalUSDollar', + 'DollarsPerGallonIncludingTaxes': + 'USDollarPerGallon', + 'DollarsPerGallonExcludingTaxes': + 'USDollarPerGallon', + 'DollarsPerMillionBtu,IncludingTaxes': + 'USDollarPerMillionBtu' +} + +UNIT_CONVERT_DICT = { + 'ThousandCubicFeet': 1000, + 'ThousandBtuPerChained(2017)Dollar': 1000, + 'Thousand': 1000, + 'ThousandFeet': 1000, + 'ThousandDollars': 1000, + 'ThousandGallons': 1000, + 'ThousandBarrels': 1000, + 'ThousandsOfRegisteredVehicles': 1000, + 'MillionDollars': 1000000, + 'MillionPeople': 1000000, + 'MillionNominalDollars': 1000000, + 'MillionGallons': 1000000, + 'ThousandKilowatts': 1000, + 'MillionPounds': 1000000, + 'BillionDollars': 10000000000, + 'BillionChained(2017)Dollars': 10000000000, + 'TrillionBtu': 1000000000000 +} _COLUMNS = [ 'place', 'stat_var', 'date', 'value', 'unit', 'scaling_factor', - 'eia_series_id' + 'eia_series_id', 'measurementMethod' ] _TMCF_STRING = """ @@ -49,6 +197,7 @@ unit: C:EIATable->unit scalingFactor: C:EIATable->scaling_factor eiaSeriesId: C:EIATable->eia_series_id +measurementMethod: C:EIATable->measurementMethod """ _DATE_RE = re.compile('[0-9WMQ]') @@ -82,11 +231,13 @@ def _parse_date(d): m_or_q = d[4:] if m_or_q.startswith('Q'): + #print("withQ",yr + '-' + _QUARTER_MAP[m_or_q]) # Quarterly if m_or_q in _QUARTER_MAP: return yr + '-' + _QUARTER_MAP[m_or_q] else: # Monthly + #print("withOutQ",yr + '-' + m_or_q) return yr + '-' + m_or_q if len(d) == 8: @@ -106,6 +257,26 @@ def _sv_dcid(raw_sv): return 'eia/' + raw_sv +def _check_unit_with_mapping(in_str): + if in_str in UNIT_MAPPING_DICT: + in_str = UNIT_MAPPING_DICT[in_str] + return in_str + + +def _check_mMethod_with_mapping(in_str): + if in_str in MMETHOD_MAPPING_DICT: + in_str = MMETHOD_MAPPING_DICT[in_str] + else: + in_str = "" + return in_str + + +def _unitConvert(unit, value): + if unit in UNIT_CONVERT_DICT: + value = float(value) * UNIT_CONVERT_DICT[unit] + return value + + def _enumify(in_str): return in_str.title().replace(' ', '') @@ -146,7 +317,6 @@ def _find_dc_place(raw_place, is_us_place, counters): if raw_place == 'WORL': return 'Earth' - # logging.error('ERROR: unsupported place %s %r', raw_place, is_us_place) counters[f'error_unsupported_places_{raw_place}'] += 1 return None @@ -223,21 +393,24 @@ def _maybe_parse_name(name, raw_place, is_us_place, counters): def _generate_sv_nodes(dataset, sv_map, sv_name_map, sv_membership_map, sv_schemaful2raw, svg_info): nodes = [] - for sv, mcf in sv_map.items(): - raw_sv = sv_schemaful2raw[sv] if sv in sv_schemaful2raw else sv + try: + for sv, mcf in sv_map.items(): + raw_sv = sv_schemaful2raw[sv] if sv in sv_schemaful2raw else sv - pvs = [mcf] - if raw_sv in sv_name_map: - pvs.append(f'name: "{sv_name_map[raw_sv]}"') + pvs = [mcf] + if raw_sv in sv_name_map: + pvs.append(f'name: "{sv_name_map[raw_sv]}"') - if dataset == 'NUC_STATUS': - pvs.append(f'memberOf: dcid:{category.NUC_STATUS_ROOT}') - if raw_sv in sv_membership_map: - for svg in sorted(sv_membership_map[raw_sv]): - if svg in svg_info: - pvs.append(f'memberOf: dcid:{svg}') + if dataset == 'NUC_STATUS': + pvs.append(f'memberOf: dcid:{category.NUC_STATUS_ROOT}') + if raw_sv in sv_membership_map: + for svg in sorted(sv_membership_map[raw_sv]): + if svg in svg_info: + pvs.append(f'memberOf: dcid:{svg}') - nodes.append('\n'.join(pvs)) + nodes.append('\n'.join(pvs)) + except Exception as e: + logging.fatal(f"error while generating the SV nodes,{sv_name_map} -{e}") return nodes @@ -294,7 +467,8 @@ def process(dataset, dataset_name, in_json, out_csv, out_sv_mcf, out_svg_mcf, counters['info_lines_processed'] += 1 if counters['info_lines_processed'] % 100000 == 99999: _print_counters(counters) - + if not line.startswith('{'): + continue data = json.loads(line) # Preliminary checks @@ -325,6 +499,8 @@ def process(dataset, dataset_name, in_json, out_csv, out_sv_mcf, out_svg_mcf, continue raw_unit = _enumify(data.get('units', '')) + dc_unit = _check_unit_with_mapping(raw_unit) + m_method = _check_mMethod_with_mapping(raw_unit) if raw_sv not in sv_name_map: name = _maybe_parse_name(data.get('name', ''), raw_place, @@ -365,9 +541,10 @@ def process(dataset, dataset_name, in_json, out_csv, out_sv_mcf, out_svg_mcf, 'place': f"dcid:{dc_place}", 'stat_var': f"dcid:{_sv_dcid(raw_sv)}", 'date': dt, - 'value': v, + 'value': _unitConvert(raw_unit, v), 'eia_series_id': series_id, - 'unit': raw_unit, + 'unit': dc_unit, + 'measurementMethod': m_method }) if not rows: @@ -407,5 +584,4 @@ def process(dataset, dataset_name, in_json, out_csv, out_sv_mcf, out_svg_mcf, with open(out_tmcf, 'w') as out_fp: out_fp.write(_TMCF_STRING) - print('=== FINAL COUNTERS ===') - _print_counters(counters) + logging.info(f"FINAL COUNTERS {_print_counters(counters)}") diff --git a/scripts/us_eia/opendata/process/common_test.py b/scripts/us_eia/opendata/process/common_test.py index 042927a6af..6e39677255 100644 --- a/scripts/us_eia/opendata/process/common_test.py +++ b/scripts/us_eia/opendata/process/common_test.py @@ -24,7 +24,7 @@ os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) -from us_eia.opendata.process import coal, common, elec, intl, ng, nuclear, pet, seds, total +from us_eia.opendata.process import common, elec, intl, ng, nuclear, pet, seds, total # module_dir_ is the path to where this test is running from. module_dir_ = os.path.dirname(__file__) @@ -32,8 +32,6 @@ _TEST_CASES = [ # dataset-code, dataset-name, test-case-filename, # extract-fn, schema-fn - ('COAL', 'Coal', 'coal', coal.extract_place_statvar, - coal.generate_statvar_schema), ('ELEC', 'Electricity', 'elec', elec.extract_place_statvar, elec.generate_statvar_schema), ('INTL', 'Internationa', 'intl', intl.extract_place_statvar, None), diff --git a/scripts/us_eia/opendata/process/elec.py b/scripts/us_eia/opendata/process/elec.py index fade060fd3..5b2da7fb0c 100644 --- a/scripts/us_eia/opendata/process/elec.py +++ b/scripts/us_eia/opendata/process/elec.py @@ -1,4 +1,4 @@ -# Copyright 2021 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -36,6 +36,7 @@ def extract_place_statvar(series_id, counters): # ELEC.{MEASURE}.{FUEL_TYPE}-{PLACE}-{PRODUCER_SECTOR}.{PERIOD} m = re.match(r"^ELEC\.([^.]+)\.([^-]+)-([^-]+)-([^.]+)\.([AQM])$", series_id) + if m: measure = m.group(1) fuel_type = m.group(2) @@ -222,7 +223,7 @@ def extract_place_statvar(series_id, counters): 'CONS_EG': (_PLACEHOLDER_FUEL_UNIT, '', 1000), 'CONS_EG_BTU': ('MMBtu', '', 1000000), 'COST': (_PLACEHOLDER_FUEL_UNIT, '', 1), - 'COST_BTU': ('MMBtu', '', 1), + 'COST_BTU': ('USDollarPerMMBtu', '', 1), 'CUSTOMERS': ('', '', 1), 'GEN': ('GigawattHour', '', 1), 'PRICE': ('USCentPerKilowattHour', '', 1), diff --git a/scripts/us_eia/opendata/process/nuclear.py b/scripts/us_eia/opendata/process/nuclear.py index c21d253fe9..6bb90e0326 100644 --- a/scripts/us_eia/opendata/process/nuclear.py +++ b/scripts/us_eia/opendata/process/nuclear.py @@ -1,4 +1,4 @@ -# Copyright 2021 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/scripts/us_eia/opendata/process/pet.py b/scripts/us_eia/opendata/process/pet.py index b795fb4e1c..9740a7bda7 100644 --- a/scripts/us_eia/opendata/process/pet.py +++ b/scripts/us_eia/opendata/process/pet.py @@ -1,4 +1,4 @@ -# Copyright 2021 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/scripts/us_eia/opendata/process/test_data/categories.csv b/scripts/us_eia/opendata/process/test_data/categories.csv index 2678990eda..11d61d34c0 100644 --- a/scripts/us_eia/opendata/process/test_data/categories.csv +++ b/scripts/us_eia/opendata/process/test_data/categories.csv @@ -1,5 +1,5 @@ -place,stat_var,date,value,unit,scaling_factor,eia_series_id -dcid:country/USA,dcid:eia/NG.N9140_2.A,2020,30482049,MillionCubicFeet,,NG.N9140US2.A -dcid:country/USA,dcid:eia/NG.N9140_2.A,2019,31099061,MillionCubicFeet,,NG.N9140US2.A -dcid:country/USA,dcid:eia/NG.N9140_2.M,2021-02,3036972,MillionCubicFeet,,NG.N9140US2.M -dcid:country/USA,dcid:eia/NG.N9140_2.M,2021-01,3286266,MillionCubicFeet,,NG.N9140US2.M +place,stat_var,date,value,unit,scaling_factor,eia_series_id,measurementMethod +dcid:country/USA,dcid:eia/NG.N9140_2.A,2020,30482049,MillionCubicFeet,,NG.N9140US2.A, +dcid:country/USA,dcid:eia/NG.N9140_2.A,2019,31099061,MillionCubicFeet,,NG.N9140US2.A, +dcid:country/USA,dcid:eia/NG.N9140_2.M,2021-02,3036972,MillionCubicFeet,,NG.N9140US2.M, +dcid:country/USA,dcid:eia/NG.N9140_2.M,2021-01,3286266,MillionCubicFeet,,NG.N9140US2.M, diff --git a/scripts/us_eia/opendata/process/test_data/categories.tmcf b/scripts/us_eia/opendata/process/test_data/categories.tmcf index e1ef4499a7..f198290a7a 100644 --- a/scripts/us_eia/opendata/process/test_data/categories.tmcf +++ b/scripts/us_eia/opendata/process/test_data/categories.tmcf @@ -8,3 +8,4 @@ value: C:EIATable->value unit: C:EIATable->unit scalingFactor: C:EIATable->scaling_factor eiaSeriesId: C:EIATable->eia_series_id +measurementMethod: C:EIATable->measurementMethod diff --git a/scripts/us_eia/opendata/process/test_data/coal.csv b/scripts/us_eia/opendata/process/test_data/coal.csv index 6b31796c42..c29aa9febf 100644 --- a/scripts/us_eia/opendata/process/test_data/coal.csv +++ b/scripts/us_eia/opendata/process/test_data/coal.csv @@ -1,12 +1,12 @@ -place,stat_var,date,value,unit,scaling_factor,eia_series_id -dcid:geoId/01,dcid:Quarterly_Average_AshContent_Coal_For_ElectricUtility,2020-12,6.744021229492053,,100,COAL.ASH_CONTENT.AL-1.Q -dcid:geoId/01,dcid:Quarterly_Average_AshContent_Coal_For_ElectricUtility,2020-09,6.767757786979022,,100,COAL.ASH_CONTENT.AL-1.Q -dcid:geoId/21,dcid:Annual_Average_AshContent_Coal_For_CommercialAndInstitutional,2002,0,,100,COAL.ASH_CONTENT.KY-8.A -dcid:geoId/21,dcid:Annual_Average_AshContent_Coal_For_CommercialAndInstitutional,2001,0,,100,COAL.ASH_CONTENT.KY-8.A -dcid:geoId/21,dcid:Annual_Average_AshContent_Coal_For_CommercialAndInstitutional,2000,0,,100,COAL.ASH_CONTENT.KY-8.A -dcid:geoId/17,dcid:Quarterly_Average_HeatContent_Coal_For_CommercialAndInstitutional,2000-09,0,dcid:BtuPerPound,,COAL.HEAT_CONTENT.IL-8.Q -dcid:geoId/17,dcid:Quarterly_Average_HeatContent_Coal_For_CommercialAndInstitutional,2000-06,0,dcid:BtuPerPound,,COAL.HEAT_CONTENT.IL-8.Q -dcid:geoId/17,dcid:Quarterly_Average_HeatContent_Coal_For_CommercialAndInstitutional,2000-03,0,dcid:BtuPerPound,,COAL.HEAT_CONTENT.IL-8.Q -dcid:geoId/36,dcid:Annual_Receipt_Coal_ElectricUtilityNonCogen,2008,8236048,dcid:ShortTon,,COAL.RECEIPTS.NY-2.A -dcid:geoId/12,dcid:Quarterly_Stock_Coal_ElectricUtility,2008-03,4067084,dcid:ShortTon,,COAL.STOCKS.FL-1.Q -dcid:geoId/46,dcid:Annual_Average_SulfurContent_Coal_For_ElectricUtility,2008,0.31,,100,COAL.SULFUR_CONTENT.SD-1.A +place,stat_var,date,value,unit,scaling_factor,eia_series_id,measurementMethod +dcid:geoId/01,dcid:Quarterly_Average_AshContent_Coal_For_ElectricUtility,2020-12,6.744021229492053,,100,COAL.ASH_CONTENT.AL-1.Q, +dcid:geoId/01,dcid:Quarterly_Average_AshContent_Coal_For_ElectricUtility,2020-09,6.767757786979022,,100,COAL.ASH_CONTENT.AL-1.Q, +dcid:geoId/21,dcid:Annual_Average_AshContent_Coal_For_CommercialAndInstitutional,2002,0,,100,COAL.ASH_CONTENT.KY-8.A, +dcid:geoId/21,dcid:Annual_Average_AshContent_Coal_For_CommercialAndInstitutional,2001,0,,100,COAL.ASH_CONTENT.KY-8.A, +dcid:geoId/21,dcid:Annual_Average_AshContent_Coal_For_CommercialAndInstitutional,2000,0,,100,COAL.ASH_CONTENT.KY-8.A, +dcid:geoId/17,dcid:Quarterly_Average_HeatContent_Coal_For_CommercialAndInstitutional,2000-09,0,dcid:BtuPerPound,,COAL.HEAT_CONTENT.IL-8.Q, +dcid:geoId/17,dcid:Quarterly_Average_HeatContent_Coal_For_CommercialAndInstitutional,2000-06,0,dcid:BtuPerPound,,COAL.HEAT_CONTENT.IL-8.Q, +dcid:geoId/17,dcid:Quarterly_Average_HeatContent_Coal_For_CommercialAndInstitutional,2000-03,0,dcid:BtuPerPound,,COAL.HEAT_CONTENT.IL-8.Q, +dcid:geoId/36,dcid:Annual_Receipt_Coal_ElectricUtilityNonCogen,2008,8236048,dcid:ShortTon,,COAL.RECEIPTS.NY-2.A, +dcid:geoId/12,dcid:Quarterly_Stock_Coal_ElectricUtility,2008-03,4067084,dcid:ShortTon,,COAL.STOCKS.FL-1.Q, +dcid:geoId/46,dcid:Annual_Average_SulfurContent_Coal_For_ElectricUtility,2008,0.31,,100,COAL.SULFUR_CONTENT.SD-1.A, diff --git a/scripts/us_eia/opendata/process/test_data/coal.tmcf b/scripts/us_eia/opendata/process/test_data/coal.tmcf index e1ef4499a7..f198290a7a 100644 --- a/scripts/us_eia/opendata/process/test_data/coal.tmcf +++ b/scripts/us_eia/opendata/process/test_data/coal.tmcf @@ -8,3 +8,4 @@ value: C:EIATable->value unit: C:EIATable->unit scalingFactor: C:EIATable->scaling_factor eiaSeriesId: C:EIATable->eia_series_id +measurementMethod: C:EIATable->measurementMethod diff --git a/scripts/us_eia/opendata/process/test_data/elec.csv b/scripts/us_eia/opendata/process/test_data/elec.csv index 55071994a6..47c39f5acd 100644 --- a/scripts/us_eia/opendata/process/test_data/elec.csv +++ b/scripts/us_eia/opendata/process/test_data/elec.csv @@ -1,10 +1,10 @@ -place,stat_var,date,value,unit,scaling_factor,eia_series_id -dcid:geoId/24,dcid:Monthly_Generation_Electricity_Solar_IndependentPowerProducers,2021-02,33.52617,dcid:GigawattHour,,ELEC.GEN.TSN-MD-94.M -dcid:geoId/24,dcid:Monthly_Generation_Electricity_Solar_IndependentPowerProducers,2021-01,33.77782,dcid:GigawattHour,,ELEC.GEN.TSN-MD-94.M -dcid:geoId/24,dcid:Monthly_Generation_Electricity_Solar_IndependentPowerProducers,2021-03,0.0,dcid:GigawattHour,,ELEC.GEN.TSN-MD-94.M -dcid:geoId/25,dcid:Quarterly_RetailSales_Electricity_Residential,2021-03,1809.70299,dcid:GigawattHour,,ELEC.SALES.MA-RES.Q -dcid:geoId/25,dcid:Quarterly_RetailSales_Electricity_Residential,2021-06,1956.15091,dcid:GigawattHour,,ELEC.SALES.MA-RES.Q -dcid:geoId/05,dcid:Quarterly_Consumption_Fuel_ForElectricityGeneration_Coal_ElectricUtilityNonCogen,2021-06,6586120.0,dcid:MMBtu,,ELEC.CONS_EG_BTU.COW-AR-2.Q -dcid:geoId/05,dcid:Quarterly_Consumption_Fuel_ForElectricityGeneration_Coal_ElectricUtilityNonCogen,2021-03,10431100.0,dcid:MMBtu,,ELEC.CONS_EG_BTU.COW-AR-2.Q -dcid:geoId/06,dcid:Monthly_SalesRevenue_Electricity,2021-06,4523307770.0,dcid:USDollar,,ELEC.REV.CA-ALL.M -dcid:geoId/06,dcid:Monthly_SalesRevenue_Electricity,2021-05,3461923010.0,dcid:USDollar,,ELEC.REV.CA-ALL.M +place,stat_var,date,value,unit,scaling_factor,eia_series_id,measurementMethod +dcid:geoId/24,dcid:Monthly_Generation_Electricity_Solar_IndependentPowerProducers,2021-02,33.52617,dcid:GigawattHour,,ELEC.GEN.TSN-MD-94.M, +dcid:geoId/24,dcid:Monthly_Generation_Electricity_Solar_IndependentPowerProducers,2021-01,33.77782,dcid:GigawattHour,,ELEC.GEN.TSN-MD-94.M, +dcid:geoId/24,dcid:Monthly_Generation_Electricity_Solar_IndependentPowerProducers,2021-03,0.0,dcid:GigawattHour,,ELEC.GEN.TSN-MD-94.M, +dcid:geoId/25,dcid:Quarterly_RetailSales_Electricity_Residential,2021-03,1809.70299,dcid:GigawattHour,,ELEC.SALES.MA-RES.Q, +dcid:geoId/25,dcid:Quarterly_RetailSales_Electricity_Residential,2021-06,1956.15091,dcid:GigawattHour,,ELEC.SALES.MA-RES.Q, +dcid:geoId/05,dcid:Quarterly_Consumption_Fuel_ForElectricityGeneration_Coal_ElectricUtilityNonCogen,2021-06,6586120.0,dcid:MMBtu,,ELEC.CONS_EG_BTU.COW-AR-2.Q, +dcid:geoId/05,dcid:Quarterly_Consumption_Fuel_ForElectricityGeneration_Coal_ElectricUtilityNonCogen,2021-03,10431100.0,dcid:MMBtu,,ELEC.CONS_EG_BTU.COW-AR-2.Q, +dcid:geoId/06,dcid:Monthly_SalesRevenue_Electricity,2021-06,4523307770000000.0,dcid:USDollar,,ELEC.REV.CA-ALL.M, +dcid:geoId/06,dcid:Monthly_SalesRevenue_Electricity,2021-05,3461923010000000.0,dcid:USDollar,,ELEC.REV.CA-ALL.M, diff --git a/scripts/us_eia/opendata/process/test_data/elec.tmcf b/scripts/us_eia/opendata/process/test_data/elec.tmcf index e1ef4499a7..f198290a7a 100644 --- a/scripts/us_eia/opendata/process/test_data/elec.tmcf +++ b/scripts/us_eia/opendata/process/test_data/elec.tmcf @@ -8,3 +8,4 @@ value: C:EIATable->value unit: C:EIATable->unit scalingFactor: C:EIATable->scaling_factor eiaSeriesId: C:EIATable->eia_series_id +measurementMethod: C:EIATable->measurementMethod diff --git a/scripts/us_eia/opendata/process/test_data/intl.csv b/scripts/us_eia/opendata/process/test_data/intl.csv index fda1cd461a..6afb7ca9f9 100644 --- a/scripts/us_eia/opendata/process/test_data/intl.csv +++ b/scripts/us_eia/opendata/process/test_data/intl.csv @@ -1,7 +1,7 @@ -place,stat_var,date,value,unit,scaling_factor,eia_series_id -dcid:country/IND,dcid:eia/INTL.53-1-TBPD.A,2020,924.4588369430336,ThousandBarrelsPerDay,,INTL.53-1-IND-TBPD.A -dcid:country/IND,dcid:eia/INTL.53-1-TBPD.A,2019,986.3134487671233,ThousandBarrelsPerDay,,INTL.53-1-IND-TBPD.A -dcid:country/IND,dcid:eia/INTL.53-1-TBPD.A,2018,1017.5592096438356,ThousandBarrelsPerDay,,INTL.53-1-IND-TBPD.A -dcid:country/IND,dcid:eia/INTL.53-1-TBPD.A,2017,1017.3772797808219,ThousandBarrelsPerDay,,INTL.53-1-IND-TBPD.A -dcid:Earth,dcid:eia/INTL.55-1-TBPD.A,2020,91753.99016207967,ThousandBarrelsPerDay,,INTL.55-1-WORL-TBPD.A -dcid:Earth,dcid:eia/INTL.55-1-TBPD.A,2019,97993.61794135909,ThousandBarrelsPerDay,,INTL.55-1-WORL-TBPD.A +place,stat_var,date,value,unit,scaling_factor,eia_series_id,measurementMethod +dcid:country/IND,dcid:eia/INTL.53-1-TBPD.A,2020,924.4588369430336,ThousandBarrelsPerDay,,INTL.53-1-IND-TBPD.A, +dcid:country/IND,dcid:eia/INTL.53-1-TBPD.A,2019,986.3134487671233,ThousandBarrelsPerDay,,INTL.53-1-IND-TBPD.A, +dcid:country/IND,dcid:eia/INTL.53-1-TBPD.A,2018,1017.5592096438356,ThousandBarrelsPerDay,,INTL.53-1-IND-TBPD.A, +dcid:country/IND,dcid:eia/INTL.53-1-TBPD.A,2017,1017.3772797808219,ThousandBarrelsPerDay,,INTL.53-1-IND-TBPD.A, +dcid:Earth,dcid:eia/INTL.55-1-TBPD.A,2020,91753.99016207967,ThousandBarrelsPerDay,,INTL.55-1-WORL-TBPD.A, +dcid:Earth,dcid:eia/INTL.55-1-TBPD.A,2019,97993.61794135909,ThousandBarrelsPerDay,,INTL.55-1-WORL-TBPD.A, diff --git a/scripts/us_eia/opendata/process/test_data/intl.tmcf b/scripts/us_eia/opendata/process/test_data/intl.tmcf index e1ef4499a7..f198290a7a 100644 --- a/scripts/us_eia/opendata/process/test_data/intl.tmcf +++ b/scripts/us_eia/opendata/process/test_data/intl.tmcf @@ -8,3 +8,4 @@ value: C:EIATable->value unit: C:EIATable->unit scalingFactor: C:EIATable->scaling_factor eiaSeriesId: C:EIATable->eia_series_id +measurementMethod: C:EIATable->measurementMethod diff --git a/scripts/us_eia/opendata/process/test_data/ng.csv b/scripts/us_eia/opendata/process/test_data/ng.csv index a1722b09cf..96af54c914 100644 --- a/scripts/us_eia/opendata/process/test_data/ng.csv +++ b/scripts/us_eia/opendata/process/test_data/ng.csv @@ -1,12 +1,12 @@ -place,stat_var,date,value,unit,scaling_factor,eia_series_id -dcid:geoId/01,dcid:eia/NG.N3035_4.M,2021-02,26.1,Percent,,NG.N3035AL4.M -dcid:geoId/01,dcid:eia/NG.N3035_4.M,2021-01,25.2,Percent,,NG.N3035AL4.M -dcid:geoId/01,dcid:eia/NG.N3035_4.M,2020-12,24.7,Percent,,NG.N3035AL4.M -dcid:geoId/40,dcid:eia/NG.RL2R02_1.A,2008,6,MillionBarrels,,NG.RL2R02SOK_1.A -dcid:geoId/40,dcid:eia/NG.RL2R02_1.A,2007,-4,MillionBarrels,,NG.RL2R02SOK_1.A -dcid:geoId/40,dcid:eia/NG.RL2R02_1.A,2006,13,MillionBarrels,,NG.RL2R02SOK_1.A -dcid:geoId/40,dcid:eia/NG.RL2R02_1.A,2005,16,MillionBarrels,,NG.RL2R02SOK_1.A -dcid:geoId/40,dcid:eia/NG.RL2R02_1.A,2004,40,MillionBarrels,,NG.RL2R02SOK_1.A -dcid:country/USA,dcid:eia/NG.NA1350_2.A,2019,58084,MillionCubicFeet,,NG.NA1350_NUS_2.A -dcid:country/USA,dcid:eia/NG.NA1350_2.A,2018,9248,MillionCubicFeet,,NG.NA1350_NUS_2.A -dcid:country/USA,dcid:eia/NG.NA1350_2.A,2017,-256,MillionCubicFeet,,NG.NA1350_NUS_2.A +place,stat_var,date,value,unit,scaling_factor,eia_series_id,measurementMethod +dcid:geoId/01,dcid:eia/NG.N3035_4.M,2021-02,26.1,Percent,,NG.N3035AL4.M, +dcid:geoId/01,dcid:eia/NG.N3035_4.M,2021-01,25.2,Percent,,NG.N3035AL4.M, +dcid:geoId/01,dcid:eia/NG.N3035_4.M,2020-12,24.7,Percent,,NG.N3035AL4.M, +dcid:geoId/40,dcid:eia/NG.RL2R02_1.A,2008,6,MillionsBarrels,,NG.RL2R02SOK_1.A, +dcid:geoId/40,dcid:eia/NG.RL2R02_1.A,2007,-4,MillionsBarrels,,NG.RL2R02SOK_1.A, +dcid:geoId/40,dcid:eia/NG.RL2R02_1.A,2006,13,MillionsBarrels,,NG.RL2R02SOK_1.A, +dcid:geoId/40,dcid:eia/NG.RL2R02_1.A,2005,16,MillionsBarrels,,NG.RL2R02SOK_1.A, +dcid:geoId/40,dcid:eia/NG.RL2R02_1.A,2004,40,MillionsBarrels,,NG.RL2R02SOK_1.A, +dcid:country/USA,dcid:eia/NG.NA1350_2.A,2019,58084,MillionCubicFeet,,NG.NA1350_NUS_2.A, +dcid:country/USA,dcid:eia/NG.NA1350_2.A,2018,9248,MillionCubicFeet,,NG.NA1350_NUS_2.A, +dcid:country/USA,dcid:eia/NG.NA1350_2.A,2017,-256,MillionCubicFeet,,NG.NA1350_NUS_2.A, diff --git a/scripts/us_eia/opendata/process/test_data/ng.tmcf b/scripts/us_eia/opendata/process/test_data/ng.tmcf index e1ef4499a7..f198290a7a 100644 --- a/scripts/us_eia/opendata/process/test_data/ng.tmcf +++ b/scripts/us_eia/opendata/process/test_data/ng.tmcf @@ -8,3 +8,4 @@ value: C:EIATable->value unit: C:EIATable->unit scalingFactor: C:EIATable->scaling_factor eiaSeriesId: C:EIATable->eia_series_id +measurementMethod: C:EIATable->measurementMethod diff --git a/scripts/us_eia/opendata/process/test_data/nuc_status.csv b/scripts/us_eia/opendata/process/test_data/nuc_status.csv index d1304b4d04..bd3eb5649b 100644 --- a/scripts/us_eia/opendata/process/test_data/nuc_status.csv +++ b/scripts/us_eia/opendata/process/test_data/nuc_status.csv @@ -1,13 +1,13 @@ -place,stat_var,date,value,unit,scaling_factor,eia_series_id -dcid:eia/pp/4046,dcid:Daily_Capacity_Nuclear_ForEnergyGeneration,2021-05-11,1197.1,dcid:Megawatt,,NUC_STATUS.CAP.4046.D -dcid:eia/pp/4046,dcid:Daily_Capacity_Nuclear_ForEnergyGeneration,2021-05-10,1197.1,dcid:Megawatt,,NUC_STATUS.CAP.4046.D -dcid:eia/pp/4046,dcid:Daily_Capacity_Nuclear_ForEnergyGeneration,2007-01-05,1036,dcid:Megawatt,,NUC_STATUS.CAP.4046.D -dcid:eia/pp/621,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration,2021-05-11,0,dcid:Megawatt,,NUC_STATUS.OUT.621.D -dcid:eia/pp/621,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration,2021-05-10,0,dcid:Megawatt,,NUC_STATUS.OUT.621.D -dcid:eia/pp/621,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration,2007-01-01,0,dcid:Megawatt,,NUC_STATUS.OUT.621.D -dcid:eia/pp/869-2,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration_AsAFractionOf_Capacity,2021-05-11,73,,,NUC_STATUS.OUT_PCT.869-2.D -dcid:eia/pp/869-2,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration_AsAFractionOf_Capacity,2021-05-10,80,,,NUC_STATUS.OUT_PCT.869-2.D -dcid:eia/pp/869-2,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration_AsAFractionOf_Capacity,2021-05-09,75,,,NUC_STATUS.OUT_PCT.869-2.D -dcid:country/USA,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration,2021-05-11,16404.767,dcid:Megawatt,,NUC_STATUS.OUT.US.D -dcid:country/USA,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration,2021-05-10,16960.869,dcid:Megawatt,,NUC_STATUS.OUT.US.D -dcid:country/USA,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration,2021-05-09,17374.955,dcid:Megawatt,,NUC_STATUS.OUT.US.D +place,stat_var,date,value,unit,scaling_factor,eia_series_id,measurementMethod +dcid:eia/pp/4046,dcid:Daily_Capacity_Nuclear_ForEnergyGeneration,2021-05-11,1197.1,dcid:Megawatt,,NUC_STATUS.CAP.4046.D, +dcid:eia/pp/4046,dcid:Daily_Capacity_Nuclear_ForEnergyGeneration,2021-05-10,1197.1,dcid:Megawatt,,NUC_STATUS.CAP.4046.D, +dcid:eia/pp/4046,dcid:Daily_Capacity_Nuclear_ForEnergyGeneration,2007-01-05,1036,dcid:Megawatt,,NUC_STATUS.CAP.4046.D, +dcid:eia/pp/621,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration,2021-05-11,0,dcid:Megawatt,,NUC_STATUS.OUT.621.D, +dcid:eia/pp/621,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration,2021-05-10,0,dcid:Megawatt,,NUC_STATUS.OUT.621.D, +dcid:eia/pp/621,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration,2007-01-01,0,dcid:Megawatt,,NUC_STATUS.OUT.621.D, +dcid:eia/pp/869-2,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration_AsAFractionOf_Capacity,2021-05-11,73,,,NUC_STATUS.OUT_PCT.869-2.D, +dcid:eia/pp/869-2,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration_AsAFractionOf_Capacity,2021-05-10,80,,,NUC_STATUS.OUT_PCT.869-2.D, +dcid:eia/pp/869-2,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration_AsAFractionOf_Capacity,2021-05-09,75,,,NUC_STATUS.OUT_PCT.869-2.D, +dcid:country/USA,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration,2021-05-11,16404.767,dcid:Megawatt,,NUC_STATUS.OUT.US.D, +dcid:country/USA,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration,2021-05-10,16960.869,dcid:Megawatt,,NUC_STATUS.OUT.US.D, +dcid:country/USA,dcid:Daily_CapacityOutage_Nuclear_ForEnergyGeneration,2021-05-09,17374.955,dcid:Megawatt,,NUC_STATUS.OUT.US.D, diff --git a/scripts/us_eia/opendata/process/test_data/nuc_status.tmcf b/scripts/us_eia/opendata/process/test_data/nuc_status.tmcf index e1ef4499a7..f198290a7a 100644 --- a/scripts/us_eia/opendata/process/test_data/nuc_status.tmcf +++ b/scripts/us_eia/opendata/process/test_data/nuc_status.tmcf @@ -8,3 +8,4 @@ value: C:EIATable->value unit: C:EIATable->unit scalingFactor: C:EIATable->scaling_factor eiaSeriesId: C:EIATable->eia_series_id +measurementMethod: C:EIATable->measurementMethod diff --git a/scripts/us_eia/opendata/process/test_data/pet.csv b/scripts/us_eia/opendata/process/test_data/pet.csv index 05d459da20..894f1122a8 100644 --- a/scripts/us_eia/opendata/process/test_data/pet.csv +++ b/scripts/us_eia/opendata/process/test_data/pet.csv @@ -1,8 +1,8 @@ -place,stat_var,date,value,unit,scaling_factor,eia_series_id -dcid:geoId/08,dcid:eia/PET.KDLVIS_1.A,2019,40989,ThousandGallons,,PET.KDLVISSCO1.A -dcid:geoId/08,dcid:eia/PET.KDLVIS_1.A,2018,37954,ThousandGallons,,PET.KDLVISSCO1.A -dcid:geoId/22,dcid:eia/PET.RCRR06_1.A,2019,3,MillionBarrels,,PET.RCRR06SLA_1.A -dcid:geoId/22,dcid:eia/PET.RCRR06_1.A,2018,6,MillionBarrels,,PET.RCRR06SLA_1.A -dcid:geoId/22,dcid:eia/PET.RCRR06_1.A,2017,29,MillionBarrels,,PET.RCRR06SLA_1.A -dcid:country/USA,dcid:eia/PET.M_EPC0_SPT_PER.W,2020-11-30,32.2,Percent,,PET.M_EPC0_SPT_NUS_PER.W -dcid:country/USA,dcid:eia/PET.M_EPC0_SPT_PER.W,2020-12-06,33.1,Percent,,PET.M_EPC0_SPT_NUS_PER.W +place,stat_var,date,value,unit,scaling_factor,eia_series_id,measurementMethod +dcid:geoId/08,dcid:eia/PET.KDLVIS_1.A,2019,40989000.0,USGallon,,PET.KDLVISSCO1.A, +dcid:geoId/08,dcid:eia/PET.KDLVIS_1.A,2018,37954000.0,USGallon,,PET.KDLVISSCO1.A, +dcid:geoId/22,dcid:eia/PET.RCRR06_1.A,2019,3,MillionsBarrels,,PET.RCRR06SLA_1.A, +dcid:geoId/22,dcid:eia/PET.RCRR06_1.A,2018,6,MillionsBarrels,,PET.RCRR06SLA_1.A, +dcid:geoId/22,dcid:eia/PET.RCRR06_1.A,2017,29,MillionsBarrels,,PET.RCRR06SLA_1.A, +dcid:country/USA,dcid:eia/PET.M_EPC0_SPT_PER.W,2020-11-30,32.2,Percent,,PET.M_EPC0_SPT_NUS_PER.W, +dcid:country/USA,dcid:eia/PET.M_EPC0_SPT_PER.W,2020-12-06,33.1,Percent,,PET.M_EPC0_SPT_NUS_PER.W, diff --git a/scripts/us_eia/opendata/process/test_data/pet.tmcf b/scripts/us_eia/opendata/process/test_data/pet.tmcf index e1ef4499a7..f198290a7a 100644 --- a/scripts/us_eia/opendata/process/test_data/pet.tmcf +++ b/scripts/us_eia/opendata/process/test_data/pet.tmcf @@ -8,3 +8,4 @@ value: C:EIATable->value unit: C:EIATable->unit scalingFactor: C:EIATable->scaling_factor eiaSeriesId: C:EIATable->eia_series_id +measurementMethod: C:EIATable->measurementMethod diff --git a/scripts/us_eia/opendata/process/test_data/seds.csv b/scripts/us_eia/opendata/process/test_data/seds.csv index 25dd6e8f74..06b1a06881 100644 --- a/scripts/us_eia/opendata/process/test_data/seds.csv +++ b/scripts/us_eia/opendata/process/test_data/seds.csv @@ -1,5 +1,5 @@ -place,stat_var,date,value,unit,scaling_factor,eia_series_id -dcid:geoId/06,dcid:eia/SEDS.TNISB.A,2018,1053194,BillionBtu,,SEDS.TNISB.CA.A -dcid:geoId/06,dcid:eia/SEDS.TNISB.A,2017,1056142,BillionBtu,,SEDS.TNISB.CA.A -dcid:country/USA,dcid:eia/SEDS.WXICD.A,2019,29.79,DollarsPerMillionBtu,,SEDS.WXICD.US.A -dcid:country/USA,dcid:eia/SEDS.WXICD.A,2018,32.94,DollarsPerMillionBtu,,SEDS.WXICD.US.A +place,stat_var,date,value,unit,scaling_factor,eia_series_id,measurementMethod +dcid:geoId/06,dcid:eia/SEDS.TNISB.A,2018,1053194,BillionBtu,,SEDS.TNISB.CA.A, +dcid:geoId/06,dcid:eia/SEDS.TNISB.A,2017,1056142,BillionBtu,,SEDS.TNISB.CA.A, +dcid:country/USA,dcid:eia/SEDS.WXICD.A,2019,29.79,USDollarPerMillionBtu,,SEDS.WXICD.US.A, +dcid:country/USA,dcid:eia/SEDS.WXICD.A,2018,32.94,USDollarPerMillionBtu,,SEDS.WXICD.US.A, diff --git a/scripts/us_eia/opendata/process/test_data/seds.tmcf b/scripts/us_eia/opendata/process/test_data/seds.tmcf index e1ef4499a7..f198290a7a 100644 --- a/scripts/us_eia/opendata/process/test_data/seds.tmcf +++ b/scripts/us_eia/opendata/process/test_data/seds.tmcf @@ -8,3 +8,4 @@ value: C:EIATable->value unit: C:EIATable->unit scalingFactor: C:EIATable->scaling_factor eiaSeriesId: C:EIATable->eia_series_id +measurementMethod: C:EIATable->measurementMethod diff --git a/scripts/us_eia/opendata/process/test_data/total.csv b/scripts/us_eia/opendata/process/test_data/total.csv index e7bc7babbb..8426b42903 100644 --- a/scripts/us_eia/opendata/process/test_data/total.csv +++ b/scripts/us_eia/opendata/process/test_data/total.csv @@ -1,3 +1,3 @@ -place,stat_var,date,value,unit,scaling_factor,eia_series_id -dcid:country/USA,dcid:eia/TOTAL.LUACP.A,2020,52.852,ThousandBarrelsPerDay,,TOTAL.LUACPUS.A -dcid:country/USA,dcid:eia/TOTAL.LUACP.A,2019,59.325,ThousandBarrelsPerDay,,TOTAL.LUACPUS.A +place,stat_var,date,value,unit,scaling_factor,eia_series_id,measurementMethod +dcid:country/USA,dcid:eia/TOTAL.LUACP.A,2020,52.852,ThousandBarrelsPerDay,,TOTAL.LUACPUS.A, +dcid:country/USA,dcid:eia/TOTAL.LUACP.A,2019,59.325,ThousandBarrelsPerDay,,TOTAL.LUACPUS.A, diff --git a/scripts/us_eia/opendata/process/test_data/total.tmcf b/scripts/us_eia/opendata/process/test_data/total.tmcf index e1ef4499a7..f198290a7a 100644 --- a/scripts/us_eia/opendata/process/test_data/total.tmcf +++ b/scripts/us_eia/opendata/process/test_data/total.tmcf @@ -8,3 +8,4 @@ value: C:EIATable->value unit: C:EIATable->unit scalingFactor: C:EIATable->scaling_factor eiaSeriesId: C:EIATable->eia_series_id +measurementMethod: C:EIATable->measurementMethod From 438687645db96164477db8addfc1469290d32688 Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Chandaluri Date: Fri, 17 Jan 2025 08:33:30 +0000 Subject: [PATCH 2/7] including coal import --- scripts/us_eia/opendata/process/coal.py | 244 ++++++++++++++++++ scripts/us_eia/opendata/process/common.py | 10 +- .../us_eia/opendata/process/common_test.py | 4 +- 3 files changed, 253 insertions(+), 5 deletions(-) create mode 100644 scripts/us_eia/opendata/process/coal.py diff --git a/scripts/us_eia/opendata/process/coal.py b/scripts/us_eia/opendata/process/coal.py new file mode 100644 index 0000000000..24890e67c3 --- /dev/null +++ b/scripts/us_eia/opendata/process/coal.py @@ -0,0 +1,244 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""EIA Coal Dataset specific functions.""" + +import logging +import re + +from . import common + + +def extract_place_statvar(series_id, counters): + """Given the series_id, extract the raw place and stat-var ID. + + Args: + series_id: EIA series ID + counters: map for updating error statistics + + Returns a (place, raw-stat-var, is_us_place) tuple. + """ + # Pattern #1: COAL.{Measure}.{Region}-{Code}.{Period} + # Region could include 3-leter codes (e.g. MAT - Middle Atlantic) + m = re.match(r"^COAL\.([^._]+_?[^._]+)\.([A-Z]+)-([0-9]+)\.([AQM])$", + series_id) + if m: + measure = m.group(1) + place = m.group(2) + code = m.group(3) + period = m.group(4) + sv_id = f'COAL.{measure}.{code}.{period}' + return (place, sv_id, True) + + # TODO: Ignore this pattern until we have a way to model multi-location SV's. + # Pattern #2: COAL.{EXPORT|IMPORT}_{Measure}.{Type}-{CountryIso}-{UsPortIso}.{Period} + # Pattern #3: COAL.{SHIPMENT}_{Submeasure}.{Source}-{Destination}-{Material}.{Period} + # m = re.match(r"^COAL\.([A-Z]+)_([A-Z]+)\.([^-]+)-([^-]+)-([^.]+)\.([AQM])$", + # series_id) + # if m: + # return (None, None, None) + # activity = m.group(1) + # measure = m.group(2) + # if activity in ['EXPORT', 'IMPORT']: + # # Pattern #2 + # # TODO: model destination / source port as well + # type = m.group(3) + # place = m.group(4) + # port = m.group(5) + # period = m.group(6) + # sv_id = f'COAL.{activity}_{measure}.{type}.{period}' + # return (place, sv_id, False) + # elif activity == 'SHIPMENT': + # # Pattern #3 + # source = m.group(3) + # if source.isalpha(): # could include 3-letter region codes + # destination_power_plant = m.group(4) + # material = m.group(5) + # period = m.group(6) + # sv_id = f'COAL.SHIPMENT_{measure}.{material}.{period}' + # return (source, sv_id, True) + # else: + # # TODO: Handle remaining places - coal mines + # counters[f'error_unknown_coal_mine SHIPMENT '] += 1 + # return (None, None, None) + # else: + # counters[f'unknown #2,3 activity ({activity})'] += 1 + # return (None, None, None) + + # Pattern #4: COAL.PROD_DIST_STOCKS.TOT-{Place}.{Period} + # Pattern #4: COAL.PRICE_BY_RANK.{Region}-{Material}.{Period} + # Pattern #4: COAL.SHIP_{MINE|PLANT}_{ASH|HEAT|PRICE|QTY|SULFUR}.{Region}-{Material}.{Period} + m = re.match(r"^COAL\.([A-Z]+_[A-Z]+_[A-Z]+)\.([^-]+)-([^.]+)\.([AQM])$", + series_id) + if m: + measure = m.group(1) + if measure.startswith("SHIP"): + # TODO: model destination / source port as well + print(series_id) + return (None, None, None) + if measure == "PROD_DIST_STOCKS": + assert m.group(2) == "TOT" + place = m.group(3) + period = m.group(4) + return (place, f'COAL.PROD_DIST_STOCKS.TOT.{period}', True) + else: + place = m.group(2) + material = m.group(3) + period = m.group(4) + return (place, f'COAL.{measure}.{material}.{period}', True) + + return (None, None, None) + + +## +## Maps for Schema - more definitions at https://www.eia.gov/coal/data/browser/data/termsAndDefs.php?rseAvailable=false&showFilterValues=true&showDetail=true&showTransportationMode=true&showPrimeMovers=true&showPlantFuelTypes=true&showMineType=true&showMineStatus=true&topic=26 +## + +# Each value is a list where first entry is StatVar ID component, and the rest +# are StatVar PVs. +### Make sure each constraint is added to SV name +_MEASURE_MAP = { + 'ASH_CONTENT': [ + 'Average_AshContent_Coal_For', + 'populationType: dcs:Coal', + 'measuredProperty: dcs:ashContent', + 'statType: dcs:meanValue', + ], + 'CONS_TOT': [ + 'Consumption_Coal', + 'populationType: dcs:Coal', + 'measuredProperty: dcs:consumption', + 'statType: dcs:measuredValue', + ], + 'COST': [ + 'Average_Cost_Coal', + 'populationType: dcs:Coal', + 'measuredProperty: dcs:cost', + 'statType: dcs:meanValue', + ], + 'HEAT_CONTENT': [ + 'Average_HeatContent_Coal_For', + 'populationType: dcs:Coal', + 'measuredProperty: dcs:heatContent', + 'statType: dcs:meanValue', + ], + 'RECEIPTS': [ + 'Receipt_Coal', + 'populationType: dcs:Coal', + 'measuredProperty: dcs:receipt', + 'statType: dcs:measuredValue', + ], + 'STOCKS': [ + 'Stock_Coal', + 'populationType: dcs:Coal', + 'measuredProperty: dcs:stock', + 'statType: dcs:measuredValue', + ], + 'SULFUR_CONTENT': [ + 'Average_SulfurContent_Coal_For', + 'populationType: dcs:Coal', + 'measuredProperty: dcs:sulfurContent', + 'statType: dcs:meanValue', + ], +} + +_CONSUMING_SECTOR = { + '1': 'ElectricUtility', + '2': 'ElectricUtilityNonCogen', + '3': 'ElectricUtilityCogen', + '8': 'CommercialAndInstitutional', + '9': 'CokePlants', + '10': 'OtherIndustrial', + '94': 'IndependentPowerProducers', + '98': 'ElectricPower', +} + +_UNIT_MAP = { + 'ASH_CONTENT': ('', '100'), + 'HEAT_CONTENT': ('BtuPerPound', ''), + 'SULFUR_CONTENT': ('', '100'), + 'CONS_TOT': ('ShortTon', ''), + 'RECEIPTS': ('ShortTon', ''), + 'STOCKS': ('ShortTon', ''), + 'COST': ('USDollarPerShortTon', ''), +} + + +def generate_statvar_schema(raw_sv, rows, sv_map, counters): + """Generate StatVar with full schema. + + Args: + raw_sv: Raw stat-var returned by extract_place_statvar() + rows: List of dicts corresponding to CSV row. See common._COLUMNS. + sv_map: Map from stat-var to its MCF content. + counters: Map updated with error statistics. + + Returns schema-ful stat-var ID if schema was generated, None otherwise. + """ + counters['generate_statvar_schema'] += 1 + + # COAL.{Measure}.{ConsumingSector}.{Period} + m = re.match(r"^COAL\.([^._]+_?[^._]+)\.([0-9]+)\.([AQM])$", raw_sv) + if m: + measure = m.group(1) + consuming_sector = m.group(2) + period = m.group(3) + else: + counters['error_unparsable_raw_statvar'] += 1 + return None + counters[f'measure-{measure}'] += 1 + + # Get popType and mprop based on measure. + measure_pvs = _MEASURE_MAP.get(measure, None) + if not measure_pvs: + counters[f'error_missing_measure-{measure}'] += 1 + return None + + sv_id_parts = [common.PERIOD_MAP[period], measure_pvs[0]] + sv_pvs = measure_pvs[1:] + [ + 'typeOf: dcs:StatisticalVariable', + # TODO(shanth): use new property in next iteration + f'measurementQualifier: dcs:{common.PERIOD_MAP[period]}', + ] + + if consuming_sector: + cs = _CONSUMING_SECTOR.get(consuming_sector, None) + if not cs: + counters[f'error_missing_consuming_sector-{consumingSector}'] += 1 + return None + sv_id_parts.append(cs) + sv_pvs.append(f'consumingSector: dcs:{cs}') + + if measure not in _UNIT_MAP: + counters[f'error_missing_unit-{measure}'] += 1 + return None + (unit, sfactor) = _UNIT_MAP[measure] + + sv_id = '_'.join(sv_id_parts) + + # Update the rows with new StatVar ID value and additional properties. + for row in rows: + row['stat_var'] = f'dcid:{sv_id}' + if unit: + row['unit'] = f'dcid:{unit}' + else: + # Reset unit to empty to clear the raw unit value. + row['unit'] = '' + if sfactor: + row['scaling_factor'] = sfactor + + if sv_id not in sv_map: + node = f'Node: dcid:{sv_id}' + sv_map[sv_id] = '\n'.join([node] + sv_pvs) + + return sv_id diff --git a/scripts/us_eia/opendata/process/common.py b/scripts/us_eia/opendata/process/common.py index 317d223d83..6eb566d38f 100644 --- a/scripts/us_eia/opendata/process/common.py +++ b/scripts/us_eia/opendata/process/common.py @@ -16,14 +16,13 @@ import sys import csv import json +import logging import re -from absl import logging from collections import defaultdict from sys import path -## For import util.alpha2_to_dcid -## Setup path for import from data/util - +# For import util.alpha2_to_dcid +# Setup path for import from data/util _MODULE_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(1, os.path.join(_MODULE_DIR, '../../../../')) import util.alpha2_to_dcid as alpha2_to_dcid @@ -61,6 +60,8 @@ '', 'NumberOfDays': 'Day', + '$/ShortTon': + 'USDollarPerShortTon', 'Dollars': 'USDollar', 'MillionBarrels': @@ -317,6 +318,7 @@ def _find_dc_place(raw_place, is_us_place, counters): if raw_place == 'WORL': return 'Earth' + # logging.error('ERROR: unsupported place %s %r', raw_place, is_us_place) counters[f'error_unsupported_places_{raw_place}'] += 1 return None diff --git a/scripts/us_eia/opendata/process/common_test.py b/scripts/us_eia/opendata/process/common_test.py index 6e39677255..042927a6af 100644 --- a/scripts/us_eia/opendata/process/common_test.py +++ b/scripts/us_eia/opendata/process/common_test.py @@ -24,7 +24,7 @@ os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) -from us_eia.opendata.process import common, elec, intl, ng, nuclear, pet, seds, total +from us_eia.opendata.process import coal, common, elec, intl, ng, nuclear, pet, seds, total # module_dir_ is the path to where this test is running from. module_dir_ = os.path.dirname(__file__) @@ -32,6 +32,8 @@ _TEST_CASES = [ # dataset-code, dataset-name, test-case-filename, # extract-fn, schema-fn + ('COAL', 'Coal', 'coal', coal.extract_place_statvar, + coal.generate_statvar_schema), ('ELEC', 'Electricity', 'elec', elec.extract_place_statvar, elec.generate_statvar_schema), ('INTL', 'Internationa', 'intl', intl.extract_place_statvar, None), From d1bff976e0ad1d235b310cf3d2350dd59f3d06a8 Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Chandaluri Date: Fri, 17 Jan 2025 08:40:28 +0000 Subject: [PATCH 3/7] modified manifest.json file --- scripts/us_eia/opendata/manifest.json | 32 +++++++++++++++++++++------ 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/scripts/us_eia/opendata/manifest.json b/scripts/us_eia/opendata/manifest.json index c0dbc85d6b..8b89cca72b 100644 --- a/scripts/us_eia/opendata/manifest.json +++ b/scripts/us_eia/opendata/manifest.json @@ -1,5 +1,23 @@ { "import_specifications": [ + { + "import_name": "EIA_Coal", + "curator_emails": [ + "chandaluri@google.com" + ], + "provenance_url": "https://www.eia.gov/opendata/qb.php?category=717234", + "provenance_description": "Coal dataset has country, state-level level information .", + "scripts": [ + "process.py --dataset=COAL" + ], + "import_inputs": [ + { + "template_mcf": "tmp_raw_data/COAL/COAL.tmcf", + "cleaned_csv": "tmp_raw_data/COAL/COAL.csv" + } + ], + "cron_schedule": "0 6 1 2 *" + }, { "import_name": "EIA_Electricity", "curator_emails": [ @@ -16,7 +34,7 @@ "cleaned_csv": "tmp_raw_data/ELEC/ELEC.csv" } ], - "cron_schedule": "0 1 1 * *" + "cron_schedule": "0 8 1 2 *" }, { "import_name": "EIA_NaturalGas", @@ -34,7 +52,7 @@ "cleaned_csv": "tmp_raw_data/NG/NG.csv" } ], - "cron_schedule": "0 2 1 * *" + "cron_schedule": "05 10 * * *" }, { "import_name": "EIA_NuclearOutages", @@ -52,7 +70,7 @@ "cleaned_csv": "tmp_raw_data/NUC_STATUS/NUC_STATUS.csv" } ], - "cron_schedule": "0 3 1 * *" + "cron_schedule": "01 9 * * *" }, { "import_name": "EIA_Petroleum", @@ -70,7 +88,7 @@ "cleaned_csv": "tmp_raw_data/PET/PET.csv" } ], - "cron_schedule": "0 4 1 * *" + "cron_schedule": "5 9 2 2 *" }, { "import_name": "EIA_International", @@ -88,7 +106,7 @@ "cleaned_csv": "tmp_raw_data/INTL/INTL.csv" } ], - "cron_schedule": "0 5 1 * *" + "cron_schedule": "1 7 * 1,4,7,10 *" }, { "import_name": "EIA_SEDS", @@ -106,7 +124,7 @@ "cleaned_csv": "tmp_raw_data/SEDS/SEDS.csv" } ], - "cron_schedule": "0 6 1 * *" + "cron_schedule": "0 0 1 1 *" }, { "import_name": "EIA_TotalEnergy", @@ -124,7 +142,7 @@ "cleaned_csv": "tmp_raw_data/TOTAL/TOTAL.csv" } ], - "cron_schedule": "20 6 1 * *" + "cron_schedule": "0 0 1 * *" } ] } \ No newline at end of file From 93c4c56bac7d19281b3f262e828ac5cf4906a66e Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Chandaluri Date: Fri, 17 Jan 2025 08:54:57 +0000 Subject: [PATCH 4/7] updated READ.ME --- scripts/us_eia/opendata/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/us_eia/opendata/README.md b/scripts/us_eia/opendata/README.md index 954d48e029..88dc7f38f7 100644 --- a/scripts/us_eia/opendata/README.md +++ b/scripts/us_eia/opendata/README.md @@ -24,6 +24,7 @@ This dataset is available for public use, license is available at https://www.ei python3 process.py --dataset=INTL --mode=download python3 process.py --dataset=ELEC --mode=download + python3 process.py --dataset=COAL --mode=download python3 process.py --dataset=PET --mode=download python3 process.py --dataset=NG --mode=download python3 process.py --dataset=SEDS --mode=download @@ -38,6 +39,7 @@ This dataset is available for public use, license is available at https://www.ei python3 process.py --dataset=INTL --mode=process python3 process.py --dataset=ELEC --mode=process + python3 process.py --dataset=COAL --mode=process python3 process.py --dataset=PET --mode=process python3 process.py --dataset=NG --mode=process python3 process.py --dataset=SEDS --mode=process @@ -49,9 +51,10 @@ This dataset is available for public use, license is available at https://www.ei python3 process.py --dataset=TOTAL python3 process.py --dataset=INTL python3 process.py --dataset=ELEC + python3 process.py --dataset=COAL python3 process.py --dataset=NG python3 process.py --dataset=PET python3 process.py --dataset=SEDS python3 process.py --dataset=NUC_STATUS - ``` \ No newline at end of file + ``` From 154fc02193ab4ab02dc3117fb3d9627272707d80 Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Chandaluri Date: Mon, 20 Jan 2025 05:59:53 +0000 Subject: [PATCH 5/7] updated process script --- scripts/us_eia/opendata/process.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/us_eia/opendata/process.py b/scripts/us_eia/opendata/process.py index feeac4ebce..8f086bf973 100644 --- a/scripts/us_eia/opendata/process.py +++ b/scripts/us_eia/opendata/process.py @@ -31,7 +31,7 @@ from absl import app from absl import logging -from process import common, elec, intl, ng, nuclear, pet, seds, total +from process import coal, common, elec, intl, ng, nuclear, pet, seds, total MANIFEST_URL = "https://api.eia.gov/bulk/manifest.txt" @@ -41,8 +41,9 @@ 'Datasets to download. Everything, if empty.') flags.DEFINE_string('mode', '', 'Options: download or process') -## Value: (name, extract_fn, schema_fn) +# Value: (name, extract_fn, schema_fn) _DATASETS = { + 'COAL': ('Coal', coal.extract_place_statvar, coal.generate_statvar_schema), 'ELEC': ('Electricity', elec.extract_place_statvar, elec.generate_statvar_schema), 'INTL': ('Energy Overview (INTL)', intl.extract_place_statvar, None), From 14ac6f65478cb8efdabd45f5df232ec1f1f6b08d Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Chandaluri Date: Mon, 20 Jan 2025 06:53:26 +0000 Subject: [PATCH 6/7] updated requirements.txt --- requirements.txt | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/requirements.txt b/requirements.txt index edfbd2bc1f..e2c9d46dfe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,34 +1,44 @@ -# Requirements for Python scripts in this repo that have automation enabled! +# Requirements for all Python code in this repo, except for import-automation absl-py arcgis2geojson -chardet +chembl-webresource-client>=0.10.2 dataclasses==0.6 datacommons==1.4.3 -frozendict +deepdiff==6.3.0 +earthengine-api +flask_restful==0.3.9 +frozendict==1.2 func-timeout==4.3.5 geojson==2.5.0 +geopandas==0.8.1 +geopy google-cloud-bigquery -google-cloud-run google-cloud-storage>=2.7.0 google-cloud-logging==3.4.0 google-cloud-scheduler==2.10.0 -gspread==5.12.0 +gspread lxml==4.9.1 -numpy==1.26.4 -openpyxl>=3.1.0 +matplotlib==3.3.0 +netCDF4==1.6.4 +numpy +openpyxl==3.0.7 pandas -psutil pylint +pyspellchecker pytest -requests==2.27.1 -requests_cache +rasterio +rdp==0.8 +requests==2.31.0 retry==0.9.2 +s2sphere==0.2.5 shapely==1.8.5 -urllib3==1.26.8 +tabula-py +urllib3==1.26.17 xarray==0.19.0 -xlrd +xlrd==1.2.0 +websockets==12.0 +yapf zipp beautifulsoup4 ratelimit -xlsxwriter==3.2.0 From 974b11d6b76f51232a3d58a94213c50c7985cf92 Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Chandaluri Date: Mon, 20 Jan 2025 09:10:52 +0000 Subject: [PATCH 7/7] updated scripts --- scripts/us_eia/opendata/process.py | 7 +++++-- scripts/us_eia/opendata/process/common.py | 12 ++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/scripts/us_eia/opendata/process.py b/scripts/us_eia/opendata/process.py index 8f086bf973..3d67164ba8 100644 --- a/scripts/us_eia/opendata/process.py +++ b/scripts/us_eia/opendata/process.py @@ -31,7 +31,7 @@ from absl import app from absl import logging -from process import coal, common, elec, intl, ng, nuclear, pet, seds, total +from process import common, coal, elec, intl, ng, nuclear, pet, seds, total MANIFEST_URL = "https://api.eia.gov/bulk/manifest.txt" @@ -41,7 +41,7 @@ 'Datasets to download. Everything, if empty.') flags.DEFINE_string('mode', '', 'Options: download or process') -# Value: (name, extract_fn, schema_fn) +## Value: (name, extract_fn, schema_fn) _DATASETS = { 'COAL': ('Coal', coal.extract_place_statvar, coal.generate_statvar_schema), 'ELEC': ('Electricity', elec.extract_place_statvar, @@ -79,6 +79,7 @@ def main(_): assert FLAGS.data_dir manifest_json = download_manifest() datasets = manifest_json.get('dataset', {}) + logging.info("================Calling main method") for dataset_name in datasets: if FLAGS.dataset and dataset_name not in FLAGS.dataset: continue @@ -89,6 +90,7 @@ def main(_): if mode == "" or mode == "process": file_prefix = os.path.join(f'{FLAGS.data_dir}/{dataset_name}', FLAGS.dataset) + logging.info("================Calling process method") common.process( dataset=FLAGS.dataset, dataset_name=_DATASETS[FLAGS.dataset], @@ -99,6 +101,7 @@ def main(_): out_tmcf=file_prefix + '.tmcf', extract_place_statvar_fn=_DATASETS[FLAGS.dataset][1], generate_statvar_schema_fn=_DATASETS[FLAGS.dataset][2]) + logging.info("================process completed") if __name__ == '__main__': diff --git a/scripts/us_eia/opendata/process/common.py b/scripts/us_eia/opendata/process/common.py index 6eb566d38f..964db8289b 100644 --- a/scripts/us_eia/opendata/process/common.py +++ b/scripts/us_eia/opendata/process/common.py @@ -512,6 +512,7 @@ def process(dataset, dataset_name, in_json, out_csv, out_sv_mcf, out_svg_mcf, # Add to rows. rows = [] + for k, v in time_series: try: @@ -529,6 +530,7 @@ def process(dataset, dataset_name, in_json, out_csv, out_sv_mcf, out_svg_mcf, # # TODO: Handle some these better. _ = float(v) + except Exception: counters['error_non_numeric_values'] += 1 continue @@ -557,17 +559,24 @@ def process(dataset, dataset_name, in_json, out_csv, out_sv_mcf, out_svg_mcf, if generate_statvar_schema_fn: schema_sv = generate_statvar_schema_fn(raw_sv, rows, sv_map, counters) + logging.info("================ process5 {sv_map}") if schema_sv: sv_schemaful2raw[schema_sv] = raw_sv counters['info_schemaful_series'] += 1 + logging.info("================ process6 {raw_sv}") else: counters['info_schemaless_series'] += 1 _generate_default_statvar(raw_sv, sv_map) + logging.info("================ process7") csvwriter.writerows(rows) + logging.info("================ process8 {rows}") counters['info_rows_output'] += len(rows) + logging.info("================ process9 {rows}") + logging.info("================ process4") category.trim_area_categories(svg_info, counters) + logging.info("================ process3 {counters} {svg_info}") with open(out_sv_mcf, 'w') as out_fp: nodes = _generate_sv_nodes(dataset, sv_map, sv_name_map, @@ -576,14 +585,17 @@ def process(dataset, dataset_name, in_json, out_csv, out_sv_mcf, out_svg_mcf, out_fp.write('\n\n'.join(nodes)) out_fp.write('\n') + logging.info("================ process {nodes}") with open(out_svg_mcf, 'w') as out_fp: nodes = category.generate_svg_nodes(dataset, dataset_name, svg_info) out_fp.write('\n\n'.join(nodes)) out_fp.write('\n') + logging.info("================ process1 {dataset_name} {dataset}") with open(out_tmcf, 'w') as out_fp: out_fp.write(_TMCF_STRING) + logging.info("================ process2 {_TMCF_STRING}") logging.info(f"FINAL COUNTERS {_print_counters(counters)}")