From 4b8e28a29b8f0faf5ab3457b5cded2079e73f3fd Mon Sep 17 00:00:00 2001 From: moritzj29 <18733473+moritzj29@users.noreply.github.com> Date: Mon, 9 Jan 2023 07:30:04 +0100 Subject: [PATCH] [Paypal] handle case gracefully when only JSON can be extracted but no HTML (#81) * [Paypal] handle case gracefully when only JSON can be extracted but no HTML --- finance_dl/paypal.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/finance_dl/paypal.py b/finance_dl/paypal.py index d4aead6..31b120d 100644 --- a/finance_dl/paypal.py +++ b/finance_dl/paypal.py @@ -51,6 +51,7 @@ from selenium.webdriver.support.ui import Select from selenium.webdriver.common.keys import Keys from selenium.common.exceptions import NoSuchElementException +from requests.exceptions import HTTPError import jsonschema from atomicwrites import atomic_write from . import scrape_lib @@ -232,15 +233,6 @@ def save_transactions(self): + transaction_id) html_path = output_prefix + '.html' json_path = output_prefix + '.json' - if not os.path.exists(html_path): - logging.info('Retrieving HTML %s', details_url) - html_resp = self.driver.request('GET', details_url) - html_resp.raise_for_status() - with atomic_write( - html_path, mode='w', encoding='utf-8', - newline='\n', overwrite=True) as f: - # Write with Unicode Byte Order Mark to ensure content will be properly interpreted as UTF-8 - f.write('\ufeff' + html_resp.text) if not os.path.exists(json_path): logging.info('Retrieving JSON %s', inline_details_url) json_resp = self.make_json_request(inline_details_url) @@ -250,6 +242,25 @@ def save_transactions(self): with atomic_write(json_path, mode='wb', overwrite=True) as f: f.write( json.dumps(j['data'], indent=' ', sort_keys=True).encode()) + if not os.path.exists(html_path): + logging.info('Retrieving HTML %s', details_url) + html_resp = self.driver.request('GET', details_url) + try: + html_resp.raise_for_status() + except HTTPError as e: + # in rare cases no HTML detail page exists but JSON could be extracted + # if JSON is present gracefully skip HTML download if it fails + if os.path.exists(json_path): + # HTML download failed but JSON present -> only log warning + logging.warning('Retrieving HTML %s failed due to %s but JSON is already present. Continuing...', details_url, e) + else: + logging.error('Retrieving HTML %s failed due to %s and no JSON is present. Aborting...', details_url, e) + raise e + with atomic_write( + html_path, mode='w', encoding='utf-8', + newline='\n', overwrite=True) as f: + # Write with Unicode Byte Order Mark to ensure content will be properly interpreted as UTF-8 + f.write('\ufeff' + html_resp.text) def run(self): if not os.path.exists(self.output_directory):