From 4b8e28a29b8f0faf5ab3457b5cded2079e73f3fd Mon Sep 17 00:00:00 2001
From: moritzj29 <18733473+moritzj29@users.noreply.github.com>
Date: Mon, 9 Jan 2023 07:30:04 +0100
Subject: [PATCH] [Paypal] handle case gracefully when only JSON can be
 extracted but no HTML (#81)

* [Paypal] handle case gracefully when only JSON can be extracted but no HTML
---
 finance_dl/paypal.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/finance_dl/paypal.py b/finance_dl/paypal.py
index d4aead6..31b120d 100644
--- a/finance_dl/paypal.py
+++ b/finance_dl/paypal.py
@@ -51,6 +51,7 @@
 from selenium.webdriver.support.ui import Select
 from selenium.webdriver.common.keys import Keys
 from selenium.common.exceptions import NoSuchElementException
+from requests.exceptions import HTTPError
 import jsonschema
 from atomicwrites import atomic_write
 from . import scrape_lib
@@ -232,15 +233,6 @@ def save_transactions(self):
                 + transaction_id)
             html_path = output_prefix + '.html'
             json_path = output_prefix + '.json'
-            if not os.path.exists(html_path):
-                logging.info('Retrieving HTML %s', details_url)
-                html_resp = self.driver.request('GET', details_url)
-                html_resp.raise_for_status()
-                with atomic_write(
-                        html_path, mode='w', encoding='utf-8',
-                        newline='\n', overwrite=True) as f:
-                    # Write with Unicode Byte Order Mark to ensure content will be properly interpreted as UTF-8
-                    f.write('\ufeff' + html_resp.text)
             if not os.path.exists(json_path):
                 logging.info('Retrieving JSON %s', inline_details_url)
                 json_resp = self.make_json_request(inline_details_url)
@@ -250,6 +242,25 @@ def save_transactions(self):
                 with atomic_write(json_path, mode='wb', overwrite=True) as f:
                     f.write(
                         json.dumps(j['data'], indent='  ', sort_keys=True).encode())
+            if not os.path.exists(html_path):
+                logging.info('Retrieving HTML %s', details_url)
+                html_resp = self.driver.request('GET', details_url)
+                try:
+                    html_resp.raise_for_status()
+                except HTTPError as e:
+                    # in rare cases no HTML detail page exists but JSON could be extracted
+                    # if JSON is present gracefully skip HTML download if it fails
+                    if os.path.exists(json_path):
+                        # HTML download failed but JSON present -> only log warning
+                        logging.warning('Retrieving HTML %s failed due to %s but JSON is already present. Continuing...', details_url, e)
+                    else:
+                        logging.error('Retrieving HTML %s failed due to %s and no JSON is present. Aborting...', details_url, e)
+                        raise e
+                with atomic_write(
+                        html_path, mode='w', encoding='utf-8',
+                        newline='\n', overwrite=True) as f:
+                    # Write with Unicode Byte Order Mark to ensure content will be properly interpreted as UTF-8
+                    f.write('\ufeff' + html_resp.text)
 
     def run(self):
         if not os.path.exists(self.output_directory):