From a87f8f71bea5498fd20e72c78a308a79723f3f03 Mon Sep 17 00:00:00 2001 From: Jonathan Klabunde Tomer Date: Tue, 16 Apr 2024 20:25:04 -0700 Subject: [PATCH] minor improvements to PG&E scraper (#95) * actually skip downloads when skipping them * pge: use mobile site for login it seems to work more reliably * more PG&E fixes: * don't wait for page reload after entering credentials; website is now an SPA and does not reload * use CSS selectors rather than link text to find billing links for significant speedup --- finance_dl/pge.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/finance_dl/pge.py b/finance_dl/pge.py index dcf2a8f..689390d 100644 --- a/finance_dl/pge.py +++ b/finance_dl/pge.py @@ -104,15 +104,15 @@ def login(self): if self.logged_in: return logger.info('Initiating log in') - self.driver.get('https://www.pge.com/en/myhome/myaccount/index.page') + self.driver.get('https://m.pge.com/') (username, password), = self.wait_and_return( self.find_username_and_password_in_any_frame) logger.info('Entering username and password') username.send_keys(self.credentials['username']) password.send_keys(self.credentials['password']) - with self.wait_for_page_load(): - password.send_keys(Keys.ENTER) + password.send_keys(Keys.ENTER) + self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'arrowBillPaymentHistory')) logger.info('Logged in') self.logged_in = True @@ -136,7 +136,7 @@ def process_download(self, download_result, output_dir): new_path = self.get_output_path(output_dir, date) if os.path.exists(new_path): logger.info('Skipping duplicate download: %s', date) - return True + return False tmp_path = new_path.replace('.pdf', '.tmp.pdf') with open(tmp_path, 'wb') as f: download_data = download_result[1] @@ -157,15 +157,11 @@ def get_bills(self, output_dir): actions.send_keys(Keys.ESCAPE) actions.perform() logger.info('Looking for download link') - (bills_link, ), = self.wait_and_return( - lambda: self.find_visible_elements_by_descendant_partial_text('BILL & PAYMENT HISTORY', 'h2')) + (bills_link, ), = self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'arrowBillPaymentHistory')) scrape_lib.retry(lambda: self.click(bills_link), retry_delay=2) - (more_link, ), = self.wait_and_return( - lambda: self.find_visible_elements_by_descendant_partial_text('View up to 24 months of activity', 'a')) + (more_link, ), = self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'href-view-24month-history')) scrape_lib.retry(lambda: self.click(more_link), retry_delay=2) - links, = self.wait_and_return( - lambda: self.find_visible_elements(By.PARTIAL_LINK_TEXT, "View Bill PDF") - ) + links, = self.wait_and_return(lambda: self.find_visible_elements(By.CSS_SELECTOR, ".utag-bill-history-view-bill-pdf")) for link in links: if not self.do_download_from_link(link, output_dir) and self.stop_early: