Skip to content

Commit

Permalink
minor improvements to PG&E scraper (#95)
Browse files Browse the repository at this point in the history
* actually skip downloads when skipping them
* pge: use mobile site for login it seems to work more reliably
* more PG&E fixes:
* don't wait for page reload after entering credentials; website is now an SPA
and does not reload
* use CSS selectors rather than link text to find billing links for
significant speedup
  • Loading branch information
jktomer authored Apr 17, 2024
1 parent dca4505 commit a87f8f7
Showing 1 changed file with 7 additions and 11 deletions.
18 changes: 7 additions & 11 deletions finance_dl/pge.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,15 +104,15 @@ def login(self):
if self.logged_in:
return
logger.info('Initiating log in')
self.driver.get('https://www.pge.com/en/myhome/myaccount/index.page')
self.driver.get('https://m.pge.com/')

(username, password), = self.wait_and_return(
self.find_username_and_password_in_any_frame)
logger.info('Entering username and password')
username.send_keys(self.credentials['username'])
password.send_keys(self.credentials['password'])
with self.wait_for_page_load():
password.send_keys(Keys.ENTER)
password.send_keys(Keys.ENTER)
self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'arrowBillPaymentHistory'))
logger.info('Logged in')
self.logged_in = True

Expand All @@ -136,7 +136,7 @@ def process_download(self, download_result, output_dir):
new_path = self.get_output_path(output_dir, date)
if os.path.exists(new_path):
logger.info('Skipping duplicate download: %s', date)
return True
return False
tmp_path = new_path.replace('.pdf', '.tmp.pdf')
with open(tmp_path, 'wb') as f:
download_data = download_result[1]
Expand All @@ -157,15 +157,11 @@ def get_bills(self, output_dir):
actions.send_keys(Keys.ESCAPE)
actions.perform()
logger.info('Looking for download link')
(bills_link, ), = self.wait_and_return(
lambda: self.find_visible_elements_by_descendant_partial_text('BILL & PAYMENT HISTORY', 'h2'))
(bills_link, ), = self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'arrowBillPaymentHistory'))
scrape_lib.retry(lambda: self.click(bills_link), retry_delay=2)
(more_link, ), = self.wait_and_return(
lambda: self.find_visible_elements_by_descendant_partial_text('View up to 24 months of activity', 'a'))
(more_link, ), = self.wait_and_return(lambda: self.find_visible_elements(By.ID, 'href-view-24month-history'))
scrape_lib.retry(lambda: self.click(more_link), retry_delay=2)
links, = self.wait_and_return(
lambda: self.find_visible_elements(By.PARTIAL_LINK_TEXT, "View Bill PDF")
)
links, = self.wait_and_return(lambda: self.find_visible_elements(By.CSS_SELECTOR, ".utag-bill-history-view-bill-pdf"))

for link in links:
if not self.do_download_from_link(link, output_dir) and self.stop_early:
Expand Down

0 comments on commit a87f8f7

Please sign in to comment.