Merge branch 'master' into dev/amazon

jbms · Sep 30, 2023 · 6795b4c · 6795b4c
2 parents 7c9b860 + 4b8e28a
commit 6795b4c
Show file tree

Hide file tree

Showing 12 changed files with 201 additions and 81 deletions.
diff --git a/README.md b/README.md
@@ -76,10 +76,12 @@ pip install -e .
 Configuration
 ==
 
-Create a Python file like `example_finance_dl_config.py`.
+Create a configuration file called something like `finance_dl_config.py`.
+For a complete example of this file and some documentation, 
+see [example_finance_dl_config.py](example_finance_dl_config.py).
 
 Refer to the documentation of the individual scraper modules for
-details.
+further details.
 
 Basic Usage
 ==
@@ -162,7 +164,7 @@ In this event, you have a few options:
    `chromedriver_binary` somewhere other than your system's default Chrome
    version, and set the environment variable `CHROMEDRIVER_CHROME_BINARY` to
    point to it. (You can do this from within your finance_dl config script,
-   e.g. with a line like `os.environ[CHROMEDRIVER_CHROME_BINARY] = "/usr/bin/google-chrome-beta"`).
+   e.g. with a line like `os.environ["CHROMEDRIVER_CHROME_BINARY"] = "/usr/bin/google-chrome-beta"`).
 
 License
 ==

diff --git a/example_finance_dl_config.py b/example_finance_dl_config.py
@@ -7,6 +7,20 @@
 Rather than hard code your usernames and passwords into this configuration
 file, you may instead wish to write some code to retrieve them from some
 external password store.
+
+For example, you can input the username/password interactively like so:
+
+    from getpass import getpass
+
+    def CONFIG_paypal():
+        return dict(
+            module='finance_dl.paypal',
+            credentials={
+                'username': input('PayPal username: '),   # <----
+                'password': getpass('PayPal password: '), # <----
+            },
+            output_directory=os.path.join(data_dir, 'paypal'),
+        )
 """
 
 import os

diff --git a/finance_dl/amazon.py b/finance_dl/amazon.py
@@ -43,6 +43,13 @@
   order page "order groups" that will be scanned for orders to download. Order groups
   include years (e.g. '2020'), as well as 'last 30 days' and 'past 3 months'.
 
+- `download_preorder_invoices`: Optional. If specified and True, invoices for
+  preorders (i.e. orders that have not actually been charged yet) will be
+  skipped. Such preorder invoices are not typically useful for accounting
+  since they claim a card was charged even though it actually has not been
+  yet; they get replaced with invoices containing the correct information when
+  the order is actually fulfilled.
+
 Output format:
 ==============
 
@@ -112,6 +119,7 @@ class Domain():
     grand_total: str
     grand_total_digital: str
     order_cancelled: str
+    pre_order: str
 
     digital_order: str
     regular_order_placed: str
@@ -144,6 +152,7 @@ def __init__(self) -> None:
             grand_total='Grand Total:',
             grand_total_digital='Grand Total:',
             order_cancelled='Order Canceled',
+            pre_order='Pre-order',
 
             digital_order='Digital Order: (.*)',
             regular_order_placed=r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})',
@@ -176,6 +185,7 @@ def __init__(self) -> None:
             grand_total='Grand Total:',
             grand_total_digital='Grand Total:',
             order_cancelled='Order Canceled',
+            pre_order='Pre-order',
 
             digital_order='Digital Order: (.*)',
             regular_order_placed=r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})',
@@ -206,6 +216,7 @@ def __init__(self) -> None:
             grand_total='Gesamtsumme:',
             grand_total_digital='Endsumme:',
             order_cancelled='Order Canceled',
+            pre_order='Pre-order',
 
             digital_order='Digitale Bestellung: (.*)',
             regular_order_placed=r'(?:Getätigte Spar-Abo-Bestellung|Bestellung aufgegeben am):\s+(\d+\. [^\s]+ \d{4})',
@@ -241,6 +252,7 @@ def __init__(self,
                  regular: bool = True,
                  digital: Optional[bool] = None,
                  order_groups: Optional[List[str]] = None,
+                 download_preorder_invoices: bool = False,
                  **kwargs):
         super().__init__(**kwargs)
         if amazon_domain not in DOMAINS:
@@ -254,6 +266,7 @@ def __init__(self,
         self.regular = regular
         self.digital_orders_menu = digital if digital is not None else self.domain.digital_orders_menu
         self.order_groups = order_groups
+        self.download_preorder_invoices = download_preorder_invoices
 
     def check_url(self, url):
         netloc_re = r'^([^\.@]+\.)*amazon.' + self.domain.top_level + '$'
@@ -312,7 +325,7 @@ def finish_login(self):
 
     def get_invoice_path(self, year, order_id):
         if self.dir_per_year:
-            return os.path.join(self.output_directory, year, order_id + '.html')
+            return os.path.join(self.output_directory, str(year), order_id + '.html')
         return os.path.join(self.output_directory, order_id + '.html')
 
     def get_order_id(self, href) -> str:
@@ -393,7 +406,7 @@ def invoice_link_finder_hidden():
                         # submenu containing order summary takes some time to load after click
                         # search for order summary link and compare order_id
                         # repeat until order_id is different to last order_id
-                        summary_links = self.driver.find_elements_by_link_text(
+                        summary_links = self.driver.find_elements(By.LINK_TEXT, 
                             self.domain.order_summary)
                         if summary_links:
                             href = summary_links[0].get_attribute('href')
@@ -505,6 +518,10 @@ def get_source():
                 return None
 
             page_source, = self.wait_and_return(get_source)
+            if self.domain.pre_order in page_source and not self.download_preorder_invoices:
+                    # Pre-orders don't have enough information to download yet. Skip them.
+                    logger.info(f'Skipping pre-order invoice {order_id}')
+                    return
             if order_id not in page_source:
                 raise ValueError(f'Failed to retrieve information for order {order_id}')
 

diff --git a/finance_dl/comcast.py b/finance_dl/comcast.py
@@ -150,7 +150,7 @@ def get_bills_link():
                 pass
         bills_link = get_bills_link()
 
-        self.driver.find_element_by_tag_name('body').send_keys(Keys.ESCAPE)
+        self.driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.ESCAPE)
         bills_link.click()
 
         def get_links():
@@ -168,7 +168,7 @@ def get_links():
             cur_el = link
             bill_date = None
             while True:
-                parent = cur_el.find_element_by_xpath('..')
+                parent = cur_el.find_element(By.XPATH, '..')
                 if parent == cur_el:
                     break
                 try:

diff --git a/finance_dl/discover.py b/finance_dl/discover.py
@@ -57,6 +57,7 @@ def CONFIG_discover():
 import os
 import shutil
 from selenium.common.exceptions import NoSuchElementException, TimeoutException
+from selenium.webdriver.common.by import By
 from selenium.webdriver.common.keys import Keys
 
 from . import scrape_lib
@@ -85,11 +86,11 @@ def check_after_wait(self):
         check_url(self.driver.current_url)
 
     def find_account_last4(self):
-        return self.driver.find_element_by_xpath(XPATH_OF_LAST_FOUR_DIGITS).text
+        return self.driver.find_element(By.XPATH, XPATH_OF_LAST_FOUR_DIGITS).text
 
     def login(self):
         try:
-            account = self.driver.find_element_by_xpath(XPATH_OF_LAST_FOUR_DIGITS)
+            account = self.driver.find_element(By.XPATH, XPATH_OF_LAST_FOUR_DIGITS)
             logger.info("Already logged in")
         except NoSuchElementException:
             logger.info('Initiating log in')

diff --git a/finance_dl/gemini.py b/finance_dl/gemini.py
@@ -187,8 +187,9 @@ def get_balances_and_prices(requester, api_key, api_secret, data_dir):
     logger.info(f"Got balances. Found {len(balances)} currencies.")
 
     # Prices
-    tickers = [b['currency'] + "USD" for b in balances if b['currency'] != 'USD']
+    tickers = [b['currency'] + "USD" for b in balances if b['currency'] != 'USD' and b['currency'] != 'GUSD']
     prices = {}
+    prices['GUSD'] = 1
     for t in tickers:
         obj = requester.make_request(TICKERS_URL+"/"+t.lower(), None, get = True)
         price = (float(obj['ask']) + float(obj['bid']))/2

diff --git a/finance_dl/healthequity.py b/finance_dl/healthequity.py
@@ -74,6 +74,9 @@ def CONFIG_healthequity():
 import logging
 import os
 import bs4
+import tempfile
+import openpyxl
+from openpyxl.cell.cell import MergedCell
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import Select
 from selenium.webdriver.common.keys import Keys
@@ -101,43 +104,66 @@ def find_first_matching_date(lines, date_format):
 
 
 FUND_ACTIVITY_HEADERS = [
-    'Fund', 'Name', 'Shares (#)', 'Closing Price', 'Closing Value'
+    'Fund', 'Name', 'Class', 'Target %\nallocation', 'Est. %\nholding', 'Shares\nheld', 'Closing\nprice', 'Closing\nvalue'
 ]
 
+# For compatibility with beancount-import's healthequity plugin, write the old
+# format for balances.csv files. The three new columns are fairly useless
+# anyway, and the new (multiline) column titles are unambiguously worse even if
+# a human were to actually ever read these CSVs.
+OLD_FUND_ACTIVITY_HEADERS = [
+    'Fund','Name',None,None,None,'Shares (#)','Closing Price','Closing Value'
+]
 
 def write_balances(data, path):
     rows = []
     for entry in data:
-        keys = [x[0] for x in entry]
-        if keys == FUND_ACTIVITY_HEADERS:
+        keys, values = zip(*entry)
+        if list(keys) == FUND_ACTIVITY_HEADERS:
+            entry = [
+                (k, v.strip().split('\n')[0].strip('$'))
+                for (k, v) in zip(OLD_FUND_ACTIVITY_HEADERS, values)
+                if k
+            ]
             row_values = dict(entry)
             row_values['Fund'] = row_values['Fund'].strip().split()[0]
+            row_values['Name'] = row_values['Name'].strip().split('\n')[0]
             rows.append(row_values)
-    csv_merge.write_csv(FUND_ACTIVITY_HEADERS, rows, path)
+    csv_merge.write_csv([h for h in OLD_FUND_ACTIVITY_HEADERS if h], rows, path)
 
 
 def write_fund_activity(raw_transactions_data, path):
-    input_date_format = '%m/%d/%Y'
-    output_date_format = '%Y-%m-%d'
-    soup = bs4.BeautifulSoup(raw_transactions_data.decode('utf-8'), 'lxml')
+    def format_cell(c):
+        if c.is_date:
+            return c.value.strftime('%Y-%m-%d')
+        if c.number_format[0] == '$':
+            base = '${:,.2f}'.format(abs(c.value))
+            if c.value >= 0:
+                return base
+            else:
+                return '(%s)' % base
+        return str(c.value)
+
+    wb = None
+    with tempfile.NamedTemporaryFile(suffix='.xlsx') as xlsx:
+        xlsx.write(raw_transactions_data)
+        xlsx.flush()
+        wb = openpyxl.load_workbook(xlsx.name)
+
+    ws = wb.worksheets[0]
     headers = [
         'Date', 'Fund', 'Category', 'Description', 'Price', 'Amount', 'Shares',
         'Total Shares', 'Total Value'
     ]
     rows = []
-    for row in soup.find_all('tr'):
-        cells = [str(x.text).strip() for x in row.find_all('td')]
-        while cells and not cells[-1].strip():
-            del cells[-1]
-        if len(cells) == 1:
+    for row in ws.rows:
+        if any([isinstance(c, MergedCell) for c in row]):
             continue
-        assert len(cells) == len(headers)
+        assert len(row) == len(headers)
+        cells = [format_cell(c) for c in row]
         if cells == headers:
             continue
-        row_values = dict(zip(headers, cells))
-        row_values['Date'] = datetime.datetime.strptime(
-            row_values['Date'], input_date_format).strftime(output_date_format)
-        rows.append(row_values)
+        rows.append(dict(zip(headers, cells)))
     csv_merge.merge_into_file(filename=path, field_names=headers, data=rows,
                               sort_by=lambda x: x['Date'])
 
@@ -157,12 +183,14 @@ def write_transactions(raw_transactions_data, path):
             continue
         if cells[0] == 'TOTAL':
             continue
-        assert len(cells) == len(headers)
-        if cells == headers:
+        assert len(cells) >= len(headers), (cells, headers)
+        if cells[:len(headers)] == headers:
             continue
         row_values = dict(zip(headers, cells))
         # Sanitize whitespace in description
         row_values['Transaction'] = ' '.join(row_values['Transaction'].split())
+        # Remove duplicate tax year in description
+        row_values['Transaction'] = re.sub(r'(\(Tax year: \d+\)) *\1', r'\1', row_values['Transaction'])
         row_values['Cash Balance'] = row_values.pop('HSA Cash Balance')
 
         # Sanitize date_str
@@ -174,7 +202,11 @@ def write_transactions(raw_transactions_data, path):
         rows.append(row_values)
     rows.reverse()
     csv_merge.merge_into_file(filename=path, field_names=output_headers,
-                              data=rows, sort_by=lambda x: x['Date'])
+                              data=rows, sort_by=lambda x: x['Date'],
+                              # Don't consider balance-after in comparing rows,
+                              # because txn order (and therefore running
+                              # balance) is not stable across visits
+                              compare_fields = output_headers[0:3])
 
 
 class Scraper(scrape_lib.Scraper):
@@ -205,7 +237,7 @@ def login(self):
 
     def download_transaction_history(self):
         (transactions_link, ), = self.wait_and_return(
-            lambda: self.find_visible_elements_by_descendant_partial_text('Transaction History', 'td'))
+            lambda: self.find_visible_elements(By.ID, 'viewAllLink'))
         scrape_lib.retry(transactions_link.click, retry_delay=2)
         (date_select, ), = self.wait_and_return(
             lambda: self.find_visible_elements_by_descendant_partial_text('All dates', 'select'))
@@ -244,7 +276,7 @@ def download_transaction_history(self):
     def get_investment_balance(self):
         headers = FUND_ACTIVITY_HEADERS
         (table, ), = self.wait_and_return(
-            lambda: scrape_lib.find_table_by_headers(self, headers))
+            lambda: self.driver.find_elements(By.TAG_NAME, 'table'))
         data = scrape_lib.extract_table_data(table, headers)
         return data
 
@@ -256,16 +288,16 @@ def go_to_investment_history(self):
     def download_fund_activity(self):
         logger.info('Looking for fund activity link')
         (fund_activity_link,), = self.wait_and_return(
-            lambda: self.find_visible_elements(By.XPATH, '//a[contains(@href, "FundActivity")]'))
+            lambda: self.find_visible_elements(By.ID, 'EditPortfolioTab'))
         scrape_lib.retry(fund_activity_link.click, retry_delay=2)
-        logger.info('Selecting date ranage for fund activity')
+        logger.info('Selecting date range for fund activity')
         (start_date,), = self.wait_and_return(
-            lambda: self.find_visible_elements(By.XPATH, '//input[@type="text" and contains(@id, "dateSelectStart")]'))
+            lambda: self.find_visible_elements(By.XPATH, '//input[@type="text" and contains(@id, "startDate")]'))
         start_date.clear()
-        start_date.send_keys('01011900')
+        start_date.send_keys('01/01/1900\n')
         logger.info('Downloading fund activity')
         (download_link, ), = self.wait_and_return(
-            lambda: self.driver.find_elements_by_link_text('Download'))
+            lambda: self.find_visible_elements(By.ID, 'fundPerformanceDownload'))
         scrape_lib.retry(download_link.click, retry_delay=2)
         logger.info('Waiting for fund activity download')
         download_result, = self.wait_and_return(self.get_downloaded_file)

diff --git a/finance_dl/ofx.py b/finance_dl/ofx.py
@@ -132,7 +132,43 @@ def CONFIG_vanguard():
 import ofxclient.institution
 import ofxclient
 
-from beancount.ingest.importers.ofx import parse_ofx_time, find_child
+# find_child and parse_ofx_time were derived from implementation in beancount/ingest/importers/ofx.py{,test}
+# Copyright (C) 2016  Martin Blais
+# GNU GPLv2
+def find_child(node, name, conversion=None):
+    """Find a child under the given node and return its value.
+
+    Args:
+      node: A <STMTTRN> bs4.element.Tag.
+      name: A string, the name of the child node.
+      conversion: A callable object used to convert the value to a new data type.
+    Returns:
+      A string, or None.
+    """
+    child = node.find(name)
+    if not child:
+        return None
+    if not child.contents:
+        value = ''
+    else:
+        value = child.contents[0].strip()
+    if conversion:
+        value = conversion(value)
+    return value
+
+
+def parse_ofx_time(date_str):
+    """Parse an OFX time string and return a datetime object.
+
+    Args:
+      date_str: A string, the date to be parsed.
+    Returns:
+      A datetime.datetime instance.
+    """
+    if len(date_str) < 14:
+        return datetime.datetime.strptime(date_str[:8], '%Y%m%d')
+    return datetime.datetime.strptime(date_str[:14], '%Y%m%d%H%M%S')
+
 
 warnings.filterwarnings('ignore', message='split()', module='re')