Skip to content

Commit

Permalink
Merge branch 'master' into dev/amazon
Browse files Browse the repository at this point in the history
  • Loading branch information
moritzj29 authored Sep 30, 2023
2 parents 7c9b860 + 4b8e28a commit 6795b4c
Show file tree
Hide file tree
Showing 12 changed files with 201 additions and 81 deletions.
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,12 @@ pip install -e .
Configuration
==

Create a Python file like `example_finance_dl_config.py`.
Create a configuration file called something like `finance_dl_config.py`.
For a complete example of this file and some documentation,
see [example_finance_dl_config.py](example_finance_dl_config.py).

Refer to the documentation of the individual scraper modules for
details.
further details.

Basic Usage
==
Expand Down Expand Up @@ -162,7 +164,7 @@ In this event, you have a few options:
`chromedriver_binary` somewhere other than your system's default Chrome
version, and set the environment variable `CHROMEDRIVER_CHROME_BINARY` to
point to it. (You can do this from within your finance_dl config script,
e.g. with a line like `os.environ[CHROMEDRIVER_CHROME_BINARY] = "/usr/bin/google-chrome-beta"`).
e.g. with a line like `os.environ["CHROMEDRIVER_CHROME_BINARY"] = "/usr/bin/google-chrome-beta"`).

License
==
Expand Down
14 changes: 14 additions & 0 deletions example_finance_dl_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,20 @@
Rather than hard code your usernames and passwords into this configuration
file, you may instead wish to write some code to retrieve them from some
external password store.
For example, you can input the username/password interactively like so:
from getpass import getpass
def CONFIG_paypal():
return dict(
module='finance_dl.paypal',
credentials={
'username': input('PayPal username: '), # <----
'password': getpass('PayPal password: '), # <----
},
output_directory=os.path.join(data_dir, 'paypal'),
)
"""

import os
Expand Down
21 changes: 19 additions & 2 deletions finance_dl/amazon.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@
order page "order groups" that will be scanned for orders to download. Order groups
include years (e.g. '2020'), as well as 'last 30 days' and 'past 3 months'.
- `download_preorder_invoices`: Optional. If specified and True, invoices for
preorders (i.e. orders that have not actually been charged yet) will be
skipped. Such preorder invoices are not typically useful for accounting
since they claim a card was charged even though it actually has not been
yet; they get replaced with invoices containing the correct information when
the order is actually fulfilled.
Output format:
==============
Expand Down Expand Up @@ -112,6 +119,7 @@ class Domain():
grand_total: str
grand_total_digital: str
order_cancelled: str
pre_order: str

digital_order: str
regular_order_placed: str
Expand Down Expand Up @@ -144,6 +152,7 @@ def __init__(self) -> None:
grand_total='Grand Total:',
grand_total_digital='Grand Total:',
order_cancelled='Order Canceled',
pre_order='Pre-order',

digital_order='Digital Order: (.*)',
regular_order_placed=r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})',
Expand Down Expand Up @@ -176,6 +185,7 @@ def __init__(self) -> None:
grand_total='Grand Total:',
grand_total_digital='Grand Total:',
order_cancelled='Order Canceled',
pre_order='Pre-order',

digital_order='Digital Order: (.*)',
regular_order_placed=r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})',
Expand Down Expand Up @@ -206,6 +216,7 @@ def __init__(self) -> None:
grand_total='Gesamtsumme:',
grand_total_digital='Endsumme:',
order_cancelled='Order Canceled',
pre_order='Pre-order',

digital_order='Digitale Bestellung: (.*)',
regular_order_placed=r'(?:Getätigte Spar-Abo-Bestellung|Bestellung aufgegeben am):\s+(\d+\. [^\s]+ \d{4})',
Expand Down Expand Up @@ -241,6 +252,7 @@ def __init__(self,
regular: bool = True,
digital: Optional[bool] = None,
order_groups: Optional[List[str]] = None,
download_preorder_invoices: bool = False,
**kwargs):
super().__init__(**kwargs)
if amazon_domain not in DOMAINS:
Expand All @@ -254,6 +266,7 @@ def __init__(self,
self.regular = regular
self.digital_orders_menu = digital if digital is not None else self.domain.digital_orders_menu
self.order_groups = order_groups
self.download_preorder_invoices = download_preorder_invoices

def check_url(self, url):
netloc_re = r'^([^\.@]+\.)*amazon.' + self.domain.top_level + '$'
Expand Down Expand Up @@ -312,7 +325,7 @@ def finish_login(self):

def get_invoice_path(self, year, order_id):
if self.dir_per_year:
return os.path.join(self.output_directory, year, order_id + '.html')
return os.path.join(self.output_directory, str(year), order_id + '.html')
return os.path.join(self.output_directory, order_id + '.html')

def get_order_id(self, href) -> str:
Expand Down Expand Up @@ -393,7 +406,7 @@ def invoice_link_finder_hidden():
# submenu containing order summary takes some time to load after click
# search for order summary link and compare order_id
# repeat until order_id is different to last order_id
summary_links = self.driver.find_elements_by_link_text(
summary_links = self.driver.find_elements(By.LINK_TEXT,
self.domain.order_summary)
if summary_links:
href = summary_links[0].get_attribute('href')
Expand Down Expand Up @@ -505,6 +518,10 @@ def get_source():
return None

page_source, = self.wait_and_return(get_source)
if self.domain.pre_order in page_source and not self.download_preorder_invoices:
# Pre-orders don't have enough information to download yet. Skip them.
logger.info(f'Skipping pre-order invoice {order_id}')
return
if order_id not in page_source:
raise ValueError(f'Failed to retrieve information for order {order_id}')

Expand Down
4 changes: 2 additions & 2 deletions finance_dl/comcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def get_bills_link():
pass
bills_link = get_bills_link()

self.driver.find_element_by_tag_name('body').send_keys(Keys.ESCAPE)
self.driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.ESCAPE)
bills_link.click()

def get_links():
Expand All @@ -168,7 +168,7 @@ def get_links():
cur_el = link
bill_date = None
while True:
parent = cur_el.find_element_by_xpath('..')
parent = cur_el.find_element(By.XPATH, '..')
if parent == cur_el:
break
try:
Expand Down
5 changes: 3 additions & 2 deletions finance_dl/discover.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def CONFIG_discover():
import os
import shutil
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

from . import scrape_lib
Expand Down Expand Up @@ -85,11 +86,11 @@ def check_after_wait(self):
check_url(self.driver.current_url)

def find_account_last4(self):
return self.driver.find_element_by_xpath(XPATH_OF_LAST_FOUR_DIGITS).text
return self.driver.find_element(By.XPATH, XPATH_OF_LAST_FOUR_DIGITS).text

def login(self):
try:
account = self.driver.find_element_by_xpath(XPATH_OF_LAST_FOUR_DIGITS)
account = self.driver.find_element(By.XPATH, XPATH_OF_LAST_FOUR_DIGITS)
logger.info("Already logged in")
except NoSuchElementException:
logger.info('Initiating log in')
Expand Down
3 changes: 2 additions & 1 deletion finance_dl/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,9 @@ def get_balances_and_prices(requester, api_key, api_secret, data_dir):
logger.info(f"Got balances. Found {len(balances)} currencies.")

# Prices
tickers = [b['currency'] + "USD" for b in balances if b['currency'] != 'USD']
tickers = [b['currency'] + "USD" for b in balances if b['currency'] != 'USD' and b['currency'] != 'GUSD']
prices = {}
prices['GUSD'] = 1
for t in tickers:
obj = requester.make_request(TICKERS_URL+"/"+t.lower(), None, get = True)
price = (float(obj['ask']) + float(obj['bid']))/2
Expand Down
86 changes: 59 additions & 27 deletions finance_dl/healthequity.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ def CONFIG_healthequity():
import logging
import os
import bs4
import tempfile
import openpyxl
from openpyxl.cell.cell import MergedCell
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
Expand Down Expand Up @@ -101,43 +104,66 @@ def find_first_matching_date(lines, date_format):


FUND_ACTIVITY_HEADERS = [
'Fund', 'Name', 'Shares (#)', 'Closing Price', 'Closing Value'
'Fund', 'Name', 'Class', 'Target %\nallocation', 'Est. %\nholding', 'Shares\nheld', 'Closing\nprice', 'Closing\nvalue'
]

# For compatibility with beancount-import's healthequity plugin, write the old
# format for balances.csv files. The three new columns are fairly useless
# anyway, and the new (multiline) column titles are unambiguously worse even if
# a human were to actually ever read these CSVs.
OLD_FUND_ACTIVITY_HEADERS = [
'Fund','Name',None,None,None,'Shares (#)','Closing Price','Closing Value'
]

def write_balances(data, path):
rows = []
for entry in data:
keys = [x[0] for x in entry]
if keys == FUND_ACTIVITY_HEADERS:
keys, values = zip(*entry)
if list(keys) == FUND_ACTIVITY_HEADERS:
entry = [
(k, v.strip().split('\n')[0].strip('$'))
for (k, v) in zip(OLD_FUND_ACTIVITY_HEADERS, values)
if k
]
row_values = dict(entry)
row_values['Fund'] = row_values['Fund'].strip().split()[0]
row_values['Name'] = row_values['Name'].strip().split('\n')[0]
rows.append(row_values)
csv_merge.write_csv(FUND_ACTIVITY_HEADERS, rows, path)
csv_merge.write_csv([h for h in OLD_FUND_ACTIVITY_HEADERS if h], rows, path)


def write_fund_activity(raw_transactions_data, path):
input_date_format = '%m/%d/%Y'
output_date_format = '%Y-%m-%d'
soup = bs4.BeautifulSoup(raw_transactions_data.decode('utf-8'), 'lxml')
def format_cell(c):
if c.is_date:
return c.value.strftime('%Y-%m-%d')
if c.number_format[0] == '$':
base = '${:,.2f}'.format(abs(c.value))
if c.value >= 0:
return base
else:
return '(%s)' % base
return str(c.value)

wb = None
with tempfile.NamedTemporaryFile(suffix='.xlsx') as xlsx:
xlsx.write(raw_transactions_data)
xlsx.flush()
wb = openpyxl.load_workbook(xlsx.name)

ws = wb.worksheets[0]
headers = [
'Date', 'Fund', 'Category', 'Description', 'Price', 'Amount', 'Shares',
'Total Shares', 'Total Value'
]
rows = []
for row in soup.find_all('tr'):
cells = [str(x.text).strip() for x in row.find_all('td')]
while cells and not cells[-1].strip():
del cells[-1]
if len(cells) == 1:
for row in ws.rows:
if any([isinstance(c, MergedCell) for c in row]):
continue
assert len(cells) == len(headers)
assert len(row) == len(headers)
cells = [format_cell(c) for c in row]
if cells == headers:
continue
row_values = dict(zip(headers, cells))
row_values['Date'] = datetime.datetime.strptime(
row_values['Date'], input_date_format).strftime(output_date_format)
rows.append(row_values)
rows.append(dict(zip(headers, cells)))
csv_merge.merge_into_file(filename=path, field_names=headers, data=rows,
sort_by=lambda x: x['Date'])

Expand All @@ -157,12 +183,14 @@ def write_transactions(raw_transactions_data, path):
continue
if cells[0] == 'TOTAL':
continue
assert len(cells) == len(headers)
if cells == headers:
assert len(cells) >= len(headers), (cells, headers)
if cells[:len(headers)] == headers:
continue
row_values = dict(zip(headers, cells))
# Sanitize whitespace in description
row_values['Transaction'] = ' '.join(row_values['Transaction'].split())
# Remove duplicate tax year in description
row_values['Transaction'] = re.sub(r'(\(Tax year: \d+\)) *\1', r'\1', row_values['Transaction'])
row_values['Cash Balance'] = row_values.pop('HSA Cash Balance')

# Sanitize date_str
Expand All @@ -174,7 +202,11 @@ def write_transactions(raw_transactions_data, path):
rows.append(row_values)
rows.reverse()
csv_merge.merge_into_file(filename=path, field_names=output_headers,
data=rows, sort_by=lambda x: x['Date'])
data=rows, sort_by=lambda x: x['Date'],
# Don't consider balance-after in comparing rows,
# because txn order (and therefore running
# balance) is not stable across visits
compare_fields = output_headers[0:3])


class Scraper(scrape_lib.Scraper):
Expand Down Expand Up @@ -205,7 +237,7 @@ def login(self):

def download_transaction_history(self):
(transactions_link, ), = self.wait_and_return(
lambda: self.find_visible_elements_by_descendant_partial_text('Transaction History', 'td'))
lambda: self.find_visible_elements(By.ID, 'viewAllLink'))
scrape_lib.retry(transactions_link.click, retry_delay=2)
(date_select, ), = self.wait_and_return(
lambda: self.find_visible_elements_by_descendant_partial_text('All dates', 'select'))
Expand Down Expand Up @@ -244,7 +276,7 @@ def download_transaction_history(self):
def get_investment_balance(self):
headers = FUND_ACTIVITY_HEADERS
(table, ), = self.wait_and_return(
lambda: scrape_lib.find_table_by_headers(self, headers))
lambda: self.driver.find_elements(By.TAG_NAME, 'table'))
data = scrape_lib.extract_table_data(table, headers)
return data

Expand All @@ -256,16 +288,16 @@ def go_to_investment_history(self):
def download_fund_activity(self):
logger.info('Looking for fund activity link')
(fund_activity_link,), = self.wait_and_return(
lambda: self.find_visible_elements(By.XPATH, '//a[contains(@href, "FundActivity")]'))
lambda: self.find_visible_elements(By.ID, 'EditPortfolioTab'))
scrape_lib.retry(fund_activity_link.click, retry_delay=2)
logger.info('Selecting date ranage for fund activity')
logger.info('Selecting date range for fund activity')
(start_date,), = self.wait_and_return(
lambda: self.find_visible_elements(By.XPATH, '//input[@type="text" and contains(@id, "dateSelectStart")]'))
lambda: self.find_visible_elements(By.XPATH, '//input[@type="text" and contains(@id, "startDate")]'))
start_date.clear()
start_date.send_keys('01011900')
start_date.send_keys('01/01/1900\n')
logger.info('Downloading fund activity')
(download_link, ), = self.wait_and_return(
lambda: self.driver.find_elements_by_link_text('Download'))
lambda: self.find_visible_elements(By.ID, 'fundPerformanceDownload'))
scrape_lib.retry(download_link.click, retry_delay=2)
logger.info('Waiting for fund activity download')
download_result, = self.wait_and_return(self.get_downloaded_file)
Expand Down
38 changes: 37 additions & 1 deletion finance_dl/ofx.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,43 @@ def CONFIG_vanguard():
import ofxclient.institution
import ofxclient

from beancount.ingest.importers.ofx import parse_ofx_time, find_child
# find_child and parse_ofx_time were derived from implementation in beancount/ingest/importers/ofx.py{,test}
# Copyright (C) 2016 Martin Blais
# GNU GPLv2
def find_child(node, name, conversion=None):
"""Find a child under the given node and return its value.
Args:
node: A <STMTTRN> bs4.element.Tag.
name: A string, the name of the child node.
conversion: A callable object used to convert the value to a new data type.
Returns:
A string, or None.
"""
child = node.find(name)
if not child:
return None
if not child.contents:
value = ''
else:
value = child.contents[0].strip()
if conversion:
value = conversion(value)
return value


def parse_ofx_time(date_str):
"""Parse an OFX time string and return a datetime object.
Args:
date_str: A string, the date to be parsed.
Returns:
A datetime.datetime instance.
"""
if len(date_str) < 14:
return datetime.datetime.strptime(date_str[:8], '%Y%m%d')
return datetime.datetime.strptime(date_str[:14], '%Y%m%d%H%M%S')


warnings.filterwarnings('ignore', message='split()', module='re')

Expand Down
Loading

0 comments on commit 6795b4c

Please sign in to comment.