-
Notifications
You must be signed in to change notification settings - Fork 38
/
google_purchases.py
151 lines (121 loc) · 5.36 KB
/
google_purchases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""Retrieves purchase and reservation history from Google.
This contains purchases that have been heuristically extracted from Gmail
messages, and possibly other sources.
This uses the `selenium` Python package in conjunction with `chromedriver` to
scrape the Google Takeout and Google purchases/reservations websites.
Configuration:
==============
The following keys may be specified as part of the configuration dict:
- `credentials`: Required. Must be a `dict` with `'username'` and `'password'`
keys.
- `output_directory`: Required. Must be a `str` that specifies the path on the
local filesystem where the output will be written. If the directory does not
exist, it will be created.
- `profile_dir`: Optional. If specified, must be a `str` that specifies the
path to a persistent Chrome browser profile to use. This should be a path
used solely for this single configuration; it should not refer to your normal
browser profile. If not specified, a fresh temporary profile will be used
each time.
Output format:
==============
For each purchase, two files are written to the specified `output_directory`:
`<id>.html` contains the raw HTML content of the order details page, and
`order_<id>.json` is a JSON file in the Google Takeout Purchases/Reservations
format.
Example:
========
def CONFIG_google_purchases():
return dict(
module='finance_dl.google_purchases',
credentials={
'username': 'XXXXXX',
'password': 'XXXXXX',
},
output_directory=os.path.join(data_dir, 'google_purchases'),
# profile_dir is optional.
profile_dir=os.path.join(profile_dir, 'google_purchases'),
)
Interactive shell:
==================
From the interactive shell, type: `self.run()` to start the scraper.
"""
from typing import List, Any, Tuple
import urllib.parse
import re
import json
import logging
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import jsonschema
from atomicwrites import atomic_write
from . import scrape_lib
from . import google_login
from . import google_takeout
logger = logging.getLogger('google_purchases')
netloc_re = r'^([^\.@]+\.)*google.com$'
def check_url(url):
result = urllib.parse.urlparse(url)
if result.scheme != 'https' or not re.fullmatch(netloc_re, result.netloc):
raise RuntimeError('Reached invalid URL: %r' % url)
class Scraper(google_takeout.Scraper):
def __init__(self, output_directory: str, **kwargs):
super().__init__(**kwargs)
self.output_directory = output_directory
def check_after_wait(self):
check_url(self.driver.current_url)
def extract_raw_data(self):
source = self.driver.page_source
prefix = 'data:function(){return '
start_index = source.index(prefix) + len(prefix)
source_suffix = source[start_index:]
try:
value = json.loads(source_suffix)
raise ValueError('Expected error parsing JSON')
except json.JSONDecodeError as e:
encoded_json = source_suffix[:e.pos]
value = json.loads(encoded_json)
return value
def _fetch_html_pages(self, need_to_fetch: List[Tuple[str, str]]):
logger.info('Fetching details for %d purchases', len(need_to_fetch))
for i, (purchase_id, html_path) in enumerate(need_to_fetch):
url = 'https://myaccount.google.com/purchases/detail?order_id=' + purchase_id
logger.info('Fetching details %d/%d: %s', i, len(need_to_fetch), url)
with self.wait_for_page_load():
self.driver.get(url)
content = self.driver.page_source
with atomic_write(
html_path, mode='w', encoding='utf-8', newline='\n',
overwrite=True) as f:
# Write with Unicode Byte Order Mark to ensure content will be properly interpreted as UTF-8
f.write('\ufeff' + content)
logger.info('Write details %d/%d: %s', i, len(need_to_fetch), html_path)
def run(self):
if not os.path.exists(self.output_directory):
os.makedirs(self.output_directory)
self.download_data()
def download_data(self):
takeout_zip = self.get_takeout_zipfile(['my_orders'])
need_to_fetch = []
for name in takeout_zip.namelist():
m = re.match(r'.*/order_([0-9]+)\.json$', name)
if m is None:
logger.info('Ignoring file in takeout archive: %s', name)
continue
order_id = m.group(1)
json_path = os.path.join(self.output_directory,
'order_' + order_id + '.json')
if not os.path.exists(json_path):
with atomic_write(json_path, mode='wb', overwrite=True) as f:
f.write(takeout_zip.read(name))
html_path = os.path.join(self.output_directory, order_id + '.html')
if os.path.exists(html_path):
continue
need_to_fetch.append((order_id, html_path))
self._fetch_html_pages(need_to_fetch)
def run(**kwargs):
scrape_lib.run_with_scraper(Scraper, **kwargs)
def interactive(**kwargs):
return scrape_lib.interact_with_scraper(Scraper, **kwargs)