Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hacky fixes to archive likes #114

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# Archiving

I made some adjustments to this script several years ago to handle downloading
likes. I haven't kept up with it. However, a handful of folks have watched,
starred, or otherwise referenced this repository/fork before, so I'm leaving it
up, archived, for posterity.

# tumblr-utils

This is a collection of utilities dealing with Tumblr blogs.
Expand Down
183 changes: 148 additions & 35 deletions tumblr_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import sys
import threading
import time
import hashlib
import urllib
import urllib2
import urlparse
Expand Down Expand Up @@ -96,6 +97,7 @@ def test_jpg(h, f):
TAG_ANY = '__all__'

MAX_POSTS = 50
MAX_LIKES = 20

HTTP_TIMEOUT = 90
HTTP_CHUNK_SIZE = 1024 * 1024
Expand All @@ -121,7 +123,6 @@ def urlopen(url):
def urlopen(url):
return urllib2.urlopen(url, timeout=HTTP_TIMEOUT)


def log(account, s):
if not options.quiet:
if account:
Expand Down Expand Up @@ -193,6 +194,33 @@ def set_period():
tm[i] += 1
options.p_stop = time.mktime(tm)

def apiparse_likes(base, count, before=0):
params = {'api_key': API_KEY, 'limit': count, 'reblog_info': 'true'}
if before > 0:
params['before'] = before
url = base + '?' + urllib.urlencode(params)
for _ in range(10):
try:
resp = urlopen(url)
data = resp.read()
except (EnvironmentError, HTTPException) as e:
sys.stderr.write("%s getting %s\n" % (e, url))
continue
if resp.info().gettype() == 'application/json':
break
sys.stderr.write("Unexpected Content-Type: '%s'\n" % resp.info().gettype())
return None
else:
return None
try:
doc = json.loads(data)
except ValueError as e:
sys.stderr.write('%s: %s\n%d %s %s\n%r\n' % (
e.__class__.__name__, e, resp.getcode(), resp.msg, resp.info().gettype(), data
))
return None
return doc if doc.get('meta', {}).get('status', 0) == 200 else None


def apiparse(base, count, start=0):
params = {'api_key': API_KEY, 'limit': count, 'reblog_info': 'true'}
Expand Down Expand Up @@ -414,7 +442,8 @@ def save_tag_index(self):
mkdir(path_to(tag_index_dir))
self.fixup_media_links()
tag_index = [self.blog.header('Tag index', 'tag-index', self.blog.title, True), '<ul>']
for tag, index in sorted(self.tags.items(), key=lambda kv: kv[1].name):
for index in sorted(self.tags.values(), key=lambda v: v.name):
tag = hashlib.sha256(index.name.encode('utf-8')).hexdigest()
index.save_index(tag_index_dir + os.sep + tag,
u"Tag ‛%s’" % index.name
)
Expand Down Expand Up @@ -514,10 +543,39 @@ def backup(self, account):
ident_max = None
if options.incremental:
try:
ident_max = max(
long(splitext(split(f)[1])[0])
for f in glob(path_to(post_dir, '*' + post_ext))
)
if not options.likes:
ident_max = max(
long(splitext(split(f)[1])[0])
for f in glob(path_to(post_dir, '*' + post_ext))
)
else:
# Need to read every file to find the latest timestamp we've liked;
# Can't just lean on post ident since likes API endpoint
# expects before/after on liked_timestamp
#
# This code operates on the assumption that, for a like backup,
# the stored date in the post html looks like this:
#
# <p><time datetime=2018-12-03T03:49:35Z>12/02/2018 09:49:35 PM</time>
#
# And the actual datetime is the time the user *liked* the
# post. Assuming the post html was generated on a "likes" run,
# then the datetime should be the *liked* time.
log(account, "Finding latest like (may take a while)")
ident_max = 0
expr = re.compile("(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?")
globslug = join('*', dir_index) if options.dirs else '*' + post_ext
for f in glob(path_to(post_dir, globslug)):
fh = open(f,'r')
for line in fh:
res = expr.findall(line)
if res:
dt = datetime(int(res[0][0]),int(res[0][1]),int(res[0][2]),int(res[0][3]),int(res[0][4]),int(res[0][5]))
tstamp = long((dt - datetime(1970,1,1)).total_seconds())
if tstamp > ident_max:
ident_max = tstamp
# no need to keep evaluating the file; there should only ever be one <time/> element
break
log(account, "Backing up posts after %d\r" % ident_max)
except ValueError: # max() arg is an empty sequence
pass
Expand All @@ -536,11 +594,12 @@ def backup(self, account):
_get_content = lambda soup: soup['response']['liked_posts']
blog = {}
last_post = resp['liked_count']
self.title = escape(blog.get('title', account)) + " likes"
else:
_get_content = lambda soup: soup['response']['posts']
blog = resp['blog']
last_post = blog['posts']
self.title = escape(blog.get('title', account))
self.title = escape(blog.get('title', account))
self.subtitle = blog.get('description', '')

# use the meta information to create a HTML header
Expand All @@ -551,10 +610,15 @@ def backup(self, account):
last_post = min(last_post, options.count + options.skip)

def _backup(posts):
for p in sorted(posts, key=lambda x: x['id'], reverse=True):
for p in sorted(posts, key=lambda x: x['id'] if not options.likes else x['liked_timestamp'], reverse=True):
post = post_class(p)
if ident_max and long(post.ident) <= ident_max:
return False
if ident_max:
if not options.likes:
if long(post.ident) <= ident_max:
return False
else:
if long(post.date) <= ident_max:
return False
if options.period:
if post.date >= options.p_stop:
continue
Expand All @@ -581,28 +645,69 @@ def _backup(posts):
# start the thread pool
backup_pool = ThreadPool()
try:
# Get the JSON entries from the API, which we can only do for max 50 posts at once.
# Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
last_batch = MAX_POSTS
i = options.skip
while i < last_post:
# find the upper bound
j = min(i + MAX_POSTS, last_post)
log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, last_post))

soup = apiparse(base, j - i, i)
if soup is None:
i += last_batch # try the next batch
self.errors = True
continue

posts = _get_content(soup)
# posts can be empty if we don't backup reblogged posts
if not posts or not _backup(posts):
break
if not options.likes:
# Get the JSON entries from the API, which we can only do for max 50 posts at once.
# Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
last_batch = MAX_POSTS
i = options.skip
while i < last_post:
# find the upper bound
j = min(i + MAX_POSTS, last_post)
log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, last_post))

soup = apiparse(base, j - i, i)
if soup is None:
i += last_batch # try the next batch
self.errors = True
continue

last_batch = len(posts)
i += last_batch
posts = _get_content(soup)
# posts can be empty if we don't backup reblogged posts
if not posts or not _backup(posts):
break

last_batch = len(posts)
i += last_batch
else:
# Get the JSON entries from the API, which we can only do for max 20 likes at once.
# Likes "arrive" in reverse chronological order. Post #0 is the most recent one.
i = options.skip
finished_with_likes = False
before_timestamp = 0
#before_timestamp = 1485673434
#before_timestamp = 1488326400
#before_timestamp = 1326153600
while not finished_with_likes:
# find the upper bound
j = min(i + MAX_LIKES, last_post)
log(account, "Getting likes %d to %d of %d\r" % (i, j - 1, last_post))

soup = apiparse_likes(base, MAX_LIKES, before_timestamp)
if soup is None:
i += MAX_LIKES # try the next batch
self.errors = True
break
else:
try:
before_timestamp = soup['response']['_links']['next']['query_params']['before']
except KeyError:
if soup['meta']['status'] == 200 and not soup['response']['liked_posts']:
finished_with_likes = True
continue
else:
raise

posts = _get_content(soup)
# posts can be empty if we don't backup reblogged posts
if not posts or not _backup(posts):
finished_with_likes = True

# Don't want to blow through hourly or daily quota.
time.sleep(10)

i += MAX_LIKES


except:
# ensure proper thread pool termination
backup_pool.cancel()
Expand Down Expand Up @@ -638,7 +743,11 @@ def __init__(self, post):
self.url = post['post_url']
self.shorturl = post['short_url']
self.typ = str(post['type'])
self.date = post['timestamp']
if options.likes:
self.creator = post['blog_name']
self.date = post['liked_timestamp']
else:
self.date = post['timestamp']
self.isodate = datetime.utcfromtimestamp(self.date).isoformat() + 'Z'
self.tm = time.localtime(self.date)
self.title = ''
Expand Down Expand Up @@ -682,6 +791,7 @@ def append_try(elt, fmt=u'%s'):
)
append(elt, fmt)


self.media_dir = join(post_dir, self.ident) if options.dirs else media_dir
self.media_url = save_dir + self.media_dir
self.media_folder = path_to(self.media_dir)
Expand Down Expand Up @@ -766,6 +876,8 @@ def append_try(elt, fmt=u'%s'):
)

else:
if self.typ is None or self.type == '':
self.typ = 'none'
sys.stderr.write(
u"Unknown post type '%s' in post #%s%-50s\n" % (self.typ, self.ident, ' ')
)
Expand Down Expand Up @@ -944,10 +1056,11 @@ def get_post(self):
"""returns this post in HTML"""
typ = ('liked-' if options.likes else '') + self.typ
post = self.post_header + u'<article class=%s id=p-%s>\n' % (typ, self.ident)
post += u'<header>\n'
if options.likes:
post += u'<p><a href=\"http://{0}.tumblr.com/\" class=\"tumblr_blog\">{0}</a>:</p>\n'.format(self.creator)
post += u'<p><time datetime=%s>%s</time>\n' % (self.isodate, strftime('%x %X', self.tm))
post += u'<header><p><a href=\"https://{0}.tumblr.com/\" class=\"tumblr_blog\">{0}</a>:</p>\n'.format(self.creator)
post += u'<p><time datetime=%s class=\"tumblr_time_of_like\">%s</time>\n' % (self.isodate, strftime('%x %X', self.tm))
else:
post += u'<header>\n<p><time datetime=%s>%s</time>\n' % (self.isodate, strftime('%x %X', self.tm))
post += u'<a class=llink href=%s%s/%s>¶</a>\n' % (save_dir, post_dir, self.llink)
post += u'<a href=%s>●</a>\n' % self.shorturl
if self.reblogged_from and self.reblogged_from != self.reblogged_root:
Expand Down