bbolli · aspensmonster · Dec 4, 2018 · Dec 4, 2018 · Dec 11, 2018 · Dec 12, 2018
diff --git a/README.md b/README.md
@@ -1,3 +1,10 @@
+# Archiving
+
+I made some adjustments to this script several years ago to handle downloading
+likes. I haven't kept up with it. However, a handful of folks have watched,
+starred, or otherwise referenced this repository/fork before, so I'm leaving it
+up, archived, for posterity.
+
 # tumblr-utils
 
 This is a collection of utilities dealing with Tumblr blogs.

diff --git a/tumblr_backup.py b/tumblr_backup.py
@@ -23,6 +23,7 @@
 import sys
 import threading
 import time
+import hashlib
 import urllib
 import urllib2
 import urlparse
@@ -96,6 +97,7 @@ def test_jpg(h, f):
 TAG_ANY = '__all__'
 
 MAX_POSTS = 50
+MAX_LIKES = 20
 
 HTTP_TIMEOUT = 90
 HTTP_CHUNK_SIZE = 1024 * 1024
@@ -121,7 +123,6 @@ def urlopen(url):
     def urlopen(url):
         return urllib2.urlopen(url, timeout=HTTP_TIMEOUT)
 
-
 def log(account, s):
     if not options.quiet:
         if account:
@@ -193,6 +194,33 @@ def set_period():
     tm[i] += 1
     options.p_stop = time.mktime(tm)
 
+def apiparse_likes(base, count, before=0):
+    params = {'api_key': API_KEY, 'limit': count, 'reblog_info': 'true'}
+    if before > 0:
+        params['before'] = before
+    url = base + '?' + urllib.urlencode(params)
+    for _ in range(10):
+        try:
+            resp = urlopen(url)
+            data = resp.read()
+        except (EnvironmentError, HTTPException) as e:
+            sys.stderr.write("%s getting %s\n" % (e, url))
+            continue
+        if resp.info().gettype() == 'application/json':
+            break
+        sys.stderr.write("Unexpected Content-Type: '%s'\n" % resp.info().gettype())
+        return None
+    else:
+        return None
+    try:
+        doc = json.loads(data)
+    except ValueError as e:
+        sys.stderr.write('%s: %s\n%d %s %s\n%r\n' % (
+            e.__class__.__name__, e, resp.getcode(), resp.msg, resp.info().gettype(), data
+        ))
+        return None
+    return doc if doc.get('meta', {}).get('status', 0) == 200 else None
+
 
 def apiparse(base, count, start=0):
     params = {'api_key': API_KEY, 'limit': count, 'reblog_info': 'true'}
@@ -414,7 +442,8 @@ def save_tag_index(self):
         mkdir(path_to(tag_index_dir))
         self.fixup_media_links()
         tag_index = [self.blog.header('Tag index', 'tag-index', self.blog.title, True), '<ul>']
-        for tag, index in sorted(self.tags.items(), key=lambda kv: kv[1].name):
+        for index in sorted(self.tags.values(), key=lambda v: v.name):
+            tag = hashlib.sha256(index.name.encode('utf-8')).hexdigest()
             index.save_index(tag_index_dir + os.sep + tag,
                 u"Tag ‛%s’" % index.name
             )
@@ -514,10 +543,39 @@ def backup(self, account):
         ident_max = None
         if options.incremental:
             try:
-                ident_max = max(
-                    long(splitext(split(f)[1])[0])
-                    for f in glob(path_to(post_dir, '*' + post_ext))
-                )
+                if not options.likes:
+                    ident_max = max(
+                        long(splitext(split(f)[1])[0])
+                        for f in glob(path_to(post_dir, '*' + post_ext))
+                    )
+                else:
+                    # Need to read every file to find the latest timestamp we've liked;
+                    # Can't just lean on post ident since likes API endpoint
+                    # expects before/after on liked_timestamp
+                    #
+                    # This code operates on the assumption that, for a like backup,
+                    # the stored date in the post html looks like this:
+                    #
+                    # <p><time datetime=2018-12-03T03:49:35Z>12/02/2018 09:49:35 PM</time>
+                    #
+                    # And the actual datetime is the time the user *liked* the
+                    # post. Assuming the post html was generated on a "likes" run,
+                    # then the datetime should be the *liked* time.
+                    log(account, "Finding latest like (may take a while)")
+                    ident_max = 0
+                    expr = re.compile("(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?")
+                    globslug = join('*', dir_index) if options.dirs else '*' + post_ext
+                    for f in glob(path_to(post_dir, globslug)):
+                        fh = open(f,'r')
+                        for line in fh:
+                            res = expr.findall(line)
+                            if res:
+                                dt = datetime(int(res[0][0]),int(res[0][1]),int(res[0][2]),int(res[0][3]),int(res[0][4]),int(res[0][5]))
+                                tstamp = long((dt - datetime(1970,1,1)).total_seconds())
+                                if tstamp > ident_max:
+                                    ident_max = tstamp
+                                    # no need to keep evaluating the file; there should only ever be one <time/> element
+                                    break
                 log(account, "Backing up posts after %d\r" % ident_max)
             except ValueError:  # max() arg is an empty sequence
                 pass
@@ -536,11 +594,12 @@ def backup(self, account):
             _get_content = lambda soup: soup['response']['liked_posts']
             blog = {}
             last_post = resp['liked_count']
+            self.title = escape(blog.get('title', account)) + " likes"
         else:
             _get_content = lambda soup: soup['response']['posts']
             blog = resp['blog']
             last_post = blog['posts']
-        self.title = escape(blog.get('title', account))
+            self.title = escape(blog.get('title', account))
         self.subtitle = blog.get('description', '')
 
         # use the meta information to create a HTML header
@@ -551,10 +610,15 @@ def backup(self, account):
             last_post = min(last_post, options.count + options.skip)
 
         def _backup(posts):
-            for p in sorted(posts, key=lambda x: x['id'], reverse=True):
+            for p in sorted(posts, key=lambda x: x['id'] if not options.likes else x['liked_timestamp'], reverse=True):
                 post = post_class(p)
-                if ident_max and long(post.ident) <= ident_max:
-                    return False
+                if ident_max:
+                    if not options.likes:
+                        if long(post.ident) <= ident_max:
+                            return False
+                    else:
+                        if long(post.date) <= ident_max:
+                            return False
                 if options.period:
                     if post.date >= options.p_stop:
                         continue
@@ -581,28 +645,69 @@ def _backup(posts):
         # start the thread pool
         backup_pool = ThreadPool()
         try:
-            # Get the JSON entries from the API, which we can only do for max 50 posts at once.
-            # Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
-            last_batch = MAX_POSTS
-            i = options.skip
-            while i < last_post:
-                # find the upper bound
-                j = min(i + MAX_POSTS, last_post)
-                log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, last_post))
-
-                soup = apiparse(base, j - i, i)
-                if soup is None:
-                    i += last_batch     # try the next batch
-                    self.errors = True
-                    continue
-
-                posts = _get_content(soup)
-                # posts can be empty if we don't backup reblogged posts
-                if not posts or not _backup(posts):
-                    break
+            if not options.likes:
+                # Get the JSON entries from the API, which we can only do for max 50 posts at once.
+                # Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
+                last_batch = MAX_POSTS
+                i = options.skip
+                while i < last_post:
+                    # find the upper bound
+                    j = min(i + MAX_POSTS, last_post)
+                    log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, last_post))
+
+                    soup = apiparse(base, j - i, i)
+                    if soup is None:
+                        i += last_batch     # try the next batch
+                        self.errors = True
+                        continue
 
-                last_batch = len(posts)
-                i += last_batch
+                    posts = _get_content(soup)
+                    # posts can be empty if we don't backup reblogged posts
+                    if not posts or not _backup(posts):
+                        break
+
+                    last_batch = len(posts)
+                    i += last_batch
+            else:
+                # Get the JSON entries from the API, which we can only do for max 20 likes at once.
+                # Likes "arrive" in reverse chronological order. Post #0 is the most recent one.
+                i = options.skip
+                finished_with_likes = False
+                before_timestamp = 0
+                #before_timestamp = 1485673434
+                #before_timestamp = 1488326400
+                #before_timestamp = 1326153600
+                while not finished_with_likes:
+                    # find the upper bound
+                    j = min(i + MAX_LIKES, last_post)
+                    log(account, "Getting likes %d to %d of %d\r" % (i, j - 1, last_post))
+
+                    soup = apiparse_likes(base, MAX_LIKES, before_timestamp)
+                    if soup is None:
+                        i += MAX_LIKES # try the next batch
+                        self.errors = True
+                        break
+                    else:
+                        try:
+                            before_timestamp = soup['response']['_links']['next']['query_params']['before']
+                        except KeyError:
+                            if soup['meta']['status'] == 200 and not soup['response']['liked_posts']:
+                                finished_with_likes = True
+                                continue
+                            else:
+                                raise
+
+                    posts = _get_content(soup)
+                    # posts can be empty if we don't backup reblogged posts
+                    if not posts or not _backup(posts):
+                        finished_with_likes = True
+
+                    # Don't want to blow through hourly or daily quota.
+                    time.sleep(10)
+
+                    i += MAX_LIKES
+
+
         except:
             # ensure proper thread pool termination
             backup_pool.cancel()
@@ -638,7 +743,11 @@ def __init__(self, post):
         self.url = post['post_url']
         self.shorturl = post['short_url']
         self.typ = str(post['type'])
-        self.date = post['timestamp']
+        if options.likes:
+            self.creator = post['blog_name']
+            self.date = post['liked_timestamp']
+        else:
+            self.date = post['timestamp']
         self.isodate = datetime.utcfromtimestamp(self.date).isoformat() + 'Z'
         self.tm = time.localtime(self.date)
         self.title = ''
@@ -682,6 +791,7 @@ def append_try(elt, fmt=u'%s'):
                     )
                 append(elt, fmt)
 
+
         self.media_dir = join(post_dir, self.ident) if options.dirs else media_dir
         self.media_url = save_dir + self.media_dir
         self.media_folder = path_to(self.media_dir)
@@ -766,6 +876,8 @@ def append_try(elt, fmt=u'%s'):
             )
 
         else:
+            if self.typ is None or self.type == '':
+                self.typ = 'none'
             sys.stderr.write(
                 u"Unknown post type '%s' in post #%s%-50s\n" % (self.typ, self.ident, ' ')
             )
@@ -944,10 +1056,11 @@ def get_post(self):
         """returns this post in HTML"""
         typ = ('liked-' if options.likes else '') + self.typ
         post = self.post_header + u'<article class=%s id=p-%s>\n' % (typ, self.ident)
-        post += u'<header>\n'
         if options.likes:
-            post += u'<p><a href=\"http://{0}.tumblr.com/\" class=\"tumblr_blog\">{0}</a>:</p>\n'.format(self.creator)
-        post += u'<p><time datetime=%s>%s</time>\n' % (self.isodate, strftime('%x %X', self.tm))
+            post += u'<header><p><a href=\"https://{0}.tumblr.com/\" class=\"tumblr_blog\">{0}</a>:</p>\n'.format(self.creator)
+            post += u'<p><time datetime=%s class=\"tumblr_time_of_like\">%s</time>\n' % (self.isodate, strftime('%x %X', self.tm))
+        else:
+            post += u'<header>\n<p><time datetime=%s>%s</time>\n' % (self.isodate, strftime('%x %X', self.tm))
         post += u'<a class=llink href=%s%s/%s>¶</a>\n' % (save_dir, post_dir, self.llink)
         post += u'<a href=%s>●</a>\n' % self.shorturl
         if self.reblogged_from and self.reblogged_from != self.reblogged_root: