Skip to content

Commit e687b96

Browse files
Merge pull request #203 from obsidianforensics/bluesky
Add parser for Bluesky TIDs (timestamp ids)
2 parents 4793e00 + cd444d3 commit e687b96

File tree

3 files changed

+119
-0
lines changed

3 files changed

+119
-0
lines changed

unfurl/parsers/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"parse_base64",
33
"parse_bing",
44
"parse_brave",
5+
"parse_bluesky",
56
"parse_compressed",
67
"parse_discord",
78
"parse_dns",

unfurl/parsers/parse_bluesky.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Copyright 2024 Ryan Benson
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import base64
16+
import re
17+
18+
import logging
19+
log = logging.getLogger(__name__)
20+
21+
bluesky_edge = {
22+
'color': {
23+
'color': '#1185fe'
24+
},
25+
'title': 'Bluesky TID',
26+
'label': '🦋'
27+
}
28+
29+
tid_re = re.compile(r'[2-7a-z]{13}')
30+
31+
# Create a mapping from "base32-sortable" alphabet to standard base32 alphabet
32+
BASE32_SORTABLE_ALPHABET = "234567abcdefghijklmnopqrstuvwxyz"
33+
STANDARD_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567"
34+
BASE32_SORTABLE_TRANS = str.maketrans(BASE32_SORTABLE_ALPHABET, STANDARD_ALPHABET)
35+
36+
def parse_bluesky_tid(unfurl, node):
37+
# Ref: https://atproto.com/specs/record-key#record-key-type-tid
38+
assert tid_re.fullmatch(node.value), "Bluesky TID is not in the expected format (base32-sortable)"
39+
assert not ord(node.value[0]) & 0x40, "Bluesky TID high bit is set; it must be 0"
40+
41+
# Translate the base32-sortable string to standard base32, then decode it to 8 raw bytes
42+
translated_str = node.value.translate(BASE32_SORTABLE_TRANS)
43+
decoded_bytes = base64.b32decode(translated_str+"===")
44+
45+
# The first bit is 0, then the next 53 bits are the timestamp (microseconds since the UNIX epoch).
46+
# The last 10 are a random "clock identifier", so shift those out to get the timestamp.
47+
timestamp = int.from_bytes(decoded_bytes, byteorder="big") >> 9
48+
49+
unfurl.add_to_queue(
50+
data_type='epoch-microseconds', key=None, value=timestamp, label=f'TID Timestamp: {timestamp}',
51+
hover='Bluesky uses <i>timestamp identifiers</i> ("TIDs") as a way to reference records, '
52+
'which contain an embedded timestamp.',
53+
parent_id=node.node_id, incoming_edge_config=bluesky_edge)
54+
55+
56+
def run(unfurl, node):
57+
if isinstance(node.value, str) and re.fullmatch(tid_re, node.value):
58+
if node.data_type == 'url.path.segment':
59+
preceding_domain = unfurl.find_preceding_domain(node)
60+
if preceding_domain in ['bsky.app']:
61+
parse_bluesky_tid(unfurl, node)
62+
63+
# If it's the "root" node and in the format of a TID, parse it.
64+
# This case covers someone parsing just an ID, not a full URL.
65+
elif node.node_id == 1:
66+
parse_bluesky_tid(unfurl, node)

unfurl/tests/unit/test_bluesky.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from unfurl.core import Unfurl
2+
import unittest
3+
4+
5+
class TestBluesky(unittest.TestCase):
6+
7+
def test_bluesky_post(self):
8+
""" Test a typical Bluesky post URL """
9+
10+
test = Unfurl()
11+
test.add_to_queue(
12+
data_type='url', key=None,
13+
value='https://bsky.app/profile/jay.bsky.team/post/3lbd2ebt3wk2r')
14+
test.parse_queue()
15+
16+
# check the number of nodes
17+
self.assertEqual(len(test.nodes.keys()), 13)
18+
self.assertEqual(test.total_nodes, 13)
19+
20+
# confirm that TID was detected
21+
self.assertIn('timestamp identifiers', test.nodes[12].hover)
22+
23+
# confirm that TID was extracted correctly
24+
self.assertEqual(1732040395098000, test.nodes[12].value)
25+
26+
# embedded timestamp parses correctly
27+
self.assertEqual('2024-11-19 18:19:55.098000', test.nodes[13].value)
28+
29+
def test_bluesky_bare_tid(self):
30+
""" Test parsing a Bluesky/ATProto TID"""
31+
32+
test = Unfurl()
33+
test.add_to_queue(
34+
data_type='url', key=None,
35+
value='3laulgolrfz2f')
36+
test.parse_queue()
37+
38+
# check the number of nodes
39+
self.assertEqual(len(test.nodes.keys()), 3)
40+
self.assertEqual(test.total_nodes, 3)
41+
42+
# confirm that TID was detected
43+
self.assertIn('timestamp identifiers', test.nodes[2].hover)
44+
45+
# confirm that TID was extracted correctly
46+
self.assertEqual(1731543333133695, test.nodes[2].value)
47+
48+
# embedded timestamp parses correctly
49+
self.assertEqual('2024-11-14 00:15:33.133695', test.nodes[3].value)
50+
51+
if __name__ == '__main__':
52+
unittest.main()

0 commit comments

Comments
 (0)