-
Notifications
You must be signed in to change notification settings - Fork 8
/
encode_emoji.py
executable file
·61 lines (48 loc) · 1.9 KB
/
encode_emoji.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import codecs
import json
import re
import sys
def replace_emoji_characters(s):
"""Replace 4-byte characters with HTML spans with bytes as JSON array
This function takes a Unicode string containing 4-byte Unicode
characters, e.g. 😀, and replaces each 4-byte character with an
HTML span with the 4 bytes encoded as a JSON array, e.g.:
<span class='emoji-bytes' data-emoji-bytes='[240, 159, 152, 128]'></span>
Args:
s (Unicode string):
Returns:
Unicode string with all 4-byte Unicode characters in the source
string replaced with HTML spans
"""
def _emoji_match_to_span(emoji_match):
"""
Args:
emoji_match (MatchObject):
Returns:
Unicode string
"""
bytes = codecs.encode(emoji_match.group(), 'utf-8')
bytes_as_json = json.dumps([b for b in bytearray(bytes)])
return u"<span class='emoji-bytes' data-emoji-bytes='%s'></span>" % \
bytes_as_json
# The procedure for stripping Emoji characters is based on this
# StackOverflow post:
# http://stackoverflow.com/questions/12636489/python-convert-4-byte-char-to-avoid-mysql-error-incorrect-string-value
if sys.maxunicode == 1114111:
# Python was built with '--enable-unicode=ucs4'
highpoints = re.compile(u'[\U00010000-\U0010ffff]')
elif sys.maxunicode == 65535:
# Python was built with '--enable-unicode=ucs2'
highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
else:
raise UnicodeError(
"Unable to determine if Python was built using UCS-2 or UCS-4")
return highpoints.sub(_emoji_match_to_span, s)
def main():
emoji = codecs.open('emoji.txt', encoding='utf-8').read().strip()
print(replace_emoji_characters(emoji))
if __name__ == "__main__":
main()