Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix encoding and decoding of non-BMP characters #27

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cl-json.asd
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
(:file "json-rpc" :depends-on ("package" "common" "utils" "encoder" "decoder"))))))

(defsystem :cl-json.test
:depends-on (:cl-json :fiveam )
:depends-on (:cl-json :cl-unicode :fiveam)
;; newer ASDF versions have this implicitly, but I know of no good way to detect this. [2010/01/02:rpg]
:in-order-to ((test-op (load-op "cl-json.test")))
:components ((:module :t
Expand Down
18 changes: 18 additions & 0 deletions src/common.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,24 @@
Strings. If nil, translate any such sequence to the char after
slash.")

(defun bmp-code-p (code)
(< code #x10000))

(defun high-surrogate-code-p (code)
(<= #xD800 code #xDBFF))

(defun low-surrogate-code-p (code)
(<= #xDC00 code #xDFFF))

(defun surrogate-pair (code)
(let ((reduced (- code #x10000)))
(cons (+ #xD800 (ldb (byte 10 10) reduced))
(+ #xDC00 (ldb (byte 10 0) reduced)))))

(defun surrogate-pair-to-code (pair)
(+ (+ (ash (- (car pair) #xD800) 10)
(- (cdr pair) #xDC00))
#x10000))

;;; Symbols

Expand Down
57 changes: 35 additions & 22 deletions src/decoder.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -155,28 +155,41 @@ return NIL."
(escaped-char-dispatch c
:code-handler
((len rdx)
(let ((code
(let ((repr (make-string len)))
(dotimes (i len)
(setf (aref repr i) (read-char stream)))
(handler-case (parse-integer repr :radix rdx)
(parse-error ()
(json-syntax-error stream esc-error-fmt
(format nil "\\~C" c)
repr))))))
(restart-case
(or (and (< code char-code-limit) (code-char code))
(error 'no-char-for-code :code code))
(substitute-char (char)
:report "Substitute another char."
:interactive
(lambda ()
(format *query-io* "Char: ")
(list (read-char *query-io*)))
char)
(pass-code ()
:report "Pass the code to char handler."
code))))
(flet ((parse-code ()
(let ((repr (make-string len)))
(dotimes (i len)
(setf (aref repr i) (read-char stream)))
(handler-case (parse-integer repr :radix rdx)
(parse-error ()
(json-syntax-error stream esc-error-fmt
(format nil "\\~C" c)
repr))))))
(let ((code (parse-code)))
(when (high-surrogate-code-p code)
;; if code falls in the surrogate code range, the next
;; char is its low surrogate again encoded as "\uXXXX"
(assert (and (char= (read-char stream) #\\)
(char= (read-char stream) c))
nil
"Expected start of low surrogate code sequence")
(let ((next (parse-code)))
(assert (low-surrogate-code-p next)
nil
"Expected low surrogate code but instead got ~A" next)
(setf code (surrogate-pair-to-code (cons code next)))))
(restart-case
(or (and (< code char-code-limit) (code-char code))
(error 'no-char-for-code :code code))
(substitute-char (char)
:report "Substitute another char."
:interactive
(lambda ()
(format *query-io* "Char: ")
(list (read-char *query-io*)))
char)
(pass-code ()
:report "Pass the code to char handler."
code)))))
:default-handler
(if *use-strict-json-rules*
(json-syntax-error stream esc-error-fmt "\\" c)
Expand Down
8 changes: 7 additions & 1 deletion src/encoder.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,13 @@ characters in string S to STREAM."
else
do (let ((special '#.(rassoc-if #'consp +json-lisp-escaped-chars+)))
(destructuring-bind (esc . (width . radix)) special
(format stream "\\~C~V,V,'0R" esc radix width code)))))
(flet ((write-code (code)
(format stream "\\~C~V,V,'0R" esc radix width code)))
(if (bmp-code-p code)
(write-code code)
(let ((pair (surrogate-pair code)))
(write-code (car pair))
(write-code (cdr pair)))))))))

(eval-when (:compile-toplevel)
(if (subtypep 'long-float 'single-float)
Expand Down
15 changes: 15 additions & 0 deletions t/testdecoder.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -411,3 +411,18 @@ safe-symbols-parsing function here for a cure."
(is (equal (symbol-package (first-bound-slot-name x))
(find-package :keyword)))))))

(test non-ascii-char-decoding
(is (equal `((:foo . ,(string (cl-unicode:character-named "Copyright Sign"))))
(with-decoder-simple-list-semantics
(decode-json-from-string "{\"foo\":\"\\u00A9\"}")))))

(test non-bmp-char-decoding
(is (equal `((:foo . ,(string (cl-unicode:character-named "Grinning Face"))))
(with-decoder-simple-list-semantics
(decode-json-from-string "{\"foo\":\"\\uD83D\\uDE00\"}"))))
(signals error
(with-decoder-simple-list-semantics
(decode-json-from-string "{\"foo\":\"\\uD83Dabc\"}")))
(signals error
(with-decoder-simple-list-semantics
(decode-json-from-string "{\"foo\":\"\\uD83D\\xDE00\"}"))))
9 changes: 9 additions & 0 deletions t/testencoder.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -470,3 +470,12 @@
(is (string= json-bar
"{\"method\":\"fooBarCheck\",\"id\":0,\"params\":{\"bar\":{\"bar\":true},\"isFoo\":false,\"isBar\":true}}")))))

(test non-ascii-char-encoding
(is (string= "{\"foo\":\"\\u00A9\"}"
(with-explicit-encoder
(json:encode-json-to-string (list :object "foo" (string (cl-unicode:character-named "Copyright Sign"))))))))

(test non-bmp-char-encoding
(is (string= "{\"foo\":\"\\uD83D\\uDE00\"}"
(with-explicit-encoder
(json:encode-json-to-string (list :object "foo" (string (cl-unicode:character-named "Grinning Face"))))))))