From 987a8c7171baafa6b938b9741668a8b5db508cba Mon Sep 17 00:00:00 2001 From: Chaitanya Gupta Date: Sat, 17 Nov 2018 19:04:22 +0530 Subject: [PATCH 1/2] Encode and decode non-BMP chars into unicode surrogate pairs --- src/common.lisp | 18 +++++++++++++++ src/decoder.lisp | 57 +++++++++++++++++++++++++++++------------------- src/encoder.lisp | 8 ++++++- 3 files changed, 60 insertions(+), 23 deletions(-) diff --git a/src/common.lisp b/src/common.lisp index 97d20e1..cfc0fe4 100644 --- a/src/common.lisp +++ b/src/common.lisp @@ -68,6 +68,24 @@ Strings. If nil, translate any such sequence to the char after slash.") +(defun bmp-code-p (code) + (< code #x10000)) + +(defun high-surrogate-code-p (code) + (<= #xD800 code #xDBFF)) + +(defun low-surrogate-code-p (code) + (<= #xDC00 code #xDFFF)) + +(defun surrogate-pair (code) + (let ((reduced (- code #x10000))) + (cons (+ #xD800 (ldb (byte 10 10) reduced)) + (+ #xDC00 (ldb (byte 10 0) reduced))))) + +(defun surrogate-pair-to-code (pair) + (+ (+ (ash (- (car pair) #xD800) 10) + (- (cdr pair) #xDC00)) + #x10000)) ;;; Symbols diff --git a/src/decoder.lisp b/src/decoder.lisp index 3957178..917095d 100644 --- a/src/decoder.lisp +++ b/src/decoder.lisp @@ -155,28 +155,41 @@ return NIL." (escaped-char-dispatch c :code-handler ((len rdx) - (let ((code - (let ((repr (make-string len))) - (dotimes (i len) - (setf (aref repr i) (read-char stream))) - (handler-case (parse-integer repr :radix rdx) - (parse-error () - (json-syntax-error stream esc-error-fmt - (format nil "\\~C" c) - repr)))))) - (restart-case - (or (and (< code char-code-limit) (code-char code)) - (error 'no-char-for-code :code code)) - (substitute-char (char) - :report "Substitute another char." - :interactive - (lambda () - (format *query-io* "Char: ") - (list (read-char *query-io*))) - char) - (pass-code () - :report "Pass the code to char handler." - code)))) + (flet ((parse-code () + (let ((repr (make-string len))) + (dotimes (i len) + (setf (aref repr i) (read-char stream))) + (handler-case (parse-integer repr :radix rdx) + (parse-error () + (json-syntax-error stream esc-error-fmt + (format nil "\\~C" c) + repr)))))) + (let ((code (parse-code))) + (when (high-surrogate-code-p code) + ;; if code falls in the surrogate code range, the next + ;; char is its low surrogate again encoded as "\uXXXX" + (assert (and (char= (read-char stream) #\\) + (char= (read-char stream) c)) + nil + "Expected start of low surrogate code sequence") + (let ((next (parse-code))) + (assert (low-surrogate-code-p next) + nil + "Expected low surrogate code but instead got ~A" next) + (setf code (surrogate-pair-to-code (cons code next))))) + (restart-case + (or (and (< code char-code-limit) (code-char code)) + (error 'no-char-for-code :code code)) + (substitute-char (char) + :report "Substitute another char." + :interactive + (lambda () + (format *query-io* "Char: ") + (list (read-char *query-io*))) + char) + (pass-code () + :report "Pass the code to char handler." + code))))) :default-handler (if *use-strict-json-rules* (json-syntax-error stream esc-error-fmt "\\" c) diff --git a/src/encoder.lisp b/src/encoder.lisp index d409362..0bdc02d 100644 --- a/src/encoder.lisp +++ b/src/encoder.lisp @@ -388,7 +388,13 @@ characters in string S to STREAM." else do (let ((special '#.(rassoc-if #'consp +json-lisp-escaped-chars+))) (destructuring-bind (esc . (width . radix)) special - (format stream "\\~C~V,V,'0R" esc radix width code))))) + (flet ((write-code (code) + (format stream "\\~C~V,V,'0R" esc radix width code))) + (if (bmp-code-p code) + (write-code code) + (let ((pair (surrogate-pair code))) + (write-code (car pair)) + (write-code (cdr pair))))))))) (eval-when (:compile-toplevel) (if (subtypep 'long-float 'single-float) From 8e8d1b4aebfcc0b42f834e57d1d361888bfef54b Mon Sep 17 00:00:00 2001 From: Chaitanya Gupta Date: Sat, 17 Nov 2018 21:50:13 +0530 Subject: [PATCH 2/2] Add tests for encoding and decoding non-ASCII and non-BMP chars --- cl-json.asd | 2 +- t/testdecoder.lisp | 15 +++++++++++++++ t/testencoder.lisp | 9 +++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/cl-json.asd b/cl-json.asd index 97fa096..236de01 100644 --- a/cl-json.asd +++ b/cl-json.asd @@ -38,7 +38,7 @@ (:file "json-rpc" :depends-on ("package" "common" "utils" "encoder" "decoder")))))) (defsystem :cl-json.test - :depends-on (:cl-json :fiveam ) + :depends-on (:cl-json :cl-unicode :fiveam) ;; newer ASDF versions have this implicitly, but I know of no good way to detect this. [2010/01/02:rpg] :in-order-to ((test-op (load-op "cl-json.test"))) :components ((:module :t diff --git a/t/testdecoder.lisp b/t/testdecoder.lisp index 644f905..0a8ad0b 100644 --- a/t/testdecoder.lisp +++ b/t/testdecoder.lisp @@ -411,3 +411,18 @@ safe-symbols-parsing function here for a cure." (is (equal (symbol-package (first-bound-slot-name x)) (find-package :keyword))))))) +(test non-ascii-char-decoding + (is (equal `((:foo . ,(string (cl-unicode:character-named "Copyright Sign")))) + (with-decoder-simple-list-semantics + (decode-json-from-string "{\"foo\":\"\\u00A9\"}"))))) + +(test non-bmp-char-decoding + (is (equal `((:foo . ,(string (cl-unicode:character-named "Grinning Face")))) + (with-decoder-simple-list-semantics + (decode-json-from-string "{\"foo\":\"\\uD83D\\uDE00\"}")))) + (signals error + (with-decoder-simple-list-semantics + (decode-json-from-string "{\"foo\":\"\\uD83Dabc\"}"))) + (signals error + (with-decoder-simple-list-semantics + (decode-json-from-string "{\"foo\":\"\\uD83D\\xDE00\"}")))) diff --git a/t/testencoder.lisp b/t/testencoder.lisp index 1c7ff9a..5789d1c 100644 --- a/t/testencoder.lisp +++ b/t/testencoder.lisp @@ -470,3 +470,12 @@ (is (string= json-bar "{\"method\":\"fooBarCheck\",\"id\":0,\"params\":{\"bar\":{\"bar\":true},\"isFoo\":false,\"isBar\":true}}"))))) +(test non-ascii-char-encoding + (is (string= "{\"foo\":\"\\u00A9\"}" + (with-explicit-encoder + (json:encode-json-to-string (list :object "foo" (string (cl-unicode:character-named "Copyright Sign")))))))) + +(test non-bmp-char-encoding + (is (string= "{\"foo\":\"\\uD83D\\uDE00\"}" + (with-explicit-encoder + (json:encode-json-to-string (list :object "foo" (string (cl-unicode:character-named "Grinning Face"))))))))