From b2c91fa961a5b25fc8b80857a1c9e014e8da4a0f Mon Sep 17 00:00:00 2001 From: sternenseemann Date: Fri, 3 Jun 2022 16:03:31 +0200 Subject: [PATCH 1/3] Skip test for known bug --- t/testmisc.lisp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/t/testmisc.lisp b/t/testmisc.lisp index 8f6391f..b6ffb57 100644 --- a/t/testmisc.lisp +++ b/t/testmisc.lisp @@ -21,7 +21,9 @@ (is (string= sub-obj.even-deeper-obj.some-stuff "Guten Tag")))) (test json-bind-in-bind-bug + (skip "This test case would fail as it asserts the absence of a known bug in cl-json") ;; A problem with json-bind. TODO: Fix it, but leave this testcase + #+nil (let* ((input-json-rpc "{\"method\":\"rc\",\"id\":\"1\",\"params\": [\"0\",{\"id\":\"pingId\",\"name\":\"ping\"},[], [{\"name\":\"tableTennisGroupName\",\"id\":\"tableTennisGroupId\"}]]}") From 479685029c511cb2011f2f2a99ca6c63aa2e4865 Mon Sep 17 00:00:00 2001 From: sternenseemann Date: Sat, 17 Nov 2018 21:50:13 +0530 Subject: [PATCH 2/3] En- and decoding of characters outside the Basic Multilingual Plane RFC4627 prescribes that everything has to in some Unicode encoding (which we comply with by using ASCII (UTF-8) and encoding everything else) and that any character may be escaped. When escaping, however, we need to take care to only escape characters in the Basic Multilingual Plane (BMP) which is U+0000 to U+FFFF: > Any character may be escaped. If the character is in the Basic > Multilingual Plane (U+0000 through U+FFFF), then it may be > represented as a six-character sequence: a reverse solidus, followed > by the lowercase letter u, followed by four hexadecimal digits that > encode the character's code point. [...] > > To escape an extended character that is not in the Basic Multilingual > Plane, the character is represented as a twelve-character sequence, > encoding the UTF-16 surrogate pair. So, for example, a string > containing only the G clef character (U+1D11E) may be represented as > "\uD834\uDD1E". > > - RFC4627, p. 3 This commit implements en- and decoding of UTF-16 surrogate pairs and the necessary error handling logic required by the ordering requirements and the fact that a lone surrogate code unit/point may never be decoded nor encoded. Test cases partially taken from #3. BREAKING CHANGES: Note that the broken behavior can all be considered a bug insofar as it violates the JSON spec. * A Unicode code point outside the BMP will now always be encoded as an UTF-16 surrogate pair. * A valid UTF-16 surrogate pair will now always be decoded to a single Unicode codepoint. * When *use-strict-json-rules* encoding a surrogate codepoint or decoding a lone surrogate code unit will result in an error. If *use-strict-json-rules* is NIL, it'll behave as before. Co-Authored-By: Chaitanya Gupta --- cl-json.asd | 2 +- src/common.lisp | 33 ++++++++++++ src/decoder.lisp | 126 ++++++++++++++++++++++++++++++++++++--------- src/encoder.lisp | 49 +++++++++++++++++- t/testdecoder.lisp | 60 +++++++++++++++++++++ t/testencoder.lisp | 20 +++++++ 6 files changed, 265 insertions(+), 25 deletions(-) diff --git a/cl-json.asd b/cl-json.asd index cac59e3..3654e3b 100644 --- a/cl-json.asd +++ b/cl-json.asd @@ -36,7 +36,7 @@ (:file "json-rpc" :depends-on ("package" "common" "utils" "encoder" "decoder")))))) (defsystem :cl-json/test - :depends-on (:cl-json :fiveam ) + :depends-on (:cl-json :cl-unicode :fiveam ) :components ((:module :t :components ((:file "package") (:file "testmisc" :depends-on ("package" "testdecoder" "testencoder")) diff --git a/src/common.lisp b/src/common.lisp index ee6a460..f24f178 100644 --- a/src/common.lisp +++ b/src/common.lisp @@ -69,6 +69,39 @@ Strings. If nil, translate any such sequence to the char after slash.") +;;;; UTF-16 magic values and common helper functions +;;;; actual implementation lives in encoder.lisp and decoder.lisp + +(declaim (type (unsigned-byte 16) + +utf16-min-surrogate+ + +utf16-max-surrogate+ + +utf16-min-low-surrogate+) + (type (unsigned-byte 32) +utf16-pair-offset+)) +(defconstant +utf16-min-surrogate+ #xd800) +(defconstant +utf16-max-surrogate+ #xdfff) +(defconstant +utf16-min-low-surrogate+ #xdc00) +(defconstant +utf16-pair-offset+ #x10000) + +(declaim (ftype (function ((unsigned-byte 32)) boolean) + utf16-high-surrogate-p + utf16-low-surrogate-p + utf16-surrogate-p) + (inline utf16-high-surrogate-p + utf16-low-surrogate-p + utf16-surrogate-p)) +(defun utf16-high-surrogate-p (code) + "Check if the given integer represents a high surrogate Unicode codepoint." + (and (>= code +utf16-min-surrogate+) + (< code +utf16-min-low-surrogate+))) + +(defun utf16-low-surrogate-p (code) + "Check if the given integer represents a low surrogate Unicode codepoint." + (<= +utf16-min-low-surrogate+ code +utf16-max-surrogate+)) + +(defun utf16-surrogate-p (code) + "Check if the given integer represents a surrogate Unicode codepoint." + (<= +utf16-min-surrogate+ code +utf16-max-surrogate+)) + ;;; Symbols diff --git a/src/decoder.lisp b/src/decoder.lisp index 7686d96..657b3ff 100644 --- a/src/decoder.lisp +++ b/src/decoder.lisp @@ -156,28 +156,14 @@ return NIL." (escaped-char-dispatch c :code-handler ((len rdx) - (let ((code - (let ((repr (make-string len))) - (dotimes (i len) - (setf (aref repr i) (read-char stream))) - (handler-case (parse-integer repr :radix rdx) - (parse-error () - (json-syntax-error stream esc-error-fmt - (format nil "\\~C" c) - repr)))))) - (restart-case - (or (and (< code char-code-limit) (code-char code)) - (error 'no-char-for-code :code code)) - (substitute-char (char) - :report "Substitute another char." - :interactive - (lambda () - (format *query-io* "Char: ") - (list (read-char *query-io*))) - char) - (pass-code () - :report "Pass the code to char handler." - code)))) + (let ((repr (make-string len))) + (dotimes (i len) + (setf (aref repr i) (read-char stream))) + (handler-case (parse-integer repr :radix rdx) + (parse-error () + (json-syntax-error stream esc-error-fmt + (format nil "\\~C" c) + repr))))) :default-handler (if *use-strict-json-rules* (json-syntax-error stream esc-error-fmt "\\" c) @@ -390,6 +376,69 @@ closing brace, calling object handlers as it goes." input but found `~A'" token))))))) +(defun bounded-code-to-char (code) + "Wrapper around CODE-CHAR which invokes NO-CHAR-FOR-CODE if the passed CODE +is greater than the implementation's CHAR-CODE-LIMIT, allowing to deal with the +condition with uniform restarts." + (restart-case + (or (and (< code char-code-limit) (code-char code)) + (error 'no-char-for-code :code code)) + (substitute-char (char) + :report "Substitute another char." + :interactive + (lambda () + (format *query-io* "Char: ") + (list (read-char *query-io*))) + char) + (pass-code () + :report "Pass the code to char handler." + code))) + +(declaim (ftype (function ((unsigned-byte 16) (unsigned-byte 16)) character) + utf16-decode-surrogate-pair)) +(defun utf16-decode-surrogate-pair (high low) + "Takes the two UTF-16 code units of a UTF-16 surrogate pair and decodes them +into a Common Lisp character. Expects the caller to verify that actual surrogates +are passed." + ;; Based on The Unicode Standard, Version 14.0.0, Section 3.9 + (bounded-code-to-char + (+ +utf16-pair-offset+ + (ash (- high +utf16-min-surrogate+) 10) + (- low +utf16-min-low-surrogate+)))) + +(defun json-surrogate-error (stream expected-high &optional expected-low) + "Raises a JSON-SYNTAX-ERROR for the encountered invalid surrogate code point(s) +and offers restarts allowing to pass in an alternative decoding result which is +returned." + (flet ((format-str-el (x) + (format nil + (typecase x + (integer "U+~16,4,'0R") + (character "~C") + (otherwise (if (null x) "end of string" "~A"))) + x))) + (restart-case + (json-syntax-error + stream + (cond + ((utf16-low-surrogate-p expected-high) + "Unexpected lone low surrogate code point ~A") + ((utf16-high-surrogate-p expected-high) + "Expected low-surrogate codepoint after ~A, but got: ~A") + (t "Invalid surrogate pair: ~A and ~A")) + (format-str-el expected-high) + (format-str-el expected-low)) + (substitute-char (char) + :report "Substitute another char." + :interactive + (lambda () + (format *query-io* "Char: ") + (list (read-char *query-io*))) + char) + (substitute-replacement-char () + :report "Substitute the Unicode replacement character." + (bounded-code-to-char #xfffd))))) + (defun decode-json-string (stream) "Read JSON String characters / escape sequences until a closing double quote, calling string handlers as it goes." @@ -398,7 +447,38 @@ double quote, calling string handlers as it goes." (loop initially (funcall *beginning-of-string-handler*) for c = (read-json-string-char stream) while c - do (funcall *string-char-handler* c) + do (etypecase c + ;; Normal characters, named escape sequences + (character (funcall *string-char-handler* c)) + ;; \uXXXX escape sequences + (integer + (cond + ;; high surrogate: expect a low surrogate escape sequence next + ((utf16-high-surrogate-p c) + (let ((next (read-json-string-char stream))) + (cond + ((and (typep next 'integer) (utf16-low-surrogate-p next)) + (funcall *string-char-handler* + (utf16-decode-surrogate-pair c next))) + (*use-strict-json-rules* + (funcall *string-char-handler* + (json-surrogate-error stream c next))) + (t (progn + (funcall *string-char-handler* (bounded-code-to-char c)) + (if next + (funcall *string-char-handler* + (etypecase next + (integer (bounded-code-to-char next)) + (character next))) + (return (funcall *end-of-string-handler*)))))))) + ;; low surrogate: should never appear on its own + ((utf16-low-surrogate-p c) + (funcall *string-char-handler* + (if *use-strict-json-rules* + (json-surrogate-error stream c) + (bounded-code-to-char c)))) + ;; Codepoints in the Basic Multilingual Plane + (t (funcall *string-char-handler* (bounded-code-to-char c)))))) finally (return (funcall *end-of-string-handler*)))))) ;;; handling numerical read errors in ACL diff --git a/src/encoder.lisp b/src/encoder.lisp index 5e378f5..d8a9f3f 100644 --- a/src/encoder.lisp +++ b/src/encoder.lisp @@ -376,6 +376,51 @@ STREAM (or to *JSON-OUTPUT*)." (write-char #\" stream) nil) +(defmacro make-typed-vector (type &rest elements) + `(make-array ,(length elements) + :element-type ,type + :initial-contents + (list ,@elements))) + +(define-condition utf16-unencodeable-codepoint (type-error) () + (:documentation + "Signalled when a codepoint can't be encoded using UTF-16, e.g. if it is +an isolated surrogate codepoint.") + (:report + (lambda (condition stream) + (with-accessors ((datum type-error-datum)) + condition + (format stream "Can't encode ~16R using UTF-16" datum) + (when (utf16-surrogate-p datum) + (format stream " because it is a surrogate codepoint")) + (format stream ".")))) + (:default-initargs :expected-type t)) + +(declaim (ftype (function ((unsigned-byte 32)) (vector (unsigned-byte 16) *)) + utf16-encode-codepoint)) +(defun utf16-encode-codepoint (code) + "Takes a Unicode codepoint and encodes it as one or two UTF-16 code units, +returned in a vector." + ;; Based on The Unicode Standard, Version 14.0.0, Section 3.9 + (cond + ;; Supplementary planes: U+010000 - U+10FFFF + ((>= code +utf16-pair-offset+) + (let ((minus-offset (- code +utf16-pair-offset+))) + (make-typed-vector + '(unsigned-byte 16) + (+ +utf16-min-surrogate+ (ldb (byte 10 10) minus-offset)) + (+ +utf16-min-low-surrogate+ (ldb (byte 10 0) minus-offset))))) + ;; High and Low Surrogate Codepoints: U+D800 - U+DFFF + ;; The Unicode Standard forbids encoding these. + ((and (utf16-surrogate-p code) *use-strict-json-rules*) + (restart-case + (error 'utf16-unencodeable-codepoint :datum code) + (substitute-replacement-char () + :report "Substitute the Unicode replacement character." + (make-typed-vector '(unsigned-byte 16) #xfffd)))) + ;; Basic Multilingual Plane: U+0000 - U+D7FF and U+E000 - U+FFFF + (t (make-typed-vector '(unsigned-byte 16) code)))) + (defun write-json-chars (s stream) "Write JSON representations (chars or escape sequences) of characters in string S to STREAM." @@ -389,7 +434,9 @@ characters in string S to STREAM." else do (let ((special '#.(rassoc-if #'consp +json-lisp-escaped-chars+))) (destructuring-bind (esc . (width . radix)) special - (format stream "\\~C~V,V,'0R" esc radix width code))))) + (loop + for code-unit across (utf16-encode-codepoint code) + do (format stream "\\~C~V,V,'0R" esc radix width code-unit)))))) (eval-when (:compile-toplevel :execute) (if (subtypep 'long-float 'single-float) diff --git a/t/testdecoder.lisp b/t/testdecoder.lisp index 184a7a5..c007b17 100644 --- a/t/testdecoder.lisp +++ b/t/testdecoder.lisp @@ -412,3 +412,63 @@ safe-symbols-parsing function here for a cure." (is (equal (symbol-package (first-bound-slot-name x)) (find-package :keyword))))))) +(test non-ascii-char-decoding + (is (equal `((:foo . ,(string (cl-unicode:character-named "Copyright Sign")))) + (with-decoder-simple-list-semantics + (decode-json-from-string "{\"foo\":\"\\u00A9\"}"))))) + +(test non-bmp-char-decoding + (is (equal `((:foo . ,(string (cl-unicode:character-named "Grinning Face")))) + (with-decoder-simple-list-semantics + (decode-json-from-string "{\"foo\":\"\\uD83D\\uDE00\"}")))) + (signals error + (with-decoder-simple-list-semantics + (decode-json-from-string "{\"foo\":\"\\uD83Dabc\"}"))) + (signals error + (with-decoder-simple-list-semantics + (decode-json-from-string "{\"foo\":\"\\uD83D\\xDE00\"}")))) + +(test surrogate-decoding + ;; lone low surrogate + (signals error + (let ((json:*use-strict-json-rules* t)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uDC80\"")))) + + (is (string= (string (code-char #xdc80)) + (let ((json:*use-strict-json-rules* nil)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uDC80\""))))) + + ;; high surrogate and end of string + (signals error + (let ((json:*use-strict-json-rules* t)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uD800\"")))) + + (is (string= (string (code-char #xd800)) + (let ((json:*use-strict-json-rules* nil)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uD800\""))))) + + ;; high surrogate and normal character + (signals error + (let ((json:*use-strict-json-rules* t)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uD800c\"")))) + + (is (string= (format nil "~Cc" (code-char #xd800)) + (let ((json:*use-strict-json-rules* nil)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uD800c\""))))) + + ;; high surrogate and non surrogate escape + (signals error + (let ((json:*use-strict-json-rules* t)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uD800\\u0012\"")))) + + (is (string= (format nil "~C~C" (code-char #xd800) (code-char #x0012)) + (let ((json:*use-strict-json-rules* nil)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uD800\\u0012\"")))))) diff --git a/t/testencoder.lisp b/t/testencoder.lisp index 7520265..a4b92b3 100644 --- a/t/testencoder.lisp +++ b/t/testencoder.lisp @@ -471,3 +471,23 @@ (is (string= json-bar "{\"method\":\"fooBarCheck\",\"id\":0,\"params\":{\"bar\":{\"bar\":true},\"isFoo\":false,\"isBar\":true}}"))))) +(test non-ascii-char-encoding + (is (string= "{\"foo\":\"\\u00A9\"}" + (with-explicit-encoder + (json:encode-json-to-string (list :object "foo" (string (cl-unicode:character-named "Copyright Sign")))))))) + +(test non-bmp-char-encoding + (is (string= "{\"foo\":\"\\uD83D\\uDE00\"}" + (with-explicit-encoder + (json:encode-json-to-string (list :object "foo" (string (cl-unicode:character-named "Grinning Face")))))))) + +(test surrogate-encoding + (signals error + (let ((json:*use-strict-json-rules* t)) + (with-explicit-encoder + (json:encode-json-to-string (string (code-char #xdc80)))))) + + (is (string= "\"\\uDC80\"" + (let ((json:*use-strict-json-rules* nil)) + (with-explicit-encoder + (json:encode-json-to-string (string (code-char #xdc80)))))))) From c059bec94e28a11102a994d6949e2e52764f21fd Mon Sep 17 00:00:00 2001 From: sternenseemann Date: Tue, 30 Aug 2022 15:48:25 +0200 Subject: [PATCH 3/3] Fix test suite execution in CCL CCL's unicode implementation doesn't allow lone surrogate code point in a string, preventing us from ever creating a string that would trigger the tested behavior here. Other CL implementations are more lenient here, whereas CCL follows the Unicode standard strictly. --- t/testdecoder.lisp | 2 ++ t/testencoder.lisp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/t/testdecoder.lisp b/t/testdecoder.lisp index c007b17..aab41cc 100644 --- a/t/testdecoder.lisp +++ b/t/testdecoder.lisp @@ -428,6 +428,8 @@ safe-symbols-parsing function here for a cure." (with-decoder-simple-list-semantics (decode-json-from-string "{\"foo\":\"\\uD83D\\xDE00\"}")))) +;; Can't construct a string with a lone surrogate code point in CCL +#-ccl (test surrogate-decoding ;; lone low surrogate (signals error diff --git a/t/testencoder.lisp b/t/testencoder.lisp index a4b92b3..fe55bb9 100644 --- a/t/testencoder.lisp +++ b/t/testencoder.lisp @@ -481,6 +481,8 @@ (with-explicit-encoder (json:encode-json-to-string (list :object "foo" (string (cl-unicode:character-named "Grinning Face")))))))) +;; Can't construct a string with a lone surrogate code point in CCL +#-ccl (test surrogate-encoding (signals error (let ((json:*use-strict-json-rules* t))