diff --git a/cl-json.asd b/cl-json.asd index cac59e3..3654e3b 100644 --- a/cl-json.asd +++ b/cl-json.asd @@ -36,7 +36,7 @@ (:file "json-rpc" :depends-on ("package" "common" "utils" "encoder" "decoder")))))) (defsystem :cl-json/test - :depends-on (:cl-json :fiveam ) + :depends-on (:cl-json :cl-unicode :fiveam ) :components ((:module :t :components ((:file "package") (:file "testmisc" :depends-on ("package" "testdecoder" "testencoder")) diff --git a/src/common.lisp b/src/common.lisp index ee6a460..f24f178 100644 --- a/src/common.lisp +++ b/src/common.lisp @@ -69,6 +69,39 @@ Strings. If nil, translate any such sequence to the char after slash.") +;;;; UTF-16 magic values and common helper functions +;;;; actual implementation lives in encoder.lisp and decoder.lisp + +(declaim (type (unsigned-byte 16) + +utf16-min-surrogate+ + +utf16-max-surrogate+ + +utf16-min-low-surrogate+) + (type (unsigned-byte 32) +utf16-pair-offset+)) +(defconstant +utf16-min-surrogate+ #xd800) +(defconstant +utf16-max-surrogate+ #xdfff) +(defconstant +utf16-min-low-surrogate+ #xdc00) +(defconstant +utf16-pair-offset+ #x10000) + +(declaim (ftype (function ((unsigned-byte 32)) boolean) + utf16-high-surrogate-p + utf16-low-surrogate-p + utf16-surrogate-p) + (inline utf16-high-surrogate-p + utf16-low-surrogate-p + utf16-surrogate-p)) +(defun utf16-high-surrogate-p (code) + "Check if the given integer represents a high surrogate Unicode codepoint." + (and (>= code +utf16-min-surrogate+) + (< code +utf16-min-low-surrogate+))) + +(defun utf16-low-surrogate-p (code) + "Check if the given integer represents a low surrogate Unicode codepoint." + (<= +utf16-min-low-surrogate+ code +utf16-max-surrogate+)) + +(defun utf16-surrogate-p (code) + "Check if the given integer represents a surrogate Unicode codepoint." + (<= +utf16-min-surrogate+ code +utf16-max-surrogate+)) + ;;; Symbols diff --git a/src/decoder.lisp b/src/decoder.lisp index 7686d96..657b3ff 100644 --- a/src/decoder.lisp +++ b/src/decoder.lisp @@ -156,28 +156,14 @@ return NIL." (escaped-char-dispatch c :code-handler ((len rdx) - (let ((code - (let ((repr (make-string len))) - (dotimes (i len) - (setf (aref repr i) (read-char stream))) - (handler-case (parse-integer repr :radix rdx) - (parse-error () - (json-syntax-error stream esc-error-fmt - (format nil "\\~C" c) - repr)))))) - (restart-case - (or (and (< code char-code-limit) (code-char code)) - (error 'no-char-for-code :code code)) - (substitute-char (char) - :report "Substitute another char." - :interactive - (lambda () - (format *query-io* "Char: ") - (list (read-char *query-io*))) - char) - (pass-code () - :report "Pass the code to char handler." - code)))) + (let ((repr (make-string len))) + (dotimes (i len) + (setf (aref repr i) (read-char stream))) + (handler-case (parse-integer repr :radix rdx) + (parse-error () + (json-syntax-error stream esc-error-fmt + (format nil "\\~C" c) + repr))))) :default-handler (if *use-strict-json-rules* (json-syntax-error stream esc-error-fmt "\\" c) @@ -390,6 +376,69 @@ closing brace, calling object handlers as it goes." input but found `~A'" token))))))) +(defun bounded-code-to-char (code) + "Wrapper around CODE-CHAR which invokes NO-CHAR-FOR-CODE if the passed CODE +is greater than the implementation's CHAR-CODE-LIMIT, allowing to deal with the +condition with uniform restarts." + (restart-case + (or (and (< code char-code-limit) (code-char code)) + (error 'no-char-for-code :code code)) + (substitute-char (char) + :report "Substitute another char." + :interactive + (lambda () + (format *query-io* "Char: ") + (list (read-char *query-io*))) + char) + (pass-code () + :report "Pass the code to char handler." + code))) + +(declaim (ftype (function ((unsigned-byte 16) (unsigned-byte 16)) character) + utf16-decode-surrogate-pair)) +(defun utf16-decode-surrogate-pair (high low) + "Takes the two UTF-16 code units of a UTF-16 surrogate pair and decodes them +into a Common Lisp character. Expects the caller to verify that actual surrogates +are passed." + ;; Based on The Unicode Standard, Version 14.0.0, Section 3.9 + (bounded-code-to-char + (+ +utf16-pair-offset+ + (ash (- high +utf16-min-surrogate+) 10) + (- low +utf16-min-low-surrogate+)))) + +(defun json-surrogate-error (stream expected-high &optional expected-low) + "Raises a JSON-SYNTAX-ERROR for the encountered invalid surrogate code point(s) +and offers restarts allowing to pass in an alternative decoding result which is +returned." + (flet ((format-str-el (x) + (format nil + (typecase x + (integer "U+~16,4,'0R") + (character "~C") + (otherwise (if (null x) "end of string" "~A"))) + x))) + (restart-case + (json-syntax-error + stream + (cond + ((utf16-low-surrogate-p expected-high) + "Unexpected lone low surrogate code point ~A") + ((utf16-high-surrogate-p expected-high) + "Expected low-surrogate codepoint after ~A, but got: ~A") + (t "Invalid surrogate pair: ~A and ~A")) + (format-str-el expected-high) + (format-str-el expected-low)) + (substitute-char (char) + :report "Substitute another char." + :interactive + (lambda () + (format *query-io* "Char: ") + (list (read-char *query-io*))) + char) + (substitute-replacement-char () + :report "Substitute the Unicode replacement character." + (bounded-code-to-char #xfffd))))) + (defun decode-json-string (stream) "Read JSON String characters / escape sequences until a closing double quote, calling string handlers as it goes." @@ -398,7 +447,38 @@ double quote, calling string handlers as it goes." (loop initially (funcall *beginning-of-string-handler*) for c = (read-json-string-char stream) while c - do (funcall *string-char-handler* c) + do (etypecase c + ;; Normal characters, named escape sequences + (character (funcall *string-char-handler* c)) + ;; \uXXXX escape sequences + (integer + (cond + ;; high surrogate: expect a low surrogate escape sequence next + ((utf16-high-surrogate-p c) + (let ((next (read-json-string-char stream))) + (cond + ((and (typep next 'integer) (utf16-low-surrogate-p next)) + (funcall *string-char-handler* + (utf16-decode-surrogate-pair c next))) + (*use-strict-json-rules* + (funcall *string-char-handler* + (json-surrogate-error stream c next))) + (t (progn + (funcall *string-char-handler* (bounded-code-to-char c)) + (if next + (funcall *string-char-handler* + (etypecase next + (integer (bounded-code-to-char next)) + (character next))) + (return (funcall *end-of-string-handler*)))))))) + ;; low surrogate: should never appear on its own + ((utf16-low-surrogate-p c) + (funcall *string-char-handler* + (if *use-strict-json-rules* + (json-surrogate-error stream c) + (bounded-code-to-char c)))) + ;; Codepoints in the Basic Multilingual Plane + (t (funcall *string-char-handler* (bounded-code-to-char c)))))) finally (return (funcall *end-of-string-handler*)))))) ;;; handling numerical read errors in ACL diff --git a/src/encoder.lisp b/src/encoder.lisp index 5e378f5..d8a9f3f 100644 --- a/src/encoder.lisp +++ b/src/encoder.lisp @@ -376,6 +376,51 @@ STREAM (or to *JSON-OUTPUT*)." (write-char #\" stream) nil) +(defmacro make-typed-vector (type &rest elements) + `(make-array ,(length elements) + :element-type ,type + :initial-contents + (list ,@elements))) + +(define-condition utf16-unencodeable-codepoint (type-error) () + (:documentation + "Signalled when a codepoint can't be encoded using UTF-16, e.g. if it is +an isolated surrogate codepoint.") + (:report + (lambda (condition stream) + (with-accessors ((datum type-error-datum)) + condition + (format stream "Can't encode ~16R using UTF-16" datum) + (when (utf16-surrogate-p datum) + (format stream " because it is a surrogate codepoint")) + (format stream ".")))) + (:default-initargs :expected-type t)) + +(declaim (ftype (function ((unsigned-byte 32)) (vector (unsigned-byte 16) *)) + utf16-encode-codepoint)) +(defun utf16-encode-codepoint (code) + "Takes a Unicode codepoint and encodes it as one or two UTF-16 code units, +returned in a vector." + ;; Based on The Unicode Standard, Version 14.0.0, Section 3.9 + (cond + ;; Supplementary planes: U+010000 - U+10FFFF + ((>= code +utf16-pair-offset+) + (let ((minus-offset (- code +utf16-pair-offset+))) + (make-typed-vector + '(unsigned-byte 16) + (+ +utf16-min-surrogate+ (ldb (byte 10 10) minus-offset)) + (+ +utf16-min-low-surrogate+ (ldb (byte 10 0) minus-offset))))) + ;; High and Low Surrogate Codepoints: U+D800 - U+DFFF + ;; The Unicode Standard forbids encoding these. + ((and (utf16-surrogate-p code) *use-strict-json-rules*) + (restart-case + (error 'utf16-unencodeable-codepoint :datum code) + (substitute-replacement-char () + :report "Substitute the Unicode replacement character." + (make-typed-vector '(unsigned-byte 16) #xfffd)))) + ;; Basic Multilingual Plane: U+0000 - U+D7FF and U+E000 - U+FFFF + (t (make-typed-vector '(unsigned-byte 16) code)))) + (defun write-json-chars (s stream) "Write JSON representations (chars or escape sequences) of characters in string S to STREAM." @@ -389,7 +434,9 @@ characters in string S to STREAM." else do (let ((special '#.(rassoc-if #'consp +json-lisp-escaped-chars+))) (destructuring-bind (esc . (width . radix)) special - (format stream "\\~C~V,V,'0R" esc radix width code))))) + (loop + for code-unit across (utf16-encode-codepoint code) + do (format stream "\\~C~V,V,'0R" esc radix width code-unit)))))) (eval-when (:compile-toplevel :execute) (if (subtypep 'long-float 'single-float) diff --git a/t/testdecoder.lisp b/t/testdecoder.lisp index 184a7a5..c007b17 100644 --- a/t/testdecoder.lisp +++ b/t/testdecoder.lisp @@ -412,3 +412,63 @@ safe-symbols-parsing function here for a cure." (is (equal (symbol-package (first-bound-slot-name x)) (find-package :keyword))))))) +(test non-ascii-char-decoding + (is (equal `((:foo . ,(string (cl-unicode:character-named "Copyright Sign")))) + (with-decoder-simple-list-semantics + (decode-json-from-string "{\"foo\":\"\\u00A9\"}"))))) + +(test non-bmp-char-decoding + (is (equal `((:foo . ,(string (cl-unicode:character-named "Grinning Face")))) + (with-decoder-simple-list-semantics + (decode-json-from-string "{\"foo\":\"\\uD83D\\uDE00\"}")))) + (signals error + (with-decoder-simple-list-semantics + (decode-json-from-string "{\"foo\":\"\\uD83Dabc\"}"))) + (signals error + (with-decoder-simple-list-semantics + (decode-json-from-string "{\"foo\":\"\\uD83D\\xDE00\"}")))) + +(test surrogate-decoding + ;; lone low surrogate + (signals error + (let ((json:*use-strict-json-rules* t)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uDC80\"")))) + + (is (string= (string (code-char #xdc80)) + (let ((json:*use-strict-json-rules* nil)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uDC80\""))))) + + ;; high surrogate and end of string + (signals error + (let ((json:*use-strict-json-rules* t)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uD800\"")))) + + (is (string= (string (code-char #xd800)) + (let ((json:*use-strict-json-rules* nil)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uD800\""))))) + + ;; high surrogate and normal character + (signals error + (let ((json:*use-strict-json-rules* t)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uD800c\"")))) + + (is (string= (format nil "~Cc" (code-char #xd800)) + (let ((json:*use-strict-json-rules* nil)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uD800c\""))))) + + ;; high surrogate and non surrogate escape + (signals error + (let ((json:*use-strict-json-rules* t)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uD800\\u0012\"")))) + + (is (string= (format nil "~C~C" (code-char #xd800) (code-char #x0012)) + (let ((json:*use-strict-json-rules* nil)) + (with-decoder-simple-list-semantics + (decode-json-from-string "\"\\uD800\\u0012\"")))))) diff --git a/t/testencoder.lisp b/t/testencoder.lisp index 7520265..a4b92b3 100644 --- a/t/testencoder.lisp +++ b/t/testencoder.lisp @@ -471,3 +471,23 @@ (is (string= json-bar "{\"method\":\"fooBarCheck\",\"id\":0,\"params\":{\"bar\":{\"bar\":true},\"isFoo\":false,\"isBar\":true}}"))))) +(test non-ascii-char-encoding + (is (string= "{\"foo\":\"\\u00A9\"}" + (with-explicit-encoder + (json:encode-json-to-string (list :object "foo" (string (cl-unicode:character-named "Copyright Sign")))))))) + +(test non-bmp-char-encoding + (is (string= "{\"foo\":\"\\uD83D\\uDE00\"}" + (with-explicit-encoder + (json:encode-json-to-string (list :object "foo" (string (cl-unicode:character-named "Grinning Face")))))))) + +(test surrogate-encoding + (signals error + (let ((json:*use-strict-json-rules* t)) + (with-explicit-encoder + (json:encode-json-to-string (string (code-char #xdc80)))))) + + (is (string= "\"\\uDC80\"" + (let ((json:*use-strict-json-rules* nil)) + (with-explicit-encoder + (json:encode-json-to-string (string (code-char #xdc80))))))))