Skip to content

Commit

Permalink
En- and decoding of characters outside the Basic Multilingual Plane
Browse files Browse the repository at this point in the history
RFC4627 prescribes that everything has to in some Unicode
encoding (which we comply with by using ASCII (UTF-8) and encoding
everything else) and that any character may be escaped. When escaping,
however, we need to take care to only escape characters in the Basic
Multilingual Plane (BMP) which is U+0000 to U+FFFF:

> Any character may be escaped.  If the character is in the Basic
> Multilingual Plane (U+0000 through U+FFFF), then it may be
> represented as a six-character sequence: a reverse solidus, followed
> by the lowercase letter u, followed by four hexadecimal digits that
> encode the character's code point. [...]
>
> To escape an extended character that is not in the Basic Multilingual
> Plane, the character is represented as a twelve-character sequence,
> encoding the UTF-16 surrogate pair.  So, for example, a string
> containing only the G clef character (U+1D11E) may be represented as
> "\uD834\uDD1E".
>
> - RFC4627, p. 3

This commit implements en- and decoding of UTF-16 surrogate pairs and
the necessary error handling logic required by the ordering requirements
and the fact that a lone surrogate code unit/point may never be decoded
nor encoded.

Test cases partially taken from sharplispers#3.

BREAKING CHANGES:

Note that the broken behavior can all be considered a bug insofar as it
violates the JSON spec.

* A Unicode code point outside the BMP will now always be encoded as an
  UTF-16 surrogate pair.
* A valid UTF-16 surrogate pair will now always be decoded to a single
  Unicode codepoint.
* When *use-strict-json-rules* encoding a surrogate codepoint or
  decoding a lone surrogate code unit will result in an error.
  If *use-strict-json-rules* is NIL, it'll behave as before.

Co-Authored-By: Chaitanya Gupta <[email protected]>
  • Loading branch information
sternenseemann and chaitanyagupta committed Jun 18, 2022
1 parent b2c91fa commit 4796850
Show file tree
Hide file tree
Showing 6 changed files with 265 additions and 25 deletions.
2 changes: 1 addition & 1 deletion cl-json.asd
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
(:file "json-rpc" :depends-on ("package" "common" "utils" "encoder" "decoder"))))))

(defsystem :cl-json/test
:depends-on (:cl-json :fiveam )
:depends-on (:cl-json :cl-unicode :fiveam )
:components ((:module :t
:components ((:file "package")
(:file "testmisc" :depends-on ("package" "testdecoder" "testencoder"))
Expand Down
33 changes: 33 additions & 0 deletions src/common.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,39 @@
Strings. If nil, translate any such sequence to the char after
slash.")

;;;; UTF-16 magic values and common helper functions
;;;; actual implementation lives in encoder.lisp and decoder.lisp

(declaim (type (unsigned-byte 16)
+utf16-min-surrogate+
+utf16-max-surrogate+
+utf16-min-low-surrogate+)
(type (unsigned-byte 32) +utf16-pair-offset+))
(defconstant +utf16-min-surrogate+ #xd800)
(defconstant +utf16-max-surrogate+ #xdfff)
(defconstant +utf16-min-low-surrogate+ #xdc00)
(defconstant +utf16-pair-offset+ #x10000)

(declaim (ftype (function ((unsigned-byte 32)) boolean)
utf16-high-surrogate-p
utf16-low-surrogate-p
utf16-surrogate-p)
(inline utf16-high-surrogate-p
utf16-low-surrogate-p
utf16-surrogate-p))
(defun utf16-high-surrogate-p (code)
"Check if the given integer represents a high surrogate Unicode codepoint."
(and (>= code +utf16-min-surrogate+)
(< code +utf16-min-low-surrogate+)))

(defun utf16-low-surrogate-p (code)
"Check if the given integer represents a low surrogate Unicode codepoint."
(<= +utf16-min-low-surrogate+ code +utf16-max-surrogate+))

(defun utf16-surrogate-p (code)
"Check if the given integer represents a surrogate Unicode codepoint."
(<= +utf16-min-surrogate+ code +utf16-max-surrogate+))


;;; Symbols

Expand Down
126 changes: 103 additions & 23 deletions src/decoder.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -156,28 +156,14 @@ return NIL."
(escaped-char-dispatch c
:code-handler
((len rdx)
(let ((code
(let ((repr (make-string len)))
(dotimes (i len)
(setf (aref repr i) (read-char stream)))
(handler-case (parse-integer repr :radix rdx)
(parse-error ()
(json-syntax-error stream esc-error-fmt
(format nil "\\~C" c)
repr))))))
(restart-case
(or (and (< code char-code-limit) (code-char code))
(error 'no-char-for-code :code code))
(substitute-char (char)
:report "Substitute another char."
:interactive
(lambda ()
(format *query-io* "Char: ")
(list (read-char *query-io*)))
char)
(pass-code ()
:report "Pass the code to char handler."
code))))
(let ((repr (make-string len)))
(dotimes (i len)
(setf (aref repr i) (read-char stream)))
(handler-case (parse-integer repr :radix rdx)
(parse-error ()
(json-syntax-error stream esc-error-fmt
(format nil "\\~C" c)
repr)))))
:default-handler
(if *use-strict-json-rules*
(json-syntax-error stream esc-error-fmt "\\" c)
Expand Down Expand Up @@ -390,6 +376,69 @@ closing brace, calling object handlers as it goes."
input but found `~A'"
token)))))))

(defun bounded-code-to-char (code)
"Wrapper around CODE-CHAR which invokes NO-CHAR-FOR-CODE if the passed CODE
is greater than the implementation's CHAR-CODE-LIMIT, allowing to deal with the
condition with uniform restarts."
(restart-case
(or (and (< code char-code-limit) (code-char code))
(error 'no-char-for-code :code code))
(substitute-char (char)
:report "Substitute another char."
:interactive
(lambda ()
(format *query-io* "Char: ")
(list (read-char *query-io*)))
char)
(pass-code ()
:report "Pass the code to char handler."
code)))

(declaim (ftype (function ((unsigned-byte 16) (unsigned-byte 16)) character)
utf16-decode-surrogate-pair))
(defun utf16-decode-surrogate-pair (high low)
"Takes the two UTF-16 code units of a UTF-16 surrogate pair and decodes them
into a Common Lisp character. Expects the caller to verify that actual surrogates
are passed."
;; Based on The Unicode Standard, Version 14.0.0, Section 3.9
(bounded-code-to-char
(+ +utf16-pair-offset+
(ash (- high +utf16-min-surrogate+) 10)
(- low +utf16-min-low-surrogate+))))

(defun json-surrogate-error (stream expected-high &optional expected-low)
"Raises a JSON-SYNTAX-ERROR for the encountered invalid surrogate code point(s)
and offers restarts allowing to pass in an alternative decoding result which is
returned."
(flet ((format-str-el (x)
(format nil
(typecase x
(integer "U+~16,4,'0R")
(character "~C")
(otherwise (if (null x) "end of string" "~A")))
x)))
(restart-case
(json-syntax-error
stream
(cond
((utf16-low-surrogate-p expected-high)
"Unexpected lone low surrogate code point ~A")
((utf16-high-surrogate-p expected-high)
"Expected low-surrogate codepoint after ~A, but got: ~A")
(t "Invalid surrogate pair: ~A and ~A"))
(format-str-el expected-high)
(format-str-el expected-low))
(substitute-char (char)
:report "Substitute another char."
:interactive
(lambda ()
(format *query-io* "Char: ")
(list (read-char *query-io*)))
char)
(substitute-replacement-char ()
:report "Substitute the Unicode replacement character."
(bounded-code-to-char #xfffd)))))

(defun decode-json-string (stream)
"Read JSON String characters / escape sequences until a closing
double quote, calling string handlers as it goes."
Expand All @@ -398,7 +447,38 @@ double quote, calling string handlers as it goes."
(loop initially (funcall *beginning-of-string-handler*)
for c = (read-json-string-char stream)
while c
do (funcall *string-char-handler* c)
do (etypecase c
;; Normal characters, named escape sequences
(character (funcall *string-char-handler* c))
;; \uXXXX escape sequences
(integer
(cond
;; high surrogate: expect a low surrogate escape sequence next
((utf16-high-surrogate-p c)
(let ((next (read-json-string-char stream)))
(cond
((and (typep next 'integer) (utf16-low-surrogate-p next))
(funcall *string-char-handler*
(utf16-decode-surrogate-pair c next)))
(*use-strict-json-rules*
(funcall *string-char-handler*
(json-surrogate-error stream c next)))
(t (progn
(funcall *string-char-handler* (bounded-code-to-char c))
(if next
(funcall *string-char-handler*
(etypecase next
(integer (bounded-code-to-char next))
(character next)))
(return (funcall *end-of-string-handler*))))))))
;; low surrogate: should never appear on its own
((utf16-low-surrogate-p c)
(funcall *string-char-handler*
(if *use-strict-json-rules*
(json-surrogate-error stream c)
(bounded-code-to-char c))))
;; Codepoints in the Basic Multilingual Plane
(t (funcall *string-char-handler* (bounded-code-to-char c))))))
finally (return (funcall *end-of-string-handler*))))))

;;; handling numerical read errors in ACL
Expand Down
49 changes: 48 additions & 1 deletion src/encoder.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,51 @@ STREAM (or to *JSON-OUTPUT*)."
(write-char #\" stream)
nil)

(defmacro make-typed-vector (type &rest elements)
`(make-array ,(length elements)
:element-type ,type
:initial-contents
(list ,@elements)))

(define-condition utf16-unencodeable-codepoint (type-error) ()
(:documentation
"Signalled when a codepoint can't be encoded using UTF-16, e.g. if it is
an isolated surrogate codepoint.")
(:report
(lambda (condition stream)
(with-accessors ((datum type-error-datum))
condition
(format stream "Can't encode ~16R using UTF-16" datum)
(when (utf16-surrogate-p datum)
(format stream " because it is a surrogate codepoint"))
(format stream "."))))
(:default-initargs :expected-type t))

(declaim (ftype (function ((unsigned-byte 32)) (vector (unsigned-byte 16) *))
utf16-encode-codepoint))
(defun utf16-encode-codepoint (code)
"Takes a Unicode codepoint and encodes it as one or two UTF-16 code units,
returned in a vector."
;; Based on The Unicode Standard, Version 14.0.0, Section 3.9
(cond
;; Supplementary planes: U+010000 - U+10FFFF
((>= code +utf16-pair-offset+)
(let ((minus-offset (- code +utf16-pair-offset+)))
(make-typed-vector
'(unsigned-byte 16)
(+ +utf16-min-surrogate+ (ldb (byte 10 10) minus-offset))
(+ +utf16-min-low-surrogate+ (ldb (byte 10 0) minus-offset)))))
;; High and Low Surrogate Codepoints: U+D800 - U+DFFF
;; The Unicode Standard forbids encoding these.
((and (utf16-surrogate-p code) *use-strict-json-rules*)
(restart-case
(error 'utf16-unencodeable-codepoint :datum code)
(substitute-replacement-char ()
:report "Substitute the Unicode replacement character."
(make-typed-vector '(unsigned-byte 16) #xfffd))))
;; Basic Multilingual Plane: U+0000 - U+D7FF and U+E000 - U+FFFF
(t (make-typed-vector '(unsigned-byte 16) code))))

(defun write-json-chars (s stream)
"Write JSON representations (chars or escape sequences) of
characters in string S to STREAM."
Expand All @@ -389,7 +434,9 @@ characters in string S to STREAM."
else
do (let ((special '#.(rassoc-if #'consp +json-lisp-escaped-chars+)))
(destructuring-bind (esc . (width . radix)) special
(format stream "\\~C~V,V,'0R" esc radix width code)))))
(loop
for code-unit across (utf16-encode-codepoint code)
do (format stream "\\~C~V,V,'0R" esc radix width code-unit))))))

(eval-when (:compile-toplevel :execute)
(if (subtypep 'long-float 'single-float)
Expand Down
60 changes: 60 additions & 0 deletions t/testdecoder.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -412,3 +412,63 @@ safe-symbols-parsing function here for a cure."
(is (equal (symbol-package (first-bound-slot-name x))
(find-package :keyword)))))))

(test non-ascii-char-decoding
(is (equal `((:foo . ,(string (cl-unicode:character-named "Copyright Sign"))))
(with-decoder-simple-list-semantics
(decode-json-from-string "{\"foo\":\"\\u00A9\"}")))))

(test non-bmp-char-decoding
(is (equal `((:foo . ,(string (cl-unicode:character-named "Grinning Face"))))
(with-decoder-simple-list-semantics
(decode-json-from-string "{\"foo\":\"\\uD83D\\uDE00\"}"))))
(signals error
(with-decoder-simple-list-semantics
(decode-json-from-string "{\"foo\":\"\\uD83Dabc\"}")))
(signals error
(with-decoder-simple-list-semantics
(decode-json-from-string "{\"foo\":\"\\uD83D\\xDE00\"}"))))

(test surrogate-decoding
;; lone low surrogate
(signals error
(let ((json:*use-strict-json-rules* t))
(with-decoder-simple-list-semantics
(decode-json-from-string "\"\\uDC80\""))))

(is (string= (string (code-char #xdc80))
(let ((json:*use-strict-json-rules* nil))
(with-decoder-simple-list-semantics
(decode-json-from-string "\"\\uDC80\"")))))

;; high surrogate and end of string
(signals error
(let ((json:*use-strict-json-rules* t))
(with-decoder-simple-list-semantics
(decode-json-from-string "\"\\uD800\""))))

(is (string= (string (code-char #xd800))
(let ((json:*use-strict-json-rules* nil))
(with-decoder-simple-list-semantics
(decode-json-from-string "\"\\uD800\"")))))

;; high surrogate and normal character
(signals error
(let ((json:*use-strict-json-rules* t))
(with-decoder-simple-list-semantics
(decode-json-from-string "\"\\uD800c\""))))

(is (string= (format nil "~Cc" (code-char #xd800))
(let ((json:*use-strict-json-rules* nil))
(with-decoder-simple-list-semantics
(decode-json-from-string "\"\\uD800c\"")))))

;; high surrogate and non surrogate escape
(signals error
(let ((json:*use-strict-json-rules* t))
(with-decoder-simple-list-semantics
(decode-json-from-string "\"\\uD800\\u0012\""))))

(is (string= (format nil "~C~C" (code-char #xd800) (code-char #x0012))
(let ((json:*use-strict-json-rules* nil))
(with-decoder-simple-list-semantics
(decode-json-from-string "\"\\uD800\\u0012\""))))))
20 changes: 20 additions & 0 deletions t/testencoder.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -471,3 +471,23 @@
(is (string= json-bar
"{\"method\":\"fooBarCheck\",\"id\":0,\"params\":{\"bar\":{\"bar\":true},\"isFoo\":false,\"isBar\":true}}")))))

(test non-ascii-char-encoding
(is (string= "{\"foo\":\"\\u00A9\"}"
(with-explicit-encoder
(json:encode-json-to-string (list :object "foo" (string (cl-unicode:character-named "Copyright Sign"))))))))

(test non-bmp-char-encoding
(is (string= "{\"foo\":\"\\uD83D\\uDE00\"}"
(with-explicit-encoder
(json:encode-json-to-string (list :object "foo" (string (cl-unicode:character-named "Grinning Face"))))))))

(test surrogate-encoding
(signals error
(let ((json:*use-strict-json-rules* t))
(with-explicit-encoder
(json:encode-json-to-string (string (code-char #xdc80))))))

(is (string= "\"\\uDC80\""
(let ((json:*use-strict-json-rules* nil))
(with-explicit-encoder
(json:encode-json-to-string (string (code-char #xdc80))))))))

0 comments on commit 4796850

Please sign in to comment.