From 4528fc324a0eab31dc06dd857f876d83d85a50e6 Mon Sep 17 00:00:00 2001
From: Chaitanya Gupta <mail@chaitanyagupta.com>
Date: Sat, 17 Nov 2018 21:50:13 +0530
Subject: [PATCH] En- and decoding of characters outside the Basic Multilingual
 Plane

RFC4627 prescribes that everything has to in some Unicode
encoding (which we comply with by using ASCII (UTF-8) and encoding
everything else) and that any character may be escaped. When escaping,
however, we need to take care to only escape characters in the Basic
Multilingual Plane (BMP) which is U+0000 to U+FFFF:

> Any character may be escaped.  If the character is in the Basic
> Multilingual Plane (U+0000 through U+FFFF), then it may be
> represented as a six-character sequence: a reverse solidus, followed
> by the lowercase letter u, followed by four hexadecimal digits that
> encode the character's code point. [...]
>
> To escape an extended character that is not in the Basic Multilingual
> Plane, the character is represented as a twelve-character sequence,
> encoding the UTF-16 surrogate pair.  So, for example, a string
> containing only the G clef character (U+1D11E) may be represented as
> "\uD834\uDD1E".
>
> - RFC4627, p. 3

This commit implements en- and decoding of UTF-16 surrogate pairs and
the necessary error handling logic required by the ordering requirements
and the fact that a lone surrogate code unit/point may never be decoded
nor encoded.

Test cases partially taken from #3.

BREAKING CHANGES:

Note that the broken behavior can all be considered a bug insofar as it
violates the JSON spec.

* A Unicode code point outside the BMP will now always be encoded as an
  UTF-16 surrogate pair.
* A valid UTF-16 surrogate pair will now always be decoded to a single
  Unicode codepoint.
* When *use-strict-json-rules* encoding a surrogate codepoint or
  decoding a lone surrogate code unit will result in an error.
  If *use-strict-json-rules* is NIL, it'll behave as before.

Co-Authored-By: Chaitanya Gupta <mail@chaitanyagupta.com>
---
 cl-json.asd        |   2 +-
 src/common.lisp    |  33 ++++++++++++
 src/decoder.lisp   | 126 ++++++++++++++++++++++++++++++++++++---------
 src/encoder.lisp   |  49 +++++++++++++++++-
 t/testdecoder.lisp |  60 +++++++++++++++++++++
 t/testencoder.lisp |  20 +++++++
 6 files changed, 265 insertions(+), 25 deletions(-)

diff --git a/cl-json.asd b/cl-json.asd
index cac59e3..3654e3b 100644
--- a/cl-json.asd
+++ b/cl-json.asd
@@ -36,7 +36,7 @@
                              (:file "json-rpc" :depends-on ("package" "common" "utils" "encoder" "decoder"))))))
 
 (defsystem :cl-json/test
-  :depends-on (:cl-json :fiveam )
+  :depends-on (:cl-json :cl-unicode :fiveam )
   :components ((:module :t
                :components ((:file "package")
                             (:file "testmisc" :depends-on ("package" "testdecoder" "testencoder"))
diff --git a/src/common.lisp b/src/common.lisp
index ee6a460..f24f178 100644
--- a/src/common.lisp
+++ b/src/common.lisp
@@ -69,6 +69,39 @@
 Strings.  If nil, translate any such sequence to the char after
 slash.")
 
+;;;; UTF-16 magic values and common helper functions
+;;;; actual implementation lives in encoder.lisp and decoder.lisp
+
+(declaim (type (unsigned-byte 16)
+               +utf16-min-surrogate+
+               +utf16-max-surrogate+
+               +utf16-min-low-surrogate+)
+         (type (unsigned-byte 32) +utf16-pair-offset+))
+(defconstant +utf16-min-surrogate+ #xd800)
+(defconstant +utf16-max-surrogate+ #xdfff)
+(defconstant +utf16-min-low-surrogate+ #xdc00)
+(defconstant +utf16-pair-offset+ #x10000)
+
+(declaim (ftype (function ((unsigned-byte 32)) boolean)
+                utf16-high-surrogate-p
+                utf16-low-surrogate-p
+                utf16-surrogate-p)
+         (inline utf16-high-surrogate-p
+                 utf16-low-surrogate-p
+                 utf16-surrogate-p))
+(defun utf16-high-surrogate-p (code)
+  "Check if the given integer represents a high surrogate Unicode codepoint."
+  (and (>= code +utf16-min-surrogate+)
+       (<  code +utf16-min-low-surrogate+)))
+
+(defun utf16-low-surrogate-p (code)
+  "Check if the given integer represents a low surrogate Unicode codepoint."
+  (<= +utf16-min-low-surrogate+ code +utf16-max-surrogate+))
+
+(defun utf16-surrogate-p (code)
+  "Check if the given integer represents a surrogate Unicode codepoint."
+  (<= +utf16-min-surrogate+ code +utf16-max-surrogate+))
+
 
 ;;; Symbols
 
diff --git a/src/decoder.lisp b/src/decoder.lisp
index 7686d96..657b3ff 100644
--- a/src/decoder.lisp
+++ b/src/decoder.lisp
@@ -156,28 +156,14 @@ return NIL."
              (escaped-char-dispatch c
                :code-handler
                  ((len rdx)
-                  (let ((code
-                         (let ((repr (make-string len)))
-                           (dotimes (i len)
-                             (setf (aref repr i) (read-char stream)))
-                           (handler-case (parse-integer repr :radix rdx)
-                             (parse-error ()
-                               (json-syntax-error stream esc-error-fmt
-                                                  (format nil "\\~C" c)
-                                                  repr))))))
-                    (restart-case
-                        (or (and (< code char-code-limit) (code-char code))
-                            (error 'no-char-for-code :code code))
-                      (substitute-char (char)
-                        :report "Substitute another char."
-                        :interactive
-                        (lambda ()
-                          (format *query-io* "Char: ")
-                          (list (read-char *query-io*)))
-                        char)
-                      (pass-code ()
-                        :report "Pass the code to char handler."
-                        code))))
+                  (let ((repr (make-string len)))
+                    (dotimes (i len)
+                      (setf (aref repr i) (read-char stream)))
+                    (handler-case (parse-integer repr :radix rdx)
+                      (parse-error ()
+                        (json-syntax-error stream esc-error-fmt
+                                           (format nil "\\~C" c)
+                                           repr)))))
                  :default-handler
                    (if *use-strict-json-rules*
                        (json-syntax-error stream esc-error-fmt "\\" c)
@@ -390,6 +376,69 @@ closing brace, calling object handlers as it goes."
                    input but found `~A'"
                   token)))))))
 
+(defun bounded-code-to-char (code)
+  "Wrapper around CODE-CHAR which invokes NO-CHAR-FOR-CODE if the passed CODE
+is greater than the implementation's CHAR-CODE-LIMIT, allowing to deal with the
+condition with uniform restarts."
+  (restart-case
+      (or (and (< code char-code-limit) (code-char code))
+          (error 'no-char-for-code :code code))
+    (substitute-char (char)
+      :report "Substitute another char."
+      :interactive
+      (lambda ()
+        (format *query-io* "Char: ")
+        (list (read-char *query-io*)))
+      char)
+    (pass-code ()
+      :report "Pass the code to char handler."
+      code)))
+
+(declaim (ftype (function ((unsigned-byte 16) (unsigned-byte 16)) character)
+                utf16-decode-surrogate-pair))
+(defun utf16-decode-surrogate-pair (high low)
+  "Takes the two UTF-16 code units of a UTF-16 surrogate pair and decodes them
+into a Common Lisp character. Expects the caller to verify that actual surrogates
+are passed."
+  ;; Based on The Unicode Standard, Version 14.0.0, Section 3.9
+  (bounded-code-to-char
+   (+ +utf16-pair-offset+
+      (ash (- high +utf16-min-surrogate+) 10)
+      (- low +utf16-min-low-surrogate+))))
+
+(defun json-surrogate-error (stream expected-high &optional expected-low)
+  "Raises a JSON-SYNTAX-ERROR for the encountered invalid surrogate code point(s)
+and offers restarts allowing to pass in an alternative decoding result which is
+returned."
+  (flet ((format-str-el (x)
+           (format nil
+                   (typecase x
+                     (integer "U+~16,4,'0R")
+                     (character "~C")
+                     (otherwise (if (null x) "end of string" "~A")))
+                   x)))
+    (restart-case
+        (json-syntax-error
+         stream
+         (cond
+           ((utf16-low-surrogate-p expected-high)
+            "Unexpected lone low surrogate code point ~A")
+           ((utf16-high-surrogate-p expected-high)
+            "Expected low-surrogate codepoint after ~A, but got: ~A")
+           (t "Invalid surrogate pair: ~A and ~A"))
+         (format-str-el expected-high)
+         (format-str-el expected-low))
+      (substitute-char (char)
+        :report "Substitute another char."
+        :interactive
+        (lambda ()
+          (format *query-io* "Char: ")
+          (list (read-char *query-io*)))
+        char)
+      (substitute-replacement-char ()
+        :report "Substitute the Unicode replacement character."
+        (bounded-code-to-char #xfffd)))))
+
 (defun decode-json-string (stream)
   "Read JSON String characters / escape sequences until a closing
 double quote, calling string handlers as it goes."
@@ -398,7 +447,38 @@ double quote, calling string handlers as it goes."
       (loop initially (funcall *beginning-of-string-handler*)
          for c = (read-json-string-char stream)
          while c
-         do (funcall *string-char-handler* c)
+         do (etypecase c
+              ;; Normal characters, named escape sequences
+              (character (funcall *string-char-handler* c))
+              ;; \uXXXX escape sequences
+              (integer
+               (cond
+                 ;; high surrogate: expect a low surrogate escape sequence next
+                 ((utf16-high-surrogate-p c)
+                  (let ((next (read-json-string-char stream)))
+                    (cond
+                      ((and (typep next 'integer) (utf16-low-surrogate-p next))
+                       (funcall *string-char-handler*
+                                (utf16-decode-surrogate-pair c next)))
+                      (*use-strict-json-rules*
+                       (funcall *string-char-handler*
+                                 (json-surrogate-error stream c next)))
+                      (t (progn
+                           (funcall *string-char-handler* (bounded-code-to-char c))
+                           (if next
+                               (funcall *string-char-handler*
+                                        (etypecase next
+                                          (integer (bounded-code-to-char next))
+                                          (character next)))
+                               (return (funcall *end-of-string-handler*))))))))
+                 ;; low surrogate: should never appear on its own
+                 ((utf16-low-surrogate-p c)
+                  (funcall *string-char-handler*
+                           (if *use-strict-json-rules*
+                               (json-surrogate-error stream c)
+                               (bounded-code-to-char c))))
+                 ;; Codepoints in the Basic Multilingual Plane
+                 (t (funcall *string-char-handler* (bounded-code-to-char c))))))
          finally (return (funcall *end-of-string-handler*))))))
 
 ;;; handling numerical read errors in ACL
diff --git a/src/encoder.lisp b/src/encoder.lisp
index 5e378f5..d8a9f3f 100644
--- a/src/encoder.lisp
+++ b/src/encoder.lisp
@@ -376,6 +376,51 @@ STREAM (or to *JSON-OUTPUT*)."
   (write-char #\" stream)
   nil)
 
+(defmacro make-typed-vector (type &rest elements)
+  `(make-array ,(length elements)
+               :element-type ,type
+               :initial-contents
+               (list ,@elements)))
+
+(define-condition utf16-unencodeable-codepoint (type-error) ()
+  (:documentation
+   "Signalled when a codepoint can't be encoded using UTF-16, e.g. if it is
+an isolated surrogate codepoint.")
+  (:report
+   (lambda (condition stream)
+     (with-accessors ((datum type-error-datum))
+         condition
+         (format stream "Can't encode ~16R using UTF-16" datum)
+         (when (utf16-surrogate-p datum)
+           (format stream " because it is a surrogate codepoint"))
+         (format stream "."))))
+  (:default-initargs :expected-type t))
+
+(declaim (ftype (function ((unsigned-byte 32)) (vector (unsigned-byte 16) *))
+                utf16-encode-codepoint))
+(defun utf16-encode-codepoint (code)
+  "Takes a Unicode codepoint and encodes it as one or two UTF-16 code units,
+returned in a vector."
+  ;; Based on The Unicode Standard, Version 14.0.0, Section 3.9
+  (cond
+    ;; Supplementary planes: U+010000 - U+10FFFF
+    ((>= code +utf16-pair-offset+)
+     (let ((minus-offset (- code +utf16-pair-offset+)))
+       (make-typed-vector
+        '(unsigned-byte 16)
+        (+ +utf16-min-surrogate+ (ldb (byte 10 10) minus-offset))
+        (+ +utf16-min-low-surrogate+ (ldb (byte 10 0) minus-offset)))))
+    ;; High and Low Surrogate Codepoints: U+D800 - U+DFFF
+    ;; The Unicode Standard forbids encoding these.
+    ((and (utf16-surrogate-p code) *use-strict-json-rules*)
+     (restart-case
+         (error 'utf16-unencodeable-codepoint :datum code)
+       (substitute-replacement-char ()
+         :report "Substitute the Unicode replacement character."
+         (make-typed-vector '(unsigned-byte 16) #xfffd))))
+    ;; Basic Multilingual Plane: U+0000 - U+D7FF and U+E000 - U+FFFF
+    (t (make-typed-vector '(unsigned-byte 16) code))))
+
 (defun write-json-chars (s stream)
   "Write JSON representations (chars or escape sequences) of
 characters in string S to STREAM."
@@ -389,7 +434,9 @@ characters in string S to STREAM."
      else
        do (let ((special '#.(rassoc-if #'consp +json-lisp-escaped-chars+)))
             (destructuring-bind (esc . (width . radix)) special
-              (format stream "\\~C~V,V,'0R" esc radix width code)))))
+              (loop
+                for code-unit across (utf16-encode-codepoint code)
+                do (format stream "\\~C~V,V,'0R" esc radix width code-unit))))))
 
 (eval-when (:compile-toplevel :execute)
     (if (subtypep 'long-float 'single-float)
diff --git a/t/testdecoder.lisp b/t/testdecoder.lisp
index 184a7a5..c007b17 100644
--- a/t/testdecoder.lisp
+++ b/t/testdecoder.lisp
@@ -412,3 +412,63 @@ safe-symbols-parsing function here for a cure."
         (is (equal (symbol-package (first-bound-slot-name x))
                    (find-package :keyword)))))))
 
+(test non-ascii-char-decoding
+  (is (equal `((:foo . ,(string (cl-unicode:character-named "Copyright Sign"))))
+             (with-decoder-simple-list-semantics
+               (decode-json-from-string "{\"foo\":\"\\u00A9\"}")))))
+
+(test non-bmp-char-decoding
+  (is (equal `((:foo . ,(string (cl-unicode:character-named "Grinning Face"))))
+             (with-decoder-simple-list-semantics
+               (decode-json-from-string "{\"foo\":\"\\uD83D\\uDE00\"}"))))
+  (signals error
+    (with-decoder-simple-list-semantics
+      (decode-json-from-string "{\"foo\":\"\\uD83Dabc\"}")))
+  (signals error
+    (with-decoder-simple-list-semantics
+      (decode-json-from-string "{\"foo\":\"\\uD83D\\xDE00\"}"))))
+
+(test surrogate-decoding
+  ;; lone low surrogate
+  (signals error
+    (let ((json:*use-strict-json-rules* t))
+      (with-decoder-simple-list-semantics
+        (decode-json-from-string "\"\\uDC80\""))))
+
+  (is (string= (string (code-char #xdc80))
+               (let ((json:*use-strict-json-rules* nil))
+                 (with-decoder-simple-list-semantics
+                   (decode-json-from-string "\"\\uDC80\"")))))
+
+  ;; high surrogate and end of string
+  (signals error
+    (let ((json:*use-strict-json-rules* t))
+      (with-decoder-simple-list-semantics
+        (decode-json-from-string "\"\\uD800\""))))
+
+  (is (string= (string (code-char #xd800))
+               (let ((json:*use-strict-json-rules* nil))
+                 (with-decoder-simple-list-semantics
+                   (decode-json-from-string "\"\\uD800\"")))))
+
+  ;; high surrogate and normal character
+  (signals error
+    (let ((json:*use-strict-json-rules* t))
+      (with-decoder-simple-list-semantics
+        (decode-json-from-string "\"\\uD800c\""))))
+
+  (is (string= (format nil "~Cc" (code-char #xd800))
+               (let ((json:*use-strict-json-rules* nil))
+                 (with-decoder-simple-list-semantics
+                   (decode-json-from-string "\"\\uD800c\"")))))
+
+  ;; high surrogate and non surrogate escape
+  (signals error
+    (let ((json:*use-strict-json-rules* t))
+      (with-decoder-simple-list-semantics
+        (decode-json-from-string "\"\\uD800\\u0012\""))))
+
+  (is (string= (format nil "~C~C" (code-char #xd800) (code-char #x0012))
+               (let ((json:*use-strict-json-rules* nil))
+                 (with-decoder-simple-list-semantics
+                   (decode-json-from-string "\"\\uD800\\u0012\""))))))
diff --git a/t/testencoder.lisp b/t/testencoder.lisp
index 7520265..a4b92b3 100644
--- a/t/testencoder.lisp
+++ b/t/testencoder.lisp
@@ -471,3 +471,23 @@
       (is (string= json-bar
 "{\"method\":\"fooBarCheck\",\"id\":0,\"params\":{\"bar\":{\"bar\":true},\"isFoo\":false,\"isBar\":true}}")))))
 
+(test non-ascii-char-encoding
+  (is (string= "{\"foo\":\"\\u00A9\"}"
+               (with-explicit-encoder
+                 (json:encode-json-to-string (list :object "foo" (string (cl-unicode:character-named "Copyright Sign"))))))))
+
+(test non-bmp-char-encoding
+  (is (string= "{\"foo\":\"\\uD83D\\uDE00\"}"
+               (with-explicit-encoder
+                 (json:encode-json-to-string (list :object "foo" (string (cl-unicode:character-named "Grinning Face"))))))))
+
+(test surrogate-encoding
+  (signals error
+    (let ((json:*use-strict-json-rules* t))
+      (with-explicit-encoder
+        (json:encode-json-to-string (string (code-char #xdc80))))))
+
+  (is (string= "\"\\uDC80\""
+               (let ((json:*use-strict-json-rules* nil))
+                 (with-explicit-encoder
+                   (json:encode-json-to-string (string (code-char #xdc80))))))))