55#include  < nlohmann/json.hpp> 
66
77#include  < string> 
8+ #include  < regex> 
89
910using  json = nlohmann::ordered_json;
1011
@@ -168,6 +169,47 @@ bool common_json_parse(
168169                }
169170            }
170171
172+             //  Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
173+             static  const  std::regex partial_unicode_regex (R"( \\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)" 
174+ 
175+             auto  is_high_surrogate = [&](const  std::string & s) {
176+                 //  Check if a partial of a high surrogate (U+D800-U+DBFF)
177+                 return  s.length () >= 4  &&
178+                     s[0 ] == ' \\ ' 1 ] == ' u' 
179+                     std::tolower (s[2 ]) == ' d' 
180+                     (s[3 ] == ' 8' 3 ] == ' 9' std::tolower (s[3 ]) == ' a' std::tolower (s[3 ]) == ' b' 
181+             };
182+ 
183+             //  Initialize the unicode marker to a low surrogate to handle the edge case
184+             //  where a high surrogate (U+D800-U+DBFF) is immediately followed by a
185+             //  backslash (\)
186+             std::string unicode_marker_padding = " udc00" 
187+             std::smatch last_unicode_seq;
188+ 
189+             if  (std::regex_search (str, last_unicode_seq, partial_unicode_regex)) {
190+                 std::smatch second_last_seq;
191+                 std::string prelude = str.substr (0 , last_unicode_seq.position ());
192+ 
193+                 //  Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
194+                 unicode_marker_padding = std::string (6  - last_unicode_seq.length (), ' 0' 
195+ 
196+                 if  (is_high_surrogate (last_unicode_seq.str ())) {
197+                     //  If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
198+                     unicode_marker_padding += " \\ udc00" 
199+                 } else  if  (std::regex_search (prelude, second_last_seq, partial_unicode_regex)) {
200+                     if  (is_high_surrogate (second_last_seq.str ())) {
201+                         //  If this follows a high surrogate, pad it to be a low surrogate
202+                         if  (last_unicode_seq.length () == 2 ) {
203+                             unicode_marker_padding = " dc00" 
204+                         } else  if  (last_unicode_seq.length () == 3 ) {
205+                             unicode_marker_padding = " c00" 
206+                         } else  {
207+                             //  The original unicode_marker_padding is already padded with 0s
208+                         }
209+                     }
210+                 }
211+             }
212+ 
171213            const  auto  & magic_seed = out.healing_marker .marker  = healing_marker;// "$llama.cpp.json$";
172214
173215            if  (err_loc.stack .back ().type  == COMMON_JSON_STACK_ELEMENT_KEY) {
@@ -186,6 +228,9 @@ bool common_json_parse(
186228                } else  if  (str[str.length () - 1 ] == ' \\ ' can_parse (str + " \\\" " 
187229                    //  Was inside an object value string after an escape
188230                    str += (out.healing_marker .json_dump_marker  = " \\ " " \" " 
231+                 } else  if  (can_parse (str + unicode_marker_padding + " \" " 
232+                     //  Was inside an object value string after a partial unicode escape
233+                     str += (out.healing_marker .json_dump_marker  = unicode_marker_padding + magic_seed) + " \" " 
189234                } else  {
190235                    //  find last :
191236                    auto  last_pos = str.find_last_of (' :' 
@@ -205,6 +250,9 @@ bool common_json_parse(
205250                } else  if  (str[str.length () - 1 ] == ' \\ ' can_parse (str + " \\\" " 
206251                    //  Was inside an array value string after an escape
207252                    str += (out.healing_marker .json_dump_marker  = " \\ " " \" " 
253+                 } else  if  (can_parse (str + unicode_marker_padding + " \" " 
254+                     //  Was inside an array value string after a partial unicode escape
255+                     str += (out.healing_marker .json_dump_marker  = unicode_marker_padding + magic_seed) + " \" " 
208256                } else  if  (!was_maybe_number () && can_parse (str + " , 1" 
209257                    //  Had just finished a value
210258                    str += (out.healing_marker .json_dump_marker  = " ,\" " " \" " 
@@ -230,6 +278,9 @@ bool common_json_parse(
230278                } else  if  (str[str.length () - 1 ] == ' \\ ' can_parse (str + " \\\" : 1" 
231279                    //  Was inside an object key string after an escape
232280                    str += (out.healing_marker .json_dump_marker  = " \\ " " \" : 1" 
281+                 } else  if  (can_parse (str + unicode_marker_padding + " \" : 1" 
282+                     //  Was inside an object key string after a partial unicode escape
283+                     str += (out.healing_marker .json_dump_marker  = unicode_marker_padding + magic_seed) + " \" : 1" 
233284                } else  {
234285                    auto  last_pos = str.find_last_of (' :' 
235286                    if  (last_pos == std::string::npos) {
0 commit comments