Quellcode durchsuchen

Improved decoding of UTF-16 surrogate pairs (closes #1157)

Kay Sievers vor 6 Jahren
Ursprung
Commit
91b808381e

+ 4 - 1
CHANGELOG.md

@@ -7,7 +7,10 @@ HEAD
 * Added `BasicJsonDocument::shrinkToFit()`
 * Added support of `uint8_t` for `serializeJson()`, `serializeJsonPretty()`, and `serializeMsgPack()` (issue #1142)
 * Auto enable support for `std::string` and `std::stream` on modern compilers (issue #1156)
-  No need to define `ARDUINOJSON_ENABLE_STD_STRING` and `ARDUINOJSON_ENABLE_STD_STREAM`.
+  (No need to define `ARDUINOJSON_ENABLE_STD_STRING` and `ARDUINOJSON_ENABLE_STD_STREAM` anymore)
+* Improved decoding of UTF-16 surrogate pairs (PR #1157 by @kaysievers)
+  (ArduinoJson now produces standard UTF-8 instead of CESU-8)
+
 
 v6.13.0 (2019-11-01)
 -------

+ 4 - 3
extras/tests/JsonDeserializer/invalid_input.cpp

@@ -7,9 +7,10 @@
 #include <catch.hpp>
 
 TEST_CASE("Invalid JSON input") {
-  const char* testCases[] = {"'\\u'",     "'\\u000g'", "'\\u000'", "'\\u000G'",
-                             "'\\u000/'", "\\x1234",   "6a9",      "1,",
-                             "2]",        "3}"};
+  const char* testCases[] = {
+      "'\\u'",     "'\\u000g'",  "'\\u000'",  "'\\u000G'", "'\\ud83d\\ud83d'",
+      "'\\udda4'", "'\\ud83d_'", "'\\u000/'", "\\x1234",   "6a9",
+      "1,",        "2]",         "3}"};
   const size_t testCount = sizeof(testCases) / sizeof(testCases[0]);
 
   DynamicJsonDocument doc(4096);

+ 4 - 4
extras/tests/JsonDeserializer/string.cpp

@@ -17,10 +17,10 @@ TEST_CASE("Valid JSON strings value") {
       {"\'hello world\'", "hello world"},
       {"\"1\\\"2\\\\3\\/4\\b5\\f6\\n7\\r8\\t9\"", "1\"2\\3/4\b5\f6\n7\r8\t9"},
       {"'\\u0041'", "A"},
-      {"'\\u00e4'", "\xc3\xa4"},      // ä
-      {"'\\u00E4'", "\xc3\xa4"},      // ä
-      {"'\\u3042'", "\xe3\x81\x82"},  // あ
-
+      {"'\\u00e4'", "\xc3\xa4"},                 // ä
+      {"'\\u00E4'", "\xc3\xa4"},                 // ä
+      {"'\\u3042'", "\xe3\x81\x82"},             // あ
+      {"'\\ud83d\\udda4'", "\xf0\x9f\x96\xa4"},  // 🖤
   };
   const size_t testCount = sizeof(testCases) / sizeof(testCases[0]);
 

+ 16 - 1
src/ArduinoJson/Json/JsonDeserializer.hpp

@@ -189,6 +189,7 @@ class JsonDeserializer {
 
   DeserializationError parseQuotedString(const char *&result) {
     StringBuilder builder = _stringStorage.startString();
+    uint16_t surrogate1 = 0;
     const char stopChar = current();
 
     move();
@@ -208,7 +209,19 @@ class JsonDeserializer {
           move();
           DeserializationError err = parseCodepoint(codepoint);
           if (err) return err;
-          Utf8::encodeCodepoint(codepoint, builder);
+          if (codepoint >= 0xd800 && codepoint <= 0xdbff) {
+            if (surrogate1 > 0) return DeserializationError::InvalidInput;
+            surrogate1 = codepoint;
+          } else if (codepoint >= 0xdc00 && codepoint <= 0xdfff) {
+            if (surrogate1 == 0) return DeserializationError::InvalidInput;
+            uint32_t codepoint32 = 0x10000;
+            codepoint32 += static_cast<uint32_t>(surrogate1 - 0xd800) << 10;
+            codepoint32 += codepoint - 0xdc00;
+            Utf8::encodeCodepoint(codepoint32, builder);
+            surrogate1 = 0;
+          } else {
+            Utf8::encodeCodepoint(codepoint, builder);
+          }
           continue;
 #else
           return DeserializationError::NotSupported;
@@ -220,6 +233,8 @@ class JsonDeserializer {
         move();
       }
 
+      if (surrogate1 > 0) return DeserializationError::InvalidInput;
+
       builder.append(c);
     }
 

+ 8 - 4
src/ArduinoJson/Json/Utf8.hpp

@@ -10,17 +10,21 @@ namespace ARDUINOJSON_NAMESPACE {
 
 namespace Utf8 {
 template <typename TStringBuilder>
-inline void encodeCodepoint(uint16_t codepoint, TStringBuilder &str) {
+inline void encodeCodepoint(uint32_t codepoint, TStringBuilder &str) {
   if (codepoint < 0x80) {
     str.append(char(codepoint));
     return;
   }
 
-  if (codepoint >= 0x00000800) {
+  if (codepoint < 0x00000800) {
+    str.append(char(0xc0 /*0b11000000*/ | (codepoint >> 6)));
+  } else if (codepoint < 0x00010000) {
     str.append(char(0xe0 /*0b11100000*/ | (codepoint >> 12)));
     str.append(char(((codepoint >> 6) & 0x3f /*0b00111111*/) | 0x80));
-  } else {
-    str.append(char(0xc0 /*0b11000000*/ | (codepoint >> 6)));
+  } else if (codepoint < 0x00110000) {
+    str.append(char(0xf0 /*0b11110000*/ | (codepoint >> 18)));
+    str.append(char(((codepoint >> 12) & 0x3f /*0b00111111*/) | 0x80));
+    str.append(char(((codepoint >> 6) & 0x3f /*0b00111111*/) | 0x80));
   }
   str.append(char((codepoint & 0x3f /*0b00111111*/) | 0x80));
 }