Browse Source

Reduced Unicode conversion code size (-122 bytes on AVR)

Benoit Blanchon 6 years ago
parent
commit
5ec062cc71

+ 11 - 5
extras/tests/JsonDeserializer/invalid_input.cpp

@@ -7,10 +7,9 @@
 #include <catch.hpp>
 
 TEST_CASE("Invalid JSON input") {
-  const char* testCases[] = {
-      "'\\u'",     "'\\u000g'",  "'\\u000'",  "'\\u000G'", "'\\ud83d\\ud83d'",
-      "'\\udda4'", "'\\ud83d_'", "'\\u000/'", "\\x1234",   "6a9",
-      "1,",        "2]",         "3}"};
+  const char* testCases[] = {"'\\u'",     "'\\u000g'", "'\\u000'", "'\\u000G'",
+                             "'\\u000/'", "\\x1234",   "6a9",      "1,",
+                             "2]",        "3}"};
   const size_t testCount = sizeof(testCases) / sizeof(testCases[0]);
 
   DynamicJsonDocument doc(4096);
@@ -23,7 +22,14 @@ TEST_CASE("Invalid JSON input") {
 }
 
 TEST_CASE("Invalid JSON input that should pass") {
-  const char* testCases[] = {"nulL", "tru3", "fals3"};
+  const char* testCases[] = {
+      "nulL",
+      "tru3",
+      "fals3",
+      "'\\ud83d'",         // leading surrogate without a trailing surrogate
+      "'\\udda4'",         // trailing surrogate without a leading surrogate
+      "'\\ud83d\\ud83d'",  // two leading surrogates
+  };
   const size_t testCount = sizeof(testCases) / sizeof(testCases[0]);
 
   DynamicJsonDocument doc(4096);

+ 1 - 0
extras/tests/Misc/CMakeLists.txt

@@ -10,6 +10,7 @@ add_executable(MiscTests
 	StringWriter.cpp
 	TypeTraits.cpp
 	unsigned_char.cpp
+	Utf8.cpp
 	version.cpp
 )
 

+ 59 - 0
extras/tests/Misc/Utf8.cpp

@@ -0,0 +1,59 @@
+// ArduinoJson - arduinojson.org
+// Copyright Benoit Blanchon 2014-2019
+// MIT License
+
+#include <ArduinoJson.h>
+#include <catch.hpp>
+
+#include <string>
+
+using namespace ARDUINOJSON_NAMESPACE;
+
+static void testCodepoint(uint32_t codepoint, std::string expected) {
+  char buffer[4096];
+  MemoryPool pool(buffer, 4096);
+  StringBuilder str(&pool);
+
+  CAPTURE(codepoint);
+  Utf8::encodeCodepoint(codepoint, str);
+
+  REQUIRE(str.complete() == expected);
+}
+
+TEST_CASE("Utf8::encodeCodepoint()") {
+  SECTION("U+0000") {
+    testCodepoint(0x0000, "");
+  }
+
+  SECTION("U+0001") {
+    testCodepoint(0x0001, "\x01");
+  }
+
+  SECTION("U+007F") {
+    testCodepoint(0x007F, "\x7f");
+  }
+
+  SECTION("U+0080") {
+    testCodepoint(0x0080, "\xc2\x80");
+  }
+
+  SECTION("U+07FF") {
+    testCodepoint(0x07FF, "\xdf\xbf");
+  }
+
+  SECTION("U+0800") {
+    testCodepoint(0x0800, "\xe0\xa0\x80");
+  }
+
+  SECTION("U+FFFF") {
+    testCodepoint(0xFFFF, "\xef\xbf\xbf");
+  }
+
+  SECTION("U+10000") {
+    testCodepoint(0x10000, "\xf0\x90\x80\x80");
+  }
+
+  SECTION("U+10FFFF") {
+    testCodepoint(0x10FFFF, "\xf4\x8f\xbf\xbf");
+  }
+}

+ 16 - 18
src/ArduinoJson/Json/JsonDeserializer.hpp

@@ -189,7 +189,9 @@ class JsonDeserializer {
 
   DeserializationError parseQuotedString(const char *&result) {
     StringBuilder builder = _stringStorage.startString();
+#if ARDUINOJSON_DECODE_UNICODE
     uint16_t surrogate1 = 0;
+#endif
     const char stopChar = current();
 
     move();
@@ -205,23 +207,21 @@ class JsonDeserializer {
         if (c == '\0') return DeserializationError::IncompleteInput;
         if (c == 'u') {
 #if ARDUINOJSON_DECODE_UNICODE
-          uint16_t codepoint;
           move();
-          DeserializationError err = parseCodepoint(codepoint);
+          uint32_t codepoint;
+          uint16_t codeunit;
+          DeserializationError err = parseHex4(codeunit);
           if (err) return err;
-          if (codepoint >= 0xd800 && codepoint <= 0xdbff) {
-            if (surrogate1 > 0) return DeserializationError::InvalidInput;
-            surrogate1 = codepoint;
-          } else if (codepoint >= 0xdc00 && codepoint <= 0xdfff) {
-            if (surrogate1 == 0) return DeserializationError::InvalidInput;
-            uint32_t codepoint32 = 0x10000;
-            codepoint32 += static_cast<uint32_t>(surrogate1 - 0xd800) << 10;
-            codepoint32 += codepoint - 0xdc00;
-            Utf8::encodeCodepoint(codepoint32, builder);
-            surrogate1 = 0;
+          if (codeunit >= 0xDC00) {
+            codepoint =
+                uint32_t(0x10000 | ((surrogate1 << 10) | (codeunit & 0x3FF)));
+          } else if (codeunit < 0xd800) {
+            codepoint = codeunit;
           } else {
-            Utf8::encodeCodepoint(codepoint, builder);
+            surrogate1 = codeunit & 0x3FF;
+            continue;
           }
+          Utf8::encodeCodepoint(codepoint, builder);
           continue;
 #else
           return DeserializationError::NotSupported;
@@ -233,8 +233,6 @@ class JsonDeserializer {
         move();
       }
 
-      if (surrogate1 > 0) return DeserializationError::InvalidInput;
-
       builder.append(c);
     }
 
@@ -312,14 +310,14 @@ class JsonDeserializer {
     return DeserializationError::InvalidInput;
   }
 
-  DeserializationError parseCodepoint(uint16_t &codepoint) {
-    codepoint = 0;
+  DeserializationError parseHex4(uint16_t &result) {
+    result = 0;
     for (uint8_t i = 0; i < 4; ++i) {
       char digit = current();
       if (!digit) return DeserializationError::IncompleteInput;
       uint8_t value = decodeHex(digit);
       if (value > 0x0F) return DeserializationError::InvalidInput;
-      codepoint = uint16_t((codepoint << 4) | value);
+      result = uint16_t((result << 4) | value);
       move();
     }
     return DeserializationError::Ok;

+ 28 - 14
src/ArduinoJson/Json/Utf8.hpp

@@ -10,23 +10,37 @@ namespace ARDUINOJSON_NAMESPACE {
 
 namespace Utf8 {
 template <typename TStringBuilder>
-inline void encodeCodepoint(uint32_t codepoint, TStringBuilder &str) {
-  if (codepoint < 0x80) {
-    str.append(char(codepoint));
-    return;
+inline void encodeCodepoint(uint32_t codepoint32, TStringBuilder& str) {
+  // this function was optimize for code size on AVR
+
+  // a buffer to store the string in reverse
+  char buf[5];
+  char* p = buf;
+
+  *(p++) = 0;
+  if (codepoint32 < 0x80) {
+    *(p++) = char((codepoint32));
+  } else {
+    *(p++) = char((codepoint32 | 0x80) & 0xBF);
+    uint16_t codepoint16 = uint16_t(codepoint32 >> 6);
+    if (codepoint16 < 0x20) {  // 0x800
+      *(p++) = char(codepoint16 | 0xC0);
+    } else {
+      *(p++) = char((codepoint16 | 0x80) & 0xBF);
+      codepoint16 = uint16_t(codepoint16 >> 6);
+      if (codepoint16 < 0x10) {  // 0x10000
+        *(p++) = char(codepoint16 | 0xE0);
+      } else {
+        *(p++) = char((codepoint16 | 0x80) & 0xBF);
+        codepoint16 = uint16_t(codepoint16 >> 6);
+        *(p++) = char(codepoint16 | 0xF0);
+      }
+    }
   }
 
-  if (codepoint < 0x00000800) {
-    str.append(char(0xc0 /*0b11000000*/ | (codepoint >> 6)));
-  } else if (codepoint < 0x00010000) {
-    str.append(char(0xe0 /*0b11100000*/ | (codepoint >> 12)));
-    str.append(char(((codepoint >> 6) & 0x3f /*0b00111111*/) | 0x80));
-  } else if (codepoint < 0x00110000) {
-    str.append(char(0xf0 /*0b11110000*/ | (codepoint >> 18)));
-    str.append(char(((codepoint >> 12) & 0x3f /*0b00111111*/) | 0x80));
-    str.append(char(((codepoint >> 6) & 0x3f /*0b00111111*/) | 0x80));
+  while (*(--p)) {
+    str.append(*p);
   }
-  str.append(char((codepoint & 0x3f /*0b00111111*/) | 0x80));
 }
 }  // namespace Utf8
 }  // namespace ARDUINOJSON_NAMESPACE