Просмотр исходного кода

Reimplement the utf8 string check (#389)

Previous implementation doesn't take care of overlong encoding

Signed-off-by: Xiaokang Qin <xiaokang.qxk@antgroup.com>
Xiaokang Qin 5 лет назад
Родитель
Сommit
2d06567cd1
1 измененных файлов с 59 добавлено и 23 удалено
  1. 59 23
      core/iwasm/interpreter/wasm_loader.c

+ 59 - 23
core/iwasm/interpreter/wasm_loader.c

@@ -250,36 +250,72 @@ loader_malloc(uint64 size, char *error_buf, uint32 error_buf_size)
 static bool
 check_utf8_str(const uint8* str, uint32 len)
 {
-    const uint8 *p = str, *p_end = str + len, *p_end1;
-    uint8 chr, n_bytes;
+    /* The valid ranges are taken from page 125, below link
+       https://www.unicode.org/versions/Unicode9.0.0/ch03.pdf */
+    const uint8 *p = str, *p_end = str + len;
+    uint8 chr;
 
     while (p < p_end) {
-        chr = *p++;
-        if (chr >= 0x80) {
-            /* Calculate the byte count: the first byte must be
-               110XXXXX, 1110XXXX, 11110XXX, 111110XX, or 1111110X,
-               the count of leading '1' denotes the total byte count */
-            n_bytes = 0;
-            while ((chr & 0x80) != 0) {
-                chr = (uint8)(chr << 1);
-                n_bytes++;
-            }
-
-            /* Check byte count */
-            if (n_bytes < 2 || n_bytes > 6
-                || p + n_bytes - 1 > p_end)
+        chr = *p;
+        if (chr < 0x80) {
+            p++;
+        }
+        else if (chr >= 0xC2 && chr <= 0xDF && p + 1 < p_end) {
+            if (p[1] < 0x80 || p[1] > 0xBF) {
                 return false;
-
-            /* Check the following bytes, which must be 10XXXXXX */
-            p_end1 = p + n_bytes - 1;
-            while (p < p_end1) {
-                if (!(*p & 0x80) || (*p | 0x40))
+            }
+            p += 2;
+        }
+        else if (chr >= 0xE0 && chr <= 0xEF && p + 2 < p_end) {
+            if (chr == 0xE0) {
+                if (p[1] < 0xA0 || p[1] > 0xBF
+                    || p[2] < 0x80 || p[2] > 0xBF) {
                     return false;
-                p++;
+                }
+            }
+            else if (chr == 0xED) {
+                if (p[1] < 0x80 || p[1] > 0x9F
+                    || p[2] < 0x80 || p[2] > 0xBF) {
+                    return false;
+                }
+            }
+            else if (chr >= 0xE1 && chr <= 0xEF) {
+                if (p[1] < 0x80 || p[1] > 0xBF
+                    || p[2] < 0x80 || p[2] > 0xBF) {
+                    return false;
+                }
             }
+            p += 3;
+        }
+        else if (chr >= 0xF0 && chr <= 0xF4 && p + 3 < p_end) {
+            if (chr == 0xF0) {
+                if (p[1] < 0x90 || p[1] > 0xBF
+                    || p[2] < 0x80 || p[2] > 0xBF
+                    || p[3] < 0x80 || p[3] > 0xBF) {
+                    return false;
+                }
+            }
+            else if (chr >= 0xF1 && chr <= 0xF3) {
+                if (p[1] < 0x80 || p[1] > 0xBF
+                    || p[2] < 0x80 || p[2] > 0xBF
+                    || p[3] < 0x80 || p[3] > 0xBF) {
+                    return false;
+                }
+            }
+            else if (chr == 0xF4) {
+                if (p[1] < 0x80 || p[1] > 0x8F
+                    || p[2] < 0x80 || p[2] > 0xBF
+                    || p[3] < 0x80 || p[3] > 0xBF) {
+                    return false;
+                }
+            }
+            p += 4;
+        }
+        else {
+            return false;
         }
     }
-    return true;
+    return (p == p_end);
 }
 
 static char*