encodingstest.cpp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481
  1. // Tencent is pleased to support the open source community by making RapidJSON available.
  2. //
  3. // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
  4. //
  5. // Licensed under the MIT License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // http://opensource.org/licenses/MIT
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "unittest.h"
  15. #include "rapidjson/filereadstream.h"
  16. #include "rapidjson/stringbuffer.h"
  17. using namespace rapidjson;
  18. // Verification of encoders/decoders with Hoehrmann's UTF8 decoder
  19. // http://www.unicode.org/Public/UNIDATA/Blocks.txt
  20. static const unsigned kCodepointRanges[] = {
  21. 0x0000, 0x007F, // Basic Latin
  22. 0x0080, 0x00FF, // Latin-1 Supplement
  23. 0x0100, 0x017F, // Latin Extended-A
  24. 0x0180, 0x024F, // Latin Extended-B
  25. 0x0250, 0x02AF, // IPA Extensions
  26. 0x02B0, 0x02FF, // Spacing Modifier Letters
  27. 0x0300, 0x036F, // Combining Diacritical Marks
  28. 0x0370, 0x03FF, // Greek and Coptic
  29. 0x0400, 0x04FF, // Cyrillic
  30. 0x0500, 0x052F, // Cyrillic Supplement
  31. 0x0530, 0x058F, // Armenian
  32. 0x0590, 0x05FF, // Hebrew
  33. 0x0600, 0x06FF, // Arabic
  34. 0x0700, 0x074F, // Syriac
  35. 0x0750, 0x077F, // Arabic Supplement
  36. 0x0780, 0x07BF, // Thaana
  37. 0x07C0, 0x07FF, // NKo
  38. 0x0800, 0x083F, // Samaritan
  39. 0x0840, 0x085F, // Mandaic
  40. 0x0900, 0x097F, // Devanagari
  41. 0x0980, 0x09FF, // Bengali
  42. 0x0A00, 0x0A7F, // Gurmukhi
  43. 0x0A80, 0x0AFF, // Gujarati
  44. 0x0B00, 0x0B7F, // Oriya
  45. 0x0B80, 0x0BFF, // Tamil
  46. 0x0C00, 0x0C7F, // Telugu
  47. 0x0C80, 0x0CFF, // Kannada
  48. 0x0D00, 0x0D7F, // Malayalam
  49. 0x0D80, 0x0DFF, // Sinhala
  50. 0x0E00, 0x0E7F, // Thai
  51. 0x0E80, 0x0EFF, // Lao
  52. 0x0F00, 0x0FFF, // Tibetan
  53. 0x1000, 0x109F, // Myanmar
  54. 0x10A0, 0x10FF, // Georgian
  55. 0x1100, 0x11FF, // Hangul Jamo
  56. 0x1200, 0x137F, // Ethiopic
  57. 0x1380, 0x139F, // Ethiopic Supplement
  58. 0x13A0, 0x13FF, // Cherokee
  59. 0x1400, 0x167F, // Unified Canadian Aboriginal Syllabics
  60. 0x1680, 0x169F, // Ogham
  61. 0x16A0, 0x16FF, // Runic
  62. 0x1700, 0x171F, // Tagalog
  63. 0x1720, 0x173F, // Hanunoo
  64. 0x1740, 0x175F, // Buhid
  65. 0x1760, 0x177F, // Tagbanwa
  66. 0x1780, 0x17FF, // Khmer
  67. 0x1800, 0x18AF, // Mongolian
  68. 0x18B0, 0x18FF, // Unified Canadian Aboriginal Syllabics Extended
  69. 0x1900, 0x194F, // Limbu
  70. 0x1950, 0x197F, // Tai Le
  71. 0x1980, 0x19DF, // New Tai Lue
  72. 0x19E0, 0x19FF, // Khmer Symbols
  73. 0x1A00, 0x1A1F, // Buginese
  74. 0x1A20, 0x1AAF, // Tai Tham
  75. 0x1B00, 0x1B7F, // Balinese
  76. 0x1B80, 0x1BBF, // Sundanese
  77. 0x1BC0, 0x1BFF, // Batak
  78. 0x1C00, 0x1C4F, // Lepcha
  79. 0x1C50, 0x1C7F, // Ol Chiki
  80. 0x1CD0, 0x1CFF, // Vedic Extensions
  81. 0x1D00, 0x1D7F, // Phonetic Extensions
  82. 0x1D80, 0x1DBF, // Phonetic Extensions Supplement
  83. 0x1DC0, 0x1DFF, // Combining Diacritical Marks Supplement
  84. 0x1E00, 0x1EFF, // Latin Extended Additional
  85. 0x1F00, 0x1FFF, // Greek Extended
  86. 0x2000, 0x206F, // General Punctuation
  87. 0x2070, 0x209F, // Superscripts and Subscripts
  88. 0x20A0, 0x20CF, // Currency Symbols
  89. 0x20D0, 0x20FF, // Combining Diacritical Marks for Symbols
  90. 0x2100, 0x214F, // Letterlike Symbols
  91. 0x2150, 0x218F, // Number Forms
  92. 0x2190, 0x21FF, // Arrows
  93. 0x2200, 0x22FF, // Mathematical Operators
  94. 0x2300, 0x23FF, // Miscellaneous Technical
  95. 0x2400, 0x243F, // Control Pictures
  96. 0x2440, 0x245F, // Optical Character Recognition
  97. 0x2460, 0x24FF, // Enclosed Alphanumerics
  98. 0x2500, 0x257F, // Box Drawing
  99. 0x2580, 0x259F, // Block Elements
  100. 0x25A0, 0x25FF, // Geometric Shapes
  101. 0x2600, 0x26FF, // Miscellaneous Symbols
  102. 0x2700, 0x27BF, // Dingbats
  103. 0x27C0, 0x27EF, // Miscellaneous Mathematical Symbols-A
  104. 0x27F0, 0x27FF, // Supplemental Arrows-A
  105. 0x2800, 0x28FF, // Braille Patterns
  106. 0x2900, 0x297F, // Supplemental Arrows-B
  107. 0x2980, 0x29FF, // Miscellaneous Mathematical Symbols-B
  108. 0x2A00, 0x2AFF, // Supplemental Mathematical Operators
  109. 0x2B00, 0x2BFF, // Miscellaneous Symbols and Arrows
  110. 0x2C00, 0x2C5F, // Glagolitic
  111. 0x2C60, 0x2C7F, // Latin Extended-C
  112. 0x2C80, 0x2CFF, // Coptic
  113. 0x2D00, 0x2D2F, // Georgian Supplement
  114. 0x2D30, 0x2D7F, // Tifinagh
  115. 0x2D80, 0x2DDF, // Ethiopic Extended
  116. 0x2DE0, 0x2DFF, // Cyrillic Extended-A
  117. 0x2E00, 0x2E7F, // Supplemental Punctuation
  118. 0x2E80, 0x2EFF, // CJK Radicals Supplement
  119. 0x2F00, 0x2FDF, // Kangxi Radicals
  120. 0x2FF0, 0x2FFF, // Ideographic Description Characters
  121. 0x3000, 0x303F, // CJK Symbols and Punctuation
  122. 0x3040, 0x309F, // Hiragana
  123. 0x30A0, 0x30FF, // Katakana
  124. 0x3100, 0x312F, // Bopomofo
  125. 0x3130, 0x318F, // Hangul Compatibility Jamo
  126. 0x3190, 0x319F, // Kanbun
  127. 0x31A0, 0x31BF, // Bopomofo Extended
  128. 0x31C0, 0x31EF, // CJK Strokes
  129. 0x31F0, 0x31FF, // Katakana Phonetic Extensions
  130. 0x3200, 0x32FF, // Enclosed CJK Letters and Months
  131. 0x3300, 0x33FF, // CJK Compatibility
  132. 0x3400, 0x4DBF, // CJK Unified Ideographs Extension A
  133. 0x4DC0, 0x4DFF, // Yijing Hexagram Symbols
  134. 0x4E00, 0x9FFF, // CJK Unified Ideographs
  135. 0xA000, 0xA48F, // Yi Syllables
  136. 0xA490, 0xA4CF, // Yi Radicals
  137. 0xA4D0, 0xA4FF, // Lisu
  138. 0xA500, 0xA63F, // Vai
  139. 0xA640, 0xA69F, // Cyrillic Extended-B
  140. 0xA6A0, 0xA6FF, // Bamum
  141. 0xA700, 0xA71F, // Modifier Tone Letters
  142. 0xA720, 0xA7FF, // Latin Extended-D
  143. 0xA800, 0xA82F, // Syloti Nagri
  144. 0xA830, 0xA83F, // Common Indic Number Forms
  145. 0xA840, 0xA87F, // Phags-pa
  146. 0xA880, 0xA8DF, // Saurashtra
  147. 0xA8E0, 0xA8FF, // Devanagari Extended
  148. 0xA900, 0xA92F, // Kayah Li
  149. 0xA930, 0xA95F, // Rejang
  150. 0xA960, 0xA97F, // Hangul Jamo Extended-A
  151. 0xA980, 0xA9DF, // Javanese
  152. 0xAA00, 0xAA5F, // Cham
  153. 0xAA60, 0xAA7F, // Myanmar Extended-A
  154. 0xAA80, 0xAADF, // Tai Viet
  155. 0xAB00, 0xAB2F, // Ethiopic Extended-A
  156. 0xABC0, 0xABFF, // Meetei Mayek
  157. 0xAC00, 0xD7AF, // Hangul Syllables
  158. 0xD7B0, 0xD7FF, // Hangul Jamo Extended-B
  159. //0xD800, 0xDB7F, // High Surrogates
  160. //0xDB80, 0xDBFF, // High Private Use Surrogates
  161. //0xDC00, 0xDFFF, // Low Surrogates
  162. 0xE000, 0xF8FF, // Private Use Area
  163. 0xF900, 0xFAFF, // CJK Compatibility Ideographs
  164. 0xFB00, 0xFB4F, // Alphabetic Presentation Forms
  165. 0xFB50, 0xFDFF, // Arabic Presentation Forms-A
  166. 0xFE00, 0xFE0F, // Variation Selectors
  167. 0xFE10, 0xFE1F, // Vertical Forms
  168. 0xFE20, 0xFE2F, // Combining Half Marks
  169. 0xFE30, 0xFE4F, // CJK Compatibility Forms
  170. 0xFE50, 0xFE6F, // Small Form Variants
  171. 0xFE70, 0xFEFF, // Arabic Presentation Forms-B
  172. 0xFF00, 0xFFEF, // Halfwidth and Fullwidth Forms
  173. 0xFFF0, 0xFFFF, // Specials
  174. 0x10000, 0x1007F, // Linear B Syllabary
  175. 0x10080, 0x100FF, // Linear B Ideograms
  176. 0x10100, 0x1013F, // Aegean Numbers
  177. 0x10140, 0x1018F, // Ancient Greek Numbers
  178. 0x10190, 0x101CF, // Ancient Symbols
  179. 0x101D0, 0x101FF, // Phaistos Disc
  180. 0x10280, 0x1029F, // Lycian
  181. 0x102A0, 0x102DF, // Carian
  182. 0x10300, 0x1032F, // Old Italic
  183. 0x10330, 0x1034F, // Gothic
  184. 0x10380, 0x1039F, // Ugaritic
  185. 0x103A0, 0x103DF, // Old Persian
  186. 0x10400, 0x1044F, // Deseret
  187. 0x10450, 0x1047F, // Shavian
  188. 0x10480, 0x104AF, // Osmanya
  189. 0x10800, 0x1083F, // Cypriot Syllabary
  190. 0x10840, 0x1085F, // Imperial Aramaic
  191. 0x10900, 0x1091F, // Phoenician
  192. 0x10920, 0x1093F, // Lydian
  193. 0x10A00, 0x10A5F, // Kharoshthi
  194. 0x10A60, 0x10A7F, // Old South Arabian
  195. 0x10B00, 0x10B3F, // Avestan
  196. 0x10B40, 0x10B5F, // Inscriptional Parthian
  197. 0x10B60, 0x10B7F, // Inscriptional Pahlavi
  198. 0x10C00, 0x10C4F, // Old Turkic
  199. 0x10E60, 0x10E7F, // Rumi Numeral Symbols
  200. 0x11000, 0x1107F, // Brahmi
  201. 0x11080, 0x110CF, // Kaithi
  202. 0x12000, 0x123FF, // Cuneiform
  203. 0x12400, 0x1247F, // Cuneiform Numbers and Punctuation
  204. 0x13000, 0x1342F, // Egyptian Hieroglyphs
  205. 0x16800, 0x16A3F, // Bamum Supplement
  206. 0x1B000, 0x1B0FF, // Kana Supplement
  207. 0x1D000, 0x1D0FF, // Byzantine Musical Symbols
  208. 0x1D100, 0x1D1FF, // Musical Symbols
  209. 0x1D200, 0x1D24F, // Ancient Greek Musical Notation
  210. 0x1D300, 0x1D35F, // Tai Xuan Jing Symbols
  211. 0x1D360, 0x1D37F, // Counting Rod Numerals
  212. 0x1D400, 0x1D7FF, // Mathematical Alphanumeric Symbols
  213. 0x1F000, 0x1F02F, // Mahjong Tiles
  214. 0x1F030, 0x1F09F, // Domino Tiles
  215. 0x1F0A0, 0x1F0FF, // Playing Cards
  216. 0x1F100, 0x1F1FF, // Enclosed Alphanumeric Supplement
  217. 0x1F200, 0x1F2FF, // Enclosed Ideographic Supplement
  218. 0x1F300, 0x1F5FF, // Miscellaneous Symbols And Pictographs
  219. 0x1F600, 0x1F64F, // Emoticons
  220. 0x1F680, 0x1F6FF, // Transport And Map Symbols
  221. 0x1F700, 0x1F77F, // Alchemical Symbols
  222. 0x20000, 0x2A6DF, // CJK Unified Ideographs Extension B
  223. 0x2A700, 0x2B73F, // CJK Unified Ideographs Extension C
  224. 0x2B740, 0x2B81F, // CJK Unified Ideographs Extension D
  225. 0x2F800, 0x2FA1F, // CJK Compatibility Ideographs Supplement
  226. 0xE0000, 0xE007F, // Tags
  227. 0xE0100, 0xE01EF, // Variation Selectors Supplement
  228. 0xF0000, 0xFFFFF, // Supplementary Private Use Area-A
  229. 0x100000, 0x10FFFF, // Supplementary Private Use Area-B
  230. 0xFFFFFFFF
  231. };
  232. // Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
  233. // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
  234. #define UTF8_ACCEPT 0u
  235. static const unsigned char utf8d[] = {
  236. // The first part of the table maps bytes to character classes that
  237. // to reduce the size of the transition table and create bitmasks.
  238. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  239. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  240. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  241. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  242. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
  243. 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  244. 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  245. 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
  246. // The second part is a transition table that maps a combination
  247. // of a state of the automaton and a character class to a state.
  248. 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
  249. 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
  250. 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
  251. 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
  252. 12,36,12,12,12,12,12,12,12,12,12,12,
  253. };
  254. static unsigned inline decode(unsigned* state, unsigned* codep, unsigned byte) {
  255. unsigned type = utf8d[byte];
  256. *codep = (*state != UTF8_ACCEPT) ?
  257. (byte & 0x3fu) | (*codep << 6) :
  258. (0xffu >> type) & (byte);
  259. *state = utf8d[256 + *state + type];
  260. return *state;
  261. }
  262. //static bool IsUTF8(unsigned char* s) {
  263. // unsigned codepoint, state = 0;
  264. //
  265. // while (*s)
  266. // decode(&state, &codepoint, *s++);
  267. //
  268. // return state == UTF8_ACCEPT;
  269. //}
  270. TEST(EncodingsTest, UTF8) {
  271. StringBuffer os, os2;
  272. for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
  273. for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
  274. os.Clear();
  275. UTF8<>::Encode(os, codepoint);
  276. const char* encodedStr = os.GetString();
  277. // Decode with Hoehrmann
  278. {
  279. unsigned decodedCodepoint = 0;
  280. unsigned state = 0;
  281. unsigned decodedCount = 0;
  282. for (const char* s = encodedStr; *s; ++s)
  283. if (!decode(&state, &decodedCodepoint, static_cast<unsigned char>(*s))) {
  284. EXPECT_EQ(codepoint, decodedCodepoint);
  285. decodedCount++;
  286. }
  287. if (*encodedStr) { // This decoder cannot handle U+0000
  288. EXPECT_EQ(1u, decodedCount); // Should only contain one code point
  289. }
  290. EXPECT_EQ(UTF8_ACCEPT, state);
  291. if (UTF8_ACCEPT != state)
  292. std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
  293. }
  294. // Decode
  295. {
  296. StringStream is(encodedStr);
  297. unsigned decodedCodepoint;
  298. bool result = UTF8<>::Decode(is, &decodedCodepoint);
  299. EXPECT_TRUE(result);
  300. EXPECT_EQ(codepoint, decodedCodepoint);
  301. if (!result || codepoint != decodedCodepoint)
  302. std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
  303. }
  304. // Validate
  305. {
  306. StringStream is(encodedStr);
  307. os2.Clear();
  308. bool result = UTF8<>::Validate(is, os2);
  309. EXPECT_TRUE(result);
  310. EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
  311. }
  312. }
  313. }
  314. // Validate code point before encoding
  315. EXPECT_FALSE(ValidatableEncoder<>::Encode<UTF8<>>(os, 0xFFFFFFFF));
  316. EXPECT_FALSE(ValidatableEncoder<>::EncodeUnsafe<UTF8<>>(os, 0xFFFFFFFF));
  317. EXPECT_THROW(ValidatableEncoder<false>::Encode<UTF8<>>(os, 0xFFFFFFFF), AssertException);
  318. EXPECT_THROW(ValidatableEncoder<false>::EncodeUnsafe<UTF8<>>(os, 0xFFFFFFFF), AssertException);
  319. }
  320. TEST(EncodingsTest, UTF16) {
  321. GenericStringBuffer<UTF16<> > os, os2;
  322. GenericStringBuffer<UTF8<> > utf8os;
  323. for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
  324. for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
  325. os.Clear();
  326. UTF16<>::Encode(os, codepoint);
  327. const UTF16<>::Ch* encodedStr = os.GetString();
  328. // Encode with Hoehrmann's code
  329. if (codepoint != 0) // cannot handle U+0000
  330. {
  331. // encode with UTF8<> first
  332. utf8os.Clear();
  333. UTF8<>::Encode(utf8os, codepoint);
  334. // transcode from UTF8 to UTF16 with Hoehrmann's code
  335. unsigned decodedCodepoint = 0;
  336. unsigned state = 0;
  337. UTF16<>::Ch buffer[3], *p = &buffer[0];
  338. for (const char* s = utf8os.GetString(); *s; ++s) {
  339. if (!decode(&state, &decodedCodepoint, static_cast<unsigned char>(*s)))
  340. break;
  341. }
  342. if (codepoint <= 0xFFFF)
  343. *p++ = static_cast<UTF16<>::Ch>(decodedCodepoint);
  344. else {
  345. // Encode code points above U+FFFF as surrogate pair.
  346. *p++ = static_cast<UTF16<>::Ch>(0xD7C0 + (decodedCodepoint >> 10));
  347. *p++ = static_cast<UTF16<>::Ch>(0xDC00 + (decodedCodepoint & 0x3FF));
  348. }
  349. *p++ = '\0';
  350. EXPECT_EQ(0, StrCmp(buffer, encodedStr));
  351. }
  352. // Decode
  353. {
  354. GenericStringStream<UTF16<> > is(encodedStr);
  355. unsigned decodedCodepoint;
  356. bool result = UTF16<>::Decode(is, &decodedCodepoint);
  357. EXPECT_TRUE(result);
  358. EXPECT_EQ(codepoint, decodedCodepoint);
  359. if (!result || codepoint != decodedCodepoint)
  360. std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
  361. }
  362. // Validate
  363. {
  364. GenericStringStream<UTF16<> > is(encodedStr);
  365. os2.Clear();
  366. bool result = UTF16<>::Validate(is, os2);
  367. EXPECT_TRUE(result);
  368. EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
  369. }
  370. }
  371. }
  372. // Validate code point before encoding
  373. EXPECT_FALSE(ValidatableEncoder<>::Encode<UTF16<>>(os, 0xFFFFFFFF));
  374. EXPECT_FALSE(ValidatableEncoder<>::EncodeUnsafe<UTF16<>>(os, 0xFFFFFFFF));
  375. EXPECT_FALSE(ValidatableEncoder<>::Encode<UTF16<>>(os, 0xD800));
  376. EXPECT_FALSE(ValidatableEncoder<>::EncodeUnsafe<UTF16<>>(os, 0xD800));
  377. EXPECT_FALSE(ValidatableEncoder<>::Encode<UTF16<>>(os, 0xDFFF));
  378. EXPECT_FALSE(ValidatableEncoder<>::EncodeUnsafe<UTF16<>>(os, 0xDFFF));
  379. EXPECT_THROW(ValidatableEncoder<false>::Encode<UTF16<>>(os, 0xFFFFFFFF), AssertException);
  380. EXPECT_THROW(ValidatableEncoder<false>::EncodeUnsafe<UTF16<>>(os, 0xFFFFFFFF), AssertException);
  381. EXPECT_THROW(ValidatableEncoder<false>::Encode<UTF16<>>(os, 0xD800), AssertException);
  382. EXPECT_THROW(ValidatableEncoder<false>::EncodeUnsafe<UTF16<>>(os, 0xD800), AssertException);
  383. EXPECT_THROW(ValidatableEncoder<false>::Encode<UTF16<>>(os, 0xDFFF), AssertException);
  384. EXPECT_THROW(ValidatableEncoder<false>::EncodeUnsafe<UTF16<>>(os, 0xDFFF), AssertException);
  385. }
  386. TEST(EncodingsTest, UTF32) {
  387. GenericStringBuffer<UTF32<> > os, os2;
  388. for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
  389. for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
  390. os.Clear();
  391. UTF32<>::Encode(os, codepoint);
  392. const UTF32<>::Ch* encodedStr = os.GetString();
  393. // Decode
  394. {
  395. GenericStringStream<UTF32<> > is(encodedStr);
  396. unsigned decodedCodepoint;
  397. bool result = UTF32<>::Decode(is, &decodedCodepoint);
  398. EXPECT_TRUE(result);
  399. EXPECT_EQ(codepoint, decodedCodepoint);
  400. if (!result || codepoint != decodedCodepoint)
  401. std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
  402. }
  403. // Validate
  404. {
  405. GenericStringStream<UTF32<> > is(encodedStr);
  406. os2.Clear();
  407. bool result = UTF32<>::Validate(is, os2);
  408. EXPECT_TRUE(result);
  409. EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
  410. }
  411. }
  412. }
  413. // Validate code point before encoding
  414. EXPECT_FALSE(ValidatableEncoder<>::Encode<UTF32<>>(os, 0xFFFFFFFF));
  415. EXPECT_FALSE(ValidatableEncoder<>::EncodeUnsafe<UTF32<>>(os, 0xFFFFFFFF));
  416. EXPECT_THROW(ValidatableEncoder<false>::Encode<UTF32<>>(os, 0xFFFFFFFF), AssertException);
  417. EXPECT_THROW(ValidatableEncoder<false>::EncodeUnsafe<UTF32<>>(os, 0xFFFFFFFF), AssertException);
  418. }
  419. TEST(EncodingsTest, ASCII) {
  420. StringBuffer os, os2;
  421. for (unsigned codepoint = 0; codepoint < 128; codepoint++) {
  422. os.Clear();
  423. ASCII<>::Encode(os, codepoint);
  424. const ASCII<>::Ch* encodedStr = os.GetString();
  425. {
  426. StringStream is(encodedStr);
  427. unsigned decodedCodepoint;
  428. bool result = ASCII<>::Decode(is, &decodedCodepoint);
  429. if (!result || codepoint != decodedCodepoint)
  430. std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
  431. }
  432. // Validate
  433. {
  434. StringStream is(encodedStr);
  435. os2.Clear();
  436. bool result = ASCII<>::Validate(is, os2);
  437. EXPECT_TRUE(result);
  438. EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
  439. }
  440. }
  441. // Validate code point before encoding
  442. EXPECT_FALSE(ValidatableEncoder<>::Encode<ASCII<>>(os, 0x0080));
  443. EXPECT_FALSE(ValidatableEncoder<>::EncodeUnsafe<ASCII<>>(os, 0x0080));
  444. EXPECT_THROW(ValidatableEncoder<false>::Encode<ASCII<>>(os, 0x0080), AssertException);
  445. EXPECT_THROW(ValidatableEncoder<false>::EncodeUnsafe<ASCII<>>(os, 0x0080), AssertException);
  446. }