utf.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530
  1. /*
  2. ** 2004 April 13
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. *************************************************************************
  12. ** This file contains routines used to translate between UTF-8,
  13. ** UTF-16, UTF-16BE, and UTF-16LE.
  14. **
  15. ** Notes on UTF-8:
  16. **
  17. ** Byte-0 Byte-1 Byte-2 Byte-3 Value
  18. ** 0xxxxxxx 00000000 00000000 0xxxxxxx
  19. ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
  20. ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
  21. ** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
  22. **
  23. **
  24. ** Notes on UTF-16: (with wwww+1==uuuuu)
  25. **
  26. ** Word-0 Word-1 Value
  27. ** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
  28. ** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
  29. **
  30. **
  31. ** BOM or Byte Order Mark:
  32. ** 0xff 0xfe little-endian utf-16 follows
  33. ** 0xfe 0xff big-endian utf-16 follows
  34. **
  35. */
  36. #include "sqliteInt.h"
  37. #include <assert.h>
  38. #include "vdbeInt.h"
  39. #ifndef SQLITE_AMALGAMATION
  40. /*
  41. ** The following constant value is used by the SQLITE_BIGENDIAN and
  42. ** SQLITE_LITTLEENDIAN macros.
  43. */
  44. const int sqlite3one = 1;
  45. #endif /* SQLITE_AMALGAMATION */
  46. /*
  47. ** This lookup table is used to help decode the first byte of
  48. ** a multi-byte UTF8 character.
  49. */
  50. static const unsigned char sqlite3Utf8Trans1[] = {
  51. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  52. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  53. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  54. 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
  55. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  56. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  57. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  58. 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
  59. };
  60. #define WRITE_UTF8(zOut, c) { \
  61. if( c<0x00080 ){ \
  62. *zOut++ = (u8)(c&0xFF); \
  63. } \
  64. else if( c<0x00800 ){ \
  65. *zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \
  66. *zOut++ = 0x80 + (u8)(c & 0x3F); \
  67. } \
  68. else if( c<0x10000 ){ \
  69. *zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \
  70. *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
  71. *zOut++ = 0x80 + (u8)(c & 0x3F); \
  72. }else{ \
  73. *zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \
  74. *zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \
  75. *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
  76. *zOut++ = 0x80 + (u8)(c & 0x3F); \
  77. } \
  78. }
  79. #define WRITE_UTF16LE(zOut, c) { \
  80. if( c<=0xFFFF ){ \
  81. *zOut++ = (u8)(c&0x00FF); \
  82. *zOut++ = (u8)((c>>8)&0x00FF); \
  83. }else{ \
  84. *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
  85. *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
  86. *zOut++ = (u8)(c&0x00FF); \
  87. *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
  88. } \
  89. }
  90. #define WRITE_UTF16BE(zOut, c) { \
  91. if( c<=0xFFFF ){ \
  92. *zOut++ = (u8)((c>>8)&0x00FF); \
  93. *zOut++ = (u8)(c&0x00FF); \
  94. }else{ \
  95. *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
  96. *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
  97. *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
  98. *zOut++ = (u8)(c&0x00FF); \
  99. } \
  100. }
  101. #define READ_UTF16LE(zIn, TERM, c){ \
  102. c = (*zIn++); \
  103. c += ((*zIn++)<<8); \
  104. if( c>=0xD800 && c<0xE000 && TERM ){ \
  105. int c2 = (*zIn++); \
  106. c2 += ((*zIn++)<<8); \
  107. c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
  108. } \
  109. }
  110. #define READ_UTF16BE(zIn, TERM, c){ \
  111. c = ((*zIn++)<<8); \
  112. c += (*zIn++); \
  113. if( c>=0xD800 && c<0xE000 && TERM ){ \
  114. int c2 = ((*zIn++)<<8); \
  115. c2 += (*zIn++); \
  116. c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
  117. } \
  118. }
  119. /*
  120. ** Translate a single UTF-8 character. Return the unicode value.
  121. **
  122. ** During translation, assume that the byte that zTerm points
  123. ** is a 0x00.
  124. **
  125. ** Write a pointer to the next unread byte back into *pzNext.
  126. **
  127. ** Notes On Invalid UTF-8:
  128. **
  129. ** * This routine never allows a 7-bit character (0x00 through 0x7f) to
  130. ** be encoded as a multi-byte character. Any multi-byte character that
  131. ** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
  132. **
  133. ** * This routine never allows a UTF16 surrogate value to be encoded.
  134. ** If a multi-byte character attempts to encode a value between
  135. ** 0xd800 and 0xe000 then it is rendered as 0xfffd.
  136. **
  137. ** * Bytes in the range of 0x80 through 0xbf which occur as the first
  138. ** byte of a character are interpreted as single-byte characters
  139. ** and rendered as themselves even though they are technically
  140. ** invalid characters.
  141. **
  142. ** * This routine accepts an infinite number of different UTF8 encodings
  143. ** for unicode values 0x80 and greater. It do not change over-length
  144. ** encodings to 0xfffd as some systems recommend.
  145. */
  146. #define READ_UTF8(zIn, zTerm, c) \
  147. c = *(zIn++); \
  148. if( c>=0xc0 ){ \
  149. c = sqlite3Utf8Trans1[c-0xc0]; \
  150. while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
  151. c = (c<<6) + (0x3f & *(zIn++)); \
  152. } \
  153. if( c<0x80 \
  154. || (c&0xFFFFF800)==0xD800 \
  155. || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
  156. }
  157. u32 sqlite3Utf8Read(
  158. const unsigned char **pz /* Pointer to string from which to read char */
  159. ){
  160. unsigned int c;
  161. /* Same as READ_UTF8() above but without the zTerm parameter.
  162. ** For this routine, we assume the UTF8 string is always zero-terminated.
  163. */
  164. c = *((*pz)++);
  165. if( c>=0xc0 ){
  166. c = sqlite3Utf8Trans1[c-0xc0];
  167. while( (*(*pz) & 0xc0)==0x80 ){
  168. c = (c<<6) + (0x3f & *((*pz)++));
  169. }
  170. if( c<0x80
  171. || (c&0xFFFFF800)==0xD800
  172. || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; }
  173. }
  174. return c;
  175. }
  176. /*
  177. ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
  178. ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
  179. */
  180. /* #define TRANSLATE_TRACE 1 */
  181. #ifndef SQLITE_OMIT_UTF16
  182. /*
  183. ** This routine transforms the internal text encoding used by pMem to
  184. ** desiredEnc. It is an error if the string is already of the desired
  185. ** encoding, or if *pMem does not contain a string value.
  186. */
  187. int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
  188. int len; /* Maximum length of output string in bytes */
  189. unsigned char *zOut; /* Output buffer */
  190. unsigned char *zIn; /* Input iterator */
  191. unsigned char *zTerm; /* End of input */
  192. unsigned char *z; /* Output iterator */
  193. unsigned int c;
  194. assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
  195. assert( pMem->flags&MEM_Str );
  196. assert( pMem->enc!=desiredEnc );
  197. assert( pMem->enc!=0 );
  198. assert( pMem->n>=0 );
  199. #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
  200. {
  201. char zBuf[100];
  202. sqlite3VdbeMemPrettyPrint(pMem, zBuf);
  203. fprintf(stderr, "INPUT: %s\n", zBuf);
  204. }
  205. #endif
  206. /* If the translation is between UTF-16 little and big endian, then
  207. ** all that is required is to swap the byte order. This case is handled
  208. ** differently from the others.
  209. */
  210. if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
  211. u8 temp;
  212. int rc;
  213. rc = sqlite3VdbeMemMakeWriteable(pMem);
  214. if( rc!=SQLITE_OK ){
  215. assert( rc==SQLITE_NOMEM );
  216. return SQLITE_NOMEM;
  217. }
  218. zIn = (u8*)pMem->z;
  219. zTerm = &zIn[pMem->n&~1];
  220. while( zIn<zTerm ){
  221. temp = *zIn;
  222. *zIn = *(zIn+1);
  223. zIn++;
  224. *zIn++ = temp;
  225. }
  226. pMem->enc = desiredEnc;
  227. goto translate_out;
  228. }
  229. /* Set len to the maximum number of bytes required in the output buffer. */
  230. if( desiredEnc==SQLITE_UTF8 ){
  231. /* When converting from UTF-16, the maximum growth results from
  232. ** translating a 2-byte character to a 4-byte UTF-8 character.
  233. ** A single byte is required for the output string
  234. ** nul-terminator.
  235. */
  236. pMem->n &= ~1;
  237. len = pMem->n * 2 + 1;
  238. }else{
  239. /* When converting from UTF-8 to UTF-16 the maximum growth is caused
  240. ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
  241. ** character. Two bytes are required in the output buffer for the
  242. ** nul-terminator.
  243. */
  244. len = pMem->n * 2 + 2;
  245. }
  246. /* Set zIn to point at the start of the input buffer and zTerm to point 1
  247. ** byte past the end.
  248. **
  249. ** Variable zOut is set to point at the output buffer, space obtained
  250. ** from sqlite3_malloc().
  251. */
  252. zIn = (u8*)pMem->z;
  253. zTerm = &zIn[pMem->n];
  254. zOut = sqlite3DbMallocRaw(pMem->db, len);
  255. if( !zOut ){
  256. return SQLITE_NOMEM;
  257. }
  258. z = zOut;
  259. if( pMem->enc==SQLITE_UTF8 ){
  260. if( desiredEnc==SQLITE_UTF16LE ){
  261. /* UTF-8 -> UTF-16 Little-endian */
  262. while( zIn<zTerm ){
  263. READ_UTF8(zIn, zTerm, c);
  264. WRITE_UTF16LE(z, c);
  265. }
  266. }else{
  267. assert( desiredEnc==SQLITE_UTF16BE );
  268. /* UTF-8 -> UTF-16 Big-endian */
  269. while( zIn<zTerm ){
  270. READ_UTF8(zIn, zTerm, c);
  271. WRITE_UTF16BE(z, c);
  272. }
  273. }
  274. pMem->n = (int)(z - zOut);
  275. *z++ = 0;
  276. }else{
  277. assert( desiredEnc==SQLITE_UTF8 );
  278. if( pMem->enc==SQLITE_UTF16LE ){
  279. /* UTF-16 Little-endian -> UTF-8 */
  280. while( zIn<zTerm ){
  281. READ_UTF16LE(zIn, zIn<zTerm, c);
  282. WRITE_UTF8(z, c);
  283. }
  284. }else{
  285. /* UTF-16 Big-endian -> UTF-8 */
  286. while( zIn<zTerm ){
  287. READ_UTF16BE(zIn, zIn<zTerm, c);
  288. WRITE_UTF8(z, c);
  289. }
  290. }
  291. pMem->n = (int)(z - zOut);
  292. }
  293. *z = 0;
  294. assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
  295. sqlite3VdbeMemRelease(pMem);
  296. pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem);
  297. pMem->enc = desiredEnc;
  298. pMem->flags |= (MEM_Term|MEM_Dyn);
  299. pMem->z = (char*)zOut;
  300. pMem->zMalloc = pMem->z;
  301. translate_out:
  302. #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
  303. {
  304. char zBuf[100];
  305. sqlite3VdbeMemPrettyPrint(pMem, zBuf);
  306. fprintf(stderr, "OUTPUT: %s\n", zBuf);
  307. }
  308. #endif
  309. return SQLITE_OK;
  310. }
  311. /*
  312. ** This routine checks for a byte-order mark at the beginning of the
  313. ** UTF-16 string stored in *pMem. If one is present, it is removed and
  314. ** the encoding of the Mem adjusted. This routine does not do any
  315. ** byte-swapping, it just sets Mem.enc appropriately.
  316. **
  317. ** The allocation (static, dynamic etc.) and encoding of the Mem may be
  318. ** changed by this function.
  319. */
  320. int sqlite3VdbeMemHandleBom(Mem *pMem){
  321. int rc = SQLITE_OK;
  322. u8 bom = 0;
  323. assert( pMem->n>=0 );
  324. if( pMem->n>1 ){
  325. u8 b1 = *(u8 *)pMem->z;
  326. u8 b2 = *(((u8 *)pMem->z) + 1);
  327. if( b1==0xFE && b2==0xFF ){
  328. bom = SQLITE_UTF16BE;
  329. }
  330. if( b1==0xFF && b2==0xFE ){
  331. bom = SQLITE_UTF16LE;
  332. }
  333. }
  334. if( bom ){
  335. rc = sqlite3VdbeMemMakeWriteable(pMem);
  336. if( rc==SQLITE_OK ){
  337. pMem->n -= 2;
  338. memmove(pMem->z, &pMem->z[2], pMem->n);
  339. pMem->z[pMem->n] = '\0';
  340. pMem->z[pMem->n+1] = '\0';
  341. pMem->flags |= MEM_Term;
  342. pMem->enc = bom;
  343. }
  344. }
  345. return rc;
  346. }
  347. #endif /* SQLITE_OMIT_UTF16 */
  348. /*
  349. ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
  350. ** return the number of unicode characters in pZ up to (but not including)
  351. ** the first 0x00 byte. If nByte is not less than zero, return the
  352. ** number of unicode characters in the first nByte of pZ (or up to
  353. ** the first 0x00, whichever comes first).
  354. */
  355. int sqlite3Utf8CharLen(const char *zIn, int nByte){
  356. int r = 0;
  357. const u8 *z = (const u8*)zIn;
  358. const u8 *zTerm;
  359. if( nByte>=0 ){
  360. zTerm = &z[nByte];
  361. }else{
  362. zTerm = (const u8*)(-1);
  363. }
  364. assert( z<=zTerm );
  365. while( *z!=0 && z<zTerm ){
  366. SQLITE_SKIP_UTF8(z);
  367. r++;
  368. }
  369. return r;
  370. }
  371. /* This test function is not currently used by the automated test-suite.
  372. ** Hence it is only available in debug builds.
  373. */
  374. #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
  375. /*
  376. ** Translate UTF-8 to UTF-8.
  377. **
  378. ** This has the effect of making sure that the string is well-formed
  379. ** UTF-8. Miscoded characters are removed.
  380. **
  381. ** The translation is done in-place and aborted if the output
  382. ** overruns the input.
  383. */
  384. int sqlite3Utf8To8(unsigned char *zIn){
  385. unsigned char *zOut = zIn;
  386. unsigned char *zStart = zIn;
  387. u32 c;
  388. while( zIn[0] && zOut<=zIn ){
  389. c = sqlite3Utf8Read((const u8**)&zIn);
  390. if( c!=0xfffd ){
  391. WRITE_UTF8(zOut, c);
  392. }
  393. }
  394. *zOut = 0;
  395. return (int)(zOut - zStart);
  396. }
  397. #endif
  398. #ifndef SQLITE_OMIT_UTF16
  399. /*
  400. ** Convert a UTF-16 string in the native encoding into a UTF-8 string.
  401. ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
  402. ** be freed by the calling function.
  403. **
  404. ** NULL is returned if there is an allocation error.
  405. */
  406. char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte, u8 enc){
  407. Mem m;
  408. memset(&m, 0, sizeof(m));
  409. m.db = db;
  410. sqlite3VdbeMemSetStr(&m, z, nByte, enc, SQLITE_STATIC);
  411. sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
  412. if( db->mallocFailed ){
  413. sqlite3VdbeMemRelease(&m);
  414. m.z = 0;
  415. }
  416. assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
  417. assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
  418. assert( (m.flags & MEM_Dyn)!=0 || db->mallocFailed );
  419. assert( m.z || db->mallocFailed );
  420. return m.z;
  421. }
  422. /*
  423. ** zIn is a UTF-16 encoded unicode string at least nChar characters long.
  424. ** Return the number of bytes in the first nChar unicode characters
  425. ** in pZ. nChar must be non-negative.
  426. */
  427. int sqlite3Utf16ByteLen(const void *zIn, int nChar){
  428. int c;
  429. unsigned char const *z = zIn;
  430. int n = 0;
  431. if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
  432. while( n<nChar ){
  433. READ_UTF16BE(z, 1, c);
  434. n++;
  435. }
  436. }else{
  437. while( n<nChar ){
  438. READ_UTF16LE(z, 1, c);
  439. n++;
  440. }
  441. }
  442. return (int)(z-(unsigned char const *)zIn);
  443. }
  444. #if defined(SQLITE_TEST)
  445. /*
  446. ** This routine is called from the TCL test function "translate_selftest".
  447. ** It checks that the primitives for serializing and deserializing
  448. ** characters in each encoding are inverses of each other.
  449. */
  450. void sqlite3UtfSelfTest(void){
  451. unsigned int i, t;
  452. unsigned char zBuf[20];
  453. unsigned char *z;
  454. int n;
  455. unsigned int c;
  456. for(i=0; i<0x00110000; i++){
  457. z = zBuf;
  458. WRITE_UTF8(z, i);
  459. n = (int)(z-zBuf);
  460. assert( n>0 && n<=4 );
  461. z[0] = 0;
  462. z = zBuf;
  463. c = sqlite3Utf8Read((const u8**)&z);
  464. t = i;
  465. if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
  466. if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
  467. assert( c==t );
  468. assert( (z-zBuf)==n );
  469. }
  470. for(i=0; i<0x00110000; i++){
  471. if( i>=0xD800 && i<0xE000 ) continue;
  472. z = zBuf;
  473. WRITE_UTF16LE(z, i);
  474. n = (int)(z-zBuf);
  475. assert( n>0 && n<=4 );
  476. z[0] = 0;
  477. z = zBuf;
  478. READ_UTF16LE(z, 1, c);
  479. assert( c==i );
  480. assert( (z-zBuf)==n );
  481. }
  482. for(i=0; i<0x00110000; i++){
  483. if( i>=0xD800 && i<0xE000 ) continue;
  484. z = zBuf;
  485. WRITE_UTF16BE(z, i);
  486. n = (int)(z-zBuf);
  487. assert( n>0 && n<=4 );
  488. z[0] = 0;
  489. z = zBuf;
  490. READ_UTF16BE(z, 1, c);
  491. assert( c==i );
  492. assert( (z-zBuf)==n );
  493. }
  494. }
  495. #endif /* SQLITE_TEST */
  496. #endif /* SQLITE_OMIT_UTF16 */