PikaStdData_String.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824
  1. #include "PikaStdData_String.h"
  2. #include "PikaStdData_List.h"
  3. #include "PikaStdData_String_Util.h"
  4. #include "dataStrs.h"
  5. char* _strlwr(char* str);
  6. static int string_len(char* str);
  7. Arg* PikaStdData_String___iter__(PikaObj* self) {
  8. obj_setInt(self, "__iter_i", 0);
  9. return arg_newRef(self);
  10. }
  11. void PikaStdData_String_set(PikaObj* self, char* s) {
  12. #if PIKA_STRING_UTF8_ENABLE
  13. int r = _valid_utf8(s, -1);
  14. if (r >= 0) {
  15. obj_setErrorCode(self, __LINE__);
  16. __platform_printf("Error invaliad character %x\r\n", s[r]);
  17. return;
  18. }
  19. #endif
  20. obj_setStr(self, "str", s);
  21. }
  22. void PikaStdData_String___init__(PikaObj* self, char* s) {
  23. #if PIKA_STRING_UTF8_ENABLE
  24. int r = _valid_utf8(s, -1);
  25. if (r >= 0) {
  26. obj_setErrorCode(self, __LINE__);
  27. __platform_printf("Error invaliad character %x\r\n", s[r]);
  28. return;
  29. }
  30. #endif
  31. PikaStdData_String_set(self, s);
  32. }
  33. char* PikaStdData_String_get(PikaObj* self) {
  34. return obj_getStr(self, "str");
  35. }
  36. Arg* PikaStdData_String___next__(PikaObj* self) {
  37. int __iter_i = args_getInt(self->list, "__iter_i");
  38. char* str = obj_getStr(self, "str");
  39. uint16_t len = strGetSize(str);
  40. #if PIKA_STRING_UTF8_ENABLE
  41. char char_buff[5];
  42. int r = _utf8_get(str, len, __iter_i, char_buff);
  43. if (r < 0) {
  44. return arg_newNull();
  45. }
  46. args_setInt(self->list, "__iter_i", __iter_i + 1);
  47. return arg_newStr((char*)char_buff);
  48. #else
  49. Arg* res = NULL;
  50. char char_buff[] = " ";
  51. if (__iter_i < len) {
  52. char_buff[0] = str[__iter_i];
  53. res = arg_newStr((char*)char_buff);
  54. } else {
  55. return arg_newNull();
  56. }
  57. args_setInt(self->list, "__iter_i", __iter_i + 1);
  58. return res;
  59. #endif
  60. }
  61. static int _str_get(char* str, int key_i, char* char_buff) {
  62. uint16_t len = strGetSize(str);
  63. if (key_i < 0) {
  64. key_i = string_len(str) + key_i;
  65. }
  66. #if PIKA_STRING_UTF8_ENABLE
  67. return _utf8_get(str, len, key_i, char_buff);
  68. #else
  69. if (key_i < len) {
  70. char_buff[0] = str[key_i];
  71. return 0;
  72. }
  73. return -1;
  74. #endif
  75. }
  76. char* string_slice(Args* outBuffs, char* str, int start, int end) {
  77. char* res = args_getBuff(outBuffs, strGetSize(str));
  78. if (start < 0) {
  79. start += string_len(str);
  80. }
  81. /* magic code, to the end */
  82. if (end == -99999) {
  83. end = string_len(str);
  84. }
  85. if (end < 0) {
  86. end += string_len(str);
  87. }
  88. for (int i = start; i < end; i++) {
  89. char char_buff[5] = {0};
  90. int r = _str_get(str, i, char_buff);
  91. if (r < 0) {
  92. return NULL;
  93. }
  94. res = strAppend(res, char_buff);
  95. }
  96. return res;
  97. }
  98. Arg* PikaStdData_String___getitem__(PikaObj* self, Arg* __key) {
  99. int key_i = arg_getInt(__key);
  100. char* str = obj_getStr(self, "str");
  101. char char_buff[5] = {0};
  102. int r = _str_get(str, key_i, char_buff);
  103. if (r < 0) {
  104. return arg_newNull();
  105. }
  106. return arg_newStr((char*)char_buff);
  107. }
  108. void PikaStdData_String___setitem__(PikaObj* self, Arg* __key, Arg* __val) {
  109. int key_i = arg_getInt(__key);
  110. char* str = obj_getStr(self, "str");
  111. char* val = arg_getStr(__val);
  112. uint16_t len = strGetSize(str);
  113. #if PIKA_STRING_UTF8_ENABLE
  114. int len2 = strlen(val);
  115. int is_invalid = _valid_utf8(val, len2);
  116. if (is_invalid >= 0) {
  117. obj_setErrorCode(self, __LINE__);
  118. __platform_printf("Error String invalid\r\n");
  119. return;
  120. }
  121. int ulen_val = _utf8_strlen(val, len2);
  122. if (ulen_val != 1) {
  123. obj_setErrorCode(self, __LINE__);
  124. __platform_printf("Error String invalid char\r\n");
  125. return;
  126. }
  127. int char_len;
  128. int repl_at = _utf8_get_offset(str, len, key_i, &char_len);
  129. if (repl_at < 0) {
  130. obj_setErrorCode(self, __LINE__);
  131. __platform_printf("Error String Overflow\r\n");
  132. return;
  133. }
  134. int ok = __str_repl(self, str, len, repl_at, char_len, val, len2);
  135. if (ok < 0) {
  136. obj_setErrorCode(self, __LINE__);
  137. __platform_printf("Error. Internal error(-%d)\r\n", __LINE__);
  138. return;
  139. }
  140. #else
  141. if (key_i >= len) {
  142. obj_setErrorCode(self, 1);
  143. __platform_printf("Error String Overflow\r\n");
  144. return;
  145. }
  146. str[key_i] = val[0];
  147. #endif
  148. }
  149. char* PikaStdData_String___str__(PikaObj* self) {
  150. return obj_getStr(self, "str");
  151. }
  152. int PikaStdData_String_startswith(PikaObj* self, char* prefix) {
  153. char* str = obj_getStr(self, "str");
  154. char* p = prefix;
  155. int i = 0;
  156. while (*p != '\0') {
  157. if (*p != str[i])
  158. return 0;
  159. p++;
  160. i++;
  161. }
  162. return 1;
  163. }
  164. int PikaStdData_String_endswith(PikaObj* self, char* suffix) {
  165. char* str = obj_getStr(self, "str");
  166. int len1 = strlen(str);
  167. int len2 = strlen(suffix);
  168. while (len2 >= 1) {
  169. if (suffix[len2 - 1] != str[len1 - 1])
  170. return 0;
  171. len2--;
  172. len1--;
  173. }
  174. return 1;
  175. }
  176. int PikaStdData_String_isdigit(PikaObj* self) {
  177. char* str = obj_getStr(self, "str");
  178. int i = 0;
  179. while (str[i] != '\0') {
  180. if (!isdigit((int)str[i]))
  181. return 0;
  182. i++;
  183. }
  184. return 1;
  185. }
  186. int PikaStdData_String_islower(PikaObj* self) {
  187. char* str = obj_getStr(self, "str");
  188. int i = 0;
  189. while (str[i] != '\0') {
  190. if (!islower((int)str[i]))
  191. return 0;
  192. i++;
  193. }
  194. return 1;
  195. }
  196. int PikaStdData_String_isalnum(PikaObj* self) {
  197. char* str = obj_getStr(self, "str");
  198. int i = 0;
  199. while (str[i] != '\0') {
  200. if (!isalnum((int)str[i]))
  201. return 0;
  202. i++;
  203. }
  204. return 1;
  205. }
  206. int PikaStdData_String_isalpha(PikaObj* self) {
  207. char* str = obj_getStr(self, "str");
  208. int i = 0;
  209. while (str[i] != '\0') {
  210. if (!isalpha((int)str[i]))
  211. return 0;
  212. i++;
  213. }
  214. return 1;
  215. }
  216. int PikaStdData_String_isspace(PikaObj* self) {
  217. char* str = obj_getStr(self, "str");
  218. int i = 0;
  219. while (str[i] != '\0') {
  220. if (!isspace((int)str[i]))
  221. return 0;
  222. i++;
  223. }
  224. return 1;
  225. }
  226. PikaObj* PikaStdData_String_split(PikaObj* self, char* s) {
  227. /* 创建 list 对象 */
  228. PikaObj* list = newNormalObj(New_PikaStdData_List);
  229. /* 初始化 list */
  230. PikaStdData_List___init__(list);
  231. Args buffs = {0};
  232. char* str = strsCopy(&buffs, obj_getStr(self, "str"));
  233. /* split str with s by strstr() */
  234. size_t spliter_len = strGetSize(s);
  235. char* p = str;
  236. while (1) {
  237. char* q = strstr(p, s);
  238. if (q == NULL) {
  239. break;
  240. }
  241. *q = '\0';
  242. Arg* arg_item = arg_newStr(p);
  243. PikaStdData_List_append(list, arg_item);
  244. arg_deinit(arg_item);
  245. p = q + spliter_len;
  246. }
  247. if (*p != '\0') {
  248. Arg* arg_item = arg_newStr(p);
  249. PikaStdData_List_append(list, arg_item);
  250. arg_deinit(arg_item);
  251. }
  252. strsDeinit(&buffs);
  253. return list;
  254. }
  255. static int string_len(char* str) {
  256. #if PIKA_STRING_UTF8_ENABLE
  257. int n = _utf8_strlen(str, -1);
  258. return n;
  259. #else
  260. return strGetSize(str);
  261. #endif
  262. }
  263. int PikaStdData_String___len__(PikaObj* self) {
  264. char* str = obj_getStr(self, "str");
  265. int n = string_len(str);
  266. if (n < 0) {
  267. obj_setErrorCode(self, __LINE__);
  268. __platform_printf("Error. Internal error(%d)\r\n", __LINE__);
  269. }
  270. return n;
  271. }
  272. char* PikaStdData_String_strip(PikaObj* self, PikaTuple* chrs) {
  273. Args buffs = {0};
  274. char to_strip = ' ';
  275. if (pikaTuple_getSize(chrs) > 1) {
  276. obj_setErrorCode(self, PIKA_RES_ERR_INVALID_PARAM);
  277. obj_setSysOut(self, "Error. Invalid param");
  278. }
  279. if (pikaTuple_getSize(chrs) == 1) {
  280. char* ch_str = pikaTuple_getStr(chrs, 0);
  281. to_strip = ch_str[0];
  282. }
  283. char* str = strsCopy(&buffs, obj_getStr(self, "str"));
  284. /* strip */
  285. char* str_start = str;
  286. size_t len = strlen(str);
  287. for (size_t i = 0; i < len; i++) {
  288. if (str[i] != to_strip) {
  289. str_start = (char*)(str + i);
  290. break;
  291. }
  292. }
  293. len = strlen(str);
  294. for (int i = len - 1; i >= 0; i--) {
  295. if (str[i] != to_strip) {
  296. str[i + 1] = '\0';
  297. break;
  298. }
  299. }
  300. obj_setStr(self, "_buf", str_start);
  301. strsDeinit(&buffs);
  302. return obj_getStr(self, "_buf");
  303. }
  304. char* PikaStdData_String_replace(PikaObj* self, char* old, char* new) {
  305. Args buffs = {0};
  306. char* str = strsCopy(&buffs, obj_getStr(self, "str"));
  307. str = strsReplace(&buffs, str, old, new);
  308. obj_setStr(self, "_buf", str);
  309. strsDeinit(&buffs);
  310. return obj_getStr(self, "_buf");
  311. }
  312. Arg* PikaStdData_String_encode(PikaObj* self, PikaTuple* encoding) {
  313. char* str = obj_getStr(self, "str");
  314. #if PIKA_STRING_UTF8_ENABLE
  315. char* to_code = NULL;
  316. int argn = pikaTuple_getSize(encoding);
  317. if (argn < 1) {
  318. return arg_newBytes((uint8_t*)str, strGetSize(str));
  319. }
  320. Arg* arg_i = pikaTuple_getArg(encoding, 0);
  321. if (arg_getType(arg_i) != ARG_TYPE_STRING) {
  322. obj_setErrorCode(self, __LINE__);
  323. __platform_printf("Error invaliad arguments\r\n");
  324. return NULL;
  325. }
  326. to_code = arg_getStr(arg_i);
  327. _strlwr(to_code);
  328. Arg* res = _str_encode(str, to_code);
  329. if (!res) {
  330. obj_setErrorCode(self, __LINE__);
  331. __platform_printf("Error internal error\r\n");
  332. return NULL;
  333. }
  334. return res;
  335. #else
  336. return arg_newBytes((uint8_t*)str, strGetSize(str));
  337. #endif
  338. }
  339. #if PIKA_STRING_UTF8_ENABLE
  340. static const uint8_t _pcre_utf8_table4[] = {
  341. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  342. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  343. 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5};
  344. const char mask1 = 0x80;
  345. const char mask2 = 0xc0;
  346. const char mask3 = 0xe0;
  347. const char mask4 = 0xf0;
  348. const char nmask1 = 0x3f;
  349. const char nmask2 = 0x1f;
  350. const char nmask3 = 0x0f;
  351. const char nmask4 = 0x07;
  352. int _valid_utf8(const char* string, int length) {
  353. const uint8_t* p;
  354. if (length < 0) {
  355. length = strlen(string);
  356. }
  357. for (p = (const uint8_t*)string; length-- > 0; p++) {
  358. int ab;
  359. int c = *p;
  360. if (!(c & 0x80))
  361. continue;
  362. if (c < 0xc0)
  363. return (uintptr_t)p - (uintptr_t)string;
  364. ab = _pcre_utf8_table4[c & 0x3f];
  365. if (length < ab || ab > 3)
  366. return (uintptr_t)p - (uintptr_t)string;
  367. length -= ab;
  368. if ((*(++p) & 0xc0) != 0x80)
  369. return (uintptr_t)p - (uintptr_t)string;
  370. switch (ab) {
  371. case 1:
  372. if ((c & 0x3e) == 0)
  373. return (uintptr_t)p - (uintptr_t)string;
  374. continue;
  375. case 2:
  376. if ((c == 0xe0 && (*p & 0x20) == 0) ||
  377. (c == 0xed && *p >= 0xa0))
  378. return (uintptr_t)p - (uintptr_t)string;
  379. break;
  380. case 3:
  381. if ((c == 0xf0 && (*p & 0x30) == 0) || (c > 0xf4) ||
  382. (c == 0xf4 && *p > 0x8f))
  383. return (uintptr_t)p - (uintptr_t)string;
  384. break;
  385. }
  386. while (--ab > 0) {
  387. if ((*(++p) & 0xc0) != 0x80)
  388. return (uintptr_t)p - (uintptr_t)string;
  389. }
  390. }
  391. return -1;
  392. }
  393. int _utf8_get(const char* string, int length, int at, char* out_buf) {
  394. const uint8_t* p;
  395. int ab, c;
  396. if (length < 0) {
  397. length = strlen(string);
  398. }
  399. if (at < 0 || at >= length)
  400. return -1;
  401. for (p = (const uint8_t*)string; length > 0 && at; p++, at--) {
  402. c = *p;
  403. if (!(c & 0x80)) {
  404. length--;
  405. continue;
  406. }
  407. ab = _pcre_utf8_table4[c & 0x3f];
  408. p += ab++;
  409. length -= ab;
  410. }
  411. if (at || length <= 0)
  412. return -2;
  413. c = *p;
  414. if (!(c & 0x80)) {
  415. *out_buf = c;
  416. out_buf[1] = 0;
  417. return 1;
  418. };
  419. ab = _pcre_utf8_table4[c & 0x3f] + 1;
  420. __platform_memcpy(out_buf, p, ab);
  421. out_buf[ab] = '\0';
  422. return ab;
  423. }
  424. int _utf8_get_offset(const char* string,
  425. int length,
  426. int at,
  427. int* out_char_len) {
  428. const uint8_t* p;
  429. int ab, c;
  430. if (length < 0) {
  431. length = strlen(string);
  432. }
  433. if (at < 0 || at >= length)
  434. return -1;
  435. for (p = (const uint8_t*)string; length > 0 && at; p++, at--) {
  436. c = *p;
  437. if (!(c & 0x80)) {
  438. length--;
  439. continue;
  440. }
  441. ab = _pcre_utf8_table4[c & 0x3f];
  442. p += ab++;
  443. length -= ab;
  444. }
  445. if (at)
  446. return -2;
  447. c = *p;
  448. if (!(c & 0x80)) {
  449. if (out_char_len)
  450. *out_char_len = 1;
  451. return (uintptr_t)p - (uintptr_t)string;
  452. };
  453. ab = _pcre_utf8_table4[c & 0x3f] + 1;
  454. if (out_char_len)
  455. *out_char_len = ab;
  456. return (uintptr_t)p - (uintptr_t)string;
  457. }
  458. int _utf8_strlen(const char* string, int length) {
  459. const uint8_t* p;
  460. int i, ab, c;
  461. if (length < 0) {
  462. length = strlen(string);
  463. }
  464. for (i = 0, p = (const uint8_t*)string; length > 0; i++, p++) {
  465. c = *p;
  466. if (!(c & 0x80)) {
  467. length--;
  468. continue;
  469. }
  470. ab = _pcre_utf8_table4[c & 0x3f];
  471. p += ab++;
  472. length -= ab;
  473. }
  474. if (length < 0)
  475. return -1;
  476. return i;
  477. }
  478. int __str_repl(PikaObj* self,
  479. char* str,
  480. int str_len,
  481. int repl_at,
  482. int repl_len,
  483. char* val,
  484. int val_len) {
  485. if (val_len > repl_len) {
  486. str[repl_at] = 0;
  487. Arg* s_new = arg_newStr(str);
  488. if (!s_new)
  489. return -1;
  490. s_new = arg_strAppend(s_new, val);
  491. s_new = arg_strAppend(s_new, str + repl_at + repl_len);
  492. obj_removeArg(self, "str");
  493. int rs = obj_setArg(self, "str", s_new);
  494. arg_deinit(s_new);
  495. if (rs)
  496. return -rs;
  497. return 0;
  498. }
  499. char* s = str + repl_at;
  500. __platform_memcpy(s, val, val_len);
  501. __platform_memmove(s + val_len, s + repl_len,
  502. str_len - repl_at - repl_len + 1);
  503. return 0;
  504. }
  505. int __utf8_to_utf32_char_LE(const char* utf8, char* out_buf) {
  506. char c = *utf8;
  507. if (!(c & mask1)) {
  508. *out_buf = c;
  509. out_buf[1] = 0;
  510. out_buf[2] = 0;
  511. out_buf[3] = 0;
  512. return 1;
  513. }
  514. int left_length = _pcre_utf8_table4[c & 0x3f];
  515. char a, b, d;
  516. switch (left_length) {
  517. case 1:
  518. a = c & nmask2;
  519. b = utf8[1] & nmask1;
  520. out_buf[0] = b | a << 6;
  521. out_buf[1] = a >> 2;
  522. out_buf[2] = 0;
  523. out_buf[3] = 0;
  524. return 2;
  525. case 2:
  526. a = c & nmask3;
  527. b = utf8[1] & nmask1;
  528. c = utf8[2] & nmask1;
  529. out_buf[0] = c | b << 6;
  530. out_buf[1] = b >> 2 | a << 4;
  531. out_buf[2] = 0;
  532. out_buf[3] = 0;
  533. return 3;
  534. case 3:
  535. a = c & nmask4;
  536. b = utf8[1] & nmask1;
  537. c = utf8[2] & nmask1;
  538. d = utf8[3] & nmask1;
  539. out_buf[0] = d | c << 6;
  540. out_buf[1] = c >> 2 | b << 4;
  541. out_buf[2] = b >> 4 | a << 2;
  542. out_buf[3] = 0;
  543. return 4;
  544. default:
  545. return 0;
  546. }
  547. }
  548. int __utf8_to_utf32_LE_noBOM_get_size(const char* utf8, int len) {
  549. char* p = (char*)utf8;
  550. char buf[4];
  551. int space_sum = 0;
  552. while (len > 0) {
  553. int size = __utf8_to_utf32_char_LE(p, buf);
  554. if (!size)
  555. return -1;
  556. p += size;
  557. len -= size;
  558. space_sum++;
  559. }
  560. return space_sum * 4;
  561. }
  562. int __utf8_to_utf32_LE_noBOM(const char* utf8, int len, char* out_buf) {
  563. char* q = out_buf;
  564. char* p = (char*)utf8;
  565. while (len > 0) {
  566. int size = __utf8_to_utf32_char_LE(p, q);
  567. if (!size)
  568. return -1;
  569. p += size;
  570. len -= size;
  571. q += 4;
  572. }
  573. return q - out_buf;
  574. }
  575. int __utf8_to_utf32_LE_withBOM(const char* utf8, int len, char* out_buf) {
  576. int size = __utf8_to_utf32_LE_noBOM(utf8, len, out_buf + 4);
  577. if (size < 0) {
  578. return size;
  579. }
  580. out_buf[0] = '\xff';
  581. out_buf[1] = '\xfe';
  582. out_buf[2] = 0;
  583. out_buf[3] = 0;
  584. return size + 4;
  585. }
  586. int32_t __utf8_decode(const char* utf8, int left_length) {
  587. int ucode = -1;
  588. char c = *utf8;
  589. if (!(c & mask1)) {
  590. return c;
  591. }
  592. char a, b, d;
  593. switch (left_length) {
  594. case 1:
  595. a = c & nmask2;
  596. b = utf8[1] & nmask1;
  597. ucode = b | (a & 0x03) << 6;
  598. ucode |= (a >> 2) << 8;
  599. break;
  600. case 2:
  601. a = c & nmask3;
  602. b = utf8[1] & nmask1;
  603. c = utf8[2] & nmask1;
  604. ucode = c | (b & 0x03) << 6;
  605. ucode |= (b >> 2 | a << 4) << 8;
  606. break;
  607. case 3:
  608. a = c & nmask4;
  609. b = utf8[1] & nmask1;
  610. c = utf8[2] & nmask1;
  611. d = utf8[3] & nmask1;
  612. ucode = d | (c & 0x03) << 6;
  613. ucode |= (c >> 2 | (b & 0x0f) << 4) << 8;
  614. ucode |= (b >> 4 | a << 2) << 16;
  615. break;
  616. default:
  617. return -1;
  618. }
  619. return ucode;
  620. }
  621. int __unicode_to_utf16_char_LE(int32_t u, char* out_buf) {
  622. if (!(u & 0xffff0000)) {
  623. out_buf[0] = u & 0xff;
  624. out_buf[1] = (u & 0xff00) >> 8;
  625. return 2;
  626. }
  627. int32_t d = u - 0x10000;
  628. int32_t L = d & 0x3ff;
  629. int32_t U = d >> 10;
  630. L = L | 0xdc00;
  631. U = U | 0xd800;
  632. out_buf[0] = U & 0xff;
  633. out_buf[1] = (U & 0xff00) >> 8;
  634. out_buf[2] = L & 0xff;
  635. out_buf[3] = (L & 0xff00) >> 8;
  636. return 4;
  637. }
  638. int __utf8_to_utf16_LE_noBOM(const char* utf8, int len, char* out_buf) {
  639. char* q = out_buf;
  640. char* p = (char*)utf8;
  641. while (len > 0) {
  642. char c = *p;
  643. int32_t ucode;
  644. if (!(c & mask1)) {
  645. ucode = c;
  646. p++;
  647. len--;
  648. } else {
  649. int left_size = _pcre_utf8_table4[c & 0x3f];
  650. ucode = __utf8_decode(p, left_size++);
  651. if (ucode < 0)
  652. return ucode;
  653. p += left_size;
  654. len -= left_size;
  655. }
  656. int size = __unicode_to_utf16_char_LE(ucode, q);
  657. q += size;
  658. }
  659. return q - out_buf;
  660. }
  661. int __utf8_to_utf16_LE_noBOM_get_size(const char* utf8, int len) {
  662. char out_buf[4];
  663. char* p = (char*)utf8;
  664. int need_space = 0;
  665. while (len > 0) {
  666. char c = *p;
  667. int32_t ucode;
  668. if (!(c & mask1)) {
  669. ucode = c;
  670. p++;
  671. len--;
  672. } else {
  673. int left_size = _pcre_utf8_table4[c & 0x3f];
  674. ucode = __utf8_decode(p, left_size++);
  675. if (ucode < 0)
  676. return ucode;
  677. p += left_size;
  678. len -= left_size;
  679. }
  680. int size = __unicode_to_utf16_char_LE(ucode, out_buf);
  681. need_space += size;
  682. }
  683. return need_space;
  684. }
  685. int __utf8_to_utf16_LE_withBOM(const char* utf8, int len, char* out_buf) {
  686. int size = __utf8_to_utf16_LE_noBOM(utf8, len, out_buf + 2);
  687. if (size < 0) {
  688. return size;
  689. }
  690. out_buf[0] = '\xff';
  691. out_buf[1] = '\xfe';
  692. return size + 2;
  693. }
  694. Arg* _str_encode(char* str, char* encoding) {
  695. if (strEqu(encoding, "utf-8")) {
  696. return arg_newBytes((uint8_t*)str, strGetSize(str));
  697. }
  698. int len = strlen(str);
  699. if (strEqu(encoding, "ascii")) {
  700. int ulen = _utf8_strlen(str, len);
  701. if (ulen == len) {
  702. return arg_newBytes((uint8_t*)str, strGetSize(str));
  703. }
  704. __platform_printf("Warning there is non-ascii characters\r\n");
  705. char* b = (char*)pikaMalloc(len + 1);
  706. if (!b) {
  707. return NULL;
  708. }
  709. char* p = str;
  710. char* q = b;
  711. char c = *p++;
  712. while (c) {
  713. if (!(c & 0x80)) {
  714. *q++ = c;
  715. }
  716. c = *p++;
  717. }
  718. *q = 0;
  719. Arg* arg = arg_newBytes((uint8_t*)b, strGetSize(b));
  720. pikaFree(b, len + 1);
  721. return arg;
  722. }
  723. if (strEqu(encoding, "utf-16")) {
  724. int size_needed = __utf8_to_utf16_LE_noBOM_get_size(str, len);
  725. if (size_needed <= 0) {
  726. return NULL;
  727. }
  728. size_needed += 2;
  729. char* b = (char*)pikaMalloc(size_needed);
  730. if (!b) {
  731. return NULL;
  732. }
  733. int ok = __utf8_to_utf16_LE_withBOM(str, len, b);
  734. if (ok < 0) {
  735. pikaFree(b, size_needed);
  736. return NULL;
  737. }
  738. Arg* arg = arg_newBytes((uint8_t*)b, size_needed);
  739. pikaFree(b, size_needed);
  740. return arg;
  741. }
  742. if (strEqu(encoding, "utf-32")) {
  743. int size_needed = __utf8_to_utf32_LE_noBOM_get_size(str, len);
  744. if (size_needed <= 0) {
  745. return NULL;
  746. }
  747. size_needed += 4;
  748. char* b = (char*)pikaMalloc(size_needed);
  749. if (!b) {
  750. return NULL;
  751. }
  752. int ok = __utf8_to_utf32_LE_withBOM(str, len, b);
  753. if (ok < 0) {
  754. pikaFree(b, size_needed);
  755. return NULL;
  756. }
  757. Arg* arg = arg_newBytes((uint8_t*)b, size_needed);
  758. pikaFree(b, size_needed);
  759. return arg;
  760. }
  761. return NULL;
  762. }
  763. char* _strlwr(char* str) {
  764. int i = 0;
  765. while (str[i] != '\0') {
  766. str[i] = tolower((int)str[i]);
  767. i++;
  768. }
  769. return str;
  770. }
  771. #endif
  772. char* PikaStdData_String_format(PikaObj* self, PikaTuple* vars) {
  773. /* 'test{}'.format(123) */
  774. return NULL;
  775. }