objstrunicode.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. /*
  2. * This file is part of the MicroPython project, http://micropython.org/
  3. *
  4. * The MIT License (MIT)
  5. *
  6. * Copyright (c) 2013, 2014 Damien P. George
  7. * Copyright (c) 2014-2016 Paul Sokolovsky
  8. *
  9. * Permission is hereby granted, free of charge, to any person obtaining a copy
  10. * of this software and associated documentation files (the "Software"), to deal
  11. * in the Software without restriction, including without limitation the rights
  12. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  13. * copies of the Software, and to permit persons to whom the Software is
  14. * furnished to do so, subject to the following conditions:
  15. *
  16. * The above copyright notice and this permission notice shall be included in
  17. * all copies or substantial portions of the Software.
  18. *
  19. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  20. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  22. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  23. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  24. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  25. * THE SOFTWARE.
  26. */
  27. #include <string.h>
  28. #include <assert.h>
  29. #include "py/objstr.h"
  30. #include "py/objlist.h"
  31. #include "py/runtime.h"
  32. #if MICROPY_PY_BUILTINS_STR_UNICODE
  33. STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf);
  34. /******************************************************************************/
  35. /* str */
  36. STATIC void uni_print_quoted(const mp_print_t *print, const byte *str_data, uint str_len) {
  37. // this escapes characters, but it will be very slow to print (calling print many times)
  38. bool has_single_quote = false;
  39. bool has_double_quote = false;
  40. for (const byte *s = str_data, *top = str_data + str_len; !has_double_quote && s < top; s++) {
  41. if (*s == '\'') {
  42. has_single_quote = true;
  43. } else if (*s == '"') {
  44. has_double_quote = true;
  45. }
  46. }
  47. unichar quote_char = '\'';
  48. if (has_single_quote && !has_double_quote) {
  49. quote_char = '"';
  50. }
  51. mp_printf(print, "%c", quote_char);
  52. const byte *s = str_data, *top = str_data + str_len;
  53. while (s < top) {
  54. unichar ch;
  55. ch = utf8_get_char(s);
  56. s = utf8_next_char(s);
  57. if (ch == quote_char) {
  58. mp_printf(print, "\\%c", quote_char);
  59. } else if (ch == '\\') {
  60. mp_print_str(print, "\\\\");
  61. } else if (32 <= ch && ch <= 126) {
  62. mp_printf(print, "%c", ch);
  63. } else if (ch == '\n') {
  64. mp_print_str(print, "\\n");
  65. } else if (ch == '\r') {
  66. mp_print_str(print, "\\r");
  67. } else if (ch == '\t') {
  68. mp_print_str(print, "\\t");
  69. } else if (ch < 0x100) {
  70. mp_printf(print, "\\x%02x", ch);
  71. } else if (ch < 0x10000) {
  72. mp_printf(print, "\\u%04x", ch);
  73. } else {
  74. mp_printf(print, "\\U%08x", ch);
  75. }
  76. }
  77. mp_printf(print, "%c", quote_char);
  78. }
  79. STATIC void uni_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
  80. GET_STR_DATA_LEN(self_in, str_data, str_len);
  81. #if MICROPY_PY_UJSON
  82. if (kind == PRINT_JSON) {
  83. mp_str_print_json(print, str_data, str_len);
  84. return;
  85. }
  86. #endif
  87. if (kind == PRINT_STR) {
  88. mp_printf(print, "%.*s", str_len, str_data);
  89. } else {
  90. uni_print_quoted(print, str_data, str_len);
  91. }
  92. }
  93. STATIC mp_obj_t uni_unary_op(mp_unary_op_t op, mp_obj_t self_in) {
  94. GET_STR_DATA_LEN(self_in, str_data, str_len);
  95. switch (op) {
  96. case MP_UNARY_OP_BOOL:
  97. return mp_obj_new_bool(str_len != 0);
  98. case MP_UNARY_OP_LEN:
  99. return MP_OBJ_NEW_SMALL_INT(utf8_charlen(str_data, str_len));
  100. default:
  101. return MP_OBJ_NULL; // op not supported
  102. }
  103. }
  104. // Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or
  105. // be capped to the first/last character of the string, depending on is_slice.
  106. const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len,
  107. mp_obj_t index, bool is_slice) {
  108. // All str functions also handle bytes objects, and they call str_index_to_ptr(),
  109. // so it must handle bytes.
  110. if (type == &mp_type_bytes) {
  111. // Taken from objstr.c:str_index_to_ptr()
  112. size_t index_val = mp_get_index(type, self_len, index, is_slice);
  113. return self_data + index_val;
  114. }
  115. mp_int_t i;
  116. // Copied from mp_get_index; I don't want bounds checking, just give me
  117. // the integer as-is. (I can't bounds-check without scanning the whole
  118. // string; an out-of-bounds index will be caught in the loops below.)
  119. if (mp_obj_is_small_int(index)) {
  120. i = MP_OBJ_SMALL_INT_VALUE(index);
  121. } else if (!mp_obj_get_int_maybe(index, &i)) {
  122. mp_raise_msg_varg(&mp_type_TypeError, MP_ERROR_TEXT("string indices must be integers, not %s"), mp_obj_get_type_str(index));
  123. }
  124. const byte *s, *top = self_data + self_len;
  125. if (i < 0) {
  126. // Negative indexing is performed by counting from the end of the string.
  127. for (s = top - 1; i; --s) {
  128. if (s < self_data) {
  129. if (is_slice) {
  130. return self_data;
  131. }
  132. mp_raise_msg(&mp_type_IndexError, MP_ERROR_TEXT("string index out of range"));
  133. }
  134. if (!UTF8_IS_CONT(*s)) {
  135. ++i;
  136. }
  137. }
  138. ++s;
  139. } else {
  140. // Positive indexing, correspondingly, counts from the start of the string.
  141. // It's assumed that negative indexing will generally be used with small
  142. // absolute values (eg str[-1], not str[-1000000]), which means it'll be
  143. // more efficient this way.
  144. s = self_data;
  145. while (1) {
  146. // First check out-of-bounds
  147. if (s >= top) {
  148. if (is_slice) {
  149. return top;
  150. }
  151. mp_raise_msg(&mp_type_IndexError, MP_ERROR_TEXT("string index out of range"));
  152. }
  153. // Then check completion
  154. if (i-- == 0) {
  155. break;
  156. }
  157. // Then skip UTF-8 char
  158. ++s;
  159. while (UTF8_IS_CONT(*s)) {
  160. ++s;
  161. }
  162. }
  163. }
  164. return s;
  165. }
  166. STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
  167. const mp_obj_type_t *type = mp_obj_get_type(self_in);
  168. assert(type == &mp_type_str);
  169. GET_STR_DATA_LEN(self_in, self_data, self_len);
  170. if (value == MP_OBJ_SENTINEL) {
  171. // load
  172. #if MICROPY_PY_BUILTINS_SLICE
  173. if (mp_obj_is_type(index, &mp_type_slice)) {
  174. mp_obj_t ostart, ostop, ostep;
  175. mp_obj_slice_t *slice = MP_OBJ_TO_PTR(index);
  176. ostart = slice->start;
  177. ostop = slice->stop;
  178. ostep = slice->step;
  179. if (ostep != mp_const_none && ostep != MP_OBJ_NEW_SMALL_INT(1)) {
  180. mp_raise_NotImplementedError(MP_ERROR_TEXT("only slices with step=1 (aka None) are supported"));
  181. }
  182. const byte *pstart, *pstop;
  183. if (ostart != mp_const_none) {
  184. pstart = str_index_to_ptr(type, self_data, self_len, ostart, true);
  185. } else {
  186. pstart = self_data;
  187. }
  188. if (ostop != mp_const_none) {
  189. // pstop will point just after the stop character. This depends on
  190. // the \0 at the end of the string.
  191. pstop = str_index_to_ptr(type, self_data, self_len, ostop, true);
  192. } else {
  193. pstop = self_data + self_len;
  194. }
  195. if (pstop < pstart) {
  196. return MP_OBJ_NEW_QSTR(MP_QSTR_);
  197. }
  198. return mp_obj_new_str_of_type(type, (const byte *)pstart, pstop - pstart);
  199. }
  200. #endif
  201. const byte *s = str_index_to_ptr(type, self_data, self_len, index, false);
  202. int len = 1;
  203. if (UTF8_IS_NONASCII(*s)) {
  204. // Count the number of 1 bits (after the first)
  205. for (char mask = 0x40; *s & mask; mask >>= 1) {
  206. ++len;
  207. }
  208. }
  209. return mp_obj_new_str_via_qstr((const char *)s, len); // This will create a one-character string
  210. } else {
  211. return MP_OBJ_NULL; // op not supported
  212. }
  213. }
  214. STATIC const mp_rom_map_elem_t struni_locals_dict_table[] = {
  215. #if MICROPY_CPYTHON_COMPAT
  216. { MP_ROM_QSTR(MP_QSTR_encode), MP_ROM_PTR(&str_encode_obj) },
  217. #endif
  218. { MP_ROM_QSTR(MP_QSTR_find), MP_ROM_PTR(&str_find_obj) },
  219. { MP_ROM_QSTR(MP_QSTR_rfind), MP_ROM_PTR(&str_rfind_obj) },
  220. { MP_ROM_QSTR(MP_QSTR_index), MP_ROM_PTR(&str_index_obj) },
  221. { MP_ROM_QSTR(MP_QSTR_rindex), MP_ROM_PTR(&str_rindex_obj) },
  222. { MP_ROM_QSTR(MP_QSTR_join), MP_ROM_PTR(&str_join_obj) },
  223. { MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&str_split_obj) },
  224. #if MICROPY_PY_BUILTINS_STR_SPLITLINES
  225. { MP_ROM_QSTR(MP_QSTR_splitlines), MP_ROM_PTR(&str_splitlines_obj) },
  226. #endif
  227. { MP_ROM_QSTR(MP_QSTR_rsplit), MP_ROM_PTR(&str_rsplit_obj) },
  228. { MP_ROM_QSTR(MP_QSTR_startswith), MP_ROM_PTR(&str_startswith_obj) },
  229. { MP_ROM_QSTR(MP_QSTR_endswith), MP_ROM_PTR(&str_endswith_obj) },
  230. { MP_ROM_QSTR(MP_QSTR_strip), MP_ROM_PTR(&str_strip_obj) },
  231. { MP_ROM_QSTR(MP_QSTR_lstrip), MP_ROM_PTR(&str_lstrip_obj) },
  232. { MP_ROM_QSTR(MP_QSTR_rstrip), MP_ROM_PTR(&str_rstrip_obj) },
  233. { MP_ROM_QSTR(MP_QSTR_format), MP_ROM_PTR(&str_format_obj) },
  234. { MP_ROM_QSTR(MP_QSTR_replace), MP_ROM_PTR(&str_replace_obj) },
  235. #if MICROPY_PY_BUILTINS_STR_COUNT
  236. { MP_ROM_QSTR(MP_QSTR_count), MP_ROM_PTR(&str_count_obj) },
  237. #endif
  238. #if MICROPY_PY_BUILTINS_STR_PARTITION
  239. { MP_ROM_QSTR(MP_QSTR_partition), MP_ROM_PTR(&str_partition_obj) },
  240. { MP_ROM_QSTR(MP_QSTR_rpartition), MP_ROM_PTR(&str_rpartition_obj) },
  241. #endif
  242. #if MICROPY_PY_BUILTINS_STR_CENTER
  243. { MP_ROM_QSTR(MP_QSTR_center), MP_ROM_PTR(&str_center_obj) },
  244. #endif
  245. { MP_ROM_QSTR(MP_QSTR_lower), MP_ROM_PTR(&str_lower_obj) },
  246. { MP_ROM_QSTR(MP_QSTR_upper), MP_ROM_PTR(&str_upper_obj) },
  247. { MP_ROM_QSTR(MP_QSTR_isspace), MP_ROM_PTR(&str_isspace_obj) },
  248. { MP_ROM_QSTR(MP_QSTR_isalpha), MP_ROM_PTR(&str_isalpha_obj) },
  249. { MP_ROM_QSTR(MP_QSTR_isdigit), MP_ROM_PTR(&str_isdigit_obj) },
  250. { MP_ROM_QSTR(MP_QSTR_isupper), MP_ROM_PTR(&str_isupper_obj) },
  251. { MP_ROM_QSTR(MP_QSTR_islower), MP_ROM_PTR(&str_islower_obj) },
  252. };
  253. STATIC MP_DEFINE_CONST_DICT(struni_locals_dict, struni_locals_dict_table);
  254. const mp_obj_type_t mp_type_str = {
  255. { &mp_type_type },
  256. .name = MP_QSTR_str,
  257. .print = uni_print,
  258. .make_new = mp_obj_str_make_new,
  259. .unary_op = uni_unary_op,
  260. .binary_op = mp_obj_str_binary_op,
  261. .subscr = str_subscr,
  262. .getiter = mp_obj_new_str_iterator,
  263. .buffer_p = { .get_buffer = mp_obj_str_get_buffer },
  264. .locals_dict = (mp_obj_dict_t *)&struni_locals_dict,
  265. };
  266. /******************************************************************************/
  267. /* str iterator */
  268. typedef struct _mp_obj_str_it_t {
  269. mp_obj_base_t base;
  270. mp_fun_1_t iternext;
  271. mp_obj_t str;
  272. size_t cur;
  273. } mp_obj_str_it_t;
  274. STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) {
  275. mp_obj_str_it_t *self = MP_OBJ_TO_PTR(self_in);
  276. GET_STR_DATA_LEN(self->str, str, len);
  277. if (self->cur < len) {
  278. const byte *cur = str + self->cur;
  279. const byte *end = utf8_next_char(str + self->cur);
  280. mp_obj_t o_out = mp_obj_new_str_via_qstr((const char *)cur, end - cur);
  281. self->cur += end - cur;
  282. return o_out;
  283. } else {
  284. return MP_OBJ_STOP_ITERATION;
  285. }
  286. }
  287. STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) {
  288. assert(sizeof(mp_obj_str_it_t) <= sizeof(mp_obj_iter_buf_t));
  289. mp_obj_str_it_t *o = (mp_obj_str_it_t *)iter_buf;
  290. o->base.type = &mp_type_polymorph_iter;
  291. o->iternext = str_it_iternext;
  292. o->str = str;
  293. o->cur = 0;
  294. return MP_OBJ_FROM_PTR(o);
  295. }
  296. #endif // MICROPY_PY_BUILTINS_STR_UNICODE