modure.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. /*
  2. * This file is part of the MicroPython project, http://micropython.org/
  3. *
  4. * The MIT License (MIT)
  5. *
  6. * Copyright (c) 2014 Paul Sokolovsky
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in
  16. * all copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24. * THE SOFTWARE.
  25. */
  26. #include <stdio.h>
  27. #include <assert.h>
  28. #include <string.h>
  29. #include "py/runtime.h"
  30. #include "py/binary.h"
  31. #include "py/objstr.h"
  32. #include "py/stackctrl.h"
  33. #if MICROPY_PY_URE
  34. #define re1_5_stack_chk() MP_STACK_CHECK()
  35. #include "re1.5/re1.5.h"
  36. #define FLAG_DEBUG 0x1000
  37. typedef struct _mp_obj_re_t {
  38. mp_obj_base_t base;
  39. ByteProg re;
  40. } mp_obj_re_t;
  41. typedef struct _mp_obj_match_t {
  42. mp_obj_base_t base;
  43. int num_matches;
  44. mp_obj_t str;
  45. const char *caps[0];
  46. } mp_obj_match_t;
  47. STATIC void match_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
  48. (void)kind;
  49. mp_obj_match_t *self = MP_OBJ_TO_PTR(self_in);
  50. mp_printf(print, "<match num=%d>", self->num_matches);
  51. }
  52. STATIC mp_obj_t match_group(mp_obj_t self_in, mp_obj_t no_in) {
  53. mp_obj_match_t *self = MP_OBJ_TO_PTR(self_in);
  54. mp_int_t no = mp_obj_get_int(no_in);
  55. if (no < 0 || no >= self->num_matches) {
  56. nlr_raise(mp_obj_new_exception_arg1(&mp_type_IndexError, no_in));
  57. }
  58. const char *start = self->caps[no * 2];
  59. if (start == NULL) {
  60. // no match for this group
  61. return mp_const_none;
  62. }
  63. return mp_obj_new_str_of_type(mp_obj_get_type(self->str),
  64. (const byte*)start, self->caps[no * 2 + 1] - start);
  65. }
  66. MP_DEFINE_CONST_FUN_OBJ_2(match_group_obj, match_group);
  67. #if MICROPY_PY_URE_MATCH_GROUPS
  68. STATIC mp_obj_t match_groups(mp_obj_t self_in) {
  69. mp_obj_match_t *self = MP_OBJ_TO_PTR(self_in);
  70. if (self->num_matches <= 1) {
  71. return mp_const_empty_tuple;
  72. }
  73. mp_obj_tuple_t *groups = MP_OBJ_TO_PTR(mp_obj_new_tuple(self->num_matches - 1, NULL));
  74. for (int i = 1; i < self->num_matches; ++i) {
  75. groups->items[i - 1] = match_group(self_in, MP_OBJ_NEW_SMALL_INT(i));
  76. }
  77. return MP_OBJ_FROM_PTR(groups);
  78. }
  79. MP_DEFINE_CONST_FUN_OBJ_1(match_groups_obj, match_groups);
  80. #endif
  81. #if MICROPY_PY_URE_MATCH_SPAN_START_END
  82. STATIC void match_span_helper(size_t n_args, const mp_obj_t *args, mp_obj_t span[2]) {
  83. mp_obj_match_t *self = MP_OBJ_TO_PTR(args[0]);
  84. mp_int_t no = 0;
  85. if (n_args == 2) {
  86. no = mp_obj_get_int(args[1]);
  87. if (no < 0 || no >= self->num_matches) {
  88. nlr_raise(mp_obj_new_exception_arg1(&mp_type_IndexError, args[1]));
  89. }
  90. }
  91. mp_int_t s = -1;
  92. mp_int_t e = -1;
  93. const char *start = self->caps[no * 2];
  94. if (start != NULL) {
  95. // have a match for this group
  96. const char *begin = mp_obj_str_get_str(self->str);
  97. s = start - begin;
  98. e = self->caps[no * 2 + 1] - begin;
  99. }
  100. span[0] = mp_obj_new_int(s);
  101. span[1] = mp_obj_new_int(e);
  102. }
  103. STATIC mp_obj_t match_span(size_t n_args, const mp_obj_t *args) {
  104. mp_obj_t span[2];
  105. match_span_helper(n_args, args, span);
  106. return mp_obj_new_tuple(2, span);
  107. }
  108. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(match_span_obj, 1, 2, match_span);
  109. STATIC mp_obj_t match_start(size_t n_args, const mp_obj_t *args) {
  110. mp_obj_t span[2];
  111. match_span_helper(n_args, args, span);
  112. return span[0];
  113. }
  114. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(match_start_obj, 1, 2, match_start);
  115. STATIC mp_obj_t match_end(size_t n_args, const mp_obj_t *args) {
  116. mp_obj_t span[2];
  117. match_span_helper(n_args, args, span);
  118. return span[1];
  119. }
  120. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(match_end_obj, 1, 2, match_end);
  121. #endif
  122. #if !MICROPY_ENABLE_DYNRUNTIME
  123. STATIC const mp_rom_map_elem_t match_locals_dict_table[] = {
  124. { MP_ROM_QSTR(MP_QSTR_group), MP_ROM_PTR(&match_group_obj) },
  125. #if MICROPY_PY_URE_MATCH_GROUPS
  126. { MP_ROM_QSTR(MP_QSTR_groups), MP_ROM_PTR(&match_groups_obj) },
  127. #endif
  128. #if MICROPY_PY_URE_MATCH_SPAN_START_END
  129. { MP_ROM_QSTR(MP_QSTR_span), MP_ROM_PTR(&match_span_obj) },
  130. { MP_ROM_QSTR(MP_QSTR_start), MP_ROM_PTR(&match_start_obj) },
  131. { MP_ROM_QSTR(MP_QSTR_end), MP_ROM_PTR(&match_end_obj) },
  132. #endif
  133. };
  134. STATIC MP_DEFINE_CONST_DICT(match_locals_dict, match_locals_dict_table);
  135. STATIC const mp_obj_type_t match_type = {
  136. { &mp_type_type },
  137. .name = MP_QSTR_match,
  138. .print = match_print,
  139. .locals_dict = (void*)&match_locals_dict,
  140. };
  141. #endif
  142. STATIC void re_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
  143. (void)kind;
  144. mp_obj_re_t *self = MP_OBJ_TO_PTR(self_in);
  145. mp_printf(print, "<re %p>", self);
  146. }
  147. STATIC mp_obj_t ure_exec(bool is_anchored, uint n_args, const mp_obj_t *args) {
  148. (void)n_args;
  149. mp_obj_re_t *self = MP_OBJ_TO_PTR(args[0]);
  150. Subject subj;
  151. size_t len;
  152. subj.begin = mp_obj_str_get_data(args[1], &len);
  153. subj.end = subj.begin + len;
  154. int caps_num = (self->re.sub + 1) * 2;
  155. mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, char*, caps_num);
  156. // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
  157. memset((char*)match->caps, 0, caps_num * sizeof(char*));
  158. int res = re1_5_recursiveloopprog(&self->re, &subj, match->caps, caps_num, is_anchored);
  159. if (res == 0) {
  160. m_del_var(mp_obj_match_t, char*, caps_num, match);
  161. return mp_const_none;
  162. }
  163. match->base.type = &match_type;
  164. match->num_matches = caps_num / 2; // caps_num counts start and end pointers
  165. match->str = args[1];
  166. return MP_OBJ_FROM_PTR(match);
  167. }
  168. STATIC mp_obj_t re_match(size_t n_args, const mp_obj_t *args) {
  169. return ure_exec(true, n_args, args);
  170. }
  171. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_match_obj, 2, 4, re_match);
  172. STATIC mp_obj_t re_search(size_t n_args, const mp_obj_t *args) {
  173. return ure_exec(false, n_args, args);
  174. }
  175. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_search_obj, 2, 4, re_search);
  176. STATIC mp_obj_t re_split(size_t n_args, const mp_obj_t *args) {
  177. mp_obj_re_t *self = MP_OBJ_TO_PTR(args[0]);
  178. Subject subj;
  179. size_t len;
  180. const mp_obj_type_t *str_type = mp_obj_get_type(args[1]);
  181. subj.begin = mp_obj_str_get_data(args[1], &len);
  182. subj.end = subj.begin + len;
  183. int caps_num = (self->re.sub + 1) * 2;
  184. int maxsplit = 0;
  185. if (n_args > 2) {
  186. maxsplit = mp_obj_get_int(args[2]);
  187. }
  188. mp_obj_t retval = mp_obj_new_list(0, NULL);
  189. const char **caps = mp_local_alloc(caps_num * sizeof(char*));
  190. while (true) {
  191. // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
  192. memset((char**)caps, 0, caps_num * sizeof(char*));
  193. int res = re1_5_recursiveloopprog(&self->re, &subj, caps, caps_num, false);
  194. // if we didn't have a match, or had an empty match, it's time to stop
  195. if (!res || caps[0] == caps[1]) {
  196. break;
  197. }
  198. mp_obj_t s = mp_obj_new_str_of_type(str_type, (const byte*)subj.begin, caps[0] - subj.begin);
  199. mp_obj_list_append(retval, s);
  200. if (self->re.sub > 0) {
  201. mp_raise_NotImplementedError("Splitting with sub-captures");
  202. }
  203. subj.begin = caps[1];
  204. if (maxsplit > 0 && --maxsplit == 0) {
  205. break;
  206. }
  207. }
  208. // cast is a workaround for a bug in msvc (see above)
  209. mp_local_free((char**)caps);
  210. mp_obj_t s = mp_obj_new_str_of_type(str_type, (const byte*)subj.begin, subj.end - subj.begin);
  211. mp_obj_list_append(retval, s);
  212. return retval;
  213. }
  214. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_split_obj, 2, 3, re_split);
  215. #if MICROPY_PY_URE_SUB
  216. STATIC mp_obj_t re_sub_helper(mp_obj_t self_in, size_t n_args, const mp_obj_t *args) {
  217. mp_obj_re_t *self = MP_OBJ_TO_PTR(self_in);
  218. mp_obj_t replace = args[1];
  219. mp_obj_t where = args[2];
  220. mp_int_t count = 0;
  221. if (n_args > 3) {
  222. count = mp_obj_get_int(args[3]);
  223. // Note: flags are currently ignored
  224. }
  225. size_t where_len;
  226. const char *where_str = mp_obj_str_get_data(where, &where_len);
  227. Subject subj;
  228. subj.begin = where_str;
  229. subj.end = subj.begin + where_len;
  230. int caps_num = (self->re.sub + 1) * 2;
  231. vstr_t vstr_return;
  232. vstr_return.buf = NULL; // We'll init the vstr after the first match
  233. mp_obj_match_t *match = mp_local_alloc(sizeof(mp_obj_match_t) + caps_num * sizeof(char*));
  234. match->base.type = &match_type;
  235. match->num_matches = caps_num / 2; // caps_num counts start and end pointers
  236. match->str = where;
  237. for (;;) {
  238. // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
  239. memset((char*)match->caps, 0, caps_num * sizeof(char*));
  240. int res = re1_5_recursiveloopprog(&self->re, &subj, match->caps, caps_num, false);
  241. // If we didn't have a match, or had an empty match, it's time to stop
  242. if (!res || match->caps[0] == match->caps[1]) {
  243. break;
  244. }
  245. // Initialise the vstr if it's not already
  246. if (vstr_return.buf == NULL) {
  247. vstr_init(&vstr_return, match->caps[0] - subj.begin);
  248. }
  249. // Add pre-match string
  250. vstr_add_strn(&vstr_return, subj.begin, match->caps[0] - subj.begin);
  251. // Get replacement string
  252. const char* repl = mp_obj_str_get_str((mp_obj_is_callable(replace) ? mp_call_function_1(replace, MP_OBJ_FROM_PTR(match)) : replace));
  253. // Append replacement string to result, substituting any regex groups
  254. while (*repl != '\0') {
  255. if (*repl == '\\') {
  256. ++repl;
  257. bool is_g_format = false;
  258. if (*repl == 'g' && repl[1] == '<') {
  259. // Group specified with syntax "\g<number>"
  260. repl += 2;
  261. is_g_format = true;
  262. }
  263. if ('0' <= *repl && *repl <= '9') {
  264. // Group specified with syntax "\g<number>" or "\number"
  265. unsigned int match_no = 0;
  266. do {
  267. match_no = match_no * 10 + (*repl++ - '0');
  268. } while ('0' <= *repl && *repl <= '9');
  269. if (is_g_format && *repl == '>') {
  270. ++repl;
  271. }
  272. if (match_no >= (unsigned int)match->num_matches) {
  273. nlr_raise(mp_obj_new_exception_arg1(&mp_type_IndexError, MP_OBJ_NEW_SMALL_INT(match_no)));
  274. }
  275. const char *start_match = match->caps[match_no * 2];
  276. if (start_match != NULL) {
  277. // Add the substring matched by group
  278. const char *end_match = match->caps[match_no * 2 + 1];
  279. vstr_add_strn(&vstr_return, start_match, end_match - start_match);
  280. }
  281. }
  282. } else {
  283. // Just add the current byte from the replacement string
  284. vstr_add_byte(&vstr_return, *repl++);
  285. }
  286. }
  287. // Move start pointer to end of last match
  288. subj.begin = match->caps[1];
  289. // Stop substitutions if count was given and gets to 0
  290. if (count > 0 && --count == 0) {
  291. break;
  292. }
  293. }
  294. mp_local_free(match);
  295. if (vstr_return.buf == NULL) {
  296. // Optimisation for case of no substitutions
  297. return where;
  298. }
  299. // Add post-match string
  300. vstr_add_strn(&vstr_return, subj.begin, subj.end - subj.begin);
  301. return mp_obj_new_str_from_vstr(mp_obj_get_type(where), &vstr_return);
  302. }
  303. STATIC mp_obj_t re_sub(size_t n_args, const mp_obj_t *args) {
  304. return re_sub_helper(args[0], n_args, args);
  305. }
  306. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_sub_obj, 3, 5, re_sub);
  307. #endif
  308. #if !MICROPY_ENABLE_DYNRUNTIME
  309. STATIC const mp_rom_map_elem_t re_locals_dict_table[] = {
  310. { MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&re_match_obj) },
  311. { MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&re_search_obj) },
  312. { MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&re_split_obj) },
  313. #if MICROPY_PY_URE_SUB
  314. { MP_ROM_QSTR(MP_QSTR_sub), MP_ROM_PTR(&re_sub_obj) },
  315. #endif
  316. };
  317. STATIC MP_DEFINE_CONST_DICT(re_locals_dict, re_locals_dict_table);
  318. STATIC const mp_obj_type_t re_type = {
  319. { &mp_type_type },
  320. .name = MP_QSTR_ure,
  321. .print = re_print,
  322. .locals_dict = (void*)&re_locals_dict,
  323. };
  324. #endif
  325. STATIC mp_obj_t mod_re_compile(size_t n_args, const mp_obj_t *args) {
  326. (void)n_args;
  327. const char *re_str = mp_obj_str_get_str(args[0]);
  328. int size = re1_5_sizecode(re_str);
  329. if (size == -1) {
  330. goto error;
  331. }
  332. mp_obj_re_t *o = m_new_obj_var(mp_obj_re_t, char, size);
  333. o->base.type = &re_type;
  334. #if MICROPY_PY_URE_DEBUG
  335. int flags = 0;
  336. if (n_args > 1) {
  337. flags = mp_obj_get_int(args[1]);
  338. }
  339. #endif
  340. int error = re1_5_compilecode(&o->re, re_str);
  341. if (error != 0) {
  342. error:
  343. mp_raise_ValueError("Error in regex");
  344. }
  345. #if MICROPY_PY_URE_DEBUG
  346. if (flags & FLAG_DEBUG) {
  347. re1_5_dumpcode(&o->re);
  348. }
  349. #endif
  350. return MP_OBJ_FROM_PTR(o);
  351. }
  352. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_compile_obj, 1, 2, mod_re_compile);
  353. STATIC mp_obj_t mod_re_exec(bool is_anchored, uint n_args, const mp_obj_t *args) {
  354. (void)n_args;
  355. mp_obj_t self = mod_re_compile(1, args);
  356. const mp_obj_t args2[] = {self, args[1]};
  357. mp_obj_t match = ure_exec(is_anchored, 2, args2);
  358. return match;
  359. }
  360. STATIC mp_obj_t mod_re_match(size_t n_args, const mp_obj_t *args) {
  361. return mod_re_exec(true, n_args, args);
  362. }
  363. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_match_obj, 2, 4, mod_re_match);
  364. STATIC mp_obj_t mod_re_search(size_t n_args, const mp_obj_t *args) {
  365. return mod_re_exec(false, n_args, args);
  366. }
  367. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_search_obj, 2, 4, mod_re_search);
  368. #if MICROPY_PY_URE_SUB
  369. STATIC mp_obj_t mod_re_sub(size_t n_args, const mp_obj_t *args) {
  370. mp_obj_t self = mod_re_compile(1, args);
  371. return re_sub_helper(self, n_args, args);
  372. }
  373. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_sub_obj, 3, 5, mod_re_sub);
  374. #endif
  375. #if !MICROPY_ENABLE_DYNRUNTIME
  376. STATIC const mp_rom_map_elem_t mp_module_re_globals_table[] = {
  377. { MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_ure) },
  378. { MP_ROM_QSTR(MP_QSTR_compile), MP_ROM_PTR(&mod_re_compile_obj) },
  379. { MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&mod_re_match_obj) },
  380. { MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&mod_re_search_obj) },
  381. #if MICROPY_PY_URE_SUB
  382. { MP_ROM_QSTR(MP_QSTR_sub), MP_ROM_PTR(&mod_re_sub_obj) },
  383. #endif
  384. #if MICROPY_PY_URE_DEBUG
  385. { MP_ROM_QSTR(MP_QSTR_DEBUG), MP_ROM_INT(FLAG_DEBUG) },
  386. #endif
  387. };
  388. STATIC MP_DEFINE_CONST_DICT(mp_module_re_globals, mp_module_re_globals_table);
  389. const mp_obj_module_t mp_module_ure = {
  390. .base = { &mp_type_module },
  391. .globals = (mp_obj_dict_t*)&mp_module_re_globals,
  392. };
  393. #endif
  394. // Source files #include'd here to make sure they're compiled in
  395. // only if module is enabled by config setting.
  396. #define re1_5_fatal(x) assert(!x)
  397. #include "re1.5/compilecode.c"
  398. #if MICROPY_PY_URE_DEBUG
  399. #include "re1.5/dumpcode.c"
  400. #endif
  401. #include "re1.5/recursiveloop.c"
  402. #include "re1.5/charclass.c"
  403. #endif //MICROPY_PY_URE