compilecode.c 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. // Copyright 2014 Paul Sokolovsky.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. #include "re1.5.h"
  5. #define REL(at, to) (to - at - 2)
  6. #define PC (prog->bytelen)
  7. #if defined(__ICCARM__)
  8. #define EMIT(at, byte) (code ? (code[at] = byte) : (at))
  9. #define INSERT_CODE(at, num, pc) \
  10. ((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num)
  11. #else
  12. #define INSERT_CODE(at, num, pc) \
  13. ((code ? memmove(code + at + num, code + at, pc - at) : (void)0), pc += num)
  14. #define EMIT(at, byte) (code ? (code[at] = byte) : (void)(at))
  15. #endif /* defined(__ICCARM__) */
  16. static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
  17. {
  18. char *code = sizecode ? NULL : prog->insts;
  19. int start = PC;
  20. int term = PC;
  21. int alt_label = 0;
  22. for (; *re && *re != ')'; re++) {
  23. switch (*re) {
  24. case '\\':
  25. re++;
  26. if (!*re) return NULL; // Trailing backslash
  27. if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') {
  28. term = PC;
  29. EMIT(PC++, NamedClass);
  30. EMIT(PC++, *re);
  31. prog->len++;
  32. break;
  33. }
  34. default:
  35. term = PC;
  36. EMIT(PC++, Char);
  37. EMIT(PC++, *re);
  38. prog->len++;
  39. break;
  40. case '.':
  41. term = PC;
  42. EMIT(PC++, Any);
  43. prog->len++;
  44. break;
  45. case '[': {
  46. int cnt;
  47. term = PC;
  48. re++;
  49. if (*re == '^') {
  50. EMIT(PC++, ClassNot);
  51. re++;
  52. } else {
  53. EMIT(PC++, Class);
  54. }
  55. PC++; // Skip # of pair byte
  56. prog->len++;
  57. for (cnt = 0; *re != ']'; re++, cnt++) {
  58. if (!*re) return NULL;
  59. EMIT(PC++, *re);
  60. if (re[1] == '-' && re[2] != ']') {
  61. re += 2;
  62. }
  63. EMIT(PC++, *re);
  64. }
  65. EMIT(term + 1, cnt);
  66. break;
  67. }
  68. case '(': {
  69. term = PC;
  70. int sub = 0;
  71. int capture = re[1] != '?' || re[2] != ':';
  72. if (capture) {
  73. sub = ++prog->sub;
  74. EMIT(PC++, Save);
  75. EMIT(PC++, 2 * sub);
  76. prog->len++;
  77. } else {
  78. re += 2;
  79. }
  80. re = _compilecode(re + 1, prog, sizecode);
  81. if (re == NULL || *re != ')') return NULL; // error, or no matching paren
  82. if (capture) {
  83. EMIT(PC++, Save);
  84. EMIT(PC++, 2 * sub + 1);
  85. prog->len++;
  86. }
  87. break;
  88. }
  89. case '?':
  90. if (PC == term) return NULL; // nothing to repeat
  91. INSERT_CODE(term, 2, PC);
  92. if (re[1] == '?') {
  93. EMIT(term, RSplit);
  94. re++;
  95. } else {
  96. EMIT(term, Split);
  97. }
  98. EMIT(term + 1, REL(term, PC));
  99. prog->len++;
  100. term = PC;
  101. break;
  102. case '*':
  103. if (PC == term) return NULL; // nothing to repeat
  104. INSERT_CODE(term, 2, PC);
  105. EMIT(PC, Jmp);
  106. EMIT(PC + 1, REL(PC, term));
  107. PC += 2;
  108. if (re[1] == '?') {
  109. EMIT(term, RSplit);
  110. re++;
  111. } else {
  112. EMIT(term, Split);
  113. }
  114. EMIT(term + 1, REL(term, PC));
  115. prog->len += 2;
  116. term = PC;
  117. break;
  118. case '+':
  119. if (PC == term) return NULL; // nothing to repeat
  120. if (re[1] == '?') {
  121. EMIT(PC, Split);
  122. re++;
  123. } else {
  124. EMIT(PC, RSplit);
  125. }
  126. EMIT(PC + 1, REL(PC, term));
  127. PC += 2;
  128. prog->len++;
  129. term = PC;
  130. break;
  131. case '|':
  132. if (alt_label) {
  133. EMIT(alt_label, REL(alt_label, PC) + 1);
  134. }
  135. INSERT_CODE(start, 2, PC);
  136. EMIT(PC++, Jmp);
  137. alt_label = PC++;
  138. EMIT(start, Split);
  139. EMIT(start + 1, REL(start, PC));
  140. prog->len += 2;
  141. term = PC;
  142. break;
  143. case '^':
  144. EMIT(PC++, Bol);
  145. prog->len++;
  146. term = PC;
  147. break;
  148. case '$':
  149. EMIT(PC++, Eol);
  150. prog->len++;
  151. term = PC;
  152. break;
  153. }
  154. }
  155. if (alt_label) {
  156. EMIT(alt_label, REL(alt_label, PC) + 1);
  157. }
  158. return re;
  159. }
  160. int re1_5_sizecode(const char *re)
  161. {
  162. ByteProg dummyprog = {
  163. // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
  164. .bytelen = 5 + NON_ANCHORED_PREFIX
  165. };
  166. if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1;
  167. return dummyprog.bytelen;
  168. }
  169. int re1_5_compilecode(ByteProg *prog, const char *re)
  170. {
  171. prog->len = 0;
  172. prog->bytelen = 0;
  173. prog->sub = 0;
  174. // Add code to implement non-anchored operation ("search"),
  175. // for anchored operation ("match"), this code will be just skipped.
  176. // TODO: Implement search in much more efficient manner
  177. prog->insts[prog->bytelen++] = RSplit;
  178. prog->insts[prog->bytelen++] = 3;
  179. prog->insts[prog->bytelen++] = Any;
  180. prog->insts[prog->bytelen++] = Jmp;
  181. prog->insts[prog->bytelen++] = -5;
  182. prog->len += 3;
  183. prog->insts[prog->bytelen++] = Save;
  184. prog->insts[prog->bytelen++] = 0;
  185. prog->len++;
  186. re = _compilecode(re, prog, /*sizecode*/0);
  187. if (re == NULL || *re) return 1;
  188. prog->insts[prog->bytelen++] = Save;
  189. prog->insts[prog->bytelen++] = 1;
  190. prog->len++;
  191. prog->insts[prog->bytelen++] = Match;
  192. prog->len++;
  193. return 0;
  194. }
  195. #if 0
  196. int main(int argc, char *argv[])
  197. {
  198. int pc = 0;
  199. ByteProg *code = re1_5_compilecode(argv[1]);
  200. re1_5_dumpcode(code);
  201. }
  202. #endif