compilecode.c 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. // Copyright 2014 Paul Sokolovsky.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. #include "re1.5.h"
  5. #define INSERT_CODE(at, num, pc) \
  6. ((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num)
  7. #define REL(at, to) (to - at - 2)
  8. #define EMIT(at, byte) (code ? (code[at] = byte) : (at))
  9. #define PC (prog->bytelen)
  10. static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
  11. {
  12. char *code = sizecode ? NULL : prog->insts;
  13. int start = PC;
  14. int term = PC;
  15. int alt_label = 0;
  16. for (; *re && *re != ')'; re++) {
  17. switch (*re) {
  18. case '\\':
  19. re++;
  20. if (!*re) return NULL; // Trailing backslash
  21. if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') {
  22. term = PC;
  23. EMIT(PC++, NamedClass);
  24. EMIT(PC++, *re);
  25. prog->len++;
  26. break;
  27. }
  28. default:
  29. term = PC;
  30. EMIT(PC++, Char);
  31. EMIT(PC++, *re);
  32. prog->len++;
  33. break;
  34. case '.':
  35. term = PC;
  36. EMIT(PC++, Any);
  37. prog->len++;
  38. break;
  39. case '[': {
  40. int cnt;
  41. term = PC;
  42. re++;
  43. if (*re == '^') {
  44. EMIT(PC++, ClassNot);
  45. re++;
  46. } else {
  47. EMIT(PC++, Class);
  48. }
  49. PC++; // Skip # of pair byte
  50. prog->len++;
  51. for (cnt = 0; *re != ']'; re++, cnt++) {
  52. if (*re == '\\') {
  53. ++re;
  54. }
  55. if (!*re) return NULL;
  56. EMIT(PC++, *re);
  57. if (re[1] == '-' && re[2] != ']') {
  58. re += 2;
  59. }
  60. EMIT(PC++, *re);
  61. }
  62. EMIT(term + 1, cnt);
  63. break;
  64. }
  65. case '(': {
  66. term = PC;
  67. int sub = 0;
  68. int capture = re[1] != '?' || re[2] != ':';
  69. if (capture) {
  70. sub = ++prog->sub;
  71. EMIT(PC++, Save);
  72. EMIT(PC++, 2 * sub);
  73. prog->len++;
  74. } else {
  75. re += 2;
  76. }
  77. re = _compilecode(re + 1, prog, sizecode);
  78. if (re == NULL || *re != ')') return NULL; // error, or no matching paren
  79. if (capture) {
  80. EMIT(PC++, Save);
  81. EMIT(PC++, 2 * sub + 1);
  82. prog->len++;
  83. }
  84. break;
  85. }
  86. case '?':
  87. if (PC == term) return NULL; // nothing to repeat
  88. INSERT_CODE(term, 2, PC);
  89. if (re[1] == '?') {
  90. EMIT(term, RSplit);
  91. re++;
  92. } else {
  93. EMIT(term, Split);
  94. }
  95. EMIT(term + 1, REL(term, PC));
  96. prog->len++;
  97. term = PC;
  98. break;
  99. case '*':
  100. if (PC == term) return NULL; // nothing to repeat
  101. INSERT_CODE(term, 2, PC);
  102. EMIT(PC, Jmp);
  103. EMIT(PC + 1, REL(PC, term));
  104. PC += 2;
  105. if (re[1] == '?') {
  106. EMIT(term, RSplit);
  107. re++;
  108. } else {
  109. EMIT(term, Split);
  110. }
  111. EMIT(term + 1, REL(term, PC));
  112. prog->len += 2;
  113. term = PC;
  114. break;
  115. case '+':
  116. if (PC == term) return NULL; // nothing to repeat
  117. if (re[1] == '?') {
  118. EMIT(PC, Split);
  119. re++;
  120. } else {
  121. EMIT(PC, RSplit);
  122. }
  123. EMIT(PC + 1, REL(PC, term));
  124. PC += 2;
  125. prog->len++;
  126. term = PC;
  127. break;
  128. case '|':
  129. if (alt_label) {
  130. EMIT(alt_label, REL(alt_label, PC) + 1);
  131. }
  132. INSERT_CODE(start, 2, PC);
  133. EMIT(PC++, Jmp);
  134. alt_label = PC++;
  135. EMIT(start, Split);
  136. EMIT(start + 1, REL(start, PC));
  137. prog->len += 2;
  138. term = PC;
  139. break;
  140. case '^':
  141. EMIT(PC++, Bol);
  142. prog->len++;
  143. term = PC;
  144. break;
  145. case '$':
  146. EMIT(PC++, Eol);
  147. prog->len++;
  148. term = PC;
  149. break;
  150. }
  151. }
  152. if (alt_label) {
  153. EMIT(alt_label, REL(alt_label, PC) + 1);
  154. }
  155. return re;
  156. }
  157. int re1_5_sizecode(const char *re)
  158. {
  159. ByteProg dummyprog = {
  160. // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
  161. .bytelen = 5 + NON_ANCHORED_PREFIX
  162. };
  163. if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1;
  164. return dummyprog.bytelen;
  165. }
  166. int re1_5_compilecode(ByteProg *prog, const char *re)
  167. {
  168. prog->len = 0;
  169. prog->bytelen = 0;
  170. prog->sub = 0;
  171. // Add code to implement non-anchored operation ("search"),
  172. // for anchored operation ("match"), this code will be just skipped.
  173. // TODO: Implement search in much more efficient manner
  174. prog->insts[prog->bytelen++] = RSplit;
  175. prog->insts[prog->bytelen++] = 3;
  176. prog->insts[prog->bytelen++] = Any;
  177. prog->insts[prog->bytelen++] = Jmp;
  178. prog->insts[prog->bytelen++] = -5;
  179. prog->len += 3;
  180. prog->insts[prog->bytelen++] = Save;
  181. prog->insts[prog->bytelen++] = 0;
  182. prog->len++;
  183. re = _compilecode(re, prog, /*sizecode*/0);
  184. if (re == NULL || *re) return 1;
  185. prog->insts[prog->bytelen++] = Save;
  186. prog->insts[prog->bytelen++] = 1;
  187. prog->len++;
  188. prog->insts[prog->bytelen++] = Match;
  189. prog->len++;
  190. return 0;
  191. }
  192. #if 0
  193. int main(int argc, char *argv[])
  194. {
  195. int pc = 0;
  196. ByteProg *code = re1_5_compilecode(argv[1]);
  197. re1_5_dumpcode(code);
  198. }
  199. #endif