compilecode.c 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. // Copyright 2014 Paul Sokolovsky.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. #include "re1.5.h"
  5. #define INSERT_CODE(at, num, pc) \
  6. ((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num)
  7. #define REL(at, to) (to - at - 2)
  8. #define EMIT(at, byte) (code ? (code[at] = byte) : (at))
  9. #define PC (prog->bytelen)
  10. static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
  11. {
  12. char *code = sizecode ? NULL : prog->insts;
  13. int start = PC;
  14. int term = PC;
  15. int alt_label = 0;
  16. for (; *re && *re != ')'; re++) {
  17. switch (*re) {
  18. case '\\':
  19. re++;
  20. if (!*re) return NULL; // Trailing backslash
  21. if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') {
  22. term = PC;
  23. EMIT(PC++, NamedClass);
  24. EMIT(PC++, *re);
  25. prog->len++;
  26. break;
  27. }
  28. default:
  29. term = PC;
  30. EMIT(PC++, Char);
  31. EMIT(PC++, *re);
  32. prog->len++;
  33. break;
  34. case '.':
  35. term = PC;
  36. EMIT(PC++, Any);
  37. prog->len++;
  38. break;
  39. case '[': {
  40. int cnt;
  41. term = PC;
  42. re++;
  43. if (*re == '^') {
  44. EMIT(PC++, ClassNot);
  45. re++;
  46. } else {
  47. EMIT(PC++, Class);
  48. }
  49. PC++; // Skip # of pair byte
  50. prog->len++;
  51. for (cnt = 0; *re != ']'; re++, cnt++) {
  52. if (!*re) return NULL;
  53. EMIT(PC++, *re);
  54. if (re[1] == '-' && re[2] != ']') {
  55. re += 2;
  56. }
  57. EMIT(PC++, *re);
  58. }
  59. EMIT(term + 1, cnt);
  60. break;
  61. }
  62. case '(': {
  63. term = PC;
  64. int sub = 0;
  65. int capture = re[1] != '?' || re[2] != ':';
  66. if (capture) {
  67. sub = ++prog->sub;
  68. EMIT(PC++, Save);
  69. EMIT(PC++, 2 * sub);
  70. prog->len++;
  71. } else {
  72. re += 2;
  73. }
  74. re = _compilecode(re + 1, prog, sizecode);
  75. if (re == NULL || *re != ')') return NULL; // error, or no matching paren
  76. if (capture) {
  77. EMIT(PC++, Save);
  78. EMIT(PC++, 2 * sub + 1);
  79. prog->len++;
  80. }
  81. break;
  82. }
  83. case '?':
  84. if (PC == term) return NULL; // nothing to repeat
  85. INSERT_CODE(term, 2, PC);
  86. if (re[1] == '?') {
  87. EMIT(term, RSplit);
  88. re++;
  89. } else {
  90. EMIT(term, Split);
  91. }
  92. EMIT(term + 1, REL(term, PC));
  93. prog->len++;
  94. term = PC;
  95. break;
  96. case '*':
  97. if (PC == term) return NULL; // nothing to repeat
  98. INSERT_CODE(term, 2, PC);
  99. EMIT(PC, Jmp);
  100. EMIT(PC + 1, REL(PC, term));
  101. PC += 2;
  102. if (re[1] == '?') {
  103. EMIT(term, RSplit);
  104. re++;
  105. } else {
  106. EMIT(term, Split);
  107. }
  108. EMIT(term + 1, REL(term, PC));
  109. prog->len += 2;
  110. term = PC;
  111. break;
  112. case '+':
  113. if (PC == term) return NULL; // nothing to repeat
  114. if (re[1] == '?') {
  115. EMIT(PC, Split);
  116. re++;
  117. } else {
  118. EMIT(PC, RSplit);
  119. }
  120. EMIT(PC + 1, REL(PC, term));
  121. PC += 2;
  122. prog->len++;
  123. term = PC;
  124. break;
  125. case '|':
  126. if (alt_label) {
  127. EMIT(alt_label, REL(alt_label, PC) + 1);
  128. }
  129. INSERT_CODE(start, 2, PC);
  130. EMIT(PC++, Jmp);
  131. alt_label = PC++;
  132. EMIT(start, Split);
  133. EMIT(start + 1, REL(start, PC));
  134. prog->len += 2;
  135. term = PC;
  136. break;
  137. case '^':
  138. EMIT(PC++, Bol);
  139. prog->len++;
  140. term = PC;
  141. break;
  142. case '$':
  143. EMIT(PC++, Eol);
  144. prog->len++;
  145. term = PC;
  146. break;
  147. }
  148. }
  149. if (alt_label) {
  150. EMIT(alt_label, REL(alt_label, PC) + 1);
  151. }
  152. return re;
  153. }
  154. int re1_5_sizecode(const char *re)
  155. {
  156. ByteProg dummyprog = {
  157. // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
  158. .bytelen = 5 + NON_ANCHORED_PREFIX
  159. };
  160. if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1;
  161. return dummyprog.bytelen;
  162. }
  163. int re1_5_compilecode(ByteProg *prog, const char *re)
  164. {
  165. prog->len = 0;
  166. prog->bytelen = 0;
  167. prog->sub = 0;
  168. // Add code to implement non-anchored operation ("search"),
  169. // for anchored operation ("match"), this code will be just skipped.
  170. // TODO: Implement search in much more efficient manner
  171. prog->insts[prog->bytelen++] = RSplit;
  172. prog->insts[prog->bytelen++] = 3;
  173. prog->insts[prog->bytelen++] = Any;
  174. prog->insts[prog->bytelen++] = Jmp;
  175. prog->insts[prog->bytelen++] = -5;
  176. prog->len += 3;
  177. prog->insts[prog->bytelen++] = Save;
  178. prog->insts[prog->bytelen++] = 0;
  179. prog->len++;
  180. re = _compilecode(re, prog, /*sizecode*/0);
  181. if (re == NULL || *re) return 1;
  182. prog->insts[prog->bytelen++] = Save;
  183. prog->insts[prog->bytelen++] = 1;
  184. prog->len++;
  185. prog->insts[prog->bytelen++] = Match;
  186. prog->len++;
  187. return 0;
  188. }
  189. #if 0
  190. int main(int argc, char *argv[])
  191. {
  192. int pc = 0;
  193. ByteProg *code = re1_5_compilecode(argv[1]);
  194. re1_5_dumpcode(code);
  195. }
  196. #endif