| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216 |
- // Copyright 2014 Paul Sokolovsky.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- #include "re1.5.h"
- #define INSERT_CODE(at, num, pc) \
- ((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num)
- #define REL(at, to) (to - at - 2)
- #define EMIT(at, byte) (code ? (code[at] = byte) : (at))
- #define PC (prog->bytelen)
- static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
- {
- char *code = sizecode ? NULL : prog->insts;
- int start = PC;
- int term = PC;
- int alt_label = 0;
- for (; *re && *re != ')'; re++) {
- switch (*re) {
- case '\\':
- re++;
- if (!*re) return NULL; // Trailing backslash
- if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') {
- term = PC;
- EMIT(PC++, NamedClass);
- EMIT(PC++, *re);
- prog->len++;
- break;
- }
- default:
- term = PC;
- EMIT(PC++, Char);
- EMIT(PC++, *re);
- prog->len++;
- break;
- case '.':
- term = PC;
- EMIT(PC++, Any);
- prog->len++;
- break;
- case '[': {
- int cnt;
- term = PC;
- re++;
- if (*re == '^') {
- EMIT(PC++, ClassNot);
- re++;
- } else {
- EMIT(PC++, Class);
- }
- PC++; // Skip # of pair byte
- prog->len++;
- for (cnt = 0; *re != ']'; re++, cnt++) {
- if (!*re) return NULL;
- EMIT(PC++, *re);
- if (re[1] == '-' && re[2] != ']') {
- re += 2;
- }
- EMIT(PC++, *re);
- }
- EMIT(term + 1, cnt);
- break;
- }
- case '(': {
- term = PC;
- int sub = 0;
- int capture = re[1] != '?' || re[2] != ':';
- if (capture) {
- sub = ++prog->sub;
- EMIT(PC++, Save);
- EMIT(PC++, 2 * sub);
- prog->len++;
- } else {
- re += 2;
- }
- re = _compilecode(re + 1, prog, sizecode);
- if (re == NULL || *re != ')') return NULL; // error, or no matching paren
- if (capture) {
- EMIT(PC++, Save);
- EMIT(PC++, 2 * sub + 1);
- prog->len++;
- }
- break;
- }
- case '?':
- if (PC == term) return NULL; // nothing to repeat
- INSERT_CODE(term, 2, PC);
- if (re[1] == '?') {
- EMIT(term, RSplit);
- re++;
- } else {
- EMIT(term, Split);
- }
- EMIT(term + 1, REL(term, PC));
- prog->len++;
- term = PC;
- break;
- case '*':
- if (PC == term) return NULL; // nothing to repeat
- INSERT_CODE(term, 2, PC);
- EMIT(PC, Jmp);
- EMIT(PC + 1, REL(PC, term));
- PC += 2;
- if (re[1] == '?') {
- EMIT(term, RSplit);
- re++;
- } else {
- EMIT(term, Split);
- }
- EMIT(term + 1, REL(term, PC));
- prog->len += 2;
- term = PC;
- break;
- case '+':
- if (PC == term) return NULL; // nothing to repeat
- if (re[1] == '?') {
- EMIT(PC, Split);
- re++;
- } else {
- EMIT(PC, RSplit);
- }
- EMIT(PC + 1, REL(PC, term));
- PC += 2;
- prog->len++;
- term = PC;
- break;
- case '|':
- if (alt_label) {
- EMIT(alt_label, REL(alt_label, PC) + 1);
- }
- INSERT_CODE(start, 2, PC);
- EMIT(PC++, Jmp);
- alt_label = PC++;
- EMIT(start, Split);
- EMIT(start + 1, REL(start, PC));
- prog->len += 2;
- term = PC;
- break;
- case '^':
- EMIT(PC++, Bol);
- prog->len++;
- term = PC;
- break;
- case '$':
- EMIT(PC++, Eol);
- prog->len++;
- term = PC;
- break;
- }
- }
- if (alt_label) {
- EMIT(alt_label, REL(alt_label, PC) + 1);
- }
- return re;
- }
- int re1_5_sizecode(const char *re)
- {
- ByteProg dummyprog = {
- // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
- .bytelen = 5 + NON_ANCHORED_PREFIX
- };
- if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1;
- return dummyprog.bytelen;
- }
- int re1_5_compilecode(ByteProg *prog, const char *re)
- {
- prog->len = 0;
- prog->bytelen = 0;
- prog->sub = 0;
- // Add code to implement non-anchored operation ("search"),
- // for anchored operation ("match"), this code will be just skipped.
- // TODO: Implement search in much more efficient manner
- prog->insts[prog->bytelen++] = RSplit;
- prog->insts[prog->bytelen++] = 3;
- prog->insts[prog->bytelen++] = Any;
- prog->insts[prog->bytelen++] = Jmp;
- prog->insts[prog->bytelen++] = -5;
- prog->len += 3;
- prog->insts[prog->bytelen++] = Save;
- prog->insts[prog->bytelen++] = 0;
- prog->len++;
- re = _compilecode(re, prog, /*sizecode*/0);
- if (re == NULL || *re) return 1;
- prog->insts[prog->bytelen++] = Save;
- prog->insts[prog->bytelen++] = 1;
- prog->len++;
- prog->insts[prog->bytelen++] = Match;
- prog->len++;
- return 0;
- }
- #if 0
- int main(int argc, char *argv[])
- {
- int pc = 0;
- ByteProg *code = re1_5_compilecode(argv[1]);
- re1_5_dumpcode(code);
- }
- #endif
|