arg_rex.c 33 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012
  1. /*
  2. * SPDX-FileCopyrightText: 1998-2001,2003-2011,2013 Stewart Heitmann
  3. *
  4. * SPDX-License-Identifier: BSD-3-Clause
  5. */
  6. /*******************************************************************************
  7. * arg_rex: Implements the regex command-line option
  8. *
  9. * This file is part of the argtable3 library.
  10. *
  11. * Copyright (C) 1998-2001,2003-2011,2013 Stewart Heitmann
  12. * <sheitmann@users.sourceforge.net>
  13. * All rights reserved.
  14. *
  15. * Redistribution and use in source and binary forms, with or without
  16. * modification, are permitted provided that the following conditions are met:
  17. * * Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * * Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in the
  21. * documentation and/or other materials provided with the distribution.
  22. * * Neither the name of STEWART HEITMANN nor the names of its contributors
  23. * may be used to endorse or promote products derived from this software
  24. * without specific prior written permission.
  25. *
  26. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29. * ARE DISCLAIMED. IN NO EVENT SHALL STEWART HEITMANN BE LIABLE FOR ANY DIRECT,
  30. * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  31. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  32. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  33. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  34. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  35. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  36. ******************************************************************************/
  37. #include "argtable3.h"
  38. #ifndef ARG_AMALGAMATION
  39. #include "argtable3_private.h"
  40. #endif
  41. #include <stdlib.h>
  42. #include <string.h>
  43. #ifndef _TREX_H_
  44. #define _TREX_H_
  45. /*
  46. * This module uses the T-Rex regular expression library to implement the regex
  47. * logic. Here is the copyright notice of the library:
  48. *
  49. * Copyright (C) 2003-2006 Alberto Demichelis
  50. *
  51. * This software is provided 'as-is', without any express
  52. * or implied warranty. In no event will the authors be held
  53. * liable for any damages arising from the use of this software.
  54. *
  55. * Permission is granted to anyone to use this software for
  56. * any purpose, including commercial applications, and to alter
  57. * it and redistribute it freely, subject to the following restrictions:
  58. *
  59. * 1. The origin of this software must not be misrepresented;
  60. * you must not claim that you wrote the original software.
  61. * If you use this software in a product, an acknowledgment
  62. * in the product documentation would be appreciated but
  63. * is not required.
  64. *
  65. * 2. Altered source versions must be plainly marked as such,
  66. * and must not be misrepresented as being the original software.
  67. *
  68. * 3. This notice may not be removed or altered from any
  69. * source distribution.
  70. */
  71. #ifdef __cplusplus
  72. extern "C" {
  73. #endif
  74. #define TRexChar char
  75. #define MAX_CHAR 0xFF
  76. #define _TREXC(c) (c)
  77. #define trex_strlen strlen
  78. #define trex_printf printf
  79. #ifndef TREX_API
  80. #define TREX_API extern
  81. #endif
  82. #define TRex_True 1
  83. #define TRex_False 0
  84. #define TREX_ICASE ARG_REX_ICASE
  85. typedef unsigned int TRexBool;
  86. typedef struct TRex TRex;
  87. typedef struct {
  88. const TRexChar* begin;
  89. int len;
  90. } TRexMatch;
  91. #ifdef __GNUC__
  92. TREX_API TRex* trex_compile(const TRexChar* pattern, const TRexChar** error, int flags) __attribute__((optimize(0)));
  93. #else
  94. TREX_API TRex* trex_compile(const TRexChar* pattern, const TRexChar** error, int flags);
  95. #endif
  96. TREX_API void trex_free(TRex* exp);
  97. TREX_API TRexBool trex_match(TRex* exp, const TRexChar* text);
  98. TREX_API TRexBool trex_search(TRex* exp, const TRexChar* text, const TRexChar** out_begin, const TRexChar** out_end);
  99. TREX_API TRexBool
  100. trex_searchrange(TRex* exp, const TRexChar* text_begin, const TRexChar* text_end, const TRexChar** out_begin, const TRexChar** out_end);
  101. TREX_API int trex_getsubexpcount(TRex* exp);
  102. TREX_API TRexBool trex_getsubexp(TRex* exp, int n, TRexMatch* subexp);
  103. #ifdef __cplusplus
  104. }
  105. #endif
  106. #endif
  107. struct privhdr {
  108. const char* pattern;
  109. int flags;
  110. };
  111. static void arg_rex_resetfn(struct arg_rex* parent) {
  112. ARG_TRACE(("%s:resetfn(%p)\n", __FILE__, parent));
  113. parent->count = 0;
  114. }
  115. static int arg_rex_scanfn(struct arg_rex* parent, const char* argval) {
  116. int errorcode = 0;
  117. const TRexChar* error = NULL;
  118. TRex* rex = NULL;
  119. TRexBool is_match = TRex_False;
  120. if (parent->count == parent->hdr.maxcount) {
  121. /* maximum number of arguments exceeded */
  122. errorcode = ARG_ERR_MAXCOUNT;
  123. } else if (!argval) {
  124. /* a valid argument with no argument value was given. */
  125. /* This happens when an optional argument value was invoked. */
  126. /* leave parent argument value unaltered but still count the argument. */
  127. parent->count++;
  128. } else {
  129. struct privhdr* priv = (struct privhdr*)parent->hdr.priv;
  130. /* test the current argument value for a match with the regular expression */
  131. /* if a match is detected, record the argument value in the arg_rex struct */
  132. rex = trex_compile(priv->pattern, &error, priv->flags);
  133. is_match = trex_match(rex, argval);
  134. if (!is_match)
  135. errorcode = ARG_ERR_REGNOMATCH;
  136. else
  137. parent->sval[parent->count++] = argval;
  138. trex_free(rex);
  139. }
  140. ARG_TRACE(("%s:scanfn(%p) returns %d\n", __FILE__, parent, errorcode));
  141. return errorcode;
  142. }
  143. static int arg_rex_checkfn(struct arg_rex* parent) {
  144. int errorcode = (parent->count < parent->hdr.mincount) ? ARG_ERR_MINCOUNT : 0;
  145. #if 0
  146. struct privhdr *priv = (struct privhdr*)parent->hdr.priv;
  147. /* free the regex "program" we constructed in resetfn */
  148. regfree(&(priv->regex));
  149. /*printf("%s:checkfn(%p) returns %d\n",__FILE__,parent,errorcode);*/
  150. #endif
  151. return errorcode;
  152. }
  153. static void arg_rex_errorfn(struct arg_rex* parent, arg_dstr_t ds, int errorcode, const char* argval, const char* progname) {
  154. const char* shortopts = parent->hdr.shortopts;
  155. const char* longopts = parent->hdr.longopts;
  156. const char* datatype = parent->hdr.datatype;
  157. /* make argval NULL safe */
  158. argval = argval ? argval : "";
  159. arg_dstr_catf(ds, "%s: ", progname);
  160. switch (errorcode) {
  161. case ARG_ERR_MINCOUNT:
  162. arg_dstr_cat(ds, "missing option ");
  163. arg_print_option_ds(ds, shortopts, longopts, datatype, "\n");
  164. break;
  165. case ARG_ERR_MAXCOUNT:
  166. arg_dstr_cat(ds, "excess option ");
  167. arg_print_option_ds(ds, shortopts, longopts, argval, "\n");
  168. break;
  169. case ARG_ERR_REGNOMATCH:
  170. arg_dstr_cat(ds, "illegal value ");
  171. arg_print_option_ds(ds, shortopts, longopts, argval, "\n");
  172. break;
  173. default: {
  174. #if 0
  175. char errbuff[256];
  176. regerror(errorcode, NULL, errbuff, sizeof(errbuff));
  177. printf("%s\n", errbuff);
  178. #endif
  179. } break;
  180. }
  181. }
  182. struct arg_rex* arg_rex0(const char* shortopts, const char* longopts, const char* pattern, const char* datatype, int flags, const char* glossary) {
  183. return arg_rexn(shortopts, longopts, pattern, datatype, 0, 1, flags, glossary);
  184. }
  185. struct arg_rex* arg_rex1(const char* shortopts, const char* longopts, const char* pattern, const char* datatype, int flags, const char* glossary) {
  186. return arg_rexn(shortopts, longopts, pattern, datatype, 1, 1, flags, glossary);
  187. }
  188. struct arg_rex* arg_rexn(const char* shortopts,
  189. const char* longopts,
  190. const char* pattern,
  191. const char* datatype,
  192. int mincount,
  193. int maxcount,
  194. int flags,
  195. const char* glossary) {
  196. size_t nbytes;
  197. struct arg_rex* result;
  198. struct privhdr* priv;
  199. int i;
  200. const TRexChar* error = NULL;
  201. TRex* rex = NULL;
  202. if (!pattern) {
  203. printf("argtable: ERROR - illegal regular expression pattern \"(NULL)\"\n");
  204. printf("argtable: Bad argument table.\n");
  205. return NULL;
  206. }
  207. /* foolproof things by ensuring maxcount is not less than mincount */
  208. maxcount = (maxcount < mincount) ? mincount : maxcount;
  209. nbytes = sizeof(struct arg_rex) /* storage for struct arg_rex */
  210. + sizeof(struct privhdr) /* storage for private arg_rex data */
  211. + (size_t)maxcount * sizeof(char*); /* storage for sval[maxcount] array */
  212. /* init the arg_hdr struct */
  213. result = (struct arg_rex*)xmalloc(nbytes);
  214. result->hdr.flag = ARG_HASVALUE;
  215. result->hdr.shortopts = shortopts;
  216. result->hdr.longopts = longopts;
  217. result->hdr.datatype = datatype ? datatype : pattern;
  218. result->hdr.glossary = glossary;
  219. result->hdr.mincount = mincount;
  220. result->hdr.maxcount = maxcount;
  221. result->hdr.parent = result;
  222. result->hdr.resetfn = (arg_resetfn*)arg_rex_resetfn;
  223. result->hdr.scanfn = (arg_scanfn*)arg_rex_scanfn;
  224. result->hdr.checkfn = (arg_checkfn*)arg_rex_checkfn;
  225. result->hdr.errorfn = (arg_errorfn*)arg_rex_errorfn;
  226. /* store the arg_rex_priv struct immediately after the arg_rex struct */
  227. result->hdr.priv = result + 1;
  228. priv = (struct privhdr*)(result->hdr.priv);
  229. priv->pattern = pattern;
  230. priv->flags = flags;
  231. /* store the sval[maxcount] array immediately after the arg_rex_priv struct */
  232. result->sval = (const char**)(priv + 1);
  233. result->count = 0;
  234. /* foolproof the string pointers by initializing them to reference empty strings */
  235. for (i = 0; i < maxcount; i++)
  236. result->sval[i] = "";
  237. /* here we construct and destroy a regex representation of the regular
  238. * expression for no other reason than to force any regex errors to be
  239. * trapped now rather than later. If we don't, then errors may go undetected
  240. * until an argument is actually parsed.
  241. */
  242. rex = trex_compile(priv->pattern, &error, priv->flags);
  243. if (rex == NULL) {
  244. ARG_LOG(("argtable: %s \"%s\"\n", error ? error : _TREXC("undefined"), priv->pattern));
  245. ARG_LOG(("argtable: Bad argument table.\n"));
  246. }
  247. trex_free(rex);
  248. ARG_TRACE(("arg_rexn() returns %p\n", result));
  249. return result;
  250. }
  251. /* see copyright notice in trex.h */
  252. #include <ctype.h>
  253. #include <setjmp.h>
  254. #include <stdlib.h>
  255. #include <string.h>
  256. #ifdef _UINCODE
  257. #define scisprint iswprint
  258. #define scstrlen wcslen
  259. #define scprintf wprintf
  260. #define _SC(x) L(x)
  261. #else
  262. #define scisprint isprint
  263. #define scstrlen strlen
  264. #define scprintf printf
  265. #define _SC(x) (x)
  266. #endif
  267. #ifdef ARG_REX_DEBUG
  268. #include <stdio.h>
  269. static const TRexChar* g_nnames[] = {_SC("NONE"), _SC("OP_GREEDY"), _SC("OP_OR"), _SC("OP_EXPR"), _SC("OP_NOCAPEXPR"),
  270. _SC("OP_DOT"), _SC("OP_CLASS"), _SC("OP_CCLASS"), _SC("OP_NCLASS"), _SC("OP_RANGE"),
  271. _SC("OP_CHAR"), _SC("OP_EOL"), _SC("OP_BOL"), _SC("OP_WB")};
  272. #endif
  273. #define OP_GREEDY (MAX_CHAR + 1) /* * + ? {n} */
  274. #define OP_OR (MAX_CHAR + 2)
  275. #define OP_EXPR (MAX_CHAR + 3) /* parentesis () */
  276. #define OP_NOCAPEXPR (MAX_CHAR + 4) /* parentesis (?:) */
  277. #define OP_DOT (MAX_CHAR + 5)
  278. #define OP_CLASS (MAX_CHAR + 6)
  279. #define OP_CCLASS (MAX_CHAR + 7)
  280. #define OP_NCLASS (MAX_CHAR + 8) /* negates class the [^ */
  281. #define OP_RANGE (MAX_CHAR + 9)
  282. #define OP_CHAR (MAX_CHAR + 10)
  283. #define OP_EOL (MAX_CHAR + 11)
  284. #define OP_BOL (MAX_CHAR + 12)
  285. #define OP_WB (MAX_CHAR + 13)
  286. #define TREX_SYMBOL_ANY_CHAR ('.')
  287. #define TREX_SYMBOL_GREEDY_ONE_OR_MORE ('+')
  288. #define TREX_SYMBOL_GREEDY_ZERO_OR_MORE ('*')
  289. #define TREX_SYMBOL_GREEDY_ZERO_OR_ONE ('?')
  290. #define TREX_SYMBOL_BRANCH ('|')
  291. #define TREX_SYMBOL_END_OF_STRING ('$')
  292. #define TREX_SYMBOL_BEGINNING_OF_STRING ('^')
  293. #define TREX_SYMBOL_ESCAPE_CHAR ('\\')
  294. typedef int TRexNodeType;
  295. typedef struct tagTRexNode {
  296. TRexNodeType type;
  297. int left;
  298. int right;
  299. int next;
  300. } TRexNode;
  301. struct TRex {
  302. const TRexChar* _eol;
  303. const TRexChar* _bol;
  304. const TRexChar* _p;
  305. int _first;
  306. int _op;
  307. TRexNode* _nodes;
  308. int _nallocated;
  309. int _nsize;
  310. int _nsubexpr;
  311. TRexMatch* _matches;
  312. int _currsubexp;
  313. void* _jmpbuf;
  314. const TRexChar** _error;
  315. int _flags;
  316. };
  317. static int trex_list(TRex* exp);
  318. static int trex_newnode(TRex* exp, TRexNodeType type) {
  319. TRexNode n;
  320. int newid;
  321. n.type = type;
  322. n.next = n.right = n.left = -1;
  323. if (type == OP_EXPR)
  324. n.right = exp->_nsubexpr++;
  325. if (exp->_nallocated < (exp->_nsize + 1)) {
  326. exp->_nallocated *= 2;
  327. exp->_nodes = (TRexNode*)xrealloc(exp->_nodes, (size_t)exp->_nallocated * sizeof(TRexNode));
  328. }
  329. exp->_nodes[exp->_nsize++] = n;
  330. newid = exp->_nsize - 1;
  331. return (int)newid;
  332. }
  333. static void trex_error(TRex* exp, const TRexChar* error) {
  334. if (exp->_error)
  335. *exp->_error = error;
  336. longjmp(*((jmp_buf*)exp->_jmpbuf), -1);
  337. }
  338. static void trex_expect(TRex* exp, int n) {
  339. if ((*exp->_p) != n)
  340. trex_error(exp, _SC("expected paren"));
  341. exp->_p++;
  342. }
  343. static TRexChar trex_escapechar(TRex* exp) {
  344. if (*exp->_p == TREX_SYMBOL_ESCAPE_CHAR) {
  345. exp->_p++;
  346. switch (*exp->_p) {
  347. case 'v':
  348. exp->_p++;
  349. return '\v';
  350. case 'n':
  351. exp->_p++;
  352. return '\n';
  353. case 't':
  354. exp->_p++;
  355. return '\t';
  356. case 'r':
  357. exp->_p++;
  358. return '\r';
  359. case 'f':
  360. exp->_p++;
  361. return '\f';
  362. default:
  363. return (*exp->_p++);
  364. }
  365. } else if (!scisprint((int)(*exp->_p)))
  366. trex_error(exp, _SC("letter expected"));
  367. return (*exp->_p++);
  368. }
  369. static int trex_charclass(TRex* exp, int classid) {
  370. int n = trex_newnode(exp, OP_CCLASS);
  371. exp->_nodes[n].left = classid;
  372. return n;
  373. }
  374. static int trex_charnode(TRex* exp, TRexBool isclass) {
  375. TRexChar t;
  376. if (*exp->_p == TREX_SYMBOL_ESCAPE_CHAR) {
  377. exp->_p++;
  378. switch (*exp->_p) {
  379. case 'n':
  380. exp->_p++;
  381. return trex_newnode(exp, '\n');
  382. case 't':
  383. exp->_p++;
  384. return trex_newnode(exp, '\t');
  385. case 'r':
  386. exp->_p++;
  387. return trex_newnode(exp, '\r');
  388. case 'f':
  389. exp->_p++;
  390. return trex_newnode(exp, '\f');
  391. case 'v':
  392. exp->_p++;
  393. return trex_newnode(exp, '\v');
  394. case 'a':
  395. case 'A':
  396. case 'w':
  397. case 'W':
  398. case 's':
  399. case 'S':
  400. case 'd':
  401. case 'D':
  402. case 'x':
  403. case 'X':
  404. case 'c':
  405. case 'C':
  406. case 'p':
  407. case 'P':
  408. case 'l':
  409. case 'u': {
  410. t = *exp->_p;
  411. exp->_p++;
  412. return trex_charclass(exp, t);
  413. }
  414. case 'b':
  415. case 'B':
  416. if (!isclass) {
  417. int node = trex_newnode(exp, OP_WB);
  418. exp->_nodes[node].left = *exp->_p;
  419. exp->_p++;
  420. return node;
  421. }
  422. /* fall through */
  423. default:
  424. t = *exp->_p;
  425. exp->_p++;
  426. return trex_newnode(exp, t);
  427. }
  428. } else if (!scisprint((int)(*exp->_p))) {
  429. trex_error(exp, _SC("letter expected"));
  430. }
  431. t = *exp->_p;
  432. exp->_p++;
  433. return trex_newnode(exp, t);
  434. }
  435. static int trex_class(TRex* exp) {
  436. int ret = -1;
  437. int first = -1, chain;
  438. if (*exp->_p == TREX_SYMBOL_BEGINNING_OF_STRING) {
  439. ret = trex_newnode(exp, OP_NCLASS);
  440. exp->_p++;
  441. } else
  442. ret = trex_newnode(exp, OP_CLASS);
  443. if (*exp->_p == ']')
  444. trex_error(exp, _SC("empty class"));
  445. chain = ret;
  446. while (*exp->_p != ']' && exp->_p != exp->_eol) {
  447. if (*exp->_p == '-' && first != -1) {
  448. int r, t;
  449. if (*exp->_p++ == ']')
  450. trex_error(exp, _SC("unfinished range"));
  451. r = trex_newnode(exp, OP_RANGE);
  452. if (first > *exp->_p)
  453. trex_error(exp, _SC("invalid range"));
  454. if (exp->_nodes[first].type == OP_CCLASS)
  455. trex_error(exp, _SC("cannot use character classes in ranges"));
  456. exp->_nodes[r].left = exp->_nodes[first].type;
  457. t = trex_escapechar(exp);
  458. exp->_nodes[r].right = t;
  459. exp->_nodes[chain].next = r;
  460. chain = r;
  461. first = -1;
  462. } else {
  463. if (first != -1) {
  464. int c = first;
  465. exp->_nodes[chain].next = c;
  466. chain = c;
  467. first = trex_charnode(exp, TRex_True);
  468. } else {
  469. first = trex_charnode(exp, TRex_True);
  470. }
  471. }
  472. }
  473. if (first != -1) {
  474. int c = first;
  475. exp->_nodes[chain].next = c;
  476. chain = c;
  477. first = -1;
  478. }
  479. /* hack? */
  480. exp->_nodes[ret].left = exp->_nodes[ret].next;
  481. exp->_nodes[ret].next = -1;
  482. return ret;
  483. }
  484. static int trex_parsenumber(TRex* exp) {
  485. int ret = *exp->_p - '0';
  486. int positions = 10;
  487. exp->_p++;
  488. while (isdigit((int)(*exp->_p))) {
  489. ret = ret * 10 + (*exp->_p++ - '0');
  490. if (positions == 1000000000)
  491. trex_error(exp, _SC("overflow in numeric constant"));
  492. positions *= 10;
  493. };
  494. return ret;
  495. }
  496. static int trex_element(TRex* exp) {
  497. int ret = -1;
  498. switch (*exp->_p) {
  499. case '(': {
  500. int expr, newn;
  501. exp->_p++;
  502. if (*exp->_p == '?') {
  503. exp->_p++;
  504. trex_expect(exp, ':');
  505. expr = trex_newnode(exp, OP_NOCAPEXPR);
  506. } else
  507. expr = trex_newnode(exp, OP_EXPR);
  508. newn = trex_list(exp);
  509. exp->_nodes[expr].left = newn;
  510. ret = expr;
  511. trex_expect(exp, ')');
  512. } break;
  513. case '[':
  514. exp->_p++;
  515. ret = trex_class(exp);
  516. trex_expect(exp, ']');
  517. break;
  518. case TREX_SYMBOL_END_OF_STRING:
  519. exp->_p++;
  520. ret = trex_newnode(exp, OP_EOL);
  521. break;
  522. case TREX_SYMBOL_ANY_CHAR:
  523. exp->_p++;
  524. ret = trex_newnode(exp, OP_DOT);
  525. break;
  526. default:
  527. ret = trex_charnode(exp, TRex_False);
  528. break;
  529. }
  530. {
  531. TRexBool isgreedy = TRex_False;
  532. unsigned short p0 = 0, p1 = 0;
  533. switch (*exp->_p) {
  534. case TREX_SYMBOL_GREEDY_ZERO_OR_MORE:
  535. p0 = 0;
  536. p1 = 0xFFFF;
  537. exp->_p++;
  538. isgreedy = TRex_True;
  539. break;
  540. case TREX_SYMBOL_GREEDY_ONE_OR_MORE:
  541. p0 = 1;
  542. p1 = 0xFFFF;
  543. exp->_p++;
  544. isgreedy = TRex_True;
  545. break;
  546. case TREX_SYMBOL_GREEDY_ZERO_OR_ONE:
  547. p0 = 0;
  548. p1 = 1;
  549. exp->_p++;
  550. isgreedy = TRex_True;
  551. break;
  552. case '{':
  553. exp->_p++;
  554. if (!isdigit((int)(*exp->_p)))
  555. trex_error(exp, _SC("number expected"));
  556. p0 = (unsigned short)trex_parsenumber(exp);
  557. /*******************************/
  558. switch (*exp->_p) {
  559. case '}':
  560. p1 = p0;
  561. exp->_p++;
  562. break;
  563. case ',':
  564. exp->_p++;
  565. p1 = 0xFFFF;
  566. if (isdigit((int)(*exp->_p))) {
  567. p1 = (unsigned short)trex_parsenumber(exp);
  568. }
  569. trex_expect(exp, '}');
  570. break;
  571. default:
  572. trex_error(exp, _SC(", or } expected"));
  573. }
  574. /*******************************/
  575. isgreedy = TRex_True;
  576. break;
  577. }
  578. if (isgreedy) {
  579. int nnode = trex_newnode(exp, OP_GREEDY);
  580. exp->_nodes[nnode].left = ret;
  581. exp->_nodes[nnode].right = ((p0) << 16) | p1;
  582. ret = nnode;
  583. }
  584. }
  585. if ((*exp->_p != TREX_SYMBOL_BRANCH) && (*exp->_p != ')') && (*exp->_p != TREX_SYMBOL_GREEDY_ZERO_OR_MORE) &&
  586. (*exp->_p != TREX_SYMBOL_GREEDY_ONE_OR_MORE) && (*exp->_p != '\0')) {
  587. int nnode = trex_element(exp);
  588. exp->_nodes[ret].next = nnode;
  589. }
  590. return ret;
  591. }
  592. static int trex_list(TRex* exp) {
  593. int ret = -1, e;
  594. if (*exp->_p == TREX_SYMBOL_BEGINNING_OF_STRING) {
  595. exp->_p++;
  596. ret = trex_newnode(exp, OP_BOL);
  597. }
  598. e = trex_element(exp);
  599. if (ret != -1) {
  600. exp->_nodes[ret].next = e;
  601. } else
  602. ret = e;
  603. if (*exp->_p == TREX_SYMBOL_BRANCH) {
  604. int temp, tright;
  605. exp->_p++;
  606. temp = trex_newnode(exp, OP_OR);
  607. exp->_nodes[temp].left = ret;
  608. tright = trex_list(exp);
  609. exp->_nodes[temp].right = tright;
  610. ret = temp;
  611. }
  612. return ret;
  613. }
  614. static TRexBool trex_matchcclass(int cclass, TRexChar c) {
  615. switch (cclass) {
  616. case 'a':
  617. return isalpha(c) ? TRex_True : TRex_False;
  618. case 'A':
  619. return !isalpha(c) ? TRex_True : TRex_False;
  620. case 'w':
  621. return (isalnum(c) || c == '_') ? TRex_True : TRex_False;
  622. case 'W':
  623. return (!isalnum(c) && c != '_') ? TRex_True : TRex_False;
  624. case 's':
  625. return isspace(c) ? TRex_True : TRex_False;
  626. case 'S':
  627. return !isspace(c) ? TRex_True : TRex_False;
  628. case 'd':
  629. return isdigit(c) ? TRex_True : TRex_False;
  630. case 'D':
  631. return !isdigit(c) ? TRex_True : TRex_False;
  632. case 'x':
  633. return isxdigit(c) ? TRex_True : TRex_False;
  634. case 'X':
  635. return !isxdigit(c) ? TRex_True : TRex_False;
  636. case 'c':
  637. return iscntrl(c) ? TRex_True : TRex_False;
  638. case 'C':
  639. return !iscntrl(c) ? TRex_True : TRex_False;
  640. case 'p':
  641. return ispunct(c) ? TRex_True : TRex_False;
  642. case 'P':
  643. return !ispunct(c) ? TRex_True : TRex_False;
  644. case 'l':
  645. return islower(c) ? TRex_True : TRex_False;
  646. case 'u':
  647. return isupper(c) ? TRex_True : TRex_False;
  648. }
  649. return TRex_False; /*cannot happen*/
  650. }
  651. static TRexBool trex_matchclass(TRex* exp, TRexNode* node, TRexChar c) {
  652. do {
  653. switch (node->type) {
  654. case OP_RANGE:
  655. if (exp->_flags & TREX_ICASE) {
  656. if (c >= toupper(node->left) && c <= toupper(node->right))
  657. return TRex_True;
  658. if (c >= tolower(node->left) && c <= tolower(node->right))
  659. return TRex_True;
  660. } else {
  661. if (c >= node->left && c <= node->right)
  662. return TRex_True;
  663. }
  664. break;
  665. case OP_CCLASS:
  666. if (trex_matchcclass(node->left, c))
  667. return TRex_True;
  668. break;
  669. default:
  670. if (exp->_flags & TREX_ICASE) {
  671. if (c == tolower(node->type) || c == toupper(node->type))
  672. return TRex_True;
  673. } else {
  674. if (c == node->type)
  675. return TRex_True;
  676. }
  677. }
  678. } while ((node->next != -1) && ((node = &exp->_nodes[node->next]) != NULL));
  679. return TRex_False;
  680. }
  681. static const TRexChar* trex_matchnode(TRex* exp, TRexNode* node, const TRexChar* str, TRexNode* next) {
  682. TRexNodeType type = node->type;
  683. switch (type) {
  684. case OP_GREEDY: {
  685. /* TRexNode *greedystop = (node->next != -1) ? &exp->_nodes[node->next] : NULL; */
  686. TRexNode* greedystop = NULL;
  687. int p0 = (node->right >> 16) & 0x0000FFFF, p1 = node->right & 0x0000FFFF, nmaches = 0;
  688. const TRexChar *s = str, *good = str;
  689. if (node->next != -1) {
  690. greedystop = &exp->_nodes[node->next];
  691. } else {
  692. greedystop = next;
  693. }
  694. while ((nmaches == 0xFFFF || nmaches < p1)) {
  695. const TRexChar* stop;
  696. if ((s = trex_matchnode(exp, &exp->_nodes[node->left], s, greedystop)) == NULL)
  697. break;
  698. nmaches++;
  699. good = s;
  700. if (greedystop) {
  701. /* checks that 0 matches satisfy the expression(if so skips) */
  702. /* if not would always stop(for instance if is a '?') */
  703. if (greedystop->type != OP_GREEDY || (greedystop->type == OP_GREEDY && ((greedystop->right >> 16) & 0x0000FFFF) != 0)) {
  704. TRexNode* gnext = NULL;
  705. if (greedystop->next != -1) {
  706. gnext = &exp->_nodes[greedystop->next];
  707. } else if (next && next->next != -1) {
  708. gnext = &exp->_nodes[next->next];
  709. }
  710. stop = trex_matchnode(exp, greedystop, s, gnext);
  711. if (stop) {
  712. /* if satisfied stop it */
  713. if (p0 == p1 && p0 == nmaches)
  714. break;
  715. else if (nmaches >= p0 && p1 == 0xFFFF)
  716. break;
  717. else if (nmaches >= p0 && nmaches <= p1)
  718. break;
  719. }
  720. }
  721. }
  722. if (s >= exp->_eol)
  723. break;
  724. }
  725. if (p0 == p1 && p0 == nmaches)
  726. return good;
  727. else if (nmaches >= p0 && p1 == 0xFFFF)
  728. return good;
  729. else if (nmaches >= p0 && nmaches <= p1)
  730. return good;
  731. return NULL;
  732. }
  733. case OP_OR: {
  734. const TRexChar* asd = str;
  735. TRexNode* temp = &exp->_nodes[node->left];
  736. while ((asd = trex_matchnode(exp, temp, asd, NULL)) != NULL) {
  737. if (temp->next != -1)
  738. temp = &exp->_nodes[temp->next];
  739. else
  740. return asd;
  741. }
  742. asd = str;
  743. temp = &exp->_nodes[node->right];
  744. while ((asd = trex_matchnode(exp, temp, asd, NULL)) != NULL) {
  745. if (temp->next != -1)
  746. temp = &exp->_nodes[temp->next];
  747. else
  748. return asd;
  749. }
  750. return NULL;
  751. break;
  752. }
  753. case OP_EXPR:
  754. case OP_NOCAPEXPR: {
  755. TRexNode* n = &exp->_nodes[node->left];
  756. const TRexChar* cur = str;
  757. int capture = -1;
  758. if (node->type != OP_NOCAPEXPR && node->right == exp->_currsubexp) {
  759. capture = exp->_currsubexp;
  760. exp->_matches[capture].begin = cur;
  761. exp->_currsubexp++;
  762. }
  763. do {
  764. TRexNode* subnext = NULL;
  765. if (n->next != -1) {
  766. subnext = &exp->_nodes[n->next];
  767. } else {
  768. subnext = next;
  769. }
  770. if ((cur = trex_matchnode(exp, n, cur, subnext)) == NULL) {
  771. if (capture != -1) {
  772. exp->_matches[capture].begin = 0;
  773. exp->_matches[capture].len = 0;
  774. }
  775. return NULL;
  776. }
  777. } while ((n->next != -1) && ((n = &exp->_nodes[n->next]) != NULL));
  778. if (capture != -1)
  779. exp->_matches[capture].len = (int)(cur - exp->_matches[capture].begin);
  780. return cur;
  781. }
  782. case OP_WB:
  783. if ((str == exp->_bol && !isspace((int)(*str))) || (str == exp->_eol && !isspace((int)(*(str - 1)))) || (!isspace((int)(*str)) && isspace((int)(*(str + 1)))) ||
  784. (isspace((int)(*str)) && !isspace((int)(*(str + 1))))) {
  785. return (node->left == 'b') ? str : NULL;
  786. }
  787. return (node->left == 'b') ? NULL : str;
  788. case OP_BOL:
  789. if (str == exp->_bol)
  790. return str;
  791. return NULL;
  792. case OP_EOL:
  793. if (str == exp->_eol)
  794. return str;
  795. return NULL;
  796. case OP_DOT: {
  797. str++;
  798. }
  799. return str;
  800. case OP_NCLASS:
  801. case OP_CLASS:
  802. if (trex_matchclass(exp, &exp->_nodes[node->left], *str) ? (type == OP_CLASS ? TRex_True : TRex_False)
  803. : (type == OP_NCLASS ? TRex_True : TRex_False)) {
  804. str++;
  805. return str;
  806. }
  807. return NULL;
  808. case OP_CCLASS:
  809. if (trex_matchcclass(node->left, *str)) {
  810. str++;
  811. return str;
  812. }
  813. return NULL;
  814. default: /* char */
  815. if (exp->_flags & TREX_ICASE) {
  816. if (*str != tolower(node->type) && *str != toupper(node->type))
  817. return NULL;
  818. } else {
  819. if (*str != node->type)
  820. return NULL;
  821. }
  822. str++;
  823. return str;
  824. }
  825. }
  826. /* public api */
  827. TRex* trex_compile(const TRexChar* pattern, const TRexChar** error, int flags) {
  828. TRex* exp = (TRex*)xmalloc(sizeof(TRex));
  829. exp->_eol = exp->_bol = NULL;
  830. exp->_p = pattern;
  831. exp->_nallocated = (int)(scstrlen(pattern) * sizeof(TRexChar));
  832. exp->_nodes = (TRexNode*)xmalloc((size_t)exp->_nallocated * sizeof(TRexNode));
  833. exp->_nsize = 0;
  834. exp->_matches = 0;
  835. exp->_nsubexpr = 0;
  836. exp->_first = trex_newnode(exp, OP_EXPR);
  837. exp->_error = error;
  838. exp->_jmpbuf = xmalloc(sizeof(jmp_buf));
  839. exp->_flags = flags;
  840. if (setjmp(*((jmp_buf*)exp->_jmpbuf)) == 0) {
  841. int res = trex_list(exp);
  842. exp->_nodes[exp->_first].left = res;
  843. if (*exp->_p != '\0')
  844. trex_error(exp, _SC("unexpected character"));
  845. #ifdef ARG_REX_DEBUG
  846. {
  847. int nsize, i;
  848. nsize = exp->_nsize;
  849. scprintf(_SC("\n"));
  850. for (i = 0; i < nsize; i++) {
  851. if (exp->_nodes[i].type > MAX_CHAR)
  852. scprintf(_SC("[%02d] %10s "), i, g_nnames[exp->_nodes[i].type - MAX_CHAR]);
  853. else
  854. scprintf(_SC("[%02d] %10c "), i, exp->_nodes[i].type);
  855. scprintf(_SC("left %02d right %02d next %02d\n"), exp->_nodes[i].left, exp->_nodes[i].right, exp->_nodes[i].next);
  856. }
  857. scprintf(_SC("\n"));
  858. }
  859. #endif
  860. exp->_matches = (TRexMatch*)xmalloc((size_t)exp->_nsubexpr * sizeof(TRexMatch));
  861. memset(exp->_matches, 0, (size_t)exp->_nsubexpr * sizeof(TRexMatch));
  862. } else {
  863. trex_free(exp);
  864. return NULL;
  865. }
  866. return exp;
  867. }
  868. void trex_free(TRex* exp) {
  869. if (exp) {
  870. xfree(exp->_nodes);
  871. xfree(exp->_jmpbuf);
  872. xfree(exp->_matches);
  873. xfree(exp);
  874. }
  875. }
  876. TRexBool trex_match(TRex* exp, const TRexChar* text) {
  877. const TRexChar* res = NULL;
  878. exp->_bol = text;
  879. exp->_eol = text + scstrlen(text);
  880. exp->_currsubexp = 0;
  881. res = trex_matchnode(exp, exp->_nodes, text, NULL);
  882. if (res == NULL || res != exp->_eol)
  883. return TRex_False;
  884. return TRex_True;
  885. }
  886. TRexBool trex_searchrange(TRex* exp, const TRexChar* text_begin, const TRexChar* text_end, const TRexChar** out_begin, const TRexChar** out_end) {
  887. const TRexChar* cur = NULL;
  888. int node = exp->_first;
  889. if (text_begin >= text_end)
  890. return TRex_False;
  891. exp->_bol = text_begin;
  892. exp->_eol = text_end;
  893. do {
  894. cur = text_begin;
  895. while (node != -1) {
  896. exp->_currsubexp = 0;
  897. cur = trex_matchnode(exp, &exp->_nodes[node], cur, NULL);
  898. if (!cur)
  899. break;
  900. node = exp->_nodes[node].next;
  901. }
  902. text_begin++;
  903. } while (cur == NULL && text_begin != text_end);
  904. if (cur == NULL)
  905. return TRex_False;
  906. --text_begin;
  907. if (out_begin)
  908. *out_begin = text_begin;
  909. if (out_end)
  910. *out_end = cur;
  911. return TRex_True;
  912. }
  913. TRexBool trex_search(TRex* exp, const TRexChar* text, const TRexChar** out_begin, const TRexChar** out_end) {
  914. return trex_searchrange(exp, text, text + scstrlen(text), out_begin, out_end);
  915. }
  916. int trex_getsubexpcount(TRex* exp) {
  917. return exp->_nsubexpr;
  918. }
  919. TRexBool trex_getsubexp(TRex* exp, int n, TRexMatch* subexp) {
  920. if (n < 0 || n >= exp->_nsubexpr)
  921. return TRex_False;
  922. *subexp = exp->_matches[n];
  923. return TRex_True;
  924. }