cre.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
  1. /*
  2. *
  3. * Generally additional utility functions.
  4. * L flag, also known as re.LOCALE in Python is not available here.
  5. * Wrong results may be returned in re_sub likes funcitones when 'repl'
  6. *contains '\', '\\\\1' for example.
  7. *
  8. * 4/9/2022
  9. */
  10. #include "cre.h"
  11. #include <stdio.h>
  12. #include <string.h>
  13. #include "pcre.h"
  14. int* _re_get_vec_table(pcre* re, int* out_groups_number) {
  15. int brackets_number = 0;
  16. pcre_fullinfo(re, NULL, PCRE_INFO_CAPTURECOUNT, &brackets_number);
  17. brackets_number++;
  18. if (out_groups_number)
  19. *out_groups_number = brackets_number;
  20. brackets_number *= 3;
  21. int* vec = (int*)malloc(brackets_number * sizeof(int));
  22. return vec;
  23. }
  24. int* pcre_match(const char* _pat,
  25. const char* s,
  26. int len,
  27. int* out_vec_number,
  28. int opt) {
  29. int* vec = NULL;
  30. pcre* re = re_get_match_re(_pat, opt);
  31. if (!re)
  32. return NULL;
  33. vec = re_match2(re, s, len, out_vec_number, opt);
  34. pcre_free(re);
  35. return vec;
  36. }
  37. int* re_match2(pcre* re, const char* s, int len, int* out_vec_number, int opt) {
  38. int* vec = NULL;
  39. int group_n = 0;
  40. int rc;
  41. int start_offset = 0;
  42. vec = _re_get_vec_table(re, &group_n);
  43. if (out_vec_number)
  44. *out_vec_number = group_n;
  45. group_n *= 3;
  46. if (!vec)
  47. goto e_er;
  48. match:
  49. rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n);
  50. if (rc == PCRE_ERROR_NOMATCH) {
  51. free(vec);
  52. return NULL;
  53. }
  54. if (rc <= 0)
  55. goto e_er;
  56. if (vec[0] == vec[1]) {
  57. start_offset++;
  58. if (start_offset >= len)
  59. goto e_er;
  60. goto match;
  61. }
  62. return vec;
  63. e_er:
  64. if (vec)
  65. free(vec);
  66. return NULL;
  67. }
  68. int* pcre_fullmatch(const char* _pat,
  69. const char* s,
  70. int len,
  71. int* out_vec_number,
  72. int opt) {
  73. int* vec = NULL;
  74. opt &= ~PCRE_MULTILINE;
  75. pcre* re = re_get_fullmatch_re(_pat, opt);
  76. if (!re)
  77. return NULL;
  78. vec = re_fullmatch2(re, s, len, out_vec_number, opt);
  79. pcre_free(re);
  80. return vec;
  81. }
  82. int* re_fullmatch2(pcre* re,
  83. const char* s,
  84. int len,
  85. int* out_vec_number,
  86. int opt) {
  87. int* vec = NULL;
  88. int group_n = 0;
  89. int rc;
  90. int start_offset = 0;
  91. vec = _re_get_vec_table(re, &group_n);
  92. if (out_vec_number)
  93. *out_vec_number = group_n;
  94. group_n *= 3;
  95. if (!vec)
  96. goto e_er;
  97. // opt &= ~PCRE_MULTILINE;
  98. match:
  99. rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n);
  100. if (rc == PCRE_ERROR_NOMATCH) {
  101. free(vec);
  102. return NULL;
  103. }
  104. if (rc <= 0)
  105. goto e_er;
  106. if (vec[0] == vec[1]) {
  107. start_offset++;
  108. if (start_offset >= len)
  109. goto e_er;
  110. goto match;
  111. }
  112. return vec;
  113. e_er:
  114. if (vec)
  115. free(vec);
  116. return NULL;
  117. }
  118. pcre* re_get_match_re(const char* _pat, int opt) {
  119. const char* pat = _pat;
  120. if (!*pat) {
  121. return NULL;
  122. }
  123. if (*pat != '^') {
  124. int pat_len = strlen(_pat);
  125. char* p = (char*)pcre_malloc(pat_len + 2);
  126. if (!p)
  127. return NULL;
  128. *p = '^';
  129. memcpy(p + 1, _pat, pat_len + 1);
  130. pat = p;
  131. }
  132. const char* error;
  133. int erroffset;
  134. pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
  135. if (pat != _pat)
  136. free((void*)pat);
  137. return re;
  138. }
  139. pcre* re_get_fullmatch_re(const char* _pat, int opt) {
  140. const char* pat = _pat;
  141. if (!*pat) {
  142. return NULL;
  143. }
  144. int prefix = 0, suffix = 0;
  145. if (*pat != '^') {
  146. prefix = 1;
  147. }
  148. int pat_len = strlen(_pat);
  149. if (_pat[pat_len - 1] != '$')
  150. suffix = 1;
  151. else {
  152. int n = pat_len - 2;
  153. int i = 0;
  154. while (_pat[n] == '\\') {
  155. i++;
  156. n--;
  157. }
  158. if (i % 2) {
  159. suffix = 1;
  160. }
  161. }
  162. int dn = prefix + suffix;
  163. if (dn) {
  164. char* q = (char*)malloc(pat_len + dn + 1);
  165. if (!q)
  166. return NULL;
  167. pat = q;
  168. if (prefix) {
  169. *q = '^';
  170. q++;
  171. }
  172. memcpy(q, _pat, pat_len);
  173. q += pat_len;
  174. if (suffix) {
  175. *q = '$';
  176. q++;
  177. }
  178. *q = '\0';
  179. }
  180. const char* error;
  181. int erroffset;
  182. pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
  183. if (pat != _pat)
  184. free((void*)pat);
  185. return re;
  186. }
  187. /* the following functions return (a) vector/table in heap, which means it need
  188. * to be freed after using*/
  189. int* pcre_search(const char* pat,
  190. const char* s,
  191. int len,
  192. int* out_vec_number,
  193. int opt) {
  194. const char* error;
  195. int erroffset;
  196. pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
  197. if (!re)
  198. return NULL;
  199. int* res = re_search2(re, s, len, out_vec_number, opt);
  200. pcre_free(re);
  201. return res;
  202. }
  203. int* re_search2(pcre* re,
  204. const char* s,
  205. int len,
  206. int* out_vec_number,
  207. int opt) {
  208. int* vec = NULL;
  209. int group_n = 0;
  210. int rc;
  211. int start_offset = 0;
  212. vec = _re_get_vec_table(re, &group_n);
  213. if (out_vec_number)
  214. *out_vec_number = group_n;
  215. group_n *= 3;
  216. if (!vec)
  217. goto e_er;
  218. match:
  219. rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n);
  220. if (rc == PCRE_ERROR_NOMATCH) {
  221. free(vec);
  222. return NULL;
  223. }
  224. if (rc <= 0)
  225. goto e_er;
  226. if (vec[0] == vec[1]) {
  227. start_offset++;
  228. if (start_offset >= len)
  229. goto e_er;
  230. goto match;
  231. }
  232. return vec;
  233. e_er:
  234. if (vec)
  235. free(vec);
  236. return NULL;
  237. }
  238. int** re_searchall(const char* pat,
  239. const char* s,
  240. int len,
  241. int* out_number,
  242. int* out_vec_number,
  243. int opt) {
  244. const char* error;
  245. int erroffset;
  246. pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
  247. if (!re)
  248. return NULL;
  249. int** res = re_searchall2(re, s, len, out_number, out_vec_number, opt);
  250. pcre_free(re);
  251. return res;
  252. }
  253. int** re_searchall2(pcre* re,
  254. const char* s,
  255. int len,
  256. int* out_number,
  257. int* out_vec_number,
  258. int opt) {
  259. int start_offset = 0;
  260. int** vecs = NULL;
  261. int vec_cap = 4;
  262. int vec_n = 0;
  263. int* vec = NULL;
  264. int group_n = 0;
  265. while (1) {
  266. if (group_n)
  267. vec = (int*)malloc(group_n * sizeof(int));
  268. else {
  269. vec = _re_get_vec_table(re, &group_n);
  270. if (out_vec_number)
  271. *out_vec_number = group_n;
  272. group_n *= 3;
  273. }
  274. if (!vec) {
  275. goto e_er;
  276. }
  277. int rc;
  278. match:
  279. rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n);
  280. if (rc == PCRE_ERROR_NOMATCH) {
  281. if (out_number)
  282. *out_number = vec_n;
  283. free(vec);
  284. return vecs;
  285. }
  286. if (rc <= 0)
  287. goto e_er;
  288. if (vec[0] == vec[1]) {
  289. start_offset++;
  290. if (start_offset >= len)
  291. goto e_er;
  292. goto match;
  293. }
  294. if (!vecs) {
  295. vecs = (int**)malloc(sizeof(int*) * vec_cap);
  296. if (!vecs)
  297. goto e_er;
  298. }
  299. if (vec_n >= vec_cap) {
  300. vec_cap *= 2;
  301. void* p = realloc(vecs, vec_cap * sizeof(int*));
  302. if (!p)
  303. goto e_er;
  304. vecs = (int**)p;
  305. }
  306. vecs[vec_n++] = vec;
  307. start_offset = vec[1];
  308. }
  309. e_er:
  310. if (vec)
  311. free(vec);
  312. if (!vecs)
  313. return NULL;
  314. for (int j = 0; j < vec_n; j++) {
  315. if (vecs[j])
  316. free((void*)(vecs[j]));
  317. }
  318. free(vecs);
  319. return NULL;
  320. }
  321. void re_free_searchall(int** vecs, int n) {
  322. if (!vecs)
  323. return;
  324. for (int j = 0; j < n; j++) {
  325. if (vecs[j])
  326. free((void*)(vecs[j]));
  327. }
  328. free(vecs);
  329. }
  330. /* the following functions return (a) string in heap, which means it need to be
  331. * freed after using*/
  332. char** _re_extract_substring(const char* s, int** vecs, int n) {
  333. if (!vecs)
  334. return NULL;
  335. int c = 0;
  336. char** res = (char**)pcre_malloc(sizeof(char*) * n);
  337. if (!res)
  338. return NULL;
  339. for (int j = 0; j < n; j++) {
  340. int* v = vecs[j];
  341. int len = v[1] - v[0];
  342. char* p = (char*)pcre_malloc(len + 1);
  343. if (!p)
  344. goto e_er;
  345. res[c++] = p;
  346. memcpy(p, s + v[0], len);
  347. p[len] = 0;
  348. }
  349. return res;
  350. e_er:
  351. if (!res)
  352. return NULL;
  353. for (int i = 0; i < c; i++) {
  354. free(res[i]);
  355. }
  356. free(res);
  357. return NULL;
  358. }
  359. char* re_find(const char* pat, const char* s, int len, int opt) {
  360. const char* error;
  361. int erroffset;
  362. pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
  363. if (!re)
  364. return NULL;
  365. char* res = re_find2(re, s, len, opt);
  366. pcre_free(re);
  367. return res;
  368. }
  369. char* re_find2(pcre* re, const char* s, int len, int opt) {
  370. int* vec = NULL;
  371. int group_n = 0;
  372. int rc;
  373. int start_offset = 0;
  374. char* res_s = NULL;
  375. vec = _re_get_vec_table(re, &group_n);
  376. if (!vec)
  377. goto e_er;
  378. group_n *= 3;
  379. match:
  380. rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n);
  381. if (rc == PCRE_ERROR_NOMATCH) {
  382. free(vec);
  383. return NULL;
  384. }
  385. if (rc <= 0)
  386. goto e_er;
  387. if (vec[0] == vec[1]) {
  388. start_offset++;
  389. if (start_offset >= len)
  390. goto e_er;
  391. goto match;
  392. }
  393. len = vec[1] - vec[0];
  394. if (!len)
  395. goto e_er;
  396. res_s = (char*)malloc(len + 1);
  397. if (!res_s)
  398. goto e_er;
  399. memcpy(res_s, s + vec[0], len);
  400. res_s[len] = 0;
  401. if (vec)
  402. free(vec);
  403. return res_s;
  404. e_er:
  405. if (vec)
  406. free(vec);
  407. return NULL;
  408. }
  409. char** pcre_findall(const char* pat,
  410. const char* s,
  411. int len,
  412. int* out_number,
  413. int opt) {
  414. const char* error;
  415. int erroffset;
  416. pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
  417. if (!re)
  418. return NULL;
  419. char** res = re_findall2(re, s, len, out_number, opt);
  420. pcre_free(re);
  421. return res;
  422. }
  423. char** re_findall2(pcre* re, const char* s, int len, int* out_number, int opt) {
  424. int out_vec_number;
  425. int** vecs;
  426. char** res;
  427. vecs = re_searchall2(re, s, len, out_number, &out_vec_number, opt);
  428. if (!vecs)
  429. goto e_er;
  430. res = _re_extract_substring(s, vecs, *out_number);
  431. if (!res)
  432. goto e_er;
  433. re_free_searchall(vecs, *out_number);
  434. return res;
  435. e_er:
  436. if (vecs)
  437. re_free_searchall(vecs, *out_number);
  438. return NULL;
  439. }
  440. void re_free_findall(char** ss, int n) {
  441. if (!ss)
  442. return;
  443. for (int j = 0; j < n; j++) {
  444. if (ss[j])
  445. free((void*)(ss[j]));
  446. }
  447. free(ss);
  448. }
  449. char* pcre_sub(const char* pat,
  450. const char* to,
  451. const char* s,
  452. int len,
  453. int opt) {
  454. const char* error;
  455. int erroffset;
  456. pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
  457. if (!re)
  458. return NULL;
  459. char* res = re_sub2(re, to, s, len, opt);
  460. pcre_free(re);
  461. return res;
  462. }
  463. char* pcre_subn(const char* pat,
  464. const char* to,
  465. const char* s,
  466. int len,
  467. int n,
  468. int opt,
  469. int* out_repl_times) {
  470. const char* error;
  471. int erroffset;
  472. pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
  473. if (!re)
  474. return NULL;
  475. char* res = re_subn2(re, to, s, len, n, opt, out_repl_times);
  476. pcre_free(re);
  477. return res;
  478. }
  479. char* re_subn2(pcre* re,
  480. const char* to,
  481. const char* s,
  482. int len,
  483. int n,
  484. int opt,
  485. int* out_repl_times) {
  486. int group_n = 0;
  487. pcre* re2 = NULL;
  488. int vcs1_n = 0, vcs2_n = 0;
  489. int** vcs1 = re_searchall2(re, s, len, &vcs1_n, &group_n, opt);
  490. int** vcs2 = NULL;
  491. int match_limit = 0;
  492. if (!vcs1_n) {
  493. return (char*)s;
  494. }
  495. const char* p2 = "(\\\\\\\\|\\\\\\d{1,2})";
  496. int erroffset;
  497. const char* error;
  498. int len_to, remain_size, remain_length2, pi = 0, qi = 0;
  499. char* new_s = NULL;
  500. re2 = pcre_compile(p2, 0, &error, &erroffset, NULL);
  501. if (!re2)
  502. goto exit_error;
  503. len_to = strlen(to);
  504. vcs2 = re_searchall2(re2, to, len_to, &vcs2_n, NULL, 0);
  505. pcre_free(re2);
  506. re2 = NULL;
  507. remain_length2 = len_to;
  508. for (int i = 0; i < vcs2_n; i++) {
  509. int* vc = vcs2[i];
  510. int vc0 = vc[0] + 1;
  511. if (to[vc0] == '\\') {
  512. vc[2] = 0;
  513. remain_length2 -= 2;
  514. } else {
  515. int wanted_number = 0;
  516. int l_n = vc[1] - vc0;
  517. if (l_n == 1) {
  518. wanted_number = to[vc0] - '0';
  519. remain_length2 -= 2;
  520. } else {
  521. wanted_number = (to[vc0] - '0') * 10 + to[vc0 + 1] - '0';
  522. remain_length2 -= 3;
  523. }
  524. if (wanted_number <= 0 || wanted_number >= group_n)
  525. goto exit_error;
  526. vc[2] = wanted_number;
  527. }
  528. }
  529. match_limit = n ? (n <= vcs1_n ? n : vcs1_n) : vcs1_n;
  530. remain_size = len + remain_length2 * match_limit;
  531. for (int i = 0; i < match_limit; i++) {
  532. int* vc = vcs1[i];
  533. remain_size -= vc[1] - vc[0];
  534. for (int j = 0; j < vcs2_n; j++) {
  535. int* v2 = vcs2[j];
  536. if (v2[2]) {
  537. remain_size += GetGroupLen(vc, v2[2]);
  538. } else {
  539. remain_size++;
  540. }
  541. }
  542. }
  543. new_s = (char*)malloc(remain_size + 1);
  544. if (!new_s)
  545. goto exit_error;
  546. for (int i = 0; i < match_limit; i++) {
  547. int* vc = vcs1[i];
  548. memcpy(new_s + pi, s + qi, vc[0] - qi);
  549. pi += vc[0] - qi;
  550. int m_start = 0, m_len = 0;
  551. for (int j = 0; j < vcs2_n; j++) {
  552. int* v2 = vcs2[j];
  553. m_len = v2[0] - m_start;
  554. memcpy(new_s + pi, to + m_start, m_len);
  555. pi += m_len;
  556. int to_group = v2[2];
  557. if (to_group) {
  558. int to_group_at = vc[to_group * 2];
  559. int to_group_end = vc[to_group * 2 + 1];
  560. int g_l = to_group_end - to_group_at;
  561. memcpy(new_s + pi, s + to_group_at, g_l);
  562. pi += g_l;
  563. } else {
  564. new_s[pi++] = '\\';
  565. }
  566. m_start = v2[1];
  567. }
  568. m_len = len_to - m_start;
  569. memcpy(new_s + pi, to + m_start, m_len);
  570. pi += m_len;
  571. qi = vc[1];
  572. }
  573. if (out_repl_times)
  574. *out_repl_times = match_limit;
  575. if (vcs1)
  576. re_free_searchall(vcs1, vcs1_n);
  577. if (vcs2)
  578. re_free_searchall(vcs2, vcs2_n);
  579. len -= qi;
  580. if (len)
  581. memcpy(new_s + pi, s + qi, len);
  582. pi += len;
  583. new_s[pi] = '\0';
  584. return new_s;
  585. exit_error:
  586. if (vcs1)
  587. re_free_searchall(vcs1, vcs1_n);
  588. if (vcs2)
  589. re_free_searchall(vcs2, vcs2_n);
  590. if (re2)
  591. pcre_free(re2);
  592. return NULL;
  593. }
  594. char* re_sub2(pcre* re, const char* to, const char* s, int len, int opt) {
  595. return re_subn2(re, to, s, len, 0, opt, NULL);
  596. }