| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624 |
- /*
- *
- * Generally additional utility functions.
- * L flag, also known as re.LOCALE in Python is not available here.
- * Wrong results may be returned in re_sub likes funcitones when 'repl'
- *contains '\', '\\\\1' for example.
- *
- * 4/9/2022
- */
- #include "cre.h"
- #include <stdio.h>
- #include <string.h>
- #include "pcre.h"
- int* _re_get_vec_table(pcre* re, int* out_groups_number) {
- int brackets_number = 0;
- pcre_fullinfo(re, NULL, PCRE_INFO_CAPTURECOUNT, &brackets_number);
- brackets_number++;
- if (out_groups_number)
- *out_groups_number = brackets_number;
- brackets_number *= 3;
- int* vec = (int*)malloc(brackets_number * sizeof(int));
- return vec;
- }
- int* pcre_match(const char* _pat,
- const char* s,
- int len,
- int* out_vec_number,
- int opt) {
- int* vec = NULL;
- pcre* re = re_get_match_re(_pat, opt);
- if (!re)
- return NULL;
- vec = re_match2(re, s, len, out_vec_number, opt);
- pcre_free(re);
- return vec;
- }
- int* re_match2(pcre* re, const char* s, int len, int* out_vec_number, int opt) {
- int* vec = NULL;
- int group_n = 0;
- int rc;
- int start_offset = 0;
- vec = _re_get_vec_table(re, &group_n);
- if (out_vec_number)
- *out_vec_number = group_n;
- group_n *= 3;
- if (!vec)
- goto e_er;
- match:
- rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n);
- if (rc == PCRE_ERROR_NOMATCH) {
- free(vec);
- return NULL;
- }
- if (rc <= 0)
- goto e_er;
- if (vec[0] == vec[1]) {
- start_offset++;
- if (start_offset >= len)
- goto e_er;
- goto match;
- }
- return vec;
- e_er:
- if (vec)
- free(vec);
- return NULL;
- }
- int* pcre_fullmatch(const char* _pat,
- const char* s,
- int len,
- int* out_vec_number,
- int opt) {
- int* vec = NULL;
- opt &= ~PCRE_MULTILINE;
- pcre* re = re_get_fullmatch_re(_pat, opt);
- if (!re)
- return NULL;
- vec = re_fullmatch2(re, s, len, out_vec_number, opt);
- pcre_free(re);
- return vec;
- }
- int* re_fullmatch2(pcre* re,
- const char* s,
- int len,
- int* out_vec_number,
- int opt) {
- int* vec = NULL;
- int group_n = 0;
- int rc;
- int start_offset = 0;
- vec = _re_get_vec_table(re, &group_n);
- if (out_vec_number)
- *out_vec_number = group_n;
- group_n *= 3;
- if (!vec)
- goto e_er;
- // opt &= ~PCRE_MULTILINE;
- match:
- rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n);
- if (rc == PCRE_ERROR_NOMATCH) {
- free(vec);
- return NULL;
- }
- if (rc <= 0)
- goto e_er;
- if (vec[0] == vec[1]) {
- start_offset++;
- if (start_offset >= len)
- goto e_er;
- goto match;
- }
- return vec;
- e_er:
- if (vec)
- free(vec);
- return NULL;
- }
- pcre* re_get_match_re(const char* _pat, int opt) {
- const char* pat = _pat;
- if (!*pat) {
- return NULL;
- }
- if (*pat != '^') {
- int pat_len = strlen(_pat);
- char* p = (char*)pcre_malloc(pat_len + 2);
- if (!p)
- return NULL;
- *p = '^';
- memcpy(p + 1, _pat, pat_len + 1);
- pat = p;
- }
- const char* error;
- int erroffset;
- pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
- if (pat != _pat)
- free((void*)pat);
- return re;
- }
- pcre* re_get_fullmatch_re(const char* _pat, int opt) {
- const char* pat = _pat;
- if (!*pat) {
- return NULL;
- }
- int prefix = 0, suffix = 0;
- if (*pat != '^') {
- prefix = 1;
- }
- int pat_len = strlen(_pat);
- if (_pat[pat_len - 1] != '$')
- suffix = 1;
- else {
- int n = pat_len - 2;
- int i = 0;
- while (_pat[n] == '\\') {
- i++;
- n--;
- }
- if (i % 2) {
- suffix = 1;
- }
- }
- int dn = prefix + suffix;
- if (dn) {
- char* q = (char*)malloc(pat_len + dn + 1);
- if (!q)
- return NULL;
- pat = q;
- if (prefix) {
- *q = '^';
- q++;
- }
- memcpy(q, _pat, pat_len);
- q += pat_len;
- if (suffix) {
- *q = '$';
- q++;
- }
- *q = '\0';
- }
- const char* error;
- int erroffset;
- pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
- if (pat != _pat)
- free((void*)pat);
- return re;
- }
- /* the following functions return (a) vector/table in heap, which means it need
- * to be freed after using*/
- int* pcre_search(const char* pat,
- const char* s,
- int len,
- int* out_vec_number,
- int opt) {
- const char* error;
- int erroffset;
- pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
- if (!re)
- return NULL;
- int* res = re_search2(re, s, len, out_vec_number, opt);
- pcre_free(re);
- return res;
- }
- int* re_search2(pcre* re,
- const char* s,
- int len,
- int* out_vec_number,
- int opt) {
- int* vec = NULL;
- int group_n = 0;
- int rc;
- int start_offset = 0;
- vec = _re_get_vec_table(re, &group_n);
- if (out_vec_number)
- *out_vec_number = group_n;
- group_n *= 3;
- if (!vec)
- goto e_er;
- match:
- rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n);
- if (rc == PCRE_ERROR_NOMATCH) {
- free(vec);
- return NULL;
- }
- if (rc <= 0)
- goto e_er;
- if (vec[0] == vec[1]) {
- start_offset++;
- if (start_offset >= len)
- goto e_er;
- goto match;
- }
- return vec;
- e_er:
- if (vec)
- free(vec);
- return NULL;
- }
- int** re_searchall(const char* pat,
- const char* s,
- int len,
- int* out_number,
- int* out_vec_number,
- int opt) {
- const char* error;
- int erroffset;
- pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
- if (!re)
- return NULL;
- int** res = re_searchall2(re, s, len, out_number, out_vec_number, opt);
- pcre_free(re);
- return res;
- }
- int** re_searchall2(pcre* re,
- const char* s,
- int len,
- int* out_number,
- int* out_vec_number,
- int opt) {
- int start_offset = 0;
- int** vecs = NULL;
- int vec_cap = 4;
- int vec_n = 0;
- int* vec = NULL;
- int group_n = 0;
- while (1) {
- if (group_n)
- vec = (int*)malloc(group_n * sizeof(int));
- else {
- vec = _re_get_vec_table(re, &group_n);
- if (out_vec_number)
- *out_vec_number = group_n;
- group_n *= 3;
- }
- if (!vec) {
- goto e_er;
- }
- int rc;
- match:
- rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n);
- if (rc == PCRE_ERROR_NOMATCH) {
- if (out_number)
- *out_number = vec_n;
- free(vec);
- return vecs;
- }
- if (rc <= 0)
- goto e_er;
- if (vec[0] == vec[1]) {
- start_offset++;
- if (start_offset >= len)
- goto e_er;
- goto match;
- }
- if (!vecs) {
- vecs = (int**)malloc(sizeof(int*) * vec_cap);
- if (!vecs)
- goto e_er;
- }
- if (vec_n >= vec_cap) {
- vec_cap *= 2;
- void* p = realloc(vecs, vec_cap * sizeof(int*));
- if (!p)
- goto e_er;
- vecs = (int**)p;
- }
- vecs[vec_n++] = vec;
- start_offset = vec[1];
- }
- e_er:
- if (vec)
- free(vec);
- if (!vecs)
- return NULL;
- for (int j = 0; j < vec_n; j++) {
- if (vecs[j])
- free((void*)(vecs[j]));
- }
- free(vecs);
- return NULL;
- }
- void re_free_searchall(int** vecs, int n) {
- if (!vecs)
- return;
- for (int j = 0; j < n; j++) {
- if (vecs[j])
- free((void*)(vecs[j]));
- }
- free(vecs);
- }
- /* the following functions return (a) string in heap, which means it need to be
- * freed after using*/
- char** _re_extract_substring(const char* s, int** vecs, int n) {
- if (!vecs)
- return NULL;
- int c = 0;
- char** res = (char**)pcre_malloc(sizeof(char*) * n);
- if (!res)
- return NULL;
- for (int j = 0; j < n; j++) {
- int* v = vecs[j];
- int len = v[1] - v[0];
- char* p = (char*)pcre_malloc(len + 1);
- if (!p)
- goto e_er;
- res[c++] = p;
- memcpy(p, s + v[0], len);
- p[len] = 0;
- }
- return res;
- e_er:
- if (!res)
- return NULL;
- for (int i = 0; i < c; i++) {
- free(res[i]);
- }
- free(res);
- return NULL;
- }
- char* re_find(const char* pat, const char* s, int len, int opt) {
- const char* error;
- int erroffset;
- pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
- if (!re)
- return NULL;
- char* res = re_find2(re, s, len, opt);
- pcre_free(re);
- return res;
- }
- char* re_find2(pcre* re, const char* s, int len, int opt) {
- int* vec = NULL;
- int group_n = 0;
- int rc;
- int start_offset = 0;
- char* res_s = NULL;
- vec = _re_get_vec_table(re, &group_n);
- if (!vec)
- goto e_er;
- group_n *= 3;
- match:
- rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n);
- if (rc == PCRE_ERROR_NOMATCH) {
- free(vec);
- return NULL;
- }
- if (rc <= 0)
- goto e_er;
- if (vec[0] == vec[1]) {
- start_offset++;
- if (start_offset >= len)
- goto e_er;
- goto match;
- }
- len = vec[1] - vec[0];
- if (!len)
- goto e_er;
- res_s = (char*)malloc(len + 1);
- if (!res_s)
- goto e_er;
- memcpy(res_s, s + vec[0], len);
- res_s[len] = 0;
- if (vec)
- free(vec);
- return res_s;
- e_er:
- if (vec)
- free(vec);
- return NULL;
- }
- char** pcre_findall(const char* pat,
- const char* s,
- int len,
- int* out_number,
- int opt) {
- const char* error;
- int erroffset;
- pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
- if (!re)
- return NULL;
- char** res = re_findall2(re, s, len, out_number, opt);
- pcre_free(re);
- return res;
- }
- char** re_findall2(pcre* re, const char* s, int len, int* out_number, int opt) {
- int out_vec_number;
- int** vecs;
- char** res;
- vecs = re_searchall2(re, s, len, out_number, &out_vec_number, opt);
- if (!vecs)
- goto e_er;
- res = _re_extract_substring(s, vecs, *out_number);
- if (!res)
- goto e_er;
- re_free_searchall(vecs, *out_number);
- return res;
- e_er:
- if (vecs)
- re_free_searchall(vecs, *out_number);
- return NULL;
- }
- void re_free_findall(char** ss, int n) {
- if (!ss)
- return;
- for (int j = 0; j < n; j++) {
- if (ss[j])
- free((void*)(ss[j]));
- }
- free(ss);
- }
- char* pcre_sub(const char* pat,
- const char* to,
- const char* s,
- int len,
- int opt) {
- const char* error;
- int erroffset;
- pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
- if (!re)
- return NULL;
- char* res = re_sub2(re, to, s, len, opt);
- pcre_free(re);
- return res;
- }
- char* pcre_subn(const char* pat,
- const char* to,
- const char* s,
- int len,
- int n,
- int opt,
- int* out_repl_times) {
- const char* error;
- int erroffset;
- pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL);
- if (!re)
- return NULL;
- char* res = re_subn2(re, to, s, len, n, opt, out_repl_times);
- pcre_free(re);
- return res;
- }
- char* re_subn2(pcre* re,
- const char* to,
- const char* s,
- int len,
- int n,
- int opt,
- int* out_repl_times) {
- int group_n = 0;
- pcre* re2 = NULL;
- int vcs1_n = 0, vcs2_n = 0;
- int** vcs1 = re_searchall2(re, s, len, &vcs1_n, &group_n, opt);
- int** vcs2 = NULL;
- int match_limit = 0;
- if (!vcs1_n) {
- return (char*)s;
- }
- const char* p2 = "(\\\\\\\\|\\\\\\d{1,2})";
- int erroffset;
- const char* error;
- int len_to, remain_size, remain_length2, pi = 0, qi = 0;
- char* new_s = NULL;
- re2 = pcre_compile(p2, 0, &error, &erroffset, NULL);
- if (!re2)
- goto exit_error;
- len_to = strlen(to);
- vcs2 = re_searchall2(re2, to, len_to, &vcs2_n, NULL, 0);
- pcre_free(re2);
- re2 = NULL;
- remain_length2 = len_to;
- for (int i = 0; i < vcs2_n; i++) {
- int* vc = vcs2[i];
- int vc0 = vc[0] + 1;
- if (to[vc0] == '\\') {
- vc[2] = 0;
- remain_length2 -= 2;
- } else {
- int wanted_number = 0;
- int l_n = vc[1] - vc0;
- if (l_n == 1) {
- wanted_number = to[vc0] - '0';
- remain_length2 -= 2;
- } else {
- wanted_number = (to[vc0] - '0') * 10 + to[vc0 + 1] - '0';
- remain_length2 -= 3;
- }
- if (wanted_number <= 0 || wanted_number >= group_n)
- goto exit_error;
- vc[2] = wanted_number;
- }
- }
- match_limit = n ? (n <= vcs1_n ? n : vcs1_n) : vcs1_n;
- remain_size = len + remain_length2 * match_limit;
- for (int i = 0; i < match_limit; i++) {
- int* vc = vcs1[i];
- remain_size -= vc[1] - vc[0];
- for (int j = 0; j < vcs2_n; j++) {
- int* v2 = vcs2[j];
- if (v2[2]) {
- remain_size += GetGroupLen(vc, v2[2]);
- } else {
- remain_size++;
- }
- }
- }
- new_s = (char*)malloc(remain_size + 1);
- if (!new_s)
- goto exit_error;
- for (int i = 0; i < match_limit; i++) {
- int* vc = vcs1[i];
- memcpy(new_s + pi, s + qi, vc[0] - qi);
- pi += vc[0] - qi;
- int m_start = 0, m_len = 0;
- for (int j = 0; j < vcs2_n; j++) {
- int* v2 = vcs2[j];
- m_len = v2[0] - m_start;
- memcpy(new_s + pi, to + m_start, m_len);
- pi += m_len;
- int to_group = v2[2];
- if (to_group) {
- int to_group_at = vc[to_group * 2];
- int to_group_end = vc[to_group * 2 + 1];
- int g_l = to_group_end - to_group_at;
- memcpy(new_s + pi, s + to_group_at, g_l);
- pi += g_l;
- } else {
- new_s[pi++] = '\\';
- }
- m_start = v2[1];
- }
- m_len = len_to - m_start;
- memcpy(new_s + pi, to + m_start, m_len);
- pi += m_len;
- qi = vc[1];
- }
- if (out_repl_times)
- *out_repl_times = match_limit;
- if (vcs1)
- re_free_searchall(vcs1, vcs1_n);
- if (vcs2)
- re_free_searchall(vcs2, vcs2_n);
- len -= qi;
- if (len)
- memcpy(new_s + pi, s + qi, len);
- pi += len;
- new_s[pi] = '\0';
- return new_s;
- exit_error:
- if (vcs1)
- re_free_searchall(vcs1, vcs1_n);
- if (vcs2)
- re_free_searchall(vcs2, vcs2_n);
- if (re2)
- pcre_free(re2);
- return NULL;
- }
- char* re_sub2(pcre* re, const char* to, const char* s, int len, int opt) {
- return re_subn2(re, to, s, len, 0, opt, NULL);
- }
|