regex_executor.h 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. // class template regex -*- C++ -*-
  2. // Copyright (C) 2013-2023 Free Software Foundation, Inc.
  3. //
  4. // This file is part of the GNU ISO C++ Library. This library is free
  5. // software; you can redistribute it and/or modify it under the
  6. // terms of the GNU General Public License as published by the
  7. // Free Software Foundation; either version 3, or (at your option)
  8. // any later version.
  9. // This library is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. // Under Section 7 of GPL version 3, you are granted additional
  14. // permissions described in the GCC Runtime Library Exception, version
  15. // 3.1, as published by the Free Software Foundation.
  16. // You should have received a copy of the GNU General Public License and
  17. // a copy of the GCC Runtime Library Exception along with this program;
  18. // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  19. // <http://www.gnu.org/licenses/>.
  20. /**
  21. * @file bits/regex_executor.h
  22. * This is an internal header file, included by other library headers.
  23. * Do not attempt to use it directly. @headername{regex}
  24. */
  25. // FIXME convert comments to doxygen format.
  26. namespace std _GLIBCXX_VISIBILITY(default)
  27. {
  28. _GLIBCXX_BEGIN_NAMESPACE_VERSION
  29. namespace __detail
  30. {
  31. /**
  32. * @addtogroup regex-detail
  33. * @{
  34. */
  35. /**
  36. * @brief Takes a regex and an input string and does the matching.
  37. *
  38. * The %_Executor class has two modes: DFS mode and BFS mode, controlled
  39. * by the template parameter %__dfs_mode.
  40. */
  41. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  42. bool __dfs_mode>
  43. class _Executor
  44. {
  45. using __search_mode = integral_constant<bool, __dfs_mode>;
  46. using __dfs = true_type;
  47. using __bfs = false_type;
  48. enum class _Match_mode : unsigned char { _Exact, _Prefix };
  49. public:
  50. typedef typename iterator_traits<_BiIter>::value_type _CharT;
  51. typedef basic_regex<_CharT, _TraitsT> _RegexT;
  52. typedef _GLIBCXX_STD_C::vector<sub_match<_BiIter>, _Alloc> _ResultsVec;
  53. typedef regex_constants::match_flag_type _FlagT;
  54. typedef typename _TraitsT::char_class_type _ClassT;
  55. typedef _NFA<_TraitsT> _NFAT;
  56. public:
  57. _Executor(_BiIter __begin,
  58. _BiIter __end,
  59. _ResultsVec& __results,
  60. const _RegexT& __re,
  61. _FlagT __flags)
  62. : _M_cur_results(__results.get_allocator()),
  63. _M_begin(__begin),
  64. _M_end(__end),
  65. _M_re(__re),
  66. _M_nfa(*__re._M_automaton),
  67. _M_results(__results),
  68. _M_rep_count(_M_nfa.size()),
  69. _M_states(_M_nfa._M_start(), _M_nfa.size()),
  70. _M_flags(__flags)
  71. {
  72. using namespace regex_constants;
  73. if (__flags & match_prev_avail) // ignore not_bol and not_bow
  74. _M_flags &= ~(match_not_bol | match_not_bow);
  75. }
  76. // Set matched when string exactly matches the pattern.
  77. bool
  78. _M_match()
  79. {
  80. _M_current = _M_begin;
  81. return _M_main(_Match_mode::_Exact);
  82. }
  83. // Set matched when some prefix of the string matches the pattern.
  84. bool
  85. _M_search_from_first()
  86. {
  87. _M_current = _M_begin;
  88. return _M_main(_Match_mode::_Prefix);
  89. }
  90. bool
  91. _M_search();
  92. private:
  93. void
  94. _M_rep_once_more(_Match_mode __match_mode, _StateIdT);
  95. void
  96. _M_handle_repeat(_Match_mode, _StateIdT);
  97. void
  98. _M_handle_subexpr_begin(_Match_mode, _StateIdT);
  99. void
  100. _M_handle_subexpr_end(_Match_mode, _StateIdT);
  101. void
  102. _M_handle_line_begin_assertion(_Match_mode, _StateIdT);
  103. void
  104. _M_handle_line_end_assertion(_Match_mode, _StateIdT);
  105. void
  106. _M_handle_word_boundary(_Match_mode, _StateIdT);
  107. void
  108. _M_handle_subexpr_lookahead(_Match_mode, _StateIdT);
  109. void
  110. _M_handle_match(_Match_mode, _StateIdT);
  111. void
  112. _M_handle_backref(_Match_mode, _StateIdT);
  113. void
  114. _M_handle_accept(_Match_mode, _StateIdT);
  115. void
  116. _M_handle_alternative(_Match_mode, _StateIdT);
  117. void
  118. _M_dfs(_Match_mode __match_mode, _StateIdT __start);
  119. bool
  120. _M_main(_Match_mode __match_mode)
  121. { return _M_main_dispatch(__match_mode, __search_mode{}); }
  122. bool
  123. _M_main_dispatch(_Match_mode __match_mode, __dfs);
  124. bool
  125. _M_main_dispatch(_Match_mode __match_mode, __bfs);
  126. bool
  127. _M_is_word(_CharT __ch) const
  128. {
  129. static const _CharT __s[2] = { 'w' };
  130. return _M_re._M_automaton->_M_traits.isctype
  131. (__ch, _M_re._M_automaton->_M_traits.lookup_classname(__s, __s+1));
  132. }
  133. bool
  134. _M_at_begin() const
  135. {
  136. if (_M_current == _M_begin)
  137. {
  138. // match_not_bol means ^ does not match [_M_begin,_M_begin)
  139. if (_M_flags & regex_constants::match_not_bol)
  140. return false;
  141. // match_prev_avail means _M_begin is not the start of the input.
  142. if (_M_flags & regex_constants::match_prev_avail)
  143. {
  144. // For ECMAScript multiline matches, check if the previous
  145. // character is a line terminator.
  146. if (_M_match_multiline())
  147. return _M_is_line_terminator(*std::prev(_M_current));
  148. else
  149. return false;
  150. }
  151. else // ^ matches at _M_begin
  152. return true;
  153. }
  154. else if (_M_match_multiline())
  155. return _M_is_line_terminator(*std::prev(_M_current));
  156. else
  157. return false;
  158. }
  159. bool
  160. _M_at_end() const
  161. {
  162. if (_M_current == _M_end)
  163. return !(_M_flags & regex_constants::match_not_eol);
  164. else if (_M_match_multiline())
  165. return _M_is_line_terminator(*_M_current);
  166. else
  167. return false;
  168. }
  169. bool
  170. _M_word_boundary() const;
  171. bool
  172. _M_lookahead(_StateIdT __next);
  173. bool
  174. _M_is_line_terminator(_CharT __c) const
  175. {
  176. const auto& __traits = _M_re._M_automaton->_M_traits;
  177. const auto& __ct = use_facet<ctype<_CharT>>(__traits.getloc());
  178. const char __n{ __ct.narrow(__c, ' ') };
  179. if (__n == '\n')
  180. return true;
  181. if (_M_re._M_automaton->_M_options() & regex_constants::ECMAScript)
  182. {
  183. if (__n == '\r')
  184. return true;
  185. // FIXME: U+2028 (line separator) and U+2029 (paragraph separator)
  186. }
  187. return false;
  188. }
  189. bool
  190. _M_match_multiline() const noexcept
  191. {
  192. constexpr auto __m
  193. = regex_constants::ECMAScript | regex_constants::__multiline;
  194. return (_M_re._M_automaton->_M_options() & __m) == __m;
  195. }
  196. // Holds additional information used in BFS-mode.
  197. template<typename _SearchMode, typename _ResultsVec>
  198. struct _State_info;
  199. template<typename _ResultsVec>
  200. struct _State_info<__bfs, _ResultsVec>
  201. {
  202. explicit
  203. _State_info(_StateIdT __start, size_t __n)
  204. : _M_visited_states(new bool[__n]()), _M_start(__start)
  205. { }
  206. ~_State_info() { delete[] _M_visited_states; }
  207. _State_info(const _State_info&) = delete;
  208. _State_info& operator=(const _State_info&) = delete;
  209. bool _M_visited(_StateIdT __i)
  210. {
  211. if (_M_visited_states[__i])
  212. return true;
  213. _M_visited_states[__i] = true;
  214. return false;
  215. }
  216. void _M_queue(_StateIdT __i, const _ResultsVec& __res)
  217. { _M_match_queue.emplace_back(__i, __res); }
  218. // Dummy implementations for BFS mode.
  219. _BiIter* _M_get_sol_pos() { return nullptr; }
  220. // Saves states that need to be considered for the next character.
  221. _GLIBCXX_STD_C::vector<pair<_StateIdT, _ResultsVec>> _M_match_queue;
  222. // Indicates which states are already visited.
  223. bool* _M_visited_states;
  224. // To record current solution.
  225. _StateIdT _M_start;
  226. };
  227. template<typename _ResultsVec>
  228. struct _State_info<__dfs, _ResultsVec>
  229. {
  230. explicit
  231. _State_info(_StateIdT __start, size_t) : _M_start(__start)
  232. { }
  233. // Dummy implementations for DFS mode.
  234. bool _M_visited(_StateIdT) const { return false; }
  235. void _M_queue(_StateIdT, const _ResultsVec&) { }
  236. _BiIter* _M_get_sol_pos() { return &_M_sol_pos; }
  237. // To record current solution.
  238. _StateIdT _M_start;
  239. _BiIter _M_sol_pos;
  240. };
  241. public:
  242. _ResultsVec _M_cur_results;
  243. _BiIter _M_current;
  244. _BiIter _M_begin;
  245. const _BiIter _M_end;
  246. const _RegexT& _M_re;
  247. const _NFAT& _M_nfa;
  248. _ResultsVec& _M_results;
  249. _GLIBCXX_STD_C::vector<pair<_BiIter, int>> _M_rep_count;
  250. _State_info<__search_mode, _ResultsVec> _M_states;
  251. _FlagT _M_flags;
  252. // Do we have a solution so far?
  253. bool _M_has_sol;
  254. };
  255. ///@} regex-detail
  256. } // namespace __detail
  257. _GLIBCXX_END_NAMESPACE_VERSION
  258. } // namespace std
  259. #include <bits/regex_executor.tcc>