sre_parse.py 38 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040
  1. #
  2. # Secret Labs' Regular Expression Engine
  3. #
  4. # convert re-style regular expression to sre pattern
  5. #
  6. # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
  7. #
  8. # See the sre.py file for information on usage and redistribution.
  9. #
  10. """Internal support module for sre"""
  11. # XXX: show string offset and offending character for all errors
  12. from sre_constants import *
  13. SPECIAL_CHARS = ".\\[{()*+?^$|"
  14. REPEAT_CHARS = "*+?{"
  15. DIGITS = frozenset("0123456789")
  16. OCTDIGITS = frozenset("01234567")
  17. HEXDIGITS = frozenset("0123456789abcdefABCDEF")
  18. ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
  19. WHITESPACE = frozenset(" \t\n\r\v\f")
  20. _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
  21. _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
  22. ESCAPES = {
  23. r"\a": (LITERAL, ord("\a")),
  24. r"\b": (LITERAL, ord("\b")),
  25. r"\f": (LITERAL, ord("\f")),
  26. r"\n": (LITERAL, ord("\n")),
  27. r"\r": (LITERAL, ord("\r")),
  28. r"\t": (LITERAL, ord("\t")),
  29. r"\v": (LITERAL, ord("\v")),
  30. r"\\": (LITERAL, ord("\\"))
  31. }
  32. CATEGORIES = {
  33. r"\A": (AT, AT_BEGINNING_STRING), # start of string
  34. r"\b": (AT, AT_BOUNDARY),
  35. r"\B": (AT, AT_NON_BOUNDARY),
  36. r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
  37. r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
  38. r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
  39. r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
  40. r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
  41. r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
  42. r"\Z": (AT, AT_END_STRING), # end of string
  43. }
  44. FLAGS = {
  45. # standard flags
  46. "i": SRE_FLAG_IGNORECASE,
  47. "L": SRE_FLAG_LOCALE,
  48. "m": SRE_FLAG_MULTILINE,
  49. "s": SRE_FLAG_DOTALL,
  50. "x": SRE_FLAG_VERBOSE,
  51. # extensions
  52. "a": SRE_FLAG_ASCII,
  53. "t": SRE_FLAG_TEMPLATE,
  54. "u": SRE_FLAG_UNICODE,
  55. }
  56. TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
  57. GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
  58. class Verbose(Exception):
  59. pass
  60. class Pattern:
  61. # master pattern object. keeps track of global attributes
  62. def __init__(self):
  63. self.flags = 0
  64. self.groupdict = {}
  65. self.groupwidths = [None] # group 0
  66. self.lookbehindgroups = None
  67. @property
  68. def groups(self):
  69. return len(self.groupwidths)
  70. def opengroup(self, name=None):
  71. gid = self.groups
  72. self.groupwidths.append(None)
  73. if self.groups > MAXGROUPS:
  74. raise error("too many groups")
  75. if name is not None:
  76. ogid = self.groupdict.get(name, None)
  77. if ogid is not None:
  78. raise error("redefinition of group name %r as group %d; "
  79. "was group %d" % (name, gid, ogid))
  80. self.groupdict[name] = gid
  81. return gid
  82. def closegroup(self, gid, p):
  83. self.groupwidths[gid] = p.getwidth()
  84. def checkgroup(self, gid):
  85. return gid < self.groups and self.groupwidths[gid] is not None
  86. def checklookbehindgroup(self, gid, source):
  87. if self.lookbehindgroups is not None:
  88. if not self.checkgroup(gid):
  89. raise source.error('cannot refer to an open group')
  90. if gid >= self.lookbehindgroups:
  91. raise source.error('cannot refer to group defined in the same '
  92. 'lookbehind subpattern')
  93. class SubPattern:
  94. # a subpattern, in intermediate form
  95. def __init__(self, pattern, data=None):
  96. self.pattern = pattern
  97. if data is None:
  98. data = []
  99. self.data = data
  100. self.width = None
  101. def dump(self, level=0):
  102. nl = True
  103. seqtypes = (tuple, list)
  104. for op, av in self.data:
  105. print(level*" " + str(op), end='')
  106. if op is IN:
  107. # member sublanguage
  108. print()
  109. for op, a in av:
  110. print((level+1)*" " + str(op), a)
  111. elif op is BRANCH:
  112. print()
  113. for i, a in enumerate(av[1]):
  114. if i:
  115. print(level*" " + "OR")
  116. a.dump(level+1)
  117. elif op is GROUPREF_EXISTS:
  118. condgroup, item_yes, item_no = av
  119. print('', condgroup)
  120. item_yes.dump(level+1)
  121. if item_no:
  122. print(level*" " + "ELSE")
  123. item_no.dump(level+1)
  124. elif isinstance(av, seqtypes):
  125. nl = False
  126. for a in av:
  127. if isinstance(a, SubPattern):
  128. if not nl:
  129. print()
  130. a.dump(level+1)
  131. nl = True
  132. else:
  133. if not nl:
  134. print(' ', end='')
  135. print(a, end='')
  136. nl = False
  137. if not nl:
  138. print()
  139. else:
  140. print('', av)
  141. def __repr__(self):
  142. return repr(self.data)
  143. def __len__(self):
  144. return len(self.data)
  145. def __delitem__(self, index):
  146. del self.data[index]
  147. def __getitem__(self, index):
  148. if isinstance(index, slice):
  149. return SubPattern(self.pattern, self.data[index])
  150. return self.data[index]
  151. def __setitem__(self, index, code):
  152. self.data[index] = code
  153. def insert(self, index, code):
  154. self.data.insert(index, code)
  155. def append(self, code):
  156. self.data.append(code)
  157. def getwidth(self):
  158. # determine the width (min, max) for this subpattern
  159. if self.width is not None:
  160. return self.width
  161. lo = hi = 0
  162. for op, av in self.data:
  163. if op is BRANCH:
  164. i = MAXREPEAT - 1
  165. j = 0
  166. for av in av[1]:
  167. l, h = av.getwidth()
  168. i = min(i, l)
  169. j = max(j, h)
  170. lo = lo + i
  171. hi = hi + j
  172. elif op is CALL:
  173. i, j = av.getwidth()
  174. lo = lo + i
  175. hi = hi + j
  176. elif op is SUBPATTERN:
  177. i, j = av[-1].getwidth()
  178. lo = lo + i
  179. hi = hi + j
  180. elif op in _REPEATCODES:
  181. i, j = av[2].getwidth()
  182. lo = lo + i * av[0]
  183. hi = hi + j * av[1]
  184. elif op in _UNITCODES:
  185. lo = lo + 1
  186. hi = hi + 1
  187. elif op is GROUPREF:
  188. i, j = self.pattern.groupwidths[av]
  189. lo = lo + i
  190. hi = hi + j
  191. elif op is GROUPREF_EXISTS:
  192. i, j = av[1].getwidth()
  193. if av[2] is not None:
  194. l, h = av[2].getwidth()
  195. i = min(i, l)
  196. j = max(j, h)
  197. else:
  198. i = 0
  199. lo = lo + i
  200. hi = hi + j
  201. elif op is SUCCESS:
  202. break
  203. self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
  204. return self.width
  205. class Tokenizer:
  206. def __init__(self, string):
  207. self.istext = isinstance(string, str)
  208. self.string = string
  209. if not self.istext:
  210. string = str(string, 'latin1')
  211. self.decoded_string = string
  212. self.index = 0
  213. self.next = None
  214. self.__next()
  215. def __next(self):
  216. index = self.index
  217. try:
  218. char = self.decoded_string[index]
  219. except IndexError:
  220. self.next = None
  221. return
  222. if char == "\\":
  223. index += 1
  224. try:
  225. char += self.decoded_string[index]
  226. except IndexError:
  227. raise error("bad escape (end of pattern)",
  228. self.string, len(self.string) - 1) from None
  229. self.index = index + 1
  230. self.next = char
  231. def match(self, char):
  232. if char == self.next:
  233. self.__next()
  234. return True
  235. return False
  236. def get(self):
  237. this = self.next
  238. self.__next()
  239. return this
  240. def getwhile(self, n, charset):
  241. result = ''
  242. for _ in range(n):
  243. c = self.next
  244. if c not in charset:
  245. break
  246. result += c
  247. self.__next()
  248. return result
  249. def getuntil(self, terminator):
  250. result = ''
  251. while True:
  252. c = self.next
  253. self.__next()
  254. if c is None:
  255. if not result:
  256. raise self.error("missing group name")
  257. raise self.error("missing %s, unterminated name" % terminator,
  258. len(result))
  259. if c == terminator:
  260. if not result:
  261. raise self.error("missing group name", 1)
  262. break
  263. result += c
  264. return result
  265. @property
  266. def pos(self):
  267. return self.index - len(self.next or '')
  268. def tell(self):
  269. return self.index - len(self.next or '')
  270. def seek(self, index):
  271. self.index = index
  272. self.__next()
  273. def error(self, msg, offset=0):
  274. return error(msg, self.string, self.tell() - offset)
  275. def _class_escape(source, escape):
  276. # handle escape code inside character class
  277. code = ESCAPES.get(escape)
  278. if code:
  279. return code
  280. code = CATEGORIES.get(escape)
  281. if code and code[0] is IN:
  282. return code
  283. try:
  284. c = escape[1:2]
  285. if c == "x":
  286. # hexadecimal escape (exactly two digits)
  287. escape += source.getwhile(2, HEXDIGITS)
  288. if len(escape) != 4:
  289. raise source.error("incomplete escape %s" % escape, len(escape))
  290. return LITERAL, int(escape[2:], 16)
  291. elif c == "u" and source.istext:
  292. # unicode escape (exactly four digits)
  293. escape += source.getwhile(4, HEXDIGITS)
  294. if len(escape) != 6:
  295. raise source.error("incomplete escape %s" % escape, len(escape))
  296. return LITERAL, int(escape[2:], 16)
  297. elif c == "U" and source.istext:
  298. # unicode escape (exactly eight digits)
  299. escape += source.getwhile(8, HEXDIGITS)
  300. if len(escape) != 10:
  301. raise source.error("incomplete escape %s" % escape, len(escape))
  302. c = int(escape[2:], 16)
  303. chr(c) # raise ValueError for invalid code
  304. return LITERAL, c
  305. elif c in OCTDIGITS:
  306. # octal escape (up to three digits)
  307. escape += source.getwhile(2, OCTDIGITS)
  308. c = int(escape[1:], 8)
  309. if c > 0o377:
  310. raise source.error('octal escape value %s outside of '
  311. 'range 0-0o377' % escape, len(escape))
  312. return LITERAL, c
  313. elif c in DIGITS:
  314. raise ValueError
  315. if len(escape) == 2:
  316. if c in ASCIILETTERS:
  317. raise source.error('bad escape %s' % escape, len(escape))
  318. return LITERAL, ord(escape[1])
  319. except ValueError:
  320. pass
  321. raise source.error("bad escape %s" % escape, len(escape))
  322. def _escape(source, escape, state):
  323. # handle escape code in expression
  324. code = CATEGORIES.get(escape)
  325. if code:
  326. return code
  327. code = ESCAPES.get(escape)
  328. if code:
  329. return code
  330. try:
  331. c = escape[1:2]
  332. if c == "x":
  333. # hexadecimal escape
  334. escape += source.getwhile(2, HEXDIGITS)
  335. if len(escape) != 4:
  336. raise source.error("incomplete escape %s" % escape, len(escape))
  337. return LITERAL, int(escape[2:], 16)
  338. elif c == "u" and source.istext:
  339. # unicode escape (exactly four digits)
  340. escape += source.getwhile(4, HEXDIGITS)
  341. if len(escape) != 6:
  342. raise source.error("incomplete escape %s" % escape, len(escape))
  343. return LITERAL, int(escape[2:], 16)
  344. elif c == "U" and source.istext:
  345. # unicode escape (exactly eight digits)
  346. escape += source.getwhile(8, HEXDIGITS)
  347. if len(escape) != 10:
  348. raise source.error("incomplete escape %s" % escape, len(escape))
  349. c = int(escape[2:], 16)
  350. chr(c) # raise ValueError for invalid code
  351. return LITERAL, c
  352. elif c == "0":
  353. # octal escape
  354. escape += source.getwhile(2, OCTDIGITS)
  355. return LITERAL, int(escape[1:], 8)
  356. elif c in DIGITS:
  357. # octal escape *or* decimal group reference (sigh)
  358. if source.next in DIGITS:
  359. escape += source.get()
  360. if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
  361. source.next in OCTDIGITS):
  362. # got three octal digits; this is an octal escape
  363. escape += source.get()
  364. c = int(escape[1:], 8)
  365. if c > 0o377:
  366. raise source.error('octal escape value %s outside of '
  367. 'range 0-0o377' % escape,
  368. len(escape))
  369. return LITERAL, c
  370. # not an octal escape, so this is a group reference
  371. group = int(escape[1:])
  372. if group < state.groups:
  373. if not state.checkgroup(group):
  374. raise source.error("cannot refer to an open group",
  375. len(escape))
  376. state.checklookbehindgroup(group, source)
  377. return GROUPREF, group
  378. raise source.error("invalid group reference %d" % group, len(escape) - 1)
  379. if len(escape) == 2:
  380. if c in ASCIILETTERS:
  381. raise source.error("bad escape %s" % escape, len(escape))
  382. return LITERAL, ord(escape[1])
  383. except ValueError:
  384. pass
  385. raise source.error("bad escape %s" % escape, len(escape))
  386. def _uniq(items):
  387. return list(dict.fromkeys(items))
  388. def _parse_sub(source, state, verbose, nested):
  389. # parse an alternation: a|b|c
  390. items = []
  391. itemsappend = items.append
  392. sourcematch = source.match
  393. start = source.tell()
  394. while True:
  395. itemsappend(_parse(source, state, verbose, nested + 1,
  396. not nested and not items))
  397. if not sourcematch("|"):
  398. break
  399. if len(items) == 1:
  400. return items[0]
  401. subpattern = SubPattern(state)
  402. # check if all items share a common prefix
  403. while True:
  404. prefix = None
  405. for item in items:
  406. if not item:
  407. break
  408. if prefix is None:
  409. prefix = item[0]
  410. elif item[0] != prefix:
  411. break
  412. else:
  413. # all subitems start with a common "prefix".
  414. # move it out of the branch
  415. for item in items:
  416. del item[0]
  417. subpattern.append(prefix)
  418. continue # check next one
  419. break
  420. # check if the branch can be replaced by a character set
  421. set = []
  422. for item in items:
  423. if len(item) != 1:
  424. break
  425. op, av = item[0]
  426. if op is LITERAL:
  427. set.append((op, av))
  428. elif op is IN and av[0][0] is not NEGATE:
  429. set.extend(av)
  430. else:
  431. break
  432. else:
  433. # we can store this as a character set instead of a
  434. # branch (the compiler may optimize this even more)
  435. subpattern.append((IN, _uniq(set)))
  436. return subpattern
  437. subpattern.append((BRANCH, (None, items)))
  438. return subpattern
  439. def _parse(source, state, verbose, nested, first=False):
  440. # parse a simple pattern
  441. subpattern = SubPattern(state)
  442. # precompute constants into local variables
  443. subpatternappend = subpattern.append
  444. sourceget = source.get
  445. sourcematch = source.match
  446. _len = len
  447. _ord = ord
  448. while True:
  449. this = source.next
  450. if this is None:
  451. break # end of pattern
  452. if this in "|)":
  453. break # end of subpattern
  454. sourceget()
  455. if verbose:
  456. # skip whitespace and comments
  457. if this in WHITESPACE:
  458. continue
  459. if this == "#":
  460. while True:
  461. this = sourceget()
  462. if this is None or this == "\n":
  463. break
  464. continue
  465. if this[0] == "\\":
  466. code = _escape(source, this, state)
  467. subpatternappend(code)
  468. elif this not in SPECIAL_CHARS:
  469. subpatternappend((LITERAL, _ord(this)))
  470. elif this == "[":
  471. here = source.tell() - 1
  472. # character set
  473. set = []
  474. setappend = set.append
  475. ## if sourcematch(":"):
  476. ## pass # handle character classes
  477. if source.next == '[':
  478. import warnings
  479. warnings.warn(
  480. 'Possible nested set at position %d' % source.tell(),
  481. FutureWarning, stacklevel=nested + 6
  482. )
  483. negate = sourcematch("^")
  484. # check remaining characters
  485. while True:
  486. this = sourceget()
  487. if this is None:
  488. raise source.error("unterminated character set",
  489. source.tell() - here)
  490. if this == "]" and set:
  491. break
  492. elif this[0] == "\\":
  493. code1 = _class_escape(source, this)
  494. else:
  495. if set and this in '-&~|' and source.next == this:
  496. import warnings
  497. warnings.warn(
  498. 'Possible set %s at position %d' % (
  499. 'difference' if this == '-' else
  500. 'intersection' if this == '&' else
  501. 'symmetric difference' if this == '~' else
  502. 'union',
  503. source.tell() - 1),
  504. FutureWarning, stacklevel=nested + 6
  505. )
  506. code1 = LITERAL, _ord(this)
  507. if sourcematch("-"):
  508. # potential range
  509. that = sourceget()
  510. if that is None:
  511. raise source.error("unterminated character set",
  512. source.tell() - here)
  513. if that == "]":
  514. if code1[0] is IN:
  515. code1 = code1[1][0]
  516. setappend(code1)
  517. setappend((LITERAL, _ord("-")))
  518. break
  519. if that[0] == "\\":
  520. code2 = _class_escape(source, that)
  521. else:
  522. if that == '-':
  523. import warnings
  524. warnings.warn(
  525. 'Possible set difference at position %d' % (
  526. source.tell() - 2),
  527. FutureWarning, stacklevel=nested + 6
  528. )
  529. code2 = LITERAL, _ord(that)
  530. if code1[0] != LITERAL or code2[0] != LITERAL:
  531. msg = "bad character range %s-%s" % (this, that)
  532. raise source.error(msg, len(this) + 1 + len(that))
  533. lo = code1[1]
  534. hi = code2[1]
  535. if hi < lo:
  536. msg = "bad character range %s-%s" % (this, that)
  537. raise source.error(msg, len(this) + 1 + len(that))
  538. setappend((RANGE, (lo, hi)))
  539. else:
  540. if code1[0] is IN:
  541. code1 = code1[1][0]
  542. setappend(code1)
  543. set = _uniq(set)
  544. # XXX: <fl> should move set optimization to compiler!
  545. if _len(set) == 1 and set[0][0] is LITERAL:
  546. # optimization
  547. if negate:
  548. subpatternappend((NOT_LITERAL, set[0][1]))
  549. else:
  550. subpatternappend(set[0])
  551. else:
  552. if negate:
  553. set.insert(0, (NEGATE, None))
  554. # charmap optimization can't be added here because
  555. # global flags still are not known
  556. subpatternappend((IN, set))
  557. elif this in REPEAT_CHARS:
  558. # repeat previous item
  559. here = source.tell()
  560. if this == "?":
  561. min, max = 0, 1
  562. elif this == "*":
  563. min, max = 0, MAXREPEAT
  564. elif this == "+":
  565. min, max = 1, MAXREPEAT
  566. elif this == "{":
  567. if source.next == "}":
  568. subpatternappend((LITERAL, _ord(this)))
  569. continue
  570. min, max = 0, MAXREPEAT
  571. lo = hi = ""
  572. while source.next in DIGITS:
  573. lo += sourceget()
  574. if sourcematch(","):
  575. while source.next in DIGITS:
  576. hi += sourceget()
  577. else:
  578. hi = lo
  579. if not sourcematch("}"):
  580. subpatternappend((LITERAL, _ord(this)))
  581. source.seek(here)
  582. continue
  583. if lo:
  584. min = int(lo)
  585. if min >= MAXREPEAT:
  586. raise OverflowError("the repetition number is too large")
  587. if hi:
  588. max = int(hi)
  589. if max >= MAXREPEAT:
  590. raise OverflowError("the repetition number is too large")
  591. if max < min:
  592. raise source.error("min repeat greater than max repeat",
  593. source.tell() - here)
  594. else:
  595. raise AssertionError("unsupported quantifier %r" % (char,))
  596. # figure out which item to repeat
  597. if subpattern:
  598. item = subpattern[-1:]
  599. else:
  600. item = None
  601. if not item or item[0][0] is AT:
  602. raise source.error("nothing to repeat",
  603. source.tell() - here + len(this))
  604. if item[0][0] in _REPEATCODES:
  605. raise source.error("multiple repeat",
  606. source.tell() - here + len(this))
  607. if item[0][0] is SUBPATTERN:
  608. group, add_flags, del_flags, p = item[0][1]
  609. if group is None and not add_flags and not del_flags:
  610. item = p
  611. if sourcematch("?"):
  612. subpattern[-1] = (MIN_REPEAT, (min, max, item))
  613. else:
  614. subpattern[-1] = (MAX_REPEAT, (min, max, item))
  615. elif this == ".":
  616. subpatternappend((ANY, None))
  617. elif this == "(":
  618. start = source.tell() - 1
  619. group = True
  620. name = None
  621. add_flags = 0
  622. del_flags = 0
  623. if sourcematch("?"):
  624. # options
  625. char = sourceget()
  626. if char is None:
  627. raise source.error("unexpected end of pattern")
  628. if char == "P":
  629. # python extensions
  630. if sourcematch("<"):
  631. # named group: skip forward to end of name
  632. name = source.getuntil(">")
  633. if not name.isidentifier():
  634. msg = "bad character in group name %r" % name
  635. raise source.error(msg, len(name) + 1)
  636. elif sourcematch("="):
  637. # named backreference
  638. name = source.getuntil(")")
  639. if not name.isidentifier():
  640. msg = "bad character in group name %r" % name
  641. raise source.error(msg, len(name) + 1)
  642. gid = state.groupdict.get(name)
  643. if gid is None:
  644. msg = "unknown group name %r" % name
  645. raise source.error(msg, len(name) + 1)
  646. if not state.checkgroup(gid):
  647. raise source.error("cannot refer to an open group",
  648. len(name) + 1)
  649. state.checklookbehindgroup(gid, source)
  650. subpatternappend((GROUPREF, gid))
  651. continue
  652. else:
  653. char = sourceget()
  654. if char is None:
  655. raise source.error("unexpected end of pattern")
  656. raise source.error("unknown extension ?P" + char,
  657. len(char) + 2)
  658. elif char == ":":
  659. # non-capturing group
  660. group = None
  661. elif char == "#":
  662. # comment
  663. while True:
  664. if source.next is None:
  665. raise source.error("missing ), unterminated comment",
  666. source.tell() - start)
  667. if sourceget() == ")":
  668. break
  669. continue
  670. elif char in "=!<":
  671. # lookahead assertions
  672. dir = 1
  673. if char == "<":
  674. char = sourceget()
  675. if char is None:
  676. raise source.error("unexpected end of pattern")
  677. if char not in "=!":
  678. raise source.error("unknown extension ?<" + char,
  679. len(char) + 2)
  680. dir = -1 # lookbehind
  681. lookbehindgroups = state.lookbehindgroups
  682. if lookbehindgroups is None:
  683. state.lookbehindgroups = state.groups
  684. p = _parse_sub(source, state, verbose, nested + 1)
  685. if dir < 0:
  686. if lookbehindgroups is None:
  687. state.lookbehindgroups = None
  688. if not sourcematch(")"):
  689. raise source.error("missing ), unterminated subpattern",
  690. source.tell() - start)
  691. if char == "=":
  692. subpatternappend((ASSERT, (dir, p)))
  693. else:
  694. subpatternappend((ASSERT_NOT, (dir, p)))
  695. continue
  696. elif char == "(":
  697. # conditional backreference group
  698. condname = source.getuntil(")")
  699. if condname.isidentifier():
  700. condgroup = state.groupdict.get(condname)
  701. if condgroup is None:
  702. msg = "unknown group name %r" % condname
  703. raise source.error(msg, len(condname) + 1)
  704. else:
  705. try:
  706. condgroup = int(condname)
  707. if condgroup < 0:
  708. raise ValueError
  709. except ValueError:
  710. msg = "bad character in group name %r" % condname
  711. raise source.error(msg, len(condname) + 1) from None
  712. if not condgroup:
  713. raise source.error("bad group number",
  714. len(condname) + 1)
  715. if condgroup >= MAXGROUPS:
  716. msg = "invalid group reference %d" % condgroup
  717. raise source.error(msg, len(condname) + 1)
  718. state.checklookbehindgroup(condgroup, source)
  719. item_yes = _parse(source, state, verbose, nested + 1)
  720. if source.match("|"):
  721. item_no = _parse(source, state, verbose, nested + 1)
  722. if source.next == "|":
  723. raise source.error("conditional backref with more than two branches")
  724. else:
  725. item_no = None
  726. if not source.match(")"):
  727. raise source.error("missing ), unterminated subpattern",
  728. source.tell() - start)
  729. subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
  730. continue
  731. elif char in FLAGS or char == "-":
  732. # flags
  733. flags = _parse_flags(source, state, char)
  734. if flags is None: # global flags
  735. if not first or subpattern:
  736. import warnings
  737. warnings.warn(
  738. 'Flags not at the start of the expression %r%s' % (
  739. source.string[:20], # truncate long regexes
  740. ' (truncated)' if len(source.string) > 20 else '',
  741. ),
  742. DeprecationWarning, stacklevel=nested + 6
  743. )
  744. if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
  745. raise Verbose
  746. continue
  747. add_flags, del_flags = flags
  748. group = None
  749. else:
  750. raise source.error("unknown extension ?" + char,
  751. len(char) + 1)
  752. # parse group contents
  753. if group is not None:
  754. try:
  755. group = state.opengroup(name)
  756. except error as err:
  757. raise source.error(err.msg, len(name) + 1) from None
  758. sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
  759. not (del_flags & SRE_FLAG_VERBOSE))
  760. p = _parse_sub(source, state, sub_verbose, nested + 1)
  761. if not source.match(")"):
  762. raise source.error("missing ), unterminated subpattern",
  763. source.tell() - start)
  764. if group is not None:
  765. state.closegroup(group, p)
  766. subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
  767. elif this == "^":
  768. subpatternappend((AT, AT_BEGINNING))
  769. elif this == "$":
  770. subpatternappend((AT, AT_END))
  771. else:
  772. raise AssertionError("unsupported special character %r" % (char,))
  773. # unpack non-capturing groups
  774. for i in range(len(subpattern))[::-1]:
  775. op, av = subpattern[i]
  776. if op is SUBPATTERN:
  777. group, add_flags, del_flags, p = av
  778. if group is None and not add_flags and not del_flags:
  779. subpattern[i: i+1] = p
  780. return subpattern
  781. def _parse_flags(source, state, char):
  782. sourceget = source.get
  783. add_flags = 0
  784. del_flags = 0
  785. if char != "-":
  786. while True:
  787. flag = FLAGS[char]
  788. if source.istext:
  789. if char == 'L':
  790. msg = "bad inline flags: cannot use 'L' flag with a str pattern"
  791. raise source.error(msg)
  792. else:
  793. if char == 'u':
  794. msg = "bad inline flags: cannot use 'u' flag with a bytes pattern"
  795. raise source.error(msg)
  796. add_flags |= flag
  797. if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag:
  798. msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible"
  799. raise source.error(msg)
  800. char = sourceget()
  801. if char is None:
  802. raise source.error("missing -, : or )")
  803. if char in ")-:":
  804. break
  805. if char not in FLAGS:
  806. msg = "unknown flag" if char.isalpha() else "missing -, : or )"
  807. raise source.error(msg, len(char))
  808. if char == ")":
  809. state.flags |= add_flags
  810. return None
  811. if add_flags & GLOBAL_FLAGS:
  812. raise source.error("bad inline flags: cannot turn on global flag", 1)
  813. if char == "-":
  814. char = sourceget()
  815. if char is None:
  816. raise source.error("missing flag")
  817. if char not in FLAGS:
  818. msg = "unknown flag" if char.isalpha() else "missing flag"
  819. raise source.error(msg, len(char))
  820. while True:
  821. flag = FLAGS[char]
  822. if flag & TYPE_FLAGS:
  823. msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'"
  824. raise source.error(msg)
  825. del_flags |= flag
  826. char = sourceget()
  827. if char is None:
  828. raise source.error("missing :")
  829. if char == ":":
  830. break
  831. if char not in FLAGS:
  832. msg = "unknown flag" if char.isalpha() else "missing :"
  833. raise source.error(msg, len(char))
  834. assert char == ":"
  835. if del_flags & GLOBAL_FLAGS:
  836. raise source.error("bad inline flags: cannot turn off global flag", 1)
  837. if add_flags & del_flags:
  838. raise source.error("bad inline flags: flag turned on and off", 1)
  839. return add_flags, del_flags
  840. def fix_flags(src, flags):
  841. # Check and fix flags according to the type of pattern (str or bytes)
  842. if isinstance(src, str):
  843. if flags & SRE_FLAG_LOCALE:
  844. raise ValueError("cannot use LOCALE flag with a str pattern")
  845. if not flags & SRE_FLAG_ASCII:
  846. flags |= SRE_FLAG_UNICODE
  847. elif flags & SRE_FLAG_UNICODE:
  848. raise ValueError("ASCII and UNICODE flags are incompatible")
  849. else:
  850. if flags & SRE_FLAG_UNICODE:
  851. raise ValueError("cannot use UNICODE flag with a bytes pattern")
  852. if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
  853. raise ValueError("ASCII and LOCALE flags are incompatible")
  854. return flags
  855. def parse(str, flags=0, pattern=None):
  856. # parse 're' pattern into list of (opcode, argument) tuples
  857. source = Tokenizer(str)
  858. if pattern is None:
  859. pattern = Pattern()
  860. pattern.flags = flags
  861. pattern.str = str
  862. try:
  863. p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, 0)
  864. except Verbose:
  865. # the VERBOSE flag was switched on inside the pattern. to be
  866. # on the safe side, we'll parse the whole thing again...
  867. pattern = Pattern()
  868. pattern.flags = flags | SRE_FLAG_VERBOSE
  869. pattern.str = str
  870. source.seek(0)
  871. p = _parse_sub(source, pattern, True, 0)
  872. p.pattern.flags = fix_flags(str, p.pattern.flags)
  873. if source.next is not None:
  874. assert source.next == ")"
  875. raise source.error("unbalanced parenthesis")
  876. if flags & SRE_FLAG_DEBUG:
  877. p.dump()
  878. return p
  879. def parse_template(source, pattern):
  880. # parse 're' replacement string into list of literals and
  881. # group references
  882. s = Tokenizer(source)
  883. sget = s.get
  884. groups = []
  885. literals = []
  886. literal = []
  887. lappend = literal.append
  888. def addgroup(index, pos):
  889. if index > pattern.groups:
  890. raise s.error("invalid group reference %d" % index, pos)
  891. if literal:
  892. literals.append(''.join(literal))
  893. del literal[:]
  894. groups.append((len(literals), index))
  895. literals.append(None)
  896. groupindex = pattern.groupindex
  897. while True:
  898. this = sget()
  899. if this is None:
  900. break # end of replacement string
  901. if this[0] == "\\":
  902. # group
  903. c = this[1]
  904. if c == "g":
  905. name = ""
  906. if not s.match("<"):
  907. raise s.error("missing <")
  908. name = s.getuntil(">")
  909. if name.isidentifier():
  910. try:
  911. index = groupindex[name]
  912. except KeyError:
  913. raise IndexError("unknown group name %r" % name)
  914. else:
  915. try:
  916. index = int(name)
  917. if index < 0:
  918. raise ValueError
  919. except ValueError:
  920. raise s.error("bad character in group name %r" % name,
  921. len(name) + 1) from None
  922. if index >= MAXGROUPS:
  923. raise s.error("invalid group reference %d" % index,
  924. len(name) + 1)
  925. addgroup(index, len(name) + 1)
  926. elif c == "0":
  927. if s.next in OCTDIGITS:
  928. this += sget()
  929. if s.next in OCTDIGITS:
  930. this += sget()
  931. lappend(chr(int(this[1:], 8) & 0xff))
  932. elif c in DIGITS:
  933. isoctal = False
  934. if s.next in DIGITS:
  935. this += sget()
  936. if (c in OCTDIGITS and this[2] in OCTDIGITS and
  937. s.next in OCTDIGITS):
  938. this += sget()
  939. isoctal = True
  940. c = int(this[1:], 8)
  941. if c > 0o377:
  942. raise s.error('octal escape value %s outside of '
  943. 'range 0-0o377' % this, len(this))
  944. lappend(chr(c))
  945. if not isoctal:
  946. addgroup(int(this[1:]), len(this) - 1)
  947. else:
  948. try:
  949. this = chr(ESCAPES[this][1])
  950. except KeyError:
  951. if c in ASCIILETTERS:
  952. raise s.error('bad escape %s' % this, len(this))
  953. lappend(this)
  954. else:
  955. lappend(this)
  956. if literal:
  957. literals.append(''.join(literal))
  958. if not isinstance(source, str):
  959. # The tokenizer implicitly decodes bytes objects as latin-1, we must
  960. # therefore re-encode the final representation.
  961. literals = [None if s is None else s.encode('latin-1') for s in literals]
  962. return groups, literals
  963. def expand_template(template, match):
  964. g = match.group
  965. empty = match.string[:0]
  966. groups, literals = template
  967. literals = literals[:]
  968. try:
  969. for index, group in groups:
  970. literals[index] = g(group) or empty
  971. except IndexError:
  972. raise error("invalid group reference %d" % index)
  973. return empty.join(literals)