grammar.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
  2. # Licensed to PSF under a Contributor Agreement.
  3. """This module defines the data structures used to represent a grammar.
  4. These are a bit arcane because they are derived from the data
  5. structures used by Python's 'pgen' parser generator.
  6. There's also a table here mapping operators to their names in the
  7. token module; the Python tokenize module reports all operators as the
  8. fallback token code OP, but the parser needs the actual token code.
  9. """
  10. # Python imports
  11. import collections
  12. import pickle
  13. # Local imports
  14. from . import token
  15. class Grammar(object):
  16. """Pgen parsing tables conversion class.
  17. Once initialized, this class supplies the grammar tables for the
  18. parsing engine implemented by parse.py. The parsing engine
  19. accesses the instance variables directly. The class here does not
  20. provide initialization of the tables; several subclasses exist to
  21. do this (see the conv and pgen modules).
  22. The load() method reads the tables from a pickle file, which is
  23. much faster than the other ways offered by subclasses. The pickle
  24. file is written by calling dump() (after loading the grammar
  25. tables using a subclass). The report() method prints a readable
  26. representation of the tables to stdout, for debugging.
  27. The instance variables are as follows:
  28. symbol2number -- a dict mapping symbol names to numbers. Symbol
  29. numbers are always 256 or higher, to distinguish
  30. them from token numbers, which are between 0 and
  31. 255 (inclusive).
  32. number2symbol -- a dict mapping numbers to symbol names;
  33. these two are each other's inverse.
  34. states -- a list of DFAs, where each DFA is a list of
  35. states, each state is a list of arcs, and each
  36. arc is a (i, j) pair where i is a label and j is
  37. a state number. The DFA number is the index into
  38. this list. (This name is slightly confusing.)
  39. Final states are represented by a special arc of
  40. the form (0, j) where j is its own state number.
  41. dfas -- a dict mapping symbol numbers to (DFA, first)
  42. pairs, where DFA is an item from the states list
  43. above, and first is a set of tokens that can
  44. begin this grammar rule (represented by a dict
  45. whose values are always 1).
  46. labels -- a list of (x, y) pairs where x is either a token
  47. number or a symbol number, and y is either None
  48. or a string; the strings are keywords. The label
  49. number is the index in this list; label numbers
  50. are used to mark state transitions (arcs) in the
  51. DFAs.
  52. start -- the number of the grammar's start symbol.
  53. keywords -- a dict mapping keyword strings to arc labels.
  54. tokens -- a dict mapping token numbers to arc labels.
  55. """
  56. def __init__(self):
  57. self.symbol2number = {}
  58. self.number2symbol = {}
  59. self.states = []
  60. self.dfas = {}
  61. self.labels = [(0, "EMPTY")]
  62. self.keywords = {}
  63. self.tokens = {}
  64. self.symbol2label = {}
  65. self.start = 256
  66. def dump(self, filename):
  67. """Dump the grammar tables to a pickle file.
  68. dump() recursively changes all dict to OrderedDict, so the pickled file
  69. is not exactly the same as what was passed in to dump(). load() uses the
  70. pickled file to create the tables, but only changes OrderedDict to dict
  71. at the top level; it does not recursively change OrderedDict to dict.
  72. So, the loaded tables are different from the original tables that were
  73. passed to load() in that some of the OrderedDict (from the pickled file)
  74. are not changed back to dict. For parsing, this has no effect on
  75. performance because OrderedDict uses dict's __getitem__ with nothing in
  76. between.
  77. """
  78. with open(filename, "wb") as f:
  79. d = _make_deterministic(self.__dict__)
  80. pickle.dump(d, f, 2)
  81. def load(self, filename):
  82. """Load the grammar tables from a pickle file."""
  83. with open(filename, "rb") as f:
  84. d = pickle.load(f)
  85. self.__dict__.update(d)
  86. def loads(self, pkl):
  87. """Load the grammar tables from a pickle bytes object."""
  88. self.__dict__.update(pickle.loads(pkl))
  89. def copy(self):
  90. """
  91. Copy the grammar.
  92. """
  93. new = self.__class__()
  94. for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
  95. "tokens", "symbol2label"):
  96. setattr(new, dict_attr, getattr(self, dict_attr).copy())
  97. new.labels = self.labels[:]
  98. new.states = self.states[:]
  99. new.start = self.start
  100. return new
  101. def report(self):
  102. """Dump the grammar tables to standard output, for debugging."""
  103. from pprint import pprint
  104. print("s2n")
  105. pprint(self.symbol2number)
  106. print("n2s")
  107. pprint(self.number2symbol)
  108. print("states")
  109. pprint(self.states)
  110. print("dfas")
  111. pprint(self.dfas)
  112. print("labels")
  113. pprint(self.labels)
  114. print("start", self.start)
  115. def _make_deterministic(top):
  116. if isinstance(top, dict):
  117. return collections.OrderedDict(
  118. sorted(((k, _make_deterministic(v)) for k, v in top.items())))
  119. if isinstance(top, list):
  120. return [_make_deterministic(e) for e in top]
  121. if isinstance(top, tuple):
  122. return tuple(_make_deterministic(e) for e in top)
  123. return top
  124. # Map from operator to number (since tokenize doesn't do this)
  125. opmap_raw = """
  126. ( LPAR
  127. ) RPAR
  128. [ LSQB
  129. ] RSQB
  130. : COLON
  131. , COMMA
  132. ; SEMI
  133. + PLUS
  134. - MINUS
  135. * STAR
  136. / SLASH
  137. | VBAR
  138. & AMPER
  139. < LESS
  140. > GREATER
  141. = EQUAL
  142. . DOT
  143. % PERCENT
  144. ` BACKQUOTE
  145. { LBRACE
  146. } RBRACE
  147. @ AT
  148. @= ATEQUAL
  149. == EQEQUAL
  150. != NOTEQUAL
  151. <> NOTEQUAL
  152. <= LESSEQUAL
  153. >= GREATEREQUAL
  154. ~ TILDE
  155. ^ CIRCUMFLEX
  156. << LEFTSHIFT
  157. >> RIGHTSHIFT
  158. ** DOUBLESTAR
  159. += PLUSEQUAL
  160. -= MINEQUAL
  161. *= STAREQUAL
  162. /= SLASHEQUAL
  163. %= PERCENTEQUAL
  164. &= AMPEREQUAL
  165. |= VBAREQUAL
  166. ^= CIRCUMFLEXEQUAL
  167. <<= LEFTSHIFTEQUAL
  168. >>= RIGHTSHIFTEQUAL
  169. **= DOUBLESTAREQUAL
  170. // DOUBLESLASH
  171. //= DOUBLESLASHEQUAL
  172. -> RARROW
  173. """
  174. opmap = {}
  175. for line in opmap_raw.splitlines():
  176. if line:
  177. op, name = line.split()
  178. opmap[op] = getattr(token, name)