1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
2 | # Licensed to PSF under a Contributor Agreement.
|
3 |
|
4 | """This module defines the data structures used to represent a grammar.
|
5 |
|
6 | These are a bit arcane because they are derived from the data
|
7 | structures used by Python's 'pgen' parser generator.
|
8 |
|
9 | There's also a table here mapping operators to their names in the
|
10 | token module; the Python tokenize module reports all operators as the
|
11 | fallback token code OP, but the parser needs the actual token code.
|
12 |
|
13 | """
|
14 |
|
15 | import marshal
|
16 |
|
17 | from mycpp.mylib import log
|
18 | from mycpp import mylib
|
19 |
|
20 | from typing import TYPE_CHECKING
|
21 |
|
22 | if TYPE_CHECKING:
|
23 | from typing import IO, Dict, List, Tuple
|
24 |
|
25 | # Type aliases
|
26 | arc_t = Tuple[int, int]
|
27 | first_t = Dict[int, int]
|
28 | states_t = List[List[arc_t]]
|
29 | dfa_t = Tuple[states_t, first_t]
|
30 |
|
31 |
|
32 | class Grammar(object):
|
33 | """Pgen parsing tables conversion class.
|
34 |
|
35 | Once initialized, this class supplies the grammar tables for the
|
36 | parsing engine implemented by parse.py. The parsing engine
|
37 | accesses the instance variables directly. The class here does not
|
38 | provide initialization of the tables; several subclasses exist to
|
39 | do this (see the conv and pgen modules).
|
40 |
|
41 | The load() method reads the tables from a pickle file, which is
|
42 | much faster than the other ways offered by subclasses. The pickle
|
43 | file is written by calling dump() (after loading the grammar
|
44 | tables using a subclass). The report() method prints a readable
|
45 | representation of the tables to stdout, for debugging.
|
46 |
|
47 | The instance variables are as follows:
|
48 |
|
49 | symbol2number -- a dict mapping symbol names to numbers. Symbol
|
50 | numbers are always NT_OFFSET or higher, to distinguish
|
51 | them from token numbers, which are between 0 and 255
|
52 | (inclusive).
|
53 |
|
54 | number2symbol -- a dict mapping numbers to symbol names;
|
55 | these two are each other's inverse.
|
56 |
|
57 | states -- a list of DFAs, where each DFA is a list of
|
58 | states, each state is a list of arcs, and each
|
59 | arc is a (i, j) pair where i is a label and j is
|
60 | a state number. The DFA number is the index into
|
61 | this list. (This name is slightly confusing.)
|
62 | Final states are represented by a special arc of
|
63 | the form (0, j) where j is its own state number.
|
64 |
|
65 | dfas -- a dict mapping symbol numbers to (DFA, first)
|
66 | pairs, where DFA is an item from the states list
|
67 | above, and first is a set of tokens that can
|
68 | begin this grammar rule (represented by a dict
|
69 | whose values are always 1).
|
70 |
|
71 | labels -- a list of (x, y) pairs where x is either a token
|
72 | number or a symbol number, and y is either None
|
73 | or a string; the strings are keywords. The label
|
74 | number is the index in this list; label numbers
|
75 | are used to mark state transitions (arcs) in the
|
76 | DFAs.
|
77 |
|
78 | Oils patch: this became List[int] where int is the
|
79 | token/symbol number.
|
80 |
|
81 | start -- the number of the grammar's start symbol.
|
82 |
|
83 | keywords -- a dict mapping keyword strings to arc labels.
|
84 |
|
85 | tokens -- a dict mapping token numbers to arc labels.
|
86 |
|
87 | """
|
88 |
|
89 | def __init__(self):
|
90 | # type: () -> None
|
91 | self.symbol2number = {} # type: Dict[str, int]
|
92 | self.number2symbol = {} # type: Dict[int, str]
|
93 |
|
94 | # TODO: See MakeGrammar in pgen2/pgen.py
|
95 | # To see the type
|
96 | # states: List[List[arcs]]
|
97 | # arc: (int, int)
|
98 | # dfs = Dict[int, Tuple[states, ...]]
|
99 |
|
100 | self.states = [] # type: states_t
|
101 | self.dfas = {} # type: Dict[int, dfa_t]
|
102 | # Oils patch: used to be [(0, "EMPTY")]. I suppose 0 is a special value?
|
103 | # Or is it ENDMARKER?
|
104 | self.labels = [0] # type: List[int]
|
105 | self.keywords = {} # type: Dict[str, int]
|
106 | self.tokens = {} # type: Dict[int, int]
|
107 | self.symbol2label = {} # type: Dict[str, int]
|
108 | self.start = 256 # TODO: move to pgen.NT_OFFSET, breaks OVM tarball
|
109 |
|
110 | if mylib.PYTHON:
|
111 | def dump(self, f):
|
112 | # type: (IO[str]) -> None
|
113 | """Dump the grammar tables to a marshal file.
|
114 |
|
115 | Oils patch: changed pickle to marshal.
|
116 |
|
117 | dump() recursively changes all dict to OrderedDict, so the pickled file
|
118 | is not exactly the same as what was passed in to dump(). load() uses the
|
119 | pickled file to create the tables, but only changes OrderedDict to dict
|
120 | at the top level; it does not recursively change OrderedDict to dict.
|
121 | So, the loaded tables are different from the original tables that were
|
122 | passed to load() in that some of the OrderedDict (from the pickled file)
|
123 | are not changed back to dict. For parsing, this has no effect on
|
124 | performance because OrderedDict uses dict's __getitem__ with nothing in
|
125 | between.
|
126 | """
|
127 | # Hack to get rid of Id_t
|
128 | labels = [int(i) for i in self.labels]
|
129 | tokens = dict((int(k), v) for (k, v) in self.tokens.iteritems())
|
130 |
|
131 | #self.report()
|
132 | payload = (
|
133 | self.MARSHAL_HEADER,
|
134 | self.symbol2number,
|
135 | self.number2symbol,
|
136 | self.states,
|
137 | self.dfas,
|
138 | labels,
|
139 | self.keywords,
|
140 | tokens,
|
141 | self.symbol2label,
|
142 | self.start,
|
143 | ) # tuple
|
144 | marshal.dump(payload, f) # version 2 is latest
|
145 |
|
146 | def dump_nonterminals_py(self, f):
|
147 | # type: (IO[str]) -> None
|
148 | """Write a Python module with nonterminals.
|
149 |
|
150 | Analogous to the 'symbol' module in Python.
|
151 | """
|
152 | f.write('# This code is generated by pgen2/grammar.py\n\n')
|
153 | for num in sorted(self.number2symbol):
|
154 | name = self.number2symbol[num]
|
155 | f.write('%s = %d\n' % (name, num))
|
156 |
|
157 | def dump_nonterminals_cpp(self, f):
|
158 | # type: (IO[str]) -> None
|
159 | """Write a Python module with nonterminals.
|
160 |
|
161 | Analogous to the 'symbol' module in Python.
|
162 | """
|
163 | f.write("""\
|
164 | // This code is generated by pgen2/grammar.py
|
165 |
|
166 | namespace grammar_nt {
|
167 | """)
|
168 | for num in sorted(self.number2symbol):
|
169 | name = self.number2symbol[num]
|
170 | f.write(' const int %s = %d;\n' % (name, num))
|
171 | f.write("""\
|
172 |
|
173 | } // namespace grammar_nt
|
174 | """)
|
175 |
|
176 | def dump_cpp(self, src_f):
|
177 | # type: (IO[str]) -> None
|
178 | """Dump a C++ implementation of the grammar tables."""
|
179 | src_f.write("""\
|
180 | // This code is generated by pgen2/grammar.py
|
181 | # include "cpp/pgen2.h"
|
182 | # include "mycpp/runtime.h"
|
183 |
|
184 | namespace grammar {
|
185 |
|
186 | """)
|
187 |
|
188 | src_f.write('Grammar::Grammar() {\n')
|
189 | src_f.write(' symbol2number = Alloc<Dict<BigStr*, int>>();\n')
|
190 | src_f.write(' number2symbol = Alloc<Dict<int, BigStr*>>();\n')
|
191 | src_f.write(' dfas = Alloc<Dict<int, dfa_t*>>();\n')
|
192 | src_f.write(' keywords = Alloc<Dict<BigStr*, int>>();\n')
|
193 | src_f.write(' tokens = Alloc<Dict<int, int>>();\n')
|
194 | src_f.write(' symbol2label = Alloc<Dict<BigStr*, int>>();\n')
|
195 | src_f.write(' start = %d;\n' % self.start)
|
196 |
|
197 | src_f.write('\n')
|
198 | for i, (states, first) in self.dfas.items():
|
199 | src_f.write(' {\n')
|
200 | src_f.write(' states_t* st = Alloc<states_t>();\n')
|
201 | for s in states:
|
202 | src_f.write(' st->append(NewList<arc_t*>(\n')
|
203 | src_f.write(' std::initializer_list<arc_t*>{\n')
|
204 | for a, b in s:
|
205 | src_f.write(' Alloc<Tuple2<int, int>>(%d, %d),\n' % (a, b))
|
206 |
|
207 | src_f.write(' })\n')
|
208 | src_f.write(' );\n')
|
209 |
|
210 | src_f.write(' first_t* first = Alloc<first_t>();\n')
|
211 | for k, v in first.items():
|
212 | src_f.write(' first->set(%d, %d);\n' % (k, v))
|
213 |
|
214 | src_f.write(' dfa_t* dfa = Alloc<dfa_t>(st, first);\n')
|
215 | src_f.write(' dfas->set(%d, dfa);\n' % i)
|
216 | src_f.write(' \n')
|
217 | src_f.write(' }\n')
|
218 | src_f.write('\n')
|
219 |
|
220 | for symbol, num in self.symbol2number.items():
|
221 | src_f.write(' symbol2number->set(StrFromC("%s"), %d);\n' % (symbol, num))
|
222 |
|
223 | src_f.write('\n')
|
224 | for num, symbol in self.number2symbol.items():
|
225 | src_f.write(' number2symbol->set(%d, StrFromC("%s"));\n' % (num, symbol))
|
226 |
|
227 | src_f.write('\n')
|
228 | for symbol, label in self.symbol2label.items():
|
229 | src_f.write(' symbol2label->set(StrFromC("%s"), %d);\n' % (symbol, label))
|
230 |
|
231 | src_f.write('\n')
|
232 | for ks, ki in self.keywords.items():
|
233 | src_f.write(' keywords->set(StrFromC("%s"), %d);\n' % (ks, ki))
|
234 |
|
235 | src_f.write('\n')
|
236 | for k, v in self.tokens.items():
|
237 | src_f.write(' tokens->set(%d, %d);\n' % (k, v))
|
238 |
|
239 | src_f.write('\n')
|
240 | src_f.write('\n')
|
241 | src_f.write(' labels = NewList<int>(\n')
|
242 | src_f.write(' std::initializer_list<int>{\n')
|
243 | src_f.write(' ')
|
244 | src_f.write(',\n '.join([str(label) for label in self.labels]))
|
245 | src_f.write('\n')
|
246 | src_f.write(' }\n')
|
247 | src_f.write(' );\n')
|
248 |
|
249 | src_f.write('\n')
|
250 | src_f.write('};\n')
|
251 |
|
252 | src_f.write("""\
|
253 |
|
254 | } // namespace grammar
|
255 | """)
|
256 |
|
257 | MARSHAL_HEADER = 'PGEN2\n' # arbitrary header
|
258 |
|
259 | def loads(self, s):
|
260 | # type: (str) -> None
|
261 | """Load the grammar from a string.
|
262 |
|
263 | We have to use a string rather than a file because the marshal module
|
264 | doesn't "fake" support zipimport files.
|
265 | """
|
266 | payload = marshal.loads(s)
|
267 | if payload[0] != self.MARSHAL_HEADER:
|
268 | raise RuntimeError('Invalid header %r' % payload[0])
|
269 |
|
270 | ( _,
|
271 | self.symbol2number,
|
272 | self.number2symbol,
|
273 | self.states,
|
274 | self.dfas,
|
275 | self.labels,
|
276 | self.keywords,
|
277 | self.tokens,
|
278 | self.symbol2label,
|
279 | self.start,
|
280 | ) = payload
|
281 | #self.report()
|
282 |
|
283 | assert isinstance(self.symbol2number, dict), self.symbol2number
|
284 | assert isinstance(self.number2symbol, dict), self.number2symbol
|
285 |
|
286 | def report(self):
|
287 | # type: () -> None
|
288 | """Dump the grammar tables to standard output, for debugging."""
|
289 | log("symbol2number: %d entries", len(self.symbol2number))
|
290 | log("number2symbol: %d entries", len(self.number2symbol))
|
291 | log("states: %d entries", len(self.states))
|
292 | log("dfas: %d entries", len(self.dfas))
|
293 | return
|
294 | from pprint import pprint
|
295 | print("labels")
|
296 | pprint(self.labels)
|
297 | print("keywords")
|
298 | pprint(self.labels)
|
299 | print("tokens")
|
300 | pprint(self.tokens)
|
301 | print("symbol2label")
|
302 | pprint(self.symbol2label)
|
303 | print("start", self.start)
|