pgen2/grammar.py

OILS / pgen2 / grammar.py View on Github | oilshell.org

303 lines, 142 significant

1	# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2	# Licensed to PSF under a Contributor Agreement.
3
4	"""This module defines the data structures used to represent a grammar.
5
6	These are a bit arcane because they are derived from the data
7	structures used by Python's 'pgen' parser generator.
8
9	There's also a table here mapping operators to their names in the
10	token module; the Python tokenize module reports all operators as the
11	fallback token code OP, but the parser needs the actual token code.
12
13	"""
14
15	import marshal
16
17	from mycpp.mylib import log
18	from mycpp import mylib
19
20	from typing import TYPE_CHECKING
21
22	if TYPE_CHECKING:
23	from typing import IO, Dict, List, Tuple
24
25	# Type aliases
26	arc_t = Tuple[int, int]
27	first_t = Dict[int, int]
28	states_t = List[List[arc_t]]
29	dfa_t = Tuple[states_t, first_t]
30
31
32	class Grammar(object):
33	"""Pgen parsing tables conversion class.
34
35	Once initialized, this class supplies the grammar tables for the
36	parsing engine implemented by parse.py. The parsing engine
37	accesses the instance variables directly. The class here does not
38	provide initialization of the tables; several subclasses exist to
39	do this (see the conv and pgen modules).
40
41	The load() method reads the tables from a pickle file, which is
42	much faster than the other ways offered by subclasses. The pickle
43	file is written by calling dump() (after loading the grammar
44	tables using a subclass). The report() method prints a readable
45	representation of the tables to stdout, for debugging.
46
47	The instance variables are as follows:
48
49	symbol2number -- a dict mapping symbol names to numbers. Symbol
50	numbers are always NT_OFFSET or higher, to distinguish
51	them from token numbers, which are between 0 and 255
52	(inclusive).
53
54	number2symbol -- a dict mapping numbers to symbol names;
55	these two are each other's inverse.
56
57	states -- a list of DFAs, where each DFA is a list of
58	states, each state is a list of arcs, and each
59	arc is a (i, j) pair where i is a label and j is
60	a state number. The DFA number is the index into
61	this list. (This name is slightly confusing.)
62	Final states are represented by a special arc of
63	the form (0, j) where j is its own state number.
64
65	dfas -- a dict mapping symbol numbers to (DFA, first)
66	pairs, where DFA is an item from the states list
67	above, and first is a set of tokens that can
68	begin this grammar rule (represented by a dict
69	whose values are always 1).
70
71	labels -- a list of (x, y) pairs where x is either a token
72	number or a symbol number, and y is either None
73	or a string; the strings are keywords. The label
74	number is the index in this list; label numbers
75	are used to mark state transitions (arcs) in the
76	DFAs.
77
78	Oils patch: this became List[int] where int is the
79	token/symbol number.
80
81	start -- the number of the grammar's start symbol.
82
83	keywords -- a dict mapping keyword strings to arc labels.
84
85	tokens -- a dict mapping token numbers to arc labels.
86
87	"""
88
89	def __init__(self):
90	# type: () -> None
91	self.symbol2number = {} # type: Dict[str, int]
92	self.number2symbol = {} # type: Dict[int, str]
93
94	# TODO: See MakeGrammar in pgen2/pgen.py
95	# To see the type
96	# states: List[List[arcs]]
97	# arc: (int, int)
98	# dfs = Dict[int, Tuple[states, ...]]
99
100	self.states = [] # type: states_t
101	self.dfas = {} # type: Dict[int, dfa_t]
102	# Oils patch: used to be [(0, "EMPTY")]. I suppose 0 is a special value?
103	# Or is it ENDMARKER?
104	self.labels = [0] # type: List[int]
105	self.keywords = {} # type: Dict[str, int]
106	self.tokens = {} # type: Dict[int, int]
107	self.symbol2label = {} # type: Dict[str, int]
108	self.start = 256 # TODO: move to pgen.NT_OFFSET, breaks OVM tarball
109
110	if mylib.PYTHON:
111	def dump(self, f):
112	# type: (IO[str]) -> None
113	"""Dump the grammar tables to a marshal file.
114
115	Oils patch: changed pickle to marshal.
116
117	dump() recursively changes all dict to OrderedDict, so the pickled file
118	is not exactly the same as what was passed in to dump(). load() uses the
119	pickled file to create the tables, but only changes OrderedDict to dict
120	at the top level; it does not recursively change OrderedDict to dict.
121	So, the loaded tables are different from the original tables that were
122	passed to load() in that some of the OrderedDict (from the pickled file)
123	are not changed back to dict. For parsing, this has no effect on
124	performance because OrderedDict uses dict's __getitem__ with nothing in
125	between.
126	"""
127	# Hack to get rid of Id_t
128	labels = [int(i) for i in self.labels]
129	tokens = dict((int(k), v) for (k, v) in self.tokens.iteritems())
130
131	#self.report()
132	payload = (
133	self.MARSHAL_HEADER,
134	self.symbol2number,
135	self.number2symbol,
136	self.states,
137	self.dfas,
138	labels,
139	self.keywords,
140	tokens,
141	self.symbol2label,
142	self.start,
143	) # tuple
144	marshal.dump(payload, f) # version 2 is latest
145
146	def dump_nonterminals_py(self, f):
147	# type: (IO[str]) -> None
148	"""Write a Python module with nonterminals.
149
150	Analogous to the 'symbol' module in Python.
151	"""
152	f.write('# This code is generated by pgen2/grammar.py\n\n')
153	for num in sorted(self.number2symbol):
154	name = self.number2symbol[num]
155	f.write('%s = %d\n' % (name, num))
156
157	def dump_nonterminals_cpp(self, f):
158	# type: (IO[str]) -> None
159	"""Write a Python module with nonterminals.
160
161	Analogous to the 'symbol' module in Python.
162	"""
163	f.write("""\
164	// This code is generated by pgen2/grammar.py
165
166	namespace grammar_nt {
167	""")
168	for num in sorted(self.number2symbol):
169	name = self.number2symbol[num]
170	f.write(' const int %s = %d;\n' % (name, num))
171	f.write("""\
172
173	} // namespace grammar_nt
174	""")
175
176	def dump_cpp(self, src_f):
177	# type: (IO[str]) -> None
178	"""Dump a C++ implementation of the grammar tables."""
179	src_f.write("""\
180	// This code is generated by pgen2/grammar.py
181	# include "cpp/pgen2.h"
182	# include "mycpp/runtime.h"
183
184	namespace grammar {
185
186	""")
187
188	src_f.write('Grammar::Grammar() {\n')
189	src_f.write(' symbol2number = Alloc<Dict<BigStr*, int>>();\n')
190	src_f.write(' number2symbol = Alloc<Dict<int, BigStr*>>();\n')
191	src_f.write(' dfas = Alloc<Dict<int, dfa_t*>>();\n')
192	src_f.write(' keywords = Alloc<Dict<BigStr*, int>>();\n')
193	src_f.write(' tokens = Alloc<Dict<int, int>>();\n')
194	src_f.write(' symbol2label = Alloc<Dict<BigStr*, int>>();\n')
195	src_f.write(' start = %d;\n' % self.start)
196
197	src_f.write('\n')
198	for i, (states, first) in self.dfas.items():
199	src_f.write(' {\n')
200	src_f.write(' states_t* st = Alloc<states_t>();\n')
201	for s in states:
202	src_f.write(' st->append(NewList<arc_t*>(\n')
203	src_f.write(' std::initializer_list<arc_t*>{\n')
204	for a, b in s:
205	src_f.write(' Alloc<Tuple2<int, int>>(%d, %d),\n' % (a, b))
206
207	src_f.write(' })\n')
208	src_f.write(' );\n')
209
210	src_f.write(' first_t* first = Alloc<first_t>();\n')
211	for k, v in first.items():
212	src_f.write(' first->set(%d, %d);\n' % (k, v))
213
214	src_f.write(' dfa_t* dfa = Alloc<dfa_t>(st, first);\n')
215	src_f.write(' dfas->set(%d, dfa);\n' % i)
216	src_f.write(' \n')
217	src_f.write(' }\n')
218	src_f.write('\n')
219
220	for symbol, num in self.symbol2number.items():
221	src_f.write(' symbol2number->set(StrFromC("%s"), %d);\n' % (symbol, num))
222
223	src_f.write('\n')
224	for num, symbol in self.number2symbol.items():
225	src_f.write(' number2symbol->set(%d, StrFromC("%s"));\n' % (num, symbol))
226
227	src_f.write('\n')
228	for symbol, label in self.symbol2label.items():
229	src_f.write(' symbol2label->set(StrFromC("%s"), %d);\n' % (symbol, label))
230
231	src_f.write('\n')
232	for ks, ki in self.keywords.items():
233	src_f.write(' keywords->set(StrFromC("%s"), %d);\n' % (ks, ki))
234
235	src_f.write('\n')
236	for k, v in self.tokens.items():
237	src_f.write(' tokens->set(%d, %d);\n' % (k, v))
238
239	src_f.write('\n')
240	src_f.write('\n')
241	src_f.write(' labels = NewList<int>(\n')
242	src_f.write(' std::initializer_list<int>{\n')
243	src_f.write(' ')
244	src_f.write(',\n '.join([str(label) for label in self.labels]))
245	src_f.write('\n')
246	src_f.write(' }\n')
247	src_f.write(' );\n')
248
249	src_f.write('\n')
250	src_f.write('};\n')
251
252	src_f.write("""\
253
254	} // namespace grammar
255	""")
256
257	MARSHAL_HEADER = 'PGEN2\n' # arbitrary header
258
259	def loads(self, s):
260	# type: (str) -> None
261	"""Load the grammar from a string.
262
263	We have to use a string rather than a file because the marshal module
264	doesn't "fake" support zipimport files.
265	"""
266	payload = marshal.loads(s)
267	if payload[0] != self.MARSHAL_HEADER:
268	raise RuntimeError('Invalid header %r' % payload[0])
269
270	( _,
271	self.symbol2number,
272	self.number2symbol,
273	self.states,
274	self.dfas,
275	self.labels,
276	self.keywords,
277	self.tokens,
278	self.symbol2label,
279	self.start,
280	) = payload
281	#self.report()
282
283	assert isinstance(self.symbol2number, dict), self.symbol2number
284	assert isinstance(self.number2symbol, dict), self.number2symbol
285
286	def report(self):
287	# type: () -> None
288	"""Dump the grammar tables to standard output, for debugging."""
289	log("symbol2number: %d entries", len(self.symbol2number))
290	log("number2symbol: %d entries", len(self.number2symbol))
291	log("states: %d entries", len(self.states))
292	log("dfas: %d entries", len(self.dfas))
293	return
294	from pprint import pprint
295	print("labels")
296	pprint(self.labels)
297	print("keywords")
298	pprint(self.labels)
299	print("tokens")
300	pprint(self.tokens)
301	print("symbol2label")
302	pprint(self.symbol2label)
303	print("start", self.start)