pgen2/parse.py

OILS / pgen2 / parse.py View on Github | oilshell.org

225 lines, 92 significant

1	# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2	# Licensed to PSF under a Contributor Agreement.
3
4	"""Parser engine for the grammar tables generated by pgen.
5
6	The grammar table must be loaded first.
7
8	See Parser/parser.c in the Python distribution for additional info on
9	how this parsing engine works.
10	"""
11
12	from mycpp.mylib import log
13	_ = log
14
15	from typing import TYPE_CHECKING, Optional, List
16	from pgen2.pnode import PNode, PNodeAllocator
17
18	if TYPE_CHECKING:
19	from _devbuild.gen.syntax_asdl import Token
20	from pgen2.grammar import Grammar, dfa_t
21
22	# Duplicated to avoid dependency
23	# from pgen2 import token
24
25	NT_OFFSET = 256
26
27
28	class ParseError(Exception):
29	"""Exception to signal the parser is stuck."""
30
31	def __init__(self, msg, type_, tok):
32	# type: (str, int, Token) -> None
33	self.msg = msg
34	self.type = type_
35	self.tok = tok
36
37	def __repr__(self):
38	# type: () -> str
39	return "%s: type=%d, tok=%r" % (self.msg, self.type, self.tok)
40
41
42	class _StackItem(object):
43	def __init__(self, dfa, state, node):
44	# type: (dfa_t, int, PNode) -> None
45	self.dfa = dfa
46	self.state = state
47	self.node = node
48
49
50	class Parser(object):
51	"""Parser engine.
52
53	The proper usage sequence is:
54
55	p = Parser(grammar, [converter]) # create instance
56	p.setup(start) # prepare for parsing
57	<for each input token>:
58	if p.addtoken(...): # parse a token; may raise ParseError
59	break
60	root = p.rootnode # root of abstract syntax tree
61
62	A Parser instance may be reused by calling setup() repeatedly.
63
64	A Parser instance contains state pertaining to the current token
65	sequence, and should not be used concurrently by different threads
66	to parse separate token sequences.
67
68	See driver.py for how to get input tokens by tokenizing a file or
69	string.
70
71	Parsing is complete when addtoken() returns True; the root of the
72	abstract syntax tree can then be retrieved from the rootnode
73	instance variable. When a syntax error occurs, addtoken() raises
74	the ParseError exception. There is no error recovery; the parser
75	cannot be used after a syntax error was reported (but it can be
76	reinitialized by calling setup()).
77	"""
78
79	def __init__(self, grammar):
80	# type: (Grammar) -> None
81	"""Constructor.
82
83	The grammar argument is a grammar.Grammar instance; see the
84	grammar module for more information.
85
86	The parser is not ready yet for parsing; you must call the
87	setup() method to get it started.
88
89	A concrete syntax tree node is a (type, value, context, nodes)
90	tuple, where type is the node type (a token or symbol number),
91	value is None for symbols and a string for tokens, context is
92	None or an opaque value used for error reporting (typically a
93	(lineno, offset) pair), and nodes is a list of children for
94	symbols, and None for tokens.
95
96	An abstract syntax tree node may be anything; this is entirely
97	up to the converter function.
98	"""
99	self.grammar = grammar
100	self.rootnode = None # type: Optional[PNode]
101	self.stack = [] # type: List[_StackItem]
102	self.pnode_alloc = None # type: Optional[PNodeAllocator]
103
104	def setup(self, start, pnode_alloc):
105	# type: (int, PNodeAllocator) -> None
106	"""Prepare for parsing.
107
108	This must be called before starting to parse.
109
110	The optional argument is an alternative start symbol; it
111	defaults to the grammar's start symbol.
112
113	You can use a Parser instance to parse any number of programs;
114	each time you call setup() the parser is reset to an initial
115	state determined by the (implicit or explicit) start symbol.
116	"""
117	self.pnode_alloc = pnode_alloc
118	newnode = self.pnode_alloc.NewPNode(start, None)
119	# Each stack entry is a tuple: (dfa, state, node).
120	self.stack = [_StackItem(self.grammar.dfas[start], 0, newnode)]
121	self.rootnode = None
122
123	def addtoken(self, typ, opaque, ilabel):
124	# type: (int, Token, int) -> bool
125	"""Add a token; return True iff this is the end of the program."""
126	# Loop until the token is shifted; may raise exceptions
127
128	# Andy NOTE: This is not linear time, i.e. a constant amount of work
129	# for each token? Is it O(n^2) as the ANTLR paper says?
130	# Do the "accelerators" in pgen.c have anything to do with it?
131
132	while True:
133	top = self.stack[-1]
134	states, _ = top.dfa
135	state = top.state
136
137	arcs = states[state]
138
139	# Look for a state with this label
140	found = False
141	for ilab, newstate in arcs:
142	t = self.grammar.labels[ilab]
143	if ilabel == ilab:
144	# Look it up in the list of labels
145	assert t < NT_OFFSET, t
146	# Shift a token; we're done with it
147	self.shift(typ, opaque, newstate)
148	# Pop while we are in an accept-only state
149	state = newstate
150
151	while True:
152	# mycpp: rewrite of tuple-in-list comparison
153	if len(states[state]) != 1:
154	break
155
156	s0, s1 = states[state][0]
157	if s0 != 0 or s1 != state:
158	break
159
160	self.pop()
161	if len(self.stack) == 0:
162	# Done parsing!
163	return True
164	top = self.stack[-1]
165	states, _ = top.dfa
166	state = top.state
167
168	# Done with this token
169	return False
170	elif t >= NT_OFFSET:
171	# See if it's a symbol and if we're in its first set
172	itsdfa = self.grammar.dfas[t]
173	_, itsfirst = itsdfa
174	if ilabel in itsfirst:
175	# Push a symbol
176	self.push(t, opaque, self.grammar.dfas[t], newstate)
177	found = True
178	break # To continue the outer while loop
179
180	if not found:
181	# Note: this condition was rewritten for mycpp tarnslation.
182	# if (0, state) in arcs:
183	# ...
184	# else:
185	# ...
186	found2 = False
187	for left, right in arcs:
188	if left == 0 and right == state:
189	# An accepting state, pop it and try something else
190	self.pop()
191	if len(self.stack) == 0:
192	# Done parsing, but another token is input
193	raise ParseError("too much input", typ, opaque)
194	found2 = True
195
196	if not found2:
197	# No success finding a transition
198	raise ParseError("bad input", typ, opaque)
199
200	def shift(self, typ, opaque, newstate):
201	# type: (int, Token, int) -> None
202	"""Shift a token. (Internal)"""
203	top = self.stack[-1]
204	newnode = self.pnode_alloc.NewPNode(typ, opaque)
205	top.node.AddChild(newnode)
206	self.stack[-1].state = newstate
207
208	def push(self, typ, opaque, newdfa, newstate):
209	# type: (int, Token, dfa_t, int) -> None
210	"""Push a nonterminal. (Internal)"""
211	top = self.stack[-1]
212	newnode = self.pnode_alloc.NewPNode(typ, opaque)
213	self.stack[-1].state = newstate
214	self.stack.append(_StackItem(newdfa, 0, newnode))
215
216	def pop(self):
217	# type: () -> None
218	"""Pop a nonterminal. (Internal)"""
219	top = self.stack.pop()
220	newnode = top.node
221	if len(self.stack):
222	top2 = self.stack[-1]
223	top2.node.AddChild(newnode)
224	else:
225	self.rootnode = newnode