demo/shedskin/tdop.py

OILS / demo / shedskin / tdop.py View on Github | oilshell.org

229 lines, 117 significant

1	"""
2	tdop.py
3	"""
4
5	import re
6
7
8	class ParseError(Exception):
9	pass
10
11
12	#
13	# Default parsing functions give errors
14	#
15
16	def NullError(p, token, bp):
17	raise ParseError("%s can't be used in prefix position" % token)
18
19
20	def LeftError(p, token, left, rbp):
21	# Hm is this not called because of binding power?
22	raise ParseError("%s can't be used in infix position" % token)
23
24
25	#
26	# Input
27	#
28
29	class Token:
30	def __init__(self, type, val, loc=None):
31	self.type = type
32	self.val = val
33
34	def __repr__(self):
35	return '<Token %s %s>' % (self.type, self.val)
36
37
38	#
39	# Using the pattern here: http://effbot.org/zone/xml-scanner.htm
40	#
41
42	# NOTE: () and [] need to be on their own so (-1+2) works
43	TOKEN_RE = re.compile("""
44	\s* (?: (\d+) \| (\w+) \| ( [\-\+\*/%!~<>=&^\|?:,]+ ) \| ([\(\)\[\]]) )
45	""", re.VERBOSE)
46
47	def Tokenize(s):
48	for item in TOKEN_RE.findall(s):
49	if item[0]:
50	typ = 'number'
51	val = int(item[0])
52	elif item[1]:
53	typ = 'name'
54	val = item[1]
55	elif item[2]:
56	typ = item[2]
57	val = item[2]
58	elif item[3]:
59	typ = item[3]
60	val = item[3]
61	yield Token(typ, val, loc=(0, 0))
62
63
64	#
65	# Simple and Composite AST nodes
66	#
67
68	class Node(object):
69	def __init__(self, token):
70	"""
71	Args:
72	type: token type (operator, etc.)
73	val: token val, only important for number and string
74	"""
75	self.token = token
76
77	def __repr__(self):
78	return str(self.token.val)
79
80
81	class CompositeNode(Node):
82	def __init__(self, token, children):
83	"""
84	Args:
85	type: token type (operator, etc.)
86	"""
87	Node.__init__(self, token)
88	self.children = children
89
90	def __repr__(self):
91	args = ''.join([" " + repr(c) for c in self.children])
92	return "(" + self.token.type + args + ")"
93
94
95	#
96	# Parser definition
97	#
98
99	class LeftInfo(object):
100	"""Row for operator.
101
102	In C++ this should be a big array.
103	"""
104	def __init__(self, led=None, lbp=0, rbp=0):
105	self.led = led or LeftError
106	self.lbp = lbp
107	self.rbp = rbp
108
109
110	class NullInfo(object):
111	"""Row for operator.
112
113	In C++ this should be a big array.
114	"""
115	def __init__(self, nud=None, bp=0):
116	self.nud = nud or NullError
117	self.bp = bp
118
119
120	class ParserSpec(object):
121	"""Specification for a TDOP parser."""
122
123	def __init__(self):
124	self.null_lookup = {}
125	self.left_lookup = {}
126
127	def Null(self, bp, nud, tokens):
128	"""Register a token that doesn't take anything on the left.
129
130	Examples: constant, prefix operator, error.
131	"""
132	for token in tokens:
133	self.null_lookup[token] = NullInfo(nud=nud, bp=bp)
134	if token not in self.left_lookup:
135	self.left_lookup[token] = LeftInfo() # error
136
137	def _RegisterLed(self, lbp, rbp, led, tokens):
138	for token in tokens:
139	if token not in self.null_lookup:
140	self.null_lookup[token] = NullInfo(NullError)
141	self.left_lookup[token] = LeftInfo(lbp=lbp, rbp=rbp, led=led)
142
143	def Left(self, bp, led, tokens):
144	"""Register a token that takes an expression on the left."""
145	self._RegisterLed(bp, bp, led, tokens)
146
147	def LeftRightAssoc(self, bp, led, tokens):
148	"""Register a right associative operator."""
149	self._RegisterLed(bp, bp-1, led, tokens)
150
151	def LookupNull(self, token):
152	"""Get the parsing function and precedence for a null position token."""
153	try:
154	nud = self.null_lookup[token]
155	except KeyError:
156	raise ParseError('Unexpected token %r' % token)
157	return nud
158
159	def LookupLeft(self, token):
160	"""Get the parsing function and precedence for a left position token."""
161	try:
162	led = self.left_lookup[token]
163	except KeyError:
164	raise ParseError('Unexpected token %r' % token)
165	return led
166
167
168	EOF_TOKEN = Token('eof', 'eof')
169
170
171	class Parser(object):
172	"""Recursive TDOP parser."""
173
174	def __init__(self, spec, lexer):
175	self.spec = spec
176	self.lexer = lexer # iterable
177	self.token = None # current token
178
179	def AtToken(self, token_type):
180	"""Test if we are looking at a token."""
181	return self.token.type == token_type
182
183	def Next(self):
184	"""Move to the next token."""
185	try:
186	t = self.lexer.next()
187	except StopIteration:
188	t = EOF_TOKEN
189	self.token = t
190
191	def Eat(self, val):
192	"""Assert the value of the current token, then move to the next token."""
193	if val and not self.AtToken(val):
194	raise ParseError('expected %s, got %s' % (val, self.token))
195	self.Next()
196
197	def ParseUntil(self, rbp):
198	"""
199	Parse to the right, eating tokens until we encounter a token with binding
200	power LESS THAN OR EQUAL TO rbp.
201	"""
202	if self.AtToken('eof'):
203	raise ParseError('Unexpected end of input')
204
205	t = self.token
206	self.Next() # skip over the token, e.g. ! ~ + -
207
208	null_info = self.spec.LookupNull(t.type)
209	node = null_info.nud(self, t, null_info.bp)
210
211	while True:
212	t = self.token
213	left_info = self.spec.LookupLeft(t.type)
214
215	# Examples:
216	# If we see 1*2+ , rbp = 27 and lbp = 25, so stop.
217	# If we see 1+2+ , rbp = 25 and lbp = 25, so stop.
218	# If we see 12, rbp = 26 and lbp = 27, so keep going.
219	if rbp >= left_info.lbp:
220	break
221	self.Next() # skip over the token, e.g. / *
222
223	node = left_info.led(self, t, node, left_info.rbp)
224
225	return node
226
227	def Parse(self):
228	self.Next()
229	return self.ParseUntil(0)