OILS / demo / shedskin / tdop.py View on Github | oilshell.org

229 lines, 117 significant
1"""
2tdop.py
3"""
4
5import re
6
7
8class ParseError(Exception):
9 pass
10
11
12#
13# Default parsing functions give errors
14#
15
16def NullError(p, token, bp):
17 raise ParseError("%s can't be used in prefix position" % token)
18
19
20def LeftError(p, token, left, rbp):
21 # Hm is this not called because of binding power?
22 raise ParseError("%s can't be used in infix position" % token)
23
24
25#
26# Input
27#
28
29class Token:
30 def __init__(self, type, val, loc=None):
31 self.type = type
32 self.val = val
33
34 def __repr__(self):
35 return '<Token %s %s>' % (self.type, self.val)
36
37
38#
39# Using the pattern here: http://effbot.org/zone/xml-scanner.htm
40#
41
42# NOTE: () and [] need to be on their own so (-1+2) works
43TOKEN_RE = re.compile("""
44\s* (?: (\d+) | (\w+) | ( [\-\+\*/%!~<>=&^|?:,]+ ) | ([\(\)\[\]]) )
45""", re.VERBOSE)
46
47def Tokenize(s):
48 for item in TOKEN_RE.findall(s):
49 if item[0]:
50 typ = 'number'
51 val = int(item[0])
52 elif item[1]:
53 typ = 'name'
54 val = item[1]
55 elif item[2]:
56 typ = item[2]
57 val = item[2]
58 elif item[3]:
59 typ = item[3]
60 val = item[3]
61 yield Token(typ, val, loc=(0, 0))
62
63
64#
65# Simple and Composite AST nodes
66#
67
68class Node(object):
69 def __init__(self, token):
70 """
71 Args:
72 type: token type (operator, etc.)
73 val: token val, only important for number and string
74 """
75 self.token = token
76
77 def __repr__(self):
78 return str(self.token.val)
79
80
81class CompositeNode(Node):
82 def __init__(self, token, children):
83 """
84 Args:
85 type: token type (operator, etc.)
86 """
87 Node.__init__(self, token)
88 self.children = children
89
90 def __repr__(self):
91 args = ''.join([" " + repr(c) for c in self.children])
92 return "(" + self.token.type + args + ")"
93
94
95#
96# Parser definition
97#
98
99class LeftInfo(object):
100 """Row for operator.
101
102 In C++ this should be a big array.
103 """
104 def __init__(self, led=None, lbp=0, rbp=0):
105 self.led = led or LeftError
106 self.lbp = lbp
107 self.rbp = rbp
108
109
110class NullInfo(object):
111 """Row for operator.
112
113 In C++ this should be a big array.
114 """
115 def __init__(self, nud=None, bp=0):
116 self.nud = nud or NullError
117 self.bp = bp
118
119
120class ParserSpec(object):
121 """Specification for a TDOP parser."""
122
123 def __init__(self):
124 self.null_lookup = {}
125 self.left_lookup = {}
126
127 def Null(self, bp, nud, tokens):
128 """Register a token that doesn't take anything on the left.
129
130 Examples: constant, prefix operator, error.
131 """
132 for token in tokens:
133 self.null_lookup[token] = NullInfo(nud=nud, bp=bp)
134 if token not in self.left_lookup:
135 self.left_lookup[token] = LeftInfo() # error
136
137 def _RegisterLed(self, lbp, rbp, led, tokens):
138 for token in tokens:
139 if token not in self.null_lookup:
140 self.null_lookup[token] = NullInfo(NullError)
141 self.left_lookup[token] = LeftInfo(lbp=lbp, rbp=rbp, led=led)
142
143 def Left(self, bp, led, tokens):
144 """Register a token that takes an expression on the left."""
145 self._RegisterLed(bp, bp, led, tokens)
146
147 def LeftRightAssoc(self, bp, led, tokens):
148 """Register a right associative operator."""
149 self._RegisterLed(bp, bp-1, led, tokens)
150
151 def LookupNull(self, token):
152 """Get the parsing function and precedence for a null position token."""
153 try:
154 nud = self.null_lookup[token]
155 except KeyError:
156 raise ParseError('Unexpected token %r' % token)
157 return nud
158
159 def LookupLeft(self, token):
160 """Get the parsing function and precedence for a left position token."""
161 try:
162 led = self.left_lookup[token]
163 except KeyError:
164 raise ParseError('Unexpected token %r' % token)
165 return led
166
167
168EOF_TOKEN = Token('eof', 'eof')
169
170
171class Parser(object):
172 """Recursive TDOP parser."""
173
174 def __init__(self, spec, lexer):
175 self.spec = spec
176 self.lexer = lexer # iterable
177 self.token = None # current token
178
179 def AtToken(self, token_type):
180 """Test if we are looking at a token."""
181 return self.token.type == token_type
182
183 def Next(self):
184 """Move to the next token."""
185 try:
186 t = self.lexer.next()
187 except StopIteration:
188 t = EOF_TOKEN
189 self.token = t
190
191 def Eat(self, val):
192 """Assert the value of the current token, then move to the next token."""
193 if val and not self.AtToken(val):
194 raise ParseError('expected %s, got %s' % (val, self.token))
195 self.Next()
196
197 def ParseUntil(self, rbp):
198 """
199 Parse to the right, eating tokens until we encounter a token with binding
200 power LESS THAN OR EQUAL TO rbp.
201 """
202 if self.AtToken('eof'):
203 raise ParseError('Unexpected end of input')
204
205 t = self.token
206 self.Next() # skip over the token, e.g. ! ~ + -
207
208 null_info = self.spec.LookupNull(t.type)
209 node = null_info.nud(self, t, null_info.bp)
210
211 while True:
212 t = self.token
213 left_info = self.spec.LookupLeft(t.type)
214
215 # Examples:
216 # If we see 1*2+ , rbp = 27 and lbp = 25, so stop.
217 # If we see 1+2+ , rbp = 25 and lbp = 25, so stop.
218 # If we see 1**2**, rbp = 26 and lbp = 27, so keep going.
219 if rbp >= left_info.lbp:
220 break
221 self.Next() # skip over the token, e.g. / *
222
223 node = left_info.led(self, t, node, left_info.rbp)
224
225 return node
226
227 def Parse(self):
228 self.Next()
229 return self.ParseUntil(0)