| 1 | """
 | 
| 2 | tdop.py
 | 
| 3 | """
 | 
| 4 | 
 | 
| 5 | import re
 | 
| 6 | 
 | 
| 7 | 
 | 
| 8 | class ParseError(Exception):
 | 
| 9 |   pass
 | 
| 10 | 
 | 
| 11 | 
 | 
| 12 | #
 | 
| 13 | # Default parsing functions give errors
 | 
| 14 | #
 | 
| 15 | 
 | 
| 16 | def NullError(p, token, bp):
 | 
| 17 |   raise ParseError("%s can't be used in prefix position" % token)
 | 
| 18 | 
 | 
| 19 | 
 | 
| 20 | def LeftError(p, token, left, rbp):
 | 
| 21 |   # Hm is this not called because of binding power?
 | 
| 22 |   raise ParseError("%s can't be used in infix position" % token)
 | 
| 23 | 
 | 
| 24 | 
 | 
| 25 | #
 | 
| 26 | # Input
 | 
| 27 | #
 | 
| 28 | 
 | 
| 29 | class Token:
 | 
| 30 |   def __init__(self, type, val, loc=None):
 | 
| 31 |     self.type = type
 | 
| 32 |     self.val = val
 | 
| 33 | 
 | 
| 34 |   def __repr__(self):
 | 
| 35 |     return '<Token %s %s>' % (self.type, self.val)
 | 
| 36 | 
 | 
| 37 | 
 | 
| 38 | #
 | 
| 39 | # Using the pattern here: http://effbot.org/zone/xml-scanner.htm
 | 
| 40 | #
 | 
| 41 | 
 | 
| 42 | # NOTE: () and [] need to be on their own so (-1+2) works
 | 
| 43 | TOKEN_RE = re.compile("""
 | 
| 44 | \s* (?: (\d+) | (\w+) | ( [\-\+\*/%!~<>=&^|?:,]+ ) | ([\(\)\[\]]) )
 | 
| 45 | """, re.VERBOSE)
 | 
| 46 | 
 | 
| 47 | def Tokenize(s):
 | 
| 48 |   for item in TOKEN_RE.findall(s):
 | 
| 49 |     if item[0]:
 | 
| 50 |       typ = 'number'
 | 
| 51 |       val = int(item[0])
 | 
| 52 |     elif item[1]:
 | 
| 53 |       typ = 'name'
 | 
| 54 |       val = item[1]
 | 
| 55 |     elif item[2]:
 | 
| 56 |       typ = item[2]
 | 
| 57 |       val = item[2]
 | 
| 58 |     elif item[3]:
 | 
| 59 |       typ = item[3]
 | 
| 60 |       val = item[3]
 | 
| 61 |     yield Token(typ, val, loc=(0, 0))
 | 
| 62 | 
 | 
| 63 | 
 | 
| 64 | #
 | 
| 65 | # Simple and Composite AST nodes
 | 
| 66 | #
 | 
| 67 | 
 | 
| 68 | class Node(object):
 | 
| 69 |   def __init__(self, token):
 | 
| 70 |     """
 | 
| 71 |     Args:
 | 
| 72 |       type: token type (operator, etc.)
 | 
| 73 |       val: token val, only important for number and string
 | 
| 74 |     """
 | 
| 75 |     self.token = token
 | 
| 76 | 
 | 
| 77 |   def __repr__(self):
 | 
| 78 |     return str(self.token.val)
 | 
| 79 | 
 | 
| 80 | 
 | 
| 81 | class CompositeNode(Node):
 | 
| 82 |   def __init__(self, token, children):
 | 
| 83 |     """
 | 
| 84 |     Args:
 | 
| 85 |       type: token type (operator, etc.)
 | 
| 86 |     """
 | 
| 87 |     Node.__init__(self, token)
 | 
| 88 |     self.children = children
 | 
| 89 | 
 | 
| 90 |   def __repr__(self):
 | 
| 91 |     args = ''.join([" " + repr(c) for c in self.children])
 | 
| 92 |     return "(" + self.token.type + args + ")"
 | 
| 93 | 
 | 
| 94 | 
 | 
| 95 | #
 | 
| 96 | # Parser definition
 | 
| 97 | #
 | 
| 98 | 
 | 
| 99 | class LeftInfo(object):
 | 
| 100 |   """Row for operator.
 | 
| 101 | 
 | 
| 102 |   In C++ this should be a big array.
 | 
| 103 |   """
 | 
| 104 |   def __init__(self, led=None, lbp=0, rbp=0):
 | 
| 105 |     self.led = led or LeftError
 | 
| 106 |     self.lbp = lbp
 | 
| 107 |     self.rbp = rbp
 | 
| 108 | 
 | 
| 109 | 
 | 
| 110 | class NullInfo(object):
 | 
| 111 |   """Row for operator.
 | 
| 112 | 
 | 
| 113 |   In C++ this should be a big array.
 | 
| 114 |   """
 | 
| 115 |   def __init__(self, nud=None, bp=0):
 | 
| 116 |     self.nud = nud or NullError
 | 
| 117 |     self.bp = bp
 | 
| 118 | 
 | 
| 119 | 
 | 
| 120 | class ParserSpec(object):
 | 
| 121 |   """Specification for a TDOP parser."""
 | 
| 122 | 
 | 
| 123 |   def __init__(self):
 | 
| 124 |     self.null_lookup = {}
 | 
| 125 |     self.left_lookup = {}
 | 
| 126 | 
 | 
| 127 |   def Null(self, bp, nud, tokens):
 | 
| 128 |     """Register a token that doesn't take anything on the left.
 | 
| 129 | 
 | 
| 130 |     Examples: constant, prefix operator, error.
 | 
| 131 |     """
 | 
| 132 |     for token in tokens:
 | 
| 133 |       self.null_lookup[token] = NullInfo(nud=nud, bp=bp)
 | 
| 134 |       if token not in self.left_lookup:
 | 
| 135 |         self.left_lookup[token] = LeftInfo()  # error
 | 
| 136 | 
 | 
| 137 |   def _RegisterLed(self, lbp, rbp, led, tokens):
 | 
| 138 |     for token in tokens:
 | 
| 139 |       if token not in self.null_lookup:
 | 
| 140 |         self.null_lookup[token] = NullInfo(NullError)
 | 
| 141 |       self.left_lookup[token] = LeftInfo(lbp=lbp, rbp=rbp, led=led)
 | 
| 142 | 
 | 
| 143 |   def Left(self, bp, led, tokens):
 | 
| 144 |     """Register a token that takes an expression on the left."""
 | 
| 145 |     self._RegisterLed(bp, bp, led, tokens)
 | 
| 146 | 
 | 
| 147 |   def LeftRightAssoc(self, bp, led, tokens):
 | 
| 148 |     """Register a right associative operator."""
 | 
| 149 |     self._RegisterLed(bp, bp-1, led, tokens)
 | 
| 150 | 
 | 
| 151 |   def LookupNull(self, token):
 | 
| 152 |     """Get the parsing function and precedence for a null position token."""
 | 
| 153 |     try:
 | 
| 154 |       nud = self.null_lookup[token]
 | 
| 155 |     except KeyError:
 | 
| 156 |       raise ParseError('Unexpected token %r' % token)
 | 
| 157 |     return nud
 | 
| 158 | 
 | 
| 159 |   def LookupLeft(self, token):
 | 
| 160 |     """Get the parsing function and precedence for a left position token."""
 | 
| 161 |     try:
 | 
| 162 |       led = self.left_lookup[token]
 | 
| 163 |     except KeyError:
 | 
| 164 |       raise ParseError('Unexpected token %r' % token)
 | 
| 165 |     return led
 | 
| 166 | 
 | 
| 167 | 
 | 
| 168 | EOF_TOKEN = Token('eof', 'eof')
 | 
| 169 | 
 | 
| 170 | 
 | 
| 171 | class Parser(object):
 | 
| 172 |   """Recursive TDOP parser."""
 | 
| 173 | 
 | 
| 174 |   def __init__(self, spec, lexer):
 | 
| 175 |     self.spec = spec
 | 
| 176 |     self.lexer = lexer  # iterable
 | 
| 177 |     self.token = None  # current token
 | 
| 178 | 
 | 
| 179 |   def AtToken(self, token_type):
 | 
| 180 |     """Test if we are looking at a token."""
 | 
| 181 |     return self.token.type == token_type
 | 
| 182 | 
 | 
| 183 |   def Next(self):
 | 
| 184 |     """Move to the next token."""
 | 
| 185 |     try:
 | 
| 186 |       t = self.lexer.next()
 | 
| 187 |     except StopIteration:
 | 
| 188 |       t = EOF_TOKEN
 | 
| 189 |     self.token = t
 | 
| 190 | 
 | 
| 191 |   def Eat(self, val):
 | 
| 192 |     """Assert the value of the current token, then move to the next token."""
 | 
| 193 |     if val and not self.AtToken(val):
 | 
| 194 |       raise ParseError('expected %s, got %s' % (val, self.token))
 | 
| 195 |     self.Next()
 | 
| 196 | 
 | 
| 197 |   def ParseUntil(self, rbp):
 | 
| 198 |     """
 | 
| 199 |     Parse to the right, eating tokens until we encounter a token with binding
 | 
| 200 |     power LESS THAN OR EQUAL TO rbp.
 | 
| 201 |     """
 | 
| 202 |     if self.AtToken('eof'):
 | 
| 203 |       raise ParseError('Unexpected end of input')
 | 
| 204 | 
 | 
| 205 |     t = self.token
 | 
| 206 |     self.Next()  # skip over the token, e.g. ! ~ + -
 | 
| 207 | 
 | 
| 208 |     null_info = self.spec.LookupNull(t.type)
 | 
| 209 |     node = null_info.nud(self, t, null_info.bp)
 | 
| 210 | 
 | 
| 211 |     while True:
 | 
| 212 |       t = self.token
 | 
| 213 |       left_info = self.spec.LookupLeft(t.type)
 | 
| 214 | 
 | 
| 215 |       # Examples:
 | 
| 216 |       # If we see 1*2+  , rbp = 27 and lbp = 25, so stop.
 | 
| 217 |       # If we see 1+2+  , rbp = 25 and lbp = 25, so stop.
 | 
| 218 |       # If we see 1**2**, rbp = 26 and lbp = 27, so keep going.
 | 
| 219 |       if rbp >= left_info.lbp:
 | 
| 220 |         break
 | 
| 221 |       self.Next()  # skip over the token, e.g. / *
 | 
| 222 | 
 | 
| 223 |       node = left_info.led(self, t, node, left_info.rbp)
 | 
| 224 | 
 | 
| 225 |     return node
 | 
| 226 | 
 | 
| 227 |   def Parse(self):
 | 
| 228 |     self.Next()
 | 
| 229 |     return self.ParseUntil(0)
 |