| 1 | #!/usr/bin/env python
 | 
| 2 | """
 | 
| 3 | tdop.py
 | 
| 4 | """
 | 
| 5 | 
 | 
| 6 | import re
 | 
| 7 | 
 | 
| 8 | 
 | 
| 9 | class ParseError(Exception):
 | 
| 10 |   pass
 | 
| 11 | 
 | 
| 12 | 
 | 
| 13 | #
 | 
| 14 | # Default parsing functions give errors
 | 
| 15 | #
 | 
| 16 | 
 | 
| 17 | def NullError(p, token, bp):
 | 
| 18 |   raise ParseError("%s can't be used in prefix position" % token)
 | 
| 19 | 
 | 
| 20 | 
 | 
| 21 | def LeftError(p, token, left, rbp):
 | 
| 22 |   # Hm is this not called because of binding power?
 | 
| 23 |   raise ParseError("%s can't be used in infix position" % token)
 | 
| 24 | 
 | 
| 25 | 
 | 
| 26 | #
 | 
| 27 | # Input
 | 
| 28 | #
 | 
| 29 | 
 | 
| 30 | class Token:
 | 
| 31 |   def __init__(self, type, val, loc=None):
 | 
| 32 |     self.type = type
 | 
| 33 |     self.val = val
 | 
| 34 | 
 | 
| 35 |   def __repr__(self):
 | 
| 36 |     return '<Token %s %s>' % (self.type, self.val)
 | 
| 37 | 
 | 
| 38 | 
 | 
| 39 | #
 | 
| 40 | # Using the pattern here: http://effbot.org/zone/xml-scanner.htm
 | 
| 41 | #
 | 
| 42 | 
 | 
| 43 | # NOTE: () and [] need to be on their own so (-1+2) works
 | 
| 44 | TOKEN_RE = re.compile("""
 | 
| 45 | \s* (?: (\d+) | (\w+) | ( [\-\+\*/%!~<>=&^|?:,]+ ) | ([\(\)\[\]]) )
 | 
| 46 | """, re.VERBOSE)
 | 
| 47 | 
 | 
| 48 | def Tokenize(s):
 | 
| 49 |   for item in TOKEN_RE.findall(s):
 | 
| 50 |     if item[0]:
 | 
| 51 |       typ = 'number'
 | 
| 52 |       val = int(item[0])
 | 
| 53 |     elif item[1]:
 | 
| 54 |       typ = 'name'
 | 
| 55 |       val = item[1]
 | 
| 56 |     elif item[2]:
 | 
| 57 |       typ = item[2]
 | 
| 58 |       val = item[2]
 | 
| 59 |     elif item[3]:
 | 
| 60 |       typ = item[3]
 | 
| 61 |       val = item[3]
 | 
| 62 |     yield Token(typ, val, loc=(0, 0))
 | 
| 63 | 
 | 
| 64 | 
 | 
| 65 | #
 | 
| 66 | # Simple and Composite AST nodes
 | 
| 67 | #
 | 
| 68 | 
 | 
| 69 | class Node(object):
 | 
| 70 |   def __init__(self, token):
 | 
| 71 |     """
 | 
| 72 |     Args:
 | 
| 73 |       type: token type (operator, etc.)
 | 
| 74 |       val: token val, only important for number and string
 | 
| 75 |     """
 | 
| 76 |     self.token = token
 | 
| 77 | 
 | 
| 78 |   def __repr__(self):
 | 
| 79 |     return str(self.token.val)
 | 
| 80 | 
 | 
| 81 | 
 | 
| 82 | class CompositeNode(Node):
 | 
| 83 |   def __init__(self, token, children):
 | 
| 84 |     """
 | 
| 85 |     Args:
 | 
| 86 |       type: token type (operator, etc.)
 | 
| 87 |     """
 | 
| 88 |     Node.__init__(self, token)
 | 
| 89 |     self.children = children
 | 
| 90 | 
 | 
| 91 |   def __repr__(self):
 | 
| 92 |     args = ''.join([" " + repr(c) for c in self.children])
 | 
| 93 |     return "(" + self.token.type + args + ")"
 | 
| 94 | 
 | 
| 95 | 
 | 
| 96 | #
 | 
| 97 | # Parser definition
 | 
| 98 | #
 | 
| 99 | 
 | 
| 100 | class LeftInfo(object):
 | 
| 101 |   """Row for operator.
 | 
| 102 | 
 | 
| 103 |   In C++ this should be a big array.
 | 
| 104 |   """
 | 
| 105 |   def __init__(self, led=None, lbp=0, rbp=0):
 | 
| 106 |     self.led = led or LeftError
 | 
| 107 |     self.lbp = lbp
 | 
| 108 |     self.rbp = rbp
 | 
| 109 | 
 | 
| 110 | 
 | 
| 111 | class NullInfo(object):
 | 
| 112 |   """Row for operator.
 | 
| 113 | 
 | 
| 114 |   In C++ this should be a big array.
 | 
| 115 |   """
 | 
| 116 |   def __init__(self, nud=None, bp=0):
 | 
| 117 |     self.nud = nud or NullError
 | 
| 118 |     self.bp = bp
 | 
| 119 | 
 | 
| 120 | 
 | 
| 121 | class ParserSpec(object):
 | 
| 122 |   """Specification for a TDOP parser."""
 | 
| 123 | 
 | 
| 124 |   def __init__(self):
 | 
| 125 |     self.null_lookup = {}
 | 
| 126 |     self.left_lookup = {}
 | 
| 127 | 
 | 
| 128 |   def Null(self, bp, nud, tokens):
 | 
| 129 |     """Register a token that doesn't take anything on the left.
 | 
| 130 |     
 | 
| 131 |     Examples: constant, prefix operator, error.
 | 
| 132 |     """
 | 
| 133 |     for token in tokens:
 | 
| 134 |       self.null_lookup[token] = NullInfo(nud=nud, bp=bp)
 | 
| 135 |       if token not in self.left_lookup:
 | 
| 136 |         self.left_lookup[token] = LeftInfo()  # error
 | 
| 137 | 
 | 
| 138 |   def _RegisterLed(self, lbp, rbp, led, tokens):
 | 
| 139 |     for token in tokens:
 | 
| 140 |       if token not in self.null_lookup:
 | 
| 141 |         self.null_lookup[token] = NullInfo(NullError)
 | 
| 142 |       self.left_lookup[token] = LeftInfo(lbp=lbp, rbp=rbp, led=led)
 | 
| 143 | 
 | 
| 144 |   def Left(self, bp, led, tokens):
 | 
| 145 |     """Register a token that takes an expression on the left."""
 | 
| 146 |     self._RegisterLed(bp, bp, led, tokens)
 | 
| 147 | 
 | 
| 148 |   def LeftRightAssoc(self, bp, led, tokens):
 | 
| 149 |     """Register a right associative operator."""
 | 
| 150 |     self._RegisterLed(bp, bp-1, led, tokens)
 | 
| 151 | 
 | 
| 152 |   def LookupNull(self, token):
 | 
| 153 |     """Get the parsing function and precedence for a null position token."""
 | 
| 154 |     try:
 | 
| 155 |       nud = self.null_lookup[token]
 | 
| 156 |     except KeyError:
 | 
| 157 |       raise ParseError('Unexpected token %r' % token)
 | 
| 158 |     return nud
 | 
| 159 | 
 | 
| 160 |   def LookupLeft(self, token):
 | 
| 161 |     """Get the parsing function and precedence for a left position token."""
 | 
| 162 |     try:
 | 
| 163 |       led = self.left_lookup[token]
 | 
| 164 |     except KeyError:
 | 
| 165 |       raise ParseError('Unexpected token %r' % token)
 | 
| 166 |     return led
 | 
| 167 | 
 | 
| 168 | 
 | 
| 169 | EOF_TOKEN = Token('eof', 'eof')
 | 
| 170 | 
 | 
| 171 | 
 | 
| 172 | class Parser(object):
 | 
| 173 |   """Recursive TDOP parser."""
 | 
| 174 | 
 | 
| 175 |   def __init__(self, spec, lexer):
 | 
| 176 |     self.spec = spec
 | 
| 177 |     self.lexer = lexer  # iterable
 | 
| 178 |     self.token = None  # current token
 | 
| 179 | 
 | 
| 180 |   def AtToken(self, token_type):
 | 
| 181 |     """Test if we are looking at a token."""
 | 
| 182 |     return self.token.type == token_type
 | 
| 183 | 
 | 
| 184 |   def Next(self):
 | 
| 185 |     """Move to the next token."""
 | 
| 186 |     try:
 | 
| 187 |       t = self.lexer.next()
 | 
| 188 |     except StopIteration:
 | 
| 189 |       t = EOF_TOKEN
 | 
| 190 |     self.token = t
 | 
| 191 | 
 | 
| 192 |   def Eat(self, val):
 | 
| 193 |     """Assert the value of the current token, then move to the next token."""
 | 
| 194 |     if val and not self.AtToken(val):
 | 
| 195 |       raise ParseError('expected %s, got %s' % (val, self.token))
 | 
| 196 |     self.Next()
 | 
| 197 | 
 | 
| 198 |   def ParseUntil(self, rbp):
 | 
| 199 |     """
 | 
| 200 |     Parse to the right, eating tokens until we encounter a token with binding
 | 
| 201 |     power LESS THAN OR EQUAL TO rbp.
 | 
| 202 |     """
 | 
| 203 |     if self.AtToken('eof'):
 | 
| 204 |       raise ParseError('Unexpected end of input')
 | 
| 205 | 
 | 
| 206 |     t = self.token
 | 
| 207 |     self.Next()  # skip over the token, e.g. ! ~ + -
 | 
| 208 | 
 | 
| 209 |     null_info = self.spec.LookupNull(t.type)
 | 
| 210 |     node = null_info.nud(self, t, null_info.bp)
 | 
| 211 | 
 | 
| 212 |     while True:
 | 
| 213 |       t = self.token
 | 
| 214 |       left_info = self.spec.LookupLeft(t.type)
 | 
| 215 | 
 | 
| 216 |       # Examples:
 | 
| 217 |       # If we see 1*2+  , rbp = 27 and lbp = 25, so stop.
 | 
| 218 |       # If we see 1+2+  , rbp = 25 and lbp = 25, so stop.
 | 
| 219 |       # If we see 1**2**, rbp = 26 and lbp = 27, so keep going.
 | 
| 220 |       if rbp >= left_info.lbp:
 | 
| 221 |         break
 | 
| 222 |       self.Next()  # skip over the token, e.g. / *
 | 
| 223 | 
 | 
| 224 |       node = left_info.led(self, t, node, left_info.rbp)
 | 
| 225 | 
 | 
| 226 |     return node
 | 
| 227 | 
 | 
| 228 |   def Parse(self):
 | 
| 229 |     self.Next()
 | 
| 230 |     return self.ParseUntil(0)
 |