| 1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
 | 
| 2 | # Licensed to PSF under a Contributor Agreement.
 | 
| 3 | 
 | 
| 4 | # Pgen imports
 | 
| 5 | #import grammar, token, tokenize
 | 
| 6 | # NOTE: Need these special versions of token/tokenize for BACKQUOTE and such.
 | 
| 7 | from . import grammar, token, tokenize
 | 
| 8 | from mycpp.mylib import log
 | 
| 9 | 
 | 
| 10 | _ = log
 | 
| 11 | 
 | 
| 12 | 
 | 
| 13 | class PythonTokDef(object):
 | 
| 14 | 
 | 
| 15 |   def GetTerminalNum(self, label):
 | 
| 16 |     """ e.g. NAME -> 1 """
 | 
| 17 |     itoken = getattr(token, label, None)
 | 
| 18 |     assert isinstance(itoken, int), label
 | 
| 19 |     assert itoken in token.tok_name, label
 | 
| 20 |     return itoken
 | 
| 21 | 
 | 
| 22 |   def GetKeywordNum(self, value):
 | 
| 23 |     """
 | 
| 24 |     e.g 'xor' -> Id.Expr_Xor
 | 
| 25 | 
 | 
| 26 |     Python doesn't have this, but Oil does.  Returns None if not found.
 | 
| 27 |     """
 | 
| 28 |     return None
 | 
| 29 | 
 | 
| 30 |   def GetOpNum(self, op_str):
 | 
| 31 |     """
 | 
| 32 |     e.g '(' -> LPAR
 | 
| 33 | 
 | 
| 34 |     Raises an exception if it's not found.
 | 
| 35 |     """
 | 
| 36 |     return token.opmap[op_str]
 | 
| 37 | 
 | 
| 38 | 
 | 
| 39 | class ParserGenerator(object):
 | 
| 40 | 
 | 
| 41 |     def __init__(self, lexer):
 | 
| 42 |         self.lexer = lexer
 | 
| 43 | 
 | 
| 44 |     def parse(self):
 | 
| 45 |         self.gettoken()  # Initialize lookahead
 | 
| 46 | 
 | 
| 47 |         dfas = {}
 | 
| 48 |         startsymbol = None
 | 
| 49 |         # MSTART: (NEWLINE | RULE)* ENDMARKER
 | 
| 50 |         while self.type != token.ENDMARKER:
 | 
| 51 |             while self.type == token.NEWLINE:
 | 
| 52 |                 self.gettoken()
 | 
| 53 |             # RULE: NAME ':' RHS NEWLINE
 | 
| 54 |             name = self.expect(token.NAME)
 | 
| 55 |             self.expect(token.OP, ":")
 | 
| 56 |             a, z = self.parse_rhs()
 | 
| 57 |             self.expect(token.NEWLINE)
 | 
| 58 |             #self.dump_nfa(name, a, z)
 | 
| 59 |             dfa = self.make_dfa(a, z)
 | 
| 60 |             #self.dump_dfa(name, dfa)
 | 
| 61 |             oldlen = len(dfa)
 | 
| 62 |             self.simplify_dfa(dfa)
 | 
| 63 |             newlen = len(dfa)
 | 
| 64 |             dfas[name] = dfa
 | 
| 65 |             #print name, oldlen, newlen
 | 
| 66 |             if startsymbol is None:
 | 
| 67 |                 startsymbol = name
 | 
| 68 |         return dfas, startsymbol
 | 
| 69 | 
 | 
| 70 |     def make_dfa(self, start, finish):
 | 
| 71 |         # To turn an NFA into a DFA, we define the states of the DFA
 | 
| 72 |         # to correspond to *sets* of states of the NFA.  Then do some
 | 
| 73 |         # state reduction.  Let's represent sets as dicts with 1 for
 | 
| 74 |         # values.
 | 
| 75 |         assert isinstance(start, NFAState)
 | 
| 76 |         assert isinstance(finish, NFAState)
 | 
| 77 |         def closure(state):
 | 
| 78 |             base = {}
 | 
| 79 |             addclosure(state, base)
 | 
| 80 |             return base
 | 
| 81 |         def addclosure(state, base):
 | 
| 82 |             assert isinstance(state, NFAState)
 | 
| 83 |             if state in base:
 | 
| 84 |                 return
 | 
| 85 |             base[state] = 1
 | 
| 86 |             for label, next in state.arcs:
 | 
| 87 |                 if label is None:
 | 
| 88 |                     addclosure(next, base)
 | 
| 89 |         states = [DFAState(closure(start), finish)]
 | 
| 90 |         for state in states: # NB states grows while we're iterating
 | 
| 91 |             arcs = {}
 | 
| 92 |             for nfastate in state.nfaset:
 | 
| 93 |                 for label, next in nfastate.arcs:
 | 
| 94 |                     if label is not None:
 | 
| 95 |                         addclosure(next, arcs.setdefault(label, {}))
 | 
| 96 |             for label, nfaset in sorted(arcs.items()):
 | 
| 97 |                 for st in states:
 | 
| 98 |                     if st.nfaset == nfaset:
 | 
| 99 |                         break
 | 
| 100 |                 else:
 | 
| 101 |                     st = DFAState(nfaset, finish)
 | 
| 102 |                     states.append(st)
 | 
| 103 |                 state.addarc(st, label)
 | 
| 104 |         return states # List of DFAState instances; first one is start
 | 
| 105 | 
 | 
| 106 |     def dump_nfa(self, name, start, finish):
 | 
| 107 |         print("Dump of NFA for", name)
 | 
| 108 |         todo = [start]
 | 
| 109 |         for i, state in enumerate(todo):
 | 
| 110 |             print("  State", i, state is finish and "(final)" or "")
 | 
| 111 |             for label, next in state.arcs:
 | 
| 112 |                 if next in todo:
 | 
| 113 |                     j = todo.index(next)
 | 
| 114 |                 else:
 | 
| 115 |                     j = len(todo)
 | 
| 116 |                     todo.append(next)
 | 
| 117 |                 if label is None:
 | 
| 118 |                     print("    -> %d" % j)
 | 
| 119 |                 else:
 | 
| 120 |                     print("    %s -> %d" % (label, j))
 | 
| 121 | 
 | 
| 122 |     def dump_dfa(self, name, dfa):
 | 
| 123 |         print("Dump of DFA for", name)
 | 
| 124 |         for i, state in enumerate(dfa):
 | 
| 125 |             print("  State", i, state.isfinal and "(final)" or "")
 | 
| 126 |             for label, next in sorted(state.arcs.items()):
 | 
| 127 |                 print("    %s -> %d" % (label, dfa.index(next)))
 | 
| 128 | 
 | 
| 129 |     def simplify_dfa(self, dfa):
 | 
| 130 |         # This is not theoretically optimal, but works well enough.
 | 
| 131 |         # Algorithm: repeatedly look for two states that have the same
 | 
| 132 |         # set of arcs (same labels pointing to the same nodes) and
 | 
| 133 |         # unify them, until things stop changing.
 | 
| 134 | 
 | 
| 135 |         # dfa is a list of DFAState instances
 | 
| 136 |         changes = True
 | 
| 137 |         while changes:
 | 
| 138 |             changes = False
 | 
| 139 |             for i, state_i in enumerate(dfa):
 | 
| 140 |                 for j in range(i+1, len(dfa)):
 | 
| 141 |                     state_j = dfa[j]
 | 
| 142 |                     if state_i == state_j:
 | 
| 143 |                         #print "  unify", i, j
 | 
| 144 |                         del dfa[j]
 | 
| 145 |                         for state in dfa:
 | 
| 146 |                             state.unifystate(state_j, state_i)
 | 
| 147 |                         changes = True
 | 
| 148 |                         break
 | 
| 149 | 
 | 
| 150 |     def parse_rhs(self):
 | 
| 151 |         # RHS: ALT ('|' ALT)*
 | 
| 152 |         a, z = self.parse_alt()
 | 
| 153 |         if self.value != "|":
 | 
| 154 |             return a, z
 | 
| 155 |         else:
 | 
| 156 |             aa = NFAState()
 | 
| 157 |             zz = NFAState()
 | 
| 158 |             aa.addarc(a)
 | 
| 159 |             z.addarc(zz)
 | 
| 160 |             while self.value == "|":
 | 
| 161 |                 self.gettoken()
 | 
| 162 |                 a, z = self.parse_alt()
 | 
| 163 |                 aa.addarc(a)
 | 
| 164 |                 z.addarc(zz)
 | 
| 165 |             return aa, zz
 | 
| 166 | 
 | 
| 167 |     def parse_alt(self):
 | 
| 168 |         # ALT: ITEM+
 | 
| 169 |         a, b = self.parse_item()
 | 
| 170 |         while (self.value in ("(", "[") or
 | 
| 171 |                self.type in (token.NAME, token.STRING)):
 | 
| 172 |             c, d = self.parse_item()
 | 
| 173 |             b.addarc(c)
 | 
| 174 |             b = d
 | 
| 175 |         return a, b
 | 
| 176 | 
 | 
| 177 |     def parse_item(self):
 | 
| 178 |         # ITEM: '[' RHS ']' | ATOM ['+' | '*']
 | 
| 179 |         if self.value == "[":
 | 
| 180 |             self.gettoken()
 | 
| 181 |             a, z = self.parse_rhs()
 | 
| 182 |             self.expect(token.OP, "]")
 | 
| 183 |             a.addarc(z)
 | 
| 184 |             return a, z
 | 
| 185 |         else:
 | 
| 186 |             a, z = self.parse_atom()
 | 
| 187 |             value = self.value
 | 
| 188 |             if value not in ("+", "*"):
 | 
| 189 |                 return a, z
 | 
| 190 |             self.gettoken()
 | 
| 191 |             z.addarc(a)
 | 
| 192 |             if value == "+":
 | 
| 193 |                 return a, z
 | 
| 194 |             else:
 | 
| 195 |                 return a, a
 | 
| 196 | 
 | 
| 197 |     def parse_atom(self):
 | 
| 198 |         # ATOM: '(' RHS ')' | NAME | STRING
 | 
| 199 |         if self.value == "(":
 | 
| 200 |             self.gettoken()
 | 
| 201 |             a, z = self.parse_rhs()
 | 
| 202 |             self.expect(token.OP, ")")
 | 
| 203 |             return a, z
 | 
| 204 |         elif self.type in (token.NAME, token.STRING):
 | 
| 205 |             a = NFAState()
 | 
| 206 |             z = NFAState()
 | 
| 207 |             a.addarc(z, self.value)
 | 
| 208 |             self.gettoken()
 | 
| 209 |             return a, z
 | 
| 210 |         else:
 | 
| 211 |             self.raise_error("expected (...) or NAME or STRING, got %s/%s",
 | 
| 212 |                              self.type, self.value)
 | 
| 213 | 
 | 
| 214 |     def expect(self, type, value=None):
 | 
| 215 |         if self.type != type or (value is not None and self.value != value):
 | 
| 216 |             self.raise_error("expected %s/%s, got %s/%s",
 | 
| 217 |                              type, value, self.type, self.value)
 | 
| 218 |         value = self.value
 | 
| 219 |         self.gettoken()
 | 
| 220 |         return value
 | 
| 221 | 
 | 
| 222 |     def gettoken(self):
 | 
| 223 |         tup = next(self.lexer)
 | 
| 224 |         while tup[0] in (tokenize.COMMENT, tokenize.NL):
 | 
| 225 |             tup = next(self.lexer)
 | 
| 226 |         self.type, self.value, self.begin, self.end, self.line = tup
 | 
| 227 |         #print token.tok_name[self.type], repr(self.value)
 | 
| 228 | 
 | 
| 229 |     def raise_error(self, msg, *args):
 | 
| 230 |         if args:
 | 
| 231 |             msg = msg % args
 | 
| 232 |         raise SyntaxError(msg, ('<grammar>', self.end[0],
 | 
| 233 |                                 self.end[1], self.line))
 | 
| 234 | 
 | 
| 235 | class NFAState(object):
 | 
| 236 | 
 | 
| 237 |     def __init__(self):
 | 
| 238 |         self.arcs = [] # list of (label, NFAState) pairs
 | 
| 239 | 
 | 
| 240 |     def addarc(self, next, label=None):
 | 
| 241 |         assert label is None or isinstance(label, str)
 | 
| 242 |         assert isinstance(next, NFAState)
 | 
| 243 |         self.arcs.append((label, next))
 | 
| 244 | 
 | 
| 245 | class DFAState(object):
 | 
| 246 | 
 | 
| 247 |     def __init__(self, nfaset, final):
 | 
| 248 |         assert isinstance(nfaset, dict)
 | 
| 249 |         assert isinstance(next(iter(nfaset)), NFAState)
 | 
| 250 |         assert isinstance(final, NFAState)
 | 
| 251 |         self.nfaset = nfaset
 | 
| 252 |         self.isfinal = final in nfaset
 | 
| 253 |         self.arcs = {} # map from label to DFAState
 | 
| 254 | 
 | 
| 255 |     def addarc(self, next, label):
 | 
| 256 |         assert isinstance(label, str)
 | 
| 257 |         assert label not in self.arcs
 | 
| 258 |         assert isinstance(next, DFAState)
 | 
| 259 |         self.arcs[label] = next
 | 
| 260 | 
 | 
| 261 |     def unifystate(self, old, new):
 | 
| 262 |         for label, next in self.arcs.items():
 | 
| 263 |             if next is old:
 | 
| 264 |                 self.arcs[label] = new
 | 
| 265 | 
 | 
| 266 |     def __eq__(self, other):
 | 
| 267 |         # Equality test -- ignore the nfaset instance variable
 | 
| 268 |         assert isinstance(other, DFAState)
 | 
| 269 |         if self.isfinal != other.isfinal:
 | 
| 270 |             return False
 | 
| 271 |         # Can't just return self.arcs == other.arcs, because that
 | 
| 272 |         # would invoke this method recursively, with cycles...
 | 
| 273 |         if len(self.arcs) != len(other.arcs):
 | 
| 274 |             return False
 | 
| 275 |         for label, next in self.arcs.items():
 | 
| 276 |             if next is not other.arcs.get(label):
 | 
| 277 |                 return False
 | 
| 278 |         return True
 | 
| 279 | 
 | 
| 280 |     __hash__ = None # For Py3 compatibility.
 | 
| 281 | 
 | 
| 282 | 
 | 
| 283 | def calcfirst(dfas, first, name):
 | 
| 284 |     """Recursive function that mutates first."""
 | 
| 285 |     dfa = dfas[name]
 | 
| 286 |     first[name] = None # dummy to detect left recursion
 | 
| 287 |     state = dfa[0]
 | 
| 288 |     totalset = {}
 | 
| 289 |     overlapcheck = {}
 | 
| 290 |     for label, _ in state.arcs.items():
 | 
| 291 |         if label in dfas:
 | 
| 292 |             if label in first:
 | 
| 293 |                 fset = first[label]
 | 
| 294 |                 if fset is None:
 | 
| 295 |                     raise ValueError("recursion for rule %r" % name)
 | 
| 296 |             else:
 | 
| 297 |                 calcfirst(dfas, first, label)
 | 
| 298 |                 fset = first[label]
 | 
| 299 |             totalset.update(fset)
 | 
| 300 |             overlapcheck[label] = fset
 | 
| 301 |         else:
 | 
| 302 |             totalset[label] = 1
 | 
| 303 |             overlapcheck[label] = {label: 1}
 | 
| 304 |     inverse = {}
 | 
| 305 |     for label, itsfirst in overlapcheck.items():
 | 
| 306 |         for symbol in itsfirst:
 | 
| 307 |             if symbol in inverse:
 | 
| 308 |                 raise ValueError("rule %s is ambiguous; %s is in the"
 | 
| 309 |                                  " first sets of %s as well as %s" %
 | 
| 310 |                                  (name, symbol, label, inverse[symbol]))
 | 
| 311 |             inverse[symbol] = label
 | 
| 312 |     first[name] = totalset
 | 
| 313 | 
 | 
| 314 | 
 | 
| 315 | def make_label(tok_def, gr, label):
 | 
| 316 |     """Given a grammar item, return a unique integer representing it.
 | 
| 317 | 
 | 
| 318 |     It could be:
 | 
| 319 |     1. or_test      - a non-terminal
 | 
| 320 |     2. Expr_Name    - a terminal
 | 
| 321 |     3. 'for'        - keyword   (quotes)
 | 
| 322 |     4. '>='         - operator  (quotes)
 | 
| 323 | 
 | 
| 324 |     Oil addition
 | 
| 325 |     5. Op_RBracket -- anything with _ is assumed to be in the Id namespace.
 | 
| 326 |     """
 | 
| 327 |     #log('make_label %r', label)
 | 
| 328 |     # XXX Maybe this should be a method on a subclass of converter?
 | 
| 329 |     ilabel = len(gr.labels)
 | 
| 330 |     if label[0].isalpha():
 | 
| 331 |         if label in gr.symbol2number:  # NON-TERMINAL
 | 
| 332 |             if label in gr.symbol2label:
 | 
| 333 |                 return gr.symbol2label[label]
 | 
| 334 |             else:
 | 
| 335 |                 gr.labels.append(gr.symbol2number[label])
 | 
| 336 |                 gr.symbol2label[label] = ilabel
 | 
| 337 |                 return ilabel
 | 
| 338 |         else:  # TERMINAL like (NAME, NUMBER, STRING)
 | 
| 339 |             itoken = tok_def.GetTerminalNum(label)
 | 
| 340 |             if itoken in gr.tokens:
 | 
| 341 |                 return gr.tokens[itoken]
 | 
| 342 |             else:
 | 
| 343 |                 gr.labels.append(itoken)
 | 
| 344 |                 #log('%s %d -> %s', token.tok_name[itoken], itoken, ilabel)
 | 
| 345 |                 gr.tokens[itoken] = ilabel
 | 
| 346 |                 return ilabel
 | 
| 347 |     else:
 | 
| 348 |         # Either a keyword or an operator
 | 
| 349 |         assert label[0] in ('"', "'"), label
 | 
| 350 |         value = eval(label)
 | 
| 351 | 
 | 
| 352 |         # Treat 'xor' just like '^'.  TODO: I think this code can be
 | 
| 353 |         # simplified.
 | 
| 354 |         n = tok_def.GetKeywordNum(value)  # int or None
 | 
| 355 | 
 | 
| 356 |         if value[0].isalpha() and n is None:  # A word like 'for', 'xor'
 | 
| 357 |             # Then look in the keywords automatically extracted from the
 | 
| 358 |             # grammar.
 | 
| 359 |             if value in gr.keywords:
 | 
| 360 |                 return gr.keywords[value]
 | 
| 361 |             else:
 | 
| 362 |                 gr.labels.append(token.NAME)  # arbitrary number < NT_OFFSET
 | 
| 363 |                 gr.keywords[value] = ilabel
 | 
| 364 |                 return ilabel
 | 
| 365 | 
 | 
| 366 |         else:  # An operator e.g. '>='
 | 
| 367 |             if n is None:
 | 
| 368 |                 itoken = tok_def.GetOpNum(value)
 | 
| 369 |             else:
 | 
| 370 |                 itoken = n
 | 
| 371 | 
 | 
| 372 |             if itoken in gr.tokens:
 | 
| 373 |                 return gr.tokens[itoken]
 | 
| 374 |             else:
 | 
| 375 |                 gr.labels.append(itoken)
 | 
| 376 |                 gr.tokens[itoken] = ilabel
 | 
| 377 |                 return ilabel
 | 
| 378 | 
 | 
| 379 | 
 | 
| 380 | def make_first(tok_def, rawfirst, gr):
 | 
| 381 |     first = {}
 | 
| 382 |     for label in sorted(rawfirst):
 | 
| 383 |         ilabel = make_label(tok_def, gr, label)
 | 
| 384 |         ##assert ilabel not in first # XXX failed on <> ... !=
 | 
| 385 |         first[ilabel] = 1
 | 
| 386 |     return first
 | 
| 387 | 
 | 
| 388 | 
 | 
| 389 | def MakeGrammar(f, tok_def=None):
 | 
| 390 |   """Construct a Grammar object from a file."""
 | 
| 391 | 
 | 
| 392 |   lexer = tokenize.generate_tokens(f.readline)
 | 
| 393 |   p = ParserGenerator(lexer)
 | 
| 394 |   dfas, startsymbol = p.parse()
 | 
| 395 | 
 | 
| 396 |   first = {}  # map from symbol name to set of tokens
 | 
| 397 |   for name in sorted(dfas):
 | 
| 398 |     if name not in first:
 | 
| 399 |       calcfirst(dfas, first, name)
 | 
| 400 |       #print name, self.first[name].keys()
 | 
| 401 | 
 | 
| 402 |   # TODO: startsymbol support could be removed.  The call to p.setup() in
 | 
| 403 |   # PushTokens() can always specify it explicitly.
 | 
| 404 |   names = sorted(dfas)
 | 
| 405 |   names.remove(startsymbol)
 | 
| 406 |   names.insert(0, startsymbol)
 | 
| 407 | 
 | 
| 408 |   gr = grammar.Grammar()
 | 
| 409 |   for name in names:
 | 
| 410 |       i = token.NT_OFFSET + len(gr.symbol2number)
 | 
| 411 |       gr.symbol2number[name] = i
 | 
| 412 |       gr.number2symbol[i] = name
 | 
| 413 | 
 | 
| 414 |   tok_def = tok_def or PythonTokDef()
 | 
| 415 |   for name in names:
 | 
| 416 |       dfa = dfas[name]
 | 
| 417 |       states = []
 | 
| 418 |       for state in dfa:
 | 
| 419 |           arcs = []
 | 
| 420 |           for label, next_ in sorted(state.arcs.items()):
 | 
| 421 |               arcs.append((make_label(tok_def, gr, label), dfa.index(next_)))
 | 
| 422 |           if state.isfinal:
 | 
| 423 |               arcs.append((0, dfa.index(state)))
 | 
| 424 |           states.append(arcs)
 | 
| 425 |       gr.states.append(states)
 | 
| 426 |       fi = make_first(tok_def, first[name], gr)
 | 
| 427 |       gr.dfas[gr.symbol2number[name]] = (states, fi)
 | 
| 428 | 
 | 
| 429 |   gr.start = gr.symbol2number[startsymbol]
 | 
| 430 |   return gr
 |