| 1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
 | 
| 2 | # Licensed to PSF under a Contributor Agreement.
 | 
| 3 | 
 | 
| 4 | """Parser engine for the grammar tables generated by pgen.
 | 
| 5 | 
 | 
| 6 | The grammar table must be loaded first.
 | 
| 7 | 
 | 
| 8 | See Parser/parser.c in the Python distribution for additional info on
 | 
| 9 | how this parsing engine works.
 | 
| 10 | """
 | 
| 11 | 
 | 
| 12 | from mycpp.mylib import log
 | 
| 13 | _ = log
 | 
| 14 | 
 | 
| 15 | from typing import TYPE_CHECKING, Optional, List
 | 
| 16 | from pgen2.pnode import PNode, PNodeAllocator
 | 
| 17 | 
 | 
| 18 | if TYPE_CHECKING:
 | 
| 19 |   from _devbuild.gen.syntax_asdl import Token
 | 
| 20 |   from pgen2.grammar import Grammar, dfa_t
 | 
| 21 | 
 | 
| 22 | # Duplicated to avoid dependency
 | 
| 23 | # from pgen2 import token
 | 
| 24 | 
 | 
| 25 | NT_OFFSET = 256
 | 
| 26 | 
 | 
| 27 | 
 | 
| 28 | class ParseError(Exception):
 | 
| 29 |     """Exception to signal the parser is stuck."""
 | 
| 30 | 
 | 
| 31 |     def __init__(self, msg, type_, tok):
 | 
| 32 |         # type: (str, int, Token) -> None
 | 
| 33 |         self.msg = msg
 | 
| 34 |         self.type = type_
 | 
| 35 |         self.tok = tok
 | 
| 36 | 
 | 
| 37 |     def __repr__(self):
 | 
| 38 |         # type: () -> str
 | 
| 39 |         return "%s: type=%d, tok=%r" % (self.msg, self.type, self.tok)
 | 
| 40 | 
 | 
| 41 | 
 | 
| 42 | class _StackItem(object):
 | 
| 43 |   def __init__(self, dfa, state, node):
 | 
| 44 |     # type: (dfa_t, int, PNode) -> None
 | 
| 45 |     self.dfa = dfa
 | 
| 46 |     self.state = state
 | 
| 47 |     self.node = node
 | 
| 48 | 
 | 
| 49 | 
 | 
| 50 | class Parser(object):
 | 
| 51 |     """Parser engine.
 | 
| 52 | 
 | 
| 53 |     The proper usage sequence is:
 | 
| 54 | 
 | 
| 55 |     p = Parser(grammar, [converter])  # create instance
 | 
| 56 |     p.setup(start)                    # prepare for parsing
 | 
| 57 |     <for each input token>:
 | 
| 58 |         if p.addtoken(...):           # parse a token; may raise ParseError
 | 
| 59 |             break
 | 
| 60 |     root = p.rootnode                 # root of abstract syntax tree
 | 
| 61 | 
 | 
| 62 |     A Parser instance may be reused by calling setup() repeatedly.
 | 
| 63 | 
 | 
| 64 |     A Parser instance contains state pertaining to the current token
 | 
| 65 |     sequence, and should not be used concurrently by different threads
 | 
| 66 |     to parse separate token sequences.
 | 
| 67 | 
 | 
| 68 |     See driver.py for how to get input tokens by tokenizing a file or
 | 
| 69 |     string.
 | 
| 70 | 
 | 
| 71 |     Parsing is complete when addtoken() returns True; the root of the
 | 
| 72 |     abstract syntax tree can then be retrieved from the rootnode
 | 
| 73 |     instance variable.  When a syntax error occurs, addtoken() raises
 | 
| 74 |     the ParseError exception.  There is no error recovery; the parser
 | 
| 75 |     cannot be used after a syntax error was reported (but it can be
 | 
| 76 |     reinitialized by calling setup()).
 | 
| 77 |     """
 | 
| 78 | 
 | 
| 79 |     def __init__(self, grammar):
 | 
| 80 |         # type: (Grammar) -> None
 | 
| 81 |         """Constructor.
 | 
| 82 | 
 | 
| 83 |         The grammar argument is a grammar.Grammar instance; see the
 | 
| 84 |         grammar module for more information.
 | 
| 85 | 
 | 
| 86 |         The parser is not ready yet for parsing; you must call the
 | 
| 87 |         setup() method to get it started.
 | 
| 88 | 
 | 
| 89 |         A concrete syntax tree node is a (type, value, context, nodes)
 | 
| 90 |         tuple, where type is the node type (a token or symbol number),
 | 
| 91 |         value is None for symbols and a string for tokens, context is
 | 
| 92 |         None or an opaque value used for error reporting (typically a
 | 
| 93 |         (lineno, offset) pair), and nodes is a list of children for
 | 
| 94 |         symbols, and None for tokens.
 | 
| 95 | 
 | 
| 96 |         An abstract syntax tree node may be anything; this is entirely
 | 
| 97 |         up to the converter function.
 | 
| 98 |         """
 | 
| 99 |         self.grammar = grammar
 | 
| 100 |         self.rootnode = None  # type: Optional[PNode]
 | 
| 101 |         self.stack = [] # type: List[_StackItem]
 | 
| 102 |         self.pnode_alloc = None # type: Optional[PNodeAllocator]
 | 
| 103 | 
 | 
| 104 |     def setup(self, start, pnode_alloc):
 | 
| 105 |         # type: (int, PNodeAllocator) -> None
 | 
| 106 |         """Prepare for parsing.
 | 
| 107 | 
 | 
| 108 |         This *must* be called before starting to parse.
 | 
| 109 | 
 | 
| 110 |         The optional argument is an alternative start symbol; it
 | 
| 111 |         defaults to the grammar's start symbol.
 | 
| 112 | 
 | 
| 113 |         You can use a Parser instance to parse any number of programs;
 | 
| 114 |         each time you call setup() the parser is reset to an initial
 | 
| 115 |         state determined by the (implicit or explicit) start symbol.
 | 
| 116 |         """
 | 
| 117 |         self.pnode_alloc = pnode_alloc
 | 
| 118 |         newnode = self.pnode_alloc.NewPNode(start, None)
 | 
| 119 |         # Each stack entry is a tuple: (dfa, state, node).
 | 
| 120 |         self.stack = [_StackItem(self.grammar.dfas[start], 0, newnode)]
 | 
| 121 |         self.rootnode = None
 | 
| 122 | 
 | 
| 123 |     def addtoken(self, typ, opaque, ilabel):
 | 
| 124 |         # type: (int, Token, int) -> bool
 | 
| 125 |         """Add a token; return True iff this is the end of the program."""
 | 
| 126 |         # Loop until the token is shifted; may raise exceptions
 | 
| 127 | 
 | 
| 128 |         # Andy NOTE: This is not linear time, i.e. a constant amount of work
 | 
| 129 |         # for each token?  Is it O(n^2) as the ANTLR paper says?
 | 
| 130 |         # Do the "accelerators" in pgen.c have anything to do with it?
 | 
| 131 | 
 | 
| 132 |         while True:
 | 
| 133 |             top = self.stack[-1]
 | 
| 134 |             states, _ = top.dfa
 | 
| 135 |             state = top.state
 | 
| 136 | 
 | 
| 137 |             arcs = states[state]
 | 
| 138 | 
 | 
| 139 |             # Look for a state with this label
 | 
| 140 |             found = False
 | 
| 141 |             for ilab, newstate in arcs:
 | 
| 142 |                 t = self.grammar.labels[ilab]
 | 
| 143 |                 if ilabel == ilab:
 | 
| 144 |                     # Look it up in the list of labels
 | 
| 145 |                     assert t < NT_OFFSET, t
 | 
| 146 |                     # Shift a token; we're done with it
 | 
| 147 |                     self.shift(typ, opaque, newstate)
 | 
| 148 |                     # Pop while we are in an accept-only state
 | 
| 149 |                     state = newstate
 | 
| 150 |                      
 | 
| 151 |                     while True:
 | 
| 152 |                         # mycpp: rewrite of tuple-in-list comparison
 | 
| 153 |                         if len(states[state]) != 1:
 | 
| 154 |                             break
 | 
| 155 | 
 | 
| 156 |                         s0, s1 = states[state][0]
 | 
| 157 |                         if s0 != 0 or s1 != state:
 | 
| 158 |                             break
 | 
| 159 | 
 | 
| 160 |                         self.pop()
 | 
| 161 |                         if len(self.stack) == 0:
 | 
| 162 |                             # Done parsing!
 | 
| 163 |                             return True
 | 
| 164 |                         top = self.stack[-1]
 | 
| 165 |                         states, _ = top.dfa
 | 
| 166 |                         state = top.state
 | 
| 167 | 
 | 
| 168 |                     # Done with this token
 | 
| 169 |                     return False
 | 
| 170 |                 elif t >= NT_OFFSET:
 | 
| 171 |                     # See if it's a symbol and if we're in its first set
 | 
| 172 |                     itsdfa = self.grammar.dfas[t]
 | 
| 173 |                     _, itsfirst = itsdfa
 | 
| 174 |                     if ilabel in itsfirst:
 | 
| 175 |                         # Push a symbol
 | 
| 176 |                         self.push(t, opaque, self.grammar.dfas[t], newstate)
 | 
| 177 |                         found = True
 | 
| 178 |                         break # To continue the outer while loop
 | 
| 179 | 
 | 
| 180 |             if not found:
 | 
| 181 |                 # Note: this condition was rewritten for mycpp tarnslation.
 | 
| 182 |                 # if (0, state) in arcs:
 | 
| 183 |                 #   ...
 | 
| 184 |                 # else:
 | 
| 185 |                 #   ...
 | 
| 186 |                 found2 = False
 | 
| 187 |                 for left, right in arcs:
 | 
| 188 |                     if left == 0 and right == state:
 | 
| 189 |                         # An accepting state, pop it and try something else
 | 
| 190 |                         self.pop()
 | 
| 191 |                         if len(self.stack) == 0:
 | 
| 192 |                             # Done parsing, but another token is input
 | 
| 193 |                             raise ParseError("too much input", typ, opaque)
 | 
| 194 |                         found2 = True
 | 
| 195 | 
 | 
| 196 |                 if not found2:
 | 
| 197 |                     # No success finding a transition
 | 
| 198 |                     raise ParseError("bad input", typ, opaque)
 | 
| 199 | 
 | 
| 200 |     def shift(self, typ, opaque, newstate):
 | 
| 201 |         # type: (int, Token, int) -> None
 | 
| 202 |         """Shift a token.  (Internal)"""
 | 
| 203 |         top = self.stack[-1]
 | 
| 204 |         newnode = self.pnode_alloc.NewPNode(typ, opaque)
 | 
| 205 |         top.node.AddChild(newnode)
 | 
| 206 |         self.stack[-1].state = newstate
 | 
| 207 | 
 | 
| 208 |     def push(self, typ, opaque, newdfa, newstate):
 | 
| 209 |         # type: (int, Token, dfa_t, int) -> None
 | 
| 210 |         """Push a nonterminal.  (Internal)"""
 | 
| 211 |         top = self.stack[-1]
 | 
| 212 |         newnode = self.pnode_alloc.NewPNode(typ, opaque)
 | 
| 213 |         self.stack[-1].state = newstate
 | 
| 214 |         self.stack.append(_StackItem(newdfa, 0, newnode))
 | 
| 215 | 
 | 
| 216 |     def pop(self):
 | 
| 217 |         # type: () -> None
 | 
| 218 |         """Pop a nonterminal.  (Internal)"""
 | 
| 219 |         top = self.stack.pop()
 | 
| 220 |         newnode = top.node
 | 
| 221 |         if len(self.stack):
 | 
| 222 |             top2 = self.stack[-1]
 | 
| 223 |             top2.node.AddChild(newnode)
 | 
| 224 |         else:
 | 
| 225 |             self.rootnode = newnode
 |