| 1 | # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
 | 
| 2 | # All rights reserved.
 | 
| 3 | 
 | 
| 4 | """Tokenization help for Python programs.
 | 
| 5 | 
 | 
| 6 | generate_tokens(readline) is a generator that breaks a stream of
 | 
| 7 | text into Python tokens.  It accepts a readline-like method which is called
 | 
| 8 | repeatedly to get the next line of input (or "" for EOF).  It generates
 | 
| 9 | 5-tuples with these members:
 | 
| 10 | 
 | 
| 11 |     the token type (see token.py)
 | 
| 12 |     the token (a string)
 | 
| 13 |     the starting (row, column) indices of the token (a 2-tuple of ints)
 | 
| 14 |     the ending (row, column) indices of the token (a 2-tuple of ints)
 | 
| 15 |     the original line (string)
 | 
| 16 | 
 | 
| 17 | It is designed to match the working of the Python tokenizer exactly, except
 | 
| 18 | that it produces COMMENT tokens for comments and gives type OP for all
 | 
| 19 | operators
 | 
| 20 | 
 | 
| 21 | Older entry points
 | 
| 22 |     tokenize_loop(readline, tokeneater)
 | 
| 23 |     tokenize(readline, tokeneater=printtoken)
 | 
| 24 | are the same, except instead of generating tokens, tokeneater is a callback
 | 
| 25 | function to which the 5 fields described above are passed as 5 arguments,
 | 
| 26 | each time a new token is found."""
 | 
| 27 | 
 | 
| 28 | __author__ = 'Ka-Ping Yee <ping@lfw.org>'
 | 
| 29 | __credits__ = \
 | 
| 30 |     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
 | 
| 31 | 
 | 
| 32 | import string, re
 | 
| 33 | from codecs import BOM_UTF8, lookup
 | 
| 34 | #from lib2to3.pgen2.token import *
 | 
| 35 | 
 | 
| 36 | from . import token
 | 
| 37 | from .token import *
 | 
| 38 | 
 | 
| 39 | __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
 | 
| 40 |            "generate_tokens", "untokenize"]
 | 
| 41 | del token
 | 
| 42 | 
 | 
| 43 | try:
 | 
| 44 |     bytes
 | 
| 45 | except NameError:
 | 
| 46 |     # Support bytes type in Python <= 2.5, so 2to3 turns itself into
 | 
| 47 |     # valid Python 3 code.
 | 
| 48 |     bytes = str
 | 
| 49 | 
 | 
| 50 | def group(*choices): return '(' + '|'.join(choices) + ')'
 | 
| 51 | def any(*choices): return group(*choices) + '*'
 | 
| 52 | def maybe(*choices): return group(*choices) + '?'
 | 
| 53 | 
 | 
| 54 | Whitespace = r'[ \f\t]*'
 | 
| 55 | Comment = r'#[^\r\n]*'
 | 
| 56 | Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
 | 
| 57 | Name = r'[a-zA-Z_]\w*'
 | 
| 58 | 
 | 
| 59 | Binnumber = r'0[bB][01]*'
 | 
| 60 | Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
 | 
| 61 | Octnumber = r'0[oO]?[0-7]*[lL]?'
 | 
| 62 | Decnumber = r'[1-9]\d*[lL]?'
 | 
| 63 | Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
 | 
| 64 | Exponent = r'[eE][-+]?\d+'
 | 
| 65 | Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
 | 
| 66 | Expfloat = r'\d+' + Exponent
 | 
| 67 | Floatnumber = group(Pointfloat, Expfloat)
 | 
| 68 | Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
 | 
| 69 | Number = group(Imagnumber, Floatnumber, Intnumber)
 | 
| 70 | 
 | 
| 71 | # Tail end of ' string.
 | 
| 72 | Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
 | 
| 73 | # Tail end of " string.
 | 
| 74 | Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 | 
| 75 | # Tail end of ''' string.
 | 
| 76 | Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 | 
| 77 | # Tail end of """ string.
 | 
| 78 | Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 | 
| 79 | Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
 | 
| 80 | # Single-line ' or " string.
 | 
| 81 | String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
 | 
| 82 |                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
 | 
| 83 | 
 | 
| 84 | # Because of leftmost-then-longest match semantics, be sure to put the
 | 
| 85 | # longest operators first (e.g., if = came before ==, == would get
 | 
| 86 | # recognized as two instances of =).
 | 
| 87 | Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
 | 
| 88 |                  r"//=?", r"->",
 | 
| 89 |                  r"[+\-*/%&@|^=<>]=?",
 | 
| 90 |                  r"~")
 | 
| 91 | 
 | 
| 92 | Bracket = '[][(){}]'
 | 
| 93 | Special = group(r'\r?\n', r'[:;.,`@]')
 | 
| 94 | Funny = group(Operator, Bracket, Special)
 | 
| 95 | 
 | 
| 96 | PlainToken = group(Number, Funny, String, Name)
 | 
| 97 | Token = Ignore + PlainToken
 | 
| 98 | 
 | 
| 99 | # First (or only) line of ' or " string.
 | 
| 100 | ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
 | 
| 101 |                 group("'", r'\\\r?\n'),
 | 
| 102 |                 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
 | 
| 103 |                 group('"', r'\\\r?\n'))
 | 
| 104 | PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 | 
| 105 | PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 | 
| 106 | 
 | 
| 107 | tokenprog, pseudoprog, single3prog, double3prog = list(map(
 | 
| 108 |     re.compile, (Token, PseudoToken, Single3, Double3)))
 | 
| 109 | endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 | 
| 110 |             "'''": single3prog, '"""': double3prog,
 | 
| 111 |             "r'''": single3prog, 'r"""': double3prog,
 | 
| 112 |             "u'''": single3prog, 'u"""': double3prog,
 | 
| 113 |             "b'''": single3prog, 'b"""': double3prog,
 | 
| 114 |             "ur'''": single3prog, 'ur"""': double3prog,
 | 
| 115 |             "br'''": single3prog, 'br"""': double3prog,
 | 
| 116 |             "R'''": single3prog, 'R"""': double3prog,
 | 
| 117 |             "U'''": single3prog, 'U"""': double3prog,
 | 
| 118 |             "B'''": single3prog, 'B"""': double3prog,
 | 
| 119 |             "uR'''": single3prog, 'uR"""': double3prog,
 | 
| 120 |             "Ur'''": single3prog, 'Ur"""': double3prog,
 | 
| 121 |             "UR'''": single3prog, 'UR"""': double3prog,
 | 
| 122 |             "bR'''": single3prog, 'bR"""': double3prog,
 | 
| 123 |             "Br'''": single3prog, 'Br"""': double3prog,
 | 
| 124 |             "BR'''": single3prog, 'BR"""': double3prog,
 | 
| 125 |             'r': None, 'R': None,
 | 
| 126 |             'u': None, 'U': None,
 | 
| 127 |             'b': None, 'B': None}
 | 
| 128 | 
 | 
| 129 | triple_quoted = {}
 | 
| 130 | for t in ("'''", '"""',
 | 
| 131 |           "r'''", 'r"""', "R'''", 'R"""',
 | 
| 132 |           "u'''", 'u"""', "U'''", 'U"""',
 | 
| 133 |           "b'''", 'b"""', "B'''", 'B"""',
 | 
| 134 |           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
 | 
| 135 |           "uR'''", 'uR"""', "UR'''", 'UR"""',
 | 
| 136 |           "br'''", 'br"""', "Br'''", 'Br"""',
 | 
| 137 |           "bR'''", 'bR"""', "BR'''", 'BR"""',):
 | 
| 138 |     triple_quoted[t] = t
 | 
| 139 | SingleQuoted = {}
 | 
| 140 | for t in ("'", '"',
 | 
| 141 |           "r'", 'r"', "R'", 'R"',
 | 
| 142 |           "u'", 'u"', "U'", 'U"',
 | 
| 143 |           "b'", 'b"', "B'", 'B"',
 | 
| 144 |           "ur'", 'ur"', "Ur'", 'Ur"',
 | 
| 145 |           "uR'", 'uR"', "UR'", 'UR"',
 | 
| 146 |           "br'", 'br"', "Br'", 'Br"',
 | 
| 147 |           "bR'", 'bR"', "BR'", 'BR"', ):
 | 
| 148 |     SingleQuoted[t] = t
 | 
| 149 | 
 | 
| 150 | tabsize = 8
 | 
| 151 | 
 | 
| 152 | class TokenError(Exception): pass
 | 
| 153 | 
 | 
| 154 | class StopTokenizing(Exception): pass
 | 
| 155 | 
 | 
| 156 | def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
 | 
| 157 |     (srow, scol) = xxx_todo_changeme
 | 
| 158 |     (erow, ecol) = xxx_todo_changeme1
 | 
| 159 |     print("%d,%d-%d,%d:\t%s\t%s" % \
 | 
| 160 |         (srow, scol, erow, ecol, tok_name[type], repr(token)))
 | 
| 161 | 
 | 
| 162 | def tokenize(readline, tokeneater=printtoken):
 | 
| 163 |     """
 | 
| 164 |     The tokenize() function accepts two parameters: one representing the
 | 
| 165 |     input stream, and one providing an output mechanism for tokenize().
 | 
| 166 | 
 | 
| 167 |     The first parameter, readline, must be a callable object which provides
 | 
| 168 |     the same interface as the readline() method of built-in file objects.
 | 
| 169 |     Each call to the function should return one line of input as a string.
 | 
| 170 | 
 | 
| 171 |     The second parameter, tokeneater, must also be a callable object. It is
 | 
| 172 |     called once for each token, with five arguments, corresponding to the
 | 
| 173 |     tuples generated by generate_tokens().
 | 
| 174 |     """
 | 
| 175 |     try:
 | 
| 176 |         tokenize_loop(readline, tokeneater)
 | 
| 177 |     except StopTokenizing:
 | 
| 178 |         pass
 | 
| 179 | 
 | 
| 180 | # backwards compatible interface
 | 
| 181 | def tokenize_loop(readline, tokeneater):
 | 
| 182 |     for token_info in generate_tokens(readline):
 | 
| 183 |         tokeneater(*token_info)
 | 
| 184 | 
 | 
| 185 | class Untokenizer:
 | 
| 186 | 
 | 
| 187 |     def __init__(self):
 | 
| 188 |         self.tokens = []
 | 
| 189 |         self.prev_row = 1
 | 
| 190 |         self.prev_col = 0
 | 
| 191 | 
 | 
| 192 |     def add_whitespace(self, start):
 | 
| 193 |         row, col = start
 | 
| 194 |         assert row <= self.prev_row
 | 
| 195 |         col_offset = col - self.prev_col
 | 
| 196 |         if col_offset:
 | 
| 197 |             self.tokens.append(" " * col_offset)
 | 
| 198 | 
 | 
| 199 |     def untokenize(self, iterable):
 | 
| 200 |         for t in iterable:
 | 
| 201 |             if len(t) == 2:
 | 
| 202 |                 self.compat(t, iterable)
 | 
| 203 |                 break
 | 
| 204 |             tok_type, token, start, end, line = t
 | 
| 205 |             self.add_whitespace(start)
 | 
| 206 |             self.tokens.append(token)
 | 
| 207 |             self.prev_row, self.prev_col = end
 | 
| 208 |             if tok_type in (NEWLINE, NL):
 | 
| 209 |                 self.prev_row += 1
 | 
| 210 |                 self.prev_col = 0
 | 
| 211 |         return "".join(self.tokens)
 | 
| 212 | 
 | 
| 213 |     def compat(self, token, iterable):
 | 
| 214 |         startline = False
 | 
| 215 |         indents = []
 | 
| 216 |         toks_append = self.tokens.append
 | 
| 217 |         toknum, tokval = token
 | 
| 218 |         if toknum in (NAME, NUMBER):
 | 
| 219 |             tokval += ' '
 | 
| 220 |         if toknum in (NEWLINE, NL):
 | 
| 221 |             startline = True
 | 
| 222 |         for tok in iterable:
 | 
| 223 |             toknum, tokval = tok[:2]
 | 
| 224 | 
 | 
| 225 |             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
 | 
| 226 |                 tokval += ' '
 | 
| 227 | 
 | 
| 228 |             if toknum == INDENT:
 | 
| 229 |                 indents.append(tokval)
 | 
| 230 |                 continue
 | 
| 231 |             elif toknum == DEDENT:
 | 
| 232 |                 indents.pop()
 | 
| 233 |                 continue
 | 
| 234 |             elif toknum in (NEWLINE, NL):
 | 
| 235 |                 startline = True
 | 
| 236 |             elif startline and indents:
 | 
| 237 |                 toks_append(indents[-1])
 | 
| 238 |                 startline = False
 | 
| 239 |             toks_append(tokval)
 | 
| 240 | 
 | 
| 241 | # Commented out because re.ASCII not in Python 2.
 | 
| 242 | #cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 | 
| 243 | #blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 | 
| 244 | 
 | 
| 245 | def _get_normal_name(orig_enc):
 | 
| 246 |     """Imitates get_normal_name in tokenizer.c."""
 | 
| 247 |     # Only care about the first 12 characters.
 | 
| 248 |     enc = orig_enc[:12].lower().replace("_", "-")
 | 
| 249 |     if enc == "utf-8" or enc.startswith("utf-8-"):
 | 
| 250 |         return "utf-8"
 | 
| 251 |     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
 | 
| 252 |        enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
 | 
| 253 |         return "iso-8859-1"
 | 
| 254 |     return orig_enc
 | 
| 255 | 
 | 
| 256 | def detect_encoding(readline):
 | 
| 257 |     """
 | 
| 258 |     The detect_encoding() function is used to detect the encoding that should
 | 
| 259 |     be used to decode a Python source file. It requires one argument, readline,
 | 
| 260 |     in the same way as the tokenize() generator.
 | 
| 261 | 
 | 
| 262 |     It will call readline a maximum of twice, and return the encoding used
 | 
| 263 |     (as a string) and a list of any lines (left as bytes) it has read
 | 
| 264 |     in.
 | 
| 265 | 
 | 
| 266 |     It detects the encoding from the presence of a utf-8 bom or an encoding
 | 
| 267 |     cookie as specified in pep-0263. If both a bom and a cookie are present, but
 | 
| 268 |     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
 | 
| 269 |     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
 | 
| 270 |     'utf-8-sig' is returned.
 | 
| 271 | 
 | 
| 272 |     If no encoding is specified, then the default of 'utf-8' will be returned.
 | 
| 273 |     """
 | 
| 274 |     bom_found = False
 | 
| 275 |     encoding = None
 | 
| 276 |     default = 'utf-8'
 | 
| 277 |     def read_or_stop():
 | 
| 278 |         try:
 | 
| 279 |             return readline()
 | 
| 280 |         except StopIteration:
 | 
| 281 |             return bytes()
 | 
| 282 | 
 | 
| 283 |     def find_cookie(line):
 | 
| 284 |         try:
 | 
| 285 |             line_string = line.decode('ascii')
 | 
| 286 |         except UnicodeDecodeError:
 | 
| 287 |             return None
 | 
| 288 |         match = cookie_re.match(line_string)
 | 
| 289 |         if not match:
 | 
| 290 |             return None
 | 
| 291 |         encoding = _get_normal_name(match.group(1))
 | 
| 292 |         try:
 | 
| 293 |             codec = lookup(encoding)
 | 
| 294 |         except LookupError:
 | 
| 295 |             # This behaviour mimics the Python interpreter
 | 
| 296 |             raise SyntaxError("unknown encoding: " + encoding)
 | 
| 297 | 
 | 
| 298 |         if bom_found:
 | 
| 299 |             if codec.name != 'utf-8':
 | 
| 300 |                 # This behaviour mimics the Python interpreter
 | 
| 301 |                 raise SyntaxError('encoding problem: utf-8')
 | 
| 302 |             encoding += '-sig'
 | 
| 303 |         return encoding
 | 
| 304 | 
 | 
| 305 |     first = read_or_stop()
 | 
| 306 |     if first.startswith(BOM_UTF8):
 | 
| 307 |         bom_found = True
 | 
| 308 |         first = first[3:]
 | 
| 309 |         default = 'utf-8-sig'
 | 
| 310 |     if not first:
 | 
| 311 |         return default, []
 | 
| 312 | 
 | 
| 313 |     encoding = find_cookie(first)
 | 
| 314 |     if encoding:
 | 
| 315 |         return encoding, [first]
 | 
| 316 |     if not blank_re.match(first):
 | 
| 317 |         return default, [first]
 | 
| 318 | 
 | 
| 319 |     second = read_or_stop()
 | 
| 320 |     if not second:
 | 
| 321 |         return default, [first]
 | 
| 322 | 
 | 
| 323 |     encoding = find_cookie(second)
 | 
| 324 |     if encoding:
 | 
| 325 |         return encoding, [first, second]
 | 
| 326 | 
 | 
| 327 |     return default, [first, second]
 | 
| 328 | 
 | 
| 329 | def untokenize(iterable):
 | 
| 330 |     """Transform tokens back into Python source code.
 | 
| 331 | 
 | 
| 332 |     Each element returned by the iterable must be a token sequence
 | 
| 333 |     with at least two elements, a token number and token value.  If
 | 
| 334 |     only two tokens are passed, the resulting output is poor.
 | 
| 335 | 
 | 
| 336 |     Round-trip invariant for full input:
 | 
| 337 |         Untokenized source will match input source exactly
 | 
| 338 | 
 | 
| 339 |     Round-trip invariant for limited input:
 | 
| 340 |         # Output text will tokenize the back to the input
 | 
| 341 |         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 | 
| 342 |         newcode = untokenize(t1)
 | 
| 343 |         readline = iter(newcode.splitlines(1)).next
 | 
| 344 |         t2 = [tok[:2] for tokin generate_tokens(readline)]
 | 
| 345 |         assert t1 == t2
 | 
| 346 |     """
 | 
| 347 |     ut = Untokenizer()
 | 
| 348 |     return ut.untokenize(iterable)
 | 
| 349 | 
 | 
| 350 | def generate_tokens(readline):
 | 
| 351 |     """
 | 
| 352 |     The generate_tokens() generator requires one argument, readline, which
 | 
| 353 |     must be a callable object which provides the same interface as the
 | 
| 354 |     readline() method of built-in file objects. Each call to the function
 | 
| 355 |     should return one line of input as a string.  Alternately, readline
 | 
| 356 |     can be a callable function terminating with StopIteration:
 | 
| 357 |         readline = open(myfile).next    # Example of alternate readline
 | 
| 358 | 
 | 
| 359 |     The generator produces 5-tuples with these members: the token type; the
 | 
| 360 |     token string; a 2-tuple (srow, scol) of ints specifying the row and
 | 
| 361 |     column where the token begins in the source; a 2-tuple (erow, ecol) of
 | 
| 362 |     ints specifying the row and column where the token ends in the source;
 | 
| 363 |     and the line on which the token was found. The line passed is the
 | 
| 364 |     logical line; continuation lines are included.
 | 
| 365 |     """
 | 
| 366 |     lnum = parenlev = continued = 0
 | 
| 367 |     namechars, numchars = string.ascii_letters + '_', '0123456789'
 | 
| 368 |     contstr, needcont = '', 0
 | 
| 369 |     contline = None
 | 
| 370 |     indents = [0]
 | 
| 371 | 
 | 
| 372 |     # 'stashed' and 'async_*' are used for async/await parsing
 | 
| 373 |     stashed = None
 | 
| 374 |     async_def = False
 | 
| 375 |     async_def_indent = 0
 | 
| 376 |     async_def_nl = False
 | 
| 377 | 
 | 
| 378 |     while 1:                                   # loop over lines in stream
 | 
| 379 |         try:
 | 
| 380 |             line = readline()
 | 
| 381 |         except StopIteration:
 | 
| 382 |             line = ''
 | 
| 383 |         lnum = lnum + 1
 | 
| 384 |         pos, max = 0, len(line)
 | 
| 385 | 
 | 
| 386 |         if contstr:                            # continued string
 | 
| 387 |             if not line:
 | 
| 388 |                 raise TokenError("EOF in multi-line string", strstart)
 | 
| 389 |             endmatch = endprog.match(line)
 | 
| 390 |             if endmatch:
 | 
| 391 |                 pos = end = endmatch.end(0)
 | 
| 392 |                 yield (STRING, contstr + line[:end],
 | 
| 393 |                        strstart, (lnum, end), contline + line)
 | 
| 394 |                 contstr, needcont = '', 0
 | 
| 395 |                 contline = None
 | 
| 396 |             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 | 
| 397 |                 yield (ERRORTOKEN, contstr + line,
 | 
| 398 |                            strstart, (lnum, len(line)), contline)
 | 
| 399 |                 contstr = ''
 | 
| 400 |                 contline = None
 | 
| 401 |                 continue
 | 
| 402 |             else:
 | 
| 403 |                 contstr = contstr + line
 | 
| 404 |                 contline = contline + line
 | 
| 405 |                 continue
 | 
| 406 | 
 | 
| 407 |         elif parenlev == 0 and not continued:  # new statement
 | 
| 408 |             if not line: break
 | 
| 409 |             column = 0
 | 
| 410 |             while pos < max:                   # measure leading whitespace
 | 
| 411 |                 if line[pos] == ' ': column = column + 1
 | 
| 412 |                 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
 | 
| 413 |                 elif line[pos] == '\f': column = 0
 | 
| 414 |                 else: break
 | 
| 415 |                 pos = pos + 1
 | 
| 416 |             if pos == max: break
 | 
| 417 | 
 | 
| 418 |             if stashed:
 | 
| 419 |                 yield stashed
 | 
| 420 |                 stashed = None
 | 
| 421 | 
 | 
| 422 |             if line[pos] in '#\r\n':           # skip comments or blank lines
 | 
| 423 |                 if line[pos] == '#':
 | 
| 424 |                     comment_token = line[pos:].rstrip('\r\n')
 | 
| 425 |                     nl_pos = pos + len(comment_token)
 | 
| 426 |                     yield (COMMENT, comment_token,
 | 
| 427 |                            (lnum, pos), (lnum, pos + len(comment_token)), line)
 | 
| 428 |                     yield (NL, line[nl_pos:],
 | 
| 429 |                            (lnum, nl_pos), (lnum, len(line)), line)
 | 
| 430 |                 else:
 | 
| 431 |                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
 | 
| 432 |                            (lnum, pos), (lnum, len(line)), line)
 | 
| 433 |                 continue
 | 
| 434 | 
 | 
| 435 |             if column > indents[-1]:           # count indents or dedents
 | 
| 436 |                 indents.append(column)
 | 
| 437 |                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 | 
| 438 |             while column < indents[-1]:
 | 
| 439 |                 if column not in indents:
 | 
| 440 |                     raise IndentationError(
 | 
| 441 |                         "unindent does not match any outer indentation level",
 | 
| 442 |                         ("<tokenize>", lnum, pos, line))
 | 
| 443 |                 indents = indents[:-1]
 | 
| 444 | 
 | 
| 445 |                 if async_def and async_def_indent >= indents[-1]:
 | 
| 446 |                     async_def = False
 | 
| 447 |                     async_def_nl = False
 | 
| 448 |                     async_def_indent = 0
 | 
| 449 | 
 | 
| 450 |                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 | 
| 451 | 
 | 
| 452 |             if async_def and async_def_nl and async_def_indent >= indents[-1]:
 | 
| 453 |                 async_def = False
 | 
| 454 |                 async_def_nl = False
 | 
| 455 |                 async_def_indent = 0
 | 
| 456 | 
 | 
| 457 |         else:                                  # continued statement
 | 
| 458 |             if not line:
 | 
| 459 |                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 | 
| 460 |             continued = 0
 | 
| 461 | 
 | 
| 462 |         while pos < max:
 | 
| 463 |             pseudomatch = pseudoprog.match(line, pos)
 | 
| 464 |             if pseudomatch:                                # scan for tokens
 | 
| 465 |                 start, end = pseudomatch.span(1)
 | 
| 466 |                 spos, epos, pos = (lnum, start), (lnum, end), end
 | 
| 467 |                 token, initial = line[start:end], line[start]
 | 
| 468 | 
 | 
| 469 |                 if initial in numchars or \
 | 
| 470 |                    (initial == '.' and token != '.'):      # ordinary number
 | 
| 471 |                     yield (NUMBER, token, spos, epos, line)
 | 
| 472 |                 elif initial in '\r\n':
 | 
| 473 |                     newline = NEWLINE
 | 
| 474 |                     if parenlev > 0:
 | 
| 475 |                         newline = NL
 | 
| 476 |                     elif async_def:
 | 
| 477 |                         async_def_nl = True
 | 
| 478 |                     if stashed:
 | 
| 479 |                         yield stashed
 | 
| 480 |                         stashed = None
 | 
| 481 |                     yield (newline, token, spos, epos, line)
 | 
| 482 | 
 | 
| 483 |                 elif initial == '#':
 | 
| 484 |                     assert not token.endswith("\n")
 | 
| 485 |                     if stashed:
 | 
| 486 |                         yield stashed
 | 
| 487 |                         stashed = None
 | 
| 488 |                     yield (COMMENT, token, spos, epos, line)
 | 
| 489 |                 elif token in triple_quoted:
 | 
| 490 |                     endprog = endprogs[token]
 | 
| 491 |                     endmatch = endprog.match(line, pos)
 | 
| 492 |                     if endmatch:                           # all on one line
 | 
| 493 |                         pos = endmatch.end(0)
 | 
| 494 |                         token = line[start:pos]
 | 
| 495 |                         if stashed:
 | 
| 496 |                             yield stashed
 | 
| 497 |                             stashed = None
 | 
| 498 |                         yield (STRING, token, spos, (lnum, pos), line)
 | 
| 499 |                     else:
 | 
| 500 |                         strstart = (lnum, start)           # multiple lines
 | 
| 501 |                         contstr = line[start:]
 | 
| 502 |                         contline = line
 | 
| 503 |                         break
 | 
| 504 |                 elif initial in SingleQuoted or \
 | 
| 505 |                     token[:2] in SingleQuoted or \
 | 
| 506 |                     token[:3] in SingleQuoted:
 | 
| 507 |                     if token[-1] == '\n':                  # continued string
 | 
| 508 |                         strstart = (lnum, start)
 | 
| 509 |                         endprog = (endprogs[initial] or endprogs[token[1]] or
 | 
| 510 |                                    endprogs[token[2]])
 | 
| 511 |                         contstr, needcont = line[start:], 1
 | 
| 512 |                         contline = line
 | 
| 513 |                         break
 | 
| 514 |                     else:                                  # ordinary string
 | 
| 515 |                         if stashed:
 | 
| 516 |                             yield stashed
 | 
| 517 |                             stashed = None
 | 
| 518 |                         yield (STRING, token, spos, epos, line)
 | 
| 519 |                 elif initial in namechars:                 # ordinary name
 | 
| 520 |                     if token in ('async', 'await'):
 | 
| 521 |                         if async_def:
 | 
| 522 |                             yield (ASYNC if token == 'async' else AWAIT,
 | 
| 523 |                                    token, spos, epos, line)
 | 
| 524 |                             continue
 | 
| 525 | 
 | 
| 526 |                     tok = (NAME, token, spos, epos, line)
 | 
| 527 |                     if token == 'async' and not stashed:
 | 
| 528 |                         stashed = tok
 | 
| 529 |                         continue
 | 
| 530 | 
 | 
| 531 |                     if token == 'def':
 | 
| 532 |                         if (stashed
 | 
| 533 |                                 and stashed[0] == NAME
 | 
| 534 |                                 and stashed[1] == 'async'):
 | 
| 535 | 
 | 
| 536 |                             async_def = True
 | 
| 537 |                             async_def_indent = indents[-1]
 | 
| 538 | 
 | 
| 539 |                             yield (ASYNC, stashed[1],
 | 
| 540 |                                    stashed[2], stashed[3],
 | 
| 541 |                                    stashed[4])
 | 
| 542 |                             stashed = None
 | 
| 543 | 
 | 
| 544 |                     if stashed:
 | 
| 545 |                         yield stashed
 | 
| 546 |                         stashed = None
 | 
| 547 | 
 | 
| 548 |                     yield tok
 | 
| 549 |                 elif initial == '\\':                      # continued stmt
 | 
| 550 |                     # This yield is new; needed for better idempotency:
 | 
| 551 |                     if stashed:
 | 
| 552 |                         yield stashed
 | 
| 553 |                         stashed = None
 | 
| 554 |                     yield (NL, token, spos, (lnum, pos), line)
 | 
| 555 |                     continued = 1
 | 
| 556 |                 else:
 | 
| 557 |                     if initial in '([{': parenlev = parenlev + 1
 | 
| 558 |                     elif initial in ')]}': parenlev = parenlev - 1
 | 
| 559 |                     if stashed:
 | 
| 560 |                         yield stashed
 | 
| 561 |                         stashed = None
 | 
| 562 |                     yield (OP, token, spos, epos, line)
 | 
| 563 |             else:
 | 
| 564 |                 yield (ERRORTOKEN, line[pos],
 | 
| 565 |                            (lnum, pos), (lnum, pos+1), line)
 | 
| 566 |                 pos = pos + 1
 | 
| 567 | 
 | 
| 568 |     if stashed:
 | 
| 569 |         yield stashed
 | 
| 570 |         stashed = None
 | 
| 571 | 
 | 
| 572 |     for indent in indents[1:]:                 # pop remaining indent levels
 | 
| 573 |         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 | 
| 574 |     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 | 
| 575 | 
 | 
| 576 | if __name__ == '__main__':                     # testing
 | 
| 577 |     import sys
 | 
| 578 |     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 | 
| 579 |     else: tokenize(sys.stdin.readline)
 |