OILS / pgen2 / tokenize.py View on Github | oilshell.org

579 lines, 406 significant
1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
33from codecs import BOM_UTF8, lookup
34#from lib2to3.pgen2.token import *
35
36from . import token
37from .token import *
38
39__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
40 "generate_tokens", "untokenize"]
41del token
42
43try:
44 bytes
45except NameError:
46 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
47 # valid Python 3 code.
48 bytes = str
49
50def group(*choices): return '(' + '|'.join(choices) + ')'
51def any(*choices): return group(*choices) + '*'
52def maybe(*choices): return group(*choices) + '?'
53
54Whitespace = r'[ \f\t]*'
55Comment = r'#[^\r\n]*'
56Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
57Name = r'[a-zA-Z_]\w*'
58
59Binnumber = r'0[bB][01]*'
60Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
61Octnumber = r'0[oO]?[0-7]*[lL]?'
62Decnumber = r'[1-9]\d*[lL]?'
63Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
64Exponent = r'[eE][-+]?\d+'
65Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
66Expfloat = r'\d+' + Exponent
67Floatnumber = group(Pointfloat, Expfloat)
68Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
69Number = group(Imagnumber, Floatnumber, Intnumber)
70
71# Tail end of ' string.
72Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
73# Tail end of " string.
74Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
75# Tail end of ''' string.
76Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
77# Tail end of """ string.
78Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
79Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
80# Single-line ' or " string.
81String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
82 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
83
84# Because of leftmost-then-longest match semantics, be sure to put the
85# longest operators first (e.g., if = came before ==, == would get
86# recognized as two instances of =).
87Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
88 r"//=?", r"->",
89 r"[+\-*/%&@|^=<>]=?",
90 r"~")
91
92Bracket = '[][(){}]'
93Special = group(r'\r?\n', r'[:;.,`@]')
94Funny = group(Operator, Bracket, Special)
95
96PlainToken = group(Number, Funny, String, Name)
97Token = Ignore + PlainToken
98
99# First (or only) line of ' or " string.
100ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
101 group("'", r'\\\r?\n'),
102 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
103 group('"', r'\\\r?\n'))
104PseudoExtras = group(r'\\\r?\n', Comment, Triple)
105PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
106
107tokenprog, pseudoprog, single3prog, double3prog = list(map(
108 re.compile, (Token, PseudoToken, Single3, Double3)))
109endprogs = {"'": re.compile(Single), '"': re.compile(Double),
110 "'''": single3prog, '"""': double3prog,
111 "r'''": single3prog, 'r"""': double3prog,
112 "u'''": single3prog, 'u"""': double3prog,
113 "b'''": single3prog, 'b"""': double3prog,
114 "ur'''": single3prog, 'ur"""': double3prog,
115 "br'''": single3prog, 'br"""': double3prog,
116 "R'''": single3prog, 'R"""': double3prog,
117 "U'''": single3prog, 'U"""': double3prog,
118 "B'''": single3prog, 'B"""': double3prog,
119 "uR'''": single3prog, 'uR"""': double3prog,
120 "Ur'''": single3prog, 'Ur"""': double3prog,
121 "UR'''": single3prog, 'UR"""': double3prog,
122 "bR'''": single3prog, 'bR"""': double3prog,
123 "Br'''": single3prog, 'Br"""': double3prog,
124 "BR'''": single3prog, 'BR"""': double3prog,
125 'r': None, 'R': None,
126 'u': None, 'U': None,
127 'b': None, 'B': None}
128
129triple_quoted = {}
130for t in ("'''", '"""',
131 "r'''", 'r"""', "R'''", 'R"""',
132 "u'''", 'u"""', "U'''", 'U"""',
133 "b'''", 'b"""', "B'''", 'B"""',
134 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
135 "uR'''", 'uR"""', "UR'''", 'UR"""',
136 "br'''", 'br"""', "Br'''", 'Br"""',
137 "bR'''", 'bR"""', "BR'''", 'BR"""',):
138 triple_quoted[t] = t
139SingleQuoted = {}
140for t in ("'", '"',
141 "r'", 'r"', "R'", 'R"',
142 "u'", 'u"', "U'", 'U"',
143 "b'", 'b"', "B'", 'B"',
144 "ur'", 'ur"', "Ur'", 'Ur"',
145 "uR'", 'uR"', "UR'", 'UR"',
146 "br'", 'br"', "Br'", 'Br"',
147 "bR'", 'bR"', "BR'", 'BR"', ):
148 SingleQuoted[t] = t
149
150tabsize = 8
151
152class TokenError(Exception): pass
153
154class StopTokenizing(Exception): pass
155
156def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
157 (srow, scol) = xxx_todo_changeme
158 (erow, ecol) = xxx_todo_changeme1
159 print("%d,%d-%d,%d:\t%s\t%s" % \
160 (srow, scol, erow, ecol, tok_name[type], repr(token)))
161
162def tokenize(readline, tokeneater=printtoken):
163 """
164 The tokenize() function accepts two parameters: one representing the
165 input stream, and one providing an output mechanism for tokenize().
166
167 The first parameter, readline, must be a callable object which provides
168 the same interface as the readline() method of built-in file objects.
169 Each call to the function should return one line of input as a string.
170
171 The second parameter, tokeneater, must also be a callable object. It is
172 called once for each token, with five arguments, corresponding to the
173 tuples generated by generate_tokens().
174 """
175 try:
176 tokenize_loop(readline, tokeneater)
177 except StopTokenizing:
178 pass
179
180# backwards compatible interface
181def tokenize_loop(readline, tokeneater):
182 for token_info in generate_tokens(readline):
183 tokeneater(*token_info)
184
185class Untokenizer:
186
187 def __init__(self):
188 self.tokens = []
189 self.prev_row = 1
190 self.prev_col = 0
191
192 def add_whitespace(self, start):
193 row, col = start
194 assert row <= self.prev_row
195 col_offset = col - self.prev_col
196 if col_offset:
197 self.tokens.append(" " * col_offset)
198
199 def untokenize(self, iterable):
200 for t in iterable:
201 if len(t) == 2:
202 self.compat(t, iterable)
203 break
204 tok_type, token, start, end, line = t
205 self.add_whitespace(start)
206 self.tokens.append(token)
207 self.prev_row, self.prev_col = end
208 if tok_type in (NEWLINE, NL):
209 self.prev_row += 1
210 self.prev_col = 0
211 return "".join(self.tokens)
212
213 def compat(self, token, iterable):
214 startline = False
215 indents = []
216 toks_append = self.tokens.append
217 toknum, tokval = token
218 if toknum in (NAME, NUMBER):
219 tokval += ' '
220 if toknum in (NEWLINE, NL):
221 startline = True
222 for tok in iterable:
223 toknum, tokval = tok[:2]
224
225 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
226 tokval += ' '
227
228 if toknum == INDENT:
229 indents.append(tokval)
230 continue
231 elif toknum == DEDENT:
232 indents.pop()
233 continue
234 elif toknum in (NEWLINE, NL):
235 startline = True
236 elif startline and indents:
237 toks_append(indents[-1])
238 startline = False
239 toks_append(tokval)
240
241# Commented out because re.ASCII not in Python 2.
242#cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
243#blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
244
245def _get_normal_name(orig_enc):
246 """Imitates get_normal_name in tokenizer.c."""
247 # Only care about the first 12 characters.
248 enc = orig_enc[:12].lower().replace("_", "-")
249 if enc == "utf-8" or enc.startswith("utf-8-"):
250 return "utf-8"
251 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
252 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
253 return "iso-8859-1"
254 return orig_enc
255
256def detect_encoding(readline):
257 """
258 The detect_encoding() function is used to detect the encoding that should
259 be used to decode a Python source file. It requires one argument, readline,
260 in the same way as the tokenize() generator.
261
262 It will call readline a maximum of twice, and return the encoding used
263 (as a string) and a list of any lines (left as bytes) it has read
264 in.
265
266 It detects the encoding from the presence of a utf-8 bom or an encoding
267 cookie as specified in pep-0263. If both a bom and a cookie are present, but
268 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
269 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
270 'utf-8-sig' is returned.
271
272 If no encoding is specified, then the default of 'utf-8' will be returned.
273 """
274 bom_found = False
275 encoding = None
276 default = 'utf-8'
277 def read_or_stop():
278 try:
279 return readline()
280 except StopIteration:
281 return bytes()
282
283 def find_cookie(line):
284 try:
285 line_string = line.decode('ascii')
286 except UnicodeDecodeError:
287 return None
288 match = cookie_re.match(line_string)
289 if not match:
290 return None
291 encoding = _get_normal_name(match.group(1))
292 try:
293 codec = lookup(encoding)
294 except LookupError:
295 # This behaviour mimics the Python interpreter
296 raise SyntaxError("unknown encoding: " + encoding)
297
298 if bom_found:
299 if codec.name != 'utf-8':
300 # This behaviour mimics the Python interpreter
301 raise SyntaxError('encoding problem: utf-8')
302 encoding += '-sig'
303 return encoding
304
305 first = read_or_stop()
306 if first.startswith(BOM_UTF8):
307 bom_found = True
308 first = first[3:]
309 default = 'utf-8-sig'
310 if not first:
311 return default, []
312
313 encoding = find_cookie(first)
314 if encoding:
315 return encoding, [first]
316 if not blank_re.match(first):
317 return default, [first]
318
319 second = read_or_stop()
320 if not second:
321 return default, [first]
322
323 encoding = find_cookie(second)
324 if encoding:
325 return encoding, [first, second]
326
327 return default, [first, second]
328
329def untokenize(iterable):
330 """Transform tokens back into Python source code.
331
332 Each element returned by the iterable must be a token sequence
333 with at least two elements, a token number and token value. If
334 only two tokens are passed, the resulting output is poor.
335
336 Round-trip invariant for full input:
337 Untokenized source will match input source exactly
338
339 Round-trip invariant for limited input:
340 # Output text will tokenize the back to the input
341 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
342 newcode = untokenize(t1)
343 readline = iter(newcode.splitlines(1)).next
344 t2 = [tok[:2] for tokin generate_tokens(readline)]
345 assert t1 == t2
346 """
347 ut = Untokenizer()
348 return ut.untokenize(iterable)
349
350def generate_tokens(readline):
351 """
352 The generate_tokens() generator requires one argument, readline, which
353 must be a callable object which provides the same interface as the
354 readline() method of built-in file objects. Each call to the function
355 should return one line of input as a string. Alternately, readline
356 can be a callable function terminating with StopIteration:
357 readline = open(myfile).next # Example of alternate readline
358
359 The generator produces 5-tuples with these members: the token type; the
360 token string; a 2-tuple (srow, scol) of ints specifying the row and
361 column where the token begins in the source; a 2-tuple (erow, ecol) of
362 ints specifying the row and column where the token ends in the source;
363 and the line on which the token was found. The line passed is the
364 logical line; continuation lines are included.
365 """
366 lnum = parenlev = continued = 0
367 namechars, numchars = string.ascii_letters + '_', '0123456789'
368 contstr, needcont = '', 0
369 contline = None
370 indents = [0]
371
372 # 'stashed' and 'async_*' are used for async/await parsing
373 stashed = None
374 async_def = False
375 async_def_indent = 0
376 async_def_nl = False
377
378 while 1: # loop over lines in stream
379 try:
380 line = readline()
381 except StopIteration:
382 line = ''
383 lnum = lnum + 1
384 pos, max = 0, len(line)
385
386 if contstr: # continued string
387 if not line:
388 raise TokenError("EOF in multi-line string", strstart)
389 endmatch = endprog.match(line)
390 if endmatch:
391 pos = end = endmatch.end(0)
392 yield (STRING, contstr + line[:end],
393 strstart, (lnum, end), contline + line)
394 contstr, needcont = '', 0
395 contline = None
396 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
397 yield (ERRORTOKEN, contstr + line,
398 strstart, (lnum, len(line)), contline)
399 contstr = ''
400 contline = None
401 continue
402 else:
403 contstr = contstr + line
404 contline = contline + line
405 continue
406
407 elif parenlev == 0 and not continued: # new statement
408 if not line: break
409 column = 0
410 while pos < max: # measure leading whitespace
411 if line[pos] == ' ': column = column + 1
412 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
413 elif line[pos] == '\f': column = 0
414 else: break
415 pos = pos + 1
416 if pos == max: break
417
418 if stashed:
419 yield stashed
420 stashed = None
421
422 if line[pos] in '#\r\n': # skip comments or blank lines
423 if line[pos] == '#':
424 comment_token = line[pos:].rstrip('\r\n')
425 nl_pos = pos + len(comment_token)
426 yield (COMMENT, comment_token,
427 (lnum, pos), (lnum, pos + len(comment_token)), line)
428 yield (NL, line[nl_pos:],
429 (lnum, nl_pos), (lnum, len(line)), line)
430 else:
431 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
432 (lnum, pos), (lnum, len(line)), line)
433 continue
434
435 if column > indents[-1]: # count indents or dedents
436 indents.append(column)
437 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
438 while column < indents[-1]:
439 if column not in indents:
440 raise IndentationError(
441 "unindent does not match any outer indentation level",
442 ("<tokenize>", lnum, pos, line))
443 indents = indents[:-1]
444
445 if async_def and async_def_indent >= indents[-1]:
446 async_def = False
447 async_def_nl = False
448 async_def_indent = 0
449
450 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
451
452 if async_def and async_def_nl and async_def_indent >= indents[-1]:
453 async_def = False
454 async_def_nl = False
455 async_def_indent = 0
456
457 else: # continued statement
458 if not line:
459 raise TokenError("EOF in multi-line statement", (lnum, 0))
460 continued = 0
461
462 while pos < max:
463 pseudomatch = pseudoprog.match(line, pos)
464 if pseudomatch: # scan for tokens
465 start, end = pseudomatch.span(1)
466 spos, epos, pos = (lnum, start), (lnum, end), end
467 token, initial = line[start:end], line[start]
468
469 if initial in numchars or \
470 (initial == '.' and token != '.'): # ordinary number
471 yield (NUMBER, token, spos, epos, line)
472 elif initial in '\r\n':
473 newline = NEWLINE
474 if parenlev > 0:
475 newline = NL
476 elif async_def:
477 async_def_nl = True
478 if stashed:
479 yield stashed
480 stashed = None
481 yield (newline, token, spos, epos, line)
482
483 elif initial == '#':
484 assert not token.endswith("\n")
485 if stashed:
486 yield stashed
487 stashed = None
488 yield (COMMENT, token, spos, epos, line)
489 elif token in triple_quoted:
490 endprog = endprogs[token]
491 endmatch = endprog.match(line, pos)
492 if endmatch: # all on one line
493 pos = endmatch.end(0)
494 token = line[start:pos]
495 if stashed:
496 yield stashed
497 stashed = None
498 yield (STRING, token, spos, (lnum, pos), line)
499 else:
500 strstart = (lnum, start) # multiple lines
501 contstr = line[start:]
502 contline = line
503 break
504 elif initial in SingleQuoted or \
505 token[:2] in SingleQuoted or \
506 token[:3] in SingleQuoted:
507 if token[-1] == '\n': # continued string
508 strstart = (lnum, start)
509 endprog = (endprogs[initial] or endprogs[token[1]] or
510 endprogs[token[2]])
511 contstr, needcont = line[start:], 1
512 contline = line
513 break
514 else: # ordinary string
515 if stashed:
516 yield stashed
517 stashed = None
518 yield (STRING, token, spos, epos, line)
519 elif initial in namechars: # ordinary name
520 if token in ('async', 'await'):
521 if async_def:
522 yield (ASYNC if token == 'async' else AWAIT,
523 token, spos, epos, line)
524 continue
525
526 tok = (NAME, token, spos, epos, line)
527 if token == 'async' and not stashed:
528 stashed = tok
529 continue
530
531 if token == 'def':
532 if (stashed
533 and stashed[0] == NAME
534 and stashed[1] == 'async'):
535
536 async_def = True
537 async_def_indent = indents[-1]
538
539 yield (ASYNC, stashed[1],
540 stashed[2], stashed[3],
541 stashed[4])
542 stashed = None
543
544 if stashed:
545 yield stashed
546 stashed = None
547
548 yield tok
549 elif initial == '\\': # continued stmt
550 # This yield is new; needed for better idempotency:
551 if stashed:
552 yield stashed
553 stashed = None
554 yield (NL, token, spos, (lnum, pos), line)
555 continued = 1
556 else:
557 if initial in '([{': parenlev = parenlev + 1
558 elif initial in ')]}': parenlev = parenlev - 1
559 if stashed:
560 yield stashed
561 stashed = None
562 yield (OP, token, spos, epos, line)
563 else:
564 yield (ERRORTOKEN, line[pos],
565 (lnum, pos), (lnum, pos+1), line)
566 pos = pos + 1
567
568 if stashed:
569 yield stashed
570 stashed = None
571
572 for indent in indents[1:]: # pop remaining indent levels
573 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
574 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
575
576if __name__ == '__main__': # testing
577 import sys
578 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
579 else: tokenize(sys.stdin.readline)