pgen2/tokenize.py

OILS / pgen2 / tokenize.py View on Github | oilshell.org

579 lines, 406 significant

1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2	# All rights reserved.
3
4	"""Tokenization help for Python programs.
5
6	generate_tokens(readline) is a generator that breaks a stream of
7	text into Python tokens. It accepts a readline-like method which is called
8	repeatedly to get the next line of input (or "" for EOF). It generates
9	5-tuples with these members:
10
11	the token type (see token.py)
12	the token (a string)
13	the starting (row, column) indices of the token (a 2-tuple of ints)
14	the ending (row, column) indices of the token (a 2-tuple of ints)
15	the original line (string)
16
17	It is designed to match the working of the Python tokenizer exactly, except
18	that it produces COMMENT tokens for comments and gives type OP for all
19	operators
20
21	Older entry points
22	tokenize_loop(readline, tokeneater)
23	tokenize(readline, tokeneater=printtoken)
24	are the same, except instead of generating tokens, tokeneater is a callback
25	function to which the 5 fields described above are passed as 5 arguments,
26	each time a new token is found."""
27
28	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29	__credits__ = \
30	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32	import string, re
33	from codecs import BOM_UTF8, lookup
34	#from lib2to3.pgen2.token import *
35
36	from . import token
37	from .token import *
38
39	__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
40	"generate_tokens", "untokenize"]
41	del token
42
43	try:
44	bytes
45	except NameError:
46	# Support bytes type in Python <= 2.5, so 2to3 turns itself into
47	# valid Python 3 code.
48	bytes = str
49
50	def group(*choices): return '(' + '\|'.join(choices) + ')'
51	def any(choices): return group(choices) + '*'
52	def maybe(choices): return group(choices) + '?'
53
54	Whitespace = r'[ \f\t]*'
55	Comment = r'#[^\r\n]*'
56	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
57	Name = r'[a-zA-Z_]\w*'
58
59	Binnumber = r'0[bB][01]*'
60	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
61	Octnumber = r'0[oO]?[0-7]*[lL]?'
62	Decnumber = r'[1-9]\d*[lL]?'
63	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
64	Exponent = r'[eE][-+]?\d+'
65	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
66	Expfloat = r'\d+' + Exponent
67	Floatnumber = group(Pointfloat, Expfloat)
68	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
69	Number = group(Imagnumber, Floatnumber, Intnumber)
70
71	# Tail end of ' string.
72	Single = r"[^'\\](?:\\.[^'\\])*'"
73	# Tail end of " string.
74	Double = r'[^"\\](?:\\.[^"\\])*"'
75	# Tail end of ''' string.
76	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
77	# Tail end of """ string.
78	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
79	Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
80	# Single-line ' or " string.
81	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
82	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
83
84	# Because of leftmost-then-longest match semantics, be sure to put the
85	# longest operators first (e.g., if = came before ==, == would get
86	# recognized as two instances of =).
87	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
88	r"//=?", r"->",
89	r"[+\-*/%&@\|^=<>]=?",
90	r"~")
91
92	Bracket = '[][(){}]'
93	Special = group(r'\r?\n', r'[:;.,`@]')
94	Funny = group(Operator, Bracket, Special)
95
96	PlainToken = group(Number, Funny, String, Name)
97	Token = Ignore + PlainToken
98
99	# First (or only) line of ' or " string.
100	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
101	group("'", r'\\\r?\n'),
102	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
103	group('"', r'\\\r?\n'))
104	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
105	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
106
107	tokenprog, pseudoprog, single3prog, double3prog = list(map(
108	re.compile, (Token, PseudoToken, Single3, Double3)))
109	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
110	"'''": single3prog, '"""': double3prog,
111	"r'''": single3prog, 'r"""': double3prog,
112	"u'''": single3prog, 'u"""': double3prog,
113	"b'''": single3prog, 'b"""': double3prog,
114	"ur'''": single3prog, 'ur"""': double3prog,
115	"br'''": single3prog, 'br"""': double3prog,
116	"R'''": single3prog, 'R"""': double3prog,
117	"U'''": single3prog, 'U"""': double3prog,
118	"B'''": single3prog, 'B"""': double3prog,
119	"uR'''": single3prog, 'uR"""': double3prog,
120	"Ur'''": single3prog, 'Ur"""': double3prog,
121	"UR'''": single3prog, 'UR"""': double3prog,
122	"bR'''": single3prog, 'bR"""': double3prog,
123	"Br'''": single3prog, 'Br"""': double3prog,
124	"BR'''": single3prog, 'BR"""': double3prog,
125	'r': None, 'R': None,
126	'u': None, 'U': None,
127	'b': None, 'B': None}
128
129	triple_quoted = {}
130	for t in ("'''", '"""',
131	"r'''", 'r"""', "R'''", 'R"""',
132	"u'''", 'u"""', "U'''", 'U"""',
133	"b'''", 'b"""', "B'''", 'B"""',
134	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
135	"uR'''", 'uR"""', "UR'''", 'UR"""',
136	"br'''", 'br"""', "Br'''", 'Br"""',
137	"bR'''", 'bR"""', "BR'''", 'BR"""',):
138	triple_quoted[t] = t
139	SingleQuoted = {}
140	for t in ("'", '"',
141	"r'", 'r"', "R'", 'R"',
142	"u'", 'u"', "U'", 'U"',
143	"b'", 'b"', "B'", 'B"',
144	"ur'", 'ur"', "Ur'", 'Ur"',
145	"uR'", 'uR"', "UR'", 'UR"',
146	"br'", 'br"', "Br'", 'Br"',
147	"bR'", 'bR"', "BR'", 'BR"', ):
148	SingleQuoted[t] = t
149
150	tabsize = 8
151
152	class TokenError(Exception): pass
153
154	class StopTokenizing(Exception): pass
155
156	def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
157	(srow, scol) = xxx_todo_changeme
158	(erow, ecol) = xxx_todo_changeme1
159	print("%d,%d-%d,%d:\t%s\t%s" % \
160	(srow, scol, erow, ecol, tok_name[type], repr(token)))
161
162	def tokenize(readline, tokeneater=printtoken):
163	"""
164	The tokenize() function accepts two parameters: one representing the
165	input stream, and one providing an output mechanism for tokenize().
166
167	The first parameter, readline, must be a callable object which provides
168	the same interface as the readline() method of built-in file objects.
169	Each call to the function should return one line of input as a string.
170
171	The second parameter, tokeneater, must also be a callable object. It is
172	called once for each token, with five arguments, corresponding to the
173	tuples generated by generate_tokens().
174	"""
175	try:
176	tokenize_loop(readline, tokeneater)
177	except StopTokenizing:
178	pass
179
180	# backwards compatible interface
181	def tokenize_loop(readline, tokeneater):
182	for token_info in generate_tokens(readline):
183	tokeneater(*token_info)
184
185	class Untokenizer:
186
187	def __init__(self):
188	self.tokens = []
189	self.prev_row = 1
190	self.prev_col = 0
191
192	def add_whitespace(self, start):
193	row, col = start
194	assert row <= self.prev_row
195	col_offset = col - self.prev_col
196	if col_offset:
197	self.tokens.append(" " * col_offset)
198
199	def untokenize(self, iterable):
200	for t in iterable:
201	if len(t) == 2:
202	self.compat(t, iterable)
203	break
204	tok_type, token, start, end, line = t
205	self.add_whitespace(start)
206	self.tokens.append(token)
207	self.prev_row, self.prev_col = end
208	if tok_type in (NEWLINE, NL):
209	self.prev_row += 1
210	self.prev_col = 0
211	return "".join(self.tokens)
212
213	def compat(self, token, iterable):
214	startline = False
215	indents = []
216	toks_append = self.tokens.append
217	toknum, tokval = token
218	if toknum in (NAME, NUMBER):
219	tokval += ' '
220	if toknum in (NEWLINE, NL):
221	startline = True
222	for tok in iterable:
223	toknum, tokval = tok[:2]
224
225	if toknum in (NAME, NUMBER, ASYNC, AWAIT):
226	tokval += ' '
227
228	if toknum == INDENT:
229	indents.append(tokval)
230	continue
231	elif toknum == DEDENT:
232	indents.pop()
233	continue
234	elif toknum in (NEWLINE, NL):
235	startline = True
236	elif startline and indents:
237	toks_append(indents[-1])
238	startline = False
239	toks_append(tokval)
240
241	# Commented out because re.ASCII not in Python 2.
242	#cookie_re = re.compile(r'^[ \t\f]#.?coding[:=][ \t]*([-\w.]+)', re.ASCII)
243	#blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]\|$)', re.ASCII)
244
245	def _get_normal_name(orig_enc):
246	"""Imitates get_normal_name in tokenizer.c."""
247	# Only care about the first 12 characters.
248	enc = orig_enc[:12].lower().replace("_", "-")
249	if enc == "utf-8" or enc.startswith("utf-8-"):
250	return "utf-8"
251	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
252	enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
253	return "iso-8859-1"
254	return orig_enc
255
256	def detect_encoding(readline):
257	"""
258	The detect_encoding() function is used to detect the encoding that should
259	be used to decode a Python source file. It requires one argument, readline,
260	in the same way as the tokenize() generator.
261
262	It will call readline a maximum of twice, and return the encoding used
263	(as a string) and a list of any lines (left as bytes) it has read
264	in.
265
266	It detects the encoding from the presence of a utf-8 bom or an encoding
267	cookie as specified in pep-0263. If both a bom and a cookie are present, but
268	disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
269	charset, raise a SyntaxError. Note that if a utf-8 bom is found,
270	'utf-8-sig' is returned.
271
272	If no encoding is specified, then the default of 'utf-8' will be returned.
273	"""
274	bom_found = False
275	encoding = None
276	default = 'utf-8'
277	def read_or_stop():
278	try:
279	return readline()
280	except StopIteration:
281	return bytes()
282
283	def find_cookie(line):
284	try:
285	line_string = line.decode('ascii')
286	except UnicodeDecodeError:
287	return None
288	match = cookie_re.match(line_string)
289	if not match:
290	return None
291	encoding = _get_normal_name(match.group(1))
292	try:
293	codec = lookup(encoding)
294	except LookupError:
295	# This behaviour mimics the Python interpreter
296	raise SyntaxError("unknown encoding: " + encoding)
297
298	if bom_found:
299	if codec.name != 'utf-8':
300	# This behaviour mimics the Python interpreter
301	raise SyntaxError('encoding problem: utf-8')
302	encoding += '-sig'
303	return encoding
304
305	first = read_or_stop()
306	if first.startswith(BOM_UTF8):
307	bom_found = True
308	first = first[3:]
309	default = 'utf-8-sig'
310	if not first:
311	return default, []
312
313	encoding = find_cookie(first)
314	if encoding:
315	return encoding, [first]
316	if not blank_re.match(first):
317	return default, [first]
318
319	second = read_or_stop()
320	if not second:
321	return default, [first]
322
323	encoding = find_cookie(second)
324	if encoding:
325	return encoding, [first, second]
326
327	return default, [first, second]
328
329	def untokenize(iterable):
330	"""Transform tokens back into Python source code.
331
332	Each element returned by the iterable must be a token sequence
333	with at least two elements, a token number and token value. If
334	only two tokens are passed, the resulting output is poor.
335
336	Round-trip invariant for full input:
337	Untokenized source will match input source exactly
338
339	Round-trip invariant for limited input:
340	# Output text will tokenize the back to the input
341	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
342	newcode = untokenize(t1)
343	readline = iter(newcode.splitlines(1)).next
344	t2 = [tok[:2] for tokin generate_tokens(readline)]
345	assert t1 == t2
346	"""
347	ut = Untokenizer()
348	return ut.untokenize(iterable)
349
350	def generate_tokens(readline):
351	"""
352	The generate_tokens() generator requires one argument, readline, which
353	must be a callable object which provides the same interface as the
354	readline() method of built-in file objects. Each call to the function
355	should return one line of input as a string. Alternately, readline
356	can be a callable function terminating with StopIteration:
357	readline = open(myfile).next # Example of alternate readline
358
359	The generator produces 5-tuples with these members: the token type; the
360	token string; a 2-tuple (srow, scol) of ints specifying the row and
361	column where the token begins in the source; a 2-tuple (erow, ecol) of
362	ints specifying the row and column where the token ends in the source;
363	and the line on which the token was found. The line passed is the
364	logical line; continuation lines are included.
365	"""
366	lnum = parenlev = continued = 0
367	namechars, numchars = string.ascii_letters + '_', '0123456789'
368	contstr, needcont = '', 0
369	contline = None
370	indents = [0]
371
372	# 'stashed' and 'async_*' are used for async/await parsing
373	stashed = None
374	async_def = False
375	async_def_indent = 0
376	async_def_nl = False
377
378	while 1: # loop over lines in stream
379	try:
380	line = readline()
381	except StopIteration:
382	line = ''
383	lnum = lnum + 1
384	pos, max = 0, len(line)
385
386	if contstr: # continued string
387	if not line:
388	raise TokenError("EOF in multi-line string", strstart)
389	endmatch = endprog.match(line)
390	if endmatch:
391	pos = end = endmatch.end(0)
392	yield (STRING, contstr + line[:end],
393	strstart, (lnum, end), contline + line)
394	contstr, needcont = '', 0
395	contline = None
396	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
397	yield (ERRORTOKEN, contstr + line,
398	strstart, (lnum, len(line)), contline)
399	contstr = ''
400	contline = None
401	continue
402	else:
403	contstr = contstr + line
404	contline = contline + line
405	continue
406
407	elif parenlev == 0 and not continued: # new statement
408	if not line: break
409	column = 0
410	while pos < max: # measure leading whitespace
411	if line[pos] == ' ': column = column + 1
412	elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
413	elif line[pos] == '\f': column = 0
414	else: break
415	pos = pos + 1
416	if pos == max: break
417
418	if stashed:
419	yield stashed
420	stashed = None
421
422	if line[pos] in '#\r\n': # skip comments or blank lines
423	if line[pos] == '#':
424	comment_token = line[pos:].rstrip('\r\n')
425	nl_pos = pos + len(comment_token)
426	yield (COMMENT, comment_token,
427	(lnum, pos), (lnum, pos + len(comment_token)), line)
428	yield (NL, line[nl_pos:],
429	(lnum, nl_pos), (lnum, len(line)), line)
430	else:
431	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
432	(lnum, pos), (lnum, len(line)), line)
433	continue
434
435	if column > indents[-1]: # count indents or dedents
436	indents.append(column)
437	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
438	while column < indents[-1]:
439	if column not in indents:
440	raise IndentationError(
441	"unindent does not match any outer indentation level",
442	("<tokenize>", lnum, pos, line))
443	indents = indents[:-1]
444
445	if async_def and async_def_indent >= indents[-1]:
446	async_def = False
447	async_def_nl = False
448	async_def_indent = 0
449
450	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
451
452	if async_def and async_def_nl and async_def_indent >= indents[-1]:
453	async_def = False
454	async_def_nl = False
455	async_def_indent = 0
456
457	else: # continued statement
458	if not line:
459	raise TokenError("EOF in multi-line statement", (lnum, 0))
460	continued = 0
461
462	while pos < max:
463	pseudomatch = pseudoprog.match(line, pos)
464	if pseudomatch: # scan for tokens
465	start, end = pseudomatch.span(1)
466	spos, epos, pos = (lnum, start), (lnum, end), end
467	token, initial = line[start:end], line[start]
468
469	if initial in numchars or \
470	(initial == '.' and token != '.'): # ordinary number
471	yield (NUMBER, token, spos, epos, line)
472	elif initial in '\r\n':
473	newline = NEWLINE
474	if parenlev > 0:
475	newline = NL
476	elif async_def:
477	async_def_nl = True
478	if stashed:
479	yield stashed
480	stashed = None
481	yield (newline, token, spos, epos, line)
482
483	elif initial == '#':
484	assert not token.endswith("\n")
485	if stashed:
486	yield stashed
487	stashed = None
488	yield (COMMENT, token, spos, epos, line)
489	elif token in triple_quoted:
490	endprog = endprogs[token]
491	endmatch = endprog.match(line, pos)
492	if endmatch: # all on one line
493	pos = endmatch.end(0)
494	token = line[start:pos]
495	if stashed:
496	yield stashed
497	stashed = None
498	yield (STRING, token, spos, (lnum, pos), line)
499	else:
500	strstart = (lnum, start) # multiple lines
501	contstr = line[start:]
502	contline = line
503	break
504	elif initial in SingleQuoted or \
505	token[:2] in SingleQuoted or \
506	token[:3] in SingleQuoted:
507	if token[-1] == '\n': # continued string
508	strstart = (lnum, start)
509	endprog = (endprogs[initial] or endprogs[token[1]] or
510	endprogs[token[2]])
511	contstr, needcont = line[start:], 1
512	contline = line
513	break
514	else: # ordinary string
515	if stashed:
516	yield stashed
517	stashed = None
518	yield (STRING, token, spos, epos, line)
519	elif initial in namechars: # ordinary name
520	if token in ('async', 'await'):
521	if async_def:
522	yield (ASYNC if token == 'async' else AWAIT,
523	token, spos, epos, line)
524	continue
525
526	tok = (NAME, token, spos, epos, line)
527	if token == 'async' and not stashed:
528	stashed = tok
529	continue
530
531	if token == 'def':
532	if (stashed
533	and stashed[0] == NAME
534	and stashed[1] == 'async'):
535
536	async_def = True
537	async_def_indent = indents[-1]
538
539	yield (ASYNC, stashed[1],
540	stashed[2], stashed[3],
541	stashed[4])
542	stashed = None
543
544	if stashed:
545	yield stashed
546	stashed = None
547
548	yield tok
549	elif initial == '\\': # continued stmt
550	# This yield is new; needed for better idempotency:
551	if stashed:
552	yield stashed
553	stashed = None
554	yield (NL, token, spos, (lnum, pos), line)
555	continued = 1
556	else:
557	if initial in '([{': parenlev = parenlev + 1
558	elif initial in ')]}': parenlev = parenlev - 1
559	if stashed:
560	yield stashed
561	stashed = None
562	yield (OP, token, spos, epos, line)
563	else:
564	yield (ERRORTOKEN, line[pos],
565	(lnum, pos), (lnum, pos+1), line)
566	pos = pos + 1
567
568	if stashed:
569	yield stashed
570	stashed = None
571
572	for indent in indents[1:]: # pop remaining indent levels
573	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
574	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
575
576	if __name__ == '__main__': # testing
577	import sys
578	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
579	else: tokenize(sys.stdin.readline)