osh/word_compile.py

OILS / osh / word_compile.py View on Github | oilshell.org

310 lines, 166 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3	"""osh/word_compile.py.
4
5	These functions are called after parsing, but don't depend on any runtime
6	values.
7	"""
8
9	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
10	from _devbuild.gen.syntax_asdl import (
11	Token,
12	CharCode,
13	word_part_e,
14	word_part_t,
15	)
16	from core.error import p_die
17	from data_lang import j8
18	from frontend import consts
19	from frontend import lexer
20	from mycpp import mylib
21	from mycpp.mylib import log, switch
22
23	from typing import List, Optional, cast
24
25
26	def EvalCharLiteralForRegex(tok):
27	# type: (Token) -> CharCode
28	"""For regex char classes.
29
30	Similar logic as below.
31	"""
32	id_ = tok.id
33	value = lexer.TokenVal(tok)
34
35	with switch(id_) as case:
36	if case(Id.Char_UBraced):
37	s = lexer.TokenSlice(tok, 3, -1) # \u{123}
38	i = int(s, 16)
39	return CharCode(tok, i, True) # u_braced
40
41	elif case(Id.Char_OneChar): # \'
42	# value[1] -> mylib.ByteAt()
43	one_char_str = consts.LookupCharC(value[1])
44	return CharCode(tok, ord(one_char_str), False)
45
46	elif case(Id.Char_Hex):
47	s = lexer.TokenSliceLeft(tok, 2)
48	i = int(s, 16)
49	return CharCode(tok, i, False)
50
51	elif case(Id.Lit_Chars, Id.Expr_Name, Id.Expr_DecInt):
52	# Id.Lit_Chars: Token in single quoted string ['a'] is Id.Lit_Chars
53	# Id.Expr_Name: [a-z] is ['a'-'Z'], and [a z] is ['a' 'Z']
54	# Id.Expr_DecInt: [0-9] is ['0'-'9'], and [0 9] is ['0' '9']
55
56	assert len(value) == 1, tok
57	# value[0] -> mylib.ByteAt()
58	return CharCode(tok, ord(value[0]), False)
59
60	else:
61	raise AssertionError(tok)
62
63
64	def EvalCStringToken(id_, value):
65	# type: (Id_t, str) -> Optional[str]
66	"""All types of C-style backslash-escaped strings use this function:
67
68	- echo -e and printf at runtime
69	- $'' and b'' u'' at parse time
70	"""
71	code_point = -1
72
73	if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash):
74	# shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
75	return value
76
77	# single quotes in the middle of a triple quoted string
78	elif id_ == Id.Right_SingleQuote:
79	return value
80
81	elif id_ == Id.Char_OneChar:
82	c = value[1]
83	return consts.LookupCharC(c)
84
85	elif id_ == Id.Char_Stop: # \c returns a special sentinel
86	return None
87
88	elif id_ in (Id.Char_Octal3, Id.Char_Octal4):
89	if id_ == Id.Char_Octal3: # $'\377'
90	s = value[1:]
91	else: # echo -e '\0377'
92	s = value[2:]
93
94	i = int(s, 8)
95	if i >= 256:
96	i = i % 256
97	# NOTE: This is for strict mode
98	#raise AssertionError('Out of range')
99	return chr(i)
100
101	elif id_ in (Id.Char_Hex, Id.Char_YHex):
102	s = value[2:]
103	i = int(s, 16)
104	return chr(i)
105
106	# Note: we're not doing the surrogate range and max code point checks for
107	# echo -e and printf:
108	#
109	# 1. It's not compatible with bash
110	# 2. We don't have good error locations anyway
111
112	elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
113	s = value[2:]
114	code_point = int(s, 16)
115	return j8.Utf8Encode(code_point)
116
117	elif id_ == Id.Char_UBraced:
118	s = value[3:-1] # \u{123}
119	code_point = int(s, 16)
120	return j8.Utf8Encode(code_point)
121
122	else:
123	raise AssertionError(Id_str(id_))
124
125
126	def EvalSingleQuoted(id_, tokens):
127	# type: (Id_t, List[Token]) -> str
128	""" Done at parse time """
129	if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote,
130	Id.Left_RTSingleQuote):
131	strs = [lexer.TokenVal(t) for t in tokens]
132
133	elif id_ in (Id.Left_DollarSingleQuote, Id.Left_USingleQuote,
134	Id.Left_BSingleQuote, Id.Left_UTSingleQuote,
135	Id.Left_BTSingleQuote):
136	if 0:
137	for t in tokens:
138	print('T %s' % t)
139
140	strs = []
141	for t in tokens:
142	# More parse time validation for code points.
143	# EvalCStringToken() redoes some of this work, but right now it's
144	# shared with dynamic echo -e / printf, which don't have tokens.
145
146	# Only check J8 style strings, not Char_Unicode4 and Char_Unicode8,
147	# which are in OSH
148	if t.id == Id.Char_UBraced:
149	s = lexer.TokenSlice(t, 3, -1)
150	code_point = int(s, 16)
151	if code_point > 0x10ffff:
152	p_die("Code point can't be greater than U+10ffff", t)
153	if 0xD800 <= code_point and code_point < 0xE000:
154	p_die(
155	r"%s escape is illegal because it's in the surrogate range"
156	% lexer.TokenVal(t), t)
157
158	strs.append(EvalCStringToken(t.id, lexer.TokenVal(t)))
159
160	else:
161	raise AssertionError(id_)
162	return ''.join(strs)
163
164
165	def _TokenConsistsOf(tok, byte_set):
166	# type: (Token, str) -> bool
167	start = tok.col
168	end = tok.col + tok.length
169	for i in xrange(start, end):
170	b = mylib.ByteAt(tok.line.content, i)
171	if not mylib.ByteInSet(b, byte_set):
172	return False
173	return True
174
175
176	def _IsLeadingSpace(tok):
177	# type: (Token) -> bool
178	""" Determine if the token before ''' etc. is space to trim """
179	return _TokenConsistsOf(tok, ' \t')
180
181
182	def _IsTrailingSpace(tok):
183	# type: (Token) -> bool
184	""" Determine if the space/newlines after ''' should be trimmed
185
186	Like s.isspace(), without legacy \f \v and Unicode.
187	"""
188	return _TokenConsistsOf(tok, ' \n\r\t')
189
190
191	# Whitespace trimming algorithms:
192	#
193	# 1. Trim what's after opening ''' or """, if it's whitespace
194	# 2. Determine what's before closing ''' or """ -- this is what you strip
195	# 3. Strip each line by mutating the token
196	# - Change the ID from Id.Lit_Chars -> Id.Lit_CharsWithoutPrefix to maintain
197	# the lossless invariant
198
199
200	def RemoveLeadingSpaceDQ(parts):
201	# type: (List[word_part_t]) -> None
202	if len(parts) <= 1: # We need at least 2 parts to strip anything
203	return
204
205	# The first token may have a newline
206	UP_first = parts[0]
207	if UP_first.tag() == word_part_e.Literal:
208	first = cast(Token, UP_first)
209	#log('T %s', first_part)
210	if _IsTrailingSpace(first):
211	# Remove the first part. TODO: This could be expensive if there are many
212	# lines.
213	parts.pop(0)
214
215	UP_last = parts[-1]
216	to_strip = None # type: Optional[str]
217	if UP_last.tag() == word_part_e.Literal:
218	last = cast(Token, UP_last)
219	if _IsLeadingSpace(last):
220	to_strip = lexer.TokenVal(last)
221	parts.pop() # Remove the last part
222
223	if to_strip is None:
224	return
225
226	n = len(to_strip)
227	for part in parts:
228	if part.tag() != word_part_e.Literal:
229	line_ended = False
230	continue
231
232	lit_tok = cast(Token, part)
233
234	if lit_tok.col == 0 and lexer.TokenStartsWith(lit_tok, to_strip):
235	# TODO: Lexer should not populate this!
236	assert lit_tok.tval is None, lit_tok.tval
237
238	lit_tok.col = n
239	lit_tok.length -= n
240	#log('n = %d, %s', n, lit_tok)
241
242	assert lit_tok.id == Id.Lit_Chars, lit_tok
243	# --tool lossless-cat has a special case for this
244	lit_tok.id = Id.Lit_CharsWithoutPrefix
245
246
247	def RemoveLeadingSpaceSQ(tokens):
248	# type: (List[Token]) -> None
249	"""Strip leading whitespace from tokens.
250
251	May return original list unmodified, or a new list.
252
253	Must respect lossless invariant - see test/lossless/multiline-str.sh
254
255	For now we create NEW Id.Ignored_LeadingSpace tokens, and are NOT in the
256	arena.
257	"""
258	if 0:
259	log('--')
260	for tok in tokens:
261	#log('tok %s', tok)
262	import sys
263	from asdl import format as fmt
264	ast_f = fmt.DetectConsoleOutput(mylib.Stderr())
265	tree = tok.AbbreviatedTree()
266	fmt.PrintTree(tree, ast_f)
267	print('', file=sys.stderr)
268	log('--')
269
270	if len(tokens) <= 1: # We need at least 2 parts to strip anything
271	return
272
273	# var x = ''' # strip initial newline/whitespace
274	# x
275	# '''
276	first = tokens[0]
277	if first.id == Id.Lit_Chars:
278	if _IsTrailingSpace(first):
279	tokens.pop(0) # Remove the first part
280
281	# Figure out what to strip, based on last token
282	last = tokens[-1]
283	to_strip = None # type: Optional[str]
284	if last.id == Id.Lit_Chars:
285	if _IsLeadingSpace(last):
286	to_strip = lexer.TokenVal(last)
287	tokens.pop() # Remove the last part
288
289	if to_strip is None:
290	return
291
292	#log('SQ Stripping %r', to_strip)
293	n = len(to_strip)
294
295	#log('--')
296	for tok in tokens: # line_ended reset on every iteration
297	#log('tok %s', tok)
298	# Strip leading space on tokens that begin lines, by bumping start col
299	if tok.col == 0 and lexer.TokenStartsWith(tok, to_strip):
300	tok.col = n
301	tok.length -= n
302
303	assert tok.id == Id.Lit_Chars, tok
304	# --tool lossless-cat has a special case for this
305	tok.id = Id.Lit_CharsWithoutPrefix
306
307	#log('STRIP tok %s', tok)
308
309
310	# vim: sw=4