OILS / osh / word_compile.py View on Github | oilshell.org

310 lines, 166 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3"""osh/word_compile.py.
4
5These functions are called after parsing, but don't depend on any runtime
6values.
7"""
8
9from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
10from _devbuild.gen.syntax_asdl import (
11 Token,
12 CharCode,
13 word_part_e,
14 word_part_t,
15)
16from core.error import p_die
17from data_lang import j8
18from frontend import consts
19from frontend import lexer
20from mycpp import mylib
21from mycpp.mylib import log, switch
22
23from typing import List, Optional, cast
24
25
26def EvalCharLiteralForRegex(tok):
27 # type: (Token) -> CharCode
28 """For regex char classes.
29
30 Similar logic as below.
31 """
32 id_ = tok.id
33 value = lexer.TokenVal(tok)
34
35 with switch(id_) as case:
36 if case(Id.Char_UBraced):
37 s = lexer.TokenSlice(tok, 3, -1) # \u{123}
38 i = int(s, 16)
39 return CharCode(tok, i, True) # u_braced
40
41 elif case(Id.Char_OneChar): # \'
42 # value[1] -> mylib.ByteAt()
43 one_char_str = consts.LookupCharC(value[1])
44 return CharCode(tok, ord(one_char_str), False)
45
46 elif case(Id.Char_Hex):
47 s = lexer.TokenSliceLeft(tok, 2)
48 i = int(s, 16)
49 return CharCode(tok, i, False)
50
51 elif case(Id.Lit_Chars, Id.Expr_Name, Id.Expr_DecInt):
52 # Id.Lit_Chars: Token in single quoted string ['a'] is Id.Lit_Chars
53 # Id.Expr_Name: [a-z] is ['a'-'Z'], and [a z] is ['a' 'Z']
54 # Id.Expr_DecInt: [0-9] is ['0'-'9'], and [0 9] is ['0' '9']
55
56 assert len(value) == 1, tok
57 # value[0] -> mylib.ByteAt()
58 return CharCode(tok, ord(value[0]), False)
59
60 else:
61 raise AssertionError(tok)
62
63
64def EvalCStringToken(id_, value):
65 # type: (Id_t, str) -> Optional[str]
66 """All types of C-style backslash-escaped strings use this function:
67
68 - echo -e and printf at runtime
69 - $'' and b'' u'' at parse time
70 """
71 code_point = -1
72
73 if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash):
74 # shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
75 return value
76
77 # single quotes in the middle of a triple quoted string
78 elif id_ == Id.Right_SingleQuote:
79 return value
80
81 elif id_ == Id.Char_OneChar:
82 c = value[1]
83 return consts.LookupCharC(c)
84
85 elif id_ == Id.Char_Stop: # \c returns a special sentinel
86 return None
87
88 elif id_ in (Id.Char_Octal3, Id.Char_Octal4):
89 if id_ == Id.Char_Octal3: # $'\377'
90 s = value[1:]
91 else: # echo -e '\0377'
92 s = value[2:]
93
94 i = int(s, 8)
95 if i >= 256:
96 i = i % 256
97 # NOTE: This is for strict mode
98 #raise AssertionError('Out of range')
99 return chr(i)
100
101 elif id_ in (Id.Char_Hex, Id.Char_YHex):
102 s = value[2:]
103 i = int(s, 16)
104 return chr(i)
105
106 # Note: we're not doing the surrogate range and max code point checks for
107 # echo -e and printf:
108 #
109 # 1. It's not compatible with bash
110 # 2. We don't have good error locations anyway
111
112 elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
113 s = value[2:]
114 code_point = int(s, 16)
115 return j8.Utf8Encode(code_point)
116
117 elif id_ == Id.Char_UBraced:
118 s = value[3:-1] # \u{123}
119 code_point = int(s, 16)
120 return j8.Utf8Encode(code_point)
121
122 else:
123 raise AssertionError(Id_str(id_))
124
125
126def EvalSingleQuoted(id_, tokens):
127 # type: (Id_t, List[Token]) -> str
128 """ Done at parse time """
129 if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote,
130 Id.Left_RTSingleQuote):
131 strs = [lexer.TokenVal(t) for t in tokens]
132
133 elif id_ in (Id.Left_DollarSingleQuote, Id.Left_USingleQuote,
134 Id.Left_BSingleQuote, Id.Left_UTSingleQuote,
135 Id.Left_BTSingleQuote):
136 if 0:
137 for t in tokens:
138 print('T %s' % t)
139
140 strs = []
141 for t in tokens:
142 # More parse time validation for code points.
143 # EvalCStringToken() redoes some of this work, but right now it's
144 # shared with dynamic echo -e / printf, which don't have tokens.
145
146 # Only check J8 style strings, not Char_Unicode4 and Char_Unicode8,
147 # which are in OSH
148 if t.id == Id.Char_UBraced:
149 s = lexer.TokenSlice(t, 3, -1)
150 code_point = int(s, 16)
151 if code_point > 0x10ffff:
152 p_die("Code point can't be greater than U+10ffff", t)
153 if 0xD800 <= code_point and code_point < 0xE000:
154 p_die(
155 r"%s escape is illegal because it's in the surrogate range"
156 % lexer.TokenVal(t), t)
157
158 strs.append(EvalCStringToken(t.id, lexer.TokenVal(t)))
159
160 else:
161 raise AssertionError(id_)
162 return ''.join(strs)
163
164
165def _TokenConsistsOf(tok, byte_set):
166 # type: (Token, str) -> bool
167 start = tok.col
168 end = tok.col + tok.length
169 for i in xrange(start, end):
170 b = mylib.ByteAt(tok.line.content, i)
171 if not mylib.ByteInSet(b, byte_set):
172 return False
173 return True
174
175
176def _IsLeadingSpace(tok):
177 # type: (Token) -> bool
178 """ Determine if the token before ''' etc. is space to trim """
179 return _TokenConsistsOf(tok, ' \t')
180
181
182def _IsTrailingSpace(tok):
183 # type: (Token) -> bool
184 """ Determine if the space/newlines after ''' should be trimmed
185
186 Like s.isspace(), without legacy \f \v and Unicode.
187 """
188 return _TokenConsistsOf(tok, ' \n\r\t')
189
190
191# Whitespace trimming algorithms:
192#
193# 1. Trim what's after opening ''' or """, if it's whitespace
194# 2. Determine what's before closing ''' or """ -- this is what you strip
195# 3. Strip each line by mutating the token
196# - Change the ID from Id.Lit_Chars -> Id.Lit_CharsWithoutPrefix to maintain
197# the lossless invariant
198
199
200def RemoveLeadingSpaceDQ(parts):
201 # type: (List[word_part_t]) -> None
202 if len(parts) <= 1: # We need at least 2 parts to strip anything
203 return
204
205 # The first token may have a newline
206 UP_first = parts[0]
207 if UP_first.tag() == word_part_e.Literal:
208 first = cast(Token, UP_first)
209 #log('T %s', first_part)
210 if _IsTrailingSpace(first):
211 # Remove the first part. TODO: This could be expensive if there are many
212 # lines.
213 parts.pop(0)
214
215 UP_last = parts[-1]
216 to_strip = None # type: Optional[str]
217 if UP_last.tag() == word_part_e.Literal:
218 last = cast(Token, UP_last)
219 if _IsLeadingSpace(last):
220 to_strip = lexer.TokenVal(last)
221 parts.pop() # Remove the last part
222
223 if to_strip is None:
224 return
225
226 n = len(to_strip)
227 for part in parts:
228 if part.tag() != word_part_e.Literal:
229 line_ended = False
230 continue
231
232 lit_tok = cast(Token, part)
233
234 if lit_tok.col == 0 and lexer.TokenStartsWith(lit_tok, to_strip):
235 # TODO: Lexer should not populate this!
236 assert lit_tok.tval is None, lit_tok.tval
237
238 lit_tok.col = n
239 lit_tok.length -= n
240 #log('n = %d, %s', n, lit_tok)
241
242 assert lit_tok.id == Id.Lit_Chars, lit_tok
243 # --tool lossless-cat has a special case for this
244 lit_tok.id = Id.Lit_CharsWithoutPrefix
245
246
247def RemoveLeadingSpaceSQ(tokens):
248 # type: (List[Token]) -> None
249 """Strip leading whitespace from tokens.
250
251 May return original list unmodified, or a new list.
252
253 Must respect lossless invariant - see test/lossless/multiline-str.sh
254
255 For now we create NEW Id.Ignored_LeadingSpace tokens, and are NOT in the
256 arena.
257 """
258 if 0:
259 log('--')
260 for tok in tokens:
261 #log('tok %s', tok)
262 import sys
263 from asdl import format as fmt
264 ast_f = fmt.DetectConsoleOutput(mylib.Stderr())
265 tree = tok.AbbreviatedTree()
266 fmt.PrintTree(tree, ast_f)
267 print('', file=sys.stderr)
268 log('--')
269
270 if len(tokens) <= 1: # We need at least 2 parts to strip anything
271 return
272
273 # var x = ''' # strip initial newline/whitespace
274 # x
275 # '''
276 first = tokens[0]
277 if first.id == Id.Lit_Chars:
278 if _IsTrailingSpace(first):
279 tokens.pop(0) # Remove the first part
280
281 # Figure out what to strip, based on last token
282 last = tokens[-1]
283 to_strip = None # type: Optional[str]
284 if last.id == Id.Lit_Chars:
285 if _IsLeadingSpace(last):
286 to_strip = lexer.TokenVal(last)
287 tokens.pop() # Remove the last part
288
289 if to_strip is None:
290 return
291
292 #log('SQ Stripping %r', to_strip)
293 n = len(to_strip)
294
295 #log('--')
296 for tok in tokens: # line_ended reset on every iteration
297 #log('tok %s', tok)
298 # Strip leading space on tokens that begin lines, by bumping start col
299 if tok.col == 0 and lexer.TokenStartsWith(tok, to_strip):
300 tok.col = n
301 tok.length -= n
302
303 assert tok.id == Id.Lit_Chars, tok
304 # --tool lossless-cat has a special case for this
305 tok.id = Id.Lit_CharsWithoutPrefix
306
307 #log('STRIP tok %s', tok)
308
309
310# vim: sw=4