OILS / builtin / printf_osh.py View on Github | oilshell.org

504 lines, 335 significant
1#!/usr/bin/env python2
2"""Builtin_printf.py."""
3from __future__ import print_function
4
5import time as time_ # avoid name conflict
6
7from _devbuild.gen import arg_types
8from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind, Kind_t
9from _devbuild.gen.runtime_asdl import cmd_value
10from _devbuild.gen.syntax_asdl import (
11 loc,
12 loc_e,
13 loc_t,
14 source,
15 Token,
16 CompoundWord,
17 printf_part,
18 printf_part_e,
19 printf_part_t,
20)
21from _devbuild.gen.types_asdl import lex_mode_e, lex_mode_t
22from _devbuild.gen.value_asdl import (value, value_e)
23
24from core import alloc
25from core import error
26from core.error import e_die, p_die
27from core import state
28from core import vm
29from frontend import flag_util
30from frontend import consts
31from frontend import lexer
32from frontend import match
33from frontend import reader
34from mycpp import mylib
35from mycpp.mylib import log
36from osh import sh_expr_eval
37from osh import word_compile
38from data_lang import j8_lite
39
40import posix_ as posix
41
42from typing import Dict, List, TYPE_CHECKING, cast
43
44if TYPE_CHECKING:
45 from core import ui
46 from frontend import parse_lib
47
48_ = log
49
50
51class _FormatStringParser(object):
52 """
53 Grammar:
54
55 width = Num | Star
56 precision = Dot (Num | Star | Zero)?
57 fmt = Percent (Flag | Zero)* width? precision? (Type | Time)
58 part = Char_* | Format_EscapedPercent | fmt
59 printf_format = part* Eof_Real # we're using the main lexer
60
61 Maybe: bash also supports %(strftime)T
62 """
63
64 def __init__(self, lexer):
65 # type: (lexer.Lexer) -> None
66 self.lexer = lexer
67
68 # uninitialized values
69 self.cur_token = None # type: Token
70 self.token_type = Id.Undefined_Tok # type: Id_t
71 self.token_kind = Kind.Undefined # type: Kind_t
72
73 def _Next(self, lex_mode):
74 # type: (lex_mode_t) -> None
75 """Advance a token."""
76 self.cur_token = self.lexer.Read(lex_mode)
77 self.token_type = self.cur_token.id
78 self.token_kind = consts.GetKind(self.token_type)
79
80 def _ParseFormatStr(self):
81 # type: () -> printf_part_t
82 """Fmt production."""
83 self._Next(lex_mode_e.PrintfPercent) # move past %
84
85 part = printf_part.Percent.CreateNull(alloc_lists=True)
86 while self.token_type in (Id.Format_Flag, Id.Format_Zero):
87 # space and + could be implemented
88 flag = lexer.TokenVal(self.cur_token) # allocation will be cached
89 if flag in '# +':
90 p_die("osh printf doesn't support the %r flag" % flag,
91 self.cur_token)
92
93 part.flags.append(self.cur_token)
94 self._Next(lex_mode_e.PrintfPercent)
95
96 if self.token_type in (Id.Format_Num, Id.Format_Star):
97 part.width = self.cur_token
98 self._Next(lex_mode_e.PrintfPercent)
99
100 if self.token_type == Id.Format_Dot:
101 part.precision = self.cur_token
102 self._Next(lex_mode_e.PrintfPercent) # past dot
103 if self.token_type in (Id.Format_Num, Id.Format_Star,
104 Id.Format_Zero):
105 part.precision = self.cur_token
106 self._Next(lex_mode_e.PrintfPercent)
107
108 if self.token_type in (Id.Format_Type, Id.Format_Time):
109 part.type = self.cur_token
110
111 # ADDITIONAL VALIDATION outside the "grammar".
112 type_val = lexer.TokenVal(part.type) # allocation will be cached
113 if type_val in 'eEfFgG':
114 p_die("osh printf doesn't support floating point", part.type)
115 # These two could be implemented. %c needs utf-8 decoding.
116 if type_val == 'c':
117 p_die("osh printf doesn't support single characters (bytes)",
118 part.type)
119
120 elif self.token_type == Id.Unknown_Tok:
121 p_die('Invalid printf format character', self.cur_token)
122
123 else:
124 p_die('Expected a printf format character', self.cur_token)
125
126 return part
127
128 def Parse(self):
129 # type: () -> List[printf_part_t]
130 self._Next(lex_mode_e.PrintfOuter)
131 parts = [] # type: List[printf_part_t]
132 while True:
133 if (self.token_kind in (Kind.Lit, Kind.Char) or self.token_type
134 in (Id.Format_EscapedPercent, Id.Unknown_Backslash)):
135
136 # Note: like in echo -e, we don't fail with Unknown_Backslash here
137 # when shopt -u parse_backslash because it's at runtime rather than
138 # parse time.
139 # Users should use $'' or the future static printf ${x %.3f}.
140
141 parts.append(self.cur_token)
142
143 elif self.token_type == Id.Format_Percent:
144 parts.append(self._ParseFormatStr())
145
146 elif self.token_type in (Id.Eof_Real, Id.Eol_Tok):
147 # Id.Eol_Tok: special case for format string of '\x00'.
148 break
149
150 else:
151 raise AssertionError(Id_str(self.token_type))
152
153 self._Next(lex_mode_e.PrintfOuter)
154
155 return parts
156
157
158class Printf(vm._Builtin):
159
160 def __init__(
161 self,
162 mem, # type: state.Mem
163 parse_ctx, # type: parse_lib.ParseContext
164 unsafe_arith, # type: sh_expr_eval.UnsafeArith
165 errfmt, # type: ui.ErrorFormatter
166 ):
167 # type: (...) -> None
168 self.mem = mem
169 self.parse_ctx = parse_ctx
170 self.unsafe_arith = unsafe_arith
171 self.errfmt = errfmt
172 self.parse_cache = {} # type: Dict[str, List[printf_part_t]]
173
174 self.shell_start_time = time_.time(
175 ) # this object initialized in main()
176
177 def _Format(self, parts, varargs, locs, out):
178 # type: (List[printf_part_t], List[str], List[CompoundWord], List[str]) -> int
179 """Hairy printf formatting logic."""
180
181 arg_index = 0
182 num_args = len(varargs)
183 backslash_c = False
184
185 while True: # loop over arguments
186 for part in parts: # loop over parsed format string
187 UP_part = part
188 if part.tag() == printf_part_e.Literal:
189 part = cast(Token, UP_part)
190 if part.id == Id.Format_EscapedPercent:
191 s = '%'
192 else:
193 s = word_compile.EvalCStringToken(
194 part.id, lexer.LazyStr(part))
195 out.append(s)
196
197 elif part.tag() == printf_part_e.Percent:
198 # Note: This case is very long, but hard to refactor because of the
199 # error cases and "recycling" of args! (arg_index, return 1, etc.)
200 part = cast(printf_part.Percent, UP_part)
201
202 # TODO: These calculations are independent of the data, so could be
203 # cached
204 flags = [] # type: List[str]
205 if len(part.flags) > 0:
206 for flag_token in part.flags:
207 flags.append(lexer.TokenVal(flag_token))
208
209 width = -1 # nonexistent
210 if part.width:
211 if part.width.id in (Id.Format_Num, Id.Format_Zero):
212 width_str = lexer.TokenVal(part.width)
213 width_loc = part.width # type: loc_t
214 elif part.width.id == Id.Format_Star:
215 if arg_index < num_args:
216 width_str = varargs[arg_index]
217 width_loc = locs[arg_index]
218 arg_index += 1
219 else:
220 width_str = '' # invalid
221 width_loc = loc.Missing
222 else:
223 raise AssertionError()
224
225 try:
226 width = int(width_str)
227 except ValueError:
228 if width_loc.tag() == loc_e.Missing:
229 width_loc = part.width
230 self.errfmt.Print_("printf got invalid width %r" %
231 width_str,
232 blame_loc=width_loc)
233 return 1
234
235 precision = -1 # nonexistent
236 if part.precision:
237 if part.precision.id == Id.Format_Dot:
238 precision_str = '0'
239 precision_loc = part.precision # type: loc_t
240 elif part.precision.id in (Id.Format_Num,
241 Id.Format_Zero):
242 precision_str = lexer.TokenVal(part.precision)
243 precision_loc = part.precision
244 elif part.precision.id == Id.Format_Star:
245 if arg_index < num_args:
246 precision_str = varargs[arg_index]
247 precision_loc = locs[arg_index]
248 arg_index += 1
249 else:
250 precision_str = ''
251 precision_loc = loc.Missing
252 else:
253 raise AssertionError()
254
255 try:
256 precision = int(precision_str)
257 except ValueError:
258 if precision_loc.tag() == loc_e.Missing:
259 precision_loc = part.precision
260 self.errfmt.Print_(
261 'printf got invalid precision %r' %
262 precision_str,
263 blame_loc=precision_loc)
264 return 1
265
266 if arg_index < num_args:
267 s = varargs[arg_index]
268 word_loc = locs[arg_index] # type: loc_t
269 arg_index += 1
270 has_arg = True
271 else:
272 s = ''
273 word_loc = loc.Missing
274 has_arg = False
275
276 # Note: %s could be lexed into Id.Percent_S. Although small string
277 # optimization would remove the allocation as well.
278 typ = lexer.TokenVal(part.type)
279 if typ == 's':
280 if precision >= 0:
281 s = s[:precision] # truncate
282
283 elif typ == 'q':
284 # Most shells give \' for single quote, while OSH gives
285 # $'\'' this could matter when SSH'ing.
286 # Ditto for $'\\' vs. '\'
287
288 s = j8_lite.MaybeShellEncode(s)
289
290 elif typ == 'b':
291 # Process just like echo -e, except \c handling is simpler.
292
293 c_parts = [] # type: List[str]
294 lex = match.EchoLexer(s)
295 while True:
296 id_, tok_val = lex.Next()
297 if id_ == Id.Eol_Tok: # Note: This is really a NUL terminator
298 break
299
300 p = word_compile.EvalCStringToken(id_, tok_val)
301
302 # Unusual behavior: '\c' aborts processing!
303 if p is None:
304 backslash_c = True
305 break
306
307 c_parts.append(p)
308 s = ''.join(c_parts)
309
310 elif part.type.id == Id.Format_Time or typ in 'diouxX':
311 # %(...)T and %d share this complex integer conversion logic
312
313 try:
314 # note: spaces like ' -42 ' accepted and normalized
315 d = int(s)
316 except ValueError:
317 # 'a is interpreted as the ASCII value of 'a'
318 if len(s) >= 1 and s[0] in '\'"':
319 # TODO: utf-8 decode s[1:] to be more correct. Probably
320 # depends on issue #366, a utf-8 library.
321 # Note: len(s) == 1 means there is a NUL (0) after the quote..
322 d = ord(s[1]) if len(s) >= 2 else 0
323
324 # No argument means -1 for %(...)T as in Bash Reference Manual
325 # 4.2 "If no argument is specified, conversion behaves as if -1
326 # had been given."
327 elif not has_arg and part.type.id == Id.Format_Time:
328 d = -1
329
330 else:
331 if has_arg:
332 blame_loc = word_loc # type: loc_t
333 else:
334 blame_loc = part.type
335 self.errfmt.Print_(
336 'printf expected an integer, got %r' % s,
337 blame_loc)
338 return 1
339
340 if part.type.id == Id.Format_Time:
341 # Initialize timezone:
342 # `localtime' uses the current timezone information initialized
343 # by `tzset'. The function `tzset' refers to the environment
344 # variable `TZ'. When the exported variable `TZ' is present,
345 # its value should be reflected in the real environment
346 # variable `TZ' before call of `tzset'.
347 #
348 # Note: unlike LANG, TZ doesn't seem to change behavior if it's
349 # not exported.
350 #
351 # TODO: In YSH, provide an API that doesn't rely on libc's global
352 # state.
353
354 tzcell = self.mem.GetCell('TZ')
355 if tzcell and tzcell.exported and tzcell.val.tag(
356 ) == value_e.Str:
357 tzval = cast(value.Str, tzcell.val)
358 posix.putenv('TZ', tzval.s)
359
360 time_.tzset()
361
362 # Handle special values:
363 # User can specify two special values -1 and -2 as in Bash
364 # Reference Manual 4.2: "Two special argument values may be
365 # used: -1 represents the current time, and -2 represents the
366 # time the shell was invoked." from
367 # https://www.gnu.org/software/bash/manual/html_node/Bash-Builtins.html#index-printf
368 if d == -1: # the current time
369 ts = time_.time()
370 elif d == -2: # the shell start time
371 ts = self.shell_start_time
372 else:
373 ts = d
374
375 s = time_.strftime(typ[1:-2], time_.localtime(ts))
376 if precision >= 0:
377 s = s[:precision] # truncate
378
379 else: # typ in 'diouxX'
380 # Disallowed because it depends on 32- or 64- bit
381 if d < 0 and typ in 'ouxX':
382 e_die(
383 "Can't format negative number %d with %%%s"
384 % (d, typ), part.type)
385
386 if typ == 'o':
387 s = mylib.octal(d)
388 elif typ == 'x':
389 s = mylib.hex_lower(d)
390 elif typ == 'X':
391 s = mylib.hex_upper(d)
392 else: # diu
393 s = str(d) # without spaces like ' -42 '
394
395 # There are TWO different ways to ZERO PAD, and they differ on
396 # the negative sign! See spec/builtin-printf
397
398 zero_pad = 0 # no zero padding
399 if width >= 0 and '0' in flags:
400 zero_pad = 1 # style 1
401 elif precision > 0 and len(s) < precision:
402 zero_pad = 2 # style 2
403
404 if zero_pad:
405 negative = (s[0] == '-')
406 if negative:
407 digits = s[1:]
408 sign = '-'
409 if zero_pad == 1:
410 # [%06d] -42 becomes [-00042] (6 TOTAL)
411 n = width - 1
412 else:
413 # [%6.6d] -42 becomes [-000042] (1 for '-' + 6)
414 n = precision
415 else:
416 digits = s
417 sign = ''
418 if zero_pad == 1:
419 n = width
420 else:
421 n = precision
422 s = sign + digits.rjust(n, '0')
423
424 else:
425 raise AssertionError()
426
427 if width >= 0:
428 if '-' in flags:
429 s = s.ljust(width, ' ')
430 else:
431 s = s.rjust(width, ' ')
432
433 out.append(s)
434
435 else:
436 raise AssertionError()
437
438 if backslash_c: # 'printf %b a\cb xx' - \c terminates processing!
439 break
440
441 if arg_index == 0:
442 # We went through ALL parts and didn't consume ANY arg.
443 # Example: print x y
444 break
445 if arg_index >= num_args:
446 # We printed all args
447 break
448 # There are more arg: Implement the 'arg recycling' behavior.
449
450 return 0
451
452 def Run(self, cmd_val):
453 # type: (cmd_value.Argv) -> int
454 """
455 printf: printf [-v var] format [argument ...]
456 """
457 attrs, arg_r = flag_util.ParseCmdVal('printf', cmd_val)
458 arg = arg_types.printf(attrs.attrs)
459
460 fmt, fmt_loc = arg_r.ReadRequired2('requires a format string')
461 varargs, locs = arg_r.Rest2()
462
463 #log('fmt %s', fmt)
464 #log('vals %s', vals)
465
466 arena = self.parse_ctx.arena
467 if fmt in self.parse_cache:
468 parts = self.parse_cache[fmt]
469 else:
470 line_reader = reader.StringLineReader(fmt, arena)
471 # TODO: Make public
472 lexer = self.parse_ctx.MakeLexer(line_reader)
473 parser = _FormatStringParser(lexer)
474
475 with alloc.ctx_SourceCode(arena,
476 source.ArgvWord('printf', fmt_loc)):
477 try:
478 parts = parser.Parse()
479 except error.Parse as e:
480 self.errfmt.PrettyPrintError(e)
481 return 2 # parse error
482
483 self.parse_cache[fmt] = parts
484
485 if 0:
486 print()
487 for part in parts:
488 part.PrettyPrint()
489 print()
490
491 out = [] # type: List[str]
492 status = self._Format(parts, varargs, locs, out)
493 if status != 0:
494 return status # failure
495
496 result = ''.join(out)
497 if arg.v is not None:
498 # TODO: get the location for arg.v!
499 v_loc = loc.Missing
500 lval = self.unsafe_arith.ParseLValue(arg.v, v_loc)
501 state.BuiltinSetValue(self.mem, lval, value.Str(result))
502 else:
503 mylib.Stdout().write(result)
504 return 0