OILS / builtin / printf_osh.py View on Github | oilshell.org

541 lines, 357 significant
1#!/usr/bin/env python2
2"""Builtin_printf.py."""
3from __future__ import print_function
4
5import time as time_ # avoid name conflict
6
7from _devbuild.gen import arg_types
8from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind, Kind_t
9from _devbuild.gen.runtime_asdl import cmd_value
10from _devbuild.gen.syntax_asdl import (
11 loc,
12 loc_e,
13 loc_t,
14 source,
15 Token,
16 CompoundWord,
17 printf_part,
18 printf_part_e,
19 printf_part_t,
20)
21from _devbuild.gen.types_asdl import lex_mode_e, lex_mode_t
22from _devbuild.gen.value_asdl import (value, value_e)
23
24from core import alloc
25from core import error
26from core.error import e_die, p_die
27from core import state
28from core import vm
29from frontend import flag_util
30from frontend import consts
31from frontend import lexer
32from frontend import match
33from frontend import reader
34from mycpp import mops
35from mycpp import mylib
36from mycpp.mylib import log
37from osh import sh_expr_eval
38from osh import string_ops
39from osh import word_compile
40from data_lang import j8_lite
41
42import posix_ as posix
43
44from typing import Dict, List, Optional, TYPE_CHECKING, cast
45
46if TYPE_CHECKING:
47 from core import ui
48 from frontend import parse_lib
49
50_ = log
51
52
53class _FormatStringParser(object):
54 """
55 Grammar:
56
57 width = Num | Star
58 precision = Dot (Num | Star | Zero)?
59 fmt = Percent (Flag | Zero)* width? precision? (Type | Time)
60 part = Char_* | Format_EscapedPercent | fmt
61 printf_format = part* Eof_Real # we're using the main lexer
62
63 Maybe: bash also supports %(strftime)T
64 """
65
66 def __init__(self, lexer):
67 # type: (lexer.Lexer) -> None
68 self.lexer = lexer
69
70 # uninitialized values
71 self.cur_token = None # type: Token
72 self.token_type = Id.Undefined_Tok # type: Id_t
73 self.token_kind = Kind.Undefined # type: Kind_t
74
75 def _Next(self, lex_mode):
76 # type: (lex_mode_t) -> None
77 """Advance a token."""
78 self.cur_token = self.lexer.Read(lex_mode)
79 self.token_type = self.cur_token.id
80 self.token_kind = consts.GetKind(self.token_type)
81
82 def _ParseFormatStr(self):
83 # type: () -> printf_part_t
84 """fmt = ..."""
85 self._Next(lex_mode_e.PrintfPercent) # move past %
86
87 part = printf_part.Percent.CreateNull(alloc_lists=True)
88 while self.token_type in (Id.Format_Flag, Id.Format_Zero):
89 # space and + could be implemented
90 flag = lexer.TokenVal(self.cur_token) # allocation will be cached
91 if flag in '# +':
92 p_die("osh printf doesn't support the %r flag" % flag,
93 self.cur_token)
94
95 part.flags.append(self.cur_token)
96 self._Next(lex_mode_e.PrintfPercent)
97
98 if self.token_type in (Id.Format_Num, Id.Format_Star):
99 part.width = self.cur_token
100 self._Next(lex_mode_e.PrintfPercent)
101
102 if self.token_type == Id.Format_Dot:
103 part.precision = self.cur_token
104 self._Next(lex_mode_e.PrintfPercent) # past dot
105 if self.token_type in (Id.Format_Num, Id.Format_Star,
106 Id.Format_Zero):
107 part.precision = self.cur_token
108 self._Next(lex_mode_e.PrintfPercent)
109
110 if self.token_type in (Id.Format_Type, Id.Format_Time):
111 part.type = self.cur_token
112
113 # ADDITIONAL VALIDATION outside the "grammar".
114 type_val = lexer.TokenVal(part.type) # allocation will be cached
115 if type_val in 'eEfFgG':
116 p_die("osh printf doesn't support floating point", part.type)
117 # These two could be implemented. %c needs utf-8 decoding.
118 if type_val == 'c':
119 p_die("osh printf doesn't support single characters (bytes)",
120 part.type)
121
122 elif self.token_type == Id.Unknown_Tok:
123 p_die('Invalid printf format character', self.cur_token)
124
125 else:
126 p_die('Expected a printf format character', self.cur_token)
127
128 return part
129
130 def Parse(self):
131 # type: () -> List[printf_part_t]
132 self._Next(lex_mode_e.PrintfOuter)
133 parts = [] # type: List[printf_part_t]
134 while True:
135 if (self.token_kind in (Kind.Lit, Kind.Char) or self.token_type
136 in (Id.Format_EscapedPercent, Id.Unknown_Backslash)):
137
138 # Note: like in echo -e, we don't fail with Unknown_Backslash here
139 # when shopt -u parse_backslash because it's at runtime rather than
140 # parse time.
141 # Users should use $'' or the future static printf ${x %.3f}.
142
143 parts.append(self.cur_token)
144
145 elif self.token_type == Id.Format_Percent:
146 parts.append(self._ParseFormatStr())
147
148 elif self.token_type in (Id.Eof_Real, Id.Eol_Tok):
149 # Id.Eol_Tok: special case for format string of '\x00'.
150 break
151
152 else:
153 raise AssertionError(Id_str(self.token_type))
154
155 self._Next(lex_mode_e.PrintfOuter)
156
157 return parts
158
159
160class _PrintfState(object):
161
162 def __init__(self):
163 # type: () -> None
164 self.arg_index = 0
165 self.backslash_c = False
166 self.status = 0 # set to 1 before returning
167
168
169class Printf(vm._Builtin):
170
171 def __init__(
172 self,
173 mem, # type: state.Mem
174 parse_ctx, # type: parse_lib.ParseContext
175 unsafe_arith, # type: sh_expr_eval.UnsafeArith
176 errfmt, # type: ui.ErrorFormatter
177 ):
178 # type: (...) -> None
179 self.mem = mem
180 self.parse_ctx = parse_ctx
181 self.unsafe_arith = unsafe_arith
182 self.errfmt = errfmt
183 self.parse_cache = {} # type: Dict[str, List[printf_part_t]]
184
185 # this object initialized in main()
186 self.shell_start_time = time_.time()
187
188 def _Percent(self, pr, part, varargs, locs):
189 # type: (_PrintfState, printf_part.Percent, List[str], List[CompoundWord]) -> Optional[str]
190
191 num_args = len(varargs)
192
193 # TODO: Cache this?
194 flags = [] # type: List[str]
195 if len(part.flags) > 0:
196 for flag_token in part.flags:
197 flags.append(lexer.TokenVal(flag_token))
198
199 width = -1 # nonexistent
200 if part.width:
201 if part.width.id in (Id.Format_Num, Id.Format_Zero):
202 width_str = lexer.TokenVal(part.width)
203 width_loc = part.width # type: loc_t
204 elif part.width.id == Id.Format_Star: # depends on data
205 if pr.arg_index < num_args:
206 width_str = varargs[pr.arg_index]
207 width_loc = locs[pr.arg_index]
208 pr.arg_index += 1
209 else:
210 width_str = '' # invalid
211 width_loc = loc.Missing
212 else:
213 raise AssertionError()
214
215 try:
216 width = int(width_str)
217 except ValueError:
218 if width_loc.tag() == loc_e.Missing:
219 width_loc = part.width
220 self.errfmt.Print_("printf got invalid width %r" % width_str,
221 blame_loc=width_loc)
222 pr.status = 1
223 return None
224
225 precision = -1 # nonexistent
226 if part.precision:
227 if part.precision.id == Id.Format_Dot:
228 precision_str = '0'
229 precision_loc = part.precision # type: loc_t
230 elif part.precision.id in (Id.Format_Num, Id.Format_Zero):
231 precision_str = lexer.TokenVal(part.precision)
232 precision_loc = part.precision
233 elif part.precision.id == Id.Format_Star:
234 if pr.arg_index < num_args:
235 precision_str = varargs[pr.arg_index]
236 precision_loc = locs[pr.arg_index]
237 pr.arg_index += 1
238 else:
239 precision_str = ''
240 precision_loc = loc.Missing
241 else:
242 raise AssertionError()
243
244 try:
245 precision = int(precision_str)
246 except ValueError:
247 if precision_loc.tag() == loc_e.Missing:
248 precision_loc = part.precision
249 self.errfmt.Print_('printf got invalid precision %r' %
250 precision_str,
251 blame_loc=precision_loc)
252 pr.status = 1
253 return None
254
255 if pr.arg_index < num_args:
256 s = varargs[pr.arg_index]
257 word_loc = locs[pr.arg_index] # type: loc_t
258 pr.arg_index += 1
259 has_arg = True
260 else:
261 s = ''
262 word_loc = loc.Missing
263 has_arg = False
264
265 # Note: %s could be lexed into Id.Percent_S. Although small string
266 # optimization would remove the allocation as well.
267 typ = lexer.TokenVal(part.type)
268 if typ == 's':
269 if precision >= 0:
270 s = s[:precision] # truncate
271
272 elif typ == 'q':
273 # Most shells give \' for single quote, while OSH gives
274 # $'\'' this could matter when SSH'ing.
275 # Ditto for $'\\' vs. '\'
276
277 s = j8_lite.MaybeShellEncode(s)
278
279 elif typ == 'b':
280 # Process just like echo -e, except \c handling is simpler.
281
282 c_parts = [] # type: List[str]
283 lex = match.EchoLexer(s)
284 while True:
285 id_, tok_val = lex.Next()
286 if id_ == Id.Eol_Tok: # Note: This is really a NUL terminator
287 break
288
289 p = word_compile.EvalCStringToken(id_, tok_val)
290
291 # Unusual behavior: '\c' aborts processing!
292 if p is None:
293 pr.backslash_c = True
294 break
295
296 c_parts.append(p)
297 s = ''.join(c_parts)
298
299 elif part.type.id == Id.Format_Time or typ in 'diouxX':
300 # %(...)T and %d share this complex integer conversion logic
301
302 if match.LooksLikeInteger(s):
303 # Note: spaces like ' -42 ' accepted and normalized
304 d = mops.FromStr(s)
305
306 else:
307 # Check for 'a and "a
308 # These are interpreted as the numeric ASCII value of 'a'
309 num_bytes = len(s)
310 if num_bytes > 0 and s[0] in '\'"':
311 if num_bytes == 1:
312 # NUL after quote
313 d = mops.ZERO
314 elif num_bytes == 2:
315 # Allow invalid UTF-8, because all shells do
316 d = mops.IntWiden(ord(s[1]))
317 else:
318 try:
319 small_i = string_ops.DecodeUtf8Char(s, 1)
320 except error.Expr as e:
321 # Take the numeric value of first char, ignoring
322 # the rest of the bytes.
323 # Something like strict_arith or strict_printf
324 # could throw an error in this case.
325 self.errfmt.Print_('Warning: %s' %
326 e.UserErrorString(), word_loc)
327 small_i = ord(s[1])
328
329 d = mops.IntWiden(small_i)
330
331 # No argument means -1 for %(...)T as in Bash Reference Manual
332 # 4.2 - "If no argument is specified, conversion behaves as if
333 # -1 had been given."
334 elif not has_arg and part.type.id == Id.Format_Time:
335 d = mops.MINUS_ONE
336
337 else:
338 if has_arg:
339 blame_loc = word_loc # type: loc_t
340 else:
341 blame_loc = part.type
342 self.errfmt.Print_(
343 'printf expected an integer, got %r' % s, blame_loc)
344 pr.status = 1
345 return None
346
347 if part.type.id == Id.Format_Time:
348 # Initialize timezone:
349 # `localtime' uses the current timezone information initialized
350 # by `tzset'. The function `tzset' refers to the environment
351 # variable `TZ'. When the exported variable `TZ' is present,
352 # its value should be reflected in the real environment
353 # variable `TZ' before call of `tzset'.
354 #
355 # Note: unlike LANG, TZ doesn't seem to change behavior if it's
356 # not exported.
357 #
358 # TODO: In YSH, provide an API that doesn't rely on libc's global
359 # state.
360
361 tzcell = self.mem.GetCell('TZ')
362 if (tzcell and tzcell.exported and
363 tzcell.val.tag() == value_e.Str):
364 tzval = cast(value.Str, tzcell.val)
365 posix.putenv('TZ', tzval.s)
366
367 time_.tzset()
368
369 # Handle special values:
370 # User can specify two special values -1 and -2 as in Bash
371 # Reference Manual 4.2: "Two special argument values may be
372 # used: -1 represents the current time, and -2 represents the
373 # time the shell was invoked." from
374 # https://www.gnu.org/software/bash/manual/html_node/Bash-Builtins.html#index-printf
375 if mops.Equal(d, mops.MINUS_ONE): # -1 is current time
376 # TODO: 2038 problem
377 ts = time_.time()
378 elif mops.Equal(d, mops.MINUS_TWO): # -2 is shell start time
379 ts = self.shell_start_time
380 else:
381 ts = mops.BigTruncate(d)
382
383 s = time_.strftime(typ[1:-2], time_.localtime(ts))
384 if precision >= 0:
385 s = s[:precision] # truncate
386
387 else: # typ in 'diouxX'
388 # Disallowed because it depends on 32- or 64- bit
389 if mops.Greater(mops.ZERO, d) and typ in 'ouxX':
390 # TODO: Don't truncate it
391 e_die(
392 "Can't format negative number with %%%s: %d" %
393 (typ, mops.BigTruncate(d)), part.type)
394
395 if typ == 'o':
396 s = mops.ToOctal(d)
397 elif typ == 'x':
398 s = mops.ToHexLower(d)
399 elif typ == 'X':
400 s = mops.ToHexUpper(d)
401 else: # diu
402 s = mops.ToStr(d) # without spaces like ' -42 '
403
404 # There are TWO different ways to ZERO PAD, and they differ on
405 # the negative sign! See spec/builtin-printf
406
407 zero_pad = 0 # no zero padding
408 if width >= 0 and '0' in flags:
409 zero_pad = 1 # style 1
410 elif precision > 0 and len(s) < precision:
411 zero_pad = 2 # style 2
412
413 if zero_pad:
414 negative = (s[0] == '-')
415 if negative:
416 digits = s[1:]
417 sign = '-'
418 if zero_pad == 1:
419 # [%06d] -42 becomes [-00042] (6 TOTAL)
420 n = width - 1
421 else:
422 # [%6.6d] -42 becomes [-000042] (1 for '-' + 6)
423 n = precision
424 else:
425 digits = s
426 sign = ''
427 if zero_pad == 1:
428 n = width
429 else:
430 n = precision
431 s = sign + digits.rjust(n, '0')
432
433 else:
434 raise AssertionError()
435
436 if width >= 0:
437 if '-' in flags:
438 s = s.ljust(width, ' ')
439 else:
440 s = s.rjust(width, ' ')
441 return s
442
443 def _Format(self, parts, varargs, locs, out):
444 # type: (List[printf_part_t], List[str], List[CompoundWord], List[str]) -> int
445 """Hairy printf formatting logic."""
446
447 pr = _PrintfState()
448 num_args = len(varargs)
449
450 while True: # loop over arguments
451 for part in parts: # loop over parsed format string
452 UP_part = part
453 if part.tag() == printf_part_e.Literal:
454 part = cast(Token, UP_part)
455 if part.id == Id.Format_EscapedPercent:
456 s = '%'
457 else:
458 s = word_compile.EvalCStringToken(
459 part.id, lexer.LazyStr(part))
460
461 elif part.tag() == printf_part_e.Percent:
462 part = cast(printf_part.Percent, UP_part)
463
464 s = self._Percent(pr, part, varargs, locs)
465 if pr.status != 0:
466 return pr.status
467
468 else:
469 raise AssertionError()
470
471 out.append(s)
472
473 if pr.backslash_c: # 'printf %b a\cb xx' - \c terminates processing!
474 break
475
476 if pr.arg_index == 0:
477 # We went through ALL parts and didn't consume ANY arg.
478 # Example: print x y
479 break
480 if pr.arg_index >= num_args:
481 # We printed all args
482 break
483 # If there are more args, keep going. This implement 'arg recycling'
484 # behavior
485 # printf '%s ' 1 2 3 => 1 2 3
486
487 return 0
488
489 def Run(self, cmd_val):
490 # type: (cmd_value.Argv) -> int
491 """
492 printf: printf [-v var] format [argument ...]
493 """
494 attrs, arg_r = flag_util.ParseCmdVal('printf', cmd_val)
495 arg = arg_types.printf(attrs.attrs)
496
497 fmt, fmt_loc = arg_r.ReadRequired2('requires a format string')
498 varargs, locs = arg_r.Rest2()
499
500 #log('fmt %s', fmt)
501 #log('vals %s', vals)
502
503 arena = self.parse_ctx.arena
504 if fmt in self.parse_cache:
505 parts = self.parse_cache[fmt]
506 else:
507 line_reader = reader.StringLineReader(fmt, arena)
508 # TODO: Make public
509 lexer = self.parse_ctx.MakeLexer(line_reader)
510 parser = _FormatStringParser(lexer)
511
512 with alloc.ctx_SourceCode(arena,
513 source.ArgvWord('printf', fmt_loc)):
514 try:
515 parts = parser.Parse()
516 except error.Parse as e:
517 self.errfmt.PrettyPrintError(e)
518 return 2 # parse error
519
520 self.parse_cache[fmt] = parts
521
522 if 0:
523 print()
524 for part in parts:
525 part.PrettyPrint()
526 print()
527
528 out = [] # type: List[str]
529 status = self._Format(parts, varargs, locs, out)
530 if status != 0:
531 return status # failure
532
533 result = ''.join(out)
534 if arg.v is not None:
535 # TODO: get the location for arg.v!
536 v_loc = loc.Missing
537 lval = self.unsafe_arith.ParseLValue(arg.v, v_loc)
538 state.BuiltinSetValue(self.mem, lval, value.Str(result))
539 else:
540 mylib.Stdout().write(result)
541 return 0