builtin/printf

OILS / builtin / printf_osh.py View on Github | oilshell.org

541 lines, 357 significant

1	#!/usr/bin/env python2
2	"""Builtin_printf.py."""
3	from __future__ import print_function
4
5	import time as time_ # avoid name conflict
6
7	from _devbuild.gen import arg_types
8	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind, Kind_t
9	from _devbuild.gen.runtime_asdl import cmd_value
10	from _devbuild.gen.syntax_asdl import (
11	loc,
12	loc_e,
13	loc_t,
14	source,
15	Token,
16	CompoundWord,
17	printf_part,
18	printf_part_e,
19	printf_part_t,
20	)
21	from _devbuild.gen.types_asdl import lex_mode_e, lex_mode_t
22	from _devbuild.gen.value_asdl import (value, value_e)
23
24	from core import alloc
25	from core import error
26	from core.error import e_die, p_die
27	from core import state
28	from core import vm
29	from frontend import flag_util
30	from frontend import consts
31	from frontend import lexer
32	from frontend import match
33	from frontend import reader
34	from mycpp import mops
35	from mycpp import mylib
36	from mycpp.mylib import log
37	from osh import sh_expr_eval
38	from osh import string_ops
39	from osh import word_compile
40	from data_lang import j8_lite
41
42	import posix_ as posix
43
44	from typing import Dict, List, Optional, TYPE_CHECKING, cast
45
46	if TYPE_CHECKING:
47	from core import ui
48	from frontend import parse_lib
49
50	_ = log
51
52
53	class _FormatStringParser(object):
54	"""
55	Grammar:
56
57	width = Num \| Star
58	precision = Dot (Num \| Star \| Zero)?
59	fmt = Percent (Flag \| Zero)* width? precision? (Type \| Time)
60	part = Char_* \| Format_EscapedPercent \| fmt
61	printf_format = part* Eof_Real # we're using the main lexer
62
63	Maybe: bash also supports %(strftime)T
64	"""
65
66	def __init__(self, lexer):
67	# type: (lexer.Lexer) -> None
68	self.lexer = lexer
69
70	# uninitialized values
71	self.cur_token = None # type: Token
72	self.token_type = Id.Undefined_Tok # type: Id_t
73	self.token_kind = Kind.Undefined # type: Kind_t
74
75	def _Next(self, lex_mode):
76	# type: (lex_mode_t) -> None
77	"""Advance a token."""
78	self.cur_token = self.lexer.Read(lex_mode)
79	self.token_type = self.cur_token.id
80	self.token_kind = consts.GetKind(self.token_type)
81
82	def _ParseFormatStr(self):
83	# type: () -> printf_part_t
84	"""fmt = ..."""
85	self._Next(lex_mode_e.PrintfPercent) # move past %
86
87	part = printf_part.Percent.CreateNull(alloc_lists=True)
88	while self.token_type in (Id.Format_Flag, Id.Format_Zero):
89	# space and + could be implemented
90	flag = lexer.TokenVal(self.cur_token) # allocation will be cached
91	if flag in '# +':
92	p_die("osh printf doesn't support the %r flag" % flag,
93	self.cur_token)
94
95	part.flags.append(self.cur_token)
96	self._Next(lex_mode_e.PrintfPercent)
97
98	if self.token_type in (Id.Format_Num, Id.Format_Star):
99	part.width = self.cur_token
100	self._Next(lex_mode_e.PrintfPercent)
101
102	if self.token_type == Id.Format_Dot:
103	part.precision = self.cur_token
104	self._Next(lex_mode_e.PrintfPercent) # past dot
105	if self.token_type in (Id.Format_Num, Id.Format_Star,
106	Id.Format_Zero):
107	part.precision = self.cur_token
108	self._Next(lex_mode_e.PrintfPercent)
109
110	if self.token_type in (Id.Format_Type, Id.Format_Time):
111	part.type = self.cur_token
112
113	# ADDITIONAL VALIDATION outside the "grammar".
114	type_val = lexer.TokenVal(part.type) # allocation will be cached
115	if type_val in 'eEfFgG':
116	p_die("osh printf doesn't support floating point", part.type)
117	# These two could be implemented. %c needs utf-8 decoding.
118	if type_val == 'c':
119	p_die("osh printf doesn't support single characters (bytes)",
120	part.type)
121
122	elif self.token_type == Id.Unknown_Tok:
123	p_die('Invalid printf format character', self.cur_token)
124
125	else:
126	p_die('Expected a printf format character', self.cur_token)
127
128	return part
129
130	def Parse(self):
131	# type: () -> List[printf_part_t]
132	self._Next(lex_mode_e.PrintfOuter)
133	parts = [] # type: List[printf_part_t]
134	while True:
135	if (self.token_kind in (Kind.Lit, Kind.Char) or self.token_type
136	in (Id.Format_EscapedPercent, Id.Unknown_Backslash)):
137
138	# Note: like in echo -e, we don't fail with Unknown_Backslash here
139	# when shopt -u parse_backslash because it's at runtime rather than
140	# parse time.
141	# Users should use $'' or the future static printf ${x %.3f}.
142
143	parts.append(self.cur_token)
144
145	elif self.token_type == Id.Format_Percent:
146	parts.append(self._ParseFormatStr())
147
148	elif self.token_type in (Id.Eof_Real, Id.Eol_Tok):
149	# Id.Eol_Tok: special case for format string of '\x00'.
150	break
151
152	else:
153	raise AssertionError(Id_str(self.token_type))
154
155	self._Next(lex_mode_e.PrintfOuter)
156
157	return parts
158
159
160	class _PrintfState(object):
161
162	def __init__(self):
163	# type: () -> None
164	self.arg_index = 0
165	self.backslash_c = False
166	self.status = 0 # set to 1 before returning
167
168
169	class Printf(vm._Builtin):
170
171	def __init__(
172	self,
173	mem, # type: state.Mem
174	parse_ctx, # type: parse_lib.ParseContext
175	unsafe_arith, # type: sh_expr_eval.UnsafeArith
176	errfmt, # type: ui.ErrorFormatter
177	):
178	# type: (...) -> None
179	self.mem = mem
180	self.parse_ctx = parse_ctx
181	self.unsafe_arith = unsafe_arith
182	self.errfmt = errfmt
183	self.parse_cache = {} # type: Dict[str, List[printf_part_t]]
184
185	# this object initialized in main()
186	self.shell_start_time = time_.time()
187
188	def _Percent(self, pr, part, varargs, locs):
189	# type: (_PrintfState, printf_part.Percent, List[str], List[CompoundWord]) -> Optional[str]
190
191	num_args = len(varargs)
192
193	# TODO: Cache this?
194	flags = [] # type: List[str]
195	if len(part.flags) > 0:
196	for flag_token in part.flags:
197	flags.append(lexer.TokenVal(flag_token))
198
199	width = -1 # nonexistent
200	if part.width:
201	if part.width.id in (Id.Format_Num, Id.Format_Zero):
202	width_str = lexer.TokenVal(part.width)
203	width_loc = part.width # type: loc_t
204	elif part.width.id == Id.Format_Star: # depends on data
205	if pr.arg_index < num_args:
206	width_str = varargs[pr.arg_index]
207	width_loc = locs[pr.arg_index]
208	pr.arg_index += 1
209	else:
210	width_str = '' # invalid
211	width_loc = loc.Missing
212	else:
213	raise AssertionError()
214
215	try:
216	width = int(width_str)
217	except ValueError:
218	if width_loc.tag() == loc_e.Missing:
219	width_loc = part.width
220	self.errfmt.Print_("printf got invalid width %r" % width_str,
221	blame_loc=width_loc)
222	pr.status = 1
223	return None
224
225	precision = -1 # nonexistent
226	if part.precision:
227	if part.precision.id == Id.Format_Dot:
228	precision_str = '0'
229	precision_loc = part.precision # type: loc_t
230	elif part.precision.id in (Id.Format_Num, Id.Format_Zero):
231	precision_str = lexer.TokenVal(part.precision)
232	precision_loc = part.precision
233	elif part.precision.id == Id.Format_Star:
234	if pr.arg_index < num_args:
235	precision_str = varargs[pr.arg_index]
236	precision_loc = locs[pr.arg_index]
237	pr.arg_index += 1
238	else:
239	precision_str = ''
240	precision_loc = loc.Missing
241	else:
242	raise AssertionError()
243
244	try:
245	precision = int(precision_str)
246	except ValueError:
247	if precision_loc.tag() == loc_e.Missing:
248	precision_loc = part.precision
249	self.errfmt.Print_('printf got invalid precision %r' %
250	precision_str,
251	blame_loc=precision_loc)
252	pr.status = 1
253	return None
254
255	if pr.arg_index < num_args:
256	s = varargs[pr.arg_index]
257	word_loc = locs[pr.arg_index] # type: loc_t
258	pr.arg_index += 1
259	has_arg = True
260	else:
261	s = ''
262	word_loc = loc.Missing
263	has_arg = False
264
265	# Note: %s could be lexed into Id.Percent_S. Although small string
266	# optimization would remove the allocation as well.
267	typ = lexer.TokenVal(part.type)
268	if typ == 's':
269	if precision >= 0:
270	s = s[:precision] # truncate
271
272	elif typ == 'q':
273	# Most shells give \' for single quote, while OSH gives
274	# $'\'' this could matter when SSH'ing.
275	# Ditto for $'\\' vs. '\'
276
277	s = j8_lite.MaybeShellEncode(s)
278
279	elif typ == 'b':
280	# Process just like echo -e, except \c handling is simpler.
281
282	c_parts = [] # type: List[str]
283	lex = match.EchoLexer(s)
284	while True:
285	id_, tok_val = lex.Next()
286	if id_ == Id.Eol_Tok: # Note: This is really a NUL terminator
287	break
288
289	p = word_compile.EvalCStringToken(id_, tok_val)
290
291	# Unusual behavior: '\c' aborts processing!
292	if p is None:
293	pr.backslash_c = True
294	break
295
296	c_parts.append(p)
297	s = ''.join(c_parts)
298
299	elif part.type.id == Id.Format_Time or typ in 'diouxX':
300	# %(...)T and %d share this complex integer conversion logic
301
302	if match.LooksLikeInteger(s):
303	# Note: spaces like ' -42 ' accepted and normalized
304	d = mops.FromStr(s)
305
306	else:
307	# Check for 'a and "a
308	# These are interpreted as the numeric ASCII value of 'a'
309	num_bytes = len(s)
310	if num_bytes > 0 and s[0] in '\'"':
311	if num_bytes == 1:
312	# NUL after quote
313	d = mops.ZERO
314	elif num_bytes == 2:
315	# Allow invalid UTF-8, because all shells do
316	d = mops.IntWiden(ord(s[1]))
317	else:
318	try:
319	small_i = string_ops.DecodeUtf8Char(s, 1)
320	except error.Expr as e:
321	# Take the numeric value of first char, ignoring
322	# the rest of the bytes.
323	# Something like strict_arith or strict_printf
324	# could throw an error in this case.
325	self.errfmt.Print_('Warning: %s' %
326	e.UserErrorString(), word_loc)
327	small_i = ord(s[1])
328
329	d = mops.IntWiden(small_i)
330
331	# No argument means -1 for %(...)T as in Bash Reference Manual
332	# 4.2 - "If no argument is specified, conversion behaves as if
333	# -1 had been given."
334	elif not has_arg and part.type.id == Id.Format_Time:
335	d = mops.MINUS_ONE
336
337	else:
338	if has_arg:
339	blame_loc = word_loc # type: loc_t
340	else:
341	blame_loc = part.type
342	self.errfmt.Print_(
343	'printf expected an integer, got %r' % s, blame_loc)
344	pr.status = 1
345	return None
346
347	if part.type.id == Id.Format_Time:
348	# Initialize timezone:
349	# `localtime' uses the current timezone information initialized
350	# by `tzset'. The function `tzset' refers to the environment
351	# variable `TZ'. When the exported variable `TZ' is present,
352	# its value should be reflected in the real environment
353	# variable `TZ' before call of `tzset'.
354	#
355	# Note: unlike LANG, TZ doesn't seem to change behavior if it's
356	# not exported.
357	#
358	# TODO: In YSH, provide an API that doesn't rely on libc's global
359	# state.
360
361	tzcell = self.mem.GetCell('TZ')
362	if (tzcell and tzcell.exported and
363	tzcell.val.tag() == value_e.Str):
364	tzval = cast(value.Str, tzcell.val)
365	posix.putenv('TZ', tzval.s)
366
367	time_.tzset()
368
369	# Handle special values:
370	# User can specify two special values -1 and -2 as in Bash
371	# Reference Manual 4.2: "Two special argument values may be
372	# used: -1 represents the current time, and -2 represents the
373	# time the shell was invoked." from
374	# https://www.gnu.org/software/bash/manual/html_node/Bash-Builtins.html#index-printf
375	if mops.Equal(d, mops.MINUS_ONE): # -1 is current time
376	# TODO: 2038 problem
377	ts = time_.time()
378	elif mops.Equal(d, mops.MINUS_TWO): # -2 is shell start time
379	ts = self.shell_start_time
380	else:
381	ts = mops.BigTruncate(d)
382
383	s = time_.strftime(typ[1:-2], time_.localtime(ts))
384	if precision >= 0:
385	s = s[:precision] # truncate
386
387	else: # typ in 'diouxX'
388	# Disallowed because it depends on 32- or 64- bit
389	if mops.Greater(mops.ZERO, d) and typ in 'ouxX':
390	# TODO: Don't truncate it
391	e_die(
392	"Can't format negative number with %%%s: %d" %
393	(typ, mops.BigTruncate(d)), part.type)
394
395	if typ == 'o':
396	s = mops.ToOctal(d)
397	elif typ == 'x':
398	s = mops.ToHexLower(d)
399	elif typ == 'X':
400	s = mops.ToHexUpper(d)
401	else: # diu
402	s = mops.ToStr(d) # without spaces like ' -42 '
403
404	# There are TWO different ways to ZERO PAD, and they differ on
405	# the negative sign! See spec/builtin-printf
406
407	zero_pad = 0 # no zero padding
408	if width >= 0 and '0' in flags:
409	zero_pad = 1 # style 1
410	elif precision > 0 and len(s) < precision:
411	zero_pad = 2 # style 2
412
413	if zero_pad:
414	negative = (s[0] == '-')
415	if negative:
416	digits = s[1:]
417	sign = '-'
418	if zero_pad == 1:
419	# [%06d] -42 becomes [-00042] (6 TOTAL)
420	n = width - 1
421	else:
422	# [%6.6d] -42 becomes [-000042] (1 for '-' + 6)
423	n = precision
424	else:
425	digits = s
426	sign = ''
427	if zero_pad == 1:
428	n = width
429	else:
430	n = precision
431	s = sign + digits.rjust(n, '0')
432
433	else:
434	raise AssertionError()
435
436	if width >= 0:
437	if '-' in flags:
438	s = s.ljust(width, ' ')
439	else:
440	s = s.rjust(width, ' ')
441	return s
442
443	def _Format(self, parts, varargs, locs, out):
444	# type: (List[printf_part_t], List[str], List[CompoundWord], List[str]) -> int
445	"""Hairy printf formatting logic."""
446
447	pr = _PrintfState()
448	num_args = len(varargs)
449
450	while True: # loop over arguments
451	for part in parts: # loop over parsed format string
452	UP_part = part
453	if part.tag() == printf_part_e.Literal:
454	part = cast(Token, UP_part)
455	if part.id == Id.Format_EscapedPercent:
456	s = '%'
457	else:
458	s = word_compile.EvalCStringToken(
459	part.id, lexer.LazyStr(part))
460
461	elif part.tag() == printf_part_e.Percent:
462	part = cast(printf_part.Percent, UP_part)
463
464	s = self._Percent(pr, part, varargs, locs)
465	if pr.status != 0:
466	return pr.status
467
468	else:
469	raise AssertionError()
470
471	out.append(s)
472
473	if pr.backslash_c: # 'printf %b a\cb xx' - \c terminates processing!
474	break
475
476	if pr.arg_index == 0:
477	# We went through ALL parts and didn't consume ANY arg.
478	# Example: print x y
479	break
480	if pr.arg_index >= num_args:
481	# We printed all args
482	break
483	# If there are more args, keep going. This implement 'arg recycling'
484	# behavior
485	# printf '%s ' 1 2 3 => 1 2 3
486
487	return 0
488
489	def Run(self, cmd_val):
490	# type: (cmd_value.Argv) -> int
491	"""
492	printf: printf [-v var] format [argument ...]
493	"""
494	attrs, arg_r = flag_util.ParseCmdVal('printf', cmd_val)
495	arg = arg_types.printf(attrs.attrs)
496
497	fmt, fmt_loc = arg_r.ReadRequired2('requires a format string')
498	varargs, locs = arg_r.Rest2()
499
500	#log('fmt %s', fmt)
501	#log('vals %s', vals)
502
503	arena = self.parse_ctx.arena
504	if fmt in self.parse_cache:
505	parts = self.parse_cache[fmt]
506	else:
507	line_reader = reader.StringLineReader(fmt, arena)
508	# TODO: Make public
509	lexer = self.parse_ctx.MakeLexer(line_reader)
510	parser = _FormatStringParser(lexer)
511
512	with alloc.ctx_SourceCode(arena,
513	source.ArgvWord('printf', fmt_loc)):
514	try:
515	parts = parser.Parse()
516	except error.Parse as e:
517	self.errfmt.PrettyPrintError(e)
518	return 2 # parse error
519
520	self.parse_cache[fmt] = parts
521
522	if 0:
523	print()
524	for part in parts:
525	part.PrettyPrint()
526	print()
527
528	out = [] # type: List[str]
529	status = self._Format(parts, varargs, locs, out)
530	if status != 0:
531	return status # failure
532
533	result = ''.join(out)
534	if arg.v is not None:
535	# TODO: get the location for arg.v!
536	v_loc = loc.Missing
537	lval = self.unsafe_arith.ParseLValue(arg.v, v_loc)
538	state.BuiltinSetValue(self.mem, lval, value.Str(result))
539	else:
540	mylib.Stdout().write(result)
541	return 0