builtin/printf

OILS / builtin / printf_osh.py View on Github | oilshell.org

504 lines, 335 significant

1	#!/usr/bin/env python2
2	"""Builtin_printf.py."""
3	from __future__ import print_function
4
5	import time as time_ # avoid name conflict
6
7	from _devbuild.gen import arg_types
8	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind, Kind_t
9	from _devbuild.gen.runtime_asdl import cmd_value
10	from _devbuild.gen.syntax_asdl import (
11	loc,
12	loc_e,
13	loc_t,
14	source,
15	Token,
16	CompoundWord,
17	printf_part,
18	printf_part_e,
19	printf_part_t,
20	)
21	from _devbuild.gen.types_asdl import lex_mode_e, lex_mode_t
22	from _devbuild.gen.value_asdl import (value, value_e)
23
24	from core import alloc
25	from core import error
26	from core.error import e_die, p_die
27	from core import state
28	from core import vm
29	from frontend import flag_util
30	from frontend import consts
31	from frontend import lexer
32	from frontend import match
33	from frontend import reader
34	from mycpp import mylib
35	from mycpp.mylib import log
36	from osh import sh_expr_eval
37	from osh import word_compile
38	from data_lang import j8_lite
39
40	import posix_ as posix
41
42	from typing import Dict, List, TYPE_CHECKING, cast
43
44	if TYPE_CHECKING:
45	from core import ui
46	from frontend import parse_lib
47
48	_ = log
49
50
51	class _FormatStringParser(object):
52	"""
53	Grammar:
54
55	width = Num \| Star
56	precision = Dot (Num \| Star \| Zero)?
57	fmt = Percent (Flag \| Zero)* width? precision? (Type \| Time)
58	part = Char_* \| Format_EscapedPercent \| fmt
59	printf_format = part* Eof_Real # we're using the main lexer
60
61	Maybe: bash also supports %(strftime)T
62	"""
63
64	def __init__(self, lexer):
65	# type: (lexer.Lexer) -> None
66	self.lexer = lexer
67
68	# uninitialized values
69	self.cur_token = None # type: Token
70	self.token_type = Id.Undefined_Tok # type: Id_t
71	self.token_kind = Kind.Undefined # type: Kind_t
72
73	def _Next(self, lex_mode):
74	# type: (lex_mode_t) -> None
75	"""Advance a token."""
76	self.cur_token = self.lexer.Read(lex_mode)
77	self.token_type = self.cur_token.id
78	self.token_kind = consts.GetKind(self.token_type)
79
80	def _ParseFormatStr(self):
81	# type: () -> printf_part_t
82	"""Fmt production."""
83	self._Next(lex_mode_e.PrintfPercent) # move past %
84
85	part = printf_part.Percent.CreateNull(alloc_lists=True)
86	while self.token_type in (Id.Format_Flag, Id.Format_Zero):
87	# space and + could be implemented
88	flag = lexer.TokenVal(self.cur_token) # allocation will be cached
89	if flag in '# +':
90	p_die("osh printf doesn't support the %r flag" % flag,
91	self.cur_token)
92
93	part.flags.append(self.cur_token)
94	self._Next(lex_mode_e.PrintfPercent)
95
96	if self.token_type in (Id.Format_Num, Id.Format_Star):
97	part.width = self.cur_token
98	self._Next(lex_mode_e.PrintfPercent)
99
100	if self.token_type == Id.Format_Dot:
101	part.precision = self.cur_token
102	self._Next(lex_mode_e.PrintfPercent) # past dot
103	if self.token_type in (Id.Format_Num, Id.Format_Star,
104	Id.Format_Zero):
105	part.precision = self.cur_token
106	self._Next(lex_mode_e.PrintfPercent)
107
108	if self.token_type in (Id.Format_Type, Id.Format_Time):
109	part.type = self.cur_token
110
111	# ADDITIONAL VALIDATION outside the "grammar".
112	type_val = lexer.TokenVal(part.type) # allocation will be cached
113	if type_val in 'eEfFgG':
114	p_die("osh printf doesn't support floating point", part.type)
115	# These two could be implemented. %c needs utf-8 decoding.
116	if type_val == 'c':
117	p_die("osh printf doesn't support single characters (bytes)",
118	part.type)
119
120	elif self.token_type == Id.Unknown_Tok:
121	p_die('Invalid printf format character', self.cur_token)
122
123	else:
124	p_die('Expected a printf format character', self.cur_token)
125
126	return part
127
128	def Parse(self):
129	# type: () -> List[printf_part_t]
130	self._Next(lex_mode_e.PrintfOuter)
131	parts = [] # type: List[printf_part_t]
132	while True:
133	if (self.token_kind in (Kind.Lit, Kind.Char) or self.token_type
134	in (Id.Format_EscapedPercent, Id.Unknown_Backslash)):
135
136	# Note: like in echo -e, we don't fail with Unknown_Backslash here
137	# when shopt -u parse_backslash because it's at runtime rather than
138	# parse time.
139	# Users should use $'' or the future static printf ${x %.3f}.
140
141	parts.append(self.cur_token)
142
143	elif self.token_type == Id.Format_Percent:
144	parts.append(self._ParseFormatStr())
145
146	elif self.token_type in (Id.Eof_Real, Id.Eol_Tok):
147	# Id.Eol_Tok: special case for format string of '\x00'.
148	break
149
150	else:
151	raise AssertionError(Id_str(self.token_type))
152
153	self._Next(lex_mode_e.PrintfOuter)
154
155	return parts
156
157
158	class Printf(vm._Builtin):
159
160	def __init__(
161	self,
162	mem, # type: state.Mem
163	parse_ctx, # type: parse_lib.ParseContext
164	unsafe_arith, # type: sh_expr_eval.UnsafeArith
165	errfmt, # type: ui.ErrorFormatter
166	):
167	# type: (...) -> None
168	self.mem = mem
169	self.parse_ctx = parse_ctx
170	self.unsafe_arith = unsafe_arith
171	self.errfmt = errfmt
172	self.parse_cache = {} # type: Dict[str, List[printf_part_t]]
173
174	self.shell_start_time = time_.time(
175	) # this object initialized in main()
176
177	def _Format(self, parts, varargs, locs, out):
178	# type: (List[printf_part_t], List[str], List[CompoundWord], List[str]) -> int
179	"""Hairy printf formatting logic."""
180
181	arg_index = 0
182	num_args = len(varargs)
183	backslash_c = False
184
185	while True: # loop over arguments
186	for part in parts: # loop over parsed format string
187	UP_part = part
188	if part.tag() == printf_part_e.Literal:
189	part = cast(Token, UP_part)
190	if part.id == Id.Format_EscapedPercent:
191	s = '%'
192	else:
193	s = word_compile.EvalCStringToken(
194	part.id, lexer.LazyStr(part))
195	out.append(s)
196
197	elif part.tag() == printf_part_e.Percent:
198	# Note: This case is very long, but hard to refactor because of the
199	# error cases and "recycling" of args! (arg_index, return 1, etc.)
200	part = cast(printf_part.Percent, UP_part)
201
202	# TODO: These calculations are independent of the data, so could be
203	# cached
204	flags = [] # type: List[str]
205	if len(part.flags) > 0:
206	for flag_token in part.flags:
207	flags.append(lexer.TokenVal(flag_token))
208
209	width = -1 # nonexistent
210	if part.width:
211	if part.width.id in (Id.Format_Num, Id.Format_Zero):
212	width_str = lexer.TokenVal(part.width)
213	width_loc = part.width # type: loc_t
214	elif part.width.id == Id.Format_Star:
215	if arg_index < num_args:
216	width_str = varargs[arg_index]
217	width_loc = locs[arg_index]
218	arg_index += 1
219	else:
220	width_str = '' # invalid
221	width_loc = loc.Missing
222	else:
223	raise AssertionError()
224
225	try:
226	width = int(width_str)
227	except ValueError:
228	if width_loc.tag() == loc_e.Missing:
229	width_loc = part.width
230	self.errfmt.Print_("printf got invalid width %r" %
231	width_str,
232	blame_loc=width_loc)
233	return 1
234
235	precision = -1 # nonexistent
236	if part.precision:
237	if part.precision.id == Id.Format_Dot:
238	precision_str = '0'
239	precision_loc = part.precision # type: loc_t
240	elif part.precision.id in (Id.Format_Num,
241	Id.Format_Zero):
242	precision_str = lexer.TokenVal(part.precision)
243	precision_loc = part.precision
244	elif part.precision.id == Id.Format_Star:
245	if arg_index < num_args:
246	precision_str = varargs[arg_index]
247	precision_loc = locs[arg_index]
248	arg_index += 1
249	else:
250	precision_str = ''
251	precision_loc = loc.Missing
252	else:
253	raise AssertionError()
254
255	try:
256	precision = int(precision_str)
257	except ValueError:
258	if precision_loc.tag() == loc_e.Missing:
259	precision_loc = part.precision
260	self.errfmt.Print_(
261	'printf got invalid precision %r' %
262	precision_str,
263	blame_loc=precision_loc)
264	return 1
265
266	if arg_index < num_args:
267	s = varargs[arg_index]
268	word_loc = locs[arg_index] # type: loc_t
269	arg_index += 1
270	has_arg = True
271	else:
272	s = ''
273	word_loc = loc.Missing
274	has_arg = False
275
276	# Note: %s could be lexed into Id.Percent_S. Although small string
277	# optimization would remove the allocation as well.
278	typ = lexer.TokenVal(part.type)
279	if typ == 's':
280	if precision >= 0:
281	s = s[:precision] # truncate
282
283	elif typ == 'q':
284	# Most shells give \' for single quote, while OSH gives
285	# $'\'' this could matter when SSH'ing.
286	# Ditto for $'\\' vs. '\'
287
288	s = j8_lite.MaybeShellEncode(s)
289
290	elif typ == 'b':
291	# Process just like echo -e, except \c handling is simpler.
292
293	c_parts = [] # type: List[str]
294	lex = match.EchoLexer(s)
295	while True:
296	id_, tok_val = lex.Next()
297	if id_ == Id.Eol_Tok: # Note: This is really a NUL terminator
298	break
299
300	p = word_compile.EvalCStringToken(id_, tok_val)
301
302	# Unusual behavior: '\c' aborts processing!
303	if p is None:
304	backslash_c = True
305	break
306
307	c_parts.append(p)
308	s = ''.join(c_parts)
309
310	elif part.type.id == Id.Format_Time or typ in 'diouxX':
311	# %(...)T and %d share this complex integer conversion logic
312
313	try:
314	# note: spaces like ' -42 ' accepted and normalized
315	d = int(s)
316	except ValueError:
317	# 'a is interpreted as the ASCII value of 'a'
318	if len(s) >= 1 and s[0] in '\'"':
319	# TODO: utf-8 decode s[1:] to be more correct. Probably
320	# depends on issue #366, a utf-8 library.
321	# Note: len(s) == 1 means there is a NUL (0) after the quote..
322	d = ord(s[1]) if len(s) >= 2 else 0
323
324	# No argument means -1 for %(...)T as in Bash Reference Manual
325	# 4.2 "If no argument is specified, conversion behaves as if -1
326	# had been given."
327	elif not has_arg and part.type.id == Id.Format_Time:
328	d = -1
329
330	else:
331	if has_arg:
332	blame_loc = word_loc # type: loc_t
333	else:
334	blame_loc = part.type
335	self.errfmt.Print_(
336	'printf expected an integer, got %r' % s,
337	blame_loc)
338	return 1
339
340	if part.type.id == Id.Format_Time:
341	# Initialize timezone:
342	# `localtime' uses the current timezone information initialized
343	# by `tzset'. The function `tzset' refers to the environment
344	# variable `TZ'. When the exported variable `TZ' is present,
345	# its value should be reflected in the real environment
346	# variable `TZ' before call of `tzset'.
347	#
348	# Note: unlike LANG, TZ doesn't seem to change behavior if it's
349	# not exported.
350	#
351	# TODO: In YSH, provide an API that doesn't rely on libc's global
352	# state.
353
354	tzcell = self.mem.GetCell('TZ')
355	if tzcell and tzcell.exported and tzcell.val.tag(
356	) == value_e.Str:
357	tzval = cast(value.Str, tzcell.val)
358	posix.putenv('TZ', tzval.s)
359
360	time_.tzset()
361
362	# Handle special values:
363	# User can specify two special values -1 and -2 as in Bash
364	# Reference Manual 4.2: "Two special argument values may be
365	# used: -1 represents the current time, and -2 represents the
366	# time the shell was invoked." from
367	# https://www.gnu.org/software/bash/manual/html_node/Bash-Builtins.html#index-printf
368	if d == -1: # the current time
369	ts = time_.time()
370	elif d == -2: # the shell start time
371	ts = self.shell_start_time
372	else:
373	ts = d
374
375	s = time_.strftime(typ[1:-2], time_.localtime(ts))
376	if precision >= 0:
377	s = s[:precision] # truncate
378
379	else: # typ in 'diouxX'
380	# Disallowed because it depends on 32- or 64- bit
381	if d < 0 and typ in 'ouxX':
382	e_die(
383	"Can't format negative number %d with %%%s"
384	% (d, typ), part.type)
385
386	if typ == 'o':
387	s = mylib.octal(d)
388	elif typ == 'x':
389	s = mylib.hex_lower(d)
390	elif typ == 'X':
391	s = mylib.hex_upper(d)
392	else: # diu
393	s = str(d) # without spaces like ' -42 '
394
395	# There are TWO different ways to ZERO PAD, and they differ on
396	# the negative sign! See spec/builtin-printf
397
398	zero_pad = 0 # no zero padding
399	if width >= 0 and '0' in flags:
400	zero_pad = 1 # style 1
401	elif precision > 0 and len(s) < precision:
402	zero_pad = 2 # style 2
403
404	if zero_pad:
405	negative = (s[0] == '-')
406	if negative:
407	digits = s[1:]
408	sign = '-'
409	if zero_pad == 1:
410	# [%06d] -42 becomes [-00042] (6 TOTAL)
411	n = width - 1
412	else:
413	# [%6.6d] -42 becomes [-000042] (1 for '-' + 6)
414	n = precision
415	else:
416	digits = s
417	sign = ''
418	if zero_pad == 1:
419	n = width
420	else:
421	n = precision
422	s = sign + digits.rjust(n, '0')
423
424	else:
425	raise AssertionError()
426
427	if width >= 0:
428	if '-' in flags:
429	s = s.ljust(width, ' ')
430	else:
431	s = s.rjust(width, ' ')
432
433	out.append(s)
434
435	else:
436	raise AssertionError()
437
438	if backslash_c: # 'printf %b a\cb xx' - \c terminates processing!
439	break
440
441	if arg_index == 0:
442	# We went through ALL parts and didn't consume ANY arg.
443	# Example: print x y
444	break
445	if arg_index >= num_args:
446	# We printed all args
447	break
448	# There are more arg: Implement the 'arg recycling' behavior.
449
450	return 0
451
452	def Run(self, cmd_val):
453	# type: (cmd_value.Argv) -> int
454	"""
455	printf: printf [-v var] format [argument ...]
456	"""
457	attrs, arg_r = flag_util.ParseCmdVal('printf', cmd_val)
458	arg = arg_types.printf(attrs.attrs)
459
460	fmt, fmt_loc = arg_r.ReadRequired2('requires a format string')
461	varargs, locs = arg_r.Rest2()
462
463	#log('fmt %s', fmt)
464	#log('vals %s', vals)
465
466	arena = self.parse_ctx.arena
467	if fmt in self.parse_cache:
468	parts = self.parse_cache[fmt]
469	else:
470	line_reader = reader.StringLineReader(fmt, arena)
471	# TODO: Make public
472	lexer = self.parse_ctx.MakeLexer(line_reader)
473	parser = _FormatStringParser(lexer)
474
475	with alloc.ctx_SourceCode(arena,
476	source.ArgvWord('printf', fmt_loc)):
477	try:
478	parts = parser.Parse()
479	except error.Parse as e:
480	self.errfmt.PrettyPrintError(e)
481	return 2 # parse error
482
483	self.parse_cache[fmt] = parts
484
485	if 0:
486	print()
487	for part in parts:
488	part.PrettyPrint()
489	print()
490
491	out = [] # type: List[str]
492	status = self._Format(parts, varargs, locs, out)
493	if status != 0:
494	return status # failure
495
496	result = ''.join(out)
497	if arg.v is not None:
498	# TODO: get the location for arg.v!
499	v_loc = loc.Missing
500	lval = self.unsafe_arith.ParseLValue(arg.v, v_loc)
501	state.BuiltinSetValue(self.mem, lval, value.Str(result))
502	else:
503	mylib.Stdout().write(result)
504	return 0