osh/glob

OILS / osh / glob_.py View on Github | oilshell.org

494 lines, 256 significant

1	"""Glob_.py."""
2
3	import libc
4
5	from _devbuild.gen.id_kind_asdl import Id, Id_t
6	from _devbuild.gen.syntax_asdl import (
7	CompoundWord,
8	Token,
9	word_part_e,
10	glob_part,
11	glob_part_e,
12	glob_part_t,
13	)
14	from core import pyutil
15	from frontend import match
16	from mycpp import mylib
17	from mycpp.mylib import log, print_stderr
18
19	from typing import List, Tuple, cast, TYPE_CHECKING
20	if TYPE_CHECKING:
21	from core import optview
22	from frontend.match import SimpleLexer
23
24	_ = log
25
26
27	def LooksLikeGlob(s):
28	# type: (str) -> bool
29	"""Does this string look like a glob pattern?
30
31	Like other shells, OSH avoids calls to glob() unless there are glob
32	metacharacters.
33
34	TODO: Reference lib/glob / glob_pattern functions in bash
35	$ grep glob_pattern lib/glob/*
36
37	Used:
38	1. in Globber below
39	2. for the slow path / fast path of prefix/suffix/patsub ops.
40	"""
41	left_bracket = False
42	i = 0
43	n = len(s)
44	while i < n:
45	c = mylib.ByteAt(s, i)
46
47	if mylib.ByteEquals(c, '\\'):
48	i += 1
49
50	elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
51	return True
52
53	elif mylib.ByteEquals(c, '['):
54	left_bracket = True
55
56	elif mylib.ByteEquals(c, ']') and left_bracket:
57	# It has at least one pair of balanced []. Not bothering to check stray
58	# [ or ].
59	return True
60
61	i += 1
62	return False
63
64
65	def LooksLikeStaticGlob(w):
66	# type: (CompoundWord) -> bool
67	"""Like LooksLikeGlob, but for static words."""
68
69	left_bracket = False
70	for part in w.parts:
71	if part.tag() == word_part_e.Literal:
72	id_ = cast(Token, part).id
73	if id_ in (Id.Lit_Star, Id.Lit_QMark):
74	return True
75	elif id_ == Id.Lit_LBracket:
76	left_bracket = True
77	elif id_ == Id.Lit_RBracket and left_bracket:
78	return True
79	return False
80
81
82	# Glob Helpers for WordParts.
83	# NOTE: Escaping / doesn't work, because it's not a filename character.
84	# ! : - are metachars within character classes
85	# ( ) \| are extended glob characters, and it's OK to add extra \ when the
86	# underlying library doesn't support extended globs
87	# we don't need to escape the @ in @(cc), because escaping ( is enough
88	GLOB_META_CHARS = r'\*?[]-:!()\|'
89
90
91	def GlobEscape(s):
92	# type: (str) -> str
93	"""For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
94	return pyutil.BackslashEscape(s, GLOB_META_CHARS)
95
96
97	# Bug fix: add [] so [[:space:]] is not special, etc.
98	ERE_META_CHARS = r'\?*+{}^$.()\|[]'
99
100
101	def ExtendedRegexEscape(s):
102	# type: (str) -> str
103	"""Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
104	I don't think libc has a function to do this. Escape these characters:
105
106	https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
107	"""
108	return pyutil.BackslashEscape(s, ERE_META_CHARS)
109
110
111	def GlobUnescape(s):
112	# type: (str) -> str
113	"""Remove glob escaping from a string.
114
115	Used when there is no glob match.
116	TODO: Can probably get rid of this, as long as you save the original word.
117
118	Complicated example: 'ab'.py, which will be escaped to a\b.py. So in
119	word_eval _JoinElideEscape and EvalWordToString you have to build two
120	'parallel' strings -- one escaped and one not.
121	"""
122	unescaped = [] # type: List[int]
123	i = 0
124	n = len(s)
125	while i < n:
126	c = mylib.ByteAt(s, i)
127
128	if mylib.ByteEquals(c, '\\') and i != n - 1:
129	# Suppressed this to fix bug #698, #628 is still there.
130	assert i != n - 1, 'Trailing backslash: %r' % s
131	i += 1
132	c2 = mylib.ByteAt(s, i)
133
134	if mylib.ByteInSet(c2, GLOB_META_CHARS):
135	unescaped.append(c2)
136	else:
137	raise AssertionError("Unexpected escaped character %r" % c2)
138	else:
139	unescaped.append(c)
140	i += 1
141	return mylib.JoinBytes(unescaped)
142
143
144	# For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
145	# positions of matches. So we convert globs to regexps.
146
147	# Problems:
148	# - What about unicode? Do we have to set any global variables? We want it to
149	# always use utf-8?
150
151
152	class _GlobParser(object):
153
154	def __init__(self, lexer):
155	# type: (SimpleLexer) -> None
156	self.lexer = lexer
157	self.token_type = Id.Undefined_Tok
158	self.token_val = ''
159	self.warnings = [] # type: List[str]
160
161	def _Next(self):
162	# type: () -> None
163	"""Move to the next token."""
164	self.token_type, self.token_val = self.lexer.Next()
165
166	def _ParseCharClass(self):
167	# type: () -> List[glob_part_t]
168	"""
169	Returns:
170	a CharClass if the parse succeeds, or a Literal if fails. In the latter
171	case, we also append a warning.
172	"""
173	first_token = glob_part.Literal(self.token_type, self.token_val)
174	balance = 1 # We already saw a [
175	tokens = [] # type: List[Tuple[Id_t, str]]
176
177	# NOTE: There is a special rule where []] and [[] are valid globs. Also
178	# [^[] and sometimes [^]], although that one is ambiguous!
179	# And [[:space:]] and [[.class.]] has to be taken into account too. I'm
180	# punting on this now because the rule isn't clear and consistent between
181	# shells.
182
183	while True:
184	self._Next()
185
186	if self.token_type == Id.Eol_Tok:
187	# TODO: location info
188	self.warnings.append(
189	'Malformed character class; treating as literal')
190	parts = [first_token] # type: List[glob_part_t]
191	for (id_, s) in tokens:
192	parts.append(glob_part.Literal(id_, s))
193	return parts
194
195	if self.token_type == Id.Glob_LBracket:
196	balance += 1
197	elif self.token_type == Id.Glob_RBracket:
198	balance -= 1
199
200	if balance == 0:
201	break
202	tokens.append(
203	(self.token_type, self.token_val)) # Don't append the last ]
204
205	negated = False
206	if len(tokens):
207	id1, _ = tokens[0]
208	# NOTE: Both ! and ^ work for negation in globs
209	# https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
210	# TODO: Warn about the one that's not recommended?
211	if id1 in (Id.Glob_Bang, Id.Glob_Caret):
212	negated = True
213	tokens = tokens[1:]
214	strs = [s for _, s in tokens]
215	return [glob_part.CharClass(negated, strs)]
216
217	def Parse(self):
218	# type: () -> Tuple[List[glob_part_t], List[str]]
219	"""
220	Returns:
221	regex string (or None if it's not a glob)
222	A list of warnings about the syntax
223	"""
224	parts = [] # type: List[glob_part_t]
225
226	while True:
227	self._Next()
228	id_ = self.token_type
229	s = self.token_val
230
231	#log('%s %r', self.token_type, self.token_val)
232	if id_ == Id.Eol_Tok:
233	break
234
235	if id_ in (Id.Glob_Star, Id.Glob_QMark):
236	parts.append(glob_part.Operator(id_))
237
238	elif id_ == Id.Glob_LBracket:
239	# Could return a Literal or a CharClass
240	parts.extend(self._ParseCharClass())
241
242	else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
243	# BadBackslash}
244	parts.append(glob_part.Literal(id_, s))
245
246	# Also check for warnings. TODO: location info.
247	if id_ == Id.Glob_RBracket:
248	self.warnings.append('Got unescaped right bracket')
249	if id_ == Id.Glob_BadBackslash:
250	self.warnings.append('Got unescaped trailing backslash')
251
252	return parts, self.warnings
253
254
255	_REGEX_CHARS_TO_ESCAPE = '.\|^$()+*?[]{}\\'
256
257
258	def _GenerateERE(parts):
259	# type: (List[glob_part_t]) -> str
260	out = [] # type: List[str]
261
262	for part in parts:
263	tag = part.tag()
264	UP_part = part
265
266	if tag == glob_part_e.Literal:
267	part = cast(glob_part.Literal, UP_part)
268	if part.id == Id.Glob_EscapedChar:
269	assert len(part.s) == 2, part.s
270	# The user could have escaped a char that doesn't need regex escaping,
271	# like \b or something.
272	c = part.s[1]
273	if c in _REGEX_CHARS_TO_ESCAPE:
274	out.append('\\')
275	out.append(c)
276
277	# ! is only for char class
278	elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
279	out.append(part.s) # e.g. 'py' doesn't need to be escaped
280
281	# ^ is only for char class
282	elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
283	assert len(part.s) == 1, part.s
284	c = part.s
285	if c in _REGEX_CHARS_TO_ESCAPE:
286	out.append('\\')
287	out.append(c)
288
289	# These are UNMATCHED ones not parsed in a glob class
290	elif part.id == Id.Glob_LBracket:
291	out.append('\\[')
292
293	elif part.id == Id.Glob_RBracket:
294	out.append('\\]')
295
296	elif part.id == Id.Glob_BadBackslash:
297	out.append('\\\\')
298
299	elif part.id == Id.Glob_Caret:
300	out.append('^')
301
302	else:
303	raise AssertionError(part.id)
304
305	elif tag == glob_part_e.Operator:
306	part = cast(glob_part.Operator, UP_part)
307	if part.op_id == Id.Glob_QMark:
308	out.append('.')
309	elif part.op_id == Id.Glob_Star:
310	out.append('.*')
311	else:
312	raise AssertionError()
313
314	elif tag == glob_part_e.CharClass:
315	part = cast(glob_part.CharClass, UP_part)
316	out.append('[')
317	if part.negated:
318	out.append('^')
319
320	# Important: the character class is LITERALLY preserved, because we
321	# assume glob char classes are EXACTLY the same as regex char classes,
322	# including the escaping rules.
323	#
324	# TWO WEIRD EXCEPTIONS:
325	# \- is moved to the end as '-'.
326	# In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
327	# want a literal, and the POSIX way to do that is to put it at the end.
328	# \] is moved to the FRONT as ]
329
330	good = [] # type: List[str]
331
332	literal_hyphen = False
333	literal_rbracket = False
334
335	for s in part.strs:
336	if s == '\-':
337	literal_hyphen = True
338	continue
339	if s == '\]':
340	literal_rbracket = True
341	continue
342	good.append(s)
343
344	if literal_rbracket:
345	out.append(']')
346
347	out.extend(good)
348
349	if literal_hyphen:
350	out.append('-')
351
352	out.append(']')
353
354	return ''.join(out)
355
356
357	def GlobToERE(pat):
358	# type: (str) -> Tuple[str, List[str]]
359	lexer = match.GlobLexer(pat)
360	p = _GlobParser(lexer)
361	parts, warnings = p.Parse()
362
363	# Vestigial: if there is nothing like * ? or [abc], then the whole string is
364	# a literal, and we could use a more efficient mechanism.
365	# But we would have to DEQUOTE before doing that.
366	if 0:
367	is_glob = False
368	for p in parts:
369	if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
370	is_glob = True
371	if 0:
372	log('GlobToERE()')
373	for p in parts:
374	log(' %s', p)
375
376	regex = _GenerateERE(parts)
377	#log('pat %s -> regex %s', pat, regex)
378	return regex, warnings
379
380
381	# Notes for implementing extglob
382	# - libc glob() doesn't have any extension!
383	# - Nix stdenv uses !(foo) and @(foo\|bar)
384	# - can we special case these for now?
385	# - !(foo\|bar) -- change it to *, and then just do fnmatch() to filter the
386	# result!
387	# - Actually I guess we can do that for all of them. That seems fine.
388	# - But we have to get the statically parsed arg in here?
389	# - or do dynamic parsing
390	# - LooksLikeGlob() would have to respect extglob! ugh!
391	# - See 2 calls in osh/word_eval.py
392
393
394	class Globber(object):
395
396	def __init__(self, exec_opts):
397	# type: (optview.Exec) -> None
398	self.exec_opts = exec_opts
399
400	# Other unimplemented bash options:
401	#
402	# dotglob dotfiles are matched
403	# globstar ** for directories
404	# globasciiranges ascii or unicode char classes (unicode by default)
405	# nocaseglob
406	# extglob the @() !() syntax -- libc helps us with fnmatch(), but
407	# not glob().
408	#
409	# NOTE: Bash also respects the GLOBIGNORE variable, but no other shells
410	# do. Could a default GLOBIGNORE to ignore flags on the file system be
411	# part of the security solution? It doesn't seem totally sound.
412
413	def _Glob(self, arg, out):
414	# type: (str, List[str]) -> int
415	try:
416	results = libc.glob(arg)
417	except RuntimeError as e:
418	# These errors should be rare: I/O error, out of memory, or unknown
419	# There are no syntax errors. (But see comment about globerr() in
420	# native/libc.c.)
421	# note: MyPy doesn't know RuntimeError has e.message (and e.args)
422	msg = e.message # type: str
423	print_stderr("Error expanding glob %r: %s" % (arg, msg))
424	raise
425	#log('glob %r -> %r', arg, g)
426
427	n = len(results)
428	if n: # Something matched
429	# Omit files starting with -
430	# dashglob turned OFF with shopt -s oil:upgrade.
431	if not self.exec_opts.dashglob():
432	tmp = [s for s in results if not s.startswith('-')]
433	results = tmp # idiom to work around mycpp limitation
434	n = len(results)
435
436	out.extend(results)
437	return n
438
439	return 0
440
441	def Expand(self, arg, out):
442	# type: (str, List[str]) -> int
443	"""Given a string that could be a glob, append a list of strings to
444	'out'.
445
446	Returns:
447	Number of items appended, or -1 for fatal failglob error.
448	"""
449	if self.exec_opts.noglob():
450	# we didn't glob escape it in osh/word_eval.py
451	out.append(arg)
452	return 1
453
454	n = self._Glob(arg, out)
455	if n:
456	return n
457
458	# Nothing matched
459	if self.exec_opts.failglob():
460	return -1
461
462	if self.exec_opts.nullglob():
463	return 0
464	else:
465	# Return the original string
466	out.append(GlobUnescape(arg))
467	return 1
468
469	def ExpandExtended(self, glob_pat, fnmatch_pat, out):
470	# type: (str, str, List[str]) -> int
471	if self.exec_opts.noglob():
472	# Return the fnmatch_pat. Note: this means we turn ,() into @(), and
473	# there is extra \ escaping compared with bash and mksh. OK for now
474	out.append(fnmatch_pat)
475	return 1
476
477	tmp = [] # type: List[str]
478	self._Glob(glob_pat, tmp)
479	filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
480	n = len(filtered)
481
482	if n:
483	out.extend(filtered)
484	return n
485
486	if self.exec_opts.failglob():
487	return -1 # nothing matched
488
489	if self.exec_opts.nullglob():
490	return 0
491	else:
492	# See comment above
493	out.append(GlobUnescape(fnmatch_pat))
494	return 1