OILS / osh / glob_.py View on Github | oilshell.org

494 lines, 256 significant
1"""Glob_.py."""
2
3import libc
4
5from _devbuild.gen.id_kind_asdl import Id, Id_t
6from _devbuild.gen.syntax_asdl import (
7 CompoundWord,
8 Token,
9 word_part_e,
10 glob_part,
11 glob_part_e,
12 glob_part_t,
13)
14from core import pyutil
15from frontend import match
16from mycpp import mylib
17from mycpp.mylib import log, print_stderr
18
19from typing import List, Tuple, cast, TYPE_CHECKING
20if TYPE_CHECKING:
21 from core import optview
22 from frontend.match import SimpleLexer
23
24_ = log
25
26
27def LooksLikeGlob(s):
28 # type: (str) -> bool
29 """Does this string look like a glob pattern?
30
31 Like other shells, OSH avoids calls to glob() unless there are glob
32 metacharacters.
33
34 TODO: Reference lib/glob / glob_pattern functions in bash
35 $ grep glob_pattern lib/glob/*
36
37 Used:
38 1. in Globber below
39 2. for the slow path / fast path of prefix/suffix/patsub ops.
40 """
41 left_bracket = False
42 i = 0
43 n = len(s)
44 while i < n:
45 c = mylib.ByteAt(s, i)
46
47 if mylib.ByteEquals(c, '\\'):
48 i += 1
49
50 elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
51 return True
52
53 elif mylib.ByteEquals(c, '['):
54 left_bracket = True
55
56 elif mylib.ByteEquals(c, ']') and left_bracket:
57 # It has at least one pair of balanced []. Not bothering to check stray
58 # [ or ].
59 return True
60
61 i += 1
62 return False
63
64
65def LooksLikeStaticGlob(w):
66 # type: (CompoundWord) -> bool
67 """Like LooksLikeGlob, but for static words."""
68
69 left_bracket = False
70 for part in w.parts:
71 if part.tag() == word_part_e.Literal:
72 id_ = cast(Token, part).id
73 if id_ in (Id.Lit_Star, Id.Lit_QMark):
74 return True
75 elif id_ == Id.Lit_LBracket:
76 left_bracket = True
77 elif id_ == Id.Lit_RBracket and left_bracket:
78 return True
79 return False
80
81
82# Glob Helpers for WordParts.
83# NOTE: Escaping / doesn't work, because it's not a filename character.
84# ! : - are metachars within character classes
85# ( ) | are extended glob characters, and it's OK to add extra \ when the
86# underlying library doesn't support extended globs
87# we don't need to escape the @ in @(cc), because escaping ( is enough
88GLOB_META_CHARS = r'\*?[]-:!()|'
89
90
91def GlobEscape(s):
92 # type: (str) -> str
93 """For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
94 return pyutil.BackslashEscape(s, GLOB_META_CHARS)
95
96
97# Bug fix: add [] so [[:space:]] is not special, etc.
98ERE_META_CHARS = r'\?*+{}^$.()|[]'
99
100
101def ExtendedRegexEscape(s):
102 # type: (str) -> str
103 """Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
104 I don't think libc has a function to do this. Escape these characters:
105
106 https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
107 """
108 return pyutil.BackslashEscape(s, ERE_META_CHARS)
109
110
111def GlobUnescape(s):
112 # type: (str) -> str
113 """Remove glob escaping from a string.
114
115 Used when there is no glob match.
116 TODO: Can probably get rid of this, as long as you save the original word.
117
118 Complicated example: 'a*b'*.py, which will be escaped to a\*b*.py. So in
119 word_eval _JoinElideEscape and EvalWordToString you have to build two
120 'parallel' strings -- one escaped and one not.
121 """
122 unescaped = [] # type: List[int]
123 i = 0
124 n = len(s)
125 while i < n:
126 c = mylib.ByteAt(s, i)
127
128 if mylib.ByteEquals(c, '\\') and i != n - 1:
129 # Suppressed this to fix bug #698, #628 is still there.
130 assert i != n - 1, 'Trailing backslash: %r' % s
131 i += 1
132 c2 = mylib.ByteAt(s, i)
133
134 if mylib.ByteInSet(c2, GLOB_META_CHARS):
135 unescaped.append(c2)
136 else:
137 raise AssertionError("Unexpected escaped character %r" % c2)
138 else:
139 unescaped.append(c)
140 i += 1
141 return mylib.JoinBytes(unescaped)
142
143
144# For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
145# positions of matches. So we convert globs to regexps.
146
147# Problems:
148# - What about unicode? Do we have to set any global variables? We want it to
149# always use utf-8?
150
151
152class _GlobParser(object):
153
154 def __init__(self, lexer):
155 # type: (SimpleLexer) -> None
156 self.lexer = lexer
157 self.token_type = Id.Undefined_Tok
158 self.token_val = ''
159 self.warnings = [] # type: List[str]
160
161 def _Next(self):
162 # type: () -> None
163 """Move to the next token."""
164 self.token_type, self.token_val = self.lexer.Next()
165
166 def _ParseCharClass(self):
167 # type: () -> List[glob_part_t]
168 """
169 Returns:
170 a CharClass if the parse succeeds, or a Literal if fails. In the latter
171 case, we also append a warning.
172 """
173 first_token = glob_part.Literal(self.token_type, self.token_val)
174 balance = 1 # We already saw a [
175 tokens = [] # type: List[Tuple[Id_t, str]]
176
177 # NOTE: There is a special rule where []] and [[] are valid globs. Also
178 # [^[] and sometimes [^]], although that one is ambiguous!
179 # And [[:space:]] and [[.class.]] has to be taken into account too. I'm
180 # punting on this now because the rule isn't clear and consistent between
181 # shells.
182
183 while True:
184 self._Next()
185
186 if self.token_type == Id.Eol_Tok:
187 # TODO: location info
188 self.warnings.append(
189 'Malformed character class; treating as literal')
190 parts = [first_token] # type: List[glob_part_t]
191 for (id_, s) in tokens:
192 parts.append(glob_part.Literal(id_, s))
193 return parts
194
195 if self.token_type == Id.Glob_LBracket:
196 balance += 1
197 elif self.token_type == Id.Glob_RBracket:
198 balance -= 1
199
200 if balance == 0:
201 break
202 tokens.append(
203 (self.token_type, self.token_val)) # Don't append the last ]
204
205 negated = False
206 if len(tokens):
207 id1, _ = tokens[0]
208 # NOTE: Both ! and ^ work for negation in globs
209 # https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
210 # TODO: Warn about the one that's not recommended?
211 if id1 in (Id.Glob_Bang, Id.Glob_Caret):
212 negated = True
213 tokens = tokens[1:]
214 strs = [s for _, s in tokens]
215 return [glob_part.CharClass(negated, strs)]
216
217 def Parse(self):
218 # type: () -> Tuple[List[glob_part_t], List[str]]
219 """
220 Returns:
221 regex string (or None if it's not a glob)
222 A list of warnings about the syntax
223 """
224 parts = [] # type: List[glob_part_t]
225
226 while True:
227 self._Next()
228 id_ = self.token_type
229 s = self.token_val
230
231 #log('%s %r', self.token_type, self.token_val)
232 if id_ == Id.Eol_Tok:
233 break
234
235 if id_ in (Id.Glob_Star, Id.Glob_QMark):
236 parts.append(glob_part.Operator(id_))
237
238 elif id_ == Id.Glob_LBracket:
239 # Could return a Literal or a CharClass
240 parts.extend(self._ParseCharClass())
241
242 else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
243 # BadBackslash}
244 parts.append(glob_part.Literal(id_, s))
245
246 # Also check for warnings. TODO: location info.
247 if id_ == Id.Glob_RBracket:
248 self.warnings.append('Got unescaped right bracket')
249 if id_ == Id.Glob_BadBackslash:
250 self.warnings.append('Got unescaped trailing backslash')
251
252 return parts, self.warnings
253
254
255_REGEX_CHARS_TO_ESCAPE = '.|^$()+*?[]{}\\'
256
257
258def _GenerateERE(parts):
259 # type: (List[glob_part_t]) -> str
260 out = [] # type: List[str]
261
262 for part in parts:
263 tag = part.tag()
264 UP_part = part
265
266 if tag == glob_part_e.Literal:
267 part = cast(glob_part.Literal, UP_part)
268 if part.id == Id.Glob_EscapedChar:
269 assert len(part.s) == 2, part.s
270 # The user could have escaped a char that doesn't need regex escaping,
271 # like \b or something.
272 c = part.s[1]
273 if c in _REGEX_CHARS_TO_ESCAPE:
274 out.append('\\')
275 out.append(c)
276
277 # ! is only for char class
278 elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
279 out.append(part.s) # e.g. 'py' doesn't need to be escaped
280
281 # ^ is only for char class
282 elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
283 assert len(part.s) == 1, part.s
284 c = part.s
285 if c in _REGEX_CHARS_TO_ESCAPE:
286 out.append('\\')
287 out.append(c)
288
289 # These are UNMATCHED ones not parsed in a glob class
290 elif part.id == Id.Glob_LBracket:
291 out.append('\\[')
292
293 elif part.id == Id.Glob_RBracket:
294 out.append('\\]')
295
296 elif part.id == Id.Glob_BadBackslash:
297 out.append('\\\\')
298
299 elif part.id == Id.Glob_Caret:
300 out.append('^')
301
302 else:
303 raise AssertionError(part.id)
304
305 elif tag == glob_part_e.Operator:
306 part = cast(glob_part.Operator, UP_part)
307 if part.op_id == Id.Glob_QMark:
308 out.append('.')
309 elif part.op_id == Id.Glob_Star:
310 out.append('.*')
311 else:
312 raise AssertionError()
313
314 elif tag == glob_part_e.CharClass:
315 part = cast(glob_part.CharClass, UP_part)
316 out.append('[')
317 if part.negated:
318 out.append('^')
319
320 # Important: the character class is LITERALLY preserved, because we
321 # assume glob char classes are EXACTLY the same as regex char classes,
322 # including the escaping rules.
323 #
324 # TWO WEIRD EXCEPTIONS:
325 # \- is moved to the end as '-'.
326 # In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
327 # want a literal, and the POSIX way to do that is to put it at the end.
328 # \] is moved to the FRONT as ]
329
330 good = [] # type: List[str]
331
332 literal_hyphen = False
333 literal_rbracket = False
334
335 for s in part.strs:
336 if s == '\-':
337 literal_hyphen = True
338 continue
339 if s == '\]':
340 literal_rbracket = True
341 continue
342 good.append(s)
343
344 if literal_rbracket:
345 out.append(']')
346
347 out.extend(good)
348
349 if literal_hyphen:
350 out.append('-')
351
352 out.append(']')
353
354 return ''.join(out)
355
356
357def GlobToERE(pat):
358 # type: (str) -> Tuple[str, List[str]]
359 lexer = match.GlobLexer(pat)
360 p = _GlobParser(lexer)
361 parts, warnings = p.Parse()
362
363 # Vestigial: if there is nothing like * ? or [abc], then the whole string is
364 # a literal, and we could use a more efficient mechanism.
365 # But we would have to DEQUOTE before doing that.
366 if 0:
367 is_glob = False
368 for p in parts:
369 if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
370 is_glob = True
371 if 0:
372 log('GlobToERE()')
373 for p in parts:
374 log(' %s', p)
375
376 regex = _GenerateERE(parts)
377 #log('pat %s -> regex %s', pat, regex)
378 return regex, warnings
379
380
381# Notes for implementing extglob
382# - libc glob() doesn't have any extension!
383# - Nix stdenv uses !(foo) and @(foo|bar)
384# - can we special case these for now?
385# - !(foo|bar) -- change it to *, and then just do fnmatch() to filter the
386# result!
387# - Actually I guess we can do that for all of them. That seems fine.
388# - But we have to get the statically parsed arg in here?
389# - or do dynamic parsing
390# - LooksLikeGlob() would have to respect extglob! ugh!
391# - See 2 calls in osh/word_eval.py
392
393
394class Globber(object):
395
396 def __init__(self, exec_opts):
397 # type: (optview.Exec) -> None
398 self.exec_opts = exec_opts
399
400 # Other unimplemented bash options:
401 #
402 # dotglob dotfiles are matched
403 # globstar ** for directories
404 # globasciiranges ascii or unicode char classes (unicode by default)
405 # nocaseglob
406 # extglob the @() !() syntax -- libc helps us with fnmatch(), but
407 # not glob().
408 #
409 # NOTE: Bash also respects the GLOBIGNORE variable, but no other shells
410 # do. Could a default GLOBIGNORE to ignore flags on the file system be
411 # part of the security solution? It doesn't seem totally sound.
412
413 def _Glob(self, arg, out):
414 # type: (str, List[str]) -> int
415 try:
416 results = libc.glob(arg)
417 except RuntimeError as e:
418 # These errors should be rare: I/O error, out of memory, or unknown
419 # There are no syntax errors. (But see comment about globerr() in
420 # native/libc.c.)
421 # note: MyPy doesn't know RuntimeError has e.message (and e.args)
422 msg = e.message # type: str
423 print_stderr("Error expanding glob %r: %s" % (arg, msg))
424 raise
425 #log('glob %r -> %r', arg, g)
426
427 n = len(results)
428 if n: # Something matched
429 # Omit files starting with -
430 # dashglob turned OFF with shopt -s oil:upgrade.
431 if not self.exec_opts.dashglob():
432 tmp = [s for s in results if not s.startswith('-')]
433 results = tmp # idiom to work around mycpp limitation
434 n = len(results)
435
436 out.extend(results)
437 return n
438
439 return 0
440
441 def Expand(self, arg, out):
442 # type: (str, List[str]) -> int
443 """Given a string that could be a glob, append a list of strings to
444 'out'.
445
446 Returns:
447 Number of items appended, or -1 for fatal failglob error.
448 """
449 if self.exec_opts.noglob():
450 # we didn't glob escape it in osh/word_eval.py
451 out.append(arg)
452 return 1
453
454 n = self._Glob(arg, out)
455 if n:
456 return n
457
458 # Nothing matched
459 if self.exec_opts.failglob():
460 return -1
461
462 if self.exec_opts.nullglob():
463 return 0
464 else:
465 # Return the original string
466 out.append(GlobUnescape(arg))
467 return 1
468
469 def ExpandExtended(self, glob_pat, fnmatch_pat, out):
470 # type: (str, str, List[str]) -> int
471 if self.exec_opts.noglob():
472 # Return the fnmatch_pat. Note: this means we turn ,() into @(), and
473 # there is extra \ escaping compared with bash and mksh. OK for now
474 out.append(fnmatch_pat)
475 return 1
476
477 tmp = [] # type: List[str]
478 self._Glob(glob_pat, tmp)
479 filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
480 n = len(filtered)
481
482 if n:
483 out.extend(filtered)
484 return n
485
486 if self.exec_opts.failglob():
487 return -1 # nothing matched
488
489 if self.exec_opts.nullglob():
490 return 0
491 else:
492 # See comment above
493 out.append(GlobUnescape(fnmatch_pat))
494 return 1