| 1 | #!/usr/bin/env python2
 | 
| 2 | from __future__ import print_function
 | 
| 3 | """osh/word_compile.py.
 | 
| 4 | 
 | 
| 5 | These functions are called after parsing, but don't depend on any runtime
 | 
| 6 | values.
 | 
| 7 | """
 | 
| 8 | 
 | 
| 9 | from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
 | 
| 10 | from _devbuild.gen.syntax_asdl import (
 | 
| 11 |     Token,
 | 
| 12 |     CharCode,
 | 
| 13 |     word_part_e,
 | 
| 14 |     word_part_t,
 | 
| 15 | )
 | 
| 16 | from core.error import p_die
 | 
| 17 | from data_lang import j8
 | 
| 18 | from frontend import consts
 | 
| 19 | from frontend import lexer
 | 
| 20 | from mycpp import mylib
 | 
| 21 | from mycpp.mylib import log, switch
 | 
| 22 | 
 | 
| 23 | from typing import List, Optional, cast
 | 
| 24 | 
 | 
| 25 | 
 | 
| 26 | def EvalCharLiteralForRegex(tok):
 | 
| 27 |     # type: (Token) -> CharCode
 | 
| 28 |     """For regex char classes.
 | 
| 29 | 
 | 
| 30 |     Similar logic as below.
 | 
| 31 |     """
 | 
| 32 |     id_ = tok.id
 | 
| 33 |     value = lexer.TokenVal(tok)
 | 
| 34 | 
 | 
| 35 |     with switch(id_) as case:
 | 
| 36 |         if case(Id.Char_UBraced):
 | 
| 37 |             s = lexer.TokenSlice(tok, 3, -1)  # \u{123}
 | 
| 38 |             i = int(s, 16)
 | 
| 39 |             return CharCode(tok, i, True)  # u_braced
 | 
| 40 | 
 | 
| 41 |         elif case(Id.Char_OneChar):  # \'
 | 
| 42 |             # value[1] -> mylib.ByteAt()
 | 
| 43 |             one_char_str = consts.LookupCharC(value[1])
 | 
| 44 |             return CharCode(tok, ord(one_char_str), False)
 | 
| 45 | 
 | 
| 46 |         elif case(Id.Char_Hex):
 | 
| 47 |             s = lexer.TokenSliceLeft(tok, 2)
 | 
| 48 |             i = int(s, 16)
 | 
| 49 |             return CharCode(tok, i, False)
 | 
| 50 | 
 | 
| 51 |         elif case(Id.Lit_Chars, Id.Expr_Name, Id.Expr_DecInt):
 | 
| 52 |             # Id.Lit_Chars: Token in single quoted string ['a'] is Id.Lit_Chars
 | 
| 53 |             # Id.Expr_Name: [a-z] is ['a'-'Z'], and [a z] is ['a' 'Z']
 | 
| 54 |             # Id.Expr_DecInt: [0-9] is ['0'-'9'], and [0 9] is ['0' '9']
 | 
| 55 | 
 | 
| 56 |             assert len(value) == 1, tok
 | 
| 57 |             # value[0] -> mylib.ByteAt()
 | 
| 58 |             return CharCode(tok, ord(value[0]), False)
 | 
| 59 | 
 | 
| 60 |         else:
 | 
| 61 |             raise AssertionError(tok)
 | 
| 62 | 
 | 
| 63 | 
 | 
| 64 | def EvalCStringToken(id_, value):
 | 
| 65 |     # type: (Id_t, str) -> Optional[str]
 | 
| 66 |     """All types of C-style backslash-escaped strings use this function:
 | 
| 67 |     
 | 
| 68 |     - echo -e and printf at runtime
 | 
| 69 |     - $'' and b'' u'' at parse time
 | 
| 70 |     """
 | 
| 71 |     code_point = -1
 | 
| 72 | 
 | 
| 73 |     if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash):
 | 
| 74 |         # shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
 | 
| 75 |         return value
 | 
| 76 | 
 | 
| 77 |     # single quotes in the middle of a triple quoted string
 | 
| 78 |     elif id_ == Id.Right_SingleQuote:
 | 
| 79 |         return value
 | 
| 80 | 
 | 
| 81 |     elif id_ == Id.Char_OneChar:
 | 
| 82 |         c = value[1]
 | 
| 83 |         return consts.LookupCharC(c)
 | 
| 84 | 
 | 
| 85 |     elif id_ == Id.Char_Stop:  # \c returns a special sentinel
 | 
| 86 |         return None
 | 
| 87 | 
 | 
| 88 |     elif id_ in (Id.Char_Octal3, Id.Char_Octal4):
 | 
| 89 |         if id_ == Id.Char_Octal3:  # $'\377'
 | 
| 90 |             s = value[1:]
 | 
| 91 |         else:  # echo -e '\0377'
 | 
| 92 |             s = value[2:]
 | 
| 93 | 
 | 
| 94 |         i = int(s, 8)
 | 
| 95 |         if i >= 256:
 | 
| 96 |             i = i % 256
 | 
| 97 |             # NOTE: This is for strict mode
 | 
| 98 |             #raise AssertionError('Out of range')
 | 
| 99 |         return chr(i)
 | 
| 100 | 
 | 
| 101 |     elif id_ in (Id.Char_Hex, Id.Char_YHex):
 | 
| 102 |         s = value[2:]
 | 
| 103 |         i = int(s, 16)
 | 
| 104 |         return chr(i)
 | 
| 105 | 
 | 
| 106 |     # Note: we're not doing the surrogate range and max code point checks for
 | 
| 107 |     # echo -e and printf:
 | 
| 108 |     #
 | 
| 109 |     # 1. It's not compatible with bash
 | 
| 110 |     # 2. We don't have good error locations anyway
 | 
| 111 | 
 | 
| 112 |     elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
 | 
| 113 |         s = value[2:]
 | 
| 114 |         code_point = int(s, 16)
 | 
| 115 |         return j8.Utf8Encode(code_point)
 | 
| 116 | 
 | 
| 117 |     elif id_ == Id.Char_UBraced:
 | 
| 118 |         s = value[3:-1]  # \u{123}
 | 
| 119 |         code_point = int(s, 16)
 | 
| 120 |         return j8.Utf8Encode(code_point)
 | 
| 121 | 
 | 
| 122 |     else:
 | 
| 123 |         raise AssertionError(Id_str(id_))
 | 
| 124 | 
 | 
| 125 | 
 | 
| 126 | def EvalSingleQuoted(id_, tokens):
 | 
| 127 |     # type: (Id_t, List[Token]) -> str
 | 
| 128 |     """ Done at parse time """
 | 
| 129 |     if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote,
 | 
| 130 |                Id.Left_RTSingleQuote):
 | 
| 131 |         strs = [lexer.TokenVal(t) for t in tokens]
 | 
| 132 | 
 | 
| 133 |     elif id_ in (Id.Left_DollarSingleQuote, Id.Left_USingleQuote,
 | 
| 134 |                  Id.Left_BSingleQuote, Id.Left_UTSingleQuote,
 | 
| 135 |                  Id.Left_BTSingleQuote):
 | 
| 136 |         if 0:
 | 
| 137 |             for t in tokens:
 | 
| 138 |                 print('T %s' % t)
 | 
| 139 | 
 | 
| 140 |         strs = []
 | 
| 141 |         for t in tokens:
 | 
| 142 |             # More parse time validation for code points.
 | 
| 143 |             # EvalCStringToken() redoes some of this work, but right now it's
 | 
| 144 |             # shared with dynamic echo -e / printf, which don't have tokens.
 | 
| 145 | 
 | 
| 146 |             # Only check J8 style strings, not Char_Unicode4 and Char_Unicode8,
 | 
| 147 |             # which are in OSH
 | 
| 148 |             if t.id == Id.Char_UBraced:
 | 
| 149 |                 s = lexer.TokenSlice(t, 3, -1)
 | 
| 150 |                 code_point = int(s, 16)
 | 
| 151 |                 if code_point > 0x10ffff:
 | 
| 152 |                     p_die("Code point can't be greater than U+10ffff", t)
 | 
| 153 |                 if 0xD800 <= code_point and code_point < 0xE000:
 | 
| 154 |                     p_die(
 | 
| 155 |                         r"%s escape is illegal because it's in the surrogate range"
 | 
| 156 |                         % lexer.TokenVal(t), t)
 | 
| 157 | 
 | 
| 158 |             strs.append(EvalCStringToken(t.id, lexer.TokenVal(t)))
 | 
| 159 | 
 | 
| 160 |     else:
 | 
| 161 |         raise AssertionError(id_)
 | 
| 162 |     return ''.join(strs)
 | 
| 163 | 
 | 
| 164 | 
 | 
| 165 | def _TokenConsistsOf(tok, byte_set):
 | 
| 166 |     # type: (Token, str) -> bool
 | 
| 167 |     start = tok.col
 | 
| 168 |     end = tok.col + tok.length
 | 
| 169 |     for i in xrange(start, end):
 | 
| 170 |         b = mylib.ByteAt(tok.line.content, i)
 | 
| 171 |         if not mylib.ByteInSet(b, byte_set):
 | 
| 172 |             return False
 | 
| 173 |     return True
 | 
| 174 | 
 | 
| 175 | 
 | 
| 176 | def _IsLeadingSpace(tok):
 | 
| 177 |     # type: (Token) -> bool
 | 
| 178 |     """ Determine if the token before ''' etc. is space to trim """
 | 
| 179 |     return _TokenConsistsOf(tok, ' \t')
 | 
| 180 | 
 | 
| 181 | 
 | 
| 182 | def _IsTrailingSpace(tok):
 | 
| 183 |     # type: (Token) -> bool
 | 
| 184 |     """ Determine if the space/newlines after ''' should be trimmed
 | 
| 185 | 
 | 
| 186 |     Like s.isspace(), without legacy \f \v and Unicode.
 | 
| 187 |     """
 | 
| 188 |     return _TokenConsistsOf(tok, ' \n\r\t')
 | 
| 189 | 
 | 
| 190 | 
 | 
| 191 | # Whitespace trimming algorithms:
 | 
| 192 | #
 | 
| 193 | # 1. Trim what's after opening ''' or """, if it's whitespace
 | 
| 194 | # 2. Determine what's before closing ''' or """ -- this is what you strip
 | 
| 195 | # 3. Strip each line by mutating the token
 | 
| 196 | #    - Change the ID from Id.Lit_Chars -> Id.Lit_CharsWithoutPrefix to maintain
 | 
| 197 | #      the lossless invariant
 | 
| 198 | 
 | 
| 199 | 
 | 
| 200 | def RemoveLeadingSpaceDQ(parts):
 | 
| 201 |     # type: (List[word_part_t]) -> None
 | 
| 202 |     if len(parts) <= 1:  # We need at least 2 parts to strip anything
 | 
| 203 |         return
 | 
| 204 | 
 | 
| 205 |     # The first token may have a newline
 | 
| 206 |     UP_first = parts[0]
 | 
| 207 |     if UP_first.tag() == word_part_e.Literal:
 | 
| 208 |         first = cast(Token, UP_first)
 | 
| 209 |         #log('T %s', first_part)
 | 
| 210 |         if _IsTrailingSpace(first):
 | 
| 211 |             # Remove the first part.  TODO: This could be expensive if there are many
 | 
| 212 |             # lines.
 | 
| 213 |             parts.pop(0)
 | 
| 214 | 
 | 
| 215 |     UP_last = parts[-1]
 | 
| 216 |     to_strip = None  # type: Optional[str]
 | 
| 217 |     if UP_last.tag() == word_part_e.Literal:
 | 
| 218 |         last = cast(Token, UP_last)
 | 
| 219 |         if _IsLeadingSpace(last):
 | 
| 220 |             to_strip = lexer.TokenVal(last)
 | 
| 221 |             parts.pop()  # Remove the last part
 | 
| 222 | 
 | 
| 223 |     if to_strip is None:
 | 
| 224 |         return
 | 
| 225 | 
 | 
| 226 |     n = len(to_strip)
 | 
| 227 |     for part in parts:
 | 
| 228 |         if part.tag() != word_part_e.Literal:
 | 
| 229 |             line_ended = False
 | 
| 230 |             continue
 | 
| 231 | 
 | 
| 232 |         lit_tok = cast(Token, part)
 | 
| 233 | 
 | 
| 234 |         if lit_tok.col == 0 and lexer.TokenStartsWith(lit_tok, to_strip):
 | 
| 235 |             # TODO: Lexer should not populate this!
 | 
| 236 |             assert lit_tok.tval is None, lit_tok.tval
 | 
| 237 | 
 | 
| 238 |             lit_tok.col = n
 | 
| 239 |             lit_tok.length -= n
 | 
| 240 |             #log('n = %d, %s', n, lit_tok)
 | 
| 241 | 
 | 
| 242 |             assert lit_tok.id == Id.Lit_Chars, lit_tok
 | 
| 243 |             # --tool lossless-cat has a special case for this
 | 
| 244 |             lit_tok.id = Id.Lit_CharsWithoutPrefix
 | 
| 245 | 
 | 
| 246 | 
 | 
| 247 | def RemoveLeadingSpaceSQ(tokens):
 | 
| 248 |     # type: (List[Token]) -> None
 | 
| 249 |     """Strip leading whitespace from tokens.
 | 
| 250 | 
 | 
| 251 |     May return original list unmodified, or a new list.
 | 
| 252 | 
 | 
| 253 |     Must respect lossless invariant - see test/lossless/multiline-str.sh
 | 
| 254 | 
 | 
| 255 |     For now we create NEW Id.Ignored_LeadingSpace tokens, and are NOT in the
 | 
| 256 |     arena.
 | 
| 257 |     """
 | 
| 258 |     if 0:
 | 
| 259 |         log('--')
 | 
| 260 |         for tok in tokens:
 | 
| 261 |             #log('tok %s', tok)
 | 
| 262 |             import sys
 | 
| 263 |             from asdl import format as fmt
 | 
| 264 |             ast_f = fmt.DetectConsoleOutput(mylib.Stderr())
 | 
| 265 |             tree = tok.AbbreviatedTree()
 | 
| 266 |             fmt.PrintTree(tree, ast_f)
 | 
| 267 |             print('', file=sys.stderr)
 | 
| 268 |         log('--')
 | 
| 269 | 
 | 
| 270 |     if len(tokens) <= 1:  # We need at least 2 parts to strip anything
 | 
| 271 |         return
 | 
| 272 | 
 | 
| 273 |     # var x = '''    # strip initial newline/whitespace
 | 
| 274 |     #   x
 | 
| 275 |     #   '''
 | 
| 276 |     first = tokens[0]
 | 
| 277 |     if first.id == Id.Lit_Chars:
 | 
| 278 |         if _IsTrailingSpace(first):
 | 
| 279 |             tokens.pop(0)  # Remove the first part
 | 
| 280 | 
 | 
| 281 |     # Figure out what to strip, based on last token
 | 
| 282 |     last = tokens[-1]
 | 
| 283 |     to_strip = None  # type: Optional[str]
 | 
| 284 |     if last.id == Id.Lit_Chars:
 | 
| 285 |         if _IsLeadingSpace(last):
 | 
| 286 |             to_strip = lexer.TokenVal(last)
 | 
| 287 |             tokens.pop()  # Remove the last part
 | 
| 288 | 
 | 
| 289 |     if to_strip is None:
 | 
| 290 |         return
 | 
| 291 | 
 | 
| 292 |     #log('SQ Stripping %r', to_strip)
 | 
| 293 |     n = len(to_strip)
 | 
| 294 | 
 | 
| 295 |     #log('--')
 | 
| 296 |     for tok in tokens:  # line_ended reset on every iteration
 | 
| 297 |         #log('tok %s', tok)
 | 
| 298 |         # Strip leading space on tokens that begin lines, by bumping start col
 | 
| 299 |         if tok.col == 0 and lexer.TokenStartsWith(tok, to_strip):
 | 
| 300 |             tok.col = n
 | 
| 301 |             tok.length -= n
 | 
| 302 | 
 | 
| 303 |             assert tok.id == Id.Lit_Chars, tok
 | 
| 304 |             # --tool lossless-cat has a special case for this
 | 
| 305 |             tok.id = Id.Lit_CharsWithoutPrefix
 | 
| 306 | 
 | 
| 307 |             #log('STRIP tok %s', tok)
 | 
| 308 | 
 | 
| 309 | 
 | 
| 310 | # vim: sw=4
 |