| 1 | ---
 | 
| 2 | default_highlighter: oils-sh
 | 
| 3 | ---
 | 
| 4 | 
 | 
| 5 | YSH Regex API - Convenient and Powerful
 | 
| 6 | =======================================
 | 
| 7 | 
 | 
| 8 | YSH has [Egg Expressions](eggex.html), a composable and readable syntax for
 | 
| 9 | regular expressions.  You can use *Eggex* with both:
 | 
| 10 | 
 | 
| 11 | - A convenient Perl-like operator: `'mystr' ~ / [a-z]+/ `
 | 
| 12 |   - access submatches with global `_group()`   `_start()`   `_end()`
 | 
| 13 | 
 | 
| 14 | - A powerful Python-like API: `'mystr' => search(/ [a-z]+ /)` and `leftMatch()`
 | 
| 15 |   - access submatches with `Match` object methods `m => group()`   `m =>
 | 
| 16 |     start()`   `m => end()`
 | 
| 17 | 
 | 
| 18 | You can also use plain POSIX regular expressions ([ERE]($xref)) instead of
 | 
| 19 | Eggex.
 | 
| 20 | 
 | 
| 21 | <div id="toc">
 | 
| 22 | </div>
 | 
| 23 | 
 | 
| 24 | <!--
 | 
| 25 | TODO: need $help-topic shortcut
 | 
| 26 | 
 | 
| 27 | - [`_group()`]($help-topic:_group)
 | 
| 28 | - [`Match => group()`]($help-topic:group)
 | 
| 29 | - [`Str => search()`]($help-topic:search)
 | 
| 30 | - [`Str => leftMatch()`]($help-topic:leftMatch)
 | 
| 31 | -->
 | 
| 32 | 
 | 
| 33 | ## Perl-Like `~` operator
 | 
| 34 | 
 | 
| 35 | The `~` operator tests if a string matches a pattern.  The captured groups are
 | 
| 36 | available through "global register" functions starting with `_`.
 | 
| 37 | 
 | 
| 38 |     var s = 'days 04-01 and 10-31'
 | 
| 39 |     var eggex = /<capture d+ as month> '-' <capture d+ as day>/
 | 
| 40 | 
 | 
| 41 |     if (s ~ eggex) {
 | 
| 42 |       = _group(1)  # => '04', the first capture
 | 
| 43 |       = _group(2)  # => '01', the second capture
 | 
| 44 | 
 | 
| 45 |       = _start(1)  # => 5, start index of the first capture
 | 
| 46 |       = _end(1)    # => 7, end index of the first capture
 | 
| 47 |     }
 | 
| 48 | 
 | 
| 49 | The eggex pattern has **named capture** `as month`, so it's more typical to
 | 
| 50 | write:
 | 
| 51 | 
 | 
| 52 |     if (s ~ eggex) {
 | 
| 53 |       = _group('month')  # => '04'
 | 
| 54 |       = _group('day')    # => '01'
 | 
| 55 | 
 | 
| 56 |       = _start('month')  # => 5
 | 
| 57 |       = _end('month')    # => 7
 | 
| 58 |     }
 | 
| 59 | 
 | 
| 60 | You can test if a string does **not** match a pattern with `!~`:
 | 
| 61 | 
 | 
| 62 |     if (s !~ / space /) {
 | 
| 63 |       echo 'no whitespace'
 | 
| 64 |     }
 | 
| 65 | 
 | 
| 66 | The pattern can also be a string, in plain [ERE]($xref) syntax:
 | 
| 67 | 
 | 
| 68 |     if (s ~ '([[:digit:]]+)') {
 | 
| 69 |       = _group(1)
 | 
| 70 |     }
 | 
| 71 | 
 | 
| 72 | Help topics:
 | 
| 73 | 
 | 
| 74 | - [match-ops](ref/chap-expr-lang.html#match-ops)
 | 
| 75 |   - [`_group()`](ref/chap-builtin-func.html#_group)
 | 
| 76 |   - [`_start()`](ref/chap-builtin-func.html#_start)
 | 
| 77 |   - [`_end()`](ref/chap-builtin-func.html#_end)
 | 
| 78 | 
 | 
| 79 | ## Python-like API
 | 
| 80 | 
 | 
| 81 | ### `search()` returns a value.Match object
 | 
| 82 | 
 | 
| 83 | The `search()` method is like the `~` operator, but it returns either `null` or
 | 
| 84 | a `Match` object.
 | 
| 85 | 
 | 
| 86 | `Match` objects have `group()`, `start()`, and `end()` methods.
 | 
| 87 | 
 | 
| 88 |     var m = 's' => search(eggex)
 | 
| 89 |     if (m) {  # test if it  matched
 | 
| 90 |       = m => group('month')  # => '04'
 | 
| 91 |       = m => group('day')    # => '01'
 | 
| 92 |     }
 | 
| 93 | 
 | 
| 94 | You can search from a given starting position:
 | 
| 95 | 
 | 
| 96 |     var m = 's' => search(eggex, pos=12)
 | 
| 97 |     if (m) {
 | 
| 98 |       = m => group('month')  # => '10', first month after pos 12
 | 
| 99 |       = m => group('day')    # => '31', first day after pos 12
 | 
| 100 |     }
 | 
| 101 | 
 | 
| 102 | The `search()` method is a bit like `Str => find()`, which searches for a
 | 
| 103 | substring rather than a pattern.
 | 
| 104 | 
 | 
| 105 | Help topics:
 | 
| 106 | 
 | 
| 107 | - [search()](ref/chap-type-method.html#search) for a pattern
 | 
| 108 |   - [Match => group()](ref/chap-type-method.html#group)
 | 
| 109 |   - [Match => start()](ref/chap-type-method.html#start)
 | 
| 110 |   - [Match => end()](ref/chap-type-method.html#end)
 | 
| 111 | - [find()](ref/chap-type-method.html#find) a substring
 | 
| 112 | 
 | 
| 113 | ### `leftMatch()` for Iterative matching / Lexers
 | 
| 114 | 
 | 
| 115 | The `leftMatch()` method is like `search()`, but the string must match the
 | 
| 116 | pattern at the left-most position.
 | 
| 117 | 
 | 
| 118 | It's useful for writing iterative lexers.
 | 
| 119 | 
 | 
| 120 |     var s = 'hi 123'
 | 
| 121 | 
 | 
| 122 |     var Name  = / <capture [a-z]+ as name> /
 | 
| 123 |     var Num   = / <capture d+ as num> /
 | 
| 124 |     var Space = / <capture s+ as space> /
 | 
| 125 | 
 | 
| 126 |     # 3 kinds of tokens.
 | 
| 127 |     # (For CapWords variables, splicing @Name doesn't require @.)
 | 
| 128 |     var lexer = / Name | Num | Space /
 | 
| 129 | 
 | 
| 130 |     var pos = 0  # start at position 0
 | 
| 131 |     while (true) {
 | 
| 132 |       var m = s => leftMatch(lexer, pos=pos)
 | 
| 133 |       if (not m) {
 | 
| 134 |         break
 | 
| 135 |       }
 | 
| 136 |       # Test which subgroup matched
 | 
| 137 |       var id = null
 | 
| 138 |       if (m => group('name') !== null) {
 | 
| 139 |         setvar id = 'name'
 | 
| 140 |       } elif (m => group('num') !== null) {
 | 
| 141 |         setvar id = 'num'
 | 
| 142 |       } elif (m => group('space') !== null) {
 | 
| 143 |         setvar id = 'space'
 | 
| 144 |       }
 | 
| 145 |       # Calculate the token value
 | 
| 146 |       var end_pos = m => end(0)
 | 
| 147 |       var val = s[pos:end_pos]
 | 
| 148 | 
 | 
| 149 |       echo "Token $id $val"
 | 
| 150 | 
 | 
| 151 |       setvar pos = end_pos  # Advance position
 | 
| 152 |     }
 | 
| 153 | 
 | 
| 154 | (YSH `leftMatch()` vs. `search()` is like Python's `re.match()` vs.
 | 
| 155 | `re.search().`)
 | 
| 156 | 
 | 
| 157 | - Help topic: [leftMatch()](ref/chap-type-method.html#leftMatch)
 | 
| 158 | 
 | 
| 159 | ## More Features
 | 
| 160 | 
 | 
| 161 | ### Named Captures
 | 
| 162 | 
 | 
| 163 | As noted above, you can name a capture group with say `<capture d+ as month>`,
 | 
| 164 | and access it with either
 | 
| 165 | 
 | 
| 166 | - `_group('month')` for the global match
 | 
| 167 | - `m => group('month')` when using `Str` methods
 | 
| 168 | 
 | 
| 169 | ### Type Conversion Funcs - A Better `scanf()`
 | 
| 170 | 
 | 
| 171 | You can also add `: funcName` to convert the captured string to a different
 | 
| 172 | value.
 | 
| 173 | 
 | 
| 174 |     var pat = / <capture d+ as month: int> /
 | 
| 175 |     if ('10-31' ~ pat) {
 | 
| 176 |       = _group('month')  # the integer 10, not the string '10'
 | 
| 177 |     }
 | 
| 178 | 
 | 
| 179 | The `func` should accept a string, and return any type of value.
 | 
| 180 | 
 | 
| 181 | Conversion funcs also work with positional captures: `/<capture d+ : int>/`.
 | 
| 182 | 
 | 
| 183 | - Help topic: [re-capture](ref/chap-expr-lang.html#re-capture)
 | 
| 184 | 
 | 
| 185 | ### Replacement / Substitution (TODO)
 | 
| 186 | 
 | 
| 187 | We plan to use unevaluated string literals like `^"hello $1"` ("quotations") as
 | 
| 188 | the replacement object.
 | 
| 189 | 
 | 
| 190 | This is instead of custom Python's custom language like `'hello \g<1>`.
 | 
| 191 | 
 | 
| 192 |     # var new = s => replace(/<capture d+ as month>/, ^"month is $month")
 | 
| 193 | 
 | 
| 194 | - Help topic: [replace()](ref/chap-type-method.html#replace)
 | 
| 195 | 
 | 
| 196 | <!--
 | 
| 197 | 
 | 
| 198 | Notes:
 | 
| 199 | - replace() can be for both substring and eggex?
 | 
| 200 | - replace() can also be a func taking a match object?
 | 
| 201 | 
 | 
| 202 | -->
 | 
| 203 | 
 | 
| 204 | ## Summary
 | 
| 205 | 
 | 
| 206 | YSH is designed to have the **convenience** of Perl and Awk, and the **power**
 | 
| 207 | of Python and JavaScript.
 | 
| 208 | 
 | 
| 209 | Eggexes can be composed by *splicing*.  Splicing works on expressions, not
 | 
| 210 | strings.
 | 
| 211 | 
 | 
| 212 | Replacement will use shell's string literal syntax, rather than a new
 | 
| 213 | `printf`-like mini-language.
 | 
| 214 | 
 | 
| 215 | ## Appendix: Python-like wrappers around the API
 | 
| 216 | 
 | 
| 217 | ### Slurping All Matches
 | 
| 218 | 
 | 
| 219 | Python's `findall()` function can be emulated by using `search()` in a loop,
 | 
| 220 | similar to the lexer example above:
 | 
| 221 | 
 | 
| 222 |     func findAll(s, pat) {
 | 
| 223 |       var pos = 0
 | 
| 224 |       var result = []
 | 
| 225 |       while (true) {
 | 
| 226 |         var m = s => search(pat, pos=pos)
 | 
| 227 |         if (not m) {
 | 
| 228 |           break
 | 
| 229 |         }
 | 
| 230 |         var left = m => start(0)
 | 
| 231 |         var right = m => end(0)
 | 
| 232 |         call result->append(s[left:right])
 | 
| 233 |         setvar pos = right
 | 
| 234 |       }
 | 
| 235 |       return (result)
 | 
| 236 |     }
 | 
| 237 | 
 | 
| 238 |     var matches = findAll('days 04-01 and 10-31', / d+ '-' d+ /)
 | 
| 239 |     json write (matches)  # => ['04-01', '10-31']
 | 
| 240 | 
 | 
| 241 | ### Split by Pattern
 | 
| 242 | 
 | 
| 243 | Python's `re.split()` can also be emulated by using `search()` in a loop.
 | 
| 244 | 
 | 
| 245 | ## Eggex Help Topics
 | 
| 246 | 
 | 
| 247 | - [re-literal](ref/chap-expr-lang.html#re-literal)
 | 
| 248 | - [re-primitive](ref/chap-expr-lang.html#re-primitive)
 | 
| 249 | - [class-literal](ref/chap-expr-lang.html#class-literal)
 | 
| 250 | - [named-class](ref/chap-expr-lang.html#named-class)
 | 
| 251 | - [re-repeat](ref/chap-expr-lang.html#re-repeat)
 | 
| 252 | - [re-compound](ref/chap-expr-lang.html#re-compound)
 | 
| 253 | - [re-capture](ref/chap-expr-lang.html#re-capture)
 | 
| 254 | - [re-splice](ref/chap-expr-lang.html#re-splice)
 | 
| 255 | - [re-flags](ref/chap-expr-lang.html#re-flags)
 | 
| 256 | 
 |