1 | #include "mycpp/gc_str.h"
|
2 |
|
3 | #include <ctype.h> // isalpha(), isdigit()
|
4 | #include <stdarg.h>
|
5 |
|
6 | #include <regex>
|
7 |
|
8 | #include "mycpp/common.h"
|
9 | #include "mycpp/gc_alloc.h" // NewStr()
|
10 | #include "mycpp/gc_builtins.h" // StringToInt()
|
11 | #include "mycpp/gc_list.h" // join(), split() use it
|
12 |
|
13 | GLOBAL_STR(kEmptyString, "");
|
14 |
|
15 | static const std::regex gStrFmtRegex("([^%]*)(?:%(-?[0-9]*)(.))?");
|
16 | static const int kMaxFmtWidth = 256; // arbitrary...
|
17 |
|
18 | int BigStr::find(BigStr* needle, int start, int end) {
|
19 | if (end == -1) {
|
20 | end = len(this);
|
21 | }
|
22 | int needle_len = len(needle);
|
23 |
|
24 | if (needle_len > (end - start)) {
|
25 | return -1; // needle is too long to be found (Python behavior)
|
26 | }
|
27 |
|
28 | if (needle_len == 1) {
|
29 | char c = needle->data_[0];
|
30 | // For 'aaa'.find('a', 0, 1)
|
31 | // end = 1, needle_len = 1, last_start = 1 which means we go through once
|
32 | for (int i = start; i < end; ++i) {
|
33 | if (data_[i] == c) {
|
34 | return i;
|
35 | }
|
36 | }
|
37 | } else {
|
38 | // Note: this works for finding the empty string. Empty string is found in
|
39 | // empty range like [5, 5), but not in [5, 4)
|
40 |
|
41 | // For 'aaa'.find('aa', 0, 2)
|
42 | // end = 2, needle_len = 2, last_start = 1 which means we go through once
|
43 |
|
44 | int last_start = end - needle_len + 1;
|
45 | // could use a smarter substring search algorithm
|
46 | for (int i = start; i < last_start; ++i) {
|
47 | if (memcmp(data_ + i, needle->data_, needle_len) == 0) {
|
48 | return i;
|
49 | }
|
50 | }
|
51 | }
|
52 | return -1;
|
53 | }
|
54 |
|
55 | int BigStr::rfind(BigStr* needle) {
|
56 | int length = len(this);
|
57 | DCHECK(len(needle) == 1); // Oils usage
|
58 | char c = needle->data_[0];
|
59 | for (int i = length - 1; i >= 0; --i) {
|
60 | if (data_[i] == c) {
|
61 | return i;
|
62 | }
|
63 | }
|
64 | return -1;
|
65 | }
|
66 |
|
67 | bool BigStr::isdigit() {
|
68 | int n = len(this);
|
69 | if (n == 0) {
|
70 | return false; // special case
|
71 | }
|
72 | for (int i = 0; i < n; ++i) {
|
73 | if (!::isdigit(data_[i])) {
|
74 | return false;
|
75 | }
|
76 | }
|
77 | return true;
|
78 | }
|
79 |
|
80 | bool BigStr::isalpha() {
|
81 | int n = len(this);
|
82 | if (n == 0) {
|
83 | return false; // special case
|
84 | }
|
85 | for (int i = 0; i < n; ++i) {
|
86 | if (!::isalpha(data_[i])) {
|
87 | return false;
|
88 | }
|
89 | }
|
90 | return true;
|
91 | }
|
92 |
|
93 | // e.g. for osh/braces.py
|
94 | bool BigStr::isupper() {
|
95 | int n = len(this);
|
96 | if (n == 0) {
|
97 | return false; // special case
|
98 | }
|
99 | for (int i = 0; i < n; ++i) {
|
100 | if (!::isupper(data_[i])) {
|
101 | return false;
|
102 | }
|
103 | }
|
104 | return true;
|
105 | }
|
106 |
|
107 | bool BigStr::startswith(BigStr* s) {
|
108 | int n = len(s);
|
109 | if (n > len(this)) {
|
110 | return false;
|
111 | }
|
112 | return memcmp(data_, s->data_, n) == 0;
|
113 | }
|
114 |
|
115 | bool BigStr::endswith(BigStr* s) {
|
116 | int len_s = len(s);
|
117 | int len_this = len(this);
|
118 | if (len_s > len_this) {
|
119 | return false;
|
120 | }
|
121 | const char* start = data_ + len_this - len_s;
|
122 | return memcmp(start, s->data_, len_s) == 0;
|
123 | }
|
124 |
|
125 | // Get a string with one character
|
126 | BigStr* BigStr::at(int i) {
|
127 | int length = len(this);
|
128 | if (i < 0) {
|
129 | i = length + i;
|
130 | }
|
131 | DCHECK(0 <= i);
|
132 | DCHECK(i < length); // had a problem here!
|
133 |
|
134 | BigStr* result = NewStr(1);
|
135 | result->data_[0] = data_[i];
|
136 | return result;
|
137 | }
|
138 |
|
139 | // s[begin:]
|
140 | BigStr* BigStr::slice(int begin) {
|
141 | return slice(begin, len(this));
|
142 | }
|
143 |
|
144 | // s[begin:end]
|
145 | BigStr* BigStr::slice(int begin, int end) {
|
146 | int length = len(this);
|
147 | SLICE_ADJUST(begin, end, length);
|
148 |
|
149 | DCHECK(0 <= begin && begin <= length);
|
150 | DCHECK(0 <= end && end <= length);
|
151 |
|
152 | int new_len = end - begin;
|
153 | DCHECK(0 <= new_len && new_len <= length);
|
154 |
|
155 | BigStr* result = NewStr(new_len); // has kEmptyString optimization
|
156 | memcpy(result->data_, data_ + begin, new_len);
|
157 |
|
158 | return result;
|
159 | }
|
160 |
|
161 | // Used by 'help' builtin and --help, neither of which translate yet.
|
162 |
|
163 | List<BigStr*>* BigStr::splitlines(bool keep) {
|
164 | DCHECK(keep == true);
|
165 | FAIL(kNotImplemented);
|
166 | }
|
167 |
|
168 | BigStr* BigStr::upper() {
|
169 | int length = len(this);
|
170 | BigStr* result = NewStr(length);
|
171 | char* buffer = result->data();
|
172 | for (int char_index = 0; char_index < length; ++char_index) {
|
173 | buffer[char_index] = toupper(data_[char_index]);
|
174 | }
|
175 | return result;
|
176 | }
|
177 |
|
178 | BigStr* BigStr::lower() {
|
179 | int length = len(this);
|
180 | BigStr* result = NewStr(length);
|
181 | char* buffer = result->data();
|
182 | for (int char_index = 0; char_index < length; ++char_index) {
|
183 | buffer[char_index] = tolower(data_[char_index]);
|
184 | }
|
185 | return result;
|
186 | }
|
187 |
|
188 | BigStr* BigStr::ljust(int width, BigStr* fillchar) {
|
189 | DCHECK(len(fillchar) == 1);
|
190 |
|
191 | int length = len(this);
|
192 | int num_fill = width - length;
|
193 | if (num_fill < 0) {
|
194 | return this;
|
195 | } else {
|
196 | BigStr* result = NewStr(width);
|
197 | char c = fillchar->data_[0];
|
198 | memcpy(result->data_, data_, length);
|
199 | for (int i = length; i < width; ++i) {
|
200 | result->data_[i] = c;
|
201 | }
|
202 | return result;
|
203 | }
|
204 | }
|
205 |
|
206 | BigStr* BigStr::rjust(int width, BigStr* fillchar) {
|
207 | DCHECK(len(fillchar) == 1);
|
208 |
|
209 | int length = len(this);
|
210 | int num_fill = width - length;
|
211 | if (num_fill < 0) {
|
212 | return this;
|
213 | } else {
|
214 | BigStr* result = NewStr(width);
|
215 | char c = fillchar->data_[0];
|
216 | for (int i = 0; i < num_fill; ++i) {
|
217 | result->data_[i] = c;
|
218 | }
|
219 | memcpy(result->data_ + num_fill, data_, length);
|
220 | return result;
|
221 | }
|
222 | }
|
223 |
|
224 | BigStr* BigStr::replace(BigStr* old, BigStr* new_str) {
|
225 | // Use -1 as in python2: "aaaa".replace(-1) -> "AAAA"
|
226 | return replace(old, new_str, -1);
|
227 | }
|
228 |
|
229 | BigStr* BigStr::replace(BigStr* old, BigStr* new_str, int count) {
|
230 | // log("replacing %s with %s", old_data, new_str->data_);
|
231 | const char* old_data = old->data_;
|
232 |
|
233 | int this_len = len(this);
|
234 | int old_len = len(old);
|
235 |
|
236 | const char* last_possible = data_ + this_len - old_len;
|
237 |
|
238 | const char* p_this = data_; // advances through 'this'
|
239 |
|
240 | // First pass: Calculate number of replacements, and hence new length
|
241 | int replace_count = 0;
|
242 | if (old_len == 0) {
|
243 | replace_count = this_len + 1;
|
244 | if (count > 0) {
|
245 | replace_count = min(replace_count, count);
|
246 | }
|
247 | } else {
|
248 | while (p_this <= last_possible) {
|
249 | if (replace_count != count && // limit replacements (if count != -1)
|
250 | memcmp(p_this, old_data, old_len) == 0) { // equal
|
251 | replace_count++;
|
252 | p_this += old_len;
|
253 | } else {
|
254 | p_this++;
|
255 | }
|
256 | }
|
257 | }
|
258 |
|
259 | // log("replacements %d", replace_count);
|
260 |
|
261 | if (replace_count == 0) {
|
262 | return this; // Reuse the string if there were no replacements
|
263 | }
|
264 |
|
265 | int new_str_len = len(new_str);
|
266 | int result_len =
|
267 | this_len - (replace_count * old_len) + (replace_count * new_str_len);
|
268 |
|
269 | BigStr* result = NewStr(result_len);
|
270 |
|
271 | const char* new_data = new_str->data_;
|
272 | const size_t new_len = new_str_len;
|
273 |
|
274 | // Second pass: Copy pieces into 'result'
|
275 | p_this = data_; // back to beginning
|
276 | char* p_result = result->data_; // advances through 'result'
|
277 | replace_count = 0;
|
278 |
|
279 | if (old_len == 0) {
|
280 | // Should place new_str between each char in this
|
281 | while (p_this < last_possible && replace_count != count) {
|
282 | replace_count++;
|
283 | memcpy(p_result, new_data, new_len); // Copy from new_str
|
284 | p_result += new_len; // Move past new_str
|
285 |
|
286 | // Write a char from this
|
287 | *p_result = *p_this;
|
288 | p_this++;
|
289 | p_result++;
|
290 | }
|
291 |
|
292 | if (replace_count != count) {
|
293 | // Write a copy of new_str at the end
|
294 | assert(p_this == last_possible);
|
295 | memcpy(p_result, new_data, new_len);
|
296 | } else if (p_this <= last_possible) {
|
297 | // Write the last part of string
|
298 | memcpy(p_result, p_this, data_ + this_len - p_this);
|
299 | }
|
300 | } else {
|
301 | while (p_this <= last_possible) {
|
302 | // Note: would be more efficient if we remembered the match positions
|
303 | if (replace_count != count && // limit replacements (if count != -1)
|
304 | memcmp(p_this, old_data, old_len) == 0) { // equal
|
305 | memcpy(p_result, new_data, new_len); // Copy from new_str
|
306 | replace_count++;
|
307 | p_result += new_len;
|
308 | p_this += old_len;
|
309 | } else { // copy 1 byte
|
310 | *p_result = *p_this;
|
311 | p_result++;
|
312 | p_this++;
|
313 | }
|
314 | }
|
315 | memcpy(p_result, p_this, data_ + this_len - p_this); // last part of string
|
316 | }
|
317 |
|
318 | return result;
|
319 | }
|
320 |
|
321 | enum class StripWhere {
|
322 | Left,
|
323 | Right,
|
324 | Both,
|
325 | };
|
326 |
|
327 | const int kWhitespace = -1;
|
328 |
|
329 | bool OmitChar(int ch, int what) {
|
330 | if (what == kWhitespace) {
|
331 | // Intentional incompatibility with Python, where say \v is whitespace
|
332 | // '\v'.strip() == ''
|
333 | //
|
334 | // But it is consistent with the JSON spec [ \t\r\n] and the rules in
|
335 | // frontend/lexer_def.py
|
336 | //
|
337 | // Note that the YSH is separate, and Str => trim() respects Unicode.
|
338 | return IsAsciiWhitespace(ch);
|
339 | } else {
|
340 | return what == ch;
|
341 | }
|
342 | }
|
343 |
|
344 | // StripAny is modeled after CPython's do_strip() in stringobject.c, and can
|
345 | // implement 6 functions:
|
346 | //
|
347 | // strip / lstrip / rstrip
|
348 | // strip(char) / lstrip(char) / rstrip(char)
|
349 | //
|
350 | // Args:
|
351 | // where: which ends to strip from
|
352 | // what: kWhitespace, or an ASCII code 0-255
|
353 |
|
354 | BigStr* StripAny(BigStr* s, StripWhere where, int what) {
|
355 | int length = len(s);
|
356 | const char* char_data = s->data();
|
357 |
|
358 | int i = 0;
|
359 | if (where != StripWhere::Right) {
|
360 | while (i < length && OmitChar(char_data[i], what)) {
|
361 | i++;
|
362 | }
|
363 | }
|
364 |
|
365 | int j = length;
|
366 | if (where != StripWhere::Left) {
|
367 | do {
|
368 | j--;
|
369 | } while (j >= i && OmitChar(char_data[j], what));
|
370 | j++;
|
371 | }
|
372 |
|
373 | if (i == j) { // Optimization to reuse existing object
|
374 | return kEmptyString;
|
375 | }
|
376 |
|
377 | if (i == 0 && j == length) { // nothing stripped
|
378 | return s;
|
379 | }
|
380 |
|
381 | // Note: makes a copy in leaky version, and will in GC version too
|
382 | int new_len = j - i;
|
383 | BigStr* result = NewStr(new_len);
|
384 | memcpy(result->data(), s->data() + i, new_len);
|
385 | return result;
|
386 | }
|
387 |
|
388 | BigStr* BigStr::strip() {
|
389 | return StripAny(this, StripWhere::Both, kWhitespace);
|
390 | }
|
391 |
|
392 | // Used for CommandSub in osh/cmd_exec.py
|
393 | BigStr* BigStr::rstrip(BigStr* chars) {
|
394 | DCHECK(len(chars) == 1);
|
395 | int c = chars->data_[0];
|
396 | return StripAny(this, StripWhere::Right, c);
|
397 | }
|
398 |
|
399 | BigStr* BigStr::rstrip() {
|
400 | return StripAny(this, StripWhere::Right, kWhitespace);
|
401 | }
|
402 |
|
403 | BigStr* BigStr::lstrip(BigStr* chars) {
|
404 | DCHECK(len(chars) == 1);
|
405 | int c = chars->data_[0];
|
406 | return StripAny(this, StripWhere::Left, c);
|
407 | }
|
408 |
|
409 | BigStr* BigStr::lstrip() {
|
410 | return StripAny(this, StripWhere::Left, kWhitespace);
|
411 | }
|
412 |
|
413 | BigStr* BigStr::join(List<BigStr*>* items) {
|
414 | int length = 0;
|
415 |
|
416 | int num_parts = len(items);
|
417 |
|
418 | // " ".join([]) == ""
|
419 | if (num_parts == 0) {
|
420 | return kEmptyString;
|
421 | }
|
422 |
|
423 | // Common case
|
424 | // 'anything'.join(["foo"]) == "foo"
|
425 | if (num_parts == 1) {
|
426 | return items->at(0);
|
427 | }
|
428 |
|
429 | for (int i = 0; i < num_parts; ++i) {
|
430 | length += len(items->at(i));
|
431 | }
|
432 | // add length of all the separators
|
433 | int this_len = len(this);
|
434 | length += this_len * (num_parts - 1);
|
435 |
|
436 | BigStr* result = NewStr(length);
|
437 | char* p_result = result->data_; // advances through
|
438 |
|
439 | for (int i = 0; i < num_parts; ++i) {
|
440 | // log("i %d", i);
|
441 | if (i != 0 && this_len) { // optimize common case of ''.join()
|
442 | memcpy(p_result, data_, this_len); // copy the separator
|
443 | p_result += this_len;
|
444 | // log("this_len %d", this_len);
|
445 | }
|
446 |
|
447 | int n = len(items->at(i));
|
448 | // log("n: %d", n);
|
449 | memcpy(p_result, items->at(i)->data_, n); // copy the list item
|
450 | p_result += n;
|
451 | }
|
452 |
|
453 | return result;
|
454 | }
|
455 |
|
456 | static void AppendPart(List<BigStr*>* result, BigStr* s, int left, int right) {
|
457 | int new_len = right - left;
|
458 | BigStr* part;
|
459 | if (new_len == 0) {
|
460 | part = kEmptyString;
|
461 | } else {
|
462 | part = NewStr(new_len);
|
463 | memcpy(part->data_, s->data_ + left, new_len);
|
464 | }
|
465 | result->append(part);
|
466 | }
|
467 |
|
468 | // Split BigStr into List<BigStr*> of parts separated by 'sep'.
|
469 | // The code structure is taken from CPython's Objects/stringlib/split.h.
|
470 | List<BigStr*>* BigStr::split(BigStr* sep, int max_split) {
|
471 | DCHECK(sep != nullptr);
|
472 | DCHECK(len(sep) == 1); // we can only split one char
|
473 | char sep_char = sep->data_[0];
|
474 |
|
475 | int str_len = len(this);
|
476 | if (str_len == 0) {
|
477 | // weird case consistent with Python: ''.split(':') == ['']
|
478 | return NewList<BigStr*>({kEmptyString});
|
479 | }
|
480 |
|
481 | List<BigStr*>* result = NewList<BigStr*>({});
|
482 | int left = 0;
|
483 | int right = 0;
|
484 | int num_parts = 0; // 3 splits results in 4 parts
|
485 |
|
486 | while (right < str_len && num_parts < max_split) {
|
487 | // search for separator
|
488 | for (; right < str_len; right++) {
|
489 | if (data_[right] == sep_char) {
|
490 | AppendPart(result, this, left, right);
|
491 | right++;
|
492 | left = right;
|
493 | num_parts++;
|
494 | break;
|
495 | }
|
496 | }
|
497 | }
|
498 | if (num_parts == 0) { // Optimization when there is no split
|
499 | result->append(this);
|
500 | } else if (left <= str_len) { // Last part
|
501 | AppendPart(result, this, left, str_len);
|
502 | }
|
503 |
|
504 | return result;
|
505 | }
|
506 |
|
507 | List<BigStr*>* BigStr::split(BigStr* sep) {
|
508 | return this->split(sep, len(this));
|
509 | }
|
510 |
|
511 | unsigned BigStr::hash(HashFunc h) {
|
512 | if (!is_hashed_) {
|
513 | hash_ = h(data_, len(this)) >> 1;
|
514 | is_hashed_ = 1;
|
515 | }
|
516 | return hash_;
|
517 | }
|
518 |
|
519 | static inline BigStr* _StrFormat(const char* fmt, int fmt_len, va_list args) {
|
520 | auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
|
521 | auto end = std::cregex_iterator();
|
522 |
|
523 | char int_buf[kMaxFmtWidth];
|
524 | std::string buf;
|
525 | for (std::cregex_iterator it = beg; it != end; ++it) {
|
526 | const std::cmatch& match = *it;
|
527 |
|
528 | const std::csub_match& lit_m = match[1];
|
529 | DCHECK(lit_m.matched);
|
530 | const std::string& lit_s = lit_m.str();
|
531 | buf.append(lit_s);
|
532 |
|
533 | int width = 0;
|
534 | bool zero_pad = false;
|
535 | bool pad_back = false;
|
536 | const std::csub_match& width_m = match[2];
|
537 | const std::string& width_s = width_m.str();
|
538 | bool ok = false;
|
539 | if (width_m.matched && !width_s.empty()) {
|
540 | if (width_s[0] == '0') {
|
541 | zero_pad = true;
|
542 | DCHECK(width_s.size() > 1);
|
543 | ok = StringToInt(width_s.c_str() + 1, width_s.size() - 1, 10, &width);
|
544 | DCHECK(ok);
|
545 | (void)ok; // silence unused var warning in opt
|
546 | } else {
|
547 | ok = StringToInt(width_s.c_str(), width_s.size(), 10, &width);
|
548 | DCHECK(ok);
|
549 | }
|
550 | if (width < 0) {
|
551 | pad_back = true;
|
552 | width *= -1;
|
553 | }
|
554 | DCHECK(0 <= width && width < kMaxFmtWidth);
|
555 | }
|
556 |
|
557 | char const* str_to_add = nullptr;
|
558 | int add_len = 0;
|
559 | const std::csub_match& code_m = match[3];
|
560 | const std::string& code_s = code_m.str();
|
561 | if (!code_m.matched) {
|
562 | DCHECK(!width_m.matched); // python errors on invalid format operators
|
563 | break;
|
564 | }
|
565 | DCHECK(code_s.size() == 1);
|
566 | switch (code_s[0]) {
|
567 | case '%': {
|
568 | str_to_add = code_s.c_str();
|
569 | add_len = 1;
|
570 | break;
|
571 | }
|
572 | case 's': {
|
573 | BigStr* s = va_arg(args, BigStr*);
|
574 | // Check type unconditionally because mycpp doesn't always check it
|
575 | CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
|
576 |
|
577 | str_to_add = s->data();
|
578 | add_len = len(s);
|
579 | zero_pad = false; // python ignores the 0 directive for strings
|
580 | break;
|
581 | }
|
582 | case 'r': {
|
583 | BigStr* s = va_arg(args, BigStr*);
|
584 | // Check type unconditionally because mycpp doesn't always check it
|
585 | CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
|
586 |
|
587 | s = repr(s);
|
588 | str_to_add = s->data();
|
589 | add_len = len(s);
|
590 | zero_pad = false; // python ignores the 0 directive for strings
|
591 | break;
|
592 | }
|
593 | case 'd': // fallthrough
|
594 | case 'o': {
|
595 | int d = va_arg(args, int);
|
596 | add_len = snprintf(int_buf, kMaxFmtWidth,
|
597 | match.str().c_str() + lit_s.size(), d);
|
598 | DCHECK(add_len > 0);
|
599 | str_to_add = int_buf;
|
600 | break;
|
601 | }
|
602 | default:
|
603 | DCHECK(0);
|
604 | break;
|
605 | }
|
606 | DCHECK(str_to_add != nullptr);
|
607 |
|
608 | if (pad_back) {
|
609 | buf.append(str_to_add, add_len);
|
610 | }
|
611 | if (add_len < width) {
|
612 | for (int i = 0; i < width - add_len; ++i) {
|
613 | buf.push_back(zero_pad ? '0' : ' ');
|
614 | }
|
615 | }
|
616 | if (!pad_back) {
|
617 | buf.append(str_to_add, add_len);
|
618 | }
|
619 | }
|
620 |
|
621 | return StrFromC(buf.c_str(), buf.size());
|
622 | }
|
623 |
|
624 | BigStr* StrIter::Value() { // similar to at()
|
625 | BigStr* result = NewStr(1);
|
626 | result->data_[0] = s_->data_[i_];
|
627 | DCHECK(result->data_[1] == '\0');
|
628 | return result;
|
629 | }
|
630 |
|
631 | BigStr* StrFormat(const char* fmt, ...) {
|
632 | va_list args;
|
633 | va_start(args, fmt);
|
634 | BigStr* ret = _StrFormat(fmt, strlen(fmt), args);
|
635 | va_end(args);
|
636 | return ret;
|
637 | }
|
638 |
|
639 | BigStr* StrFormat(BigStr* fmt, ...) {
|
640 | va_list args;
|
641 | va_start(args, fmt);
|
642 | BigStr* ret = _StrFormat(fmt->data(), len(fmt), args);
|
643 | va_end(args);
|
644 | return ret;
|
645 | }
|