| 1 | #ifndef MYCPP_GC_STR_H
|
| 2 | #define MYCPP_GC_STR_H
|
| 3 |
|
| 4 | #include "mycpp/common.h" // DISALLOW_COPY_AND_ASSIGN
|
| 5 | #include "mycpp/gc_obj.h" // GC_OBJ
|
| 6 | #include "mycpp/hash.h" // HashFunc
|
| 7 |
|
| 8 | template <typename T>
|
| 9 | class List;
|
| 10 |
|
| 11 | class BigStr {
|
| 12 | public:
|
| 13 | // Don't call this directly. Call NewStr() instead, which calls this.
|
| 14 | BigStr() {
|
| 15 | }
|
| 16 |
|
| 17 | char* data() {
|
| 18 | return data_;
|
| 19 | }
|
| 20 |
|
| 21 | // Call this after writing into buffer created by OverAllocatedStr()
|
| 22 | void MaybeShrink(int str_len);
|
| 23 |
|
| 24 | BigStr* at(int i);
|
| 25 |
|
| 26 | int find(BigStr* needle, int start = 0, int end = -1);
|
| 27 | int rfind(BigStr* needle);
|
| 28 |
|
| 29 | BigStr* slice(int begin);
|
| 30 | BigStr* slice(int begin, int end);
|
| 31 |
|
| 32 | BigStr* strip();
|
| 33 | // Used for CommandSub in osh/cmd_exec.py
|
| 34 | BigStr* rstrip(BigStr* chars);
|
| 35 | BigStr* rstrip();
|
| 36 |
|
| 37 | BigStr* lstrip(BigStr* chars);
|
| 38 | BigStr* lstrip();
|
| 39 |
|
| 40 | BigStr* ljust(int width, BigStr* fillchar);
|
| 41 | BigStr* rjust(int width, BigStr* fillchar);
|
| 42 |
|
| 43 | // Can take (start, end) so Tokens can be compared without allocation
|
| 44 | bool startswith(BigStr* s);
|
| 45 | bool endswith(BigStr* s);
|
| 46 |
|
| 47 | BigStr* replace(BigStr* old, BigStr* new_str);
|
| 48 | BigStr* replace(BigStr* old, BigStr* new_str, int count);
|
| 49 | BigStr* join(List<BigStr*>* items);
|
| 50 |
|
| 51 | List<BigStr*>* split(BigStr* sep);
|
| 52 | List<BigStr*>* split(BigStr* sep, int max_split);
|
| 53 | List<BigStr*>* splitlines(bool keep);
|
| 54 |
|
| 55 | // TODO: Move unicode functions out of mycpp runtime? Because we won't match
|
| 56 | // Python exactly
|
| 57 | bool isdigit();
|
| 58 | bool isalpha();
|
| 59 | bool isupper();
|
| 60 |
|
| 61 | BigStr* upper();
|
| 62 | BigStr* lower();
|
| 63 |
|
| 64 | // Other options for fast comparison / hashing / string interning:
|
| 65 | // - unique_id_: an index into intern table. I don't think this works unless
|
| 66 | // you want to deal with rehashing all strings when the set grows.
|
| 67 | // - although note that the JVM has -XX:StringTableSize=FIXED, which means
|
| 68 | // - it can degrade into linked list performance
|
| 69 | // - Hashed strings become GLOBAL_STR(). Never deallocated.
|
| 70 | // - Hashed strings become part of the "large object space", which might be
|
| 71 | // managed by mark and sweep. This requires linked list overhead.
|
| 72 | // (doubly-linked?)
|
| 73 | // - Intern strings at GARBAGE COLLECTION TIME, with
|
| 74 | // LayoutForwarded::new_location_? Is this possible? Does it introduce
|
| 75 | // too much coupling between strings, hash tables, and GC?
|
| 76 |
|
| 77 | static constexpr ObjHeader obj_header() {
|
| 78 | return ObjHeader::BigStr();
|
| 79 | }
|
| 80 |
|
| 81 | unsigned hash(HashFunc h);
|
| 82 |
|
| 83 | int len_;
|
| 84 | unsigned hash_ : 31;
|
| 85 | unsigned is_hashed_ : 1;
|
| 86 | char data_[1]; // flexible array
|
| 87 |
|
| 88 | private:
|
| 89 | int _strip_left_pos();
|
| 90 | int _strip_right_pos();
|
| 91 |
|
| 92 | DISALLOW_COPY_AND_ASSIGN(BigStr)
|
| 93 | };
|
| 94 |
|
| 95 | constexpr int kStrHeaderSize = offsetof(BigStr, data_);
|
| 96 |
|
| 97 | // Note: for SmallStr, we might copy into the VALUE
|
| 98 | inline void BigStr::MaybeShrink(int str_len) {
|
| 99 | len_ = str_len;
|
| 100 | data_[len_] = '\0'; // NUL terminate
|
| 101 | }
|
| 102 |
|
| 103 | inline int len(const BigStr* s) {
|
| 104 | return s->len_;
|
| 105 | }
|
| 106 |
|
| 107 | BigStr* StrFormat(const char* fmt, ...);
|
| 108 | BigStr* StrFormat(BigStr* fmt, ...);
|
| 109 |
|
| 110 | // NOTE: This iterates over bytes.
|
| 111 | class StrIter {
|
| 112 | public:
|
| 113 | explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) {
|
| 114 | // Cheney only: s_ could be moved during iteration.
|
| 115 | // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_));
|
| 116 | }
|
| 117 | ~StrIter() {
|
| 118 | // gHeap.PopRoot();
|
| 119 | }
|
| 120 | void Next() {
|
| 121 | i_++;
|
| 122 | }
|
| 123 | bool Done() {
|
| 124 | return i_ >= len_;
|
| 125 | }
|
| 126 | BigStr* Value(); // similar to at()
|
| 127 |
|
| 128 | private:
|
| 129 | BigStr* s_;
|
| 130 | int i_;
|
| 131 | int len_;
|
| 132 |
|
| 133 | DISALLOW_COPY_AND_ASSIGN(StrIter)
|
| 134 | };
|
| 135 |
|
| 136 | extern BigStr* kEmptyString;
|
| 137 |
|
| 138 | // GlobalStr notes:
|
| 139 | // - sizeof("foo") == 4, for the NUL terminator.
|
| 140 | // - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr. We
|
| 141 | // don't put it here because it triggers -Winvalid-offsetof
|
| 142 |
|
| 143 | template <int N>
|
| 144 | class GlobalStr {
|
| 145 | // A template type with the same layout as BigStr with length N-1 (which needs
|
| 146 | // a buffer of size N). For initializing global constant instances.
|
| 147 | public:
|
| 148 | int len_;
|
| 149 | unsigned hash_ : 31;
|
| 150 | unsigned is_hashed_ : 1;
|
| 151 | const char data_[N];
|
| 152 |
|
| 153 | DISALLOW_COPY_AND_ASSIGN(GlobalStr)
|
| 154 | };
|
| 155 |
|
| 156 | union Str {
|
| 157 | public:
|
| 158 | // Instead of this at the start of every function:
|
| 159 | // Str* s = nullptr;
|
| 160 | // It will now be:
|
| 161 | // Str s(nullptr);
|
| 162 | //
|
| 163 | // StackRoot _root(&s);
|
| 164 | explicit Str(BigStr* big) : big_(big) {
|
| 165 | }
|
| 166 |
|
| 167 | char* data() {
|
| 168 | return big_->data();
|
| 169 | }
|
| 170 |
|
| 171 | Str at(int i) {
|
| 172 | return Str(big_->at(i));
|
| 173 | }
|
| 174 |
|
| 175 | Str upper() {
|
| 176 | return Str(big_->upper());
|
| 177 | }
|
| 178 |
|
| 179 | uint64_t raw_bytes_;
|
| 180 | BigStr* big_;
|
| 181 | // TODO: add SmallStr, see mycpp/small_str_test.cc
|
| 182 | };
|
| 183 |
|
| 184 | inline int len(const Str s) {
|
| 185 | return len(s.big_);
|
| 186 | }
|
| 187 |
|
| 188 | // This macro is a workaround for the fact that it's impossible to have a
|
| 189 | // a constexpr initializer for char[N]. The "String Literals as Non-Type
|
| 190 | // Template Parameters" feature of C++ 20 would have done it, but it's not
|
| 191 | // there.
|
| 192 | //
|
| 193 | // https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/
|
| 194 | // https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor
|
| 195 | //
|
| 196 | // TODO: Can we hash values at compile time so they can be in the intern table?
|
| 197 |
|
| 198 | #define GLOBAL_STR(name, val) \
|
| 199 | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
|
| 200 | ObjHeader::Global(TypeTag::BigStr), \
|
| 201 | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
|
| 202 | BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj);
|
| 203 |
|
| 204 | // New style for SmallStr compatibility
|
| 205 | #define GLOBAL_STR2(name, val) \
|
| 206 | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
|
| 207 | ObjHeader::Global(TypeTag::BigStr), \
|
| 208 | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
|
| 209 | Str name(reinterpret_cast<BigStr*>(&_##name.obj));
|
| 210 |
|
| 211 | // Helper function that's consistent with JSON definition of ASCII whitespace,
|
| 212 | // e.g.
|
| 213 | // {"age": \t 42} is OK
|
| 214 | // {"age": \v 42} is NOT OK
|
| 215 | inline bool IsAsciiWhitespace(int ch) {
|
| 216 | return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
|
| 217 | }
|
| 218 |
|
| 219 | #endif // MYCPP_GC_STR_H
|