| 1 | #ifndef MYCPP_GC_STR_H
 | 
| 2 | #define MYCPP_GC_STR_H
 | 
| 3 | 
 | 
| 4 | #include "mycpp/common.h"  // DISALLOW_COPY_AND_ASSIGN
 | 
| 5 | #include "mycpp/gc_obj.h"  // GC_OBJ
 | 
| 6 | #include "mycpp/hash.h"    // HashFunc
 | 
| 7 | 
 | 
| 8 | template <typename T>
 | 
| 9 | class List;
 | 
| 10 | 
 | 
| 11 | class BigStr {
 | 
| 12 |  public:
 | 
| 13 |   // Don't call this directly.  Call NewStr() instead, which calls this.
 | 
| 14 |   BigStr() {
 | 
| 15 |   }
 | 
| 16 | 
 | 
| 17 |   char* data() {
 | 
| 18 |     return data_;
 | 
| 19 |   }
 | 
| 20 | 
 | 
| 21 |   // Call this after writing into buffer created by OverAllocatedStr()
 | 
| 22 |   void MaybeShrink(int str_len);
 | 
| 23 | 
 | 
| 24 |   BigStr* at(int i);
 | 
| 25 | 
 | 
| 26 |   int find(BigStr* needle, int start = 0, int end = -1);
 | 
| 27 |   int rfind(BigStr* needle);
 | 
| 28 | 
 | 
| 29 |   BigStr* slice(int begin);
 | 
| 30 |   BigStr* slice(int begin, int end);
 | 
| 31 | 
 | 
| 32 |   BigStr* strip();
 | 
| 33 |   // Used for CommandSub in osh/cmd_exec.py
 | 
| 34 |   BigStr* rstrip(BigStr* chars);
 | 
| 35 |   BigStr* rstrip();
 | 
| 36 | 
 | 
| 37 |   BigStr* lstrip(BigStr* chars);
 | 
| 38 |   BigStr* lstrip();
 | 
| 39 | 
 | 
| 40 |   BigStr* ljust(int width, BigStr* fillchar);
 | 
| 41 |   BigStr* rjust(int width, BigStr* fillchar);
 | 
| 42 | 
 | 
| 43 |   // Can take (start, end) so Tokens can be compared without allocation
 | 
| 44 |   bool startswith(BigStr* s);
 | 
| 45 |   bool endswith(BigStr* s);
 | 
| 46 | 
 | 
| 47 |   BigStr* replace(BigStr* old, BigStr* new_str);
 | 
| 48 |   BigStr* replace(BigStr* old, BigStr* new_str, int count);
 | 
| 49 |   BigStr* join(List<BigStr*>* items);
 | 
| 50 | 
 | 
| 51 |   List<BigStr*>* split(BigStr* sep);
 | 
| 52 |   List<BigStr*>* split(BigStr* sep, int max_split);
 | 
| 53 |   List<BigStr*>* splitlines(bool keep);
 | 
| 54 | 
 | 
| 55 |   // TODO: Move unicode functions out of mycpp runtime?  Because we won't match
 | 
| 56 |   // Python exactly
 | 
| 57 |   bool isdigit();
 | 
| 58 |   bool isalpha();
 | 
| 59 |   bool isupper();
 | 
| 60 | 
 | 
| 61 |   BigStr* upper();
 | 
| 62 |   BigStr* lower();
 | 
| 63 | 
 | 
| 64 |   // Other options for fast comparison / hashing / string interning:
 | 
| 65 |   // - unique_id_: an index into intern table.  I don't think this works unless
 | 
| 66 |   //   you want to deal with rehashing all strings when the set grows.
 | 
| 67 |   //   - although note that the JVM has -XX:StringTableSize=FIXED, which means
 | 
| 68 |   //   - it can degrade into linked list performance
 | 
| 69 |   // - Hashed strings become GLOBAL_STR().  Never deallocated.
 | 
| 70 |   // - Hashed strings become part of the "large object space", which might be
 | 
| 71 |   //   managed by mark and sweep.  This requires linked list overhead.
 | 
| 72 |   //   (doubly-linked?)
 | 
| 73 |   // - Intern strings at GARBAGE COLLECTION TIME, with
 | 
| 74 |   //   LayoutForwarded::new_location_?  Is this possible?  Does it introduce
 | 
| 75 |   //   too much coupling between strings, hash tables, and GC?
 | 
| 76 | 
 | 
| 77 |   static constexpr ObjHeader obj_header() {
 | 
| 78 |     return ObjHeader::BigStr();
 | 
| 79 |   }
 | 
| 80 | 
 | 
| 81 |   unsigned hash(HashFunc h);
 | 
| 82 | 
 | 
| 83 |   int len_;
 | 
| 84 |   unsigned hash_ : 31;
 | 
| 85 |   unsigned is_hashed_ : 1;
 | 
| 86 |   char data_[1];  // flexible array
 | 
| 87 | 
 | 
| 88 |  private:
 | 
| 89 |   int _strip_left_pos();
 | 
| 90 |   int _strip_right_pos();
 | 
| 91 | 
 | 
| 92 |   DISALLOW_COPY_AND_ASSIGN(BigStr)
 | 
| 93 | };
 | 
| 94 | 
 | 
| 95 | constexpr int kStrHeaderSize = offsetof(BigStr, data_);
 | 
| 96 | 
 | 
| 97 | // Note: for SmallStr, we might copy into the VALUE
 | 
| 98 | inline void BigStr::MaybeShrink(int str_len) {
 | 
| 99 |   len_ = str_len;
 | 
| 100 |   data_[len_] = '\0';  // NUL terminate
 | 
| 101 | }
 | 
| 102 | 
 | 
| 103 | inline int len(const BigStr* s) {
 | 
| 104 |   return s->len_;
 | 
| 105 | }
 | 
| 106 | 
 | 
| 107 | BigStr* StrFormat(const char* fmt, ...);
 | 
| 108 | BigStr* StrFormat(BigStr* fmt, ...);
 | 
| 109 | 
 | 
| 110 | // NOTE: This iterates over bytes.
 | 
| 111 | class StrIter {
 | 
| 112 |  public:
 | 
| 113 |   explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) {
 | 
| 114 |     // Cheney only: s_ could be moved during iteration.
 | 
| 115 |     // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_));
 | 
| 116 |   }
 | 
| 117 |   ~StrIter() {
 | 
| 118 |     // gHeap.PopRoot();
 | 
| 119 |   }
 | 
| 120 |   void Next() {
 | 
| 121 |     i_++;
 | 
| 122 |   }
 | 
| 123 |   bool Done() {
 | 
| 124 |     return i_ >= len_;
 | 
| 125 |   }
 | 
| 126 |   BigStr* Value();  // similar to at()
 | 
| 127 | 
 | 
| 128 |  private:
 | 
| 129 |   BigStr* s_;
 | 
| 130 |   int i_;
 | 
| 131 |   int len_;
 | 
| 132 | 
 | 
| 133 |   DISALLOW_COPY_AND_ASSIGN(StrIter)
 | 
| 134 | };
 | 
| 135 | 
 | 
| 136 | extern BigStr* kEmptyString;
 | 
| 137 | 
 | 
| 138 | // GlobalStr notes:
 | 
| 139 | // - sizeof("foo") == 4, for the NUL terminator.
 | 
| 140 | // - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr.  We
 | 
| 141 | // don't put it here because it triggers -Winvalid-offsetof
 | 
| 142 | 
 | 
| 143 | template <int N>
 | 
| 144 | class GlobalStr {
 | 
| 145 |   // A template type with the same layout as BigStr with length N-1 (which needs
 | 
| 146 |   // a buffer of size N).  For initializing global constant instances.
 | 
| 147 |  public:
 | 
| 148 |   int len_;
 | 
| 149 |   unsigned hash_ : 31;
 | 
| 150 |   unsigned is_hashed_ : 1;
 | 
| 151 |   const char data_[N];
 | 
| 152 | 
 | 
| 153 |   DISALLOW_COPY_AND_ASSIGN(GlobalStr)
 | 
| 154 | };
 | 
| 155 | 
 | 
| 156 | union Str {
 | 
| 157 |  public:
 | 
| 158 |   // Instead of this at the start of every function:
 | 
| 159 |   //   Str* s = nullptr;
 | 
| 160 |   // It will now be:
 | 
| 161 |   //   Str s(nullptr);
 | 
| 162 |   //
 | 
| 163 |   //   StackRoot _root(&s);
 | 
| 164 |   explicit Str(BigStr* big) : big_(big) {
 | 
| 165 |   }
 | 
| 166 | 
 | 
| 167 |   char* data() {
 | 
| 168 |     return big_->data();
 | 
| 169 |   }
 | 
| 170 | 
 | 
| 171 |   Str at(int i) {
 | 
| 172 |     return Str(big_->at(i));
 | 
| 173 |   }
 | 
| 174 | 
 | 
| 175 |   Str upper() {
 | 
| 176 |     return Str(big_->upper());
 | 
| 177 |   }
 | 
| 178 | 
 | 
| 179 |   uint64_t raw_bytes_;
 | 
| 180 |   BigStr* big_;
 | 
| 181 |   // TODO: add SmallStr, see mycpp/small_str_test.cc
 | 
| 182 | };
 | 
| 183 | 
 | 
| 184 | inline int len(const Str s) {
 | 
| 185 |   return len(s.big_);
 | 
| 186 | }
 | 
| 187 | 
 | 
| 188 | // This macro is a workaround for the fact that it's impossible to have a
 | 
| 189 | // a constexpr initializer for char[N].  The "String Literals as Non-Type
 | 
| 190 | // Template Parameters" feature of C++ 20 would have done it, but it's not
 | 
| 191 | // there.
 | 
| 192 | //
 | 
| 193 | // https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/
 | 
| 194 | // https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor
 | 
| 195 | //
 | 
| 196 | // TODO: Can we hash values at compile time so they can be in the intern table?
 | 
| 197 | 
 | 
| 198 | #define GLOBAL_STR(name, val)                                                \
 | 
| 199 |   GcGlobal<GlobalStr<sizeof(val)>> _##name = {                               \
 | 
| 200 |       ObjHeader::Global(TypeTag::BigStr),                                    \
 | 
| 201 |       {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
 | 
| 202 |   BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj);
 | 
| 203 | 
 | 
| 204 | // New style for SmallStr compatibility
 | 
| 205 | #define GLOBAL_STR2(name, val)                                               \
 | 
| 206 |   GcGlobal<GlobalStr<sizeof(val)>> _##name = {                               \
 | 
| 207 |       ObjHeader::Global(TypeTag::BigStr),                                    \
 | 
| 208 |       {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
 | 
| 209 |   Str name(reinterpret_cast<BigStr*>(&_##name.obj));
 | 
| 210 | 
 | 
| 211 | // Helper function that's consistent with JSON definition of ASCII whitespace,
 | 
| 212 | // e.g.
 | 
| 213 | // {"age": \t 42} is OK
 | 
| 214 | // {"age": \v 42} is NOT OK
 | 
| 215 | inline bool IsAsciiWhitespace(int ch) {
 | 
| 216 |   return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
 | 
| 217 | }
 | 
| 218 | 
 | 
| 219 | #endif  // MYCPP_GC_STR_H
 |