1 | #ifndef MYCPP_GC_STR_H
|
2 | #define MYCPP_GC_STR_H
|
3 |
|
4 | #include "mycpp/common.h" // DISALLOW_COPY_AND_ASSIGN
|
5 | #include "mycpp/gc_obj.h" // GC_OBJ
|
6 | #include "mycpp/hash.h" // HashFunc
|
7 |
|
8 | template <typename T>
|
9 | class List;
|
10 |
|
11 | class BigStr {
|
12 | public:
|
13 | // Don't call this directly. Call NewStr() instead, which calls this.
|
14 | BigStr() {
|
15 | }
|
16 |
|
17 | char* data() {
|
18 | return data_;
|
19 | }
|
20 |
|
21 | // Call this after writing into buffer created by OverAllocatedStr()
|
22 | void MaybeShrink(int str_len);
|
23 |
|
24 | BigStr* at(int i);
|
25 |
|
26 | int find(BigStr* needle, int start = 0, int end = -1);
|
27 | int rfind(BigStr* needle);
|
28 |
|
29 | BigStr* slice(int begin);
|
30 | BigStr* slice(int begin, int end);
|
31 |
|
32 | BigStr* strip();
|
33 | // Used for CommandSub in osh/cmd_exec.py
|
34 | BigStr* rstrip(BigStr* chars);
|
35 | BigStr* rstrip();
|
36 |
|
37 | BigStr* lstrip(BigStr* chars);
|
38 | BigStr* lstrip();
|
39 |
|
40 | BigStr* ljust(int width, BigStr* fillchar);
|
41 | BigStr* rjust(int width, BigStr* fillchar);
|
42 |
|
43 | // Can take (start, end) so Tokens can be compared without allocation
|
44 | bool startswith(BigStr* s);
|
45 | bool endswith(BigStr* s);
|
46 |
|
47 | BigStr* replace(BigStr* old, BigStr* new_str);
|
48 | BigStr* replace(BigStr* old, BigStr* new_str, int count);
|
49 | BigStr* join(List<BigStr*>* items);
|
50 |
|
51 | List<BigStr*>* split(BigStr* sep);
|
52 | List<BigStr*>* split(BigStr* sep, int max_split);
|
53 | List<BigStr*>* splitlines(bool keep);
|
54 |
|
55 | // TODO: Move unicode functions out of mycpp runtime? Because we won't match
|
56 | // Python exactly
|
57 | bool isdigit();
|
58 | bool isalpha();
|
59 | bool isupper();
|
60 |
|
61 | BigStr* upper();
|
62 | BigStr* lower();
|
63 |
|
64 | // Other options for fast comparison / hashing / string interning:
|
65 | // - unique_id_: an index into intern table. I don't think this works unless
|
66 | // you want to deal with rehashing all strings when the set grows.
|
67 | // - although note that the JVM has -XX:StringTableSize=FIXED, which means
|
68 | // - it can degrade into linked list performance
|
69 | // - Hashed strings become GLOBAL_STR(). Never deallocated.
|
70 | // - Hashed strings become part of the "large object space", which might be
|
71 | // managed by mark and sweep. This requires linked list overhead.
|
72 | // (doubly-linked?)
|
73 | // - Intern strings at GARBAGE COLLECTION TIME, with
|
74 | // LayoutForwarded::new_location_? Is this possible? Does it introduce
|
75 | // too much coupling between strings, hash tables, and GC?
|
76 |
|
77 | static constexpr ObjHeader obj_header() {
|
78 | return ObjHeader::BigStr();
|
79 | }
|
80 |
|
81 | unsigned hash(HashFunc h);
|
82 |
|
83 | int len_;
|
84 | unsigned hash_ : 31;
|
85 | unsigned is_hashed_ : 1;
|
86 | char data_[1]; // flexible array
|
87 |
|
88 | private:
|
89 | int _strip_left_pos();
|
90 | int _strip_right_pos();
|
91 |
|
92 | DISALLOW_COPY_AND_ASSIGN(BigStr)
|
93 | };
|
94 |
|
95 | constexpr int kStrHeaderSize = offsetof(BigStr, data_);
|
96 |
|
97 | // Note: for SmallStr, we might copy into the VALUE
|
98 | inline void BigStr::MaybeShrink(int str_len) {
|
99 | len_ = str_len;
|
100 | data_[len_] = '\0'; // NUL terminate
|
101 | }
|
102 |
|
103 | inline int len(const BigStr* s) {
|
104 | return s->len_;
|
105 | }
|
106 |
|
107 | BigStr* StrFormat(const char* fmt, ...);
|
108 | BigStr* StrFormat(BigStr* fmt, ...);
|
109 |
|
110 | // NOTE: This iterates over bytes.
|
111 | class StrIter {
|
112 | public:
|
113 | explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) {
|
114 | // Cheney only: s_ could be moved during iteration.
|
115 | // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_));
|
116 | }
|
117 | ~StrIter() {
|
118 | // gHeap.PopRoot();
|
119 | }
|
120 | void Next() {
|
121 | i_++;
|
122 | }
|
123 | bool Done() {
|
124 | return i_ >= len_;
|
125 | }
|
126 | BigStr* Value(); // similar to at()
|
127 |
|
128 | private:
|
129 | BigStr* s_;
|
130 | int i_;
|
131 | int len_;
|
132 |
|
133 | DISALLOW_COPY_AND_ASSIGN(StrIter)
|
134 | };
|
135 |
|
136 | extern BigStr* kEmptyString;
|
137 |
|
138 | // GlobalStr notes:
|
139 | // - sizeof("foo") == 4, for the NUL terminator.
|
140 | // - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr. We
|
141 | // don't put it here because it triggers -Winvalid-offsetof
|
142 |
|
143 | template <int N>
|
144 | class GlobalStr {
|
145 | // A template type with the same layout as BigStr with length N-1 (which needs
|
146 | // a buffer of size N). For initializing global constant instances.
|
147 | public:
|
148 | int len_;
|
149 | unsigned hash_ : 31;
|
150 | unsigned is_hashed_ : 1;
|
151 | const char data_[N];
|
152 |
|
153 | DISALLOW_COPY_AND_ASSIGN(GlobalStr)
|
154 | };
|
155 |
|
156 | union Str {
|
157 | public:
|
158 | // Instead of this at the start of every function:
|
159 | // Str* s = nullptr;
|
160 | // It will now be:
|
161 | // Str s(nullptr);
|
162 | //
|
163 | // StackRoot _root(&s);
|
164 | explicit Str(BigStr* big) : big_(big) {
|
165 | }
|
166 |
|
167 | char* data() {
|
168 | return big_->data();
|
169 | }
|
170 |
|
171 | Str at(int i) {
|
172 | return Str(big_->at(i));
|
173 | }
|
174 |
|
175 | Str upper() {
|
176 | return Str(big_->upper());
|
177 | }
|
178 |
|
179 | uint64_t raw_bytes_;
|
180 | BigStr* big_;
|
181 | // TODO: add SmallStr, see mycpp/small_str_test.cc
|
182 | };
|
183 |
|
184 | inline int len(const Str s) {
|
185 | return len(s.big_);
|
186 | }
|
187 |
|
188 | // This macro is a workaround for the fact that it's impossible to have a
|
189 | // a constexpr initializer for char[N]. The "String Literals as Non-Type
|
190 | // Template Parameters" feature of C++ 20 would have done it, but it's not
|
191 | // there.
|
192 | //
|
193 | // https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/
|
194 | // https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor
|
195 | //
|
196 | // TODO: Can we hash values at compile time so they can be in the intern table?
|
197 |
|
198 | #define GLOBAL_STR(name, val) \
|
199 | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
|
200 | ObjHeader::Global(TypeTag::BigStr), \
|
201 | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
|
202 | BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj);
|
203 |
|
204 | // New style for SmallStr compatibility
|
205 | #define GLOBAL_STR2(name, val) \
|
206 | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
|
207 | ObjHeader::Global(TypeTag::BigStr), \
|
208 | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
|
209 | Str name(reinterpret_cast<BigStr*>(&_##name.obj));
|
210 |
|
211 | // Helper function that's consistent with JSON definition of ASCII whitespace,
|
212 | // e.g.
|
213 | // {"age": \t 42} is OK
|
214 | // {"age": \v 42} is NOT OK
|
215 | inline bool IsAsciiWhitespace(int ch) {
|
216 | return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
|
217 | }
|
218 |
|
219 | #endif // MYCPP_GC_STR_H
|