OILS / mycpp / gc_str.h View on Github | oilshell.org

219 lines, 110 significant
1#ifndef MYCPP_GC_STR_H
2#define MYCPP_GC_STR_H
3
4#include "mycpp/common.h" // DISALLOW_COPY_AND_ASSIGN
5#include "mycpp/gc_obj.h" // GC_OBJ
6#include "mycpp/hash.h" // HashFunc
7
8template <typename T>
9class List;
10
11class BigStr {
12 public:
13 // Don't call this directly. Call NewStr() instead, which calls this.
14 BigStr() {
15 }
16
17 char* data() {
18 return data_;
19 }
20
21 // Call this after writing into buffer created by OverAllocatedStr()
22 void MaybeShrink(int str_len);
23
24 BigStr* at(int i);
25
26 int find(BigStr* needle, int start = 0, int end = -1);
27 int rfind(BigStr* needle);
28
29 BigStr* slice(int begin);
30 BigStr* slice(int begin, int end);
31
32 BigStr* strip();
33 // Used for CommandSub in osh/cmd_exec.py
34 BigStr* rstrip(BigStr* chars);
35 BigStr* rstrip();
36
37 BigStr* lstrip(BigStr* chars);
38 BigStr* lstrip();
39
40 BigStr* ljust(int width, BigStr* fillchar);
41 BigStr* rjust(int width, BigStr* fillchar);
42
43 // Can take (start, end) so Tokens can be compared without allocation
44 bool startswith(BigStr* s);
45 bool endswith(BigStr* s);
46
47 BigStr* replace(BigStr* old, BigStr* new_str);
48 BigStr* replace(BigStr* old, BigStr* new_str, int count);
49 BigStr* join(List<BigStr*>* items);
50
51 List<BigStr*>* split(BigStr* sep);
52 List<BigStr*>* split(BigStr* sep, int max_split);
53 List<BigStr*>* splitlines(bool keep);
54
55 // TODO: Move unicode functions out of mycpp runtime? Because we won't match
56 // Python exactly
57 bool isdigit();
58 bool isalpha();
59 bool isupper();
60
61 BigStr* upper();
62 BigStr* lower();
63
64 // Other options for fast comparison / hashing / string interning:
65 // - unique_id_: an index into intern table. I don't think this works unless
66 // you want to deal with rehashing all strings when the set grows.
67 // - although note that the JVM has -XX:StringTableSize=FIXED, which means
68 // - it can degrade into linked list performance
69 // - Hashed strings become GLOBAL_STR(). Never deallocated.
70 // - Hashed strings become part of the "large object space", which might be
71 // managed by mark and sweep. This requires linked list overhead.
72 // (doubly-linked?)
73 // - Intern strings at GARBAGE COLLECTION TIME, with
74 // LayoutForwarded::new_location_? Is this possible? Does it introduce
75 // too much coupling between strings, hash tables, and GC?
76
77 static constexpr ObjHeader obj_header() {
78 return ObjHeader::BigStr();
79 }
80
81 unsigned hash(HashFunc h);
82
83 int len_;
84 unsigned hash_ : 31;
85 unsigned is_hashed_ : 1;
86 char data_[1]; // flexible array
87
88 private:
89 int _strip_left_pos();
90 int _strip_right_pos();
91
92 DISALLOW_COPY_AND_ASSIGN(BigStr)
93};
94
95constexpr int kStrHeaderSize = offsetof(BigStr, data_);
96
97// Note: for SmallStr, we might copy into the VALUE
98inline void BigStr::MaybeShrink(int str_len) {
99 len_ = str_len;
100 data_[len_] = '\0'; // NUL terminate
101}
102
103inline int len(const BigStr* s) {
104 return s->len_;
105}
106
107BigStr* StrFormat(const char* fmt, ...);
108BigStr* StrFormat(BigStr* fmt, ...);
109
110// NOTE: This iterates over bytes.
111class StrIter {
112 public:
113 explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) {
114 // Cheney only: s_ could be moved during iteration.
115 // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_));
116 }
117 ~StrIter() {
118 // gHeap.PopRoot();
119 }
120 void Next() {
121 i_++;
122 }
123 bool Done() {
124 return i_ >= len_;
125 }
126 BigStr* Value(); // similar to at()
127
128 private:
129 BigStr* s_;
130 int i_;
131 int len_;
132
133 DISALLOW_COPY_AND_ASSIGN(StrIter)
134};
135
136extern BigStr* kEmptyString;
137
138// GlobalStr notes:
139// - sizeof("foo") == 4, for the NUL terminator.
140// - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr. We
141// don't put it here because it triggers -Winvalid-offsetof
142
143template <int N>
144class GlobalStr {
145 // A template type with the same layout as BigStr with length N-1 (which needs
146 // a buffer of size N). For initializing global constant instances.
147 public:
148 int len_;
149 unsigned hash_ : 31;
150 unsigned is_hashed_ : 1;
151 const char data_[N];
152
153 DISALLOW_COPY_AND_ASSIGN(GlobalStr)
154};
155
156union Str {
157 public:
158 // Instead of this at the start of every function:
159 // Str* s = nullptr;
160 // It will now be:
161 // Str s(nullptr);
162 //
163 // StackRoot _root(&s);
164 explicit Str(BigStr* big) : big_(big) {
165 }
166
167 char* data() {
168 return big_->data();
169 }
170
171 Str at(int i) {
172 return Str(big_->at(i));
173 }
174
175 Str upper() {
176 return Str(big_->upper());
177 }
178
179 uint64_t raw_bytes_;
180 BigStr* big_;
181 // TODO: add SmallStr, see mycpp/small_str_test.cc
182};
183
184inline int len(const Str s) {
185 return len(s.big_);
186}
187
188// This macro is a workaround for the fact that it's impossible to have a
189// a constexpr initializer for char[N]. The "String Literals as Non-Type
190// Template Parameters" feature of C++ 20 would have done it, but it's not
191// there.
192//
193// https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/
194// https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor
195//
196// TODO: Can we hash values at compile time so they can be in the intern table?
197
198#define GLOBAL_STR(name, val) \
199 GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
200 ObjHeader::Global(TypeTag::BigStr), \
201 {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
202 BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj);
203
204// New style for SmallStr compatibility
205#define GLOBAL_STR2(name, val) \
206 GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
207 ObjHeader::Global(TypeTag::BigStr), \
208 {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
209 Str name(reinterpret_cast<BigStr*>(&_##name.obj));
210
211// Helper function that's consistent with JSON definition of ASCII whitespace,
212// e.g.
213// {"age": \t 42} is OK
214// {"age": \v 42} is NOT OK
215inline bool IsAsciiWhitespace(int ch) {
216 return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
217}
218
219#endif // MYCPP_GC_STR_H