OILS / mycpp / small_str_test.cc View on Github | oilshell.org

502 lines, 289 significant
1// small_str_test.cc - Demo for new Str implementation
2
3#include <inttypes.h>
4#include <limits.h> // HOST_NAME_MAX
5#include <unistd.h> // gethostname()
6
7#include <new> // placement new
8
9// #include "mycpp/runtime.h"
10#include "mycpp/common.h"
11#include "mycpp/gc_obj.h" // ObjHeader
12#include "vendor/greatest.h"
13
14namespace small_str_test {
15
16//
17// STRING IMPLEMENTATION
18//
19
20// SmallStr is used as a VALUE
21
22const int kSmallStrThreshold = 6;
23const int kSmallStrInvalidLength = 0b1111;
24
25// Layout compatible with SmallStr, and globally initialized
26struct GlobalSmallStr {
27 unsigned is_present_ : 1; // reserved
28 unsigned pad_ : 3;
29 unsigned length_ : 4; // max string length is 6
30
31 char data_[7]; // NUL-terminated C string
32};
33
34// SmallStr is an 8-byte value type (even on 32-bit machines)
35class SmallStr {
36 public:
37 SmallStr(int n) : is_present_(1), pad_(0), length_(n), data_{0} {
38 }
39
40 unsigned is_present_ : 1; // reserved
41 unsigned pad_ : 3;
42 unsigned length_ : 4; // 0 to 6 bytes of data payload
43
44 char data_[7];
45};
46
47// HeapStr is used as POINTER
48
49class HeapStr {
50 public:
51 HeapStr() {
52 }
53 int Length() {
54#ifdef MARK_SWEEP
55 return header_.u_mask_npointers;
56#elif BUMP_LEAK
57 #error "TODO: add field to HeapStr"
58#else
59 // derive string length from GC object length
60 return header.obj_len - kStrHeaderSize - 1;
61#endif
62 }
63 void SetLength(int len) {
64 // Important invariant that makes str_equals() simpler: "abc" in a HeapStr
65 // is INVALID.
66 assert(len > kSmallStrThreshold);
67
68#ifdef MARK_SWEEP
69 header_.u_mask_npointers = len;
70#elif BUMP_LEAK
71 #error "TODO: add field to HeapStr"
72#else
73 // set object length, which can derive string length
74 header.obj_len = kStrHeaderSize + len + 1; // +1 for
75#endif
76 }
77
78 static constexpr ObjHeader obj_header() {
79 return ObjHeader::BigStr();
80 }
81
82 ObjHeader header_;
83 char data_[1];
84};
85
86constexpr int kStrHeaderSize = offsetof(HeapStr, data_);
87
88// AllocHeapStr() is a helper that allocates a HeapStr but doesn't set its
89// length. It's NOT part of the public API; use NewStr() instead
90static HeapStr* AllocHeapStr(int n) {
91 void* place = malloc(kStrHeaderSize + n + 1); // +1 for NUL terminator
92 return new (place) HeapStr();
93}
94
95// Str is a value type that can be small or big!
96union Str {
97 // small_ is the whole 8 bytes
98 Str(SmallStr small) : small_(small) {
99 }
100 // big_ may be 4 bytes, so we need raw_bytes_ first
101 Str(HeapStr* big) : raw_bytes_(0) {
102 big_ = big;
103 }
104
105 bool IsSmall() {
106 return small_.is_present_;
107 }
108
109 // Returns a NUL-terminated C string, like std::string::c_str()
110 char* c_str() {
111 if (small_.is_present_) {
112 return small_.data_;
113 } else {
114 return big_->data_;
115 }
116 }
117
118 // Mutate in place, like OverAllocatedStr then SetObjLenFromStrLen()
119 // Assumes the caller already NUL-terminate the string to this length!
120 // e.g. read(), snprintf
121 void MaybeShrink(int new_len) {
122 if (new_len <= kSmallStrThreshold) {
123 if (small_.is_present_) { // It's already small, just set length
124
125 // Callers like strftime() should have NUL-terminated it!
126 assert(small_.data_[new_len] == '\0');
127
128 small_.length_ = new_len;
129
130 } else { // Shrink from big to small
131 HeapStr* copy_of_big = big_; // Important!
132
133 raw_bytes_ = 0; // maintain invariants for fast str_equals()
134 small_.is_present_ = 1;
135 memcpy(small_.data_, copy_of_big->data_, new_len);
136 small_.data_[new_len] = '\0'; // NUL terminate
137 }
138 } else { // It's already bit, set length
139 // OverAllocatedStr always starts with a big string
140 assert(!small_.is_present_);
141
142 // Callers like strftime() should have NUL-terminated it!
143 assert(big_->data_[new_len] == '\0');
144
145 big_->SetLength(new_len);
146 }
147 }
148
149 void CopyTo(char* dest) {
150 char* src;
151 int n;
152 if (small_.is_present_) {
153 src = small_.data_;
154 n = small_.length_;
155 } else {
156 src = big_->data_;
157 n = big_->Length();
158 }
159 memcpy(dest, src, n);
160 }
161
162 Str upper() {
163 if (small_.is_present_) {
164 // Mutate
165 for (int i = 0; i < small_.length_; ++i) {
166 small_.data_[i] = toupper(small_.data_[i]);
167 }
168 return Str(small_); // return a copy BY VALUE
169 } else {
170 int n = big_->Length();
171 HeapStr* result = AllocHeapStr(n);
172
173 for (int i = 0; i < n; ++i) {
174 result->data_[i] = toupper(big_->data_[i]);
175 }
176 result->data_[n] = '\0';
177 result->SetLength(n);
178
179 return Str(result);
180 }
181 }
182
183 uint64_t raw_bytes_;
184 SmallStr small_;
185 HeapStr* big_;
186};
187
188// Invariants affecting Str equality
189//
190// 1. The contents of Str are normalized
191// - SmallStr: the bytes past the NUL terminator are zero-initialized.
192// - HeapStr*: if sizeof(HeapStr*) == 4, then the rest of the bytes are
193// zero-initialized.
194//
195// 2. If len(s) <= kSmallStrThreshold, then s.IsSmall()
196// Conversely, If len(s) > kSmallStrThreshold, then NOT s.IsSmall()
197//
198// This is enforced by the fact that all strings are created by:
199//
200// 1. StrFromC()
201// 2. OverAllocatedStr(), then MaybeShrink()
202// 3. Str:: methods that use the above functions, or NewStr()
203
204bool str_equals(Str a, Str b) {
205 // Fast path takes care of two cases: Identical small strings, or identical
206 // pointers to big strings!
207 if (a.raw_bytes_ == b.raw_bytes_) {
208 return true;
209 }
210
211 bool a_small = a.IsSmall();
212 bool b_small = b.IsSmall();
213
214 // Str instances are normalized so a SmallStr can't equal a HeapStr*
215 if (a_small != b_small) {
216 return false;
217 }
218
219 // Both are small, and we already failed the fast path
220 if (a_small) {
221 return false;
222 }
223
224 // Both are big
225 int a_len = a.big_->Length();
226 int b_len = b.big_->Length();
227
228 if (a_len != b_len) {
229 return false;
230 }
231
232 return memcmp(a.big_->data_, b.big_->data_, a_len) == 0;
233}
234
235#define G_SMALL_STR(name, s, small_len) \
236 GlobalSmallStr _##name = {1, 0, small_len, s}; \
237 Str name = *(reinterpret_cast<Str*>(&_##name));
238
239G_SMALL_STR(kEmptyString, "", 0);
240
241G_SMALL_STR(gSmall, "global", 6);
242
243Str NewStr(int n) {
244 if (n <= kSmallStrThreshold) {
245 SmallStr small(n);
246 return Str(small);
247 } else {
248 HeapStr* big = AllocHeapStr(n);
249 big->SetLength(n);
250 return Str(big);
251 }
252}
253
254// NOTE: must call MaybeShrink(n) afterward to set length! Should it NUL
255// terminate?
256Str OverAllocatedStr(int n) {
257 // There's no point in overallocating small strings
258 assert(n > kSmallStrThreshold);
259
260 HeapStr* big = AllocHeapStr(n);
261 // Not setting length!
262 return Str(big);
263}
264
265Str StrFromC(const char* s, int n) {
266 if (n <= kSmallStrThreshold) {
267 SmallStr small(n);
268 memcpy(small.data_, s, n + 1); // copy NUL terminator too
269 return Str(small);
270 } else {
271 HeapStr* big = AllocHeapStr(n);
272 memcpy(big->data_, s, n + 1); // copy NUL terminator too
273 big->SetLength(n);
274 return Str(big);
275 }
276}
277
278Str StrFromC(const char* s) {
279 return StrFromC(s, strlen(s));
280}
281
282int len(Str s) {
283 if (s.small_.is_present_) {
284 return s.small_.length_;
285 } else {
286 return s.big_->Length();
287 }
288}
289
290Str str_concat(Str a, Str b) {
291 int a_len = len(a);
292 int b_len = len(b);
293 int new_len = a_len + b_len;
294
295 // Create both on the stack so we can share the logic
296 HeapStr* big;
297 SmallStr small(kSmallStrInvalidLength);
298
299 char* dest;
300
301 if (new_len <= kSmallStrThreshold) {
302 dest = small.data_;
303 small.length_ = new_len;
304 } else {
305 big = AllocHeapStr(new_len);
306
307 dest = big->data_;
308 big->SetLength(new_len);
309 }
310
311 a.CopyTo(dest);
312 dest += a_len;
313
314 b.CopyTo(dest);
315 dest += b_len;
316
317 *dest = '\0';
318
319 if (new_len <= kSmallStrThreshold) {
320 return Str(small);
321 } else {
322 return Str(big);
323 }
324}
325
326static_assert(sizeof(SmallStr) == 8, "SmallStr should be 8 bytes");
327static_assert(sizeof(Str) == 8, "Str should be 8 bytes");
328
329TEST small_str_test() {
330 log("sizeof(Str) = %d", sizeof(Str));
331 log("sizeof(SmallStr) = %d", sizeof(SmallStr));
332 log("sizeof(HeapStr*) = %d", sizeof(HeapStr*));
333
334 log("");
335 log("---- SmallStrFromC() / StrFromC() / global G_SMALL_STR() ---- ");
336 log("");
337
338 log("gSmall = %s", gSmall.small_.data_);
339
340 // Str s { 1, 0, 3, "foo" };
341 SmallStr local_small(0);
342 ASSERT(local_small.is_present_);
343
344 // It just has 1 bit set
345 log("local_small as integer %d", local_small);
346 log("local_small = %s", local_small.data_);
347
348 Str local_s = StrFromC("little");
349 ASSERT(local_s.IsSmall());
350 log("local_s = %s", local_s.small_.data_);
351
352 Str local_big = StrFromC("big long string");
353 ASSERT(!local_big.IsSmall());
354
355 log("");
356 log("---- c_str() ---- ");
357 log("");
358
359 log("gSmall = %s %d", gSmall.c_str(), len(gSmall));
360 log("local_small = %s %d", local_s.c_str(), len(local_s));
361 log("local_big = %s %d", local_big.c_str(), len(local_big));
362
363 log("");
364 log("---- Str_upper() ---- ");
365 log("");
366
367 Str u1 = local_s.upper();
368 ASSERT(u1.IsSmall());
369
370 Str u2 = gSmall.upper();
371 ASSERT(u2.IsSmall());
372
373 Str u3 = local_big.upper();
374 ASSERT(!u3.IsSmall());
375
376 log("local_small = %s %d", u1.c_str(), len(u1));
377 log("gSmall = %s %d", u2.c_str(), len(u2));
378 log("local_big = %s %d", u3.c_str(), len(u3));
379
380 log("");
381 log("---- NewStr() ---- ");
382 log("");
383
384 Str small_empty = NewStr(6);
385 ASSERT(small_empty.IsSmall());
386 ASSERT_EQ(6, len(small_empty));
387
388 Str big_empty = NewStr(7);
389 ASSERT(!big_empty.IsSmall());
390 ASSERT_EQ_FMT(7, len(big_empty), "%d");
391
392 log("");
393 log("---- str_concat() ---- ");
394 log("");
395
396 Str empty_empty = str_concat(kEmptyString, kEmptyString);
397 ASSERT(empty_empty.IsSmall());
398 log("empty_empty (%d) = %s", len(empty_empty), empty_empty.c_str());
399
400 Str empty_small = str_concat(kEmptyString, StrFromC("b"));
401 ASSERT(empty_small.IsSmall());
402 log("empty_small (%d) = %s", len(empty_small), empty_small.c_str());
403
404 Str small_small = str_concat(StrFromC("a"), StrFromC("b"));
405 ASSERT(small_small.IsSmall());
406 log("small_small (%d) %s", len(small_small), small_small.c_str());
407
408 Str small_big = str_concat(StrFromC("small"), StrFromC("big string"));
409 ASSERT(!small_big.IsSmall());
410 log("small_big (%d) %s", len(small_big), small_big.c_str());
411
412 Str big_small = str_concat(StrFromC("big string"), StrFromC("small"));
413 ASSERT(!big_small.IsSmall());
414 log("big_small (%d) %s", len(big_small), big_small.c_str());
415
416 Str big_big = str_concat(StrFromC("abcdefghij"), StrFromC("0123456789"));
417 ASSERT(!big_big.IsSmall());
418 log("big_big (%d) = %s ", len(big_big), big_big.c_str());
419
420 log("");
421 log("---- str_equals() ---- ");
422 log("");
423
424 ASSERT(str_equals(kEmptyString, StrFromC("")));
425 ASSERT(str_equals(kEmptyString, NewStr(0)));
426
427 // small vs. small
428 ASSERT(!str_equals(kEmptyString, StrFromC("a")));
429
430 ASSERT(str_equals(StrFromC("a"), StrFromC("a")));
431 ASSERT(!str_equals(StrFromC("a"), StrFromC("b"))); // same length
432 ASSERT(!str_equals(StrFromC("a"), StrFromC("two"))); // different length
433
434 // small vs. big
435 ASSERT(!str_equals(StrFromC("small"), StrFromC("big string")));
436 ASSERT(!str_equals(StrFromC("big string"), StrFromC("small")));
437
438 // big vs. big
439 ASSERT(str_equals(StrFromC("big string"), StrFromC("big string")));
440 ASSERT(!str_equals(StrFromC("big string"), StrFromC("big strinZ")));
441 ASSERT(!str_equals(StrFromC("big string"), StrFromC("longer string")));
442
443 // TODO:
444 log("");
445 log("---- OverAllocatedStr() ---- ");
446 log("");
447
448 Str hostname = OverAllocatedStr(HOST_NAME_MAX);
449 int status = ::gethostname(hostname.big_->data_, HOST_NAME_MAX);
450 if (status != 0) {
451 assert(0);
452 }
453 hostname.MaybeShrink(strlen(hostname.big_->data_));
454
455 log("hostname = %s", hostname.c_str());
456
457 time_t ts = 0;
458 tm* loc_time = ::localtime(&ts);
459
460 const int max_len = 1024;
461 Str t1 = OverAllocatedStr(max_len);
462
463 int n = strftime(t1.big_->data_, max_len, "%Y-%m-%d", loc_time);
464 if (n == 0) { // exceeds max length
465 assert(0);
466 }
467 t1.MaybeShrink(n);
468
469 log("t1 = %s", t1.c_str());
470
471 Str t2 = OverAllocatedStr(max_len);
472 n = strftime(t2.big_->data_, max_len, "%Y", loc_time);
473 if (n == 0) { // exceeds max length
474 assert(0);
475 }
476 t2.MaybeShrink(n);
477
478 log("t2 = %s", t2.c_str());
479
480 // TODO:
481 // BufWriter (rename StrWriter, and uses MutableHeapStr ?)
482 // writer.getvalue(); // may copy into data_
483
484 PASS();
485}
486
487} // namespace small_str_test
488
489GREATEST_MAIN_DEFS();
490
491int main(int argc, char** argv) {
492 // gHeap.Init();
493
494 GREATEST_MAIN_BEGIN();
495
496 RUN_TEST(small_str_test::small_str_test);
497
498 // gHeap.CleanProcessExit();
499
500 GREATEST_MAIN_END();
501 return 0;
502}