OILS / mycpp / gc_dict.h View on Github | oilshell.org

470 lines, 265 significant
1// Hash table based on CPython's "compact dict":
2//
3// https://mail.python.org/pipermail/python-dev/2012-December/123028.html
4// https://code.activestate.com/recipes/578375/
5//
6// Main differences:
7// - It's type-specialized in C++ -- Dict<K, V>.
8// - It's integrated with our mark and sweep GC, using Slab<int>, Slab<K>, and
9// Slab<V>
10// - We use linear probing, not the pseudo-random number generator
11
12#ifndef MYCPP_GC_DICT_H
13#define MYCPP_GC_DICT_H
14
15#include "mycpp/comparators.h"
16#include "mycpp/gc_builtins.h"
17#include "mycpp/gc_list.h"
18#include "mycpp/hash.h"
19
20// Non-negative entries in index_ are array indices into keys_ and values_.
21// There are two special negative entries:
22
23// index_ value to say this Dict item was deleted (a tombstone).
24const int kDeletedEntry = -1;
25
26// index_ value to say this Dict entry is free.
27const int kEmptyEntry = -2;
28
29// Return value for find_kv_index(), not stored in index_.
30const int kNotFound = -3;
31
32// Return value for hash_and_probe(), not stored in index_.
33const int kTooSmall = -4;
34
35// Helper for keys() and values()
36template <typename T>
37List<T>* ListFromDictSlab(Slab<T>* slab, int n) {
38 List<T>* result = Alloc<List<T>>();
39 result->reserve(n);
40
41 for (int i = 0; i < n; ++i) {
42 result->append(slab->items_[i]);
43 }
44 return result;
45}
46
47// GlobalDict is layout-compatible with Dict (unit tests assert this), and it
48// can be a true C global (incurs zero startup time)
49
50template <typename K, typename V, int N>
51class GlobalDict {
52 public:
53 int len_;
54 int capacity_;
55 int index_len_;
56 GlobalSlab<int, N>* index_;
57 GlobalSlab<K, N>* keys_;
58 GlobalSlab<V, N>* values_;
59};
60
61#define GLOBAL_DICT(name, K, V, N, keys, vals) \
62 GcGlobal<GlobalSlab<K, N>> _keys_##name = {ObjHeader::Global(TypeTag::Slab), \
63 {.items_ = keys}}; \
64 GcGlobal<GlobalSlab<V, N>> _vals_##name = {ObjHeader::Global(TypeTag::Slab), \
65 {.items_ = vals}}; \
66 GcGlobal<GlobalDict<K, V, N>> _dict_##name = { \
67 ObjHeader::Global(TypeTag::Dict), \
68 {.len_ = N, \
69 .capacity_ = N, \
70 .index_len_ = 0, \
71 .index_ = nullptr, \
72 .keys_ = &_keys_##name.obj, \
73 .values_ = &_vals_##name.obj}, \
74 }; \
75 Dict<K, V>* name = reinterpret_cast<Dict<K, V>*>(&_dict_##name.obj);
76
77template <class K, class V>
78class Dict {
79 public:
80 Dict()
81 : len_(0),
82 capacity_(0),
83 index_len_(0),
84 index_(nullptr),
85 keys_(nullptr),
86 values_(nullptr) {
87 }
88
89 Dict(std::initializer_list<K> keys, std::initializer_list<V> values)
90 : len_(0),
91 capacity_(0),
92 index_len_(0),
93 index_(nullptr),
94 keys_(nullptr),
95 values_(nullptr) {
96 DCHECK(keys.size() == values.size());
97 auto v = values.begin(); // This simulates a "zip" loop
98 for (auto key : keys) {
99 // note: calls reserve(), and may allocate
100 this->set(key, *v);
101 ++v;
102 }
103 }
104
105 // Reserve enough space for at LEAST this many key-value pairs.
106 void reserve(int num_desired);
107
108 // d[key] in Python: raises KeyError if not found
109 V at(K key) const;
110
111 // d.get(key) in Python. (Can't use this if V isn't a pointer!)
112 V get(K key) const;
113
114 // Get a key, but return a default if not found.
115 // expr_parse.py uses this with OTHER_BALANCE
116 V get(K key, V default_val) const;
117
118 // Implements d[k] = v. May resize the dictionary.
119 void set(K key, V val);
120
121 void update(List<Tuple2<K, V>*>* pairs);
122 void update(Dict<K, V>* other);
123
124 List<K>* keys() const;
125
126 List<V>* values() const;
127
128 void clear();
129
130 // Helper used by find_kv_index(), set(), mylib::dict_erase() in
131 // gc_mylib.h
132 // Returns either:
133 // - the slot for an existing key, or an empty slot for a new key
134 // - kTooSmall if the table is full
135 int hash_and_probe(K key) const;
136
137 // Helper used by at(), get(), dict_contains()
138 // Given a key, returns either:
139 // - an index into keys_ and values_
140 // - kNotFound
141 int find_kv_index(K key) const;
142
143 static constexpr ObjHeader obj_header() {
144 return ObjHeader::ClassFixed(field_mask(), sizeof(Dict));
145 }
146
147 int len_; // number of entries (keys and values, almost dense)
148 int capacity_; // number of k/v slots
149 int index_len_; // number of index slots
150
151 // These 3 slabs are resized at the same time.
152 Slab<int>* index_; // kEmptyEntry, kDeletedEntry, or a valid index into
153 // keys_ and values_
154 Slab<K>* keys_; // Dict<K, int>
155 Slab<V>* values_; // Dict<int, V>
156
157 // A dict has 3 pointers the GC needs to follow.
158 static constexpr uint32_t field_mask() {
159 return maskbit(offsetof(Dict, index_)) | maskbit(offsetof(Dict, keys_)) |
160 maskbit(offsetof(Dict, values_));
161 }
162
163 DISALLOW_COPY_AND_ASSIGN(Dict);
164
165 // kItemSize is max of K and V size. That is, on 64-bit machines, the RARE
166 // Dict<int, int> is smaller than other dicts
167 static constexpr int kItemSize = sizeof(K) > sizeof(V) ? sizeof(K)
168 : sizeof(V);
169
170 // Matches mark_sweep_heap.h
171 static constexpr int kPoolBytes2 = 48 - sizeof(ObjHeader);
172 static_assert(kPoolBytes2 % kItemSize == 0,
173 "An integral number of items should fit in second pool");
174 static constexpr int kNumItems2 = kPoolBytes2 / kItemSize;
175
176 static const int kHeaderFudge = sizeof(ObjHeader) / kItemSize;
177 static_assert(sizeof(ObjHeader) % kItemSize == 0,
178 "Slab header size should be multiple of key size");
179
180#if 0
181 static constexpr int kMinBytes2 = 128 - sizeof(ObjHeader);
182 static_assert(kMinBytes2 % kItemSize == 0,
183 "An integral number of items should fit");
184 static constexpr int kMinItems2 = kMinBytes2 / kItemSize;
185#endif
186
187 int HowManyPairs(int num_desired) {
188 // See gc_list.h for comments on nearly identical logic
189
190 if (num_desired <= kNumItems2) { // use full cell in pool 2
191 return kNumItems2;
192 }
193#if 0
194 if (num_desired <= kMinItems2) { // 48 -> 128, not 48 -> 64
195 return kMinItems2;
196 }
197#endif
198 return RoundUp(num_desired + kHeaderFudge) - kHeaderFudge;
199 }
200};
201
202template <typename K, typename V>
203inline bool dict_contains(const Dict<K, V>* haystack, K needle) {
204 return haystack->find_kv_index(needle) != kNotFound;
205}
206
207template <typename K, typename V>
208void Dict<K, V>::reserve(int num_desired) {
209 if (capacity_ >= num_desired) {
210 return; // Don't do anything if there's already enough space.
211 }
212
213 int old_len = len_;
214 Slab<K>* old_k = keys_;
215 Slab<V>* old_v = values_;
216
217 // Calculate the number of keys and values we should have
218 capacity_ = HowManyPairs(num_desired);
219
220 // 1) Ensure index len a power of 2, to avoid expensive modulus % operation
221 // 2) Introduce hash table load factor. Use capacity_+1 to simulate ceil()
222 // div, not floor() div.
223 index_len_ = RoundUp((capacity_ + 1) * 5 / 4);
224 DCHECK(index_len_ > capacity_);
225
226 index_ = NewSlab<int>(index_len_);
227 for (int i = 0; i < index_len_; ++i) {
228 index_->items_[i] = kEmptyEntry;
229 }
230
231 // These are DENSE, while index_ is sparse.
232 keys_ = NewSlab<K>(capacity_);
233 values_ = NewSlab<V>(capacity_);
234
235 if (old_k != nullptr) { // rehash if there were any entries
236 len_ = 0;
237 for (int i = 0; i < old_len; ++i) {
238 set(old_k->items_[i], old_v->items_[i]);
239 }
240 }
241}
242
243template <typename K, typename V>
244V Dict<K, V>::at(K key) const {
245 int kv_index = find_kv_index(key);
246 if (kv_index == kNotFound) {
247 throw Alloc<KeyError>();
248 } else {
249 return values_->items_[kv_index];
250 }
251}
252
253template <typename K, typename V>
254V Dict<K, V>::get(K key) const {
255 int kv_index = find_kv_index(key);
256 if (kv_index == kNotFound) {
257 return nullptr;
258 } else {
259 return values_->items_[kv_index];
260 }
261}
262
263template <typename K, typename V>
264V Dict<K, V>::get(K key, V default_val) const {
265 int kv_index = find_kv_index(key);
266 if (kv_index == kNotFound) {
267 return default_val;
268 } else {
269 return values_->items_[kv_index];
270 }
271}
272
273template <typename K, typename V>
274List<K>* Dict<K, V>::keys() const {
275 return ListFromDictSlab<K>(keys_, len_);
276}
277
278template <typename K, typename V>
279List<V>* Dict<K, V>::values() const {
280 return ListFromDictSlab<V>(values_, len_);
281}
282
283template <typename K, typename V>
284void Dict<K, V>::clear() {
285 // Maintain invariant
286 for (int i = 0; i < index_len_; ++i) {
287 index_->items_[i] = kEmptyEntry;
288 }
289
290 if (keys_) {
291 memset(keys_->items_, 0, len_ * sizeof(K)); // zero for GC scan
292 }
293 if (values_) {
294 memset(values_->items_, 0, len_ * sizeof(V)); // zero for GC scan
295 }
296 len_ = 0;
297}
298
299// TODO:
300// - Special case to intern BigStr* when it's hashed? How?
301// - Should we have wrappers like:
302// - V GetAndIntern<V>(D, &string_key)
303// - SetAndIntern<V>(D, &string_key, value)
304// This will enable duplicate copies of the string to be garbage collected
305template <typename K, typename V>
306int Dict<K, V>::hash_and_probe(K key) const {
307 if (capacity_ == 0) {
308 return kTooSmall;
309 }
310
311 // Hash the key onto a slot in the index. If the first slot is occupied,
312 // probe until an empty one is found.
313 unsigned h = hash_key(key);
314 // faster % using & -- assuming index_len_ is power of 2
315 int init_bucket = h & (index_len_ - 1);
316
317 // If we see a tombstone along the probing path, stash it.
318 int open_slot = -1;
319
320 for (int i = 0; i < index_len_; ++i) {
321 // Start at init_bucket and wrap araound
322
323 // faster % using & -- assuming index_len_ is power of 2
324 int slot = (i + init_bucket) & (index_len_ - 1);
325
326 int kv_index = index_->items_[slot];
327 DCHECK(kv_index < len_);
328 // Optimistically this is the common case once the table has been populated.
329 if (kv_index >= 0) {
330 unsigned h2 = hash_key(keys_->items_[kv_index]);
331 if (h == h2 && keys_equal(keys_->items_[kv_index], key)) {
332 return slot;
333 }
334 }
335
336 if (kv_index == kEmptyEntry) {
337 if (open_slot != -1) {
338 slot = open_slot;
339 }
340 // If there isn't room in the entry arrays, tell the caller to resize.
341 return len_ < capacity_ ? slot : kTooSmall;
342 }
343
344 // Tombstone or collided keys unequal. Keep scanning.
345 DCHECK(kv_index >= 0 || kv_index == kDeletedEntry);
346 if (kv_index == kDeletedEntry && open_slot == -1) {
347 // NOTE: We only record the open slot here. We DON'T return it. If we're
348 // looking for a key that was writen before this tombstone was written to
349 // the index we should continue probing until we get to that key. If we
350 // get to an empty index slot or the end of the index then we know we are
351 // dealing with a new key and can safely replace the tombstone without
352 // disrupting any existing keys.
353 open_slot = slot;
354 }
355 }
356
357 if (open_slot != -1) {
358 return len_ < capacity_ ? open_slot : kTooSmall;
359 }
360
361 return kTooSmall;
362}
363
364template <typename K, typename V>
365int Dict<K, V>::find_kv_index(K key) const {
366 if (index_ != nullptr) { // Common case
367 int pos = hash_and_probe(key);
368 if (pos == kTooSmall) {
369 return kNotFound;
370 }
371 DCHECK(pos >= 0);
372 int kv_index = index_->items_[pos];
373 if (kv_index < 0) {
374 return kNotFound;
375 }
376 return kv_index;
377 }
378
379 // Linear search on GlobalDict instances.
380 // TODO: Should we populate and compare their hash values?
381 for (int i = 0; i < len_; ++i) {
382 if (keys_equal(keys_->items_[i], key)) {
383 return i;
384 }
385 }
386
387 return kNotFound;
388}
389
390template <typename K, typename V>
391void Dict<K, V>::set(K key, V val) {
392 DCHECK(obj_header().heap_tag != HeapTag::Global);
393 int pos = hash_and_probe(key);
394 if (pos == kTooSmall) {
395 reserve(len_ + 1);
396 pos = hash_and_probe(key);
397 }
398 DCHECK(pos >= 0);
399
400 int kv_index = index_->items_[pos];
401 DCHECK(kv_index < len_);
402 if (kv_index < 0) {
403 // Write new entries to the end of the k/v arrays. This allows us to recall
404 // insertion order until the first deletion.
405 keys_->items_[len_] = key;
406 values_->items_[len_] = val;
407 index_->items_[pos] = len_;
408 len_++;
409 DCHECK(len_ <= capacity_);
410 } else {
411 values_->items_[kv_index] = val;
412 }
413}
414
415template <typename K, typename V>
416inline int len(const Dict<K, V>* d) {
417 return d->len_;
418}
419
420template <class K, class V>
421class DictIter {
422 public:
423 explicit DictIter(Dict<K, V>* D) : D_(D), pos_(0) {
424 }
425 void Next() {
426 pos_++;
427 }
428 bool Done() {
429 return pos_ == D_->len_;
430 }
431 K Key() {
432 return D_->keys_->items_[pos_];
433 }
434 V Value() {
435 return D_->values_->items_[pos_];
436 }
437
438 private:
439 const Dict<K, V>* D_;
440 int pos_;
441};
442
443// dict(mylist) converts a list of (k, v) tuples into a dict
444template <typename K, typename V>
445Dict<K, V>* dict(List<Tuple2<K, V>*>* l) {
446 auto ret = Alloc<Dict<K, V>>();
447 ret->reserve(len(l));
448 for (ListIter<Tuple2<K, V>*> it(l); !it.Done(); it.Next()) {
449 ret->set(it.Value()->at0(), it.Value()->at1());
450 }
451 return ret;
452}
453
454template <class K, class V>
455void Dict<K, V>::update(List<Tuple2<K, V>*>* pairs) {
456 reserve(len_ + len(pairs));
457 for (ListIter<Tuple2<K, V>*> it(pairs); !it.Done(); it.Next()) {
458 set(it.Value()->at0(), it.Value()->at1());
459 }
460}
461
462template <class K, class V>
463void Dict<K, V>::update(Dict<K, V>* other) {
464 reserve(len_ + len(other));
465 for (DictIter<K, V> it(other); !it.Done(); it.Next()) {
466 set(it.Key(), it.Value());
467 }
468}
469
470#endif // MYCPP_GC_DICT_H