OILS / mycpp / gc_dict.h View on Github | oilshell.org

471 lines, 265 significant
1// Hash table based on CPython's "compact dict":
2//
3// https://mail.python.org/pipermail/python-dev/2012-December/123028.html
4// https://code.activestate.com/recipes/578375/
5//
6// Main differences:
7// - It's type-specialized in C++ -- Dict<K, V>.
8// - It's integrated with our mark and sweep GC, using Slab<int>, Slab<K>, and
9// Slab<V>
10// - We use linear probing, not the pseudo-random number generator
11
12#ifndef MYCPP_GC_DICT_H
13#define MYCPP_GC_DICT_H
14
15#include "mycpp/comparators.h"
16#include "mycpp/gc_builtins.h"
17#include "mycpp/gc_list.h"
18#include "mycpp/hash.h"
19
20// Non-negative entries in index_ are array indices into keys_ and values_.
21// There are two special negative entries:
22
23// index_ value to say this Dict item was deleted (a tombstone).
24const int kDeletedEntry = -1;
25
26// index_ value to say this Dict entry is free.
27const int kEmptyEntry = -2;
28
29// Return value for find_kv_index(), not stored in index_.
30const int kNotFound = -3;
31
32// Return value for hash_and_probe(), not stored in index_.
33const int kTooSmall = -4;
34
35// Helper for keys() and values()
36template <typename T>
37List<T>* ListFromDictSlab(Slab<T>* slab, int n) {
38 List<T>* result = Alloc<List<T>>();
39 result->reserve(n);
40
41 for (int i = 0; i < n; ++i) {
42 result->append(slab->items_[i]);
43 }
44 return result;
45}
46
47// GlobalDict is layout-compatible with Dict (unit tests assert this), and it
48// can be a true C global (incurs zero startup time)
49
50template <typename K, typename V, int N>
51class GlobalDict {
52 public:
53 int len_;
54 int capacity_;
55 int index_len_;
56 GlobalSlab<int, N>* index_;
57 GlobalSlab<K, N>* keys_;
58 GlobalSlab<V, N>* values_;
59};
60
61#define GLOBAL_DICT(name, K, V, N, keys, vals) \
62 GcGlobal<GlobalSlab<K, N>> _keys_##name = {ObjHeader::Global(TypeTag::Slab), \
63 {.items_ = keys}}; \
64 GcGlobal<GlobalSlab<V, N>> _vals_##name = {ObjHeader::Global(TypeTag::Slab), \
65 {.items_ = vals}}; \
66 GcGlobal<GlobalDict<K, V, N>> _dict_##name = { \
67 ObjHeader::Global(TypeTag::Dict), \
68 {.len_ = N, \
69 .capacity_ = N, \
70 .index_len_ = 0, \
71 .index_ = nullptr, \
72 .keys_ = &_keys_##name.obj, \
73 .values_ = &_vals_##name.obj}, \
74 }; \
75 Dict<K, V>* name = reinterpret_cast<Dict<K, V>*>(&_dict_##name.obj);
76
77template <class K, class V>
78class Dict {
79 public:
80 Dict()
81 : len_(0),
82 capacity_(0),
83 index_len_(0),
84 index_(nullptr),
85 keys_(nullptr),
86 values_(nullptr) {
87 }
88
89 Dict(std::initializer_list<K> keys, std::initializer_list<V> values)
90 : len_(0),
91 capacity_(0),
92 index_len_(0),
93 index_(nullptr),
94 keys_(nullptr),
95 values_(nullptr) {
96 DCHECK(keys.size() == values.size());
97 auto v = values.begin(); // This simulates a "zip" loop
98 for (auto key : keys) {
99 // note: calls reserve(), and may allocate
100 this->set(key, *v);
101 ++v;
102 }
103 }
104
105 // Reserve enough space for at LEAST this many key-value pairs.
106 void reserve(int num_desired);
107
108 // d[key] in Python: raises KeyError if not found
109 V at(K key) const;
110
111 // d.get(key) in Python. (Can't use this if V isn't a pointer!)
112 V get(K key) const;
113
114 // Get a key, but return a default if not found.
115 // expr_parse.py uses this with OTHER_BALANCE
116 V get(K key, V default_val) const;
117
118 // Implements d[k] = v. May resize the dictionary.
119 void set(K key, V val);
120
121 void update(List<Tuple2<K, V>*>* pairs);
122 void update(Dict<K, V>* other);
123
124 List<K>* keys() const;
125
126 List<V>* values() const;
127
128 void clear();
129
130 // Helper used by find_kv_index(), set(), mylib::dict_erase() in
131 // gc_mylib.h
132 // Returns either:
133 // - the slot for an existing key, or an empty slot for a new key
134 // - kTooSmall if the table is full
135 int hash_and_probe(K key) const;
136
137 // Helper used by at(), get(), dict_contains()
138 // Given a key, returns either:
139 // - an index into keys_ and values_
140 // - kNotFound
141 int find_kv_index(K key) const;
142
143 static constexpr ObjHeader obj_header() {
144 return ObjHeader::ClassFixed(field_mask(), sizeof(Dict));
145 }
146
147 int len_; // number of entries (keys and values, almost dense)
148 int capacity_; // number of k/v slots
149 int index_len_; // number of index slots
150
151 // These 3 slabs are resized at the same time.
152 Slab<int>* index_; // kEmptyEntry, kDeletedEntry, or a valid index into
153 // keys_ and values_
154 Slab<K>* keys_; // Dict<K, int>
155 Slab<V>* values_; // Dict<int, V>
156
157 // A dict has 3 pointers the GC needs to follow.
158 static constexpr uint32_t field_mask() {
159 return maskbit(offsetof(Dict, index_)) | maskbit(offsetof(Dict, keys_)) |
160 maskbit(offsetof(Dict, values_));
161 }
162
163 DISALLOW_COPY_AND_ASSIGN(Dict);
164
165 // kItemSize is max of K and V size. That is, on 64-bit machines, the RARE
166 // Dict<int, int> is smaller than other dicts
167 static constexpr int kItemSize = sizeof(K) > sizeof(V) ? sizeof(K)
168 : sizeof(V);
169
170 // Matches mark_sweep_heap.h
171 static constexpr int kPoolBytes2 = 48 - sizeof(ObjHeader);
172 static_assert(kPoolBytes2 % kItemSize == 0,
173 "An integral number of items should fit in second pool");
174 static constexpr int kNumItems2 = kPoolBytes2 / kItemSize;
175
176 static const int kHeaderFudge = sizeof(ObjHeader) / kItemSize;
177 static_assert(sizeof(ObjHeader) % kItemSize == 0,
178 "Slab header size should be multiple of key size");
179
180#if 0
181 static constexpr int kMinBytes2 = 128 - sizeof(ObjHeader);
182 static_assert(kMinBytes2 % kItemSize == 0,
183 "An integral number of items should fit");
184 static constexpr int kMinItems2 = kMinBytes2 / kItemSize;
185#endif
186
187 int HowManyPairs(int num_desired) {
188 // See gc_list.h for comments on nearly identical logic
189
190 if (num_desired <= kNumItems2) { // use full cell in pool 2
191 return kNumItems2;
192 }
193#if 0
194 if (num_desired <= kMinItems2) { // 48 -> 128, not 48 -> 64
195 return kMinItems2;
196 }
197#endif
198 return RoundUp(num_desired + kHeaderFudge) - kHeaderFudge;
199 }
200};
201
202template <typename K, typename V>
203inline bool dict_contains(const Dict<K, V>* haystack, K needle) {
204 return haystack->find_kv_index(needle) != kNotFound;
205}
206
207template <typename K, typename V>
208void Dict<K, V>::reserve(int num_desired) {
209 if (capacity_ >= num_desired) {
210 return; // Don't do anything if there's already enough space.
211 }
212
213 int old_len = len_;
214 Slab<K>* old_k = keys_;
215 Slab<V>* old_v = values_;
216
217 // Calculate the number of keys and values we should have
218 capacity_ = HowManyPairs(num_desired);
219
220 // 1) Ensure index len a power of 2, to avoid expensive modulus % operation
221 // 2) Introduce hash table load factor. Use capacity_+1 to simulate ceil()
222 // div, not floor() div.
223 index_len_ = RoundUp((capacity_ + 1) * 5 / 4);
224 DCHECK(index_len_ > capacity_);
225
226 index_ = NewSlab<int>(index_len_);
227 for (int i = 0; i < index_len_; ++i) {
228 index_->items_[i] = kEmptyEntry;
229 }
230
231 // These are DENSE, while index_ is sparse.
232 keys_ = NewSlab<K>(capacity_);
233 values_ = NewSlab<V>(capacity_);
234
235 if (old_k != nullptr) { // rehash if there were any entries
236 // log("REHASH num_desired %d", num_desired);
237 len_ = 0;
238 for (int i = 0; i < old_len; ++i) {
239 set(old_k->items_[i], old_v->items_[i]);
240 }
241 }
242}
243
244template <typename K, typename V>
245V Dict<K, V>::at(K key) const {
246 int kv_index = find_kv_index(key);
247 if (kv_index == kNotFound) {
248 throw Alloc<KeyError>();
249 } else {
250 return values_->items_[kv_index];
251 }
252}
253
254template <typename K, typename V>
255V Dict<K, V>::get(K key) const {
256 int kv_index = find_kv_index(key);
257 if (kv_index == kNotFound) {
258 return nullptr;
259 } else {
260 return values_->items_[kv_index];
261 }
262}
263
264template <typename K, typename V>
265V Dict<K, V>::get(K key, V default_val) const {
266 int kv_index = find_kv_index(key);
267 if (kv_index == kNotFound) {
268 return default_val;
269 } else {
270 return values_->items_[kv_index];
271 }
272}
273
274template <typename K, typename V>
275List<K>* Dict<K, V>::keys() const {
276 return ListFromDictSlab<K>(keys_, len_);
277}
278
279template <typename K, typename V>
280List<V>* Dict<K, V>::values() const {
281 return ListFromDictSlab<V>(values_, len_);
282}
283
284template <typename K, typename V>
285void Dict<K, V>::clear() {
286 // Maintain invariant
287 for (int i = 0; i < index_len_; ++i) {
288 index_->items_[i] = kEmptyEntry;
289 }
290
291 if (keys_) {
292 memset(keys_->items_, 0, len_ * sizeof(K)); // zero for GC scan
293 }
294 if (values_) {
295 memset(values_->items_, 0, len_ * sizeof(V)); // zero for GC scan
296 }
297 len_ = 0;
298}
299
300// TODO:
301// - Special case to intern BigStr* when it's hashed? How?
302// - Should we have wrappers like:
303// - V GetAndIntern<V>(D, &string_key)
304// - SetAndIntern<V>(D, &string_key, value)
305// This will enable duplicate copies of the string to be garbage collected
306template <typename K, typename V>
307int Dict<K, V>::hash_and_probe(K key) const {
308 if (capacity_ == 0) {
309 return kTooSmall;
310 }
311
312 // Hash the key onto a slot in the index. If the first slot is occupied,
313 // probe until an empty one is found.
314 unsigned h = hash_key(key);
315 // faster % using & -- assuming index_len_ is power of 2
316 int init_bucket = h & (index_len_ - 1);
317
318 // If we see a tombstone along the probing path, stash it.
319 int open_slot = -1;
320
321 for (int i = 0; i < index_len_; ++i) {
322 // Start at init_bucket and wrap araound
323
324 // faster % using & -- assuming index_len_ is power of 2
325 int slot = (i + init_bucket) & (index_len_ - 1);
326
327 int kv_index = index_->items_[slot];
328 DCHECK(kv_index < len_);
329 // Optimistically this is the common case once the table has been populated.
330 if (kv_index >= 0) {
331 unsigned h2 = hash_key(keys_->items_[kv_index]);
332 if (h == h2 && keys_equal(keys_->items_[kv_index], key)) {
333 return slot;
334 }
335 }
336
337 if (kv_index == kEmptyEntry) {
338 if (open_slot != -1) {
339 slot = open_slot;
340 }
341 // If there isn't room in the entry arrays, tell the caller to resize.
342 return len_ < capacity_ ? slot : kTooSmall;
343 }
344
345 // Tombstone or collided keys unequal. Keep scanning.
346 DCHECK(kv_index >= 0 || kv_index == kDeletedEntry);
347 if (kv_index == kDeletedEntry && open_slot == -1) {
348 // NOTE: We only record the open slot here. We DON'T return it. If we're
349 // looking for a key that was writen before this tombstone was written to
350 // the index we should continue probing until we get to that key. If we
351 // get to an empty index slot or the end of the index then we know we are
352 // dealing with a new key and can safely replace the tombstone without
353 // disrupting any existing keys.
354 open_slot = slot;
355 }
356 }
357
358 if (open_slot != -1) {
359 return len_ < capacity_ ? open_slot : kTooSmall;
360 }
361
362 return kTooSmall;
363}
364
365template <typename K, typename V>
366int Dict<K, V>::find_kv_index(K key) const {
367 if (index_ != nullptr) { // Common case
368 int pos = hash_and_probe(key);
369 if (pos == kTooSmall) {
370 return kNotFound;
371 }
372 DCHECK(pos >= 0);
373 int kv_index = index_->items_[pos];
374 if (kv_index < 0) {
375 return kNotFound;
376 }
377 return kv_index;
378 }
379
380 // Linear search on GlobalDict instances.
381 // TODO: Should we populate and compare their hash values?
382 for (int i = 0; i < len_; ++i) {
383 if (keys_equal(keys_->items_[i], key)) {
384 return i;
385 }
386 }
387
388 return kNotFound;
389}
390
391template <typename K, typename V>
392void Dict<K, V>::set(K key, V val) {
393 DCHECK(obj_header().heap_tag != HeapTag::Global);
394 int pos = hash_and_probe(key);
395 if (pos == kTooSmall) {
396 reserve(len_ + 1);
397 pos = hash_and_probe(key);
398 }
399 DCHECK(pos >= 0);
400
401 int kv_index = index_->items_[pos];
402 DCHECK(kv_index < len_);
403 if (kv_index < 0) {
404 // Write new entries to the end of the k/v arrays. This allows us to recall
405 // insertion order until the first deletion.
406 keys_->items_[len_] = key;
407 values_->items_[len_] = val;
408 index_->items_[pos] = len_;
409 len_++;
410 DCHECK(len_ <= capacity_);
411 } else {
412 values_->items_[kv_index] = val;
413 }
414}
415
416template <typename K, typename V>
417inline int len(const Dict<K, V>* d) {
418 return d->len_;
419}
420
421template <class K, class V>
422class DictIter {
423 public:
424 explicit DictIter(Dict<K, V>* D) : D_(D), pos_(0) {
425 }
426 void Next() {
427 pos_++;
428 }
429 bool Done() {
430 return pos_ == D_->len_;
431 }
432 K Key() {
433 return D_->keys_->items_[pos_];
434 }
435 V Value() {
436 return D_->values_->items_[pos_];
437 }
438
439 private:
440 const Dict<K, V>* D_;
441 int pos_;
442};
443
444// dict(mylist) converts a list of (k, v) tuples into a dict
445template <typename K, typename V>
446Dict<K, V>* dict(List<Tuple2<K, V>*>* l) {
447 auto ret = Alloc<Dict<K, V>>();
448 ret->reserve(len(l));
449 for (ListIter<Tuple2<K, V>*> it(l); !it.Done(); it.Next()) {
450 ret->set(it.Value()->at0(), it.Value()->at1());
451 }
452 return ret;
453}
454
455template <class K, class V>
456void Dict<K, V>::update(List<Tuple2<K, V>*>* pairs) {
457 reserve(len_ + len(pairs));
458 for (ListIter<Tuple2<K, V>*> it(pairs); !it.Done(); it.Next()) {
459 set(it.Value()->at0(), it.Value()->at1());
460 }
461}
462
463template <class K, class V>
464void Dict<K, V>::update(Dict<K, V>* other) {
465 reserve(len_ + len(other));
466 for (DictIter<K, V> it(other); !it.Done(); it.Next()) {
467 set(it.Key(), it.Value());
468 }
469}
470
471#endif // MYCPP_GC_DICT_H