1 | // Hash table based on CPython's "compact dict":
|
2 | //
|
3 | // https://mail.python.org/pipermail/python-dev/2012-December/123028.html
|
4 | // https://code.activestate.com/recipes/578375/
|
5 | //
|
6 | // Main differences:
|
7 | // - It's type-specialized in C++ -- Dict<K, V>.
|
8 | // - It's integrated with our mark and sweep GC, using Slab<int>, Slab<K>, and
|
9 | // Slab<V>
|
10 | // - We use linear probing, not the pseudo-random number generator
|
11 |
|
12 | #ifndef MYCPP_GC_DICT_H
|
13 | #define MYCPP_GC_DICT_H
|
14 |
|
15 | #include "mycpp/comparators.h"
|
16 | #include "mycpp/gc_builtins.h"
|
17 | #include "mycpp/gc_list.h"
|
18 | #include "mycpp/hash.h"
|
19 |
|
20 | // Non-negative entries in index_ are array indices into keys_ and values_.
|
21 | // There are two special negative entries:
|
22 |
|
23 | // index_ value to say this Dict item was deleted (a tombstone).
|
24 | const int kDeletedEntry = -1;
|
25 |
|
26 | // index_ value to say this Dict entry is free.
|
27 | const int kEmptyEntry = -2;
|
28 |
|
29 | // Return value for find_kv_index(), not stored in index_.
|
30 | const int kNotFound = -3;
|
31 |
|
32 | // Return value for hash_and_probe(), not stored in index_.
|
33 | const int kTooSmall = -4;
|
34 |
|
35 | // Helper for keys() and values()
|
36 | template <typename T>
|
37 | List<T>* ListFromDictSlab(Slab<T>* slab, int n) {
|
38 | List<T>* result = Alloc<List<T>>();
|
39 | result->reserve(n);
|
40 |
|
41 | for (int i = 0; i < n; ++i) {
|
42 | result->append(slab->items_[i]);
|
43 | }
|
44 | return result;
|
45 | }
|
46 |
|
47 | // GlobalDict is layout-compatible with Dict (unit tests assert this), and it
|
48 | // can be a true C global (incurs zero startup time)
|
49 |
|
50 | template <typename K, typename V, int N>
|
51 | class GlobalDict {
|
52 | public:
|
53 | int len_;
|
54 | int capacity_;
|
55 | int index_len_;
|
56 | GlobalSlab<int, N>* index_;
|
57 | GlobalSlab<K, N>* keys_;
|
58 | GlobalSlab<V, N>* values_;
|
59 | };
|
60 |
|
61 | #define GLOBAL_DICT(name, K, V, N, keys, vals) \
|
62 | GcGlobal<GlobalSlab<K, N>> _keys_##name = {ObjHeader::Global(TypeTag::Slab), \
|
63 | {.items_ = keys}}; \
|
64 | GcGlobal<GlobalSlab<V, N>> _vals_##name = {ObjHeader::Global(TypeTag::Slab), \
|
65 | {.items_ = vals}}; \
|
66 | GcGlobal<GlobalDict<K, V, N>> _dict_##name = { \
|
67 | ObjHeader::Global(TypeTag::Dict), \
|
68 | {.len_ = N, \
|
69 | .capacity_ = N, \
|
70 | .index_len_ = 0, \
|
71 | .index_ = nullptr, \
|
72 | .keys_ = &_keys_##name.obj, \
|
73 | .values_ = &_vals_##name.obj}, \
|
74 | }; \
|
75 | Dict<K, V>* name = reinterpret_cast<Dict<K, V>*>(&_dict_##name.obj);
|
76 |
|
77 | template <class K, class V>
|
78 | class Dict {
|
79 | public:
|
80 | Dict()
|
81 | : len_(0),
|
82 | capacity_(0),
|
83 | index_len_(0),
|
84 | index_(nullptr),
|
85 | keys_(nullptr),
|
86 | values_(nullptr) {
|
87 | }
|
88 |
|
89 | Dict(std::initializer_list<K> keys, std::initializer_list<V> values)
|
90 | : len_(0),
|
91 | capacity_(0),
|
92 | index_len_(0),
|
93 | index_(nullptr),
|
94 | keys_(nullptr),
|
95 | values_(nullptr) {
|
96 | DCHECK(keys.size() == values.size());
|
97 | auto v = values.begin(); // This simulates a "zip" loop
|
98 | for (auto key : keys) {
|
99 | // note: calls reserve(), and may allocate
|
100 | this->set(key, *v);
|
101 | ++v;
|
102 | }
|
103 | }
|
104 |
|
105 | // Reserve enough space for at LEAST this many key-value pairs.
|
106 | void reserve(int num_desired);
|
107 |
|
108 | // d[key] in Python: raises KeyError if not found
|
109 | V at(K key) const;
|
110 |
|
111 | // d.get(key) in Python. (Can't use this if V isn't a pointer!)
|
112 | V get(K key) const;
|
113 |
|
114 | // Get a key, but return a default if not found.
|
115 | // expr_parse.py uses this with OTHER_BALANCE
|
116 | V get(K key, V default_val) const;
|
117 |
|
118 | // Implements d[k] = v. May resize the dictionary.
|
119 | void set(K key, V val);
|
120 |
|
121 | void update(List<Tuple2<K, V>*>* pairs);
|
122 | void update(Dict<K, V>* other);
|
123 |
|
124 | List<K>* keys() const;
|
125 |
|
126 | List<V>* values() const;
|
127 |
|
128 | void clear();
|
129 |
|
130 | // Helper used by find_kv_index(), set(), mylib::dict_erase() in
|
131 | // gc_mylib.h
|
132 | // Returns either:
|
133 | // - the slot for an existing key, or an empty slot for a new key
|
134 | // - kTooSmall if the table is full
|
135 | int hash_and_probe(K key) const;
|
136 |
|
137 | // Helper used by at(), get(), dict_contains()
|
138 | // Given a key, returns either:
|
139 | // - an index into keys_ and values_
|
140 | // - kNotFound
|
141 | int find_kv_index(K key) const;
|
142 |
|
143 | static constexpr ObjHeader obj_header() {
|
144 | return ObjHeader::ClassFixed(field_mask(), sizeof(Dict));
|
145 | }
|
146 |
|
147 | int len_; // number of entries (keys and values, almost dense)
|
148 | int capacity_; // number of k/v slots
|
149 | int index_len_; // number of index slots
|
150 |
|
151 | // These 3 slabs are resized at the same time.
|
152 | Slab<int>* index_; // kEmptyEntry, kDeletedEntry, or a valid index into
|
153 | // keys_ and values_
|
154 | Slab<K>* keys_; // Dict<K, int>
|
155 | Slab<V>* values_; // Dict<int, V>
|
156 |
|
157 | // A dict has 3 pointers the GC needs to follow.
|
158 | static constexpr uint32_t field_mask() {
|
159 | return maskbit(offsetof(Dict, index_)) | maskbit(offsetof(Dict, keys_)) |
|
160 | maskbit(offsetof(Dict, values_));
|
161 | }
|
162 |
|
163 | DISALLOW_COPY_AND_ASSIGN(Dict);
|
164 |
|
165 | // kItemSize is max of K and V size. That is, on 64-bit machines, the RARE
|
166 | // Dict<int, int> is smaller than other dicts
|
167 | static constexpr int kItemSize = sizeof(K) > sizeof(V) ? sizeof(K)
|
168 | : sizeof(V);
|
169 |
|
170 | // Matches mark_sweep_heap.h
|
171 | static constexpr int kPoolBytes2 = 48 - sizeof(ObjHeader);
|
172 | static_assert(kPoolBytes2 % kItemSize == 0,
|
173 | "An integral number of items should fit in second pool");
|
174 | static constexpr int kNumItems2 = kPoolBytes2 / kItemSize;
|
175 |
|
176 | static const int kHeaderFudge = sizeof(ObjHeader) / kItemSize;
|
177 | static_assert(sizeof(ObjHeader) % kItemSize == 0,
|
178 | "Slab header size should be multiple of key size");
|
179 |
|
180 | #if 0
|
181 | static constexpr int kMinBytes2 = 128 - sizeof(ObjHeader);
|
182 | static_assert(kMinBytes2 % kItemSize == 0,
|
183 | "An integral number of items should fit");
|
184 | static constexpr int kMinItems2 = kMinBytes2 / kItemSize;
|
185 | #endif
|
186 |
|
187 | int HowManyPairs(int num_desired) {
|
188 | // See gc_list.h for comments on nearly identical logic
|
189 |
|
190 | if (num_desired <= kNumItems2) { // use full cell in pool 2
|
191 | return kNumItems2;
|
192 | }
|
193 | #if 0
|
194 | if (num_desired <= kMinItems2) { // 48 -> 128, not 48 -> 64
|
195 | return kMinItems2;
|
196 | }
|
197 | #endif
|
198 | return RoundUp(num_desired + kHeaderFudge) - kHeaderFudge;
|
199 | }
|
200 | };
|
201 |
|
202 | template <typename K, typename V>
|
203 | inline bool dict_contains(const Dict<K, V>* haystack, K needle) {
|
204 | return haystack->find_kv_index(needle) != kNotFound;
|
205 | }
|
206 |
|
207 | template <typename K, typename V>
|
208 | void Dict<K, V>::reserve(int num_desired) {
|
209 | if (capacity_ >= num_desired) {
|
210 | return; // Don't do anything if there's already enough space.
|
211 | }
|
212 |
|
213 | int old_len = len_;
|
214 | Slab<K>* old_k = keys_;
|
215 | Slab<V>* old_v = values_;
|
216 |
|
217 | // Calculate the number of keys and values we should have
|
218 | capacity_ = HowManyPairs(num_desired);
|
219 |
|
220 | // 1) Ensure index len a power of 2, to avoid expensive modulus % operation
|
221 | // 2) Introduce hash table load factor. Use capacity_+1 to simulate ceil()
|
222 | // div, not floor() div.
|
223 | index_len_ = RoundUp((capacity_ + 1) * 5 / 4);
|
224 | DCHECK(index_len_ > capacity_);
|
225 |
|
226 | index_ = NewSlab<int>(index_len_);
|
227 | for (int i = 0; i < index_len_; ++i) {
|
228 | index_->items_[i] = kEmptyEntry;
|
229 | }
|
230 |
|
231 | // These are DENSE, while index_ is sparse.
|
232 | keys_ = NewSlab<K>(capacity_);
|
233 | values_ = NewSlab<V>(capacity_);
|
234 |
|
235 | if (old_k != nullptr) { // rehash if there were any entries
|
236 | // log("REHASH num_desired %d", num_desired);
|
237 | len_ = 0;
|
238 | for (int i = 0; i < old_len; ++i) {
|
239 | set(old_k->items_[i], old_v->items_[i]);
|
240 | }
|
241 | }
|
242 | }
|
243 |
|
244 | template <typename K, typename V>
|
245 | V Dict<K, V>::at(K key) const {
|
246 | int kv_index = find_kv_index(key);
|
247 | if (kv_index == kNotFound) {
|
248 | throw Alloc<KeyError>();
|
249 | } else {
|
250 | return values_->items_[kv_index];
|
251 | }
|
252 | }
|
253 |
|
254 | template <typename K, typename V>
|
255 | V Dict<K, V>::get(K key) const {
|
256 | int kv_index = find_kv_index(key);
|
257 | if (kv_index == kNotFound) {
|
258 | return nullptr;
|
259 | } else {
|
260 | return values_->items_[kv_index];
|
261 | }
|
262 | }
|
263 |
|
264 | template <typename K, typename V>
|
265 | V Dict<K, V>::get(K key, V default_val) const {
|
266 | int kv_index = find_kv_index(key);
|
267 | if (kv_index == kNotFound) {
|
268 | return default_val;
|
269 | } else {
|
270 | return values_->items_[kv_index];
|
271 | }
|
272 | }
|
273 |
|
274 | template <typename K, typename V>
|
275 | List<K>* Dict<K, V>::keys() const {
|
276 | return ListFromDictSlab<K>(keys_, len_);
|
277 | }
|
278 |
|
279 | template <typename K, typename V>
|
280 | List<V>* Dict<K, V>::values() const {
|
281 | return ListFromDictSlab<V>(values_, len_);
|
282 | }
|
283 |
|
284 | template <typename K, typename V>
|
285 | void Dict<K, V>::clear() {
|
286 | // Maintain invariant
|
287 | for (int i = 0; i < index_len_; ++i) {
|
288 | index_->items_[i] = kEmptyEntry;
|
289 | }
|
290 |
|
291 | if (keys_) {
|
292 | memset(keys_->items_, 0, len_ * sizeof(K)); // zero for GC scan
|
293 | }
|
294 | if (values_) {
|
295 | memset(values_->items_, 0, len_ * sizeof(V)); // zero for GC scan
|
296 | }
|
297 | len_ = 0;
|
298 | }
|
299 |
|
300 | // TODO:
|
301 | // - Special case to intern BigStr* when it's hashed? How?
|
302 | // - Should we have wrappers like:
|
303 | // - V GetAndIntern<V>(D, &string_key)
|
304 | // - SetAndIntern<V>(D, &string_key, value)
|
305 | // This will enable duplicate copies of the string to be garbage collected
|
306 | template <typename K, typename V>
|
307 | int Dict<K, V>::hash_and_probe(K key) const {
|
308 | if (capacity_ == 0) {
|
309 | return kTooSmall;
|
310 | }
|
311 |
|
312 | // Hash the key onto a slot in the index. If the first slot is occupied,
|
313 | // probe until an empty one is found.
|
314 | unsigned h = hash_key(key);
|
315 | // faster % using & -- assuming index_len_ is power of 2
|
316 | int init_bucket = h & (index_len_ - 1);
|
317 |
|
318 | // If we see a tombstone along the probing path, stash it.
|
319 | int open_slot = -1;
|
320 |
|
321 | for (int i = 0; i < index_len_; ++i) {
|
322 | // Start at init_bucket and wrap araound
|
323 |
|
324 | // faster % using & -- assuming index_len_ is power of 2
|
325 | int slot = (i + init_bucket) & (index_len_ - 1);
|
326 |
|
327 | int kv_index = index_->items_[slot];
|
328 | DCHECK(kv_index < len_);
|
329 | // Optimistically this is the common case once the table has been populated.
|
330 | if (kv_index >= 0) {
|
331 | unsigned h2 = hash_key(keys_->items_[kv_index]);
|
332 | if (h == h2 && keys_equal(keys_->items_[kv_index], key)) {
|
333 | return slot;
|
334 | }
|
335 | }
|
336 |
|
337 | if (kv_index == kEmptyEntry) {
|
338 | if (open_slot != -1) {
|
339 | slot = open_slot;
|
340 | }
|
341 | // If there isn't room in the entry arrays, tell the caller to resize.
|
342 | return len_ < capacity_ ? slot : kTooSmall;
|
343 | }
|
344 |
|
345 | // Tombstone or collided keys unequal. Keep scanning.
|
346 | DCHECK(kv_index >= 0 || kv_index == kDeletedEntry);
|
347 | if (kv_index == kDeletedEntry && open_slot == -1) {
|
348 | // NOTE: We only record the open slot here. We DON'T return it. If we're
|
349 | // looking for a key that was writen before this tombstone was written to
|
350 | // the index we should continue probing until we get to that key. If we
|
351 | // get to an empty index slot or the end of the index then we know we are
|
352 | // dealing with a new key and can safely replace the tombstone without
|
353 | // disrupting any existing keys.
|
354 | open_slot = slot;
|
355 | }
|
356 | }
|
357 |
|
358 | if (open_slot != -1) {
|
359 | return len_ < capacity_ ? open_slot : kTooSmall;
|
360 | }
|
361 |
|
362 | return kTooSmall;
|
363 | }
|
364 |
|
365 | template <typename K, typename V>
|
366 | int Dict<K, V>::find_kv_index(K key) const {
|
367 | if (index_ != nullptr) { // Common case
|
368 | int pos = hash_and_probe(key);
|
369 | if (pos == kTooSmall) {
|
370 | return kNotFound;
|
371 | }
|
372 | DCHECK(pos >= 0);
|
373 | int kv_index = index_->items_[pos];
|
374 | if (kv_index < 0) {
|
375 | return kNotFound;
|
376 | }
|
377 | return kv_index;
|
378 | }
|
379 |
|
380 | // Linear search on GlobalDict instances.
|
381 | // TODO: Should we populate and compare their hash values?
|
382 | for (int i = 0; i < len_; ++i) {
|
383 | if (keys_equal(keys_->items_[i], key)) {
|
384 | return i;
|
385 | }
|
386 | }
|
387 |
|
388 | return kNotFound;
|
389 | }
|
390 |
|
391 | template <typename K, typename V>
|
392 | void Dict<K, V>::set(K key, V val) {
|
393 | DCHECK(obj_header().heap_tag != HeapTag::Global);
|
394 | int pos = hash_and_probe(key);
|
395 | if (pos == kTooSmall) {
|
396 | reserve(len_ + 1);
|
397 | pos = hash_and_probe(key);
|
398 | }
|
399 | DCHECK(pos >= 0);
|
400 |
|
401 | int kv_index = index_->items_[pos];
|
402 | DCHECK(kv_index < len_);
|
403 | if (kv_index < 0) {
|
404 | // Write new entries to the end of the k/v arrays. This allows us to recall
|
405 | // insertion order until the first deletion.
|
406 | keys_->items_[len_] = key;
|
407 | values_->items_[len_] = val;
|
408 | index_->items_[pos] = len_;
|
409 | len_++;
|
410 | DCHECK(len_ <= capacity_);
|
411 | } else {
|
412 | values_->items_[kv_index] = val;
|
413 | }
|
414 | }
|
415 |
|
416 | template <typename K, typename V>
|
417 | inline int len(const Dict<K, V>* d) {
|
418 | return d->len_;
|
419 | }
|
420 |
|
421 | template <class K, class V>
|
422 | class DictIter {
|
423 | public:
|
424 | explicit DictIter(Dict<K, V>* D) : D_(D), pos_(0) {
|
425 | }
|
426 | void Next() {
|
427 | pos_++;
|
428 | }
|
429 | bool Done() {
|
430 | return pos_ == D_->len_;
|
431 | }
|
432 | K Key() {
|
433 | return D_->keys_->items_[pos_];
|
434 | }
|
435 | V Value() {
|
436 | return D_->values_->items_[pos_];
|
437 | }
|
438 |
|
439 | private:
|
440 | const Dict<K, V>* D_;
|
441 | int pos_;
|
442 | };
|
443 |
|
444 | // dict(mylist) converts a list of (k, v) tuples into a dict
|
445 | template <typename K, typename V>
|
446 | Dict<K, V>* dict(List<Tuple2<K, V>*>* l) {
|
447 | auto ret = Alloc<Dict<K, V>>();
|
448 | ret->reserve(len(l));
|
449 | for (ListIter<Tuple2<K, V>*> it(l); !it.Done(); it.Next()) {
|
450 | ret->set(it.Value()->at0(), it.Value()->at1());
|
451 | }
|
452 | return ret;
|
453 | }
|
454 |
|
455 | template <class K, class V>
|
456 | void Dict<K, V>::update(List<Tuple2<K, V>*>* pairs) {
|
457 | reserve(len_ + len(pairs));
|
458 | for (ListIter<Tuple2<K, V>*> it(pairs); !it.Done(); it.Next()) {
|
459 | set(it.Value()->at0(), it.Value()->at1());
|
460 | }
|
461 | }
|
462 |
|
463 | template <class K, class V>
|
464 | void Dict<K, V>::update(Dict<K, V>* other) {
|
465 | reserve(len_ + len(other));
|
466 | for (DictIter<K, V> it(other); !it.Done(); it.Next()) {
|
467 | set(it.Key(), it.Value());
|
468 | }
|
469 | }
|
470 |
|
471 | #endif // MYCPP_GC_DICT_H
|