| 1 | // Hash table based on CPython's "compact dict":
|
| 2 | //
|
| 3 | // https://mail.python.org/pipermail/python-dev/2012-December/123028.html
|
| 4 | // https://code.activestate.com/recipes/578375/
|
| 5 | //
|
| 6 | // Main differences:
|
| 7 | // - It's type-specialized in C++ -- Dict<K, V>.
|
| 8 | // - It's integrated with our mark and sweep GC, using Slab<int>, Slab<K>, and
|
| 9 | // Slab<V>
|
| 10 | // - We use linear probing, not the pseudo-random number generator
|
| 11 |
|
| 12 | #ifndef MYCPP_GC_DICT_H
|
| 13 | #define MYCPP_GC_DICT_H
|
| 14 |
|
| 15 | #include "mycpp/comparators.h"
|
| 16 | #include "mycpp/gc_builtins.h"
|
| 17 | #include "mycpp/gc_list.h"
|
| 18 | #include "mycpp/hash.h"
|
| 19 |
|
| 20 | // Non-negative entries in index_ are array indices into keys_ and values_.
|
| 21 | // There are two special negative entries:
|
| 22 |
|
| 23 | // index_ value to say this Dict item was deleted (a tombstone).
|
| 24 | const int kDeletedEntry = -1;
|
| 25 |
|
| 26 | // index_ value to say this Dict entry is free.
|
| 27 | const int kEmptyEntry = -2;
|
| 28 |
|
| 29 | // Return value for find_kv_index(), not stored in index_.
|
| 30 | const int kNotFound = -3;
|
| 31 |
|
| 32 | // Return value for hash_and_probe(), not stored in index_.
|
| 33 | const int kTooSmall = -4;
|
| 34 |
|
| 35 | // Helper for keys() and values()
|
| 36 | template <typename T>
|
| 37 | List<T>* ListFromDictSlab(Slab<T>* slab, int n) {
|
| 38 | List<T>* result = Alloc<List<T>>();
|
| 39 | result->reserve(n);
|
| 40 |
|
| 41 | for (int i = 0; i < n; ++i) {
|
| 42 | result->append(slab->items_[i]);
|
| 43 | }
|
| 44 | return result;
|
| 45 | }
|
| 46 |
|
| 47 | // GlobalDict is layout-compatible with Dict (unit tests assert this), and it
|
| 48 | // can be a true C global (incurs zero startup time)
|
| 49 |
|
| 50 | template <typename K, typename V, int N>
|
| 51 | class GlobalDict {
|
| 52 | public:
|
| 53 | int len_;
|
| 54 | int capacity_;
|
| 55 | int index_len_;
|
| 56 | GlobalSlab<int, N>* index_;
|
| 57 | GlobalSlab<K, N>* keys_;
|
| 58 | GlobalSlab<V, N>* values_;
|
| 59 | };
|
| 60 |
|
| 61 | #define GLOBAL_DICT(name, K, V, N, keys, vals) \
|
| 62 | GcGlobal<GlobalSlab<K, N>> _keys_##name = {ObjHeader::Global(TypeTag::Slab), \
|
| 63 | {.items_ = keys}}; \
|
| 64 | GcGlobal<GlobalSlab<V, N>> _vals_##name = {ObjHeader::Global(TypeTag::Slab), \
|
| 65 | {.items_ = vals}}; \
|
| 66 | GcGlobal<GlobalDict<K, V, N>> _dict_##name = { \
|
| 67 | ObjHeader::Global(TypeTag::Dict), \
|
| 68 | {.len_ = N, \
|
| 69 | .capacity_ = N, \
|
| 70 | .index_len_ = 0, \
|
| 71 | .index_ = nullptr, \
|
| 72 | .keys_ = &_keys_##name.obj, \
|
| 73 | .values_ = &_vals_##name.obj}, \
|
| 74 | }; \
|
| 75 | Dict<K, V>* name = reinterpret_cast<Dict<K, V>*>(&_dict_##name.obj);
|
| 76 |
|
| 77 | template <class K, class V>
|
| 78 | class Dict {
|
| 79 | public:
|
| 80 | Dict()
|
| 81 | : len_(0),
|
| 82 | capacity_(0),
|
| 83 | index_len_(0),
|
| 84 | index_(nullptr),
|
| 85 | keys_(nullptr),
|
| 86 | values_(nullptr) {
|
| 87 | }
|
| 88 |
|
| 89 | Dict(std::initializer_list<K> keys, std::initializer_list<V> values)
|
| 90 | : len_(0),
|
| 91 | capacity_(0),
|
| 92 | index_len_(0),
|
| 93 | index_(nullptr),
|
| 94 | keys_(nullptr),
|
| 95 | values_(nullptr) {
|
| 96 | DCHECK(keys.size() == values.size());
|
| 97 | auto v = values.begin(); // This simulates a "zip" loop
|
| 98 | for (auto key : keys) {
|
| 99 | // note: calls reserve(), and may allocate
|
| 100 | this->set(key, *v);
|
| 101 | ++v;
|
| 102 | }
|
| 103 | }
|
| 104 |
|
| 105 | // Reserve enough space for at LEAST this many key-value pairs.
|
| 106 | void reserve(int num_desired);
|
| 107 |
|
| 108 | // d[key] in Python: raises KeyError if not found
|
| 109 | V at(K key) const;
|
| 110 |
|
| 111 | // d.get(key) in Python. (Can't use this if V isn't a pointer!)
|
| 112 | V get(K key) const;
|
| 113 |
|
| 114 | // Get a key, but return a default if not found.
|
| 115 | // expr_parse.py uses this with OTHER_BALANCE
|
| 116 | V get(K key, V default_val) const;
|
| 117 |
|
| 118 | // Implements d[k] = v. May resize the dictionary.
|
| 119 | void set(K key, V val);
|
| 120 |
|
| 121 | void update(List<Tuple2<K, V>*>* pairs);
|
| 122 | void update(Dict<K, V>* other);
|
| 123 |
|
| 124 | List<K>* keys() const;
|
| 125 |
|
| 126 | List<V>* values() const;
|
| 127 |
|
| 128 | void clear();
|
| 129 |
|
| 130 | // Helper used by find_kv_index(), set(), mylib::dict_erase() in
|
| 131 | // gc_mylib.h
|
| 132 | // Returns either:
|
| 133 | // - the slot for an existing key, or an empty slot for a new key
|
| 134 | // - kTooSmall if the table is full
|
| 135 | int hash_and_probe(K key) const;
|
| 136 |
|
| 137 | // Helper used by at(), get(), dict_contains()
|
| 138 | // Given a key, returns either:
|
| 139 | // - an index into keys_ and values_
|
| 140 | // - kNotFound
|
| 141 | int find_kv_index(K key) const;
|
| 142 |
|
| 143 | static constexpr ObjHeader obj_header() {
|
| 144 | return ObjHeader::ClassFixed(field_mask(), sizeof(Dict));
|
| 145 | }
|
| 146 |
|
| 147 | int len_; // number of entries (keys and values, almost dense)
|
| 148 | int capacity_; // number of k/v slots
|
| 149 | int index_len_; // number of index slots
|
| 150 |
|
| 151 | // These 3 slabs are resized at the same time.
|
| 152 | Slab<int>* index_; // kEmptyEntry, kDeletedEntry, or a valid index into
|
| 153 | // keys_ and values_
|
| 154 | Slab<K>* keys_; // Dict<K, int>
|
| 155 | Slab<V>* values_; // Dict<int, V>
|
| 156 |
|
| 157 | // A dict has 3 pointers the GC needs to follow.
|
| 158 | static constexpr uint32_t field_mask() {
|
| 159 | return maskbit(offsetof(Dict, index_)) | maskbit(offsetof(Dict, keys_)) |
|
| 160 | maskbit(offsetof(Dict, values_));
|
| 161 | }
|
| 162 |
|
| 163 | DISALLOW_COPY_AND_ASSIGN(Dict);
|
| 164 |
|
| 165 | // kItemSize is max of K and V size. That is, on 64-bit machines, the RARE
|
| 166 | // Dict<int, int> is smaller than other dicts
|
| 167 | static constexpr int kItemSize = sizeof(K) > sizeof(V) ? sizeof(K)
|
| 168 | : sizeof(V);
|
| 169 |
|
| 170 | // Matches mark_sweep_heap.h
|
| 171 | static constexpr int kPoolBytes2 = 48 - sizeof(ObjHeader);
|
| 172 | static_assert(kPoolBytes2 % kItemSize == 0,
|
| 173 | "An integral number of items should fit in second pool");
|
| 174 | static constexpr int kNumItems2 = kPoolBytes2 / kItemSize;
|
| 175 |
|
| 176 | static const int kHeaderFudge = sizeof(ObjHeader) / kItemSize;
|
| 177 | static_assert(sizeof(ObjHeader) % kItemSize == 0,
|
| 178 | "Slab header size should be multiple of key size");
|
| 179 |
|
| 180 | #if 0
|
| 181 | static constexpr int kMinBytes2 = 128 - sizeof(ObjHeader);
|
| 182 | static_assert(kMinBytes2 % kItemSize == 0,
|
| 183 | "An integral number of items should fit");
|
| 184 | static constexpr int kMinItems2 = kMinBytes2 / kItemSize;
|
| 185 | #endif
|
| 186 |
|
| 187 | int HowManyPairs(int num_desired) {
|
| 188 | // See gc_list.h for comments on nearly identical logic
|
| 189 |
|
| 190 | if (num_desired <= kNumItems2) { // use full cell in pool 2
|
| 191 | return kNumItems2;
|
| 192 | }
|
| 193 | #if 0
|
| 194 | if (num_desired <= kMinItems2) { // 48 -> 128, not 48 -> 64
|
| 195 | return kMinItems2;
|
| 196 | }
|
| 197 | #endif
|
| 198 | return RoundUp(num_desired + kHeaderFudge) - kHeaderFudge;
|
| 199 | }
|
| 200 | };
|
| 201 |
|
| 202 | template <typename K, typename V>
|
| 203 | inline bool dict_contains(const Dict<K, V>* haystack, K needle) {
|
| 204 | return haystack->find_kv_index(needle) != kNotFound;
|
| 205 | }
|
| 206 |
|
| 207 | template <typename K, typename V>
|
| 208 | void Dict<K, V>::reserve(int num_desired) {
|
| 209 | if (capacity_ >= num_desired) {
|
| 210 | return; // Don't do anything if there's already enough space.
|
| 211 | }
|
| 212 |
|
| 213 | int old_len = len_;
|
| 214 | Slab<K>* old_k = keys_;
|
| 215 | Slab<V>* old_v = values_;
|
| 216 |
|
| 217 | // Calculate the number of keys and values we should have
|
| 218 | capacity_ = HowManyPairs(num_desired);
|
| 219 |
|
| 220 | // 1) Ensure index len a power of 2, to avoid expensive modulus % operation
|
| 221 | // 2) Introduce hash table load factor. Use capacity_+1 to simulate ceil()
|
| 222 | // div, not floor() div.
|
| 223 | index_len_ = RoundUp((capacity_ + 1) * 5 / 4);
|
| 224 | DCHECK(index_len_ > capacity_);
|
| 225 |
|
| 226 | index_ = NewSlab<int>(index_len_);
|
| 227 | for (int i = 0; i < index_len_; ++i) {
|
| 228 | index_->items_[i] = kEmptyEntry;
|
| 229 | }
|
| 230 |
|
| 231 | // These are DENSE, while index_ is sparse.
|
| 232 | keys_ = NewSlab<K>(capacity_);
|
| 233 | values_ = NewSlab<V>(capacity_);
|
| 234 |
|
| 235 | if (old_k != nullptr) { // rehash if there were any entries
|
| 236 | // log("REHASH num_desired %d", num_desired);
|
| 237 | len_ = 0;
|
| 238 | for (int i = 0; i < old_len; ++i) {
|
| 239 | set(old_k->items_[i], old_v->items_[i]);
|
| 240 | }
|
| 241 | }
|
| 242 | }
|
| 243 |
|
| 244 | template <typename K, typename V>
|
| 245 | V Dict<K, V>::at(K key) const {
|
| 246 | int kv_index = find_kv_index(key);
|
| 247 | if (kv_index == kNotFound) {
|
| 248 | throw Alloc<KeyError>();
|
| 249 | } else {
|
| 250 | return values_->items_[kv_index];
|
| 251 | }
|
| 252 | }
|
| 253 |
|
| 254 | template <typename K, typename V>
|
| 255 | V Dict<K, V>::get(K key) const {
|
| 256 | int kv_index = find_kv_index(key);
|
| 257 | if (kv_index == kNotFound) {
|
| 258 | return nullptr;
|
| 259 | } else {
|
| 260 | return values_->items_[kv_index];
|
| 261 | }
|
| 262 | }
|
| 263 |
|
| 264 | template <typename K, typename V>
|
| 265 | V Dict<K, V>::get(K key, V default_val) const {
|
| 266 | int kv_index = find_kv_index(key);
|
| 267 | if (kv_index == kNotFound) {
|
| 268 | return default_val;
|
| 269 | } else {
|
| 270 | return values_->items_[kv_index];
|
| 271 | }
|
| 272 | }
|
| 273 |
|
| 274 | template <typename K, typename V>
|
| 275 | List<K>* Dict<K, V>::keys() const {
|
| 276 | return ListFromDictSlab<K>(keys_, len_);
|
| 277 | }
|
| 278 |
|
| 279 | template <typename K, typename V>
|
| 280 | List<V>* Dict<K, V>::values() const {
|
| 281 | return ListFromDictSlab<V>(values_, len_);
|
| 282 | }
|
| 283 |
|
| 284 | template <typename K, typename V>
|
| 285 | void Dict<K, V>::clear() {
|
| 286 | // Maintain invariant
|
| 287 | for (int i = 0; i < index_len_; ++i) {
|
| 288 | index_->items_[i] = kEmptyEntry;
|
| 289 | }
|
| 290 |
|
| 291 | if (keys_) {
|
| 292 | memset(keys_->items_, 0, len_ * sizeof(K)); // zero for GC scan
|
| 293 | }
|
| 294 | if (values_) {
|
| 295 | memset(values_->items_, 0, len_ * sizeof(V)); // zero for GC scan
|
| 296 | }
|
| 297 | len_ = 0;
|
| 298 | }
|
| 299 |
|
| 300 | // TODO:
|
| 301 | // - Special case to intern BigStr* when it's hashed? How?
|
| 302 | // - Should we have wrappers like:
|
| 303 | // - V GetAndIntern<V>(D, &string_key)
|
| 304 | // - SetAndIntern<V>(D, &string_key, value)
|
| 305 | // This will enable duplicate copies of the string to be garbage collected
|
| 306 | template <typename K, typename V>
|
| 307 | int Dict<K, V>::hash_and_probe(K key) const {
|
| 308 | if (capacity_ == 0) {
|
| 309 | return kTooSmall;
|
| 310 | }
|
| 311 |
|
| 312 | // Hash the key onto a slot in the index. If the first slot is occupied,
|
| 313 | // probe until an empty one is found.
|
| 314 | unsigned h = hash_key(key);
|
| 315 | // faster % using & -- assuming index_len_ is power of 2
|
| 316 | int init_bucket = h & (index_len_ - 1);
|
| 317 |
|
| 318 | // If we see a tombstone along the probing path, stash it.
|
| 319 | int open_slot = -1;
|
| 320 |
|
| 321 | for (int i = 0; i < index_len_; ++i) {
|
| 322 | // Start at init_bucket and wrap araound
|
| 323 |
|
| 324 | // faster % using & -- assuming index_len_ is power of 2
|
| 325 | int slot = (i + init_bucket) & (index_len_ - 1);
|
| 326 |
|
| 327 | int kv_index = index_->items_[slot];
|
| 328 | DCHECK(kv_index < len_);
|
| 329 | // Optimistically this is the common case once the table has been populated.
|
| 330 | if (kv_index >= 0) {
|
| 331 | unsigned h2 = hash_key(keys_->items_[kv_index]);
|
| 332 | if (h == h2 && keys_equal(keys_->items_[kv_index], key)) {
|
| 333 | return slot;
|
| 334 | }
|
| 335 | }
|
| 336 |
|
| 337 | if (kv_index == kEmptyEntry) {
|
| 338 | if (open_slot != -1) {
|
| 339 | slot = open_slot;
|
| 340 | }
|
| 341 | // If there isn't room in the entry arrays, tell the caller to resize.
|
| 342 | return len_ < capacity_ ? slot : kTooSmall;
|
| 343 | }
|
| 344 |
|
| 345 | // Tombstone or collided keys unequal. Keep scanning.
|
| 346 | DCHECK(kv_index >= 0 || kv_index == kDeletedEntry);
|
| 347 | if (kv_index == kDeletedEntry && open_slot == -1) {
|
| 348 | // NOTE: We only record the open slot here. We DON'T return it. If we're
|
| 349 | // looking for a key that was writen before this tombstone was written to
|
| 350 | // the index we should continue probing until we get to that key. If we
|
| 351 | // get to an empty index slot or the end of the index then we know we are
|
| 352 | // dealing with a new key and can safely replace the tombstone without
|
| 353 | // disrupting any existing keys.
|
| 354 | open_slot = slot;
|
| 355 | }
|
| 356 | }
|
| 357 |
|
| 358 | if (open_slot != -1) {
|
| 359 | return len_ < capacity_ ? open_slot : kTooSmall;
|
| 360 | }
|
| 361 |
|
| 362 | return kTooSmall;
|
| 363 | }
|
| 364 |
|
| 365 | template <typename K, typename V>
|
| 366 | int Dict<K, V>::find_kv_index(K key) const {
|
| 367 | if (index_ != nullptr) { // Common case
|
| 368 | int pos = hash_and_probe(key);
|
| 369 | if (pos == kTooSmall) {
|
| 370 | return kNotFound;
|
| 371 | }
|
| 372 | DCHECK(pos >= 0);
|
| 373 | int kv_index = index_->items_[pos];
|
| 374 | if (kv_index < 0) {
|
| 375 | return kNotFound;
|
| 376 | }
|
| 377 | return kv_index;
|
| 378 | }
|
| 379 |
|
| 380 | // Linear search on GlobalDict instances.
|
| 381 | // TODO: Should we populate and compare their hash values?
|
| 382 | for (int i = 0; i < len_; ++i) {
|
| 383 | if (keys_equal(keys_->items_[i], key)) {
|
| 384 | return i;
|
| 385 | }
|
| 386 | }
|
| 387 |
|
| 388 | return kNotFound;
|
| 389 | }
|
| 390 |
|
| 391 | template <typename K, typename V>
|
| 392 | void Dict<K, V>::set(K key, V val) {
|
| 393 | DCHECK(obj_header().heap_tag != HeapTag::Global);
|
| 394 | int pos = hash_and_probe(key);
|
| 395 | if (pos == kTooSmall) {
|
| 396 | reserve(len_ + 1);
|
| 397 | pos = hash_and_probe(key);
|
| 398 | }
|
| 399 | DCHECK(pos >= 0);
|
| 400 |
|
| 401 | int kv_index = index_->items_[pos];
|
| 402 | DCHECK(kv_index < len_);
|
| 403 | if (kv_index < 0) {
|
| 404 | // Write new entries to the end of the k/v arrays. This allows us to recall
|
| 405 | // insertion order until the first deletion.
|
| 406 | keys_->items_[len_] = key;
|
| 407 | values_->items_[len_] = val;
|
| 408 | index_->items_[pos] = len_;
|
| 409 | len_++;
|
| 410 | DCHECK(len_ <= capacity_);
|
| 411 | } else {
|
| 412 | values_->items_[kv_index] = val;
|
| 413 | }
|
| 414 | }
|
| 415 |
|
| 416 | template <typename K, typename V>
|
| 417 | inline int len(const Dict<K, V>* d) {
|
| 418 | return d->len_;
|
| 419 | }
|
| 420 |
|
| 421 | template <class K, class V>
|
| 422 | class DictIter {
|
| 423 | public:
|
| 424 | explicit DictIter(Dict<K, V>* D) : D_(D), pos_(0) {
|
| 425 | }
|
| 426 | void Next() {
|
| 427 | pos_++;
|
| 428 | }
|
| 429 | bool Done() {
|
| 430 | return pos_ == D_->len_;
|
| 431 | }
|
| 432 | K Key() {
|
| 433 | return D_->keys_->items_[pos_];
|
| 434 | }
|
| 435 | V Value() {
|
| 436 | return D_->values_->items_[pos_];
|
| 437 | }
|
| 438 |
|
| 439 | private:
|
| 440 | const Dict<K, V>* D_;
|
| 441 | int pos_;
|
| 442 | };
|
| 443 |
|
| 444 | // dict(mylist) converts a list of (k, v) tuples into a dict
|
| 445 | template <typename K, typename V>
|
| 446 | Dict<K, V>* dict(List<Tuple2<K, V>*>* l) {
|
| 447 | auto ret = Alloc<Dict<K, V>>();
|
| 448 | ret->reserve(len(l));
|
| 449 | for (ListIter<Tuple2<K, V>*> it(l); !it.Done(); it.Next()) {
|
| 450 | ret->set(it.Value()->at0(), it.Value()->at1());
|
| 451 | }
|
| 452 | return ret;
|
| 453 | }
|
| 454 |
|
| 455 | template <class K, class V>
|
| 456 | void Dict<K, V>::update(List<Tuple2<K, V>*>* pairs) {
|
| 457 | reserve(len_ + len(pairs));
|
| 458 | for (ListIter<Tuple2<K, V>*> it(pairs); !it.Done(); it.Next()) {
|
| 459 | set(it.Value()->at0(), it.Value()->at1());
|
| 460 | }
|
| 461 | }
|
| 462 |
|
| 463 | template <class K, class V>
|
| 464 | void Dict<K, V>::update(Dict<K, V>* other) {
|
| 465 | reserve(len_ + len(other));
|
| 466 | for (DictIter<K, V> it(other); !it.Done(); it.Next()) {
|
| 467 | set(it.Key(), it.Value());
|
| 468 | }
|
| 469 | }
|
| 470 |
|
| 471 | #endif // MYCPP_GC_DICT_H
|