From d597012acf7554e071698194c741a08aabb986b5 Mon Sep 17 00:00:00 2001
From: Le <_@I-am-Le.me>
Date: Sun, 3 Sep 2023 10:45:15 +0200
Subject: [PATCH] Add functionality for mass deletion

Using hashmap_delete() with hashmap_scan() or hashmap_iter() is unsafe,
as a resize of the bucket list may happen at any time, throwing off the
current cursor.

However, mass deletion based on some part of the item that isn't the key
may still be useful in some cases.

This introduces voluntary tombstoning of buckets, with the possibility
to clean them up on demand.  Tombstoning a bucket is implemented as a
simple flag, which effectively makes the bucket still "occupied", but
otherwise inaccessible.  This can also be viewed as delayed deletion.

These two functions are added to make this possible:

- hashmap_unset (and hashmap_unset_with_hash), which simply sets the
  tombstone flag on the item.
- hashmap_vacuum, which goes through the whole bucket list, and deletes
  every bucket that has the tombstone flag.

Example code with hashmap_scan:

    bool filter_unset(const void *item, void *udata)
    {
        if (/* Some condition on |item| */)
            hashmap_unset(udata, item);
        return true;
    }
    // ...
    hashmap_scan(map, filter_unset, map);
    hashmap_vacuum(map);

Example code with hashmap_iter:

    for (size_t i = 0, void *item; hashmap_iter(map, &i, &item);) {
        if (/* Some condition on |item| */)
            hashmap_unset(udata, item);
    hashmap_vacuum(map);

A test is included.

Signed-off-by: Le <_@I-am-Le.me>
---
 README.md |   7 ++
 hashmap.c | 187 ++++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 167 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index c61403e..0721478 100644
--- a/README.md
+++ b/README.md
@@ -112,6 +112,7 @@ hashmap_count    # returns the number of items in the hash map
 hashmap_set      # insert or replace an existing item and return the previous
 hashmap_get      # get an existing item
 hashmap_delete   # delete and return an item
+hashmap_unset    # make the item a tombstone and return it
 hashmap_clear    # clear the hash map
 ```
 
@@ -122,6 +123,12 @@ hashmap_iter     # loop based iteration over all items in hash map
 hashmap_scan     # callback based iteration over all items in hash map
 ```
 
+### Maintenance
+
+```sh
+hashmap_vacuum   # vacuum tombstones
+```
+
 ### Hash helpers
 
 ```sh
diff --git a/hashmap.c b/hashmap.c
index 2685f6f..5d0f08b 100644
--- a/hashmap.c
+++ b/hashmap.c
@@ -26,7 +26,8 @@ void hashmap_set_allocator(void *(*malloc)(size_t), void (*free)(void*)) {
 
 struct bucket {
     uint64_t hash:48;
-    uint64_t dib:16;
+    uint64_t tomb:1;
+    uint64_t dib:15;
 };
 
 // hashmap is an open addressed hash map using robinhood hashing.
@@ -45,6 +46,7 @@ struct hashmap {
     size_t bucketsz;
     size_t nbuckets;
     size_t count;
+    size_t tombs;
     size_t mask;
     size_t growat;
     size_t shrinkat;
@@ -210,7 +212,7 @@ static bool resize0(struct hashmap *map, size_t new_cap) {
     if (!map2) return false;
     for (size_t i = 0; i < map->nbuckets; i++) {
         struct bucket *entry = bucket_at(map, i);
-        if (!entry->dib) {
+        if (entry->tomb || !entry->dib) {
             continue;
         }
         entry->dib = 1;
@@ -261,6 +263,7 @@ const void *hashmap_set_with_hash(struct hashmap *map, const void *item,
 
     struct bucket *entry = map->edata;
     entry->hash = hash;
+    entry->tomb = 0;
     entry->dib = 1;
     void *eitem = bucket_item(entry);
     memcpy(eitem, item, map->elsize);
@@ -275,9 +278,10 @@ const void *hashmap_set_with_hash(struct hashmap *map, const void *item,
             return NULL;
         }
         bitem = bucket_item(bucket);
-        if (entry->hash == bucket->hash && (!map->compare ||
-            map->compare(eitem, bitem, map->udata) == 0))
-        {
+        if (!bucket->tomb
+            && entry->hash == bucket->hash
+            && (!map->compare
+                || map->compare(eitem, bitem, map->udata) == 0)) {
             memcpy(map->spare, bitem, map->elsize);
             memcpy(bitem, eitem, map->elsize);
             return map->spare;
@@ -312,7 +316,7 @@ const void *hashmap_get_with_hash(struct hashmap *map, const void *key,
     while(1) {
         struct bucket *bucket = bucket_at(map, i);
         if (!bucket->dib) return NULL;
-        if (bucket->hash == hash) {
+        if (!bucket->tomb && bucket->hash == hash) {
             void *bitem = bucket_item(bucket);
             if (!map->compare || map->compare(key, bitem, map->udata) == 0) {
                 return bitem;
@@ -334,12 +338,90 @@ const void *hashmap_get(struct hashmap *map, const void *key) {
 const void *hashmap_probe(struct hashmap *map, uint64_t position) {
     size_t i = position & map->mask;
     struct bucket *bucket = bucket_at(map, i);
-    if (!bucket->dib) {
+    if (bucket->tomb || !bucket->dib) {
         return NULL;
     }
     return bucket_item(bucket);
 }
 
+// hashmap_unset_with_hash works like hashmap_unset but you provide your
+// own hash. The 'hash' callback provided to the hashmap_new function
+// will not be called.
+const void *hashmap_unset_with_hash(struct hashmap *map, const void *key,
+    uint64_t hash)
+{
+    hash = clip_hash(hash);
+    map->oom = false;
+    size_t i = hash & map->mask;
+    while(1) {
+        struct bucket *bucket = bucket_at(map, i);
+        if (!bucket->dib) {
+            return NULL;
+        }
+        void *bitem = bucket_item(bucket);
+        if (!bucket->tomb
+            && bucket->hash == hash
+            && (!map->compare
+                || map->compare(key, bitem, map->udata) == 0)) {
+            memcpy(map->spare, bitem, map->elsize);
+            bucket->tomb = 1;
+            map->tombs++;
+            return map->spare;
+        }
+        i = (i + 1) & map->mask;
+    }
+}
+
+// hashmap_unset removes an item from the hash map and returns it. If the
+// item is not found then NULL is returned.
+// This differs from hashmap_delete insofar that it makes no adjustments
+// of the bucket list, but rather make the item a tombstone.  A tombstoned
+// item is still the same item, but with a mark that makes in inaccessible.
+// It's safe for the caller to do anything with the returned item, as its
+// contents will not be used any more, not even for comparison.
+// This function is mainly suitable to use with functions like hashmap_scan
+// or hashmap_iter, and should be completed with a call of hashmap_vacuum
+// after the iteration is done, to effectively delete all the tombstones.
+// This can be seen as a delayed delete.
+const void *hashmap_unset(struct hashmap *map, const void *key) {
+    return hashmap_unset_with_hash(map, key, get_hash(map, key));
+}
+
+// Helper function used both by hashmap_delete_with_hash and hashmap_vacuum
+static void *delete_at(struct hashmap *map, uint64_t position)
+{
+    struct bucket *bucket = bucket_at(map, position);
+    if (!bucket->dib) {
+        return NULL;
+    }
+
+    void *bitem = bucket_item(bucket);
+    memcpy(map->spare, bitem, map->elsize);
+
+    // bitem has no more use, except for being the return value
+    if (bucket->tomb)
+        bitem = NULL;
+
+    while(1) {
+        struct bucket *prev = bucket;
+        position = (position + 1) & map->mask;
+        bucket = bucket_at(map, position);
+        if (bucket->dib <= 1) {
+            prev->tomb = 0;
+            prev->dib = 0;
+            break;
+        }
+        memcpy(prev, bucket, map->bucketsz);
+        if (! --prev->dib)
+            prev->tomb = 0;
+    }
+    map->count--;
+
+    if (bitem == NULL)
+        map->tombs--;
+    return bitem;
+}
+
 // hashmap_delete_with_hash works like hashmap_delete but you provide your
 // own hash. The 'hash' callback provided to the hashmap_new function
 // will not be called
@@ -355,23 +437,14 @@ const void *hashmap_delete_with_hash(struct hashmap *map, const void *key,
             return NULL;
         }
         void *bitem = bucket_item(bucket);
-        if (bucket->hash == hash && (!map->compare ||
-            map->compare(key, bitem, map->udata) == 0))
-        {
-            memcpy(map->spare, bitem, map->elsize);
-            bucket->dib = 0;
-            while(1) {
-                struct bucket *prev = bucket;
-                i = (i + 1) & map->mask;
-                bucket = bucket_at(map, i);
-                if (bucket->dib <= 1) {
-                    prev->dib = 0;
-                    break;
-                }
-                memcpy(prev, bucket, map->bucketsz);
-                prev->dib--;
-            }
-            map->count--;
+        if (!bucket->tomb
+            && bucket->hash == hash
+            && (!map->compare
+                || map->compare(key, bitem, map->udata) == 0)) {
+            // |i| is the position of a filled bucket, so delete_at will
+            // save it in |map->spared|.  No need to check the returned
+            // value here.
+            delete_at(map, i);
             if (map->nbuckets > map->cap && map->count <= map->shrinkat) {
                 // Ignore the return value. It's ok for the resize operation to
                 // fail to allocate enough memory because a shrink operation
@@ -390,9 +463,34 @@ const void *hashmap_delete(struct hashmap *map, const void *key) {
     return hashmap_delete_with_hash(map, key, get_hash(map, key));
 }
 
+// hashmap_vacuum Vacuums all remaining tombstones (buckets with tomb == 1)
+void hashmap_vacuum(struct hashmap *map)
+{
+    size_t i = 0;
+    do {
+        struct bucket *bucket = bucket_at(map, i);
+
+        if (bucket->tomb && bucket->dib)
+            delete_at(map, i);
+        // Since delete_at does backward shifting, there are new buckets
+        // at position |i|, including a possible tombstone.  Therefore,
+        // the position can only be updated when the current bucket was
+        // deemed to be live (not a tombstone) or completely dead (not
+        // a tombstone, and with bucket->dib == 0).
+        if (!bucket->tomb)
+            i++;
+    } while(i < map->nbuckets);
+    if (map->nbuckets > map->cap && map->count <= map->shrinkat) {
+        // Ignore the return value. It's ok for the resize operation to
+        // fail to allocate enough memory because a shrink operation
+        // does not change the integrity of the data.
+        resize(map, map->nbuckets/2);
+    }
+}
+
 // hashmap_count returns the number of items in the hash map.
 size_t hashmap_count(struct hashmap *map) {
-    return map->count;
+    return map->count - map->tombs;
 }
 
 // hashmap_free frees the hash map
@@ -419,7 +517,9 @@ bool hashmap_scan(struct hashmap *map,
 {
     for (size_t i = 0; i < map->nbuckets; i++) {
         struct bucket *bucket = bucket_at(map, i);
-        if (bucket->dib && !iter(bucket_item(bucket), udata)) {
+        if (!bucket->tomb
+            && bucket->dib
+            && !iter(bucket_item(bucket), udata)) {
             return false;
         }
     }
@@ -450,7 +550,7 @@ bool hashmap_iter(struct hashmap *map, size_t *i, void **item) {
         if (*i >= map->nbuckets) return false;
         bucket = bucket_at(map, *i);
         (*i)++;
-    } while (!bucket->dib);
+    } while (!bucket->tomb && !bucket->dib);
     *item = bucket_item(bucket);
     return true;
 }
@@ -985,6 +1085,39 @@ static void all(void) {
     hashmap_clear(map, false);
     assert(prev_cap == map->cap);
 
+
+    for (int i = 0; i < N; i++) {
+        while (true) {
+            assert(!hashmap_set(map, &vals[i]));
+            if (!hashmap_oom(map)) {
+                break;
+            }
+        }
+    }
+
+    // Test unset in a hashmap_iter loop
+    size_t prev_count = map->count;
+    assert(map->tombs == 0);
+    for (iter = 0; hashmap_iter(map, &iter, &iter_val);)
+        // Unset the odd ones
+        if (*(int *)iter_val % 2)
+            assert(hashmap_unset(map, iter_val));
+    // about half the buckets should be tombs
+    fprintf(stderr, "map->count = %zu, map->tombs = %zu\n",
+            map->count, map->tombs);
+    assert(map->tombs != 0);
+    assert(map->count == N);
+    if (N % 2)
+        assert(hashmap_count(map) + 1 == map->tombs);
+    else
+        assert(hashmap_count(map) == map->tombs);
+    assert(map->count == prev_count);
+    prev_count = map->count;
+    hashmap_vacuum(map);
+    assert(map->tombs == 0);
+    assert(map->count == prev_count / 2);
+
+
     hashmap_free(map);
 
     xfree(vals);