From 991b63b5c9562eaaf6a1c01cdce7127c1d063f70 Mon Sep 17 00:00:00 2001 From: tidwall Date: Mon, 6 Jul 2020 13:40:46 -0700 Subject: [PATCH] first commit --- LICENSE | 20 ++ README.md | 140 ++++++++++ hashmap.c | 743 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ hashmap.h | 32 +++ 4 files changed, 935 insertions(+) create mode 100644 LICENSE create mode 100644 README.md create mode 100644 hashmap.c create mode 100644 hashmap.h diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..541ae74 --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2020 Joshua J Baker + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..26c31b4 --- /dev/null +++ b/README.md @@ -0,0 +1,140 @@ +# hashmap.c + +[Hash map](https://en.wikipedia.org/wiki/Hash_table) implementation in C. + +## Features + +- [Open addressing](https://en.wikipedia.org/wiki/Hash_table#Open_addressing) using [Robin Hood](https://en.wikipedia.org/wiki/Hash_table#Robin_Hood_hashing) hashing +- Generic interface with support for variable sized items. +- Built-in [SipHash](https://en.wikipedia.org/wiki/SipHash) or [MurmurHash3](https://en.wikipedia.org/wiki/MurmurHash) and allows for alternative algorithms. +- ANSI C (C99) +- Supports custom allocators +- Pretty darn good performance. 🚀 + +## Example + +```c +#include +#include +#include "hashmap.h" + +struct user { + char *name; + int age; +}; + +int user_compare(const void *a, const void *b) { + const struct user *ua = a; + const struct user *ub = b; + return strcmp(ua->name, ub->name); +} + +bool user_iter(const void *item, void *udata) { + const struct user *user = item; + printf("%s (age=%d)\n", user->name, user->age); + return true; +} + +uint64_t user_hash(const void *item, uint64_t seed0, uint64_t seed1) { + const struct user *user = item; + return hashmap_sip(user->name, strlen(user->name), seed0, seed1); +} + +int main() { + // create a new hash map where each item is a `struct user`. The second + // argument is the initial capacity. The third and fourth arguments are + // optional seeds that are passed to the following hash function. + struct hashmap *map = hashmap_new(sizeof(struct user), 0, 0, 0, + user_hash, user_compare); + + // Here we'll load some users into the hash map. Each set operation + // performs a copy of the data that is pointed to in the second argument. + hashmap_set(map, &(struct user){ .name="Dale", .age=44 }); + hashmap_set(map, &(struct user){ .name="Roger", .age=68 }); + hashmap_set(map, &(struct user){ .name="Jane", .age=47 }); + + struct user *user; + + printf("\n-- get some users --\n"); + user = hashmap_get(map, &(struct user){ .name="Jane" }); + printf("%s age=%d\n", user->name, user->age); + + user = hashmap_get(map, &(struct user){ .name="Roger" }); + printf("%s age=%d\n", user->name, user->age); + + user = hashmap_get(map, &(struct user){ .name="Dale" }); + printf("%s age=%d\n", user->name, user->age); + + user = hashmap_get(map, &(struct user){ .name="Tom" }); + printf("%s\n", user?"exists":"not exists"); + + printf("\n-- iterate over all users --\n"); + hashmap_scan(map, user_iter, NULL); + + hashmap_free(map); +} + +// output: +// -- get some users -- +// Jane age=47 +// Roger age=68 +// Dale age=44 +// not exists +// +// -- iterate over all users -- +// Dale (age=44) +// Roger (age=68) +// Jane (age=47) +``` + +## Functions + +### Basic + +```sh +hashmap_new # allocate a new hash map +hashmap_free # free the hash map +hashmap_count # returns the number of items in the hash map +hashmap_set # insert or replace an existing item and return the previous +hashmap_get # get an existing item +hashmap_delete # delete and return an item +``` + +### Iteration + +```sh +hashmap_scan # iterate over items in hash map +``` + +### Hash helpers + +```sh +hashmap_sip # returns hash value for data using SipHash-2-4 +hashmap_murmur # returns hash value for data using MurmurHash3 +``` + +## Testing and benchmarks + +```sh +$ cc -DHASHMAP_TEST hashmap.c && ./a.out # run tests +$ cc -DHASHMAP_TEST -O3 hashmap.c && BENCH=1 ./a.out # run benchmarks +``` + +The following benchmarks were run on my 2019 Macbook Pro (2.4 GHz 8-Core Intel Core i9) using gcc-9. +The items are simple 4-byte ints. +The hash function is MurmurHash3. +Testing with 5,000,000 items. +The `(cap)` results are hashmaps that are created with an inital capacity of 5,000,000. + +``` +set 5000000 ops in 0.708 secs, 142 ns/op, 7057960 op/sec, 26.84 bytes/op +get 5000000 ops in 0.303 secs, 61 ns/op, 16492723 op/sec +delete 5000000 ops in 0.486 secs, 97 ns/op, 10280873 op/sec +set (cap) 5000000 ops in 0.429 secs, 86 ns/op, 11641660 op/sec +get (cap) 5000000 ops in 0.303 secs, 61 ns/op, 16490493 op/sec +delete (cap) 5000000 ops in 0.410 secs, 82 ns/op, 12200091 op/sec +``` + +## License + +hashmap.c source code is available under the MIT License. diff --git a/hashmap.c b/hashmap.c new file mode 100644 index 0000000..469f493 --- /dev/null +++ b/hashmap.c @@ -0,0 +1,743 @@ +// Copyright 2020 Joshua J Baker. All rights reserved. +// Use of this source code is governed by an MIT-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include +#include +#include "hashmap.h" + +static void *(*_malloc)(size_t) = NULL; +static void (*_free)(void *) = NULL; + +#define hmmalloc (_malloc?_malloc:malloc) +#define hmfree (_free?_free:free) + +// hashmap_set_allocator allows for configuring a custom allocator for +// all hashmap library operations. This function, if needed, should be called +// only once at startup and a prior to calling hashmap_new(). +void hashmap_set_allocator(void *(*malloc)(size_t), void (*free)(void*)) +{ + _malloc = malloc; + _free = free; +} + +#define panic(_msg_) { \ + fprintf(stderr, "panic: %s (%s:%d)\n", (_msg_), __FILE__, __LINE__); \ + exit(1); \ +} + +struct bucket { + uint64_t hash:48; + uint64_t dib:16; +}; + +// hashmap is an open addressed hash map using robinhood hashing. +struct hashmap { + bool oom; + size_t elsize; + size_t cap; + uint64_t seed0; + uint64_t seed1; + uint64_t (*hash)(const void *item, uint64_t seed0, uint64_t seed1); + int (*compare)(const void *a, const void *b); + size_t bucketsz; + size_t nbuckets; + size_t count; + size_t mask; + size_t growat; + size_t shrinkat; + void *buckets; + void *spare; +}; + +static struct bucket *bucket_at(struct hashmap *map, size_t index) { + return (struct bucket*)(((char*)map->buckets)+(map->bucketsz*index)); +} + +static void *bucket_item(struct bucket *entry) { + return ((char*)entry)+sizeof(struct bucket); +} + +static uint64_t get_hash(struct hashmap *map, void *key) { + return map->hash(key, map->seed0, map->seed1) << 16 >> 16; +} + +// hashmap_new returns a new hash map. +// Param `elsize` is the size of each element in the tree. Every element that +// is inserted, deleted, or retrieved will be this size. +// Param `cap` is the default lower capacity of the hashmap. Setting this to +// zero will default to 16. +// Params `seed0` and `seed1` are optional seed values that are passed to the +// following `hash` function. These can be any value you wish but it's often +// best to use randomly generated values. +// Param `hash` is a function that generates a hash value for an item. It's +// important that you provice good hash function, otherwise will perform poorly +// or be vulnerable to Denial-of-service attacks. This implementation comes +// with two helper functions `hashmap_sip()` and `hashmap_murmur()` +// Param `compare` is a function that compares items in the tree. See the +// qsort stdlib function for an example of how this function works. +// The hashmap must be freed with hashmap_free(). +struct hashmap *hashmap_new(size_t elsize, size_t cap, + uint64_t seed0, uint64_t seed1, + uint64_t (*hash)(const void *item, + uint64_t seed0, uint64_t seed1), + int (*compare)(const void *a, const void *b)) +{ + int ncap = 16; + if (cap < ncap) { + cap = ncap; + } else { + while (ncap < cap) { + ncap *= 2; + } + cap = ncap; + } + size_t bucketsz = sizeof(struct bucket) + elsize; + while (bucketsz & (sizeof(uintptr_t)-1)) { + bucketsz++; + } + struct hashmap *map = hmmalloc(sizeof(struct hashmap)+bucketsz); + if (!map) { + return NULL; + } + memset(map, 0, sizeof(struct hashmap)); + map->elsize = elsize; + map->bucketsz = bucketsz; + map->seed0 = seed0; + map->seed1 = seed1; + map->hash = hash; + map->compare = compare; + map->spare = ((char*)map)+sizeof(struct hashmap); + map->cap = cap; + map->nbuckets = cap; + map->mask = map->nbuckets-1; + map->buckets = hmmalloc(map->bucketsz*map->nbuckets); + if (!map->buckets) { + hmfree(map); + return NULL; + } + memset(map->buckets, 0, map->bucketsz*map->nbuckets); + map->growat = map->nbuckets*0.75; + map->shrinkat = map->nbuckets*0.10; + return map; +} + +static bool resize(struct hashmap *map, size_t new_cap) { + struct hashmap *map2 = hashmap_new(map->elsize, new_cap, map->seed1, + map->seed1, map->hash, map->compare); + if (!map2) { + return false; + } + for (size_t i = 0; i < map->nbuckets; i++) { + struct bucket *entry = bucket_at(map, i); + if (!entry->dib) { + continue; + } + entry->dib = 1; + size_t j = entry->hash & map2->mask; + for (;;) { + struct bucket *bucket = bucket_at(map2, j); + if (bucket->dib == 0) { + memcpy(bucket, entry, map->bucketsz); + break; + } + if (bucket->dib < entry->dib) { + memcpy(map2->spare, bucket, map->bucketsz); + memcpy(bucket, entry, map->bucketsz); + memcpy(entry, map2->spare, map->bucketsz); + } + j = (j + 1) & map2->mask; + entry->dib += 1; + } + } + hmfree(map->buckets); + map->buckets = map2->buckets; + map->nbuckets = map2->nbuckets; + map->mask = map2->mask; + map->growat = map2->growat; + map->shrinkat = map2->shrinkat; + hmfree(map2); + return true; +} + +// hashmap_set inserts or replaces an item in the hash map. If an item is +// replaced then it is returned otherwise NULL is returned. This operation +// may allocate memory. If the system is unable to allocate additional +// memory then NULL is returned and hashmap_oom() returns true. +void *hashmap_set(struct hashmap *map, void *item) { + if (!item) { + panic("item is null"); + } + map->oom = false; + if (map->count == map->growat) { + if (!resize(map, map->nbuckets*2)) { + map->oom = true; + return NULL; + } + } + + char edata[map->bucketsz]; // VLA + struct bucket *entry = (void*)edata; + entry->hash = get_hash(map, item); + entry->dib = 1; + memcpy(bucket_item(entry), item, map->elsize); + + size_t i = entry->hash & map->mask; + for (;;) { + struct bucket *bucket = bucket_at(map, i); + if (bucket->dib == 0) { + memcpy(bucket, entry, map->bucketsz); + map->count++; + return NULL; + } + if (entry->hash == bucket->hash && + map->compare(bucket_item(entry), bucket_item(bucket)) == 0) + { + memcpy(map->spare, bucket_item(bucket), map->elsize); + memcpy(bucket_item(bucket), bucket_item(entry), map->elsize); + return map->spare; + } + if (bucket->dib < entry->dib) { + memcpy(map->spare, bucket, map->bucketsz); + memcpy(bucket, entry, map->bucketsz); + memcpy(entry, map->spare, map->bucketsz); + } + i = (i + 1) & map->mask; + entry->dib += 1; + } +} + +// hashmap_get returns the item based on the provided key. If the item is not +// found then NULL is returned. +void *hashmap_get(struct hashmap *map, void *key) { + if (!key) { + panic("key is null"); + } + uint64_t hash = get_hash(map, key); + size_t i = hash & map->mask; + for (;;) { + struct bucket *bucket = bucket_at(map, i); + if (!bucket->dib) { + return NULL; + } + if (bucket->hash == hash && + map->compare(key, bucket_item(bucket)) == 0) + { + return bucket_item(bucket); + } + i = (i + 1) & map->mask; + } +} + +// hashmap_delete removes an item from the hash map and returns it. If the +// item is not found then NULL is returned. +void *hashmap_delete(struct hashmap *map, void *key) { + if (!key) { + panic("key is null"); + } + map->oom = false; + uint64_t hash = get_hash(map, key); + size_t i = hash & map->mask; + for (;;) { + struct bucket *bucket = bucket_at(map, i); + if (!bucket->dib) { + return NULL; + } + if (bucket->hash == hash && + map->compare(key, bucket_item(bucket)) == 0) + { + memcpy(map->spare, bucket_item(bucket), map->elsize); + bucket->dib = 0; + for (;;) { + struct bucket *prev = bucket; + i = (i + 1) & map->mask; + bucket = bucket_at(map, i); + if (bucket->dib <= 1) { + prev->dib = 0; + break; + } + memcpy(prev, bucket, map->bucketsz); + prev->dib--; + } + map->count--; + if (map->nbuckets > map->cap && map->count <= map->shrinkat) { + // Ignore the return value. It's ok for the resize operation to + // fail to allocate enough memory because a shrink operation + // does not change the integrity of the data. + resize(map, map->nbuckets/2); + } + return map->spare; + } + i = (i + 1) & map->mask; + } +} + +// hashmap_count returns the number of items in the hash map. +size_t hashmap_count(struct hashmap *map) { + return map->count; +} + +// hashmap_free frees the hash map +void hashmap_free(struct hashmap *map) { + if (!map) return; + hmfree(map->buckets); + hmfree(map); +} + +// hashmap_scan iterates over all items in the hash map +// Param `iter` can return false to stop iteration early. +// Returns false if the iteration has been stopped early. +bool hashmap_scan(struct hashmap *map, + bool (*iter)(const void *item, void *udata), void *udata) +{ + for (size_t i = 0; i < map->nbuckets; i++) { + struct bucket *bucket = bucket_at(map, i); + if (bucket->dib) { + if (!iter(bucket_item(bucket), udata)) { + return false; + } + } + } + return true; +} + +//----------------------------------------------------------------------------- +// SipHash reference C implementation +// +// Copyright (c) 2012-2016 Jean-Philippe Aumasson +// +// Copyright (c) 2012-2014 Daniel J. Bernstein +// +// To the extent possible under law, the author(s) have dedicated all copyright +// and related and neighboring rights to this software to the public domain +// worldwide. This software is distributed without any warranty. +// +// You should have received a copy of the CC0 Public Domain Dedication along +// with this software. If not, see +// . +// +// default: SipHash-2-4 +//----------------------------------------------------------------------------- +static uint64_t SIP64(const uint8_t *in, const size_t inlen, + uint64_t seed0, uint64_t seed1) +{ +#define U8TO64_LE(p) \ + { (((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) | \ + ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \ + ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \ + ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56)) } +#define U64TO8_LE(p, v) \ + { U32TO8_LE((p), (uint32_t)((v))); \ + U32TO8_LE((p) + 4, (uint32_t)((v) >> 32)); } +#define U32TO8_LE(p, v) \ + { (p)[0] = (uint8_t)((v)); \ + (p)[1] = (uint8_t)((v) >> 8); \ + (p)[2] = (uint8_t)((v) >> 16); \ + (p)[3] = (uint8_t)((v) >> 24); } +#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b)))) +#define SIPROUND \ + { v0 += v1; v1 = ROTL(v1, 13); \ + v1 ^= v0; v0 = ROTL(v0, 32); \ + v2 += v3; v3 = ROTL(v3, 16); \ + v3 ^= v2; \ + v0 += v3; v3 = ROTL(v3, 21); \ + v3 ^= v0; \ + v2 += v1; v1 = ROTL(v1, 17); \ + v1 ^= v2; v2 = ROTL(v2, 32); } + uint64_t k0 = U8TO64_LE((uint8_t*)&seed0); + uint64_t k1 = U8TO64_LE((uint8_t*)&seed1); + uint64_t v3 = UINT64_C(0x7465646279746573) ^ k1; + uint64_t v2 = UINT64_C(0x6c7967656e657261) ^ k0; + uint64_t v1 = UINT64_C(0x646f72616e646f6d) ^ k1; + uint64_t v0 = UINT64_C(0x736f6d6570736575) ^ k0; + const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t)); + for (; in != end; in += 8) { + uint64_t m = U8TO64_LE(in); + v3 ^= m; + SIPROUND; SIPROUND; + v0 ^= m; + } + const int left = inlen & 7; + uint64_t b = ((uint64_t)inlen) << 56; + switch (left) { + case 7: b |= ((uint64_t)in[6]) << 48; + case 6: b |= ((uint64_t)in[5]) << 40; + case 5: b |= ((uint64_t)in[4]) << 32; + case 4: b |= ((uint64_t)in[3]) << 24; + case 3: b |= ((uint64_t)in[2]) << 16; + case 2: b |= ((uint64_t)in[1]) << 8; + case 1: b |= ((uint64_t)in[0]); break; + case 0: break; + } + v3 ^= b; + SIPROUND; SIPROUND; + v0 ^= b; + v2 ^= 0xff; + SIPROUND; SIPROUND; SIPROUND; SIPROUND; + b = v0 ^ v1 ^ v2 ^ v3; + uint64_t out = 0; + U64TO8_LE((uint8_t*)&out, b); + return out; +} + +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. +// +// Murmur3_86_128 +//----------------------------------------------------------------------------- +static void MM86128(const void *key, const int len, uint32_t seed, void *out) { +#define ROTL32(x, r) ((x << r) | (x >> (32 - r))) +#define FMIX32(h) h^=h>>16; h*=0x85ebca6b; h^=h>>13; h*=0xc2b2ae35; h^=h>>16; + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + uint32_t c1 = 0x239b961b; + uint32_t c2 = 0xab0e9789; + uint32_t c3 = 0x38b34ae5; + uint32_t c4 = 0xa1e38b93; + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + for (int i = -nblocks; i; i++) { + uint32_t k1 = blocks[i*4+0]; + uint32_t k2 = blocks[i*4+1]; + uint32_t k3 = blocks[i*4+2]; + uint32_t k4 = blocks[i*4+3]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + switch(len & 15) { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + FMIX32(h1); FMIX32(h2); FMIX32(h3); FMIX32(h4); + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; +} + +// hashmap_sip returns a hash value for `data` using SipHash-2-4. +uint64_t hashmap_sip(const void *data, size_t len, + uint64_t seed0, uint64_t seed1) +{ + return SIP64((uint8_t*)data, len, seed0, seed1); +} + +// hashmap_murmur returns a hash value for `data` using Murmur3_86_128. +uint64_t hashmap_murmur(const void *data, size_t len, + uint64_t seed0, uint64_t seed1) +{ + char out[16]; + MM86128(data, len, seed0, &out); + return *(uint64_t*)out; +} + +//============================================================================== +// TESTS AND BENCHMARKS +// $ cc -DHASHMAP_TEST hashmap.c && ./a.out # run tests +// $ cc -DHASHMAP_TEST -O3 hashmap.c && BENCH=1 ./a.out # run benchmarks +//============================================================================== +// #ifndef HASHMAP_TEST +// #define HASHMAP_TEST +// #endif +#ifdef HASHMAP_TEST + +static size_t deepcount(struct hashmap *map) { + size_t count = 0; + for (size_t i = 0; i < map->nbuckets; i++) { + if (bucket_at(map, i)->dib) { + count++; + } + } + return count; +} + + +#pragma GCC diagnostic ignored "-Wextra" + + +#include +#include +#include +#include +#include +#include "hashmap.h" + +static uintptr_t total_allocs = 0; +static uintptr_t total_mem = 0; + +static void *xmalloc(size_t size) { + void *mem = malloc(sizeof(uintptr_t)+size); + assert(mem); + *(uintptr_t*)mem = size; + total_allocs++; + total_mem += size; + return (char*)mem+sizeof(uintptr_t); +} + +static void xfree(void *ptr) { + if (ptr) { + total_mem -= *(uintptr_t*)((char*)ptr-sizeof(uintptr_t)); + free((char*)ptr-sizeof(uintptr_t)); + total_allocs--; + } +} + +static void shuffle(void *array, size_t numels, size_t elsize) { + char tmp[elsize]; + char *arr = array; + for (size_t i = 0; i < numels - 1; i++) { + int j = i + rand() / (RAND_MAX / (numels - i) + 1); + memcpy(tmp, arr + j * elsize, elsize); + memcpy(arr + j * elsize, arr + i * elsize, elsize); + memcpy(arr + i * elsize, tmp, elsize); + } +} + +static bool iter_ints(const void *item, void *udata) { + int *vals = *(int**)udata; + vals[*(int*)item] = 1; + return true; +} + +static int compare_ints(const void *a, const void *b) { + return *(int*)a - *(int*)b; +} + +static uint64_t hash_int(const void *item, uint64_t seed0, uint64_t seed1) { + return hashmap_murmur(item, sizeof(int), seed0, seed1); +} + +static void all() { + int seed = getenv("SEED")?atoi(getenv("SEED")):time(NULL); + int N = getenv("N")?atoi(getenv("N")):2000; + printf("seed=%d, count=%d, item_size=%zu\n", seed, N, sizeof(int)); + srand(seed); + + // test sip and murmur hashes + assert(hashmap_sip("hello", 5, 1, 2) == 2957200328589801622); + assert(hashmap_murmur("hello", 5, 1, 2) == 1682575153221130884); + + int *vals = xmalloc(N * sizeof(int)); + for (int i = 0; i < N; i++) { + vals[i] = i; + } + + struct hashmap *map = hashmap_new(sizeof(int), 0, seed, seed, + hash_int, compare_ints); + shuffle(vals, N, sizeof(int)); + for (int i = 0; i < N; i++) { + // // printf("== %d ==\n", vals[i]); + assert(map->count == i); + assert(map->count == hashmap_count(map)); + assert(map->count == deepcount(map)); + int *v; + assert(!hashmap_get(map, &vals[i])); + assert(!hashmap_delete(map, &vals[i])); + assert(!hashmap_set(map, &vals[i])); + for (int j = 0; j < i; j++) { + v = hashmap_get(map, &vals[j]); + assert(v && *v == vals[j]); + } + v = hashmap_set(map, &vals[i]); + assert(v && *v == vals[i]); + v = hashmap_get(map, &vals[i]); + assert(v && *v == vals[i]); + v = hashmap_delete(map, &vals[i]); + assert(v && *v == vals[i]); + assert(!hashmap_get(map, &vals[i])); + assert(!hashmap_delete(map, &vals[i])); + assert(!hashmap_set(map, &vals[i])); + assert(map->count == i+1); + assert(map->count == hashmap_count(map)); + assert(map->count == deepcount(map)); + } + + int *vals2 = xmalloc(N * sizeof(int)); + memset(vals2, 0, N * sizeof(int)); + assert(hashmap_scan(map, iter_ints, &vals2)); + for (int i = 0; i < N; i++) { + assert(vals2[i] == 1); + } + xfree(vals2); + + shuffle(vals, N, sizeof(int)); + for (int i = 0; i < N; i++) { + int *v; + v = hashmap_delete(map, &vals[i]); + assert(v && *v == vals[i]); + assert(!hashmap_get(map, &vals[i])); + assert(map->count == N-i-1); + assert(map->count == hashmap_count(map)); + assert(map->count == deepcount(map)); + for (int j = N-1; j > i; j--) { + v = hashmap_get(map, &vals[j]); + assert(v && *v == vals[j]); + } + } + assert(map->cap == map->nbuckets); + + + + + hashmap_free(map); + + xfree(vals); + + if (total_allocs != 0) { + fprintf(stderr, "total_allocs: expected 0, got %lu\n", total_allocs); + exit(1); + } +} + +#define bench(name, N, code) {{ \ + if (strlen(name) > 0) { \ + printf("%-14s ", name); \ + } \ + size_t tmem = total_mem; \ + size_t tallocs = total_allocs; \ + uint64_t bytes = 0; \ + clock_t begin = clock(); \ + for (int i = 0; i < N; i++) { \ + (code); \ + } \ + clock_t end = clock(); \ + double elapsed_secs = (double)(end - begin) / CLOCKS_PER_SEC; \ + double bytes_sec = (double)bytes/elapsed_secs; \ + printf("%d ops in %.3f secs, %.0f ns/op, %.0f op/sec", \ + N, elapsed_secs, \ + elapsed_secs/(double)N*1e9, \ + (double)N/elapsed_secs \ + ); \ + if (bytes > 0) { \ + printf(", %.1f GB/sec", bytes_sec/1024/1024/1024); \ + } \ + if (total_mem > tmem) { \ + size_t used_mem = total_mem-tmem; \ + printf(", %.2f bytes/op", (double)used_mem/N); \ + } \ + if (total_allocs > tallocs) { \ + size_t used_allocs = total_allocs-tallocs; \ + printf(", %.2f allocs/op", (double)used_allocs/N); \ + } \ + printf("\n"); \ +}} + +static void benchmarks() { + int seed = getenv("SEED")?atoi(getenv("SEED")):time(NULL); + int N = getenv("N")?atoi(getenv("N")):5000000; + printf("seed=%d, count=%d, item_size=%zu\n", seed, N, sizeof(int)); + srand(seed); + + + int *vals = xmalloc(N * sizeof(int)); + for (int i = 0; i < N; i++) { + vals[i] = i; + } + + shuffle(vals, N, sizeof(int)); + + struct hashmap *map; + shuffle(vals, N, sizeof(int)); + + map = hashmap_new(sizeof(int), 0, seed, seed, hash_int, compare_ints); + bench("set", N, { + int *v = hashmap_set(map, &vals[i]); + assert(!v); + }) + shuffle(vals, N, sizeof(int)); + bench("get", N, { + int *v = hashmap_get(map, &vals[i]); + assert(v && *v == vals[i]); + }) + shuffle(vals, N, sizeof(int)); + bench("delete", N, { + int *v = hashmap_delete(map, &vals[i]); + assert(v && *v == vals[i]); + }) + hashmap_free(map); + + map = hashmap_new(sizeof(int), N, seed, seed, hash_int, compare_ints); + bench("set (cap)", N, { + int *v = hashmap_set(map, &vals[i]); + assert(!v); + }) + shuffle(vals, N, sizeof(int)); + bench("get (cap)", N, { + int *v = hashmap_get(map, &vals[i]); + assert(v && *v == vals[i]); + }) + shuffle(vals, N, sizeof(int)); + bench("delete (cap)" , N, { + int *v = hashmap_delete(map, &vals[i]); + assert(v && *v == vals[i]); + }) + + hashmap_free(map); + + + xfree(vals); + + if (total_allocs != 0) { + fprintf(stderr, "total_allocs: expected 0, got %lu\n", total_allocs); + exit(1); + } +} + +int main() { + hashmap_set_allocator(xmalloc, xfree); + + if (getenv("BENCH")) { + printf("Running hashmap.c benchmarks...\n"); + benchmarks(); + } else { + printf("Running hashmap.c tests...\n"); + all(); + printf("PASSED\n"); + } +} + + +#endif + + + diff --git a/hashmap.h b/hashmap.h new file mode 100644 index 0000000..54165db --- /dev/null +++ b/hashmap.h @@ -0,0 +1,32 @@ +// Copyright 2020 Joshua J Baker. All rights reserved. +// Use of this source code is governed by an MIT-style +// license that can be found in the LICENSE file. + +#ifndef HASHMAP_H +#define HASHMAP_H + +#include +#include +#include + +struct hashmap; + +struct hashmap *hashmap_new(size_t elsize, size_t cap, + uint64_t seed0, uint64_t seed1, + uint64_t (*hash)(const void *item, + uint64_t seed0, uint64_t seed1), + int (*compare)(const void *a, const void *b)); +void hashmap_free(struct hashmap *map); +size_t hashmap_count(struct hashmap *map); +void *hashmap_get(struct hashmap *map, void *item); +void *hashmap_set(struct hashmap *map, void *item); +void *hashmap_delete(struct hashmap *map, void *item); +bool hashmap_scan(struct hashmap *map, + bool (*iter)(const void *item, void *udata), void *udata); +void hashmap_set_allocator(void *(*malloc)(size_t), void (*free)(void*)); +uint64_t hashmap_sip(const void *data, size_t len, + uint64_t seed0, uint64_t seed1); +uint64_t hashmap_murmur(const void *data, size_t len, + uint64_t seed0, uint64_t seed1); + +#endif