perfecthash

perfect hash code generators
git clone https://noulin.net/git/perfecthash.git
Log | Files | Refs | README | LICENSE

commit 285d3f5cd6a21c454613b64761df3c8a30d642ca
parent c4c539955dc732ed432bd36c1348d80f826ebd93
Author: Remy Noulin <loader2x@gmail.com>
Date:   Sun,  3 Feb 2019 15:57:20 +0100

add genHanov generator

README.md   |  39 +++++++-
genHanov.c  | 295 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
package.yml |  32 +++++++
3 files changed, 365 insertions(+), 1 deletion(-)

Diffstat:
MREADME.md | 39++++++++++++++++++++++++++++++++++++++-
AgenHanov.c | 295+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apackage.yml | 32++++++++++++++++++++++++++++++++
3 files changed, 365 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md @@ -1,2 +1,39 @@ +# Sheepy +This is a sheepy package for [sheepy](https://github.com/RemyNoulin/sheepy) and using [libsheepy](https://github.com/RemyNoulin/libsheepy) + # perfecthash -perfect hash code generators + +[What is a perfect hash function?](https://en.wikipedia.org/wiki/Perfect_hash_function) + +`genHanov` generates a perfect hash function C source code given a list of keys in a text file. +`genHanov` expects one key per line. + +genHanov is inspired by the Steve Hanov implementation of the Dr Daoud perfect hash algorithm. + +# Usage + +Install with spm: `spm -g install perfecthash` + +Generate a perfect hash function: + +``` +genHanov keys.txt +``` + +The generated code depends on the crc package, to install it: +`spm install crc` + +Just running `genHanov` prints the help. + +To configure `genHanov`, create a `hanovConfig.yml` file with this content: + +```yml +--- + type: hanovt + perfectHashData: perfectHash + keysArray: keys + funcScope: static inline + hashFunc: hanovHash + lookupFunc: hanovLookup + findFunc: hanovFind +``` diff --git a/genHanov.c b/genHanov.c @@ -0,0 +1,295 @@ +#! /usr/bin/env sheepy +/* or direct path to sheepy: #! /usr/local/bin/sheepy */ + +/** + * simple C implementation of hanov's perfect hash: http://stevehanov.ca/blog/index.php?id=119 + * Minimal Perfect Hashing - Dr. Amjad M Daoud - http://iswsa.acm.org/mphf/index.html + */ + +/* Libsheepy documentation: http://spartatek.se/libsheepy/ */ +#include "libsheepyObject.h" +#include "shpPackages/crc/crc.h" + +/* enable/disable logging */ +/* #undef pLog */ +/* #define pLog(...) */ + +const char *type = "hanovt"; +const char *perfectHashData = "perfectHash"; +const char *keysArray = "keys"; +const char *funcScope = "static inline"; +const char *hashFunc = "hanovHash"; +const char *lookupFunc = "hanovLookup"; +const char *findFunc = "hanovFind"; + +#define CFG_FILENAME "hanovConfig.yml" + +// column count in the perfectHashData and keysArray arrays +#define colCount 4 + +u32 hash(u32 d, const char *str) { + if (!d) d = 0x811c9dc5; + ret crc32_Buf(d, (void*)str, lenG(str)) & 0x7fffffff; +} + +typ struct {smallArrayt *G; u32 size;} hanovt; + +hanovt create(const char **keys) { + var size = lenG(keys); + createSmallArray(buckets); + loop(size) {pushG(&buckets, NULL);} + hanovt res; + res.G = allocG(rtSmallArrayt); + loop(size) {pushG(res.G, NULL);} + res.size = size; + + // Place all of the keys into buckets + cast(char**, ks, keys); + forEachS(ks, key) { + var bkey = hash(0, key) % size; + var bucket = getG(&buckets, rtSmallArrayt, bkey); + if (!bucket) { + bucket = allocG(bucket); + } + pushG(bucket, key); + setNFreePG(&buckets, bkey, bucket); + } + + int bucketCmp(const void *a, const void *b) { + smallArrayt *A = (smallArrayt*)a; + smallArrayt *B = (smallArrayt*)b; + size_t la,lb; + if (!A or !isOSmallArray(A)) la = 0; else la = lenG(A); + if (!B or !isOSmallArray(B)) lb = 0; else lb = lenG(B); + ret lb - la; + } + + sortFG(&buckets, bucketCmp); + + staticBitsetT(usedSlotst, u64 , size); + usedSlotst usedSlots; + staticBitsetClear(&usedSlots); + + size_t b; + for(b = 0; b < size; b++) { + var bucket = getG(&buckets, rtSmallArrayt, b); + if (!bucket) continue; + + if(lenG(bucket) <= 1) { finishG(bucket); break;} + + u32 d = 1; size_t item = 0; u32 slots[size]; u32 slot; bool used[size]; + indexer slotsi; + indexerInit(slotsi, size); + range(i, size) used[i] = false; + + // Repeatedly try different values of d until we find a hash function + // that places all items in the bucket into free slots + while(item < lenG(bucket)) { + slot = hash(d, getG(bucket, rtChar, item)) % size; + // need hasObjectG + if(staticBitsetGet(&usedSlots, slot) || used[slot]) { + d++; + item = 0; + indexerInit(slotsi, size); + range(i, size) used[i] = false; + } else { + used[slot] = true; + indexerPush(slotsi); + slots[slotsi.last] = slot; + item++; + } + } + + setG(res.G, hash(0, getG(bucket, rtChar, 0)) % size, d); + range(i, lenG(bucket)) { + staticBitset1(&usedSlots, slots[i]); + } + setNFreePG(&buckets, b, bucket); + } + + // Only buckets with 1 item remain. Process them more quickly by directly + // placing them into a free slot. Use a negative value of d to indicate + // this. + + createSmallArray(freelist); + range(i, size) { + if (!staticBitsetGet(&usedSlots, i)) { + pushG(&freelist, i); + } + } + + for(; b < size; b++ ) { + var bucket = getG(&buckets, rtSmallArrayt, b); + if (!bucket) break; + var slot = popG(&freelist, rtU32); + + // We subtract one to ensure it's negative even if the zeroeth slot was used. + setG(res.G, hash(0, getG(bucket, rtChar, 0)) % size, 0-slot-1); + setNFreePG(&buckets, b, bucket); + } + + freeG(&freelist); + freeG(&buckets); + + ret res; +} + +u32 lookup(hanovt *GV, const char *key) { + var d = getG(GV->G, rtI32, hash(0, key) % lenG(GV->G)); + if (d < 0) ret 0-d-1; + ret hash(d, key) % GV->size; +} + +void freeHanov(hanovt *h) { + terminateG(h->G); +} + +int main(int ARGC, char** ARGV) { + + initLibsheepy(ARGV[0]); + setLogMode(LOG_CONCISE); + setLogSymbols(LOG_UTF8); + + char **vector = NULL; + + if (ARGC < 2) { + logE("key list missing"); + logP("\n"BLD GRN"Usage:"RST); + puts(BLD "genHanov keyFileList\n\n" RST + "Optional configuration file: "CFG_FILENAME"\n" + "(located in directory where genHanov runs)\n\n" + "---\n" + " type: hanovt\n" + " perfectHashData: perfectHash\n" + " keysArray: keys\n" + " funcScope: static inline\n" + " hashFunc: hanovHash\n" + " lookupFunc: hanovLookup\n" + " findFunc: hanovFind" + ); + XFAILURE; + } + + vector = readFileG(vector, ARGV[1]); + + if (!vector) { + logE("Couldn't key list file "BLD RED"'%s'"RST, ARGV[1]); + XFAILURE; + } + + var han = create((const char**)vector); + + // load configuration + createSmallJson(cfg); + if (fileExists(CFG_FILENAME)) { + readFileG(&cfg, CFG_FILENAME); + type = (const char *) getG(&cfg, rtChar, "type"); + perfectHashData = (const char *) getG(&cfg, rtChar, "perfectHashData"); + keysArray = (const char *) getG(&cfg, rtChar, "keysArray"); + funcScope = (const char *) getG(&cfg, rtChar, "funcScope"); + hashFunc = (const char *) getG(&cfg, rtChar, "hashFunc"); + lookupFunc = (const char *) getG(&cfg, rtChar, "lookupFunc"); + findFunc = (const char *) getG(&cfg, rtChar, "findFunc"); + } + + // print minimal perfect hash function C code + + char *dataDecl = formatS("const %s %s = {{", type, perfectHashData); + var ts = getCurrentDateYMD(); + + printf("// generated by genHanov.c from perfecthash sheepy package at %s\n\n" + "#include "_"libsheepyObject.h"_"\n" + "#include "_"shpPackages/crc/crc.h"_"\n\n" + "typ struct { u32 G[%zu]; u32 size;} %s;\n\n" + "%s", + ts, + lenG(han.G), type, + dataDecl); + free(ts); + + size_t dataDeclLen = lenG(dataDecl) -1; + free(dataDecl); + + char *comma[] = {"", ","}; + + indexer cols; + indexerInit(cols, colCount); + + iter(han.G, D) { + cast(smallIntt*,d,D); + if (isOSmallInt(D)) { + printf("%s%u", comma[iterIndexG(han.G) ? 1 : 0], getValG(d)); + } + else printf("%s0", comma[iterIndexG(han.G) ? 1 : 0]); + indexerPush(cols); + if (indexerLast(cols) == cols.maxCount -1) printf("\n%*c",dataDeclLen,' '); + } + + printf("}, %u};\n\n", han.size); + + staticBitsetT(usedSlotst, u64 , lenG(vector)); + usedSlotst usedSlots; + staticBitsetClear(&usedSlots); + + char *keys[han.size]; + + forEachS(vector, key) { + var o = lookup(&han, key); + if (staticBitsetGet(&usedSlots, o)) { + logE("Collision: hash %u", o); + XFAILURE; + } + keys[o] = key; + /* logVarG(key); */ + /* logVarG(o); */ + staticBitset1(&usedSlots, o); + } + + char *keysDecl = formatS("const char *%s[%u] = {", keysArray, han.size); + printf("%s", keysDecl); + + dataDeclLen = lenG(keysDecl) -1; + free(keysDecl); + + indexerInit(cols, colCount); + + range(i, han.size) { + printf("%s"_"%s"_, comma[i ? 1 : 0], keys[i]); + indexerPush(cols); + if (indexerLast(cols) == cols.maxCount -1) printf("\n%*c",dataDeclLen,' '); + } + + printf("};\n\n"); + + printf("%s u32 %s(u32 d, const char *str) {\n" + " if (!d) d = 0x811c9dc5;\n" + " ret crc32_Buf(d, (void*)str, lenG(str)) & 0x7fffffff;\n" + "}\n\n" + "%s u32 %s(const %s *GV, const char *key) {\n" + " var d = (i32)GV->G[%s(0, key) %% ARRAY_SIZE(GV->G)];\n" + " if (d < 0) ret 0-d-1;\n" + " ret %s(d, key) %% GV->size;\n" + "}\n\n" + "// lookup and return -1 when key is not found\n" + "%s ssize_t %s(const %s *GV, const char *key) {\n" + " var idx = %s(GV, key);\n\n" + " if (eqG(key, %s[idx])) ret idx;\n" + " else ret -1; // key is not found\n" + "}\n\n" + "// Loopup: u32 idx = %s(&%s, "_"key"_");\n" + "// Find key: u32 idx = %s(&%s, "_"key"_");", + funcScope, hashFunc, + funcScope, lookupFunc, type, + hashFunc, + hashFunc, + funcScope, findFunc, type, + lookupFunc, + keysArray, + lookupFunc, perfectHashData, + findFunc, perfectHashData); + + freeG(&cfg); + freeG(vector); + freeHanov(&han); +} +// vim: set expandtab ts=2 sw=2: diff --git a/package.yml b/package.yml @@ -0,0 +1,32 @@ +--- + name: perfecthash + version: 0.0.1 + description: "perfect hash code generators: hanov, daoud" + bin: ./genHanov.c + #cflags: -DA -ggdb -std=gnu11 -fPIC -pipe + #lflags: -lpcre + repository: + type: git + url: git+https://github.com/RemyNoulin/perfecthash.git + keywords: + - utility + - command + author: Remy + license: MIT + bugs: + url: https://github.com/RemyNoulin/perfecthash/issues + homepage: https://github.com/RemyNoulin/perfecthash#readme + #compileHelp: # text displayed when there is a compilation error + dependencies: + crc: + # Test configuration: + #testBin: ./testPerfecthash.c + #testCflags: -ggdb -std=gnu11 -fPIC -pipe -fprofile-arcs -ftest-coverage -Wall -Wextra + #testLflags: -lcheck_pic -lrt -lm -lsubunit -fprofile-arcs -ftest-coverage -rdynamic + # Memcheck configuration: + #memcheckBin: ./memcheckPerfecthash.c + #memcheckCmd: valgrind --leak-check=full --show-leak-kinds=all + #memcheckCflags: -ggdb -std=gnu11 -fPIC -pipe + #memcheckLflags: -rdynamic + #documentationCmd: # command for generating the documentation with spm doc + private: false # true for private package