commit 285d3f5cd6a21c454613b64761df3c8a30d642ca
parent c4c539955dc732ed432bd36c1348d80f826ebd93
Author: Remy Noulin <loader2x@gmail.com>
Date: Sun, 3 Feb 2019 15:57:20 +0100
add genHanov generator
README.md | 39 +++++++-
genHanov.c | 295 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
package.yml | 32 +++++++
3 files changed, 365 insertions(+), 1 deletion(-)
Diffstat:
| M | README.md | | | 39 | ++++++++++++++++++++++++++++++++++++++- |
| A | genHanov.c | | | 295 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | package.yml | | | 32 | ++++++++++++++++++++++++++++++++ |
3 files changed, 365 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
@@ -1,2 +1,39 @@
+# Sheepy
+This is a sheepy package for [sheepy](https://github.com/RemyNoulin/sheepy) and using [libsheepy](https://github.com/RemyNoulin/libsheepy)
+
# perfecthash
-perfect hash code generators
+
+[What is a perfect hash function?](https://en.wikipedia.org/wiki/Perfect_hash_function)
+
+`genHanov` generates a perfect hash function C source code given a list of keys in a text file.
+`genHanov` expects one key per line.
+
+genHanov is inspired by the Steve Hanov implementation of the Dr Daoud perfect hash algorithm.
+
+# Usage
+
+Install with spm: `spm -g install perfecthash`
+
+Generate a perfect hash function:
+
+```
+genHanov keys.txt
+```
+
+The generated code depends on the crc package, to install it:
+`spm install crc`
+
+Just running `genHanov` prints the help.
+
+To configure `genHanov`, create a `hanovConfig.yml` file with this content:
+
+```yml
+---
+ type: hanovt
+ perfectHashData: perfectHash
+ keysArray: keys
+ funcScope: static inline
+ hashFunc: hanovHash
+ lookupFunc: hanovLookup
+ findFunc: hanovFind
+```
diff --git a/genHanov.c b/genHanov.c
@@ -0,0 +1,295 @@
+#! /usr/bin/env sheepy
+/* or direct path to sheepy: #! /usr/local/bin/sheepy */
+
+/**
+ * simple C implementation of hanov's perfect hash: http://stevehanov.ca/blog/index.php?id=119
+ * Minimal Perfect Hashing - Dr. Amjad M Daoud - http://iswsa.acm.org/mphf/index.html
+ */
+
+/* Libsheepy documentation: http://spartatek.se/libsheepy/ */
+#include "libsheepyObject.h"
+#include "shpPackages/crc/crc.h"
+
+/* enable/disable logging */
+/* #undef pLog */
+/* #define pLog(...) */
+
+const char *type = "hanovt";
+const char *perfectHashData = "perfectHash";
+const char *keysArray = "keys";
+const char *funcScope = "static inline";
+const char *hashFunc = "hanovHash";
+const char *lookupFunc = "hanovLookup";
+const char *findFunc = "hanovFind";
+
+#define CFG_FILENAME "hanovConfig.yml"
+
+// column count in the perfectHashData and keysArray arrays
+#define colCount 4
+
+u32 hash(u32 d, const char *str) {
+ if (!d) d = 0x811c9dc5;
+ ret crc32_Buf(d, (void*)str, lenG(str)) & 0x7fffffff;
+}
+
+typ struct {smallArrayt *G; u32 size;} hanovt;
+
+hanovt create(const char **keys) {
+ var size = lenG(keys);
+ createSmallArray(buckets);
+ loop(size) {pushG(&buckets, NULL);}
+ hanovt res;
+ res.G = allocG(rtSmallArrayt);
+ loop(size) {pushG(res.G, NULL);}
+ res.size = size;
+
+ // Place all of the keys into buckets
+ cast(char**, ks, keys);
+ forEachS(ks, key) {
+ var bkey = hash(0, key) % size;
+ var bucket = getG(&buckets, rtSmallArrayt, bkey);
+ if (!bucket) {
+ bucket = allocG(bucket);
+ }
+ pushG(bucket, key);
+ setNFreePG(&buckets, bkey, bucket);
+ }
+
+ int bucketCmp(const void *a, const void *b) {
+ smallArrayt *A = (smallArrayt*)a;
+ smallArrayt *B = (smallArrayt*)b;
+ size_t la,lb;
+ if (!A or !isOSmallArray(A)) la = 0; else la = lenG(A);
+ if (!B or !isOSmallArray(B)) lb = 0; else lb = lenG(B);
+ ret lb - la;
+ }
+
+ sortFG(&buckets, bucketCmp);
+
+ staticBitsetT(usedSlotst, u64 , size);
+ usedSlotst usedSlots;
+ staticBitsetClear(&usedSlots);
+
+ size_t b;
+ for(b = 0; b < size; b++) {
+ var bucket = getG(&buckets, rtSmallArrayt, b);
+ if (!bucket) continue;
+
+ if(lenG(bucket) <= 1) { finishG(bucket); break;}
+
+ u32 d = 1; size_t item = 0; u32 slots[size]; u32 slot; bool used[size];
+ indexer slotsi;
+ indexerInit(slotsi, size);
+ range(i, size) used[i] = false;
+
+ // Repeatedly try different values of d until we find a hash function
+ // that places all items in the bucket into free slots
+ while(item < lenG(bucket)) {
+ slot = hash(d, getG(bucket, rtChar, item)) % size;
+ // need hasObjectG
+ if(staticBitsetGet(&usedSlots, slot) || used[slot]) {
+ d++;
+ item = 0;
+ indexerInit(slotsi, size);
+ range(i, size) used[i] = false;
+ } else {
+ used[slot] = true;
+ indexerPush(slotsi);
+ slots[slotsi.last] = slot;
+ item++;
+ }
+ }
+
+ setG(res.G, hash(0, getG(bucket, rtChar, 0)) % size, d);
+ range(i, lenG(bucket)) {
+ staticBitset1(&usedSlots, slots[i]);
+ }
+ setNFreePG(&buckets, b, bucket);
+ }
+
+ // Only buckets with 1 item remain. Process them more quickly by directly
+ // placing them into a free slot. Use a negative value of d to indicate
+ // this.
+
+ createSmallArray(freelist);
+ range(i, size) {
+ if (!staticBitsetGet(&usedSlots, i)) {
+ pushG(&freelist, i);
+ }
+ }
+
+ for(; b < size; b++ ) {
+ var bucket = getG(&buckets, rtSmallArrayt, b);
+ if (!bucket) break;
+ var slot = popG(&freelist, rtU32);
+
+ // We subtract one to ensure it's negative even if the zeroeth slot was used.
+ setG(res.G, hash(0, getG(bucket, rtChar, 0)) % size, 0-slot-1);
+ setNFreePG(&buckets, b, bucket);
+ }
+
+ freeG(&freelist);
+ freeG(&buckets);
+
+ ret res;
+}
+
+u32 lookup(hanovt *GV, const char *key) {
+ var d = getG(GV->G, rtI32, hash(0, key) % lenG(GV->G));
+ if (d < 0) ret 0-d-1;
+ ret hash(d, key) % GV->size;
+}
+
+void freeHanov(hanovt *h) {
+ terminateG(h->G);
+}
+
+int main(int ARGC, char** ARGV) {
+
+ initLibsheepy(ARGV[0]);
+ setLogMode(LOG_CONCISE);
+ setLogSymbols(LOG_UTF8);
+
+ char **vector = NULL;
+
+ if (ARGC < 2) {
+ logE("key list missing");
+ logP("\n"BLD GRN"Usage:"RST);
+ puts(BLD "genHanov keyFileList\n\n" RST
+ "Optional configuration file: "CFG_FILENAME"\n"
+ "(located in directory where genHanov runs)\n\n"
+ "---\n"
+ " type: hanovt\n"
+ " perfectHashData: perfectHash\n"
+ " keysArray: keys\n"
+ " funcScope: static inline\n"
+ " hashFunc: hanovHash\n"
+ " lookupFunc: hanovLookup\n"
+ " findFunc: hanovFind"
+ );
+ XFAILURE;
+ }
+
+ vector = readFileG(vector, ARGV[1]);
+
+ if (!vector) {
+ logE("Couldn't key list file "BLD RED"'%s'"RST, ARGV[1]);
+ XFAILURE;
+ }
+
+ var han = create((const char**)vector);
+
+ // load configuration
+ createSmallJson(cfg);
+ if (fileExists(CFG_FILENAME)) {
+ readFileG(&cfg, CFG_FILENAME);
+ type = (const char *) getG(&cfg, rtChar, "type");
+ perfectHashData = (const char *) getG(&cfg, rtChar, "perfectHashData");
+ keysArray = (const char *) getG(&cfg, rtChar, "keysArray");
+ funcScope = (const char *) getG(&cfg, rtChar, "funcScope");
+ hashFunc = (const char *) getG(&cfg, rtChar, "hashFunc");
+ lookupFunc = (const char *) getG(&cfg, rtChar, "lookupFunc");
+ findFunc = (const char *) getG(&cfg, rtChar, "findFunc");
+ }
+
+ // print minimal perfect hash function C code
+
+ char *dataDecl = formatS("const %s %s = {{", type, perfectHashData);
+ var ts = getCurrentDateYMD();
+
+ printf("// generated by genHanov.c from perfecthash sheepy package at %s\n\n"
+ "#include "_"libsheepyObject.h"_"\n"
+ "#include "_"shpPackages/crc/crc.h"_"\n\n"
+ "typ struct { u32 G[%zu]; u32 size;} %s;\n\n"
+ "%s",
+ ts,
+ lenG(han.G), type,
+ dataDecl);
+ free(ts);
+
+ size_t dataDeclLen = lenG(dataDecl) -1;
+ free(dataDecl);
+
+ char *comma[] = {"", ","};
+
+ indexer cols;
+ indexerInit(cols, colCount);
+
+ iter(han.G, D) {
+ cast(smallIntt*,d,D);
+ if (isOSmallInt(D)) {
+ printf("%s%u", comma[iterIndexG(han.G) ? 1 : 0], getValG(d));
+ }
+ else printf("%s0", comma[iterIndexG(han.G) ? 1 : 0]);
+ indexerPush(cols);
+ if (indexerLast(cols) == cols.maxCount -1) printf("\n%*c",dataDeclLen,' ');
+ }
+
+ printf("}, %u};\n\n", han.size);
+
+ staticBitsetT(usedSlotst, u64 , lenG(vector));
+ usedSlotst usedSlots;
+ staticBitsetClear(&usedSlots);
+
+ char *keys[han.size];
+
+ forEachS(vector, key) {
+ var o = lookup(&han, key);
+ if (staticBitsetGet(&usedSlots, o)) {
+ logE("Collision: hash %u", o);
+ XFAILURE;
+ }
+ keys[o] = key;
+ /* logVarG(key); */
+ /* logVarG(o); */
+ staticBitset1(&usedSlots, o);
+ }
+
+ char *keysDecl = formatS("const char *%s[%u] = {", keysArray, han.size);
+ printf("%s", keysDecl);
+
+ dataDeclLen = lenG(keysDecl) -1;
+ free(keysDecl);
+
+ indexerInit(cols, colCount);
+
+ range(i, han.size) {
+ printf("%s"_"%s"_, comma[i ? 1 : 0], keys[i]);
+ indexerPush(cols);
+ if (indexerLast(cols) == cols.maxCount -1) printf("\n%*c",dataDeclLen,' ');
+ }
+
+ printf("};\n\n");
+
+ printf("%s u32 %s(u32 d, const char *str) {\n"
+ " if (!d) d = 0x811c9dc5;\n"
+ " ret crc32_Buf(d, (void*)str, lenG(str)) & 0x7fffffff;\n"
+ "}\n\n"
+ "%s u32 %s(const %s *GV, const char *key) {\n"
+ " var d = (i32)GV->G[%s(0, key) %% ARRAY_SIZE(GV->G)];\n"
+ " if (d < 0) ret 0-d-1;\n"
+ " ret %s(d, key) %% GV->size;\n"
+ "}\n\n"
+ "// lookup and return -1 when key is not found\n"
+ "%s ssize_t %s(const %s *GV, const char *key) {\n"
+ " var idx = %s(GV, key);\n\n"
+ " if (eqG(key, %s[idx])) ret idx;\n"
+ " else ret -1; // key is not found\n"
+ "}\n\n"
+ "// Loopup: u32 idx = %s(&%s, "_"key"_");\n"
+ "// Find key: u32 idx = %s(&%s, "_"key"_");",
+ funcScope, hashFunc,
+ funcScope, lookupFunc, type,
+ hashFunc,
+ hashFunc,
+ funcScope, findFunc, type,
+ lookupFunc,
+ keysArray,
+ lookupFunc, perfectHashData,
+ findFunc, perfectHashData);
+
+ freeG(&cfg);
+ freeG(vector);
+ freeHanov(&han);
+}
+// vim: set expandtab ts=2 sw=2:
diff --git a/package.yml b/package.yml
@@ -0,0 +1,32 @@
+---
+ name: perfecthash
+ version: 0.0.1
+ description: "perfect hash code generators: hanov, daoud"
+ bin: ./genHanov.c
+ #cflags: -DA -ggdb -std=gnu11 -fPIC -pipe
+ #lflags: -lpcre
+ repository:
+ type: git
+ url: git+https://github.com/RemyNoulin/perfecthash.git
+ keywords:
+ - utility
+ - command
+ author: Remy
+ license: MIT
+ bugs:
+ url: https://github.com/RemyNoulin/perfecthash/issues
+ homepage: https://github.com/RemyNoulin/perfecthash#readme
+ #compileHelp: # text displayed when there is a compilation error
+ dependencies:
+ crc:
+ # Test configuration:
+ #testBin: ./testPerfecthash.c
+ #testCflags: -ggdb -std=gnu11 -fPIC -pipe -fprofile-arcs -ftest-coverage -Wall -Wextra
+ #testLflags: -lcheck_pic -lrt -lm -lsubunit -fprofile-arcs -ftest-coverage -rdynamic
+ # Memcheck configuration:
+ #memcheckBin: ./memcheckPerfecthash.c
+ #memcheckCmd: valgrind --leak-check=full --show-leak-kinds=all
+ #memcheckCflags: -ggdb -std=gnu11 -fPIC -pipe
+ #memcheckLflags: -rdynamic
+ #documentationCmd: # command for generating the documentation with spm doc
+ private: false # true for private package