perfecthash

perfect hash code generators
git clone https://noulin.net/git/perfecthash.git
Log | Files | Refs | README | LICENSE

genHanov.c (8309B)


      1 #! /usr/bin/env sheepy
      2 /* or direct path to sheepy: #! /usr/local/bin/sheepy */
      3 
      4 /**
      5  * simple C implementation of hanov's perfect hash: http://stevehanov.ca/blog/index.php?id=119
      6  * Minimal Perfect Hashing - Dr. Amjad M Daoud - http://iswsa.acm.org/mphf/index.html
      7  */
      8 
      9 /* Libsheepy documentation: http://spartatek.se/libsheepy/ */
     10 #include "libsheepyObject.h"
     11 #include "shpPackages/crc/crc.h"
     12 
     13 /* enable/disable logging */
     14 /* #undef pLog */
     15 /* #define pLog(...) */
     16 
     17 const char *type            = "hanovt";
     18 const char *perfectHashData = "perfectHash";
     19 const char *keysArray       = "keys";
     20 const char *funcScope       = "static inline";
     21 const char *hashFunc        = "hanovHash";
     22 const char *lookupFunc      = "hanovLookup";
     23 const char *findFunc        = "hanovFind";
     24 
     25 #define CFG_FILENAME "hanovConfig.yml"
     26 
     27 // column count in the perfectHashData and keysArray arrays
     28 #define colCount 4
     29 
     30 u32 hash(u32 d, const char *str) {
     31   if (!d) d = 0x811c9dc5;
     32   ret crc32_Buf(d, (void*)str, lenG(str)) & 0x7fffffff;
     33 }
     34 
     35 typ struct {smallArrayt *G; u32 size;} hanovt;
     36 
     37 hanovt create(const char **keys) {
     38   var size = lenG(keys);
     39   createSmallArray(buckets);
     40   loop(size) {pushG(&buckets, NULL);}
     41   hanovt res;
     42   res.G = allocG(rtSmallArrayt);
     43   loop(size) {pushG(res.G, NULL);}
     44   res.size = size;
     45 
     46   // Place all of the keys into buckets
     47   cast(char**, ks, keys);
     48   forEachS(ks, key) {
     49     var bkey   = hash(0, key) % size;
     50     var bucket = getG(&buckets, rtSmallArrayt, bkey);
     51     if (!bucket) {
     52       bucket = allocG(bucket);
     53     }
     54     pushG(bucket, key);
     55     setNFreePG(&buckets, bkey, bucket);
     56   }
     57 
     58   int bucketCmp(const void *a, const void *b) {
     59     smallArrayt *A = (smallArrayt*)a;
     60     smallArrayt *B = (smallArrayt*)b;
     61     size_t la,lb;
     62     if (!A or !isOSmallArray(A)) la = 0; else la = lenG(A);
     63     if (!B or !isOSmallArray(B)) lb = 0; else lb = lenG(B);
     64     ret lb - la;
     65   }
     66 
     67   sortFG(&buckets, bucketCmp);
     68 
     69   staticBitsetT(usedSlotst, u64 , size);
     70   usedSlotst usedSlots;
     71   staticBitsetClear(&usedSlots);
     72 
     73   size_t b;
     74   for(b = 0; b < size; b++) {
     75     var bucket = getG(&buckets, rtSmallArrayt, b);
     76     if (!bucket) continue;
     77 
     78     if(lenG(bucket) <= 1) { finishG(bucket); break;}
     79 
     80     u32 d = 1; size_t item = 0; u32 slots[size]; u32 slot; bool used[size];
     81     indexer slotsi;
     82     indexerInit(slotsi, size);
     83     range(i, size) used[i] = false;
     84 
     85     // Repeatedly try different values of d until we find a hash function
     86     // that places all items in the bucket into free slots
     87     while(item < lenG(bucket)) {
     88       slot = hash(d, getG(bucket, rtChar, item)) % size;
     89       // need hasObjectG
     90       if(staticBitsetGet(&usedSlots, slot) || used[slot]) {
     91         d++;
     92         item = 0;
     93         indexerInit(slotsi, size);
     94         range(i, size) used[i] = false;
     95       } else {
     96         used[slot] = true;
     97         indexerPush(slotsi);
     98         slots[slotsi.last] = slot;
     99         item++;
    100       }
    101     }
    102 
    103     setG(res.G, hash(0, getG(bucket, rtChar, 0)) % size, d);
    104     range(i, lenG(bucket)) {
    105       staticBitset1(&usedSlots, slots[i]);
    106     }
    107     setNFreePG(&buckets, b, bucket);
    108   }
    109 
    110   // Only buckets with 1 item remain. Process them more quickly by directly
    111   // placing them into a free slot. Use a negative value of d to indicate
    112   // this.
    113 
    114   createSmallArray(freelist);
    115   range(i, size) {
    116     if (!staticBitsetGet(&usedSlots, i)) {
    117       pushG(&freelist, i);
    118     }
    119   }
    120 
    121   for(; b < size; b++ ) {
    122     var bucket = getG(&buckets, rtSmallArrayt, b);
    123     if (!bucket) break;
    124     var slot = popG(&freelist, rtU32);
    125 
    126     // We subtract one to ensure it's negative even if the zeroeth slot was used.
    127     setG(res.G, hash(0, getG(bucket, rtChar, 0)) % size, 0-slot-1);
    128     setNFreePG(&buckets, b, bucket);
    129   }
    130 
    131   freeG(&freelist);
    132   freeG(&buckets);
    133 
    134   ret res;
    135 }
    136 
    137 u32 lookup(hanovt *GV, const char *key) {
    138   var d = getG(GV->G, rtI32, hash(0, key) % lenG(GV->G));
    139   if (d < 0) ret 0-d-1;
    140   ret hash(d, key) % GV->size;
    141 }
    142 
    143 void freeHanov(hanovt *h) {
    144   terminateG(h->G);
    145 }
    146 
    147 int main(int ARGC, char** ARGV) {
    148 
    149   initLibsheepy(ARGV[0]);
    150   setLogMode(LOG_CONCISE);
    151   setLogSymbols(LOG_UTF8);
    152 
    153   char **vector = NULL;
    154 
    155   if (ARGC < 2) {
    156     logE("key list missing");
    157     logP("\n"BLD GRN"Usage:"RST);
    158     puts(BLD "genHanov keyFileList\n\n" RST
    159          "Optional configuration file: "CFG_FILENAME"\n"
    160          "(located in directory where genHanov runs)\n\n"
    161          "---\n"
    162          "  type: hanovt\n"
    163          "  perfectHashData: perfectHash\n"
    164          "  keysArray: keys\n"
    165          "  funcScope: static inline\n"
    166          "  hashFunc: hanovHash\n"
    167          "  lookupFunc: hanovLookup\n"
    168          "  findFunc:  hanovFind"
    169          );
    170     XFAILURE;
    171   }
    172 
    173   vector = readFileG(&vector, ARGV[1]);
    174 
    175   if (!vector) {
    176     logE("Couldn't key list file "BLD RED"'%s'"RST, ARGV[1]);
    177     XFAILURE;
    178   }
    179 
    180   var han = create((const char**)vector);
    181 
    182   // load configuration
    183   createSmallJson(cfg);
    184   if (fileExists(CFG_FILENAME)) {
    185     readFileG(&cfg, CFG_FILENAME);
    186     type            = (const char *) getG(&cfg, rtChar, "type");
    187     perfectHashData = (const char *) getG(&cfg, rtChar, "perfectHashData");
    188     keysArray       = (const char *) getG(&cfg, rtChar, "keysArray");
    189     funcScope       = (const char *) getG(&cfg, rtChar, "funcScope");
    190     hashFunc        = (const char *) getG(&cfg, rtChar, "hashFunc");
    191     lookupFunc      = (const char *) getG(&cfg, rtChar, "lookupFunc");
    192     findFunc        = (const char *) getG(&cfg, rtChar, "findFunc");
    193   }
    194 
    195   // print minimal perfect hash function C code
    196 
    197   char *dataDecl = formatS("const %s %s = {{", type, perfectHashData);
    198   var ts = getCurrentDateYMD();
    199 
    200   printf("// generated by genHanov.c from perfecthash sheepy package at %s\n\n"
    201          "#include "_"libsheepyObject.h"_"\n"
    202          "#include "_"shpPackages/crc/crc.h"_"\n\n"
    203          "typ struct { u32 G[%zu]; u32 size;} %s;\n\n"
    204          "%s",
    205          ts,
    206          lenG(han.G), type,
    207          dataDecl);
    208   free(ts);
    209 
    210   size_t dataDeclLen = lenG(dataDecl) -1;
    211   free(dataDecl);
    212 
    213   char *comma[] = {"", ","};
    214 
    215   indexer cols;
    216   indexerInit(cols, colCount);
    217 
    218   iter(han.G, D) {
    219     cast(smallIntt*,d,D);
    220     if (isOSmallInt(D)) {
    221       printf("%s%u", comma[iterIndexG(han.G) ? 1 : 0], getValG(d));
    222     }
    223     else printf("%s0", comma[iterIndexG(han.G) ? 1 : 0]);
    224     indexerPush(cols);
    225     if (indexerLast(cols) == cols.maxCount -1) printf("\n%*c",dataDeclLen,' ');
    226   }
    227 
    228   printf("}, %u};\n\n", han.size);
    229 
    230   staticBitsetT(usedSlotst, u64 , lenG(vector));
    231   usedSlotst usedSlots;
    232   staticBitsetClear(&usedSlots);
    233 
    234   char *keys[han.size];
    235 
    236   forEachS(vector, key) {
    237     var o = lookup(&han, key);
    238     if (staticBitsetGet(&usedSlots, o)) {
    239       logE("Collision: hash %u", o);
    240       XFAILURE;
    241     }
    242     keys[o] = key;
    243     /* logVarG(key); */
    244     /* logVarG(o); */
    245     staticBitset1(&usedSlots, o);
    246   }
    247 
    248   char *keysDecl = formatS("const char *%s[%u] = {", keysArray, han.size);
    249   printf("%s", keysDecl);
    250 
    251   dataDeclLen = lenG(keysDecl) -1;
    252   free(keysDecl);
    253 
    254   indexerInit(cols, colCount);
    255 
    256   range(i, han.size) {
    257     printf("%s"_"%s"_, comma[i ? 1 : 0], keys[i]);
    258     indexerPush(cols);
    259     if (indexerLast(cols) == cols.maxCount -1) printf("\n%*c",dataDeclLen,' ');
    260   }
    261 
    262   printf("};\n\n");
    263 
    264   printf("%s u32 %s(u32 d, const char *str) {\n"
    265          "  if (!d) d = 0x811c9dc5;\n"
    266          "  ret crc32_Buf(d, (void*)str, lenG(str)) & 0x7fffffff;\n"
    267          "}\n\n"
    268          "%s u32 %s(const %s *GV, const char *key) {\n"
    269          "  var d = (i32)GV->G[%s(0, key) %% ARRAY_SIZE(GV->G)];\n"
    270          "  if (d < 0) ret 0-d-1;\n"
    271          "  ret %s(d, key) %% GV->size;\n"
    272          "}\n\n"
    273          "// lookup and return -1 when key is not found\n"
    274          "%s ssize_t %s(const %s *GV, const char *key) {\n"
    275          "  var idx = %s(GV, key);\n\n"
    276          "  if (eqG(key, %s[idx])) ret idx;\n"
    277          "  else ret -1; // key is not found\n"
    278          "}\n\n"
    279          "// Loopup:   u32 idx = %s(&%s, "_"key"_");\n"
    280          "// Find key: u32 idx = %s(&%s, "_"key"_");",
    281          funcScope, hashFunc,
    282          funcScope, lookupFunc, type,
    283          hashFunc,
    284          hashFunc,
    285          funcScope, findFunc, type,
    286          lookupFunc,
    287          keysArray,
    288          lookupFunc, perfectHashData,
    289          findFunc, perfectHashData);
    290 
    291   freeG(&cfg);
    292   freeG(vector);
    293   freeHanov(&han);
    294 }
    295 // vim: set expandtab ts=2 sw=2: