Merge branch 'hash' - md4c - C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.

commit 6bc0b86101e1efa18eaa8d0c045d83982b89ec47
parent 07c5dba9baf0ec1638441e225a91b6bb9cf892c3
Author: Martin Mitas <mity@morous.org>
Date:   Tue, 18 Jul 2017 18:50:07 +0200

Merge branch 'hash'

Diffstat:
M md4c/md4c.c  | 545 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
M test/coverage.txt  | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

2 files changed, 512 insertions(+), 151 deletions(-)
diff --git a/md4c/md4c.c b/md4c/md4c.c
@@ -75,7 +75,7 @@
 typedef struct MD_MARK_tag MD_MARK;
 typedef struct MD_BLOCK_tag MD_BLOCK;
 typedef struct MD_CONTAINER_tag MD_CONTAINER;
-typedef struct MD_LINK_REF_DEF_tag MD_LINK_REF_DEF;
+typedef struct MD_REF_DEF_tag MD_REF_DEF;
 
 
 /* During analyzes of inline marks, we need to manage some "mark chains",
@@ -101,10 +101,12 @@ struct MD_CTX_tag {
     CHAR* buffer;
     unsigned alloc_buffer;
 
-    /* Link reference definitions. */
-    MD_LINK_REF_DEF* link_ref_defs;
-    int n_link_ref_defs;
-    int alloc_link_ref_defs;
+    /* Reference definitions. */
+    MD_REF_DEF* ref_defs;
+    int n_ref_defs;
+    int alloc_ref_defs;
+    void** ref_def_hashtable;
+    int ref_def_hashtable_size;
 
     /* Stack of inline/span markers.
      * This is only used for parsing a single block contents but by storing it
@@ -910,6 +912,22 @@ md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_
     return 0;
 }
 
+static OFF
+md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
+{
+    SZ char_size;
+    int codepoint;
+
+    while(off < size) {
+        codepoint = md_decode_unicode(label, off, size, &char_size);
+        if(!ISUNICODEWHITESPACE_(codepoint)  &&  !ISNEWLINE_(label[off]))
+            break;
+        off += char_size;
+    }
+
+    return off;
+}
+
 
 /******************************
  ***  Recognizing raw HTML  ***
@@ -1459,17 +1477,33 @@ md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE* attr)
 }
 
 
-/***************************
- ***  Recognizing Links  ***
- ***************************/
+/*********************************************
+ ***  Dictionary of Reference Definitions  ***
+ *********************************************/
 
-/* Note this code is partially shared between processing inlines and blocks
- * as link reference definitions and links share some helper parser functions.
- */
+#define MD_FNV1A_BASE       2166136261
+#define MD_FNV1A_PRIME      16777619
+
+static inline unsigned
+md_fnv1a(unsigned base, const void* data, size_t n)
+{
+    const unsigned char* buf = (const unsigned char*) data;
+    unsigned hash = base;
+    size_t i;
+
+    for(i = 0; i < n; i++) {
+        hash ^= buf[i];
+        hash *= MD_FNV1A_PRIME;
+    }
+
+    return hash;
+}
 
-struct MD_LINK_REF_DEF_tag {
+
+struct MD_REF_DEF_tag {
     CHAR* label;
     CHAR* title;
+    unsigned hash;
     SZ label_size                   : 24;
     unsigned label_needs_free       :  1;
     unsigned title_needs_free       :  1;
@@ -1478,6 +1512,323 @@ struct MD_LINK_REF_DEF_tag {
     OFF dest_end;
 };
 
+/* Label equivalence is quite complicated with regards to whitespace and case
+ * folding. This complicates computing a hash of it as well as direct comparison
+ * of two labels. */
+
+static unsigned
+md_link_label_hash(const CHAR* label, SZ size)
+{
+    unsigned hash = MD_FNV1A_BASE;
+    OFF off;
+    int codepoint;
+    int is_whitespace = FALSE;
+
+    off = md_skip_unicode_whitespace(label, 0, size);
+    while(off < size) {
+        SZ char_size;
+
+        codepoint = md_decode_unicode(label, off, size, &char_size);
+        is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]);
+
+        if(is_whitespace) {
+            codepoint = ' ';
+            hash = md_fnv1a(hash, &codepoint, 1 * sizeof(int));
+
+            off = md_skip_unicode_whitespace(label, off, size);
+        } else {
+            MD_UNICODE_FOLD_INFO fold_info;
+
+            md_get_unicode_fold_info(codepoint, &fold_info);
+            hash = md_fnv1a(hash, fold_info.codepoints, fold_info.n_codepoints * sizeof(int));
+
+            off += char_size;
+        }
+    }
+
+    if(!is_whitespace) {
+        codepoint = ' ';
+        hash = md_fnv1a(hash, &codepoint, 1 * sizeof(int));
+    }
+
+    return hash;
+}
+
+static int
+md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size)
+{
+    OFF a_off;
+    OFF b_off;
+
+    /* The slow path, with Unicode case folding and Unicode whitespace collapsing. */
+    a_off = md_skip_unicode_whitespace(a_label, 0, a_size);
+    b_off = md_skip_unicode_whitespace(b_label, 0, b_size);
+    while(a_off < a_size  ||  b_off < b_size) {
+        int a_codepoint, b_codepoint;
+        SZ a_char_size, b_char_size;
+        int a_is_whitespace, b_is_whitespace;
+
+        if(a_off < a_size) {
+            a_codepoint = md_decode_unicode(a_label, a_off, a_size, &a_char_size);
+            a_is_whitespace = ISUNICODEWHITESPACE_(a_codepoint) || ISNEWLINE_(a_label[a_off]);
+        } else {
+            /* Treat end of label as a whitespace. */
+            a_codepoint = -1;
+            a_is_whitespace = TRUE;
+        }
+
+        if(b_off < b_size) {
+            b_codepoint = md_decode_unicode(b_label, b_off, b_size, &b_char_size);
+            b_is_whitespace = ISUNICODEWHITESPACE_(b_codepoint) || ISNEWLINE_(b_label[b_off]);
+        } else {
+            /* Treat end of label as a whitespace. */
+            b_codepoint = -1;
+            b_is_whitespace = TRUE;
+        }
+
+        if(a_is_whitespace || b_is_whitespace) {
+            if(!a_is_whitespace || !b_is_whitespace)
+                return (a_is_whitespace ? -1 : +1);
+
+            a_off = md_skip_unicode_whitespace(a_label, a_off, a_size);
+            b_off = md_skip_unicode_whitespace(b_label, b_off, b_size);
+        } else {
+            MD_UNICODE_FOLD_INFO a_fold_info, b_fold_info;
+            int cmp;
+
+            md_get_unicode_fold_info(a_codepoint, &a_fold_info);
+            md_get_unicode_fold_info(b_codepoint, &b_fold_info);
+
+            if(a_fold_info.n_codepoints != b_fold_info.n_codepoints)
+                return (a_fold_info.n_codepoints - b_fold_info.n_codepoints);
+            cmp = memcmp(a_fold_info.codepoints, b_fold_info.codepoints, a_fold_info.n_codepoints * sizeof(int));
+            if(cmp != 0)
+                return cmp;
+
+            a_off += a_char_size;
+            b_off += b_char_size;
+        }
+    }
+
+    return 0;
+}
+
+typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST;
+struct MD_REF_DEF_LIST_tag {
+    int n_ref_defs;
+    int alloc_ref_defs;
+    MD_REF_DEF* ref_defs[];  /* Valid items always  point into ctx->ref_defs[] */
+};
+
+static int
+md_ref_def_cmp(const void* a, const void* b)
+{
+    const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
+    const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
+
+    if(a_ref->hash < b_ref->hash)
+        return -1;
+    else if(a_ref->hash > b_ref->hash)
+        return +1;
+    else
+        return md_link_label_cmp(a_ref->label, a_ref->label_size, b_ref->label, b_ref->label_size);
+}
+
+static int
+md_ref_def_cmp_stable(const void* a, const void* b)
+{
+    int cmp;
+
+    cmp = md_ref_def_cmp(a, b);
+
+    /* Ensure stability of the sorting. */
+    if(cmp == 0) {
+        const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
+        const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
+
+        if(a_ref < b_ref)
+            cmp = -1;
+        else if(a_ref > b_ref)
+            cmp = +1;
+        else
+            cmp = 0;
+    }
+
+    return cmp;
+}
+
+static int
+md_build_ref_def_hashtable(MD_CTX* ctx)
+{
+    int i;
+
+    if(ctx->n_ref_defs == 0)
+        return 0;
+
+    ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4;
+    ctx->ref_def_hashtable = malloc(ctx->ref_def_hashtable_size * sizeof(void*));
+    if(ctx->ref_def_hashtable == NULL) {
+        MD_LOG("malloc() failed.");
+        goto abort;
+    }
+    memset(ctx->ref_def_hashtable, 0, ctx->ref_def_hashtable_size * sizeof(void*));
+
+    /* Each member of ctx->ref_def_hashtable[] can be:
+     *  -- NULL,
+     *  -- pointer to the MD_REF_DEF in ctx->ref_defs[], or
+     *  -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to
+     *     such MD_REF_DEFs.
+     */
+    for(i = 0; i < ctx->n_ref_defs; i++) {
+        MD_REF_DEF* def = &ctx->ref_defs[i];
+        void* bucket;
+        MD_REF_DEF_LIST* list;
+
+        def->hash = md_link_label_hash(def->label, def->label_size);
+        bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size];
+
+        if(bucket == NULL) {
+            ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def;
+            continue;
+        }
+
+        if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
+            /* The bucket already contains one ref. def. Lets see whether it
+             * is the same label (ref. def. duplicate) or different one
+             * (hash conflict). */
+            MD_REF_DEF* old_def = (MD_REF_DEF*) bucket;
+
+            if(md_link_label_cmp(def->label, def->label_size, old_def->label, old_def->label_size) == 0) {
+                /* Ignore this ref. def. */
+                continue;
+            }
+
+            /* Make the bucket capable of holding more ref. defs. */
+            list = (MD_REF_DEF_LIST*) malloc(sizeof(MD_REF_DEF_LIST) + 4 * sizeof(MD_REF_DEF));
+            if(list == NULL) {
+                MD_LOG("malloc() failed.");
+                goto abort;
+            }
+            list->ref_defs[0] = old_def;
+            list->ref_defs[1] = def;
+            list->n_ref_defs = 2;
+            list->alloc_ref_defs = 4;
+            ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
+            continue;
+        }
+
+        /* Append the def to the bucket list. */
+        list = (MD_REF_DEF_LIST*) bucket;
+        if(list->n_ref_defs >= list->alloc_ref_defs) {
+            MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(list,
+                        sizeof(MD_REF_DEF_LIST) + 2 * list->alloc_ref_defs * sizeof(MD_REF_DEF));
+            if(list_tmp == NULL) {
+                MD_LOG("realloc() failed.");
+                goto abort;
+            }
+            list = list_tmp;
+            list->alloc_ref_defs *= 2;
+            ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
+        }
+
+        list->ref_defs[list->n_ref_defs] = def;
+        list->n_ref_defs++;
+    }
+
+    /* Sort the complex buckets so we can use bsearch() with them. */
+    for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
+        void* bucket = ctx->ref_def_hashtable[i];
+        MD_REF_DEF_LIST* list;
+
+        if(bucket == NULL)
+            continue;
+        if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
+            continue;
+
+        list = (MD_REF_DEF_LIST*) bucket;
+        qsort(list->ref_defs, list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp_stable);
+
+        /* Disable duplicates. */
+        for(i = 1; i < list->n_ref_defs; i++) {
+            if(md_ref_def_cmp(&list->ref_defs[i-1], &list->ref_defs[i]) == 0)
+                list->ref_defs[i] = list->ref_defs[i-1];
+        }
+    }
+
+    return 0;
+
+abort:
+    return -1;
+}
+
+static void
+md_free_ref_def_hashtable(MD_CTX* ctx)
+{
+    if(ctx->ref_def_hashtable != NULL) {
+        int i;
+
+        for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
+            void* bucket = ctx->ref_def_hashtable[i];
+            if(bucket == NULL)
+                continue;
+            if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
+                continue;
+            free(bucket);
+        }
+
+        free(ctx->ref_def_hashtable);
+    }
+}
+
+static const MD_REF_DEF*
+md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
+{
+    unsigned hash;
+    void* bucket;
+
+    if(ctx->ref_def_hashtable_size == 0)
+        return NULL;
+
+    hash = md_link_label_hash(label, label_size);
+    bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size];
+
+    if(bucket == NULL) {
+        return NULL;
+    } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
+        const MD_REF_DEF* def = (MD_REF_DEF*) bucket;
+
+        if(md_link_label_cmp(def->label, def->label_size, label, label_size) == 0)
+            return def;
+        else
+            return NULL;
+    } else {
+        MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket;
+        MD_REF_DEF key_buf;
+        const MD_REF_DEF* key = &key_buf;
+        const MD_REF_DEF** ret;
+
+        key_buf.label = (CHAR*) label;
+        key_buf.label_size = label_size;
+        key_buf.hash = md_link_label_hash(key_buf.label, key_buf.label_size);
+
+        ret = (const MD_REF_DEF**) bsearch(&key, list->ref_defs,
+                    list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp);
+        if(ret != NULL)
+            return *ret;
+        else
+            return NULL;
+    }
+}
+
+
+/***************************
+ ***  Recognizing Links  ***
+ ***************************/
+
+/* Note this code is partially shared between processing inlines and blocks
+ * as reference definitions and links share some helper parser functions.
+ */
+
 typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR;
 struct MD_LINK_ATTR_tag {
     OFF dest_beg;
@@ -1695,9 +2046,9 @@ md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
     return FALSE;
 }
 
-/* Returns 0 if it is not a link reference definition.
+/* Returns 0 if it is not a reference definition.
  *
- * Returns N > 0 if it is not a link reference definition (then N corresponds
+ * Returns N > 0 if it is not a reference definition (then N corresponds
  * to the number of lines forming it). In this case the definition is stored
  * for resolving any links referring to it.
  *
@@ -1722,7 +2073,7 @@ md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
     OFF off;
     int line_index = 0;
     int tmp_line_index;
-    MD_LINK_REF_DEF* def;
+    MD_REF_DEF* def;
     int ret;
 
     /* Link label. */
@@ -1786,23 +2137,23 @@ md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
         label_needs_free = TRUE;
     }
 
-    /* Store the link reference definition. */
-    if(ctx->n_link_ref_defs >= ctx->alloc_link_ref_defs) {
-        MD_LINK_REF_DEF* new_defs;
+    /* Store the reference definition. */
+    if(ctx->n_ref_defs >= ctx->alloc_ref_defs) {
+        MD_REF_DEF* new_defs;
 
-        ctx->alloc_link_ref_defs = (ctx->alloc_link_ref_defs > 0 ? ctx->alloc_link_ref_defs * 2 : 16);
-        new_defs = (MD_LINK_REF_DEF*) realloc(ctx->link_ref_defs, ctx->alloc_link_ref_defs * sizeof(MD_LINK_REF_DEF));
+        ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0 ? ctx->alloc_ref_defs * 2 : 16);
+        new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF));
         if(new_defs == NULL) {
             MD_LOG("realloc() failed.");
             ret = -1;
             goto abort;
         }
 
-        ctx->link_ref_defs = new_defs;
+        ctx->ref_defs = new_defs;
     }
 
-    def = &ctx->link_ref_defs[ctx->n_link_ref_defs];
-    memset(def, 0, sizeof(MD_LINK_REF_DEF));
+    def = &ctx->ref_defs[ctx->n_ref_defs];
+    memset(def, 0, sizeof(MD_REF_DEF));
 
     def->label = label;
     def->label_size = label_size;
@@ -1825,7 +2176,7 @@ md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
     }
 
     /* Success. */
-    ctx->n_link_ref_defs++;
+    ctx->n_ref_defs++;
     return line_index + 1;
 
 abort:
@@ -1835,118 +2186,11 @@ abort:
     return -1;
 }
 
-
-static OFF
-md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
-{
-    SZ char_size;
-    int codepoint;
-
-    while(off < size) {
-        codepoint = md_decode_unicode(label, off, size, &char_size);
-        if(!ISUNICODEWHITESPACE_(codepoint)  &&  !ISNEWLINE_(label[off]))
-            break;
-        off += char_size;
-    }
-
-    return off;
-}
-
-static int
-md_link_label_cmp(const void* a, const void* b)
-{
-    const CHAR* a_label = ((const MD_LINK_REF_DEF*)a)->label;
-    const CHAR* b_label = ((const MD_LINK_REF_DEF*)b)->label;
-    SZ a_size = ((const MD_LINK_REF_DEF*)a)->label_size;
-    SZ b_size = ((const MD_LINK_REF_DEF*)b)->label_size;
-    OFF a_off;
-    OFF b_off;
-
-    /* The slow path, with Unicode case folding and Unicode whitespace collapsing. */
-    a_off = md_skip_unicode_whitespace(a_label, 0, a_size);
-    b_off = md_skip_unicode_whitespace(b_label, 0, b_size);
-    while(a_off < a_size  ||  b_off < b_size) {
-        int a_codepoint, b_codepoint;
-        SZ a_char_size, b_char_size;
-        int a_is_whitespace, b_is_whitespace;
-
-        if(a_off < a_size) {
-            a_codepoint = md_decode_unicode(a_label, a_off, a_size, &a_char_size);
-            a_is_whitespace = ISUNICODEWHITESPACE_(a_codepoint) || ISNEWLINE_(a_label[a_off]);
-        } else {
-            /* Treat end of label as a whitespace. */
-            a_codepoint = -1;
-            a_is_whitespace = TRUE;
-        }
-
-        if(b_off < b_size) {
-            b_codepoint = md_decode_unicode(b_label, b_off, b_size, &b_char_size);
-            b_is_whitespace = ISUNICODEWHITESPACE_(b_codepoint) || ISNEWLINE_(b_label[b_off]);
-        } else {
-            /* Treat end of label as a whitespace. */
-            b_codepoint = -1;
-            b_is_whitespace = TRUE;
-        }
-
-        if(a_is_whitespace || b_is_whitespace) {
-            if(!a_is_whitespace || !b_is_whitespace)
-                return (a_is_whitespace ? -1 : +1);
-
-            a_off = md_skip_unicode_whitespace(a_label, a_off, a_size);
-            b_off = md_skip_unicode_whitespace(b_label, b_off, b_size);
-        } else {
-            MD_UNICODE_FOLD_INFO a_fold_info, b_fold_info;
-            int cmp;
-
-            md_get_unicode_fold_info(a_codepoint, &a_fold_info);
-            md_get_unicode_fold_info(b_codepoint, &b_fold_info);
-
-            if(a_fold_info.n_codepoints != b_fold_info.n_codepoints)
-                return (a_fold_info.n_codepoints - b_fold_info.n_codepoints);
-            cmp = memcmp(a_fold_info.codepoints, b_fold_info.codepoints, a_fold_info.n_codepoints * sizeof(int));
-            if(cmp != 0)
-                return cmp;
-
-            a_off += a_char_size;
-            b_off += b_char_size;
-        }
-    }
-
-    return 0;
-}
-
-static int
-md_link_label_cmp_for_qsort(const void* a, const void* b)
-{
-    int cmp;
-
-    cmp = md_link_label_cmp(a, b);
-    if(cmp != 0)
-        return cmp;
-
-    /* The specification requests that only first link reference definition
-     * with the same label is valid. So make sure we make a stable sort
-     * from the qsort(). */
-    return ((MD_LINK_REF_DEF*) a - (MD_LINK_REF_DEF*) b);
-}
-
-static inline const MD_LINK_REF_DEF*
-md_lookup_link_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
-{
-    MD_LINK_REF_DEF key;
-
-    key.label = (CHAR*) label;
-    key.label_size = label_size;
-
-    return bsearch(&key, ctx->link_ref_defs, ctx->n_link_ref_defs,
-                   sizeof(MD_LINK_REF_DEF), md_link_label_cmp);
-}
-
 static int
 md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
                      OFF beg, OFF end, MD_LINK_ATTR* attr)
 {
-    const MD_LINK_REF_DEF* def;
+    const MD_REF_DEF* def;
     const MD_LINE* beg_line;
     const MD_LINE* end_line;
     CHAR* label;
@@ -1978,7 +2222,7 @@ md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
         label_size = end - beg;
     }
 
-    def = md_lookup_link_ref_def(ctx, label, label_size);
+    def = md_lookup_ref_def(ctx, label, label_size);
     if(def != NULL) {
         attr->dest_beg = def->dest_beg;
         attr->dest_end = def->dest_end;
@@ -2084,12 +2328,12 @@ abort:
 }
 
 static void
-md_free_link_ref_defs(MD_CTX* ctx)
+md_free_ref_defs(MD_CTX* ctx)
 {
     int i;
 
-    for(i = 0; i < ctx->n_link_ref_defs; i++) {
-        MD_LINK_REF_DEF* def = &ctx->link_ref_defs[i];
+    for(i = 0; i < ctx->n_ref_defs; i++) {
+        MD_REF_DEF* def = &ctx->ref_defs[i];
 
         if(def->label_needs_free)
             free(def->label);
@@ -2097,7 +2341,7 @@ md_free_link_ref_defs(MD_CTX* ctx)
             free(def->title);
     }
 
-    free(ctx->link_ref_defs);
+    free(ctx->ref_defs);
 }
 
 
@@ -4335,10 +4579,10 @@ md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line)
     return 0;
 }
 
-/* Eat from start of current (textual) block any link reference definitions
- * and remember them so we can resolve any links referring to them.
+/* Eat from start of current (textual) block any reference definitions and
+ * remember them so we can resolve any links referring to them.
  *
- * (Link reference definitions can only be at start of it as they cannot break
+ * (Reference definitions can only be at start of it as they cannot break
  * a paragraph.)
  */
 static int
@@ -4349,17 +4593,17 @@ md_consume_link_reference_definitions(MD_CTX* ctx)
     int n = 0;
 
     /* Compute how many lines at the start of the block form one or more
-     * link reference definitions. */
+     * reference definitions. */
     while(n < n_lines) {
         int n_link_ref_lines;
 
         n_link_ref_lines = md_is_link_reference_definition(ctx,
                                     lines + n, n_lines - n);
-        /* Not a link reference definition? */
+        /* Not a reference definition? */
         if(n_link_ref_lines == 0)
             break;
 
-        /* We fail if it is the link ref. def. but it could not be stored due
+        /* We fail if it is the ref. def. but it could not be stored due
          * a memory allocation error. */
         if(n_link_ref_lines < 0)
             return -1;
@@ -4367,7 +4611,7 @@ md_consume_link_reference_definitions(MD_CTX* ctx)
         n += n_link_ref_lines;
     }
 
-    /* If there was at least one link reference definition, we need to remove
+    /* If there was at least one reference definition, we need to remove
      * its lines from the block, or perhaps even the whole block. */
     if(n > 0) {
         if(n == n_lines) {
@@ -4393,9 +4637,9 @@ md_end_current_block(MD_CTX* ctx)
     if(ctx->current_block == NULL)
         return ret;
 
-    /* Check whether there is a link reference definition. (We do this here
-     * instead of in md_analyze_line() because link reference definition can
-     * take multiple lines.) */
+    /* Check whether there is a reference definition. (We do this here instead
+     * of in md_analyze_line() because reference definition can take multiple
+     * lines.) */
     if(ctx->current_block->type == MD_BLOCK_P) {
         MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
         if(CH(lines[0].beg) == _T('['))
@@ -5556,9 +5800,7 @@ md_process_doc(MD_CTX *ctx)
 
     md_end_current_block(ctx);
 
-    /* Sort link reference definitions so we can use bsearch() for their lookup. */
-    qsort(ctx->link_ref_defs, ctx->n_link_ref_defs, sizeof(MD_LINK_REF_DEF),
-          md_link_label_cmp_for_qsort);
+    MD_CHECK(md_build_ref_def_hashtable(ctx));
 
     /* Process all blocks. */
     MD_CHECK(md_leave_child_containers(ctx, 0));
@@ -5626,7 +5868,8 @@ md_parse(const MD_CHAR* text, MD_SIZE size, const MD_RENDERER* renderer, void* u
     ret = md_process_doc(&ctx);
 
     /* Clean-up. */
-    md_free_link_ref_defs(&ctx);
+    md_free_ref_defs(&ctx);
+    md_free_ref_def_hashtable(&ctx);
     free(ctx.buffer);
     free(ctx.marks);
     free(ctx.block_bytes);
diff --git a/test/coverage.txt b/test/coverage.txt
@@ -183,3 +183,121 @@ Ditto for Unicode punctuation (here U+00A1).
 bar">link</a></p>
 </blockquote>
 ````````````````````````````````
+
+
+### `md_build_ref_def_hashtable()`
+
+All link labels in the following example all have the same FNV1a hash (after
+normalization of the label, which means after converting to a vector of Unicode
+codepoints and lowercase folding).
+
+So the example triggers quite complex code paths which are not otherwise easily
+tested.
+
+```````````````````````````````` example
+[foo]: /foo
+[qnptgbh]: /qnptgbh
+[abgbrwcv]: /abgbrwcv
+[abgbrwcv]: /abgbrwcv2
+[abgbrwcv]: /abgbrwcv3
+[abgbrwcv]: /abgbrwcv4
+[alqadfgn]: /alqadfgn
+
+[foo]
+[qnptgbh]
+[abgbrwcv]
+[alqadfgn]
+[axgydtdu]
+.
+<p><a href="/foo">foo</a>
+<a href="/qnptgbh">qnptgbh</a>
+<a href="/abgbrwcv">abgbrwcv</a>
+<a href="/alqadfgn">alqadfgn</a>
+[axgydtdu]</p>
+````````````````````````````````
+
+For the sake of completeness, the following C program was used to find the hash
+collisions by brute force:
+
+~~~
+
+#include <stdio.h>
+#include <string.h>
+
+
+static unsigned etalon;
+
+
+
+#define MD_FNV1A_BASE       2166136261
+#define MD_FNV1A_PRIME      16777619
+
+static inline unsigned
+fnv1a(unsigned base, const void* data, size_t n)
+{
+    const unsigned char* buf = (const unsigned char*) data;
+    unsigned hash = base;
+    size_t i;
+
+    for(i = 0; i < n; i++) {
+        hash ^= buf[i];
+        hash *= MD_FNV1A_PRIME;
+    }
+
+    return hash;
+}
+
+
+static unsigned
+unicode_hash(const char* data, size_t n)
+{
+    unsigned value;
+    unsigned hash = MD_FNV1A_BASE;
+    int i;
+
+    for(i = 0; i < n; i++) {
+        value = data[i];
+        hash = fnv1a(hash, &value, sizeof(unsigned));
+    }
+
+    return hash;
+}
+
+
+static void
+recurse(char* buffer, size_t off, size_t len)
+{
+    int ch;
+
+    if(off < len - 1) {
+        for(ch = 'a'; ch <= 'z'; ch++) {
+            buffer[off] = ch;
+            recurse(buffer, off+1, len);
+        }
+    } else {
+        for(ch = 'a'; ch <= 'z'; ch++) {
+            buffer[off] = ch;
+            if(unicode_hash(buffer, len) == etalon) {
+                printf("Dup: %.*s\n", (int)len, buffer);
+            }
+        }
+    }
+}
+
+int
+main(int argc, char** argv)
+{
+    char buffer[32];
+    int len;
+
+    if(argc < 2)
+        etalon = unicode_hash("foo", 3);
+    else
+        etalon = unicode_hash(argv[1], strlen(argv[1]));
+
+    for(len = 1; len <= sizeof(buffer); len++)
+        recurse(buffer, 0, len);
+
+    return 0;
+}
+~~~

	md4c C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
	git clone https://noulin.net/git/md4c.git
	Log \| Files \| Refs \| README \| LICENSE

M	md4c/md4c.c	\|	545	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
M	test/coverage.txt	\|	118	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++