add effects in markdown syntax - md4c - C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.

commit 6d9cac663cfc143d2e2634266081adb06dbe8d26
parent f814d89369829e5aabcbac0a059f972949c9ccd6
Author: Remy Noulin <loader2x@gmail.com>
Date:   Mon, 26 Dec 2022 20:20:17 +0100

add effects in markdown syntax

add '-' for faint span.
add '%' for inverse span.
add '!' for conceal/hidden span.
add '^' for blink span.
add anchor with syntax: [|id] and link syntax [to anchor id](|id)
add autolink for gemini, gopher and spartan protocols
add MD_SPAN_COLOR for color support in program using this library

md4c/md4c.c      | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
md4c/md4c.h      |  16 ++-
md4c/package.yml |   2 +-
3 files changed, 314 insertions(+), 20 deletions(-)

Diffstat:
M md4c/md4c.c  | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M md4c/md4c.h  | 16 ++++++++++++++--
M md4c/package.yml  | 2 +-

3 files changed, 314 insertions(+), 20 deletions(-)
diff --git a/md4c/md4c.c b/md4c/md4c.c
@@ -1,4 +1,4 @@
-/* commit e9ff661ff818ee94a4a231958d9b6768dc6882c9 - added _ for underline and __ for bold instead of underline
+/* commit e9ff661ff818ee94a4a231958d9b6768dc6882c9 - mity/md4c repo
  * MD4C: Markdown parser for C
  * (http://github.com/mity/md4c)
  *
@@ -178,7 +178,7 @@ struct MD_CTX_tag {
 #endif
 
     /* For resolving of inline spans. */
-    MD_MARKCHAIN mark_chains[13];
+    MD_MARKCHAIN mark_chains[17];
 #define PTR_CHAIN                               (ctx->mark_chains[0])
 #define TABLECELLBOUNDARIES                     (ctx->mark_chains[1])
 #define ASTERISK_OPENERS_extraword_mod3_0       (ctx->mark_chains[2])
@@ -192,8 +192,12 @@ struct MD_CTX_tag {
 #define TILDE_OPENERS_2                         (ctx->mark_chains[10])
 #define BRACKET_OPENERS                         (ctx->mark_chains[11])
 #define DOLLAR_OPENERS                          (ctx->mark_chains[12])
+#define FAINT_OPENERS                           (ctx->mark_chains[13])
+#define INVERSE_OPENERS                         (ctx->mark_chains[14])
+#define CONCEAL_OPENERS                         (ctx->mark_chains[15])
+#define BLINK_OPENERS                           (ctx->mark_chains[16])
 #define OPENERS_CHAIN_FIRST                     1
-#define OPENERS_CHAIN_LAST                      12
+#define OPENERS_CHAIN_LAST                      16
 
     int n_table_cell_boundaries;
 
@@ -2513,9 +2517,13 @@ md_mark_chain(MD_CTX* ctx, int mark_index)
         case _T('*'):   return md_asterisk_chain(ctx, mark->flags);
         case _T('_'):   return &UNDERSCORE_OPENERS;
         case _T('~'):   return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2;
-        case _T('!'):   MD_FALLTHROUGH();
+        /* case _T('!'):   MD_FALLTHROUGH(); */
         case _T('['):   return &BRACKET_OPENERS;
         case _T('|'):   return &TABLECELLBOUNDARIES;
+        case _T('-'):   return &FAINT_OPENERS;
+        case _T('%'):   return &INVERSE_OPENERS;
+        case _T('!'):   return &CONCEAL_OPENERS;
+        case _T('^'):   return &BLINK_OPENERS;
         default:        return NULL;
     }
 }
@@ -2723,6 +2731,9 @@ md_build_mark_char_map(MD_CTX* ctx)
     memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map));
 
     ctx->mark_char_map['\\'] = 1;
+    ctx->mark_char_map['^'] = 1;
+    ctx->mark_char_map['%'] = 1;
+    ctx->mark_char_map['-'] = 1;
     ctx->mark_char_map['*'] = 1;
     ctx->mark_char_map['_'] = 1;
     ctx->mark_char_map['`'] = 1;
@@ -2887,6 +2898,141 @@ md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
     return TRUE;
 }
 
+/* detect anchors with syntax: [|anchorId] */
+static int
+md_is_anchor_span(MD_CTX* ctx, const MD_LINE* lines, OFF off, OFF* p_closer_beg)
+{
+    OFF line_end = lines[0].end;
+    // Smallest anchor is [|x]
+    // An anchor must be on a single line
+    if (off+4 >= line_end)
+        return FALSE;
+    off += 2;
+
+    // Find closer mark
+    int opener_end = off;
+    while (off < line_end) {
+        if (CH(off) == _T(']')) {
+            // Check if there an id for the anchor
+            if (off == opener_end)
+                return FALSE;
+            *p_closer_beg = off;
+            return TRUE;
+        }
+        off++;
+    }
+    return FALSE;
+}
+
+#ifdef MD4C_USE_UTF16
+    /* For UTF-16, mark_char_map[] covers only ASCII. */
+    #define IS_MARK_CHAR(off)   ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map))  &&  \
+                                (ctx->mark_char_map[(unsigned char) CH(off)]))
+#else
+    /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */
+    #define IS_MARK_CHAR(off)   (ctx->mark_char_map[(unsigned char) CH(off)])
+#endif
+
+/* detect faint effect: -text text- */
+static int
+md_is_faint_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg)
+{
+    OFF tmp;
+    OFF line_end;
+
+    line_end = lines[0].end;
+    if (beg+2 >= line_end)
+        return FALSE;
+    if (ISUNICODEWHITESPACE(beg+1))
+        return FALSE;
+    tmp = beg+2;
+    while (tmp < line_end) {
+        if (CH(tmp) == _T('-') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1))
+            && (!ISUNICODEWHITESPACE(tmp-1))) {
+            *p_closer_beg = tmp;
+            return TRUE;
+        }
+        tmp++;
+    }
+
+    return FALSE;
+}
+
+/* detect inverse effect: %text text% */
+static int
+md_is_inverse_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg)
+{
+    OFF tmp;
+    OFF line_end;
+
+    line_end = lines[0].end;
+    if (beg+2 >= line_end)
+        return FALSE;
+    if (ISUNICODEWHITESPACE(beg+1))
+        return FALSE;
+    tmp = beg+2;
+    while (tmp < line_end) {
+        if (CH(tmp) == _T('%') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1))
+            && (!ISUNICODEWHITESPACE(tmp-1))) {
+            *p_closer_beg = tmp;
+            return TRUE;
+        }
+        tmp++;
+    }
+
+    return FALSE;
+}
+
+/* detect conceal effect: !text text! */
+static int
+md_is_conceal_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg)
+{
+    OFF tmp;
+    OFF line_end;
+
+    line_end = lines[0].end;
+    if (beg+2 >= line_end)
+        return FALSE;
+    if (ISUNICODEWHITESPACE(beg+1))
+        return FALSE;
+    tmp = beg+2;
+    while (tmp < line_end) {
+        if (CH(tmp) == _T('!') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1))
+            && (!ISUNICODEWHITESPACE(tmp-1))) {
+            *p_closer_beg = tmp;
+            return TRUE;
+        }
+        tmp++;
+    }
+
+    return FALSE;
+}
+
+/* detect blink effect: ^text text^ */
+static int
+md_is_blink_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg)
+{
+    OFF tmp;
+    OFF line_end;
+
+    line_end = lines[0].end;
+    if (beg+2 >= line_end)
+        return FALSE;
+    if (ISUNICODEWHITESPACE(beg+1))
+        return FALSE;
+    tmp = beg+2;
+    while (tmp < line_end) {
+        if (CH(tmp) == _T('^') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1))
+            && (!ISUNICODEWHITESPACE(tmp-1))) {
+            *p_closer_beg = tmp;
+            return TRUE;
+        }
+        tmp++;
+    }
+
+    return FALSE;
+}
+
 static int
 md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
 {
@@ -3009,15 +3155,6 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
         while(TRUE) {
             CHAR ch;
 
-#ifdef MD4C_USE_UTF16
-    /* For UTF-16, mark_char_map[] covers only ASCII. */
-    #define IS_MARK_CHAR(off)   ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map))  &&  \
-                                (ctx->mark_char_map[(unsigned char) CH(off)]))
-#else
-    /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */
-    #define IS_MARK_CHAR(off)   (ctx->mark_char_map[(unsigned char) CH(off)])
-#endif
-
             /* Optimization: Use some loop unrolling. */
             while(off + 3 < line_end  &&  !IS_MARK_CHAR(off+0)  &&  !IS_MARK_CHAR(off+1)
                                       &&  !IS_MARK_CHAR(off+2)  &&  !IS_MARK_CHAR(off+3))
@@ -3138,6 +3275,89 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
                 continue;
             }
 
+            /* A potential faint span start/end. */
+            if(ch == _T('-')) {
+                OFF closer_beg;
+                int is_faint_span;
+
+                if (off == line->beg  ||  ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off)
+                    || IS_MARK_CHAR(off-1)) {
+
+                    is_faint_span = md_is_faint_span(ctx, line, off, &closer_beg);
+                    if(is_faint_span) {
+                        PUSH_MARK(_T('-'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED);
+                        PUSH_MARK(_T('-'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED);
+                        ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
+                        ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
+                    }
+                }
+                off++;
+                continue;
+            }
+
+            /* A potential inverse span start/end. */
+            if(ch == _T('%')) {
+                OFF closer_beg;
+                int is_inverse_span;
+
+                if (off == line->beg  ||  ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off)
+                    || IS_MARK_CHAR(off-1)) {
+
+                    is_inverse_span = md_is_inverse_span(ctx, line, off, &closer_beg);
+                    if(is_inverse_span) {
+                        PUSH_MARK(_T('%'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED);
+                        PUSH_MARK(_T('%'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED);
+                        ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
+                        ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
+
+                    }
+                }
+                off++;
+                continue;
+            }
+
+            /* A potential conceal span start/end. */
+            if(ch == _T('!')) {
+                OFF closer_beg;
+                int is_conceal_span;
+
+                if (off == line->beg  ||  ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off)
+                    || IS_MARK_CHAR(off-1)) {
+
+                    is_conceal_span = md_is_conceal_span(ctx, line, off, &closer_beg);
+                    if(is_conceal_span) {
+                        PUSH_MARK(_T('!'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED);
+                        PUSH_MARK(_T('!'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED);
+                        ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
+                        ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
+
+                    }
+                }
+                off++;
+                continue;
+            }
+
+            /* A potential blink span start/end. */
+            if(ch == _T('^')) {
+                OFF closer_beg;
+                int is_blink_span;
+
+                if (off == line->beg  ||  ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off)
+                    || IS_MARK_CHAR(off-1)) {
+
+                    is_blink_span = md_is_blink_span(ctx, line, off, &closer_beg);
+                    if(is_blink_span) {
+                        PUSH_MARK(_T('^'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED);
+                        PUSH_MARK(_T('^'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED);
+                        ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
+                        ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
+
+                    }
+                }
+                off++;
+                continue;
+            }
+
             /* A potential entity start. */
             if(ch == _T('&')) {
                 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
@@ -3203,6 +3423,21 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
                 continue;
             }
 
+            /* A potential anchor */
+            if(ch == _T('[') && off+1 < line_end && CH(off+1) == _T('|')) {
+                OFF closer_beg;
+                int is_anchor_span = md_is_anchor_span(ctx, line, off, &closer_beg);
+                if (is_anchor_span) {
+                    PUSH_MARK(_T('['), off, off+2, MD_MARK_OPENER | MD_MARK_RESOLVED);
+                    PUSH_MARK(_T(']'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED);
+                    ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
+                    ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
+                    off = closer_beg+1;
+                    continue;
+                }
+                // continue analyzing [ mark
+            }
+
             /* A potential link or its part. */
             if(ch == _T('[')  ||  (ch == _T('!') && off+1 < line_end && CH(off+1) == _T('['))) {
                 OFF tmp = (ch == _T('[') ? off+1 : off+2);
@@ -3243,8 +3478,11 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
                     SZ suffix_size;
                 } scheme_map[] = {
                     /* In the order from the most frequently used, arguably. */
-                    { _T("http"), 4,    _T("//"), 2 },
                     { _T("https"), 5,   _T("//"), 2 },
+                    { _T("gemini"), 6,   _T("//"), 2 },
+                    { _T("http"), 4,    _T("//"), 2 },
+                    { _T("gopher"), 6,   _T("//"), 2 },
+                    { _T("spartan"), 7,    _T("//"), 2 },
                     { _T("ftp"), 3,     _T("//"), 2 }
                 };
                 int scheme_index;
@@ -4204,6 +4442,30 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
                     }
                     break;
 
+                case '-': /* faint */
+                    if(mark->flags & MD_MARK_OPENER) {
+                        MD_ENTER_SPAN(MD_SPAN_FNT, NULL);
+                    } else {
+                        MD_LEAVE_SPAN(MD_SPAN_FNT, NULL);
+                    }
+                    break;
+
+                case '%': /* inverse */
+                    if(mark->flags & MD_MARK_OPENER) {
+                        MD_ENTER_SPAN(MD_SPAN_INV, NULL);
+                    } else {
+                        MD_LEAVE_SPAN(MD_SPAN_INV, NULL);
+                    }
+                    break;
+
+                case '^': /* blink */
+                    if(mark->flags & MD_MARK_OPENER) {
+                        MD_ENTER_SPAN(MD_SPAN_BLI, NULL);
+                    } else {
+                        MD_LEAVE_SPAN(MD_SPAN_BLI, NULL);
+                    }
+                    break;
+
                 case '_':       /* Underline (or emphasis if we fall through). */
                     if(ctx->parser.flags & MD_FLAG_UNDERLINE) {
                         if(mark->flags & MD_MARK_OPENER) {
@@ -4259,7 +4521,7 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
                     }
                     break;
 
-                case '~':
+                case '~': /* crossed */
                     if(mark->flags & MD_MARK_OPENER)
                         MD_ENTER_SPAN(MD_SPAN_DEL, NULL);
                     else
@@ -4276,8 +4538,16 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
                     }
                     break;
 
-                case '[':       /* Link, wiki link, image. */
-                case '!':
+                case '!': /* conceal/hidden */
+                    if (mark->prev == -1) {
+                        if(mark->flags & MD_MARK_OPENER) {
+                            MD_ENTER_SPAN(MD_SPAN_COC, NULL);
+                        } else {
+                            MD_LEAVE_SPAN(MD_SPAN_COC, NULL);
+                        }
+                        break;
+                    }
+                case '[':       /* Link, wiki link, image, anchor. */
                 case ']':
                 {
                     const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]);
@@ -4304,6 +4574,18 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
                         break;
                     }
 
+                    if ((opener->ch == '[' && closer->ch == ']') &&
+                        opener->end - opener->beg == 2 &&
+                        closer->end - closer->beg == 1 &&
+                        CH(opener->beg+1) == _T('|'))
+                    {
+                        if(mark->flags & MD_MARK_OPENER) {
+                            MD_ENTER_SPAN(MD_SPAN_ANCHOR, NULL);
+                        } else {
+                            MD_LEAVE_SPAN(MD_SPAN_ANCHOR, NULL);
+                        }
+                    }
+
                     dest_mark = opener+1;
                     MD_ASSERT(dest_mark->ch == 'D');
                     title_mark = opener+2;
diff --git a/md4c/md4c.h b/md4c/md4c.h
@@ -145,7 +145,19 @@ typedef enum MD_SPANTYPE {
 
     /* <u>...</u>
      * Note: Recognized only when MD_FLAG_UNDERLINE is enabled. */
-    MD_SPAN_U
+    MD_SPAN_U,
+    MD_SPAN_FNT,
+    MD_SPAN_INV,
+    MD_SPAN_COC,
+    MD_SPAN_BLI,
+    MD_SPAN_ANCHOR,
+    /* This span type is issued by md4c
+     * MD_SPAN_COLOR allows supporting RGB colors:
+     * [text with colors](#1#13)
+     * md4c treats colors as MD_SPAN_A and the parsing of the color
+     * is done by the user.
+     */
+    MD_SPAN_COLOR,
 } MD_SPANTYPE;
 
 /* Text is the actual textual contents of span. */
@@ -164,7 +176,7 @@ typedef enum MD_TEXTTYPE {
     MD_TEXT_SOFTBR,     /* '\n' in source text where it is not semantically meaningful (soft break) */
 
     /* Entity.
-     * (a) Named entity, e.g. &nbsp; 
+     * (a) Named entity, e.g. &nbsp;
      *     (Note MD4C does not have a list of known entities.
      *     Anything matching the regexp /&[A-Za-z][A-Za-z0-9]{1,47};/ is
      *     treated as a named entity.)
diff --git a/md4c/package.yml b/md4c/package.yml
@@ -1,6 +1,6 @@
 ---
   name: md4c
-  version: 0.0.1
+  version: 0.0.2
   description: "md4c is a markdown parser library (forked from mity/md4c)"
   bin: ./md4c.c
   scripts:

	md4c C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
	git clone https://noulin.net/git/md4c.git
	Log \| Files \| Refs \| README \| LICENSE

M	md4c/md4c.c	\|	316	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M	md4c/md4c.h	\|	16	++++++++++++++--
M	md4c/package.yml	\|	2	+-