md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

commit 6d9cac663cfc143d2e2634266081adb06dbe8d26
parent f814d89369829e5aabcbac0a059f972949c9ccd6
Author: Remy Noulin <loader2x@gmail.com>
Date:   Mon, 26 Dec 2022 20:20:17 +0100

add effects in markdown syntax

add '-' for faint span.
add '%' for inverse span.
add '!' for conceal/hidden span.
add '^' for blink span.
add anchor with syntax: [|id] and link syntax [to anchor id](|id)
add autolink for gemini, gopher and spartan protocols
add MD_SPAN_COLOR for color support in program using this library

md4c/md4c.c      | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
md4c/md4c.h      |  16 ++-
md4c/package.yml |   2 +-
3 files changed, 314 insertions(+), 20 deletions(-)

Diffstat:
Mmd4c/md4c.c | 316++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mmd4c/md4c.h | 16++++++++++++++--
Mmd4c/package.yml | 2+-
3 files changed, 314 insertions(+), 20 deletions(-)

diff --git a/md4c/md4c.c b/md4c/md4c.c @@ -1,4 +1,4 @@ -/* commit e9ff661ff818ee94a4a231958d9b6768dc6882c9 - added _ for underline and __ for bold instead of underline +/* commit e9ff661ff818ee94a4a231958d9b6768dc6882c9 - mity/md4c repo * MD4C: Markdown parser for C * (http://github.com/mity/md4c) * @@ -178,7 +178,7 @@ struct MD_CTX_tag { #endif /* For resolving of inline spans. */ - MD_MARKCHAIN mark_chains[13]; + MD_MARKCHAIN mark_chains[17]; #define PTR_CHAIN (ctx->mark_chains[0]) #define TABLECELLBOUNDARIES (ctx->mark_chains[1]) #define ASTERISK_OPENERS_extraword_mod3_0 (ctx->mark_chains[2]) @@ -192,8 +192,12 @@ struct MD_CTX_tag { #define TILDE_OPENERS_2 (ctx->mark_chains[10]) #define BRACKET_OPENERS (ctx->mark_chains[11]) #define DOLLAR_OPENERS (ctx->mark_chains[12]) +#define FAINT_OPENERS (ctx->mark_chains[13]) +#define INVERSE_OPENERS (ctx->mark_chains[14]) +#define CONCEAL_OPENERS (ctx->mark_chains[15]) +#define BLINK_OPENERS (ctx->mark_chains[16]) #define OPENERS_CHAIN_FIRST 1 -#define OPENERS_CHAIN_LAST 12 +#define OPENERS_CHAIN_LAST 16 int n_table_cell_boundaries; @@ -2513,9 +2517,13 @@ md_mark_chain(MD_CTX* ctx, int mark_index) case _T('*'): return md_asterisk_chain(ctx, mark->flags); case _T('_'): return &UNDERSCORE_OPENERS; case _T('~'): return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2; - case _T('!'): MD_FALLTHROUGH(); + /* case _T('!'): MD_FALLTHROUGH(); */ case _T('['): return &BRACKET_OPENERS; case _T('|'): return &TABLECELLBOUNDARIES; + case _T('-'): return &FAINT_OPENERS; + case _T('%'): return &INVERSE_OPENERS; + case _T('!'): return &CONCEAL_OPENERS; + case _T('^'): return &BLINK_OPENERS; default: return NULL; } } @@ -2723,6 +2731,9 @@ md_build_mark_char_map(MD_CTX* ctx) memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map)); ctx->mark_char_map['\\'] = 1; + ctx->mark_char_map['^'] = 1; + ctx->mark_char_map['%'] = 1; + ctx->mark_char_map['-'] = 1; ctx->mark_char_map['*'] = 1; ctx->mark_char_map['_'] = 1; ctx->mark_char_map['`'] = 1; @@ -2887,6 +2898,141 @@ md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, return TRUE; } +/* detect anchors with syntax: [|anchorId] */ +static int +md_is_anchor_span(MD_CTX* ctx, const MD_LINE* lines, OFF off, OFF* p_closer_beg) +{ + OFF line_end = lines[0].end; + // Smallest anchor is [|x] + // An anchor must be on a single line + if (off+4 >= line_end) + return FALSE; + off += 2; + + // Find closer mark + int opener_end = off; + while (off < line_end) { + if (CH(off) == _T(']')) { + // Check if there an id for the anchor + if (off == opener_end) + return FALSE; + *p_closer_beg = off; + return TRUE; + } + off++; + } + return FALSE; +} + +#ifdef MD4C_USE_UTF16 + /* For UTF-16, mark_char_map[] covers only ASCII. */ + #define IS_MARK_CHAR(off) ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map)) && \ + (ctx->mark_char_map[(unsigned char) CH(off)])) +#else + /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */ + #define IS_MARK_CHAR(off) (ctx->mark_char_map[(unsigned char) CH(off)]) +#endif + +/* detect faint effect: -text text- */ +static int +md_is_faint_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg) +{ + OFF tmp; + OFF line_end; + + line_end = lines[0].end; + if (beg+2 >= line_end) + return FALSE; + if (ISUNICODEWHITESPACE(beg+1)) + return FALSE; + tmp = beg+2; + while (tmp < line_end) { + if (CH(tmp) == _T('-') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1)) + && (!ISUNICODEWHITESPACE(tmp-1))) { + *p_closer_beg = tmp; + return TRUE; + } + tmp++; + } + + return FALSE; +} + +/* detect inverse effect: %text text% */ +static int +md_is_inverse_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg) +{ + OFF tmp; + OFF line_end; + + line_end = lines[0].end; + if (beg+2 >= line_end) + return FALSE; + if (ISUNICODEWHITESPACE(beg+1)) + return FALSE; + tmp = beg+2; + while (tmp < line_end) { + if (CH(tmp) == _T('%') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1)) + && (!ISUNICODEWHITESPACE(tmp-1))) { + *p_closer_beg = tmp; + return TRUE; + } + tmp++; + } + + return FALSE; +} + +/* detect conceal effect: !text text! */ +static int +md_is_conceal_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg) +{ + OFF tmp; + OFF line_end; + + line_end = lines[0].end; + if (beg+2 >= line_end) + return FALSE; + if (ISUNICODEWHITESPACE(beg+1)) + return FALSE; + tmp = beg+2; + while (tmp < line_end) { + if (CH(tmp) == _T('!') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1)) + && (!ISUNICODEWHITESPACE(tmp-1))) { + *p_closer_beg = tmp; + return TRUE; + } + tmp++; + } + + return FALSE; +} + +/* detect blink effect: ^text text^ */ +static int +md_is_blink_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg) +{ + OFF tmp; + OFF line_end; + + line_end = lines[0].end; + if (beg+2 >= line_end) + return FALSE; + if (ISUNICODEWHITESPACE(beg+1)) + return FALSE; + tmp = beg+2; + while (tmp < line_end) { + if (CH(tmp) == _T('^') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1)) + && (!ISUNICODEWHITESPACE(tmp-1))) { + *p_closer_beg = tmp; + return TRUE; + } + tmp++; + } + + return FALSE; +} + static int md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end) { @@ -3009,15 +3155,6 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) while(TRUE) { CHAR ch; -#ifdef MD4C_USE_UTF16 - /* For UTF-16, mark_char_map[] covers only ASCII. */ - #define IS_MARK_CHAR(off) ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map)) && \ - (ctx->mark_char_map[(unsigned char) CH(off)])) -#else - /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */ - #define IS_MARK_CHAR(off) (ctx->mark_char_map[(unsigned char) CH(off)]) -#endif - /* Optimization: Use some loop unrolling. */ while(off + 3 < line_end && !IS_MARK_CHAR(off+0) && !IS_MARK_CHAR(off+1) && !IS_MARK_CHAR(off+2) && !IS_MARK_CHAR(off+3)) @@ -3138,6 +3275,89 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) continue; } + /* A potential faint span start/end. */ + if(ch == _T('-')) { + OFF closer_beg; + int is_faint_span; + + if (off == line->beg || ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off) + || IS_MARK_CHAR(off-1)) { + + is_faint_span = md_is_faint_span(ctx, line, off, &closer_beg); + if(is_faint_span) { + PUSH_MARK(_T('-'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED); + PUSH_MARK(_T('-'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED); + ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; + ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; + } + } + off++; + continue; + } + + /* A potential inverse span start/end. */ + if(ch == _T('%')) { + OFF closer_beg; + int is_inverse_span; + + if (off == line->beg || ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off) + || IS_MARK_CHAR(off-1)) { + + is_inverse_span = md_is_inverse_span(ctx, line, off, &closer_beg); + if(is_inverse_span) { + PUSH_MARK(_T('%'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED); + PUSH_MARK(_T('%'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED); + ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; + ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; + + } + } + off++; + continue; + } + + /* A potential conceal span start/end. */ + if(ch == _T('!')) { + OFF closer_beg; + int is_conceal_span; + + if (off == line->beg || ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off) + || IS_MARK_CHAR(off-1)) { + + is_conceal_span = md_is_conceal_span(ctx, line, off, &closer_beg); + if(is_conceal_span) { + PUSH_MARK(_T('!'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED); + PUSH_MARK(_T('!'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED); + ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; + ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; + + } + } + off++; + continue; + } + + /* A potential blink span start/end. */ + if(ch == _T('^')) { + OFF closer_beg; + int is_blink_span; + + if (off == line->beg || ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off) + || IS_MARK_CHAR(off-1)) { + + is_blink_span = md_is_blink_span(ctx, line, off, &closer_beg); + if(is_blink_span) { + PUSH_MARK(_T('^'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED); + PUSH_MARK(_T('^'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED); + ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; + ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; + + } + } + off++; + continue; + } + /* A potential entity start. */ if(ch == _T('&')) { PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER); @@ -3203,6 +3423,21 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) continue; } + /* A potential anchor */ + if(ch == _T('[') && off+1 < line_end && CH(off+1) == _T('|')) { + OFF closer_beg; + int is_anchor_span = md_is_anchor_span(ctx, line, off, &closer_beg); + if (is_anchor_span) { + PUSH_MARK(_T('['), off, off+2, MD_MARK_OPENER | MD_MARK_RESOLVED); + PUSH_MARK(_T(']'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED); + ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; + ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; + off = closer_beg+1; + continue; + } + // continue analyzing [ mark + } + /* A potential link or its part. */ if(ch == _T('[') || (ch == _T('!') && off+1 < line_end && CH(off+1) == _T('['))) { OFF tmp = (ch == _T('[') ? off+1 : off+2); @@ -3243,8 +3478,11 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) SZ suffix_size; } scheme_map[] = { /* In the order from the most frequently used, arguably. */ - { _T("http"), 4, _T("//"), 2 }, { _T("https"), 5, _T("//"), 2 }, + { _T("gemini"), 6, _T("//"), 2 }, + { _T("http"), 4, _T("//"), 2 }, + { _T("gopher"), 6, _T("//"), 2 }, + { _T("spartan"), 7, _T("//"), 2 }, { _T("ftp"), 3, _T("//"), 2 } }; int scheme_index; @@ -4204,6 +4442,30 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) } break; + case '-': /* faint */ + if(mark->flags & MD_MARK_OPENER) { + MD_ENTER_SPAN(MD_SPAN_FNT, NULL); + } else { + MD_LEAVE_SPAN(MD_SPAN_FNT, NULL); + } + break; + + case '%': /* inverse */ + if(mark->flags & MD_MARK_OPENER) { + MD_ENTER_SPAN(MD_SPAN_INV, NULL); + } else { + MD_LEAVE_SPAN(MD_SPAN_INV, NULL); + } + break; + + case '^': /* blink */ + if(mark->flags & MD_MARK_OPENER) { + MD_ENTER_SPAN(MD_SPAN_BLI, NULL); + } else { + MD_LEAVE_SPAN(MD_SPAN_BLI, NULL); + } + break; + case '_': /* Underline (or emphasis if we fall through). */ if(ctx->parser.flags & MD_FLAG_UNDERLINE) { if(mark->flags & MD_MARK_OPENER) { @@ -4259,7 +4521,7 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) } break; - case '~': + case '~': /* crossed */ if(mark->flags & MD_MARK_OPENER) MD_ENTER_SPAN(MD_SPAN_DEL, NULL); else @@ -4276,8 +4538,16 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) } break; - case '[': /* Link, wiki link, image. */ - case '!': + case '!': /* conceal/hidden */ + if (mark->prev == -1) { + if(mark->flags & MD_MARK_OPENER) { + MD_ENTER_SPAN(MD_SPAN_COC, NULL); + } else { + MD_LEAVE_SPAN(MD_SPAN_COC, NULL); + } + break; + } + case '[': /* Link, wiki link, image, anchor. */ case ']': { const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]); @@ -4304,6 +4574,18 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) break; } + if ((opener->ch == '[' && closer->ch == ']') && + opener->end - opener->beg == 2 && + closer->end - closer->beg == 1 && + CH(opener->beg+1) == _T('|')) + { + if(mark->flags & MD_MARK_OPENER) { + MD_ENTER_SPAN(MD_SPAN_ANCHOR, NULL); + } else { + MD_LEAVE_SPAN(MD_SPAN_ANCHOR, NULL); + } + } + dest_mark = opener+1; MD_ASSERT(dest_mark->ch == 'D'); title_mark = opener+2; diff --git a/md4c/md4c.h b/md4c/md4c.h @@ -145,7 +145,19 @@ typedef enum MD_SPANTYPE { /* <u>...</u> * Note: Recognized only when MD_FLAG_UNDERLINE is enabled. */ - MD_SPAN_U + MD_SPAN_U, + MD_SPAN_FNT, + MD_SPAN_INV, + MD_SPAN_COC, + MD_SPAN_BLI, + MD_SPAN_ANCHOR, + /* This span type is issued by md4c + * MD_SPAN_COLOR allows supporting RGB colors: + * [text with colors](#1#13) + * md4c treats colors as MD_SPAN_A and the parsing of the color + * is done by the user. + */ + MD_SPAN_COLOR, } MD_SPANTYPE; /* Text is the actual textual contents of span. */ @@ -164,7 +176,7 @@ typedef enum MD_TEXTTYPE { MD_TEXT_SOFTBR, /* '\n' in source text where it is not semantically meaningful (soft break) */ /* Entity. - * (a) Named entity, e.g. &nbsp; + * (a) Named entity, e.g. &nbsp; * (Note MD4C does not have a list of known entities. * Anything matching the regexp /&[A-Za-z][A-Za-z0-9]{1,47};/ is * treated as a named entity.) diff --git a/md4c/package.yml b/md4c/package.yml @@ -1,6 +1,6 @@ --- name: md4c - version: 0.0.1 + version: 0.0.2 description: "md4c is a markdown parser library (forked from mity/md4c)" bin: ./md4c.c scripts: