md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

commit e01275811fef8e046940546d16431d252da74350
parent d2da226c445841b1c975722079d79344aacbf86f
Author: Martin Mitas <mity@morous.org>
Date:   Tue,  4 Oct 2016 20:29:22 +0200

Implement raw HTML blocks.

Diffstat:
MREADME.md | 4++--
Mmd2html/md2html.c | 10++++++++--
Mmd4c/md4c.c | 249++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mmd4c/md4c.h | 13+++++++++++--
4 files changed, 263 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md @@ -85,8 +85,8 @@ more or less forms our to do list. - [x] 4.2 ATX headings - [x] 4.3 Setext headings - [x] 4.4 Indented code blocks - - [ ] 4.5 Fenced code blocks - - [ ] 4.6 HTML blocks + - [x] 4.5 Fenced code blocks + - [x] 4.6 HTML blocks - [ ] 4.7 Link reference definitions - [x] 4.8 Paragraphs - [x] 4.9 Blank lines diff --git a/md2html/md2html.c b/md2html/md2html.c @@ -151,6 +151,7 @@ enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata) case MD_BLOCK_HR: MEMBUF_APPEND_LITERAL(out, "<hr>\n"); break; case MD_BLOCK_H: MEMBUF_APPEND_LITERAL(out, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break; case MD_BLOCK_CODE: open_code_block(out, (const MD_BLOCK_CODE_DETAIL*) detail); break; + case MD_BLOCK_HTML: /* noop */ break; case MD_BLOCK_P: MEMBUF_APPEND_LITERAL(out, "<p>"); break; } @@ -168,6 +169,7 @@ leave_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata) case MD_BLOCK_HR: /*noop*/ break; case MD_BLOCK_H: MEMBUF_APPEND_LITERAL(out, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break; case MD_BLOCK_CODE: MEMBUF_APPEND_LITERAL(out, "</code></pre>\n"); break; + case MD_BLOCK_HTML: /* noop */ break; case MD_BLOCK_P: MEMBUF_APPEND_LITERAL(out, "</p>\n"); break; } @@ -192,6 +194,7 @@ text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, void* userdat struct membuffer* out = (struct membuffer*) userdata; switch(type) { + case MD_TEXT_HTML: membuf_append(out, text, size); break; default: membuf_append_escaped(out, text, size); break; } @@ -304,6 +307,7 @@ static const option cmdline_options[] = { { "help", 'h', 'h', OPTION_ARG_NONE }, { "fpermissive-atx-headers", 0, 'A', OPTION_ARG_NONE }, { "fno-indented-code", 0, 'I', OPTION_ARG_NONE }, + { "fno-html-blocks", 0, 'H', OPTION_ARG_NONE }, { 0 } }; @@ -322,7 +326,8 @@ usage(void) "\n" "Markdown dialect options:\n" " --fpermissive-atx-headers allow ATX headers without delimiting space\n" - " --fno-indented-code disabled indented code blocks\n" + " --fno-indented-code disable indented code blocks\n" + " --fno-html-blocks disable raw HTML blocks\n" ); } @@ -351,7 +356,8 @@ cmdline_callback(int opt, char const* value, void* data) case 'h': usage(); exit(0); break; case 'A': renderer_flags |= MD_FLAG_PERMISSIVEATXHEADERS; break; - case 'I': renderer_flags |= MD_FLAG_NOINDENTEDCODE; break; + case 'I': renderer_flags |= MD_FLAG_NOINDENTEDCODEBLOCKS; break; + case 'H': renderer_flags |= MD_FLAG_NOHTMLBLOCKS; break; default: fprintf(stderr, "Illegal option: %s\n", value); diff --git a/md4c/md4c.c b/md4c/md4c.c @@ -88,6 +88,9 @@ struct MD_CTX_tag { OFF code_fence_indent; OFF code_fence_info_beg; OFF code_fence_info_end; + + /* For MD_BLOCK_HTML. */ + unsigned html_block_type; }; typedef enum MD_LINETYPE_tag MD_LINETYPE; @@ -100,6 +103,7 @@ enum MD_LINETYPE_tag { MD_LINE_INDENTEDCODE, MD_LINE_CODEFENCE, MD_LINE_FENCEDCODE, + MD_LINE_HTML, MD_LINE_TEXT }; @@ -205,6 +209,25 @@ md_strchr(const CHAR* str, CHAR ch) return NULL; } +/* Case insensitive check of string equality. */ +static inline int +md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n) +{ + OFF i; + for(i = 0; i < n; i++) { + CHAR ch1 = s1[i]; + CHAR ch2 = s2[i]; + + if(ISLOWER_(ch1)) + ch1 += ('A'-'a'); + if(ISLOWER_(ch2)) + ch2 += ('A'-'a'); + if(ch1 != ch2) + return -1; + } + return 0; +} + #define MD_ENTER_BLOCK(type, arg) \ do { \ @@ -274,7 +297,7 @@ abort: } static int -md_process_verbatim_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines) +md_process_verbatim_block(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_LINE* lines, int n_lines) { static const CHAR indent_str[16] = _T(" "); int i; @@ -286,17 +309,17 @@ md_process_verbatim_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines) /* Output code indentation. */ while(indent > SIZEOF_ARRAY(indent_str)) { - MD_TEXT(MD_TEXT_CODEBLOCK, indent_str, SIZEOF_ARRAY(indent_str)); + MD_TEXT(text_type, indent_str, SIZEOF_ARRAY(indent_str)); indent -= SIZEOF_ARRAY(indent_str); } if(indent > 0) - MD_TEXT(MD_TEXT_CODEBLOCK, indent_str, indent); + MD_TEXT(text_type, indent_str, indent); /* Output the code line itself. */ - MD_TEXT(MD_TEXT_CODEBLOCK, STR(line->beg), line->end - line->beg); + MD_TEXT(text_type, STR(line->beg), line->end - line->beg); /* Enforce end-of-line. */ - MD_TEXT(MD_TEXT_CODEBLOCK, _T("\n"), 1); + MD_TEXT(text_type, _T("\n"), 1); } abort: @@ -441,6 +464,190 @@ out: return ret; } +/* Returns type of the raw HTML block, or -1 if it is not HTML block. + * (Refer to CommonMark specification for details about the types.) + */ +static int +md_is_html_block_start_condition(MD_CTX* ctx, OFF beg) +{ + typedef struct TAG_tag TAG; + struct TAG_tag { + const CHAR* name; + unsigned len : 8; + }; + + /* Type 6 is started by a long list of allowed tags. We use two-level + * tree to speed-up the search. */ +#ifdef X + #undef X +#endif +#define X(name) { _T(name), sizeof(name)-1 } +#define Xend { NULL, 0 } + static const TAG t1[] = { X("script"), X("pre"), X("style"), Xend }; + + static const TAG a6[] = { X("address"), X("article"), X("aside"), Xend }; + static const TAG b6[] = { X("base"), X("basefont"), X("blockquote"), X("body"), Xend }; + static const TAG c6[] = { X("caption"), X("center"), X("col"), X("colgroup"), Xend }; + static const TAG d6[] = { X("dd"), X("details"), X("dialog"), X("dir"), + X("div"), X("dl"), X("dt"), Xend }; + static const TAG f6[] = { X("fieldset"), X("figcaption"), X("figure"), X("footer"), + X("form"), X("frame"), X("frameset"), Xend }; + static const TAG h6[] = { X("h1"), X("head"), X("header"), X("hr"), X("html"), Xend }; + static const TAG i6[] = { X("iframe"), Xend }; + static const TAG l6[] = { X("legend"), X("li"), X("link"), Xend }; + static const TAG m6[] = { X("main"), X("menu"), X("menuitem"), X("meta"), Xend }; + static const TAG n6[] = { X("nav"), X("noframes"), Xend }; + static const TAG o6[] = { X("ol"), X("optgroup"), X("option"), Xend }; + static const TAG p6[] = { X("p"), X("param"), Xend }; + static const TAG s6[] = { X("section"), X("source"), X("summary"), Xend }; + static const TAG t6[] = { X("table"), X("tbody"), X("td"), X("tfoot"), X("th"), + X("thead"), X("title"), X("tr"), X("track"), Xend }; + static const TAG u6[] = { X("ul"), Xend }; + static const TAG xx[] = { Xend }; +#undef X + + static const TAG* map6[26] = { + a6, b6, c6, d6, xx, f6, xx, h6, i6, xx, xx, l6, m6, + n6, o6, p6, xx, xx, s6, t6, u6, xx, xx, xx, xx, xx + }; + OFF off = beg + 1; + int i; + + MD_ASSERT(CH(beg) == _T('<')); + + /* Check for type 1: <script, <pre, or <style */ + for(i = 0; t1[i].name != NULL; i++) { + if(off + t1[i].len < ctx->size) { + if(md_str_case_eq(STR(off), t1[i].name, t1[i].len) == 0) + return 1; + } + } + + /* Check for type 2: <!-- */ + if(off + 3 < ctx->size && CH(off) == _T('!') && CH(off) == _T('-') && CH(off+1) == _T('-')) + return 2; + + /* Check for type 3: <? */ + if(off < ctx->size && CH(off) == _T('?')) + return 3; + + /* Check for type 4 or 5: <! */ + if(off < ctx->size && CH(off) == _T('!')) { + /* Check for type 4: <! followed by uppercase letter. */ + if(off + 1 < ctx->size && ISUPPER(off+1)) + return 4; + + /* Check for type 5: <![CDATA[ */ + if(off + 8 < ctx->size) { + if(memcmp(STR(off), _T("![CDATA["), 8 * sizeof(CHAR)) == 0) + return 5; + } + } + + /* Check for type 6: Many possible starting tags listed above. */ + if(off + 1 < ctx->size && (ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1)))) { + int slot; + const TAG* tags; + + if(CH(off) == _T('/')) + off++; + + slot = (ISUPPER(off) ? CH(off) - 'A' : CH(off) - 'a'); + tags = map6[slot]; + + for(i = 0; tags[i].name != NULL; i++) { + if(off + tags[i].len <= ctx->size) { + if(md_str_case_eq(STR(off), tags[i].name, tags[i].len) == 0) { + OFF tmp = off + tags[i].len; + if(tmp >= ctx->size) + return 6; + if(ISBLANK(tmp) || ISNEWLINE(tmp) || CH(tmp) == _T('>')) + return 6; + if(tmp+1 < ctx->size && CH(tmp) == _T('/') && CH(tmp+1) == _T('>')) + return 6; + break; + } + } + } + } + + /* Check for type 7: any COMPLETE other opening or closing tag. */ + // TODO: Rework this: This should be shared with some part of + // inline raw html (spec section 6.8). + if(off + 1 < ctx->size) { + if(ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1))) { + OFF tmp = off + 1; + + /* Eat tag name. */ + while(tmp < ctx->size && (ISALNUM(tmp) || CH(tmp) == _T('-'))) + tmp++; + + /* If opening tag, eat any attributes. */ + if(tmp < ctx->size && CH(tmp) != _T('/')) { + // TODO + } + + /* Eat any whitespace */ + while(tmp < ctx->size && ISWHITESPACE(tmp)) + tmp++; + + if(tmp < ctx->size && CH(tmp) == _T('/')) + tmp++; + + if(tmp < ctx->size && CH(tmp) == _T('>')) + return 7; + } + } + + return -1; +} + +/* Case insensitive check whether line starting at the offset contains 'what'. */ +static int +md_line_case_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len) +{ + OFF i; + for(i = beg; i + what_len < ctx->size; i++) { + if(ISNEWLINE(i)) + break; + if(md_str_case_eq(STR(i), what, what_len) == 0) + return 0; + } + return -1; +} + +/* Case sensitive check whether line starting at the offset contains 'what'. */ +static int +md_line_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len) +{ + OFF i; + for(i = beg; i + what_len < ctx->size; i++) { + if(ISNEWLINE(i)) + break; + if(memcmp(STR(i), what, what_len * sizeof(CHAR)) == 0) + return 0; + } + return -1; +} + +/* Returns type of HTML block end condition or -1 if not an end condition. */ +static int +md_is_html_block_end_condition(MD_CTX* ctx, OFF beg) +{ + switch(ctx->html_block_type) { + case 1: return (md_line_case_contains(ctx, beg, _T("</script>"), 9) == 0 + || md_line_case_contains(ctx, beg, _T("</pre>"), 6) == 0 + || md_line_case_contains(ctx, beg, _T("</style>"), 8) == 0 ? 1 : -1); + case 2: return (md_line_contains(ctx, beg, _T("-->"), 3) == 0 ? 2 : -1); + case 3: return (md_line_contains(ctx, beg, _T("?>"), 2) == 0 ? 3 : -1); + case 4: return (md_line_contains(ctx, beg, _T(">"), 1) == 0 ? 4 : -1); + case 5: return (md_line_contains(ctx, beg, _T("]]>"), 3) == 0 ? 5 : -1); + case 6: /* Pass through */ + case 7: return (ISNEWLINE(beg) ? ctx->html_block_type : -1); + default: return -1; + } +} + /* Analyze type of the line and find some its properties. This serves as a * main input for determining type and boundaries of a block. */ static void @@ -477,6 +684,17 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, const MD_LINE* pivot_line, MD_ goto done; } + /* Check whether we are HTML block continuation. */ + if(pivot_line->type == MD_LINE_HTML && ctx->html_block_type > 0) { + if(md_is_html_block_end_condition(ctx, off) == ctx->html_block_type) { + /* Make sure this is the last line of the block. */ + ctx->html_block_type = 0; + } + + line->type = MD_LINE_HTML; + goto done; + } + /* Check whether we are blank line. * Note blank lines after indented code are treated as part of that block. * If they are at the end of the block, it is discarded by caller. @@ -546,6 +764,15 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, const MD_LINE* pivot_line, MD_ } } + /* Check whether we are start of raw HTML block. */ + if(CH(off) == _T('<') && !(ctx->r.flags & MD_FLAG_NOHTMLBLOCKS)) { + ctx->html_block_type = md_is_html_block_start_condition(ctx, off); + if(ctx->html_block_type > 0) { + line->type = MD_LINE_HTML; + goto done; + } + } + /* By default, we are normal text line. */ line->type = MD_LINE_TEXT; @@ -632,6 +859,10 @@ md_process_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines) block_type = MD_BLOCK_P; break; + case MD_LINE_HTML: + block_type = MD_BLOCK_HTML; + break; + case MD_LINE_SETEXTUNDERLINE: case MD_LINE_CODEFENCE: /* Noop. */ @@ -651,7 +882,11 @@ md_process_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines) break; case MD_BLOCK_CODE: - ret = md_process_verbatim_block(ctx, lines, n_lines); + ret = md_process_verbatim_block(ctx, MD_TEXT_CODEBLOCK, lines, n_lines); + break; + + case MD_BLOCK_HTML: + ret = md_process_verbatim_block(ctx, MD_TEXT_HTML, lines, n_lines); break; default: @@ -781,7 +1016,7 @@ md_parse(const MD_CHAR* text, MD_SIZE size, const MD_RENDERER* renderer, void* u ctx.userdata = userdata; /* Offset for indented code block. */ - ctx.code_indent_offset = (ctx.r.flags & MD_FLAG_NOINDENTEDCODE) ? (OFF)(-1) : 4; + ctx.code_indent_offset = (ctx.r.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4; /* Do all the hard work. */ return md_process_doc(&ctx); diff --git a/md4c/md4c.h b/md4c/md4c.h @@ -72,6 +72,11 @@ enum MD_BLOCKTYPE_tag { * Note the text lines (spans) within blocks are terminated with '\n'. */ MD_BLOCK_CODE, + /* Raw HTML block. This itself does not correspond to any particular HTML + * tag. The contents of it _IS_ raw HTML source intended to be put + * in verbatim form to the HTML output. */ + MD_BLOCK_HTML, + /* <p>...</p> */ MD_BLOCK_P }; @@ -93,7 +98,10 @@ enum MD_TEXTTYPE_tag { /* Text in a code block (inside MD_BLOCK_CODE). * Includes spaces for indentation and '\n' for new lines. * MD_TEXT_BR and MD_TEXT_SOFTBR are not sent for this kind of text. */ - MD_TEXT_CODEBLOCK + MD_TEXT_CODEBLOCK, + + /* Text is a raw HTML. */ + MD_TEXT_HTML }; @@ -117,7 +125,8 @@ struct MD_BLOCK_CODE_DETAIL_tag { * The following flags may allow some extensions or deviations from it. */ #define MD_FLAG_PERMISSIVEATXHEADERS 0x0001 /* Do not require space in ATX headers ( ###header ) */ -#define MD_FLAG_NOINDENTEDCODE 0x0002 /* Recognize only fenced code blocks. */ +#define MD_FLAG_NOINDENTEDCODEBLOCKS 0x0002 /* Disable indented code blocks. (Only fenced code works.) */ +#define MD_FLAG_NOHTMLBLOCKS 0x0004 /* Disable raw HTML blocks. */ /* Caller-provided callbacks. *