md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

commit d2da226c445841b1c975722079d79344aacbf86f
parent 43bd28445b639afbd91e0155ffb5b5072c88e181
Author: Martin Mitas <mity@morous.org>
Date:   Tue,  4 Oct 2016 19:48:06 +0200

Implement fenced code blocks.

Diffstat:
Mmd2html/md2html.c | 17++++++++++++++++-
Mmd4c/md4c.c | 122+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mmd4c/md4c.h | 7+++++++
3 files changed, 143 insertions(+), 3 deletions(-)

diff --git a/md2html/md2html.c b/md2html/md2html.c @@ -125,6 +125,21 @@ membuf_append_escaped(struct membuffer* buf, const char* data, MD_SIZE size) *** HTML renderer implementation *** **************************************/ +static void +open_code_block(struct membuffer* out, const MD_BLOCK_CODE_DETAIL* det) +{ + MEMBUF_APPEND_LITERAL(out, "<pre><code"); + + /* If known, output the HTML 5 attribute class="language-LANGNAME". */ + if(det->lang != NULL) { + MEMBUF_APPEND_LITERAL(out, " class=\"language-"); + membuf_append_escaped(out, det->lang, det->lang_size); + MEMBUF_APPEND_LITERAL(out, "\""); + } + + MEMBUF_APPEND_LITERAL(out, ">"); +} + static int enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata) { @@ -135,7 +150,7 @@ enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata) case MD_BLOCK_DOC: /* noop */ break; case MD_BLOCK_HR: MEMBUF_APPEND_LITERAL(out, "<hr>\n"); break; case MD_BLOCK_H: MEMBUF_APPEND_LITERAL(out, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break; - case MD_BLOCK_CODE: MEMBUF_APPEND_LITERAL(out, "<pre><code>"); break; + case MD_BLOCK_CODE: open_code_block(out, (const MD_BLOCK_CODE_DETAIL*) detail); break; case MD_BLOCK_P: MEMBUF_APPEND_LITERAL(out, "<p>"); break; } diff --git a/md4c/md4c.c b/md4c/md4c.c @@ -81,6 +81,13 @@ struct MD_CTX_tag { /* For MD_BLOCK_HEADER. */ unsigned header_level; + + /* For MD_BLOCK_CODE (fenced). */ + CHAR code_fence_char; /* '~' or '`' */ + SZ code_fence_length; + OFF code_fence_indent; + OFF code_fence_info_beg; + OFF code_fence_info_end; }; typedef enum MD_LINETYPE_tag MD_LINETYPE; @@ -91,6 +98,8 @@ enum MD_LINETYPE_tag { MD_LINE_SETEXTHEADER, MD_LINE_SETEXTUNDERLINE, MD_LINE_INDENTEDCODE, + MD_LINE_CODEFENCE, + MD_LINE_FENCEDCODE, MD_LINE_TEXT }; @@ -364,6 +373,74 @@ md_is_setext_underline(MD_CTX* ctx, OFF beg, OFF* p_end) return 0; } +static int +md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end) +{ + OFF off = beg; + + MD_ASSERT(CH(beg) == _T('`') || CH(beg) == _T('~')); + + while(off < ctx->size && CH(off) == CH(beg)) + off++; + + /* Fence must have at least three characters. */ + if(off - beg < 3) + return -1; + + ctx->code_fence_length = off - beg; + + /* Optionally, space(s) can follow. */ + while(off < ctx->size && CH(off) == _T(' ')) + off++; + + /* Optionally, language info can follow. It must not contain '`'. */ + ctx->code_fence_info_beg = off; + while(off < ctx->size && CH(off) != _T('`') && !ISNEWLINE(off)) + off++; + if(off < ctx->size && !ISNEWLINE(off)) + return -1; + + *p_end = off; + + /* Right trim of language info. */ + while(off > ctx->code_fence_info_beg && CH(off-1) == _T(' ')) + off--; + ctx->code_fence_info_end = off; + + ctx->code_fence_char = CH(beg); + return 0; +} + +static int +md_is_closing_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end) +{ + OFF off = beg; + int ret = -1; + + /* Closing fence must have at least the same length and use same char as + * opening one. */ + while(off < ctx->size && CH(off) == ctx->code_fence_char) + off++; + if(off - beg < ctx->code_fence_length) + goto out; + + /* Optionally, space(s) can follow */ + while(off < ctx->size && CH(off) == _T(' ')) + off++; + + /* But nothing more is allowed on the line. */ + if(off < ctx->size && !ISNEWLINE(off)) + goto out; + + ret = 0; + +out: + /* Note we set *p_end even on failure: If we are not closing fence, caller + * would eat the line anyway without any parsing. */ + *p_end = off; + return ret; +} + /* Analyze type of the line and find some its properties. This serves as a * main input for determining type and boundaries of a block. */ static void @@ -385,6 +462,21 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, const MD_LINE* pivot_line, MD_ line->beg = off; + /* Check whether we are fenced code continuation. */ + if(pivot_line->type == MD_LINE_FENCEDCODE || pivot_line->type == MD_LINE_CODEFENCE) { + /* We are another MD_LINE_FENCEDCODE unless we are closing fence + * which we transform into MD_LINE_BLANK. */ + if(line->indent < ctx->code_indent_offset) { + if(md_is_closing_code_fence(ctx, off, &off) == 0) { + line->type = MD_LINE_BLANK; + goto done; + } + } + + line->type = MD_LINE_FENCEDCODE; + goto done; + } + /* Check whether we are blank line. * Note blank lines after indented code are treated as part of that block. * If they are at the end of the block, it is discarded by caller. @@ -428,7 +520,7 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, const MD_LINE* pivot_line, MD_ } } - /* Check whether we are setext underline. */ + /* Check whether we are Setext underline. */ if(pivot_line->type == MD_LINE_TEXT && (CH(off) == _T('=') || CH(off) == _T('-'))) { if(md_is_setext_underline(ctx, off, &off) == 0) { line->type = MD_LINE_SETEXTUNDERLINE; @@ -436,7 +528,8 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, const MD_LINE* pivot_line, MD_ } } - /* Check whether we are thematic break line. */ + /* Check whether we are thematic break line. + * (Keep this after check for Setext underline as that one has higher priority). */ if(ISANYOF(off, _T("-_*"))) { if(md_is_hr_line(ctx, off, &off) == 0) { line->type = MD_LINE_HR; @@ -444,6 +537,15 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, const MD_LINE* pivot_line, MD_ } } + /* Check whether we are starting code fence. */ + if(CH(off) == _T('`') || CH(off) == _T('~')) { + if(md_is_opening_code_fence(ctx, off, &off) == 0) { + ctx->code_fence_indent = line->indent; + line->type = MD_LINE_CODEFENCE; + goto done; + } + } + /* By default, we are normal text line. */ line->type = MD_LINE_TEXT; @@ -489,6 +591,7 @@ md_process_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines) MD_BLOCKTYPE block_type; union { MD_BLOCK_H_DETAIL header; + MD_BLOCK_CODE_DETAIL code; } det; int ret = 0; @@ -511,7 +614,18 @@ md_process_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines) break; case MD_LINE_INDENTEDCODE: + det.code.lang = NULL; + det.code.lang_size = 0; + block_type = MD_BLOCK_CODE; + break; + + case MD_LINE_FENCEDCODE: block_type = MD_BLOCK_CODE; + if(ctx->code_fence_info_beg < ctx->code_fence_info_end) + det.code.lang = STR(ctx->code_fence_info_beg); + else + det.code.lang = NULL; + det.code.lang_size = ctx->code_fence_info_end - ctx->code_fence_info_beg; break; case MD_LINE_TEXT: @@ -519,6 +633,10 @@ md_process_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines) break; case MD_LINE_SETEXTUNDERLINE: + case MD_LINE_CODEFENCE: + /* Noop. */ + return 0; + default: MD_UNREACHABLE(); break; diff --git a/md4c/md4c.h b/md4c/md4c.h @@ -103,6 +103,13 @@ struct MD_BLOCK_H_DETAIL_tag { unsigned level; /* Header level (1 - 6) */ }; +/* Detailed info for MD_BLOCK_CODE. */ +typedef struct MD_BLOCK_CODE_DETAIL_tag MD_BLOCK_CODE_DETAIL; +struct MD_BLOCK_CODE_DETAIL_tag { + const MD_CHAR* lang; /* Not zero-terminated, use lang_size. */ + MD_SIZE lang_size; +}; + /* Flags specifying Markdown dialect. *