Implement fenced code blocks. - md4c - C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.

commit d2da226c445841b1c975722079d79344aacbf86f
parent 43bd28445b639afbd91e0155ffb5b5072c88e181
Author: Martin Mitas <mity@morous.org>
Date:   Tue,  4 Oct 2016 19:48:06 +0200

Implement fenced code blocks.

Diffstat:
M md2html/md2html.c  | 17 ++++++++++++++++-
M md4c/md4c.c  | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M md4c/md4c.h  | 7 +++++++

3 files changed, 143 insertions(+), 3 deletions(-)
diff --git a/md2html/md2html.c b/md2html/md2html.c
@@ -125,6 +125,21 @@ membuf_append_escaped(struct membuffer* buf, const char* data, MD_SIZE size)
  ***  HTML renderer implementation  ***
  **************************************/
 
+static void
+open_code_block(struct membuffer* out, const MD_BLOCK_CODE_DETAIL* det)
+{
+    MEMBUF_APPEND_LITERAL(out, "<pre><code");
+
+    /* If known, output the HTML 5 attribute class="language-LANGNAME". */
+    if(det->lang != NULL) {
+        MEMBUF_APPEND_LITERAL(out, " class=\"language-");
+        membuf_append_escaped(out, det->lang, det->lang_size);
+        MEMBUF_APPEND_LITERAL(out, "\"");
+    }
+
+    MEMBUF_APPEND_LITERAL(out, ">");
+}
+
 static int
 enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata)
 {
@@ -135,7 +150,7 @@ enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata)
         case MD_BLOCK_DOC:      /* noop */ break;
         case MD_BLOCK_HR:       MEMBUF_APPEND_LITERAL(out, "<hr>\n"); break;
         case MD_BLOCK_H:        MEMBUF_APPEND_LITERAL(out, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break;
-        case MD_BLOCK_CODE:     MEMBUF_APPEND_LITERAL(out, "<pre><code>"); break;
+        case MD_BLOCK_CODE:     open_code_block(out, (const MD_BLOCK_CODE_DETAIL*) detail); break;
         case MD_BLOCK_P:        MEMBUF_APPEND_LITERAL(out, "<p>"); break;
     }
 
diff --git a/md4c/md4c.c b/md4c/md4c.c
@@ -81,6 +81,13 @@ struct MD_CTX_tag {
 
     /* For MD_BLOCK_HEADER. */
     unsigned header_level;
+
+    /* For MD_BLOCK_CODE (fenced). */
+    CHAR code_fence_char;   /* '~' or '`' */
+    SZ code_fence_length;
+    OFF code_fence_indent;
+    OFF code_fence_info_beg;
+    OFF code_fence_info_end;
 };
 
 typedef enum MD_LINETYPE_tag MD_LINETYPE;
@@ -91,6 +98,8 @@ enum MD_LINETYPE_tag {
     MD_LINE_SETEXTHEADER,
     MD_LINE_SETEXTUNDERLINE,
     MD_LINE_INDENTEDCODE,
+    MD_LINE_CODEFENCE,
+    MD_LINE_FENCEDCODE,
     MD_LINE_TEXT
 };
 
@@ -364,6 +373,74 @@ md_is_setext_underline(MD_CTX* ctx, OFF beg, OFF* p_end)
     return 0;
 }
 
+static int
+md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end)
+{
+    OFF off = beg;
+
+    MD_ASSERT(CH(beg) == _T('`') || CH(beg) == _T('~'));
+
+    while(off < ctx->size && CH(off) == CH(beg))
+        off++;
+
+    /* Fence must have at least three characters. */
+    if(off - beg < 3)
+        return -1;
+
+    ctx->code_fence_length = off - beg;
+
+    /* Optionally, space(s) can follow. */
+    while(off < ctx->size  &&  CH(off) == _T(' '))
+        off++;
+
+    /* Optionally, language info can follow. It must not contain '`'. */
+    ctx->code_fence_info_beg = off;
+    while(off < ctx->size  &&  CH(off) != _T('`')  &&  !ISNEWLINE(off))
+        off++;
+    if(off < ctx->size  &&  !ISNEWLINE(off))
+        return -1;
+
+    *p_end = off;
+
+    /* Right trim of language info. */
+    while(off > ctx->code_fence_info_beg  &&  CH(off-1) == _T(' '))
+        off--;
+    ctx->code_fence_info_end = off;
+
+    ctx->code_fence_char = CH(beg);
+    return 0;
+}
+
+static int
+md_is_closing_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end)
+{
+    OFF off = beg;
+    int ret = -1;
+
+    /* Closing fence must have at least the same length and use same char as
+     * opening one. */
+    while(off < ctx->size  &&  CH(off) == ctx->code_fence_char)
+        off++;
+    if(off - beg < ctx->code_fence_length)
+        goto out;
+
+    /* Optionally, space(s) can follow */
+    while(off < ctx->size  &&  CH(off) == _T(' '))
+        off++;
+
+    /* But nothing more is allowed on the line. */
+    if(off < ctx->size  &&  !ISNEWLINE(off))
+        goto out;
+
+    ret = 0;
+
+out:
+    /* Note we set *p_end even on failure: If we are not closing fence, caller
+     * would eat the line anyway without any parsing. */
+    *p_end = off;
+    return ret;
+}
+
 /* Analyze type of the line and find some its properties. This serves as a
  * main input for determining type and boundaries of a block. */
 static void
@@ -385,6 +462,21 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, const MD_LINE* pivot_line, MD_
 
     line->beg = off;
 
+    /* Check whether we are fenced code continuation. */
+    if(pivot_line->type == MD_LINE_FENCEDCODE || pivot_line->type == MD_LINE_CODEFENCE) {
+        /* We are another MD_LINE_FENCEDCODE unless we are closing fence
+         * which we transform into MD_LINE_BLANK. */
+        if(line->indent < ctx->code_indent_offset) {
+            if(md_is_closing_code_fence(ctx, off, &off) == 0) {
+                line->type = MD_LINE_BLANK;
+                goto done;
+            }
+        }
+
+        line->type = MD_LINE_FENCEDCODE;
+        goto done;
+    }
+
     /* Check whether we are blank line.
      * Note blank lines after indented code are treated as part of that block.
      * If they are at the end of the block, it is discarded by caller.
@@ -428,7 +520,7 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, const MD_LINE* pivot_line, MD_
         }
     }
 
-    /* Check whether we are setext underline. */
+    /* Check whether we are Setext underline. */
     if(pivot_line->type == MD_LINE_TEXT  &&  (CH(off) == _T('=') || CH(off) == _T('-'))) {
         if(md_is_setext_underline(ctx, off, &off) == 0) {
             line->type = MD_LINE_SETEXTUNDERLINE;
@@ -436,7 +528,8 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, const MD_LINE* pivot_line, MD_
         }
     }
 
-    /* Check whether we are thematic break line. */
+    /* Check whether we are thematic break line.
+     * (Keep this after check for Setext underline as that one has higher priority). */
     if(ISANYOF(off, _T("-_*"))) {
         if(md_is_hr_line(ctx, off, &off) == 0) {
             line->type = MD_LINE_HR;
@@ -444,6 +537,15 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, const MD_LINE* pivot_line, MD_
         }
     }
 
+    /* Check whether we are starting code fence. */
+    if(CH(off) == _T('`') || CH(off) == _T('~')) {
+        if(md_is_opening_code_fence(ctx, off, &off) == 0) {
+            ctx->code_fence_indent = line->indent;
+            line->type = MD_LINE_CODEFENCE;
+            goto done;
+        }
+    }
+
     /* By default, we are normal text line. */
     line->type = MD_LINE_TEXT;
 
@@ -489,6 +591,7 @@ md_process_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
     MD_BLOCKTYPE block_type;
     union {
         MD_BLOCK_H_DETAIL header;
+        MD_BLOCK_CODE_DETAIL code;
     } det;
     int ret = 0;
 
@@ -511,7 +614,18 @@ md_process_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
             break;
 
         case MD_LINE_INDENTEDCODE:
+            det.code.lang = NULL;
+            det.code.lang_size = 0;
+            block_type = MD_BLOCK_CODE;
+            break;
+
+        case MD_LINE_FENCEDCODE:
             block_type = MD_BLOCK_CODE;
+            if(ctx->code_fence_info_beg < ctx->code_fence_info_end)
+                det.code.lang = STR(ctx->code_fence_info_beg);
+            else
+                det.code.lang = NULL;
+            det.code.lang_size = ctx->code_fence_info_end - ctx->code_fence_info_beg;
             break;
 
         case MD_LINE_TEXT:
@@ -519,6 +633,10 @@ md_process_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
             break;
 
         case MD_LINE_SETEXTUNDERLINE:
+        case MD_LINE_CODEFENCE:
+            /* Noop. */
+            return 0;
+
         default:
             MD_UNREACHABLE();
             break;
diff --git a/md4c/md4c.h b/md4c/md4c.h
@@ -103,6 +103,13 @@ struct MD_BLOCK_H_DETAIL_tag {
     unsigned level;         /* Header level (1 - 6) */
 };
 
+/* Detailed info for MD_BLOCK_CODE. */
+typedef struct MD_BLOCK_CODE_DETAIL_tag MD_BLOCK_CODE_DETAIL;
+struct MD_BLOCK_CODE_DETAIL_tag {
+    const MD_CHAR* lang;    /* Not zero-terminated, use lang_size. */
+    MD_SIZE lang_size;
+};
+
 
 /* Flags specifying Markdown dialect.
  *

	md4c C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
	git clone https://noulin.net/git/md4c.git
	Log \| Files \| Refs \| README \| LICENSE

M	md2html/md2html.c	\|	17	++++++++++++++++-
M	md4c/md4c.c	\|	122	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M	md4c/md4c.h	\|	7	+++++++