commit e01275811fef8e046940546d16431d252da74350
parent d2da226c445841b1c975722079d79344aacbf86f
Author: Martin Mitas <mity@morous.org>
Date: Tue, 4 Oct 2016 20:29:22 +0200
Implement raw HTML blocks.
Diffstat:
4 files changed, 263 insertions(+), 13 deletions(-)
diff --git a/README.md b/README.md
@@ -85,8 +85,8 @@ more or less forms our to do list.
- [x] 4.2 ATX headings
- [x] 4.3 Setext headings
- [x] 4.4 Indented code blocks
- - [ ] 4.5 Fenced code blocks
- - [ ] 4.6 HTML blocks
+ - [x] 4.5 Fenced code blocks
+ - [x] 4.6 HTML blocks
- [ ] 4.7 Link reference definitions
- [x] 4.8 Paragraphs
- [x] 4.9 Blank lines
diff --git a/md2html/md2html.c b/md2html/md2html.c
@@ -151,6 +151,7 @@ enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata)
case MD_BLOCK_HR: MEMBUF_APPEND_LITERAL(out, "<hr>\n"); break;
case MD_BLOCK_H: MEMBUF_APPEND_LITERAL(out, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break;
case MD_BLOCK_CODE: open_code_block(out, (const MD_BLOCK_CODE_DETAIL*) detail); break;
+ case MD_BLOCK_HTML: /* noop */ break;
case MD_BLOCK_P: MEMBUF_APPEND_LITERAL(out, "<p>"); break;
}
@@ -168,6 +169,7 @@ leave_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata)
case MD_BLOCK_HR: /*noop*/ break;
case MD_BLOCK_H: MEMBUF_APPEND_LITERAL(out, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break;
case MD_BLOCK_CODE: MEMBUF_APPEND_LITERAL(out, "</code></pre>\n"); break;
+ case MD_BLOCK_HTML: /* noop */ break;
case MD_BLOCK_P: MEMBUF_APPEND_LITERAL(out, "</p>\n"); break;
}
@@ -192,6 +194,7 @@ text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, void* userdat
struct membuffer* out = (struct membuffer*) userdata;
switch(type) {
+ case MD_TEXT_HTML: membuf_append(out, text, size); break;
default: membuf_append_escaped(out, text, size); break;
}
@@ -304,6 +307,7 @@ static const option cmdline_options[] = {
{ "help", 'h', 'h', OPTION_ARG_NONE },
{ "fpermissive-atx-headers", 0, 'A', OPTION_ARG_NONE },
{ "fno-indented-code", 0, 'I', OPTION_ARG_NONE },
+ { "fno-html-blocks", 0, 'H', OPTION_ARG_NONE },
{ 0 }
};
@@ -322,7 +326,8 @@ usage(void)
"\n"
"Markdown dialect options:\n"
" --fpermissive-atx-headers allow ATX headers without delimiting space\n"
- " --fno-indented-code disabled indented code blocks\n"
+ " --fno-indented-code disable indented code blocks\n"
+ " --fno-html-blocks disable raw HTML blocks\n"
);
}
@@ -351,7 +356,8 @@ cmdline_callback(int opt, char const* value, void* data)
case 'h': usage(); exit(0); break;
case 'A': renderer_flags |= MD_FLAG_PERMISSIVEATXHEADERS; break;
- case 'I': renderer_flags |= MD_FLAG_NOINDENTEDCODE; break;
+ case 'I': renderer_flags |= MD_FLAG_NOINDENTEDCODEBLOCKS; break;
+ case 'H': renderer_flags |= MD_FLAG_NOHTMLBLOCKS; break;
default:
fprintf(stderr, "Illegal option: %s\n", value);
diff --git a/md4c/md4c.c b/md4c/md4c.c
@@ -88,6 +88,9 @@ struct MD_CTX_tag {
OFF code_fence_indent;
OFF code_fence_info_beg;
OFF code_fence_info_end;
+
+ /* For MD_BLOCK_HTML. */
+ unsigned html_block_type;
};
typedef enum MD_LINETYPE_tag MD_LINETYPE;
@@ -100,6 +103,7 @@ enum MD_LINETYPE_tag {
MD_LINE_INDENTEDCODE,
MD_LINE_CODEFENCE,
MD_LINE_FENCEDCODE,
+ MD_LINE_HTML,
MD_LINE_TEXT
};
@@ -205,6 +209,25 @@ md_strchr(const CHAR* str, CHAR ch)
return NULL;
}
+/* Case insensitive check of string equality. */
+static inline int
+md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
+{
+ OFF i;
+ for(i = 0; i < n; i++) {
+ CHAR ch1 = s1[i];
+ CHAR ch2 = s2[i];
+
+ if(ISLOWER_(ch1))
+ ch1 += ('A'-'a');
+ if(ISLOWER_(ch2))
+ ch2 += ('A'-'a');
+ if(ch1 != ch2)
+ return -1;
+ }
+ return 0;
+}
+
#define MD_ENTER_BLOCK(type, arg) \
do { \
@@ -274,7 +297,7 @@ abort:
}
static int
-md_process_verbatim_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
+md_process_verbatim_block(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_LINE* lines, int n_lines)
{
static const CHAR indent_str[16] = _T(" ");
int i;
@@ -286,17 +309,17 @@ md_process_verbatim_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
/* Output code indentation. */
while(indent > SIZEOF_ARRAY(indent_str)) {
- MD_TEXT(MD_TEXT_CODEBLOCK, indent_str, SIZEOF_ARRAY(indent_str));
+ MD_TEXT(text_type, indent_str, SIZEOF_ARRAY(indent_str));
indent -= SIZEOF_ARRAY(indent_str);
}
if(indent > 0)
- MD_TEXT(MD_TEXT_CODEBLOCK, indent_str, indent);
+ MD_TEXT(text_type, indent_str, indent);
/* Output the code line itself. */
- MD_TEXT(MD_TEXT_CODEBLOCK, STR(line->beg), line->end - line->beg);
+ MD_TEXT(text_type, STR(line->beg), line->end - line->beg);
/* Enforce end-of-line. */
- MD_TEXT(MD_TEXT_CODEBLOCK, _T("\n"), 1);
+ MD_TEXT(text_type, _T("\n"), 1);
}
abort:
@@ -441,6 +464,190 @@ out:
return ret;
}
+/* Returns type of the raw HTML block, or -1 if it is not HTML block.
+ * (Refer to CommonMark specification for details about the types.)
+ */
+static int
+md_is_html_block_start_condition(MD_CTX* ctx, OFF beg)
+{
+ typedef struct TAG_tag TAG;
+ struct TAG_tag {
+ const CHAR* name;
+ unsigned len : 8;
+ };
+
+ /* Type 6 is started by a long list of allowed tags. We use two-level
+ * tree to speed-up the search. */
+#ifdef X
+ #undef X
+#endif
+#define X(name) { _T(name), sizeof(name)-1 }
+#define Xend { NULL, 0 }
+ static const TAG t1[] = { X("script"), X("pre"), X("style"), Xend };
+
+ static const TAG a6[] = { X("address"), X("article"), X("aside"), Xend };
+ static const TAG b6[] = { X("base"), X("basefont"), X("blockquote"), X("body"), Xend };
+ static const TAG c6[] = { X("caption"), X("center"), X("col"), X("colgroup"), Xend };
+ static const TAG d6[] = { X("dd"), X("details"), X("dialog"), X("dir"),
+ X("div"), X("dl"), X("dt"), Xend };
+ static const TAG f6[] = { X("fieldset"), X("figcaption"), X("figure"), X("footer"),
+ X("form"), X("frame"), X("frameset"), Xend };
+ static const TAG h6[] = { X("h1"), X("head"), X("header"), X("hr"), X("html"), Xend };
+ static const TAG i6[] = { X("iframe"), Xend };
+ static const TAG l6[] = { X("legend"), X("li"), X("link"), Xend };
+ static const TAG m6[] = { X("main"), X("menu"), X("menuitem"), X("meta"), Xend };
+ static const TAG n6[] = { X("nav"), X("noframes"), Xend };
+ static const TAG o6[] = { X("ol"), X("optgroup"), X("option"), Xend };
+ static const TAG p6[] = { X("p"), X("param"), Xend };
+ static const TAG s6[] = { X("section"), X("source"), X("summary"), Xend };
+ static const TAG t6[] = { X("table"), X("tbody"), X("td"), X("tfoot"), X("th"),
+ X("thead"), X("title"), X("tr"), X("track"), Xend };
+ static const TAG u6[] = { X("ul"), Xend };
+ static const TAG xx[] = { Xend };
+#undef X
+
+ static const TAG* map6[26] = {
+ a6, b6, c6, d6, xx, f6, xx, h6, i6, xx, xx, l6, m6,
+ n6, o6, p6, xx, xx, s6, t6, u6, xx, xx, xx, xx, xx
+ };
+ OFF off = beg + 1;
+ int i;
+
+ MD_ASSERT(CH(beg) == _T('<'));
+
+ /* Check for type 1: <script, <pre, or <style */
+ for(i = 0; t1[i].name != NULL; i++) {
+ if(off + t1[i].len < ctx->size) {
+ if(md_str_case_eq(STR(off), t1[i].name, t1[i].len) == 0)
+ return 1;
+ }
+ }
+
+ /* Check for type 2: <!-- */
+ if(off + 3 < ctx->size && CH(off) == _T('!') && CH(off) == _T('-') && CH(off+1) == _T('-'))
+ return 2;
+
+ /* Check for type 3: <? */
+ if(off < ctx->size && CH(off) == _T('?'))
+ return 3;
+
+ /* Check for type 4 or 5: <! */
+ if(off < ctx->size && CH(off) == _T('!')) {
+ /* Check for type 4: <! followed by uppercase letter. */
+ if(off + 1 < ctx->size && ISUPPER(off+1))
+ return 4;
+
+ /* Check for type 5: <![CDATA[ */
+ if(off + 8 < ctx->size) {
+ if(memcmp(STR(off), _T("![CDATA["), 8 * sizeof(CHAR)) == 0)
+ return 5;
+ }
+ }
+
+ /* Check for type 6: Many possible starting tags listed above. */
+ if(off + 1 < ctx->size && (ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1)))) {
+ int slot;
+ const TAG* tags;
+
+ if(CH(off) == _T('/'))
+ off++;
+
+ slot = (ISUPPER(off) ? CH(off) - 'A' : CH(off) - 'a');
+ tags = map6[slot];
+
+ for(i = 0; tags[i].name != NULL; i++) {
+ if(off + tags[i].len <= ctx->size) {
+ if(md_str_case_eq(STR(off), tags[i].name, tags[i].len) == 0) {
+ OFF tmp = off + tags[i].len;
+ if(tmp >= ctx->size)
+ return 6;
+ if(ISBLANK(tmp) || ISNEWLINE(tmp) || CH(tmp) == _T('>'))
+ return 6;
+ if(tmp+1 < ctx->size && CH(tmp) == _T('/') && CH(tmp+1) == _T('>'))
+ return 6;
+ break;
+ }
+ }
+ }
+ }
+
+ /* Check for type 7: any COMPLETE other opening or closing tag. */
+ // TODO: Rework this: This should be shared with some part of
+ // inline raw html (spec section 6.8).
+ if(off + 1 < ctx->size) {
+ if(ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1))) {
+ OFF tmp = off + 1;
+
+ /* Eat tag name. */
+ while(tmp < ctx->size && (ISALNUM(tmp) || CH(tmp) == _T('-')))
+ tmp++;
+
+ /* If opening tag, eat any attributes. */
+ if(tmp < ctx->size && CH(tmp) != _T('/')) {
+ // TODO
+ }
+
+ /* Eat any whitespace */
+ while(tmp < ctx->size && ISWHITESPACE(tmp))
+ tmp++;
+
+ if(tmp < ctx->size && CH(tmp) == _T('/'))
+ tmp++;
+
+ if(tmp < ctx->size && CH(tmp) == _T('>'))
+ return 7;
+ }
+ }
+
+ return -1;
+}
+
+/* Case insensitive check whether line starting at the offset contains 'what'. */
+static int
+md_line_case_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len)
+{
+ OFF i;
+ for(i = beg; i + what_len < ctx->size; i++) {
+ if(ISNEWLINE(i))
+ break;
+ if(md_str_case_eq(STR(i), what, what_len) == 0)
+ return 0;
+ }
+ return -1;
+}
+
+/* Case sensitive check whether line starting at the offset contains 'what'. */
+static int
+md_line_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len)
+{
+ OFF i;
+ for(i = beg; i + what_len < ctx->size; i++) {
+ if(ISNEWLINE(i))
+ break;
+ if(memcmp(STR(i), what, what_len * sizeof(CHAR)) == 0)
+ return 0;
+ }
+ return -1;
+}
+
+/* Returns type of HTML block end condition or -1 if not an end condition. */
+static int
+md_is_html_block_end_condition(MD_CTX* ctx, OFF beg)
+{
+ switch(ctx->html_block_type) {
+ case 1: return (md_line_case_contains(ctx, beg, _T("</script>"), 9) == 0
+ || md_line_case_contains(ctx, beg, _T("</pre>"), 6) == 0
+ || md_line_case_contains(ctx, beg, _T("</style>"), 8) == 0 ? 1 : -1);
+ case 2: return (md_line_contains(ctx, beg, _T("-->"), 3) == 0 ? 2 : -1);
+ case 3: return (md_line_contains(ctx, beg, _T("?>"), 2) == 0 ? 3 : -1);
+ case 4: return (md_line_contains(ctx, beg, _T(">"), 1) == 0 ? 4 : -1);
+ case 5: return (md_line_contains(ctx, beg, _T("]]>"), 3) == 0 ? 5 : -1);
+ case 6: /* Pass through */
+ case 7: return (ISNEWLINE(beg) ? ctx->html_block_type : -1);
+ default: return -1;
+ }
+}
+
/* Analyze type of the line and find some its properties. This serves as a
* main input for determining type and boundaries of a block. */
static void
@@ -477,6 +684,17 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, const MD_LINE* pivot_line, MD_
goto done;
}
+ /* Check whether we are HTML block continuation. */
+ if(pivot_line->type == MD_LINE_HTML && ctx->html_block_type > 0) {
+ if(md_is_html_block_end_condition(ctx, off) == ctx->html_block_type) {
+ /* Make sure this is the last line of the block. */
+ ctx->html_block_type = 0;
+ }
+
+ line->type = MD_LINE_HTML;
+ goto done;
+ }
+
/* Check whether we are blank line.
* Note blank lines after indented code are treated as part of that block.
* If they are at the end of the block, it is discarded by caller.
@@ -546,6 +764,15 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, const MD_LINE* pivot_line, MD_
}
}
+ /* Check whether we are start of raw HTML block. */
+ if(CH(off) == _T('<') && !(ctx->r.flags & MD_FLAG_NOHTMLBLOCKS)) {
+ ctx->html_block_type = md_is_html_block_start_condition(ctx, off);
+ if(ctx->html_block_type > 0) {
+ line->type = MD_LINE_HTML;
+ goto done;
+ }
+ }
+
/* By default, we are normal text line. */
line->type = MD_LINE_TEXT;
@@ -632,6 +859,10 @@ md_process_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
block_type = MD_BLOCK_P;
break;
+ case MD_LINE_HTML:
+ block_type = MD_BLOCK_HTML;
+ break;
+
case MD_LINE_SETEXTUNDERLINE:
case MD_LINE_CODEFENCE:
/* Noop. */
@@ -651,7 +882,11 @@ md_process_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
break;
case MD_BLOCK_CODE:
- ret = md_process_verbatim_block(ctx, lines, n_lines);
+ ret = md_process_verbatim_block(ctx, MD_TEXT_CODEBLOCK, lines, n_lines);
+ break;
+
+ case MD_BLOCK_HTML:
+ ret = md_process_verbatim_block(ctx, MD_TEXT_HTML, lines, n_lines);
break;
default:
@@ -781,7 +1016,7 @@ md_parse(const MD_CHAR* text, MD_SIZE size, const MD_RENDERER* renderer, void* u
ctx.userdata = userdata;
/* Offset for indented code block. */
- ctx.code_indent_offset = (ctx.r.flags & MD_FLAG_NOINDENTEDCODE) ? (OFF)(-1) : 4;
+ ctx.code_indent_offset = (ctx.r.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4;
/* Do all the hard work. */
return md_process_doc(&ctx);
diff --git a/md4c/md4c.h b/md4c/md4c.h
@@ -72,6 +72,11 @@ enum MD_BLOCKTYPE_tag {
* Note the text lines (spans) within blocks are terminated with '\n'. */
MD_BLOCK_CODE,
+ /* Raw HTML block. This itself does not correspond to any particular HTML
+ * tag. The contents of it _IS_ raw HTML source intended to be put
+ * in verbatim form to the HTML output. */
+ MD_BLOCK_HTML,
+
/* <p>...</p> */
MD_BLOCK_P
};
@@ -93,7 +98,10 @@ enum MD_TEXTTYPE_tag {
/* Text in a code block (inside MD_BLOCK_CODE).
* Includes spaces for indentation and '\n' for new lines.
* MD_TEXT_BR and MD_TEXT_SOFTBR are not sent for this kind of text. */
- MD_TEXT_CODEBLOCK
+ MD_TEXT_CODEBLOCK,
+
+ /* Text is a raw HTML. */
+ MD_TEXT_HTML
};
@@ -117,7 +125,8 @@ struct MD_BLOCK_CODE_DETAIL_tag {
* The following flags may allow some extensions or deviations from it.
*/
#define MD_FLAG_PERMISSIVEATXHEADERS 0x0001 /* Do not require space in ATX headers ( ###header ) */
-#define MD_FLAG_NOINDENTEDCODE 0x0002 /* Recognize only fenced code blocks. */
+#define MD_FLAG_NOINDENTEDCODEBLOCKS 0x0002 /* Disable indented code blocks. (Only fenced code works.) */
+#define MD_FLAG_NOHTMLBLOCKS 0x0004 /* Disable raw HTML blocks. */
/* Caller-provided callbacks.
*