mplement raw HTML spans. - md4c - C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.

commit 4f65b45bd6e1a979beefd43cd124e08b83a258ed
parent 87b41e1aba93c29f0ea378fbd923a3ebf42ce974
Author: Martin Mitas <mity@morous.org>
Date:   Sat,  8 Oct 2016 20:04:38 +0200

mplement raw HTML spans.

Diffstat:
M README.md  | 4 ++--
M md4c/md4c.c  | 393 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M md4c/md4c.h  | 2 ++

3 files changed, 373 insertions(+), 26 deletions(-)
diff --git a/README.md b/README.md
@@ -104,7 +104,7 @@ more or less forms our to do list.
   - [ ] 6.5 Links
   - [ ] 6.6 Images
   - [ ] 6.7 Autolinks
-  - [ ] 6.8 Raw HTML
+  - [x] 6.8 Raw HTML
   - [x] 6.9 Hard line breaks
   - [x] 6.10 Soft line breaks
   - [x] 6.11 Textual content
@@ -142,7 +142,7 @@ consideration.
   - [x] Permissive ATX headers: `###Header` (without space)
   - [ ] Permissive autolinks: `http://google.com` (without `<`...`>`)
   - [x] Disabling indented code blocks
-  - [ ] Disabling raw HTML blocks/spans
+  - [x] Disabling raw HTML blocks/spans
 
 
 ## License
diff --git a/md4c/md4c.c b/md4c/md4c.c
@@ -298,6 +298,300 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
     } while(0)
 
 
+/******************************
+ ***  Recognizing raw HTML  ***
+ ******************************/
+
+/* md_is_html_tag() may be called when processing inlines (inline raw HTML)
+ * or when breaking document to blocks (checking for start of HTML block type 7).
+ *
+ * When breaking document to blocks, we do not yet know line boundaries, but
+ * in that case th whole tag has to live on a single line. We distinguish this
+ * by n_lines == 0.
+ */
+static int
+md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF* p_end)
+{
+    int attr_state;
+    OFF off = beg;
+    OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size;
+    int i = 0;
+
+    if(off + 1 >= line_end)
+        return -1;
+    if(CH(off) != _T('<'))
+        return -1;
+    off++;
+
+    /* For parsing attributes, we need a little state automaton below.
+     * State -1: no attributes are allowed.
+     * State 0: attribute could follow after some whitespace.
+     * State 1: after a whitespace (attribute name may follow).
+     * State 2: after attribute name ('=' MAY follow).
+     * State 3: after '=' (value specification MUST follow).
+     * State 41: in middle of unquoted attribute value.
+     * State 42: in middle of single-quoted attribute value.
+     * State 43: in middle of double-quoted attribute value.
+     */
+    attr_state = 0;
+
+    if(CH(off) == _T('/')) {
+        /* Closer tag "</ ... >". No attributes may be present. */
+        attr_state = -1;
+        off++;
+    }
+
+    /* Tag name */
+    if(off >= line_end  ||  !ISALPHA(off))
+        return -1;
+    off++;
+    while(off < line_end  &&  (ISALNUM(off)  ||  ISANYOF(off, _T("_.:-"))))
+        off++;
+
+    /* (Optional) attributes (if not closer), (optional) '/' (if not closer)
+     * and final '>'. */
+    while(1) {
+        while(off < line_end) {
+            if(attr_state > 40) {
+                if(attr_state == 41 && ISANYOF(off, _T("\"'=<>`"))) {
+                    attr_state = 0;
+                    off--;  /* Put the char back for re-inspection in the new state. */
+                } else if(attr_state == 42 && CH(off) == _T('\'')) {
+                    attr_state = 0;
+                } else if(attr_state == 43 && CH(off) == _T('"')) {
+                    attr_state = 0;
+                }
+                off++;
+            } else if(ISWHITESPACE(off)) {
+                if(attr_state == 0)
+                    attr_state = 1;
+                off++;
+            } else if(attr_state <= 2 && CH(off) == _T('>')) {
+                /* End. */
+                *p_end = off+1;
+                return 0;
+            } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) {
+                /* End with digraph '/>' */
+                *p_end = off+2;
+                return 0;
+            } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) {
+                off++;
+                /* Attribute name */
+                while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-"))))
+                    off++;
+                attr_state = 2;
+            } else if(attr_state == 2 && CH(off) == _T('=')) {
+                /* Attribute assignment sign */
+                off++;
+                attr_state = 3;
+            } else if(attr_state == 3) {
+                /* Expecting start of attribute value. */
+                if(CH(off) == _T('"'))
+                    attr_state = 43;
+                else if(CH(off) == _T('\''))
+                    attr_state = 42;
+                else if(!ISANYOF(off, _T("\"'=<>`"))  &&  !ISNEWLINE(off))
+                    attr_state = 41;
+                else
+                    return -1;
+                off++;
+            } else {
+                /* Anything unexpected. */
+                return -1;
+            }
+        }
+
+        /* We have to be on a single line. See definition of start condition
+         * of HTML block, type 7. */
+        if(n_lines == 0)
+            break;
+
+        i++;
+        if(i >= n_lines)
+            break;
+
+        off = lines[i].beg;
+        line_end = lines[i].end;
+
+        if(attr_state == 0)
+            attr_state = 1;
+    }
+
+    return -1;
+}
+
+static int
+md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF* p_end)
+{
+    OFF off = beg;
+    int i = 0;
+
+    if(off + 4 >= lines[0].end)
+        return -1;
+    if(CH(off) != _T('<')  ||  CH(off+1) != _T('!')  ||  CH(off+2) != _T('-')  ||  CH(off+3) != _T('-'))
+        return -1;
+    off += 4;
+
+    /* ">" and "->" must follow the opening. */
+    if(off < lines[0].end  &&  CH(off) == _T('>'))
+        return -1;
+    if(off+1 < lines[0].end  &&  CH(off) == _T('-')  &&  CH(off+1) == _T('>'))
+        return -1;
+
+    while(1) {
+        while(off + 2 < lines[i].end) {
+            if(CH(off) == _T('-')  &&  CH(off+1) == _T('-')) {
+                if(CH(off+2) == _T('>')) {
+                    /* Success. */
+                    *p_end = off + 3;
+                    return 0;
+                } else {
+                    /* "--" is prohibited inside the comment. */
+                    return -1;
+                }
+            }
+
+            off++;
+        }
+
+        i++;
+        if(i >= n_lines)
+            break;
+
+        off = lines[i].beg;
+    }
+
+    return -1;
+}
+
+static int
+md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF* p_end)
+{
+    OFF off = beg;
+    int i = 0;
+
+    if(off + 2 >= lines[0].end)
+        return -1;
+    if(CH(off) != _T('<')  ||  CH(off+1) != _T('?'))
+        return -1;
+    off += 2;
+
+    while(1) {
+        while(off + 1 < lines[i].end) {
+            if(CH(off) == _T('?')  &&  CH(off+1) == _T('>')) {
+                /* Success. */
+                *p_end = off + 2;
+                return 0;
+            }
+
+            off++;
+        }
+
+        i++;
+        if(i >= n_lines)
+            break;
+
+        off = lines[i].beg;
+    }
+
+    return -1;
+}
+
+static int
+md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF* p_end)
+{
+    OFF off = beg;
+    int i = 0;
+
+    if(off + 2 >= lines[0].end)
+        return -1;
+    if(CH(off) != _T('<')  ||  CH(off+1) != _T('!'))
+        return -1;
+    off += 2;
+
+    /* Declaration name. */
+    if(off >= lines[0].end  ||  !ISALPHA(off))
+        return -1;
+    off++;
+    while(off < lines[0].end  &&  ISALPHA(off))
+        off++;
+    if(off < lines[0].end  &&  !ISWHITESPACE(off))
+        return -1;
+
+    while(1) {
+        while(off < lines[i].end) {
+            if(CH(off+1) == _T('>')) {
+                /* Success. */
+                *p_end = off + 2;
+                return 0;
+            }
+
+            off++;
+        }
+
+        i++;
+        if(i >= n_lines)
+            break;
+
+        off = lines[i].beg;
+    }
+
+    return -1;
+}
+
+static int
+md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF* p_end)
+{
+    static const CHAR open_str[9] = _T("<![CDATA[");
+
+    OFF off = beg;
+    int i = 0;
+
+    if(off + SIZEOF_ARRAY(open_str) >= lines[0].end)
+        return -1;
+    if(memcmp(STR(off), open_str, sizeof(open_str)) != 0)
+        return -1;
+    off += SIZEOF_ARRAY(open_str);
+
+    while(1) {
+        while(off + 2 < lines[i].end) {
+            if(CH(off) == _T(']')  &&  CH(off+1) == _T(']')  &&  CH(off+2) == _T('>')) {
+                /* Success. */
+                *p_end = off + 3;
+                return 0;
+            }
+
+            off++;
+        }
+
+        i++;
+        if(i >= n_lines)
+            break;
+
+        off = lines[i].beg;
+    }
+
+    return -1;
+}
+
+static int
+md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF* p_end)
+{
+    if(md_is_html_tag(ctx, lines, n_lines, beg, p_end) == 0)
+        return 0;
+    if(md_is_html_comment(ctx, lines, n_lines, beg, p_end) == 0)
+        return 0;
+    if(md_is_html_processing_instruction(ctx, lines, n_lines, beg, p_end) == 0)
+        return 0;
+    if(md_is_html_declaration(ctx, lines, n_lines, beg, p_end) == 0)
+        return 0;
+    if(md_is_html_cdata(ctx, lines, n_lines, beg, p_end) == 0)
+        return 0;
+
+    return -1;
+}
+
+
 /******************************************************
  ***  Processing Sequence of Inlines (a.k.a Spans)  ***
  ******************************************************/
@@ -309,6 +603,8 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
  *  '`': Maybe code span start/end.
  *  '&': Maybe start of entity.
  *  ';': Maybe end of entity.
+ *  '<': Maybe start of raw HTML.
+ *  '>': Maybe end of raw HTML.
  *
  * Note that not all instances of these chars in the text imply creation of the
  * structure. Only those which have (or may have, after we see more context)
@@ -456,6 +752,15 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
                 }
             }
 
+            /* A potential raw HTML start/end. */
+            if(ch == _T('<') || ch == _T('>')) {
+                if(!(ctx->r.flags & MD_FLAG_NOHTMLSPANS)) {
+                    PUSH(ch, off, off+1, (ch == _T('<') ? MD_MARK_OPENER : MD_MARK_CLOSER));
+                    off++;
+                    continue;
+                }
+            }
+
             off++;
         }
     }
@@ -495,7 +800,7 @@ md_analyze_backtick(MD_CTX* ctx, int mark_index, int* p_unresolved_openers)
             /* Make the opener point to us as its closer. */
             op->next = mark_index;
 
-            /* Cancel any escapes inside the code span. */
+            /* Cancel any already resolved marks in the code span. */
             if(mark_index - opener > 1)
                 memset(ctx->marks + opener + 1, 0, sizeof(MD_MARK) * (mark_index - opener - 1));
 
@@ -521,6 +826,49 @@ md_analyze_backtick(MD_CTX* ctx, int mark_index, int* p_unresolved_openers)
     }
 }
 
+static void
+md_analyze_raw_html(MD_CTX* ctx, int mark_index, const MD_LINE* lines, int n_lines)
+{
+    MD_MARK* opener = &ctx->marks[mark_index];
+    MD_MARK* closer;
+    OFF end;
+    int i = 0;
+
+    /* Identify the line where the mark lives. */
+    while(1) {
+        if(opener->beg < lines[i].end)
+            break;
+        i++;
+    }
+
+    /* Return if we are not really raw HTML. */
+    if(md_is_html_any(ctx, lines + i, n_lines - i, opener->beg, &end) < 0)
+        return;
+
+    /* Cancel any already resolved marks in the range up to the closer.
+     * We have to find there the close '>' or something is severly broken. */
+    mark_index++;
+    while(mark_index < ctx->n_marks  &&  ctx->marks[mark_index].end < end) {
+        ctx->marks[mark_index].ch = _T('\0');
+        ctx->marks[mark_index].flags = 0;
+        mark_index++;
+    }
+    closer = &ctx->marks[mark_index];
+/*
+    MD_ASSERT(closer->end == end);
+    MD_ASSERT(closer->ch == _T('>'));
+*/
+
+    opener->flags |= MD_MARK_RESOLVED;
+    opener->next = mark_index;
+    closer->flags |= MD_MARK_RESOLVED;
+
+    /* Make these marker zero width so the '<' and '>' are part of its
+     * contents. */
+    opener->end = opener->beg;
+    closer->beg = closer->end;
+}
+
 /* Analyze whether the mark '&' starts a HTML entity.
  * If so, update its flags as well as flags of corresponding closer ';'. */
 static void
@@ -588,11 +936,12 @@ md_analyze_entity(MD_CTX* ctx, int mark_index)
 /* Table of precedence of various span types. */
 static const CHAR* md_precedence_table[] = {
     _T("`"),        /* Code spans. */
+    _T("<"),        /* Raw HTML. */
     _T("&")         /* Entities. */
 };
 
 static void
-md_analyze_marks(MD_CTX* ctx, int precedence_level)
+md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int precedence_level)
 {
     const CHAR* mark_chars = md_precedence_table[precedence_level];
     /* Chain of potential/unresolved code span openers. */
@@ -623,6 +972,10 @@ md_analyze_marks(MD_CTX* ctx, int precedence_level)
                 md_analyze_backtick(ctx, i, &code_span_unresolved_openers);
                 break;
 
+            case _T('<'):
+                md_analyze_raw_html(ctx, i, lines, n_lines);
+                break;
+
             case _T('&'):
                 md_analyze_entity(ctx, i);
                 break;
@@ -639,7 +992,7 @@ md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
     int i;
 
     for(i = 0; i < SIZEOF_ARRAY(md_precedence_table); i++)
-        md_analyze_marks(ctx, i);
+        md_analyze_marks(ctx, lines, n_lines, i);
 }
 
 /* Render the output, accordingly to the analyzed ctx->marks. */
@@ -697,6 +1050,13 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
                     }
                     break;
 
+                case _T('<'):       /* Raw HTML. */
+                    text_type = MD_TEXT_HTML;
+                    break;
+                case _T('>'):
+                    text_type = MD_TEXT_NORMAL;
+                    break;
+
                 case _T('&'):       /* Entity. */
                     MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg);
                     break;
@@ -1044,29 +1404,14 @@ md_is_html_block_start_condition(MD_CTX* ctx, OFF beg)
     }
 
     /* Check for type 7: any COMPLETE other opening or closing tag. */
-    // TODO: Rework this: This should be shared with some part of
-    // inline raw html (spec section 6.8).
     if(off + 1 < ctx->size) {
-        if(ISALPHA(off)  ||  (CH(off) == _T('/') && ISALPHA(off+1))) {
-            OFF tmp = off + 1;
-
-            /* Eat tag name. */
-            while(tmp < ctx->size  &&  (ISALNUM(tmp) || CH(tmp) == _T('-')))
-                tmp++;
-
-            /* If opening tag, eat any attributes. */
-            if(tmp < ctx->size  &&  CH(tmp) != _T('/')) {
-                // TODO
-            }
-
-            /* Eat any whitespace */
-            while(tmp < ctx->size  &&  ISWHITESPACE(tmp))
-                tmp++;
-
-            if(tmp < ctx->size  &&  CH(tmp) == _T('/'))
-                tmp++;
+        OFF end;
 
-            if(tmp < ctx->size  &&  CH(tmp) == _T('>'))
+        if(md_is_html_tag(ctx, NULL, 0, beg, &end) == 0) {
+            /* Only optional whitespace and new line may follow. */
+            while(end < ctx->size  &&  ISWHITESPACE(end))
+                end++;
+            if(end >= ctx->size  ||  ISNEWLINE(end))
                 return 7;
         }
     }
diff --git a/md4c/md4c.h b/md4c/md4c.h
@@ -148,6 +148,8 @@ struct MD_BLOCK_CODE_DETAIL_tag {
 #define MD_FLAG_PERMISSIVEATXHEADERS    0x0001  /* Do not require space in ATX headers ( ###header ) */
 #define MD_FLAG_NOINDENTEDCODEBLOCKS    0x0002  /* Disable indented code blocks. (Only fenced code works.) */
 #define MD_FLAG_NOHTMLBLOCKS            0x0004  /* Disable raw HTML blocks. */
+#define MD_FLAG_NOHTMLSPANS             0x0008  /* Disable raw HTML (inline). */
+#define MD_FLAG_NOHTML                  (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS)
 
 /* Caller-provided callbacks.
  *

	md4c C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
	git clone https://noulin.net/git/md4c.git
	Log \| Files \| Refs \| README \| LICENSE

M	README.md	\|	4	++--
M	md4c/md4c.c	\|	393	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M	md4c/md4c.h	\|	2	++