md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

commit 4f65b45bd6e1a979beefd43cd124e08b83a258ed
parent 87b41e1aba93c29f0ea378fbd923a3ebf42ce974
Author: Martin Mitas <mity@morous.org>
Date:   Sat,  8 Oct 2016 20:04:38 +0200

mplement raw HTML spans.

Diffstat:
MREADME.md | 4++--
Mmd4c/md4c.c | 393++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mmd4c/md4c.h | 2++
3 files changed, 373 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md @@ -104,7 +104,7 @@ more or less forms our to do list. - [ ] 6.5 Links - [ ] 6.6 Images - [ ] 6.7 Autolinks - - [ ] 6.8 Raw HTML + - [x] 6.8 Raw HTML - [x] 6.9 Hard line breaks - [x] 6.10 Soft line breaks - [x] 6.11 Textual content @@ -142,7 +142,7 @@ consideration. - [x] Permissive ATX headers: `###Header` (without space) - [ ] Permissive autolinks: `http://google.com` (without `<`...`>`) - [x] Disabling indented code blocks - - [ ] Disabling raw HTML blocks/spans + - [x] Disabling raw HTML blocks/spans ## License diff --git a/md4c/md4c.c b/md4c/md4c.c @@ -298,6 +298,300 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n) } while(0) +/****************************** + *** Recognizing raw HTML *** + ******************************/ + +/* md_is_html_tag() may be called when processing inlines (inline raw HTML) + * or when breaking document to blocks (checking for start of HTML block type 7). + * + * When breaking document to blocks, we do not yet know line boundaries, but + * in that case th whole tag has to live on a single line. We distinguish this + * by n_lines == 0. + */ +static int +md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF* p_end) +{ + int attr_state; + OFF off = beg; + OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size; + int i = 0; + + if(off + 1 >= line_end) + return -1; + if(CH(off) != _T('<')) + return -1; + off++; + + /* For parsing attributes, we need a little state automaton below. + * State -1: no attributes are allowed. + * State 0: attribute could follow after some whitespace. + * State 1: after a whitespace (attribute name may follow). + * State 2: after attribute name ('=' MAY follow). + * State 3: after '=' (value specification MUST follow). + * State 41: in middle of unquoted attribute value. + * State 42: in middle of single-quoted attribute value. + * State 43: in middle of double-quoted attribute value. + */ + attr_state = 0; + + if(CH(off) == _T('/')) { + /* Closer tag "</ ... >". No attributes may be present. */ + attr_state = -1; + off++; + } + + /* Tag name */ + if(off >= line_end || !ISALPHA(off)) + return -1; + off++; + while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-")))) + off++; + + /* (Optional) attributes (if not closer), (optional) '/' (if not closer) + * and final '>'. */ + while(1) { + while(off < line_end) { + if(attr_state > 40) { + if(attr_state == 41 && ISANYOF(off, _T("\"'=<>`"))) { + attr_state = 0; + off--; /* Put the char back for re-inspection in the new state. */ + } else if(attr_state == 42 && CH(off) == _T('\'')) { + attr_state = 0; + } else if(attr_state == 43 && CH(off) == _T('"')) { + attr_state = 0; + } + off++; + } else if(ISWHITESPACE(off)) { + if(attr_state == 0) + attr_state = 1; + off++; + } else if(attr_state <= 2 && CH(off) == _T('>')) { + /* End. */ + *p_end = off+1; + return 0; + } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) { + /* End with digraph '/>' */ + *p_end = off+2; + return 0; + } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) { + off++; + /* Attribute name */ + while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-")))) + off++; + attr_state = 2; + } else if(attr_state == 2 && CH(off) == _T('=')) { + /* Attribute assignment sign */ + off++; + attr_state = 3; + } else if(attr_state == 3) { + /* Expecting start of attribute value. */ + if(CH(off) == _T('"')) + attr_state = 43; + else if(CH(off) == _T('\'')) + attr_state = 42; + else if(!ISANYOF(off, _T("\"'=<>`")) && !ISNEWLINE(off)) + attr_state = 41; + else + return -1; + off++; + } else { + /* Anything unexpected. */ + return -1; + } + } + + /* We have to be on a single line. See definition of start condition + * of HTML block, type 7. */ + if(n_lines == 0) + break; + + i++; + if(i >= n_lines) + break; + + off = lines[i].beg; + line_end = lines[i].end; + + if(attr_state == 0) + attr_state = 1; + } + + return -1; +} + +static int +md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF* p_end) +{ + OFF off = beg; + int i = 0; + + if(off + 4 >= lines[0].end) + return -1; + if(CH(off) != _T('<') || CH(off+1) != _T('!') || CH(off+2) != _T('-') || CH(off+3) != _T('-')) + return -1; + off += 4; + + /* ">" and "->" must follow the opening. */ + if(off < lines[0].end && CH(off) == _T('>')) + return -1; + if(off+1 < lines[0].end && CH(off) == _T('-') && CH(off+1) == _T('>')) + return -1; + + while(1) { + while(off + 2 < lines[i].end) { + if(CH(off) == _T('-') && CH(off+1) == _T('-')) { + if(CH(off+2) == _T('>')) { + /* Success. */ + *p_end = off + 3; + return 0; + } else { + /* "--" is prohibited inside the comment. */ + return -1; + } + } + + off++; + } + + i++; + if(i >= n_lines) + break; + + off = lines[i].beg; + } + + return -1; +} + +static int +md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF* p_end) +{ + OFF off = beg; + int i = 0; + + if(off + 2 >= lines[0].end) + return -1; + if(CH(off) != _T('<') || CH(off+1) != _T('?')) + return -1; + off += 2; + + while(1) { + while(off + 1 < lines[i].end) { + if(CH(off) == _T('?') && CH(off+1) == _T('>')) { + /* Success. */ + *p_end = off + 2; + return 0; + } + + off++; + } + + i++; + if(i >= n_lines) + break; + + off = lines[i].beg; + } + + return -1; +} + +static int +md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF* p_end) +{ + OFF off = beg; + int i = 0; + + if(off + 2 >= lines[0].end) + return -1; + if(CH(off) != _T('<') || CH(off+1) != _T('!')) + return -1; + off += 2; + + /* Declaration name. */ + if(off >= lines[0].end || !ISALPHA(off)) + return -1; + off++; + while(off < lines[0].end && ISALPHA(off)) + off++; + if(off < lines[0].end && !ISWHITESPACE(off)) + return -1; + + while(1) { + while(off < lines[i].end) { + if(CH(off+1) == _T('>')) { + /* Success. */ + *p_end = off + 2; + return 0; + } + + off++; + } + + i++; + if(i >= n_lines) + break; + + off = lines[i].beg; + } + + return -1; +} + +static int +md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF* p_end) +{ + static const CHAR open_str[9] = _T("<![CDATA["); + + OFF off = beg; + int i = 0; + + if(off + SIZEOF_ARRAY(open_str) >= lines[0].end) + return -1; + if(memcmp(STR(off), open_str, sizeof(open_str)) != 0) + return -1; + off += SIZEOF_ARRAY(open_str); + + while(1) { + while(off + 2 < lines[i].end) { + if(CH(off) == _T(']') && CH(off+1) == _T(']') && CH(off+2) == _T('>')) { + /* Success. */ + *p_end = off + 3; + return 0; + } + + off++; + } + + i++; + if(i >= n_lines) + break; + + off = lines[i].beg; + } + + return -1; +} + +static int +md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF* p_end) +{ + if(md_is_html_tag(ctx, lines, n_lines, beg, p_end) == 0) + return 0; + if(md_is_html_comment(ctx, lines, n_lines, beg, p_end) == 0) + return 0; + if(md_is_html_processing_instruction(ctx, lines, n_lines, beg, p_end) == 0) + return 0; + if(md_is_html_declaration(ctx, lines, n_lines, beg, p_end) == 0) + return 0; + if(md_is_html_cdata(ctx, lines, n_lines, beg, p_end) == 0) + return 0; + + return -1; +} + + /****************************************************** *** Processing Sequence of Inlines (a.k.a Spans) *** ******************************************************/ @@ -309,6 +603,8 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n) * '`': Maybe code span start/end. * '&': Maybe start of entity. * ';': Maybe end of entity. + * '<': Maybe start of raw HTML. + * '>': Maybe end of raw HTML. * * Note that not all instances of these chars in the text imply creation of the * structure. Only those which have (or may have, after we see more context) @@ -456,6 +752,15 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines) } } + /* A potential raw HTML start/end. */ + if(ch == _T('<') || ch == _T('>')) { + if(!(ctx->r.flags & MD_FLAG_NOHTMLSPANS)) { + PUSH(ch, off, off+1, (ch == _T('<') ? MD_MARK_OPENER : MD_MARK_CLOSER)); + off++; + continue; + } + } + off++; } } @@ -495,7 +800,7 @@ md_analyze_backtick(MD_CTX* ctx, int mark_index, int* p_unresolved_openers) /* Make the opener point to us as its closer. */ op->next = mark_index; - /* Cancel any escapes inside the code span. */ + /* Cancel any already resolved marks in the code span. */ if(mark_index - opener > 1) memset(ctx->marks + opener + 1, 0, sizeof(MD_MARK) * (mark_index - opener - 1)); @@ -521,6 +826,49 @@ md_analyze_backtick(MD_CTX* ctx, int mark_index, int* p_unresolved_openers) } } +static void +md_analyze_raw_html(MD_CTX* ctx, int mark_index, const MD_LINE* lines, int n_lines) +{ + MD_MARK* opener = &ctx->marks[mark_index]; + MD_MARK* closer; + OFF end; + int i = 0; + + /* Identify the line where the mark lives. */ + while(1) { + if(opener->beg < lines[i].end) + break; + i++; + } + + /* Return if we are not really raw HTML. */ + if(md_is_html_any(ctx, lines + i, n_lines - i, opener->beg, &end) < 0) + return; + + /* Cancel any already resolved marks in the range up to the closer. + * We have to find there the close '>' or something is severly broken. */ + mark_index++; + while(mark_index < ctx->n_marks && ctx->marks[mark_index].end < end) { + ctx->marks[mark_index].ch = _T('\0'); + ctx->marks[mark_index].flags = 0; + mark_index++; + } + closer = &ctx->marks[mark_index]; +/* + MD_ASSERT(closer->end == end); + MD_ASSERT(closer->ch == _T('>')); +*/ + + opener->flags |= MD_MARK_RESOLVED; + opener->next = mark_index; + closer->flags |= MD_MARK_RESOLVED; + + /* Make these marker zero width so the '<' and '>' are part of its + * contents. */ + opener->end = opener->beg; + closer->beg = closer->end; +} + /* Analyze whether the mark '&' starts a HTML entity. * If so, update its flags as well as flags of corresponding closer ';'. */ static void @@ -588,11 +936,12 @@ md_analyze_entity(MD_CTX* ctx, int mark_index) /* Table of precedence of various span types. */ static const CHAR* md_precedence_table[] = { _T("`"), /* Code spans. */ + _T("<"), /* Raw HTML. */ _T("&") /* Entities. */ }; static void -md_analyze_marks(MD_CTX* ctx, int precedence_level) +md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int precedence_level) { const CHAR* mark_chars = md_precedence_table[precedence_level]; /* Chain of potential/unresolved code span openers. */ @@ -623,6 +972,10 @@ md_analyze_marks(MD_CTX* ctx, int precedence_level) md_analyze_backtick(ctx, i, &code_span_unresolved_openers); break; + case _T('<'): + md_analyze_raw_html(ctx, i, lines, n_lines); + break; + case _T('&'): md_analyze_entity(ctx, i); break; @@ -639,7 +992,7 @@ md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) int i; for(i = 0; i < SIZEOF_ARRAY(md_precedence_table); i++) - md_analyze_marks(ctx, i); + md_analyze_marks(ctx, lines, n_lines, i); } /* Render the output, accordingly to the analyzed ctx->marks. */ @@ -697,6 +1050,13 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) } break; + case _T('<'): /* Raw HTML. */ + text_type = MD_TEXT_HTML; + break; + case _T('>'): + text_type = MD_TEXT_NORMAL; + break; + case _T('&'): /* Entity. */ MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg); break; @@ -1044,29 +1404,14 @@ md_is_html_block_start_condition(MD_CTX* ctx, OFF beg) } /* Check for type 7: any COMPLETE other opening or closing tag. */ - // TODO: Rework this: This should be shared with some part of - // inline raw html (spec section 6.8). if(off + 1 < ctx->size) { - if(ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1))) { - OFF tmp = off + 1; - - /* Eat tag name. */ - while(tmp < ctx->size && (ISALNUM(tmp) || CH(tmp) == _T('-'))) - tmp++; - - /* If opening tag, eat any attributes. */ - if(tmp < ctx->size && CH(tmp) != _T('/')) { - // TODO - } - - /* Eat any whitespace */ - while(tmp < ctx->size && ISWHITESPACE(tmp)) - tmp++; - - if(tmp < ctx->size && CH(tmp) == _T('/')) - tmp++; + OFF end; - if(tmp < ctx->size && CH(tmp) == _T('>')) + if(md_is_html_tag(ctx, NULL, 0, beg, &end) == 0) { + /* Only optional whitespace and new line may follow. */ + while(end < ctx->size && ISWHITESPACE(end)) + end++; + if(end >= ctx->size || ISNEWLINE(end)) return 7; } } diff --git a/md4c/md4c.h b/md4c/md4c.h @@ -148,6 +148,8 @@ struct MD_BLOCK_CODE_DETAIL_tag { #define MD_FLAG_PERMISSIVEATXHEADERS 0x0001 /* Do not require space in ATX headers ( ###header ) */ #define MD_FLAG_NOINDENTEDCODEBLOCKS 0x0002 /* Disable indented code blocks. (Only fenced code works.) */ #define MD_FLAG_NOHTMLBLOCKS 0x0004 /* Disable raw HTML blocks. */ +#define MD_FLAG_NOHTMLSPANS 0x0008 /* Disable raw HTML (inline). */ +#define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS) /* Caller-provided callbacks. *