md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

commit a284a382ea5026c456e3cf52834c525adf619fbc
parent 479a117937dc869b2ef3cc685698b53c1f875945
Author: Martin Mitas <mity@morous.org>
Date:   Thu,  6 Oct 2016 23:50:56 +0200

Implement code spans.

Diffstat:
MREADME.md | 2+-
Mmd2html/md2html.c | 12++++++++++++
Mmd4c/md4c.c | 236+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
Mmd4c/md4c.h | 2+-
4 files changed, 214 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md @@ -99,7 +99,7 @@ more or less forms our to do list. - **Inlines:** - [x] 6.1 Backslash escapes - [ ] 6.2 Entity and numeric character references - - [ ] 6.3 Code spans + - [x] 6.3 Code spans - [ ] 6.4 Emphasis and strong emphasis - [ ] 6.5 Links - [ ] 6.6 Images diff --git a/md2html/md2html.c b/md2html/md2html.c @@ -181,12 +181,24 @@ leave_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata) static int enter_span_callback(MD_SPANTYPE type, void* detail, void* userdata) { + struct membuffer* out = (struct membuffer*) userdata; + + switch(type) { + case MD_SPAN_CODE: MEMBUF_APPEND_LITERAL(out, "<code>"); break; + } + return 0; } static int leave_span_callback(MD_SPANTYPE type, void* detail, void* userdata) { + struct membuffer* out = (struct membuffer*) userdata; + + switch(type) { + case MD_SPAN_CODE: MEMBUF_APPEND_LITERAL(out, "</code>"); break; + } + return 0; } diff --git a/md4c/md4c.c b/md4c/md4c.c @@ -305,8 +305,8 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n) /* Structure marking an offset which needs special attention. The type * of the attention is determined by the member ch: * - * '\\': Escape sequence. - * (beg points to '\\'; beg+1 to the escaped char.) + * '\\': Maybe escape sequence. + * '`': Maybe code span start/end. * * Note that not all instances of these chars in the text imply creation of the * structure. Only those which have (or may have, after we see more context) @@ -315,14 +315,20 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n) struct MD_MARK_tag { OFF beg; OFF end; + + /* Index of another mark. Before resolving the member may be used for + * arbitrary purpose during the analyzes phase. + * For resolved openers, it has to point to the corresponding closer. */ + int next; + MD_CHAR ch; unsigned short flags; }; /* Mark flags. */ -#define MD_MARK_RESOLVED 0x0001 -#define MD_MARK_OPENER 0x0002 -#define MD_MARK_CLOSER 0x0004 +#define MD_MARK_RESOLVED 0x0001 /* Yes, the special meaning is indeed recognized. */ +#define MD_MARK_OPENER 0x0002 /* This opens (or potentially may open) a span. */ +#define MD_MARK_CLOSER 0x0004 /* This closes (or potentially may close) a span. */ static MD_MARK* @@ -374,9 +380,6 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines) int ret = 0; MD_MARK* mark; - /* Reset the previously collected stack of marks. */ - ctx->n_marks = 0; - for(i = 0; i < n_lines; i++) { const MD_LINE* line = &lines[i]; OFF off = line->beg; @@ -384,14 +387,53 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines) while(off < end) { CHAR ch = CH(off); - /* Analyze backslash escapes. - * Note it can go beyond line->end as it may involve - * escaped new line to form a hard break. */ + /* A backslash escape. + * It can go beyond line->end as it may involve escaped new + * line to form a hard break. */ if(ch == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) { /* Hard-break cannot be on the last line of the block. */ if(!ISNEWLINE(off+1) || i+1 < n_lines) PUSH(ch, off, off+2, MD_MARK_RESOLVED); - off += 2; + + /* If '`' follows, we need both marks as the backslash may be + * inside a code span. */ + if(CH(off+1) == _T('`')) + off++; + else + off += 2; + continue; + } + + /* Turn non-trivial whitespace into single space. */ + if(ISWHITESPACE_(ch)) { + OFF tmp = off+1; + + while(tmp < end && ISWHITESPACE(tmp)) + tmp++; + + if(tmp - end > 1 || ch != _T(' ')) { + PUSH(ch, off, tmp, MD_MARK_RESOLVED); + off = tmp; + continue; + } + } + + /* A potential code span start/end. */ + if(ch == _T('`')) { + unsigned flags; + OFF tmp = off+1; + + /* It may be opener only if it is not escaped. */ + if(ctx->n_marks > 0 && ctx->marks[ctx->n_marks-1].beg == off-1 && CH(off-1) == _T('\\')) + flags = MD_MARK_CLOSER; + else + flags = MD_MARK_OPENER | MD_MARK_CLOSER; + + while(tmp < end && CH(tmp) == _T('`')) + tmp++; + PUSH(ch, off, tmp, flags); + + off = tmp; continue; } @@ -409,23 +451,115 @@ abort: return ret; } + +/* Table of precedence of various span types. */ +static const CHAR* md_precedence_table[] = { + _T("`"), /* Code spans. */ + _T("\\") /* Backslash escapes. */ +}; + + +static void +md_analyze_backtick(MD_CTX* ctx, int mark_index, int* p_unresolved_openers) +{ + MD_MARK* mark = &ctx->marks[mark_index]; + int opener = *p_unresolved_openers; + + /* Try to find unresolved opener of the same length. If we find it, + * we form a code span. */ + while(opener >= 0) { + MD_MARK* op = &ctx->marks[opener]; + + if(op->end - op->beg == mark->end - mark->beg) { + /* Resolve the span. */ + op->flags = MD_MARK_OPENER | MD_MARK_RESOLVED; + mark->flags = MD_MARK_CLOSER | MD_MARK_RESOLVED; + + /* Shorten the list of unresolved openers. */ + *p_unresolved_openers = op->next; + + /* Make the opener point to us as its closer. */ + op->next = mark_index; + + /* Cancel any escapes inside the code span. */ + if(mark_index - opener > 1) + memset(ctx->marks + opener + 1, 0, sizeof(MD_MARK) * (mark_index - opener - 1)); + + /* Append any space or new line inside the span into the mark itself + * to swallow it. */ + while(CH(op->end) == _T(' ') || ISNEWLINE(op->end)) + op->end++; + while(CH(mark->beg-1) == _T(' ') || ISNEWLINE(mark->beg-1)) + mark->beg--; + + /* Done. */ + return; + } + + opener = ctx->marks[opener].next; + } + + /* We didn't find any matching opener, remember it as a potential opener. */ + if(mark->flags & MD_MARK_OPENER) { + mark->next = *p_unresolved_openers; + *p_unresolved_openers = mark_index; + } +} + +static void +md_analyze_marks(MD_CTX* ctx, int precedence_level) +{ + const CHAR* mark_chars = md_precedence_table[precedence_level]; + /* Chain of potential/unresolved code span openers. */ + int code_span_unresolved_openers = -1; + int i = 0; + + while(i < ctx->n_marks) { + MD_MARK* mark = &ctx->marks[i]; + + /* Skip resolved spans. */ + if(mark->flags & MD_MARK_RESOLVED) { + if(mark->flags & MD_MARK_OPENER) + i = mark->next + 1; + else + i++; + continue; + } + + /* Skip marks we do not want to deal with. */ + if(!ISANYOF_(mark->ch, mark_chars)) { + i++; + continue; + } + + /* Analyze the mark. */ + switch(mark->ch) { + case _T('`'): + md_analyze_backtick(ctx, i, &code_span_unresolved_openers); + break; + } + + i++; + } +} + /* Analyze marks (build ctx->marks). */ -static int +static void md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) { - int ret = 0; - - MD_CHECK(md_collect_marks(ctx, lines, n_lines)); + int i; -abort: - return ret; + for(i = 0; i < SIZEOF_ARRAY(md_precedence_table); i++) + md_analyze_marks(ctx, i); } /* Render the output, accordingly to the analyzed ctx->marks. */ static int md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) { + MD_TEXTTYPE text_type; const MD_LINE* line = lines; + const MD_MARK* prev_mark = NULL; const MD_MARK* mark; OFF off = lines[0].beg; OFF end = lines[n_lines-1].end; @@ -440,28 +574,45 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) while(!(mark->flags & MD_MARK_RESOLVED)) mark++; + text_type = MD_TEXT_NORMAL; + while(1) { /* Process the text up to the next mark or end-of-line. */ OFF tmp = (line->end < mark->beg ? line->end : mark->beg); if(tmp > off) { - MD_TEXT(MD_TEXT_NORMAL, STR(off), tmp - off); + MD_TEXT(text_type, STR(off), tmp - off); off = tmp; } /* If reached the mark, process it and move to next one. */ if(off >= mark->beg) { switch(mark->ch) { - case _T('\\'): /* Backslash escape. */ - if(ISNEWLINE(mark->beg+1)) - enforce_hardbreak = 1; - else - MD_TEXT(MD_TEXT_NORMAL, STR(mark->beg+1), 1); - break; + case _T('\\'): /* Backslash escape. */ + if(ISNEWLINE(mark->beg+1)) + enforce_hardbreak = 1; + else + MD_TEXT(text_type, STR(mark->beg+1), 1); + break; + + case _T(' '): /* Non-trivial space. */ + MD_TEXT(text_type, _T(" "), 1); + break; + + case _T('`'): /* Code span. */ + if(mark->flags & MD_MARK_OPENER) { + MD_ENTER_SPAN(MD_SPAN_CODE, NULL); + text_type = MD_TEXT_CODE; + } else { + MD_LEAVE_SPAN(MD_SPAN_CODE, NULL); + text_type = MD_TEXT_NORMAL; + } + break; } off = mark->end; /* Move to next resolved mark. */ + prev_mark = mark; mark++; while(!(mark->flags & MD_MARK_RESOLVED)) mark++; @@ -475,12 +626,23 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) if(off >= end) break; - /* Output soft or hard line break. */ - if(enforce_hardbreak || (CH(line->end) == _T(' ') && CH(line->end+1) == _T(' '))) - break_type = MD_TEXT_BR; - else - break_type = MD_TEXT_SOFTBR; - MD_TEXT(break_type, _T("\n"), 1); + if(text_type == MD_TEXT_CODE) { + /* Inside code spans, new lines are transformed into single + * spaces. */ + MD_ASSERT(prev_mark != NULL); + MD_ASSERT(prev_mark->ch == _T('`') && (prev_mark->flags & MD_MARK_OPENER)); + MD_ASSERT(mark->ch == _T('`') && (mark->flags & MD_MARK_CLOSER)); + + if(prev_mark->end < off && off < mark->beg) + MD_TEXT(MD_SPAN_CODE, _T(" "), 1); + } else { + /* Output soft or hard line break. */ + if(enforce_hardbreak || (CH(line->end) == _T(' ') && CH(line->end+1) == _T(' '))) + break_type = MD_TEXT_BR; + else + break_type = MD_TEXT_SOFTBR; + MD_TEXT(break_type, _T("\n"), 1); + } /* Switch to the following line. */ line++; @@ -504,7 +666,13 @@ md_process_normal_block(MD_CTX* ctx, const MD_LINE* lines, int n_lines) { int ret; - MD_CHECK(md_analyze_inlines(ctx, lines, n_lines)); + /* Reset the previously collected stack of marks. */ + ctx->n_marks = 0; + + MD_CHECK(md_collect_marks(ctx, lines, n_lines)); + + md_analyze_inlines(ctx, lines, n_lines); + MD_CHECK(md_process_inlines(ctx, lines, n_lines)); abort: @@ -616,8 +784,6 @@ md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end) { OFF off = beg; - MD_ASSERT(CH(beg) == _T('`') || CH(beg) == _T('~')); - while(off < ctx->size && CH(off) == CH(beg)) off++; @@ -728,8 +894,6 @@ md_is_html_block_start_condition(MD_CTX* ctx, OFF beg) OFF off = beg + 1; int i; - MD_ASSERT(CH(beg) == _T('<')); - /* Check for type 1: <script, <pre, or <style */ for(i = 0; t1[i].name != NULL; i++) { if(off + t1[i].len < ctx->size) { diff --git a/md4c/md4c.h b/md4c/md4c.h @@ -89,7 +89,7 @@ enum MD_BLOCKTYPE_tag { * like paragraph or list item. */ typedef enum MD_SPANTYPE_tag MD_SPANTYPE; enum MD_SPANTYPE_tag { - MD_SPAN_DUMMY = 0 /* not yet used... */ + MD_SPAN_CODE }; /* Text is the actual textual contents of span. */