md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

commit 1ba03589c0962849a8a45926f53263867404ef6a
parent 7c9f5982c66dd737b651f9361dac8aaca443efd5
Author: Martin Mitas <mity@morous.org>
Date:   Mon,  7 Nov 2016 20:50:11 +0100

md_collect_marks: Optimize the function.

Use character map for a fast path and minimize count of branches for
specially handled characters.

When profiling md2html on a larger documents with output redirected to
/dev/null to mitigate I/O, this function was quite a bottleneck.
It consummed about 33% of CPU cycles on a longer document input, with
this patch applied it drops down to 12%.

Diffstat:
Mmd4c/md4c.c | 99++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 73 insertions(+), 26 deletions(-)

diff --git a/md4c/md4c.c b/md4c/md4c.c @@ -103,6 +103,8 @@ struct MD_CTX_tag { unsigned n_marks; unsigned alloc_marks; + char mark_char_map[128]; + /* For resolving of inline spans. */ MD_MARKCHAIN mark_chains[4]; #define BACKTICK_OPENERS ctx->mark_chains[0] @@ -1045,6 +1047,37 @@ md_split_mark(MD_CTX* ctx, int mark_index, SZ n) return mark_index + 1; } +static void +md_build_mark_char_map(MD_CTX* ctx) +{ + memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map)); + + ctx->mark_char_map['\\'] = 1; + ctx->mark_char_map['*'] = 1; + ctx->mark_char_map['_'] = 1; + ctx->mark_char_map['`'] = 1; + ctx->mark_char_map['&'] = 1; + ctx->mark_char_map[';'] = 1; + ctx->mark_char_map['<'] = 1; + ctx->mark_char_map['>'] = 1; + ctx->mark_char_map['\0'] = 1; + + if(ctx->r.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS) + ctx->mark_char_map[':'] = 1; + + if(ctx->r.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS) + ctx->mark_char_map['@'] = 1; + + if(ctx->r.flags & MD_FLAG_COLLAPSEWHITESPACE) { + int i; + + for(i = 0; i < sizeof(ctx->mark_char_map); i++) { + if(ISWHITESPACE_(i)) + ctx->mark_char_map[i] = 1; + } + } +} + static int md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines) { @@ -1059,6 +1092,13 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines) while(off < line_end) { CHAR ch = CH(off); + + /* Optimization: Fast path. */ + if(ch >= sizeof(ctx->mark_char_map) || !ctx->mark_char_map[(int) ch]) { + off++; + continue; + } + /* A backslash escape. * It can go beyond line->end as it may involve escaped new * line to form a hard break. */ @@ -1076,20 +1116,6 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines) continue; } - /* Turn non-trivial whitespace into single space. */ - if((ctx->r.flags & MD_FLAG_COLLAPSEWHITESPACE) && ISWHITESPACE_(ch)) { - OFF tmp = off+1; - - while(tmp < line_end && ISWHITESPACE(tmp)) - tmp++; - - if(tmp - off > 1 || ch != _T(' ')) { - PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED); - off = tmp; - continue; - } - } - /* A potential (string) emphasis start/end. */ if(ch == _T('*') || ch == _T('_')) { OFF tmp = off+1; @@ -1143,6 +1169,7 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines) } off = tmp; + continue; } /* A potential code span start/end. */ @@ -1174,24 +1201,24 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines) /* A potential entity end. */ if(ch == _T(';')) { /* We surely cannot be entity unless the previous mark is '&'. */ - if(ctx->n_marks > 0 && ctx->marks[ctx->n_marks-1].ch == _T('&')) { + if(ctx->n_marks > 0 && ctx->marks[ctx->n_marks-1].ch == _T('&')) PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER); - off++; - continue; - } + + off++; + continue; } /* A potential autolink or raw HTML start/end. */ if(ch == _T('<') || ch == _T('>')) { - if(!(ctx->r.flags & MD_FLAG_NOHTMLSPANS)) { + if(!(ctx->r.flags & MD_FLAG_NOHTMLSPANS)) PUSH_MARK(ch, off, off+1, (ch == _T('<') ? MD_MARK_POTENTIAL_OPENER : MD_MARK_POTENTIAL_CLOSER)); - off++; - continue; - } + + off++; + continue; } /* A potential permissive URL autolink. */ - if((ctx->r.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS) && ch == _T(':')) { + if(ch == _T(':')) { static struct { const CHAR* scheme; SZ scheme_size; @@ -1223,19 +1250,37 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines) continue; } } + + off++; + continue; } /* A potential permissive e-mail autolink. */ - if((ctx->r.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS) && ch == _T('@')) { + if(ch == _T('@')) { if(line->beg + 1 <= off && ISALNUM(off-1) && off + 3 < line->end && ISALNUM(off+1)) { PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER); /* Push a dummy as a reserve for a closer. */ PUSH_MARK('D', off, off, 0); - off++; - continue; } + + off++; + continue; + } + + /* Turn non-trivial whitespace into single space. */ + if(ISWHITESPACE_(ch)) { + OFF tmp = off+1; + + while(tmp < line_end && ISWHITESPACE(tmp)) + tmp++; + + if(tmp - off > 1 || ch != _T(' ')) + PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED); + + off = tmp; + continue; } /* NULL character. */ @@ -2589,6 +2634,8 @@ md_process_doc(MD_CTX *ctx) OFF off = 0; int ret = 0; + md_build_mark_char_map(ctx); + MD_ENTER_BLOCK(MD_BLOCK_DOC, NULL); while(off < ctx->size) {