md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

commit f3f9404e5346e3e0c1d5c0baa8855af28e5eca15
parent 8818ff14d3b301a42c5483f666ee007e99500eee
Author: Martin Mitas <mity@morous.org>
Date:   Fri, 14 Jul 2017 02:06:23 +0200

Improve URL autolinks extension.

It is now much more compatible to Cmark-gfm.

With the flag MD_FLAG_PERMISSIVEWWWAUTOLINKS, we now also support the
WWW autolinks (when the http: scheme is omitted).

Diffstat:
Mmd2html/md2html.c | 7++++++-
Mmd4c/md4c.c | 121+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Mmd4c/md4c.h | 6++++--
Mscripts/run-tests.sh | 1+
Mtest/permissive-url-autolinks.txt | 30+++++++++++-------------------
Atest/permissive-www-autolinks.txt | 92+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 199 insertions(+), 58 deletions(-)

diff --git a/md2html/md2html.c b/md2html/md2html.c @@ -198,6 +198,7 @@ static const option cmdline_options[] = { { "fverbatim-entities", 0, 'E', OPTION_ARG_NONE }, { "fpermissive-atx-headers", 0, 'A', OPTION_ARG_NONE }, { "fpermissive-url-autolinks", 0, 'U', OPTION_ARG_NONE }, + { "fpermissive-www-autolinks", 0, '.', OPTION_ARG_NONE }, { "fpermissive-email-autolinks", 0, '@', OPTION_ARG_NONE }, { "fpermissive-autolinks", 0, 'V', OPTION_ARG_NONE }, { "fno-indented-code", 0, 'I', OPTION_ARG_NONE }, @@ -238,10 +239,13 @@ usage(void) " Allow ATX headers without delimiting space\n" " --fpermissive-url-autolinks\n" " Allow URL autolinks without '<', '>'\n" + " --fpermissive-www-autolinks\n" + " Allow WWW autolinks without any scheme (e.g. 'www.example.com')\n" " --fpermissive-email-autolinks \n" " Allow e-mail autolinks without '<', '>' and 'mailto:'\n" " --fpermissive-autolinks\n" - " Same as --fpermissive-url-autolinks --fpermissive-email-autolinks\n" + " Same as --fpermissive-url-autolinks --fpermissive-www-autolinks\n" + " --fpermissive-email-autolinks\n" " --fno-indented-code\n" " Disable indented code blocks\n" " --fno-html-blocks\n" @@ -293,6 +297,7 @@ cmdline_callback(int opt, char const* value, void* data) case 'H': parser_flags |= MD_FLAG_NOHTML; break; case 'W': parser_flags |= MD_FLAG_COLLAPSEWHITESPACE; break; case 'U': parser_flags |= MD_FLAG_PERMISSIVEURLAUTOLINKS; break; + case '.': parser_flags |= MD_FLAG_PERMISSIVEWWWAUTOLINKS; break; case '@': parser_flags |= MD_FLAG_PERMISSIVEEMAILAUTOLINKS; break; case 'V': parser_flags |= MD_FLAG_PERMISSIVEAUTOLINKS; break; case 'T': parser_flags |= MD_FLAG_TABLES; break; diff --git a/md4c/md4c.c b/md4c/md4c.c @@ -2154,8 +2154,9 @@ md_free_link_ref_defs(MD_CTX* ctx) * '[': Maybe start of link label or link text. * '!': Equivalent of '[' for image. * ']': Maybe end of link label or link text. - * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS). * '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS). + * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS). + * '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS). * 'D': Dummy mark, it reserves a space for splitting a previous mark * (e.g. emphasis) or to make more space for storing some special data * related to the preceding mark (e.g. link). @@ -2414,11 +2415,14 @@ md_build_mark_char_map(MD_CTX* ctx) if(ctx->r.flags & MD_FLAG_STRIKETHROUGH) ctx->mark_char_map['~'] = 1; + if(ctx->r.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS) + ctx->mark_char_map['@'] = 1; + if(ctx->r.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS) ctx->mark_char_map[':'] = 1; - if(ctx->r.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS) - ctx->mark_char_map['@'] = 1; + if(ctx->r.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS) + ctx->mark_char_map['.'] = 1; if(ctx->r.flags & MD_FLAG_TABLES) ctx->mark_char_map['|'] = 1; @@ -2613,6 +2617,20 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) continue; } + /* A potential permissive e-mail autolink. */ + if(ch == _T('@')) { + if(line->beg + 1 <= off && ISALNUM(off-1) && + off + 3 < line->end && ISALNUM(off+1)) + { + PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER); + /* Push a dummy as a reserve for a closer. */ + PUSH_MARK('D', off, off, 0); + } + + off++; + continue; + } + /* A potential permissive URL autolink. */ if(ch == _T(':')) { static struct { @@ -2624,8 +2642,7 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) /* In the order from the most frequently used, arguably. */ { _T("http"), 4, _T("//"), 2 }, { _T("https"), 5, _T("//"), 2 }, - { _T("mailto"), 6, NULL, 0 }, - { _T("ftp"), 3, _T("//"), 2 }, + { _T("ftp"), 3, _T("//"), 2 } }; int scheme_index; @@ -2636,7 +2653,7 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) const SZ suffix_size = scheme_map[scheme_index].suffix_size; if(line->beg + scheme_size <= off && md_ascii_eq(STR(off-scheme_size), scheme, scheme_size) && - (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1)) && + (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1) || ISANYOF(off-scheme_size-1, _T("*_~(["))) && off + 1 + suffix_size < line->end && md_ascii_eq(STR(off+1), suffix, suffix_size)) { PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER); @@ -2651,14 +2668,17 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) continue; } - /* A potential permissive e-mail autolink. */ - if(ch == _T('@')) { - if(line->beg + 1 <= off && ISALNUM(off-1) && - off + 3 < line->end && ISALNUM(off+1)) + /* A potential permissive WWW autolink. */ + if(ch == _T('.')) { + if(line->beg + 3 <= off && md_ascii_eq(STR(off-3), _T("www"), 3) && + (line->beg + 3 == off || ISWHITESPACE(off-4) || ISANYOF(off-4, _T("*_~(["))) && + off + 1 < line_end) { - PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER); + PUSH_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER); /* Push a dummy as a reserve for a closer. */ PUSH_MARK('D', off, off, 0); + off++; + continue; } off++; @@ -3285,38 +3305,63 @@ static void md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index) { MD_MARK* opener = &ctx->marks[mark_index]; - int closer_index; - MD_MARK* closer; + int closer_index = mark_index + 1; + MD_MARK* closer = &ctx->marks[mark_index + 1]; + MD_MARK* next_resolved_mark; OFF off = opener->end; + int seen_dot = FALSE; + int seen_underscore_or_hyphen[2] = { FALSE, FALSE }; - if(off < ctx->size && ISALNUM(off)) - off++; - else + /* Check for domain. */ + while(off < ctx->size) { + if(ISALNUM(off)) { + off++; + } else if(CH(off) == _T('.')) { + seen_dot = TRUE; + seen_underscore_or_hyphen[0] = seen_underscore_or_hyphen[1]; + seen_underscore_or_hyphen[1] = FALSE; + off++; + } else if(ISANYOF2(off, _T('-'), _T('_'))) { + seen_underscore_or_hyphen[1] = TRUE; + off++; + } else { + break; + } + } + + if(off <= opener->end || !seen_dot || seen_underscore_or_hyphen[0] || seen_underscore_or_hyphen[1]) return; - while(1) { - while(off < ctx->size && (ISALNUM(off) || CH(off) == _T('/'))) - off++; + /* Check for path. */ + next_resolved_mark = closer + 1; + while(next_resolved_mark->ch == 'D' || !(next_resolved_mark->flags & MD_MARK_RESOLVED)) + next_resolved_mark++; + while(off < next_resolved_mark->beg && CH(off) != _T('<') && !ISWHITESPACE(off) && !ISNEWLINE(off)) + off++; - /* We need to be relatively careful to not include too much into the URL. - * Consider e.g. a dot or question mark: - * "Go to http://example.com." versus "http://example.com.uk" - * "Do you know http://zombo.com?" versus "http://example.com/?page=2" - * Therefore we include some named punctuation characters only if they - * are immediately followed by alnum char. - */ - if(off + 1 < ctx->size && ISANYOF(off, _T("@.?=&%+-_#")) && ISALNUM(off+1)) - off += 2; - else - break; + /* Path validation. */ + if(ISANYOF(off-1, _T("?!.,:*_~)"))) { + if(CH(off-1) != _T(')')) { + off--; + } else { + int parenthesis_balance = 0; + OFF tmp; + + for(tmp = opener->end; tmp < off; tmp++) { + if(CH(tmp) == _T('(')) + parenthesis_balance++; + else if(CH(tmp) == _T(')')) + parenthesis_balance--; + } + + if(parenthesis_balance < 0) + off--; + } } /* Ok. Lets call it auto-link. Adapt opener and create closer to zero * length so all the contents becomes the link text. */ - closer_index = mark_index + 1; - closer = &ctx->marks[closer_index]; MD_ASSERT(closer->ch == 'D'); - opener->end = opener->beg; closer->ch = opener->ch; closer->beg = off; @@ -3436,6 +3481,7 @@ md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF en case '*': md_analyze_asterisk(ctx, i); break; case '_': md_analyze_underscore(ctx, i); break; case '~': md_analyze_tilde(ctx, i); break; + case '.': /* Pass through. */ case ':': md_analyze_permissive_url_autolink(ctx, i); break; case '@': md_analyze_permissive_email_autolink(ctx, i); break; } @@ -3494,7 +3540,7 @@ abort: static void md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF end) { - md_analyze_marks(ctx, lines, n_lines, beg, end, _T("*_~@:")); + md_analyze_marks(ctx, lines, n_lines, beg, end, _T("*_~@:.")); ASTERISK_OPENERS.head = -1; ASTERISK_OPENERS.tail = -1; UNDERSCORE_OPENERS.head = -1; @@ -3645,16 +3691,19 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) case '@': /* Permissive e-mail autolink. */ case ':': /* Permissive URL autolink. */ + case '.': /* Permissive WWW autolink. */ { const MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]); const MD_MARK* closer = &ctx->marks[opener->next]; const CHAR* dest = STR(opener->end); SZ dest_size = closer->beg - opener->end; - if(opener->ch == '@') { + if(opener->ch == '@' || opener->ch == '.') { dest_size += 7; MD_TEMP_BUFFER(dest_size * sizeof(CHAR)); - memcpy(ctx->buffer, _T("mailto:"), 7 * sizeof(CHAR)); + memcpy(ctx->buffer, + (opener->ch == '@' ? _T("mailto:") : _T("http://")), + 7 * sizeof(CHAR)); memcpy(ctx->buffer + 7, dest, (dest_size-7) * sizeof(CHAR)); dest = ctx->buffer; } diff --git a/md4c/md4c.h b/md4c/md4c.h @@ -262,13 +262,15 @@ typedef struct MD_SPAN_IMG_DETAIL { #define MD_FLAG_PERMISSIVEATXHEADERS 0x0002 /* Do not require space in ATX headers ( ###header ) */ #define MD_FLAG_PERMISSIVEURLAUTOLINKS 0x0004 /* Recognize URLs as autolinks even without '<', '>' */ #define MD_FLAG_PERMISSIVEEMAILAUTOLINKS 0x0008 /* Recognize e-mails as autolinks even without '<', '>' and 'mailto:' */ -#define MD_FLAG_PERMISSIVEAUTOLINKS (MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEEMAILAUTOLINKS) #define MD_FLAG_NOINDENTEDCODEBLOCKS 0x0010 /* Disable indented code blocks. (Only fenced code works.) */ #define MD_FLAG_NOHTMLBLOCKS 0x0020 /* Disable raw HTML blocks. */ #define MD_FLAG_NOHTMLSPANS 0x0040 /* Disable raw HTML (inline). */ -#define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS) #define MD_FLAG_TABLES 0x0100 /* Enable tables extension. */ #define MD_FLAG_STRIKETHROUGH 0x0200 /* Enable strikethrough extension. */ +#define MD_FLAG_PERMISSIVEWWWAUTOLINKS 0x0400 /* Enable WWW autolinks (even without any scheme prefix, if they begin with 'www.') */ + +#define MD_FLAG_PERMISSIVEAUTOLINKS (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS) +#define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS) /* Convenient sets of flags corresponding to well-known Markdown dialects. * Note we may only support subset of features of the referred dialect. diff --git a/scripts/run-tests.sh b/scripts/run-tests.sh @@ -34,5 +34,6 @@ $PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/coverage.txt" -p "$PROGRAM" # Test various extensions and deviations from the specifications: $PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/permissive-email-autolinks.txt" -p "$PROGRAM --fpermissive-email-autolinks" $PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/permissive-url-autolinks.txt" -p "$PROGRAM --fpermissive-url-autolinks" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/permissive-www-autolinks.txt" -p "$PROGRAM --fpermissive-www-autolinks" $PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/tables.txt" -p "$PROGRAM --ftables" $PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/strikethrough.txt" -p "$PROGRAM --fstrikethrough" diff --git a/test/permissive-url-autolinks.txt b/test/permissive-url-autolinks.txt @@ -22,8 +22,7 @@ Homepage: https://github.com/mity/md4c ```````````````````````````````` But this permissive autolink feature can work only for very widely used URL -schemes, in alphabetical order `ftp:`, `http:`, `https:` and `mailto:`, -so this is not an permissive autolink. +schemes, in alphabetical order `ftp:`, `http:`, `https:`. That's why this is not a permissive autolink: @@ -33,12 +32,8 @@ ssh://root@example.com <p>ssh://root@example.com</p> ```````````````````````````````` -As certain characters (`.`, `?`, `=`, `&`, `%`, `+`, `-`, `_`, `#`, `@`) can collide -with a normal text flow, they are not recognized as part of the URL unless another -alphanumeric character immediately follows. - -Therefore the question mark in this question is not part of the autolink's -contents: +The same rules for path validation as for permissivve WWW autolinks apply. +Therefore the final question mark here is not part of the autolink: ```````````````````````````````` example Have you ever visited http://www.zombo.com? @@ -54,19 +49,16 @@ http://www.bing.com/search?q=md4c <p><a href="http://www.bing.com/search?q=md4c">http://www.bing.com/search?q=md4c</a></p> ```````````````````````````````` -Please note that e-mail addresses without the `mailto:` scheme are not permissive URL -autolinks, but MD4C implements another extension `MD_FLAG_PERMISSIVEEMAILAUTOLINKS` -which can be used to enable that too. +And finally one complex example: ```````````````````````````````` example -mailto:john.doe@example.com -. -<p><a href="mailto:john.doe@example.com">mailto:john.doe@example.com</a></p> -```````````````````````````````` +http://commonmark.org -```````````````````````````````` example -john.doe@example.com +(Visit https://encrypted.google.com/search?q=Markup+(business)) + +Anonymous FTP is available at ftp://foo.bar.baz. . -<p>john.doe@example.com</p> +<p><a href="http://commonmark.org">http://commonmark.org</a></p> +<p>(Visit <a href="https://encrypted.google.com/search?q=Markup+(business)">https://encrypted.google.com/search?q=Markup+(business)</a>)</p> +<p>Anonymous FTP is available at <a href="ftp://foo.bar.baz">ftp://foo.bar.baz</a>.</p> ```````````````````````````````` - diff --git a/test/permissive-www-autolinks.txt b/test/permissive-www-autolinks.txt @@ -0,0 +1,92 @@ + +# Permissive WWW Autolinks + +With the flag `MD_FLAG_PERMISSIVEWWWAUTOLINKS`, MD4C enables recognition of +autolinks starting with `www.`, even if they do not exactly follow the syntax +of autolink as specified in CommonMark specification. + +These do not have to be enclosed in `<` and `>`, and they even do not need +any preceding scheme specification. + +The WWW autolink will be recognized when a valid domain is found. + +A valid domain consists of the text `www.`, followed by alphanumeric characters, +nderscores (`_`), hyphens (`-`) and periods (`.`). There must be at least one +period, and no underscores may be present in the last two segments of the domain. + +The scheme `http` will be inserted automatically: + +```````````````````````````````` example +www.commonmark.org +. +<p><a href="http://www.commonmark.org">www.commonmark.org</a></p> +```````````````````````````````` + +After a valid domain, zero or more non-space non-`<` characters may follow: + +```````````````````````````````` example +Visit www.commonmark.org/help for more information. +. +<p>Visit <a href="http://www.commonmark.org/help">www.commonmark.org/help</a> for more information.</p> +```````````````````````````````` + +We then apply extended autolink path validation as follows: + +Trailing punctuation (specifically, `?`, `!`, `.`, `,`, `:`, `*`, `_`, and `~`) +will not be considered part of the autolink, though they may be included in the +interior of the link: + +```````````````````````````````` example +Visit www.commonmark.org. + +Visit www.commonmark.org/a.b. +. +<p>Visit <a href="http://www.commonmark.org">www.commonmark.org</a>.</p> +<p>Visit <a href="http://www.commonmark.org/a.b">www.commonmark.org/a.b</a>.</p> +```````````````````````````````` + +When an autolink ends in `)`, we scan the entire autolink for the total number +of parentheses. If there is a greater number of closing parentheses than +opening ones, we don't consider the last character part of the autolink, in +order to facilitate including an autolink inside a parenthesis: + +```````````````````````````````` example +www.google.com/search?q=Markup+(business) + +(www.google.com/search?q=Markup+(business)) +. +<p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a></p> +<p>(<a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>)</p> +```````````````````````````````` + +This check is only done when the link ends in a closing parentheses `)`, so if +the only parentheses are in the interior of the autolink, no special rules are +applied: + +```````````````````````````````` example +www.google.com/search?q=(business))+ok +. +<p><a href="http://www.google.com/search?q=(business))+ok">www.google.com/search?q=(business))+ok</a></p> +```````````````````````````````` + +If an autolink ends in a semicolon (`;`), we check to see if it appears to +resemble an [entity reference][entity references]; if the preceding text is `&` +followed by one or more alphanumeric characters. If so, it is excluded from +the autolink: + +```````````````````````````````` example +www.google.com/search?q=commonmark&hl=en + +www.google.com/search?q=commonmark&hl; +. +<p><a href="http://www.google.com/search?q=commonmark&hl=en">www.google.com/search?q=commonmark&amp;hl=en</a></p> +<p><a href="http://www.google.com/search?q=commonmark">www.google.com/search?q=commonmark</a>&amp;hl;</p> +```````````````````````````````` + +`<` immediately ends an autolink. + +```````````````````````````````` example +www.commonmark.org/he<lp +. +<p><a href="http://www.commonmark.org/he">www.commonmark.org/he</a>&lt;lp</p> +````````````````````````````````