commit f3f9404e5346e3e0c1d5c0baa8855af28e5eca15
parent 8818ff14d3b301a42c5483f666ee007e99500eee
Author: Martin Mitas <mity@morous.org>
Date: Fri, 14 Jul 2017 02:06:23 +0200
Improve URL autolinks extension.
It is now much more compatible to Cmark-gfm.
With the flag MD_FLAG_PERMISSIVEWWWAUTOLINKS, we now also support the
WWW autolinks (when the http: scheme is omitted).
Diffstat:
6 files changed, 199 insertions(+), 58 deletions(-)
diff --git a/md2html/md2html.c b/md2html/md2html.c
@@ -198,6 +198,7 @@ static const option cmdline_options[] = {
{ "fverbatim-entities", 0, 'E', OPTION_ARG_NONE },
{ "fpermissive-atx-headers", 0, 'A', OPTION_ARG_NONE },
{ "fpermissive-url-autolinks", 0, 'U', OPTION_ARG_NONE },
+ { "fpermissive-www-autolinks", 0, '.', OPTION_ARG_NONE },
{ "fpermissive-email-autolinks", 0, '@', OPTION_ARG_NONE },
{ "fpermissive-autolinks", 0, 'V', OPTION_ARG_NONE },
{ "fno-indented-code", 0, 'I', OPTION_ARG_NONE },
@@ -238,10 +239,13 @@ usage(void)
" Allow ATX headers without delimiting space\n"
" --fpermissive-url-autolinks\n"
" Allow URL autolinks without '<', '>'\n"
+ " --fpermissive-www-autolinks\n"
+ " Allow WWW autolinks without any scheme (e.g. 'www.example.com')\n"
" --fpermissive-email-autolinks \n"
" Allow e-mail autolinks without '<', '>' and 'mailto:'\n"
" --fpermissive-autolinks\n"
- " Same as --fpermissive-url-autolinks --fpermissive-email-autolinks\n"
+ " Same as --fpermissive-url-autolinks --fpermissive-www-autolinks\n"
+ " --fpermissive-email-autolinks\n"
" --fno-indented-code\n"
" Disable indented code blocks\n"
" --fno-html-blocks\n"
@@ -293,6 +297,7 @@ cmdline_callback(int opt, char const* value, void* data)
case 'H': parser_flags |= MD_FLAG_NOHTML; break;
case 'W': parser_flags |= MD_FLAG_COLLAPSEWHITESPACE; break;
case 'U': parser_flags |= MD_FLAG_PERMISSIVEURLAUTOLINKS; break;
+ case '.': parser_flags |= MD_FLAG_PERMISSIVEWWWAUTOLINKS; break;
case '@': parser_flags |= MD_FLAG_PERMISSIVEEMAILAUTOLINKS; break;
case 'V': parser_flags |= MD_FLAG_PERMISSIVEAUTOLINKS; break;
case 'T': parser_flags |= MD_FLAG_TABLES; break;
diff --git a/md4c/md4c.c b/md4c/md4c.c
@@ -2154,8 +2154,9 @@ md_free_link_ref_defs(MD_CTX* ctx)
* '[': Maybe start of link label or link text.
* '!': Equivalent of '[' for image.
* ']': Maybe end of link label or link text.
- * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS).
* '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS).
+ * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS).
+ * '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS).
* 'D': Dummy mark, it reserves a space for splitting a previous mark
* (e.g. emphasis) or to make more space for storing some special data
* related to the preceding mark (e.g. link).
@@ -2414,11 +2415,14 @@ md_build_mark_char_map(MD_CTX* ctx)
if(ctx->r.flags & MD_FLAG_STRIKETHROUGH)
ctx->mark_char_map['~'] = 1;
+ if(ctx->r.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
+ ctx->mark_char_map['@'] = 1;
+
if(ctx->r.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS)
ctx->mark_char_map[':'] = 1;
- if(ctx->r.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
- ctx->mark_char_map['@'] = 1;
+ if(ctx->r.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS)
+ ctx->mark_char_map['.'] = 1;
if(ctx->r.flags & MD_FLAG_TABLES)
ctx->mark_char_map['|'] = 1;
@@ -2613,6 +2617,20 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
continue;
}
+ /* A potential permissive e-mail autolink. */
+ if(ch == _T('@')) {
+ if(line->beg + 1 <= off && ISALNUM(off-1) &&
+ off + 3 < line->end && ISALNUM(off+1))
+ {
+ PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
+ /* Push a dummy as a reserve for a closer. */
+ PUSH_MARK('D', off, off, 0);
+ }
+
+ off++;
+ continue;
+ }
+
/* A potential permissive URL autolink. */
if(ch == _T(':')) {
static struct {
@@ -2624,8 +2642,7 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
/* In the order from the most frequently used, arguably. */
{ _T("http"), 4, _T("//"), 2 },
{ _T("https"), 5, _T("//"), 2 },
- { _T("mailto"), 6, NULL, 0 },
- { _T("ftp"), 3, _T("//"), 2 },
+ { _T("ftp"), 3, _T("//"), 2 }
};
int scheme_index;
@@ -2636,7 +2653,7 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
const SZ suffix_size = scheme_map[scheme_index].suffix_size;
if(line->beg + scheme_size <= off && md_ascii_eq(STR(off-scheme_size), scheme, scheme_size) &&
- (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1)) &&
+ (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1) || ISANYOF(off-scheme_size-1, _T("*_~(["))) &&
off + 1 + suffix_size < line->end && md_ascii_eq(STR(off+1), suffix, suffix_size))
{
PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER);
@@ -2651,14 +2668,17 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
continue;
}
- /* A potential permissive e-mail autolink. */
- if(ch == _T('@')) {
- if(line->beg + 1 <= off && ISALNUM(off-1) &&
- off + 3 < line->end && ISALNUM(off+1))
+ /* A potential permissive WWW autolink. */
+ if(ch == _T('.')) {
+ if(line->beg + 3 <= off && md_ascii_eq(STR(off-3), _T("www"), 3) &&
+ (line->beg + 3 == off || ISWHITESPACE(off-4) || ISANYOF(off-4, _T("*_~(["))) &&
+ off + 1 < line_end)
{
- PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
+ PUSH_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER);
/* Push a dummy as a reserve for a closer. */
PUSH_MARK('D', off, off, 0);
+ off++;
+ continue;
}
off++;
@@ -3285,38 +3305,63 @@ static void
md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index)
{
MD_MARK* opener = &ctx->marks[mark_index];
- int closer_index;
- MD_MARK* closer;
+ int closer_index = mark_index + 1;
+ MD_MARK* closer = &ctx->marks[mark_index + 1];
+ MD_MARK* next_resolved_mark;
OFF off = opener->end;
+ int seen_dot = FALSE;
+ int seen_underscore_or_hyphen[2] = { FALSE, FALSE };
- if(off < ctx->size && ISALNUM(off))
- off++;
- else
+ /* Check for domain. */
+ while(off < ctx->size) {
+ if(ISALNUM(off)) {
+ off++;
+ } else if(CH(off) == _T('.')) {
+ seen_dot = TRUE;
+ seen_underscore_or_hyphen[0] = seen_underscore_or_hyphen[1];
+ seen_underscore_or_hyphen[1] = FALSE;
+ off++;
+ } else if(ISANYOF2(off, _T('-'), _T('_'))) {
+ seen_underscore_or_hyphen[1] = TRUE;
+ off++;
+ } else {
+ break;
+ }
+ }
+
+ if(off <= opener->end || !seen_dot || seen_underscore_or_hyphen[0] || seen_underscore_or_hyphen[1])
return;
- while(1) {
- while(off < ctx->size && (ISALNUM(off) || CH(off) == _T('/')))
- off++;
+ /* Check for path. */
+ next_resolved_mark = closer + 1;
+ while(next_resolved_mark->ch == 'D' || !(next_resolved_mark->flags & MD_MARK_RESOLVED))
+ next_resolved_mark++;
+ while(off < next_resolved_mark->beg && CH(off) != _T('<') && !ISWHITESPACE(off) && !ISNEWLINE(off))
+ off++;
- /* We need to be relatively careful to not include too much into the URL.
- * Consider e.g. a dot or question mark:
- * "Go to http://example.com." versus "http://example.com.uk"
- * "Do you know http://zombo.com?" versus "http://example.com/?page=2"
- * Therefore we include some named punctuation characters only if they
- * are immediately followed by alnum char.
- */
- if(off + 1 < ctx->size && ISANYOF(off, _T("@.?=&%+-_#")) && ISALNUM(off+1))
- off += 2;
- else
- break;
+ /* Path validation. */
+ if(ISANYOF(off-1, _T("?!.,:*_~)"))) {
+ if(CH(off-1) != _T(')')) {
+ off--;
+ } else {
+ int parenthesis_balance = 0;
+ OFF tmp;
+
+ for(tmp = opener->end; tmp < off; tmp++) {
+ if(CH(tmp) == _T('('))
+ parenthesis_balance++;
+ else if(CH(tmp) == _T(')'))
+ parenthesis_balance--;
+ }
+
+ if(parenthesis_balance < 0)
+ off--;
+ }
}
/* Ok. Lets call it auto-link. Adapt opener and create closer to zero
* length so all the contents becomes the link text. */
- closer_index = mark_index + 1;
- closer = &ctx->marks[closer_index];
MD_ASSERT(closer->ch == 'D');
-
opener->end = opener->beg;
closer->ch = opener->ch;
closer->beg = off;
@@ -3436,6 +3481,7 @@ md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF en
case '*': md_analyze_asterisk(ctx, i); break;
case '_': md_analyze_underscore(ctx, i); break;
case '~': md_analyze_tilde(ctx, i); break;
+ case '.': /* Pass through. */
case ':': md_analyze_permissive_url_autolink(ctx, i); break;
case '@': md_analyze_permissive_email_autolink(ctx, i); break;
}
@@ -3494,7 +3540,7 @@ abort:
static void
md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF end)
{
- md_analyze_marks(ctx, lines, n_lines, beg, end, _T("*_~@:"));
+ md_analyze_marks(ctx, lines, n_lines, beg, end, _T("*_~@:."));
ASTERISK_OPENERS.head = -1;
ASTERISK_OPENERS.tail = -1;
UNDERSCORE_OPENERS.head = -1;
@@ -3645,16 +3691,19 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
case '@': /* Permissive e-mail autolink. */
case ':': /* Permissive URL autolink. */
+ case '.': /* Permissive WWW autolink. */
{
const MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]);
const MD_MARK* closer = &ctx->marks[opener->next];
const CHAR* dest = STR(opener->end);
SZ dest_size = closer->beg - opener->end;
- if(opener->ch == '@') {
+ if(opener->ch == '@' || opener->ch == '.') {
dest_size += 7;
MD_TEMP_BUFFER(dest_size * sizeof(CHAR));
- memcpy(ctx->buffer, _T("mailto:"), 7 * sizeof(CHAR));
+ memcpy(ctx->buffer,
+ (opener->ch == '@' ? _T("mailto:") : _T("http://")),
+ 7 * sizeof(CHAR));
memcpy(ctx->buffer + 7, dest, (dest_size-7) * sizeof(CHAR));
dest = ctx->buffer;
}
diff --git a/md4c/md4c.h b/md4c/md4c.h
@@ -262,13 +262,15 @@ typedef struct MD_SPAN_IMG_DETAIL {
#define MD_FLAG_PERMISSIVEATXHEADERS 0x0002 /* Do not require space in ATX headers ( ###header ) */
#define MD_FLAG_PERMISSIVEURLAUTOLINKS 0x0004 /* Recognize URLs as autolinks even without '<', '>' */
#define MD_FLAG_PERMISSIVEEMAILAUTOLINKS 0x0008 /* Recognize e-mails as autolinks even without '<', '>' and 'mailto:' */
-#define MD_FLAG_PERMISSIVEAUTOLINKS (MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
#define MD_FLAG_NOINDENTEDCODEBLOCKS 0x0010 /* Disable indented code blocks. (Only fenced code works.) */
#define MD_FLAG_NOHTMLBLOCKS 0x0020 /* Disable raw HTML blocks. */
#define MD_FLAG_NOHTMLSPANS 0x0040 /* Disable raw HTML (inline). */
-#define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS)
#define MD_FLAG_TABLES 0x0100 /* Enable tables extension. */
#define MD_FLAG_STRIKETHROUGH 0x0200 /* Enable strikethrough extension. */
+#define MD_FLAG_PERMISSIVEWWWAUTOLINKS 0x0400 /* Enable WWW autolinks (even without any scheme prefix, if they begin with 'www.') */
+
+#define MD_FLAG_PERMISSIVEAUTOLINKS (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS)
+#define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS)
/* Convenient sets of flags corresponding to well-known Markdown dialects.
* Note we may only support subset of features of the referred dialect.
diff --git a/scripts/run-tests.sh b/scripts/run-tests.sh
@@ -34,5 +34,6 @@ $PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/coverage.txt" -p "$PROGRAM"
# Test various extensions and deviations from the specifications:
$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/permissive-email-autolinks.txt" -p "$PROGRAM --fpermissive-email-autolinks"
$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/permissive-url-autolinks.txt" -p "$PROGRAM --fpermissive-url-autolinks"
+$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/permissive-www-autolinks.txt" -p "$PROGRAM --fpermissive-www-autolinks"
$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/tables.txt" -p "$PROGRAM --ftables"
$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/strikethrough.txt" -p "$PROGRAM --fstrikethrough"
diff --git a/test/permissive-url-autolinks.txt b/test/permissive-url-autolinks.txt
@@ -22,8 +22,7 @@ Homepage: https://github.com/mity/md4c
````````````````````````````````
But this permissive autolink feature can work only for very widely used URL
-schemes, in alphabetical order `ftp:`, `http:`, `https:` and `mailto:`,
-so this is not an permissive autolink.
+schemes, in alphabetical order `ftp:`, `http:`, `https:`.
That's why this is not a permissive autolink:
@@ -33,12 +32,8 @@ ssh://root@example.com
<p>ssh://root@example.com</p>
````````````````````````````````
-As certain characters (`.`, `?`, `=`, `&`, `%`, `+`, `-`, `_`, `#`, `@`) can collide
-with a normal text flow, they are not recognized as part of the URL unless another
-alphanumeric character immediately follows.
-
-Therefore the question mark in this question is not part of the autolink's
-contents:
+The same rules for path validation as for permissivve WWW autolinks apply.
+Therefore the final question mark here is not part of the autolink:
```````````````````````````````` example
Have you ever visited http://www.zombo.com?
@@ -54,19 +49,16 @@ http://www.bing.com/search?q=md4c
<p><a href="http://www.bing.com/search?q=md4c">http://www.bing.com/search?q=md4c</a></p>
````````````````````````````````
-Please note that e-mail addresses without the `mailto:` scheme are not permissive URL
-autolinks, but MD4C implements another extension `MD_FLAG_PERMISSIVEEMAILAUTOLINKS`
-which can be used to enable that too.
+And finally one complex example:
```````````````````````````````` example
-mailto:john.doe@example.com
-.
-<p><a href="mailto:john.doe@example.com">mailto:john.doe@example.com</a></p>
-````````````````````````````````
+http://commonmark.org
-```````````````````````````````` example
-john.doe@example.com
+(Visit https://encrypted.google.com/search?q=Markup+(business))
+
+Anonymous FTP is available at ftp://foo.bar.baz.
.
-<p>john.doe@example.com</p>
+<p><a href="http://commonmark.org">http://commonmark.org</a></p>
+<p>(Visit <a href="https://encrypted.google.com/search?q=Markup+(business)">https://encrypted.google.com/search?q=Markup+(business)</a>)</p>
+<p>Anonymous FTP is available at <a href="ftp://foo.bar.baz">ftp://foo.bar.baz</a>.</p>
````````````````````````````````
-
diff --git a/test/permissive-www-autolinks.txt b/test/permissive-www-autolinks.txt
@@ -0,0 +1,92 @@
+
+# Permissive WWW Autolinks
+
+With the flag `MD_FLAG_PERMISSIVEWWWAUTOLINKS`, MD4C enables recognition of
+autolinks starting with `www.`, even if they do not exactly follow the syntax
+of autolink as specified in CommonMark specification.
+
+These do not have to be enclosed in `<` and `>`, and they even do not need
+any preceding scheme specification.
+
+The WWW autolink will be recognized when a valid domain is found.
+
+A valid domain consists of the text `www.`, followed by alphanumeric characters,
+nderscores (`_`), hyphens (`-`) and periods (`.`). There must be at least one
+period, and no underscores may be present in the last two segments of the domain.
+
+The scheme `http` will be inserted automatically:
+
+```````````````````````````````` example
+www.commonmark.org
+.
+<p><a href="http://www.commonmark.org">www.commonmark.org</a></p>
+````````````````````````````````
+
+After a valid domain, zero or more non-space non-`<` characters may follow:
+
+```````````````````````````````` example
+Visit www.commonmark.org/help for more information.
+.
+<p>Visit <a href="http://www.commonmark.org/help">www.commonmark.org/help</a> for more information.</p>
+````````````````````````````````
+
+We then apply extended autolink path validation as follows:
+
+Trailing punctuation (specifically, `?`, `!`, `.`, `,`, `:`, `*`, `_`, and `~`)
+will not be considered part of the autolink, though they may be included in the
+interior of the link:
+
+```````````````````````````````` example
+Visit www.commonmark.org.
+
+Visit www.commonmark.org/a.b.
+.
+<p>Visit <a href="http://www.commonmark.org">www.commonmark.org</a>.</p>
+<p>Visit <a href="http://www.commonmark.org/a.b">www.commonmark.org/a.b</a>.</p>
+````````````````````````````````
+
+When an autolink ends in `)`, we scan the entire autolink for the total number
+of parentheses. If there is a greater number of closing parentheses than
+opening ones, we don't consider the last character part of the autolink, in
+order to facilitate including an autolink inside a parenthesis:
+
+```````````````````````````````` example
+www.google.com/search?q=Markup+(business)
+
+(www.google.com/search?q=Markup+(business))
+.
+<p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a></p>
+<p>(<a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>)</p>
+````````````````````````````````
+
+This check is only done when the link ends in a closing parentheses `)`, so if
+the only parentheses are in the interior of the autolink, no special rules are
+applied:
+
+```````````````````````````````` example
+www.google.com/search?q=(business))+ok
+.
+<p><a href="http://www.google.com/search?q=(business))+ok">www.google.com/search?q=(business))+ok</a></p>
+````````````````````````````````
+
+If an autolink ends in a semicolon (`;`), we check to see if it appears to
+resemble an [entity reference][entity references]; if the preceding text is `&`
+followed by one or more alphanumeric characters. If so, it is excluded from
+the autolink:
+
+```````````````````````````````` example
+www.google.com/search?q=commonmark&hl=en
+
+www.google.com/search?q=commonmark&hl;
+.
+<p><a href="http://www.google.com/search?q=commonmark&hl=en">www.google.com/search?q=commonmark&hl=en</a></p>
+<p><a href="http://www.google.com/search?q=commonmark">www.google.com/search?q=commonmark</a>&hl;</p>
+````````````````````````````````
+
+`<` immediately ends an autolink.
+
+```````````````````````````````` example
+www.commonmark.org/he<lp
+.
+<p><a href="http://www.commonmark.org/he">www.commonmark.org/he</a><lp</p>
+````````````````````````````````