md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

commit f2821cbd8ed1cfbfd55de102cfa1249d4172e7dd
parent a83db2b70212178fa3542531b3bc393d22aa6c08
Author: Martin Mitas <mity@morous.org>
Date:   Fri, 14 Jul 2017 17:10:45 +0200

md_analyze_permissive_email_autolink: Make it compatible with CMark-gfm.

Diffstat:
Mmd4c/md4c.c | 49++++++++++++++++---------------------------------
Mtest/permissive-email-autolinks.txt | 34+++++++++++++++++++++++++++++++---
2 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/md4c/md4c.c b/md4c/md4c.c @@ -3381,45 +3381,28 @@ md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index) MD_MARK* closer; OFF beg = opener->beg; OFF end = opener->end; - int right_dot_count = 0; + int dot_count = 0; MD_ASSERT(CH(beg) == _T('@')); - /* Accept any alphanumeric sequences delimited with dot before the '@'. - * There must be a whitespace or start of line before it. */ - while(1) { - while(beg > 0 && ISALNUM(beg-1)) - beg--; + /* Scan for name before '@'. */ + while(beg > 0 && (ISALNUM(beg-1) || ISANYOF(beg-1, _T(".-_+")))) + beg--; - if(beg > 1 && CH(beg-1) == _T('.') && ISALNUM(beg-2)) - beg -= 2; - else if(beg == 0 || ISWHITESPACE(beg-1) || ISNEWLINE(beg-1)) - break; - else - return; + /* Scan for domain after '@'. */ + while(end < ctx->size && (ISALNUM(end) || ISANYOF(end, _T(".-_")))) { + if(CH(end) == _T('.')) + dot_count++; + end++; } - - /* Accept any alphanumeric sequences delimited with dot after the '@', - * limiting the sequences length by 64 characters. */ - while(1) { - OFF label_start = end; - while(end + 1 < ctx->size && ISALNUM(end)) - end++; - if(end - label_start > 63) - return; - - if(end + 1 < ctx->size && CH(end) == _T('.') && ISALNUM(end+1)) { - right_dot_count++; - end += 2; - } else if(right_dot_count > 0) { - /* Although "user@machine" is technically correct e-mail address, - * we request at least one dot, as in e.g. "user@machine.com" to - * prevent some false positives with this very loose format. */ - break; - } else { - return; - } + if(CH(end-1) == _T('.')) { /* Final '.' not part of it. */ + dot_count--; + end--; } + else if(ISANYOF2(end-1, _T('-'), _T('_'))) /* These are forbidden at the end. */ + return; + if(CH(end-1) == _T('@') || dot_count == 0) + return; /* Ok. Lets call it auto-link. Adapt opener and create closer to zero * length so all the contents becomes the link text. */ diff --git a/test/permissive-email-autolinks.txt b/test/permissive-email-autolinks.txt @@ -1,9 +1,10 @@ # Permissive E-mail Autolinks -With the flag `MD_FLAG_PERMISSIVEEMAILAUTOLINKS`, MD4C enables more permissive recognition -of e-mail addresses and transforms them to autolinks, even if they do not exactly follow -the syntax of autolink as specified in CommonMark specification. +With the flag `MD_FLAG_PERMISSIVEEMAILAUTOLINKS`, MD4C enables more permissive +recognition of e-mail addresses and transforms them to autolinks, even if they +do not exactly follow the syntax of autolink as specified in CommonMark +specification. This is standard CommonMark e-mail autolink: @@ -20,3 +21,30 @@ E-mail: john.doe@gmail.com . <p>E-mail: <a href="mailto:john.doe@gmail.com">john.doe@gmail.com</a></p> ```````````````````````````````` + +`+` can occur before the `@`, but not after. + +```````````````````````````````` example +hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is. +. +<p>hello@mail+xyz.example isn't valid, but <a href="mailto:hello+xyz@mail.example">hello+xyz@mail.example</a> is.</p> +```````````````````````````````` + +`.`, `-`, and `_` can occur on both sides of the `@`, but only `.` may occur at +the end of the email address, in which case it will not be considered part of +the address: + +```````````````````````````````` example +a.b-c_d@a.b + +a.b-c_d@a.b. + +a.b-c_d@a.b- + +a.b-c_d@a.b_ +. +<p><a href="mailto:a.b-c_d@a.b">a.b-c_d@a.b</a></p> +<p><a href="mailto:a.b-c_d@a.b">a.b-c_d@a.b</a>.</p> +<p>a.b-c_d@a.b-</p> +<p>a.b-c_d@a.b_</p> +````````````````````````````````