md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

commit ef5f230ffaef6e655b46f3f842681d52251658de
parent 63a3141108fe0961add6671efc650c42fc64a10b
Author: Martin Mitas <mity@morous.org>
Date:   Fri, 14 Oct 2016 19:56:05 +0200

Implement permissive autolinks extensions.

With MD_FLAG_PERMISSIVEURLAUTOLINKS, we treat not overly complicated URLs
as autolinks even without '<' and '>'.

With MD_FLAG_PERMISSIVEEMAILAUTOLINKS, we treat not overly complicated
e-mail addresses as autolinks even without '<', '>' and without the
'mailto:' scheme.

Also expanded md2html utility and tests to cover these.

Diffstat:
MREADME.md | 4+++-
Mmd2html/md2html.c | 48++++++++++++++++++++++++++++++++++++------------
Mmd4c/md4c.c | 232+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Mmd4c/md4c.h | 15+++++++++------
Mscripts/run-tests.sh | 9+++++++--
Atest/permissive-email-autolinks.txt | 22++++++++++++++++++++++
Atest/permissive-url-autolinks.txt | 72++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 364 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md @@ -140,7 +140,9 @@ consideration. - **Miscellaneous:** - [x] Permissive ATX headers: `###Header` (without space) - - [ ] Permissive autolinks: `http://google.com` (without `<`...`>`) + - [x] Permissive URL autolinks: `http://google.com` (without `<`...`>`) + - [ ] Permissive e-mail autolinks: `john.dow@example.com` + (without `<`...`>` and `mailto:`) - [x] Disabling indented code blocks - [x] Disabling raw HTML blocks/spans diff --git a/md2html/md2html.c b/md2html/md2html.c @@ -440,8 +440,13 @@ static const option cmdline_options[] = { { "help", 'h', 'h', OPTION_ARG_NONE }, { "fverbatim-entities", 0, 'E', OPTION_ARG_NONE }, { "fpermissive-atx-headers", 0, 'A', OPTION_ARG_NONE }, + { "fpermissive-url-autolinks", 0, 'U', OPTION_ARG_NONE }, + { "fpermissive-email-autolinks", 0, '@', OPTION_ARG_NONE }, + { "fpermissive-autolinks", 0, 'V', OPTION_ARG_NONE }, { "fno-indented-code", 0, 'I', OPTION_ARG_NONE }, - { "fno-html-blocks", 0, 'H', OPTION_ARG_NONE }, + { "fno-html-blocks", 0, 'F', OPTION_ARG_NONE }, + { "fno-html-spans", 0, 'G', OPTION_ARG_NONE }, + { "fno-html", 0, 'H', OPTION_ARG_NONE }, { "fcollapse-whitespace", 0, 'W', OPTION_ARG_NONE }, { 0 } }; @@ -454,17 +459,31 @@ usage(void) "Convert input FILE (or standard input) in Markdown format to HTML.\n" "\n" "General options:\n" - " -o --output=FILE output file (default is standard output)\n" - " -f, --full-html generate full HTML document, including header\n" - " -s, --stat measure time of input parsing\n" - " -h, --help display this help and exit\n" + " -o --output=FILE Output file (default is standard output)\n" + " -f, --full-html Generate full HTML document, including header\n" + " -s, --stat Measure time of input parsing\n" + " -h, --help Display this help and exit\n" "\n" - "Markdown dialect options:\n" - " --fcollapse-whitespace collapse non-trivial whitespace\n" - " --fverbatim-entities do not translate entities\n" - " --fpermissive-atx-headers allow ATX headers without delimiting space\n" - " --fno-indented-code disable indented code blocks\n" - " --fno-html-blocks disable raw HTML blocks\n" + "Markdown extension options:\n" + " --fcollapse-whitespace\n" + " Collapse non-trivial whitespace\n" + " --fverbatim-entities\n" + " Do not translate entities\n" + " --fpermissive-atx-headers\n" + " Allow ATX headers without delimiting space\n" + " --fpermissive-url-autolinks\n" + " Allow URL autolinks without '<', '>'\n" + " --fpermissive-email-autolinks \n" + " Allow e-mail autolinks without '<', '>' and 'mailto:'\n" + " --fpermissive-autolinks\n" + " Same as --fpermissive-url-autolinks --fpermissive-email-autolinks\n" + " --fno-indented-code\n" + " Disable indented code blocks\n" + " --fno-html-blocks\n" + " Disable raw HTML blocks\n" + " --fno-html-spans\n" + " Disable raw HTML spans\n" + " --fno-html Same as --fno-html-blocks --fno-html-spans\n" ); } @@ -492,8 +511,13 @@ cmdline_callback(int opt, char const* value, void* data) case 'E': want_verbatim_entities = 1; break; case 'A': renderer_flags |= MD_FLAG_PERMISSIVEATXHEADERS; break; case 'I': renderer_flags |= MD_FLAG_NOINDENTEDCODEBLOCKS; break; - case 'H': renderer_flags |= MD_FLAG_NOHTMLBLOCKS; break; + case 'F': renderer_flags |= MD_FLAG_NOHTMLBLOCKS; break; + case 'G': renderer_flags |= MD_FLAG_NOHTMLSPANS; break; + case 'H': renderer_flags |= MD_FLAG_NOHTML; break; case 'W': renderer_flags |= MD_FLAG_COLLAPSEWHITESPACE; break; + case 'U': renderer_flags |= MD_FLAG_PERMISSIVEURLAUTOLINKS; break; + case '@': renderer_flags |= MD_FLAG_PERMISSIVEEMAILAUTOLINKS; break; + case 'V': renderer_flags |= MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEEMAILAUTOLINKS; break; default: fprintf(stderr, "Illegal option: %s\n", value); diff --git a/md4c/md4c.c b/md4c/md4c.c @@ -91,6 +91,10 @@ struct MD_CTX_tag { MD_RENDERER r; void* userdata; + /* Helper temporary growing buffer. */ + CHAR* buffer; + unsigned alloc_buffer; + /* Stack of inline/span markers. * This is only used for parsing a single block contents but by storing it * here we may reuse the stack for subsequent blocks; i.e. we have fewer @@ -253,6 +257,12 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n) return 0; } +static inline int +md_str_eq(const CHAR* s1, const CHAR* s2, SZ n) +{ + return memcmp(s1, s2, n * sizeof(CHAR)); +} + static int md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size) { @@ -290,6 +300,26 @@ md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ goto abort; \ } while(0) + +#define MD_TEMP_BUFFER(sz) \ + do { \ + if(sz > ctx->alloc_buffer) { \ + CHAR* new_buffer; \ + SZ new_size = ((sz) + (sz) / 2 + 128) & ~127; \ + \ + new_buffer = realloc(ctx->buffer, new_size); \ + if(new_buffer == NULL) { \ + MD_LOG("realloc() failed."); \ + ret = -1; \ + goto abort; \ + } \ + \ + ctx->buffer = new_buffer; \ + ctx->alloc_buffer = new_size; \ + } \ + } while(0) + + #define MD_ENTER_BLOCK(type, arg) \ do { \ ret = ctx->r.enter_block((type), (arg), ctx->userdata); \ @@ -768,13 +798,15 @@ md_is_autolink(MD_CTX* ctx, OFF beg, OFF end) /* The mark structure. * * '\\': Maybe escape sequence. + * '\0': NULL char. * '*': Maybe (strong) emphasis start/end. * '`': Maybe code span start/end. * '&': Maybe start of entity. * ';': Maybe end of entity. * '<': Maybe start of raw HTML or autolink. * '>': Maybe end of raw HTML or autolink. - * '0': NULL char. + * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS) + * '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS) * * Note that not all instances of these chars in the text imply creation of the * structure. Only those which have (or may have, after we see more context) @@ -1161,6 +1193,54 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines) } } + /* A potential permissive URL autolink. */ + if((ctx->r.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS) && ch == _T(':')) { + static struct { + const CHAR* scheme; + SZ scheme_size; + const CHAR* suffix; + SZ suffix_size; + } scheme_map[] = { + /* In the order from the most frequently used, arguably. */ + { _T("http"), 4, _T("//"), 2 }, + { _T("https"), 5, _T("//"), 2 }, + { _T("mailto"), 6, NULL, 0 }, + { _T("ftp"), 3, _T("//"), 2 }, + }; + int scheme_index; + + for(scheme_index = 0; scheme_index < SIZEOF_ARRAY(scheme_map); scheme_index++) { + const CHAR* scheme = scheme_map[scheme_index].scheme; + const SZ scheme_size = scheme_map[scheme_index].scheme_size; + const CHAR* suffix = scheme_map[scheme_index].suffix; + const SZ suffix_size = scheme_map[scheme_index].suffix_size; + + if(line->beg + scheme_size <= off && md_str_eq(STR(off-scheme_size), scheme, scheme_size) == 0 && + (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1)) && + off + 1 + suffix_size < line->end && md_str_eq(STR(off+1), suffix, suffix_size) == 0) + { + PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER); + /* Push a dummy as a reserve for a closer. */ + PUSH_MARK('D', off, off, 0); + off += 1 + suffix_size; + continue; + } + } + } + + /* A potential permissive e-mail autolink. */ + if((ctx->r.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS) && ch == _T('@')) { + if(line->beg + 1 <= off && ISALNUM(off-1) && + off + 3 < line->end && ISALNUM(off+1)) + { + PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER); + /* Push a dummy as a reserve for a closer. */ + PUSH_MARK('D', off, off, 0); + off++; + continue; + } + } + /* NULL character. */ if(ch == _T('\0')) { PUSH_MARK(ch, off, off+1, MD_MARK_RESOLVED); @@ -1412,10 +1492,107 @@ md_analyze_underscore(MD_CTX* ctx, int mark_index) md_analyze_simple_pairing_mark(ctx, &UNDERSCORE_OPENERS, mark_index, 1); } +static void +md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index) +{ + MD_MARK* opener = &ctx->marks[mark_index]; + int closer_index; + MD_MARK* closer; + OFF off = opener->end; + + if(off < ctx->size && ISALNUM(off)) + off++; + else + return; + + while(1) { + while(off < ctx->size && (ISALNUM(off) || CH(off) == _T('/'))) + off++; + + /* We need to be relatively careful to not include too much into the URL. + * Consider e.g. a dot or question mark: + * "Go to http://example.com." versus "http://example.com.uk" + * "Do you know http://zombo.com?" versus "http://example.com/?page=2" + * Therefore we include some named punctuation characters only if they + * are immediately followed by alnum char. + */ + if(off + 1 < ctx->size && ISANYOF(off, _T("@.?=&%+-_#")) && ISALNUM(off+1)) + off += 2; + else + break; + } + + /* Ok. Lets call it auto-link. Adapt opener and create closer to zero + * length so all the contents becomes the link text. */ + closer_index = md_split_mark(ctx, mark_index, 0); + closer = &ctx->marks[closer_index]; + + opener->end = opener->beg; + closer->beg = off; + closer->end = off; + md_resolve_range(ctx, NULL, mark_index, closer_index); +} + +static void +md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index) +{ + MD_MARK* opener = &ctx->marks[mark_index]; + int closer_index; + MD_MARK* closer; + OFF beg = opener->beg; + OFF end = opener->end; + int right_dot_count = 0; + + MD_ASSERT(CH(beg) == _T('@')); + + /* Accept any alphanumeric sequences delimited with dot before the '@'. + * There must be a whitespace or start of line before it. */ + while(1) { + while(beg > 0 && ISALNUM(beg-1)) + beg--; + + if(beg > 1 && CH(beg-1) == _T('.') && ISALNUM(beg-2)) + beg -= 2; + else if(beg == 0 || ISWHITESPACE(beg-1) || ISNEWLINE(beg-1)) + break; + else + return; + } + + /* Accept any alphanumeric sequences delimited with dot after the '@'. */ + while(1) { + while(end + 1 < ctx->size && ISALNUM(end)) + end++; + + if(end + 1 < ctx->size && CH(end) == _T('.') && ISALNUM(end+1)) { + right_dot_count++; + end += 2; + } else if(right_dot_count > 0) { + /* Although "user@machine" is technically correct e-mail address, + * we request at least one dot, as in e.g. "user@machine.com" to + * prevent some false positives with this very loose format. */ + break; + } else { + return; + } + } + + /* Ok. Lets call it auto-link. Adapt opener and create closer to zero + * length so all the contents becomes the link text. */ + closer_index = md_split_mark(ctx, mark_index, 0); + closer = &ctx->marks[closer_index]; + + opener->beg = beg; + opener->end = beg; + closer->beg = end; + closer->end = end; + md_resolve_range(ctx, NULL, mark_index, closer_index); +} + /* Table of precedence of various span types. */ static const CHAR* md_precedence_table[] = { - _T("&`<>"), /* Code spans; autolinks; raw HTML. */ - _T("*_") /* Emphasis and string emphasis. */ + _T("&`<>"), /* Entities; code spans; autolinks; raw HTML. */ + _T("*_@:") /* Emphasis and string emphasis; permissive autolinks. */ }; static void @@ -1452,6 +1629,8 @@ md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int precedence_ case '&': md_analyze_entity(ctx, i); break; case '*': md_analyze_asterisk(ctx, i); break; case '_': md_analyze_underscore(ctx, i); break; + case ':': md_analyze_permissive_url_autolink(ctx, i); break; + case '@': md_analyze_permissive_email_autolink(ctx, i); break; } i++; @@ -1563,24 +1742,42 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) } break; - case '<': /* Autolink or raw HTML. */ - if(mark->flags & MD_MARK_AUTOLINK) { - det.a.href = STR(mark->end); - det.a.href_size = ctx->marks[mark->next].beg - mark->end; - MD_ENTER_SPAN(MD_SPAN_A, (void*) &det); - } else { - text_type = MD_TEXT_HTML; + case '<': + case '>': /* Autolink or raw HTML. */ + if(!(mark->flags & MD_MARK_AUTOLINK)) { + if(mark->flags & MD_MARK_OPENER) + text_type = MD_TEXT_HTML; + else + text_type = MD_TEXT_NORMAL; + break; } - break; + /* Pass through, if auto-link. */ - case '>': - if(mark->flags & MD_MARK_AUTOLINK) + case '@': /* Permissive e-mail autolink. */ + case ':': /* Permissive URL autolink. */ + if(mark->flags & MD_MARK_OPENER) { + if(mark->ch == '@') { + SZ sz = 7 + ctx->marks[mark->next].beg - mark->end; + + MD_TEMP_BUFFER(sz); + memcpy(ctx->buffer, _T("mailto:"), 7 * sizeof(CHAR)); + memcpy(ctx->buffer + 7, STR(mark->end), (sz-7) * sizeof(CHAR)); + + det.a.href_size = sz; + det.a.href = ctx->buffer; + } else { + det.a.href_size = ctx->marks[mark->next].beg - mark->end; + det.a.href = STR(mark->end); + } + det.a.title = NULL; + det.a.title_size = 0; + MD_ENTER_SPAN(MD_SPAN_A, (void*) &det); + } else { /* The detail already has to be initialized: There cannot - * be any resolved mark between the autlink opener and + * be any resolved mark between the autolink opener and * closer. */ MD_LEAVE_SPAN(MD_SPAN_A, (void*) &det); - else - text_type = MD_TEXT_NORMAL; + } break; case '&': /* Entity. */ @@ -1930,7 +2127,7 @@ md_is_html_block_start_condition(MD_CTX* ctx, OFF beg) /* Check for type 5: <![CDATA[ */ if(off + 8 < ctx->size) { - if(memcmp(STR(off), _T("![CDATA["), 8 * sizeof(CHAR)) == 0) + if(md_str_eq(STR(off), _T("![CDATA["), 8 * sizeof(CHAR)) == 0) return 5; } } @@ -2500,6 +2697,7 @@ md_parse(const MD_CHAR* text, MD_SIZE size, const MD_RENDERER* renderer, void* u /* Clean-up. */ free(ctx.marks); + free(ctx.buffer); return ret; } diff --git a/md4c/md4c.h b/md4c/md4c.h @@ -177,12 +177,15 @@ struct MD_BLOCK_CODE_DETAIL_tag { * By default (when MD_RENDERER::flags == 0), we follow CommonMark specification. * The following flags may allow some extensions or deviations from it. */ -#define MD_FLAG_COLLAPSEWHITESPACE 0x0001 /* In MD_TEXT_NORMAL, collapse non-trivial whitespace into single ' ' */ -#define MD_FLAG_PERMISSIVEATXHEADERS 0x0002 /* Do not require space in ATX headers ( ###header ) */ -#define MD_FLAG_NOINDENTEDCODEBLOCKS 0x0004 /* Disable indented code blocks. (Only fenced code works.) */ -#define MD_FLAG_NOHTMLBLOCKS 0x0010 /* Disable raw HTML blocks. */ -#define MD_FLAG_NOHTMLSPANS 0x0020 /* Disable raw HTML (inline). */ -#define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS) +#define MD_FLAG_COLLAPSEWHITESPACE 0x0001 /* In MD_TEXT_NORMAL, collapse non-trivial whitespace into single ' ' */ +#define MD_FLAG_PERMISSIVEATXHEADERS 0x0002 /* Do not require space in ATX headers ( ###header ) */ +#define MD_FLAG_PERMISSIVEURLAUTOLINKS 0x0004 /* Recognize URLs as autolinks even without '<', '>' */ +#define MD_FLAG_PERMISSIVEEMAILAUTOLINKS 0x0008 /* Recognize e-mails as autolinks even without '<', '>' and 'mailto:' */ +#define MD_FLAG_PERMISSIVEAUTOLINKS (MD_FLAG_PERMISSIVEWWWAUTOLINKS | MD_FLAG_PERMISSIVEEMAILAUTOLINKS) +#define MD_FLAG_NOINDENTEDCODEBLOCKS 0x0010 /* Disable indented code blocks. (Only fenced code works.) */ +#define MD_FLAG_NOHTMLBLOCKS 0x0020 /* Disable raw HTML blocks. */ +#define MD_FLAG_NOHTMLSPANS 0x0040 /* Disable raw HTML (inline). */ +#define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS) /* Caller-provided callbacks. * diff --git a/scripts/run-tests.sh b/scripts/run-tests.sh @@ -2,7 +2,7 @@ # # Run this script from build directory. -set -e +#set -e SELF_DIR=`dirname $0` PROJECT_DIR="$SELF_DIR/.." @@ -24,4 +24,9 @@ elif which python 2>/dev/null; then fi fi -$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/spec.txt" -p "$PROGRAM" "$@" +# Test CommonMark specification compliance (with default options): +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/spec.txt" -p "$PROGRAM" + +# Test various extensions and deviations from the specifications: +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/permissive-email-autolinks.txt" -p "$PROGRAM --fpermissive-email-autolinks" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/permissive-url-autolinks.txt" -p "$PROGRAM --fpermissive-url-autolinks" diff --git a/test/permissive-email-autolinks.txt b/test/permissive-email-autolinks.txt @@ -0,0 +1,22 @@ + +# Permissive E-mail Autolinks + +With the flag `MD_FLAG_PERMISSIVEEMAILAUTOLINKS`, MD4C enables more permissive recognition +of e-mail addresses and transforms them to autolinks, even if they do not exactly follow +the syntax of autolink as specified in CommonMark specification. + +This is standard CommonMark e-mail autolink: + +```````````````````````````````` example +E-mail: <mailto:john.doe@gmail.com> +. +<p>E-mail: <a href="mailto:john.doe@gmail.com">mailto:john.doe@gmail.com</a></p> +```````````````````````````````` + +With the permissive autolinks enabled, this is sufficient: + +```````````````````````````````` example +E-mail: john.doe@gmail.com +. +<p>E-mail: <a href="mailto:john.doe@gmail.com">john.doe@gmail.com</a></p> +```````````````````````````````` diff --git a/test/permissive-url-autolinks.txt b/test/permissive-url-autolinks.txt @@ -0,0 +1,72 @@ + +# Permissive URL Autolinks + +With the flag `MD_FLAG_PERMISSIVEURLAUTOLINKS`, MD4C enables more permissive recognition +of URLs and transform them to autolinks, even if they do not exactly follow the syntax +of autolink as specified in CommonMark specification. + +This is standard CommonMark autolink: + +```````````````````````````````` example +Homepage: <https://github.com/mity/md4c> +. +<p>Homepage: <a href="https://github.com/mity/md4c">https://github.com/mity/md4c</a></p> +```````````````````````````````` + +With the permissive autolinks enabled, this is sufficient: + +```````````````````````````````` example +Homepage: https://github.com/mity/md4c +. +<p>Homepage: <a href="https://github.com/mity/md4c">https://github.com/mity/md4c</a></p> +```````````````````````````````` + +But this permissive autolink feature can work only for very widely used URL +schemes, in alphabetical order `ftp:`, `http:`, `https:` and `mailto:`, +so this is not an permissive autolink. + +That's why this is not a permissive autolink: + +```````````````````````````````` example +ssh://root@example.com +. +<p>ssh://root@example.com</p> +```````````````````````````````` + +As certain characters (`.`, `?`, `=`, `&`, `%`, `+`, `-`, `_`, `#`, `@`) can collide +with a normal text flow, they are not recognized as part of the URL unless another +alphanumeric character immediately follows. + +Therefore the question mark in this question is not part of the autolink's +contents: + +```````````````````````````````` example +Have you ever visited http://www.zombo.com? +. +<p>Have you ever visited <a href="http://www.zombo.com">http://www.zombo.com</a>?</p> +```````````````````````````````` + +But in contrast, in this example it is: + +```````````````````````````````` example +http://www.bing.com/search?q=md4c +. +<p><a href="http://www.bing.com/search?q=md4c">http://www.bing.com/search?q=md4c</a></p> +```````````````````````````````` + +Please note that e-mail addresses without the `mailto:` scheme are not permissive URL +autolinks, but MD4C implements another extension `MD_FLAG_PERMISSIVEEMAILAUTOLINKS` +which can be used to enable that too. + +```````````````````````````````` example +mailto:john.doe@example.com +. +<p><a href="mailto:john.doe@example.com">mailto:john.doe@example.com</a></p> +```````````````````````````````` + +```````````````````````````````` example +john.doe@example.com +. +<p>john.doe@example.com</p> +```````````````````````````````` +