md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

commit c085ab5cfed594592ad9ad0b3d8a801cc2ef0db8
parent ba29d0075eb54cfa8ceee10a0d193fef96a0e674
Author: Martin Mitas <mity@morous.org>
Date:   Mon, 12 Dec 2016 23:23:51 +0100

Implement support for entities outside normal text flow (issue #5).

 * Change API (md4c.h) to propagate different substring type info to renderer.
 * Implement/refactor related code in the parser.
 * Adapt renderer (md2html) to the new API.

Diffstat:
Mmd2html/md2html.c | 210++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Mmd4c/md4c.c | 544+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mmd4c/md4c.h | 52+++++++++++++++++++++++++++++++++++-----------------
3 files changed, 475 insertions(+), 331 deletions(-)

diff --git a/md2html/md2html.c b/md2html/md2html.c @@ -180,89 +180,6 @@ membuf_append_url_escaped(struct membuffer* buf, const char* data, MD_SIZE size) *** HTML rendering helper functions *** *****************************************/ -static int image_nesting_level = 0; - -static void -open_ol_block(struct membuffer* out, const MD_BLOCK_OL_DETAIL* det) -{ - char buf[64]; - - if(det->start == 1) { - MEMBUF_APPEND_LITERAL(out, "<ol>"); - return; - } - - snprintf(buf, sizeof(buf), "<ol start=\"%u\">", det->start); - MEMBUF_APPEND_LITERAL(out, buf); -} - -static void -open_code_block(struct membuffer* out, const MD_BLOCK_CODE_DETAIL* det) -{ - MEMBUF_APPEND_LITERAL(out, "<pre><code"); - - /* If known, output the HTML 5 attribute class="language-LANGNAME". */ - if(det->lang != NULL) { - MEMBUF_APPEND_LITERAL(out, " class=\"language-"); - membuf_append_escaped(out, det->lang, det->lang_size); - MEMBUF_APPEND_LITERAL(out, "\""); - } - - MEMBUF_APPEND_LITERAL(out, ">"); -} - -static void -open_td_block(struct membuffer* out, const char* cell_type, const MD_BLOCK_TD_DETAIL* det) -{ - MEMBUF_APPEND_LITERAL(out, "<"); - MEMBUF_APPEND_LITERAL(out, cell_type); - - switch(det->align) { - case MD_ALIGN_LEFT: MEMBUF_APPEND_LITERAL(out, " align=\"left\">"); break; - case MD_ALIGN_CENTER: MEMBUF_APPEND_LITERAL(out, " align=\"center\">"); break; - case MD_ALIGN_RIGHT: MEMBUF_APPEND_LITERAL(out, " align=\"right\">"); break; - default: MEMBUF_APPEND_LITERAL(out, ">"); break; - } -} - -static void -open_a_span(struct membuffer* out, const MD_SPAN_A_DETAIL* det) -{ - MEMBUF_APPEND_LITERAL(out, "<a href=\""); - membuf_append_url_escaped(out, det->href, det->href_size); - - if(det->title != NULL) { - MEMBUF_APPEND_LITERAL(out, "\" title=\""); - membuf_append_escaped(out, det->title, det->title_size); - } - - MEMBUF_APPEND_LITERAL(out, "\">"); -} - -static void -open_img_span(struct membuffer* out, const MD_SPAN_IMG_DETAIL* det) -{ - MEMBUF_APPEND_LITERAL(out, "<img src=\""); - membuf_append_url_escaped(out, det->src, det->src_size); - - MEMBUF_APPEND_LITERAL(out, "\" alt=\""); - - image_nesting_level++; -} - -static void -close_img_span(struct membuffer* out, const MD_SPAN_IMG_DETAIL* det) -{ - if(det->title != NULL) { - MEMBUF_APPEND_LITERAL(out, "\" title=\""); - membuf_append_escaped(out, det->title, det->title_size); - } - - MEMBUF_APPEND_LITERAL(out, "\">"); - - image_nesting_level--; -} - static unsigned hex_val(char ch) { @@ -275,7 +192,8 @@ hex_val(char ch) } static void -render_utf8_codepoint(struct membuffer* out, unsigned codepoint) +render_utf8_codepoint(struct membuffer* out, unsigned codepoint, + void (*fn_append)(struct membuffer*, const char*, MD_SIZE)) { static const char utf8_replacement_char[] = { 0xef, 0xbf, 0xbd }; @@ -303,18 +221,19 @@ render_utf8_codepoint(struct membuffer* out, unsigned codepoint) } if(0 < codepoint && codepoint <= 0x10ffff) - membuf_append_escaped(out, (char*)utf8, n); + fn_append(out, (char*)utf8, n); else - membuf_append(out, utf8_replacement_char, 3); + fn_append(out, utf8_replacement_char, 3); } /* Translate entity to its UTF-8 equivalent, or output the verbatim one * if such entity is unknown (or if the translation is disabled). */ static void -render_entity(struct membuffer* out, const MD_CHAR* text, MD_SIZE size) +render_entity(struct membuffer* out, const MD_CHAR* text, MD_SIZE size, + void (*fn_append)(struct membuffer*, const char*, MD_SIZE)) { if(want_verbatim_entities) { - membuf_append(out, text, size); + fn_append(out, text, size); return; } @@ -334,20 +253,123 @@ render_entity(struct membuffer* out, const MD_CHAR* text, MD_SIZE size) codepoint = 10 * codepoint + (text[i] - '0'); } - render_utf8_codepoint(out, codepoint); + render_utf8_codepoint(out, codepoint, fn_append); return; } else { - /* Named entity (e.g. "&nbsp;". */ + /* Named entity (e.g. "&nbsp;"). */ const char* ent; ent = entity_lookup(text, size); if(ent != NULL) { - membuf_append_escaped(out, ent, strlen(ent)); + fn_append(out, ent, strlen(ent)); return; } } - membuf_append_escaped(out, text, size); + fn_append(out, text, size); +} + +static void +render_attribute(struct membuffer* out, const MD_ATTRIBUTE* attr, + void (*fn_append)(struct membuffer*, const char*, MD_SIZE)) +{ + int i; + + for(i = 0; attr->substr_offsets[i] < attr->size; i++) { + MD_TEXTTYPE type = attr->substr_types[i]; + MD_OFFSET off = attr->substr_offsets[i]; + MD_SIZE size = attr->substr_offsets[i+1] - off; + const MD_CHAR* text = attr->text + off; + + switch(type) { + case MD_TEXT_ENTITY: render_entity(out, text, size, fn_append); break; + default: fn_append(out, text, size); break; + } + } +} + + +static int image_nesting_level = 0; + +static void +open_ol_block(struct membuffer* out, const MD_BLOCK_OL_DETAIL* det) +{ + char buf[64]; + + if(det->start == 1) { + MEMBUF_APPEND_LITERAL(out, "<ol>"); + return; + } + + snprintf(buf, sizeof(buf), "<ol start=\"%u\">", det->start); + MEMBUF_APPEND_LITERAL(out, buf); +} + +static void +open_code_block(struct membuffer* out, const MD_BLOCK_CODE_DETAIL* det) +{ + MEMBUF_APPEND_LITERAL(out, "<pre><code"); + + /* If known, output the HTML 5 attribute class="language-LANGNAME". */ + if(det->lang.text != NULL) { + MEMBUF_APPEND_LITERAL(out, " class=\"language-"); + render_attribute(out, &det->lang, membuf_append_escaped); + MEMBUF_APPEND_LITERAL(out, "\""); + } + + MEMBUF_APPEND_LITERAL(out, ">"); +} + +static void +open_td_block(struct membuffer* out, const char* cell_type, const MD_BLOCK_TD_DETAIL* det) +{ + MEMBUF_APPEND_LITERAL(out, "<"); + MEMBUF_APPEND_LITERAL(out, cell_type); + + switch(det->align) { + case MD_ALIGN_LEFT: MEMBUF_APPEND_LITERAL(out, " align=\"left\">"); break; + case MD_ALIGN_CENTER: MEMBUF_APPEND_LITERAL(out, " align=\"center\">"); break; + case MD_ALIGN_RIGHT: MEMBUF_APPEND_LITERAL(out, " align=\"right\">"); break; + default: MEMBUF_APPEND_LITERAL(out, ">"); break; + } +} + +static void +open_a_span(struct membuffer* out, const MD_SPAN_A_DETAIL* det) +{ + MEMBUF_APPEND_LITERAL(out, "<a href=\""); + render_attribute(out, &det->href, membuf_append_url_escaped); + + if(det->title.text != NULL) { + MEMBUF_APPEND_LITERAL(out, "\" title=\""); + render_attribute(out, &det->title, membuf_append_escaped); + } + + MEMBUF_APPEND_LITERAL(out, "\">"); +} + +static void +open_img_span(struct membuffer* out, const MD_SPAN_IMG_DETAIL* det) +{ + MEMBUF_APPEND_LITERAL(out, "<img src=\""); + render_attribute(out, &det->src, membuf_append_url_escaped); + + MEMBUF_APPEND_LITERAL(out, "\" alt=\""); + + image_nesting_level++; +} + +static void +close_img_span(struct membuffer* out, const MD_SPAN_IMG_DETAIL* det) +{ + if(det->title.text != NULL) { + MEMBUF_APPEND_LITERAL(out, "\" title=\""); + render_attribute(out, &det->title, membuf_append_escaped); + } + + MEMBUF_APPEND_LITERAL(out, "\">"); + + image_nesting_level--; } @@ -463,11 +485,11 @@ text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, void* userdat struct membuffer* out = (struct membuffer*) userdata; switch(type) { - case MD_TEXT_NULLCHAR: render_utf8_codepoint(out, 0x0000); break; + case MD_TEXT_NULLCHAR: render_utf8_codepoint(out, 0x0000, membuf_append); break; case MD_TEXT_BR: MEMBUF_APPEND_LITERAL(out, (image_nesting_level == 0 ? "<br>\n" : " ")); break; case MD_TEXT_SOFTBR: MEMBUF_APPEND_LITERAL(out, (image_nesting_level == 0 ? "\n" : " ")); break; case MD_TEXT_HTML: membuf_append(out, text, size); break; - case MD_TEXT_ENTITY: render_entity(out, text, size); break; + case MD_TEXT_ENTITY: render_entity(out, text, size, membuf_append_escaped); break; default: membuf_append_escaped(out, text, size); break; } diff --git a/md4c/md4c.c b/md4c/md4c.c @@ -840,17 +840,15 @@ struct MD_UNICODE_FOLD_INFO_tag { *************************************/ /* Fill buffer with copy of the string between 'beg' and 'end' but replace any - * line breaks with given replacement character and also optionally resolve any - * escape sequences. + * line breaks with given replacement character. * * NOTE: Caller is responsible to make sure the buffer is large enough. * (Given the output is always shorter then input, (end - beg) is good idea * what the caller should allocate.) */ static void -md_do_normalize_string(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines, - CHAR line_break_replacement_char, int resolve_escapes, - CHAR* buffer, SZ* p_size) +md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines, + CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size) { CHAR* ptr = buffer; int line_index = 0; @@ -863,13 +861,6 @@ md_do_normalize_string(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int line_end = end; while(off < line_end) { - if(resolve_escapes && CH(off) == _T('\\') && - off+1 < end && (ISPUNCT(off+1) || ISNEWLINE(off+1))) { - if(ISNEWLINE(off+1)) - break; - off++; - } - *ptr = CH(off); ptr++; off++; @@ -888,12 +879,11 @@ md_do_normalize_string(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int } } -/* Wrapper of md_do_normalize_string() which allocates new buffer for the - * output string. */ +/* Wrapper of md_merge_lines() which allocates new buffer for the output string. + */ static int -md_normalize_string(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines, - CHAR line_break_replacement_char, int resolve_escapes, - CHAR** p_str, SZ* p_size) +md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines, + CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size) { CHAR* buffer; @@ -903,8 +893,8 @@ md_normalize_string(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_l return -1; } - md_do_normalize_string(ctx, beg, end, lines, n_lines, - line_break_replacement_char, resolve_escapes, buffer, p_size); + md_merge_lines(ctx, beg, end, lines, n_lines, + line_break_replacement_char, buffer, p_size); *p_str = buffer; return 0; @@ -1246,6 +1236,218 @@ md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_ } +/**************************** + *** Recognizing Entity *** + ****************************/ + +static int +md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) +{ + OFF off = beg; + + while(off < max_end && ISXDIGIT_(text[off]) && off - beg <= 8) + off++; + + if(1 <= off - beg && off - beg <= 8) { + *p_end = off; + return TRUE; + } else { + return FALSE; + } +} + +static int +md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) +{ + OFF off = beg; + + while(off < max_end && ISDIGIT_(text[off]) && off - beg <= 8) + off++; + + if(1 <= off - beg && off - beg <= 8) { + *p_end = off; + return TRUE; + } else { + return FALSE; + } +} + +static int +md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) +{ + OFF off = beg; + + if(off <= max_end && ISALPHA_(text[off])) + off++; + else + return FALSE; + + while(off < max_end && ISALNUM_(text[off]) && off - beg <= 48) + off++; + + if(2 <= off - beg && off - beg <= 48) { + *p_end = off; + return TRUE; + } else { + return FALSE; + } +} + +static int +md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) +{ + int is_contents; + OFF off = beg; + + MD_ASSERT(text[off] == _T('&')); + off++; + + if(off+1 < max_end && text[off] == _T('#') && (text[off+1] == _T('x') || text[off+1] == _T('X'))) + is_contents = md_is_hex_entity_contents(ctx, text, off+2, max_end, &off); + else if(off < max_end && CH(off) == _T('#')) + is_contents = md_is_dec_entity_contents(ctx, text, off+1, max_end, &off); + else + is_contents = md_is_named_entity_contents(ctx, text, off, max_end, &off); + + if(is_contents && off < max_end && text[off] == _T(';')) { + *p_end = off+1; + return TRUE; + } else { + return FALSE; + } +} + +static inline int +md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end) +{ + return md_is_entity_str(ctx, ctx->text, beg, max_end, p_end); +} + + +/****************************** + *** Attribute Management *** + ******************************/ + +typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD; +struct MD_ATTRIBUTE_BUILD_tag { + MD_TEXTTYPE* substr_types; + OFF* substr_offsets; + int substr_count; + int substr_alloc; +}; + + +#define MD_BUILD_ATTR_NO_ESCAPES 0x0001 + +static int +md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build, + MD_TEXTTYPE type, OFF off) +{ + if(build->substr_count >= build->substr_alloc) { + MD_TEXTTYPE* new_substr_types; + OFF* new_substr_offsets; + + build->substr_alloc = (build->substr_alloc == 0 ? 8 : build->substr_alloc * 2); + + new_substr_types = (MD_TEXTTYPE*) realloc(build->substr_types, + (build->substr_alloc+1) * sizeof(MD_TEXTTYPE)); + if(new_substr_types == NULL) { + MD_LOG("realloc() failed."); + return -1; + } + new_substr_offsets = (OFF*) realloc(build->substr_offsets, + build->substr_alloc * sizeof(OFF)); + if(new_substr_offsets == NULL) { + MD_LOG("realloc() failed."); + return -1; + } + + build->substr_types = new_substr_types; + build->substr_offsets = new_substr_offsets; + } + + build->substr_types[build->substr_count] = type; + build->substr_offsets[build->substr_count] = off; + build->substr_count++; + return 0; +} + +static int +md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, + unsigned flags, MD_ATTRIBUTE* attr) +{ + MD_ATTRIBUTE_BUILD build = {0}; + CHAR* text; + OFF raw_off = 0; + OFF off = 0; + int ret = 0; + + if(raw_size == 0) { + static const MD_TEXTTYPE empty_types[] = { MD_TEXT_NORMAL }; + static const OFF empty_offsets[] = { 0, 0 }; + + attr->text = NULL; + attr->size = 0; + attr->substr_types = empty_types; + attr->substr_offsets = empty_offsets; + return 0; + } + + text = (CHAR*) malloc(raw_size * sizeof(CHAR)); + if(text == NULL) { + MD_LOG("malloc() failed."); + goto abort; + } + + while(raw_off < raw_size) { + if(raw_text[raw_off] == _T('&')) { + OFF ent_end; + + if(md_is_entity_str(ctx, raw_text, raw_off, raw_size, &ent_end)) { + MD_CHECK(md_build_attr_append_substr(ctx, &build, MD_TEXT_ENTITY, off)); + memcpy(text + off, raw_text + raw_off, ent_end - raw_off); + off += ent_end - raw_off; + raw_off = ent_end; + continue; + } + } + + if(build.substr_count == 0 || build.substr_types[build.substr_count-1] != MD_TEXT_NORMAL) + MD_CHECK(md_build_attr_append_substr(ctx, &build, MD_TEXT_NORMAL, off)); + + if(!(flags & MD_BUILD_ATTR_NO_ESCAPES) && + raw_text[raw_off] == _T('\\') && raw_off+1 < raw_size && + (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1]))) + raw_off++; + + text[off++] = raw_text[raw_off++]; + } + build.substr_offsets[build.substr_count] = off; + + attr->text = text; + attr->size = off; + attr->substr_offsets = build.substr_offsets; + attr->substr_types = build.substr_types; + return 0; + +abort: + free(text); + free(build.substr_offsets); + free(build.substr_types); + return -1; +} + +static void +md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE* attr) +{ + if(attr->size > 0) { + free((void*) attr->text); + free((void*) attr->substr_types); + free((void*) attr->substr_offsets); + } +} + + /*************************** *** Recognizing Links *** ***************************/ @@ -1260,7 +1462,6 @@ struct MD_LINK_REF_DEF_tag { SZ label_size : 24; unsigned label_needs_free : 1; unsigned title_needs_free : 1; - unsigned dest_contains_escape : 1; SZ title_size; OFF dest_beg; OFF dest_end; @@ -1273,7 +1474,6 @@ struct MD_LINK_ATTR_tag { CHAR* title; SZ title_size; - int dest_contains_escape; int title_needs_free; }; @@ -1347,8 +1547,7 @@ md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, static int md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, - OFF* p_contents_beg, OFF* p_contents_end, - int* p_contains_escape) + OFF* p_contents_beg, OFF* p_contents_end) { OFF off = beg; @@ -1356,11 +1555,8 @@ md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, return FALSE; off++; - *p_contains_escape = FALSE; - while(off < max_end) { if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) { - *p_contains_escape = TRUE; off += 2; continue; } @@ -1384,17 +1580,13 @@ md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, static int md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, - OFF* p_contents_beg, OFF* p_contents_end, - int* p_contains_escape) + OFF* p_contents_beg, OFF* p_contents_end) { OFF off = beg; int in_parentheses = 0; - *p_contains_escape = FALSE; - while(off < max_end) { if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) { - *p_contains_escape = TRUE; off += 2; continue; } @@ -1429,25 +1621,18 @@ md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, return TRUE; } -static int +static inline int md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, - OFF* p_contents_beg, OFF* p_contents_end, - int* p_contains_escape) + OFF* p_contents_beg, OFF* p_contents_end) { - if(md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end, p_contains_escape)) - return TRUE; - - if(md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end, p_contains_escape)) - return TRUE; - - return FALSE; + return (md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end) || + md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end)); } static int md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF* p_end, int* p_beg_line_index, int* p_end_line_index, - OFF* p_contents_beg, OFF* p_contents_end, - int* p_has_escape) + OFF* p_contents_beg, OFF* p_contents_end) { OFF off = beg; CHAR closer_char; @@ -1475,14 +1660,12 @@ md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, off++; *p_contents_beg = off; - *p_has_escape = FALSE; while(line_index < n_lines) { OFF line_end = lines[line_index].end; while(off < line_end) { if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) { - *p_has_escape = TRUE; off++; } else if(CH(off) == closer_char) { /* Success. */ @@ -1518,12 +1701,10 @@ md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines) int label_is_multiline; OFF dest_contents_beg; OFF dest_contents_end; - int dest_contains_escape; OFF title_contents_beg; OFF title_contents_end; int title_contents_line_index; int title_is_multiline; - int title_has_escape; OFF off; int line_index = 0; int tmp_line_index; @@ -1554,14 +1735,14 @@ md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines) /* Link destination. */ if(!md_is_link_destination(ctx, off, lines[line_index].end, - &off, &dest_contents_beg, &dest_contents_end, &dest_contains_escape)) + &off, &dest_contents_beg, &dest_contents_end)) return FALSE; /* (Optional) title. Note we interpret it as an title only if nothing * more follows on its last line. */ if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off, &off, &title_contents_line_index, &tmp_line_index, - &title_contents_beg, &title_contents_end, &title_has_escape) + &title_contents_beg, &title_contents_end) && off >= lines[line_index + tmp_line_index].end) { title_is_multiline = (tmp_line_index != title_contents_line_index); @@ -1604,27 +1785,26 @@ md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines) } else { SZ label_size; - MD_CHECK(md_normalize_string(ctx, label_contents_beg, label_contents_end, + MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end, lines + label_contents_line_index, n_lines - label_contents_line_index, - _T(' '), FALSE, &def->label, &label_size)); + _T(' '), &def->label, &label_size)); def->label_size = label_size; def->label_needs_free = TRUE; } def->dest_beg = dest_contents_beg; def->dest_end = dest_contents_end; - def->dest_contains_escape = dest_contains_escape; if(title_contents_beg >= title_contents_end) { def->title = NULL; def->title_size = 0; - } else if(!title_is_multiline && !title_has_escape) { + } else if(!title_is_multiline) { def->title = (CHAR*) STR(title_contents_beg); def->title_size = title_contents_end - title_contents_beg; } else { - MD_CHECK(md_normalize_string(ctx, title_contents_beg, title_contents_end, + MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end, lines + title_contents_line_index, n_lines - title_contents_line_index, - _T('\n'), TRUE, &def->title, &def->title_size)); + _T('\n'), &def->title, &def->title_size)); def->title_needs_free = TRUE; } @@ -1758,8 +1938,8 @@ md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines, end_line++; if(beg_line != end_line) { - MD_CHECK(md_normalize_string(ctx, beg, end, beg_line, - n_lines - (beg_line - lines), _T(' '), FALSE, &label, &label_size)); + MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line, + n_lines - (beg_line - lines), _T(' '), &label, &label_size)); } else { label = (CHAR*) STR(beg); label_size = end - beg; @@ -1769,7 +1949,6 @@ md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines, if(ret == TRUE) { attr->dest_beg = def->dest_beg; attr->dest_end = def->dest_end; - attr->dest_contains_escape = def->dest_contains_escape; attr->title = def->title; attr->title_size = def->title_size; attr->title_needs_free = FALSE; @@ -1792,7 +1971,6 @@ md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF title_contents_end; int title_contents_line_index; int title_is_multiline; - int title_has_escape; OFF off = beg; int ret = FALSE; @@ -1814,7 +1992,7 @@ md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines, /* (Optional) link destination. */ if(!md_is_link_destination(ctx, off, lines[line_index].end, - &off, &attr->dest_beg, &attr->dest_end, &attr->dest_contains_escape)) { + &off, &attr->dest_beg, &attr->dest_end)) { attr->dest_beg = off; attr->dest_end = off; } @@ -1822,7 +2000,7 @@ md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines, /* (Optional) title. */ if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off, &off, &title_contents_line_index, &tmp_line_index, - &title_contents_beg, &title_contents_end, &title_has_escape)) + &title_contents_beg, &title_contents_end)) { title_is_multiline = (tmp_line_index != title_contents_line_index); title_contents_line_index += line_index; @@ -1852,14 +2030,14 @@ md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines, attr->title = NULL; attr->title_size = 0; attr->title_needs_free = FALSE; - } else if(!title_is_multiline && !title_has_escape) { + } else if(!title_is_multiline) { attr->title = (CHAR*) STR(title_contents_beg); attr->title_size = title_contents_end - title_contents_beg; attr->title_needs_free = FALSE; } else { - MD_CHECK(md_normalize_string(ctx, title_contents_beg, title_contents_end, + MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end, lines + title_contents_line_index, n_lines - title_contents_line_index, - _T('\n'), TRUE, &attr->title, &attr->title_size)); + _T('\n'), &attr->title, &attr->title_size)); attr->title_needs_free = TRUE; } @@ -1978,7 +2156,6 @@ struct MD_MARK_tag { /* Mark flags specific for various mark types (so they can share bits). */ #define MD_MARK_INTRAWORD 0x40 /* Helper for emphasis '*', '_' ("the rule of 3"). */ #define MD_MARK_AUTOLINK 0x40 /* Distinguisher for '<', '>'. */ -#define MD_MARK_LINKDESTCONTAINESESCAPE 0x40 /* Flag that link destination contains an escape. */ static MD_MARK* @@ -2872,8 +3049,6 @@ md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines) MD_ASSERT(ctx->marks[opener_index+1].ch == 'D'); ctx->marks[opener_index+1].beg = attr.dest_beg; ctx->marks[opener_index+1].end = attr.dest_end; - if(attr.dest_contains_escape) - ctx->marks[opener_index+1].flags |= MD_MARK_LINKDESTCONTAINESESCAPE; MD_ASSERT(ctx->marks[opener_index+2].ch == 'D'); md_mark_store_ptr(ctx, opener_index+2, attr.title); @@ -2905,7 +3080,7 @@ md_analyze_entity(MD_CTX* ctx, int mark_index) { MD_MARK* opener = &ctx->marks[mark_index]; MD_MARK* closer; - OFF beg, end, off; + OFF off; /* Cannot be entity if there is no closer as the next mark. * (Any other mark between would mean strange character which cannot be @@ -2920,50 +3095,12 @@ md_analyze_entity(MD_CTX* ctx, int mark_index) if(closer->ch != ';') return; - if(CH(opener->end) == _T('#')) { - if(CH(opener->end+1) == _T('x') || CH(opener->end+1) == _T('X')) { - /* It can be only a hexadecimal entity. - * Check it has 1 - 8 hexadecimal digits. */ - beg = opener->end+2; - end = closer->beg; - if(!(1 <= end - beg && end - beg <= 8)) - return; - for(off = beg; off < end; off++) { - if(!ISXDIGIT(off)) - return; - } - } else { - /* It can be only a decimal entity. - * Check it has 1 - 8 decimal digits. */ - beg = opener->end+1; - end = closer->beg; - if(!(1 <= end - beg && end - beg <= 8)) - return; - for(off = beg; off < end; off++) { - if(!ISDIGIT(off)) - return; - } - } - } else { - /* It can be only a named entity. - * Check it starts with letter and 1-47 alnum chars follow. */ - beg = opener->end; - end = closer->beg; - if(!(2 <= end - beg && end - beg <= 48)) - return; - if(!ISALPHA(beg)) - return; - for(off = beg + 1; off < end; off++) { - if(!ISALNUM(off)) - return; - } - } + if(md_is_entity(ctx, opener->beg, closer->end, &off)) { + MD_ASSERT(off == closer->end); - /* Mark us as an entity. - * As entity has no span, we may just turn the range into a single mark. - * (This also causes we do not get called for ';'. */ - md_resolve_range(ctx, NULL, mark_index, mark_index+1); - opener->end = closer->end; + md_resolve_range(ctx, NULL, mark_index, mark_index+1); + opener->end = closer->end; + } } static void @@ -3230,75 +3367,36 @@ md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg } static int -md_unescape_link_dest(MD_CTX* ctx, OFF beg, OFF end, SZ* p_size) +md_enter_leave_span_a(MD_CTX* ctx, int enter, MD_SPANTYPE type, + const CHAR* dest, SZ dest_size, int prohibit_escapes_in_dest, + const CHAR* title, SZ title_size) { - CHAR* ptr; - OFF off = beg; + MD_SPAN_A_DETAIL det; int ret = 0; - MD_TEMP_BUFFER((end - beg) * sizeof(CHAR)); - ptr = ctx->buffer; + /* Note we here rely on fact that MD_SPAN_A_DETAIL and + * MD_SPAN_IMG_DETAIL are binary-compatible. */ + memset(&det, 0, sizeof(MD_SPAN_A_DETAIL)); + MD_CHECK(md_build_attribute(ctx, dest, dest_size, + (prohibit_escapes_in_dest ? MD_BUILD_ATTR_NO_ESCAPES : 0), + &det.href)); + MD_CHECK(md_build_attribute(ctx, title, title_size, 0, &det.title)); - while(off < end) { - if(CH(off) == _T('\\') && off+1 < end && ISPUNCT(off+1)) { - off++; - continue; - } - - *ptr = CH(off); - ptr++; - off++; - } - - *p_size = ptr - ctx->buffer; - -abort: - return ret; -} - -static int -md_setup_span_a_detail(MD_CTX* ctx, const MD_MARK* mark, MD_SPAN_A_DETAIL* det) -{ - const MD_MARK* dest_mark = mark+1; - const MD_MARK* title_mark = mark+2; - int ret = 0; - - MD_ASSERT(dest_mark->ch == 'D'); - if(dest_mark->beg < dest_mark->end) { - if(dest_mark->flags & MD_MARK_LINKDESTCONTAINESESCAPE) { - MD_CHECK(md_unescape_link_dest(ctx, dest_mark->beg, dest_mark->end, &det->href_size)); - det->href = ctx->buffer; - } else { - det->href = STR(dest_mark->beg); - det->href_size = dest_mark->end - dest_mark->beg; - } - } else { - det->href = NULL; - det->href_size = 0; - } - - MD_ASSERT(title_mark->ch == 'D'); - det->title = md_mark_get_ptr(ctx, title_mark - ctx->marks); - det->title_size = title_mark->prev; + if(enter) + MD_ENTER_SPAN(type, &det); + else + MD_LEAVE_SPAN(type, &det); abort: + md_free_attribute(ctx, &det.href); + md_free_attribute(ctx, &det.title); return ret; } -static inline int -md_setup_span_img_detail(MD_CTX* ctx, MD_MARK* mark, MD_SPAN_IMG_DETAIL* det) -{ - /* MD_SPAN_A_DETAIL and MD_SPAN_IMG_DETAIL are binary-compatible. */ - return md_setup_span_a_detail(ctx, mark, (MD_SPAN_A_DETAIL*) det); -} - /* Render the output, accordingly to the analyzed ctx->marks. */ static int md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) { - union { - MD_SPAN_A_DETAIL a; - } det; MD_TEXTTYPE text_type; const MD_LINE* line = lines; MD_MARK* prev_mark = NULL; @@ -3373,15 +3471,21 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) case '[': /* Link, image. */ case '!': - /* Note we here rely on fact that MD_SPAN_A_DETAIL and - * MD_SPAN_IMG_DETAIL are binary-compatible. */ - MD_CHECK(md_setup_span_a_detail(ctx, mark, &det.a)); - MD_ENTER_SPAN((mark->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A), &det.a); - break; case ']': - MD_CHECK(md_setup_span_a_detail(ctx, &ctx->marks[mark->prev], &det.a)); - MD_LEAVE_SPAN((ctx->marks[mark->prev].ch == '!' ? MD_SPAN_IMG : MD_SPAN_A), &det.a); + { + const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]); + const MD_MARK* dest_mark = opener+1; + const MD_MARK* title_mark = opener+2; + + MD_ASSERT(dest_mark->ch == 'D'); + MD_ASSERT(title_mark->ch == 'D'); + + MD_CHECK(md_enter_leave_span_a(ctx, (mark->ch != ']'), + (opener->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A), + STR(dest_mark->beg), dest_mark->end - dest_mark->beg, FALSE, + md_mark_get_ptr(ctx, title_mark - ctx->marks), title_mark->prev)); break; + } case '<': case '>': /* Autolink or raw HTML. */ @@ -3397,30 +3501,24 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) case '@': /* Permissive e-mail autolink. */ case ':': /* Permissive URL autolink. */ - if(mark->flags & MD_MARK_OPENER) { - if(mark->ch == '@') { - SZ sz = 7 + ctx->marks[mark->next].beg - mark->end; - - MD_TEMP_BUFFER(sz * sizeof(CHAR)); - memcpy(ctx->buffer, _T("mailto:"), 7 * sizeof(CHAR)); - memcpy(ctx->buffer + 7, STR(mark->end), (sz-7) * sizeof(CHAR)); - - det.a.href_size = sz; - det.a.href = ctx->buffer; - } else { - det.a.href_size = ctx->marks[mark->next].beg - mark->end; - det.a.href = STR(mark->end); - } - det.a.title = NULL; - det.a.title_size = 0; - MD_ENTER_SPAN(MD_SPAN_A, (void*) &det); - } else { - /* The detail already has to be initialized: There cannot - * be any resolved mark between the autolink opener and - * closer. */ - MD_LEAVE_SPAN(MD_SPAN_A, (void*) &det); + { + const MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]); + const MD_MARK* closer = &ctx->marks[opener->next]; + const CHAR* dest = STR(opener->end); + SZ dest_size = closer->beg - opener->end; + + if(opener->ch == '@') { + dest_size += 7; + MD_TEMP_BUFFER(dest_size * sizeof(CHAR)); + memcpy(ctx->buffer, _T("mailto:"), 7 * sizeof(CHAR)); + memcpy(ctx->buffer + 7, dest, (dest_size-7) * sizeof(CHAR)); + dest = ctx->buffer; } + + MD_CHECK(md_enter_leave_span_a(ctx, (mark->flags & MD_MARK_OPENER), + MD_SPAN_A, dest, dest_size, TRUE, NULL, 0)); break; + } case '&': /* Entity. */ MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg); @@ -3726,6 +3824,7 @@ md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DE const MD_VERBATIMLINE* fence_line = (const MD_VERBATIMLINE*)(block + 1); OFF beg = fence_line->beg; OFF end = fence_line->end; + OFF lang_end; CHAR fence_ch = CH(fence_line->beg); int ret = 0; @@ -3740,25 +3839,26 @@ md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DE while(end > beg && CH(end-1) == _T(' ')) end--; - if(beg < end) { - MD_LINE line = { beg, end }; - SZ size; + /* Build info string attribute. */ + MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->info)); - MD_TEMP_BUFFER((end - beg) * sizeof(CHAR)); - md_do_normalize_string(ctx, beg, end, &line, 1, _T(' '), TRUE, ctx->buffer, &size); - - det->info = ctx->buffer; - det->info_size = size; - - det->lang = det->info; - while(det->lang_size < det->info_size && !ISWHITESPACE_(det->lang[det->lang_size])) - det->lang_size++; - } + /* Build info string attribute. */ + lang_end = beg; + while(lang_end < end && !ISWHITESPACE(lang_end)) + lang_end++; + MD_CHECK(md_build_attribute(ctx, STR(beg), lang_end - beg, 0, &det->lang)); abort: return ret; } +static inline void +md_clean_fenced_code_detail(MD_CTX* ctx, MD_BLOCK_CODE_DETAIL* det) +{ + md_free_attribute(ctx, &det->info); + md_free_attribute(ctx, &det->lang); +} + static int md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block) { @@ -3767,6 +3867,7 @@ md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block) MD_BLOCK_CODE_DETAIL code; } det; int is_in_tight_list; + int clean_fence_code_detail = FALSE; int ret = 0; memset(&det, 0, sizeof(det)); @@ -3783,8 +3884,11 @@ md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block) case MD_BLOCK_CODE: /* For fenced code block, we may need to set the info string. */ - if(block->data != 0) + if(block->data != 0) { + memset(&det.code, 0, sizeof(MD_BLOCK_CODE_DETAIL)); + clean_fence_code_detail = TRUE; MD_CHECK(md_setup_fenced_code_detail(ctx, block, &det.code)); + } break; default: @@ -3802,32 +3906,32 @@ md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block) break; case MD_BLOCK_CODE: - ret = md_process_code_block_contents(ctx, (block->data != 0), - (const MD_VERBATIMLINE*)(block + 1), block->n_lines); + MD_CHECK(md_process_code_block_contents(ctx, (block->data != 0), + (const MD_VERBATIMLINE*)(block + 1), block->n_lines)); break; case MD_BLOCK_HTML: - ret = md_process_verbatim_block_contents(ctx, MD_TEXT_HTML, - (const MD_VERBATIMLINE*)(block + 1), block->n_lines); + MD_CHECK(md_process_verbatim_block_contents(ctx, MD_TEXT_HTML, + (const MD_VERBATIMLINE*)(block + 1), block->n_lines)); break; case MD_BLOCK_TABLE: - ret = md_process_table_block_contents(ctx, block->data, - (const MD_LINE*)(block + 1), block->n_lines); + MD_CHECK(md_process_table_block_contents(ctx, block->data, + (const MD_LINE*)(block + 1), block->n_lines)); break; default: - ret = md_process_normal_block_contents(ctx, - (const MD_LINE*)(block + 1), block->n_lines); + MD_CHECK(md_process_normal_block_contents(ctx, + (const MD_LINE*)(block + 1), block->n_lines)); break; } - if(ret != 0) - goto abort; if(!is_in_tight_list || block->type != MD_BLOCK_P) MD_LEAVE_BLOCK(block->type, (void*) &det); abort: + if(clean_fence_code_detail) + md_clean_fenced_code_detail(ctx, &det.code); return ret; } diff --git a/md4c/md4c.h b/md4c/md4c.h @@ -176,6 +176,35 @@ enum MD_ALIGN_tag { }; +/* String attribute. + * + * This wraps strings which are outside of a normal text flow and which are + * propagated within various detailed structures, but which still may contain + * string portions of different types like e.g. entities. + * + * So, for example, lets consider an image has a title attribute string + * set to "foo &quot; bar". (Note the string size is 14.) + * + * Then: + * -- [0]: "foo " (substr_types[0] == MD_TEXT_NORMAL; substr_offsets[0] == 0) + * -- [1]: "&quot;" (substr_types[1] == MD_TEXT_ENTITY; substr_offsets[1] == 4) + * -- [2]: " bar" (substr_types[2] == MD_TEXT_NORMAL; substr_offsets[2] == 10) + * -- [3]: (n/a) (n/a ; substr_offsets[3] == 14) + * + * Note that these conditions are guaranteed: + * -- substr_offsets[0] == 0 + * -- substr_offsets[LAST+1] == size + * -- Only MD_TEXT_NORMAL and MD_TEXT_ENTITY substrings can appear. + */ +typedef struct MD_ATTRIBUTE_tag MD_ATTRIBUTE; +struct MD_ATTRIBUTE_tag { + const MD_CHAR* text; + MD_SIZE size; + const MD_TEXTTYPE* substr_types; + const MD_OFFSET* substr_offsets; +}; + + /* Detailed info for MD_BLOCK_OL_DETAIL. */ typedef struct MD_BLOCK_OL_DETAIL_tag MD_BLOCK_OL_DETAIL; struct MD_BLOCK_OL_DETAIL_tag { @@ -191,13 +220,8 @@ struct MD_BLOCK_H_DETAIL_tag { /* Detailed info for MD_BLOCK_CODE. */ typedef struct MD_BLOCK_CODE_DETAIL_tag MD_BLOCK_CODE_DETAIL; struct MD_BLOCK_CODE_DETAIL_tag { - /* Complete "info string" */ - const MD_CHAR* info; - MD_SIZE info_size; - - /* Language portion of the info string. */ - const MD_CHAR* lang; - MD_SIZE lang_size; + MD_ATTRIBUTE info; + MD_ATTRIBUTE lang; }; /* Detailed info for MD_BLOCK_TH and MD_BLOCK_TD. */ @@ -209,21 +233,15 @@ struct MD_BLOCK_TD_DETAIL_tag { /* Detailed info for MD_SPAN_A. */ typedef struct MD_SPAN_A_DETAIL_tag MD_SPAN_A_DETAIL; struct MD_SPAN_A_DETAIL_tag { - const MD_CHAR* href; - MD_SIZE href_size; - - const MD_CHAR* title; - MD_SIZE title_size; + MD_ATTRIBUTE href; + MD_ATTRIBUTE title; }; /* Detailed info for MD_SPAN_IMG. */ typedef struct MD_SPAN_IMG_DETAIL_tag MD_SPAN_IMG_DETAIL; struct MD_SPAN_IMG_DETAIL_tag { - const MD_CHAR* src; - MD_SIZE src_size; - - const MD_CHAR* title; - MD_SIZE title_size; + MD_ATTRIBUTE src; + MD_ATTRIBUTE title; };