md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

commit 2b740798ca631b2bcb139b7226e9a287e4034fa5
parent 04c82c6c0f918c8d88e180a0452dc77c1bca9deb
Author: Martin Mitas <mity@morous.org>
Date:   Tue, 11 Oct 2016 00:36:39 +0200

Implement insecure character (NULL) replacement.

Diffstat:
MREADME.md | 2+-
Mmd2html/md2html.c | 14+++++++++-----
Mmd4c/md4c.c | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mmd4c/md4c.h | 15++++++++++-----
4 files changed, 81 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md @@ -74,7 +74,7 @@ more or less forms our to do list. - **Preliminaries:** - [x] 2.1 Character and lines - [x] 2.2 Tabs - - [ ] 2.3 Insecure characters + - [x] 2.3 Insecure characters - **Blocks and Inlines:** - [x] 3.1 Precedence diff --git a/md2html/md2html.c b/md2html/md2html.c @@ -163,6 +163,8 @@ hex_val(char ch) static void render_utf8_codepoint(struct membuffer* out, unsigned codepoint) { + static const char utf8_replacement_char[] = { 0xef, 0xbf, 0xbd }; + unsigned char utf8[4]; size_t n; @@ -186,7 +188,10 @@ render_utf8_codepoint(struct membuffer* out, unsigned codepoint) utf8[3] = 0x80 + ((codepoint >> 0) & 0x3f); } - membuf_append_escaped(out, (char*)utf8, n); + if(0 < codepoint && codepoint <= 0x10ffff) + membuf_append_escaped(out, (char*)utf8, n); + else + membuf_append(out, utf8_replacement_char, 3); } /* Translate entity to its UTF-8 equivalent, or output the verbatim one @@ -215,10 +220,8 @@ render_entity(struct membuffer* out, const MD_CHAR* text, MD_SIZE size) codepoint = 10 * codepoint + (text[i] - '0'); } - if(codepoint <= 0x10ffff) { /* Max. Unicode codepoint. */ - render_utf8_codepoint(out, codepoint); - return; - } + render_utf8_codepoint(out, codepoint); + return; } else { /* Named entity (e.g. "&nbsp;". */ const char* ent; @@ -306,6 +309,7 @@ text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, void* userdat struct membuffer* out = (struct membuffer*) userdata; switch(type) { + case MD_TEXT_NULLCHAR: render_utf8_codepoint(out, 0x0000); break; case MD_TEXT_BR: MEMBUF_APPEND_LITERAL(out, "<br>\n"); break; case MD_TEXT_SOFTBR: MEMBUF_APPEND_LITERAL(out, "\n"); break; case MD_TEXT_HTML: membuf_append(out, text, size); break; diff --git a/md4c/md4c.c b/md4c/md4c.c @@ -258,6 +258,35 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n) return 0; } +static int +md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size) +{ + OFF off = 0; + int ret = 0; + + while(1) { + while(off < size && str[off] != _T('\0')) + off++; + + if(off > 0) { + ret = ctx->r.text(type, str, off, ctx->userdata); + if(ret != 0) + return ret; + + str += off; + size -= off; + off = 0; + } + + if(off >= size) + return 0; + + ret = ctx->r.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata); + if(ret != 0) + return ret; + } +} + #define MD_CHECK(func) \ do { \ @@ -313,6 +342,17 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n) } \ } while(0) +#define MD_TEXT_INSECURE(type, str, size) \ + do { \ + if(size > 0) { \ + ret = md_text_with_null_replacement(ctx, type, str, size); \ + if(ret != 0) { \ + md_log(ctx, "Aborted from text() callback."); \ + goto abort; \ + } \ + } \ + } while(0) + /****************************** *** Recognizing raw HTML *** @@ -689,6 +729,7 @@ md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_ * ';': Maybe end of entity. * '<': Maybe start of raw HTML. * '>': Maybe end of raw HTML. + * '0': NULL char (need replacement). * * Note that not all instances of these chars in the text imply creation of the * structure. Only those which have (or may have, after we see more context) @@ -940,6 +981,13 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines) } } + /* NULL character. */ + if(ch == _T('\0')) { + PUSH_MARK(ch, off, off+1, 0); + off++; + continue; + } + off++; } } @@ -1119,7 +1167,7 @@ md_analyze_entity(MD_CTX* ctx, int mark_index) /* Table of precedence of various span types. */ static const CHAR* md_precedence_table[] = { _T("`<>"), /* Code spans; raw HTML. */ - _T("&") /* Entities. */ + _T("&"), /* Entities. */ }; static void @@ -1164,6 +1212,13 @@ md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int precedence_ i++; } + + for(i = 0; i < ctx->n_marks; i++) { + MD_MARK* mark = &ctx->marks[i]; + + if(mark->ch == '\0') + mark->flags |= MD_MARK_RESOLVED; + } } /* Analyze marks (build ctx->marks). */ @@ -1257,6 +1312,10 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) case '&': /* Entity. */ MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg); break; + + case '\0': + MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1); + break; } off = mark->end; @@ -1346,7 +1405,7 @@ md_process_verbatim_block(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_LINE* lin MD_TEXT(text_type, indent_str, indent); /* Output the code line itself. */ - MD_TEXT(text_type, STR(line->beg), line->end - line->beg); + MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg); /* Enforce end-of-line. */ MD_TEXT(text_type, _T("\n"), 1); diff --git a/md4c/md4c.h b/md4c/md4c.h @@ -98,6 +98,16 @@ enum MD_TEXTTYPE_tag { /* Normal text. */ MD_TEXT_NORMAL = 0, + /* NULL character. Markdown is supposed to replace NULL character with + * the replacement char U+FFFD but since we are encoding agnostic, caller + * has to do that. */ + MD_TEXT_NULLCHAR, + + /* Line breaks. + * Note these are only sent within MD_BLOCK_CODE or MD_BLOCK_HTML. */ + MD_TEXT_BR, /* <br> (hard break) */ + MD_TEXT_SOFTBR, /* '\n' in source text where it is not semantically meaningful (soft break) */ + /* Entity. * (a) Named entity, e.g. &nbsp; * (Note MD4C does not have a lsit of known entities. @@ -110,11 +120,6 @@ enum MD_TEXTTYPE_tag { * text into the MD_RENDERER::text_callback(). */ MD_TEXT_ENTITY, - /* Line breaks. - * Note these are only sent within MD_BLOCK_CODE or MD_BLOCK_HTML. */ - MD_TEXT_BR, /* <br> (hard break) */ - MD_TEXT_SOFTBR, /* '\n' in source text where it is not semantically meaningful (soft break) */ - /* Text in a code block (inside MD_BLOCK_CODE) or inlined code (`code`). * If it is inside MD_BLOCK_CODE, it includes spaces for indentation and * '\n' for new lines. MD_TEXT_BR and MD_TEXT_SOFTBR are not sent for this