commit 2b740798ca631b2bcb139b7226e9a287e4034fa5
parent 04c82c6c0f918c8d88e180a0452dc77c1bca9deb
Author: Martin Mitas <mity@morous.org>
Date: Tue, 11 Oct 2016 00:36:39 +0200
Implement insecure character (NULL) replacement.
Diffstat:
4 files changed, 81 insertions(+), 13 deletions(-)
diff --git a/README.md b/README.md
@@ -74,7 +74,7 @@ more or less forms our to do list.
- **Preliminaries:**
- [x] 2.1 Character and lines
- [x] 2.2 Tabs
- - [ ] 2.3 Insecure characters
+ - [x] 2.3 Insecure characters
- **Blocks and Inlines:**
- [x] 3.1 Precedence
diff --git a/md2html/md2html.c b/md2html/md2html.c
@@ -163,6 +163,8 @@ hex_val(char ch)
static void
render_utf8_codepoint(struct membuffer* out, unsigned codepoint)
{
+ static const char utf8_replacement_char[] = { 0xef, 0xbf, 0xbd };
+
unsigned char utf8[4];
size_t n;
@@ -186,7 +188,10 @@ render_utf8_codepoint(struct membuffer* out, unsigned codepoint)
utf8[3] = 0x80 + ((codepoint >> 0) & 0x3f);
}
- membuf_append_escaped(out, (char*)utf8, n);
+ if(0 < codepoint && codepoint <= 0x10ffff)
+ membuf_append_escaped(out, (char*)utf8, n);
+ else
+ membuf_append(out, utf8_replacement_char, 3);
}
/* Translate entity to its UTF-8 equivalent, or output the verbatim one
@@ -215,10 +220,8 @@ render_entity(struct membuffer* out, const MD_CHAR* text, MD_SIZE size)
codepoint = 10 * codepoint + (text[i] - '0');
}
- if(codepoint <= 0x10ffff) { /* Max. Unicode codepoint. */
- render_utf8_codepoint(out, codepoint);
- return;
- }
+ render_utf8_codepoint(out, codepoint);
+ return;
} else {
/* Named entity (e.g. " ". */
const char* ent;
@@ -306,6 +309,7 @@ text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, void* userdat
struct membuffer* out = (struct membuffer*) userdata;
switch(type) {
+ case MD_TEXT_NULLCHAR: render_utf8_codepoint(out, 0x0000); break;
case MD_TEXT_BR: MEMBUF_APPEND_LITERAL(out, "<br>\n"); break;
case MD_TEXT_SOFTBR: MEMBUF_APPEND_LITERAL(out, "\n"); break;
case MD_TEXT_HTML: membuf_append(out, text, size); break;
diff --git a/md4c/md4c.c b/md4c/md4c.c
@@ -258,6 +258,35 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
return 0;
}
+static int
+md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size)
+{
+ OFF off = 0;
+ int ret = 0;
+
+ while(1) {
+ while(off < size && str[off] != _T('\0'))
+ off++;
+
+ if(off > 0) {
+ ret = ctx->r.text(type, str, off, ctx->userdata);
+ if(ret != 0)
+ return ret;
+
+ str += off;
+ size -= off;
+ off = 0;
+ }
+
+ if(off >= size)
+ return 0;
+
+ ret = ctx->r.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata);
+ if(ret != 0)
+ return ret;
+ }
+}
+
#define MD_CHECK(func) \
do { \
@@ -313,6 +342,17 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
} \
} while(0)
+#define MD_TEXT_INSECURE(type, str, size) \
+ do { \
+ if(size > 0) { \
+ ret = md_text_with_null_replacement(ctx, type, str, size); \
+ if(ret != 0) { \
+ md_log(ctx, "Aborted from text() callback."); \
+ goto abort; \
+ } \
+ } \
+ } while(0)
+
/******************************
*** Recognizing raw HTML ***
@@ -689,6 +729,7 @@ md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_
* ';': Maybe end of entity.
* '<': Maybe start of raw HTML.
* '>': Maybe end of raw HTML.
+ * '0': NULL char (need replacement).
*
* Note that not all instances of these chars in the text imply creation of the
* structure. Only those which have (or may have, after we see more context)
@@ -940,6 +981,13 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
}
}
+ /* NULL character. */
+ if(ch == _T('\0')) {
+ PUSH_MARK(ch, off, off+1, 0);
+ off++;
+ continue;
+ }
+
off++;
}
}
@@ -1119,7 +1167,7 @@ md_analyze_entity(MD_CTX* ctx, int mark_index)
/* Table of precedence of various span types. */
static const CHAR* md_precedence_table[] = {
_T("`<>"), /* Code spans; raw HTML. */
- _T("&") /* Entities. */
+ _T("&"), /* Entities. */
};
static void
@@ -1164,6 +1212,13 @@ md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int precedence_
i++;
}
+
+ for(i = 0; i < ctx->n_marks; i++) {
+ MD_MARK* mark = &ctx->marks[i];
+
+ if(mark->ch == '\0')
+ mark->flags |= MD_MARK_RESOLVED;
+ }
}
/* Analyze marks (build ctx->marks). */
@@ -1257,6 +1312,10 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
case '&': /* Entity. */
MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg);
break;
+
+ case '\0':
+ MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1);
+ break;
}
off = mark->end;
@@ -1346,7 +1405,7 @@ md_process_verbatim_block(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_LINE* lin
MD_TEXT(text_type, indent_str, indent);
/* Output the code line itself. */
- MD_TEXT(text_type, STR(line->beg), line->end - line->beg);
+ MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg);
/* Enforce end-of-line. */
MD_TEXT(text_type, _T("\n"), 1);
diff --git a/md4c/md4c.h b/md4c/md4c.h
@@ -98,6 +98,16 @@ enum MD_TEXTTYPE_tag {
/* Normal text. */
MD_TEXT_NORMAL = 0,
+ /* NULL character. Markdown is supposed to replace NULL character with
+ * the replacement char U+FFFD but since we are encoding agnostic, caller
+ * has to do that. */
+ MD_TEXT_NULLCHAR,
+
+ /* Line breaks.
+ * Note these are only sent within MD_BLOCK_CODE or MD_BLOCK_HTML. */
+ MD_TEXT_BR, /* <br> (hard break) */
+ MD_TEXT_SOFTBR, /* '\n' in source text where it is not semantically meaningful (soft break) */
+
/* Entity.
* (a) Named entity, e.g.
* (Note MD4C does not have a lsit of known entities.
@@ -110,11 +120,6 @@ enum MD_TEXTTYPE_tag {
* text into the MD_RENDERER::text_callback(). */
MD_TEXT_ENTITY,
- /* Line breaks.
- * Note these are only sent within MD_BLOCK_CODE or MD_BLOCK_HTML. */
- MD_TEXT_BR, /* <br> (hard break) */
- MD_TEXT_SOFTBR, /* '\n' in source text where it is not semantically meaningful (soft break) */
-
/* Text in a code block (inside MD_BLOCK_CODE) or inlined code (`code`).
* If it is inside MD_BLOCK_CODE, it includes spaces for indentation and
* '\n' for new lines. MD_TEXT_BR and MD_TEXT_SOFTBR are not sent for this