Implement insecure character (NULL) replacement. - md4c - C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.

commit 2b740798ca631b2bcb139b7226e9a287e4034fa5
parent 04c82c6c0f918c8d88e180a0452dc77c1bca9deb
Author: Martin Mitas <mity@morous.org>
Date:   Tue, 11 Oct 2016 00:36:39 +0200

Implement insecure character (NULL) replacement.

Diffstat:
M README.md  | 2 +-
M md2html/md2html.c  | 14 +++++++++-----
M md4c/md4c.c  | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M md4c/md4c.h  | 15 ++++++++++-----

4 files changed, 81 insertions(+), 13 deletions(-)
diff --git a/README.md b/README.md
@@ -74,7 +74,7 @@ more or less forms our to do list.
 - **Preliminaries:**
   - [x] 2.1 Character and lines
   - [x] 2.2 Tabs
-  - [ ] 2.3 Insecure characters
+  - [x] 2.3 Insecure characters
 
 - **Blocks and Inlines:**
   - [x] 3.1 Precedence
diff --git a/md2html/md2html.c b/md2html/md2html.c
@@ -163,6 +163,8 @@ hex_val(char ch)
 static void
 render_utf8_codepoint(struct membuffer* out, unsigned codepoint)
 {
+    static const char utf8_replacement_char[] = { 0xef, 0xbf, 0xbd };
+
     unsigned char utf8[4];
     size_t n;
 
@@ -186,7 +188,10 @@ render_utf8_codepoint(struct membuffer* out, unsigned codepoint)
         utf8[3] = 0x80 + ((codepoint >>  0) & 0x3f);
     }
 
-    membuf_append_escaped(out, (char*)utf8, n);
+    if(0 < codepoint  &&  codepoint <= 0x10ffff)
+        membuf_append_escaped(out, (char*)utf8, n);
+    else
+        membuf_append(out, utf8_replacement_char, 3);
 }
 
 /* Translate entity to its UTF-8 equivalent, or output the verbatim one
@@ -215,10 +220,8 @@ render_entity(struct membuffer* out, const MD_CHAR* text, MD_SIZE size)
                 codepoint = 10 * codepoint + (text[i] - '0');
         }
 
-        if(codepoint <= 0x10ffff) {     /* Max. Unicode codepoint. */
-            render_utf8_codepoint(out, codepoint);
-            return;
-        }
+        render_utf8_codepoint(out, codepoint);
+        return;
     } else {
         /* Named entity (e.g. "&nbsp;". */
         const char* ent;
@@ -306,6 +309,7 @@ text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, void* userdat
     struct membuffer* out = (struct membuffer*) userdata;
 
     switch(type) {
+        case MD_TEXT_NULLCHAR:  render_utf8_codepoint(out, 0x0000); break;
         case MD_TEXT_BR:        MEMBUF_APPEND_LITERAL(out, "<br>\n"); break;
         case MD_TEXT_SOFTBR:    MEMBUF_APPEND_LITERAL(out, "\n"); break;
         case MD_TEXT_HTML:      membuf_append(out, text, size); break;
diff --git a/md4c/md4c.c b/md4c/md4c.c
@@ -258,6 +258,35 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
     return 0;
 }
 
+static int
+md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size)
+{
+    OFF off = 0;
+    int ret = 0;
+
+    while(1) {
+        while(off < size  &&  str[off] != _T('\0'))
+            off++;
+
+        if(off > 0) {
+            ret = ctx->r.text(type, str, off, ctx->userdata);
+            if(ret != 0)
+                return ret;
+
+            str += off;
+            size -= off;
+            off = 0;
+        }
+
+        if(off >= size)
+            return 0;
+
+        ret = ctx->r.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata);
+        if(ret != 0)
+            return ret;
+    }
+}
+
 
 #define MD_CHECK(func)                                                  \
     do {                                                                \
@@ -313,6 +342,17 @@ md_str_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
         }                                                               \
     } while(0)
 
+#define MD_TEXT_INSECURE(type, str, size)                               \
+    do {                                                                \
+        if(size > 0) {                                                  \
+            ret = md_text_with_null_replacement(ctx, type, str, size);  \
+            if(ret != 0) {                                              \
+                md_log(ctx, "Aborted from text() callback.");           \
+                goto abort;                                             \
+            }                                                           \
+        }                                                               \
+    } while(0)
+
 
 /******************************
  ***  Recognizing raw HTML  ***
@@ -689,6 +729,7 @@ md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_
  *  ';': Maybe end of entity.
  *  '<': Maybe start of raw HTML.
  *  '>': Maybe end of raw HTML.
+ *  '0': NULL char (need replacement).
  *
  * Note that not all instances of these chars in the text imply creation of the
  * structure. Only those which have (or may have, after we see more context)
@@ -940,6 +981,13 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
                 }
             }
 
+            /* NULL character. */
+            if(ch == _T('\0')) {
+                PUSH_MARK(ch, off, off+1, 0);
+                off++;
+                continue;
+            }
+
             off++;
         }
     }
@@ -1119,7 +1167,7 @@ md_analyze_entity(MD_CTX* ctx, int mark_index)
 /* Table of precedence of various span types. */
 static const CHAR* md_precedence_table[] = {
     _T("`<>"),      /* Code spans; raw HTML. */
-    _T("&")         /* Entities. */
+    _T("&"),        /* Entities. */
 };
 
 static void
@@ -1164,6 +1212,13 @@ md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int precedence_
 
         i++;
     }
+
+    for(i = 0; i < ctx->n_marks; i++) {
+        MD_MARK* mark = &ctx->marks[i];
+
+        if(mark->ch == '\0')
+            mark->flags |= MD_MARK_RESOLVED;
+    }
 }
 
 /* Analyze marks (build ctx->marks). */
@@ -1257,6 +1312,10 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
                 case '&':       /* Entity. */
                     MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg);
                     break;
+
+                case '\0':
+                    MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1);
+                    break;
             }
 
             off = mark->end;
@@ -1346,7 +1405,7 @@ md_process_verbatim_block(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_LINE* lin
             MD_TEXT(text_type, indent_str, indent);
 
         /* Output the code line itself. */
-        MD_TEXT(text_type, STR(line->beg), line->end - line->beg);
+        MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg);
 
         /* Enforce end-of-line. */
         MD_TEXT(text_type, _T("\n"), 1);
diff --git a/md4c/md4c.h b/md4c/md4c.h
@@ -98,6 +98,16 @@ enum MD_TEXTTYPE_tag {
     /* Normal text. */
     MD_TEXT_NORMAL = 0,
 
+    /* NULL character. Markdown is supposed to replace NULL character with
+     * the replacement char U+FFFD but since we are encoding agnostic, caller
+     * has to do that. */
+    MD_TEXT_NULLCHAR,
+
+    /* Line breaks.
+     * Note these are only sent within MD_BLOCK_CODE or MD_BLOCK_HTML. */
+    MD_TEXT_BR,         /* <br> (hard break) */
+    MD_TEXT_SOFTBR,     /* '\n' in source text where it is not semantically meaningful (soft break) */
+
     /* Entity.
      * (a) Named entity, e.g. &nbsp; 
      *     (Note MD4C does not have a lsit of known entities.
@@ -110,11 +120,6 @@ enum MD_TEXTTYPE_tag {
      * text into the MD_RENDERER::text_callback(). */
     MD_TEXT_ENTITY,
 
-    /* Line breaks.
-     * Note these are only sent within MD_BLOCK_CODE or MD_BLOCK_HTML. */
-    MD_TEXT_BR,         /* <br> (hard break) */
-    MD_TEXT_SOFTBR,     /* '\n' in source text where it is not semantically meaningful (soft break) */
-
     /* Text in a code block (inside MD_BLOCK_CODE) or inlined code (`code`).
      * If it is inside MD_BLOCK_CODE, it includes spaces for indentation and
      * '\n' for new lines. MD_TEXT_BR and MD_TEXT_SOFTBR are not sent for this

	md4c C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
	git clone https://noulin.net/git/md4c.git
	Log \| Files \| Refs \| README \| LICENSE

M	README.md	\|	2	+-
M	md2html/md2html.c	\|	14	+++++++++-----
M	md4c/md4c.c	\|	63	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M	md4c/md4c.h	\|	15	++++++++++-----