Better Unicode support. - md4c - C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.

commit 8f672646c1c25af06a74bb1e347a131ef32d1403
parent 15a3a81fe8e4525e52d8db4b4828db0a49f962c0
Author: Martin Mitas <mity@morous.org>
Date:   Sat, 19 Nov 2016 13:38:06 +0100

Better Unicode support.

Diffstat:
M md4c/CMakeLists.txt  | 2 ++
M md4c/md4c.c  | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M md4c/md4c.h  | 4 +---

3 files changed, 99 insertions(+), 11 deletions(-)
diff --git a/md4c/CMakeLists.txt b/md4c/CMakeLists.txt
@@ -1,4 +1,6 @@
 
 set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DDEBUG")
 
+add_definitions(-DMD4C_USE_UNICODE)
+
 add_library(md4c STATIC md4c.c md4c.h)
diff --git a/md4c/md4c.c b/md4c/md4c.c
@@ -47,7 +47,7 @@
 #ifdef _T
     #undef _T
 #endif
-#if defined _WIN32  &&  defined MD_WIN_UNICODE
+#if defined MD4C_USE_WIN_UNICODE
     #define _T(x)           L##x
 #else
     #define _T(x)           x
@@ -65,9 +65,9 @@
  ************************/
 
 /* These are omnipresent so lets save some typing. */
-typedef MD_CHAR CHAR;
-typedef MD_SIZE SZ;
-typedef MD_OFFSET OFF;
+#define CHAR    MD_CHAR
+#define SZ      MD_SIZE
+#define OFF     MD_OFFSET
 
 typedef struct MD_MARK_tag MD_MARK;
 typedef struct MD_BLOCK_tag MD_BLOCK;
@@ -247,6 +247,94 @@ struct MD_VERBATIMLINE_tag {
 #define ISANYOF(off, palette)   ISANYOF_(CH(off), (palette))
 
 
+#if defined MD4C_USE_WIN_UNICODE
+    #include <ctype.h>
+
+    #define ISUNICODEWHITESPACE(off)        iswspace(CH(off))
+    #define ISUNICODEPUNCT(off)             iswpunct(CH(off))
+    #define ISUNICODEWHITESPACEBEFORE(off)  iswspace(CH((off)-1))
+    #define ISUNICODEPUNCTBEFORE(off)       iswpunct(CH((off)-1))
+#elif defined MD4C_USE_UNICODE
+    #ifdef _WIN32
+        /* Note Win32 supports only Unicode plane 0 but better then nothing. */
+        #include <ctype.h>
+    #else
+        #include <wctype.h>
+
+        #ifndef __STDC_ISO_10646__
+            #error "MD4C relies on wchar_t to support Unicode properly."
+        #endif
+    #endif
+
+    #define IS_UTF8_LEAD1(byte)     ((unsigned char)(byte) <= 0x7f)
+    #define IS_UTF8_LEAD2(byte)     (((unsigned char)(byte) & 0xe0) == 0xc0)
+    #define IS_UTF8_LEAD3(byte)     (((unsigned char)(byte) & 0xf0) == 0xe0)
+    #define IS_UTF8_LEAD4(byte)     (((unsigned char)(byte) & 0xf8) == 0xf0)
+    #define IS_UTF8_TAIL(byte)      (((unsigned char)(byte) & 0xc0) == 0x80)
+
+    static int
+    md_decode_utf8(MD_CTX* ctx, OFF off)
+    {
+        /* For any invalid UTF-8 sequence we use the Unicode replacement char
+         * for purposes of character classification. */
+        int codepoint = 0xfffd;
+
+        if(IS_UTF8_LEAD1(CH(off))) {
+            codepoint = CH(off);
+        } else if(IS_UTF8_LEAD2(CH(off))) {
+            if(off+1 < ctx->size)
+                codepoint = (((unsigned int)CH(off) & 0x1f) << 6) |
+                            (((unsigned int)CH(off+1) & 0x3f) << 0);
+        } else if(IS_UTF8_LEAD3(CH(off))) {
+            if(off+2 < ctx->size)
+                codepoint = (((unsigned int)CH(off) & 0x0f) << 12) |
+                            (((unsigned int)CH(off+1) & 0x3f) << 6) |
+                            (((unsigned int)CH(off+2) & 0x3f) << 0);
+        } else if(IS_UTF8_LEAD4(CH(off))) {
+            if(off+3 < ctx->size)
+                codepoint = (((unsigned int)CH(off) & 0x07) << 18) |
+                            (((unsigned int)CH(off+1) & 0x3f) << 12) |
+                            (((unsigned int)CH(off+2) & 0x3f) << 6) |
+                            (((unsigned int)CH(off+3) & 0x3f) << 0);
+        }
+
+#ifdef _WIN32
+        /* On Windows, iswpace() et al. gets garbage for codepoints above
+         * the Unicode plane 0. */
+        if(codepoint > 0xffff)
+            codepoint = 0xfffd;
+#endif
+
+        return codepoint;
+    }
+
+    static int
+    md_decode_utf8_before(MD_CTX* ctx, OFF off)
+    {
+        if(off > 0  &&  IS_UTF8_LEAD1(CH(off-1)))
+            return CH(off-1);
+        if(off > 1  &&  IS_UTF8_LEAD2(CH(off-2)))
+            return md_decode_utf8(ctx, off-2);
+        if(off > 2  &&  IS_UTF8_LEAD3(CH(off-3)))
+            return md_decode_utf8(ctx, off-3);
+        if(off > 3  &&  IS_UTF8_LEAD4(CH(off-4)))
+            return md_decode_utf8(ctx, off-4);
+
+        return 0xfffd;
+    }
+
+    #define ISUNICODEWHITESPACE(off)        iswspace(md_decode_utf8(ctx, off))
+    #define ISUNICODEPUNCT(off)             iswpunct(md_decode_utf8(ctx, off))
+    #define ISUNICODEWHITESPACEBEFORE(off)  iswspace(md_decode_utf8_before(ctx, off))
+    #define ISUNICODEPUNCTBEFORE(off)       iswpunct(md_decode_utf8_before(ctx, off))
+#else
+    #define ISUNICODEWHITESPACE(off)        ISWHITESPACE(off)
+    #define ISUNICODEPUNCT(off)             ISPUNCT(off)
+    #define ISUNICODEWHITESPACEBEFORE(off)  ISWHITESPACE((off)-1)
+    #define ISUNICODEPUNCTBEFORE(off)       ISPUNCT((off)-1)
+#endif
+
+
 static inline const CHAR*
 md_strchr(const CHAR* str, CHAR ch)
 {
@@ -1103,16 +1191,16 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
                 while(tmp < line_end  &&  CH(tmp) == ch)
                     tmp++;
 
-                if(off == line->beg  ||  ISWHITESPACE(off-1))
+                if(off == line->beg  ||  ISUNICODEWHITESPACEBEFORE(off))
                     left_level = 0;
-                else if(ISPUNCT(off-1))
+                else if(ISUNICODEPUNCTBEFORE(off))
                     left_level = 1;
                 else
                     left_level = 2;
 
-                if(tmp == line_end  ||  ISWHITESPACE(tmp))
+                if(tmp == line_end  ||  ISUNICODEWHITESPACE(tmp))
                     right_level = 0;
-                else if(ISPUNCT(tmp))
+                else if(ISUNICODEPUNCT(tmp))
                     right_level = 1;
                 else
                     right_level = 2;
diff --git a/md4c/md4c.h b/md4c/md4c.h
@@ -41,10 +41,8 @@
  * On Windows, when UNICODE is defined, we by default switch to WCHAR.
  * This behavior may be disabled by predefining MD4C_DISABLE_WIN_UNICODE.
  */
-#if defined _WIN32  &&  defined UNICODE  &&  !defined MD4C_DISABLE_WIN_UNICODE
+#if defined MD4C_USE_WIN_UNICODE
     #include <windows.h>
-
-    #define MD4C_USE_WIN_UNICODE
     typedef WCHAR   MD_CHAR;
 #else
     typedef char    MD_CHAR;

	md4c C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
	git clone https://noulin.net/git/md4c.git
	Log \| Files \| Refs \| README \| LICENSE

M	md4c/CMakeLists.txt	\|	2	++
M	md4c/md4c.c	\|	104	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M	md4c/md4c.h	\|	4	+---