md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

commit 6c90b37f1280a1061a82325fe02efefabfa8535b
parent f9e58913ce00cdd4cd4fb56c33fd2adf30d082f5
Author: Martin Mitas <mity@morous.org>
Date:   Mon,  5 Dec 2016 21:17:45 +0100

More fixes and enhancements to Windows Unicode support (issue #3).

 * Rename MD4C_USE_WIN_UNICODE to MD4C_USE_UTF16.
 * Update and improve related documentation in README.md.

Diffstat:
MREADME.md | 23+++++++++++++----------
Mmd4c/md4c.c | 17+++++++----------
Mmd4c/md4c.h | 16+++++++++-------
3 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md @@ -111,6 +111,10 @@ inspection Unicode is actually used on very few occasions: * Unicode case folding. This is used to perform case-independent matching of link labels when resolving reference links. + * Translating HTML entities and numeric character references (e.g. `&amp;`, + `&#35;`). However MD4C leaves the translation on the renderer/application; + as the renderer is supposed to really know output encoding. + MD4C uses this property of the standard and its implementation is, to a large degree, encoding-agnostic. Most of the code only assumes that the encoding of your choice is compatible with ASCII, i.e. that the codepoints below 128 have @@ -119,21 +123,20 @@ the same numeric values as ASCII. All input MD4C does not understand is seen as a text and sent to the callbacks unchanged. -The behavior of MD4C in the isolated situations where the encoding really -matters is determined by preprocessor macros: +The behavior of MD4C in the isolated listed situations where the encoding +really matters is determined by preprocessor macros: * If preprocessor macro `MD4C_USE_UTF8` is defined, MD4C assumes UTF-8 in the specific situations. - * On Windows, if preprocessor macro `MD4C_USE_WIN_UNICODE` is defined, MD4C - assumes UTF-16 and uses `WCHAR` instead of `char`. This allows usage of - MD4C directly within Unicode applications on Windows, without any text - conversions. + * On Windows, if preprocessor macro `MD4C_USE_UTF16` is defined, MD4C assumes + UTF-16 and uses `WCHAR` instead of `char`. (UTF-16 is what Windows + developers usually call just "Unicode" and what Win32API works with.) - * When none of the macros is defined, ASCII-only approach is used even in - the listed situations. This effectively means that non-ASCII whitespace or - punctuation characters won't be recognized as such and that case-folding is - performed only on ASCII letters (i.e. `[a-zA-Z]`). + * By default (when none of the macros is defined), ASCII-only mode is used + even in the situations listed above. This effectively means that non-ASCII + whitespace or punctuation characters won't be recognized as such and that + case-folding is performed only on ASCII letters (i.e. `[a-zA-Z]`). (Adding support for yet another encodings should be relatively simple due the isolation of the respective code.) diff --git a/md4c/md4c.c b/md4c/md4c.c @@ -45,7 +45,7 @@ #ifdef _T #undef _T #endif -#if defined MD4C_USE_WIN_UNICODE +#if defined MD4C_USE_UTF16 #define _T(x) L##x #else #define _T(x) x @@ -432,7 +432,7 @@ struct MD_UNICODE_FOLD_INFO_tag { }; -#if defined MD4C_USE_WIN_UNICODE || defined MD4C_USE_UTF8 +#if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8 static int md_is_unicode_whitespace__(int codepoint) { @@ -441,7 +441,7 @@ struct MD_UNICODE_FOLD_INFO_tag { return ISWHITESPACE_(codepoint); /* Check for Unicode codepoints in Zs class above 127. */ - if(codepoint == 0x00A0 || codepoint == 0x1680) + if(codepoint == 0x00a0 || codepoint == 0x1680) return TRUE; if(0x2000 <= codepoint && codepoint <= 0x200a) return TRUE; @@ -685,13 +685,10 @@ struct MD_UNICODE_FOLD_INFO_tag { #endif -#if defined MD4C_USE_WIN_UNICODE - /* The encoding known called on Windows simply as "Unicode" is actually - * UTF-16. */ - - #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc) == 0xd800) - #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc) == 0xdc00) - #define UTF16_DECODE_SURROGATE(hi, lo) ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)) +#if defined MD4C_USE_UTF16 + #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc00) == 0xd800) + #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc00) == 0xdc00) + #define UTF16_DECODE_SURROGATE(hi, lo) (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0))) static int md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size) diff --git a/md4c/md4c.h b/md4c/md4c.h @@ -31,14 +31,16 @@ #endif -/* Magic to support UTF16-LE (i.e. what is called Unicode among Windows - * developers) input/output on Windows. - */ -#if defined MD4C_USE_WIN_UNICODE - #include <windows.h> - typedef WCHAR MD_CHAR; +/* Magic to support UTF16. */ +#if defined MD4C_USE_UTF16 + #ifdef _WIN32 + #include <wchar.h> + typedef WCHAR MD_CHAR; + #else + #error MD4C_USE_UTF16 is only upported on Windows. + #endif #else - typedef char MD_CHAR; + typedef char MD_CHAR; #endif typedef unsigned MD_SIZE;