md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

commit 7d20152c39dbf094a774bbf34a808bf689dd2b6a
parent 0d10f6dbe26117ca84943d3518c2879193e55c5e
Author: Martin Mitas <mity@morous.org>
Date:   Mon,  5 Dec 2016 13:45:57 +0100

Fix UTF-16 sorrogate decoding (with -DMD4C_USE_UNICODE).

See https://github.com/mity/md4c/pull/1#issuecomment-264842360

Diffstat:
MREADME.md | 10+++++-----
Mmd4c/md4c.c | 22+++++++++++-----------
2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md @@ -44,8 +44,8 @@ MD4C is C Markdown parser with the following features: be fairly simple to make it run also on most other systems. * **Encoding:** MD4C can be compiled to recognize ASCII-only control characters, - UTF-8 and, on Windows, also UTF-16 little endian, i.e. what is on Windows - commonly called just "Unicode". See more details below. + UTF-8 and, on Windows, also UTF-16, i.e. what is on Windows commonly called + just "Unicode". See more details below. * **Permissive license:** MD4C is available under the MIT license. @@ -126,9 +126,9 @@ matters is determined by preprocessor macros: in the specific situations. * On Windows, if preprocessor macro `MD4C_USE_WIN_UNICODE` is defined, MD4C - assumes little-endian UTF-16 and uses `WCHAR` instead of `char`. This allows - usage of MD4C directly within Unicode applications on Windows, without any - text conversion. + assumes UTF-16 and uses `WCHAR` instead of `char`. This allows usage of + MD4C directly within Unicode applications on Windows, without any text + conversions. * When none of the macros is defined, ASCII-only approach is used even in the listed situations. This effectively means that non-ASCII whitespace or diff --git a/md4c/md4c.c b/md4c/md4c.c @@ -686,21 +686,21 @@ struct MD_UNICODE_FOLD_INFO_tag { #if defined MD4C_USE_WIN_UNICODE - #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc) == 0xd800) - #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc) == 0xdc00) - #define UTF16_COMPUTE_SURROGATE(hi, lo) ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)) + /* The encoding known called on Windows simply as "Unicode" is actually + * UTF-16. */ + + #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc) == 0xd800) + #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc) == 0xdc00) + #define UTF16_DECODE_SURROGATE(hi, lo) ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)) static int md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size) { - /* The encoding known called on Windows simply as "Unicode" is actually - * little-endian UTF-16, i.e. the low surrogate precedes the high - * surrogate. */ - if(IS_UTF16_SURROGATE_LO(str[0])) { - if(1 < str_size && IS_UTF16_SURROGATE_HI(str[1])) { + if(IS_UTF16_SURROGATE_HI(str[0])) { + if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) { if(p_size != NULL) *p_size = 2; - return UTF16_COMPUTE_SURROGATE(str[1], str[0]); + return UTF16_DECODE_SURROGATE(str[0], str[1]); } } @@ -712,8 +712,8 @@ struct MD_UNICODE_FOLD_INFO_tag { static int md_decode_utf16le_before__(MD_CTX* ctx, OFF off) { - if(off > 2 && IS_UTF16_SURROGATE_LO(CH(off-2)) && IS_UTF16_SURROGATE_HI(CH(off-1))) - return UTF16_COMPUTE_SURROGATE(CH(off-1), CH(off-2)); + if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1))) + return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1)); return CH(off); }