commit 6c90b37f1280a1061a82325fe02efefabfa8535b
parent f9e58913ce00cdd4cd4fb56c33fd2adf30d082f5
Author: Martin Mitas <mity@morous.org>
Date: Mon, 5 Dec 2016 21:17:45 +0100
More fixes and enhancements to Windows Unicode support (issue #3).
* Rename MD4C_USE_WIN_UNICODE to MD4C_USE_UTF16.
* Update and improve related documentation in README.md.
Diffstat:
3 files changed, 29 insertions(+), 27 deletions(-)
diff --git a/README.md b/README.md
@@ -111,6 +111,10 @@ inspection Unicode is actually used on very few occasions:
* Unicode case folding. This is used to perform case-independent matching
of link labels when resolving reference links.
+ * Translating HTML entities and numeric character references (e.g. `&`,
+ `#`). However MD4C leaves the translation on the renderer/application;
+ as the renderer is supposed to really know output encoding.
+
MD4C uses this property of the standard and its implementation is, to a large
degree, encoding-agnostic. Most of the code only assumes that the encoding of
your choice is compatible with ASCII, i.e. that the codepoints below 128 have
@@ -119,21 +123,20 @@ the same numeric values as ASCII.
All input MD4C does not understand is seen as a text and sent to the callbacks
unchanged.
-The behavior of MD4C in the isolated situations where the encoding really
-matters is determined by preprocessor macros:
+The behavior of MD4C in the isolated listed situations where the encoding
+really matters is determined by preprocessor macros:
* If preprocessor macro `MD4C_USE_UTF8` is defined, MD4C assumes UTF-8
in the specific situations.
- * On Windows, if preprocessor macro `MD4C_USE_WIN_UNICODE` is defined, MD4C
- assumes UTF-16 and uses `WCHAR` instead of `char`. This allows usage of
- MD4C directly within Unicode applications on Windows, without any text
- conversions.
+ * On Windows, if preprocessor macro `MD4C_USE_UTF16` is defined, MD4C assumes
+ UTF-16 and uses `WCHAR` instead of `char`. (UTF-16 is what Windows
+ developers usually call just "Unicode" and what Win32API works with.)
- * When none of the macros is defined, ASCII-only approach is used even in
- the listed situations. This effectively means that non-ASCII whitespace or
- punctuation characters won't be recognized as such and that case-folding is
- performed only on ASCII letters (i.e. `[a-zA-Z]`).
+ * By default (when none of the macros is defined), ASCII-only mode is used
+ even in the situations listed above. This effectively means that non-ASCII
+ whitespace or punctuation characters won't be recognized as such and that
+ case-folding is performed only on ASCII letters (i.e. `[a-zA-Z]`).
(Adding support for yet another encodings should be relatively simple due
the isolation of the respective code.)
diff --git a/md4c/md4c.c b/md4c/md4c.c
@@ -45,7 +45,7 @@
#ifdef _T
#undef _T
#endif
-#if defined MD4C_USE_WIN_UNICODE
+#if defined MD4C_USE_UTF16
#define _T(x) L##x
#else
#define _T(x) x
@@ -432,7 +432,7 @@ struct MD_UNICODE_FOLD_INFO_tag {
};
-#if defined MD4C_USE_WIN_UNICODE || defined MD4C_USE_UTF8
+#if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8
static int
md_is_unicode_whitespace__(int codepoint)
{
@@ -441,7 +441,7 @@ struct MD_UNICODE_FOLD_INFO_tag {
return ISWHITESPACE_(codepoint);
/* Check for Unicode codepoints in Zs class above 127. */
- if(codepoint == 0x00A0 || codepoint == 0x1680)
+ if(codepoint == 0x00a0 || codepoint == 0x1680)
return TRUE;
if(0x2000 <= codepoint && codepoint <= 0x200a)
return TRUE;
@@ -685,13 +685,10 @@ struct MD_UNICODE_FOLD_INFO_tag {
#endif
-#if defined MD4C_USE_WIN_UNICODE
- /* The encoding known called on Windows simply as "Unicode" is actually
- * UTF-16. */
-
- #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc) == 0xd800)
- #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc) == 0xdc00)
- #define UTF16_DECODE_SURROGATE(hi, lo) ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0))
+#if defined MD4C_USE_UTF16
+ #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc00) == 0xd800)
+ #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc00) == 0xdc00)
+ #define UTF16_DECODE_SURROGATE(hi, lo) (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)))
static int
md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
diff --git a/md4c/md4c.h b/md4c/md4c.h
@@ -31,14 +31,16 @@
#endif
-/* Magic to support UTF16-LE (i.e. what is called Unicode among Windows
- * developers) input/output on Windows.
- */
-#if defined MD4C_USE_WIN_UNICODE
- #include <windows.h>
- typedef WCHAR MD_CHAR;
+/* Magic to support UTF16. */
+#if defined MD4C_USE_UTF16
+ #ifdef _WIN32
+ #include <wchar.h>
+ typedef WCHAR MD_CHAR;
+ #else
+ #error MD4C_USE_UTF16 is only upported on Windows.
+ #endif
#else
- typedef char MD_CHAR;
+ typedef char MD_CHAR;
#endif
typedef unsigned MD_SIZE;