md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

commit 1a2477952a4f751fb320f08cd011a2a09ac5e48e
parent ef38ecbc5c4dfa7f8f8560c497dd8a4dc3eea845
Author: Martin Mitas <mity@morous.org>
Date:   Thu, 10 Nov 2016 13:22:29 +0100

Implement link reference definitions and, partially, link references.

Diffstat:
MREADME.md | 2+-
Mmd2html/md2html.c | 6++++++
Mmd4c/md4c.c | 1402++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
3 files changed, 1293 insertions(+), 117 deletions(-)

diff --git a/README.md b/README.md @@ -87,7 +87,7 @@ more or less forms our to do list. - [x] 4.4 Indented code blocks - [x] 4.5 Fenced code blocks - [x] 4.6 HTML blocks - - [ ] 4.7 Link reference definitions + - [x] 4.7 Link reference definitions - [x] 4.8 Paragraphs - [x] 4.9 Blank lines diff --git a/md2html/md2html.c b/md2html/md2html.c @@ -154,6 +154,12 @@ open_a_span(struct membuffer* out, MD_SPAN_A_DETAIL* det) { MEMBUF_APPEND_LITERAL(out, "<a href=\""); membuf_append_escaped(out, det->href, det->href_size); + + if(det->title != NULL) { + MEMBUF_APPEND_LITERAL(out, "\" href=\""); + membuf_append_escaped(out, det->title, det->title_size); + } + MEMBUF_APPEND_LITERAL(out, "\">"); } diff --git a/md4c/md4c.c b/md4c/md4c.c @@ -42,8 +42,6 @@ #endif #endif -/* Magic to support UTF16-LE (i.e. what is called Unicode among Windows - * developers) input/output on Windows. */ #ifdef _T #undef _T #endif @@ -76,6 +74,7 @@ typedef struct MD_MARK_tag MD_MARK; typedef struct MD_BLOCK_tag MD_BLOCK; +typedef struct MD_LINK_REF_DEF_tag MD_LINK_REF_DEF; /* During analyzes of inline marks, we need to manage some "mark chains", @@ -101,6 +100,8 @@ struct MD_CTX_tag { CHAR* buffer; unsigned alloc_buffer; + MD_LINK_REF_DEF* link_ref_defs; + /* Stack of inline/span markers. * This is only used for parsing a single block contents but by storing it * here we may reuse the stack for subsequent blocks; i.e. we have fewer @@ -112,11 +113,17 @@ struct MD_CTX_tag { char mark_char_map[128]; /* For resolving of inline spans. */ - MD_MARKCHAIN mark_chains[4]; -#define BACKTICK_OPENERS ctx->mark_chains[0] -#define LOWERTHEN_OPENERS ctx->mark_chains[1] -#define ASTERISK_OPENERS ctx->mark_chains[2] -#define UNDERSCORE_OPENERS ctx->mark_chains[3] + MD_MARKCHAIN mark_chains[6]; +#define PTR_CHAIN ctx->mark_chains[0] +#define BACKTICK_OPENERS ctx->mark_chains[1] +#define LOWERTHEN_OPENERS ctx->mark_chains[2] +#define ASTERISK_OPENERS ctx->mark_chains[3] +#define UNDERSCORE_OPENERS ctx->mark_chains[4] +#define BRACKET_OPENERS ctx->mark_chains[5] + + /* For resolving links. */ + int unresolved_link_head; + int unresolved_link_tail; /* For block analysis. * Notes: @@ -207,7 +214,7 @@ struct MD_VERBATIMLINE_tag { #define MD_ASSERT(cond) do { __assume(cond); } while(0) #define MD_UNREACHABLE() do { __assume(0); } while(0) #else - #define MD_ASSERT(cond) do {} while(0) + define MD_ASSERT(cond) do {} while(0) #define MD_UNREACHABLE() do {} while(0) #endif #endif @@ -251,95 +258,6 @@ struct MD_VERBATIMLINE_tag { #define ISALNUM(off) ISALNUM_(CH(off)) #define ISANYOF(off, palette) ISANYOF_(CH(off), (palette)) - -#if defined MD4C_USE_WIN_UNICODE - #include <ctype.h> - - #define ISUNICODEWHITESPACE(off) iswspace(CH(off)) - #define ISUNICODEPUNCT(off) iswpunct(CH(off)) - #define ISUNICODEWHITESPACEBEFORE(off) iswspace(CH((off)-1)) - #define ISUNICODEPUNCTBEFORE(off) iswpunct(CH((off)-1)) -#elif defined MD4C_USE_UNICODE - #ifdef _WIN32 - /* Note Win32 supports only Unicode plane 0 but better then nothing. */ - #include <ctype.h> - #else - #include <wctype.h> - - #ifndef __STDC_ISO_10646__ - #error "MD4C relies on wchar_t to support Unicode properly." - #endif - #endif - - #define IS_UTF8_LEAD1(byte) ((unsigned char)(byte) <= 0x7f) - #define IS_UTF8_LEAD2(byte) (((unsigned char)(byte) & 0xe0) == 0xc0) - #define IS_UTF8_LEAD3(byte) (((unsigned char)(byte) & 0xf0) == 0xe0) - #define IS_UTF8_LEAD4(byte) (((unsigned char)(byte) & 0xf8) == 0xf0) - #define IS_UTF8_TAIL(byte) (((unsigned char)(byte) & 0xc0) == 0x80) - - static int - md_decode_utf8(MD_CTX* ctx, OFF off) - { - /* For any invalid UTF-8 sequence we use the Unicode replacement char - * for purposes of character classification. */ - int codepoint = 0xfffd; - - if(IS_UTF8_LEAD1(CH(off))) { - codepoint = CH(off); - } else if(IS_UTF8_LEAD2(CH(off))) { - if(off+1 < ctx->size) - codepoint = (((unsigned int)CH(off) & 0x1f) << 6) | - (((unsigned int)CH(off+1) & 0x3f) << 0); - } else if(IS_UTF8_LEAD3(CH(off))) { - if(off+2 < ctx->size) - codepoint = (((unsigned int)CH(off) & 0x0f) << 12) | - (((unsigned int)CH(off+1) & 0x3f) << 6) | - (((unsigned int)CH(off+2) & 0x3f) << 0); - } else if(IS_UTF8_LEAD4(CH(off))) { - if(off+3 < ctx->size) - codepoint = (((unsigned int)CH(off) & 0x07) << 18) | - (((unsigned int)CH(off+1) & 0x3f) << 12) | - (((unsigned int)CH(off+2) & 0x3f) << 6) | - (((unsigned int)CH(off+3) & 0x3f) << 0); - } - -#ifdef _WIN32 - /* On Windows, iswpace() et al. gets garbage for codepoints above - * the Unicode plane 0. */ - if(codepoint > 0xffff) - codepoint = 0xfffd; -#endif - - return codepoint; - } - - static int - md_decode_utf8_before(MD_CTX* ctx, OFF off) - { - if(off > 0 && IS_UTF8_LEAD1(CH(off-1))) - return CH(off-1); - if(off > 1 && IS_UTF8_LEAD2(CH(off-2))) - return md_decode_utf8(ctx, off-2); - if(off > 2 && IS_UTF8_LEAD3(CH(off-3))) - return md_decode_utf8(ctx, off-3); - if(off > 3 && IS_UTF8_LEAD4(CH(off-4))) - return md_decode_utf8(ctx, off-4); - - return 0xfffd; - } - - #define ISUNICODEWHITESPACE(off) iswspace(md_decode_utf8(ctx, off)) - #define ISUNICODEPUNCT(off) iswpunct(md_decode_utf8(ctx, off)) - #define ISUNICODEWHITESPACEBEFORE(off) iswspace(md_decode_utf8_before(ctx, off)) - #define ISUNICODEPUNCTBEFORE(off) iswpunct(md_decode_utf8_before(ctx, off)) -#else - #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off) - #define ISUNICODEPUNCT(off) ISPUNCT(off) - #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1) - #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1) -#endif - - static inline const CHAR* md_strchr(const CHAR* str, CHAR ch) { @@ -492,6 +410,424 @@ md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ } while(0) + +/************************* + *** Unicode Support *** + *************************/ + +typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO; +struct MD_UNICODE_FOLD_INFO_tag { + int codepoints[3]; + int n_codepoints; +}; + + +#if defined MD4C_USE_WIN_UNICODE || defined MD4C_USE_UNICODE + static int + md_is_unicode_whitespace__(int codepoint) + { + /* The ASCII ones are the most frequently used ones, so lets check them first. */ + if(codepoint <= 0x7f) + return ISWHITESPACE_(codepoint); + + /* Check for Unicode codepoints in Zs class above 127. */ + if(codepoint == 0x00A0 || codepoint == 0x1680) + return TRUE; + if(0x2000 <= codepoint && codepoint <= 0x200a) + return TRUE; + if(codepoint == 0x202f || codepoint == 0x205f || codepoint == 0x3000) + return TRUE; + + return FALSE; + } + + static int + md_unicode_cmp__(const void* p_codepoint_a, const void* p_codepoint_b) + { + return (*(const int*)p_codepoint_a - *(const int*)p_codepoint_b); + } + + static int + md_is_unicode_punct__(int codepoint) + { + /* non-ASCII (above 127) Unicode punctuation codepoints (classes + * Pc, Pd, Pe, Pf, Pi, Po, Ps). + * + * Warning: Keep the array sorted. + */ + static const int punct_list[] = { + 0x00a1, 0x00a7, 0x00ab, 0x00b6, 0x00b7, 0x00bb, 0x00bf, 0x037e, 0x0387, 0x055a, 0x055b, 0x055c, 0x055d, 0x055e, 0x055f, 0x0589, + 0x058a, 0x05be, 0x05c0, 0x05c3, 0x05c6, 0x05f3, 0x05f4, 0x0609, 0x060a, 0x060c, 0x060d, 0x061b, 0x061e, 0x061f, 0x066a, 0x066b, + 0x066c, 0x066d, 0x06d4, 0x0700, 0x0701, 0x0702, 0x0703, 0x0704, 0x0705, 0x0706, 0x0707, 0x0708, 0x0709, 0x070a, 0x070b, 0x070c, + 0x070d, 0x07f7, 0x07f8, 0x07f9, 0x0830, 0x0831, 0x0832, 0x0833, 0x0834, 0x0835, 0x0836, 0x0837, 0x0838, 0x0839, 0x083a, 0x083b, + 0x083c, 0x083d, 0x083e, 0x085e, 0x0964, 0x0965, 0x0970, 0x0af0, 0x0df4, 0x0e4f, 0x0e5a, 0x0e5b, 0x0f04, 0x0f05, 0x0f06, 0x0f07, + 0x0f08, 0x0f09, 0x0f0a, 0x0f0b, 0x0f0c, 0x0f0d, 0x0f0e, 0x0f0f, 0x0f10, 0x0f11, 0x0f12, 0x0f14, 0x0f3a, 0x0f3b, 0x0f3c, 0x0f3d, + 0x0f85, 0x0fd0, 0x0fd1, 0x0fd2, 0x0fd3, 0x0fd4, 0x0fd9, 0x0fda, 0x104a, 0x104b, 0x104c, 0x104d, 0x104e, 0x104f, 0x10fb, 0x1360, + 0x1361, 0x1362, 0x1363, 0x1364, 0x1365, 0x1366, 0x1367, 0x1368, 0x1400, 0x166d, 0x166e, 0x169b, 0x169c, 0x16eb, 0x16ec, 0x16ed, + 0x1735, 0x1736, 0x17d4, 0x17d5, 0x17d6, 0x17d8, 0x17d9, 0x17da, 0x1800, 0x1801, 0x1802, 0x1803, 0x1804, 0x1805, 0x1806, 0x1807, + 0x1808, 0x1809, 0x180a, 0x1944, 0x1945, 0x1a1e, 0x1a1f, 0x1aa0, 0x1aa1, 0x1aa2, 0x1aa3, 0x1aa4, 0x1aa5, 0x1aa6, 0x1aa8, 0x1aa9, + 0x1aaa, 0x1aab, 0x1aac, 0x1aad, 0x1b5a, 0x1b5b, 0x1b5c, 0x1b5d, 0x1b5e, 0x1b5f, 0x1b60, 0x1bfc, 0x1bfd, 0x1bfe, 0x1bff, 0x1c3b, + 0x1c3c, 0x1c3d, 0x1c3e, 0x1c3f, 0x1c7e, 0x1c7f, 0x1cc0, 0x1cc1, 0x1cc2, 0x1cc3, 0x1cc4, 0x1cc5, 0x1cc6, 0x1cc7, 0x1cd3, 0x2010, + 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2016, 0x2017, 0x2018, 0x2019, 0x201a, 0x201b, 0x201c, 0x201d, 0x201e, 0x201f, 0x2020, + 0x2021, 0x2022, 0x2023, 0x2024, 0x2025, 0x2026, 0x2027, 0x2030, 0x2031, 0x2032, 0x2033, 0x2034, 0x2035, 0x2036, 0x2037, 0x2038, + 0x2039, 0x203a, 0x203b, 0x203c, 0x203d, 0x203e, 0x203f, 0x2040, 0x2041, 0x2042, 0x2043, 0x2045, 0x2046, 0x2047, 0x2048, 0x2049, + 0x204a, 0x204b, 0x204c, 0x204d, 0x204e, 0x204f, 0x2050, 0x2051, 0x2053, 0x2054, 0x2055, 0x2056, 0x2057, 0x2058, 0x2059, 0x205a, + 0x205b, 0x205c, 0x205d, 0x205e, 0x207d, 0x207e, 0x208d, 0x208e, 0x2308, 0x2309, 0x230a, 0x230b, 0x2329, 0x232a, 0x2768, 0x2769, + 0x276a, 0x276b, 0x276c, 0x276d, 0x276e, 0x276f, 0x2770, 0x2771, 0x2772, 0x2773, 0x2774, 0x2775, 0x27c5, 0x27c6, 0x27e6, 0x27e7, + 0x27e8, 0x27e9, 0x27ea, 0x27eb, 0x27ec, 0x27ed, 0x27ee, 0x27ef, 0x2983, 0x2984, 0x2985, 0x2986, 0x2987, 0x2988, 0x2989, 0x298a, + 0x298b, 0x298c, 0x298d, 0x298e, 0x298f, 0x2990, 0x2991, 0x2992, 0x2993, 0x2994, 0x2995, 0x2996, 0x2997, 0x2998, 0x29d8, 0x29d9, + 0x29da, 0x29db, 0x29fc, 0x29fd, 0x2cf9, 0x2cfa, 0x2cfb, 0x2cfc, 0x2cfe, 0x2cff, 0x2d70, 0x2e00, 0x2e01, 0x2e02, 0x2e03, 0x2e04, + 0x2e05, 0x2e06, 0x2e07, 0x2e08, 0x2e09, 0x2e0a, 0x2e0b, 0x2e0c, 0x2e0d, 0x2e0e, 0x2e0f, 0x2e10, 0x2e11, 0x2e12, 0x2e13, 0x2e14, + 0x2e15, 0x2e16, 0x2e17, 0x2e18, 0x2e19, 0x2e1a, 0x2e1b, 0x2e1c, 0x2e1d, 0x2e1e, 0x2e1f, 0x2e20, 0x2e21, 0x2e22, 0x2e23, 0x2e24, + 0x2e25, 0x2e26, 0x2e27, 0x2e28, 0x2e29, 0x2e2a, 0x2e2b, 0x2e2c, 0x2e2d, 0x2e2e, 0x2e30, 0x2e31, 0x2e32, 0x2e33, 0x2e34, 0x2e35, + 0x2e36, 0x2e37, 0x2e38, 0x2e39, 0x2e3a, 0x2e3b, 0x2e3c, 0x2e3d, 0x2e3e, 0x2e3f, 0x2e40, 0x2e41, 0x2e42, 0x2e43, 0x2e44, 0x3001, + 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3014, 0x3015, 0x3016, 0x3017, + 0x3018, 0x3019, 0x301a, 0x301b, 0x301c, 0x301d, 0x301e, 0x301f, 0x3030, 0x303d, 0x30a0, 0x30fb, 0xa4fe, 0xa4ff, 0xa60d, 0xa60e, + 0xa60f, 0xa673, 0xa67e, 0xa6f2, 0xa6f3, 0xa6f4, 0xa6f5, 0xa6f6, 0xa6f7, 0xa874, 0xa875, 0xa876, 0xa877, 0xa8ce, 0xa8cf, 0xa8f8, + 0xa8f9, 0xa8fa, 0xa8fc, 0xa92e, 0xa92f, 0xa95f, 0xa9c1, 0xa9c2, 0xa9c3, 0xa9c4, 0xa9c5, 0xa9c6, 0xa9c7, 0xa9c8, 0xa9c9, 0xa9ca, + 0xa9cb, 0xa9cc, 0xa9cd, 0xa9de, 0xa9df, 0xaa5c, 0xaa5d, 0xaa5e, 0xaa5f, 0xaade, 0xaadf, 0xaaf0, 0xaaf1, 0xabeb, 0xfd3e, 0xfd3f, + 0xfe10, 0xfe11, 0xfe12, 0xfe13, 0xfe14, 0xfe15, 0xfe16, 0xfe17, 0xfe18, 0xfe19, 0xfe30, 0xfe31, 0xfe32, 0xfe33, 0xfe34, 0xfe35, + 0xfe36, 0xfe37, 0xfe38, 0xfe39, 0xfe3a, 0xfe3b, 0xfe3c, 0xfe3d, 0xfe3e, 0xfe3f, 0xfe40, 0xfe41, 0xfe42, 0xfe43, 0xfe44, 0xfe45, + 0xfe46, 0xfe47, 0xfe48, 0xfe49, 0xfe4a, 0xfe4b, 0xfe4c, 0xfe4d, 0xfe4e, 0xfe4f, 0xfe50, 0xfe51, 0xfe52, 0xfe54, 0xfe55, 0xfe56, + 0xfe57, 0xfe58, 0xfe59, 0xfe5a, 0xfe5b, 0xfe5c, 0xfe5d, 0xfe5e, 0xfe5f, 0xfe60, 0xfe61, 0xfe63, 0xfe68, 0xfe6a, 0xfe6b, 0xff01, + 0xff02, 0xff03, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09, 0xff0a, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1f, 0xff20, + 0xff3b, 0xff3c, 0xff3d, 0xff3f, 0xff5b, 0xff5d, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0x10100, 0x10101, 0x10102, + 0x1039f, 0x103d0, 0x1056f, 0x10857, 0x1091f, 0x1093f, 0x10a50, 0x10a51, 0x10a52, 0x10a53, 0x10a54, 0x10a55, 0x10a56, 0x10a57, 0x10a58, 0x10a7f, + 0x10af0, 0x10af1, 0x10af2, 0x10af3, 0x10af4, 0x10af5, 0x10af6, 0x10b39, 0x10b3a, 0x10b3b, 0x10b3c, 0x10b3d, 0x10b3e, 0x10b3f, 0x10b99, 0x10b9a, + 0x10b9b, 0x10b9c, 0x11047, 0x11048, 0x11049, 0x1104a, 0x1104b, 0x1104c, 0x1104d, 0x110bb, 0x110bc, 0x110be, 0x110bf, 0x110c0, 0x110c1, 0x11140, + 0x11141, 0x11142, 0x11143, 0x11174, 0x11175, 0x111c5, 0x111c6, 0x111c7, 0x111c8, 0x111c9, 0x111cd, 0x111db, 0x111dd, 0x111de, 0x111df, 0x11238, + 0x11239, 0x1123a, 0x1123b, 0x1123c, 0x1123d, 0x112a9, 0x1144b, 0x1144c, 0x1144d, 0x1144e, 0x1144f, 0x1145b, 0x1145d, 0x114c6, 0x115c1, 0x115c2, + 0x115c3, 0x115c4, 0x115c5, 0x115c6, 0x115c7, 0x115c8, 0x115c9, 0x115ca, 0x115cb, 0x115cc, 0x115cd, 0x115ce, 0x115cf, 0x115d0, 0x115d1, 0x115d2, + 0x115d3, 0x115d4, 0x115d5, 0x115d6, 0x115d7, 0x11641, 0x11642, 0x11643, 0x11660, 0x11661, 0x11662, 0x11663, 0x11664, 0x11665, 0x11666, 0x11667, + 0x11668, 0x11669, 0x1166a, 0x1166b, 0x1166c, 0x1173c, 0x1173d, 0x1173e, 0x11c41, 0x11c42, 0x11c43, 0x11c44, 0x11c45, 0x11c70, 0x11c71, 0x12470, + 0x12471, 0x12472, 0x12473, 0x12474, 0x16a6e, 0x16a6f, 0x16af5, 0x16b37, 0x16b38, 0x16b39, 0x16b3a, 0x16b3b, 0x16b44, 0x1bc9f, 0x1da87, 0x1da88, + 0x1da89, 0x1da8a, 0x1da8b, 0x1e95e, 0x1e95f + }; + + /* The ASCII ones are the most frequently used ones, so lets check them first. */ + if(codepoint <= 0x7f) + return ISPUNCT_(codepoint); + + return (bsearch(&codepoint, punct_list, SIZEOF_ARRAY(punct_list), sizeof(int), md_unicode_cmp__) != NULL); + } + + static void + md_get_unicode_fold_info(int codepoint, MD_UNICODE_FOLD_INFO* info) + { + /* This maps single codepoint within a range to a single codepoint + * within an offseted range. */ + static const struct { + int min_codepoint; + int max_codepoint; + int offset; + } range_map[] = { + { 0x00c0, 0x00d6, 32 }, { 0x00d8, 0x00de, 32 }, { 0x0388, 0x038a, 37 }, { 0x0391, 0x03a1, 32 }, { 0x03a3, 0x03ab, 32 }, { 0x0400, 0x040f, 80 }, + { 0x0410, 0x042f, 32 }, { 0x0531, 0x0556, 48 }, { 0x1f08, 0x1f0f, -8 }, { 0x1f18, 0x1f1d, -8 }, { 0x1f28, 0x1f2f, -8 }, { 0x1f38, 0x1f3f, -8 }, + { 0x1f48, 0x1f4d, -8 }, { 0x1f68, 0x1f6f, -8 }, { 0x1fc8, 0x1fcb, -86 }, { 0x2160, 0x216f, 16 }, { 0x24b6, 0x24cf, 26 }, { 0xff21, 0xff3a, 32 }, + { 0x10400, 0x10425, 40 } + }; + + /* This maps single codepoint to another single codepoint. */ + static const struct { + int src_codepoint; + int dest_codepoint; + } single_map[] = { + { 0x00b5, 0x03bc }, { 0x0100, 0x0101 }, { 0x0102, 0x0103 }, { 0x0104, 0x0105 }, { 0x0106, 0x0107 }, { 0x0108, 0x0109 }, { 0x010a, 0x010b }, { 0x010c, 0x010d }, + { 0x010e, 0x010f }, { 0x0110, 0x0111 }, { 0x0112, 0x0113 }, { 0x0114, 0x0115 }, { 0x0116, 0x0117 }, { 0x0118, 0x0119 }, { 0x011a, 0x011b }, { 0x011c, 0x011d }, + { 0x011e, 0x011f }, { 0x0120, 0x0121 }, { 0x0122, 0x0123 }, { 0x0124, 0x0125 }, { 0x0126, 0x0127 }, { 0x0128, 0x0129 }, { 0x012a, 0x012b }, { 0x012c, 0x012d }, + { 0x012e, 0x012f }, { 0x0132, 0x0133 }, { 0x0134, 0x0135 }, { 0x0136, 0x0137 }, { 0x0139, 0x013a }, { 0x013b, 0x013c }, { 0x013d, 0x013e }, { 0x013f, 0x0140 }, + { 0x0141, 0x0142 }, { 0x0143, 0x0144 }, { 0x0145, 0x0146 }, { 0x0147, 0x0148 }, { 0x014a, 0x014b }, { 0x014c, 0x014d }, { 0x014e, 0x014f }, { 0x0150, 0x0151 }, + { 0x0152, 0x0153 }, { 0x0154, 0x0155 }, { 0x0156, 0x0157 }, { 0x0158, 0x0159 }, { 0x015a, 0x015b }, { 0x015c, 0x015d }, { 0x015e, 0x015f }, { 0x0160, 0x0161 }, + { 0x0162, 0x0163 }, { 0x0164, 0x0165 }, { 0x0166, 0x0167 }, { 0x0168, 0x0169 }, { 0x016a, 0x016b }, { 0x016c, 0x016d }, { 0x016e, 0x016f }, { 0x0170, 0x0171 }, + { 0x0172, 0x0173 }, { 0x0174, 0x0175 }, { 0x0176, 0x0177 }, { 0x0178, 0x00ff }, { 0x0179, 0x017a }, { 0x017b, 0x017c }, { 0x017d, 0x017e }, { 0x017f, 0x0073 }, + { 0x0181, 0x0253 }, { 0x0182, 0x0183 }, { 0x0184, 0x0185 }, { 0x0186, 0x0254 }, { 0x0187, 0x0188 }, { 0x0189, 0x0256 }, { 0x018a, 0x0257 }, { 0x018b, 0x018c }, + { 0x018e, 0x01dd }, { 0x018f, 0x0259 }, { 0x0190, 0x025b }, { 0x0191, 0x0192 }, { 0x0193, 0x0260 }, { 0x0194, 0x0263 }, { 0x0196, 0x0269 }, { 0x0197, 0x0268 }, + { 0x0198, 0x0199 }, { 0x019c, 0x026f }, { 0x019d, 0x0272 }, { 0x019f, 0x0275 }, { 0x01a0, 0x01a1 }, { 0x01a2, 0x01a3 }, { 0x01a4, 0x01a5 }, { 0x01a6, 0x0280 }, + { 0x01a7, 0x01a8 }, { 0x01a9, 0x0283 }, { 0x01ac, 0x01ad }, { 0x01ae, 0x0288 }, { 0x01af, 0x01b0 }, { 0x01b1, 0x028a }, { 0x01b2, 0x028b }, { 0x01b3, 0x01b4 }, + { 0x01b5, 0x01b6 }, { 0x01b7, 0x0292 }, { 0x01b8, 0x01b9 }, { 0x01bc, 0x01bd }, { 0x01c4, 0x01c6 }, { 0x01c5, 0x01c6 }, { 0x01c7, 0x01c9 }, { 0x01c8, 0x01c9 }, + { 0x01ca, 0x01cc }, { 0x01cb, 0x01cc }, { 0x01cd, 0x01ce }, { 0x01cf, 0x01d0 }, { 0x01d1, 0x01d2 }, { 0x01d3, 0x01d4 }, { 0x01d5, 0x01d6 }, { 0x01d7, 0x01d8 }, + { 0x01d9, 0x01da }, { 0x01db, 0x01dc }, { 0x01de, 0x01df }, { 0x01e0, 0x01e1 }, { 0x01e2, 0x01e3 }, { 0x01e4, 0x01e5 }, { 0x01e6, 0x01e7 }, { 0x01e8, 0x01e9 }, + { 0x01ea, 0x01eb }, { 0x01ec, 0x01ed }, { 0x01ee, 0x01ef }, { 0x01f1, 0x01f3 }, { 0x01f2, 0x01f3 }, { 0x01f4, 0x01f5 }, { 0x01f6, 0x0195 }, { 0x01f7, 0x01bf }, + { 0x01f8, 0x01f9 }, { 0x01fa, 0x01fb }, { 0x01fc, 0x01fd }, { 0x01fe, 0x01ff }, { 0x0200, 0x0201 }, { 0x0202, 0x0203 }, { 0x0204, 0x0205 }, { 0x0206, 0x0207 }, + { 0x0208, 0x0209 }, { 0x020a, 0x020b }, { 0x020c, 0x020d }, { 0x020e, 0x020f }, { 0x0210, 0x0211 }, { 0x0212, 0x0213 }, { 0x0214, 0x0215 }, { 0x0216, 0x0217 }, + { 0x0218, 0x0219 }, { 0x021a, 0x021b }, { 0x021c, 0x021d }, { 0x021e, 0x021f }, { 0x0220, 0x019e }, { 0x0222, 0x0223 }, { 0x0224, 0x0225 }, { 0x0226, 0x0227 }, + { 0x0228, 0x0229 }, { 0x022a, 0x022b }, { 0x022c, 0x022d }, { 0x022e, 0x022f }, { 0x0230, 0x0231 }, { 0x0232, 0x0233 }, { 0x0345, 0x03b9 }, { 0x0386, 0x03ac }, + { 0x038c, 0x03cc }, { 0x038e, 0x03cd }, { 0x038f, 0x03ce }, { 0x03c2, 0x03c3 }, { 0x03d0, 0x03b2 }, { 0x03d1, 0x03b8 }, { 0x03d5, 0x03c6 }, { 0x03d6, 0x03c0 }, + { 0x03d8, 0x03d9 }, { 0x03da, 0x03db }, { 0x03dc, 0x03dd }, { 0x03de, 0x03df }, { 0x03e0, 0x03e1 }, { 0x03e2, 0x03e3 }, { 0x03e4, 0x03e5 }, { 0x03e6, 0x03e7 }, + { 0x03e8, 0x03e9 }, { 0x03ea, 0x03eb }, { 0x03ec, 0x03ed }, { 0x03ee, 0x03ef }, { 0x03f0, 0x03ba }, { 0x03f1, 0x03c1 }, { 0x03f2, 0x03c3 }, { 0x03f4, 0x03b8 }, + { 0x03f5, 0x03b5 }, { 0x0460, 0x0461 }, { 0x0462, 0x0463 }, { 0x0464, 0x0465 }, { 0x0466, 0x0467 }, { 0x0468, 0x0469 }, { 0x046a, 0x046b }, { 0x046c, 0x046d }, + { 0x046e, 0x046f }, { 0x0470, 0x0471 }, { 0x0472, 0x0473 }, { 0x0474, 0x0475 }, { 0x0476, 0x0477 }, { 0x0478, 0x0479 }, { 0x047a, 0x047b }, { 0x047c, 0x047d }, + { 0x047e, 0x047f }, { 0x0480, 0x0481 }, { 0x048a, 0x048b }, { 0x048c, 0x048d }, { 0x048e, 0x048f }, { 0x0490, 0x0491 }, { 0x0492, 0x0493 }, { 0x0494, 0x0495 }, + { 0x0496, 0x0497 }, { 0x0498, 0x0499 }, { 0x049a, 0x049b }, { 0x049c, 0x049d }, { 0x049e, 0x049f }, { 0x04a0, 0x04a1 }, { 0x04a2, 0x04a3 }, { 0x04a4, 0x04a5 }, + { 0x04a6, 0x04a7 }, { 0x04a8, 0x04a9 }, { 0x04aa, 0x04ab }, { 0x04ac, 0x04ad }, { 0x04ae, 0x04af }, { 0x04b0, 0x04b1 }, { 0x04b2, 0x04b3 }, { 0x04b4, 0x04b5 }, + { 0x04b6, 0x04b7 }, { 0x04b8, 0x04b9 }, { 0x04ba, 0x04bb }, { 0x04bc, 0x04bd }, { 0x04be, 0x04bf }, { 0x04c1, 0x04c2 }, { 0x04c3, 0x04c4 }, { 0x04c5, 0x04c6 }, + { 0x04c7, 0x04c8 }, { 0x04c9, 0x04ca }, { 0x04cb, 0x04cc }, { 0x04cd, 0x04ce }, { 0x04d0, 0x04d1 }, { 0x04d2, 0x04d3 }, { 0x04d4, 0x04d5 }, { 0x04d6, 0x04d7 }, + { 0x04d8, 0x04d9 }, { 0x04da, 0x04db }, { 0x04dc, 0x04dd }, { 0x04de, 0x04df }, { 0x04e0, 0x04e1 }, { 0x04e2, 0x04e3 }, { 0x04e4, 0x04e5 }, { 0x04e6, 0x04e7 }, + { 0x04e8, 0x04e9 }, { 0x04ea, 0x04eb }, { 0x04ec, 0x04ed }, { 0x04ee, 0x04ef }, { 0x04f0, 0x04f1 }, { 0x04f2, 0x04f3 }, { 0x04f4, 0x04f5 }, { 0x04f8, 0x04f9 }, + { 0x0500, 0x0501 }, { 0x0502, 0x0503 }, { 0x0504, 0x0505 }, { 0x0506, 0x0507 }, { 0x0508, 0x0509 }, { 0x050a, 0x050b }, { 0x050c, 0x050d }, { 0x050e, 0x050f }, + { 0x1e00, 0x1e01 }, { 0x1e02, 0x1e03 }, { 0x1e04, 0x1e05 }, { 0x1e06, 0x1e07 }, { 0x1e08, 0x1e09 }, { 0x1e0a, 0x1e0b }, { 0x1e0c, 0x1e0d }, { 0x1e0e, 0x1e0f }, + { 0x1e10, 0x1e11 }, { 0x1e12, 0x1e13 }, { 0x1e14, 0x1e15 }, { 0x1e16, 0x1e17 }, { 0x1e18, 0x1e19 }, { 0x1e1a, 0x1e1b }, { 0x1e1c, 0x1e1d }, { 0x1e1e, 0x1e1f }, + { 0x1e20, 0x1e21 }, { 0x1e22, 0x1e23 }, { 0x1e24, 0x1e25 }, { 0x1e26, 0x1e27 }, { 0x1e28, 0x1e29 }, { 0x1e2a, 0x1e2b }, { 0x1e2c, 0x1e2d }, { 0x1e2e, 0x1e2f }, + { 0x1e30, 0x1e31 }, { 0x1e32, 0x1e33 }, { 0x1e34, 0x1e35 }, { 0x1e36, 0x1e37 }, { 0x1e38, 0x1e39 }, { 0x1e3a, 0x1e3b }, { 0x1e3c, 0x1e3d }, { 0x1e3e, 0x1e3f }, + { 0x1e40, 0x1e41 }, { 0x1e42, 0x1e43 }, { 0x1e44, 0x1e45 }, { 0x1e46, 0x1e47 }, { 0x1e48, 0x1e49 }, { 0x1e4a, 0x1e4b }, { 0x1e4c, 0x1e4d }, { 0x1e4e, 0x1e4f }, + { 0x1e50, 0x1e51 }, { 0x1e52, 0x1e53 }, { 0x1e54, 0x1e55 }, { 0x1e56, 0x1e57 }, { 0x1e58, 0x1e59 }, { 0x1e5a, 0x1e5b }, { 0x1e5c, 0x1e5d }, { 0x1e5e, 0x1e5f }, + { 0x1e60, 0x1e61 }, { 0x1e62, 0x1e63 }, { 0x1e64, 0x1e65 }, { 0x1e66, 0x1e67 }, { 0x1e68, 0x1e69 }, { 0x1e6a, 0x1e6b }, { 0x1e6c, 0x1e6d }, { 0x1e6e, 0x1e6f }, + { 0x1e70, 0x1e71 }, { 0x1e72, 0x1e73 }, { 0x1e74, 0x1e75 }, { 0x1e76, 0x1e77 }, { 0x1e78, 0x1e79 }, { 0x1e7a, 0x1e7b }, { 0x1e7c, 0x1e7d }, { 0x1e7e, 0x1e7f }, + { 0x1e80, 0x1e81 }, { 0x1e82, 0x1e83 }, { 0x1e84, 0x1e85 }, { 0x1e86, 0x1e87 }, { 0x1e88, 0x1e89 }, { 0x1e8a, 0x1e8b }, { 0x1e8c, 0x1e8d }, { 0x1e8e, 0x1e8f }, + { 0x1e90, 0x1e91 }, { 0x1e92, 0x1e93 }, { 0x1e94, 0x1e95 }, { 0x1e9b, 0x1e61 }, { 0x1ea0, 0x1ea1 }, { 0x1ea2, 0x1ea3 }, { 0x1ea4, 0x1ea5 }, { 0x1ea6, 0x1ea7 }, + { 0x1ea8, 0x1ea9 }, { 0x1eaa, 0x1eab }, { 0x1eac, 0x1ead }, { 0x1eae, 0x1eaf }, { 0x1eb0, 0x1eb1 }, { 0x1eb2, 0x1eb3 }, { 0x1eb4, 0x1eb5 }, { 0x1eb6, 0x1eb7 }, + { 0x1eb8, 0x1eb9 }, { 0x1eba, 0x1ebb }, { 0x1ebc, 0x1ebd }, { 0x1ebe, 0x1ebf }, { 0x1ec0, 0x1ec1 }, { 0x1ec2, 0x1ec3 }, { 0x1ec4, 0x1ec5 }, { 0x1ec6, 0x1ec7 }, + { 0x1ec8, 0x1ec9 }, { 0x1eca, 0x1ecb }, { 0x1ecc, 0x1ecd }, { 0x1ece, 0x1ecf }, { 0x1ed0, 0x1ed1 }, { 0x1ed2, 0x1ed3 }, { 0x1ed4, 0x1ed5 }, { 0x1ed6, 0x1ed7 }, + { 0x1ed8, 0x1ed9 }, { 0x1eda, 0x1edb }, { 0x1edc, 0x1edd }, { 0x1ede, 0x1edf }, { 0x1ee0, 0x1ee1 }, { 0x1ee2, 0x1ee3 }, { 0x1ee4, 0x1ee5 }, { 0x1ee6, 0x1ee7 }, + { 0x1ee8, 0x1ee9 }, { 0x1eea, 0x1eeb }, { 0x1eec, 0x1eed }, { 0x1eee, 0x1eef }, { 0x1ef0, 0x1ef1 }, { 0x1ef2, 0x1ef3 }, { 0x1ef4, 0x1ef5 }, { 0x1ef6, 0x1ef7 }, + { 0x1ef8, 0x1ef9 }, { 0x1f59, 0x1f51 }, { 0x1f5b, 0x1f53 }, { 0x1f5d, 0x1f55 }, { 0x1f5f, 0x1f57 }, { 0x1fb8, 0x1fb0 }, { 0x1fb9, 0x1fb1 }, { 0x1fba, 0x1f70 }, + { 0x1fbb, 0x1f71 }, { 0x1fbe, 0x03b9 }, { 0x1fd8, 0x1fd0 }, { 0x1fd9, 0x1fd1 }, { 0x1fda, 0x1f76 }, { 0x1fdb, 0x1f77 }, { 0x1fe8, 0x1fe0 }, { 0x1fe9, 0x1fe1 }, + { 0x1fea, 0x1f7a }, { 0x1feb, 0x1f7b }, { 0x1fec, 0x1fe5 }, { 0x1ff8, 0x1f78 }, { 0x1ff9, 0x1f79 }, { 0x1ffa, 0x1f7c }, { 0x1ffb, 0x1f7d }, { 0x2126, 0x03c9 }, + { 0x212a, 0x006b }, { 0x212b, 0x00e5 }, + }; + + /* This maps single codepoint to two codepoints. */ + static const struct { + int src_codepoint; + int dest_codepoint0; + int dest_codepoint1; + int dest_codepoint2; + } double_map[] = { + { 0x00df, 0x0073, 0x0073 }, { 0x0130, 0x0069, 0x0307 }, { 0x0149, 0x02bc, 0x006e }, { 0x01f0, 0x006a, 0x030c }, { 0x0587, 0x0565, 0x0582 }, { 0x1e96, 0x0068, 0x0331 }, + { 0x1e97, 0x0074, 0x0308 }, { 0x1e98, 0x0077, 0x030a }, { 0x1e99, 0x0079, 0x030a }, { 0x1e9a, 0x0061, 0x02be }, { 0x1f50, 0x03c5, 0x0313 }, { 0x1f80, 0x1f00, 0x03b9 }, + { 0x1f81, 0x1f01, 0x03b9 }, { 0x1f82, 0x1f02, 0x03b9 }, { 0x1f83, 0x1f03, 0x03b9 }, { 0x1f84, 0x1f04, 0x03b9 }, { 0x1f85, 0x1f05, 0x03b9 }, { 0x1f86, 0x1f06, 0x03b9 }, + { 0x1f87, 0x1f07, 0x03b9 }, { 0x1f88, 0x1f00, 0x03b9 }, { 0x1f89, 0x1f01, 0x03b9 }, { 0x1f8a, 0x1f02, 0x03b9 }, { 0x1f8b, 0x1f03, 0x03b9 }, { 0x1f8c, 0x1f04, 0x03b9 }, + { 0x1f8d, 0x1f05, 0x03b9 }, { 0x1f8e, 0x1f06, 0x03b9 }, { 0x1f8f, 0x1f07, 0x03b9 }, { 0x1f90, 0x1f20, 0x03b9 }, { 0x1f91, 0x1f21, 0x03b9 }, { 0x1f92, 0x1f22, 0x03b9 }, + { 0x1f93, 0x1f23, 0x03b9 }, { 0x1f94, 0x1f24, 0x03b9 }, { 0x1f95, 0x1f25, 0x03b9 }, { 0x1f96, 0x1f26, 0x03b9 }, { 0x1f97, 0x1f27, 0x03b9 }, { 0x1f98, 0x1f20, 0x03b9 }, + { 0x1f99, 0x1f21, 0x03b9 }, { 0x1f9a, 0x1f22, 0x03b9 }, { 0x1f9b, 0x1f23, 0x03b9 }, { 0x1f9c, 0x1f24, 0x03b9 }, { 0x1f9d, 0x1f25, 0x03b9 }, { 0x1f9e, 0x1f26, 0x03b9 }, + { 0x1f9f, 0x1f27, 0x03b9 }, { 0x1fa0, 0x1f60, 0x03b9 }, { 0x1fa1, 0x1f61, 0x03b9 }, { 0x1fa2, 0x1f62, 0x03b9 }, { 0x1fa3, 0x1f63, 0x03b9 }, { 0x1fa4, 0x1f64, 0x03b9 }, + { 0x1fa5, 0x1f65, 0x03b9 }, { 0x1fa6, 0x1f66, 0x03b9 }, { 0x1fa7, 0x1f67, 0x03b9 }, { 0x1fa8, 0x1f60, 0x03b9 }, { 0x1fa9, 0x1f61, 0x03b9 }, { 0x1faa, 0x1f62, 0x03b9 }, + { 0x1fab, 0x1f63, 0x03b9 }, { 0x1fac, 0x1f64, 0x03b9 }, { 0x1fad, 0x1f65, 0x03b9 }, { 0x1fae, 0x1f66, 0x03b9 }, { 0x1faf, 0x1f67, 0x03b9 }, { 0x1fb2, 0x1f70, 0x03b9 }, + { 0x1fb3, 0x03b1, 0x03b9 }, { 0x1fb4, 0x03ac, 0x03b9 }, { 0x1fb6, 0x03b1, 0x0342 }, { 0x1fbc, 0x03b1, 0x03b9 }, { 0x1fc2, 0x1f74, 0x03b9 }, { 0x1fc3, 0x03b7, 0x03b9 }, + { 0x1fc4, 0x03ae, 0x03b9 }, { 0x1fc6, 0x03b7, 0x0342 }, { 0x1fcc, 0x03b7, 0x03b9 }, { 0x1fd6, 0x03b9, 0x0342 }, { 0x1fe4, 0x03c1, 0x0313 }, { 0x1fe6, 0x03c5, 0x0342 }, + { 0x1ff2, 0x1f7c, 0x03b9 }, { 0x1ff3, 0x03c9, 0x03b9 }, { 0x1ff4, 0x03ce, 0x03b9 }, { 0x1ff6, 0x03c9, 0x0342 }, { 0x1ffc, 0x03c9, 0x03b9 }, { 0xfb00, 0x0066, 0x0066 }, + { 0xfb01, 0x0066, 0x0069 }, { 0xfb02, 0x0066, 0x006c }, { 0xfb05, 0x0073, 0x0074 }, { 0xfb06, 0x0073, 0x0074 }, { 0xfb13, 0x0574, 0x0576 }, { 0xfb14, 0x0574, 0x0565 }, + { 0xfb15, 0x0574, 0x056b }, { 0xfb16, 0x057e, 0x0576 }, { 0xfb17, 0x0574, 0x056d } + }; + + /* This maps single codepoint to three codepoints. */ + static const struct { + int src_codepoint; + int dest_codepoint0; + int dest_codepoint1; + int dest_codepoint2; + } triple_map[] = { + { 0x0390, 0x03b9, 0x0308, 0x0301 }, { 0x03b0, 0x03c5, 0x0308, 0x0301 }, { 0x1f52, 0x03c5, 0x0313, 0x0300 }, { 0x1f54, 0x03c5, 0x0313, 0x0301 }, + { 0x1f56, 0x03c5, 0x0313, 0x0342 }, { 0x1fb7, 0x03b1, 0x0342, 0x03b9 }, { 0x1fc7, 0x03b7, 0x0342, 0x03b9 }, { 0x1fd2, 0x03b9, 0x0308, 0x0300 }, + { 0x1fd3, 0x03b9, 0x0308, 0x0301 }, { 0x1fd7, 0x03b9, 0x0308, 0x0342 }, { 0x1fe2, 0x03c5, 0x0308, 0x0300 }, { 0x1fe3, 0x03c5, 0x0308, 0x0301 }, + { 0x1fe7, 0x03c5, 0x0308, 0x0342 }, { 0x1ff7, 0x03c9, 0x0342, 0x03b9 }, { 0xfb03, 0x0066, 0x0066, 0x0069 }, { 0xfb04, 0x0066, 0x0066, 0x006c } + }; + + int i; + + /* Fast path for ASCII characters. */ + if(codepoint <= 0x7f) { + info->codepoints[0] = codepoint; + if(ISUPPER_(codepoint)) + info->codepoints[0] += 'a' - 'A'; + info->n_codepoints = 1; + return; + } + + for(i = 0; i < SIZEOF_ARRAY(range_map); i++) { + if(range_map[i].min_codepoint <= codepoint && codepoint <= range_map[i].max_codepoint) { + info->codepoints[0] = codepoint + range_map[i].offset; + info->n_codepoints = 1; + return; + } + } + + for(i = 0; i < SIZEOF_ARRAY(single_map); i++) { + if(codepoint == single_map[i].src_codepoint) { + info->codepoints[0] = single_map[i].dest_codepoint; + info->n_codepoints = 1; + return; + } + } + + for(i = 0; i < SIZEOF_ARRAY(double_map); i++) { + if(codepoint == double_map[i].src_codepoint) { + info->codepoints[0] = double_map[i].dest_codepoint0; + info->codepoints[1] = double_map[i].dest_codepoint1; + info->n_codepoints = 2; + return; + } + } + + for(i = 0; i < SIZEOF_ARRAY(triple_map); i++) { + if(codepoint == triple_map[i].src_codepoint) { + info->codepoints[0] = triple_map[i].dest_codepoint0; + info->codepoints[1] = triple_map[i].dest_codepoint1; + info->codepoints[2] = triple_map[i].dest_codepoint2; + info->n_codepoints = 3; + return; + } + } + + info->codepoints[0] = codepoint; + info->n_codepoints = 1; + } +#endif + + +#if defined MD4C_USE_WIN_UNICODE + #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc) == 0xd800) + #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc) == 0xdc00) + #define UTF16_COMPUTE_SURROGATE(hi, lo) ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)) + + static int + md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size) + { + /* The encoding known called on Windows simply as "Unicode" is actually + * little-endian UTF-16, i.e. the low surrogate precedes the high + * surrogate. */ + if(IS_UTF16_SURROGATE_LO(str[0])) { + if(off+1 < str_size && IS_UTF16_SURROGATE_HI(str[1])) { + if(p_size != NULL) + *p_size = 2; + return UTF16_COMPUTE_SURROGATE(str[1], str[0]); + } + } + + if(p_size != NULL) + *p_size = 1; + return str[0]; + } + + static int + md_decode_utf16le_before__(MD_CTX* ctx, OFF off) + { + if(off > 2 && IS_UTF16_SURROGATE_LO(CH(off-2)) && IS_UTF16_SURROGATE_HI(CH(off-1))) + return UTF16_COMPUTE_SURROGATE(CH(off-1), CH(off-2)); + + return CH(off); + } + + /* No whitespace uses surrogates, so no decoding needed here. */ + #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint) + #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(CH(off)) + #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(CH((off)-1)) + + #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL)) + #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off)) + + static inline int + md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size) + { + return md_decode_utf16le__(str+off, str_size-off, p_char_size); + } +#elif defined MD4C_USE_UNICODE + #define IS_UTF8_LEAD1(byte) ((unsigned char)(byte) <= 0x7f) + #define IS_UTF8_LEAD2(byte) (((unsigned char)(byte) & 0xe0) == 0xc0) + #define IS_UTF8_LEAD3(byte) (((unsigned char)(byte) & 0xf0) == 0xe0) + #define IS_UTF8_LEAD4(byte) (((unsigned char)(byte) & 0xf8) == 0xf0) + #define IS_UTF8_TAIL(byte) (((unsigned char)(byte) & 0xc0) == 0x80) + + static int + md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size) + { + if(!IS_UTF8_LEAD1(str[0])) { + if(IS_UTF8_LEAD2(str[0])) { + if(1 < str_size && IS_UTF8_TAIL(str[1])) { + if(p_size != NULL) + *p_size = 2; + + return (((unsigned int)str[0] & 0x1f) << 6) | + (((unsigned int)str[1] & 0x3f) << 0); + } + } else if(IS_UTF8_LEAD3(str[0])) { + if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) { + if(p_size != NULL) + *p_size = 3; + + return (((unsigned int)str[0] & 0x0f) << 12) | + (((unsigned int)str[1] & 0x3f) << 6) | + (((unsigned int)str[2] & 0x3f) << 0); + } + } else if(IS_UTF8_LEAD4(str[0])) { + if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) { + if(p_size != NULL) + *p_size = 4; + + return (((unsigned int)str[0] & 0x07) << 18) | + (((unsigned int)str[1] & 0x3f) << 12) | + (((unsigned int)str[2] & 0x3f) << 6) | + (((unsigned int)str[3] & 0x3f) << 0); + } + } + } + + if(p_size != NULL) + *p_size = 1; + return str[0]; + } + + static int + md_decode_utf8_before__(MD_CTX* ctx, OFF off) + { + if(!IS_UTF8_LEAD1(CH(off-1))) { + if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1))) + return (((unsigned int)CH(off-2) & 0x1f) << 6) | + (((unsigned int)CH(off-1) & 0x3f) << 0); + + if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1))) + return (((unsigned int)CH(off-3) & 0x0f) << 12) | + (((unsigned int)CH(off-2) & 0x3f) << 6) | + (((unsigned int)CH(off-1) & 0x3f) << 0); + + if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1))) + return (((unsigned int)CH(off-4) & 0x07) << 18) | + (((unsigned int)CH(off-3) & 0x3f) << 12) | + (((unsigned int)CH(off-2) & 0x3f) << 6) | + (((unsigned int)CH(off-1) & 0x3f) << 0); + } + + return CH(off-1); + } + + #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint) + #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL)) + #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off)) + + #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL)) + #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf8_before__(ctx, off)) + + static inline int + md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size) + { + return md_decode_utf8__(str+off, str_size-off, p_char_size); + } +#else + #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint) + #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off) + #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1) + + #define ISUNICODEPUNCT(off) ISPUNCT(off) + #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1) + + static inline void + md_get_unicode_fold_info(int codepoint, MD_UNICODE_FOLD_INFO* info) + { + info->codepoints[0] = codepoint; + if(ISUPPER_(codepoint)) + info->codepoints[0] += 'a' - 'A'; + info->n_codepoints = 1; + } + + static inline int + md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size) + { + *p_size = 1; + return str[off]; + } +#endif + + /****************************** *** Recognizing raw HTML *** ******************************/ @@ -827,6 +1163,569 @@ md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_ } +/*************************** + *** Recognizing Links *** + ***************************/ + +/* Note this code is partially shared between processing inlines and blocks + * as link reference definitions and links share some helper parser functions. + */ + +struct MD_LINK_REF_DEF_tag { + CHAR* label; + CHAR* title; + SZ label_size : 24; + unsigned label_needs_free : 1; + unsigned title_needs_free : 1; + SZ title_size; + OFF dest_beg; + OFF dest_end; + + MD_LINK_REF_DEF* next; +}; + +typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR; +struct MD_LINK_ATTR_tag { + OFF dest_beg; + OFF dest_end; + + CHAR* title; + SZ title_size; + int title_needs_free; +}; + + +static int +md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, + OFF* p_end, int* p_beg_line_index, int* p_end_line_index, + OFF* p_contents_beg, OFF* p_contents_end) +{ + OFF off = beg; + OFF contents_beg = 0; + OFF contents_end = 0; + int line_index = 0; + int len = 0; + + if(CH(off) != _T('[')) + return -1; + off++; + + while(line_index < n_lines) { + OFF line_end = lines[line_index].end; + + while(off < line_end) { + if(CH(off) == _T('\\') && off < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) { + off++; + if(contents_end == 0) + contents_beg = off; + contents_end = off + 2; + } else if(CH(off) == _T('[')) { + return -1; + } else if(CH(off) == _T(']')) { + if(contents_beg < contents_end) { + /* Success. */ + *p_contents_beg = contents_beg; + *p_contents_end = contents_end; + *p_end = off+1; + *p_end_line_index = line_index; + return 0; + } else { + /* Link label must have some non-whitespace contents. */ + return -1; + } + } else { + if(contents_end == 0) { + contents_beg = off; + *p_beg_line_index = line_index; + } + contents_end = off + 1; + } + + len++; + if(len > 999) + return -1; + off++; + } + + line_index++; + len++; + } + + return -1; +} + +static int +md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, + OFF* p_contents_beg, OFF* p_contents_end) +{ + OFF off = beg; + + if(off >= max_end || CH(off) != _T('<')) + return -1; + off++; + + while(off < max_end) { + if(CH(off) == _T('\\') && off < max_end && ISPUNCT(off+1)) { + off += 2; + continue; + } + + if(ISWHITESPACE(off) || CH(off) == _T('<')) + return -1; + + if(CH(off) == _T('>')) { + /* Success. */ + *p_contents_beg = beg+1; + *p_contents_end = off; + *p_end = off+1; + return 0; + } + + off++; + } + + return -1; +} + +static int +md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, + OFF* p_contents_beg, OFF* p_contents_end) +{ + OFF off = beg; + int in_parentheses = 0; + + while(off < max_end) { + if(CH(off) == _T('\\') && off < max_end && ISPUNCT(off+1)) { + off += 2; + continue; + } + + if(ISWHITESPACE(off) || ISCNTRL(off)) + break; + + /* Link destination may include balanced pair of unescaped '(' ')' + * but only if they are not nested. */ + if(CH(off) == _T('(')) { + if(in_parentheses) + return -1; + else + in_parentheses = 1; + } else if(CH(off) == _T(')')) { + if(in_parentheses) + in_parentheses = 0; + else + return -1; + } + + off++; + } + + if(in_parentheses || off == beg) + return -1; + + /* Success. */ + *p_contents_beg = beg; + *p_contents_end = off; + *p_end = off; + return 0; +} + +static int +md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, + OFF* p_contents_beg, OFF* p_contents_end) +{ + if(md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end) == 0) + return 0; + + if(md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end) == 0) + return 0; + + return -1; +} + +static int +md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, + OFF* p_end, int* p_beg_line_index, int* p_end_line_index, + OFF* p_contents_beg, OFF* p_contents_end) +{ + OFF off = beg; + CHAR closer_char; + int line_index = 0; + + /* Optional white space with up to one line break. */ + while(off < lines[line_index].end && ISWHITESPACE(off)) + off++; + if(off >= lines[line_index].end && ISNEWLINE(off)) { + line_index++; + if(line_index >= n_lines) + return -1; + off = lines[line_index].beg; + } + + *p_beg_line_index = line_index; + + /* First char determines how to detect end of it. */ + switch(CH(off)) { + case _T('"'): closer_char = _T('"'); break; + case _T('\''): closer_char = _T('\''); break; + case _T('('): closer_char = _T(')'); break; + default: return -1; + } + off++; + + *p_contents_beg = off; + + while(line_index < n_lines) { + OFF line_end = lines[line_index].end; + + while(off < line_end) { + if(CH(off) == _T('\\') && off < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) { + off++; + } else if(CH(off) == closer_char) { + /* Success. */ + *p_contents_end = off; + *p_end = off+1; + *p_end_line_index = line_index; + return 0; + } + + off++; + } + + line_index++; + } + + return -1; +} + +/* Allocate new buffer, and fill it with copy of the string between + * 'beg' and 'end' but replace any line breaks with single space. + */ +static int +md_remove_line_breaks(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines, + CHAR** p_str, SZ* p_size) +{ + CHAR* buffer; + CHAR* ptr; + int line_index = 0; + OFF off = beg; + + buffer = (CHAR*) malloc(sizeof(CHAR) * (end - beg)); + if(buffer == NULL) { + MD_LOG("malloc() failed."); + return -1; + } + ptr = buffer; + + while(1) { + const MD_LINE* line = &lines[line_index]; + OFF line_end = line->end; + + while(off < line_end) { + *ptr = CH(off); + ptr++; + + off++; + if(off >= end) { + *p_str = buffer; + *p_size = ptr - buffer; + return 0; + } + } + + *ptr = _T(' '); + ptr++; + + line_index++; + off = lines[line_index].beg; + } +} + +/* Returns 0 if it is not a link reference definition. + * + * Returns N > 0 if it is not a link reference definition (then N corresponds + * to the number of lines forming it). In this case the definition is stored + * for resolving any links referring to it. + * + * If there is an error (cannot alloc memory for storing it), we return -1. + */ +static int +md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines) +{ + OFF label_contents_beg; + OFF label_contents_end; + int label_contents_line_index; + int label_is_multiline; + OFF dest_contents_beg; + OFF dest_contents_end; + OFF title_contents_beg; + OFF title_contents_end; + int title_contents_line_index; + int title_is_multiline; + OFF off; + int line_index = 0; + int tmp_line_index; + MD_LINK_REF_DEF* def; + int ret = 0; + + /* Link label. */ + if(md_is_link_label(ctx, lines, n_lines, lines[0].beg, + &off, &label_contents_line_index, &line_index, + &label_contents_beg, &label_contents_end) != 0) + return 0; + label_is_multiline = (label_contents_line_index != line_index); + + /* Colon. */ + if(off >= lines[line_index].end || CH(off) != _T(':')) + return 0; + off++; + + /* Optional white space with up to one line break. */ + while(off < lines[line_index].end && ISWHITESPACE(off)) + off++; + if(off >= lines[line_index].end && ISNEWLINE(off)) { + line_index++; + if(line_index >= n_lines) + return 0; + off = lines[line_index].beg; + } + + /* Link destination. */ + if(md_is_link_destination(ctx, off, lines[line_index].end, + &off, &dest_contents_beg, &dest_contents_end) != 0) + return 0; + + /* (Optional) title. Note we interpret it as an title only if nothing + * more follows on its last line. */ + if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off, + &off, &title_contents_line_index, &tmp_line_index, &title_contents_beg, &title_contents_end) == 0 + && off >= lines[line_index].end) + { + title_is_multiline = (tmp_line_index != title_contents_line_index); + title_contents_line_index += line_index; + line_index += tmp_line_index; + } else { + /* Not a title. */ + title_contents_beg = off; + title_contents_end = off; + } + + /* Nothing more can follow on the last line. */ + if(off < lines[line_index].end) + return 0; + + /* Store the link reference definition. */ + def = (MD_LINK_REF_DEF*) malloc(sizeof(MD_LINK_REF_DEF)); + if(def == NULL) { + MD_LOG("malloc() failed."); + return -1; + } + memset(def, 0, sizeof(MD_LINK_REF_DEF)); + + if(label_is_multiline) { + def->label = (CHAR*) STR(label_contents_beg); + def->label_size = label_contents_end - label_contents_beg; + } else { + SZ label_size; + + MD_CHECK(md_remove_line_breaks(ctx, label_contents_beg, label_contents_end, + lines + label_contents_line_index, n_lines - label_contents_line_index, + &def->label, &label_size)); + def->label_size = label_size; + def->label_needs_free = 1; + } + + def->dest_beg = dest_contents_beg; + def->dest_end = dest_contents_end; + + if(title_contents_beg >= title_contents_end) { + def->title = NULL; + def->title_size = 0; + } else if(!title_is_multiline) { + def->title = (CHAR*) STR(title_contents_beg); + def->title_size = title_contents_end - title_contents_beg; + } else { + MD_CHECK(md_remove_line_breaks(ctx, title_contents_beg, title_contents_end, + lines + title_contents_line_index, n_lines - title_contents_line_index, + &def->title, &def->title_size)); + def->title_needs_free = 1; + } + + def->next = ctx->link_ref_defs; + ctx->link_ref_defs = def; + + ret = line_index + 1; + +abort: + return ret; +} + +static OFF +md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size) +{ + SZ char_size; + int codepoint; + + while(off < size) { + codepoint = md_decode_unicode(label, off, size, &char_size); + if(!ISUNICODEWHITESPACE_(codepoint)) + break; + off += char_size; + } + + return off; +} + +static int +md_link_label_eq(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size) +{ + OFF a_off; + OFF b_off; + + /* Fast path: Most real-life reference labels are using exact match. */ + if(a_size == b_size && memcmp(a_label, b_label, a_size * sizeof(CHAR)) == 0) + return TRUE; + + /* The slow path, with Unicode case folding and Unicode whitespace collapsing. */ + a_off = md_skip_unicode_whitespace(a_label, 0, a_size); + b_off = md_skip_unicode_whitespace(b_label, 0, b_size); + while(a_off < a_size || b_off < b_size) { + int a_codepoint, b_codepoint; + SZ a_char_size, b_char_size; + int a_is_whitespace, b_is_whitespace; + + if(a_off < a_size) { + a_codepoint = md_decode_unicode(a_label, a_off, a_size, &a_char_size); + a_is_whitespace = ISUNICODEWHITESPACE_(a_codepoint); + } else { + /* Treat end of label as a whitespace. */ + a_is_whitespace = TRUE; + } + + if(b_off < b_size) { + b_codepoint = md_decode_unicode(b_label, b_off, b_size, &b_char_size); + b_is_whitespace = ISUNICODEWHITESPACE_(b_codepoint); + } else { + /* Treat end of label as a whitespace. */ + b_is_whitespace = TRUE; + } + + if(a_is_whitespace || b_is_whitespace) { + if(!a_is_whitespace || !b_is_whitespace) + return FALSE; + + a_off = md_skip_unicode_whitespace(a_label, a_off, a_size); + b_off = md_skip_unicode_whitespace(b_label, b_off, b_size); + } else { + MD_UNICODE_FOLD_INFO a_fold_info, b_fold_info; + + md_get_unicode_fold_info(a_codepoint, &a_fold_info); + md_get_unicode_fold_info(b_codepoint, &b_fold_info); + + if(a_fold_info.n_codepoints != b_fold_info.n_codepoints) + return FALSE; + if(memcmp(a_fold_info.codepoints, b_fold_info.codepoints, a_fold_info.n_codepoints * sizeof(int)) != 0) + return FALSE; + + a_off += a_char_size; + b_off += b_char_size; + } + } + + return TRUE; +} + +static int +md_lookup_link_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size, MD_LINK_REF_DEF** p_def) +{ + MD_LINK_REF_DEF* def; + + def = ctx->link_ref_defs; + while(def != NULL) { + if(md_link_label_eq(def->label, def->label_size, label, label_size)) { + *p_def = def; + return TRUE; + } + + def = def->next; + } + + *p_def = NULL; + return FALSE; +} + +static int +md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines, + OFF beg, OFF end, MD_LINK_ATTR* attr) +{ + MD_LINK_REF_DEF* def; + const MD_LINE* beg_line; + const MD_LINE* end_line; + CHAR* label; + SZ label_size; + int ret; + + MD_ASSERT(CH(beg) == _T('[')); + MD_ASSERT(CH(end-1) == _T(']')); + + beg++; + end--; + + /* Find lines corresponding to the beg and end positions. */ + MD_ASSERT(lines[0].beg <= beg); + beg_line = lines; + while(beg >= beg_line->end) + beg_line++; + + MD_ASSERT(end <= lines[n_lines-1].end); + end_line = beg_line; + while(end >= end_line->end) + end_line++; + + if(beg_line != end_line) { + MD_CHECK(md_remove_line_breaks(ctx, beg, end, beg_line, + n_lines - (beg_line - lines), &label, &label_size)); + } else { + label = (char*) STR(beg); + label_size = end - beg; + } + + ret = md_lookup_link_ref_def(ctx, label, label_size, &def); + if(ret == TRUE) { + attr->dest_beg = def->dest_beg; + attr->dest_end = def->dest_end; + attr->title = def->title; + attr->title_size = def->title_size; + attr->title_needs_free = 0; + } + + if(beg_line != end_line) + free(label); + +abort: + return ret; +} + +static void +md_free_link_ref_defs(MD_CTX* ctx) +{ + MD_LINK_REF_DEF* def = ctx->link_ref_defs; + MD_LINK_REF_DEF* def_next; + + while(def != NULL) { + def_next = def->next; + + if(def->label_needs_free) + free(def->label); + if(def->title_needs_free) + free(def->title); + free(def); + + def = def_next; + } +} + + /****************************************** *** Processing Inlines (a.k.a Spans) *** ******************************************/ @@ -874,8 +1773,13 @@ md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_ * ';': Maybe end of entity. * '<': Maybe start of raw HTML or autolink. * '>': Maybe end of raw HTML or autolink. - * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS) - * '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS) + * '[': Maybe start of link label or link text. + * ']': Maybe end of link label or link text. + * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS). + * '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS). + * 'D': Dummy mark, it reserves a space for splitting a previous mark + * (e.g. emphasis) or to make more space for storing some special data + * related to the preceding mark (e.g. link). * * Note that not all instances of these chars in the text imply creation of the * structure. Only those which have (or may have, after we see more context) @@ -969,6 +1873,29 @@ md_mark_chain_append(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index) chain->tail = mark_index; } +/* Sometimes, we need to store a pointer into the mark. It is quite rare + * so we do not bother to make MD_MARK use union, and it can only happen + * for dummy marks. */ +static inline void +md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr) +{ + MD_MARK* mark = &ctx->marks[mark_index]; + MD_ASSERT(mark->ch == 'D'); + + /* Check only members beg and end are misused for this. */ + MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF)); + + *((void**) &mark->beg) = ptr; +} + +static inline void* +md_mark_get_ptr(MD_CTX* ctx, int mark_index) +{ + MD_MARK* mark = &ctx->marks[mark_index]; + MD_ASSERT(mark->ch == 'D'); + return *((void**) &mark->beg); +} + static void md_resolve_range(MD_CTX* ctx, MD_MARKCHAIN* chain, int opener_index, int closer_index) { @@ -1130,6 +2057,8 @@ md_build_mark_char_map(MD_CTX* ctx) ctx->mark_char_map[';'] = 1; ctx->mark_char_map['<'] = 1; ctx->mark_char_map['>'] = 1; + ctx->mark_char_map['['] = 1; + ctx->mark_char_map[']'] = 1; ctx->mark_char_map['\0'] = 1; if(ctx->r.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS) @@ -1287,6 +2216,22 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines) continue; } + /* A potential link or its part. */ + if(ch == _T('[')) { + PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER); + off++; + /* Two dummies to make enough place for data we need if it is + * a link. */ + PUSH_MARK('D', off, off, 0); + PUSH_MARK('D', off, off, 0); + continue; + } + if(ch == _T(']')) { + PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER); + off++; + continue; + } + /* A potential permissive URL autolink. */ if(ch == _T(':')) { static struct { @@ -1584,6 +2529,129 @@ md_analyze_lt_gt(MD_CTX* ctx, int mark_index, const MD_LINE* lines, int n_lines) } } +static void +md_analyze_bracket(MD_CTX* ctx, int mark_index) +{ + /* We cannot really resolve links here as for that we would need + * more context. E.g. a following pair of brackets (reference link), + * or enclosing pair of brackets (if the inner is the link, the outer + * one cannot be.) + * + * Therefore we here only construct a list of resolved '[' ']' pairs + * ordered by position of the closer. This allows ur to analyze what is + * or is not link in the right order, from inside to outside in case + * of nested brackets. + * + * The resolving itself is deferred into md_resolve_links(). + */ + + MD_MARK* mark = &ctx->marks[mark_index]; + + if(mark->flags & MD_MARK_POTENTIAL_OPENER) { + md_mark_chain_append(ctx, &BRACKET_OPENERS, mark_index); + return; + } + + if(BRACKET_OPENERS.tail >= 0) { + /* Pop the opener from the chain. */ + int opener_index = BRACKET_OPENERS.tail; + MD_MARK* opener = &ctx->marks[opener_index]; + if(opener->prev >= 0) + ctx->marks[opener->prev].next = -1; + else + BRACKET_OPENERS.head = -1; + BRACKET_OPENERS.tail = opener->prev; + + /* Interconnect the opener and closer. */ + opener->next = mark_index; + mark->prev = opener_index; + + /* Add the pair into chain of potential links for md_resolve_links(). + * Note we misuse opener->prev for this as opener->next points to its + * closer. */ + if(ctx->unresolved_link_tail >= 0) + ctx->marks[ctx->unresolved_link_tail].prev = opener_index; + else + ctx->unresolved_link_head = opener_index; + ctx->unresolved_link_tail = opener_index; + opener->prev = -1; + } +} + +static int +md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines) +{ + int opener_index = ctx->unresolved_link_head; + + // TODO: Handle nested links: we need to remember end offset of most recent + // resolved link. If we later iterate to anything with opener BEFORE that + // then it cannot be link. + + while(opener_index >= 0) { + MD_MARK* opener = &ctx->marks[opener_index]; + int closer_index = opener->next; + MD_MARK* closer = &ctx->marks[closer_index]; + int next_index = opener->prev; + MD_MARK* next_opener = &ctx->marks[next_index]; + MD_MARK* next_closer = &ctx->marks[next_opener->next]; + MD_LINK_ATTR attr; + int is_link = FALSE; + + if(next_index >= 0) { + next_opener = &ctx->marks[next_index]; + next_closer = &ctx->marks[next_opener->next]; + } else { + next_opener = NULL; + } + + // TODO: Check for inline link + + if(next_opener != NULL && next_opener->beg == closer->end) { + if(next_closer->beg > closer->end + 1) { + /* Might be full reference link. */ + is_link = (md_is_link_reference(ctx, lines, n_lines, next_opener->beg, next_closer->end, &attr)); + } else { + /* Might be shortcut reference link. */ + is_link = (md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr)); + } + + if(is_link == TRUE) { + /* Eat the 2nd "[...]". */ + closer->end = next_closer->end; + next_index = next_opener->prev; + } + } else { + /* Might be collapsed reference link. */ + is_link = (md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr)); + } + + if(is_link < 0) + return -1; + + if(is_link == TRUE) { + /* Resolve the brackets as a link. */ + opener->flags |= MD_MARK_RESOLVED; + closer->flags |= MD_MARK_RESOLVED; + + /* If it is a link, we store the destination and title in the two + * dummy marks after the opener. */ + MD_ASSERT(ctx->marks[opener_index+1].ch == 'D'); + ctx->marks[opener_index+1].beg = attr.dest_beg; + ctx->marks[opener_index+1].end = attr.dest_end; + + MD_ASSERT(ctx->marks[opener_index+2].ch == 'D'); + md_mark_store_ptr(ctx, opener_index+2, attr.title); + if(attr.title_needs_free) + md_mark_chain_append(ctx, &PTR_CHAIN, opener_index+2); + ctx->marks[opener_index+2].prev = attr.title_size; + } + + opener_index = next_index; + } + + return 0; +} + /* Analyze whether the mark '&' starts a HTML entity. * If so, update its flags as well as flags of corresponding closer ';'. */ static void @@ -1812,16 +2880,9 @@ md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index) md_resolve_range(ctx, NULL, mark_index, closer_index); } -/* Table of precedence of various span types. */ -static const CHAR* md_precedence_table[] = { - _T("&`<>"), /* Entities; code spans; autolinks; raw HTML. */ - _T("*_@:") /* Emphasis and string emphasis; permissive autolinks. */ -}; - static void -md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int precedence_level) +md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, const CHAR* mark_chars) { - const CHAR* mark_chars = md_precedence_table[precedence_level]; int i = 0; while(i < ctx->n_marks) { @@ -1849,6 +2910,8 @@ md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int precedence_ case '`': md_analyze_backtick(ctx, i); break; case '<': /* Pass through. */ case '>': md_analyze_lt_gt(ctx, i, lines, n_lines); break; + case '[': /* Pass through. */ + case ']': md_analyze_bracket(ctx, i); break; case '&': md_analyze_entity(ctx, i); break; case '*': md_analyze_asterisk(ctx, i); break; case '_': md_analyze_underscore(ctx, i); break; @@ -1865,6 +2928,7 @@ static int md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) { int i; + int ret; /* Reset the previously collected stack of marks. */ ctx->n_marks = 0; @@ -1874,16 +2938,42 @@ md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) ctx->mark_chains[i].head = -1; ctx->mark_chains[i].tail = -1; } + ctx->unresolved_link_head = -1; + ctx->unresolved_link_tail = -1; /* Collect all marks. */ if(md_collect_marks(ctx, lines, n_lines) != 0) return -1; - /* Analyze all marks. */ - for(i = 0; i < SIZEOF_ARRAY(md_precedence_table); i++) - md_analyze_marks(ctx, lines, n_lines, i); + /* We analyze marks in few groups to handle their precedence. */ + /* (1) Entities; code spans; autolinks; raw HTML. */ + md_analyze_marks(ctx, lines, n_lines, _T("&`<>")); + /* (2) Links. */ + md_analyze_marks(ctx, lines, n_lines, _T("[]")); + MD_CHECK(md_resolve_links(ctx, lines, n_lines)); + /* (3) Emphasis and strong emphasis; permissive autolinks. */ + md_analyze_marks(ctx, lines, n_lines, _T("*_@:")); - return 0; +abort: + return ret; +} + +static void +md_setup_span_a_detail(MD_CTX* ctx, const MD_MARK* mark, MD_SPAN_A_DETAIL* det) +{ + const MD_MARK* dest_mark = mark+1; + const MD_MARK* title_mark = mark+2; + + MD_ASSERT(dest_mark->ch == 'D'); + if(dest_mark->beg < dest_mark->end) + det->href = STR(dest_mark->beg); + else + det->href = NULL; + det->href_size = dest_mark->end - dest_mark->beg; + + MD_ASSERT(title_mark->ch == 'D'); + det->title = md_mark_get_ptr(ctx, title_mark - ctx->marks); + det->title_size = title_mark->prev; } /* Render the output, accordingly to the analyzed ctx->marks. */ @@ -1965,9 +3055,19 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) } break; + case '[': /* Link. */ + md_setup_span_a_detail(ctx, mark, &det.a); + MD_ENTER_SPAN(MD_SPAN_A, &det.a); + break; + case ']': + md_setup_span_a_detail(ctx, &ctx->marks[mark->prev], &det.a); + MD_LEAVE_SPAN(MD_SPAN_A, &det.a); + break; + case '<': case '>': /* Autolink or raw HTML. */ if(!(mark->flags & MD_MARK_AUTOLINK)) { + /* Raw HTML. */ if(mark->flags & MD_MARK_OPENER) text_type = MD_TEXT_HTML; else @@ -2082,12 +3182,17 @@ struct MD_BLOCK_tag { static int md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines) { + int i; int ret; MD_CHECK(md_analyze_inlines(ctx, lines, n_lines)); MD_CHECK(md_process_inlines(ctx, lines, n_lines)); abort: + /* Free any temporary memory blocks stored within some dummy marks. */ + for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next) + free(md_mark_get_ptr(ctx, i)); + return ret; } @@ -2373,18 +3478,81 @@ md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line) return 0; } +/* Eat from start of current (textual) block any link reference definitions + * and remember them so we can resolve any links referring to them. + * + * (Link reference definitions can only be at start of it as they cannot break + * a paragraph.) + */ +static int +md_consume_link_reference_definitions(MD_CTX* ctx) +{ + MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1); + int n_lines = ctx->current_block->n_lines; + int n = 0; + + /* Compute how many lines at the start of the block form one or more + * link reference definitions. */ + while(n < n_lines) { + int n_link_ref_lines; + + n_link_ref_lines = md_is_link_reference_definition(ctx, + lines + n, n_lines - n); + /* Not a link reference definition? */ + if(n_link_ref_lines == 0) + break; + + /* We fail if it is the link ref. def. but it could not be stored due + * a memory allocation error. */ + if(n_link_ref_lines < 0) + return -1; + + n += n_link_ref_lines; + } + + /* If there was at least one link reference definition, we need to remove + * its lines from the block, or perhaps even the whole block. */ + if(n > 0) { + if(n == n_lines) { + /* Remove complete block. */ + ctx->n_block_bytes -= n * sizeof(MD_LINE); + ctx->n_block_bytes -= sizeof(MD_BLOCK); + } else { + /* Remove just some initial lines from the block. */ + memmove(lines, lines + n, n_lines - n); + ctx->current_block->n_lines -= n; + ctx->n_block_bytes -= n * sizeof(MD_LINE); + } + } + + return 0; +} + static int md_end_current_block(MD_CTX* ctx) { int ret = 0; - if(ctx->current_block != NULL) { - ctx->current_block = NULL; + if(ctx->current_block == NULL) + return ret; - // TODO : consider flush of all complete blocks - //MD_CHECK(md_process_all_blocks(ctx)); + /* Check whether there is a link reference definition. (We do this here + * instead of in md_analyze_line() because link reference definition can + * take multiple lines.) */ + if(ctx->current_block->type == MD_BLOCK_P) { + MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1); + if(CH(lines[0].beg) == _T('[')) + MD_CHECK(md_consume_link_reference_definitions(ctx)); } + /* Mark we are not building any block anymore. */ + ctx->current_block = NULL; + + /* Consider flush of all complete blocks */ + // TODO: we cannot do this if we look ahead for link ref. def. or if we + // are inside a list which can yet turn from tight to loose. + //MD_CHECK(md_process_all_blocks(ctx)); + abort: return ret; } @@ -3038,7 +4206,8 @@ md_process_doc(MD_CTX *ctx) MD_CHECK(md_process_line(ctx, &pivot_line, line)); } - /* Process all remaining blocks. */ + /* Process all blocks. */ + md_end_current_block(ctx); MD_CHECK(md_process_all_blocks(ctx)); /* Close any dangling parent blocks. */ @@ -3091,6 +4260,7 @@ md_parse(const MD_CHAR* text, MD_SIZE size, const MD_RENDERER* renderer, void* u ret = md_process_doc(&ctx); /* Clean-up. */ + md_free_link_ref_defs(&ctx); free(ctx.block_bytes); free(ctx.marks); free(ctx.buffer);