md4c.c (234850B)
1 /* commit e9ff661ff818ee94a4a231958d9b6768dc6882c9 - mity/md4c repo 2 * MD4C: Markdown parser for C 3 * (http://github.com/mity/md4c) 4 * 5 * Copyright (c) 2016-2020 Martin Mitas 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 * and/or sell copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 23 * IN THE SOFTWARE. 24 */ 25 26 #include "md4c.h" 27 28 #include <limits.h> 29 #include <stdio.h> 30 #include <stdlib.h> 31 #include <string.h> 32 33 34 /***************************** 35 *** Miscellaneous Stuff *** 36 *****************************/ 37 38 #if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L 39 /* C89/90 or old compilers in general may not understand "inline". */ 40 #if defined __GNUC__ 41 #define inline __inline__ 42 #elif defined _MSC_VER 43 #define inline __inline 44 #else 45 #define inline 46 #endif 47 #endif 48 49 /* Make the UTF-8 support the default. */ 50 #if !defined MD4C_USE_ASCII && !defined MD4C_USE_UTF8 && !defined MD4C_USE_UTF16 51 #define MD4C_USE_UTF8 52 #endif 53 54 /* Magic for making wide literals with MD4C_USE_UTF16. */ 55 #ifdef _T 56 #undef _T 57 #endif 58 #if defined MD4C_USE_UTF16 59 #define _T(x) L##x 60 #else 61 #define _T(x) x 62 #endif 63 64 /* Misc. macros. */ 65 #define SIZEOF_ARRAY(a) (sizeof(a) / sizeof(a[0])) 66 67 #define STRINGIZE_(x) #x 68 #define STRINGIZE(x) STRINGIZE_(x) 69 70 #ifndef TRUE 71 #define TRUE 1 72 #define FALSE 0 73 #endif 74 75 #define MD_LOG(msg) \ 76 do { \ 77 if(ctx->parser.debug_log != NULL) \ 78 ctx->parser.debug_log((msg), ctx->userdata); \ 79 } while(0) 80 81 #ifdef DEBUG 82 #define MD_ASSERT(cond) \ 83 do { \ 84 if(!(cond)) { \ 85 MD_LOG(__FILE__ ":" STRINGIZE(__LINE__) ": " \ 86 "Assertion '" STRINGIZE(cond) "' failed."); \ 87 exit(1); \ 88 } \ 89 } while(0) 90 91 #define MD_UNREACHABLE() MD_ASSERT(1 == 0) 92 #else 93 #ifdef __GNUC__ 94 #define MD_ASSERT(cond) do { if(!(cond)) __builtin_unreachable(); } while(0) 95 #define MD_UNREACHABLE() do { __builtin_unreachable(); } while(0) 96 #elif defined _MSC_VER && _MSC_VER > 120 97 #define MD_ASSERT(cond) do { __assume(cond); } while(0) 98 #define MD_UNREACHABLE() do { __assume(0); } while(0) 99 #else 100 #define MD_ASSERT(cond) do {} while(0) 101 #define MD_UNREACHABLE() do {} while(0) 102 #endif 103 #endif 104 105 /* For falling through case labels in switch statements. */ 106 #if defined __clang__ && __clang_major__ >= 12 107 #define MD_FALLTHROUGH() __attribute__((fallthrough)) 108 #elif defined __GNUC__ && __GNUC__ >= 7 109 #define MD_FALLTHROUGH() __attribute__((fallthrough)) 110 #else 111 #define MD_FALLTHROUGH() ((void)0) 112 #endif 113 114 /* Suppress "unused parameter" warnings. */ 115 #define MD_UNUSED(x) ((void)x) 116 117 118 /************************ 119 *** Internal Types *** 120 ************************/ 121 122 /* These are omnipresent so lets save some typing. */ 123 #define CHAR MD_CHAR 124 #define SZ MD_SIZE 125 #define OFF MD_OFFSET 126 127 typedef struct MD_MARK_tag MD_MARK; 128 typedef struct MD_BLOCK_tag MD_BLOCK; 129 typedef struct MD_CONTAINER_tag MD_CONTAINER; 130 typedef struct MD_REF_DEF_tag MD_REF_DEF; 131 132 133 /* During analyzes of inline marks, we need to manage some "mark chains", 134 * of (yet unresolved) openers. This structure holds start/end of the chain. 135 * The chain internals are then realized through MD_MARK::prev and ::next. 136 */ 137 typedef struct MD_MARKCHAIN_tag MD_MARKCHAIN; 138 struct MD_MARKCHAIN_tag { 139 int head; /* Index of first mark in the chain, or -1 if empty. */ 140 int tail; /* Index of last mark in the chain, or -1 if empty. */ 141 }; 142 143 /* Context propagated through all the parsing. */ 144 typedef struct MD_CTX_tag MD_CTX; 145 struct MD_CTX_tag { 146 /* Immutable stuff (parameters of md_parse()). */ 147 const CHAR* text; 148 SZ size; 149 MD_PARSER parser; 150 void* userdata; 151 152 /* When this is true, it allows some optimizations. */ 153 int doc_ends_with_newline; 154 155 /* Helper temporary growing buffer. */ 156 CHAR* buffer; 157 unsigned alloc_buffer; 158 159 /* Reference definitions. */ 160 MD_REF_DEF* ref_defs; 161 int n_ref_defs; 162 int alloc_ref_defs; 163 void** ref_def_hashtable; 164 int ref_def_hashtable_size; 165 166 /* Stack of inline/span markers. 167 * This is only used for parsing a single block contents but by storing it 168 * here we may reuse the stack for subsequent blocks; i.e. we have fewer 169 * (re)allocations. */ 170 MD_MARK* marks; 171 int n_marks; 172 int alloc_marks; 173 174 #if defined MD4C_USE_UTF16 175 char mark_char_map[128]; 176 #else 177 char mark_char_map[256]; 178 #endif 179 180 /* For resolving of inline spans. */ 181 MD_MARKCHAIN mark_chains[17]; 182 #define PTR_CHAIN (ctx->mark_chains[0]) 183 #define TABLECELLBOUNDARIES (ctx->mark_chains[1]) 184 #define ASTERISK_OPENERS_extraword_mod3_0 (ctx->mark_chains[2]) 185 #define ASTERISK_OPENERS_extraword_mod3_1 (ctx->mark_chains[3]) 186 #define ASTERISK_OPENERS_extraword_mod3_2 (ctx->mark_chains[4]) 187 #define ASTERISK_OPENERS_intraword_mod3_0 (ctx->mark_chains[5]) 188 #define ASTERISK_OPENERS_intraword_mod3_1 (ctx->mark_chains[6]) 189 #define ASTERISK_OPENERS_intraword_mod3_2 (ctx->mark_chains[7]) 190 #define UNDERSCORE_OPENERS (ctx->mark_chains[8]) 191 #define TILDE_OPENERS_1 (ctx->mark_chains[9]) 192 #define TILDE_OPENERS_2 (ctx->mark_chains[10]) 193 #define BRACKET_OPENERS (ctx->mark_chains[11]) 194 #define DOLLAR_OPENERS (ctx->mark_chains[12]) 195 #define FAINT_OPENERS (ctx->mark_chains[13]) 196 #define INVERSE_OPENERS (ctx->mark_chains[14]) 197 #define CONCEAL_OPENERS (ctx->mark_chains[15]) 198 #define BLINK_OPENERS (ctx->mark_chains[16]) 199 #define OPENERS_CHAIN_FIRST 1 200 #define OPENERS_CHAIN_LAST 16 201 202 int n_table_cell_boundaries; 203 204 /* For resolving links. */ 205 int unresolved_link_head; 206 int unresolved_link_tail; 207 208 /* For resolving raw HTML. */ 209 OFF html_comment_horizon; 210 OFF html_proc_instr_horizon; 211 OFF html_decl_horizon; 212 OFF html_cdata_horizon; 213 214 /* For block analysis. 215 * Notes: 216 * -- It holds MD_BLOCK as well as MD_LINE structures. After each 217 * MD_BLOCK, its (multiple) MD_LINE(s) follow. 218 * -- For MD_BLOCK_HTML and MD_BLOCK_CODE, MD_VERBATIMLINE(s) are used 219 * instead of MD_LINE(s). 220 */ 221 void* block_bytes; 222 MD_BLOCK* current_block; 223 int n_block_bytes; 224 int alloc_block_bytes; 225 226 /* For container block analysis. */ 227 MD_CONTAINER* containers; 228 int n_containers; 229 int alloc_containers; 230 231 /* Minimal indentation to call the block "indented code block". */ 232 unsigned code_indent_offset; 233 234 /* Contextual info for line analysis. */ 235 SZ code_fence_length; /* For checking closing fence length. */ 236 int html_block_type; /* For checking closing raw HTML condition. */ 237 int last_line_has_list_loosening_effect; 238 int last_list_item_starts_with_two_blank_lines; 239 }; 240 241 enum MD_LINETYPE_tag { 242 MD_LINE_BLANK, 243 MD_LINE_HR, 244 MD_LINE_ATXHEADER, 245 MD_LINE_SETEXTHEADER, 246 MD_LINE_SETEXTUNDERLINE, 247 MD_LINE_INDENTEDCODE, 248 MD_LINE_FENCEDCODE, 249 MD_LINE_HTML, 250 MD_LINE_TEXT, 251 MD_LINE_TABLE, 252 MD_LINE_TABLEUNDERLINE 253 }; 254 typedef enum MD_LINETYPE_tag MD_LINETYPE; 255 256 typedef struct MD_LINE_ANALYSIS_tag MD_LINE_ANALYSIS; 257 struct MD_LINE_ANALYSIS_tag { 258 MD_LINETYPE type : 16; 259 unsigned data : 16; 260 OFF beg; 261 OFF end; 262 unsigned indent; /* Indentation level. */ 263 }; 264 265 typedef struct MD_LINE_tag MD_LINE; 266 struct MD_LINE_tag { 267 OFF beg; 268 OFF end; 269 }; 270 271 typedef struct MD_VERBATIMLINE_tag MD_VERBATIMLINE; 272 struct MD_VERBATIMLINE_tag { 273 OFF beg; 274 OFF end; 275 OFF indent; 276 }; 277 278 279 /***************** 280 *** Helpers *** 281 *****************/ 282 283 /* Character accessors. */ 284 #define CH(off) (ctx->text[(off)]) 285 #define STR(off) (ctx->text + (off)) 286 287 /* Character classification. 288 * Note we assume ASCII compatibility of code points < 128 here. */ 289 #define ISIN_(ch, ch_min, ch_max) ((ch_min) <= (unsigned)(ch) && (unsigned)(ch) <= (ch_max)) 290 #define ISANYOF_(ch, palette) ((ch) != _T('\0') && md_strchr((palette), (ch)) != NULL) 291 #define ISANYOF2_(ch, ch1, ch2) ((ch) == (ch1) || (ch) == (ch2)) 292 #define ISANYOF3_(ch, ch1, ch2, ch3) ((ch) == (ch1) || (ch) == (ch2) || (ch) == (ch3)) 293 #define ISASCII_(ch) ((unsigned)(ch) <= 127) 294 #define ISBLANK_(ch) (ISANYOF2_((ch), _T(' '), _T('\t'))) 295 #define ISNEWLINE_(ch) (ISANYOF2_((ch), _T('\r'), _T('\n'))) 296 #define ISWHITESPACE_(ch) (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f'))) 297 #define ISCNTRL_(ch) ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127) 298 #define ISPUNCT_(ch) (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126)) 299 #define ISUPPER_(ch) (ISIN_(ch, _T('A'), _T('Z'))) 300 #define ISLOWER_(ch) (ISIN_(ch, _T('a'), _T('z'))) 301 #define ISALPHA_(ch) (ISUPPER_(ch) || ISLOWER_(ch)) 302 #define ISDIGIT_(ch) (ISIN_(ch, _T('0'), _T('9'))) 303 #define ISXDIGIT_(ch) (ISDIGIT_(ch) || ISIN_(ch, _T('A'), _T('F')) || ISIN_(ch, _T('a'), _T('f'))) 304 #define ISALNUM_(ch) (ISALPHA_(ch) || ISDIGIT_(ch)) 305 306 #define ISANYOF(off, palette) ISANYOF_(CH(off), (palette)) 307 #define ISANYOF2(off, ch1, ch2) ISANYOF2_(CH(off), (ch1), (ch2)) 308 #define ISANYOF3(off, ch1, ch2, ch3) ISANYOF3_(CH(off), (ch1), (ch2), (ch3)) 309 #define ISASCII(off) ISASCII_(CH(off)) 310 #define ISBLANK(off) ISBLANK_(CH(off)) 311 #define ISNEWLINE(off) ISNEWLINE_(CH(off)) 312 #define ISWHITESPACE(off) ISWHITESPACE_(CH(off)) 313 #define ISCNTRL(off) ISCNTRL_(CH(off)) 314 #define ISPUNCT(off) ISPUNCT_(CH(off)) 315 #define ISUPPER(off) ISUPPER_(CH(off)) 316 #define ISLOWER(off) ISLOWER_(CH(off)) 317 #define ISALPHA(off) ISALPHA_(CH(off)) 318 #define ISDIGIT(off) ISDIGIT_(CH(off)) 319 #define ISXDIGIT(off) ISXDIGIT_(CH(off)) 320 #define ISALNUM(off) ISALNUM_(CH(off)) 321 322 323 #if defined MD4C_USE_UTF16 324 #define md_strchr wcschr 325 #else 326 #define md_strchr strchr 327 #endif 328 329 330 /* Case insensitive check of string equality. */ 331 static inline int 332 md_ascii_case_eq(const CHAR* s1, const CHAR* s2, SZ n) 333 { 334 OFF i; 335 for(i = 0; i < n; i++) { 336 CHAR ch1 = s1[i]; 337 CHAR ch2 = s2[i]; 338 339 if(ISLOWER_(ch1)) 340 ch1 += ('A'-'a'); 341 if(ISLOWER_(ch2)) 342 ch2 += ('A'-'a'); 343 if(ch1 != ch2) 344 return FALSE; 345 } 346 return TRUE; 347 } 348 349 static inline int 350 md_ascii_eq(const CHAR* s1, const CHAR* s2, SZ n) 351 { 352 return memcmp(s1, s2, n * sizeof(CHAR)) == 0; 353 } 354 355 static int 356 md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size) 357 { 358 OFF off = 0; 359 int ret = 0; 360 361 while(1) { 362 while(off < size && str[off] != _T('\0')) 363 off++; 364 365 if(off > 0) { 366 ret = ctx->parser.text(type, str, off, ctx->userdata); 367 if(ret != 0) 368 return ret; 369 370 str += off; 371 size -= off; 372 off = 0; 373 } 374 375 if(off >= size) 376 return 0; 377 378 ret = ctx->parser.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata); 379 if(ret != 0) 380 return ret; 381 off++; 382 } 383 } 384 385 386 #define MD_CHECK(func) \ 387 do { \ 388 ret = (func); \ 389 if(ret < 0) \ 390 goto abort; \ 391 } while(0) 392 393 394 #define MD_TEMP_BUFFER(sz) \ 395 do { \ 396 if(sz > ctx->alloc_buffer) { \ 397 CHAR* new_buffer; \ 398 SZ new_size = ((sz) + (sz) / 2 + 128) & ~127; \ 399 \ 400 new_buffer = realloc(ctx->buffer, new_size); \ 401 if(new_buffer == NULL) { \ 402 MD_LOG("realloc() failed."); \ 403 ret = -1; \ 404 goto abort; \ 405 } \ 406 \ 407 ctx->buffer = new_buffer; \ 408 ctx->alloc_buffer = new_size; \ 409 } \ 410 } while(0) 411 412 413 #define MD_ENTER_BLOCK(type, arg) \ 414 do { \ 415 ret = ctx->parser.enter_block((type), (arg), ctx->userdata); \ 416 if(ret != 0) { \ 417 MD_LOG("Aborted from enter_block() callback."); \ 418 goto abort; \ 419 } \ 420 } while(0) 421 422 #define MD_LEAVE_BLOCK(type, arg) \ 423 do { \ 424 ret = ctx->parser.leave_block((type), (arg), ctx->userdata); \ 425 if(ret != 0) { \ 426 MD_LOG("Aborted from leave_block() callback."); \ 427 goto abort; \ 428 } \ 429 } while(0) 430 431 #define MD_ENTER_SPAN(type, arg) \ 432 do { \ 433 ret = ctx->parser.enter_span((type), (arg), ctx->userdata); \ 434 if(ret != 0) { \ 435 MD_LOG("Aborted from enter_span() callback."); \ 436 goto abort; \ 437 } \ 438 } while(0) 439 440 #define MD_LEAVE_SPAN(type, arg) \ 441 do { \ 442 ret = ctx->parser.leave_span((type), (arg), ctx->userdata); \ 443 if(ret != 0) { \ 444 MD_LOG("Aborted from leave_span() callback."); \ 445 goto abort; \ 446 } \ 447 } while(0) 448 449 #define MD_TEXT(type, str, size) \ 450 do { \ 451 if(size > 0) { \ 452 ret = ctx->parser.text((type), (str), (size), ctx->userdata); \ 453 if(ret != 0) { \ 454 MD_LOG("Aborted from text() callback."); \ 455 goto abort; \ 456 } \ 457 } \ 458 } while(0) 459 460 #define MD_TEXT_INSECURE(type, str, size) \ 461 do { \ 462 if(size > 0) { \ 463 ret = md_text_with_null_replacement(ctx, type, str, size); \ 464 if(ret != 0) { \ 465 MD_LOG("Aborted from text() callback."); \ 466 goto abort; \ 467 } \ 468 } \ 469 } while(0) 470 471 472 /* If the offset falls into a gap between line, we return the following 473 * line. */ 474 static const MD_LINE* 475 md_lookup_line(OFF off, const MD_LINE* lines, int n_lines) 476 { 477 int lo, hi; 478 int pivot; 479 const MD_LINE* line; 480 481 lo = 0; 482 hi = n_lines - 1; 483 while(lo <= hi) { 484 pivot = (lo + hi) / 2; 485 line = &lines[pivot]; 486 487 if(off < line->beg) { 488 hi = pivot - 1; 489 if(hi < 0 || lines[hi].end <= off) 490 return line; 491 } else if(off > line->end) { 492 lo = pivot + 1; 493 } else { 494 return line; 495 } 496 } 497 498 return NULL; 499 } 500 501 502 /************************* 503 *** Unicode Support *** 504 *************************/ 505 506 typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO; 507 struct MD_UNICODE_FOLD_INFO_tag { 508 unsigned codepoints[3]; 509 unsigned n_codepoints; 510 }; 511 512 513 #if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8 514 /* Binary search over sorted "map" of codepoints. Consecutive sequences 515 * of codepoints may be encoded in the map by just using the 516 * (MIN_CODEPOINT | 0x40000000) and (MAX_CODEPOINT | 0x80000000). 517 * 518 * Returns index of the found record in the map (in the case of ranges, 519 * the minimal value is used); or -1 on failure. */ 520 static int 521 md_unicode_bsearch__(unsigned codepoint, const unsigned* map, size_t map_size) 522 { 523 int beg, end; 524 int pivot_beg, pivot_end; 525 526 beg = 0; 527 end = (int) map_size-1; 528 while(beg <= end) { 529 /* Pivot may be a range, not just a single value. */ 530 pivot_beg = pivot_end = (beg + end) / 2; 531 if(map[pivot_end] & 0x40000000) 532 pivot_end++; 533 if(map[pivot_beg] & 0x80000000) 534 pivot_beg--; 535 536 if(codepoint < (map[pivot_beg] & 0x00ffffff)) 537 end = pivot_beg - 1; 538 else if(codepoint > (map[pivot_end] & 0x00ffffff)) 539 beg = pivot_end + 1; 540 else 541 return pivot_beg; 542 } 543 544 return -1; 545 } 546 547 static int 548 md_is_unicode_whitespace__(unsigned codepoint) 549 { 550 #define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000) 551 #define S(cp) (cp) 552 /* Unicode "Zs" category. 553 * (generated by scripts/build_whitespace_map.py) */ 554 static const unsigned WHITESPACE_MAP[] = { 555 S(0x0020), S(0x00a0), S(0x1680), R(0x2000,0x200a), S(0x202f), S(0x205f), S(0x3000) 556 }; 557 #undef R 558 #undef S 559 560 /* The ASCII ones are the most frequently used ones, also CommonMark 561 * specification requests few more in this range. */ 562 if(codepoint <= 0x7f) 563 return ISWHITESPACE_(codepoint); 564 565 return (md_unicode_bsearch__(codepoint, WHITESPACE_MAP, SIZEOF_ARRAY(WHITESPACE_MAP)) >= 0); 566 } 567 568 static int 569 md_is_unicode_punct__(unsigned codepoint) 570 { 571 #define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000) 572 #define S(cp) (cp) 573 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories. 574 * (generated by scripts/build_punct_map.py) */ 575 static const unsigned PUNCT_MAP[] = { 576 R(0x0021,0x0023), R(0x0025,0x002a), R(0x002c,0x002f), R(0x003a,0x003b), R(0x003f,0x0040), 577 R(0x005b,0x005d), S(0x005f), S(0x007b), S(0x007d), S(0x00a1), S(0x00a7), S(0x00ab), R(0x00b6,0x00b7), 578 S(0x00bb), S(0x00bf), S(0x037e), S(0x0387), R(0x055a,0x055f), R(0x0589,0x058a), S(0x05be), S(0x05c0), 579 S(0x05c3), S(0x05c6), R(0x05f3,0x05f4), R(0x0609,0x060a), R(0x060c,0x060d), S(0x061b), R(0x061e,0x061f), 580 R(0x066a,0x066d), S(0x06d4), R(0x0700,0x070d), R(0x07f7,0x07f9), R(0x0830,0x083e), S(0x085e), 581 R(0x0964,0x0965), S(0x0970), S(0x09fd), S(0x0a76), S(0x0af0), S(0x0c77), S(0x0c84), S(0x0df4), S(0x0e4f), 582 R(0x0e5a,0x0e5b), R(0x0f04,0x0f12), S(0x0f14), R(0x0f3a,0x0f3d), S(0x0f85), R(0x0fd0,0x0fd4), 583 R(0x0fd9,0x0fda), R(0x104a,0x104f), S(0x10fb), R(0x1360,0x1368), S(0x1400), S(0x166e), R(0x169b,0x169c), 584 R(0x16eb,0x16ed), R(0x1735,0x1736), R(0x17d4,0x17d6), R(0x17d8,0x17da), R(0x1800,0x180a), 585 R(0x1944,0x1945), R(0x1a1e,0x1a1f), R(0x1aa0,0x1aa6), R(0x1aa8,0x1aad), R(0x1b5a,0x1b60), 586 R(0x1bfc,0x1bff), R(0x1c3b,0x1c3f), R(0x1c7e,0x1c7f), R(0x1cc0,0x1cc7), S(0x1cd3), R(0x2010,0x2027), 587 R(0x2030,0x2043), R(0x2045,0x2051), R(0x2053,0x205e), R(0x207d,0x207e), R(0x208d,0x208e), 588 R(0x2308,0x230b), R(0x2329,0x232a), R(0x2768,0x2775), R(0x27c5,0x27c6), R(0x27e6,0x27ef), 589 R(0x2983,0x2998), R(0x29d8,0x29db), R(0x29fc,0x29fd), R(0x2cf9,0x2cfc), R(0x2cfe,0x2cff), S(0x2d70), 590 R(0x2e00,0x2e2e), R(0x2e30,0x2e4f), S(0x2e52), R(0x3001,0x3003), R(0x3008,0x3011), R(0x3014,0x301f), 591 S(0x3030), S(0x303d), S(0x30a0), S(0x30fb), R(0xa4fe,0xa4ff), R(0xa60d,0xa60f), S(0xa673), S(0xa67e), 592 R(0xa6f2,0xa6f7), R(0xa874,0xa877), R(0xa8ce,0xa8cf), R(0xa8f8,0xa8fa), S(0xa8fc), R(0xa92e,0xa92f), 593 S(0xa95f), R(0xa9c1,0xa9cd), R(0xa9de,0xa9df), R(0xaa5c,0xaa5f), R(0xaade,0xaadf), R(0xaaf0,0xaaf1), 594 S(0xabeb), R(0xfd3e,0xfd3f), R(0xfe10,0xfe19), R(0xfe30,0xfe52), R(0xfe54,0xfe61), S(0xfe63), S(0xfe68), 595 R(0xfe6a,0xfe6b), R(0xff01,0xff03), R(0xff05,0xff0a), R(0xff0c,0xff0f), R(0xff1a,0xff1b), 596 R(0xff1f,0xff20), R(0xff3b,0xff3d), S(0xff3f), S(0xff5b), S(0xff5d), R(0xff5f,0xff65), R(0x10100,0x10102), 597 S(0x1039f), S(0x103d0), S(0x1056f), S(0x10857), S(0x1091f), S(0x1093f), R(0x10a50,0x10a58), S(0x10a7f), 598 R(0x10af0,0x10af6), R(0x10b39,0x10b3f), R(0x10b99,0x10b9c), S(0x10ead), R(0x10f55,0x10f59), 599 R(0x11047,0x1104d), R(0x110bb,0x110bc), R(0x110be,0x110c1), R(0x11140,0x11143), R(0x11174,0x11175), 600 R(0x111c5,0x111c8), S(0x111cd), S(0x111db), R(0x111dd,0x111df), R(0x11238,0x1123d), S(0x112a9), 601 R(0x1144b,0x1144f), R(0x1145a,0x1145b), S(0x1145d), S(0x114c6), R(0x115c1,0x115d7), R(0x11641,0x11643), 602 R(0x11660,0x1166c), R(0x1173c,0x1173e), S(0x1183b), R(0x11944,0x11946), S(0x119e2), R(0x11a3f,0x11a46), 603 R(0x11a9a,0x11a9c), R(0x11a9e,0x11aa2), R(0x11c41,0x11c45), R(0x11c70,0x11c71), R(0x11ef7,0x11ef8), 604 S(0x11fff), R(0x12470,0x12474), R(0x16a6e,0x16a6f), S(0x16af5), R(0x16b37,0x16b3b), S(0x16b44), 605 R(0x16e97,0x16e9a), S(0x16fe2), S(0x1bc9f), R(0x1da87,0x1da8b), R(0x1e95e,0x1e95f) 606 }; 607 #undef R 608 #undef S 609 610 /* The ASCII ones are the most frequently used ones, also CommonMark 611 * specification requests few more in this range. */ 612 if(codepoint <= 0x7f) 613 return ISPUNCT_(codepoint); 614 615 return (md_unicode_bsearch__(codepoint, PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0); 616 } 617 618 static void 619 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info) 620 { 621 #define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000) 622 #define S(cp) (cp) 623 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories. 624 * (generated by scripts/build_folding_map.py) */ 625 static const unsigned FOLD_MAP_1[] = { 626 R(0x0041,0x005a), S(0x00b5), R(0x00c0,0x00d6), R(0x00d8,0x00de), R(0x0100,0x012e), R(0x0132,0x0136), 627 R(0x0139,0x0147), R(0x014a,0x0176), S(0x0178), R(0x0179,0x017d), S(0x017f), S(0x0181), S(0x0182), 628 S(0x0184), S(0x0186), S(0x0187), S(0x0189), S(0x018a), S(0x018b), S(0x018e), S(0x018f), S(0x0190), 629 S(0x0191), S(0x0193), S(0x0194), S(0x0196), S(0x0197), S(0x0198), S(0x019c), S(0x019d), S(0x019f), 630 R(0x01a0,0x01a4), S(0x01a6), S(0x01a7), S(0x01a9), S(0x01ac), S(0x01ae), S(0x01af), S(0x01b1), S(0x01b2), 631 S(0x01b3), S(0x01b5), S(0x01b7), S(0x01b8), S(0x01bc), S(0x01c4), S(0x01c5), S(0x01c7), S(0x01c8), 632 S(0x01ca), R(0x01cb,0x01db), R(0x01de,0x01ee), S(0x01f1), S(0x01f2), S(0x01f4), S(0x01f6), S(0x01f7), 633 R(0x01f8,0x021e), S(0x0220), R(0x0222,0x0232), S(0x023a), S(0x023b), S(0x023d), S(0x023e), S(0x0241), 634 S(0x0243), S(0x0244), S(0x0245), R(0x0246,0x024e), S(0x0345), S(0x0370), S(0x0372), S(0x0376), S(0x037f), 635 S(0x0386), R(0x0388,0x038a), S(0x038c), S(0x038e), S(0x038f), R(0x0391,0x03a1), R(0x03a3,0x03ab), 636 S(0x03c2), S(0x03cf), S(0x03d0), S(0x03d1), S(0x03d5), S(0x03d6), R(0x03d8,0x03ee), S(0x03f0), S(0x03f1), 637 S(0x03f4), S(0x03f5), S(0x03f7), S(0x03f9), S(0x03fa), R(0x03fd,0x03ff), R(0x0400,0x040f), 638 R(0x0410,0x042f), R(0x0460,0x0480), R(0x048a,0x04be), S(0x04c0), R(0x04c1,0x04cd), R(0x04d0,0x052e), 639 R(0x0531,0x0556), R(0x10a0,0x10c5), S(0x10c7), S(0x10cd), R(0x13f8,0x13fd), S(0x1c80), S(0x1c81), 640 S(0x1c82), S(0x1c83), S(0x1c84), S(0x1c85), S(0x1c86), S(0x1c87), S(0x1c88), R(0x1c90,0x1cba), 641 R(0x1cbd,0x1cbf), R(0x1e00,0x1e94), S(0x1e9b), R(0x1ea0,0x1efe), R(0x1f08,0x1f0f), R(0x1f18,0x1f1d), 642 R(0x1f28,0x1f2f), R(0x1f38,0x1f3f), R(0x1f48,0x1f4d), S(0x1f59), S(0x1f5b), S(0x1f5d), S(0x1f5f), 643 R(0x1f68,0x1f6f), S(0x1fb8), S(0x1fb9), S(0x1fba), S(0x1fbb), S(0x1fbe), R(0x1fc8,0x1fcb), S(0x1fd8), 644 S(0x1fd9), S(0x1fda), S(0x1fdb), S(0x1fe8), S(0x1fe9), S(0x1fea), S(0x1feb), S(0x1fec), S(0x1ff8), 645 S(0x1ff9), S(0x1ffa), S(0x1ffb), S(0x2126), S(0x212a), S(0x212b), S(0x2132), R(0x2160,0x216f), S(0x2183), 646 R(0x24b6,0x24cf), R(0x2c00,0x2c2e), S(0x2c60), S(0x2c62), S(0x2c63), S(0x2c64), R(0x2c67,0x2c6b), 647 S(0x2c6d), S(0x2c6e), S(0x2c6f), S(0x2c70), S(0x2c72), S(0x2c75), S(0x2c7e), S(0x2c7f), R(0x2c80,0x2ce2), 648 S(0x2ceb), S(0x2ced), S(0x2cf2), R(0xa640,0xa66c), R(0xa680,0xa69a), R(0xa722,0xa72e), R(0xa732,0xa76e), 649 S(0xa779), S(0xa77b), S(0xa77d), R(0xa77e,0xa786), S(0xa78b), S(0xa78d), S(0xa790), S(0xa792), 650 R(0xa796,0xa7a8), S(0xa7aa), S(0xa7ab), S(0xa7ac), S(0xa7ad), S(0xa7ae), S(0xa7b0), S(0xa7b1), S(0xa7b2), 651 S(0xa7b3), R(0xa7b4,0xa7be), S(0xa7c2), S(0xa7c4), S(0xa7c5), S(0xa7c6), S(0xa7c7), S(0xa7c9), S(0xa7f5), 652 R(0xab70,0xabbf), R(0xff21,0xff3a), R(0x10400,0x10427), R(0x104b0,0x104d3), R(0x10c80,0x10cb2), 653 R(0x118a0,0x118bf), R(0x16e40,0x16e5f), R(0x1e900,0x1e921) 654 }; 655 static const unsigned FOLD_MAP_1_DATA[] = { 656 0x0061, 0x007a, 0x03bc, 0x00e0, 0x00f6, 0x00f8, 0x00fe, 0x0101, 0x012f, 0x0133, 0x0137, 0x013a, 0x0148, 657 0x014b, 0x0177, 0x00ff, 0x017a, 0x017e, 0x0073, 0x0253, 0x0183, 0x0185, 0x0254, 0x0188, 0x0256, 0x0257, 658 0x018c, 0x01dd, 0x0259, 0x025b, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268, 0x0199, 0x026f, 0x0272, 0x0275, 659 0x01a1, 0x01a5, 0x0280, 0x01a8, 0x0283, 0x01ad, 0x0288, 0x01b0, 0x028a, 0x028b, 0x01b4, 0x01b6, 0x0292, 660 0x01b9, 0x01bd, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01cc, 0x01cc, 0x01dc, 0x01df, 0x01ef, 0x01f3, 0x01f3, 661 0x01f5, 0x0195, 0x01bf, 0x01f9, 0x021f, 0x019e, 0x0223, 0x0233, 0x2c65, 0x023c, 0x019a, 0x2c66, 0x0242, 662 0x0180, 0x0289, 0x028c, 0x0247, 0x024f, 0x03b9, 0x0371, 0x0373, 0x0377, 0x03f3, 0x03ac, 0x03ad, 0x03af, 663 0x03cc, 0x03cd, 0x03ce, 0x03b1, 0x03c1, 0x03c3, 0x03cb, 0x03c3, 0x03d7, 0x03b2, 0x03b8, 0x03c6, 0x03c0, 664 0x03d9, 0x03ef, 0x03ba, 0x03c1, 0x03b8, 0x03b5, 0x03f8, 0x03f2, 0x03fb, 0x037b, 0x037d, 0x0450, 0x045f, 665 0x0430, 0x044f, 0x0461, 0x0481, 0x048b, 0x04bf, 0x04cf, 0x04c2, 0x04ce, 0x04d1, 0x052f, 0x0561, 0x0586, 666 0x2d00, 0x2d25, 0x2d27, 0x2d2d, 0x13f0, 0x13f5, 0x0432, 0x0434, 0x043e, 0x0441, 0x0442, 0x0442, 0x044a, 667 0x0463, 0xa64b, 0x10d0, 0x10fa, 0x10fd, 0x10ff, 0x1e01, 0x1e95, 0x1e61, 0x1ea1, 0x1eff, 0x1f00, 0x1f07, 668 0x1f10, 0x1f15, 0x1f20, 0x1f27, 0x1f30, 0x1f37, 0x1f40, 0x1f45, 0x1f51, 0x1f53, 0x1f55, 0x1f57, 0x1f60, 669 0x1f67, 0x1fb0, 0x1fb1, 0x1f70, 0x1f71, 0x03b9, 0x1f72, 0x1f75, 0x1fd0, 0x1fd1, 0x1f76, 0x1f77, 0x1fe0, 670 0x1fe1, 0x1f7a, 0x1f7b, 0x1fe5, 0x1f78, 0x1f79, 0x1f7c, 0x1f7d, 0x03c9, 0x006b, 0x00e5, 0x214e, 0x2170, 671 0x217f, 0x2184, 0x24d0, 0x24e9, 0x2c30, 0x2c5e, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c68, 0x2c6c, 0x0251, 672 0x0271, 0x0250, 0x0252, 0x2c73, 0x2c76, 0x023f, 0x0240, 0x2c81, 0x2ce3, 0x2cec, 0x2cee, 0x2cf3, 0xa641, 673 0xa66d, 0xa681, 0xa69b, 0xa723, 0xa72f, 0xa733, 0xa76f, 0xa77a, 0xa77c, 0x1d79, 0xa77f, 0xa787, 0xa78c, 674 0x0265, 0xa791, 0xa793, 0xa797, 0xa7a9, 0x0266, 0x025c, 0x0261, 0x026c, 0x026a, 0x029e, 0x0287, 0x029d, 675 0xab53, 0xa7b5, 0xa7bf, 0xa7c3, 0xa794, 0x0282, 0x1d8e, 0xa7c8, 0xa7ca, 0xa7f6, 0x13a0, 0x13ef, 0xff41, 676 0xff5a, 0x10428, 0x1044f, 0x104d8, 0x104fb, 0x10cc0, 0x10cf2, 0x118c0, 0x118df, 0x16e60, 0x16e7f, 0x1e922, 677 0x1e943 678 }; 679 static const unsigned FOLD_MAP_2[] = { 680 S(0x00df), S(0x0130), S(0x0149), S(0x01f0), S(0x0587), S(0x1e96), S(0x1e97), S(0x1e98), S(0x1e99), 681 S(0x1e9a), S(0x1e9e), S(0x1f50), R(0x1f80,0x1f87), R(0x1f88,0x1f8f), R(0x1f90,0x1f97), R(0x1f98,0x1f9f), 682 R(0x1fa0,0x1fa7), R(0x1fa8,0x1faf), S(0x1fb2), S(0x1fb3), S(0x1fb4), S(0x1fb6), S(0x1fbc), S(0x1fc2), 683 S(0x1fc3), S(0x1fc4), S(0x1fc6), S(0x1fcc), S(0x1fd6), S(0x1fe4), S(0x1fe6), S(0x1ff2), S(0x1ff3), 684 S(0x1ff4), S(0x1ff6), S(0x1ffc), S(0xfb00), S(0xfb01), S(0xfb02), S(0xfb05), S(0xfb06), S(0xfb13), 685 S(0xfb14), S(0xfb15), S(0xfb16), S(0xfb17) 686 }; 687 static const unsigned FOLD_MAP_2_DATA[] = { 688 0x0073,0x0073, 0x0069,0x0307, 0x02bc,0x006e, 0x006a,0x030c, 0x0565,0x0582, 0x0068,0x0331, 0x0074,0x0308, 689 0x0077,0x030a, 0x0079,0x030a, 0x0061,0x02be, 0x0073,0x0073, 0x03c5,0x0313, 0x1f00,0x03b9, 0x1f07,0x03b9, 690 0x1f00,0x03b9, 0x1f07,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f60,0x03b9, 691 0x1f67,0x03b9, 0x1f60,0x03b9, 0x1f67,0x03b9, 0x1f70,0x03b9, 0x03b1,0x03b9, 0x03ac,0x03b9, 0x03b1,0x0342, 692 0x03b1,0x03b9, 0x1f74,0x03b9, 0x03b7,0x03b9, 0x03ae,0x03b9, 0x03b7,0x0342, 0x03b7,0x03b9, 0x03b9,0x0342, 693 0x03c1,0x0313, 0x03c5,0x0342, 0x1f7c,0x03b9, 0x03c9,0x03b9, 0x03ce,0x03b9, 0x03c9,0x0342, 0x03c9,0x03b9, 694 0x0066,0x0066, 0x0066,0x0069, 0x0066,0x006c, 0x0073,0x0074, 0x0073,0x0074, 0x0574,0x0576, 0x0574,0x0565, 695 0x0574,0x056b, 0x057e,0x0576, 0x0574,0x056d 696 }; 697 static const unsigned FOLD_MAP_3[] = { 698 S(0x0390), S(0x03b0), S(0x1f52), S(0x1f54), S(0x1f56), S(0x1fb7), S(0x1fc7), S(0x1fd2), S(0x1fd3), 699 S(0x1fd7), S(0x1fe2), S(0x1fe3), S(0x1fe7), S(0x1ff7), S(0xfb03), S(0xfb04) 700 }; 701 static const unsigned FOLD_MAP_3_DATA[] = { 702 0x03b9,0x0308,0x0301, 0x03c5,0x0308,0x0301, 0x03c5,0x0313,0x0300, 0x03c5,0x0313,0x0301, 703 0x03c5,0x0313,0x0342, 0x03b1,0x0342,0x03b9, 0x03b7,0x0342,0x03b9, 0x03b9,0x0308,0x0300, 704 0x03b9,0x0308,0x0301, 0x03b9,0x0308,0x0342, 0x03c5,0x0308,0x0300, 0x03c5,0x0308,0x0301, 705 0x03c5,0x0308,0x0342, 0x03c9,0x0342,0x03b9, 0x0066,0x0066,0x0069, 0x0066,0x0066,0x006c 706 }; 707 #undef R 708 #undef S 709 static const struct { 710 const unsigned* map; 711 const unsigned* data; 712 size_t map_size; 713 unsigned n_codepoints; 714 } FOLD_MAP_LIST[] = { 715 { FOLD_MAP_1, FOLD_MAP_1_DATA, SIZEOF_ARRAY(FOLD_MAP_1), 1 }, 716 { FOLD_MAP_2, FOLD_MAP_2_DATA, SIZEOF_ARRAY(FOLD_MAP_2), 2 }, 717 { FOLD_MAP_3, FOLD_MAP_3_DATA, SIZEOF_ARRAY(FOLD_MAP_3), 3 } 718 }; 719 720 int i; 721 722 /* Fast path for ASCII characters. */ 723 if(codepoint <= 0x7f) { 724 info->codepoints[0] = codepoint; 725 if(ISUPPER_(codepoint)) 726 info->codepoints[0] += 'a' - 'A'; 727 info->n_codepoints = 1; 728 return; 729 } 730 731 /* Try to locate the codepoint in any of the maps. */ 732 for(i = 0; i < (int) SIZEOF_ARRAY(FOLD_MAP_LIST); i++) { 733 int index; 734 735 index = md_unicode_bsearch__(codepoint, FOLD_MAP_LIST[i].map, FOLD_MAP_LIST[i].map_size); 736 if(index >= 0) { 737 /* Found the mapping. */ 738 unsigned n_codepoints = FOLD_MAP_LIST[i].n_codepoints; 739 const unsigned* map = FOLD_MAP_LIST[i].map; 740 const unsigned* codepoints = FOLD_MAP_LIST[i].data + (index * n_codepoints); 741 742 memcpy(info->codepoints, codepoints, sizeof(unsigned) * n_codepoints); 743 info->n_codepoints = n_codepoints; 744 745 if(FOLD_MAP_LIST[i].map[index] != codepoint) { 746 /* The found mapping maps whole range of codepoints, 747 * i.e. we have to offset info->codepoints[0] accordingly. */ 748 if((map[index] & 0x00ffffff)+1 == codepoints[0]) { 749 /* Alternating type of the range. */ 750 info->codepoints[0] = codepoint + ((codepoint & 0x1) == (map[index] & 0x1) ? 1 : 0); 751 } else { 752 /* Range to range kind of mapping. */ 753 info->codepoints[0] += (codepoint - (map[index] & 0x00ffffff)); 754 } 755 } 756 757 return; 758 } 759 } 760 761 /* No mapping found. Map the codepoint to itself. */ 762 info->codepoints[0] = codepoint; 763 info->n_codepoints = 1; 764 } 765 #endif 766 767 768 #if defined MD4C_USE_UTF16 769 #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc00) == 0xd800) 770 #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc00) == 0xdc00) 771 #define UTF16_DECODE_SURROGATE(hi, lo) (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0))) 772 773 static unsigned 774 md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size) 775 { 776 if(IS_UTF16_SURROGATE_HI(str[0])) { 777 if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) { 778 if(p_size != NULL) 779 *p_size = 2; 780 return UTF16_DECODE_SURROGATE(str[0], str[1]); 781 } 782 } 783 784 if(p_size != NULL) 785 *p_size = 1; 786 return str[0]; 787 } 788 789 static unsigned 790 md_decode_utf16le_before__(MD_CTX* ctx, OFF off) 791 { 792 if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1))) 793 return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1)); 794 795 return CH(off); 796 } 797 798 /* No whitespace uses surrogates, so no decoding needed here. */ 799 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint) 800 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(CH(off)) 801 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(CH((off)-1)) 802 803 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL)) 804 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off)) 805 806 static inline int 807 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size) 808 { 809 return md_decode_utf16le__(str+off, str_size-off, p_char_size); 810 } 811 #elif defined MD4C_USE_UTF8 812 #define IS_UTF8_LEAD1(byte) ((unsigned char)(byte) <= 0x7f) 813 #define IS_UTF8_LEAD2(byte) (((unsigned char)(byte) & 0xe0) == 0xc0) 814 #define IS_UTF8_LEAD3(byte) (((unsigned char)(byte) & 0xf0) == 0xe0) 815 #define IS_UTF8_LEAD4(byte) (((unsigned char)(byte) & 0xf8) == 0xf0) 816 #define IS_UTF8_TAIL(byte) (((unsigned char)(byte) & 0xc0) == 0x80) 817 818 static unsigned 819 md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size) 820 { 821 if(!IS_UTF8_LEAD1(str[0])) { 822 if(IS_UTF8_LEAD2(str[0])) { 823 if(1 < str_size && IS_UTF8_TAIL(str[1])) { 824 if(p_size != NULL) 825 *p_size = 2; 826 827 return (((unsigned int)str[0] & 0x1f) << 6) | 828 (((unsigned int)str[1] & 0x3f) << 0); 829 } 830 } else if(IS_UTF8_LEAD3(str[0])) { 831 if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) { 832 if(p_size != NULL) 833 *p_size = 3; 834 835 return (((unsigned int)str[0] & 0x0f) << 12) | 836 (((unsigned int)str[1] & 0x3f) << 6) | 837 (((unsigned int)str[2] & 0x3f) << 0); 838 } 839 } else if(IS_UTF8_LEAD4(str[0])) { 840 if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) { 841 if(p_size != NULL) 842 *p_size = 4; 843 844 return (((unsigned int)str[0] & 0x07) << 18) | 845 (((unsigned int)str[1] & 0x3f) << 12) | 846 (((unsigned int)str[2] & 0x3f) << 6) | 847 (((unsigned int)str[3] & 0x3f) << 0); 848 } 849 } 850 } 851 852 if(p_size != NULL) 853 *p_size = 1; 854 return (unsigned) str[0]; 855 } 856 857 static unsigned 858 md_decode_utf8_before__(MD_CTX* ctx, OFF off) 859 { 860 if(!IS_UTF8_LEAD1(CH(off-1))) { 861 if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1))) 862 return (((unsigned int)CH(off-2) & 0x1f) << 6) | 863 (((unsigned int)CH(off-1) & 0x3f) << 0); 864 865 if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1))) 866 return (((unsigned int)CH(off-3) & 0x0f) << 12) | 867 (((unsigned int)CH(off-2) & 0x3f) << 6) | 868 (((unsigned int)CH(off-1) & 0x3f) << 0); 869 870 if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1))) 871 return (((unsigned int)CH(off-4) & 0x07) << 18) | 872 (((unsigned int)CH(off-3) & 0x3f) << 12) | 873 (((unsigned int)CH(off-2) & 0x3f) << 6) | 874 (((unsigned int)CH(off-1) & 0x3f) << 0); 875 } 876 877 return (unsigned) CH(off-1); 878 } 879 880 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint) 881 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL)) 882 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off)) 883 884 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL)) 885 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf8_before__(ctx, off)) 886 887 static inline unsigned 888 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size) 889 { 890 return md_decode_utf8__(str+off, str_size-off, p_char_size); 891 } 892 #else 893 #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint) 894 #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off) 895 #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1) 896 897 #define ISUNICODEPUNCT(off) ISPUNCT(off) 898 #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1) 899 900 static inline void 901 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info) 902 { 903 info->codepoints[0] = codepoint; 904 if(ISUPPER_(codepoint)) 905 info->codepoints[0] += 'a' - 'A'; 906 info->n_codepoints = 1; 907 } 908 909 static inline unsigned 910 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size) 911 { 912 *p_size = 1; 913 return (unsigned) str[off]; 914 } 915 #endif 916 917 918 /************************************* 919 *** Helper string manipulations *** 920 *************************************/ 921 922 /* Fill buffer with copy of the string between 'beg' and 'end' but replace any 923 * line breaks with given replacement character. 924 * 925 * NOTE: Caller is responsible to make sure the buffer is large enough. 926 * (Given the output is always shorter then input, (end - beg) is good idea 927 * what the caller should allocate.) 928 */ 929 static void 930 md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines, 931 CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size) 932 { 933 CHAR* ptr = buffer; 934 int line_index = 0; 935 OFF off = beg; 936 937 MD_UNUSED(n_lines); 938 939 while(1) { 940 const MD_LINE* line = &lines[line_index]; 941 OFF line_end = line->end; 942 if(end < line_end) 943 line_end = end; 944 945 while(off < line_end) { 946 *ptr = CH(off); 947 ptr++; 948 off++; 949 } 950 951 if(off >= end) { 952 *p_size = (MD_SIZE)(ptr - buffer); 953 return; 954 } 955 956 *ptr = line_break_replacement_char; 957 ptr++; 958 959 line_index++; 960 off = lines[line_index].beg; 961 } 962 } 963 964 /* Wrapper of md_merge_lines() which allocates new buffer for the output string. 965 */ 966 static int 967 md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines, 968 CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size) 969 { 970 CHAR* buffer; 971 972 buffer = (CHAR*) malloc(sizeof(CHAR) * (end - beg)); 973 if(buffer == NULL) { 974 MD_LOG("malloc() failed."); 975 return -1; 976 } 977 978 md_merge_lines(ctx, beg, end, lines, n_lines, 979 line_break_replacement_char, buffer, p_size); 980 981 *p_str = buffer; 982 return 0; 983 } 984 985 static OFF 986 md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size) 987 { 988 SZ char_size; 989 unsigned codepoint; 990 991 while(off < size) { 992 codepoint = md_decode_unicode(label, off, size, &char_size); 993 if(!ISUNICODEWHITESPACE_(codepoint) && !ISNEWLINE_(label[off])) 994 break; 995 off += char_size; 996 } 997 998 return off; 999 } 1000 1001 1002 /****************************** 1003 *** Recognizing raw HTML *** 1004 ******************************/ 1005 1006 /* md_is_html_tag() may be called when processing inlines (inline raw HTML) 1007 * or when breaking document to blocks (checking for start of HTML block type 7). 1008 * 1009 * When breaking document to blocks, we do not yet know line boundaries, but 1010 * in that case the whole tag has to live on a single line. We distinguish this 1011 * by n_lines == 0. 1012 */ 1013 static int 1014 md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) 1015 { 1016 int attr_state; 1017 OFF off = beg; 1018 OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size; 1019 int i = 0; 1020 1021 MD_ASSERT(CH(beg) == _T('<')); 1022 1023 if(off + 1 >= line_end) 1024 return FALSE; 1025 off++; 1026 1027 /* For parsing attributes, we need a little state automaton below. 1028 * State -1: no attributes are allowed. 1029 * State 0: attribute could follow after some whitespace. 1030 * State 1: after a whitespace (attribute name may follow). 1031 * State 2: after attribute name ('=' MAY follow). 1032 * State 3: after '=' (value specification MUST follow). 1033 * State 41: in middle of unquoted attribute value. 1034 * State 42: in middle of single-quoted attribute value. 1035 * State 43: in middle of double-quoted attribute value. 1036 */ 1037 attr_state = 0; 1038 1039 if(CH(off) == _T('/')) { 1040 /* Closer tag "</ ... >". No attributes may be present. */ 1041 attr_state = -1; 1042 off++; 1043 } 1044 1045 /* Tag name */ 1046 if(off >= line_end || !ISALPHA(off)) 1047 return FALSE; 1048 off++; 1049 while(off < line_end && (ISALNUM(off) || CH(off) == _T('-'))) 1050 off++; 1051 1052 /* (Optional) attributes (if not closer), (optional) '/' (if not closer) 1053 * and final '>'. */ 1054 while(1) { 1055 while(off < line_end && !ISNEWLINE(off)) { 1056 if(attr_state > 40) { 1057 if(attr_state == 41 && (ISBLANK(off) || ISANYOF(off, _T("\"'=<>`")))) { 1058 attr_state = 0; 1059 off--; /* Put the char back for re-inspection in the new state. */ 1060 } else if(attr_state == 42 && CH(off) == _T('\'')) { 1061 attr_state = 0; 1062 } else if(attr_state == 43 && CH(off) == _T('"')) { 1063 attr_state = 0; 1064 } 1065 off++; 1066 } else if(ISWHITESPACE(off)) { 1067 if(attr_state == 0) 1068 attr_state = 1; 1069 off++; 1070 } else if(attr_state <= 2 && CH(off) == _T('>')) { 1071 /* End. */ 1072 goto done; 1073 } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) { 1074 /* End with digraph '/>' */ 1075 off++; 1076 goto done; 1077 } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) { 1078 off++; 1079 /* Attribute name */ 1080 while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-")))) 1081 off++; 1082 attr_state = 2; 1083 } else if(attr_state == 2 && CH(off) == _T('=')) { 1084 /* Attribute assignment sign */ 1085 off++; 1086 attr_state = 3; 1087 } else if(attr_state == 3) { 1088 /* Expecting start of attribute value. */ 1089 if(CH(off) == _T('"')) 1090 attr_state = 43; 1091 else if(CH(off) == _T('\'')) 1092 attr_state = 42; 1093 else if(!ISANYOF(off, _T("\"'=<>`")) && !ISNEWLINE(off)) 1094 attr_state = 41; 1095 else 1096 return FALSE; 1097 off++; 1098 } else { 1099 /* Anything unexpected. */ 1100 return FALSE; 1101 } 1102 } 1103 1104 /* We have to be on a single line. See definition of start condition 1105 * of HTML block, type 7. */ 1106 if(n_lines == 0) 1107 return FALSE; 1108 1109 i++; 1110 if(i >= n_lines) 1111 return FALSE; 1112 1113 off = lines[i].beg; 1114 line_end = lines[i].end; 1115 1116 if(attr_state == 0 || attr_state == 41) 1117 attr_state = 1; 1118 1119 if(off >= max_end) 1120 return FALSE; 1121 } 1122 1123 done: 1124 if(off >= max_end) 1125 return FALSE; 1126 1127 *p_end = off+1; 1128 return TRUE; 1129 } 1130 1131 static int 1132 md_scan_for_html_closer(MD_CTX* ctx, const MD_CHAR* str, MD_SIZE len, 1133 const MD_LINE* lines, int n_lines, 1134 OFF beg, OFF max_end, OFF* p_end, 1135 OFF* p_scan_horizon) 1136 { 1137 OFF off = beg; 1138 int i = 0; 1139 1140 if(off < *p_scan_horizon && *p_scan_horizon >= max_end - len) { 1141 /* We have already scanned the range up to the max_end so we know 1142 * there is nothing to see. */ 1143 return FALSE; 1144 } 1145 1146 while(TRUE) { 1147 while(off + len <= lines[i].end && off + len <= max_end) { 1148 if(md_ascii_eq(STR(off), str, len)) { 1149 /* Success. */ 1150 *p_end = off + len; 1151 return TRUE; 1152 } 1153 off++; 1154 } 1155 1156 i++; 1157 if(off >= max_end || i >= n_lines) { 1158 /* Failure. */ 1159 *p_scan_horizon = off; 1160 return FALSE; 1161 } 1162 1163 off = lines[i].beg; 1164 } 1165 } 1166 1167 static int 1168 md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) 1169 { 1170 OFF off = beg; 1171 1172 MD_ASSERT(CH(beg) == _T('<')); 1173 1174 if(off + 4 >= lines[0].end) 1175 return FALSE; 1176 if(CH(off+1) != _T('!') || CH(off+2) != _T('-') || CH(off+3) != _T('-')) 1177 return FALSE; 1178 off += 4; 1179 1180 /* ">" and "->" must not follow the opening. */ 1181 if(off < lines[0].end && CH(off) == _T('>')) 1182 return FALSE; 1183 if(off+1 < lines[0].end && CH(off) == _T('-') && CH(off+1) == _T('>')) 1184 return FALSE; 1185 1186 /* HTML comment must not contain "--", so we scan just for "--" instead 1187 * of "-->" and verify manually that '>' follows. */ 1188 if(md_scan_for_html_closer(ctx, _T("--"), 2, 1189 lines, n_lines, off, max_end, p_end, &ctx->html_comment_horizon)) 1190 { 1191 if(*p_end < max_end && CH(*p_end) == _T('>')) { 1192 *p_end = *p_end + 1; 1193 return TRUE; 1194 } 1195 } 1196 1197 return FALSE; 1198 } 1199 1200 static int 1201 md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) 1202 { 1203 OFF off = beg; 1204 1205 if(off + 2 >= lines[0].end) 1206 return FALSE; 1207 if(CH(off+1) != _T('?')) 1208 return FALSE; 1209 off += 2; 1210 1211 return md_scan_for_html_closer(ctx, _T("?>"), 2, 1212 lines, n_lines, off, max_end, p_end, &ctx->html_proc_instr_horizon); 1213 } 1214 1215 static int 1216 md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) 1217 { 1218 OFF off = beg; 1219 1220 if(off + 2 >= lines[0].end) 1221 return FALSE; 1222 if(CH(off+1) != _T('!')) 1223 return FALSE; 1224 off += 2; 1225 1226 /* Declaration name. */ 1227 if(off >= lines[0].end || !ISALPHA(off)) 1228 return FALSE; 1229 off++; 1230 while(off < lines[0].end && ISALPHA(off)) 1231 off++; 1232 if(off < lines[0].end && !ISWHITESPACE(off)) 1233 return FALSE; 1234 1235 return md_scan_for_html_closer(ctx, _T(">"), 1, 1236 lines, n_lines, off, max_end, p_end, &ctx->html_decl_horizon); 1237 } 1238 1239 static int 1240 md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) 1241 { 1242 static const CHAR open_str[] = _T("<![CDATA["); 1243 static const SZ open_size = SIZEOF_ARRAY(open_str) - 1; 1244 1245 OFF off = beg; 1246 1247 if(off + open_size >= lines[0].end) 1248 return FALSE; 1249 if(memcmp(STR(off), open_str, open_size) != 0) 1250 return FALSE; 1251 off += open_size; 1252 1253 if(lines[n_lines-1].end < max_end) 1254 max_end = lines[n_lines-1].end - 2; 1255 1256 return md_scan_for_html_closer(ctx, _T("]]>"), 3, 1257 lines, n_lines, off, max_end, p_end, &ctx->html_cdata_horizon); 1258 } 1259 1260 static int 1261 md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) 1262 { 1263 MD_ASSERT(CH(beg) == _T('<')); 1264 return (md_is_html_tag(ctx, lines, n_lines, beg, max_end, p_end) || 1265 md_is_html_comment(ctx, lines, n_lines, beg, max_end, p_end) || 1266 md_is_html_processing_instruction(ctx, lines, n_lines, beg, max_end, p_end) || 1267 md_is_html_declaration(ctx, lines, n_lines, beg, max_end, p_end) || 1268 md_is_html_cdata(ctx, lines, n_lines, beg, max_end, p_end)); 1269 } 1270 1271 1272 /**************************** 1273 *** Recognizing Entity *** 1274 ****************************/ 1275 1276 static int 1277 md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) 1278 { 1279 OFF off = beg; 1280 MD_UNUSED(ctx); 1281 1282 while(off < max_end && ISXDIGIT_(text[off]) && off - beg <= 8) 1283 off++; 1284 1285 if(1 <= off - beg && off - beg <= 6) { 1286 *p_end = off; 1287 return TRUE; 1288 } else { 1289 return FALSE; 1290 } 1291 } 1292 1293 static int 1294 md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) 1295 { 1296 OFF off = beg; 1297 MD_UNUSED(ctx); 1298 1299 while(off < max_end && ISDIGIT_(text[off]) && off - beg <= 8) 1300 off++; 1301 1302 if(1 <= off - beg && off - beg <= 7) { 1303 *p_end = off; 1304 return TRUE; 1305 } else { 1306 return FALSE; 1307 } 1308 } 1309 1310 static int 1311 md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) 1312 { 1313 OFF off = beg; 1314 MD_UNUSED(ctx); 1315 1316 if(off < max_end && ISALPHA_(text[off])) 1317 off++; 1318 else 1319 return FALSE; 1320 1321 while(off < max_end && ISALNUM_(text[off]) && off - beg <= 48) 1322 off++; 1323 1324 if(2 <= off - beg && off - beg <= 48) { 1325 *p_end = off; 1326 return TRUE; 1327 } else { 1328 return FALSE; 1329 } 1330 } 1331 1332 static int 1333 md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) 1334 { 1335 int is_contents; 1336 OFF off = beg; 1337 1338 MD_ASSERT(text[off] == _T('&')); 1339 off++; 1340 1341 if(off+2 < max_end && text[off] == _T('#') && (text[off+1] == _T('x') || text[off+1] == _T('X'))) 1342 is_contents = md_is_hex_entity_contents(ctx, text, off+2, max_end, &off); 1343 else if(off+1 < max_end && text[off] == _T('#')) 1344 is_contents = md_is_dec_entity_contents(ctx, text, off+1, max_end, &off); 1345 else 1346 is_contents = md_is_named_entity_contents(ctx, text, off, max_end, &off); 1347 1348 if(is_contents && off < max_end && text[off] == _T(';')) { 1349 *p_end = off+1; 1350 return TRUE; 1351 } else { 1352 return FALSE; 1353 } 1354 } 1355 1356 static inline int 1357 md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end) 1358 { 1359 return md_is_entity_str(ctx, ctx->text, beg, max_end, p_end); 1360 } 1361 1362 1363 /****************************** 1364 *** Attribute Management *** 1365 ******************************/ 1366 1367 typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD; 1368 struct MD_ATTRIBUTE_BUILD_tag { 1369 CHAR* text; 1370 MD_TEXTTYPE* substr_types; 1371 OFF* substr_offsets; 1372 int substr_count; 1373 int substr_alloc; 1374 MD_TEXTTYPE trivial_types[1]; 1375 OFF trivial_offsets[2]; 1376 }; 1377 1378 1379 #define MD_BUILD_ATTR_NO_ESCAPES 0x0001 1380 1381 static int 1382 md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build, 1383 MD_TEXTTYPE type, OFF off) 1384 { 1385 if(build->substr_count >= build->substr_alloc) { 1386 MD_TEXTTYPE* new_substr_types; 1387 OFF* new_substr_offsets; 1388 1389 build->substr_alloc = (build->substr_alloc > 0 1390 ? build->substr_alloc + build->substr_alloc / 2 1391 : 8); 1392 new_substr_types = (MD_TEXTTYPE*) realloc(build->substr_types, 1393 build->substr_alloc * sizeof(MD_TEXTTYPE)); 1394 if(new_substr_types == NULL) { 1395 MD_LOG("realloc() failed."); 1396 return -1; 1397 } 1398 /* Note +1 to reserve space for final offset (== raw_size). */ 1399 new_substr_offsets = (OFF*) realloc(build->substr_offsets, 1400 (build->substr_alloc+1) * sizeof(OFF)); 1401 if(new_substr_offsets == NULL) { 1402 MD_LOG("realloc() failed."); 1403 free(new_substr_types); 1404 return -1; 1405 } 1406 1407 build->substr_types = new_substr_types; 1408 build->substr_offsets = new_substr_offsets; 1409 } 1410 1411 build->substr_types[build->substr_count] = type; 1412 build->substr_offsets[build->substr_count] = off; 1413 build->substr_count++; 1414 return 0; 1415 } 1416 1417 static void 1418 md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build) 1419 { 1420 MD_UNUSED(ctx); 1421 1422 if(build->substr_alloc > 0) { 1423 free(build->text); 1424 free(build->substr_types); 1425 free(build->substr_offsets); 1426 } 1427 } 1428 1429 static int 1430 md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, 1431 unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build) 1432 { 1433 OFF raw_off, off; 1434 int is_trivial; 1435 int ret = 0; 1436 1437 memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD)); 1438 1439 /* If there is no backslash and no ampersand, build trivial attribute 1440 * without any malloc(). */ 1441 is_trivial = TRUE; 1442 for(raw_off = 0; raw_off < raw_size; raw_off++) { 1443 if(ISANYOF3_(raw_text[raw_off], _T('\\'), _T('&'), _T('\0'))) { 1444 is_trivial = FALSE; 1445 break; 1446 } 1447 } 1448 1449 if(is_trivial) { 1450 build->text = (CHAR*) (raw_size ? raw_text : NULL); 1451 build->substr_types = build->trivial_types; 1452 build->substr_offsets = build->trivial_offsets; 1453 build->substr_count = 1; 1454 build->substr_alloc = 0; 1455 build->trivial_types[0] = MD_TEXT_NORMAL; 1456 build->trivial_offsets[0] = 0; 1457 build->trivial_offsets[1] = raw_size; 1458 off = raw_size; 1459 } else { 1460 build->text = (CHAR*) malloc(raw_size * sizeof(CHAR)); 1461 if(build->text == NULL) { 1462 MD_LOG("malloc() failed."); 1463 goto abort; 1464 } 1465 1466 raw_off = 0; 1467 off = 0; 1468 1469 while(raw_off < raw_size) { 1470 if(raw_text[raw_off] == _T('\0')) { 1471 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NULLCHAR, off)); 1472 memcpy(build->text + off, raw_text + raw_off, 1); 1473 off++; 1474 raw_off++; 1475 continue; 1476 } 1477 1478 if(raw_text[raw_off] == _T('&')) { 1479 OFF ent_end; 1480 1481 if(md_is_entity_str(ctx, raw_text, raw_off, raw_size, &ent_end)) { 1482 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_ENTITY, off)); 1483 memcpy(build->text + off, raw_text + raw_off, ent_end - raw_off); 1484 off += ent_end - raw_off; 1485 raw_off = ent_end; 1486 continue; 1487 } 1488 } 1489 1490 if(build->substr_count == 0 || build->substr_types[build->substr_count-1] != MD_TEXT_NORMAL) 1491 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NORMAL, off)); 1492 1493 if(!(flags & MD_BUILD_ATTR_NO_ESCAPES) && 1494 raw_text[raw_off] == _T('\\') && raw_off+1 < raw_size && 1495 (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1]))) 1496 raw_off++; 1497 1498 build->text[off++] = raw_text[raw_off++]; 1499 } 1500 build->substr_offsets[build->substr_count] = off; 1501 } 1502 1503 attr->text = build->text; 1504 attr->size = off; 1505 attr->substr_offsets = build->substr_offsets; 1506 attr->substr_types = build->substr_types; 1507 return 0; 1508 1509 abort: 1510 md_free_attribute(ctx, build); 1511 return -1; 1512 } 1513 1514 1515 /********************************************* 1516 *** Dictionary of Reference Definitions *** 1517 *********************************************/ 1518 1519 #define MD_FNV1A_BASE 2166136261U 1520 #define MD_FNV1A_PRIME 16777619U 1521 1522 static inline unsigned 1523 md_fnv1a(unsigned base, const void* data, size_t n) 1524 { 1525 const unsigned char* buf = (const unsigned char*) data; 1526 unsigned hash = base; 1527 size_t i; 1528 1529 for(i = 0; i < n; i++) { 1530 hash ^= buf[i]; 1531 hash *= MD_FNV1A_PRIME; 1532 } 1533 1534 return hash; 1535 } 1536 1537 1538 struct MD_REF_DEF_tag { 1539 CHAR* label; 1540 CHAR* title; 1541 unsigned hash; 1542 SZ label_size; 1543 SZ title_size; 1544 OFF dest_beg; 1545 OFF dest_end; 1546 unsigned char label_needs_free : 1; 1547 unsigned char title_needs_free : 1; 1548 }; 1549 1550 /* Label equivalence is quite complicated with regards to whitespace and case 1551 * folding. This complicates computing a hash of it as well as direct comparison 1552 * of two labels. */ 1553 1554 static unsigned 1555 md_link_label_hash(const CHAR* label, SZ size) 1556 { 1557 unsigned hash = MD_FNV1A_BASE; 1558 OFF off; 1559 unsigned codepoint; 1560 int is_whitespace = FALSE; 1561 1562 off = md_skip_unicode_whitespace(label, 0, size); 1563 while(off < size) { 1564 SZ char_size; 1565 1566 codepoint = md_decode_unicode(label, off, size, &char_size); 1567 is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]); 1568 1569 if(is_whitespace) { 1570 codepoint = ' '; 1571 hash = md_fnv1a(hash, &codepoint, sizeof(unsigned)); 1572 off = md_skip_unicode_whitespace(label, off, size); 1573 } else { 1574 MD_UNICODE_FOLD_INFO fold_info; 1575 1576 md_get_unicode_fold_info(codepoint, &fold_info); 1577 hash = md_fnv1a(hash, fold_info.codepoints, fold_info.n_codepoints * sizeof(unsigned)); 1578 off += char_size; 1579 } 1580 } 1581 1582 return hash; 1583 } 1584 1585 static OFF 1586 md_link_label_cmp_load_fold_info(const CHAR* label, OFF off, SZ size, 1587 MD_UNICODE_FOLD_INFO* fold_info) 1588 { 1589 unsigned codepoint; 1590 SZ char_size; 1591 1592 if(off >= size) { 1593 /* Treat end of a link label as a whitespace. */ 1594 goto whitespace; 1595 } 1596 1597 codepoint = md_decode_unicode(label, off, size, &char_size); 1598 off += char_size; 1599 if(ISUNICODEWHITESPACE_(codepoint)) { 1600 /* Treat all whitespace as equivalent */ 1601 goto whitespace; 1602 } 1603 1604 /* Get real folding info. */ 1605 md_get_unicode_fold_info(codepoint, fold_info); 1606 return off; 1607 1608 whitespace: 1609 fold_info->codepoints[0] = _T(' '); 1610 fold_info->n_codepoints = 1; 1611 return md_skip_unicode_whitespace(label, off, size); 1612 } 1613 1614 static int 1615 md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size) 1616 { 1617 OFF a_off; 1618 OFF b_off; 1619 MD_UNICODE_FOLD_INFO a_fi = { { 0 }, 0 }; 1620 MD_UNICODE_FOLD_INFO b_fi = { { 0 }, 0 }; 1621 OFF a_fi_off = 0; 1622 OFF b_fi_off = 0; 1623 int cmp; 1624 1625 a_off = md_skip_unicode_whitespace(a_label, 0, a_size); 1626 b_off = md_skip_unicode_whitespace(b_label, 0, b_size); 1627 while(a_off < a_size || a_fi_off < a_fi.n_codepoints || 1628 b_off < b_size || b_fi_off < b_fi.n_codepoints) 1629 { 1630 /* If needed, load fold info for next char. */ 1631 if(a_fi_off >= a_fi.n_codepoints) { 1632 a_fi_off = 0; 1633 a_off = md_link_label_cmp_load_fold_info(a_label, a_off, a_size, &a_fi); 1634 } 1635 if(b_fi_off >= b_fi.n_codepoints) { 1636 b_fi_off = 0; 1637 b_off = md_link_label_cmp_load_fold_info(b_label, b_off, b_size, &b_fi); 1638 } 1639 1640 cmp = b_fi.codepoints[b_fi_off] - a_fi.codepoints[a_fi_off]; 1641 if(cmp != 0) 1642 return cmp; 1643 1644 a_fi_off++; 1645 b_fi_off++; 1646 } 1647 1648 return 0; 1649 } 1650 1651 typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST; 1652 struct MD_REF_DEF_LIST_tag { 1653 int n_ref_defs; 1654 int alloc_ref_defs; 1655 MD_REF_DEF* ref_defs[]; /* Valid items always point into ctx->ref_defs[] */ 1656 }; 1657 1658 static int 1659 md_ref_def_cmp(const void* a, const void* b) 1660 { 1661 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a; 1662 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b; 1663 1664 if(a_ref->hash < b_ref->hash) 1665 return -1; 1666 else if(a_ref->hash > b_ref->hash) 1667 return +1; 1668 else 1669 return md_link_label_cmp(a_ref->label, a_ref->label_size, b_ref->label, b_ref->label_size); 1670 } 1671 1672 static int 1673 md_ref_def_cmp_for_sort(const void* a, const void* b) 1674 { 1675 int cmp; 1676 1677 cmp = md_ref_def_cmp(a, b); 1678 1679 /* Ensure stability of the sorting. */ 1680 if(cmp == 0) { 1681 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a; 1682 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b; 1683 1684 if(a_ref < b_ref) 1685 cmp = -1; 1686 else if(a_ref > b_ref) 1687 cmp = +1; 1688 else 1689 cmp = 0; 1690 } 1691 1692 return cmp; 1693 } 1694 1695 static int 1696 md_build_ref_def_hashtable(MD_CTX* ctx) 1697 { 1698 int i, j; 1699 1700 if(ctx->n_ref_defs == 0) 1701 return 0; 1702 1703 ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4; 1704 ctx->ref_def_hashtable = malloc(ctx->ref_def_hashtable_size * sizeof(void*)); 1705 if(ctx->ref_def_hashtable == NULL) { 1706 MD_LOG("malloc() failed."); 1707 goto abort; 1708 } 1709 memset(ctx->ref_def_hashtable, 0, ctx->ref_def_hashtable_size * sizeof(void*)); 1710 1711 /* Each member of ctx->ref_def_hashtable[] can be: 1712 * -- NULL, 1713 * -- pointer to the MD_REF_DEF in ctx->ref_defs[], or 1714 * -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to 1715 * such MD_REF_DEFs. 1716 */ 1717 for(i = 0; i < ctx->n_ref_defs; i++) { 1718 MD_REF_DEF* def = &ctx->ref_defs[i]; 1719 void* bucket; 1720 MD_REF_DEF_LIST* list; 1721 1722 def->hash = md_link_label_hash(def->label, def->label_size); 1723 bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size]; 1724 1725 if(bucket == NULL) { 1726 /* The bucket is empty. Make it just point to the def. */ 1727 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def; 1728 continue; 1729 } 1730 1731 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) { 1732 /* The bucket already contains one ref. def. Lets see whether it 1733 * is the same label (ref. def. duplicate) or different one 1734 * (hash conflict). */ 1735 MD_REF_DEF* old_def = (MD_REF_DEF*) bucket; 1736 1737 if(md_link_label_cmp(def->label, def->label_size, old_def->label, old_def->label_size) == 0) { 1738 /* Duplicate label: Ignore this ref. def. */ 1739 continue; 1740 } 1741 1742 /* Make the bucket complex, i.e. able to hold more ref. defs. */ 1743 list = (MD_REF_DEF_LIST*) malloc(sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*)); 1744 if(list == NULL) { 1745 MD_LOG("malloc() failed."); 1746 goto abort; 1747 } 1748 list->ref_defs[0] = old_def; 1749 list->ref_defs[1] = def; 1750 list->n_ref_defs = 2; 1751 list->alloc_ref_defs = 2; 1752 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list; 1753 continue; 1754 } 1755 1756 /* Append the def to the complex bucket list. 1757 * 1758 * Note in this case we ignore potential duplicates to avoid expensive 1759 * iterating over the complex bucket. Below, we revisit all the complex 1760 * buckets and handle it more cheaply after the complex bucket contents 1761 * is sorted. */ 1762 list = (MD_REF_DEF_LIST*) bucket; 1763 if(list->n_ref_defs >= list->alloc_ref_defs) { 1764 int alloc_ref_defs = list->alloc_ref_defs + list->alloc_ref_defs / 2; 1765 MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(list, 1766 sizeof(MD_REF_DEF_LIST) + alloc_ref_defs * sizeof(MD_REF_DEF*)); 1767 if(list_tmp == NULL) { 1768 MD_LOG("realloc() failed."); 1769 goto abort; 1770 } 1771 list = list_tmp; 1772 list->alloc_ref_defs = alloc_ref_defs; 1773 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list; 1774 } 1775 1776 list->ref_defs[list->n_ref_defs] = def; 1777 list->n_ref_defs++; 1778 } 1779 1780 /* Sort the complex buckets so we can use bsearch() with them. */ 1781 for(i = 0; i < ctx->ref_def_hashtable_size; i++) { 1782 void* bucket = ctx->ref_def_hashtable[i]; 1783 MD_REF_DEF_LIST* list; 1784 1785 if(bucket == NULL) 1786 continue; 1787 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) 1788 continue; 1789 1790 list = (MD_REF_DEF_LIST*) bucket; 1791 qsort(list->ref_defs, list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp_for_sort); 1792 1793 /* Disable all duplicates in the complex bucket by forcing all such 1794 * records to point to the 1st such ref. def. I.e. no matter which 1795 * record is found during the lookup, it will always point to the right 1796 * ref. def. in ctx->ref_defs[]. */ 1797 for(j = 1; j < list->n_ref_defs; j++) { 1798 if(md_ref_def_cmp(&list->ref_defs[j-1], &list->ref_defs[j]) == 0) 1799 list->ref_defs[j] = list->ref_defs[j-1]; 1800 } 1801 } 1802 1803 return 0; 1804 1805 abort: 1806 return -1; 1807 } 1808 1809 static void 1810 md_free_ref_def_hashtable(MD_CTX* ctx) 1811 { 1812 if(ctx->ref_def_hashtable != NULL) { 1813 int i; 1814 1815 for(i = 0; i < ctx->ref_def_hashtable_size; i++) { 1816 void* bucket = ctx->ref_def_hashtable[i]; 1817 if(bucket == NULL) 1818 continue; 1819 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) 1820 continue; 1821 free(bucket); 1822 } 1823 1824 free(ctx->ref_def_hashtable); 1825 } 1826 } 1827 1828 static const MD_REF_DEF* 1829 md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size) 1830 { 1831 unsigned hash; 1832 void* bucket; 1833 1834 if(ctx->ref_def_hashtable_size == 0) 1835 return NULL; 1836 1837 hash = md_link_label_hash(label, label_size); 1838 bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size]; 1839 1840 if(bucket == NULL) { 1841 return NULL; 1842 } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) { 1843 const MD_REF_DEF* def = (MD_REF_DEF*) bucket; 1844 1845 if(md_link_label_cmp(def->label, def->label_size, label, label_size) == 0) 1846 return def; 1847 else 1848 return NULL; 1849 } else { 1850 MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket; 1851 MD_REF_DEF key_buf; 1852 const MD_REF_DEF* key = &key_buf; 1853 const MD_REF_DEF** ret; 1854 1855 key_buf.label = (CHAR*) label; 1856 key_buf.label_size = label_size; 1857 key_buf.hash = md_link_label_hash(key_buf.label, key_buf.label_size); 1858 1859 ret = (const MD_REF_DEF**) bsearch(&key, list->ref_defs, 1860 list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp); 1861 if(ret != NULL) 1862 return *ret; 1863 else 1864 return NULL; 1865 } 1866 } 1867 1868 1869 /*************************** 1870 *** Recognizing Links *** 1871 ***************************/ 1872 1873 /* Note this code is partially shared between processing inlines and blocks 1874 * as reference definitions and links share some helper parser functions. 1875 */ 1876 1877 typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR; 1878 struct MD_LINK_ATTR_tag { 1879 OFF dest_beg; 1880 OFF dest_end; 1881 1882 CHAR* title; 1883 SZ title_size; 1884 int title_needs_free; 1885 }; 1886 1887 1888 static int 1889 md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, 1890 OFF* p_end, int* p_beg_line_index, int* p_end_line_index, 1891 OFF* p_contents_beg, OFF* p_contents_end) 1892 { 1893 OFF off = beg; 1894 OFF contents_beg = 0; 1895 OFF contents_end = 0; 1896 int line_index = 0; 1897 int len = 0; 1898 1899 if(CH(off) != _T('[')) 1900 return FALSE; 1901 off++; 1902 1903 while(1) { 1904 OFF line_end = lines[line_index].end; 1905 1906 while(off < line_end) { 1907 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) { 1908 if(contents_end == 0) { 1909 contents_beg = off; 1910 *p_beg_line_index = line_index; 1911 } 1912 contents_end = off + 2; 1913 off += 2; 1914 } else if(CH(off) == _T('[')) { 1915 return FALSE; 1916 } else if(CH(off) == _T(']')) { 1917 if(contents_beg < contents_end) { 1918 /* Success. */ 1919 *p_contents_beg = contents_beg; 1920 *p_contents_end = contents_end; 1921 *p_end = off+1; 1922 *p_end_line_index = line_index; 1923 return TRUE; 1924 } else { 1925 /* Link label must have some non-whitespace contents. */ 1926 return FALSE; 1927 } 1928 } else { 1929 unsigned codepoint; 1930 SZ char_size; 1931 1932 codepoint = md_decode_unicode(ctx->text, off, ctx->size, &char_size); 1933 if(!ISUNICODEWHITESPACE_(codepoint)) { 1934 if(contents_end == 0) { 1935 contents_beg = off; 1936 *p_beg_line_index = line_index; 1937 } 1938 contents_end = off + char_size; 1939 } 1940 1941 off += char_size; 1942 } 1943 1944 len++; 1945 if(len > 999) 1946 return FALSE; 1947 } 1948 1949 line_index++; 1950 len++; 1951 if(line_index < n_lines) 1952 off = lines[line_index].beg; 1953 else 1954 break; 1955 } 1956 1957 return FALSE; 1958 } 1959 1960 static int 1961 md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, 1962 OFF* p_contents_beg, OFF* p_contents_end) 1963 { 1964 OFF off = beg; 1965 1966 if(off >= max_end || CH(off) != _T('<')) 1967 return FALSE; 1968 off++; 1969 1970 while(off < max_end) { 1971 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) { 1972 off += 2; 1973 continue; 1974 } 1975 1976 if(ISNEWLINE(off) || CH(off) == _T('<')) 1977 return FALSE; 1978 1979 if(CH(off) == _T('>')) { 1980 /* Success. */ 1981 *p_contents_beg = beg+1; 1982 *p_contents_end = off; 1983 *p_end = off+1; 1984 return TRUE; 1985 } 1986 1987 off++; 1988 } 1989 1990 return FALSE; 1991 } 1992 1993 static int 1994 md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, 1995 OFF* p_contents_beg, OFF* p_contents_end) 1996 { 1997 OFF off = beg; 1998 int parenthesis_level = 0; 1999 2000 while(off < max_end) { 2001 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) { 2002 off += 2; 2003 continue; 2004 } 2005 2006 if(ISWHITESPACE(off) || ISCNTRL(off)) 2007 break; 2008 2009 /* Link destination may include balanced pairs of unescaped '(' ')'. 2010 * Note we limit the maximal nesting level by 32 to protect us from 2011 * https://github.com/jgm/cmark/issues/214 */ 2012 if(CH(off) == _T('(')) { 2013 parenthesis_level++; 2014 if(parenthesis_level > 32) 2015 return FALSE; 2016 } else if(CH(off) == _T(')')) { 2017 if(parenthesis_level == 0) 2018 break; 2019 parenthesis_level--; 2020 } 2021 2022 off++; 2023 } 2024 2025 if(parenthesis_level != 0 || off == beg) 2026 return FALSE; 2027 2028 /* Success. */ 2029 *p_contents_beg = beg; 2030 *p_contents_end = off; 2031 *p_end = off; 2032 return TRUE; 2033 } 2034 2035 static inline int 2036 md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, 2037 OFF* p_contents_beg, OFF* p_contents_end) 2038 { 2039 if(CH(beg) == _T('<')) 2040 return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end); 2041 else 2042 return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end); 2043 } 2044 2045 static int 2046 md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, 2047 OFF* p_end, int* p_beg_line_index, int* p_end_line_index, 2048 OFF* p_contents_beg, OFF* p_contents_end) 2049 { 2050 OFF off = beg; 2051 CHAR closer_char; 2052 int line_index = 0; 2053 2054 /* White space with up to one line break. */ 2055 while(off < lines[line_index].end && ISWHITESPACE(off)) 2056 off++; 2057 if(off >= lines[line_index].end) { 2058 line_index++; 2059 if(line_index >= n_lines) 2060 return FALSE; 2061 off = lines[line_index].beg; 2062 } 2063 if(off == beg) 2064 return FALSE; 2065 2066 *p_beg_line_index = line_index; 2067 2068 /* First char determines how to detect end of it. */ 2069 switch(CH(off)) { 2070 case _T('"'): closer_char = _T('"'); break; 2071 case _T('\''): closer_char = _T('\''); break; 2072 case _T('('): closer_char = _T(')'); break; 2073 default: return FALSE; 2074 } 2075 off++; 2076 2077 *p_contents_beg = off; 2078 2079 while(line_index < n_lines) { 2080 OFF line_end = lines[line_index].end; 2081 2082 while(off < line_end) { 2083 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) { 2084 off++; 2085 } else if(CH(off) == closer_char) { 2086 /* Success. */ 2087 *p_contents_end = off; 2088 *p_end = off+1; 2089 *p_end_line_index = line_index; 2090 return TRUE; 2091 } else if(closer_char == _T(')') && CH(off) == _T('(')) { 2092 /* ()-style title cannot contain (unescaped '(')) */ 2093 return FALSE; 2094 } 2095 2096 off++; 2097 } 2098 2099 line_index++; 2100 } 2101 2102 return FALSE; 2103 } 2104 2105 /* Returns 0 if it is not a reference definition. 2106 * 2107 * Returns N > 0 if it is a reference definition. N then corresponds to the 2108 * number of lines forming it). In this case the definition is stored for 2109 * resolving any links referring to it. 2110 * 2111 * Returns -1 in case of an error (out of memory). 2112 */ 2113 static int 2114 md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines) 2115 { 2116 OFF label_contents_beg; 2117 OFF label_contents_end; 2118 int label_contents_line_index = -1; 2119 int label_is_multiline = FALSE; 2120 OFF dest_contents_beg; 2121 OFF dest_contents_end; 2122 OFF title_contents_beg; 2123 OFF title_contents_end; 2124 int title_contents_line_index; 2125 int title_is_multiline = FALSE; 2126 OFF off; 2127 int line_index = 0; 2128 int tmp_line_index; 2129 MD_REF_DEF* def = NULL; 2130 int ret = 0; 2131 2132 /* Link label. */ 2133 if(!md_is_link_label(ctx, lines, n_lines, lines[0].beg, 2134 &off, &label_contents_line_index, &line_index, 2135 &label_contents_beg, &label_contents_end)) 2136 return FALSE; 2137 label_is_multiline = (label_contents_line_index != line_index); 2138 2139 /* Colon. */ 2140 if(off >= lines[line_index].end || CH(off) != _T(':')) 2141 return FALSE; 2142 off++; 2143 2144 /* Optional white space with up to one line break. */ 2145 while(off < lines[line_index].end && ISWHITESPACE(off)) 2146 off++; 2147 if(off >= lines[line_index].end) { 2148 line_index++; 2149 if(line_index >= n_lines) 2150 return FALSE; 2151 off = lines[line_index].beg; 2152 } 2153 2154 /* Link destination. */ 2155 if(!md_is_link_destination(ctx, off, lines[line_index].end, 2156 &off, &dest_contents_beg, &dest_contents_end)) 2157 return FALSE; 2158 2159 /* (Optional) title. Note we interpret it as an title only if nothing 2160 * more follows on its last line. */ 2161 if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off, 2162 &off, &title_contents_line_index, &tmp_line_index, 2163 &title_contents_beg, &title_contents_end) 2164 && off >= lines[line_index + tmp_line_index].end) 2165 { 2166 title_is_multiline = (tmp_line_index != title_contents_line_index); 2167 title_contents_line_index += line_index; 2168 line_index += tmp_line_index; 2169 } else { 2170 /* Not a title. */ 2171 title_is_multiline = FALSE; 2172 title_contents_beg = off; 2173 title_contents_end = off; 2174 title_contents_line_index = 0; 2175 } 2176 2177 /* Nothing more can follow on the last line. */ 2178 if(off < lines[line_index].end) 2179 return FALSE; 2180 2181 /* So, it _is_ a reference definition. Remember it. */ 2182 if(ctx->n_ref_defs >= ctx->alloc_ref_defs) { 2183 MD_REF_DEF* new_defs; 2184 2185 ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0 2186 ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2 2187 : 16); 2188 new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF)); 2189 if(new_defs == NULL) { 2190 MD_LOG("realloc() failed."); 2191 goto abort; 2192 } 2193 2194 ctx->ref_defs = new_defs; 2195 } 2196 def = &ctx->ref_defs[ctx->n_ref_defs]; 2197 memset(def, 0, sizeof(MD_REF_DEF)); 2198 2199 if(label_is_multiline) { 2200 MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end, 2201 lines + label_contents_line_index, n_lines - label_contents_line_index, 2202 _T(' '), &def->label, &def->label_size)); 2203 def->label_needs_free = TRUE; 2204 } else { 2205 def->label = (CHAR*) STR(label_contents_beg); 2206 def->label_size = label_contents_end - label_contents_beg; 2207 } 2208 2209 if(title_is_multiline) { 2210 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end, 2211 lines + title_contents_line_index, n_lines - title_contents_line_index, 2212 _T('\n'), &def->title, &def->title_size)); 2213 def->title_needs_free = TRUE; 2214 } else { 2215 def->title = (CHAR*) STR(title_contents_beg); 2216 def->title_size = title_contents_end - title_contents_beg; 2217 } 2218 2219 def->dest_beg = dest_contents_beg; 2220 def->dest_end = dest_contents_end; 2221 2222 /* Success. */ 2223 ctx->n_ref_defs++; 2224 return line_index + 1; 2225 2226 abort: 2227 /* Failure. */ 2228 if(def != NULL && def->label_needs_free) 2229 free(def->label); 2230 if(def != NULL && def->title_needs_free) 2231 free(def->title); 2232 return ret; 2233 } 2234 2235 static int 2236 md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines, 2237 OFF beg, OFF end, MD_LINK_ATTR* attr) 2238 { 2239 const MD_REF_DEF* def; 2240 const MD_LINE* beg_line; 2241 int is_multiline; 2242 CHAR* label; 2243 SZ label_size; 2244 int ret; 2245 2246 MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!')); 2247 MD_ASSERT(CH(end-1) == _T(']')); 2248 2249 beg += (CH(beg) == _T('!') ? 2 : 1); 2250 end--; 2251 2252 /* Find lines corresponding to the beg and end positions. */ 2253 beg_line = md_lookup_line(beg, lines, n_lines); 2254 is_multiline = (end > beg_line->end); 2255 2256 if(is_multiline) { 2257 MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line, 2258 (int)(n_lines - (beg_line - lines)), _T(' '), &label, &label_size)); 2259 } else { 2260 label = (CHAR*) STR(beg); 2261 label_size = end - beg; 2262 } 2263 2264 def = md_lookup_ref_def(ctx, label, label_size); 2265 if(def != NULL) { 2266 attr->dest_beg = def->dest_beg; 2267 attr->dest_end = def->dest_end; 2268 attr->title = def->title; 2269 attr->title_size = def->title_size; 2270 attr->title_needs_free = FALSE; 2271 } 2272 2273 if(is_multiline) 2274 free(label); 2275 2276 ret = (def != NULL); 2277 2278 abort: 2279 return ret; 2280 } 2281 2282 static int 2283 md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines, 2284 OFF beg, OFF* p_end, MD_LINK_ATTR* attr) 2285 { 2286 int line_index = 0; 2287 int tmp_line_index; 2288 OFF title_contents_beg; 2289 OFF title_contents_end; 2290 int title_contents_line_index; 2291 int title_is_multiline; 2292 OFF off = beg; 2293 int ret = FALSE; 2294 2295 while(off >= lines[line_index].end) 2296 line_index++; 2297 2298 MD_ASSERT(CH(off) == _T('(')); 2299 off++; 2300 2301 /* Optional white space with up to one line break. */ 2302 while(off < lines[line_index].end && ISWHITESPACE(off)) 2303 off++; 2304 if(off >= lines[line_index].end && (off >= ctx->size || ISNEWLINE(off))) { 2305 line_index++; 2306 if(line_index >= n_lines) 2307 return FALSE; 2308 off = lines[line_index].beg; 2309 } 2310 2311 /* Link destination may be omitted, but only when not also having a title. */ 2312 if(off < ctx->size && CH(off) == _T(')')) { 2313 attr->dest_beg = off; 2314 attr->dest_end = off; 2315 attr->title = NULL; 2316 attr->title_size = 0; 2317 attr->title_needs_free = FALSE; 2318 off++; 2319 *p_end = off; 2320 return TRUE; 2321 } 2322 2323 /* Link destination. */ 2324 if(!md_is_link_destination(ctx, off, lines[line_index].end, 2325 &off, &attr->dest_beg, &attr->dest_end)) 2326 return FALSE; 2327 2328 /* (Optional) title. */ 2329 if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off, 2330 &off, &title_contents_line_index, &tmp_line_index, 2331 &title_contents_beg, &title_contents_end)) 2332 { 2333 title_is_multiline = (tmp_line_index != title_contents_line_index); 2334 title_contents_line_index += line_index; 2335 line_index += tmp_line_index; 2336 } else { 2337 /* Not a title. */ 2338 title_is_multiline = FALSE; 2339 title_contents_beg = off; 2340 title_contents_end = off; 2341 title_contents_line_index = 0; 2342 } 2343 2344 /* Optional whitespace followed with final ')'. */ 2345 while(off < lines[line_index].end && ISWHITESPACE(off)) 2346 off++; 2347 if (off >= lines[line_index].end && (off >= ctx->size || ISNEWLINE(off))) { 2348 line_index++; 2349 if(line_index >= n_lines) 2350 return FALSE; 2351 off = lines[line_index].beg; 2352 } 2353 if(CH(off) != _T(')')) 2354 goto abort; 2355 off++; 2356 2357 if(title_contents_beg >= title_contents_end) { 2358 attr->title = NULL; 2359 attr->title_size = 0; 2360 attr->title_needs_free = FALSE; 2361 } else if(!title_is_multiline) { 2362 attr->title = (CHAR*) STR(title_contents_beg); 2363 attr->title_size = title_contents_end - title_contents_beg; 2364 attr->title_needs_free = FALSE; 2365 } else { 2366 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end, 2367 lines + title_contents_line_index, n_lines - title_contents_line_index, 2368 _T('\n'), &attr->title, &attr->title_size)); 2369 attr->title_needs_free = TRUE; 2370 } 2371 2372 *p_end = off; 2373 ret = TRUE; 2374 2375 abort: 2376 return ret; 2377 } 2378 2379 static void 2380 md_free_ref_defs(MD_CTX* ctx) 2381 { 2382 int i; 2383 2384 for(i = 0; i < ctx->n_ref_defs; i++) { 2385 MD_REF_DEF* def = &ctx->ref_defs[i]; 2386 2387 if(def->label_needs_free) 2388 free(def->label); 2389 if(def->title_needs_free) 2390 free(def->title); 2391 } 2392 2393 free(ctx->ref_defs); 2394 } 2395 2396 2397 /****************************************** 2398 *** Processing Inlines (a.k.a Spans) *** 2399 ******************************************/ 2400 2401 /* We process inlines in few phases: 2402 * 2403 * (1) We go through the block text and collect all significant characters 2404 * which may start/end a span or some other significant position into 2405 * ctx->marks[]. Core of this is what md_collect_marks() does. 2406 * 2407 * We also do some very brief preliminary context-less analysis, whether 2408 * it might be opener or closer (e.g. of an emphasis span). 2409 * 2410 * This speeds the other steps as we do not need to re-iterate over all 2411 * characters anymore. 2412 * 2413 * (2) We analyze each potential mark types, in order by their precedence. 2414 * 2415 * In each md_analyze_XXX() function, we re-iterate list of the marks, 2416 * skipping already resolved regions (in preceding precedences) and try to 2417 * resolve them. 2418 * 2419 * (2.1) For trivial marks, which are single (e.g. HTML entity), we just mark 2420 * them as resolved. 2421 * 2422 * (2.2) For range-type marks, we analyze whether the mark could be closer 2423 * and, if yes, whether there is some preceding opener it could satisfy. 2424 * 2425 * If not we check whether it could be really an opener and if yes, we 2426 * remember it so subsequent closers may resolve it. 2427 * 2428 * (3) Finally, when all marks were analyzed, we render the block contents 2429 * by calling MD_RENDERER::text() callback, interrupting by ::enter_span() 2430 * or ::close_span() whenever we reach a resolved mark. 2431 */ 2432 2433 2434 /* The mark structure. 2435 * 2436 * '\\': Maybe escape sequence. 2437 * '\0': NULL char. 2438 * '*': Maybe (strong) emphasis start/end. 2439 * '_': Maybe (strong) emphasis start/end. 2440 * '~': Maybe strikethrough start/end (needs MD_FLAG_STRIKETHROUGH). 2441 * '`': Maybe code span start/end. 2442 * '&': Maybe start of entity. 2443 * ';': Maybe end of entity. 2444 * '<': Maybe start of raw HTML or autolink. 2445 * '>': Maybe end of raw HTML or autolink. 2446 * '[': Maybe start of link label or link text. 2447 * '!': Equivalent of '[' for image. 2448 * ']': Maybe end of link label or link text. 2449 * '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS). 2450 * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS). 2451 * '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS). 2452 * 'D': Dummy mark, it reserves a space for splitting a previous mark 2453 * (e.g. emphasis) or to make more space for storing some special data 2454 * related to the preceding mark (e.g. link). 2455 * 2456 * Note that not all instances of these chars in the text imply creation of the 2457 * structure. Only those which have (or may have, after we see more context) 2458 * the special meaning. 2459 * 2460 * (Keep this struct as small as possible to fit as much of them into CPU 2461 * cache line.) 2462 */ 2463 struct MD_MARK_tag { 2464 OFF beg; 2465 OFF end; 2466 2467 /* For unresolved openers, 'prev' and 'next' form the chain of open openers 2468 * of given type 'ch'. 2469 * 2470 * During resolving, we disconnect from the chain and point to the 2471 * corresponding counterpart so opener points to its closer and vice versa. 2472 */ 2473 int prev; 2474 int next; 2475 CHAR ch; 2476 unsigned char flags; 2477 }; 2478 2479 /* Mark flags (these apply to ALL mark types). */ 2480 #define MD_MARK_POTENTIAL_OPENER 0x01 /* Maybe opener. */ 2481 #define MD_MARK_POTENTIAL_CLOSER 0x02 /* Maybe closer. */ 2482 #define MD_MARK_OPENER 0x04 /* Definitely opener. */ 2483 #define MD_MARK_CLOSER 0x08 /* Definitely closer. */ 2484 #define MD_MARK_RESOLVED 0x10 /* Resolved in any definite way. */ 2485 2486 /* Mark flags specific for various mark types (so they can share bits). */ 2487 #define MD_MARK_EMPH_INTRAWORD 0x20 /* Helper for the "rule of 3". */ 2488 #define MD_MARK_EMPH_MOD3_0 0x40 2489 #define MD_MARK_EMPH_MOD3_1 0x80 2490 #define MD_MARK_EMPH_MOD3_2 (0x40 | 0x80) 2491 #define MD_MARK_EMPH_MOD3_MASK (0x40 | 0x80) 2492 #define MD_MARK_AUTOLINK 0x20 /* Distinguisher for '<', '>'. */ 2493 #define MD_MARK_VALIDPERMISSIVEAUTOLINK 0x20 /* For permissive autolinks. */ 2494 #define MD_MARK_HASNESTEDBRACKETS 0x20 /* For '[' to rule out invalid link labels early */ 2495 2496 static MD_MARKCHAIN* 2497 md_asterisk_chain(MD_CTX* ctx, unsigned flags) 2498 { 2499 switch(flags & (MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_MASK)) { 2500 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_intraword_mod3_0; 2501 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_intraword_mod3_1; 2502 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_intraword_mod3_2; 2503 case MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_extraword_mod3_0; 2504 case MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_extraword_mod3_1; 2505 case MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_extraword_mod3_2; 2506 default: MD_UNREACHABLE(); 2507 } 2508 return NULL; 2509 } 2510 2511 static MD_MARKCHAIN* 2512 md_mark_chain(MD_CTX* ctx, int mark_index) 2513 { 2514 MD_MARK* mark = &ctx->marks[mark_index]; 2515 2516 switch(mark->ch) { 2517 case _T('*'): return md_asterisk_chain(ctx, mark->flags); 2518 case _T('_'): return &UNDERSCORE_OPENERS; 2519 case _T('~'): return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2; 2520 /* case _T('!'): MD_FALLTHROUGH(); */ 2521 case _T('['): return &BRACKET_OPENERS; 2522 case _T('|'): return &TABLECELLBOUNDARIES; 2523 case _T('-'): return &FAINT_OPENERS; 2524 case _T('%'): return &INVERSE_OPENERS; 2525 case _T('!'): return &CONCEAL_OPENERS; 2526 case _T('^'): return &BLINK_OPENERS; 2527 default: return NULL; 2528 } 2529 } 2530 2531 static MD_MARK* 2532 md_push_mark(MD_CTX* ctx) 2533 { 2534 if(ctx->n_marks >= ctx->alloc_marks) { 2535 MD_MARK* new_marks; 2536 2537 ctx->alloc_marks = (ctx->alloc_marks > 0 2538 ? ctx->alloc_marks + ctx->alloc_marks / 2 2539 : 64); 2540 new_marks = realloc(ctx->marks, ctx->alloc_marks * sizeof(MD_MARK)); 2541 if(new_marks == NULL) { 2542 MD_LOG("realloc() failed."); 2543 return NULL; 2544 } 2545 2546 ctx->marks = new_marks; 2547 } 2548 2549 return &ctx->marks[ctx->n_marks++]; 2550 } 2551 2552 #define PUSH_MARK_() \ 2553 do { \ 2554 mark = md_push_mark(ctx); \ 2555 if(mark == NULL) { \ 2556 ret = -1; \ 2557 goto abort; \ 2558 } \ 2559 } while(0) 2560 2561 #define PUSH_MARK(ch_, beg_, end_, flags_) \ 2562 do { \ 2563 PUSH_MARK_(); \ 2564 mark->beg = (beg_); \ 2565 mark->end = (end_); \ 2566 mark->prev = -1; \ 2567 mark->next = -1; \ 2568 mark->ch = (char)(ch_); \ 2569 mark->flags = (flags_); \ 2570 } while(0) 2571 2572 2573 static void 2574 md_mark_chain_append(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index) 2575 { 2576 if(chain->tail >= 0) 2577 ctx->marks[chain->tail].next = mark_index; 2578 else 2579 chain->head = mark_index; 2580 2581 ctx->marks[mark_index].prev = chain->tail; 2582 ctx->marks[mark_index].next = -1; 2583 chain->tail = mark_index; 2584 } 2585 2586 /* Sometimes, we need to store a pointer into the mark. It is quite rare 2587 * so we do not bother to make MD_MARK use union, and it can only happen 2588 * for dummy marks. */ 2589 static inline void 2590 md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr) 2591 { 2592 MD_MARK* mark = &ctx->marks[mark_index]; 2593 MD_ASSERT(mark->ch == 'D'); 2594 2595 /* Check only members beg and end are misused for this. */ 2596 MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF)); 2597 memcpy(mark, &ptr, sizeof(void*)); 2598 } 2599 2600 static inline void* 2601 md_mark_get_ptr(MD_CTX* ctx, int mark_index) 2602 { 2603 void* ptr; 2604 MD_MARK* mark = &ctx->marks[mark_index]; 2605 MD_ASSERT(mark->ch == 'D'); 2606 memcpy(&ptr, mark, sizeof(void*)); 2607 return ptr; 2608 } 2609 2610 static void 2611 md_resolve_range(MD_CTX* ctx, MD_MARKCHAIN* chain, int opener_index, int closer_index) 2612 { 2613 MD_MARK* opener = &ctx->marks[opener_index]; 2614 MD_MARK* closer = &ctx->marks[closer_index]; 2615 2616 /* Remove opener from the list of openers. */ 2617 if(chain != NULL) { 2618 if(opener->prev >= 0) 2619 ctx->marks[opener->prev].next = opener->next; 2620 else 2621 chain->head = opener->next; 2622 2623 if(opener->next >= 0) 2624 ctx->marks[opener->next].prev = opener->prev; 2625 else 2626 chain->tail = opener->prev; 2627 } 2628 2629 /* Interconnect opener and closer and mark both as resolved. */ 2630 opener->next = closer_index; 2631 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED; 2632 closer->prev = opener_index; 2633 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED; 2634 } 2635 2636 2637 #define MD_ROLLBACK_ALL 0 2638 #define MD_ROLLBACK_CROSSING 1 2639 2640 /* In the range ctx->marks[opener_index] ... [closer_index], undo some or all 2641 * resolvings accordingly to these rules: 2642 * 2643 * (1) All openers BEFORE the range corresponding to any closer inside the 2644 * range are un-resolved and they are re-added to their respective chains 2645 * of unresolved openers. This ensures we can reuse the opener for closers 2646 * AFTER the range. 2647 * 2648 * (2) If 'how' is MD_ROLLBACK_ALL, then ALL resolved marks inside the range 2649 * are discarded. 2650 * 2651 * (3) If 'how' is MD_ROLLBACK_CROSSING, only closers with openers handled 2652 * in (1) are discarded. I.e. pairs of openers and closers which are both 2653 * inside the range are retained as well as any unpaired marks. 2654 */ 2655 static void 2656 md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how) 2657 { 2658 int i; 2659 int mark_index; 2660 2661 /* Cut all unresolved openers at the mark index. */ 2662 for(i = OPENERS_CHAIN_FIRST; i < OPENERS_CHAIN_LAST+1; i++) { 2663 MD_MARKCHAIN* chain = &ctx->mark_chains[i]; 2664 2665 while(chain->tail >= opener_index) { 2666 int same = chain->tail == opener_index; 2667 chain->tail = ctx->marks[chain->tail].prev; 2668 if (same) break; 2669 } 2670 2671 if(chain->tail >= 0) 2672 ctx->marks[chain->tail].next = -1; 2673 else 2674 chain->head = -1; 2675 } 2676 2677 /* Go backwards so that unresolved openers are re-added into their 2678 * respective chains, in the right order. */ 2679 mark_index = closer_index - 1; 2680 while(mark_index > opener_index) { 2681 MD_MARK* mark = &ctx->marks[mark_index]; 2682 int mark_flags = mark->flags; 2683 int discard_flag = (how == MD_ROLLBACK_ALL); 2684 2685 if(mark->flags & MD_MARK_CLOSER) { 2686 int mark_opener_index = mark->prev; 2687 2688 /* Undo opener BEFORE the range. */ 2689 if(mark_opener_index < opener_index) { 2690 MD_MARK* mark_opener = &ctx->marks[mark_opener_index]; 2691 MD_MARKCHAIN* chain; 2692 2693 mark_opener->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED); 2694 chain = md_mark_chain(ctx, opener_index); 2695 if(chain != NULL) { 2696 md_mark_chain_append(ctx, chain, mark_opener_index); 2697 discard_flag = 1; 2698 } 2699 } 2700 } 2701 2702 /* And reset our flags. */ 2703 if(discard_flag) { 2704 /* Make zero-length closer a dummy mark as that's how it was born */ 2705 if((mark->flags & MD_MARK_CLOSER) && mark->beg == mark->end) 2706 mark->ch = 'D'; 2707 2708 mark->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED); 2709 } 2710 2711 /* Jump as far as we can over unresolved or non-interesting marks. */ 2712 switch(how) { 2713 case MD_ROLLBACK_CROSSING: 2714 if((mark_flags & MD_MARK_CLOSER) && mark->prev > opener_index) { 2715 /* If we are closer with opener INSIDE the range, there may 2716 * not be any other crosser inside the subrange. */ 2717 mark_index = mark->prev; 2718 break; 2719 } 2720 MD_FALLTHROUGH(); 2721 default: 2722 mark_index--; 2723 break; 2724 } 2725 } 2726 } 2727 2728 static void 2729 md_build_mark_char_map(MD_CTX* ctx) 2730 { 2731 memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map)); 2732 2733 ctx->mark_char_map['\\'] = 1; 2734 ctx->mark_char_map['^'] = 1; 2735 ctx->mark_char_map['%'] = 1; 2736 ctx->mark_char_map['-'] = 1; 2737 ctx->mark_char_map['*'] = 1; 2738 ctx->mark_char_map['_'] = 1; 2739 ctx->mark_char_map['`'] = 1; 2740 ctx->mark_char_map['&'] = 1; 2741 ctx->mark_char_map[';'] = 1; 2742 ctx->mark_char_map['<'] = 1; 2743 ctx->mark_char_map['>'] = 1; 2744 ctx->mark_char_map['['] = 1; 2745 ctx->mark_char_map['!'] = 1; 2746 ctx->mark_char_map[']'] = 1; 2747 ctx->mark_char_map['\0'] = 1; 2748 2749 if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH) 2750 ctx->mark_char_map['~'] = 1; 2751 2752 if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS) 2753 ctx->mark_char_map['$'] = 1; 2754 2755 if(ctx->parser.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS) 2756 ctx->mark_char_map['@'] = 1; 2757 2758 if(ctx->parser.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS) 2759 ctx->mark_char_map[':'] = 1; 2760 2761 if(ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS) 2762 ctx->mark_char_map['.'] = 1; 2763 2764 if((ctx->parser.flags & MD_FLAG_TABLES) || (ctx->parser.flags & MD_FLAG_WIKILINKS)) 2765 ctx->mark_char_map['|'] = 1; 2766 2767 if(ctx->parser.flags & MD_FLAG_COLLAPSEWHITESPACE) { 2768 int i; 2769 2770 for(i = 0; i < (int) sizeof(ctx->mark_char_map); i++) { 2771 if(ISWHITESPACE_(i)) 2772 ctx->mark_char_map[i] = 1; 2773 } 2774 } 2775 } 2776 2777 /* We limit code span marks to lower than 32 backticks. This solves the 2778 * pathologic case of too many openers, each of different length: Their 2779 * resolving would be then O(n^2). */ 2780 #define CODESPAN_MARK_MAXLEN 32 2781 2782 static int 2783 md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, 2784 OFF* p_opener_beg, OFF* p_opener_end, 2785 OFF* p_closer_beg, OFF* p_closer_end, 2786 OFF last_potential_closers[CODESPAN_MARK_MAXLEN], 2787 int* p_reached_paragraph_end) 2788 { 2789 OFF opener_beg = beg; 2790 OFF opener_end; 2791 OFF closer_beg; 2792 OFF closer_end; 2793 SZ mark_len; 2794 OFF line_end; 2795 int has_space_after_opener = FALSE; 2796 int has_eol_after_opener = FALSE; 2797 int has_space_before_closer = FALSE; 2798 int has_eol_before_closer = FALSE; 2799 int has_only_space = TRUE; 2800 int line_index = 0; 2801 2802 line_end = lines[0].end; 2803 opener_end = opener_beg; 2804 while(opener_end < line_end && CH(opener_end) == _T('`')) 2805 opener_end++; 2806 has_space_after_opener = (opener_end < line_end && CH(opener_end) == _T(' ')); 2807 has_eol_after_opener = (opener_end == line_end); 2808 2809 /* The caller needs to know end of the opening mark even if we fail. */ 2810 *p_opener_end = opener_end; 2811 2812 mark_len = opener_end - opener_beg; 2813 if(mark_len > CODESPAN_MARK_MAXLEN) 2814 return FALSE; 2815 2816 /* Check whether we already know there is no closer of this length. 2817 * If so, re-scan does no sense. This fixes issue #59. */ 2818 if(last_potential_closers[mark_len-1] >= lines[n_lines-1].end || 2819 (*p_reached_paragraph_end && last_potential_closers[mark_len-1] < opener_end)) 2820 return FALSE; 2821 2822 closer_beg = opener_end; 2823 closer_end = opener_end; 2824 2825 /* Find closer mark. */ 2826 while(TRUE) { 2827 while(closer_beg < line_end && CH(closer_beg) != _T('`')) { 2828 if(CH(closer_beg) != _T(' ')) 2829 has_only_space = FALSE; 2830 closer_beg++; 2831 } 2832 closer_end = closer_beg; 2833 while(closer_end < line_end && CH(closer_end) == _T('`')) 2834 closer_end++; 2835 2836 if(closer_end - closer_beg == mark_len) { 2837 /* Success. */ 2838 has_space_before_closer = (closer_beg > lines[line_index].beg && CH(closer_beg-1) == _T(' ')); 2839 has_eol_before_closer = (closer_beg == lines[line_index].beg); 2840 break; 2841 } 2842 2843 if(closer_end - closer_beg > 0) { 2844 /* We have found a back-tick which is not part of the closer. */ 2845 has_only_space = FALSE; 2846 2847 /* But if we eventually fail, remember it as a potential closer 2848 * of its own length for future attempts. This mitigates needs for 2849 * rescans. */ 2850 if(closer_end - closer_beg < CODESPAN_MARK_MAXLEN) { 2851 if(closer_beg > last_potential_closers[closer_end - closer_beg - 1]) 2852 last_potential_closers[closer_end - closer_beg - 1] = closer_beg; 2853 } 2854 } 2855 2856 if(closer_end >= line_end) { 2857 line_index++; 2858 if(line_index >= n_lines) { 2859 /* Reached end of the paragraph and still nothing. */ 2860 *p_reached_paragraph_end = TRUE; 2861 return FALSE; 2862 } 2863 /* Try on the next line. */ 2864 line_end = lines[line_index].end; 2865 closer_beg = lines[line_index].beg; 2866 } else { 2867 closer_beg = closer_end; 2868 } 2869 } 2870 2871 /* If there is a space or a new line both after and before the opener 2872 * (and if the code span is not made of spaces only), consume one initial 2873 * and one trailing space as part of the marks. */ 2874 if(!has_only_space && 2875 (has_space_after_opener || has_eol_after_opener) && 2876 (has_space_before_closer || has_eol_before_closer)) 2877 { 2878 if(has_space_after_opener) 2879 opener_end++; 2880 else 2881 opener_end = lines[1].beg; 2882 2883 if(has_space_before_closer) 2884 closer_beg--; 2885 else { 2886 closer_beg = lines[line_index-1].end; 2887 /* We need to eat the preceding "\r\n" but not any line trailing 2888 * spaces. */ 2889 while(closer_beg < ctx->size && ISBLANK(closer_beg)) 2890 closer_beg++; 2891 } 2892 } 2893 2894 *p_opener_beg = opener_beg; 2895 *p_opener_end = opener_end; 2896 *p_closer_beg = closer_beg; 2897 *p_closer_end = closer_end; 2898 return TRUE; 2899 } 2900 2901 /* detect anchors with syntax: [|anchorId] */ 2902 static int 2903 md_is_anchor_span(MD_CTX* ctx, const MD_LINE* lines, OFF off, OFF* p_closer_beg) 2904 { 2905 OFF line_end = lines[0].end; 2906 // Smallest anchor is [|x] 2907 // An anchor must be on a single line 2908 if (off+4 >= line_end) 2909 return FALSE; 2910 off += 2; 2911 2912 // Find closer mark 2913 int opener_end = off; 2914 while (off < line_end) { 2915 if (CH(off) == _T(']')) { 2916 // Check if there an id for the anchor 2917 if (off == opener_end) 2918 return FALSE; 2919 *p_closer_beg = off; 2920 return TRUE; 2921 } 2922 off++; 2923 } 2924 return FALSE; 2925 } 2926 2927 #ifdef MD4C_USE_UTF16 2928 /* For UTF-16, mark_char_map[] covers only ASCII. */ 2929 #define IS_MARK_CHAR(off) ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map)) && \ 2930 (ctx->mark_char_map[(unsigned char) CH(off)])) 2931 #else 2932 /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */ 2933 #define IS_MARK_CHAR(off) (ctx->mark_char_map[(unsigned char) CH(off)]) 2934 #endif 2935 2936 /* detect faint effect: -text text- */ 2937 static int 2938 md_is_faint_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg) 2939 { 2940 OFF tmp; 2941 OFF line_end; 2942 2943 line_end = lines[0].end; 2944 if (beg+2 >= line_end) 2945 return FALSE; 2946 if (ISUNICODEWHITESPACE(beg+1)) 2947 return FALSE; 2948 tmp = beg+2; 2949 while (tmp < line_end) { 2950 if (CH(tmp) == _T('-') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1)) 2951 && (!ISUNICODEWHITESPACE(tmp-1))) { 2952 *p_closer_beg = tmp; 2953 return TRUE; 2954 } 2955 tmp++; 2956 } 2957 2958 return FALSE; 2959 } 2960 2961 /* detect inverse effect: %text text% */ 2962 static int 2963 md_is_inverse_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg) 2964 { 2965 OFF tmp; 2966 OFF line_end; 2967 2968 line_end = lines[0].end; 2969 if (beg+2 >= line_end) 2970 return FALSE; 2971 if (ISUNICODEWHITESPACE(beg+1)) 2972 return FALSE; 2973 tmp = beg+2; 2974 while (tmp < line_end) { 2975 if (CH(tmp) == _T('%') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1)) 2976 && (!ISUNICODEWHITESPACE(tmp-1))) { 2977 *p_closer_beg = tmp; 2978 return TRUE; 2979 } 2980 tmp++; 2981 } 2982 2983 return FALSE; 2984 } 2985 2986 /* detect conceal effect: !text text! */ 2987 static int 2988 md_is_conceal_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg) 2989 { 2990 OFF tmp; 2991 OFF line_end; 2992 2993 line_end = lines[0].end; 2994 if (beg+2 >= line_end) 2995 return FALSE; 2996 if (ISUNICODEWHITESPACE(beg+1)) 2997 return FALSE; 2998 tmp = beg+2; 2999 while (tmp < line_end) { 3000 if (CH(tmp) == _T('!') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1)) 3001 && (!ISUNICODEWHITESPACE(tmp-1))) { 3002 *p_closer_beg = tmp; 3003 return TRUE; 3004 } 3005 tmp++; 3006 } 3007 3008 return FALSE; 3009 } 3010 3011 /* detect blink effect: ^text text^ */ 3012 static int 3013 md_is_blink_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg) 3014 { 3015 OFF tmp; 3016 OFF line_end; 3017 3018 line_end = lines[0].end; 3019 if (beg+2 >= line_end) 3020 return FALSE; 3021 if (ISUNICODEWHITESPACE(beg+1)) 3022 return FALSE; 3023 tmp = beg+2; 3024 while (tmp < line_end) { 3025 if (CH(tmp) == _T('^') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1)) 3026 && (!ISUNICODEWHITESPACE(tmp-1))) { 3027 *p_closer_beg = tmp; 3028 return TRUE; 3029 } 3030 tmp++; 3031 } 3032 3033 return FALSE; 3034 } 3035 3036 static int 3037 md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end) 3038 { 3039 OFF off = beg+1; 3040 3041 MD_ASSERT(CH(beg) == _T('<')); 3042 3043 /* Check for scheme. */ 3044 if(off >= max_end || !ISASCII(off)) 3045 return FALSE; 3046 off++; 3047 while(1) { 3048 if(off >= max_end) 3049 return FALSE; 3050 if(off - beg > 32) 3051 return FALSE; 3052 if(CH(off) == _T(':') && off - beg >= 3) 3053 break; 3054 if(!ISALNUM(off) && CH(off) != _T('+') && CH(off) != _T('-') && CH(off) != _T('.')) 3055 return FALSE; 3056 off++; 3057 } 3058 3059 /* Check the path after the scheme. */ 3060 while(off < max_end && CH(off) != _T('>')) { 3061 if(ISWHITESPACE(off) || ISCNTRL(off) || CH(off) == _T('<')) 3062 return FALSE; 3063 off++; 3064 } 3065 3066 if(off >= max_end) 3067 return FALSE; 3068 3069 MD_ASSERT(CH(off) == _T('>')); 3070 *p_end = off+1; 3071 return TRUE; 3072 } 3073 3074 static int 3075 md_is_autolink_email(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end) 3076 { 3077 OFF off = beg + 1; 3078 int label_len; 3079 3080 MD_ASSERT(CH(beg) == _T('<')); 3081 3082 /* The code should correspond to this regexp: 3083 /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+ 3084 @[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])? 3085 (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/ 3086 */ 3087 3088 /* Username (before '@'). */ 3089 while(off < max_end && (ISALNUM(off) || ISANYOF(off, _T(".!#$%&'*+/=?^_`{|}~-")))) 3090 off++; 3091 if(off <= beg+1) 3092 return FALSE; 3093 3094 /* '@' */ 3095 if(off >= max_end || CH(off) != _T('@')) 3096 return FALSE; 3097 off++; 3098 3099 /* Labels delimited with '.'; each label is sequence of 1 - 63 alnum 3100 * characters or '-', but '-' is not allowed as first or last char. */ 3101 label_len = 0; 3102 while(off < max_end) { 3103 if(ISALNUM(off)) 3104 label_len++; 3105 else if(CH(off) == _T('-') && label_len > 0) 3106 label_len++; 3107 else if(CH(off) == _T('.') && label_len > 0 && CH(off-1) != _T('-')) 3108 label_len = 0; 3109 else 3110 break; 3111 3112 if(label_len > 63) 3113 return FALSE; 3114 3115 off++; 3116 } 3117 3118 if(label_len <= 0 || off >= max_end || CH(off) != _T('>') || CH(off-1) == _T('-')) 3119 return FALSE; 3120 3121 *p_end = off+1; 3122 return TRUE; 3123 } 3124 3125 static int 3126 md_is_autolink(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, int* p_missing_mailto) 3127 { 3128 if(md_is_autolink_uri(ctx, beg, max_end, p_end)) { 3129 *p_missing_mailto = FALSE; 3130 return TRUE; 3131 } 3132 3133 if(md_is_autolink_email(ctx, beg, max_end, p_end)) { 3134 *p_missing_mailto = TRUE; 3135 return TRUE; 3136 } 3137 3138 return FALSE; 3139 } 3140 3141 static int 3142 md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) 3143 { 3144 const MD_LINE* line_term = lines + n_lines; 3145 const MD_LINE* line; 3146 int ret = 0; 3147 MD_MARK* mark; 3148 OFF codespan_last_potential_closers[CODESPAN_MARK_MAXLEN] = { 0 }; 3149 int codespan_scanned_till_paragraph_end = FALSE; 3150 3151 for(line = lines; line < line_term; line++) { 3152 OFF off = line->beg; 3153 OFF line_end = line->end; 3154 3155 while(TRUE) { 3156 CHAR ch; 3157 3158 /* Optimization: Use some loop unrolling. */ 3159 while(off + 3 < line_end && !IS_MARK_CHAR(off+0) && !IS_MARK_CHAR(off+1) 3160 && !IS_MARK_CHAR(off+2) && !IS_MARK_CHAR(off+3)) 3161 off += 4; 3162 while(off < line_end && !IS_MARK_CHAR(off+0)) 3163 off++; 3164 3165 if(off >= line_end) 3166 break; 3167 3168 ch = CH(off); 3169 3170 /* A backslash escape. 3171 * It can go beyond line->end as it may involve escaped new 3172 * line to form a hard break. */ 3173 if(ch == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) { 3174 /* Hard-break cannot be on the last line of the block. */ 3175 if(!ISNEWLINE(off+1) || line+1 < line_term) 3176 PUSH_MARK(ch, off, off+2, MD_MARK_RESOLVED); 3177 off += 2; 3178 continue; 3179 } 3180 3181 /* A potential (string) emphasis start/end. */ 3182 if(ch == _T('*') || ch == _T('_')) { 3183 OFF tmp = off+1; 3184 int left_level; /* What precedes: 0 = whitespace; 1 = punctuation; 2 = other char. */ 3185 int right_level; /* What follows: 0 = whitespace; 1 = punctuation; 2 = other char. */ 3186 3187 while(tmp < line_end && CH(tmp) == ch) 3188 tmp++; 3189 3190 if(off == line->beg || ISUNICODEWHITESPACEBEFORE(off)) 3191 left_level = 0; 3192 else if(ISUNICODEPUNCTBEFORE(off)) 3193 left_level = 1; 3194 else 3195 left_level = 2; 3196 3197 if(tmp == line_end || ISUNICODEWHITESPACE(tmp)) 3198 right_level = 0; 3199 else if(ISUNICODEPUNCT(tmp)) 3200 right_level = 1; 3201 else 3202 right_level = 2; 3203 3204 /* Intra-word underscore doesn't have special meaning. */ 3205 if(ch == _T('_') && left_level == 2 && right_level == 2) { 3206 left_level = 0; 3207 right_level = 0; 3208 } 3209 3210 if(left_level != 0 || right_level != 0) { 3211 unsigned flags = 0; 3212 3213 if(left_level > 0 && left_level >= right_level) 3214 flags |= MD_MARK_POTENTIAL_CLOSER; 3215 if(right_level > 0 && right_level >= left_level) 3216 flags |= MD_MARK_POTENTIAL_OPENER; 3217 if(left_level == 2 && right_level == 2) 3218 flags |= MD_MARK_EMPH_INTRAWORD; 3219 3220 /* For "the rule of three" we need to remember the original 3221 * size of the mark (modulo three), before we potentially 3222 * split the mark when being later resolved partially by some 3223 * shorter closer. */ 3224 switch((tmp - off) % 3) { 3225 case 0: flags |= MD_MARK_EMPH_MOD3_0; break; 3226 case 1: flags |= MD_MARK_EMPH_MOD3_1; break; 3227 case 2: flags |= MD_MARK_EMPH_MOD3_2; break; 3228 } 3229 3230 PUSH_MARK(ch, off, tmp, flags); 3231 3232 /* During resolving, multiple asterisks may have to be 3233 * split into independent span start/ends. Consider e.g. 3234 * "**foo* bar*". Therefore we push also some empty dummy 3235 * marks to have enough space for that. */ 3236 off++; 3237 while(off < tmp) { 3238 PUSH_MARK('D', off, off, 0); 3239 off++; 3240 } 3241 continue; 3242 } 3243 3244 off = tmp; 3245 continue; 3246 } 3247 3248 /* A potential code span start/end. */ 3249 if(ch == _T('`')) { 3250 OFF opener_beg, opener_end; 3251 OFF closer_beg, closer_end; 3252 int is_code_span; 3253 3254 is_code_span = md_is_code_span(ctx, line, line_term - line, off, 3255 &opener_beg, &opener_end, &closer_beg, &closer_end, 3256 codespan_last_potential_closers, 3257 &codespan_scanned_till_paragraph_end); 3258 if(is_code_span) { 3259 PUSH_MARK(_T('`'), opener_beg, opener_end, MD_MARK_OPENER | MD_MARK_RESOLVED); 3260 PUSH_MARK(_T('`'), closer_beg, closer_end, MD_MARK_CLOSER | MD_MARK_RESOLVED); 3261 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; 3262 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; 3263 3264 off = closer_end; 3265 3266 /* Advance the current line accordingly. */ 3267 if(off > line_end) { 3268 line = md_lookup_line(off, line, line_term - line); 3269 line_end = line->end; 3270 } 3271 continue; 3272 } 3273 3274 off = opener_end; 3275 continue; 3276 } 3277 3278 /* A potential faint span start/end. */ 3279 if(ch == _T('-')) { 3280 OFF closer_beg; 3281 int is_faint_span; 3282 3283 if (off == line->beg || ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off) 3284 || IS_MARK_CHAR(off-1)) { 3285 3286 is_faint_span = md_is_faint_span(ctx, line, off, &closer_beg); 3287 if(is_faint_span) { 3288 PUSH_MARK(_T('-'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED); 3289 PUSH_MARK(_T('-'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED); 3290 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; 3291 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; 3292 } 3293 } 3294 off++; 3295 continue; 3296 } 3297 3298 /* A potential inverse span start/end. */ 3299 if(ch == _T('%')) { 3300 OFF closer_beg; 3301 int is_inverse_span; 3302 3303 if (off == line->beg || ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off) 3304 || IS_MARK_CHAR(off-1)) { 3305 3306 is_inverse_span = md_is_inverse_span(ctx, line, off, &closer_beg); 3307 if(is_inverse_span) { 3308 PUSH_MARK(_T('%'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED); 3309 PUSH_MARK(_T('%'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED); 3310 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; 3311 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; 3312 3313 } 3314 } 3315 off++; 3316 continue; 3317 } 3318 3319 /* A potential conceal span start/end. */ 3320 if(ch == _T('!')) { 3321 OFF closer_beg; 3322 int is_conceal_span; 3323 3324 if (off == line->beg || ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off) 3325 || IS_MARK_CHAR(off-1)) { 3326 3327 is_conceal_span = md_is_conceal_span(ctx, line, off, &closer_beg); 3328 if(is_conceal_span) { 3329 PUSH_MARK(_T('!'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED); 3330 PUSH_MARK(_T('!'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED); 3331 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; 3332 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; 3333 3334 } 3335 } 3336 off++; 3337 continue; 3338 } 3339 3340 /* A potential blink span start/end. */ 3341 if(ch == _T('^')) { 3342 OFF closer_beg; 3343 int is_blink_span; 3344 3345 if (off == line->beg || ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off) 3346 || IS_MARK_CHAR(off-1)) { 3347 3348 is_blink_span = md_is_blink_span(ctx, line, off, &closer_beg); 3349 if(is_blink_span) { 3350 PUSH_MARK(_T('^'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED); 3351 PUSH_MARK(_T('^'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED); 3352 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; 3353 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; 3354 3355 } 3356 } 3357 off++; 3358 continue; 3359 } 3360 3361 /* A potential entity start. */ 3362 if(ch == _T('&')) { 3363 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER); 3364 off++; 3365 continue; 3366 } 3367 3368 /* A potential entity end. */ 3369 if(ch == _T(';')) { 3370 /* We surely cannot be entity unless the previous mark is '&'. */ 3371 if(ctx->n_marks > 0 && ctx->marks[ctx->n_marks-1].ch == _T('&')) 3372 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER); 3373 3374 off++; 3375 continue; 3376 } 3377 3378 /* A potential autolink or raw HTML start/end. */ 3379 if(ch == _T('<')) { 3380 int is_autolink; 3381 OFF autolink_end; 3382 int missing_mailto; 3383 3384 if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS)) { 3385 int is_html; 3386 OFF html_end; 3387 3388 /* Given the nature of the raw HTML, we have to recognize 3389 * it here. Doing so later in md_analyze_lt_gt() could 3390 * open can of worms of quadratic complexity. */ 3391 is_html = md_is_html_any(ctx, line, line_term - line, off, 3392 lines[n_lines-1].end, &html_end); 3393 if(is_html) { 3394 PUSH_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED); 3395 PUSH_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED); 3396 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; 3397 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; 3398 off = html_end; 3399 3400 /* Advance the current line accordingly. */ 3401 if(off > line_end) { 3402 line = md_lookup_line(off, line, line_term - line); 3403 line_end = line->end; 3404 } 3405 continue; 3406 } 3407 } 3408 3409 is_autolink = md_is_autolink(ctx, off, lines[n_lines-1].end, 3410 &autolink_end, &missing_mailto); 3411 if(is_autolink) { 3412 PUSH_MARK((missing_mailto ? _T('@') : _T('<')), off, off+1, 3413 MD_MARK_OPENER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK); 3414 PUSH_MARK(_T('>'), autolink_end-1, autolink_end, 3415 MD_MARK_CLOSER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK); 3416 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; 3417 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; 3418 off = autolink_end; 3419 continue; 3420 } 3421 3422 off++; 3423 continue; 3424 } 3425 3426 /* A potential anchor */ 3427 if(ch == _T('[') && off+1 < line_end && CH(off+1) == _T('|')) { 3428 OFF closer_beg; 3429 int is_anchor_span = md_is_anchor_span(ctx, line, off, &closer_beg); 3430 if (is_anchor_span) { 3431 PUSH_MARK(_T('['), off, off+2, MD_MARK_OPENER | MD_MARK_RESOLVED); 3432 PUSH_MARK(_T(']'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED); 3433 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1; 3434 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2; 3435 off = closer_beg+1; 3436 continue; 3437 } 3438 // continue analyzing [ mark 3439 } 3440 3441 /* A potential link or its part. */ 3442 if(ch == _T('[') || (ch == _T('!') && off+1 < line_end && CH(off+1) == _T('['))) { 3443 OFF tmp = (ch == _T('[') ? off+1 : off+2); 3444 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER); 3445 off = tmp; 3446 /* Two dummies to make enough place for data we need if it is 3447 * a link. */ 3448 PUSH_MARK('D', off, off, 0); 3449 PUSH_MARK('D', off, off, 0); 3450 continue; 3451 } 3452 if(ch == _T(']')) { 3453 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER); 3454 off++; 3455 continue; 3456 } 3457 3458 /* A potential permissive e-mail autolink. */ 3459 if(ch == _T('@')) { 3460 if(line->beg + 1 <= off && ISALNUM(off-1) && 3461 off + 3 < line->end && ISALNUM(off+1)) 3462 { 3463 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER); 3464 /* Push a dummy as a reserve for a closer. */ 3465 PUSH_MARK('D', off, off, 0); 3466 } 3467 3468 off++; 3469 continue; 3470 } 3471 3472 /* A potential permissive URL autolink. */ 3473 if(ch == _T(':')) { 3474 static struct { 3475 const CHAR* scheme; 3476 SZ scheme_size; 3477 const CHAR* suffix; 3478 SZ suffix_size; 3479 } scheme_map[] = { 3480 /* In the order from the most frequently used, arguably. */ 3481 { _T("https"), 5, _T("//"), 2 }, 3482 { _T("gemini"), 6, _T("//"), 2 }, 3483 { _T("http"), 4, _T("//"), 2 }, 3484 { _T("gopher"), 6, _T("//"), 2 }, 3485 { _T("spartan"), 7, _T("//"), 2 }, 3486 { _T("ftp"), 3, _T("//"), 2 } 3487 }; 3488 int scheme_index; 3489 3490 for(scheme_index = 0; scheme_index < (int) SIZEOF_ARRAY(scheme_map); scheme_index++) { 3491 const CHAR* scheme = scheme_map[scheme_index].scheme; 3492 const SZ scheme_size = scheme_map[scheme_index].scheme_size; 3493 const CHAR* suffix = scheme_map[scheme_index].suffix; 3494 const SZ suffix_size = scheme_map[scheme_index].suffix_size; 3495 3496 if(line->beg + scheme_size <= off && md_ascii_eq(STR(off-scheme_size), scheme, scheme_size) && 3497 (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1) || ISANYOF(off-scheme_size-1, _T("*_~(["))) && 3498 off + 1 + suffix_size < line->end && md_ascii_eq(STR(off+1), suffix, suffix_size)) 3499 { 3500 PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER); 3501 /* Push a dummy as a reserve for a closer. */ 3502 PUSH_MARK('D', off, off, 0); 3503 off += 1 + suffix_size; 3504 break; 3505 } 3506 } 3507 3508 off++; 3509 continue; 3510 } 3511 3512 /* A potential permissive WWW autolink. */ 3513 if(ch == _T('.')) { 3514 if(line->beg + 3 <= off && md_ascii_eq(STR(off-3), _T("www"), 3) && 3515 (line->beg + 3 == off || ISWHITESPACE(off-4) || ISANYOF(off-4, _T("*_~(["))) && 3516 off + 1 < line_end) 3517 { 3518 PUSH_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER); 3519 /* Push a dummy as a reserve for a closer. */ 3520 PUSH_MARK('D', off, off, 0); 3521 off++; 3522 continue; 3523 } 3524 3525 off++; 3526 continue; 3527 } 3528 3529 /* A potential table cell boundary or wiki link label delimiter. */ 3530 if((table_mode || ctx->parser.flags & MD_FLAG_WIKILINKS) && ch == _T('|')) { 3531 PUSH_MARK(ch, off, off+1, 0); 3532 off++; 3533 continue; 3534 } 3535 3536 /* A potential strikethrough start/end. */ 3537 if(ch == _T('~')) { 3538 OFF tmp = off+1; 3539 3540 while(tmp < line_end && CH(tmp) == _T('~')) 3541 tmp++; 3542 3543 if(tmp - off < 3) { 3544 unsigned flags = 0; 3545 3546 if(tmp < line_end && !ISUNICODEWHITESPACE(tmp)) 3547 flags |= MD_MARK_POTENTIAL_OPENER; 3548 if(off > line->beg && !ISUNICODEWHITESPACEBEFORE(off)) 3549 flags |= MD_MARK_POTENTIAL_CLOSER; 3550 if(flags != 0) 3551 PUSH_MARK(ch, off, tmp, flags); 3552 } 3553 3554 off = tmp; 3555 continue; 3556 } 3557 3558 /* A potential equation start/end */ 3559 if(ch == _T('$')) { 3560 /* We can have at most two consecutive $ signs, 3561 * where two dollar signs signify a display equation. */ 3562 OFF tmp = off+1; 3563 3564 while(tmp < line_end && CH(tmp) == _T('$')) 3565 tmp++; 3566 3567 if (tmp - off <= 2) 3568 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER); 3569 off = tmp; 3570 continue; 3571 } 3572 3573 /* Turn non-trivial whitespace into single space. */ 3574 if(ISWHITESPACE_(ch)) { 3575 OFF tmp = off+1; 3576 3577 while(tmp < line_end && ISWHITESPACE(tmp)) 3578 tmp++; 3579 3580 if(tmp - off > 1 || ch != _T(' ')) 3581 PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED); 3582 3583 off = tmp; 3584 continue; 3585 } 3586 3587 /* NULL character. */ 3588 if(ch == _T('\0')) { 3589 PUSH_MARK(ch, off, off+1, MD_MARK_RESOLVED); 3590 off++; 3591 continue; 3592 } 3593 3594 off++; 3595 } 3596 } 3597 3598 /* Add a dummy mark at the end of the mark vector to simplify 3599 * process_inlines(). */ 3600 PUSH_MARK(127, ctx->size, ctx->size, MD_MARK_RESOLVED); 3601 3602 abort: 3603 return ret; 3604 } 3605 3606 static void 3607 md_analyze_bracket(MD_CTX* ctx, int mark_index) 3608 { 3609 /* We cannot really resolve links here as for that we would need 3610 * more context. E.g. a following pair of brackets (reference link), 3611 * or enclosing pair of brackets (if the inner is the link, the outer 3612 * one cannot be.) 3613 * 3614 * Therefore we here only construct a list of '[' ']' pairs ordered by 3615 * position of the closer. This allows us to analyze what is or is not 3616 * link in the right order, from inside to outside in case of nested 3617 * brackets. 3618 * 3619 * The resolving itself is deferred to md_resolve_links(). 3620 */ 3621 3622 MD_MARK* mark = &ctx->marks[mark_index]; 3623 3624 if(mark->flags & MD_MARK_POTENTIAL_OPENER) { 3625 if(BRACKET_OPENERS.head != -1) 3626 ctx->marks[BRACKET_OPENERS.tail].flags |= MD_MARK_HASNESTEDBRACKETS; 3627 3628 md_mark_chain_append(ctx, &BRACKET_OPENERS, mark_index); 3629 return; 3630 } 3631 3632 if(BRACKET_OPENERS.tail >= 0) { 3633 /* Pop the opener from the chain. */ 3634 int opener_index = BRACKET_OPENERS.tail; 3635 MD_MARK* opener = &ctx->marks[opener_index]; 3636 if(opener->prev >= 0) 3637 ctx->marks[opener->prev].next = -1; 3638 else 3639 BRACKET_OPENERS.head = -1; 3640 BRACKET_OPENERS.tail = opener->prev; 3641 3642 /* Interconnect the opener and closer. */ 3643 opener->next = mark_index; 3644 mark->prev = opener_index; 3645 3646 /* Add the pair into chain of potential links for md_resolve_links(). 3647 * Note we misuse opener->prev for this as opener->next points to its 3648 * closer. */ 3649 if(ctx->unresolved_link_tail >= 0) 3650 ctx->marks[ctx->unresolved_link_tail].prev = opener_index; 3651 else 3652 ctx->unresolved_link_head = opener_index; 3653 ctx->unresolved_link_tail = opener_index; 3654 opener->prev = -1; 3655 } 3656 } 3657 3658 /* Forward declaration. */ 3659 static void md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines, 3660 int mark_beg, int mark_end); 3661 3662 static int 3663 md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines) 3664 { 3665 int opener_index = ctx->unresolved_link_head; 3666 OFF last_link_beg = 0; 3667 OFF last_link_end = 0; 3668 OFF last_img_beg = 0; 3669 OFF last_img_end = 0; 3670 3671 while(opener_index >= 0) { 3672 MD_MARK* opener = &ctx->marks[opener_index]; 3673 int closer_index = opener->next; 3674 MD_MARK* closer = &ctx->marks[closer_index]; 3675 int next_index = opener->prev; 3676 MD_MARK* next_opener; 3677 MD_MARK* next_closer; 3678 MD_LINK_ATTR attr; 3679 int is_link = FALSE; 3680 3681 if(next_index >= 0) { 3682 next_opener = &ctx->marks[next_index]; 3683 next_closer = &ctx->marks[next_opener->next]; 3684 } else { 3685 next_opener = NULL; 3686 next_closer = NULL; 3687 } 3688 3689 /* If nested ("[ [ ] ]"), we need to make sure that: 3690 * - The outer does not end inside of (...) belonging to the inner. 3691 * - The outer cannot be link if the inner is link (i.e. not image). 3692 * 3693 * (Note we here analyze from inner to outer as the marks are ordered 3694 * by closer->beg.) 3695 */ 3696 if((opener->beg < last_link_beg && closer->end < last_link_end) || 3697 (opener->beg < last_img_beg && closer->end < last_img_end) || 3698 (opener->beg < last_link_end && opener->ch == '[')) 3699 { 3700 opener_index = next_index; 3701 continue; 3702 } 3703 3704 /* Recognize and resolve wiki links. 3705 * Wiki-links maybe '[[destination]]' or '[[destination|label]]'. 3706 */ 3707 if ((ctx->parser.flags & MD_FLAG_WIKILINKS) && 3708 (opener->end - opener->beg == 1) && /* not image */ 3709 next_opener != NULL && /* double '[' opener */ 3710 next_opener->ch == '[' && 3711 (next_opener->beg == opener->beg - 1) && 3712 (next_opener->end - next_opener->beg == 1) && 3713 next_closer != NULL && /* double ']' closer */ 3714 next_closer->ch == ']' && 3715 (next_closer->beg == closer->beg + 1) && 3716 (next_closer->end - next_closer->beg == 1)) 3717 { 3718 MD_MARK* delim = NULL; 3719 int delim_index; 3720 OFF dest_beg, dest_end; 3721 3722 is_link = TRUE; 3723 3724 /* We don't allow destination to be longer than 100 characters. 3725 * Lets scan to see whether there is '|'. (If not then the whole 3726 * wiki-link has to be below the 100 characters.) */ 3727 delim_index = opener_index + 1; 3728 while(delim_index < closer_index) { 3729 MD_MARK* m = &ctx->marks[delim_index]; 3730 if(m->ch == '|') { 3731 delim = m; 3732 break; 3733 } 3734 if(m->ch != 'D' && m->beg - opener->end > 100) 3735 break; 3736 delim_index++; 3737 } 3738 dest_beg = opener->end; 3739 dest_end = (delim != NULL) ? delim->beg : closer->beg; 3740 if(dest_end - dest_beg == 0 || dest_end - dest_beg > 100) 3741 is_link = FALSE; 3742 3743 /* There may not be any new line in the destination. */ 3744 if(is_link) { 3745 OFF off; 3746 for(off = dest_beg; off < dest_end; off++) { 3747 if(ISNEWLINE(off)) { 3748 is_link = FALSE; 3749 break; 3750 } 3751 } 3752 } 3753 3754 if(is_link) { 3755 if(delim != NULL) { 3756 if(delim->end < closer->beg) { 3757 md_rollback(ctx, opener_index, delim_index, MD_ROLLBACK_ALL); 3758 md_rollback(ctx, delim_index, closer_index, MD_ROLLBACK_CROSSING); 3759 delim->flags |= MD_MARK_RESOLVED; 3760 opener->end = delim->beg; 3761 } else { 3762 /* The pipe is just before the closer: [[foo|]] */ 3763 md_rollback(ctx, opener_index, closer_index, MD_ROLLBACK_ALL); 3764 closer->beg = delim->beg; 3765 delim = NULL; 3766 } 3767 } 3768 3769 opener->beg = next_opener->beg; 3770 opener->next = closer_index; 3771 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED; 3772 3773 closer->end = next_closer->end; 3774 closer->prev = opener_index; 3775 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED; 3776 3777 last_link_beg = opener->beg; 3778 last_link_end = closer->end; 3779 3780 if(delim != NULL) 3781 md_analyze_link_contents(ctx, lines, n_lines, delim_index+1, closer_index); 3782 3783 opener_index = next_opener->prev; 3784 continue; 3785 } 3786 } 3787 3788 if(next_opener != NULL && next_opener->beg == closer->end) { 3789 if(next_closer->beg > closer->end + 1) { 3790 /* Might be full reference link. */ 3791 if(!(next_opener->flags & MD_MARK_HASNESTEDBRACKETS)) 3792 is_link = md_is_link_reference(ctx, lines, n_lines, next_opener->beg, next_closer->end, &attr); 3793 } else { 3794 /* Might be shortcut reference link. */ 3795 if(!(opener->flags & MD_MARK_HASNESTEDBRACKETS)) 3796 is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr); 3797 } 3798 3799 if(is_link < 0) 3800 return -1; 3801 3802 if(is_link) { 3803 /* Eat the 2nd "[...]". */ 3804 closer->end = next_closer->end; 3805 3806 /* Do not analyze the label as a standalone link in the next 3807 * iteration. */ 3808 next_index = ctx->marks[next_index].prev; 3809 } 3810 } else { 3811 if(closer->end < ctx->size && CH(closer->end) == _T('(')) { 3812 /* Might be inline link. */ 3813 OFF inline_link_end = UINT_MAX; 3814 3815 is_link = md_is_inline_link_spec(ctx, lines, n_lines, closer->end, &inline_link_end, &attr); 3816 if(is_link < 0) 3817 return -1; 3818 3819 /* Check the closing ')' is not inside an already resolved range 3820 * (i.e. a range with a higher priority), e.g. a code span. */ 3821 if(is_link) { 3822 int i = closer_index + 1; 3823 3824 while(i < ctx->n_marks) { 3825 MD_MARK* mark = &ctx->marks[i]; 3826 3827 if(mark->beg >= inline_link_end) 3828 break; 3829 if((mark->flags & (MD_MARK_OPENER | MD_MARK_RESOLVED)) == (MD_MARK_OPENER | MD_MARK_RESOLVED)) { 3830 if(ctx->marks[mark->next].beg >= inline_link_end) { 3831 /* Cancel the link status. */ 3832 if(attr.title_needs_free) 3833 free(attr.title); 3834 is_link = FALSE; 3835 break; 3836 } 3837 3838 i = mark->next + 1; 3839 } else { 3840 i++; 3841 } 3842 } 3843 } 3844 3845 if(is_link) { 3846 /* Eat the "(...)" */ 3847 closer->end = inline_link_end; 3848 } 3849 } 3850 3851 if(!is_link) { 3852 /* Might be collapsed reference link. */ 3853 if(!(opener->flags & MD_MARK_HASNESTEDBRACKETS)) 3854 is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr); 3855 if(is_link < 0) 3856 return -1; 3857 } 3858 } 3859 3860 if(is_link) { 3861 /* Resolve the brackets as a link. */ 3862 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED; 3863 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED; 3864 3865 /* If it is a link, we store the destination and title in the two 3866 * dummy marks after the opener. */ 3867 MD_ASSERT(ctx->marks[opener_index+1].ch == 'D'); 3868 ctx->marks[opener_index+1].beg = attr.dest_beg; 3869 ctx->marks[opener_index+1].end = attr.dest_end; 3870 3871 MD_ASSERT(ctx->marks[opener_index+2].ch == 'D'); 3872 md_mark_store_ptr(ctx, opener_index+2, attr.title); 3873 /* The title might or might not have been allocated for us. */ 3874 if(attr.title_needs_free) 3875 md_mark_chain_append(ctx, &PTR_CHAIN, opener_index+2); 3876 ctx->marks[opener_index+2].prev = attr.title_size; 3877 3878 if(opener->ch == '[') { 3879 last_link_beg = opener->beg; 3880 last_link_end = closer->end; 3881 } else { 3882 last_img_beg = opener->beg; 3883 last_img_end = closer->end; 3884 } 3885 3886 md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index); 3887 3888 /* If the link text is formed by nothing but permissive autolink, 3889 * suppress the autolink. 3890 * See https://github.com/mity/md4c/issues/152 for more info. */ 3891 if(ctx->parser.flags & MD_FLAG_PERMISSIVEAUTOLINKS) { 3892 MD_MARK* first_nested; 3893 MD_MARK* last_nested; 3894 3895 first_nested = opener + 1; 3896 while(first_nested->ch == _T('D') && first_nested < closer) 3897 first_nested++; 3898 3899 last_nested = closer - 1; 3900 while(first_nested->ch == _T('D') && last_nested > opener) 3901 last_nested--; 3902 3903 if((first_nested->flags & MD_MARK_RESOLVED) && 3904 first_nested->beg == opener->end && 3905 ISANYOF_(first_nested->ch, _T("@:.")) && 3906 first_nested->next == (last_nested - ctx->marks) && 3907 last_nested->end == closer->beg) 3908 { 3909 first_nested->ch = _T('D'); 3910 first_nested->flags &= ~MD_MARK_RESOLVED; 3911 last_nested->ch = _T('D'); 3912 last_nested->flags &= ~MD_MARK_RESOLVED; 3913 } 3914 } 3915 } 3916 3917 opener_index = next_index; 3918 } 3919 3920 return 0; 3921 } 3922 3923 /* Analyze whether the mark '&' starts a HTML entity. 3924 * If so, update its flags as well as flags of corresponding closer ';'. */ 3925 static void 3926 md_analyze_entity(MD_CTX* ctx, int mark_index) 3927 { 3928 MD_MARK* opener = &ctx->marks[mark_index]; 3929 MD_MARK* closer; 3930 OFF off; 3931 3932 /* Cannot be entity if there is no closer as the next mark. 3933 * (Any other mark between would mean strange character which cannot be 3934 * part of the entity. 3935 * 3936 * So we can do all the work on '&' and do not call this later for the 3937 * closing mark ';'. 3938 */ 3939 if(mark_index + 1 >= ctx->n_marks) 3940 return; 3941 closer = &ctx->marks[mark_index+1]; 3942 if(closer->ch != ';') 3943 return; 3944 3945 if(md_is_entity(ctx, opener->beg, closer->end, &off)) { 3946 MD_ASSERT(off == closer->end); 3947 3948 md_resolve_range(ctx, NULL, mark_index, mark_index+1); 3949 opener->end = closer->end; 3950 } 3951 } 3952 3953 static void 3954 md_analyze_table_cell_boundary(MD_CTX* ctx, int mark_index) 3955 { 3956 MD_MARK* mark = &ctx->marks[mark_index]; 3957 mark->flags |= MD_MARK_RESOLVED; 3958 3959 md_mark_chain_append(ctx, &TABLECELLBOUNDARIES, mark_index); 3960 ctx->n_table_cell_boundaries++; 3961 } 3962 3963 /* Split a longer mark into two. The new mark takes the given count of 3964 * characters. May only be called if an adequate number of dummy 'D' marks 3965 * follows. 3966 */ 3967 static int 3968 md_split_emph_mark(MD_CTX* ctx, int mark_index, SZ n) 3969 { 3970 MD_MARK* mark = &ctx->marks[mark_index]; 3971 int new_mark_index = mark_index + (mark->end - mark->beg - n); 3972 MD_MARK* dummy = &ctx->marks[new_mark_index]; 3973 3974 MD_ASSERT(mark->end - mark->beg > n); 3975 MD_ASSERT(dummy->ch == 'D'); 3976 3977 memcpy(dummy, mark, sizeof(MD_MARK)); 3978 mark->end -= n; 3979 dummy->beg = mark->end; 3980 3981 return new_mark_index; 3982 } 3983 3984 static void 3985 md_analyze_emph(MD_CTX* ctx, int mark_index) 3986 { 3987 MD_MARK* mark = &ctx->marks[mark_index]; 3988 MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index); 3989 3990 /* If we can be a closer, try to resolve with the preceding opener. */ 3991 if(mark->flags & MD_MARK_POTENTIAL_CLOSER) { 3992 MD_MARK* opener = NULL; 3993 int opener_index = 0; 3994 3995 if(mark->ch == _T('*')) { 3996 MD_MARKCHAIN* opener_chains[6]; 3997 int i, n_opener_chains; 3998 unsigned flags = mark->flags; 3999 4000 /* Apply the "rule of three". */ 4001 n_opener_chains = 0; 4002 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_0; 4003 if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2) 4004 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_1; 4005 if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1) 4006 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_2; 4007 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_0; 4008 if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2) 4009 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_1; 4010 if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1) 4011 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_2; 4012 4013 /* Opener is the most recent mark from the allowed chains. */ 4014 for(i = 0; i < n_opener_chains; i++) { 4015 if(opener_chains[i]->tail >= 0) { 4016 int tmp_index = opener_chains[i]->tail; 4017 MD_MARK* tmp_mark = &ctx->marks[tmp_index]; 4018 if(opener == NULL || tmp_mark->end > opener->end) { 4019 opener_index = tmp_index; 4020 opener = tmp_mark; 4021 } 4022 } 4023 } 4024 } else { 4025 /* Simple emph. mark */ 4026 if(chain->tail >= 0) { 4027 opener_index = chain->tail; 4028 opener = &ctx->marks[opener_index]; 4029 } 4030 } 4031 4032 /* Resolve, if we have found matching opener. */ 4033 if(opener != NULL) { 4034 SZ opener_size = opener->end - opener->beg; 4035 SZ closer_size = mark->end - mark->beg; 4036 MD_MARKCHAIN* opener_chain = md_mark_chain(ctx, opener_index); 4037 4038 if(opener_size > closer_size) { 4039 opener_index = md_split_emph_mark(ctx, opener_index, closer_size); 4040 md_mark_chain_append(ctx, opener_chain, opener_index); 4041 } else if(opener_size < closer_size) { 4042 md_split_emph_mark(ctx, mark_index, closer_size - opener_size); 4043 } 4044 4045 md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING); 4046 md_resolve_range(ctx, opener_chain, opener_index, mark_index); 4047 return; 4048 } 4049 } 4050 4051 /* If we could not resolve as closer, we may be yet be an opener. */ 4052 if(mark->flags & MD_MARK_POTENTIAL_OPENER) 4053 md_mark_chain_append(ctx, chain, mark_index); 4054 } 4055 4056 static void 4057 md_analyze_tilde(MD_CTX* ctx, int mark_index) 4058 { 4059 MD_MARK* mark = &ctx->marks[mark_index]; 4060 MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index); 4061 4062 /* We attempt to be Github Flavored Markdown compatible here. GFM accepts 4063 * only tildes sequences of length 1 and 2, and the length of the opener 4064 * and closer has to match. */ 4065 4066 if((mark->flags & MD_MARK_POTENTIAL_CLOSER) && chain->head >= 0) { 4067 int opener_index = chain->head; 4068 4069 md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING); 4070 md_resolve_range(ctx, chain, opener_index, mark_index); 4071 return; 4072 } 4073 4074 if(mark->flags & MD_MARK_POTENTIAL_OPENER) 4075 md_mark_chain_append(ctx, chain, mark_index); 4076 } 4077 4078 static void 4079 md_analyze_dollar(MD_CTX* ctx, int mark_index) 4080 { 4081 /* This should mimic the way inline equations work in LaTeX, so there 4082 * can only ever be one item in the chain (i.e. the dollars can't be 4083 * nested). This is basically the same as the md_analyze_tilde function, 4084 * except that we require matching openers and closers to be of the same 4085 * length. 4086 * 4087 * E.g.: $abc$$def$$ => abc (display equation) def (end equation) */ 4088 if(DOLLAR_OPENERS.head >= 0) { 4089 /* If the potential closer has a non-matching number of $, discard */ 4090 MD_MARK* open = &ctx->marks[DOLLAR_OPENERS.head]; 4091 MD_MARK* close = &ctx->marks[mark_index]; 4092 4093 int opener_index = DOLLAR_OPENERS.head; 4094 md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_ALL); 4095 if (open->end - open->beg == close->end - close->beg) { 4096 /* We are the matching closer */ 4097 md_resolve_range(ctx, &DOLLAR_OPENERS, opener_index, mark_index); 4098 return; 4099 } 4100 } 4101 4102 md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index); 4103 } 4104 4105 static void 4106 md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index) 4107 { 4108 MD_MARK* opener = &ctx->marks[mark_index]; 4109 int closer_index = mark_index + 1; 4110 MD_MARK* closer = &ctx->marks[closer_index]; 4111 MD_MARK* next_resolved_mark; 4112 OFF off = opener->end; 4113 int n_dots = FALSE; 4114 int has_underscore_in_last_seg = FALSE; 4115 int has_underscore_in_next_to_last_seg = FALSE; 4116 int n_opened_parenthesis = 0; 4117 int n_excess_parenthesis = 0; 4118 4119 /* Check for domain. */ 4120 while(off < ctx->size) { 4121 if(ISALNUM(off) || CH(off) == _T('-')) { 4122 off++; 4123 } else if(CH(off) == _T('.')) { 4124 /* We must see at least one period. */ 4125 n_dots++; 4126 has_underscore_in_next_to_last_seg = has_underscore_in_last_seg; 4127 has_underscore_in_last_seg = FALSE; 4128 off++; 4129 } else if(CH(off) == _T('_')) { 4130 /* No underscore may be present in the last two domain segments. */ 4131 has_underscore_in_last_seg = TRUE; 4132 off++; 4133 } else { 4134 break; 4135 } 4136 } 4137 if(off > opener->end && CH(off-1) == _T('.')) { 4138 off--; 4139 n_dots--; 4140 } 4141 if(off <= opener->end || n_dots == 0 || has_underscore_in_next_to_last_seg || has_underscore_in_last_seg) 4142 return; 4143 4144 /* Check for path. */ 4145 next_resolved_mark = closer + 1; 4146 while(next_resolved_mark->ch == 'D' || !(next_resolved_mark->flags & MD_MARK_RESOLVED)) 4147 next_resolved_mark++; 4148 while(off < next_resolved_mark->beg && CH(off) != _T('<') && !ISWHITESPACE(off) && !ISNEWLINE(off)) { 4149 /* Parenthesis must be balanced. */ 4150 if(CH(off) == _T('(')) { 4151 n_opened_parenthesis++; 4152 } else if(CH(off) == _T(')')) { 4153 if(n_opened_parenthesis > 0) 4154 n_opened_parenthesis--; 4155 else 4156 n_excess_parenthesis++; 4157 } 4158 4159 off++; 4160 } 4161 4162 /* Trim a trailing punctuation from the end. */ 4163 while(TRUE) { 4164 if(ISANYOF(off-1, _T("?!.,:*_~"))) { 4165 off--; 4166 } else if(CH(off-1) == ')' && n_excess_parenthesis > 0) { 4167 /* Unmatched ')' can be in an interior of the path but not at the 4168 * of it, so the auto-link may be safely nested in a parenthesis 4169 * pair. */ 4170 off--; 4171 n_excess_parenthesis--; 4172 } else { 4173 break; 4174 } 4175 } 4176 4177 /* Ok. Lets call it an auto-link. Adapt opener and create closer to zero 4178 * length so all the contents becomes the link text. */ 4179 MD_ASSERT(closer->ch == 'D' || 4180 ((ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS) && 4181 (closer->ch == '.' || closer->ch == ':' || closer->ch == '@'))); 4182 opener->end = opener->beg; 4183 closer->ch = opener->ch; 4184 closer->beg = off; 4185 closer->end = off; 4186 md_resolve_range(ctx, NULL, mark_index, closer_index); 4187 } 4188 4189 /* The permissive autolinks do not have to be enclosed in '<' '>' but we 4190 * instead impose stricter rules what is understood as an e-mail address 4191 * here. Actually any non-alphanumeric characters with exception of '.' 4192 * are prohibited both in username and after '@'. */ 4193 static void 4194 md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index) 4195 { 4196 MD_MARK* opener = &ctx->marks[mark_index]; 4197 int closer_index; 4198 MD_MARK* closer; 4199 OFF beg = opener->beg; 4200 OFF end = opener->end; 4201 int dot_count = 0; 4202 4203 MD_ASSERT(opener->ch == _T('@')); 4204 4205 /* Scan for name before '@'. */ 4206 while(beg > 0 && (ISALNUM(beg-1) || ISANYOF(beg-1, _T(".-_+")))) 4207 beg--; 4208 4209 /* Scan for domain after '@'. */ 4210 while(end < ctx->size && (ISALNUM(end) || ISANYOF(end, _T(".-_")))) { 4211 if(CH(end) == _T('.')) 4212 dot_count++; 4213 end++; 4214 } 4215 if(CH(end-1) == _T('.')) { /* Final '.' not part of it. */ 4216 dot_count--; 4217 end--; 4218 } 4219 else if(ISANYOF2(end-1, _T('-'), _T('_'))) /* These are forbidden at the end. */ 4220 return; 4221 if(CH(end-1) == _T('@') || dot_count == 0) 4222 return; 4223 4224 /* Ok. Lets call it auto-link. Adapt opener and create closer to zero 4225 * length so all the contents becomes the link text. */ 4226 closer_index = mark_index + 1; 4227 closer = &ctx->marks[closer_index]; 4228 if (closer->ch != 'D') return; 4229 4230 opener->beg = beg; 4231 opener->end = beg; 4232 closer->ch = opener->ch; 4233 closer->beg = end; 4234 closer->end = end; 4235 md_resolve_range(ctx, NULL, mark_index, closer_index); 4236 } 4237 4238 static inline void 4239 md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, 4240 int mark_beg, int mark_end, const CHAR* mark_chars) 4241 { 4242 int i = mark_beg; 4243 MD_UNUSED(lines); 4244 MD_UNUSED(n_lines); 4245 4246 while(i < mark_end) { 4247 MD_MARK* mark = &ctx->marks[i]; 4248 4249 /* Skip resolved spans. */ 4250 if(mark->flags & MD_MARK_RESOLVED) { 4251 if(mark->flags & MD_MARK_OPENER) { 4252 MD_ASSERT(i < mark->next); 4253 i = mark->next + 1; 4254 } else { 4255 i++; 4256 } 4257 continue; 4258 } 4259 4260 /* Skip marks we do not want to deal with. */ 4261 if(!ISANYOF_(mark->ch, mark_chars)) { 4262 i++; 4263 continue; 4264 } 4265 4266 /* Analyze the mark. */ 4267 switch(mark->ch) { 4268 case '[': /* Pass through. */ 4269 case '!': /* Pass through. */ 4270 case ']': md_analyze_bracket(ctx, i); break; 4271 case '&': md_analyze_entity(ctx, i); break; 4272 case '|': md_analyze_table_cell_boundary(ctx, i); break; 4273 case '_': /* Pass through. */ 4274 case '*': md_analyze_emph(ctx, i); break; 4275 case '~': md_analyze_tilde(ctx, i); break; 4276 case '$': md_analyze_dollar(ctx, i); break; 4277 case '.': /* Pass through. */ 4278 case ':': md_analyze_permissive_url_autolink(ctx, i); break; 4279 case '@': md_analyze_permissive_email_autolink(ctx, i); break; 4280 } 4281 4282 i++; 4283 } 4284 } 4285 4286 /* Analyze marks (build ctx->marks). */ 4287 static int 4288 md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) 4289 { 4290 int ret; 4291 4292 /* Reset the previously collected stack of marks. */ 4293 ctx->n_marks = 0; 4294 4295 /* Collect all marks. */ 4296 MD_CHECK(md_collect_marks(ctx, lines, n_lines, table_mode)); 4297 4298 /* (1) Links. */ 4299 md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("[]!")); 4300 MD_CHECK(md_resolve_links(ctx, lines, n_lines)); 4301 BRACKET_OPENERS.head = -1; 4302 BRACKET_OPENERS.tail = -1; 4303 ctx->unresolved_link_head = -1; 4304 ctx->unresolved_link_tail = -1; 4305 4306 if(table_mode) { 4307 /* (2) Analyze table cell boundaries. 4308 * Note we reset TABLECELLBOUNDARIES chain prior to the call md_analyze_marks(), 4309 * not after, because caller may need it. */ 4310 MD_ASSERT(n_lines == 1); 4311 TABLECELLBOUNDARIES.head = -1; 4312 TABLECELLBOUNDARIES.tail = -1; 4313 ctx->n_table_cell_boundaries = 0; 4314 md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("|")); 4315 return ret; 4316 } 4317 4318 /* (3) Emphasis and strong emphasis; permissive autolinks. */ 4319 md_analyze_link_contents(ctx, lines, n_lines, 0, ctx->n_marks); 4320 4321 abort: 4322 return ret; 4323 } 4324 4325 static void 4326 md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines, 4327 int mark_beg, int mark_end) 4328 { 4329 int i; 4330 4331 md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("&")); 4332 md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$@:.")); 4333 4334 for(i = OPENERS_CHAIN_FIRST; i <= OPENERS_CHAIN_LAST; i++) { 4335 ctx->mark_chains[i].head = -1; 4336 ctx->mark_chains[i].tail = -1; 4337 } 4338 } 4339 4340 static int 4341 md_enter_leave_span_a(MD_CTX* ctx, int enter, MD_SPANTYPE type, 4342 const CHAR* dest, SZ dest_size, int prohibit_escapes_in_dest, 4343 const CHAR* title, SZ title_size) 4344 { 4345 MD_ATTRIBUTE_BUILD href_build = { 0 }; 4346 MD_ATTRIBUTE_BUILD title_build = { 0 }; 4347 MD_SPAN_A_DETAIL det; 4348 int ret = 0; 4349 4350 /* Note we here rely on fact that MD_SPAN_A_DETAIL and 4351 * MD_SPAN_IMG_DETAIL are binary-compatible. */ 4352 memset(&det, 0, sizeof(MD_SPAN_A_DETAIL)); 4353 MD_CHECK(md_build_attribute(ctx, dest, dest_size, 4354 (prohibit_escapes_in_dest ? MD_BUILD_ATTR_NO_ESCAPES : 0), 4355 &det.href, &href_build)); 4356 MD_CHECK(md_build_attribute(ctx, title, title_size, 0, &det.title, &title_build)); 4357 4358 if(enter) 4359 MD_ENTER_SPAN(type, &det); 4360 else 4361 MD_LEAVE_SPAN(type, &det); 4362 4363 abort: 4364 md_free_attribute(ctx, &href_build); 4365 md_free_attribute(ctx, &title_build); 4366 return ret; 4367 } 4368 4369 static int 4370 md_enter_leave_span_wikilink(MD_CTX* ctx, int enter, const CHAR* target, SZ target_size) 4371 { 4372 MD_ATTRIBUTE_BUILD target_build = { 0 }; 4373 MD_SPAN_WIKILINK_DETAIL det; 4374 int ret = 0; 4375 4376 memset(&det, 0, sizeof(MD_SPAN_WIKILINK_DETAIL)); 4377 MD_CHECK(md_build_attribute(ctx, target, target_size, 0, &det.target, &target_build)); 4378 4379 if (enter) 4380 MD_ENTER_SPAN(MD_SPAN_WIKILINK, &det); 4381 else 4382 MD_LEAVE_SPAN(MD_SPAN_WIKILINK, &det); 4383 4384 abort: 4385 md_free_attribute(ctx, &target_build); 4386 return ret; 4387 } 4388 4389 4390 /* Render the output, accordingly to the analyzed ctx->marks. */ 4391 static int 4392 md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) 4393 { 4394 MD_TEXTTYPE text_type; 4395 const MD_LINE* line = lines; 4396 MD_MARK* prev_mark = NULL; 4397 MD_MARK* mark; 4398 OFF off = lines[0].beg; 4399 OFF end = lines[n_lines-1].end; 4400 int enforce_hardbreak = 0; 4401 int ret = 0; 4402 4403 /* Find first resolved mark. Note there is always at least one resolved 4404 * mark, the dummy last one after the end of the latest line we actually 4405 * never really reach. This saves us of a lot of special checks and cases 4406 * in this function. */ 4407 mark = ctx->marks; 4408 while(!(mark->flags & MD_MARK_RESOLVED)) 4409 mark++; 4410 4411 text_type = MD_TEXT_NORMAL; 4412 4413 while(1) { 4414 /* Process the text up to the next mark or end-of-line. */ 4415 OFF tmp = (line->end < mark->beg ? line->end : mark->beg); 4416 if(tmp > off) { 4417 MD_TEXT(text_type, STR(off), tmp - off); 4418 off = tmp; 4419 } 4420 4421 /* If reached the mark, process it and move to next one. */ 4422 if(off >= mark->beg) { 4423 switch(mark->ch) { 4424 case '\\': /* Backslash escape. */ 4425 if(ISNEWLINE(mark->beg+1)) 4426 enforce_hardbreak = 1; 4427 else 4428 MD_TEXT(text_type, STR(mark->beg+1), 1); 4429 break; 4430 4431 case ' ': /* Non-trivial space. */ 4432 MD_TEXT(text_type, _T(" "), 1); 4433 break; 4434 4435 case '`': /* Code span. */ 4436 if(mark->flags & MD_MARK_OPENER) { 4437 MD_ENTER_SPAN(MD_SPAN_CODE, NULL); 4438 text_type = MD_TEXT_CODE; 4439 } else { 4440 MD_LEAVE_SPAN(MD_SPAN_CODE, NULL); 4441 text_type = MD_TEXT_NORMAL; 4442 } 4443 break; 4444 4445 case '-': /* faint */ 4446 if(mark->flags & MD_MARK_OPENER) { 4447 MD_ENTER_SPAN(MD_SPAN_FNT, NULL); 4448 } else { 4449 MD_LEAVE_SPAN(MD_SPAN_FNT, NULL); 4450 } 4451 break; 4452 4453 case '%': /* inverse */ 4454 if(mark->flags & MD_MARK_OPENER) { 4455 MD_ENTER_SPAN(MD_SPAN_INV, NULL); 4456 } else { 4457 MD_LEAVE_SPAN(MD_SPAN_INV, NULL); 4458 } 4459 break; 4460 4461 case '^': /* blink */ 4462 if(mark->flags & MD_MARK_OPENER) { 4463 MD_ENTER_SPAN(MD_SPAN_BLI, NULL); 4464 } else { 4465 MD_LEAVE_SPAN(MD_SPAN_BLI, NULL); 4466 } 4467 break; 4468 4469 case '_': /* Underline (or emphasis if we fall through). */ 4470 if(ctx->parser.flags & MD_FLAG_UNDERLINE) { 4471 if(mark->flags & MD_MARK_OPENER) { 4472 /* while(off < mark->end) { */ 4473 /* MD_ENTER_SPAN(MD_SPAN_U, NULL); */ 4474 /* off++; */ 4475 /* } */ 4476 if((mark->end - off) % 2) { 4477 MD_ENTER_SPAN(MD_SPAN_U, NULL); 4478 off++; 4479 } 4480 while(off + 1 < mark->end) { 4481 MD_ENTER_SPAN(MD_SPAN_STRONG, NULL); 4482 off += 2; 4483 } 4484 } else { 4485 /* while(off < mark->end) { */ 4486 /* MD_LEAVE_SPAN(MD_SPAN_U, NULL); */ 4487 /* off++; */ 4488 /* } */ 4489 while(off + 1 < mark->end) { 4490 MD_LEAVE_SPAN(MD_SPAN_STRONG, NULL); 4491 off += 2; 4492 } 4493 if((mark->end - off) % 2) { 4494 MD_LEAVE_SPAN(MD_SPAN_U, NULL); 4495 off++; 4496 } 4497 } 4498 break; 4499 } 4500 MD_FALLTHROUGH(); 4501 4502 case '*': /* Emphasis, strong emphasis. */ 4503 if(mark->flags & MD_MARK_OPENER) { 4504 if((mark->end - off) % 2) { 4505 MD_ENTER_SPAN(MD_SPAN_EM, NULL); 4506 off++; 4507 } 4508 while(off + 1 < mark->end) { 4509 MD_ENTER_SPAN(MD_SPAN_STRONG, NULL); 4510 off += 2; 4511 } 4512 } else { 4513 while(off + 1 < mark->end) { 4514 MD_LEAVE_SPAN(MD_SPAN_STRONG, NULL); 4515 off += 2; 4516 } 4517 if((mark->end - off) % 2) { 4518 MD_LEAVE_SPAN(MD_SPAN_EM, NULL); 4519 off++; 4520 } 4521 } 4522 break; 4523 4524 case '~': /* crossed */ 4525 if(mark->flags & MD_MARK_OPENER) 4526 MD_ENTER_SPAN(MD_SPAN_DEL, NULL); 4527 else 4528 MD_LEAVE_SPAN(MD_SPAN_DEL, NULL); 4529 break; 4530 4531 case '$': 4532 if(mark->flags & MD_MARK_OPENER) { 4533 MD_ENTER_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL); 4534 text_type = MD_TEXT_LATEXMATH; 4535 } else { 4536 MD_LEAVE_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL); 4537 text_type = MD_TEXT_NORMAL; 4538 } 4539 break; 4540 4541 case '!': /* conceal/hidden */ 4542 if (mark->prev == -1) { 4543 if (mark->flags & MD_MARK_OPENER) { 4544 MD_ENTER_SPAN(MD_SPAN_COC, NULL); 4545 break; 4546 } 4547 } 4548 else { 4549 if (ctx->marks[mark->prev].ch == '!' && !(mark->flags & MD_MARK_OPENER)) { 4550 MD_LEAVE_SPAN(MD_SPAN_COC, NULL); 4551 break; 4552 } 4553 } 4554 case '[': /* Link, wiki link, image, anchor. */ 4555 case ']': 4556 { 4557 const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]); 4558 const MD_MARK* closer = &ctx->marks[opener->next]; 4559 const MD_MARK* dest_mark; 4560 const MD_MARK* title_mark; 4561 4562 if ((opener->ch == '[' && closer->ch == ']') && 4563 opener->end - opener->beg >= 2 && 4564 closer->end - closer->beg >= 2) 4565 { 4566 int has_label = (opener->end - opener->beg > 2); 4567 SZ target_sz; 4568 4569 if(has_label) 4570 target_sz = opener->end - (opener->beg+2); 4571 else 4572 target_sz = closer->beg - opener->end; 4573 4574 MD_CHECK(md_enter_leave_span_wikilink(ctx, (mark->ch != ']'), 4575 has_label ? STR(opener->beg+2) : STR(opener->end), 4576 target_sz)); 4577 4578 break; 4579 } 4580 4581 if ((opener->ch == '[' && closer->ch == ']') && 4582 opener->end - opener->beg == 2 && 4583 closer->end - closer->beg == 1 && 4584 CH(opener->beg+1) == _T('|')) 4585 { 4586 if(mark->flags & MD_MARK_OPENER) { 4587 MD_ENTER_SPAN(MD_SPAN_ANCHOR, NULL); 4588 } else { 4589 MD_LEAVE_SPAN(MD_SPAN_ANCHOR, NULL); 4590 } 4591 } 4592 4593 dest_mark = opener+1; 4594 MD_ASSERT(dest_mark->ch == 'D'); 4595 title_mark = opener+2; 4596 if (title_mark->ch != 'D') break; 4597 4598 MD_CHECK(md_enter_leave_span_a(ctx, (mark->ch != ']'), 4599 (opener->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A), 4600 STR(dest_mark->beg), dest_mark->end - dest_mark->beg, FALSE, 4601 md_mark_get_ptr(ctx, (int)(title_mark - ctx->marks)), 4602 title_mark->prev)); 4603 4604 /* link/image closer may span multiple lines. */ 4605 if(mark->ch == ']') { 4606 while(mark->end > line->end) 4607 line++; 4608 } 4609 4610 break; 4611 } 4612 4613 case '<': 4614 case '>': /* Autolink or raw HTML. */ 4615 if(!(mark->flags & MD_MARK_AUTOLINK)) { 4616 /* Raw HTML. */ 4617 if(mark->flags & MD_MARK_OPENER) 4618 text_type = MD_TEXT_HTML; 4619 else 4620 text_type = MD_TEXT_NORMAL; 4621 break; 4622 } 4623 /* Pass through, if auto-link. */ 4624 MD_FALLTHROUGH(); 4625 4626 case '@': /* Permissive e-mail autolink. */ 4627 case ':': /* Permissive URL autolink. */ 4628 case '.': /* Permissive WWW autolink. */ 4629 { 4630 MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]); 4631 MD_MARK* closer = &ctx->marks[opener->next]; 4632 const CHAR* dest = STR(opener->end); 4633 SZ dest_size = closer->beg - opener->end; 4634 4635 /* For permissive auto-links we do not know closer mark 4636 * position at the time of md_collect_marks(), therefore 4637 * it can be out-of-order in ctx->marks[]. 4638 * 4639 * With this flag, we make sure that we output the closer 4640 * only if we processed the opener. */ 4641 if(mark->flags & MD_MARK_OPENER) 4642 closer->flags |= MD_MARK_VALIDPERMISSIVEAUTOLINK; 4643 4644 if(opener->ch == '@' || opener->ch == '.') { 4645 dest_size += 7; 4646 MD_TEMP_BUFFER(dest_size * sizeof(CHAR)); 4647 memcpy(ctx->buffer, 4648 (opener->ch == '@' ? _T("mailto:") : _T("http://")), 4649 7 * sizeof(CHAR)); 4650 memcpy(ctx->buffer + 7, dest, (dest_size-7) * sizeof(CHAR)); 4651 dest = ctx->buffer; 4652 } 4653 4654 if(closer->flags & MD_MARK_VALIDPERMISSIVEAUTOLINK) 4655 MD_CHECK(md_enter_leave_span_a(ctx, (mark->flags & MD_MARK_OPENER), 4656 MD_SPAN_A, dest, dest_size, TRUE, NULL, 0)); 4657 break; 4658 } 4659 4660 case '&': /* Entity. */ 4661 MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg); 4662 break; 4663 4664 case '\0': 4665 MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1); 4666 break; 4667 4668 case 127: 4669 goto abort; 4670 } 4671 4672 off = mark->end; 4673 4674 /* Move to next resolved mark. */ 4675 prev_mark = mark; 4676 mark++; 4677 while(!(mark->flags & MD_MARK_RESOLVED) || mark->beg < off) 4678 mark++; 4679 } 4680 4681 /* If reached end of line, move to next one. */ 4682 if(off >= line->end) { 4683 /* If it is the last line, we are done. */ 4684 if(off >= end) 4685 break; 4686 4687 if(text_type == MD_TEXT_CODE || text_type == MD_TEXT_LATEXMATH) { 4688 OFF tmp; 4689 4690 MD_ASSERT(prev_mark != NULL); 4691 MD_ASSERT(ISANYOF2_(prev_mark->ch, '`', '$') && (prev_mark->flags & MD_MARK_OPENER)); 4692 MD_ASSERT(ISANYOF2_(mark->ch, '`', '$') && (mark->flags & MD_MARK_CLOSER)); 4693 4694 /* Inside a code span, trailing line whitespace has to be 4695 * outputted. */ 4696 tmp = off; 4697 while(off < ctx->size && ISBLANK(off)) 4698 off++; 4699 if(off > tmp) 4700 MD_TEXT(text_type, STR(tmp), off-tmp); 4701 4702 /* and new lines are transformed into single spaces. */ 4703 if(prev_mark->end < off && off < mark->beg) 4704 MD_TEXT(text_type, _T(" "), 1); 4705 } else if(text_type == MD_TEXT_HTML) { 4706 /* Inside raw HTML, we output the new line verbatim, including 4707 * any trailing spaces. */ 4708 OFF tmp = off; 4709 4710 while(tmp < end && ISBLANK(tmp)) 4711 tmp++; 4712 if(tmp > off) 4713 MD_TEXT(MD_TEXT_HTML, STR(off), tmp - off); 4714 MD_TEXT(MD_TEXT_HTML, _T("\n"), 1); 4715 } else { 4716 /* Output soft or hard line break. */ 4717 MD_TEXTTYPE break_type = MD_TEXT_SOFTBR; 4718 4719 if(text_type == MD_TEXT_NORMAL) { 4720 if(enforce_hardbreak) 4721 break_type = MD_TEXT_BR; 4722 else if((CH(line->end) == _T(' ') && CH(line->end+1) == _T(' '))) 4723 break_type = MD_TEXT_BR; 4724 } 4725 4726 MD_TEXT(break_type, _T("\n"), 1); 4727 } 4728 4729 /* Move to the next line. */ 4730 line++; 4731 off = line->beg; 4732 4733 enforce_hardbreak = 0; 4734 } 4735 } 4736 4737 abort: 4738 return ret; 4739 } 4740 4741 4742 /*************************** 4743 *** Processing Tables *** 4744 ***************************/ 4745 4746 static void 4747 md_analyze_table_alignment(MD_CTX* ctx, OFF beg, OFF end, MD_ALIGN* align, int n_align) 4748 { 4749 static const MD_ALIGN align_map[] = { MD_ALIGN_DEFAULT, MD_ALIGN_LEFT, MD_ALIGN_RIGHT, MD_ALIGN_CENTER }; 4750 OFF off = beg; 4751 4752 while(n_align > 0) { 4753 int index = 0; /* index into align_map[] */ 4754 4755 while(CH(off) != _T('-')) 4756 off++; 4757 if(off > beg && CH(off-1) == _T(':')) 4758 index |= 1; 4759 while(off < end && CH(off) == _T('-')) 4760 off++; 4761 if(off < end && CH(off) == _T(':')) 4762 index |= 2; 4763 4764 *align = align_map[index]; 4765 align++; 4766 n_align--; 4767 } 4768 4769 } 4770 4771 /* Forward declaration. */ 4772 static int md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines); 4773 4774 static int 4775 md_process_table_cell(MD_CTX* ctx, MD_BLOCKTYPE cell_type, MD_ALIGN align, OFF beg, OFF end) 4776 { 4777 MD_LINE line; 4778 MD_BLOCK_TD_DETAIL det; 4779 int ret = 0; 4780 4781 while(beg < end && ISWHITESPACE(beg)) 4782 beg++; 4783 while(end > beg && ISWHITESPACE(end-1)) 4784 end--; 4785 4786 det.align = align; 4787 line.beg = beg; 4788 line.end = end; 4789 4790 MD_ENTER_BLOCK(cell_type, &det); 4791 MD_CHECK(md_process_normal_block_contents(ctx, &line, 1)); 4792 MD_LEAVE_BLOCK(cell_type, &det); 4793 4794 abort: 4795 return ret; 4796 } 4797 4798 static int 4799 md_process_table_row(MD_CTX* ctx, MD_BLOCKTYPE cell_type, OFF beg, OFF end, 4800 const MD_ALIGN* align, int col_count) 4801 { 4802 MD_LINE line; 4803 OFF* pipe_offs = NULL; 4804 int i, j, k, n; 4805 int ret = 0; 4806 4807 line.beg = beg; 4808 line.end = end; 4809 4810 /* Break the line into table cells by identifying pipe characters who 4811 * form the cell boundary. */ 4812 MD_CHECK(md_analyze_inlines(ctx, &line, 1, TRUE)); 4813 4814 /* We have to remember the cell boundaries in local buffer because 4815 * ctx->marks[] shall be reused during cell contents processing. */ 4816 n = ctx->n_table_cell_boundaries + 2; 4817 pipe_offs = (OFF*) malloc(n * sizeof(OFF)); 4818 if(pipe_offs == NULL) { 4819 MD_LOG("malloc() failed."); 4820 ret = -1; 4821 goto abort; 4822 } 4823 j = 0; 4824 pipe_offs[j++] = beg; 4825 for(i = TABLECELLBOUNDARIES.head; i >= 0; i = ctx->marks[i].next) { 4826 MD_MARK* mark = &ctx->marks[i]; 4827 pipe_offs[j++] = mark->end; 4828 } 4829 pipe_offs[j++] = end+1; 4830 4831 /* Process cells. */ 4832 MD_ENTER_BLOCK(MD_BLOCK_TR, NULL); 4833 k = 0; 4834 for(i = 0; i < j-1 && k < col_count; i++) { 4835 if(pipe_offs[i] < pipe_offs[i+1]-1) 4836 MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], pipe_offs[i], pipe_offs[i+1]-1)); 4837 } 4838 /* Make sure we call enough table cells even if the current table contains 4839 * too few of them. */ 4840 while(k < col_count) 4841 MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], 0, 0)); 4842 MD_LEAVE_BLOCK(MD_BLOCK_TR, NULL); 4843 4844 abort: 4845 free(pipe_offs); 4846 4847 /* Free any temporary memory blocks stored within some dummy marks. */ 4848 for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next) 4849 free(md_mark_get_ptr(ctx, i)); 4850 PTR_CHAIN.head = -1; 4851 PTR_CHAIN.tail = -1; 4852 4853 return ret; 4854 } 4855 4856 static int 4857 md_process_table_block_contents(MD_CTX* ctx, int col_count, const MD_LINE* lines, int n_lines) 4858 { 4859 MD_ALIGN* align; 4860 int i; 4861 int ret = 0; 4862 4863 /* At least two lines have to be present: The column headers and the line 4864 * with the underlines. */ 4865 MD_ASSERT(n_lines >= 2); 4866 4867 align = malloc(col_count * sizeof(MD_ALIGN)); 4868 if(align == NULL) { 4869 MD_LOG("malloc() failed."); 4870 ret = -1; 4871 goto abort; 4872 } 4873 4874 md_analyze_table_alignment(ctx, lines[1].beg, lines[1].end, align, col_count); 4875 4876 MD_ENTER_BLOCK(MD_BLOCK_THEAD, NULL); 4877 MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TH, 4878 lines[0].beg, lines[0].end, align, col_count)); 4879 MD_LEAVE_BLOCK(MD_BLOCK_THEAD, NULL); 4880 4881 if(n_lines > 2) { 4882 MD_ENTER_BLOCK(MD_BLOCK_TBODY, NULL); 4883 for(i = 2; i < n_lines; i++) { 4884 MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TD, 4885 lines[i].beg, lines[i].end, align, col_count)); 4886 } 4887 MD_LEAVE_BLOCK(MD_BLOCK_TBODY, NULL); 4888 } 4889 4890 abort: 4891 free(align); 4892 return ret; 4893 } 4894 4895 4896 /************************** 4897 *** Processing Block *** 4898 **************************/ 4899 4900 #define MD_BLOCK_CONTAINER_OPENER 0x01 4901 #define MD_BLOCK_CONTAINER_CLOSER 0x02 4902 #define MD_BLOCK_CONTAINER (MD_BLOCK_CONTAINER_OPENER | MD_BLOCK_CONTAINER_CLOSER) 4903 #define MD_BLOCK_LOOSE_LIST 0x04 4904 #define MD_BLOCK_SETEXT_HEADER 0x08 4905 4906 struct MD_BLOCK_tag { 4907 MD_BLOCKTYPE type : 8; 4908 unsigned flags : 8; 4909 4910 /* MD_BLOCK_H: Header level (1 - 6) 4911 * MD_BLOCK_CODE: Non-zero if fenced, zero if indented. 4912 * MD_BLOCK_LI: Task mark character (0 if not task list item, 'x', 'X' or ' '). 4913 * MD_BLOCK_TABLE: Column count (as determined by the table underline). 4914 */ 4915 unsigned data : 16; 4916 4917 /* Leaf blocks: Count of lines (MD_LINE or MD_VERBATIMLINE) on the block. 4918 * MD_BLOCK_LI: Task mark offset in the input doc. 4919 * MD_BLOCK_OL: Start item number. 4920 */ 4921 unsigned n_lines; 4922 }; 4923 4924 struct MD_CONTAINER_tag { 4925 CHAR ch; 4926 unsigned is_loose : 8; 4927 unsigned is_task : 8; 4928 unsigned start; 4929 unsigned mark_indent; 4930 unsigned contents_indent; 4931 OFF block_byte_off; 4932 OFF task_mark_off; 4933 }; 4934 4935 4936 static int 4937 md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines) 4938 { 4939 int i; 4940 int ret; 4941 4942 MD_CHECK(md_analyze_inlines(ctx, lines, n_lines, FALSE)); 4943 MD_CHECK(md_process_inlines(ctx, lines, n_lines)); 4944 4945 abort: 4946 /* Free any temporary memory blocks stored within some dummy marks. */ 4947 for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next) 4948 free(md_mark_get_ptr(ctx, i)); 4949 PTR_CHAIN.head = -1; 4950 PTR_CHAIN.tail = -1; 4951 4952 return ret; 4953 } 4954 4955 static int 4956 md_process_verbatim_block_contents(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_VERBATIMLINE* lines, int n_lines) 4957 { 4958 static const CHAR indent_chunk_str[] = _T(" "); 4959 static const SZ indent_chunk_size = SIZEOF_ARRAY(indent_chunk_str) - 1; 4960 4961 int i; 4962 int ret = 0; 4963 4964 for(i = 0; i < n_lines; i++) { 4965 const MD_VERBATIMLINE* line = &lines[i]; 4966 int indent = line->indent; 4967 4968 MD_ASSERT(indent >= 0); 4969 4970 /* Output code indentation. */ 4971 while(indent > (int) indent_chunk_size) { 4972 MD_TEXT(text_type, indent_chunk_str, indent_chunk_size); 4973 indent -= indent_chunk_size; 4974 } 4975 if(indent > 0) 4976 MD_TEXT(text_type, indent_chunk_str, indent); 4977 4978 /* Output the code line itself. */ 4979 MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg); 4980 4981 /* Enforce end-of-line. */ 4982 MD_TEXT(text_type, _T("\n"), 1); 4983 } 4984 4985 abort: 4986 return ret; 4987 } 4988 4989 static int 4990 md_process_code_block_contents(MD_CTX* ctx, int is_fenced, const MD_VERBATIMLINE* lines, int n_lines) 4991 { 4992 if(is_fenced) { 4993 /* Skip the first line in case of fenced code: It is the fence. 4994 * (Only the starting fence is present due to logic in md_analyze_line().) */ 4995 lines++; 4996 n_lines--; 4997 } else { 4998 /* Ignore blank lines at start/end of indented code block. */ 4999 while(n_lines > 0 && lines[0].beg == lines[0].end) { 5000 lines++; 5001 n_lines--; 5002 } 5003 while(n_lines > 0 && lines[n_lines-1].beg == lines[n_lines-1].end) { 5004 n_lines--; 5005 } 5006 } 5007 5008 if(n_lines == 0) 5009 return 0; 5010 5011 return md_process_verbatim_block_contents(ctx, MD_TEXT_CODE, lines, n_lines); 5012 } 5013 5014 static int 5015 md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DETAIL* det, 5016 MD_ATTRIBUTE_BUILD* info_build, MD_ATTRIBUTE_BUILD* lang_build) 5017 { 5018 const MD_VERBATIMLINE* fence_line = (const MD_VERBATIMLINE*)(block + 1); 5019 OFF beg = fence_line->beg; 5020 OFF end = fence_line->end; 5021 OFF lang_end; 5022 CHAR fence_ch = CH(fence_line->beg); 5023 int ret = 0; 5024 5025 /* Skip the fence itself. */ 5026 while(beg < ctx->size && CH(beg) == fence_ch) 5027 beg++; 5028 /* Trim initial spaces. */ 5029 while(beg < ctx->size && CH(beg) == _T(' ')) 5030 beg++; 5031 5032 /* Trim trailing spaces. */ 5033 while(end > beg && CH(end-1) == _T(' ')) 5034 end--; 5035 5036 /* Build info string attribute. */ 5037 MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->info, info_build)); 5038 5039 /* Build info string attribute. */ 5040 lang_end = beg; 5041 while(lang_end < end && !ISWHITESPACE(lang_end)) 5042 lang_end++; 5043 MD_CHECK(md_build_attribute(ctx, STR(beg), lang_end - beg, 0, &det->lang, lang_build)); 5044 5045 det->fence_char = fence_ch; 5046 5047 abort: 5048 return ret; 5049 } 5050 5051 static int 5052 md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block) 5053 { 5054 union { 5055 MD_BLOCK_H_DETAIL header; 5056 MD_BLOCK_CODE_DETAIL code; 5057 MD_BLOCK_TABLE_DETAIL table; 5058 } det; 5059 MD_ATTRIBUTE_BUILD info_build; 5060 MD_ATTRIBUTE_BUILD lang_build; 5061 int is_in_tight_list; 5062 int clean_fence_code_detail = FALSE; 5063 int ret = 0; 5064 5065 memset(&det, 0, sizeof(det)); 5066 5067 if(ctx->n_containers == 0) 5068 is_in_tight_list = FALSE; 5069 else 5070 is_in_tight_list = !ctx->containers[ctx->n_containers-1].is_loose; 5071 5072 switch(block->type) { 5073 case MD_BLOCK_H: 5074 det.header.level = block->data; 5075 break; 5076 5077 case MD_BLOCK_CODE: 5078 /* For fenced code block, we may need to set the info string. */ 5079 if(block->data != 0) { 5080 memset(&det.code, 0, sizeof(MD_BLOCK_CODE_DETAIL)); 5081 clean_fence_code_detail = TRUE; 5082 MD_CHECK(md_setup_fenced_code_detail(ctx, block, &det.code, &info_build, &lang_build)); 5083 } 5084 break; 5085 5086 case MD_BLOCK_TABLE: 5087 det.table.col_count = block->data; 5088 det.table.head_row_count = 1; 5089 det.table.body_row_count = block->n_lines - 2; 5090 break; 5091 5092 default: 5093 /* Noop. */ 5094 break; 5095 } 5096 5097 if(!is_in_tight_list || block->type != MD_BLOCK_P) 5098 MD_ENTER_BLOCK(block->type, (void*) &det); 5099 5100 /* Process the block contents accordingly to is type. */ 5101 switch(block->type) { 5102 case MD_BLOCK_HR: 5103 /* noop */ 5104 break; 5105 5106 case MD_BLOCK_CODE: 5107 MD_CHECK(md_process_code_block_contents(ctx, (block->data != 0), 5108 (const MD_VERBATIMLINE*)(block + 1), block->n_lines)); 5109 break; 5110 5111 case MD_BLOCK_HTML: 5112 MD_CHECK(md_process_verbatim_block_contents(ctx, MD_TEXT_HTML, 5113 (const MD_VERBATIMLINE*)(block + 1), block->n_lines)); 5114 break; 5115 5116 case MD_BLOCK_TABLE: 5117 MD_CHECK(md_process_table_block_contents(ctx, block->data, 5118 (const MD_LINE*)(block + 1), block->n_lines)); 5119 break; 5120 5121 default: 5122 MD_CHECK(md_process_normal_block_contents(ctx, 5123 (const MD_LINE*)(block + 1), block->n_lines)); 5124 break; 5125 } 5126 5127 if(!is_in_tight_list || block->type != MD_BLOCK_P) 5128 MD_LEAVE_BLOCK(block->type, (void*) &det); 5129 5130 abort: 5131 if(clean_fence_code_detail) { 5132 md_free_attribute(ctx, &info_build); 5133 md_free_attribute(ctx, &lang_build); 5134 } 5135 return ret; 5136 } 5137 5138 static int 5139 md_process_all_blocks(MD_CTX* ctx) 5140 { 5141 int byte_off = 0; 5142 int ret = 0; 5143 5144 /* ctx->containers now is not needed for detection of lists and list items 5145 * so we reuse it for tracking what lists are loose or tight. We rely 5146 * on the fact the vector is large enough to hold the deepest nesting 5147 * level of lists. */ 5148 ctx->n_containers = 0; 5149 5150 while(byte_off < ctx->n_block_bytes) { 5151 MD_BLOCK* block = (MD_BLOCK*)((char*)ctx->block_bytes + byte_off); 5152 union { 5153 MD_BLOCK_UL_DETAIL ul; 5154 MD_BLOCK_OL_DETAIL ol; 5155 MD_BLOCK_LI_DETAIL li; 5156 } det; 5157 5158 switch(block->type) { 5159 case MD_BLOCK_UL: 5160 det.ul.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE; 5161 det.ul.mark = (CHAR) block->data; 5162 break; 5163 5164 case MD_BLOCK_OL: 5165 det.ol.start = block->n_lines; 5166 det.ol.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE; 5167 det.ol.mark_delimiter = (CHAR) block->data; 5168 break; 5169 5170 case MD_BLOCK_LI: 5171 det.li.is_task = (block->data != 0); 5172 det.li.task_mark = (CHAR) block->data; 5173 det.li.task_mark_offset = (OFF) block->n_lines; 5174 break; 5175 5176 default: 5177 /* noop */ 5178 break; 5179 } 5180 5181 if(block->flags & MD_BLOCK_CONTAINER) { 5182 if(block->flags & MD_BLOCK_CONTAINER_CLOSER) { 5183 MD_LEAVE_BLOCK(block->type, &det); 5184 5185 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL || block->type == MD_BLOCK_QUOTE) 5186 ctx->n_containers--; 5187 } 5188 5189 if(block->flags & MD_BLOCK_CONTAINER_OPENER) { 5190 MD_ENTER_BLOCK(block->type, &det); 5191 5192 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL) { 5193 ctx->containers[ctx->n_containers].is_loose = (block->flags & MD_BLOCK_LOOSE_LIST); 5194 ctx->n_containers++; 5195 } else if(block->type == MD_BLOCK_QUOTE) { 5196 /* This causes that any text in a block quote, even if 5197 * nested inside a tight list item, is wrapped with 5198 * <p>...</p>. */ 5199 ctx->containers[ctx->n_containers].is_loose = TRUE; 5200 ctx->n_containers++; 5201 } 5202 } 5203 } else { 5204 MD_CHECK(md_process_leaf_block(ctx, block)); 5205 5206 if(block->type == MD_BLOCK_CODE || block->type == MD_BLOCK_HTML) 5207 byte_off += block->n_lines * sizeof(MD_VERBATIMLINE); 5208 else 5209 byte_off += block->n_lines * sizeof(MD_LINE); 5210 } 5211 5212 byte_off += sizeof(MD_BLOCK); 5213 } 5214 5215 ctx->n_block_bytes = 0; 5216 5217 abort: 5218 return ret; 5219 } 5220 5221 5222 /************************************ 5223 *** Grouping Lines into Blocks *** 5224 ************************************/ 5225 5226 static void* 5227 md_push_block_bytes(MD_CTX* ctx, int n_bytes) 5228 { 5229 void* ptr; 5230 5231 if(ctx->n_block_bytes + n_bytes > ctx->alloc_block_bytes) { 5232 void* new_block_bytes; 5233 5234 ctx->alloc_block_bytes = (ctx->alloc_block_bytes > 0 5235 ? ctx->alloc_block_bytes + ctx->alloc_block_bytes / 2 5236 : 512); 5237 new_block_bytes = realloc(ctx->block_bytes, ctx->alloc_block_bytes); 5238 if(new_block_bytes == NULL) { 5239 MD_LOG("realloc() failed."); 5240 return NULL; 5241 } 5242 5243 /* Fix the ->current_block after the reallocation. */ 5244 if(ctx->current_block != NULL) { 5245 OFF off_current_block = (OFF) ((char*) ctx->current_block - (char*) ctx->block_bytes); 5246 ctx->current_block = (MD_BLOCK*) ((char*) new_block_bytes + off_current_block); 5247 } 5248 5249 ctx->block_bytes = new_block_bytes; 5250 } 5251 5252 ptr = (char*)ctx->block_bytes + ctx->n_block_bytes; 5253 ctx->n_block_bytes += n_bytes; 5254 return ptr; 5255 } 5256 5257 static int 5258 md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line) 5259 { 5260 MD_BLOCK* block; 5261 5262 MD_ASSERT(ctx->current_block == NULL); 5263 5264 block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK)); 5265 if(block == NULL) 5266 return -1; 5267 5268 switch(line->type) { 5269 case MD_LINE_HR: 5270 block->type = MD_BLOCK_HR; 5271 break; 5272 5273 case MD_LINE_ATXHEADER: 5274 case MD_LINE_SETEXTHEADER: 5275 block->type = MD_BLOCK_H; 5276 break; 5277 5278 case MD_LINE_FENCEDCODE: 5279 case MD_LINE_INDENTEDCODE: 5280 block->type = MD_BLOCK_CODE; 5281 break; 5282 5283 case MD_LINE_TEXT: 5284 block->type = MD_BLOCK_P; 5285 break; 5286 5287 case MD_LINE_HTML: 5288 block->type = MD_BLOCK_HTML; 5289 break; 5290 5291 case MD_LINE_BLANK: 5292 case MD_LINE_SETEXTUNDERLINE: 5293 case MD_LINE_TABLEUNDERLINE: 5294 default: 5295 MD_UNREACHABLE(); 5296 break; 5297 } 5298 5299 block->flags = 0; 5300 block->data = line->data; 5301 block->n_lines = 0; 5302 5303 ctx->current_block = block; 5304 return 0; 5305 } 5306 5307 /* Eat from start of current (textual) block any reference definitions and 5308 * remember them so we can resolve any links referring to them. 5309 * 5310 * (Reference definitions can only be at start of it as they cannot break 5311 * a paragraph.) 5312 */ 5313 static int 5314 md_consume_link_reference_definitions(MD_CTX* ctx) 5315 { 5316 MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1); 5317 int n_lines = ctx->current_block->n_lines; 5318 int n = 0; 5319 5320 /* Compute how many lines at the start of the block form one or more 5321 * reference definitions. */ 5322 while(n < n_lines) { 5323 int n_link_ref_lines; 5324 5325 n_link_ref_lines = md_is_link_reference_definition(ctx, 5326 lines + n, n_lines - n); 5327 /* Not a reference definition? */ 5328 if(n_link_ref_lines == 0) 5329 break; 5330 5331 /* We fail if it is the ref. def. but it could not be stored due 5332 * a memory allocation error. */ 5333 if(n_link_ref_lines < 0) 5334 return -1; 5335 5336 n += n_link_ref_lines; 5337 } 5338 5339 /* If there was at least one reference definition, we need to remove 5340 * its lines from the block, or perhaps even the whole block. */ 5341 if(n > 0) { 5342 if(n == n_lines) { 5343 /* Remove complete block. */ 5344 ctx->n_block_bytes -= n * sizeof(MD_LINE); 5345 ctx->n_block_bytes -= sizeof(MD_BLOCK); 5346 ctx->current_block = NULL; 5347 } else { 5348 /* Remove just some initial lines from the block. */ 5349 memmove(lines, lines + n, (n_lines - n) * sizeof(MD_LINE)); 5350 ctx->current_block->n_lines -= n; 5351 ctx->n_block_bytes -= n * sizeof(MD_LINE); 5352 } 5353 } 5354 5355 return 0; 5356 } 5357 5358 static int 5359 md_end_current_block(MD_CTX* ctx) 5360 { 5361 int ret = 0; 5362 5363 if(ctx->current_block == NULL) 5364 return ret; 5365 5366 /* Check whether there is a reference definition. (We do this here instead 5367 * of in md_analyze_line() because reference definition can take multiple 5368 * lines.) */ 5369 if(ctx->current_block->type == MD_BLOCK_P || 5370 (ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER))) 5371 { 5372 MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1); 5373 if(CH(lines[0].beg) == _T('[')) { 5374 MD_CHECK(md_consume_link_reference_definitions(ctx)); 5375 if(ctx->current_block == NULL) 5376 return ret; 5377 } 5378 } 5379 5380 if(ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)) { 5381 int n_lines = ctx->current_block->n_lines; 5382 5383 if(n_lines > 1) { 5384 /* Get rid of the underline. */ 5385 ctx->current_block->n_lines--; 5386 ctx->n_block_bytes -= sizeof(MD_LINE); 5387 } else { 5388 /* Only the underline has left after eating the ref. defs. 5389 * Keep the line as beginning of a new ordinary paragraph. */ 5390 ctx->current_block->type = MD_BLOCK_P; 5391 return 0; 5392 } 5393 } 5394 5395 /* Mark we are not building any block anymore. */ 5396 ctx->current_block = NULL; 5397 5398 abort: 5399 return ret; 5400 } 5401 5402 static int 5403 md_add_line_into_current_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* analysis) 5404 { 5405 MD_ASSERT(ctx->current_block != NULL); 5406 5407 if(ctx->current_block->type == MD_BLOCK_CODE || ctx->current_block->type == MD_BLOCK_HTML) { 5408 MD_VERBATIMLINE* line; 5409 5410 line = (MD_VERBATIMLINE*) md_push_block_bytes(ctx, sizeof(MD_VERBATIMLINE)); 5411 if(line == NULL) 5412 return -1; 5413 5414 line->indent = analysis->indent; 5415 line->beg = analysis->beg; 5416 line->end = analysis->end; 5417 } else { 5418 MD_LINE* line; 5419 5420 line = (MD_LINE*) md_push_block_bytes(ctx, sizeof(MD_LINE)); 5421 if(line == NULL) 5422 return -1; 5423 5424 line->beg = analysis->beg; 5425 line->end = analysis->end; 5426 } 5427 ctx->current_block->n_lines++; 5428 5429 return 0; 5430 } 5431 5432 static int 5433 md_push_container_bytes(MD_CTX* ctx, MD_BLOCKTYPE type, unsigned start, 5434 unsigned data, unsigned flags) 5435 { 5436 MD_BLOCK* block; 5437 int ret = 0; 5438 5439 MD_CHECK(md_end_current_block(ctx)); 5440 5441 block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK)); 5442 if(block == NULL) 5443 return -1; 5444 5445 block->type = type; 5446 block->flags = flags; 5447 block->data = data; 5448 block->n_lines = start; 5449 5450 abort: 5451 return ret; 5452 } 5453 5454 5455 5456 /*********************** 5457 *** Line Analysis *** 5458 ***********************/ 5459 5460 static int 5461 md_is_hr_line(MD_CTX* ctx, OFF beg, OFF* p_end, OFF* p_killer) 5462 { 5463 OFF off = beg + 1; 5464 int n = 1; 5465 5466 while(off < ctx->size && (CH(off) == CH(beg) || CH(off) == _T(' ') || CH(off) == _T('\t'))) { 5467 if(CH(off) == CH(beg)) 5468 n++; 5469 off++; 5470 } 5471 5472 if(n < 3) { 5473 *p_killer = off; 5474 return FALSE; 5475 } 5476 5477 /* Nothing else can be present on the line. */ 5478 if(off < ctx->size && !ISNEWLINE(off)) { 5479 *p_killer = off; 5480 return FALSE; 5481 } 5482 5483 *p_end = off; 5484 return TRUE; 5485 } 5486 5487 static int 5488 md_is_atxheader_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end, unsigned* p_level) 5489 { 5490 int n; 5491 OFF off = beg + 1; 5492 5493 while(off < ctx->size && CH(off) == _T('#') && off - beg < 7) 5494 off++; 5495 n = off - beg; 5496 5497 if(n > 6) 5498 return FALSE; 5499 *p_level = n; 5500 5501 if(!(ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS) && off < ctx->size && 5502 CH(off) != _T(' ') && CH(off) != _T('\t') && !ISNEWLINE(off)) 5503 return FALSE; 5504 5505 while(off < ctx->size && CH(off) == _T(' ')) 5506 off++; 5507 *p_beg = off; 5508 *p_end = off; 5509 return TRUE; 5510 } 5511 5512 static int 5513 md_is_setext_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_level) 5514 { 5515 OFF off = beg + 1; 5516 5517 while(off < ctx->size && CH(off) == CH(beg)) 5518 off++; 5519 5520 /* Optionally, space(s) can follow. */ 5521 while(off < ctx->size && CH(off) == _T(' ')) 5522 off++; 5523 5524 /* But nothing more is allowed on the line. */ 5525 if(off < ctx->size && !ISNEWLINE(off)) 5526 return FALSE; 5527 5528 *p_level = (CH(beg) == _T('=') ? 1 : 2); 5529 *p_end = off; 5530 return TRUE; 5531 } 5532 5533 static int 5534 md_is_table_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_col_count) 5535 { 5536 OFF off = beg; 5537 int found_pipe = FALSE; 5538 unsigned col_count = 0; 5539 5540 if(off < ctx->size && CH(off) == _T('|')) { 5541 found_pipe = TRUE; 5542 off++; 5543 while(off < ctx->size && ISWHITESPACE(off)) 5544 off++; 5545 } 5546 5547 while(1) { 5548 int delimited = FALSE; 5549 5550 /* Cell underline ("-----", ":----", "----:" or ":----:") */ 5551 if(off < ctx->size && CH(off) == _T(':')) 5552 off++; 5553 if(off >= ctx->size || CH(off) != _T('-')) 5554 return FALSE; 5555 while(off < ctx->size && CH(off) == _T('-')) 5556 off++; 5557 if(off < ctx->size && CH(off) == _T(':')) 5558 off++; 5559 5560 col_count++; 5561 5562 /* Pipe delimiter (optional at the end of line). */ 5563 while(off < ctx->size && ISWHITESPACE(off)) 5564 off++; 5565 if(off < ctx->size && CH(off) == _T('|')) { 5566 delimited = TRUE; 5567 found_pipe = TRUE; 5568 off++; 5569 while(off < ctx->size && ISWHITESPACE(off)) 5570 off++; 5571 } 5572 5573 /* Success, if we reach end of line. */ 5574 if(off >= ctx->size || ISNEWLINE(off)) 5575 break; 5576 5577 if(!delimited) 5578 return FALSE; 5579 } 5580 5581 if(!found_pipe) 5582 return FALSE; 5583 5584 *p_end = off; 5585 *p_col_count = col_count; 5586 return TRUE; 5587 } 5588 5589 static int 5590 md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end) 5591 { 5592 OFF off = beg; 5593 5594 while(off < ctx->size && CH(off) == CH(beg)) 5595 off++; 5596 5597 /* Fence must have at least three characters. */ 5598 if(off - beg < 3) 5599 return FALSE; 5600 5601 ctx->code_fence_length = off - beg; 5602 5603 /* Optionally, space(s) can follow. */ 5604 while(off < ctx->size && CH(off) == _T(' ')) 5605 off++; 5606 5607 /* Optionally, an info string can follow. */ 5608 while(off < ctx->size && !ISNEWLINE(off)) { 5609 /* Backtick-based fence must not contain '`' in the info string. */ 5610 if(CH(beg) == _T('`') && CH(off) == _T('`')) 5611 return FALSE; 5612 off++; 5613 } 5614 5615 *p_end = off; 5616 return TRUE; 5617 } 5618 5619 static int 5620 md_is_closing_code_fence(MD_CTX* ctx, CHAR ch, OFF beg, OFF* p_end) 5621 { 5622 OFF off = beg; 5623 int ret = FALSE; 5624 5625 /* Closing fence must have at least the same length and use same char as 5626 * opening one. */ 5627 while(off < ctx->size && CH(off) == ch) 5628 off++; 5629 if(off - beg < ctx->code_fence_length) 5630 goto out; 5631 5632 /* Optionally, space(s) can follow */ 5633 while(off < ctx->size && CH(off) == _T(' ')) 5634 off++; 5635 5636 /* But nothing more is allowed on the line. */ 5637 if(off < ctx->size && !ISNEWLINE(off)) 5638 goto out; 5639 5640 ret = TRUE; 5641 5642 out: 5643 /* Note we set *p_end even on failure: If we are not closing fence, caller 5644 * would eat the line anyway without any parsing. */ 5645 *p_end = off; 5646 return ret; 5647 } 5648 5649 /* Returns type of the raw HTML block, or FALSE if it is not HTML block. 5650 * (Refer to CommonMark specification for details about the types.) 5651 */ 5652 static int 5653 md_is_html_block_start_condition(MD_CTX* ctx, OFF beg) 5654 { 5655 typedef struct TAG_tag TAG; 5656 struct TAG_tag { 5657 const CHAR* name; 5658 unsigned len : 8; 5659 }; 5660 5661 /* Type 6 is started by a long list of allowed tags. We use two-level 5662 * tree to speed-up the search. */ 5663 #ifdef X 5664 #undef X 5665 #endif 5666 #define X(name) { _T(name), (sizeof(name)-1) / sizeof(CHAR) } 5667 #define Xend { NULL, 0 } 5668 static const TAG t1[] = { X("pre"), X("script"), X("style"), X("textarea"), Xend }; 5669 5670 static const TAG a6[] = { X("address"), X("article"), X("aside"), Xend }; 5671 static const TAG b6[] = { X("base"), X("basefont"), X("blockquote"), X("body"), Xend }; 5672 static const TAG c6[] = { X("caption"), X("center"), X("col"), X("colgroup"), Xend }; 5673 static const TAG d6[] = { X("dd"), X("details"), X("dialog"), X("dir"), 5674 X("div"), X("dl"), X("dt"), Xend }; 5675 static const TAG f6[] = { X("fieldset"), X("figcaption"), X("figure"), X("footer"), 5676 X("form"), X("frame"), X("frameset"), Xend }; 5677 static const TAG h6[] = { X("h1"), X("head"), X("header"), X("hr"), X("html"), Xend }; 5678 static const TAG i6[] = { X("iframe"), Xend }; 5679 static const TAG l6[] = { X("legend"), X("li"), X("link"), Xend }; 5680 static const TAG m6[] = { X("main"), X("menu"), X("menuitem"), Xend }; 5681 static const TAG n6[] = { X("nav"), X("noframes"), Xend }; 5682 static const TAG o6[] = { X("ol"), X("optgroup"), X("option"), Xend }; 5683 static const TAG p6[] = { X("p"), X("param"), Xend }; 5684 static const TAG s6[] = { X("section"), X("source"), X("summary"), Xend }; 5685 static const TAG t6[] = { X("table"), X("tbody"), X("td"), X("tfoot"), X("th"), 5686 X("thead"), X("title"), X("tr"), X("track"), Xend }; 5687 static const TAG u6[] = { X("ul"), Xend }; 5688 static const TAG xx[] = { Xend }; 5689 #undef X 5690 5691 static const TAG* map6[26] = { 5692 a6, b6, c6, d6, xx, f6, xx, h6, i6, xx, xx, l6, m6, 5693 n6, o6, p6, xx, xx, s6, t6, u6, xx, xx, xx, xx, xx 5694 }; 5695 OFF off = beg + 1; 5696 int i; 5697 5698 /* Check for type 1: <script, <pre, or <style */ 5699 for(i = 0; t1[i].name != NULL; i++) { 5700 if(off + t1[i].len <= ctx->size) { 5701 if(md_ascii_case_eq(STR(off), t1[i].name, t1[i].len)) 5702 return 1; 5703 } 5704 } 5705 5706 /* Check for type 2: <!-- */ 5707 if(off + 3 < ctx->size && CH(off) == _T('!') && CH(off+1) == _T('-') && CH(off+2) == _T('-')) 5708 return 2; 5709 5710 /* Check for type 3: <? */ 5711 if(off < ctx->size && CH(off) == _T('?')) 5712 return 3; 5713 5714 /* Check for type 4 or 5: <! */ 5715 if(off < ctx->size && CH(off) == _T('!')) { 5716 /* Check for type 4: <! followed by uppercase letter. */ 5717 if(off + 1 < ctx->size && ISASCII(off+1)) 5718 return 4; 5719 5720 /* Check for type 5: <![CDATA[ */ 5721 if(off + 8 < ctx->size) { 5722 if(md_ascii_eq(STR(off), _T("![CDATA["), 8)) 5723 return 5; 5724 } 5725 } 5726 5727 /* Check for type 6: Many possible starting tags listed above. */ 5728 if(off + 1 < ctx->size && (ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1)))) { 5729 int slot; 5730 const TAG* tags; 5731 5732 if(CH(off) == _T('/')) 5733 off++; 5734 5735 slot = (ISUPPER(off) ? CH(off) - 'A' : CH(off) - 'a'); 5736 tags = map6[slot]; 5737 5738 for(i = 0; tags[i].name != NULL; i++) { 5739 if(off + tags[i].len <= ctx->size) { 5740 if(md_ascii_case_eq(STR(off), tags[i].name, tags[i].len)) { 5741 OFF tmp = off + tags[i].len; 5742 if(tmp >= ctx->size) 5743 return 6; 5744 if(ISBLANK(tmp) || ISNEWLINE(tmp) || CH(tmp) == _T('>')) 5745 return 6; 5746 if(tmp+1 < ctx->size && CH(tmp) == _T('/') && CH(tmp+1) == _T('>')) 5747 return 6; 5748 break; 5749 } 5750 } 5751 } 5752 } 5753 5754 /* Check for type 7: any COMPLETE other opening or closing tag. */ 5755 if(off + 1 < ctx->size) { 5756 OFF end; 5757 5758 if(md_is_html_tag(ctx, NULL, 0, beg, ctx->size, &end)) { 5759 /* Only optional whitespace and new line may follow. */ 5760 while(end < ctx->size && ISWHITESPACE(end)) 5761 end++; 5762 if(end >= ctx->size || ISNEWLINE(end)) 5763 return 7; 5764 } 5765 } 5766 5767 return FALSE; 5768 } 5769 5770 /* Case sensitive check whether there is a substring 'what' between 'beg' 5771 * and end of line. */ 5772 static int 5773 md_line_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len, OFF* p_end) 5774 { 5775 OFF i; 5776 for(i = beg; i + what_len < ctx->size; i++) { 5777 if(ISNEWLINE(i)) 5778 break; 5779 if(memcmp(STR(i), what, what_len * sizeof(CHAR)) == 0) { 5780 *p_end = i + what_len; 5781 return TRUE; 5782 } 5783 } 5784 5785 *p_end = i; 5786 return FALSE; 5787 } 5788 5789 /* Returns type of HTML block end condition or FALSE if not an end condition. 5790 * 5791 * Note it fills p_end even when it is not end condition as the caller 5792 * does not need to analyze contents of a raw HTML block. 5793 */ 5794 static int 5795 md_is_html_block_end_condition(MD_CTX* ctx, OFF beg, OFF* p_end) 5796 { 5797 switch(ctx->html_block_type) { 5798 case 1: 5799 { 5800 OFF off = beg; 5801 5802 while(off < ctx->size && !ISNEWLINE(off)) { 5803 if(CH(off) == _T('<')) { 5804 #define FIND_TAG_END(string, length) \ 5805 if(off + length <= ctx->size && \ 5806 md_ascii_case_eq(STR(off), _T(string), length)) { \ 5807 *p_end = off + length; \ 5808 return TRUE; \ 5809 } 5810 FIND_TAG_END("</script>", 9) 5811 FIND_TAG_END("</style>", 8) 5812 FIND_TAG_END("</pre>", 6) 5813 #undef FIND_TAG_END 5814 } 5815 5816 off++; 5817 } 5818 *p_end = off; 5819 return FALSE; 5820 } 5821 5822 case 2: 5823 return (md_line_contains(ctx, beg, _T("-->"), 3, p_end) ? 2 : FALSE); 5824 5825 case 3: 5826 return (md_line_contains(ctx, beg, _T("?>"), 2, p_end) ? 3 : FALSE); 5827 5828 case 4: 5829 return (md_line_contains(ctx, beg, _T(">"), 1, p_end) ? 4 : FALSE); 5830 5831 case 5: 5832 return (md_line_contains(ctx, beg, _T("]]>"), 3, p_end) ? 5 : FALSE); 5833 5834 case 6: /* Pass through */ 5835 case 7: 5836 *p_end = beg; 5837 return (beg >= ctx->size || ISNEWLINE(beg) ? ctx->html_block_type : FALSE); 5838 5839 default: 5840 MD_UNREACHABLE(); 5841 } 5842 return FALSE; 5843 } 5844 5845 5846 static int 5847 md_is_container_compatible(const MD_CONTAINER* pivot, const MD_CONTAINER* container) 5848 { 5849 /* Block quote has no "items" like lists. */ 5850 if(container->ch == _T('>')) 5851 return FALSE; 5852 5853 if(container->ch != pivot->ch) 5854 return FALSE; 5855 if(container->mark_indent > pivot->contents_indent) 5856 return FALSE; 5857 5858 return TRUE; 5859 } 5860 5861 static int 5862 md_push_container(MD_CTX* ctx, const MD_CONTAINER* container) 5863 { 5864 if(ctx->n_containers >= ctx->alloc_containers) { 5865 MD_CONTAINER* new_containers; 5866 5867 ctx->alloc_containers = (ctx->alloc_containers > 0 5868 ? ctx->alloc_containers + ctx->alloc_containers / 2 5869 : 16); 5870 new_containers = realloc(ctx->containers, ctx->alloc_containers * sizeof(MD_CONTAINER)); 5871 if(new_containers == NULL) { 5872 MD_LOG("realloc() failed."); 5873 return -1; 5874 } 5875 5876 ctx->containers = new_containers; 5877 } 5878 5879 memcpy(&ctx->containers[ctx->n_containers++], container, sizeof(MD_CONTAINER)); 5880 return 0; 5881 } 5882 5883 static int 5884 md_enter_child_containers(MD_CTX* ctx, int n_children) 5885 { 5886 int i; 5887 int ret = 0; 5888 5889 for(i = ctx->n_containers - n_children; i < ctx->n_containers; i++) { 5890 MD_CONTAINER* c = &ctx->containers[i]; 5891 int is_ordered_list = FALSE; 5892 5893 switch(c->ch) { 5894 case _T(')'): 5895 case _T('.'): 5896 is_ordered_list = TRUE; 5897 MD_FALLTHROUGH(); 5898 5899 case _T('-'): 5900 case _T('+'): 5901 case _T('*'): 5902 /* Remember offset in ctx->block_bytes so we can revisit the 5903 * block if we detect it is a loose list. */ 5904 md_end_current_block(ctx); 5905 c->block_byte_off = ctx->n_block_bytes; 5906 5907 MD_CHECK(md_push_container_bytes(ctx, 5908 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL), 5909 c->start, c->ch, MD_BLOCK_CONTAINER_OPENER)); 5910 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI, 5911 c->task_mark_off, 5912 (c->is_task ? CH(c->task_mark_off) : 0), 5913 MD_BLOCK_CONTAINER_OPENER)); 5914 break; 5915 5916 case _T('>'): 5917 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0, 0, MD_BLOCK_CONTAINER_OPENER)); 5918 break; 5919 5920 default: 5921 MD_UNREACHABLE(); 5922 break; 5923 } 5924 } 5925 5926 abort: 5927 return ret; 5928 } 5929 5930 static int 5931 md_leave_child_containers(MD_CTX* ctx, int n_keep) 5932 { 5933 int ret = 0; 5934 5935 while(ctx->n_containers > n_keep) { 5936 MD_CONTAINER* c = &ctx->containers[ctx->n_containers-1]; 5937 int is_ordered_list = FALSE; 5938 5939 switch(c->ch) { 5940 case _T(')'): 5941 case _T('.'): 5942 is_ordered_list = TRUE; 5943 MD_FALLTHROUGH(); 5944 5945 case _T('-'): 5946 case _T('+'): 5947 case _T('*'): 5948 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI, 5949 c->task_mark_off, (c->is_task ? CH(c->task_mark_off) : 0), 5950 MD_BLOCK_CONTAINER_CLOSER)); 5951 MD_CHECK(md_push_container_bytes(ctx, 5952 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL), 0, 5953 c->ch, MD_BLOCK_CONTAINER_CLOSER)); 5954 break; 5955 5956 case _T('>'): 5957 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0, 5958 0, MD_BLOCK_CONTAINER_CLOSER)); 5959 break; 5960 5961 default: 5962 MD_UNREACHABLE(); 5963 break; 5964 } 5965 5966 ctx->n_containers--; 5967 } 5968 5969 abort: 5970 return ret; 5971 } 5972 5973 static int 5974 md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTAINER* p_container) 5975 { 5976 OFF off = beg; 5977 OFF max_end; 5978 5979 if(off >= ctx->size || indent >= ctx->code_indent_offset) 5980 return FALSE; 5981 5982 /* Check for block quote mark. */ 5983 if(CH(off) == _T('>')) { 5984 off++; 5985 p_container->ch = _T('>'); 5986 p_container->is_loose = FALSE; 5987 p_container->is_task = FALSE; 5988 p_container->mark_indent = indent; 5989 p_container->contents_indent = indent + 1; 5990 *p_end = off; 5991 return TRUE; 5992 } 5993 5994 /* Check for list item bullet mark. */ 5995 if(ISANYOF(off, _T("-+*")) && (off+1 >= ctx->size || ISBLANK(off+1) || ISNEWLINE(off+1))) { 5996 p_container->ch = CH(off); 5997 p_container->is_loose = FALSE; 5998 p_container->is_task = FALSE; 5999 p_container->mark_indent = indent; 6000 p_container->contents_indent = indent + 1; 6001 *p_end = off+1; 6002 return TRUE; 6003 } 6004 6005 /* Check for ordered list item marks. */ 6006 max_end = off + 9; 6007 if(max_end > ctx->size) 6008 max_end = ctx->size; 6009 p_container->start = 0; 6010 while(off < max_end && ISDIGIT(off)) { 6011 p_container->start = p_container->start * 10 + CH(off) - _T('0'); 6012 off++; 6013 } 6014 if(off > beg && 6015 off < ctx->size && 6016 (CH(off) == _T('.') || CH(off) == _T(')')) && 6017 (off+1 >= ctx->size || ISBLANK(off+1) || ISNEWLINE(off+1))) 6018 { 6019 p_container->ch = CH(off); 6020 p_container->is_loose = FALSE; 6021 p_container->is_task = FALSE; 6022 p_container->mark_indent = indent; 6023 p_container->contents_indent = indent + off - beg + 1; 6024 *p_end = off+1; 6025 return TRUE; 6026 } 6027 6028 return FALSE; 6029 } 6030 6031 static unsigned 6032 md_line_indentation(MD_CTX* ctx, unsigned total_indent, OFF beg, OFF* p_end) 6033 { 6034 OFF off = beg; 6035 unsigned indent = total_indent; 6036 6037 while(off < ctx->size && ISBLANK(off)) { 6038 if(CH(off) == _T('\t')) 6039 indent = (indent + 4) & ~3; 6040 else 6041 indent++; 6042 off++; 6043 } 6044 6045 *p_end = off; 6046 return indent - total_indent; 6047 } 6048 6049 static const MD_LINE_ANALYSIS md_dummy_blank_line = { MD_LINE_BLANK, 0, 0, 0, 0 }; 6050 6051 /* Analyze type of the line and find some its properties. This serves as a 6052 * main input for determining type and boundaries of a block. */ 6053 static int 6054 md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, 6055 const MD_LINE_ANALYSIS* pivot_line, MD_LINE_ANALYSIS* line) 6056 { 6057 unsigned total_indent = 0; 6058 int n_parents = 0; 6059 int n_brothers = 0; 6060 int n_children = 0; 6061 MD_CONTAINER container = { 0 }; 6062 int prev_line_has_list_loosening_effect = ctx->last_line_has_list_loosening_effect; 6063 OFF off = beg; 6064 OFF hr_killer = 0; 6065 int ret = 0; 6066 6067 line->indent = md_line_indentation(ctx, total_indent, off, &off); 6068 total_indent += line->indent; 6069 line->beg = off; 6070 6071 /* Given the indentation and block quote marks '>', determine how many of 6072 * the current containers are our parents. */ 6073 while(n_parents < ctx->n_containers) { 6074 MD_CONTAINER* c = &ctx->containers[n_parents]; 6075 6076 if(c->ch == _T('>') && line->indent < ctx->code_indent_offset && 6077 off < ctx->size && CH(off) == _T('>')) 6078 { 6079 /* Block quote mark. */ 6080 off++; 6081 total_indent++; 6082 line->indent = md_line_indentation(ctx, total_indent, off, &off); 6083 total_indent += line->indent; 6084 6085 /* The optional 1st space after '>' is part of the block quote mark. */ 6086 if(line->indent > 0) 6087 line->indent--; 6088 6089 line->beg = off; 6090 6091 } else if(c->ch != _T('>') && line->indent >= c->contents_indent) { 6092 /* List. */ 6093 line->indent -= c->contents_indent; 6094 } else { 6095 break; 6096 } 6097 6098 n_parents++; 6099 } 6100 6101 if(off >= ctx->size || ISNEWLINE(off)) { 6102 /* Blank line does not need any real indentation to be nested inside 6103 * a list. */ 6104 if(n_brothers + n_children == 0) { 6105 while(n_parents < ctx->n_containers && ctx->containers[n_parents].ch != _T('>')) 6106 n_parents++; 6107 } 6108 } 6109 6110 while(TRUE) { 6111 /* Check whether we are fenced code continuation. */ 6112 if(pivot_line->type == MD_LINE_FENCEDCODE) { 6113 line->beg = off; 6114 6115 /* We are another MD_LINE_FENCEDCODE unless we are closing fence 6116 * which we transform into MD_LINE_BLANK. */ 6117 if(line->indent < ctx->code_indent_offset) { 6118 if(md_is_closing_code_fence(ctx, CH(pivot_line->beg), off, &off)) { 6119 line->type = MD_LINE_BLANK; 6120 ctx->last_line_has_list_loosening_effect = FALSE; 6121 break; 6122 } 6123 } 6124 6125 /* Change indentation accordingly to the initial code fence. */ 6126 if(n_parents == ctx->n_containers) { 6127 if(line->indent > pivot_line->indent) 6128 line->indent -= pivot_line->indent; 6129 else 6130 line->indent = 0; 6131 6132 line->type = MD_LINE_FENCEDCODE; 6133 break; 6134 } 6135 } 6136 6137 /* Check whether we are HTML block continuation. */ 6138 if(pivot_line->type == MD_LINE_HTML && ctx->html_block_type > 0) { 6139 if(n_parents < ctx->n_containers) { 6140 /* HTML block is implicitly ended if the enclosing container 6141 * block ends. */ 6142 ctx->html_block_type = 0; 6143 } else { 6144 int html_block_type; 6145 6146 html_block_type = md_is_html_block_end_condition(ctx, off, &off); 6147 if(html_block_type > 0) { 6148 MD_ASSERT(html_block_type == ctx->html_block_type); 6149 6150 /* Make sure this is the last line of the block. */ 6151 ctx->html_block_type = 0; 6152 6153 /* Some end conditions serve as blank lines at the same time. */ 6154 if(html_block_type == 6 || html_block_type == 7) { 6155 line->type = MD_LINE_BLANK; 6156 line->indent = 0; 6157 break; 6158 } 6159 } 6160 6161 line->type = MD_LINE_HTML; 6162 n_parents = ctx->n_containers; 6163 break; 6164 } 6165 } 6166 6167 /* Check for blank line. */ 6168 if(off >= ctx->size || ISNEWLINE(off)) { 6169 if(pivot_line->type == MD_LINE_INDENTEDCODE && n_parents == ctx->n_containers) { 6170 line->type = MD_LINE_INDENTEDCODE; 6171 if(line->indent > ctx->code_indent_offset) 6172 line->indent -= ctx->code_indent_offset; 6173 else 6174 line->indent = 0; 6175 ctx->last_line_has_list_loosening_effect = FALSE; 6176 } else { 6177 line->type = MD_LINE_BLANK; 6178 ctx->last_line_has_list_loosening_effect = (n_parents > 0 && 6179 n_brothers + n_children == 0 && 6180 ctx->containers[n_parents-1].ch != _T('>')); 6181 6182 #if 1 6183 /* See https://github.com/mity/md4c/issues/6 6184 * 6185 * This ugly checking tests we are in (yet empty) list item but 6186 * not its very first line (i.e. not the line with the list 6187 * item mark). 6188 * 6189 * If we are such a blank line, then any following non-blank 6190 * line which would be part of the list item actually has to 6191 * end the list because according to the specification, "a list 6192 * item can begin with at most one blank line." 6193 */ 6194 if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') && 6195 n_brothers + n_children == 0 && ctx->current_block == NULL && 6196 ctx->n_block_bytes > (int) sizeof(MD_BLOCK)) 6197 { 6198 MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK)); 6199 if(top_block->type == MD_BLOCK_LI) 6200 ctx->last_list_item_starts_with_two_blank_lines = TRUE; 6201 } 6202 #endif 6203 } 6204 break; 6205 } else { 6206 #if 1 6207 /* This is the 2nd half of the hack. If the flag is set (i.e. there 6208 * was a 2nd blank line at the beginning of the list item) and if 6209 * we would otherwise still belong to the list item, we enforce 6210 * the end of the list. */ 6211 ctx->last_line_has_list_loosening_effect = FALSE; 6212 if(ctx->last_list_item_starts_with_two_blank_lines) { 6213 if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') && 6214 n_brothers + n_children == 0 && ctx->current_block == NULL && 6215 ctx->n_block_bytes > (int) sizeof(MD_BLOCK)) 6216 { 6217 MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK)); 6218 if(top_block->type == MD_BLOCK_LI) 6219 n_parents--; 6220 } 6221 6222 ctx->last_list_item_starts_with_two_blank_lines = FALSE; 6223 } 6224 #endif 6225 } 6226 6227 /* Check whether we are Setext underline. */ 6228 if(line->indent < ctx->code_indent_offset && pivot_line->type == MD_LINE_TEXT 6229 && off < ctx->size && ISANYOF2(off, _T('='), _T('-')) 6230 && (n_parents == ctx->n_containers)) 6231 { 6232 unsigned level; 6233 6234 if(md_is_setext_underline(ctx, off, &off, &level)) { 6235 line->type = MD_LINE_SETEXTUNDERLINE; 6236 line->data = level; 6237 break; 6238 } 6239 } 6240 6241 /* Check for thematic break line. */ 6242 if(line->indent < ctx->code_indent_offset 6243 && off < ctx->size && off >= hr_killer 6244 && ISANYOF(off, _T("-_*"))) 6245 { 6246 if(md_is_hr_line(ctx, off, &off, &hr_killer)) { 6247 line->type = MD_LINE_HR; 6248 break; 6249 } 6250 } 6251 6252 /* Check for "brother" container. I.e. whether we are another list item 6253 * in already started list. */ 6254 if(n_parents < ctx->n_containers && n_brothers + n_children == 0) { 6255 OFF tmp; 6256 6257 if(md_is_container_mark(ctx, line->indent, off, &tmp, &container) && 6258 md_is_container_compatible(&ctx->containers[n_parents], &container)) 6259 { 6260 pivot_line = &md_dummy_blank_line; 6261 6262 off = tmp; 6263 6264 total_indent += container.contents_indent - container.mark_indent; 6265 line->indent = md_line_indentation(ctx, total_indent, off, &off); 6266 total_indent += line->indent; 6267 line->beg = off; 6268 6269 /* Some of the following whitespace actually still belongs to the mark. */ 6270 if(off >= ctx->size || ISNEWLINE(off)) { 6271 container.contents_indent++; 6272 } else if(line->indent <= ctx->code_indent_offset) { 6273 container.contents_indent += line->indent; 6274 line->indent = 0; 6275 } else { 6276 container.contents_indent += 1; 6277 line->indent--; 6278 } 6279 6280 ctx->containers[n_parents].mark_indent = container.mark_indent; 6281 ctx->containers[n_parents].contents_indent = container.contents_indent; 6282 6283 n_brothers++; 6284 continue; 6285 } 6286 } 6287 6288 /* Check for indented code. 6289 * Note indented code block cannot interrupt a paragraph. */ 6290 if(line->indent >= ctx->code_indent_offset && 6291 (pivot_line->type == MD_LINE_BLANK || pivot_line->type == MD_LINE_INDENTEDCODE)) 6292 { 6293 line->type = MD_LINE_INDENTEDCODE; 6294 MD_ASSERT(line->indent >= ctx->code_indent_offset); 6295 line->indent -= ctx->code_indent_offset; 6296 line->data = 0; 6297 break; 6298 } 6299 6300 /* Check for start of a new container block. */ 6301 if(line->indent < ctx->code_indent_offset && 6302 md_is_container_mark(ctx, line->indent, off, &off, &container)) 6303 { 6304 if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers && 6305 (off >= ctx->size || ISNEWLINE(off)) && container.ch != _T('>')) 6306 { 6307 /* Noop. List mark followed by a blank line cannot interrupt a paragraph. */ 6308 } else if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers && 6309 ISANYOF2_(container.ch, _T('.'), _T(')')) && container.start != 1) 6310 { 6311 /* Noop. Ordered list cannot interrupt a paragraph unless the start index is 1. */ 6312 } else { 6313 total_indent += container.contents_indent - container.mark_indent; 6314 line->indent = md_line_indentation(ctx, total_indent, off, &off); 6315 total_indent += line->indent; 6316 6317 line->beg = off; 6318 line->data = container.ch; 6319 6320 /* Some of the following whitespace actually still belongs to the mark. */ 6321 if(off >= ctx->size || ISNEWLINE(off)) { 6322 container.contents_indent++; 6323 } else if(line->indent <= ctx->code_indent_offset) { 6324 container.contents_indent += line->indent; 6325 line->indent = 0; 6326 } else { 6327 container.contents_indent += 1; 6328 line->indent--; 6329 } 6330 6331 if(n_brothers + n_children == 0) 6332 pivot_line = &md_dummy_blank_line; 6333 6334 if(n_children == 0) 6335 MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers)); 6336 6337 n_children++; 6338 MD_CHECK(md_push_container(ctx, &container)); 6339 continue; 6340 } 6341 } 6342 6343 /* Check whether we are table continuation. */ 6344 if(pivot_line->type == MD_LINE_TABLE && n_parents == ctx->n_containers) { 6345 line->type = MD_LINE_TABLE; 6346 break; 6347 } 6348 6349 /* Check for ATX header. */ 6350 if(line->indent < ctx->code_indent_offset && 6351 off < ctx->size && CH(off) == _T('#')) 6352 { 6353 unsigned level; 6354 6355 if(md_is_atxheader_line(ctx, off, &line->beg, &off, &level)) { 6356 line->type = MD_LINE_ATXHEADER; 6357 line->data = level; 6358 break; 6359 } 6360 } 6361 6362 /* Check whether we are starting code fence. */ 6363 if(off < ctx->size && ISANYOF2(off, _T('`'), _T('~'))) { 6364 if(md_is_opening_code_fence(ctx, off, &off)) { 6365 line->type = MD_LINE_FENCEDCODE; 6366 line->data = 1; 6367 break; 6368 } 6369 } 6370 6371 /* Check for start of raw HTML block. */ 6372 if(off < ctx->size && CH(off) == _T('<') 6373 && !(ctx->parser.flags & MD_FLAG_NOHTMLBLOCKS)) 6374 { 6375 ctx->html_block_type = md_is_html_block_start_condition(ctx, off); 6376 6377 /* HTML block type 7 cannot interrupt paragraph. */ 6378 if(ctx->html_block_type == 7 && pivot_line->type == MD_LINE_TEXT) 6379 ctx->html_block_type = 0; 6380 6381 if(ctx->html_block_type > 0) { 6382 /* The line itself also may immediately close the block. */ 6383 if(md_is_html_block_end_condition(ctx, off, &off) == ctx->html_block_type) { 6384 /* Make sure this is the last line of the block. */ 6385 ctx->html_block_type = 0; 6386 } 6387 6388 line->type = MD_LINE_HTML; 6389 break; 6390 } 6391 } 6392 6393 /* Check for table underline. */ 6394 if((ctx->parser.flags & MD_FLAG_TABLES) && pivot_line->type == MD_LINE_TEXT 6395 && off < ctx->size && ISANYOF3(off, _T('|'), _T('-'), _T(':')) 6396 && n_parents == ctx->n_containers) 6397 { 6398 unsigned col_count; 6399 6400 if(ctx->current_block != NULL && ctx->current_block->n_lines == 1 && 6401 md_is_table_underline(ctx, off, &off, &col_count)) 6402 { 6403 line->data = col_count; 6404 line->type = MD_LINE_TABLEUNDERLINE; 6405 break; 6406 } 6407 } 6408 6409 /* By default, we are normal text line. */ 6410 line->type = MD_LINE_TEXT; 6411 if(pivot_line->type == MD_LINE_TEXT && n_brothers + n_children == 0) { 6412 /* Lazy continuation. */ 6413 n_parents = ctx->n_containers; 6414 } 6415 6416 /* Check for task mark. */ 6417 if((ctx->parser.flags & MD_FLAG_TASKLISTS) && n_brothers + n_children > 0 && 6418 ISANYOF_(ctx->containers[ctx->n_containers-1].ch, _T("-+*.)"))) 6419 { 6420 OFF tmp = off; 6421 6422 while(tmp < ctx->size && tmp < off + 3 && ISBLANK(tmp)) 6423 tmp++; 6424 if(tmp + 2 < ctx->size && CH(tmp) == _T('[') && 6425 ISANYOF(tmp+1, _T("xX ")) && CH(tmp+2) == _T(']') && 6426 (tmp + 3 == ctx->size || ISBLANK(tmp+3) || ISNEWLINE(tmp+3))) 6427 { 6428 MD_CONTAINER* task_container = (n_children > 0 ? &ctx->containers[ctx->n_containers-1] : &container); 6429 task_container->is_task = TRUE; 6430 task_container->task_mark_off = tmp + 1; 6431 off = tmp + 3; 6432 while(off < ctx->size && ISWHITESPACE(off)) 6433 off++; 6434 if (off == ctx->size) break; 6435 line->beg = off; 6436 } 6437 } 6438 6439 break; 6440 } 6441 6442 /* Scan for end of the line. 6443 * 6444 * Note this is quite a bottleneck of the parsing as we here iterate almost 6445 * over compete document. 6446 */ 6447 #if defined __linux__ && !defined MD4C_USE_UTF16 6448 /* Recent glibc versions have superbly optimized strcspn(), even using 6449 * vectorization if available. */ 6450 if(ctx->doc_ends_with_newline && off < ctx->size) { 6451 while(TRUE) { 6452 off += (OFF) strcspn(STR(off), "\r\n"); 6453 6454 /* strcspn() can stop on zero terminator; but that can appear 6455 * anywhere in the Markfown input... */ 6456 if(CH(off) == _T('\0')) 6457 off++; 6458 else 6459 break; 6460 } 6461 } else 6462 #endif 6463 { 6464 /* Optimization: Use some loop unrolling. */ 6465 while(off + 3 < ctx->size && !ISNEWLINE(off+0) && !ISNEWLINE(off+1) 6466 && !ISNEWLINE(off+2) && !ISNEWLINE(off+3)) 6467 off += 4; 6468 while(off < ctx->size && !ISNEWLINE(off)) 6469 off++; 6470 } 6471 6472 /* Set end of the line. */ 6473 line->end = off; 6474 6475 /* But for ATX header, we should exclude the optional trailing mark. */ 6476 if(line->type == MD_LINE_ATXHEADER) { 6477 OFF tmp = line->end; 6478 while(tmp > line->beg && CH(tmp-1) == _T(' ')) 6479 tmp--; 6480 while(tmp > line->beg && CH(tmp-1) == _T('#')) 6481 tmp--; 6482 if(tmp == line->beg || CH(tmp-1) == _T(' ') || (ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS)) 6483 line->end = tmp; 6484 } 6485 6486 /* Trim trailing spaces. */ 6487 if(line->type != MD_LINE_INDENTEDCODE && line->type != MD_LINE_FENCEDCODE) { 6488 while(line->end > line->beg && CH(line->end-1) == _T(' ')) 6489 line->end--; 6490 } 6491 6492 /* Eat also the new line. */ 6493 if(off < ctx->size && CH(off) == _T('\r')) 6494 off++; 6495 if(off < ctx->size && CH(off) == _T('\n')) 6496 off++; 6497 6498 *p_end = off; 6499 6500 /* If we belong to a list after seeing a blank line, the list is loose. */ 6501 if(prev_line_has_list_loosening_effect && line->type != MD_LINE_BLANK && n_parents + n_brothers > 0) { 6502 MD_CONTAINER* c = &ctx->containers[n_parents + n_brothers - 1]; 6503 if(c->ch != _T('>')) { 6504 MD_BLOCK* block = (MD_BLOCK*) (((char*)ctx->block_bytes) + c->block_byte_off); 6505 block->flags |= MD_BLOCK_LOOSE_LIST; 6506 } 6507 } 6508 6509 /* Leave any containers we are not part of anymore. */ 6510 if(n_children == 0 && n_parents + n_brothers < ctx->n_containers) 6511 MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers)); 6512 6513 /* Enter any container we found a mark for. */ 6514 if(n_brothers > 0) { 6515 MD_ASSERT(n_brothers == 1); 6516 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI, 6517 ctx->containers[n_parents].task_mark_off, 6518 (ctx->containers[n_parents].is_task ? CH(ctx->containers[n_parents].task_mark_off) : 0), 6519 MD_BLOCK_CONTAINER_CLOSER)); 6520 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI, 6521 container.task_mark_off, 6522 (container.is_task ? CH(container.task_mark_off) : 0), 6523 MD_BLOCK_CONTAINER_OPENER)); 6524 ctx->containers[n_parents].is_task = container.is_task; 6525 ctx->containers[n_parents].task_mark_off = container.task_mark_off; 6526 } 6527 6528 if(n_children > 0) 6529 MD_CHECK(md_enter_child_containers(ctx, n_children)); 6530 6531 abort: 6532 return ret; 6533 } 6534 6535 static int 6536 md_process_line(MD_CTX* ctx, const MD_LINE_ANALYSIS** p_pivot_line, MD_LINE_ANALYSIS* line) 6537 { 6538 const MD_LINE_ANALYSIS* pivot_line = *p_pivot_line; 6539 int ret = 0; 6540 6541 /* Blank line ends current leaf block. */ 6542 if(line->type == MD_LINE_BLANK) { 6543 MD_CHECK(md_end_current_block(ctx)); 6544 *p_pivot_line = &md_dummy_blank_line; 6545 return 0; 6546 } 6547 6548 /* Some line types form block on their own. */ 6549 if(line->type == MD_LINE_HR || line->type == MD_LINE_ATXHEADER) { 6550 MD_CHECK(md_end_current_block(ctx)); 6551 6552 /* Add our single-line block. */ 6553 MD_CHECK(md_start_new_block(ctx, line)); 6554 MD_CHECK(md_add_line_into_current_block(ctx, line)); 6555 MD_CHECK(md_end_current_block(ctx)); 6556 *p_pivot_line = &md_dummy_blank_line; 6557 return 0; 6558 } 6559 6560 /* MD_LINE_SETEXTUNDERLINE changes meaning of the current block and ends it. */ 6561 if(line->type == MD_LINE_SETEXTUNDERLINE) { 6562 MD_ASSERT(ctx->current_block != NULL); 6563 ctx->current_block->type = MD_BLOCK_H; 6564 ctx->current_block->data = line->data; 6565 ctx->current_block->flags |= MD_BLOCK_SETEXT_HEADER; 6566 MD_CHECK(md_add_line_into_current_block(ctx, line)); 6567 MD_CHECK(md_end_current_block(ctx)); 6568 if(ctx->current_block == NULL) { 6569 *p_pivot_line = &md_dummy_blank_line; 6570 } else { 6571 /* This happens if we have consumed all the body as link ref. defs. 6572 * and downgraded the underline into start of a new paragraph block. */ 6573 line->type = MD_LINE_TEXT; 6574 *p_pivot_line = line; 6575 } 6576 return 0; 6577 } 6578 6579 /* MD_LINE_TABLEUNDERLINE changes meaning of the current block. */ 6580 if(line->type == MD_LINE_TABLEUNDERLINE) { 6581 MD_ASSERT(ctx->current_block != NULL); 6582 MD_ASSERT(ctx->current_block->n_lines == 1); 6583 ctx->current_block->type = MD_BLOCK_TABLE; 6584 ctx->current_block->data = line->data; 6585 MD_ASSERT(pivot_line != &md_dummy_blank_line); 6586 ((MD_LINE_ANALYSIS*)pivot_line)->type = MD_LINE_TABLE; 6587 MD_CHECK(md_add_line_into_current_block(ctx, line)); 6588 return 0; 6589 } 6590 6591 /* The current block also ends if the line has different type. */ 6592 if(line->type != pivot_line->type) 6593 MD_CHECK(md_end_current_block(ctx)); 6594 6595 /* The current line may start a new block. */ 6596 if(ctx->current_block == NULL) { 6597 MD_CHECK(md_start_new_block(ctx, line)); 6598 *p_pivot_line = line; 6599 } 6600 6601 /* In all other cases the line is just a continuation of the current block. */ 6602 MD_CHECK(md_add_line_into_current_block(ctx, line)); 6603 6604 abort: 6605 return ret; 6606 } 6607 6608 static int 6609 md_process_doc(MD_CTX *ctx) 6610 { 6611 const MD_LINE_ANALYSIS* pivot_line = &md_dummy_blank_line; 6612 MD_LINE_ANALYSIS line_buf[2]; 6613 MD_LINE_ANALYSIS* line = &line_buf[0]; 6614 OFF off = 0; 6615 int ret = 0; 6616 6617 MD_ENTER_BLOCK(MD_BLOCK_DOC, NULL); 6618 6619 while(off < ctx->size) { 6620 if(line == pivot_line) 6621 line = (line == &line_buf[0] ? &line_buf[1] : &line_buf[0]); 6622 6623 MD_CHECK(md_analyze_line(ctx, off, &off, pivot_line, line)); 6624 MD_CHECK(md_process_line(ctx, &pivot_line, line)); 6625 } 6626 6627 md_end_current_block(ctx); 6628 6629 MD_CHECK(md_build_ref_def_hashtable(ctx)); 6630 6631 /* Process all blocks. */ 6632 MD_CHECK(md_leave_child_containers(ctx, 0)); 6633 MD_CHECK(md_process_all_blocks(ctx)); 6634 6635 MD_LEAVE_BLOCK(MD_BLOCK_DOC, NULL); 6636 6637 abort: 6638 6639 #if 0 6640 /* Output some memory consumption statistics. */ 6641 { 6642 char buffer[256]; 6643 sprintf(buffer, "Alloced %u bytes for block buffer.", 6644 (unsigned)(ctx->alloc_block_bytes)); 6645 MD_LOG(buffer); 6646 6647 sprintf(buffer, "Alloced %u bytes for containers buffer.", 6648 (unsigned)(ctx->alloc_containers * sizeof(MD_CONTAINER))); 6649 MD_LOG(buffer); 6650 6651 sprintf(buffer, "Alloced %u bytes for marks buffer.", 6652 (unsigned)(ctx->alloc_marks * sizeof(MD_MARK))); 6653 MD_LOG(buffer); 6654 6655 sprintf(buffer, "Alloced %u bytes for aux. buffer.", 6656 (unsigned)(ctx->alloc_buffer * sizeof(MD_CHAR))); 6657 MD_LOG(buffer); 6658 } 6659 #endif 6660 6661 return ret; 6662 } 6663 6664 6665 /******************** 6666 *** Public API *** 6667 ********************/ 6668 6669 int 6670 md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata) 6671 { 6672 MD_CTX ctx = {.text = text, 6673 .size = size, 6674 .userdata = userdata, 6675 .code_indent_offset = (ctx.parser.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4, 6676 .doc_ends_with_newline = (size > 0 && ISNEWLINE_(text[size-1]))}; 6677 int i; 6678 int ret; 6679 6680 if(parser->abi_version != 0) { 6681 if(parser->debug_log != NULL) 6682 parser->debug_log("Unsupported abi_version.", userdata); 6683 return -1; 6684 } 6685 6686 /* Setup context structure. */ 6687 memcpy(&ctx.parser, parser, sizeof(MD_PARSER)); 6688 md_build_mark_char_map(&ctx); 6689 6690 /* Reset all unresolved opener mark chains. */ 6691 for(i = 0; i < (int) SIZEOF_ARRAY(ctx.mark_chains); i++) { 6692 ctx.mark_chains[i].head = -1; 6693 ctx.mark_chains[i].tail = -1; 6694 } 6695 ctx.unresolved_link_head = -1; 6696 ctx.unresolved_link_tail = -1; 6697 6698 /* All the work. */ 6699 ret = md_process_doc(&ctx); 6700 6701 /* Clean-up. */ 6702 md_free_ref_defs(&ctx); 6703 md_free_ref_def_hashtable(&ctx); 6704 free(ctx.buffer); 6705 free(ctx.marks); 6706 free(ctx.block_bytes); 6707 free(ctx.containers); 6708 6709 return ret; 6710 }