md

cat markdown files with syntax highlighting
git clone https://noulin.net/git/md.git
Log | Files | Refs | README | LICENSE

md4c.h (15511B)


      1 /*
      2  * MD4C: Markdown parser for C
      3  * (http://github.com/mity/md4c)
      4  *
      5  * Copyright (c) 2016-2020 Martin Mitas
      6  *
      7  * Permission is hereby granted, free of charge, to any person obtaining a
      8  * copy of this software and associated documentation files (the "Software"),
      9  * to deal in the Software without restriction, including without limitation
     10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     11  * and/or sell copies of the Software, and to permit persons to whom the
     12  * Software is furnished to do so, subject to the following conditions:
     13  *
     14  * The above copyright notice and this permission notice shall be included in
     15  * all copies or substantial portions of the Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     20  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     23  * IN THE SOFTWARE.
     24  */
     25 
     26 #ifndef MD4C_H
     27 #define MD4C_H
     28 
     29 #ifdef __cplusplus
     30     extern "C" {
     31 #endif
     32 
     33 #if defined MD4C_USE_UTF16
     34     /* Magic to support UTF-16. Note that in order to use it, you have to define
     35      * the macro MD4C_USE_UTF16 both when building MD4C as well as when
     36      * including this header in your code. */
     37     #ifdef _WIN32
     38         #include <windows.h>
     39         typedef WCHAR       MD_CHAR;
     40     #else
     41         #error MD4C_USE_UTF16 is only supported on Windows.
     42     #endif
     43 #else
     44     typedef char            MD_CHAR;
     45 #endif
     46 
     47 typedef unsigned MD_SIZE;
     48 typedef unsigned MD_OFFSET;
     49 
     50 
     51 /* Block represents a part of document hierarchy structure like a paragraph
     52  * or list item.
     53  */
     54 typedef enum MD_BLOCKTYPE {
     55     /* <body>...</body> */
     56     MD_BLOCK_DOC = 0,
     57 
     58     /* <blockquote>...</blockquote> */
     59     MD_BLOCK_QUOTE,
     60 
     61     /* <ul>...</ul>
     62      * Detail: Structure MD_BLOCK_UL_DETAIL. */
     63     MD_BLOCK_UL,
     64 
     65     /* <ol>...</ol>
     66      * Detail: Structure MD_BLOCK_OL_DETAIL. */
     67     MD_BLOCK_OL,
     68 
     69     /* <li>...</li>
     70      * Detail: Structure MD_BLOCK_LI_DETAIL. */
     71     MD_BLOCK_LI,
     72 
     73     /* <hr> */
     74     MD_BLOCK_HR,
     75 
     76     /* <h1>...</h1> (for levels up to 6)
     77      * Detail: Structure MD_BLOCK_H_DETAIL. */
     78     MD_BLOCK_H,
     79 
     80     /* <pre><code>...</code></pre>
     81      * Note the text lines within code blocks are terminated with '\n'
     82      * instead of explicit MD_TEXT_BR. */
     83     MD_BLOCK_CODE,
     84 
     85     /* Raw HTML block. This itself does not correspond to any particular HTML
     86      * tag. The contents of it _is_ raw HTML source intended to be put
     87      * in verbatim form to the HTML output. */
     88     MD_BLOCK_HTML,
     89 
     90     /* <p>...</p> */
     91     MD_BLOCK_P,
     92 
     93     /* <table>...</table> and its contents.
     94      * Detail: Structure MD_BLOCK_TABLE_DETAIL (for MD_BLOCK_TABLE),
     95      *         structure MD_BLOCK_TD_DETAIL (for MD_BLOCK_TH and MD_BLOCK_TD)
     96      * Note all of these are used only if extension MD_FLAG_TABLES is enabled. */
     97     MD_BLOCK_TABLE,
     98     MD_BLOCK_THEAD,
     99     MD_BLOCK_TBODY,
    100     MD_BLOCK_TR,
    101     MD_BLOCK_TH,
    102     MD_BLOCK_TD
    103 } MD_BLOCKTYPE;
    104 
    105 /* Span represents an in-line piece of a document which should be rendered with
    106  * the same font, color and other attributes. A sequence of spans forms a block
    107  * like paragraph or list item. */
    108 typedef enum MD_SPANTYPE {
    109     /* <em>...</em> */
    110     MD_SPAN_EM,
    111 
    112     /* <strong>...</strong> */
    113     MD_SPAN_STRONG,
    114 
    115     /* <a href="xxx">...</a>
    116      * Detail: Structure MD_SPAN_A_DETAIL. */
    117     MD_SPAN_A,
    118 
    119     /* <img src="xxx">...</a>
    120      * Detail: Structure MD_SPAN_IMG_DETAIL.
    121      * Note: Image text can contain nested spans and even nested images.
    122      * If rendered into ALT attribute of HTML <IMG> tag, it's responsibility
    123      * of the parser to deal with it.
    124      */
    125     MD_SPAN_IMG,
    126 
    127     /* <code>...</code> */
    128     MD_SPAN_CODE,
    129 
    130     /* <del>...</del>
    131      * Note: Recognized only when MD_FLAG_STRIKETHROUGH is enabled.
    132      */
    133     MD_SPAN_DEL,
    134 
    135     /* For recognizing inline ($) and display ($$) equations
    136      * Note: Recognized only when MD_FLAG_LATEXMATHSPANS is enabled.
    137      */
    138     MD_SPAN_LATEXMATH,
    139     MD_SPAN_LATEXMATH_DISPLAY,
    140 
    141     /* Wiki links
    142      * Note: Recognized only when MD_FLAG_WIKILINKS is enabled.
    143      */
    144     MD_SPAN_WIKILINK,
    145 
    146     /* <u>...</u>
    147      * Note: Recognized only when MD_FLAG_UNDERLINE is enabled. */
    148     MD_SPAN_U,
    149     MD_SPAN_FNT,
    150     MD_SPAN_INV,
    151     MD_SPAN_COC,
    152     MD_SPAN_BLI,
    153     MD_SPAN_ANCHOR,
    154     /* This span type is issued by md4c
    155      * MD_SPAN_COLOR allows supporting RGB colors:
    156      * [text with colors](#1#13)
    157      * md4c treats colors as MD_SPAN_A and the parsing of the color
    158      * is done by the user.
    159      */
    160     MD_SPAN_COLOR,
    161 } MD_SPANTYPE;
    162 
    163 /* Text is the actual textual contents of span. */
    164 typedef enum MD_TEXTTYPE {
    165     /* Normal text. */
    166     MD_TEXT_NORMAL = 0,
    167 
    168     /* NULL character. CommonMark requires replacing NULL character with
    169      * the replacement char U+FFFD, so this allows caller to do that easily. */
    170     MD_TEXT_NULLCHAR,
    171 
    172     /* Line breaks.
    173      * Note these are not sent from blocks with verbatim output (MD_BLOCK_CODE
    174      * or MD_BLOCK_HTML). In such cases, '\n' is part of the text itself. */
    175     MD_TEXT_BR,         /* <br> (hard break) */
    176     MD_TEXT_SOFTBR,     /* '\n' in source text where it is not semantically meaningful (soft break) */
    177 
    178     /* Entity.
    179      * (a) Named entity, e.g. &nbsp;
    180      *     (Note MD4C does not have a list of known entities.
    181      *     Anything matching the regexp /&[A-Za-z][A-Za-z0-9]{1,47};/ is
    182      *     treated as a named entity.)
    183      * (b) Numerical entity, e.g. &#1234;
    184      * (c) Hexadecimal entity, e.g. &#x12AB;
    185      *
    186      * As MD4C is mostly encoding agnostic, application gets the verbatim
    187      * entity text into the MD_PARSER::text_callback(). */
    188     MD_TEXT_ENTITY,
    189 
    190     /* Text in a code block (inside MD_BLOCK_CODE) or inlined code (`code`).
    191      * If it is inside MD_BLOCK_CODE, it includes spaces for indentation and
    192      * '\n' for new lines. MD_TEXT_BR and MD_TEXT_SOFTBR are not sent for this
    193      * kind of text. */
    194     MD_TEXT_CODE,
    195 
    196     /* Text is a raw HTML. If it is contents of a raw HTML block (i.e. not
    197      * an inline raw HTML), then MD_TEXT_BR and MD_TEXT_SOFTBR are not used.
    198      * The text contains verbatim '\n' for the new lines. */
    199     MD_TEXT_HTML,
    200 
    201     /* Text is inside an equation. This is processed the same way as inlined code
    202      * spans (`code`). */
    203     MD_TEXT_LATEXMATH
    204 } MD_TEXTTYPE;
    205 
    206 
    207 /* Alignment enumeration. */
    208 typedef enum MD_ALIGN {
    209     MD_ALIGN_DEFAULT = 0,   /* When unspecified. */
    210     MD_ALIGN_LEFT,
    211     MD_ALIGN_CENTER,
    212     MD_ALIGN_RIGHT
    213 } MD_ALIGN;
    214 
    215 
    216 /* String attribute.
    217  *
    218  * This wraps strings which are outside of a normal text flow and which are
    219  * propagated within various detailed structures, but which still may contain
    220  * string portions of different types like e.g. entities.
    221  *
    222  * So, for example, lets consider this image:
    223  *
    224  *     ![image alt text](http://example.org/image.png 'foo &quot; bar')
    225  *
    226  * The image alt text is propagated as a normal text via the MD_PARSER::text()
    227  * callback. However, the image title ('foo &quot; bar') is propagated as
    228  * MD_ATTRIBUTE in MD_SPAN_IMG_DETAIL::title.
    229  *
    230  * Then the attribute MD_SPAN_IMG_DETAIL::title shall provide the following:
    231  *  -- [0]: "foo "   (substr_types[0] == MD_TEXT_NORMAL; substr_offsets[0] == 0)
    232  *  -- [1]: "&quot;" (substr_types[1] == MD_TEXT_ENTITY; substr_offsets[1] == 4)
    233  *  -- [2]: " bar"   (substr_types[2] == MD_TEXT_NORMAL; substr_offsets[2] == 10)
    234  *  -- [3]: (n/a)    (n/a                              ; substr_offsets[3] == 14)
    235  *
    236  * Note that these invariants are always guaranteed:
    237  *  -- substr_offsets[0] == 0
    238  *  -- substr_offsets[LAST+1] == size
    239  *  -- Currently, only MD_TEXT_NORMAL, MD_TEXT_ENTITY, MD_TEXT_NULLCHAR
    240  *     substrings can appear. This could change only of the specification
    241  *     changes.
    242  */
    243 typedef struct MD_ATTRIBUTE {
    244     const MD_CHAR* text;
    245     MD_SIZE size;
    246     const MD_TEXTTYPE* substr_types;
    247     const MD_OFFSET* substr_offsets;
    248 } MD_ATTRIBUTE;
    249 
    250 
    251 /* Detailed info for MD_BLOCK_UL. */
    252 typedef struct MD_BLOCK_UL_DETAIL {
    253     int is_tight;           /* Non-zero if tight list, zero if loose. */
    254     MD_CHAR mark;           /* Item bullet character in MarkDown source of the list, e.g. '-', '+', '*'. */
    255 } MD_BLOCK_UL_DETAIL;
    256 
    257 /* Detailed info for MD_BLOCK_OL. */
    258 typedef struct MD_BLOCK_OL_DETAIL {
    259     unsigned start;         /* Start index of the ordered list. */
    260     int is_tight;           /* Non-zero if tight list, zero if loose. */
    261     MD_CHAR mark_delimiter; /* Character delimiting the item marks in MarkDown source, e.g. '.' or ')' */
    262 } MD_BLOCK_OL_DETAIL;
    263 
    264 /* Detailed info for MD_BLOCK_LI. */
    265 typedef struct MD_BLOCK_LI_DETAIL {
    266     int is_task;            /* Can be non-zero only with MD_FLAG_TASKLISTS */
    267     MD_CHAR task_mark;      /* If is_task, then one of 'x', 'X' or ' '. Undefined otherwise. */
    268     MD_OFFSET task_mark_offset;  /* If is_task, then offset in the input of the char between '[' and ']'. */
    269 } MD_BLOCK_LI_DETAIL;
    270 
    271 /* Detailed info for MD_BLOCK_H. */
    272 typedef struct MD_BLOCK_H_DETAIL {
    273     unsigned level;         /* Header level (1 - 6) */
    274 } MD_BLOCK_H_DETAIL;
    275 
    276 /* Detailed info for MD_BLOCK_CODE. */
    277 typedef struct MD_BLOCK_CODE_DETAIL {
    278     MD_ATTRIBUTE info;
    279     MD_ATTRIBUTE lang;
    280     MD_CHAR fence_char;     /* The character used for fenced code block; or zero for indented code block. */
    281 } MD_BLOCK_CODE_DETAIL;
    282 
    283 /* Detailed info for MD_BLOCK_TABLE. */
    284 typedef struct MD_BLOCK_TABLE_DETAIL {
    285     unsigned col_count;         /* Count of columns in the table. */
    286     unsigned head_row_count;    /* Count of rows in the table header (currently always 1) */
    287     unsigned body_row_count;    /* Count of rows in the table body */
    288 } MD_BLOCK_TABLE_DETAIL;
    289 
    290 /* Detailed info for MD_BLOCK_TH and MD_BLOCK_TD. */
    291 typedef struct MD_BLOCK_TD_DETAIL {
    292     MD_ALIGN align;
    293 } MD_BLOCK_TD_DETAIL;
    294 
    295 /* Detailed info for MD_SPAN_A. */
    296 typedef struct MD_SPAN_A_DETAIL {
    297     MD_ATTRIBUTE href;
    298     MD_ATTRIBUTE title;
    299 } MD_SPAN_A_DETAIL;
    300 
    301 /* Detailed info for MD_SPAN_IMG. */
    302 typedef struct MD_SPAN_IMG_DETAIL {
    303     MD_ATTRIBUTE src;
    304     MD_ATTRIBUTE title;
    305 } MD_SPAN_IMG_DETAIL;
    306 
    307 /* Detailed info for MD_SPAN_WIKILINK. */
    308 typedef struct MD_SPAN_WIKILINK {
    309     MD_ATTRIBUTE target;
    310 } MD_SPAN_WIKILINK_DETAIL;
    311 
    312 /* Flags specifying extensions/deviations from CommonMark specification.
    313  *
    314  * By default (when MD_PARSER::flags == 0), we follow CommonMark specification.
    315  * The following flags may allow some extensions or deviations from it.
    316  */
    317 #define MD_FLAG_COLLAPSEWHITESPACE          0x0001  /* In MD_TEXT_NORMAL, collapse non-trivial whitespace into single ' ' */
    318 #define MD_FLAG_PERMISSIVEATXHEADERS        0x0002  /* Do not require space in ATX headers ( ###header ) */
    319 #define MD_FLAG_PERMISSIVEURLAUTOLINKS      0x0004  /* Recognize URLs as autolinks even without '<', '>' */
    320 #define MD_FLAG_PERMISSIVEEMAILAUTOLINKS    0x0008  /* Recognize e-mails as autolinks even without '<', '>' and 'mailto:' */
    321 #define MD_FLAG_NOINDENTEDCODEBLOCKS        0x0010  /* Disable indented code blocks. (Only fenced code works.) */
    322 #define MD_FLAG_NOHTMLBLOCKS                0x0020  /* Disable raw HTML blocks. */
    323 #define MD_FLAG_NOHTMLSPANS                 0x0040  /* Disable raw HTML (inline). */
    324 #define MD_FLAG_TABLES                      0x0100  /* Enable tables extension. */
    325 #define MD_FLAG_STRIKETHROUGH               0x0200  /* Enable strikethrough extension. */
    326 #define MD_FLAG_PERMISSIVEWWWAUTOLINKS      0x0400  /* Enable WWW autolinks (even without any scheme prefix, if they begin with 'www.') */
    327 #define MD_FLAG_TASKLISTS                   0x0800  /* Enable task list extension. */
    328 #define MD_FLAG_LATEXMATHSPANS              0x1000  /* Enable $ and $$ containing LaTeX equations. */
    329 #define MD_FLAG_WIKILINKS                   0x2000  /* Enable wiki links extension. */
    330 #define MD_FLAG_UNDERLINE                   0x4000  /* Enable underline extension (and disables '_' for normal emphasis). */
    331 
    332 #define MD_FLAG_PERMISSIVEAUTOLINKS         (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS)
    333 #define MD_FLAG_NOHTML                      (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS)
    334 
    335 /* Convenient sets of flags corresponding to well-known Markdown dialects.
    336  *
    337  * Note we may only support subset of features of the referred dialect.
    338  * The constant just enables those extensions which bring us as close as
    339  * possible given what features we implement.
    340  *
    341  * ABI compatibility note: Meaning of these can change in time as new
    342  * extensions, bringing the dialect closer to the original, are implemented.
    343  */
    344 #define MD_DIALECT_COMMONMARK               0
    345 #define MD_DIALECT_GITHUB                   (MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_TABLES | MD_FLAG_STRIKETHROUGH | MD_FLAG_TASKLISTS)
    346 
    347 /* Parser structure.
    348  */
    349 typedef struct MD_PARSER {
    350     /* Reserved. Set to zero.
    351      */
    352     unsigned abi_version;
    353 
    354     /* Dialect options. Bitmask of MD_FLAG_xxxx values.
    355      */
    356     unsigned flags;
    357 
    358     /* Caller-provided rendering callbacks.
    359      *
    360      * For some block/span types, more detailed information is provided in a
    361      * type-specific structure pointed by the argument 'detail'.
    362      *
    363      * The last argument of all callbacks, 'userdata', is just propagated from
    364      * md_parse() and is available for any use by the application.
    365      *
    366      * Note any strings provided to the callbacks as their arguments or as
    367      * members of any detail structure are generally not zero-terminated.
    368      * Application has to take the respective size information into account.
    369      *
    370      * Any rendering callback may abort further parsing of the document by
    371      * returning non-zero.
    372      */
    373     int (*enter_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
    374     int (*leave_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
    375 
    376     int (*enter_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
    377     int (*leave_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
    378 
    379     int (*text)(MD_TEXTTYPE /*type*/, const MD_CHAR* /*text*/, MD_SIZE /*size*/, void* /*userdata*/);
    380 
    381     /* Debug callback. Optional (may be NULL).
    382      *
    383      * If provided and something goes wrong, this function gets called.
    384      * This is intended for debugging and problem diagnosis for developers;
    385      * it is not intended to provide any errors suitable for displaying to an
    386      * end user.
    387      */
    388     void (*debug_log)(const char* /*msg*/, void* /*userdata*/);
    389 
    390     /* Reserved. Set to NULL.
    391      */
    392     void (*syntax)(void);
    393 } MD_PARSER;
    394 
    395 
    396 /* For backward compatibility. Do not use in new code.
    397  */
    398 typedef MD_PARSER MD_RENDERER;
    399 
    400 
    401 /* Parse the Markdown document stored in the string 'text' of size 'size'.
    402  * The parser provides callbacks to be called during the parsing so the
    403  * caller can render the document on the screen or convert the Markdown
    404  * to another format.
    405  *
    406  * Zero is returned on success. If a runtime error occurs (e.g. a memory
    407  * fails), -1 is returned. If the processing is aborted due any callback
    408  * returning non-zero, the return value of the callback is returned.
    409  */
    410 int md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata);
    411 
    412 
    413 #ifdef __cplusplus
    414     }  /* extern "C" { */
    415 #endif
    416 
    417 #endif  /* MD4C_H */