md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

render_html.c (16141B)


      1 /*
      2  * MD4C: Markdown parser for C
      3  * (http://github.com/mity/md4c)
      4  *
      5  * Copyright (c) 2016-2017 Martin Mitas
      6  *
      7  * Permission is hereby granted, free of charge, to any person obtaining a
      8  * copy of this software and associated documentation files (the "Software"),
      9  * to deal in the Software without restriction, including without limitation
     10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     11  * and/or sell copies of the Software, and to permit persons to whom the
     12  * Software is furnished to do so, subject to the following conditions:
     13  *
     14  * The above copyright notice and this permission notice shall be included in
     15  * all copies or substantial portions of the Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     20  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     23  * IN THE SOFTWARE.
     24  */
     25 
     26 #include <stdio.h>
     27 #include <string.h>
     28 
     29 #include "render_html.h"
     30 #include "entity.h"
     31 
     32 
     33 #ifdef _MSC_VER
     34     /* MSVC does not understand "inline" when building as pure C (not C++).
     35      * However it understands "__inline" */
     36     #ifndef __cplusplus
     37         #define inline __inline
     38     #endif
     39 #endif
     40 
     41 #ifdef _WIN32
     42     #define snprintf _snprintf
     43 #endif
     44 
     45 
     46 
     47 typedef struct MD_RENDER_HTML_tag MD_RENDER_HTML;
     48 struct MD_RENDER_HTML_tag {
     49     void (*process_output)(const MD_CHAR*, MD_SIZE, void*);
     50     void* userdata;
     51     unsigned flags;
     52     int image_nesting_level;
     53 };
     54 
     55 
     56 /*****************************************
     57  ***  HTML rendering helper functions  ***
     58  *****************************************/
     59 
     60 #define ISDIGIT(ch)     ('0' <= (ch) && (ch) <= '9')
     61 #define ISLOWER(ch)     ('a' <= (ch) && (ch) <= 'z')
     62 #define ISUPPER(ch)     ('A' <= (ch) && (ch) <= 'Z')
     63 #define ISALNUM(ch)     (ISLOWER(ch) || ISUPPER(ch) || ISDIGIT(ch))
     64 
     65 
     66 static inline void
     67 render_text(MD_RENDER_HTML* r, const MD_CHAR* text, MD_SIZE size)
     68 {
     69     r->process_output(text, size, r->userdata);
     70 }
     71 
     72 #define RENDER_LITERAL(r, literal)    render_text((r), (literal), strlen(literal))
     73 
     74 
     75 static void
     76 render_html_escaped(MD_RENDER_HTML* r, const MD_CHAR* data, MD_SIZE size)
     77 {
     78     MD_OFFSET beg = 0;
     79     MD_OFFSET off = 0;
     80 
     81     /* Some characters need to be escaped in normal HTML text. */
     82     #define HTML_NEED_ESCAPE(ch)                                            \
     83             ((ch) == '&' || (ch) == '<' || (ch) == '>' || (ch) == '"')
     84 
     85     while(1) {
     86         while(off < size  &&  !HTML_NEED_ESCAPE(data[off]))
     87             off++;
     88         if(off > beg)
     89             render_text(r, data + beg, off - beg);
     90 
     91         if(off < size) {
     92             switch(data[off]) {
     93                 case '&':   RENDER_LITERAL(r, "&amp;"); break;
     94                 case '<':   RENDER_LITERAL(r, "&lt;"); break;
     95                 case '>':   RENDER_LITERAL(r, "&gt;"); break;
     96                 case '"':   RENDER_LITERAL(r, "&quot;"); break;
     97             }
     98             off++;
     99         } else {
    100             break;
    101         }
    102         beg = off;
    103     }
    104 }
    105 
    106 static void
    107 render_url_escaped(MD_RENDER_HTML* r, const MD_CHAR* data, MD_SIZE size)
    108 {
    109     static const MD_CHAR hex_chars[] = "0123456789ABCDEF";
    110     MD_OFFSET beg = 0;
    111     MD_OFFSET off = 0;
    112 
    113     #define URL_NEED_ESCAPE(ch)                                             \
    114             (!ISALNUM(ch)  &&  strchr("-_.+!*'(),%#@?=;:/,+$", ch) == NULL)
    115 
    116     while(1) {
    117         while(off < size  &&  !URL_NEED_ESCAPE(data[off]))
    118             off++;
    119         if(off > beg)
    120             render_text(r, data + beg, off - beg);
    121 
    122         if(off < size) {
    123             char hex[3];
    124 
    125             switch(data[off]) {
    126                 case '&':   RENDER_LITERAL(r, "&amp;"); break;
    127                 case '\'':  RENDER_LITERAL(r, "&#x27;"); break;
    128                 default:
    129                     hex[0] = '%';
    130                     hex[1] = hex_chars[((unsigned)data[off] >> 4) & 0xf];
    131                     hex[2] = hex_chars[((unsigned)data[off] >> 0) & 0xf];
    132                     render_text(r, hex, 3);
    133                     break;
    134             }
    135             off++;
    136         } else {
    137             break;
    138         }
    139 
    140         beg = off;
    141     }
    142 }
    143 
    144 static unsigned
    145 hex_val(char ch)
    146 {
    147     if('0' <= ch && ch <= '9')
    148         return ch - '0';
    149     if('A' <= ch && ch <= 'Z')
    150         return ch - 'A' + 10;
    151     else
    152         return ch - 'a' + 10;
    153 }
    154 
    155 static void
    156 render_utf8_codepoint(MD_RENDER_HTML* r, unsigned codepoint,
    157                       void (*fn_append)(MD_RENDER_HTML*, const MD_CHAR*, MD_SIZE))
    158 {
    159     static const MD_CHAR utf8_replacement_char[] = { 0xef, 0xbf, 0xbd };
    160 
    161     unsigned char utf8[4];
    162     size_t n;
    163 
    164     if(codepoint <= 0x7f) {
    165         n = 1;
    166         utf8[0] = codepoint;
    167     } else if(codepoint <= 0x7ff) {
    168         n = 2;
    169         utf8[0] = 0xc0 | ((codepoint >>  6) & 0x1f);
    170         utf8[1] = 0x80 + ((codepoint >>  0) & 0x3f);
    171     } else if(codepoint <= 0xffff) {
    172         n = 3;
    173         utf8[0] = 0xe0 | ((codepoint >> 12) & 0xf);
    174         utf8[1] = 0x80 + ((codepoint >>  6) & 0x3f);
    175         utf8[2] = 0x80 + ((codepoint >>  0) & 0x3f);
    176     } else {
    177         n = 4;
    178         utf8[0] = 0xf0 | ((codepoint >> 18) & 0x7);
    179         utf8[1] = 0x80 + ((codepoint >> 12) & 0x3f);
    180         utf8[2] = 0x80 + ((codepoint >>  6) & 0x3f);
    181         utf8[3] = 0x80 + ((codepoint >>  0) & 0x3f);
    182     }
    183 
    184     if(0 < codepoint  &&  codepoint <= 0x10ffff)
    185         fn_append(r, (char*)utf8, n);
    186     else
    187         fn_append(r, utf8_replacement_char, 3);
    188 }
    189 
    190 /* Translate entity to its UTF-8 equivalent, or output the verbatim one
    191  * if such entity is unknown (or if the translation is disabled). */
    192 static void
    193 render_entity(MD_RENDER_HTML* r, const MD_CHAR* text, MD_SIZE size,
    194               void (*fn_append)(MD_RENDER_HTML*, const MD_CHAR*, MD_SIZE))
    195 {
    196     if(r->flags & MD_RENDER_FLAG_VERBATIM_ENTITIES) {
    197         fn_append(r, text, size);
    198         return;
    199     }
    200 
    201     /* We assume UTF-8 output is what is desired. */
    202     if(size > 3 && text[1] == '#') {
    203         unsigned codepoint = 0;
    204 
    205         if(text[2] == 'x' || text[2] == 'X') {
    206             /* Hexadecimal entity (e.g. "&#x1234abcd;")). */
    207             MD_SIZE i;
    208             for(i = 3; i < size-1; i++)
    209                 codepoint = 16 * codepoint + hex_val(text[i]);
    210         } else {
    211             /* Decimal entity (e.g. "&1234;") */
    212             MD_SIZE i;
    213             for(i = 2; i < size-1; i++)
    214                 codepoint = 10 * codepoint + (text[i] - '0');
    215         }
    216 
    217         render_utf8_codepoint(r, codepoint, fn_append);
    218         return;
    219     } else {
    220         /* Named entity (e.g. "&nbsp;"). */
    221         const struct entity* ent;
    222 
    223         ent = entity_lookup(text, size);
    224         if(ent != NULL) {
    225             render_utf8_codepoint(r, ent->codepoints[0], fn_append);
    226             if(ent->codepoints[1])
    227                 render_utf8_codepoint(r, ent->codepoints[1], fn_append);
    228             return;
    229         }
    230     }
    231 
    232     fn_append(r, text, size);
    233 }
    234 
    235 static void
    236 render_attribute(MD_RENDER_HTML* r, const MD_ATTRIBUTE* attr,
    237                  void (*fn_append)(MD_RENDER_HTML*, const MD_CHAR*, MD_SIZE))
    238 {
    239     int i;
    240 
    241     for(i = 0; attr->substr_offsets[i] < attr->size; i++) {
    242         MD_TEXTTYPE type = attr->substr_types[i];
    243         MD_OFFSET off = attr->substr_offsets[i];
    244         MD_SIZE size = attr->substr_offsets[i+1] - off;
    245         const MD_CHAR* text = attr->text + off;
    246 
    247         switch(type) {
    248             case MD_TEXT_NULLCHAR:  render_utf8_codepoint(r, 0x0000, render_text); break;
    249             case MD_TEXT_ENTITY:    render_entity(r, text, size, fn_append); break;
    250             default:                fn_append(r, text, size); break;
    251         }
    252     }
    253 }
    254 
    255 
    256 static void
    257 render_open_ol_block(MD_RENDER_HTML* r, const MD_BLOCK_OL_DETAIL* det)
    258 {
    259     char buf[64];
    260 
    261     if(det->start == 1) {
    262         RENDER_LITERAL(r, "<ol>\n");
    263         return;
    264     }
    265 
    266     snprintf(buf, sizeof(buf), "<ol start=\"%u\">\n", det->start);
    267     RENDER_LITERAL(r, buf);
    268 }
    269 
    270 static void
    271 render_open_code_block(MD_RENDER_HTML* r, const MD_BLOCK_CODE_DETAIL* det)
    272 {
    273     RENDER_LITERAL(r, "<pre><code");
    274 
    275     /* If known, output the HTML 5 attribute class="language-LANGNAME". */
    276     if(det->lang.text != NULL) {
    277         RENDER_LITERAL(r, " class=\"language-");
    278         render_attribute(r, &det->lang, render_html_escaped);
    279         RENDER_LITERAL(r, "\"");
    280     }
    281 
    282     RENDER_LITERAL(r, ">");
    283 }
    284 
    285 static void
    286 render_open_td_block(MD_RENDER_HTML* r, const MD_CHAR* cell_type, const MD_BLOCK_TD_DETAIL* det)
    287 {
    288     RENDER_LITERAL(r, "<");
    289     RENDER_LITERAL(r, cell_type);
    290 
    291     switch(det->align) {
    292         case MD_ALIGN_LEFT:     RENDER_LITERAL(r, " align=\"left\">"); break;
    293         case MD_ALIGN_CENTER:   RENDER_LITERAL(r, " align=\"center\">"); break;
    294         case MD_ALIGN_RIGHT:    RENDER_LITERAL(r, " align=\"right\">"); break;
    295         default:                RENDER_LITERAL(r, ">"); break;
    296     }
    297 }
    298 
    299 static void
    300 render_open_a_span(MD_RENDER_HTML* r, const MD_SPAN_A_DETAIL* det)
    301 {
    302     RENDER_LITERAL(r, "<a href=\"");
    303     render_attribute(r, &det->href, render_url_escaped);
    304 
    305     if(det->title.text != NULL) {
    306         RENDER_LITERAL(r, "\" title=\"");
    307         render_attribute(r, &det->title, render_html_escaped);
    308     }
    309 
    310     RENDER_LITERAL(r, "\">");
    311 }
    312 
    313 static void
    314 render_open_img_span(MD_RENDER_HTML* r, const MD_SPAN_IMG_DETAIL* det)
    315 {
    316     RENDER_LITERAL(r, "<img src=\"");
    317     render_attribute(r, &det->src, render_url_escaped);
    318 
    319     RENDER_LITERAL(r, "\" alt=\"");
    320 
    321     r->image_nesting_level++;
    322 }
    323 
    324 static void
    325 render_close_img_span(MD_RENDER_HTML* r, const MD_SPAN_IMG_DETAIL* det)
    326 {
    327     if(det->title.text != NULL) {
    328         RENDER_LITERAL(r, "\" title=\"");
    329         render_attribute(r, &det->title, render_html_escaped);
    330     }
    331 
    332     RENDER_LITERAL(r, "\">");
    333 
    334     r->image_nesting_level--;
    335 }
    336 
    337 
    338 /**************************************
    339  ***  HTML renderer implementation  ***
    340  **************************************/
    341 
    342 static int
    343 enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata)
    344 {
    345     static const MD_CHAR* head[6] = { "<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>" };
    346     MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata;
    347 
    348     switch(type) {
    349         case MD_BLOCK_DOC:      /* noop */ break;
    350         case MD_BLOCK_QUOTE:    RENDER_LITERAL(r, "<blockquote>\n"); break;
    351         case MD_BLOCK_UL:       RENDER_LITERAL(r, "<ul>\n"); break;
    352         case MD_BLOCK_OL:       render_open_ol_block(r, (const MD_BLOCK_OL_DETAIL*)detail); break;
    353         case MD_BLOCK_LI:       RENDER_LITERAL(r, "<li>"); break;
    354         case MD_BLOCK_HR:       RENDER_LITERAL(r, "<hr>\n"); break;
    355         case MD_BLOCK_H:        RENDER_LITERAL(r, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break;
    356         case MD_BLOCK_CODE:     render_open_code_block(r, (const MD_BLOCK_CODE_DETAIL*) detail); break;
    357         case MD_BLOCK_HTML:     /* noop */ break;
    358         case MD_BLOCK_P:        RENDER_LITERAL(r, "<p>"); break;
    359         case MD_BLOCK_TABLE:    RENDER_LITERAL(r, "<table>\n"); break;
    360         case MD_BLOCK_THEAD:    RENDER_LITERAL(r, "<thead>\n"); break;
    361         case MD_BLOCK_TBODY:    RENDER_LITERAL(r, "<tbody>\n"); break;
    362         case MD_BLOCK_TR:       RENDER_LITERAL(r, "<tr>\n"); break;
    363         case MD_BLOCK_TH:       render_open_td_block(r, "th", (MD_BLOCK_TD_DETAIL*)detail); break;
    364         case MD_BLOCK_TD:       render_open_td_block(r, "td", (MD_BLOCK_TD_DETAIL*)detail); break;
    365     }
    366 
    367     return 0;
    368 }
    369 
    370 static int
    371 leave_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata)
    372 {
    373     static const MD_CHAR* head[6] = { "</h1>\n", "</h2>\n", "</h3>\n", "</h4>\n", "</h5>\n", "</h6>\n" };
    374     MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata;
    375 
    376     switch(type) {
    377         case MD_BLOCK_DOC:      /*noop*/ break;
    378         case MD_BLOCK_QUOTE:    RENDER_LITERAL(r, "</blockquote>\n"); break;
    379         case MD_BLOCK_UL:       RENDER_LITERAL(r, "</ul>\n"); break;
    380         case MD_BLOCK_OL:       RENDER_LITERAL(r, "</ol>\n"); break;
    381         case MD_BLOCK_LI:       RENDER_LITERAL(r, "</li>\n"); break;
    382         case MD_BLOCK_HR:       /*noop*/ break;
    383         case MD_BLOCK_H:        RENDER_LITERAL(r, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break;
    384         case MD_BLOCK_CODE:     RENDER_LITERAL(r, "</code></pre>\n"); break;
    385         case MD_BLOCK_HTML:     /* noop */ break;
    386         case MD_BLOCK_P:        RENDER_LITERAL(r, "</p>\n"); break;
    387         case MD_BLOCK_TABLE:    RENDER_LITERAL(r, "</table>\n"); break;
    388         case MD_BLOCK_THEAD:    RENDER_LITERAL(r, "</thead>\n"); break;
    389         case MD_BLOCK_TBODY:    RENDER_LITERAL(r, "</tbody>\n"); break;
    390         case MD_BLOCK_TR:       RENDER_LITERAL(r, "</tr>\n"); break;
    391         case MD_BLOCK_TH:       RENDER_LITERAL(r, "</th>\n"); break;
    392         case MD_BLOCK_TD:       RENDER_LITERAL(r, "</td>\n"); break;
    393     }
    394 
    395     return 0;
    396 }
    397 
    398 static int
    399 enter_span_callback(MD_SPANTYPE type, void* detail, void* userdata)
    400 {
    401     MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata;
    402 
    403     if(r->image_nesting_level > 0) {
    404         /* We are inside an image, i.e. rendering the ALT attribute of
    405          * <IMG> tag. */
    406         return 0;
    407     }
    408 
    409     switch(type) {
    410         case MD_SPAN_EM:        RENDER_LITERAL(r, "<em>"); break;
    411         case MD_SPAN_STRONG:    RENDER_LITERAL(r, "<strong>"); break;
    412         case MD_SPAN_A:         render_open_a_span(r, (MD_SPAN_A_DETAIL*) detail); break;
    413         case MD_SPAN_IMG:       render_open_img_span(r, (MD_SPAN_IMG_DETAIL*) detail); break;
    414         case MD_SPAN_CODE:      RENDER_LITERAL(r, "<code>"); break;
    415         case MD_SPAN_DEL:       RENDER_LITERAL(r, "<del>"); break;
    416     }
    417 
    418     return 0;
    419 }
    420 
    421 static int
    422 leave_span_callback(MD_SPANTYPE type, void* detail, void* userdata)
    423 {
    424     MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata;
    425 
    426     if(r->image_nesting_level > 0) {
    427         /* We are inside an image, i.e. rendering the ALT attribute of
    428          * <IMG> tag. */
    429         if(r->image_nesting_level == 1  &&  type == MD_SPAN_IMG)
    430             render_close_img_span(r, (MD_SPAN_IMG_DETAIL*) detail);
    431         return 0;
    432     }
    433 
    434     switch(type) {
    435         case MD_SPAN_EM:        RENDER_LITERAL(r, "</em>"); break;
    436         case MD_SPAN_STRONG:    RENDER_LITERAL(r, "</strong>"); break;
    437         case MD_SPAN_A:         RENDER_LITERAL(r, "</a>"); break;
    438         case MD_SPAN_IMG:       /*noop, handled above*/ break;
    439         case MD_SPAN_CODE:      RENDER_LITERAL(r, "</code>"); break;
    440         case MD_SPAN_DEL:       RENDER_LITERAL(r, "</del>"); break;
    441     }
    442 
    443     return 0;
    444 }
    445 
    446 static int
    447 text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, void* userdata)
    448 {
    449     MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata;
    450 
    451     switch(type) {
    452         case MD_TEXT_NULLCHAR:  render_utf8_codepoint(r, 0x0000, render_text); break;
    453         case MD_TEXT_BR:        RENDER_LITERAL(r, (r->image_nesting_level == 0 ? "<br>\n" : " ")); break;
    454         case MD_TEXT_SOFTBR:    RENDER_LITERAL(r, (r->image_nesting_level == 0 ? "\n" : " ")); break;
    455         case MD_TEXT_HTML:      render_text(r, text, size); break;
    456         case MD_TEXT_ENTITY:    render_entity(r, text, size, render_html_escaped); break;
    457         default:                render_html_escaped(r, text, size); break;
    458     }
    459 
    460     return 0;
    461 }
    462 
    463 static void
    464 debug_log_callback(const char* msg, void* userdata)
    465 {
    466     MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata;
    467     if(r->flags & MD_RENDER_FLAG_DEBUG)
    468         fprintf(stderr, "MD4C: %s\n", msg);
    469 }
    470 
    471 int
    472 md_render_html(const MD_CHAR* input, MD_SIZE input_size,
    473                void (*process_output)(const MD_CHAR*, MD_SIZE, void*),
    474                void* userdata, unsigned parser_flags, unsigned renderer_flags)
    475 {
    476     MD_RENDER_HTML render = { process_output, userdata, renderer_flags, 0 };
    477 
    478     MD_RENDERER renderer = {
    479         enter_block_callback,
    480         leave_block_callback,
    481         enter_span_callback,
    482         leave_span_callback,
    483         text_callback,
    484         debug_log_callback,
    485         parser_flags
    486     };
    487 
    488     return md_parse(input, input_size, &renderer, (void*) &render);
    489 }
    490