md4c

C Markdown parser. Fast. SAX-like interface. Compliant to CommonMark specification.
git clone https://noulin.net/git/md4c.git
Log | Files | Refs | README | LICENSE

normalize.py (6506B)


      1 # -*- coding: utf-8 -*-
      2 from html.parser import HTMLParser
      3 import urllib
      4 
      5 try:
      6     from html.parser import HTMLParseError
      7 except ImportError:
      8     # HTMLParseError was removed in Python 3.5. It could never be
      9     # thrown, so we define a placeholder instead.
     10     class HTMLParseError(Exception):
     11         pass
     12 
     13 from html.entities import name2codepoint
     14 import sys
     15 import re
     16 import cgi
     17 
     18 # Normalization code, adapted from
     19 # https://github.com/karlcow/markdown-testsuite/
     20 significant_attrs = ["alt", "href", "src", "title"]
     21 whitespace_re = re.compile('\s+')
     22 class MyHTMLParser(HTMLParser):
     23     def __init__(self):
     24         HTMLParser.__init__(self)
     25         self.convert_charrefs = False
     26         self.last = "starttag"
     27         self.in_pre = False
     28         self.output = ""
     29         self.last_tag = ""
     30     def handle_data(self, data):
     31         after_tag = self.last == "endtag" or self.last == "starttag"
     32         after_block_tag = after_tag and self.is_block_tag(self.last_tag)
     33         if after_tag and self.last_tag == "br":
     34             data = data.lstrip('\n')
     35         if not self.in_pre:
     36             data = whitespace_re.sub(' ', data)
     37         if after_block_tag and not self.in_pre:
     38             if self.last == "starttag":
     39                 data = data.lstrip()
     40             elif self.last == "endtag":
     41                 data = data.strip()
     42         self.output += data
     43         self.last = "data"
     44     def handle_endtag(self, tag):
     45         if tag == "pre":
     46             self.in_pre = False
     47         elif self.is_block_tag(tag):
     48             self.output = self.output.rstrip()
     49         self.output += "</" + tag + ">"
     50         self.last_tag = tag
     51         self.last = "endtag"
     52     def handle_starttag(self, tag, attrs):
     53         if tag == "pre":
     54             self.in_pre = True
     55         if self.is_block_tag(tag):
     56             self.output = self.output.rstrip()
     57         self.output += "<" + tag
     58         # For now we don't strip out 'extra' attributes, because of
     59         # raw HTML test cases.
     60         # attrs = filter(lambda attr: attr[0] in significant_attrs, attrs)
     61         if attrs:
     62             attrs.sort()
     63             for (k,v) in attrs:
     64                 self.output += " " + k
     65                 if v in ['href','src']:
     66                     self.output += ("=" + '"' +
     67                             urllib.quote(urllib.unquote(v), safe='/') + '"')
     68                 elif v != None:
     69                     self.output += ("=" + '"' + cgi.escape(v,quote=True) + '"')
     70         self.output += ">"
     71         self.last_tag = tag
     72         self.last = "starttag"
     73     def handle_startendtag(self, tag, attrs):
     74         """Ignore closing tag for self-closing """
     75         self.handle_starttag(tag, attrs)
     76         self.last_tag = tag
     77         self.last = "endtag"
     78     def handle_comment(self, data):
     79         self.output += '<!--' + data + '-->'
     80         self.last = "comment"
     81     def handle_decl(self, data):
     82         self.output += '<!' + data + '>'
     83         self.last = "decl"
     84     def unknown_decl(self, data):
     85         self.output += '<!' + data + '>'
     86         self.last = "decl"
     87     def handle_pi(self,data):
     88         self.output += '<?' + data + '>'
     89         self.last = "pi"
     90     def handle_entityref(self, name):
     91         try:
     92             c = chr(name2codepoint[name])
     93         except KeyError:
     94             c = None
     95         self.output_char(c, '&' + name + ';')
     96         self.last = "ref"
     97     def handle_charref(self, name):
     98         try:
     99             if name.startswith("x"):
    100                 c = chr(int(name[1:], 16))
    101             else:
    102                 c = chr(int(name))
    103         except ValueError:
    104                 c = None
    105         self.output_char(c, '&' + name + ';')
    106         self.last = "ref"
    107     # Helpers.
    108     def output_char(self, c, fallback):
    109         if c == '<':
    110             self.output += "&lt;"
    111         elif c == '>':
    112             self.output += "&gt;"
    113         elif c == '&':
    114             self.output += "&amp;"
    115         elif c == '"':
    116             self.output += "&quot;"
    117         elif c == None:
    118             self.output += fallback
    119         else:
    120             self.output += c
    121 
    122     def is_block_tag(self,tag):
    123         return (tag in ['article', 'header', 'aside', 'hgroup', 'blockquote',
    124             'hr', 'iframe', 'body', 'li', 'map', 'button', 'object', 'canvas',
    125             'ol', 'caption', 'output', 'col', 'p', 'colgroup', 'pre', 'dd',
    126             'progress', 'div', 'section', 'dl', 'table', 'td', 'dt',
    127             'tbody', 'embed', 'textarea', 'fieldset', 'tfoot', 'figcaption',
    128             'th', 'figure', 'thead', 'footer', 'tr', 'form', 'ul',
    129             'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'video', 'script', 'style'])
    130 
    131 def normalize_html(html):
    132     r"""
    133     Return normalized form of HTML which ignores insignificant output
    134     differences:
    135 
    136     Multiple inner whitespaces are collapsed to a single space (except
    137     in pre tags):
    138 
    139         >>> normalize_html("<p>a  \t b</p>")
    140         '<p>a b</p>'
    141 
    142         >>> normalize_html("<p>a  \t\nb</p>")
    143         '<p>a b</p>'
    144 
    145     * Whitespace surrounding block-level tags is removed.
    146 
    147         >>> normalize_html("<p>a  b</p>")
    148         '<p>a b</p>'
    149 
    150         >>> normalize_html(" <p>a  b</p>")
    151         '<p>a b</p>'
    152 
    153         >>> normalize_html("<p>a  b</p> ")
    154         '<p>a b</p>'
    155 
    156         >>> normalize_html("\n\t<p>\n\t\ta  b\t\t</p>\n\t")
    157         '<p>a b</p>'
    158 
    159         >>> normalize_html("<i>a  b</i> ")
    160         '<i>a b</i> '
    161 
    162     * Self-closing tags are converted to open tags.
    163 
    164         >>> normalize_html("<br />")
    165         '<br>'
    166 
    167     * Attributes are sorted and lowercased.
    168 
    169         >>> normalize_html('<a title="bar" HREF="foo">x</a>')
    170         '<a href="foo" title="bar">x</a>'
    171 
    172     * References are converted to unicode, except that '<', '>', '&', and
    173       '"' are rendered using entities.
    174 
    175         >>> normalize_html("&forall;&amp;&gt;&lt;&quot;")
    176         '\u2200&amp;&gt;&lt;&quot;'
    177 
    178     """
    179     html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")
    180     try:
    181         parser = MyHTMLParser()
    182         # We work around HTMLParser's limitations parsing CDATA
    183         # by breaking the input into chunks and passing CDATA chunks
    184         # through verbatim.
    185         for chunk in re.finditer(html_chunk_re, html):
    186             if chunk.group(0)[:8] == "<![CDATA":
    187                 parser.output += chunk.group(0)
    188             else:
    189                 parser.feed(chunk.group(0))
    190         parser.close()
    191         return parser.output
    192     except HTMLParseError as e:
    193         sys.stderr.write("Normalization error: " + e.msg + "\n")
    194         return html  # on error, return unnormalized HTML