askbot/utils/html.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161

"""Utilities for working with HTML."""
from bs4 import BeautifulSoup
import html5lib
from html5lib import sanitizer, serializer, tokenizer, treebuilders, treewalkers
import re
import htmlentitydefs
from urlparse import urlparse
from django.core.urlresolvers import reverse
from askbot.conf import settings as askbot_settings

class HTMLSanitizerMixin(sanitizer.HTMLSanitizerMixin):
    acceptable_elements = ('a', 'abbr', 'acronym', 'address', 'b', 'big',
        'blockquote', 'br', 'caption', 'center', 'cite', 'code', 'col',
        'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font',
        'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd',
        'li', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike',
        'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead',
        'tr', 'tt', 'u', 'ul', 'var', 'object', 'param')

    acceptable_attributes = ('abbr', 'align', 'alt', 'axis', 'border',
        'cellpadding', 'cellspacing', 'char', 'charoff', 'charset', 'cite',
        'cols', 'colspan', 'data', 'datetime', 'dir', 'frame', 'headers', 'height',
        'href', 'hreflang', 'hspace', 'lang', 'longdesc', 'name', 'nohref',
        'noshade', 'nowrap', 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope',
        'span', 'src', 'start', 'summary', 'title', 'type', 'valign', 'vspace',
        'width')

    allowed_elements = acceptable_elements
    allowed_attributes = acceptable_attributes
    allowed_css_properties = ()
    allowed_css_keywords = ()
    allowed_svg_properties = ()

class HTMLSanitizer(tokenizer.HTMLTokenizer, HTMLSanitizerMixin):
    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
                 lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
        tokenizer.HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
                                         useChardet, lowercaseElementName,
                                         lowercaseAttrName, **kwargs)

    def __iter__(self):
        for token in tokenizer.HTMLTokenizer.__iter__(self):
            token = self.sanitize_token(token)
            if token:
                yield token

def absolutize_urls(html):
    """turns relative urls in <img> and <a> tags to absolute,
    starting with the ``askbot_settings.APP_URL``"""
    #temporal fix for bad regex with wysiwyg editor
    url_re1 = re.compile(r'(?P<prefix><img[^<]+src=)"(?P<url>/[^"]+)"', re.I)
    url_re2 = re.compile(r"(?P<prefix><img[^<]+src=)'(?P<url>/[^']+)'", re.I)
    url_re3 = re.compile(r'(?P<prefix><a[^<]+href=)"(?P<url>/[^"]+)"', re.I)
    url_re4 = re.compile(r"(?P<prefix><a[^<]+href=)'(?P<url>/[^']+)'", re.I)
    base_url = site_url('')#important to have this without the slash
    img_replacement = '\g<prefix>"%s/\g<url>" style="max-width:500px;"' % base_url
    replacement = '\g<prefix>"%s\g<url>"' % base_url
    html = url_re1.sub(img_replacement, html)
    html = url_re2.sub(img_replacement, html)
    html = url_re3.sub(replacement, html)
    #temporal fix for bad regex with wysiwyg editor
    return url_re4.sub(replacement, html).replace('%s//' % base_url, '%s/' % base_url)

def replace_links_with_text(html):
    """any absolute links will be replaced with the
    url in plain text, same with any img tags
    """
    def format_url_replacement(url, text):
        url = url.strip()
        text = text.strip()
        url_domain = urlparse(url).netloc
        if url and text and url_domain != text and url != text:
            return '%s (%s)' % (url, text)
        return url or text or ''
            
    soup = BeautifulSoup(html)
    abs_url_re = r'^http(s)?://'

    images = soup.find_all('img')
    for image in images:
        url = image.get('src', '')
        text = image.get('alt', '')
        if url == '' or re.match(abs_url_re, url):
            image.replaceWith(format_url_replacement(url, text))

    links = soup.find_all('a')
    for link in links:
        url = link.get('href', '')
        text = ''.join(link.text) or ''

        if text == '':#this is due to an issue with url inlining in comments
            link.replaceWith('')
        elif url == '' or re.match(abs_url_re, url):
            link.replaceWith(format_url_replacement(url, text))

    return unicode(soup.find('body').renderContents(), 'utf-8')

def strip_tags(html, tags=None):
    """strips tags from given html output"""
    #a corner case
    if html.strip() == '':
        return html

    assert(tags != None)

    soup = BeautifulSoup(html)
    for tag in tags:
        tag_matches = soup.find_all(tag)
        map(lambda v: v.replaceWith(''), tag_matches)
    return unicode(soup.find('body').renderContents(), 'utf-8')

def sanitize_html(html):
    """Sanitizes an HTML fragment."""
    p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
                            tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False,
                                  quote_attr_values=True)
    output_generator = s.serialize(stream)
    return u''.join(output_generator)

def site_url(url):
    from askbot.conf import settings
    base_url = urlparse(settings.APP_URL)
    return base_url.scheme + '://' + base_url.netloc + url

def site_link(url_name, title):
    """returns html for the link to the given url
    todo: may be improved to process url parameters, keyword
    and other arguments
    """
    url = site_url(reverse(url_name))
    return '<a href="%s">%s</a>' % (url, title)

def unescape(text):
    """source: http://effbot.org/zone/re-sub.htm#unescape-html
    Removes HTML or XML character references and entities from a text string.
    @param text The HTML (or XML) source text.
    @return The plain text, as a Unicode string, if necessary.
    """
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub("&#?\w+;", fixup, text)