"""Utilities for working with HTML."""
from bs4 import BeautifulSoup
import html5lib
from html5lib import sanitizer, serializer, tokenizer, treebuilders, treewalkers
import re
import htmlentitydefs
from urlparse import urlparse
from django.core.urlresolvers import reverse
from askbot.conf import settings as askbot_settings
class HTMLSanitizerMixin(sanitizer.HTMLSanitizerMixin):
acceptable_elements = ('a', 'abbr', 'acronym', 'address', 'b', 'big',
'blockquote', 'br', 'caption', 'center', 'cite', 'code', 'col',
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd',
'li', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike',
'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead',
'tr', 'tt', 'u', 'ul', 'var', 'object', 'param')
acceptable_attributes = ('abbr', 'align', 'alt', 'axis', 'border',
'cellpadding', 'cellspacing', 'char', 'charoff', 'charset', 'cite',
'cols', 'colspan', 'data', 'datetime', 'dir', 'frame', 'headers', 'height',
'href', 'hreflang', 'hspace', 'lang', 'longdesc', 'name', 'nohref',
'noshade', 'nowrap', 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope',
'span', 'src', 'start', 'summary', 'title', 'type', 'valign', 'vspace',
'width')
allowed_elements = acceptable_elements
allowed_attributes = acceptable_attributes
allowed_css_properties = ()
allowed_css_keywords = ()
allowed_svg_properties = ()
class HTMLSanitizer(tokenizer.HTMLTokenizer, HTMLSanitizerMixin):
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
tokenizer.HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
useChardet, lowercaseElementName,
lowercaseAttrName, **kwargs)
def __iter__(self):
for token in tokenizer.HTMLTokenizer.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token
def absolutize_urls(html):
"""turns relative urls in and tags to absolute,
starting with the ``askbot_settings.APP_URL``"""
#temporal fix for bad regex with wysiwyg editor
url_re1 = re.compile(r'(?P/[^"]+)"', re.I)
url_re2 = re.compile(r"(?P/[^']+)'", re.I)
url_re3 = re.compile(r'(?P/[^"]+)"', re.I)
url_re4 = re.compile(r"(?P/[^']+)'", re.I)
base_url = site_url('')#important to have this without the slash
img_replacement = '\g"%s/\g" style="max-width:500px;"' % base_url
replacement = '\g"%s\g"' % base_url
html = url_re1.sub(img_replacement, html)
html = url_re2.sub(img_replacement, html)
html = url_re3.sub(replacement, html)
#temporal fix for bad regex with wysiwyg editor
return url_re4.sub(replacement, html).replace('%s//' % base_url, '%s/' % base_url)
def replace_links_with_text(html):
"""any absolute links will be replaced with the
url in plain text, same with any img tags
"""
def format_url_replacement(url, text):
url = url.strip()
text = text.strip()
url_domain = urlparse(url).netloc
if url and text and url_domain != text and url != text:
return '%s (%s)' % (url, text)
return url or text or ''
soup = BeautifulSoup(html, 'html5lib')
abs_url_re = r'^http(s)?://'
images = soup.find_all('img')
for image in images:
url = image.get('src', '')
text = image.get('alt', '')
if url == '' or re.match(abs_url_re, url):
image.replaceWith(format_url_replacement(url, text))
links = soup.find_all('a')
for link in links:
url = link.get('href', '')
text = ''.join(link.text) or ''
if text == '':#this is due to an issue with url inlining in comments
link.replaceWith('')
elif url == '' or re.match(abs_url_re, url):
link.replaceWith(format_url_replacement(url, text))
return unicode(soup.find('body').renderContents(), 'utf-8')
def strip_tags(html, tags=None):
"""strips tags from given html output"""
#a corner case
if html.strip() == '':
return html
assert(tags != None)
soup = BeautifulSoup(html, 'html5lib')
for tag in tags:
tag_matches = soup.find_all(tag)
map(lambda v: v.replaceWith(''), tag_matches)
return unicode(soup.find('body').renderContents(), 'utf-8')
def sanitize_html(html):
"""Sanitizes an HTML fragment."""
p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
tree=treebuilders.getTreeBuilder("dom"))
dom_tree = p.parseFragment(html)
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
s = serializer.HTMLSerializer(omit_optional_tags=False,
quote_attr_values=True)
output_generator = s.serialize(stream)
return u''.join(output_generator)
def site_url(url):
from askbot.conf import settings
base_url = urlparse(settings.APP_URL)
return base_url.scheme + '://' + base_url.netloc + url
def site_link(url_name, title):
"""returns html for the link to the given url
todo: may be improved to process url parameters, keyword
and other arguments
"""
url = site_url(reverse(url_name))
return '%s' % (url, title)
def unescape(text):
"""source: http://effbot.org/zone/re-sub.htm#unescape-html
Removes HTML or XML character references and entities from a text string.
@param text The HTML (or XML) source text.
@return The plain text, as a Unicode string, if necessary.
"""
def fixup(m):
text = m.group(0)
if text[:2] == "":
# character reference
try:
if text[:3] == "":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("?\w+;", fixup, text)