summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEvgeny Fadeev <evgeny.fadeev@gmail.com>2013-08-04 14:38:55 -0400
committerEvgeny Fadeev <evgeny.fadeev@gmail.com>2013-08-04 14:38:55 -0400
commit14db3d2a9d9989dfc83296d341c3312186e0cff4 (patch)
tree2317d22b71b0900451449dd85f930cf26e688341
parentca2bf1f30f3c081abd518e541362f91ebdb968ad (diff)
downloadaskbot-14db3d2a9d9989dfc83296d341c3312186e0cff4.tar.gz
askbot-14db3d2a9d9989dfc83296d341c3312186e0cff4.tar.bz2
askbot-14db3d2a9d9989dfc83296d341c3312186e0cff4.zip
hopefully fixed autolinking issues
-rw-r--r--askbot/tests/markup_test.py106
-rw-r--r--askbot/utils/html.py46
-rw-r--r--askbot/utils/markup.py15
3 files changed, 161 insertions, 6 deletions
diff --git a/askbot/tests/markup_test.py b/askbot/tests/markup_test.py
index 192b108a..9cdc429f 100644
--- a/askbot/tests/markup_test.py
+++ b/askbot/tests/markup_test.py
@@ -1,4 +1,7 @@
+# -*- coding: utf-8 -*-
from django.conf import settings as django_settings
+from django.test import TestCase
+from askbot.utils.markup import markdown_input_converter
from askbot.tests.utils import AskbotTestCase
from askbot.utils import markup
@@ -22,3 +25,106 @@ class MarkupTest(AskbotTestCase):
text = "oh hai @user1 how are you?"
output = markup.extract_mentioned_name_seeds(text)
self.assertEquals(output, set(['user1']))
+
+"""
+More test cases for the future, taken from
+http://daringfireball.net/misc/2010/07/url-matching-regex-test-data.text
+
+Matches the right thing in the following lines:
+
+http://foo.com/blah_blah
+http://foo.com/blah_blah/
+(Something like http://foo.com/blah_blah)
+http://foo.com/blah_blah_(wikipedia)
+http://foo.com/more_(than)_one_(parens)
+(Something like http://foo.com/blah_blah_(wikipedia))
+http://foo.com/blah_(wikipedia)#cite-1
+http://foo.com/blah_(wikipedia)_blah#cite-1
+http://foo.com/unicode_(✪)_in_parens
+http://foo.com/(something)?after=parens
+http://foo.com/blah_blah.
+http://foo.com/blah_blah/.
+<http://foo.com/blah_blah>
+<http://foo.com/blah_blah/>
+http://foo.com/blah_blah,
+http://www.extinguishedscholar.com/wpglob/?p=364.
+http://✪df.ws/1234
+rdar://1234
+rdar:/1234
+x-yojimbo-item://6303E4C1-6A6E-45A6-AB9D-3A908F59AE0E
+message://%3c330e7f840905021726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e
+http://➡.ws/䨹
+www.c.ws/䨹
+<tag>http://example.com</tag>
+Just a www.example.com link.
+http://example.com/something?with,commas,in,url, but not at end
+What about <mailto:gruber@daringfireball.net?subject=TEST> (including brokets).
+mailto:name@example.com
+bit.ly/foo
+“is.gd/foo/”
+WWW.EXAMPLE.COM
+http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55))/Web_ENG/View_DetailPhoto.aspx?PicId=752
+http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55))
+http://lcweb2.loc.gov/cgi-bin/query/h?pp/horyd:@field(NUMBER+@band(thc+5a46634))
+
+
+Should fail against:
+ 6:00p
+ filename.txt
+
+
+Known to fail against:
+ http://example.com/quotes-are-“part”
+ ✪df.ws/1234
+ example.com
+ example.com/
+"""
+
+class MarkdownTestCase(TestCase):
+ """tests markdown,
+ todo: add more test cases from above"""
+ def setUp(self):
+ self.conv = markdown_input_converter
+ def test_anchor_stays_untouched(self):
+ text = """text <a href="http://example.com/">link</a> text"""
+ self.assertHTMLEqual(self.conv(text), '<p>' + text + '</p>\n')
+
+ def test_full_link_converts_to_anchor(self):
+ text = """text http://example.com/ text"""
+ expected ="""<p>text <a href="http://example.com">http://example.com</a>/ text</p>\n"""
+ #todo: note there is a weird artefact produced by markdown2 inself
+ #trailing slash after the closing </a> tag
+ #the artifact is produced by _do_auto_links() function
+ self.assertHTMLEqual(self.conv(text), expected)
+
+ def test_protocol_less_link_converts_to_anchor(self):
+ text = """text www.example.com text"""
+ expected ="""<p>text <a href="http://www.example.com">www.example.com</a> text</p>\n"""
+ self.assertHTMLEqual(self.conv(text), expected)
+
+ def test_convert_mixed_text(self):
+ text = """<p>
+some text
+<a href="http://example.com">example</a>
+replace this http://example.com
+replace that example.com
+<code>http://example.com</code>
+</p>
+<pre>http://example.com</pre>
+"""
+ """
+ this is messed up by markdown2
+ <a href="http://example.com"><div>http://example.com</div></a>
+ """
+ expected = """<p>
+some text
+<a href="http://example.com">example</a>
+replace this <a href="http://example.com">http://example.com</a>
+replace that <a href="http://example.com">example.com</a>
+<code>http://example.com</code>
+</p>
+<pre>http://example.com</pre>
+"""
+ """<a href="http://example.com"><div>http://example.com</div></a>
+ """
+ self.assertHTMLEqual(self.conv(text), expected)
diff --git a/askbot/utils/html.py b/askbot/utils/html.py
index d7b321da..72947204 100644
--- a/askbot/utils/html.py
+++ b/askbot/utils/html.py
@@ -1,11 +1,13 @@
"""Utilities for working with HTML."""
from bs4 import BeautifulSoup
+from bs4 import NavigableString
import html5lib
from html5lib import sanitizer, serializer, tokenizer, treebuilders, treewalkers
import re
import htmlentitydefs
from urlparse import urlparse
from django.core.urlresolvers import reverse
+from django.utils.html import urlize
from askbot.conf import settings as askbot_settings
class HTMLSanitizerMixin(sanitizer.HTMLSanitizerMixin):
@@ -61,6 +63,50 @@ def absolutize_urls(html):
#temporal fix for bad regex with wysiwyg editor
return url_re4.sub(replacement, html).replace('%s//' % base_url, '%s/' % base_url)
+def urlize_html(html):
+ """will urlize html, while ignoring link
+ patterns inside anchors, <pre> and <code> tags
+ """
+ soup = BeautifulSoup(html, 'html5lib')
+ extract_nodes = list()
+ for node in soup.findAll(text=True):
+ parent_tags = [p.name for p in node.parents]
+ skip_tags = ['a', 'img', 'pre', 'code']
+ if set(parent_tags) & set(skip_tags):
+ continue
+
+ #bs4 is weird, so we work around to replace nodes
+ #maybe there is a better way though
+ urlized_text = urlize(node)
+ if unicode(node) == urlized_text:
+ continue
+
+ sub_soup = BeautifulSoup(urlize(node), 'html5lib')
+ contents = sub_soup.find('body').contents
+ num_items = len(contents)
+ for i in range(num_items):
+ #there is strange thing in bs4, can't iterate
+ #as the tag seemingly can't belong to >1 soup object
+ child = contents[0] #always take first element
+ #insure that text nodes are sandwiched by space
+ have_string = (not hasattr(child, 'name'))
+ if have_string:
+ node.insert_before(soup.new_string(' '))
+ node.insert_before(child)
+ if have_string:
+ node.insert_before(soup.new_string(' '))
+
+ extract_nodes.append(node)
+
+ #extract the nodes that we replaced
+ for node in extract_nodes:
+ node.extract()
+
+ result = unicode(soup.find('body').renderContents(), 'utf8')
+ if html.endswith('\n') and not result.endswith('\n'):
+ result += '\n'
+ return result
+
def replace_links_with_text(html):
"""any absolute links will be replaced with the
url in plain text, same with any img tags
diff --git a/askbot/utils/markup.py b/askbot/utils/markup.py
index 5b6bf3a2..61821bba 100644
--- a/askbot/utils/markup.py
+++ b/askbot/utils/markup.py
@@ -7,10 +7,12 @@ import re
import logging
from askbot import const
from askbot.conf import settings as askbot_settings
-from askbot.utils.html import sanitize_html, strip_tags
+from askbot.utils.html import sanitize_html
+from askbot.utils.html import strip_tags
+from askbot.utils.html import urlize_html
from django.utils.html import urlize
from markdown2 import Markdown
-#url taken from http://regexlib.com/REDetails.aspx?regexp_id=501 by Brian Bothwell
+#url taken from http://regexlib.com/REDetails.aspx?regexp_id=501
URL_RE = re.compile("((?<!(href|.src|data)=['\"])((http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&amp;%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&amp;%\$#\=~_\-]+))*))")
def get_parser():
@@ -28,9 +30,10 @@ def get_parser():
#pip install -e git+git://github.com/andryuha/python-markdown2.git
extras.append('video')
- link_patterns = [
- (URL_RE, r'\1'),
- ]
+ #link_patterns = [
+ # (URL_RE, r'\1'),
+ #]
+ link_patterns = []
if askbot_settings.ENABLE_AUTO_LINKING:
pattern_list = askbot_settings.AUTO_LINK_PATTERNS.split('\n')
url_list = askbot_settings.AUTO_LINK_URLS.split('\n')
@@ -198,8 +201,8 @@ def plain_text_input_converter(text):
def markdown_input_converter(text):
"""markdown to html converter"""
- text = urlize(text)
text = get_parser().convert(text)
+ text = urlize_html(text)
return sanitize_html(text)
def tinymce_input_converter(text):