diff options
author | Evgeny Fadeev <evgeny.fadeev@gmail.com> | 2013-08-04 14:38:55 -0400 |
---|---|---|
committer | Evgeny Fadeev <evgeny.fadeev@gmail.com> | 2013-08-04 14:38:55 -0400 |
commit | 14db3d2a9d9989dfc83296d341c3312186e0cff4 (patch) | |
tree | 2317d22b71b0900451449dd85f930cf26e688341 | |
parent | ca2bf1f30f3c081abd518e541362f91ebdb968ad (diff) | |
download | askbot-14db3d2a9d9989dfc83296d341c3312186e0cff4.tar.gz askbot-14db3d2a9d9989dfc83296d341c3312186e0cff4.tar.bz2 askbot-14db3d2a9d9989dfc83296d341c3312186e0cff4.zip |
hopefully fixed autolinking issues
-rw-r--r-- | askbot/tests/markup_test.py | 106 | ||||
-rw-r--r-- | askbot/utils/html.py | 46 | ||||
-rw-r--r-- | askbot/utils/markup.py | 15 |
3 files changed, 161 insertions, 6 deletions
diff --git a/askbot/tests/markup_test.py b/askbot/tests/markup_test.py index 192b108a..9cdc429f 100644 --- a/askbot/tests/markup_test.py +++ b/askbot/tests/markup_test.py @@ -1,4 +1,7 @@ +# -*- coding: utf-8 -*- from django.conf import settings as django_settings +from django.test import TestCase +from askbot.utils.markup import markdown_input_converter from askbot.tests.utils import AskbotTestCase from askbot.utils import markup @@ -22,3 +25,106 @@ class MarkupTest(AskbotTestCase): text = "oh hai @user1 how are you?" output = markup.extract_mentioned_name_seeds(text) self.assertEquals(output, set(['user1'])) + +""" +More test cases for the future, taken from +http://daringfireball.net/misc/2010/07/url-matching-regex-test-data.text + +Matches the right thing in the following lines: + +http://foo.com/blah_blah +http://foo.com/blah_blah/ +(Something like http://foo.com/blah_blah) +http://foo.com/blah_blah_(wikipedia) +http://foo.com/more_(than)_one_(parens) +(Something like http://foo.com/blah_blah_(wikipedia)) +http://foo.com/blah_(wikipedia)#cite-1 +http://foo.com/blah_(wikipedia)_blah#cite-1 +http://foo.com/unicode_(✪)_in_parens +http://foo.com/(something)?after=parens +http://foo.com/blah_blah. +http://foo.com/blah_blah/. +<http://foo.com/blah_blah> +<http://foo.com/blah_blah/> +http://foo.com/blah_blah, +http://www.extinguishedscholar.com/wpglob/?p=364. +http://✪df.ws/1234 +rdar://1234 +rdar:/1234 +x-yojimbo-item://6303E4C1-6A6E-45A6-AB9D-3A908F59AE0E +message://%3c330e7f840905021726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e +http://➡.ws/䨹 +www.c.ws/䨹 +<tag>http://example.com</tag> +Just a www.example.com link. +http://example.com/something?with,commas,in,url, but not at end +What about <mailto:gruber@daringfireball.net?subject=TEST> (including brokets). +mailto:name@example.com +bit.ly/foo +“is.gd/foo/” +WWW.EXAMPLE.COM +http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55))/Web_ENG/View_DetailPhoto.aspx?PicId=752 +http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55)) +http://lcweb2.loc.gov/cgi-bin/query/h?pp/horyd:@field(NUMBER+@band(thc+5a46634)) + + +Should fail against: + 6:00p + filename.txt + + +Known to fail against: + http://example.com/quotes-are-“part” + ✪df.ws/1234 + example.com + example.com/ +""" + +class MarkdownTestCase(TestCase): + """tests markdown, + todo: add more test cases from above""" + def setUp(self): + self.conv = markdown_input_converter + def test_anchor_stays_untouched(self): + text = """text <a href="http://example.com/">link</a> text""" + self.assertHTMLEqual(self.conv(text), '<p>' + text + '</p>\n') + + def test_full_link_converts_to_anchor(self): + text = """text http://example.com/ text""" + expected ="""<p>text <a href="http://example.com">http://example.com</a>/ text</p>\n""" + #todo: note there is a weird artefact produced by markdown2 inself + #trailing slash after the closing </a> tag + #the artifact is produced by _do_auto_links() function + self.assertHTMLEqual(self.conv(text), expected) + + def test_protocol_less_link_converts_to_anchor(self): + text = """text www.example.com text""" + expected ="""<p>text <a href="http://www.example.com">www.example.com</a> text</p>\n""" + self.assertHTMLEqual(self.conv(text), expected) + + def test_convert_mixed_text(self): + text = """<p> +some text +<a href="http://example.com">example</a> +replace this http://example.com +replace that example.com +<code>http://example.com</code> +</p> +<pre>http://example.com</pre> +""" + """ + this is messed up by markdown2 + <a href="http://example.com"><div>http://example.com</div></a> + """ + expected = """<p> +some text +<a href="http://example.com">example</a> +replace this <a href="http://example.com">http://example.com</a> +replace that <a href="http://example.com">example.com</a> +<code>http://example.com</code> +</p> +<pre>http://example.com</pre> +""" + """<a href="http://example.com"><div>http://example.com</div></a> + """ + self.assertHTMLEqual(self.conv(text), expected) diff --git a/askbot/utils/html.py b/askbot/utils/html.py index d7b321da..72947204 100644 --- a/askbot/utils/html.py +++ b/askbot/utils/html.py @@ -1,11 +1,13 @@ """Utilities for working with HTML.""" from bs4 import BeautifulSoup +from bs4 import NavigableString import html5lib from html5lib import sanitizer, serializer, tokenizer, treebuilders, treewalkers import re import htmlentitydefs from urlparse import urlparse from django.core.urlresolvers import reverse +from django.utils.html import urlize from askbot.conf import settings as askbot_settings class HTMLSanitizerMixin(sanitizer.HTMLSanitizerMixin): @@ -61,6 +63,50 @@ def absolutize_urls(html): #temporal fix for bad regex with wysiwyg editor return url_re4.sub(replacement, html).replace('%s//' % base_url, '%s/' % base_url) +def urlize_html(html): + """will urlize html, while ignoring link + patterns inside anchors, <pre> and <code> tags + """ + soup = BeautifulSoup(html, 'html5lib') + extract_nodes = list() + for node in soup.findAll(text=True): + parent_tags = [p.name for p in node.parents] + skip_tags = ['a', 'img', 'pre', 'code'] + if set(parent_tags) & set(skip_tags): + continue + + #bs4 is weird, so we work around to replace nodes + #maybe there is a better way though + urlized_text = urlize(node) + if unicode(node) == urlized_text: + continue + + sub_soup = BeautifulSoup(urlize(node), 'html5lib') + contents = sub_soup.find('body').contents + num_items = len(contents) + for i in range(num_items): + #there is strange thing in bs4, can't iterate + #as the tag seemingly can't belong to >1 soup object + child = contents[0] #always take first element + #insure that text nodes are sandwiched by space + have_string = (not hasattr(child, 'name')) + if have_string: + node.insert_before(soup.new_string(' ')) + node.insert_before(child) + if have_string: + node.insert_before(soup.new_string(' ')) + + extract_nodes.append(node) + + #extract the nodes that we replaced + for node in extract_nodes: + node.extract() + + result = unicode(soup.find('body').renderContents(), 'utf8') + if html.endswith('\n') and not result.endswith('\n'): + result += '\n' + return result + def replace_links_with_text(html): """any absolute links will be replaced with the url in plain text, same with any img tags diff --git a/askbot/utils/markup.py b/askbot/utils/markup.py index 5b6bf3a2..61821bba 100644 --- a/askbot/utils/markup.py +++ b/askbot/utils/markup.py @@ -7,10 +7,12 @@ import re import logging from askbot import const from askbot.conf import settings as askbot_settings -from askbot.utils.html import sanitize_html, strip_tags +from askbot.utils.html import sanitize_html +from askbot.utils.html import strip_tags +from askbot.utils.html import urlize_html from django.utils.html import urlize from markdown2 import Markdown -#url taken from http://regexlib.com/REDetails.aspx?regexp_id=501 by Brian Bothwell +#url taken from http://regexlib.com/REDetails.aspx?regexp_id=501 URL_RE = re.compile("((?<!(href|.src|data)=['\"])((http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*))") def get_parser(): @@ -28,9 +30,10 @@ def get_parser(): #pip install -e git+git://github.com/andryuha/python-markdown2.git extras.append('video') - link_patterns = [ - (URL_RE, r'\1'), - ] + #link_patterns = [ + # (URL_RE, r'\1'), + #] + link_patterns = [] if askbot_settings.ENABLE_AUTO_LINKING: pattern_list = askbot_settings.AUTO_LINK_PATTERNS.split('\n') url_list = askbot_settings.AUTO_LINK_URLS.split('\n') @@ -198,8 +201,8 @@ def plain_text_input_converter(text): def markdown_input_converter(text): """markdown to html converter""" - text = urlize(text) text = get_parser().convert(text) + text = urlize_html(text) return sanitize_html(text) def tinymce_input_converter(text): |