hopefully fixed autolinking issues

author: Evgeny Fadeev <evgeny.fadeev@gmail.com> 2013-08-04 14:38:55 -0400
committer: Evgeny Fadeev <evgeny.fadeev@gmail.com> 2013-08-04 14:38:55 -0400
commit: 14db3d2a9d9989dfc83296d341c3312186e0cff4 (patch)
tree: 2317d22b71b0900451449dd85f930cf26e688341
parent: ca2bf1f30f3c081abd518e541362f91ebdb968ad (diff)
download: askbot-14db3d2a9d9989dfc83296d341c3312186e0cff4.tar.gz
askbot-14db3d2a9d9989dfc83296d341c3312186e0cff4.tar.bz2
askbot-14db3d2a9d9989dfc83296d341c3312186e0cff4.zip
3 files changed, 161 insertions, 6 deletions
diff --git a/askbot/tests/markup_test.py b/askbot/tests/markup_test.py
index 192b108a..9cdc429f 100644
--- a/askbot/tests/markup_test.py
+++ b/askbot/tests/markup_test.py
@@ -1,4 +1,7 @@
+# -*- coding: utf-8 -*-
 from django.conf import settings as django_settings
+from django.test import TestCase
+from askbot.utils.markup import markdown_input_converter
 from askbot.tests.utils import AskbotTestCase
 from askbot.utils import markup
 
@@ -22,3 +25,106 @@ class MarkupTest(AskbotTestCase):
         text = "oh hai @user1 how are you?"
         output = markup.extract_mentioned_name_seeds(text)
         self.assertEquals(output, set(['user1']))
+
+"""
+More test cases for the future, taken from 
+http://daringfireball.net/misc/2010/07/url-matching-regex-test-data.text
+
+Matches the right thing in the following lines:
+
+http://foo.com/blah_blah
+http://foo.com/blah_blah/
+(Something like http://foo.com/blah_blah)
+http://foo.com/blah_blah_(wikipedia)
+http://foo.com/more_(than)_one_(parens)
+(Something like http://foo.com/blah_blah_(wikipedia))
+http://foo.com/blah_(wikipedia)#cite-1
+http://foo.com/blah_(wikipedia)_blah#cite-1
+http://foo.com/unicode_(✪)_in_parens
+http://foo.com/(something)?after=parens
+http://foo.com/blah_blah.
+http://foo.com/blah_blah/.
+<http://foo.com/blah_blah>
+<http://foo.com/blah_blah/>
+http://foo.com/blah_blah,
+http://www.extinguishedscholar.com/wpglob/?p=364.
+http://✪df.ws/1234
+rdar://1234
+rdar:/1234
+x-yojimbo-item://6303E4C1-6A6E-45A6-AB9D-3A908F59AE0E
+message://%3c330e7f840905021726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e
+http://➡.ws/䨹
+www.c.ws/䨹
+<tag>http://example.com</tag>
+Just a www.example.com link.
+http://example.com/something?with,commas,in,url, but not at end
+What about <mailto:gruber@daringfireball.net?subject=TEST> (including brokets).
+mailto:name@example.com
+bit.ly/foo
+“is.gd/foo/”
+WWW.EXAMPLE.COM
+http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55))/Web_ENG/View_DetailPhoto.aspx?PicId=752
+http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55))
+http://lcweb2.loc.gov/cgi-bin/query/h?pp/horyd:@field(NUMBER+@band(thc+5a46634))
+
+
+Should fail against:
+    6:00p
+    filename.txt
+
+
+Known to fail against:
+    http://example.com/quotes-are-“part”
+    ✪df.ws/1234
+    example.com
+    example.com/
+"""
+
+class MarkdownTestCase(TestCase):
+    """tests markdown,
+    todo: add more test cases from above"""
+    def setUp(self):
+        self.conv = markdown_input_converter
+    def test_anchor_stays_untouched(self):
+        text = """text <a href="http://example.com/">link</a> text"""
+        self.assertHTMLEqual(self.conv(text), '<p>' + text + '</p>\n')
+
+    def test_full_link_converts_to_anchor(self):
+        text = """text http://example.com/ text"""
+        expected ="""<p>text <a href="http://example.com">http://example.com</a>/ text</p>\n"""
+        #todo: note there is a weird artefact produced by markdown2 inself
+        #trailing slash after the closing </a> tag
+        #the artifact is produced by _do_auto_links() function
+        self.assertHTMLEqual(self.conv(text), expected)
+
+    def test_protocol_less_link_converts_to_anchor(self):
+        text = """text www.example.com text"""
+        expected ="""<p>text <a href="http://www.example.com">www.example.com</a> text</p>\n"""
+        self.assertHTMLEqual(self.conv(text), expected)
+
+    def test_convert_mixed_text(self):
+        text = """<p>
+some text
+<a href="http://example.com">example</a>
+replace this http://example.com
+replace that example.com
+<code>http://example.com</code>
+</p>
+<pre>http://example.com</pre>
+"""
+        """
+        this is messed up by markdown2
+        <a href="http://example.com"><div>http://example.com</div></a>
+        """
+        expected = """<p>
+some text
+<a href="http://example.com">example</a>
+replace this <a href="http://example.com">http://example.com</a>
+replace that <a href="http://example.com">example.com</a>
+<code>http://example.com</code>
+</p>
+<pre>http://example.com</pre>
+"""
+        """<a href="http://example.com"><div>http://example.com</div></a>
+        """
+        self.assertHTMLEqual(self.conv(text), expected)
diff --git a/askbot/utils/html.py b/askbot/utils/html.py
index d7b321da..72947204 100644
--- a/askbot/utils/html.py
+++ b/askbot/utils/html.py
@@ -1,11 +1,13 @@
 """Utilities for working with HTML."""
 from bs4 import BeautifulSoup
+from bs4 import NavigableString
 import html5lib
 from html5lib import sanitizer, serializer, tokenizer, treebuilders, treewalkers
 import re
 import htmlentitydefs
 from urlparse import urlparse
 from django.core.urlresolvers import reverse
+from django.utils.html import urlize
 from askbot.conf import settings as askbot_settings
 
 class HTMLSanitizerMixin(sanitizer.HTMLSanitizerMixin):
@@ -61,6 +63,50 @@ def absolutize_urls(html):
     #temporal fix for bad regex with wysiwyg editor
     return url_re4.sub(replacement, html).replace('%s//' % base_url, '%s/' % base_url)
 
+def urlize_html(html):
+    """will urlize html, while ignoring link
+    patterns inside anchors, <pre> and <code> tags
+    """
+    soup = BeautifulSoup(html, 'html5lib')
+    extract_nodes = list()
+    for node in soup.findAll(text=True):
+        parent_tags = [p.name for p in node.parents]
+        skip_tags = ['a', 'img', 'pre', 'code']
+        if set(parent_tags) & set(skip_tags):
+            continue
+
+        #bs4 is weird, so we work around to replace nodes
+        #maybe there is a better way though
+        urlized_text = urlize(node)
+        if unicode(node) == urlized_text:
+            continue
+
+        sub_soup = BeautifulSoup(urlize(node), 'html5lib')
+        contents = sub_soup.find('body').contents
+        num_items = len(contents)
+        for i in range(num_items):
+            #there is strange thing in bs4, can't iterate
+            #as the tag seemingly can't belong to >1 soup object
+            child = contents[0] #always take first element
+            #insure that text nodes are sandwiched by space
+            have_string = (not hasattr(child, 'name'))
+            if have_string:
+                node.insert_before(soup.new_string(' '))
+            node.insert_before(child)
+            if have_string:
+                node.insert_before(soup.new_string(' '))
+
+        extract_nodes.append(node)
+
+    #extract the nodes that we replaced
+    for node in extract_nodes:
+        node.extract()
+
+    result = unicode(soup.find('body').renderContents(), 'utf8')
+    if html.endswith('\n') and not result.endswith('\n'):
+        result += '\n'
+    return result
+
 def replace_links_with_text(html):
     """any absolute links will be replaced with the
     url in plain text, same with any img tags
diff --git a/askbot/utils/markup.py b/askbot/utils/markup.py
index 5b6bf3a2..61821bba 100644
--- a/askbot/utils/markup.py
+++ b/askbot/utils/markup.py
@@ -7,10 +7,12 @@ import re
 import logging
 from askbot import const
 from askbot.conf import settings as askbot_settings
-from askbot.utils.html import sanitize_html, strip_tags
+from askbot.utils.html import sanitize_html
+from askbot.utils.html import strip_tags
+from askbot.utils.html import urlize_html
 from django.utils.html import urlize
 from markdown2 import Markdown
-#url taken from http://regexlib.com/REDetails.aspx?regexp_id=501 by Brian Bothwell
+#url taken from http://regexlib.com/REDetails.aspx?regexp_id=501
 URL_RE = re.compile("((?<!(href|.src|data)=['\"])((http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&amp;%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&amp;%\$#\=~_\-]+))*))")
 
 def get_parser():
@@ -28,9 +30,10 @@ def get_parser():
         #pip install -e git+git://github.com/andryuha/python-markdown2.git
         extras.append('video')
 
-    link_patterns = [
-        (URL_RE, r'\1'),
-    ]
+    #link_patterns = [
+    #    (URL_RE, r'\1'),
+    #]
+    link_patterns = []
     if askbot_settings.ENABLE_AUTO_LINKING:
         pattern_list = askbot_settings.AUTO_LINK_PATTERNS.split('\n')
         url_list = askbot_settings.AUTO_LINK_URLS.split('\n')
@@ -198,8 +201,8 @@ def plain_text_input_converter(text):
 
 def markdown_input_converter(text):
     """markdown to html converter"""
-    text = urlize(text)
     text = get_parser().convert(text)
+    text = urlize_html(text)
     return sanitize_html(text)
 
 def tinymce_input_converter(text):
author	Evgeny Fadeev <evgeny.fadeev@gmail.com>	2013-08-04 14:38:55 -0400
committer	Evgeny Fadeev <evgeny.fadeev@gmail.com>	2013-08-04 14:38:55 -0400
commit	14db3d2a9d9989dfc83296d341c3312186e0cff4 (patch)
tree	2317d22b71b0900451449dd85f930cf26e688341
parent	ca2bf1f30f3c081abd518e541362f91ebdb968ad (diff)
download	askbot-14db3d2a9d9989dfc83296d341c3312186e0cff4.tar.gz askbot-14db3d2a9d9989dfc83296d341c3312186e0cff4.tar.bz2 askbot-14db3d2a9d9989dfc83296d341c3312186e0cff4.zip