code for stripping email client quote separator and the corresponding test cases for some most common email clients

author: Evgeny Fadeev <evgeny.fadeev@gmail.com> 2012-11-06 23:02:23 -0300
committer: Evgeny Fadeev <evgeny.fadeev@gmail.com> 2012-11-06 23:02:23 -0300
commit: c6105f36c41b39012a2c9edae193de24b0a9a176 (patch)
tree: b0fb670ca71ae0f2e46b35bb70d59a1059c5f094
parent: fc26b08ad98dd427ab882f8ec332dfdfea2ca9b2 (diff)
download: askbot-c6105f36c41b39012a2c9edae193de24b0a9a176.tar.gz
askbot-c6105f36c41b39012a2c9edae193de24b0a9a176.tar.bz2
askbot-c6105f36c41b39012a2c9edae193de24b0a9a176.zip
4 files changed, 134 insertions, 47 deletions
diff --git a/askbot/mail/__init__.py b/askbot/mail/__init__.py
index c9c84f33..a33793b4 100644
--- a/askbot/mail/__init__.py
+++ b/askbot/mail/__init__.py
@@ -1,9 +1,18 @@
 """functions that send email in askbot
 these automatically catch email-related exceptions
 """
+import logging
 import os
+import re
 import smtplib
-import logging
+from askbot import exceptions
+from askbot import const
+from askbot.conf import settings as askbot_settings
+from askbot.mail import parsing
+from askbot.utils import url_utils
+from askbot.utils.file_utils import store_file
+from askbot.utils.html import absolutize_urls
+from bs4 import BeautifulSoup
 from django.core import mail
 from django.conf import settings as django_settings
 from django.core.exceptions import PermissionDenied
@@ -12,14 +21,7 @@ from django.utils.translation import ugettext_lazy as _
 from django.utils.translation import string_concat
 from django.template import Context
 from django.utils.html import strip_tags
-from askbot import exceptions
-from askbot import const
-from askbot.conf import settings as askbot_settings
-from askbot.utils import url_utils
-from askbot.utils.file_utils import store_file
-from askbot.utils.html import absolutize_urls
 
-from bs4 import BeautifulSoup
 #todo: maybe send_mail functions belong to models
 #or the future API
 def prefix_the_subject_line(subject):
@@ -82,7 +84,8 @@ def thread_headers(post, orig_post, update):
     return headers
 
 def clean_html_email(email_body):
-    """needs more clenup might not work for other email templates
+    """returns the content part from an HTML email.
+    todo: needs more clenup might not work for other email templates
     that do not use table layout
     """
     soup = BeautifulSoup(email_body)
@@ -266,16 +269,12 @@ def bounce_email(
 def extract_reply(text):
     """take the part above the separator
     and discard the last line above the separator
+    ``text`` is the input text
     """
-    if const.REPLY_SEPARATOR_REGEX.search(text):
-        text = const.REPLY_SEPARATOR_REGEX.split(text)[0]
-        text_lines = text.splitlines(False)
-        #log last 10 lines of text - to capture email responses
-        logging.debug('reply-border-separator|' + '|'.join(text_lines[-10:]))
-        #here we need code stripping the "On ... somebody wrote:"
-        return '\n'.join(text_lines[:-3])
-    else:
-        return text
+    return parsing.extract_reply_contents(
+                                text,
+                                const.REPLY_SEPARATOR_REGEX
+                            )
 
 def process_attachment(attachment):
     """will save a single
@@ -294,11 +293,11 @@ def process_attachment(attachment):
 def extract_user_signature(text, reply_code):
     """extracts email signature as text trailing
     the reply code"""
-    striped_text = strip_tags(text)
-    if reply_code in striped_text:
+    stripped_text = strip_tags(text)
+    if reply_code in stripped_text:
         #extract the signature
         tail = list()
-        for line in reversed(striped_text.splitlines()):
+        for line in reversed(stripped_text.splitlines()):
             #scan backwards from the end until the magic line
             if reply_code in line:
                 break
@@ -314,10 +313,10 @@ def extract_user_signature(text, reply_code):
         return ''
 
 
-def process_parts(parts, reply_code = None):
-    """Process parts will upload the attachments and parse out the
-    body, if body is multipart. Secondly - links to attachments
-    will be added to the body of the question.
+def process_parts(parts, reply_code=None):
+    """Uploads the attachments and parses out the
+    body, if body is multipart.
+    Links to attachments will be added to the body of the question.
     Returns ready to post body of the message and the list
     of uploaded files.
     """
@@ -366,7 +365,7 @@ def process_emailed_question(
             'subject': subject,
             'body_text': body_text
         }
-        user = User.objects.get(email__iexact = email_address)
+        user = User.objects.get(email__iexact=from_address)
         form = AskByEmailForm(data, user=user)
         if form.is_valid():
             email_address = form.cleaned_data['email']
diff --git a/askbot/mail/parsing.py b/askbot/mail/parsing.py
new file mode 100644
index 00000000..919c3f02
--- /dev/null
+++ b/askbot/mail/parsing.py
@@ -0,0 +1,78 @@
+"""a module for parsing email response text
+this file is a candidate for publishing as an independent module
+"""
+import re
+
+#Regexes for quote separators
+#add more via variables ending with _QUOTE_RE
+#These regexes do not contain any trailing:
+#* newline chars,
+#* lines starting with | or >
+#* lines consisting entirely of empty space
+#expressions are stripped of month and day names
+#to keep them simpler and make the additions of language variants
+#easier.
+GMAIL_QUOTE_RE = r'\n\nOn [^\n]* wrote:\Z'
+YAHOO_QUOTE_RE = r'\n\n\n\n_+\n From: [^\n]+\nTo: [^\n]+\nSent: [^\n]+\nSubject: [^\n]+\Z'
+KMAIL_QUOTE_RE = r'\AOn [^\n]+ you wrote:\s*\n\n'
+OUTLOOK_RTF_QUOTE_RE = r'\n\nSubject: [^\n]+\nFrom: [^\n]+\nTo: [^\n]+\nDate: [^\n]+\Z'
+OUTLOOK_TEXT_QUOTE_RE = r'\n_+\Z'
+
+def compile_quote_regexes():
+    regex_names = filter(
+        lambda v: v.endswith('_QUOTE_RE'),
+        globals().keys()
+    )
+    compiled_regexes = list()
+    for regex_name in regex_names:
+        regex = globals()[regex_name]
+        compiled_regexes.append(
+            re.compile(
+                regex,
+                re.MULTILINE | re.IGNORECASE
+            )
+        )
+    return compiled_regexes
+
+CLIENT_SPECIFIC_QUOTE_REGEXES = compile_quote_regexes()
+
+def strip_trailing_empties_and_quotes(text):
+    #strip empty lines and quote lines starting with | and >
+    return re.sub(r'(([\n\s\xa0])|(\n[\|>][^\n]*))*\Z', '', text)
+
+def strip_leading_empties(text):
+    return re.sub(r'\A[\n\s\xa0]*', '', text)
+
+def strip_email_client_formatting(text):
+    """strips email client formatting from the responses,
+    such as empty lines and quote separators (on ... wrote)
+
+    if one client-specific separator matches, then result
+    is immediately returned
+    """
+    text = strip_trailing_empties_and_quotes(text)
+    for regex in CLIENT_SPECIFIC_QUOTE_REGEXES:
+        if regex.search(text):
+            text = regex.sub('', text)
+            break
+    text = strip_trailing_empties_and_quotes(text)
+    return strip_leading_empties(text)
+
+def extract_reply_contents(text, reply_separator=None):
+    """If reply_separator is given,
+    take the part above the separator.
+    After, strip the email-client-specific text
+
+    ``text`` is the input text
+    ``reply_separator`` is either a string or a regex object
+    """
+    if reply_separator:
+        if isinstance(reply_separator, basestring):
+            text = text.split(reply_separator)[0]
+        else:
+            testre = re.compile('test')
+            if type(testre) == type(reply_separator):
+                text = reply_separator.split(text)[0]
+            else:
+                raise ValueError('reply_separator must be a string or a compiled regex')
+    return strip_email_client_formatting(text)
diff --git a/askbot/tests/email_parsing_tests.py b/askbot/tests/email_parsing_tests.py
index c1cea215..c8f3a057 100644
--- a/askbot/tests/email_parsing_tests.py
+++ b/askbot/tests/email_parsing_tests.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 from django.conf import settings as django_settings
 from askbot.skins.loaders import get_template
 from django.template import Context
@@ -5,7 +6,7 @@ from askbot import mail
 from askbot import models
 from askbot.tests import utils
 
-class EmailParseTests(utils.AskbotTestCase):
+class EmailParsingTests(utils.AskbotTestCase):
 
     def setUp(self):
         self.template_name = 'email/welcome_lamson_on.html'
@@ -22,29 +23,38 @@ class EmailParseTests(utils.AskbotTestCase):
         print '=================================================='
         print cleaned_body
         print "CLEANED BODY"
-        self.assertEquals(cleaned_body, self.expected_output)
+        self.assertEqual(cleaned_body, self.expected_output)
 
     def test_gmail_rich_text_response_stripped(self):
         text = u'\n\nthis is my reply!\n\nOn Wed, Oct 31, 2012 at 1:45 AM, <kp@kp-dev.askbot.com> wrote:\n\n> **\n>            '
-        self.assertEqual(mail.extract_body(text), 'this is my reply!')
+        self.assertEqual(mail.extract_reply(text), 'this is my reply!')
 
-    def test_gmail_plain_text_response_stripped(text):
+    def test_gmail_plain_text_response_stripped(self):
         text = u'\n\nthis is my another reply!\n\nOn Wed, Oct 31, 2012 at 1:45 AM, <kp@kp-dev.askbot.com> wrote:\n>\n> '
-        self.assertEqual(mail.extract_body(text), 'this is my another reply!')
+        self.assertEqual(mail.extract_reply(text), 'this is my another reply!')
 
     def test_yahoo_mail_response_stripped(self):
         text = u'\n\nthis is my reply!\n\n\n\n________________________________\n From: "kp@kp-dev.askbot.com" <kp@kp-dev.askbot.com>\nTo: fadeev@rocketmail.com \nSent: Wednesday, October 31, 2012 2:41 AM\nSubject: "This is my test question"\n \n\n  \n \n \n'
-        self.assertEqual(mail.extract_body(text), 'this is my reply!')
-
-    def test_kmail_plain_text_response_stripped(text):
-        text = u'On Monday 01 October 2012 21:22:44 you wrote: \n\nSecond try, no HTML kmail does weird things.'
-        self.assertEqual(mail.extract_body(text), 'this is my reply!')
-        
-
-u'outlook.com (new hotmail) with RTF on \n\nSubject: "Posting a question by email." \nFrom: kp@kp-dev.askbot.com \nTo: aj_fitoria@hotmail.com \nDate: Thu, 1 Nov 2012 16:30:27 +0000'
-
-u'reply from hotmail without RTF \n________________________________ \n> Subject: "test with recovered signature" \n> From: kp@kp-dev.askbot.com \n> To: aj_fitoria@hotmail.com \n> Date: Thu, 1 Nov 2012 16:44:35 +0000'
-
-u'Reply from squirremail \n'
-
-"""
+        self.assertEqual(mail.extract_reply(text), 'this is my reply!')
+
+    def test_kmail_plain_text_response_stripped(self):
+        text = u'On Monday 01 October 2012 21:22:44 you wrote: \n\nthis is my reply!'
+        self.assertEqual(mail.extract_reply(text), 'this is my reply!')
+
+    def test_outlook_com_with_rtf_response_stripped(self):
+        text = u'outlook.com (new hotmail) with RTF on \n\nSubject: "Posting a question by email." \nFrom: kp@kp-dev.askbot.com \nTo: aj_fitoria@hotmail.com \nDate: Thu, 1 Nov 2012 16:30:27 +0000'
+        self.assertEqual(
+            mail.extract_reply(text),
+            'outlook.com (new hotmail) with RTF on'
+        )
+        self.assertEqual(
+            mail.extract_reply(text),
+            'outlook.com (new hotmail) with RTF on'
+        )
+
+    def test_outlook_com_plain_text_response_stripped(self):
+        text = u'reply from hotmail without RTF \n________________________________ \n> Subject: "test with recovered signature" \n> From: kp@kp-dev.askbot.com \n> To: aj_fitoria@hotmail.com \n> Date: Thu, 1 Nov 2012 16:44:35 +0000'
+        self.assertEqual(
+            mail.extract_reply(text),
+            u'reply from hotmail without RTF'
+        )
diff --git a/askbot/tests/reply_by_email_tests.py b/askbot/tests/reply_by_email_tests.py
index 30cb48be..5353586c 100644
--- a/askbot/tests/reply_by_email_tests.py
+++ b/askbot/tests/reply_by_email_tests.py
@@ -84,8 +84,8 @@ class ReplyAddressModelTests(AskbotTestCase):
                                     'instruction': 'reply above this line'
                                 }
         msg = MockMessage(
-            "This is a test reply \n\nOn such and such someone"
-            "wrote something \n\n%s\nlorem ipsum " % (reply_separator),
+            "This is a test reply \n\nOn such and such someone "
+            "wrote: \n\n%s\nlorem ipsum " % (reply_separator),
             "user1@domain.com"
         )
         msg['Subject'] = 'test subject'
author	Evgeny Fadeev <evgeny.fadeev@gmail.com>	2012-11-06 23:02:23 -0300
committer	Evgeny Fadeev <evgeny.fadeev@gmail.com>	2012-11-06 23:02:23 -0300
commit	c6105f36c41b39012a2c9edae193de24b0a9a176 (patch)
tree	b0fb670ca71ae0f2e46b35bb70d59a1059c5f094
parent	fc26b08ad98dd427ab882f8ec332dfdfea2ca9b2 (diff)
download	askbot-c6105f36c41b39012a2c9edae193de24b0a9a176.tar.gz askbot-c6105f36c41b39012a2c9edae193de24b0a9a176.tar.bz2 askbot-c6105f36c41b39012a2c9edae193de24b0a9a176.zip