From c6105f36c41b39012a2c9edae193de24b0a9a176 Mon Sep 17 00:00:00 2001 From: Evgeny Fadeev Date: Tue, 6 Nov 2012 23:02:23 -0300 Subject: code for stripping email client quote separator and the corresponding test cases for some most common email clients --- askbot/mail/__init__.py | 51 ++++++++++++----------- askbot/mail/parsing.py | 78 ++++++++++++++++++++++++++++++++++++ askbot/tests/email_parsing_tests.py | 48 +++++++++++++--------- askbot/tests/reply_by_email_tests.py | 4 +- 4 files changed, 134 insertions(+), 47 deletions(-) create mode 100644 askbot/mail/parsing.py diff --git a/askbot/mail/__init__.py b/askbot/mail/__init__.py index c9c84f33..a33793b4 100644 --- a/askbot/mail/__init__.py +++ b/askbot/mail/__init__.py @@ -1,9 +1,18 @@ """functions that send email in askbot these automatically catch email-related exceptions """ +import logging import os +import re import smtplib -import logging +from askbot import exceptions +from askbot import const +from askbot.conf import settings as askbot_settings +from askbot.mail import parsing +from askbot.utils import url_utils +from askbot.utils.file_utils import store_file +from askbot.utils.html import absolutize_urls +from bs4 import BeautifulSoup from django.core import mail from django.conf import settings as django_settings from django.core.exceptions import PermissionDenied @@ -12,14 +21,7 @@ from django.utils.translation import ugettext_lazy as _ from django.utils.translation import string_concat from django.template import Context from django.utils.html import strip_tags -from askbot import exceptions -from askbot import const -from askbot.conf import settings as askbot_settings -from askbot.utils import url_utils -from askbot.utils.file_utils import store_file -from askbot.utils.html import absolutize_urls -from bs4 import BeautifulSoup #todo: maybe send_mail functions belong to models #or the future API def prefix_the_subject_line(subject): @@ -82,7 +84,8 @@ def thread_headers(post, orig_post, update): return headers def clean_html_email(email_body): - """needs more clenup might not work for other email templates + """returns the content part from an HTML email. + todo: needs more clenup might not work for other email templates that do not use table layout """ soup = BeautifulSoup(email_body) @@ -266,16 +269,12 @@ def bounce_email( def extract_reply(text): """take the part above the separator and discard the last line above the separator + ``text`` is the input text """ - if const.REPLY_SEPARATOR_REGEX.search(text): - text = const.REPLY_SEPARATOR_REGEX.split(text)[0] - text_lines = text.splitlines(False) - #log last 10 lines of text - to capture email responses - logging.debug('reply-border-separator|' + '|'.join(text_lines[-10:])) - #here we need code stripping the "On ... somebody wrote:" - return '\n'.join(text_lines[:-3]) - else: - return text + return parsing.extract_reply_contents( + text, + const.REPLY_SEPARATOR_REGEX + ) def process_attachment(attachment): """will save a single @@ -294,11 +293,11 @@ def process_attachment(attachment): def extract_user_signature(text, reply_code): """extracts email signature as text trailing the reply code""" - striped_text = strip_tags(text) - if reply_code in striped_text: + stripped_text = strip_tags(text) + if reply_code in stripped_text: #extract the signature tail = list() - for line in reversed(striped_text.splitlines()): + for line in reversed(stripped_text.splitlines()): #scan backwards from the end until the magic line if reply_code in line: break @@ -314,10 +313,10 @@ def extract_user_signature(text, reply_code): return '' -def process_parts(parts, reply_code = None): - """Process parts will upload the attachments and parse out the - body, if body is multipart. Secondly - links to attachments - will be added to the body of the question. +def process_parts(parts, reply_code=None): + """Uploads the attachments and parses out the + body, if body is multipart. + Links to attachments will be added to the body of the question. Returns ready to post body of the message and the list of uploaded files. """ @@ -366,7 +365,7 @@ def process_emailed_question( 'subject': subject, 'body_text': body_text } - user = User.objects.get(email__iexact = email_address) + user = User.objects.get(email__iexact=from_address) form = AskByEmailForm(data, user=user) if form.is_valid(): email_address = form.cleaned_data['email'] diff --git a/askbot/mail/parsing.py b/askbot/mail/parsing.py new file mode 100644 index 00000000..919c3f02 --- /dev/null +++ b/askbot/mail/parsing.py @@ -0,0 +1,78 @@ +"""a module for parsing email response text +this file is a candidate for publishing as an independent module +""" +import re + +#Regexes for quote separators +#add more via variables ending with _QUOTE_RE +#These regexes do not contain any trailing: +#* newline chars, +#* lines starting with | or > +#* lines consisting entirely of empty space +#expressions are stripped of month and day names +#to keep them simpler and make the additions of language variants +#easier. +GMAIL_QUOTE_RE = r'\n\nOn [^\n]* wrote:\Z' +YAHOO_QUOTE_RE = r'\n\n\n\n_+\n From: [^\n]+\nTo: [^\n]+\nSent: [^\n]+\nSubject: [^\n]+\Z' +KMAIL_QUOTE_RE = r'\AOn [^\n]+ you wrote:\s*\n\n' +OUTLOOK_RTF_QUOTE_RE = r'\n\nSubject: [^\n]+\nFrom: [^\n]+\nTo: [^\n]+\nDate: [^\n]+\Z' +OUTLOOK_TEXT_QUOTE_RE = r'\n_+\Z' + +def compile_quote_regexes(): + regex_names = filter( + lambda v: v.endswith('_QUOTE_RE'), + globals().keys() + ) + compiled_regexes = list() + for regex_name in regex_names: + regex = globals()[regex_name] + compiled_regexes.append( + re.compile( + regex, + re.MULTILINE | re.IGNORECASE + ) + ) + return compiled_regexes + +CLIENT_SPECIFIC_QUOTE_REGEXES = compile_quote_regexes() + +def strip_trailing_empties_and_quotes(text): + #strip empty lines and quote lines starting with | and > + return re.sub(r'(([\n\s\xa0])|(\n[\|>][^\n]*))*\Z', '', text) + +def strip_leading_empties(text): + return re.sub(r'\A[\n\s\xa0]*', '', text) + +def strip_email_client_formatting(text): + """strips email client formatting from the responses, + such as empty lines and quote separators (on ... wrote) + + if one client-specific separator matches, then result + is immediately returned + """ + text = strip_trailing_empties_and_quotes(text) + for regex in CLIENT_SPECIFIC_QUOTE_REGEXES: + if regex.search(text): + text = regex.sub('', text) + break + text = strip_trailing_empties_and_quotes(text) + return strip_leading_empties(text) + +def extract_reply_contents(text, reply_separator=None): + """If reply_separator is given, + take the part above the separator. + After, strip the email-client-specific text + + ``text`` is the input text + ``reply_separator`` is either a string or a regex object + """ + if reply_separator: + if isinstance(reply_separator, basestring): + text = text.split(reply_separator)[0] + else: + testre = re.compile('test') + if type(testre) == type(reply_separator): + text = reply_separator.split(text)[0] + else: + raise ValueError('reply_separator must be a string or a compiled regex') + return strip_email_client_formatting(text) diff --git a/askbot/tests/email_parsing_tests.py b/askbot/tests/email_parsing_tests.py index c1cea215..c8f3a057 100644 --- a/askbot/tests/email_parsing_tests.py +++ b/askbot/tests/email_parsing_tests.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from django.conf import settings as django_settings from askbot.skins.loaders import get_template from django.template import Context @@ -5,7 +6,7 @@ from askbot import mail from askbot import models from askbot.tests import utils -class EmailParseTests(utils.AskbotTestCase): +class EmailParsingTests(utils.AskbotTestCase): def setUp(self): self.template_name = 'email/welcome_lamson_on.html' @@ -22,29 +23,38 @@ class EmailParseTests(utils.AskbotTestCase): print '==================================================' print cleaned_body print "CLEANED BODY" - self.assertEquals(cleaned_body, self.expected_output) + self.assertEqual(cleaned_body, self.expected_output) def test_gmail_rich_text_response_stripped(self): text = u'\n\nthis is my reply!\n\nOn Wed, Oct 31, 2012 at 1:45 AM, wrote:\n\n> **\n> ' - self.assertEqual(mail.extract_body(text), 'this is my reply!') + self.assertEqual(mail.extract_reply(text), 'this is my reply!') - def test_gmail_plain_text_response_stripped(text): + def test_gmail_plain_text_response_stripped(self): text = u'\n\nthis is my another reply!\n\nOn Wed, Oct 31, 2012 at 1:45 AM, wrote:\n>\n> ' - self.assertEqual(mail.extract_body(text), 'this is my another reply!') + self.assertEqual(mail.extract_reply(text), 'this is my another reply!') def test_yahoo_mail_response_stripped(self): text = u'\n\nthis is my reply!\n\n\n\n________________________________\n From: "kp@kp-dev.askbot.com" \nTo: fadeev@rocketmail.com \nSent: Wednesday, October 31, 2012 2:41 AM\nSubject: "This is my test question"\n \n\n \n \n \n' - self.assertEqual(mail.extract_body(text), 'this is my reply!') - - def test_kmail_plain_text_response_stripped(text): - text = u'On Monday 01 October 2012 21:22:44 you wrote: \n\nSecond try, no HTML kmail does weird things.' - self.assertEqual(mail.extract_body(text), 'this is my reply!') - - -u'outlook.com (new hotmail) with RTF on \n\nSubject: "Posting a question by email." \nFrom: kp@kp-dev.askbot.com \nTo: aj_fitoria@hotmail.com \nDate: Thu, 1 Nov 2012 16:30:27 +0000' - -u'reply from hotmail without RTF \n________________________________ \n> Subject: "test with recovered signature" \n> From: kp@kp-dev.askbot.com \n> To: aj_fitoria@hotmail.com \n> Date: Thu, 1 Nov 2012 16:44:35 +0000' - -u'Reply from squirremail \n' - -""" + self.assertEqual(mail.extract_reply(text), 'this is my reply!') + + def test_kmail_plain_text_response_stripped(self): + text = u'On Monday 01 October 2012 21:22:44 you wrote: \n\nthis is my reply!' + self.assertEqual(mail.extract_reply(text), 'this is my reply!') + + def test_outlook_com_with_rtf_response_stripped(self): + text = u'outlook.com (new hotmail) with RTF on \n\nSubject: "Posting a question by email." \nFrom: kp@kp-dev.askbot.com \nTo: aj_fitoria@hotmail.com \nDate: Thu, 1 Nov 2012 16:30:27 +0000' + self.assertEqual( + mail.extract_reply(text), + 'outlook.com (new hotmail) with RTF on' + ) + self.assertEqual( + mail.extract_reply(text), + 'outlook.com (new hotmail) with RTF on' + ) + + def test_outlook_com_plain_text_response_stripped(self): + text = u'reply from hotmail without RTF \n________________________________ \n> Subject: "test with recovered signature" \n> From: kp@kp-dev.askbot.com \n> To: aj_fitoria@hotmail.com \n> Date: Thu, 1 Nov 2012 16:44:35 +0000' + self.assertEqual( + mail.extract_reply(text), + u'reply from hotmail without RTF' + ) diff --git a/askbot/tests/reply_by_email_tests.py b/askbot/tests/reply_by_email_tests.py index 30cb48be..5353586c 100644 --- a/askbot/tests/reply_by_email_tests.py +++ b/askbot/tests/reply_by_email_tests.py @@ -84,8 +84,8 @@ class ReplyAddressModelTests(AskbotTestCase): 'instruction': 'reply above this line' } msg = MockMessage( - "This is a test reply \n\nOn such and such someone" - "wrote something \n\n%s\nlorem ipsum " % (reply_separator), + "This is a test reply \n\nOn such and such someone " + "wrote: \n\n%s\nlorem ipsum " % (reply_separator), "user1@domain.com" ) msg['Subject'] = 'test subject' -- cgit v1.2.3-1-g7c22