summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEvgeny Fadeev <evgeny.fadeev@gmail.com>2012-11-06 23:02:23 -0300
committerEvgeny Fadeev <evgeny.fadeev@gmail.com>2012-11-06 23:02:23 -0300
commitc6105f36c41b39012a2c9edae193de24b0a9a176 (patch)
treeb0fb670ca71ae0f2e46b35bb70d59a1059c5f094
parentfc26b08ad98dd427ab882f8ec332dfdfea2ca9b2 (diff)
downloadaskbot-c6105f36c41b39012a2c9edae193de24b0a9a176.tar.gz
askbot-c6105f36c41b39012a2c9edae193de24b0a9a176.tar.bz2
askbot-c6105f36c41b39012a2c9edae193de24b0a9a176.zip
code for stripping email client quote separator and the corresponding test cases for some most common email clients
-rw-r--r--askbot/mail/__init__.py51
-rw-r--r--askbot/mail/parsing.py78
-rw-r--r--askbot/tests/email_parsing_tests.py48
-rw-r--r--askbot/tests/reply_by_email_tests.py4
4 files changed, 134 insertions, 47 deletions
diff --git a/askbot/mail/__init__.py b/askbot/mail/__init__.py
index c9c84f33..a33793b4 100644
--- a/askbot/mail/__init__.py
+++ b/askbot/mail/__init__.py
@@ -1,9 +1,18 @@
"""functions that send email in askbot
these automatically catch email-related exceptions
"""
+import logging
import os
+import re
import smtplib
-import logging
+from askbot import exceptions
+from askbot import const
+from askbot.conf import settings as askbot_settings
+from askbot.mail import parsing
+from askbot.utils import url_utils
+from askbot.utils.file_utils import store_file
+from askbot.utils.html import absolutize_urls
+from bs4 import BeautifulSoup
from django.core import mail
from django.conf import settings as django_settings
from django.core.exceptions import PermissionDenied
@@ -12,14 +21,7 @@ from django.utils.translation import ugettext_lazy as _
from django.utils.translation import string_concat
from django.template import Context
from django.utils.html import strip_tags
-from askbot import exceptions
-from askbot import const
-from askbot.conf import settings as askbot_settings
-from askbot.utils import url_utils
-from askbot.utils.file_utils import store_file
-from askbot.utils.html import absolutize_urls
-from bs4 import BeautifulSoup
#todo: maybe send_mail functions belong to models
#or the future API
def prefix_the_subject_line(subject):
@@ -82,7 +84,8 @@ def thread_headers(post, orig_post, update):
return headers
def clean_html_email(email_body):
- """needs more clenup might not work for other email templates
+ """returns the content part from an HTML email.
+ todo: needs more clenup might not work for other email templates
that do not use table layout
"""
soup = BeautifulSoup(email_body)
@@ -266,16 +269,12 @@ def bounce_email(
def extract_reply(text):
"""take the part above the separator
and discard the last line above the separator
+ ``text`` is the input text
"""
- if const.REPLY_SEPARATOR_REGEX.search(text):
- text = const.REPLY_SEPARATOR_REGEX.split(text)[0]
- text_lines = text.splitlines(False)
- #log last 10 lines of text - to capture email responses
- logging.debug('reply-border-separator|' + '|'.join(text_lines[-10:]))
- #here we need code stripping the "On ... somebody wrote:"
- return '\n'.join(text_lines[:-3])
- else:
- return text
+ return parsing.extract_reply_contents(
+ text,
+ const.REPLY_SEPARATOR_REGEX
+ )
def process_attachment(attachment):
"""will save a single
@@ -294,11 +293,11 @@ def process_attachment(attachment):
def extract_user_signature(text, reply_code):
"""extracts email signature as text trailing
the reply code"""
- striped_text = strip_tags(text)
- if reply_code in striped_text:
+ stripped_text = strip_tags(text)
+ if reply_code in stripped_text:
#extract the signature
tail = list()
- for line in reversed(striped_text.splitlines()):
+ for line in reversed(stripped_text.splitlines()):
#scan backwards from the end until the magic line
if reply_code in line:
break
@@ -314,10 +313,10 @@ def extract_user_signature(text, reply_code):
return ''
-def process_parts(parts, reply_code = None):
- """Process parts will upload the attachments and parse out the
- body, if body is multipart. Secondly - links to attachments
- will be added to the body of the question.
+def process_parts(parts, reply_code=None):
+ """Uploads the attachments and parses out the
+ body, if body is multipart.
+ Links to attachments will be added to the body of the question.
Returns ready to post body of the message and the list
of uploaded files.
"""
@@ -366,7 +365,7 @@ def process_emailed_question(
'subject': subject,
'body_text': body_text
}
- user = User.objects.get(email__iexact = email_address)
+ user = User.objects.get(email__iexact=from_address)
form = AskByEmailForm(data, user=user)
if form.is_valid():
email_address = form.cleaned_data['email']
diff --git a/askbot/mail/parsing.py b/askbot/mail/parsing.py
new file mode 100644
index 00000000..919c3f02
--- /dev/null
+++ b/askbot/mail/parsing.py
@@ -0,0 +1,78 @@
+"""a module for parsing email response text
+this file is a candidate for publishing as an independent module
+"""
+import re
+
+#Regexes for quote separators
+#add more via variables ending with _QUOTE_RE
+#These regexes do not contain any trailing:
+#* newline chars,
+#* lines starting with | or >
+#* lines consisting entirely of empty space
+#expressions are stripped of month and day names
+#to keep them simpler and make the additions of language variants
+#easier.
+GMAIL_QUOTE_RE = r'\n\nOn [^\n]* wrote:\Z'
+YAHOO_QUOTE_RE = r'\n\n\n\n_+\n From: [^\n]+\nTo: [^\n]+\nSent: [^\n]+\nSubject: [^\n]+\Z'
+KMAIL_QUOTE_RE = r'\AOn [^\n]+ you wrote:\s*\n\n'
+OUTLOOK_RTF_QUOTE_RE = r'\n\nSubject: [^\n]+\nFrom: [^\n]+\nTo: [^\n]+\nDate: [^\n]+\Z'
+OUTLOOK_TEXT_QUOTE_RE = r'\n_+\Z'
+
+def compile_quote_regexes():
+ regex_names = filter(
+ lambda v: v.endswith('_QUOTE_RE'),
+ globals().keys()
+ )
+ compiled_regexes = list()
+ for regex_name in regex_names:
+ regex = globals()[regex_name]
+ compiled_regexes.append(
+ re.compile(
+ regex,
+ re.MULTILINE | re.IGNORECASE
+ )
+ )
+ return compiled_regexes
+
+CLIENT_SPECIFIC_QUOTE_REGEXES = compile_quote_regexes()
+
+def strip_trailing_empties_and_quotes(text):
+ #strip empty lines and quote lines starting with | and >
+ return re.sub(r'(([\n\s\xa0])|(\n[\|>][^\n]*))*\Z', '', text)
+
+def strip_leading_empties(text):
+ return re.sub(r'\A[\n\s\xa0]*', '', text)
+
+def strip_email_client_formatting(text):
+ """strips email client formatting from the responses,
+ such as empty lines and quote separators (on ... wrote)
+
+ if one client-specific separator matches, then result
+ is immediately returned
+ """
+ text = strip_trailing_empties_and_quotes(text)
+ for regex in CLIENT_SPECIFIC_QUOTE_REGEXES:
+ if regex.search(text):
+ text = regex.sub('', text)
+ break
+ text = strip_trailing_empties_and_quotes(text)
+ return strip_leading_empties(text)
+
+def extract_reply_contents(text, reply_separator=None):
+ """If reply_separator is given,
+ take the part above the separator.
+ After, strip the email-client-specific text
+
+ ``text`` is the input text
+ ``reply_separator`` is either a string or a regex object
+ """
+ if reply_separator:
+ if isinstance(reply_separator, basestring):
+ text = text.split(reply_separator)[0]
+ else:
+ testre = re.compile('test')
+ if type(testre) == type(reply_separator):
+ text = reply_separator.split(text)[0]
+ else:
+ raise ValueError('reply_separator must be a string or a compiled regex')
+ return strip_email_client_formatting(text)
diff --git a/askbot/tests/email_parsing_tests.py b/askbot/tests/email_parsing_tests.py
index c1cea215..c8f3a057 100644
--- a/askbot/tests/email_parsing_tests.py
+++ b/askbot/tests/email_parsing_tests.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
from django.conf import settings as django_settings
from askbot.skins.loaders import get_template
from django.template import Context
@@ -5,7 +6,7 @@ from askbot import mail
from askbot import models
from askbot.tests import utils
-class EmailParseTests(utils.AskbotTestCase):
+class EmailParsingTests(utils.AskbotTestCase):
def setUp(self):
self.template_name = 'email/welcome_lamson_on.html'
@@ -22,29 +23,38 @@ class EmailParseTests(utils.AskbotTestCase):
print '=================================================='
print cleaned_body
print "CLEANED BODY"
- self.assertEquals(cleaned_body, self.expected_output)
+ self.assertEqual(cleaned_body, self.expected_output)
def test_gmail_rich_text_response_stripped(self):
text = u'\n\nthis is my reply!\n\nOn Wed, Oct 31, 2012 at 1:45 AM, <kp@kp-dev.askbot.com> wrote:\n\n> **\n> '
- self.assertEqual(mail.extract_body(text), 'this is my reply!')
+ self.assertEqual(mail.extract_reply(text), 'this is my reply!')
- def test_gmail_plain_text_response_stripped(text):
+ def test_gmail_plain_text_response_stripped(self):
text = u'\n\nthis is my another reply!\n\nOn Wed, Oct 31, 2012 at 1:45 AM, <kp@kp-dev.askbot.com> wrote:\n>\n> '
- self.assertEqual(mail.extract_body(text), 'this is my another reply!')
+ self.assertEqual(mail.extract_reply(text), 'this is my another reply!')
def test_yahoo_mail_response_stripped(self):
text = u'\n\nthis is my reply!\n\n\n\n________________________________\n From: "kp@kp-dev.askbot.com" <kp@kp-dev.askbot.com>\nTo: fadeev@rocketmail.com \nSent: Wednesday, October 31, 2012 2:41 AM\nSubject: "This is my test question"\n \n\n \n \n \n'
- self.assertEqual(mail.extract_body(text), 'this is my reply!')
-
- def test_kmail_plain_text_response_stripped(text):
- text = u'On Monday 01 October 2012 21:22:44 you wrote: \n\nSecond try, no HTML kmail does weird things.'
- self.assertEqual(mail.extract_body(text), 'this is my reply!')
-
-
-u'outlook.com (new hotmail) with RTF on \n\nSubject: "Posting a question by email." \nFrom: kp@kp-dev.askbot.com \nTo: aj_fitoria@hotmail.com \nDate: Thu, 1 Nov 2012 16:30:27 +0000'
-
-u'reply from hotmail without RTF \n________________________________ \n> Subject: "test with recovered signature" \n> From: kp@kp-dev.askbot.com \n> To: aj_fitoria@hotmail.com \n> Date: Thu, 1 Nov 2012 16:44:35 +0000'
-
-u'Reply from squirremail \n'
-
-"""
+ self.assertEqual(mail.extract_reply(text), 'this is my reply!')
+
+ def test_kmail_plain_text_response_stripped(self):
+ text = u'On Monday 01 October 2012 21:22:44 you wrote: \n\nthis is my reply!'
+ self.assertEqual(mail.extract_reply(text), 'this is my reply!')
+
+ def test_outlook_com_with_rtf_response_stripped(self):
+ text = u'outlook.com (new hotmail) with RTF on \n\nSubject: "Posting a question by email." \nFrom: kp@kp-dev.askbot.com \nTo: aj_fitoria@hotmail.com \nDate: Thu, 1 Nov 2012 16:30:27 +0000'
+ self.assertEqual(
+ mail.extract_reply(text),
+ 'outlook.com (new hotmail) with RTF on'
+ )
+ self.assertEqual(
+ mail.extract_reply(text),
+ 'outlook.com (new hotmail) with RTF on'
+ )
+
+ def test_outlook_com_plain_text_response_stripped(self):
+ text = u'reply from hotmail without RTF \n________________________________ \n> Subject: "test with recovered signature" \n> From: kp@kp-dev.askbot.com \n> To: aj_fitoria@hotmail.com \n> Date: Thu, 1 Nov 2012 16:44:35 +0000'
+ self.assertEqual(
+ mail.extract_reply(text),
+ u'reply from hotmail without RTF'
+ )
diff --git a/askbot/tests/reply_by_email_tests.py b/askbot/tests/reply_by_email_tests.py
index 30cb48be..5353586c 100644
--- a/askbot/tests/reply_by_email_tests.py
+++ b/askbot/tests/reply_by_email_tests.py
@@ -84,8 +84,8 @@ class ReplyAddressModelTests(AskbotTestCase):
'instruction': 'reply above this line'
}
msg = MockMessage(
- "This is a test reply \n\nOn such and such someone"
- "wrote something \n\n%s\nlorem ipsum " % (reply_separator),
+ "This is a test reply \n\nOn such and such someone "
+ "wrote: \n\n%s\nlorem ipsum " % (reply_separator),
"user1@domain.com"
)
msg['Subject'] = 'test subject'