1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
"""a module for parsing email response text
this file is a candidate for publishing as an independent module
"""
import re
import sys
from askbot.conf import settings as askbot_settings
#Regexes for quote separators
#add more via variables ending with _QUOTE_RE
#These regexes do not contain any trailing:
#* newline chars,
#* lines starting with | or >
#* lines consisting entirely of empty space
#expressions are stripped of month and day names
#to keep them simpler and make the additions of language variants
#easier.
QUOTE_REGEXES = (
#GMAIL_QUOTE_RE =
r'\nOn [^\n]* wrote:\Z',
#GMAIL_SECOND_QUOTE_RE =
r'\n\d{4}/\d{1,2}/\d{1,2} [^\n]*\Z',
#BLACKBERRY
r'_+\nFrom:.*?\nSent:.*?\nTo:.*?\nSubject:.*?\Z',
#OUTLOOK1
r'\n-+[\w -]+\nFrom:.*?\nSent:.*?\nTo:.*?\nSubject:.*?\Z',
#unknown
r'\n-+[\w -]+\nFrom:.*?\nDate:.*?\nTo:.*?\nSubject:.*?\Z',
#YAHOO_QUOTE_RE =
r'\n_+\n\s*From: [^\n]+\nTo: [^\n]+\nSent: [^\n]+\nSubject: [^\n]+\Z',
#KMAIL_QUOTE_RE =
r'\AOn [^\n]+ you wrote:\s*\n\n',
#OUTLOOK_RTF_QUOTE_RE =
r'\nSubject: [^\n]+\nFrom: [^\n]+\nTo: [^\n]+\nDate: [^\n]+\Z',
#OUTLOOK_TEXT_QUOTE_RE =
r'\n_+\Z',
)
# extra samples
"""
-----Original Message-----^M
From: forum@example.com [mailto:forum@example.com] ^M
Sent: Wednesday, August 07, 2013 11:00 AM^M
To: Jane Doe^M
Subject: "One more test question from email."^M
"""
def compile_quote_regexes():
compiled_regexes = list()
for regex in QUOTE_REGEXES:
compiled_regexes.append(
re.compile(
regex,
re.MULTILINE | re.IGNORECASE
)
)
return compiled_regexes
CLIENT_SPECIFIC_QUOTE_REGEXES = compile_quote_regexes()
def strip_trailing_empties_and_quotes(text):
#strip empty lines and quote lines starting with | and >
return re.sub(r'(([\n\s\xa0])|(\n[\|>][^\n]*))*\Z', '', text)
def strip_leading_empties(text):
return re.sub(r'\A[\n\s\xa0]*', '', text)
def strip_trailing_sender_references(text, email_address):
server_email = 'ask@' + askbot_settings.REPLY_BY_EMAIL_HOSTNAME
email_pattern = '(%s|%s)' % (email_address, server_email)
pattern = r'\n[^\n]*%s[^\n]*$' % email_pattern
return re.sub(pattern, '', text, re.IGNORECASE)
def strip_email_client_quote_separator(text):
"""strips email client quote separator from the responses,
e.g. (on such date XYZ wrote)
if one client-specific separator matches, then result
is immediately returned
"""
for regex in CLIENT_SPECIFIC_QUOTE_REGEXES:
if regex.search(text):
return regex.sub('', text)
#did not find a quote separator!!! log it
log_message = u'\nno matching quote separator: %s\n' % text
sys.stderr.write(log_message.encode('utf-8'))
text_lines = text.splitlines(False)
return ''.join(text_lines[:-3])#strip 3 lines as a guess
def extract_reply_contents(text, reply_separator=None):
"""If reply_separator is given,
take the part above the separator.
After, strip the email-client-specific text
``text`` is the input text
``reply_separator`` is either a string or a regex object
"""
if reply_separator:
if isinstance(reply_separator, basestring):
text = text.split(reply_separator)[0]
else:
testre = re.compile('test')
if type(testre) == type(reply_separator):
text = reply_separator.split(text)[0]
else:
raise ValueError('reply_separator must be a string or a compiled regex')
text = strip_trailing_empties_and_quotes(text)
text = strip_email_client_quote_separator(text)
text = strip_trailing_empties_and_quotes(text)
return strip_leading_empties(text)
|