askbot/search/state_manager.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283

"""Search state manager object"""
import re
import urllib
import copy

from django.core import urlresolvers
from django.utils.http import urlencode
from django.utils.encoding import smart_str

import askbot
import askbot.conf
from askbot.conf import settings as askbot_settings
from askbot import const
from askbot.utils.functions import strip_plus


def extract_matching_token(text, regexes):
    """if text matches any of the regexes,
    * the entire match is removed from text
    * repeating spaces in the remaining string are replaced with one
    * returned is a tuple of: first group from the regex, remaining text
    """
    for regex in regexes:
        m = regex.search(text)
        if m:
            text = regex.sub('', text)
            extracted_match = m.group(1)
            return (strip_plus(extracted_match), strip_plus(text))
    return ('', text.strip())

def extract_all_matching_tokens(text, regexes):
    """the same as the ``extract_matching_token``
    but returns a tuple of: list of first group matches from the regexes
    and the remains of the input text
    """
    matching_tokens = set()
    for regex in regexes:
        matches = regex.findall(text)
        if len(matches) > 0:
            text = regex.sub('', text)
            matching_tokens.update([match.strip() for match in matches])
    return ([strip_plus(token) for token in matching_tokens], strip_plus(text))


def parse_query(query):
    """takes hand-typed search query string as an argument
    returns a dictionary with keys (and values in parens):
    * stripped_query (query with the items below stripped)
    * query_tags (list of tag names)
    * query_users (list of user names, not validated)
    * query_title (question title)
    Note: the stripped_query is the actual string
    against which global search will be performed
    the original query will still all be shown in the search
    query input box
    """
    title_re1 = re.compile(r'\[title:(.+?)\]')
    title_re2 = re.compile(r'title:"([^"]+?)"')
    title_re3 = re.compile(r"title:'([^']+?)'")
    title_regexes = (title_re1, title_re2, title_re3)
    (query_title, query) = extract_matching_token(query, title_regexes)

    tag_re1 = re.compile(r'\[([^:]+?)\]')
    tag_re2 = re.compile(r'\[tag:\s*([\S]+)\s*]')
    tag_re3 = re.compile(r'#(\S+)')
    tag_regexes = (tag_re1, tag_re2, tag_re3)
    (query_tags, query) = extract_all_matching_tokens(query, tag_regexes)

    user_re1 = re.compile(r'\[user:([^\]]+?)\]')
    user_re2 = re.compile(r'user:"([^"]+?)"')
    user_re3 = re.compile(r"user:'([^']+?)'")
    user_re4 = re.compile(r"""@([^'"\s]+)""")
    user_re5 = re.compile(r'@"([^"]+)"')
    user_re6 = re.compile(r"@'([^']+)'")
    user_regexes = (user_re1, user_re2, user_re3, user_re4, user_re5, user_re6)
    (query_users, stripped_query) = extract_all_matching_tokens(query, user_regexes)

    return {
        'stripped_query': stripped_query,
        'query_title': query_title,
        'query_tags': query_tags,
        'query_users': query_users
    }

class SearchState(object):

    @classmethod
    def get_empty(cls):
        return cls(scope=None, sort=None, query=None, tags=None, author=None, page=None, user_logged_in=None)

    def __init__(self, scope, sort, query, tags, author, page, user_logged_in):
        # INFO: zip(*[('a', 1), ('b', 2)])[0] == ('a', 'b')

        if (scope not in zip(*const.POST_SCOPE_LIST)[0]) or (scope == 'followed' and not user_logged_in):
            if user_logged_in:
                self.scope = askbot_settings.DEFAULT_SCOPE_AUTHENTICATED
            else:
                self.scope = askbot_settings.DEFAULT_SCOPE_ANONYMOUS
        else:
            self.scope = scope

        self.query = query.strip() if query else None

        if self.query:
            #pull out values of [title:xxx], [user:some one]
            #[tag: sometag], title:'xxx', title:"xxx", @user, @'some user',
            #and  #tag - (hash symbol to delineate the tag
            query_bits = parse_query(self.query)
            self.stripped_query = query_bits['stripped_query']
            self.query_tags = query_bits['query_tags']
            self.query_users = query_bits['query_users']
            self.query_title = query_bits['query_title']
        else:
            self.stripped_query = None
            self.query_tags = None
            self.query_users = None
            self.query_title = None

        if (sort not in zip(*const.POST_SORT_METHODS)[0]) or (sort == 'relevance-desc' and (not self.query or not askbot.conf.should_show_sort_by_relevance())):
            self.sort = const.DEFAULT_POST_SORT_METHOD
        else:
            self.sort = sort

        #patch for empty stripped query, relevance sorting is useless then
        if self.stripped_query in (None, '') and sort == 'relevance-desc':
            self.sort = const.DEFAULT_POST_SORT_METHOD

        self.tags = []
        if tags:
            for t in tags.split(const.TAG_SEP):
                tag = t.strip()
                if tag not in self.tags:
                    self.tags.append(tag)

        self.author = int(author) if author else None
        self.page = int(page) if page else 1
        if self.page == 0:  # in case someone likes jokes :)
            self.page = 1

        self._questions_url = urlresolvers.reverse('questions')

    def __str__(self):
        return self.query_string()

    def full_url(self):
        return self._questions_url + self.query_string()

    def ask_query_string(self): # TODO: test me
        """returns string to prepopulate title field on the "Ask your question" page"""
        ask_title = self.stripped_query or self.query or ''
        if not ask_title:
            return ''
        return '?' + urlencode({'title': ask_title})

    def full_ask_url(self):
        return urlresolvers.reverse('ask') + self.ask_query_string()

    def unified_tags(self):
        "Returns tags both from tag selector and extracted from query"
        return (self.query_tags or []) + (self.tags or [])

    #
    # Safe characters in urlquote() according to http://www.ietf.org/rfc/rfc1738.txt:
    #
    #    Thus, only alphanumerics, the special characters "$-_.+!*'(),", and
    #    reserved characters used for their reserved purposes may be used
    #    unencoded within a URL.
    #
    # Tag separator (const.TAG_SEP) remains unencoded to clearly mark tag boundaries
    # _+.- stay unencoded to keep tags in URL as verbose as possible
    #      (note that urllib.quote() in Python 2.7 treats _.- as safe chars, but let's be explicit)
    # Hash (#) is not safe and has to be encodeded, as it's used as URL has delimiter
    #
    SAFE_CHARS = const.TAG_SEP + '_+.-'

    def query_string(self):
        """returns part of the url to the main page,
        responsible to display the full text search results,
        taking into account sort method, selected scope
        and search tags"""

        lst = [
            'scope:' + self.scope,
            'sort:' + self.sort
        ]

        """
            ATTN: a copy from urls.py
            r'(%s)?' % r'/scope:(?P<scope>\w+)' +
            r'(%s)?' % r'/sort:(?P<sort>[\w\-]+)' +
            r'(%s)?' % r'/tags:(?P<tags>[\w+.#,-]+)' + # Should match: const.TAG_CHARS + ','; TODO: Is `#` char decoded by the time URLs are processed ??
            r'(%s)?' % r'/author:(?P<author>\d+)' +
            r'(%s)?' % r'/page:(?P<page>\d+)' +
            r'(%s)?' % r'/query:(?P<query>.+)' +  # INFO: query is last, b/c it can contain slash!!!
        """

        #order of items is important!!!
        if self.tags:
            lst.append('tags:' + urllib.quote(smart_str(const.TAG_SEP.join(self.tags)), safe=self.SAFE_CHARS))
        if self.author:
            lst.append('author:' + str(self.author))
        if self.page:
            lst.append('page:' + str(self.page))
        if self.query:
            lst.append('query:' + urllib.quote(smart_str(self.query), safe=self.SAFE_CHARS))
        return '/'.join(lst) + '/'

    def deepcopy(self): # TODO: test me
        "Used to contruct a new SearchState for manipulation, e.g. for adding/removing tags"
        ss = copy.copy(self) #SearchState.get_empty()

        #ss.scope = self.scope
        #ss.sort = self.sort
        #ss.query = self.query
        if ss.tags is not None: # it's important to test against None, because empty lists should also be cloned!
            ss.tags = ss.tags[:]  # create a copy
        #ss.author = self.author
        #ss.page = self.page

        #ss.stripped_query = self.stripped_query
        if ss.query_tags: # Here we don't have empty lists, only None
            ss.query_tags = ss.query_tags[:]
        if ss.query_users:
            ss.query_users = ss.query_users[:]
        #ss.query_title = self.query_title

        #ss._questions_url = self._questions_url

        return ss

    def add_tag(self, tag):
        ss = self.deepcopy()
        if tag not in ss.tags:
            ss.tags.append(tag)
            ss.page = 1 # state change causes page reset
        return ss

    def remove_author(self):
        ss = self.deepcopy()
        ss.author = None
        ss.page = 1
        return ss

    def remove_tags(self, tags = None):
        ss = self.deepcopy()
        if tags:
            ss.tags = list(
                set(ss.tags) - set(tags)
            )
        else:
            ss.tags = []
        ss.page = 1
        return ss

    def change_scope(self, new_scope):
        ss = self.deepcopy()
        ss.scope = new_scope
        ss.page = 1
        return ss

    def change_sort(self, new_sort):
        ss = self.deepcopy()
        ss.sort = new_sort
        ss.page = 1
        return ss

    def change_page(self, new_page):
        ss = self.deepcopy()
        ss.page = new_page
        return ss


class DummySearchState(object): # Used for caching question/thread summaries

    def add_tag(self, tag):
        self.tag = tag
        return self

    def change_scope(self, new_scope):
        return self

    def full_url(self):
        return '<<<%s>>>' % self.tag