askbot/search/state_manager.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211

"""Search state manager object"""
import re
import copy

from django.core import urlresolvers
from django.utils.http import urlquote, urlencode

import askbot
import askbot.conf
from askbot import const
from askbot.conf import settings as askbot_settings
from askbot.utils.functions import strip_plus


def extract_matching_token(text, regexes):
    """if text matches any of the regexes,
    * the entire match is removed from text
    * repeating spaces in the remaining string are replaced with one
    * returned is a tuple of: first group from the regex, remaining text
    """
    for regex in regexes:
        m = regex.search(text)
        if m:
            text = regex.sub('', text)
            extracted_match = m.group(1)
            return (strip_plus(extracted_match), strip_plus(text))
    return ('', text.strip())

def extract_all_matching_tokens(text, regexes):
    """the same as the ``extract_matching_token``
    but returns a tuple of: list of first group matches from the regexes
    and the remains of the input text
    """
    matching_tokens = set()
    for regex in regexes:
        matches = regex.findall(text)
        if len(matches) > 0:
            text = regex.sub('', text)
            matching_tokens.update([match.strip() for match in matches])
    return ([strip_plus(token) for token in matching_tokens], strip_plus(text))


def parse_query(query):
    """takes hand-typed search query string as an argument
    returns a dictionary with keys (and values in parens):
    * stripped_query (query with the items below stripped)
    * query_tags (list of tag names)
    * query_users (list of user names, not validated)
    * query_title (question title)
    Note: the stripped_query is the actual string
    against which global search will be performed
    the original query will still all be shown in the search
    query input box
    """
    title_re1 = re.compile(r'\[title:(.+?)\]')
    title_re2 = re.compile(r'title:"([^"]+?)"')
    title_re3 = re.compile(r"title:'([^']+?)'")
    title_regexes = (title_re1, title_re2, title_re3)
    (query_title, query) = extract_matching_token(query, title_regexes)

    tag_re1 = re.compile(r'\[([^:]+?)\]')
    tag_re2 = re.compile(r'\[tag:\s*([\S]+)\s*]')
    tag_re3 = re.compile(r'#(\S+)')
    tag_regexes = (tag_re1, tag_re2, tag_re3)
    (query_tags, query) = extract_all_matching_tokens(query, tag_regexes)

    user_re1 = re.compile(r'\[user:([^\]]+?)\]')
    user_re2 = re.compile(r'user:"([^"]+?)"')
    user_re3 = re.compile(r"user:'([^']+?)'")
    user_re4 = re.compile(r"""@([^'"\s]+)""")
    user_re5 = re.compile(r'@"([^"]+)"')
    user_re6 = re.compile(r"@'([^']+)'")
    user_regexes = (user_re1, user_re2, user_re3, user_re4, user_re5, user_re6)
    (query_users, stripped_query) = extract_all_matching_tokens(query, user_regexes)

    return {
        'stripped_query': stripped_query,
        'query_title': query_title,
        'query_tags': query_tags,
        'query_users': query_users
    }

class SearchState(object):

    @classmethod
    def get_empty(cls):
        return cls(scope=None, sort=None, query=None, tags=None, author=None, page=None, user_logged_in=None)

    def __init__(self, scope, sort, query, tags, author, page, user_logged_in):
        # INFO: zip(*[('a', 1), ('b', 2)])[0] == ('a', 'b')

        if (scope not in zip(*const.POST_SCOPE_LIST)[0]) or (scope == 'favorite' and not user_logged_in):
            self.scope = const.DEFAULT_POST_SCOPE
        else:
            self.scope = scope

        self.query = query.strip() if query else None

        if self.query:
            #pull out values of [title:xxx], [user:some one]
            #[tag: sometag], title:'xxx', title:"xxx", @user, @'some user',
            #and  #tag - (hash symbol to delineate the tag
            query_bits = parse_query(self.query)
            self.stripped_query = query_bits['stripped_query']
            self.query_tags = query_bits['query_tags']
            self.query_users = query_bits['query_users']
            self.query_title = query_bits['query_title']
        else:
            self.stripped_query = None
            self.query_tags = None
            self.query_users = None
            self.query_title = None

        if (sort not in zip(*const.POST_SORT_METHODS)[0]) or (sort == 'relevance-desc' and (not self.query or not askbot.conf.should_show_sort_by_relevance())):
            self.sort = const.DEFAULT_POST_SORT_METHOD
        else:
            self.sort = sort

        self.tags = [t.strip() for t in tags.split(const.TAG_SEP)] if tags else []
        self.author = int(author) if author else None
        self.page = int(page) if page else 1
        if self.page == 0:  # in case someone likes jokes :)
            self.page = 1

    def __str__(self):
        return self.query_string()

    def full_url(self):
        return urlresolvers.reverse('questions') + self.query_string()

    def ask_query_string(self): # TODO: test me
        """returns string to prepopulate title field on the "Ask your question" page"""
        ask_title = self.stripped_query or self.query or ''
        if not ask_title:
            return ''
        return '?' + urlencode({'title': ask_title})

    def full_ask_url(self):
        return urlresolvers.reverse('ask') + self.ask_query_string()

    def unified_tags(self):
        "Returns tags both from tag selector and extracted from query"
        return (self.query_tags or []) + (self.tags or [])

    #
    # Safe characters in urlquote() according to http://www.ietf.org/rfc/rfc1738.txt:
    #
    #    Thus, only alphanumerics, the special characters "$-_.+!*'(),", and
    #    reserved characters used for their reserved purposes may be used
    #    unencoded within a URL.
    #
    # Tag separator (const.TAG_SEP) remains unencoded to clearly mark tag boundaries
    # _+.- stay unencoded to keep tags in URL as verbose as possible
    #      (note that urllib.quote() in Python 2.7 treats _.- as safe chars, but let's be explicit)
    # Hash (#) is not safe and has to be encodeded, as it's used as URL has delimiter
    #
    SAFE_CHARS = const.TAG_SEP + '_+.-'

    def query_string(self):
        lst = [
            'scope:%s' % self.scope,
            'sort:%s' % self.sort
        ]
        if self.query:
            lst.append('query:%s' % urlquote(self.query, safe=self.SAFE_CHARS))
        if self.tags:
            lst.append('tags:%s' % urlquote(const.TAG_SEP.join(self.tags), safe=self.SAFE_CHARS))
        if self.author:
            lst.append('author:%d' % self.author)
        if self.page:
            lst.append('page:%d' % self.page)
        return '/'.join(lst) + '/'

    def deepcopy(self):
        "Used to contruct a new SearchState for manipulation, e.g. for adding/removing tags"
        return copy.deepcopy(self)

    def add_tag(self, tag):
        ss = self.deepcopy()
        ss.tags.append(tag)
        ss.page = 1 # state change causes page reset
        return ss

    def remove_author(self):
        ss = self.deepcopy()
        ss.author = None
        ss.page = 1
        return ss

    def remove_tags(self):
        ss = self.deepcopy()
        ss.tags = []
        ss.page = 1
        return ss

    def change_scope(self, new_scope):
        ss = self.deepcopy()
        ss.scope = new_scope
        ss.page = 1
        return ss

    def change_sort(self, new_sort):
        ss = self.deepcopy()
        ss.sort = new_sort
        ss.page = 1
        return ss

    def change_page(self, new_page):
        ss = self.deepcopy()
        ss.page = new_page
        return ss