From 336c7e3ae202efad4fa10e242f66f0f35b6c1a07 Mon Sep 17 00:00:00 2001 From: Evgeny Fadeev Date: Tue, 28 May 2013 02:23:46 -0400 Subject: added management command apply_hinted_tags --- askbot/doc/source/changelog.rst | 1 + askbot/doc/source/management-commands.rst | 5 +++ askbot/management/commands/apply_hinted_tags.py | 58 +++++++++++++++++++++++++ askbot/models/question.py | 58 +++++++++++++++++++++++++ 4 files changed, 122 insertions(+) create mode 100644 askbot/management/commands/apply_hinted_tags.py diff --git a/askbot/doc/source/changelog.rst b/askbot/doc/source/changelog.rst index 85a7015e..48afb5ea 100644 --- a/askbot/doc/source/changelog.rst +++ b/askbot/doc/source/changelog.rst @@ -3,6 +3,7 @@ Changes in Askbot Development version ------------------- +* Added management command `apply_hinted_tags` to batch-apply tags from a list * Added hovercard on the user's karma display in the header * Added option to hide ad blocks from logged in users * Applied Askbot templates to the settings control panel diff --git a/askbot/doc/source/management-commands.rst b/askbot/doc/source/management-commands.rst index cc5e952f..da93dcb5 100644 --- a/askbot/doc/source/management-commands.rst +++ b/askbot/doc/source/management-commands.rst @@ -25,6 +25,11 @@ The bulk of the management commands fall into this group and will probably be th | `add_admin ` | Turn user into an administrator | | | `` is a numeric user id of the account | +---------------------------------+-------------------------------------------------------------+ +| `apply_hinted_tags | Apply tags to all questions in batch given the list of tags | +| --tag-names ` | provided with a file. The file must contain tags - | +| | one per line. If many tags match - only the most frequent | +| | will be selected. | ++---------------------------------+-------------------------------------------------------------+ | `remove_admin ` | Remove admin status from a user account - the opposite of | | | the `add_admin` command | +---------------------------------+-------------------------------------------------------------+ diff --git a/askbot/management/commands/apply_hinted_tags.py b/askbot/management/commands/apply_hinted_tags.py new file mode 100644 index 00000000..94bf2383 --- /dev/null +++ b/askbot/management/commands/apply_hinted_tags.py @@ -0,0 +1,58 @@ +import datetime +from django.core.management.base import BaseCommand +from django.core.management.base import CommandError +from optparse import make_option +from askbot.utils.console import ProgressBar +from askbot.models import Thread +from askbot.models import User + +class Command(BaseCommand): + help = """Adds tags to questions. Tags should be given via a file + with one tag per line. The tags will be matched with the words + found in the question title. Then, most frequently used matching tags + will be applied. This command respects the maximum number of tags + allowed per question. + """ + option_list = BaseCommand.option_list + ( + make_option('--tags-file', '-t', + action = 'store', + type = 'str', + dest = 'tags_file', + default = None, + help = 'file containing tag names, one per line' + ), + ) + def handle(self, *args, **kwargs): + """reads the tags file, parses it, + then applies tags to questions by matching them + with the question titles and content + """ + if kwargs['tags_file'] is None: + raise CommandError('parameter --tags-file is required') + try: + tags_input = open(kwargs['tags_file']).read() + except IOError: + raise CommandError('file "%s" not found' % kwargs['tags_file']) + + tags_list = map(lambda v: v.strip(), tags_input.split('\n')) + + multiword_tags = list() + for tag in tags_list: + if ' ' in tag: + multiword_tags.append(tag) + + if len(multiword_tags): + message = 'multiword tags tags not allowed, have: %s' % ', '.join(multiword_tags) + raise CommandError(message) + + threads = Thread.objects.all() + count = threads.count() + message = 'Applying tags to questions' + + user = User.objects.all().order_by('-id')[0] + now = datetime.datetime.now() + + for thread in ProgressBar(threads.iterator(), count, message): + thread.apply_hinted_tags( + tags_list, user=user, timestamp=now, silent=True + ) diff --git a/askbot/models/question.py b/askbot/models/question.py index 3dd9fc6b..70060eb2 100644 --- a/askbot/models/question.py +++ b/askbot/models/question.py @@ -602,6 +602,64 @@ class Thread(models.Model): self._question_cache = Post.objects.get(post_type='question', thread=self) return self._question_cache + def apply_hinted_tags(self, hints=None, user=None, timestamp=None, silent=False): + """match words in title and body with hints + and apply some of the hints as tags, + so that total number of tags in no more + than the maximum allowed number of tags""" + + #1) see how many tags we're missing, + #if we don't need more we return + existing_tags = self.get_tag_names() + tags_count = len(existing_tags) + if tags_count >= askbot_settings.MAX_TAGS_PER_POST: + return + + #2) get set of words from title and body + post_text = self.title + ' ' + self._question_post().text + post_text = post_text.lower()#normalize + post_words = set(post_text.split()) + + #3) get intersection set + #normalize hints and tags and remember the originals + orig_hints = dict() + for hint in hints: + orig_hints[hint.lower()] = hint + + norm_hints = orig_hints.keys() + norm_tags = map(lambda v: v.lower(), existing_tags) + + common_words = (set(norm_hints) & post_words) - set(norm_tags) + + #4) for each common word count occurances in corpus + counts = dict() + for word in common_words: + counts[word] = sum(map(lambda w: w.lower() == word.lower(), post_words)) + + #5) sort words by count + sorted_words = sorted( + common_words, + lambda a, b: cmp(counts[b], counts[a]) + ) + + #6) extract correct number of most frequently used tags + need_tags = askbot_settings.MAX_TAGS_PER_POST - len(existing_tags) + add_tags = sorted_words[0:need_tags] + add_tags = map(lambda h: orig_hints[h], add_tags) + + tagnames = ' '.join(existing_tags + add_tags) + + if askbot_settings.FORCE_LOWERCASE_TAGS: + tagnames = tagnames.lower() + + self.retag( + retagged_by=user, + retagged_at=timestamp or datetime.datetime.now(), + tagnames =' '.join(existing_tags + add_tags), + silent=silent + ) + + def get_absolute_url(self): return self._question_post().get_absolute_url(thread = self) #question_id = self._question_post().id -- cgit v1.2.3-1-g7c22