diff options
-rwxr-xr-x | forum/auth.py | 15 | ||||
-rwxr-xr-x | forum/const.py | 8 | ||||
-rwxr-xr-x | forum/models/__init__.py | 103 | ||||
-rwxr-xr-x | forum/models/meta.py | 6 | ||||
-rwxr-xr-x | forum/models/repute.py | 27 | ||||
-rwxr-xr-x | forum/views/commands.py | 9 | ||||
-rw-r--r-- | stackexchange/ANOMALIES | 9 | ||||
-rw-r--r-- | stackexchange/README | 34 | ||||
-rw-r--r-- | stackexchange/management/commands/load_stackexchange.py | 266 |
9 files changed, 415 insertions, 62 deletions
diff --git a/forum/auth.py b/forum/auth.py index c5b466f8..e8b97ddd 100755 --- a/forum/auth.py +++ b/forum/auth.py @@ -11,6 +11,8 @@ from django.db import transaction from models import Repute from models import Question from models import Answer +#todo: why can't I import these? +#from models import mark_offensive, delete_post_or_answer from const import TYPE_REPUTATION import logging question_type = ContentType.objects.get_for_model(Question) @@ -252,6 +254,11 @@ def onFlaggedItem(item, post, user, timestamp=None): #post.deleted_at = timestamp #post.deleted_by = Admin post.save() + mark_offensive.send( + sender=post.__class__, + instance=post, + mark_by=user + ) @transaction.commit_on_success @@ -331,7 +338,8 @@ def onUpVoted(vote, post, user, timestamp=None): if not post.wiki: author = post.author - if Repute.objects.get_reputation_by_upvoted_today(author) < int(REPUTATION_RULES['scope_per_day_by_upvotes']): + todays_rep_gain = Repute.objects.get_reputation_by_upvoted_today(author) + if todays_rep_gain < int(REPUTATION_RULES['scope_per_day_by_upvotes']): author.reputation = calculate_reputation(author.reputation, int(REPUTATION_RULES['gain_by_upvoted'])) author.save() @@ -514,3 +522,8 @@ def onDeleted(post, user, timestamp=None): elif isinstance(post, Answer): Question.objects.update_answer_count(post.question) logging.debug('updated answer count to %d' % post.question.answer_count) + delete_post_or_answer.send( + sender=post.__class__, + instance=post, + delete_by=user + ) diff --git a/forum/const.py b/forum/const.py index ce81acb2..39db5ad4 100755 --- a/forum/const.py +++ b/forum/const.py @@ -8,12 +8,12 @@ CLOSE_REASONS = ( (1, _('duplicate question')), (2, _('question is off-topic or not relevant')), (3, _('too subjective and argumentative')), - (4, _('is not an answer to the question')), + (4, _('not a real question')), (5, _('the question is answered, right answer was accepted')), - (6, _('problem is not reproducible or outdated')), - #(7, u'太局部、本地化的问题',) - (7, _('question contains offensive inappropriate, or malicious remarks')), + (6, _('question is not relevant or outdated')), + (7, _('question contains offensive or malicious remarks')), (8, _('spam or advertising')), + (9, _('too localized')), ) TYPE_REPUTATION = ( diff --git a/forum/models/__init__.py b/forum/models/__init__.py index 9b62e4ae..ef591702 100755 --- a/forum/models/__init__.py +++ b/forum/models/__init__.py @@ -8,6 +8,8 @@ import re from base import * import datetime +from forum import auth +from django.contrib.contenttypes.models import ContentType # User extend properties QUESTIONS_PER_PAGE_CHOICES = ( @@ -99,8 +101,19 @@ def get_profile_link(self): #series of methods for user vote-type commands #same call signature func(self, post, timestamp=None, cancel=None) +#note that none of these have business logic checks internally +#these functions are used by the forum app and +#by the data importer jobs from say stackexchange, where internal rules +#may be different +#maybe if we do use business rule checks here - we should add +#some flag allowing to bypass them for things like the data importers def toggle_favorite_question(self, question, timestamp=None, cancel=False): - """cancel has no effect here, but is important + """cancel has no effect here, but is important for the SE loader + it is hoped that toggle will work and data will be consistent + but there is no guarantee, maybe it's better to be more strict + about processing the "cancel" option + another strange thing is that this function unlike others below + returns a value """ try: fave = FavoriteQuestion.objects.get(question=question, user=self) @@ -117,19 +130,95 @@ def toggle_favorite_question(self, question, timestamp=None, cancel=False): Question.objects.update_favorite_count(question) return result +#"private" wrapper function that applies post upvotes/downvotes and cancelations +def _process_vote(user, post, timestamp=None, cancel=False, vote_type=None): + post_type = ContentType.objects.get_for_model(post) + #get or create the vote object + #return with noop in some situations + try: + vote = Vote.objects.get( + user = user, + content_type = post_type, + object_id = post.id, + ) + except Vote.DoesNotExist: + vote = None + if cancel: + if vote == None: + return + elif vote.is_opposite(vote_type): + return + else: + #we would call vote.delete() here + #but for now all that is handled by the + #legacy forum.auth functions + #vote.delete() + pass + else: + if vote == None: + vote = Vote( + user = user, + content_object = post, + vote = vote_type, + voted_at = timestamp, + ) + elif vote.is_opposite(vote_type): + vote.vote = vote_type + else: + return + + #do the actual work + if vote_type == Vote.VOTE_UP: + if cancel: + auth.onUpVotedCanceled(vote, post, user, timestamp) + else: + auth.onUpVoted(vote, post, user, timestamp) + elif vote_type == Vote.VOTE_DOWN: + if cancel: + auth.onDownVotedCanceled(vote, post, user, timestamp) + else: + auth.onDonwVoted(vote, post, user, timestamp) + def upvote(self, post, timestamp=None, cancel=False): - pass + _process_vote( + self,post, + timestamp=timestamp, + cancel=cancel, + vote_type=Vote.VOTE_UP + ) def downvote(self, post, timestamp=None, cancel=False): - pass + _process_vote( + self,post, + timestamp=timestamp, + cancel=cancel, + vote_type=Vote.VOTE_DOWN + ) def accept_answer(self, answer, timestamp=None, cancel=False): - pass + if cancel: + auth.onAnswerAcceptCanceled(answer, self, timestamp=timestamp) + else: + auth.onAnswerAccept(answer, self, timestamp=timestamp) -def flag_post(self, answer, timestamp=None, cancel=False): - pass +def flag_post(self, post, timestamp=None, cancel=False): + if cancel:#todo: can't unflag? + return + if post.flagged_items.filter(user=user).count() > 0: + return + else: + flag = FlaggedItem( + user = self, + content_object = post, + flagged_at = timestamp, + ) + auth.onFlaggedItem(flag, post, user, timestamp=timestamp) User.add_to_class('toggle_favorite_question', toggle_favorite_question) +User.add_to_class('upvote', upvote) +User.add_to_class('downvote', downvote) +User.add_to_class('accept_answer', accept_answer) +User.add_to_class('flag_post', flag_post) User.add_to_class('get_profile_url', get_profile_url) User.add_to_class('get_profile_link', get_profile_link) User.add_to_class('get_messages', get_messages) @@ -395,7 +484,7 @@ __all__ = [ 'AnonymousEmail', 'AuthKeyUserAssociation', - 'User' + 'User', ] diff --git a/forum/models/meta.py b/forum/models/meta.py index 7c3f5d36..6923a932 100755 --- a/forum/models/meta.py +++ b/forum/models/meta.py @@ -48,6 +48,10 @@ class Vote(MetaContent): def is_downvote(self): return self.vote == self.VOTE_DOWN + def is_opposite(self, vote_type): + assert(vote_type in (self.VOTE_UP, self.VOTE_DOWN)) + return self.vote != vote_type + class FlaggedItemManager(models.Manager): def get_flagged_items_count_today(self, user): @@ -86,4 +90,4 @@ class Comment(MetaContent): logging.debug('problem pinging google did you register you sitemap with google?') def __unicode__(self): - return self.comment
\ No newline at end of file + return self.comment diff --git a/forum/models/repute.py b/forum/models/repute.py index f0ee27ed..5e42542f 100755 --- a/forum/models/repute.py +++ b/forum/models/repute.py @@ -81,15 +81,26 @@ class ReputeManager(models.Manager): by upvoted(also substracted from upvoted canceled). This is because we need to prohibit gaming system by upvoting/cancel again and again. """ - if user is not None: - today = datetime.date.today() - sums = self.filter(models.Q(reputation_type=1) | models.Q(reputation_type=-8), - user=user, reputed_at__range=(today, today + datetime.timedelta(1))). \ - agregate(models.Sum('positive'), models.Sum('negative')) - - return sums['positive__sum'] + sums['negative__sum'] - else: + if user is None: return 0 + else: + today = datetime.date.today() + tomorrow = today + datetime.timedelta(1) + rep_types = (1,-8) + sums = self.filter(models.Q(reputation_type__in=(1,-8)), + user=user, + reputed_at__range=(today, tomorrow), + ).aggregate(models.Sum('positive'), models.Sum('negative')) + if sums: + pos = sums['positive__sum'] + neg = sums['negative__sum'] + if pos is None: + pos = 0 + if neg is None: + neg = 0 + return pos + neg + else: + return 0 class Repute(models.Model): """The reputation histories for user""" diff --git a/forum/views/commands.py b/forum/views/commands.py index 7640afb7..ca6569e2 100755 --- a/forum/views/commands.py +++ b/forum/views/commands.py @@ -130,7 +130,13 @@ def vote(request, id):#todo: pretty incomprehensible view used by various ajax c elif not __can_vote(vote_score, request.user): response_data['allowed'] = -2 elif post.votes.filter(user=request.user).count() > 0: + #todo: I think we have a bug here + #we need to instead select vote on that particular post + #not just the latest vote, although it is a good shortcut. + #The problem is that this vote is deleted in one of + #the on...Canceled() functions vote = post.votes.filter(user=request.user)[0] + # get latest vote by the current user # unvote should be less than certain time if (datetime.datetime.now().day - vote.voted_at.day) >= auth.VOTE_RULES['scope_deny_unvote_days']: response_data['status'] = 2 @@ -178,8 +184,6 @@ def vote(request, id):#todo: pretty incomprehensible view used by various ajax c item = FlaggedItem(user=request.user, content_object=post, flagged_at=datetime.datetime.now()) auth.onFlaggedItem(item, post, request.user) response_data['count'] = post.offensive_flag_count - # send signal when question or answer be marked offensive - mark_offensive.send(sender=post.__class__, instance=post, mark_by=request.user) elif vote_type in ['9', '10']: post = question post_id = id @@ -195,7 +199,6 @@ def vote(request, id):#todo: pretty incomprehensible view used by various ajax c response_data['status'] = 1 else: auth.onDeleted(post, request.user) - delete_post_or_answer.send(sender=post.__class__, instance=post, delete_by=request.user) elif vote_type == '11':#subscribe q updates user = request.user if user.is_authenticated(): diff --git a/stackexchange/ANOMALIES b/stackexchange/ANOMALIES index 17ead454..05a7dbdb 100644 --- a/stackexchange/ANOMALIES +++ b/stackexchange/ANOMALIES @@ -3,3 +3,12 @@ * users with no email (hack: gravatar set to settings.ANONYMOUS_USER_EMAIL) * users with no screen name * users with no email and no screen name (25% in homeschool) +* tag preferences are not stored explicitly (interesting/ignored) + maybe they are in se.User.preferences_raw + but the data there is not marked up and is kind of cryptic +* we don't have Community user. SE has one with id=-1 + this id may break the load script + potential break places are anywhere where is X.get_user() call + issues may happen with larger data sets where activity + of user "Community" is somehow reflected in a way + that load_stackexchange does not take care of diff --git a/stackexchange/README b/stackexchange/README index b2a39e1c..bad11c9f 100644 --- a/stackexchange/README +++ b/stackexchange/README @@ -1,11 +1,12 @@ this app's function will be to: -* install it's own tables <--- done -* read SE xml dump into DjangoDB <--- done -* populate osqa database <-- user accounts and Q&A revisions loaded -* remove SE tables +* install it's own tables (#todo: not yet automated) +* read SE xml dump into DjangoDB (automated) +* populate osqa database (automated) +* remove SE tables (#todo: not done yet) -Current process to load SE data into OSQA: +Current process to load SE data into OSQA is: +============================================== 1) backup database @@ -36,3 +37,26 @@ Current process to load SE data into OSQA: if anything doesn't go right - run 'python manage.py flush' and repeat steps 6 and 7 + +NOTES: +============ + +Here is the load script that I used for the testing +it assumes that SE dump has been unzipped inside the tmp directory + + #!/bin/sh$ + python manage.py flush + #delete all data + mysql -u osqa -p osqa < sql_scripts/badges.sql + python manage.py load_stackexchange tmp + +Untested parts are tagged with comments starting with +#todo: + +The test set did not have all the usage cases of StackExchange represented so +it may break with other sets. + +The job takes some time to run, especially +content revisions and votes - may be optimized + +Some of the fringe cases are described in file stackexchange/ANOMALIES diff --git a/stackexchange/management/commands/load_stackexchange.py b/stackexchange/management/commands/load_stackexchange.py index 5673b9f0..11b0efc9 100644 --- a/stackexchange/management/commands/load_stackexchange.py +++ b/stackexchange/management/commands/load_stackexchange.py @@ -12,6 +12,8 @@ import stackexchange.models as se from forum.forms import EditUserEmailFeedsForm from forum.utils.html import sanitize_html from django.conf import settings +from django.contrib.auth.models import Message as DjangoMessage +from django.utils.translation import ugettext as _ #from markdown2 import Markdown #markdowner = Markdown(html4tags=True) @@ -55,14 +57,15 @@ class X(object):# 'UpMod':'upvote', 'DownMod':'downvote', 'AcceptedByOriginator':'accept_answer', - 'Offensive','flag_post', - 'Favorite','toggle_favorite_question', + 'Offensive':'flag_post', + 'Favorite':'toggle_favorite_question', } #these modes cannot be mixed + #only wiki is assumed to be mixable exclusive_revision_modes = ( - 'initial','edit','lock','unlock', - 'migrate','close','reopen','merge', + 'initial','edit','rollback','lock', + 'migrate','close','merge','delete', ) #badges whose names don't match exactly, but @@ -84,15 +87,49 @@ class X(object):# 'Rollback Body':'rollback', 'Rollback Tags':'rollback', 'Post Closed':'close', - 'Post Reopened':'reopen', + 'Post Reopened':'close', 'Post Deleted':'delete', - 'Post Undeleted':'undelete', + 'Post Undeleted':'delete', 'Post Locked':'lock', - 'Post Unlocked':'unlock', + 'Post Unlocked':'lock', 'Community Owned':'wiki', 'Post Migrated':'migrate', 'Question Merged':'merge', } + + close_reason_map = { + 1:1,#duplicate + 2:2,#off-topic + 3:3,#subjective and argumentative + 4:4,#not a real question + 5:7,#offensive + 6:6,#irrelevant or outdated question + 7:9,#too localized + 10:8,#spam + } + + @classmethod + def get_message_text(cls, se_m): + """try to intelligently translate + SE message to OSQA so that it makese sense in + our context + """ + #todo: properly translate messages + #todo: maybe work through more instances of messages + if se_m.message_type.name == 'Badge Notification': + return se_m.text + else: + if 'you are now an administrator' in se_m.text: + return _('Congratulations, you are now an Administrator') + elif re.search(r'^You have \d+ new',se_m.text): + bits = se_m.text.split('.') + text = bits[0] + if se_m.user.id == -1: + return None + url = cls.get_user(se_m.user).get_profile_url() + return '<a href="%s?sort=responses">%s</a>' % (url,text) + return None + @classmethod def get_post(cls, se_post): #todo: fix this hack - either in-memory id association table @@ -106,6 +143,12 @@ class X(object):# raise Exception('unknown post type %s' % post_type) @classmethod + def get_close_reason(cls, se_reason): + #todo: this is a guess - have not seen real data + se_reason = int(se_reason) + return cls.close_reason_map[se_reason] + + @classmethod def get_user(cls, se_user): #todo: same as get_post return osqa.User.objects.get(id=USER[se_user.id].id) @@ -236,19 +279,76 @@ class Command(BaseCommand): table_name = self.get_table_name(xml) self.load_xml_file(xml_path, table_name) + #this is important so that when we clean up messages + #automatically generated by the procedures below + #we do not delete old messages + #todo: unfortunately this may need to be redone + #when we upgrade to django 1.2 and definitely by 1.4 when + #the current message system will be replaced with the + #django messages framework + self.save_osqa_message_id_list() + #transfer data into OSQA tables + print 'Transferring users...', + sys.stdout.flush() self.transfer_users() + print 'done.' + print 'Transferring content edits...', + sys.stdout.flush() self.transfer_question_and_answer_activity() + print 'done.' + print 'Transferring view counts...', + sys.stdout.flush() self.transfer_question_view_counts() + print 'done.' + print 'Transferring comments...', + sys.stdout.flush() self.transfer_comments() + print 'done.' + print 'Transferring badges and badge awards...', + sys.stdout.flush() self.transfer_badges() - self.transfer_votes() - self.transfer_favorites() - self.transfer_tag_preferences() + print 'done.' + print 'Transferring votes...', + sys.stdout.flush() + self.transfer_votes()#includes favorites, accepts and flags + print 'done.' + + self.cleanup_messages()#delete autogenerated messages + self.transfer_messages() + + #todo: these are not clear how to go about self.transfer_update_subscriptions() - self.transfer_flags() + self.transfer_tag_preferences() self.transfer_meta_pages() + def save_osqa_message_id_list(self): + id_list = list(DjangoMessage.objects.all().values('id')) + self._osqa_message_id_list = id_list + + def cleanup_messages(self): + """deletes messages generated by the load process + """ + id_list = self._osqa_message_id_list + mset = DjangoMessage.objects.all().exclude(id__in=id_list) + mset.delete() + + def transfer_messages(self): + """transfers some messages from + SE to OSQA + """ + for m in se.Message.objects.all(): + if m.is_read: + continue + if m.user.id == -1: + continue + u = X.get_user(m.user) + text = X.get_message_text(m) + if text: + u.message_set.create( + message=text, + ) + def _process_post_initial_revision_group(self, rev_group): title = None @@ -298,7 +398,7 @@ class Command(BaseCommand): def _process_post_edit_revision_group(self, rev_group): #question apply edit - (title, text, tags, wiki) = (None, None, None, False) + (title, text, tags) = (None, None, None) for rev in rev_group: rev_type = rev.post_history_type.name if rev_type == 'Edit Title': @@ -308,7 +408,7 @@ class Command(BaseCommand): elif rev_type == 'Edit Tags': tags = X.clean_tags(rev.text) elif rev_type == 'Community Owned': - wiki = True + pass else: raise Exception('unexpected revision type %s' % rev_type) @@ -327,39 +427,141 @@ class Command(BaseCommand): text = text, comment = comment, tags = tags, - wiki = wiki ) elif post_type == 'Answer': a = ANSWER[rev0.post.id] - #todo: wiki will probably be lost here a.apply_edit( edited_at = edited_at, edited_by = edited_by, text = text, comment = comment, - wiki = wiki ) - def _process_post_action_revision_group(self, rev_group): - #this is odd - there were no edit actions like these - #closed, reopened, etc in homeschoolers sample - print 'Warning: these content revisions were not processed' - print 'please give us your sample and we will write code to import it' - print ';'.join([rev.post_history_type.name for rev in rev_group]) + def _make_post_wiki(self, rev_group): + #todo: untested + for rev in rev_group: + if rev.post_history_type.name == 'Community Owned': + p = X.get_post(rev.post) + u = X.get_user(rev.user) + t = rev.creation_date + p.wiki = True + p.wikified_at = t + p.wikified_by = u + self.mark_activity(p,u,t) + p.save() + return + + def mark_activity(self,p,u,t): + """p,u,t - post, user, timestamp + """ + if isinstance(p, osqa.Question): + p.last_activity_by = u + p.last_activity_at = t + elif isinstance(p, osqa.Answer): + p.question.last_activity_by = u + p.question.last_activity_at = t + p.question.save() + + def _process_post_rollback_revision_group(self, rev_group): + #todo: don't know what to do here as there were no + #such data available + pass + + def _process_post_lock_revision_group(self, rev_group): + #todo: untested + for rev in rev_group: + rev_type = rev.post_history_type.name + if rev_type.endswith('ocked'): + t = rev.creation_date + u = X.get_user(rev.user) + p = X.get_post(rev.post) + if rev_type == 'Post Locked': + p.locked = True + p.locked_by = u + p.locked_at = t + elif rev_type == 'Post Unlocked': + p.locked = False + p.locked_by = None + p.locked_at = None + else: + return + self.mark_activity(p,u,t) + p.save() + return + + def _process_post_close_revision_group(self, rev_group): + #todo: untested + for rev in rev_group: + if rev.post.post_type.name != 'Question': + return + rev_type = rev.post_history_type.name + if rev_type in ('Post Closed', 'Post Reopened'): + t = rev.creation_date + u = X.get_user(rev.user) + p = X.get_post(rev.post) + if rev_type == 'Post Closed': + p.closed = True + p.closed_at = t + p.closed_by = u + p.close_reason = X.get_close_reason(rev.text) + elif rev_type == 'Post Reopened': + p.closed = False + p.closed_at = None + p.closed_by = None + p.close_reason = None + self.mark_activity(p,u,t) + p.save() + return + + def _process_post_delete_revision_group(self, rev_group): + #todo: untested + for rev in rev_group: + rev_type = rev.post_history_type.name + if rev_type.endswith('eleted'): + t = rev.creation_date + u = X.get_user(rev.user) + p = X.get_post(rev.post) + if rev_type == 'Post Deleted': + p.deleted = True + p.deleted_at = t + p.deleted_by = u + elif rev_type == 'Post Undeleted': + p.deleted = False + p.deleted_at = None + p.deleted_by = None + self.mark_activity(p,u,t) + p.save() + return def _process_post_revision_group(self, rev_group): #determine revision type + #'initial','edit','rollback','lock', + #'migrate','close','merge','delete', rev_types = X.get_post_revision_group_types(rev_group) - #initial,edit,lock,unlock, - #migrate,close,reopen,merge,wiki if 'initial' in rev_types: self._process_post_initial_revision_group(rev_group) elif 'edit' in rev_types: self._process_post_edit_revision_group(rev_group) + elif 'rollback' in rev_types: + self._process_post_rollback_revision_group(rev_group) + elif 'lock' in rev_types: + self._process_post_lock_revision_group(rev_group) + elif 'close' in rev_types: + self._process_post_close_revision_group(rev_group) + elif 'delete' in rev_types: + self._process_post_delete_revision_group(rev_group) else: - self._process_post_action_revision_group(rev_group) + pass + #todo: rollback, lock, close and delete are + #not tested + #merge and migrate actions are ignored + #wiki is mixable with other groups, so process it in addition + if 'wiki' in rev_types: + self._make_post_wiki(rev_group) def transfer_tag_preferences(self): + #todo: figure out where these are stored in SE + #maybe in se.User.preferences_raw? pass def transfer_question_and_answer_activity(self): @@ -384,6 +586,8 @@ class Command(BaseCommand): c_group = [] c_group.append(se_rev) c_guid = se_rev.revision_guid + if len(c_group) != 0: + self._process_post_revision_group(c_group) def transfer_comments(self): for se_c in se.PostComment.objects.all(): @@ -477,18 +681,14 @@ class Command(BaseCommand): u = X.get_user(v.user) p = X.get_post(v.post) m = X.vote_actions[vote_type] - vote_method = getattr(osqa.User, m['on']) + vote_method = getattr(osqa.User, m) vote_method(u, p, timestamp = v.creation_date) if v.deletion_date: vote_method(u, p, timestamp = v.deletion_date, cancel=True) - def transfer_favorites(self): - pass - def transfer_update_subscriptions(self): - pass - - def transfer_flags(self): + #todo: not clear where this is stored in SE + #maybe in se.User.preferences_raw? pass def transfer_meta_pages(self): @@ -548,7 +748,7 @@ class Command(BaseCommand): if se_u.open_id is None and se_u.email is None: print 'Warning: SE user %d is not recoverable (no email or openid)' - u.reputation = se_u.reputation + u.reputation = 1#se_u.reputation, it's actually re-computed u.last_seen = se_u.last_access_date u.email = X.get_email(se_u.email) u.location = X.blankable(se_u.location) |