diff options
author | Evgeny Fadeev <evgeny.fadeev@gmail.com> | 2011-06-26 01:09:29 -0400 |
---|---|---|
committer | Evgeny Fadeev <evgeny.fadeev@gmail.com> | 2011-06-26 01:09:29 -0400 |
commit | f2bfae711a5b543eb72d37231be67a369ea64d02 (patch) | |
tree | 2e32460f8909dc081d0227df9d0e87f5884b0c01 | |
parent | c2679f7d4921daea8568b35a544e64ccec065098 (diff) | |
download | askbot-f2bfae711a5b543eb72d37231be67a369ea64d02.tar.gz askbot-f2bfae711a5b543eb72d37231be67a369ea64d02.tar.bz2 askbot-f2bfae711a5b543eb72d37231be67a369ea64d02.zip |
fixed some issues in SE import
-rw-r--r-- | askbot/importers/stackexchange/management/commands/load_stackexchange.py | 154 | ||||
-rw-r--r-- | askbot/models/__init__.py | 54 |
2 files changed, 145 insertions, 63 deletions
diff --git a/askbot/importers/stackexchange/management/commands/load_stackexchange.py b/askbot/importers/stackexchange/management/commands/load_stackexchange.py index ddac764e..3d6bee05 100644 --- a/askbot/importers/stackexchange/management/commands/load_stackexchange.py +++ b/askbot/importers/stackexchange/management/commands/load_stackexchange.py @@ -3,10 +3,14 @@ import os import re import sys import zipfile +from datetime import datetime +from guppy import hpy from django.core.management.base import BaseCommand, CommandError import askbot.importers.stackexchange.parse_models as se_parser from xml.etree import ElementTree as et -from django.db import models#, transaction +from django.db.models import fields +from django.db.utils import IntegrityError +from django.db import models, transaction #from askbot.utils import dummy_transaction as transaction import askbot.models as askbot import askbot.deps.django_authopenid.models as askbot_openid @@ -31,7 +35,9 @@ xml_read_order = ( 'Users2Badges','VoteTypes','Users2Votes','MessageTypes', 'Posts','Posts2Votes','PostHistory','PostComments', 'ModeratorMessages','Messages','Comments2Votes', - ) +) + +HEAP = hpy() #association tables SE item id --> ASKBOT item id #table associations are implied @@ -141,18 +147,18 @@ class X(object):# def get_post(cls, se_post): #todo: fix this hack - either in-memory id association table #or use database to store these associations - if isinstance(se_post, se.PostComment): - try: + try: + if isinstance(se_post, se.PostComment): return askbot.Comment.objects.get(id=COMMENT[se_post.id].id) - except KeyError: - return None - post_type = se_post.post_type.name - if post_type == 'Question': - return askbot.Question.objects.get(id=QUESTION[se_post.id].id) - elif post_type == 'Answer': - return askbot.Answer.objects.get(id=ANSWER[se_post.id].id) - else: - raise Exception('unknown post type %s' % post_type) + post_type = se_post.post_type.name + if post_type == 'Question': + return askbot.Question.objects.get(id=QUESTION[se_post.id].id) + elif post_type == 'Answer': + return askbot.Answer.objects.get(id=ANSWER[se_post.id].id) + else: + raise Exception('unknown post type %s' % post_type) + except KeyError: + return None @classmethod def get_close_reason(cls, se_reason): @@ -256,23 +262,13 @@ class X(object):# @classmethod def parse_badge_summary(cls, badge_summary): - (gold,silver,bronze) = (0,0,0) + badge_counts = [0,0,0]#gold, silver and bronze, respectively if badge_summary: - if len(badge_summary) > 3: - print 'warning: guessing that badge summary is comma separated' - print 'have %s' % badge_summary - sys.stdout.flush() - bits = badge_summary.split(',') - else: - bits = [badge_summary] - for bit in bits: - m = re.search(r'^(?P<type>[1-3])=(?P<count>\d+)$', bit) - if not m: - raise Exception('could not parse badge summary: %s' % badge_summary) - else: - badge_type = cls.badge_type_map[m.groupdict()['type']] - locals()[badge_type] = int(m.groupdict()['count']) - return (gold,silver,bronze) + badge_info_list = badge_summary.split(' ') + for badge_info in badge_info_list: + level, count = badge_info.split('=') + badge_counts[int(level) - 1] = int(count) + return badge_counts @classmethod def get_badge_name(cls, name): @@ -301,8 +297,12 @@ class Command(BaseCommand): self.zipfile = self.open_dump(arg[0]) #read the data into SE tables for item in xml_read_order: + time_before = datetime.now() self.load_xml_file(item) transaction.commit() + time_after = datetime.now() + print time_after - time_before + print HEAP.heap() #this is important so that when we clean up messages #automatically generated by the procedures below @@ -315,11 +315,9 @@ class Command(BaseCommand): #transfer data into ASKBOT tables print 'Transferring users...', - sys.stdout.flush() self.transfer_users() transaction.commit() print 'done.' - sys.stdout.flush() print 'Transferring content edits...', sys.stdout.flush() self.transfer_question_and_answer_activity() @@ -391,9 +389,11 @@ class Command(BaseCommand): """transfers some messages from SE to ASKBOT """ - for m in se.Message.objects.all(): + for m in se.Message.objects.all().iterator(): if m.is_read: continue + if m.user in None: + continue if m.user.id == -1: continue u = X.get_user(m.user) @@ -437,6 +437,8 @@ class Command(BaseCommand): QUESTION[rev_group[0].post.id] = q elif post_type == 'Answer': q = X.get_post(rev_group[0].post.parent) + if q is None: + return a = author.post_answer( question = q, body_text = text, @@ -468,21 +470,26 @@ class Command(BaseCommand): edited_by = USER[rev0.user.id] edited_at = rev0.creation_date comment = ';'.join([rev.comment for rev in rev_group if rev.comment]) + if len(comment) > 300:#truncate to make the db happy + comment = comment[:300] post_type = rev0.post.post_type.name + post = X.get_post(rev0.post) + if post is None: + return if post_type == 'Question': - q = X.get_post(rev0.post) edited_by.edit_question( - question = q, + question = post, title = title, body_text = text, tags = tags, revision_comment = comment, - timestamp = edited_at + timestamp = edited_at, + force = True #avoid insufficient rep issue on imports ) elif post_type == 'Answer': - a = X.get_post(rev0.post) - a.apply_edit( + #todo: why here use "apply_edit" and not "edit answer"? + post.apply_edit( edited_at = edited_at, edited_by = edited_by, text = text, @@ -494,6 +501,8 @@ class Command(BaseCommand): for rev in rev_group: if rev.post_history_type.name == 'Community Owned': p = X.get_post(rev.post) + if p is None: + return u = X.get_user(rev.user) t = rev.creation_date p.wiki = True @@ -527,6 +536,8 @@ class Command(BaseCommand): t = rev.creation_date u = X.get_user(rev.user) p = X.get_post(rev.post) + if p is None: + return if rev_type == 'Post Locked': p.locked = True p.locked_by = u @@ -551,6 +562,8 @@ class Command(BaseCommand): t = rev.creation_date u = X.get_user(rev.user) p = X.get_post(rev.post) + if p is None: + return if rev_type == 'Post Closed': p.closed = True p.closed_at = t @@ -573,6 +586,8 @@ class Command(BaseCommand): t = rev.creation_date u = X.get_user(rev.user) p = X.get_post(rev.post) + if p is None: + return if rev_type == 'Post Deleted': p.deleted = True p.deleted_at = t @@ -589,6 +604,10 @@ class Command(BaseCommand): #determine revision type #'initial','edit','rollback','lock', #'migrate','close','merge','delete', + if rev_group[0].user is None: + #drop userless revisions - those are probably garbage posts + #by the deleted users + return rev_types = X.get_post_revision_group_types(rev_group) if 'initial' in rev_types: self._process_post_initial_revision_group(rev_group) @@ -630,7 +649,7 @@ class Command(BaseCommand): c_group = [] #this loop groups revisions by revision id, then calls process function #for the revision grup (elementary revisions posted at once) - for se_rev in se_revs: + for se_rev in se_revs.iterator(): if se_rev.revision_guid == c_guid: c_group.append(se_rev) else: @@ -638,22 +657,29 @@ class Command(BaseCommand): c_group = [] c_group.append(se_rev) c_guid = se_rev.revision_guid + transaction.commit() if len(c_group) != 0: self._process_post_revision_group(c_group) def transfer_comments(self): - for se_c in se.PostComment.objects.all(): + for se_c in se.PostComment.objects.all().iterator(): if se_c.deletion_date: print 'Warning deleted comment %d dropped' % se_c.id sys.stdout.flush() continue se_post = se_c.post askbot_post = X.get_post(se_post) + if askbot_post is None: + continue + + se_author = se_c.user + if se_author is None: + continue comment = askbot_post.add_comment( comment = se_c.text, added_at = se_c.creation_date, - user = USER[se_c.user.id] + user = USER[se_author.id] ) COMMENT[se_c.id] = comment @@ -676,7 +702,7 @@ class Command(BaseCommand): def _award_badges(self): #note: SE does not keep information on #content-related badges like askbot does - for se_a in se.User2Badge.objects.all(): + for se_a in se.User2Badge.objects.all().iterator(): if se_a.user.id == -1: continue #skip community user u = USER[se_a.user.id] @@ -719,32 +745,50 @@ class Command(BaseCommand): pass def transfer_question_view_counts(self): - for se_q in se.Post.objects.filter(post_type__name='Question'): + for se_q in se.Post.objects.filter(post_type__name='Question').iterator(): q = X.get_post(se_q) + if q is None: + continue q.view_count = se_q.view_count q.save() def transfer_QA_votes(self): - for v in se.Post2Vote.objects.all(): + for v in se.Post2Vote.objects.all().iterator(): vote_type = v.vote_type.name if not vote_type in X.vote_actions: continue + if v.user is None: + continue + u = X.get_user(v.user) p = X.get_post(v.post) + if p is None: + continue m = X.vote_actions[vote_type] vote_method = getattr(askbot.User, m) - vote_method(u, p, timestamp = v.creation_date) + vote_method( + u, p, timestamp = v.creation_date, + force = True + ) if v.deletion_date: - vote_method(u, p, timestamp = v.deletion_date, cancel=True) + vote_method( + u, p, timestamp = v.deletion_date, + cancel=True, + force = True#force to avoid permission errors + ) + transaction.commit() def transfer_comment_votes(self): - for v in se.Comment2Vote.objects.all(): + for v in se.Comment2Vote.objects.all().iterator(): vote_type = v.vote_type.name if vote_type not in ('UpMod', 'Offensive'): continue + if v.user is None: + continue + p = X.get_post(v.post_comment) #could also check deletion date on the Comment2Vote object #instead of making get_post return None on KeyError inside @@ -754,7 +798,11 @@ class Command(BaseCommand): u = X.get_user(v.user) m = X.vote_actions[vote_type] vote_method = getattr(askbot.User, m) - vote_method(u, p, timestamp = v.creation_date) + vote_method( + u, p, timestamp = v.creation_date, + force = True + ) + transaction.commit() def transfer_update_subscriptions(self): @@ -784,10 +832,15 @@ class Command(BaseCommand): i += 1 for col in row.getchildren(): field_name = se_parser.parse_field_name(col.tag) - field_type = model._meta.get_field(field_name) + try: + field_type = model._meta.get_field(field_name) + except fields.FieldDoesNotExist, e: + print u"Warning: %s" % unicode(e) + continue field_value = se_parser.parse_value(col.text, field_type) setattr(model_entry, field_name, field_value) model_entry.save() + #transaction.commit() print '... %d objects saved' % i sys.stdout.flush() @@ -798,7 +851,7 @@ class Command(BaseCommand): return xml_file_basename + '.xml' def transfer_users(self): - for se_u in se.User.objects.all(): + for se_u in se.User.objects.all().iterator(): #if se_u.id == -1:#skip the Community user # continue u = askbot.User() @@ -826,6 +879,9 @@ class Command(BaseCommand): print 'User %s (id=%d) does not have openid' % \ (se_u.display_name, se_u.id) sys.stdout.flush() + except IntegrityError: + print "Warning: have duplicate openid: %s" % se_u.open_id + sys.stdout.flush() if se_u.open_id is None and se_u.email is None: print 'Warning: SE user %d is not recoverable (no email or openid)' diff --git a/askbot/models/__init__.py b/askbot/models/__init__.py index 66992dad..ce1b5cff 100644 --- a/askbot/models/__init__.py +++ b/askbot/models/__init__.py @@ -978,11 +978,20 @@ def user_retag_question( ) @auto_now_timestamp -def user_accept_best_answer(self, answer = None, - timestamp = None, cancel = False): +def user_accept_best_answer( + self, answer = None, + timestamp = None, + cancel = False, + force = False + ): if cancel: - return self.unaccept_best_answer(answer = answer, timestamp = timestamp) - self.assert_can_accept_best_answer(answer) + return self.unaccept_best_answer( + answer = answer, + timestamp = timestamp, + force = force + ) + if force == False: + self.assert_can_accept_best_answer(answer) if answer.accepted == True: return @@ -999,8 +1008,13 @@ def user_accept_best_answer(self, answer = None, ) @auto_now_timestamp -def user_unaccept_best_answer(self, answer = None, timestamp = None): - self.assert_can_unaccept_best_answer(answer) +def user_unaccept_best_answer( + self, answer = None, + timestamp = None, + force = False + ): + if force == False: + self.assert_can_unaccept_best_answer(answer) if answer.accepted == False: return auth.onAnswerAcceptCanceled(answer, self) @@ -1200,8 +1214,10 @@ def user_edit_question( wiki = False, edit_anonymously = False, timestamp = None, + force = False,#if True - bypass the assert ): - self.assert_can_edit_question(question) + if force == False: + self.assert_can_edit_question(question) question.apply_edit( edited_at = timestamp, edited_by = self, @@ -1227,9 +1243,11 @@ def user_edit_answer( body_text = None, revision_comment = None, wiki = False, - timestamp = None + timestamp = None, + force = False#if True - bypass the assert ): - self.assert_can_edit_answer(answer) + if force == False: + self.assert_can_edit_answer(answer) answer.apply_edit( edited_at = timestamp, edited_by = self, @@ -1668,7 +1686,12 @@ def user_get_badge_summary(self): #may be different #maybe if we do use business rule checks here - we should add #some flag allowing to bypass them for things like the data importers -def toggle_favorite_question(self, question, timestamp=None, cancel=False): +def toggle_favorite_question( + self, question, + timestamp = None, + cancel = False, + force = False#this parameter is not used yet + ): """cancel has no effect here, but is important for the SE loader it is hoped that toggle will work and data will be consistent but there is no guarantee, maybe it's better to be more strict @@ -1789,7 +1812,8 @@ def user_is_following_question(user, question): return False -def upvote(self, post, timestamp=None, cancel=False): +def upvote(self, post, timestamp=None, cancel=False, force = False): + #force parameter not used yet return _process_vote( self, post, @@ -1798,7 +1822,8 @@ def upvote(self, post, timestamp=None, cancel=False): vote_type=Vote.VOTE_UP ) -def downvote(self, post, timestamp=None, cancel=False): +def downvote(self, post, timestamp=None, cancel=False, force = False): + #force not used yet return _process_vote( self, post, @@ -1808,11 +1833,12 @@ def downvote(self, post, timestamp=None, cancel=False): ) @auto_now_timestamp -def flag_post(user, post, timestamp=None, cancel=False): +def flag_post(user, post, timestamp=None, cancel=False, force = False): if cancel:#todo: can't unflag? return - user.assert_can_flag_offensive(post = post) + if force == False: + user.assert_can_flag_offensive(post = post) auth.onFlaggedItem(post, user, timestamp=timestamp) award_badges_signal.send(None, event = 'flag_post', |