summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEvgeny Fadeev <evgeny.fadeev@gmail.com>2011-06-26 01:09:29 -0400
committerEvgeny Fadeev <evgeny.fadeev@gmail.com>2011-06-26 01:09:29 -0400
commitf2bfae711a5b543eb72d37231be67a369ea64d02 (patch)
tree2e32460f8909dc081d0227df9d0e87f5884b0c01
parentc2679f7d4921daea8568b35a544e64ccec065098 (diff)
downloadaskbot-f2bfae711a5b543eb72d37231be67a369ea64d02.tar.gz
askbot-f2bfae711a5b543eb72d37231be67a369ea64d02.tar.bz2
askbot-f2bfae711a5b543eb72d37231be67a369ea64d02.zip
fixed some issues in SE import
-rw-r--r--askbot/importers/stackexchange/management/commands/load_stackexchange.py154
-rw-r--r--askbot/models/__init__.py54
2 files changed, 145 insertions, 63 deletions
diff --git a/askbot/importers/stackexchange/management/commands/load_stackexchange.py b/askbot/importers/stackexchange/management/commands/load_stackexchange.py
index ddac764e..3d6bee05 100644
--- a/askbot/importers/stackexchange/management/commands/load_stackexchange.py
+++ b/askbot/importers/stackexchange/management/commands/load_stackexchange.py
@@ -3,10 +3,14 @@ import os
import re
import sys
import zipfile
+from datetime import datetime
+from guppy import hpy
from django.core.management.base import BaseCommand, CommandError
import askbot.importers.stackexchange.parse_models as se_parser
from xml.etree import ElementTree as et
-from django.db import models#, transaction
+from django.db.models import fields
+from django.db.utils import IntegrityError
+from django.db import models, transaction
#from askbot.utils import dummy_transaction as transaction
import askbot.models as askbot
import askbot.deps.django_authopenid.models as askbot_openid
@@ -31,7 +35,9 @@ xml_read_order = (
'Users2Badges','VoteTypes','Users2Votes','MessageTypes',
'Posts','Posts2Votes','PostHistory','PostComments',
'ModeratorMessages','Messages','Comments2Votes',
- )
+)
+
+HEAP = hpy()
#association tables SE item id --> ASKBOT item id
#table associations are implied
@@ -141,18 +147,18 @@ class X(object):#
def get_post(cls, se_post):
#todo: fix this hack - either in-memory id association table
#or use database to store these associations
- if isinstance(se_post, se.PostComment):
- try:
+ try:
+ if isinstance(se_post, se.PostComment):
return askbot.Comment.objects.get(id=COMMENT[se_post.id].id)
- except KeyError:
- return None
- post_type = se_post.post_type.name
- if post_type == 'Question':
- return askbot.Question.objects.get(id=QUESTION[se_post.id].id)
- elif post_type == 'Answer':
- return askbot.Answer.objects.get(id=ANSWER[se_post.id].id)
- else:
- raise Exception('unknown post type %s' % post_type)
+ post_type = se_post.post_type.name
+ if post_type == 'Question':
+ return askbot.Question.objects.get(id=QUESTION[se_post.id].id)
+ elif post_type == 'Answer':
+ return askbot.Answer.objects.get(id=ANSWER[se_post.id].id)
+ else:
+ raise Exception('unknown post type %s' % post_type)
+ except KeyError:
+ return None
@classmethod
def get_close_reason(cls, se_reason):
@@ -256,23 +262,13 @@ class X(object):#
@classmethod
def parse_badge_summary(cls, badge_summary):
- (gold,silver,bronze) = (0,0,0)
+ badge_counts = [0,0,0]#gold, silver and bronze, respectively
if badge_summary:
- if len(badge_summary) > 3:
- print 'warning: guessing that badge summary is comma separated'
- print 'have %s' % badge_summary
- sys.stdout.flush()
- bits = badge_summary.split(',')
- else:
- bits = [badge_summary]
- for bit in bits:
- m = re.search(r'^(?P<type>[1-3])=(?P<count>\d+)$', bit)
- if not m:
- raise Exception('could not parse badge summary: %s' % badge_summary)
- else:
- badge_type = cls.badge_type_map[m.groupdict()['type']]
- locals()[badge_type] = int(m.groupdict()['count'])
- return (gold,silver,bronze)
+ badge_info_list = badge_summary.split(' ')
+ for badge_info in badge_info_list:
+ level, count = badge_info.split('=')
+ badge_counts[int(level) - 1] = int(count)
+ return badge_counts
@classmethod
def get_badge_name(cls, name):
@@ -301,8 +297,12 @@ class Command(BaseCommand):
self.zipfile = self.open_dump(arg[0])
#read the data into SE tables
for item in xml_read_order:
+ time_before = datetime.now()
self.load_xml_file(item)
transaction.commit()
+ time_after = datetime.now()
+ print time_after - time_before
+ print HEAP.heap()
#this is important so that when we clean up messages
#automatically generated by the procedures below
@@ -315,11 +315,9 @@ class Command(BaseCommand):
#transfer data into ASKBOT tables
print 'Transferring users...',
- sys.stdout.flush()
self.transfer_users()
transaction.commit()
print 'done.'
- sys.stdout.flush()
print 'Transferring content edits...',
sys.stdout.flush()
self.transfer_question_and_answer_activity()
@@ -391,9 +389,11 @@ class Command(BaseCommand):
"""transfers some messages from
SE to ASKBOT
"""
- for m in se.Message.objects.all():
+ for m in se.Message.objects.all().iterator():
if m.is_read:
continue
+ if m.user in None:
+ continue
if m.user.id == -1:
continue
u = X.get_user(m.user)
@@ -437,6 +437,8 @@ class Command(BaseCommand):
QUESTION[rev_group[0].post.id] = q
elif post_type == 'Answer':
q = X.get_post(rev_group[0].post.parent)
+ if q is None:
+ return
a = author.post_answer(
question = q,
body_text = text,
@@ -468,21 +470,26 @@ class Command(BaseCommand):
edited_by = USER[rev0.user.id]
edited_at = rev0.creation_date
comment = ';'.join([rev.comment for rev in rev_group if rev.comment])
+ if len(comment) > 300:#truncate to make the db happy
+ comment = comment[:300]
post_type = rev0.post.post_type.name
+ post = X.get_post(rev0.post)
+ if post is None:
+ return
if post_type == 'Question':
- q = X.get_post(rev0.post)
edited_by.edit_question(
- question = q,
+ question = post,
title = title,
body_text = text,
tags = tags,
revision_comment = comment,
- timestamp = edited_at
+ timestamp = edited_at,
+ force = True #avoid insufficient rep issue on imports
)
elif post_type == 'Answer':
- a = X.get_post(rev0.post)
- a.apply_edit(
+ #todo: why here use "apply_edit" and not "edit answer"?
+ post.apply_edit(
edited_at = edited_at,
edited_by = edited_by,
text = text,
@@ -494,6 +501,8 @@ class Command(BaseCommand):
for rev in rev_group:
if rev.post_history_type.name == 'Community Owned':
p = X.get_post(rev.post)
+ if p is None:
+ return
u = X.get_user(rev.user)
t = rev.creation_date
p.wiki = True
@@ -527,6 +536,8 @@ class Command(BaseCommand):
t = rev.creation_date
u = X.get_user(rev.user)
p = X.get_post(rev.post)
+ if p is None:
+ return
if rev_type == 'Post Locked':
p.locked = True
p.locked_by = u
@@ -551,6 +562,8 @@ class Command(BaseCommand):
t = rev.creation_date
u = X.get_user(rev.user)
p = X.get_post(rev.post)
+ if p is None:
+ return
if rev_type == 'Post Closed':
p.closed = True
p.closed_at = t
@@ -573,6 +586,8 @@ class Command(BaseCommand):
t = rev.creation_date
u = X.get_user(rev.user)
p = X.get_post(rev.post)
+ if p is None:
+ return
if rev_type == 'Post Deleted':
p.deleted = True
p.deleted_at = t
@@ -589,6 +604,10 @@ class Command(BaseCommand):
#determine revision type
#'initial','edit','rollback','lock',
#'migrate','close','merge','delete',
+ if rev_group[0].user is None:
+ #drop userless revisions - those are probably garbage posts
+ #by the deleted users
+ return
rev_types = X.get_post_revision_group_types(rev_group)
if 'initial' in rev_types:
self._process_post_initial_revision_group(rev_group)
@@ -630,7 +649,7 @@ class Command(BaseCommand):
c_group = []
#this loop groups revisions by revision id, then calls process function
#for the revision grup (elementary revisions posted at once)
- for se_rev in se_revs:
+ for se_rev in se_revs.iterator():
if se_rev.revision_guid == c_guid:
c_group.append(se_rev)
else:
@@ -638,22 +657,29 @@ class Command(BaseCommand):
c_group = []
c_group.append(se_rev)
c_guid = se_rev.revision_guid
+ transaction.commit()
if len(c_group) != 0:
self._process_post_revision_group(c_group)
def transfer_comments(self):
- for se_c in se.PostComment.objects.all():
+ for se_c in se.PostComment.objects.all().iterator():
if se_c.deletion_date:
print 'Warning deleted comment %d dropped' % se_c.id
sys.stdout.flush()
continue
se_post = se_c.post
askbot_post = X.get_post(se_post)
+ if askbot_post is None:
+ continue
+
+ se_author = se_c.user
+ if se_author is None:
+ continue
comment = askbot_post.add_comment(
comment = se_c.text,
added_at = se_c.creation_date,
- user = USER[se_c.user.id]
+ user = USER[se_author.id]
)
COMMENT[se_c.id] = comment
@@ -676,7 +702,7 @@ class Command(BaseCommand):
def _award_badges(self):
#note: SE does not keep information on
#content-related badges like askbot does
- for se_a in se.User2Badge.objects.all():
+ for se_a in se.User2Badge.objects.all().iterator():
if se_a.user.id == -1:
continue #skip community user
u = USER[se_a.user.id]
@@ -719,32 +745,50 @@ class Command(BaseCommand):
pass
def transfer_question_view_counts(self):
- for se_q in se.Post.objects.filter(post_type__name='Question'):
+ for se_q in se.Post.objects.filter(post_type__name='Question').iterator():
q = X.get_post(se_q)
+ if q is None:
+ continue
q.view_count = se_q.view_count
q.save()
def transfer_QA_votes(self):
- for v in se.Post2Vote.objects.all():
+ for v in se.Post2Vote.objects.all().iterator():
vote_type = v.vote_type.name
if not vote_type in X.vote_actions:
continue
+ if v.user is None:
+ continue
+
u = X.get_user(v.user)
p = X.get_post(v.post)
+ if p is None:
+ continue
m = X.vote_actions[vote_type]
vote_method = getattr(askbot.User, m)
- vote_method(u, p, timestamp = v.creation_date)
+ vote_method(
+ u, p, timestamp = v.creation_date,
+ force = True
+ )
if v.deletion_date:
- vote_method(u, p, timestamp = v.deletion_date, cancel=True)
+ vote_method(
+ u, p, timestamp = v.deletion_date,
+ cancel=True,
+ force = True#force to avoid permission errors
+ )
+ transaction.commit()
def transfer_comment_votes(self):
- for v in se.Comment2Vote.objects.all():
+ for v in se.Comment2Vote.objects.all().iterator():
vote_type = v.vote_type.name
if vote_type not in ('UpMod', 'Offensive'):
continue
+ if v.user is None:
+ continue
+
p = X.get_post(v.post_comment)
#could also check deletion date on the Comment2Vote object
#instead of making get_post return None on KeyError inside
@@ -754,7 +798,11 @@ class Command(BaseCommand):
u = X.get_user(v.user)
m = X.vote_actions[vote_type]
vote_method = getattr(askbot.User, m)
- vote_method(u, p, timestamp = v.creation_date)
+ vote_method(
+ u, p, timestamp = v.creation_date,
+ force = True
+ )
+ transaction.commit()
def transfer_update_subscriptions(self):
@@ -784,10 +832,15 @@ class Command(BaseCommand):
i += 1
for col in row.getchildren():
field_name = se_parser.parse_field_name(col.tag)
- field_type = model._meta.get_field(field_name)
+ try:
+ field_type = model._meta.get_field(field_name)
+ except fields.FieldDoesNotExist, e:
+ print u"Warning: %s" % unicode(e)
+ continue
field_value = se_parser.parse_value(col.text, field_type)
setattr(model_entry, field_name, field_value)
model_entry.save()
+ #transaction.commit()
print '... %d objects saved' % i
sys.stdout.flush()
@@ -798,7 +851,7 @@ class Command(BaseCommand):
return xml_file_basename + '.xml'
def transfer_users(self):
- for se_u in se.User.objects.all():
+ for se_u in se.User.objects.all().iterator():
#if se_u.id == -1:#skip the Community user
# continue
u = askbot.User()
@@ -826,6 +879,9 @@ class Command(BaseCommand):
print 'User %s (id=%d) does not have openid' % \
(se_u.display_name, se_u.id)
sys.stdout.flush()
+ except IntegrityError:
+ print "Warning: have duplicate openid: %s" % se_u.open_id
+ sys.stdout.flush()
if se_u.open_id is None and se_u.email is None:
print 'Warning: SE user %d is not recoverable (no email or openid)'
diff --git a/askbot/models/__init__.py b/askbot/models/__init__.py
index 66992dad..ce1b5cff 100644
--- a/askbot/models/__init__.py
+++ b/askbot/models/__init__.py
@@ -978,11 +978,20 @@ def user_retag_question(
)
@auto_now_timestamp
-def user_accept_best_answer(self, answer = None,
- timestamp = None, cancel = False):
+def user_accept_best_answer(
+ self, answer = None,
+ timestamp = None,
+ cancel = False,
+ force = False
+ ):
if cancel:
- return self.unaccept_best_answer(answer = answer, timestamp = timestamp)
- self.assert_can_accept_best_answer(answer)
+ return self.unaccept_best_answer(
+ answer = answer,
+ timestamp = timestamp,
+ force = force
+ )
+ if force == False:
+ self.assert_can_accept_best_answer(answer)
if answer.accepted == True:
return
@@ -999,8 +1008,13 @@ def user_accept_best_answer(self, answer = None,
)
@auto_now_timestamp
-def user_unaccept_best_answer(self, answer = None, timestamp = None):
- self.assert_can_unaccept_best_answer(answer)
+def user_unaccept_best_answer(
+ self, answer = None,
+ timestamp = None,
+ force = False
+ ):
+ if force == False:
+ self.assert_can_unaccept_best_answer(answer)
if answer.accepted == False:
return
auth.onAnswerAcceptCanceled(answer, self)
@@ -1200,8 +1214,10 @@ def user_edit_question(
wiki = False,
edit_anonymously = False,
timestamp = None,
+ force = False,#if True - bypass the assert
):
- self.assert_can_edit_question(question)
+ if force == False:
+ self.assert_can_edit_question(question)
question.apply_edit(
edited_at = timestamp,
edited_by = self,
@@ -1227,9 +1243,11 @@ def user_edit_answer(
body_text = None,
revision_comment = None,
wiki = False,
- timestamp = None
+ timestamp = None,
+ force = False#if True - bypass the assert
):
- self.assert_can_edit_answer(answer)
+ if force == False:
+ self.assert_can_edit_answer(answer)
answer.apply_edit(
edited_at = timestamp,
edited_by = self,
@@ -1668,7 +1686,12 @@ def user_get_badge_summary(self):
#may be different
#maybe if we do use business rule checks here - we should add
#some flag allowing to bypass them for things like the data importers
-def toggle_favorite_question(self, question, timestamp=None, cancel=False):
+def toggle_favorite_question(
+ self, question,
+ timestamp = None,
+ cancel = False,
+ force = False#this parameter is not used yet
+ ):
"""cancel has no effect here, but is important for the SE loader
it is hoped that toggle will work and data will be consistent
but there is no guarantee, maybe it's better to be more strict
@@ -1789,7 +1812,8 @@ def user_is_following_question(user, question):
return False
-def upvote(self, post, timestamp=None, cancel=False):
+def upvote(self, post, timestamp=None, cancel=False, force = False):
+ #force parameter not used yet
return _process_vote(
self,
post,
@@ -1798,7 +1822,8 @@ def upvote(self, post, timestamp=None, cancel=False):
vote_type=Vote.VOTE_UP
)
-def downvote(self, post, timestamp=None, cancel=False):
+def downvote(self, post, timestamp=None, cancel=False, force = False):
+ #force not used yet
return _process_vote(
self,
post,
@@ -1808,11 +1833,12 @@ def downvote(self, post, timestamp=None, cancel=False):
)
@auto_now_timestamp
-def flag_post(user, post, timestamp=None, cancel=False):
+def flag_post(user, post, timestamp=None, cancel=False, force = False):
if cancel:#todo: can't unflag?
return
- user.assert_can_flag_offensive(post = post)
+ if force == False:
+ user.assert_can_flag_offensive(post = post)
auth.onFlaggedItem(post, user, timestamp=timestamp)
award_badges_signal.send(None,
event = 'flag_post',