summaryrefslogtreecommitdiffstats
path: root/stackexchange
diff options
context:
space:
mode:
authorEvgeny Fadeev <evgeny.fadeev@gmail.com>2010-03-09 22:05:39 -0500
committerEvgeny Fadeev <evgeny.fadeev@gmail.com>2010-03-09 22:05:39 -0500
commit020701bcba397d590d284962f3ce5df3134aaa08 (patch)
treed152d879fe3396c6d8591762c8c5284e9dcca56a /stackexchange
parentc813ea591905e90512f2f04c68d251da3eb77eaa (diff)
downloadaskbot-020701bcba397d590d284962f3ce5df3134aaa08.tar.gz
askbot-020701bcba397d590d284962f3ce5df3134aaa08.tar.bz2
askbot-020701bcba397d590d284962f3ce5df3134aaa08.zip
SE loader seems to work, details are in stackexchange/README
Diffstat (limited to 'stackexchange')
-rw-r--r--stackexchange/ANOMALIES9
-rw-r--r--stackexchange/README34
-rw-r--r--stackexchange/management/commands/load_stackexchange.py266
3 files changed, 271 insertions, 38 deletions
diff --git a/stackexchange/ANOMALIES b/stackexchange/ANOMALIES
index 17ead454..05a7dbdb 100644
--- a/stackexchange/ANOMALIES
+++ b/stackexchange/ANOMALIES
@@ -3,3 +3,12 @@
* users with no email (hack: gravatar set to settings.ANONYMOUS_USER_EMAIL)
* users with no screen name
* users with no email and no screen name (25% in homeschool)
+* tag preferences are not stored explicitly (interesting/ignored)
+ maybe they are in se.User.preferences_raw
+ but the data there is not marked up and is kind of cryptic
+* we don't have Community user. SE has one with id=-1
+ this id may break the load script
+ potential break places are anywhere where is X.get_user() call
+ issues may happen with larger data sets where activity
+ of user "Community" is somehow reflected in a way
+ that load_stackexchange does not take care of
diff --git a/stackexchange/README b/stackexchange/README
index b2a39e1c..bad11c9f 100644
--- a/stackexchange/README
+++ b/stackexchange/README
@@ -1,11 +1,12 @@
this app's function will be to:
-* install it's own tables <--- done
-* read SE xml dump into DjangoDB <--- done
-* populate osqa database <-- user accounts and Q&A revisions loaded
-* remove SE tables
+* install it's own tables (#todo: not yet automated)
+* read SE xml dump into DjangoDB (automated)
+* populate osqa database (automated)
+* remove SE tables (#todo: not done yet)
-Current process to load SE data into OSQA:
+Current process to load SE data into OSQA is:
+==============================================
1) backup database
@@ -36,3 +37,26 @@ Current process to load SE data into OSQA:
if anything doesn't go right - run 'python manage.py flush' and repeat
steps 6 and 7
+
+NOTES:
+============
+
+Here is the load script that I used for the testing
+it assumes that SE dump has been unzipped inside the tmp directory
+
+ #!/bin/sh$
+ python manage.py flush
+ #delete all data
+ mysql -u osqa -p osqa < sql_scripts/badges.sql
+ python manage.py load_stackexchange tmp
+
+Untested parts are tagged with comments starting with
+#todo:
+
+The test set did not have all the usage cases of StackExchange represented so
+it may break with other sets.
+
+The job takes some time to run, especially
+content revisions and votes - may be optimized
+
+Some of the fringe cases are described in file stackexchange/ANOMALIES
diff --git a/stackexchange/management/commands/load_stackexchange.py b/stackexchange/management/commands/load_stackexchange.py
index 5673b9f0..11b0efc9 100644
--- a/stackexchange/management/commands/load_stackexchange.py
+++ b/stackexchange/management/commands/load_stackexchange.py
@@ -12,6 +12,8 @@ import stackexchange.models as se
from forum.forms import EditUserEmailFeedsForm
from forum.utils.html import sanitize_html
from django.conf import settings
+from django.contrib.auth.models import Message as DjangoMessage
+from django.utils.translation import ugettext as _
#from markdown2 import Markdown
#markdowner = Markdown(html4tags=True)
@@ -55,14 +57,15 @@ class X(object):#
'UpMod':'upvote',
'DownMod':'downvote',
'AcceptedByOriginator':'accept_answer',
- 'Offensive','flag_post',
- 'Favorite','toggle_favorite_question',
+ 'Offensive':'flag_post',
+ 'Favorite':'toggle_favorite_question',
}
#these modes cannot be mixed
+ #only wiki is assumed to be mixable
exclusive_revision_modes = (
- 'initial','edit','lock','unlock',
- 'migrate','close','reopen','merge',
+ 'initial','edit','rollback','lock',
+ 'migrate','close','merge','delete',
)
#badges whose names don't match exactly, but
@@ -84,15 +87,49 @@ class X(object):#
'Rollback Body':'rollback',
'Rollback Tags':'rollback',
'Post Closed':'close',
- 'Post Reopened':'reopen',
+ 'Post Reopened':'close',
'Post Deleted':'delete',
- 'Post Undeleted':'undelete',
+ 'Post Undeleted':'delete',
'Post Locked':'lock',
- 'Post Unlocked':'unlock',
+ 'Post Unlocked':'lock',
'Community Owned':'wiki',
'Post Migrated':'migrate',
'Question Merged':'merge',
}
+
+ close_reason_map = {
+ 1:1,#duplicate
+ 2:2,#off-topic
+ 3:3,#subjective and argumentative
+ 4:4,#not a real question
+ 5:7,#offensive
+ 6:6,#irrelevant or outdated question
+ 7:9,#too localized
+ 10:8,#spam
+ }
+
+ @classmethod
+ def get_message_text(cls, se_m):
+ """try to intelligently translate
+ SE message to OSQA so that it makese sense in
+ our context
+ """
+ #todo: properly translate messages
+ #todo: maybe work through more instances of messages
+ if se_m.message_type.name == 'Badge Notification':
+ return se_m.text
+ else:
+ if 'you are now an administrator' in se_m.text:
+ return _('Congratulations, you are now an Administrator')
+ elif re.search(r'^You have \d+ new',se_m.text):
+ bits = se_m.text.split('.')
+ text = bits[0]
+ if se_m.user.id == -1:
+ return None
+ url = cls.get_user(se_m.user).get_profile_url()
+ return '<a href="%s?sort=responses">%s</a>' % (url,text)
+ return None
+
@classmethod
def get_post(cls, se_post):
#todo: fix this hack - either in-memory id association table
@@ -106,6 +143,12 @@ class X(object):#
raise Exception('unknown post type %s' % post_type)
@classmethod
+ def get_close_reason(cls, se_reason):
+ #todo: this is a guess - have not seen real data
+ se_reason = int(se_reason)
+ return cls.close_reason_map[se_reason]
+
+ @classmethod
def get_user(cls, se_user):
#todo: same as get_post
return osqa.User.objects.get(id=USER[se_user.id].id)
@@ -236,19 +279,76 @@ class Command(BaseCommand):
table_name = self.get_table_name(xml)
self.load_xml_file(xml_path, table_name)
+ #this is important so that when we clean up messages
+ #automatically generated by the procedures below
+ #we do not delete old messages
+ #todo: unfortunately this may need to be redone
+ #when we upgrade to django 1.2 and definitely by 1.4 when
+ #the current message system will be replaced with the
+ #django messages framework
+ self.save_osqa_message_id_list()
+
#transfer data into OSQA tables
+ print 'Transferring users...',
+ sys.stdout.flush()
self.transfer_users()
+ print 'done.'
+ print 'Transferring content edits...',
+ sys.stdout.flush()
self.transfer_question_and_answer_activity()
+ print 'done.'
+ print 'Transferring view counts...',
+ sys.stdout.flush()
self.transfer_question_view_counts()
+ print 'done.'
+ print 'Transferring comments...',
+ sys.stdout.flush()
self.transfer_comments()
+ print 'done.'
+ print 'Transferring badges and badge awards...',
+ sys.stdout.flush()
self.transfer_badges()
- self.transfer_votes()
- self.transfer_favorites()
- self.transfer_tag_preferences()
+ print 'done.'
+ print 'Transferring votes...',
+ sys.stdout.flush()
+ self.transfer_votes()#includes favorites, accepts and flags
+ print 'done.'
+
+ self.cleanup_messages()#delete autogenerated messages
+ self.transfer_messages()
+
+ #todo: these are not clear how to go about
self.transfer_update_subscriptions()
- self.transfer_flags()
+ self.transfer_tag_preferences()
self.transfer_meta_pages()
+ def save_osqa_message_id_list(self):
+ id_list = list(DjangoMessage.objects.all().values('id'))
+ self._osqa_message_id_list = id_list
+
+ def cleanup_messages(self):
+ """deletes messages generated by the load process
+ """
+ id_list = self._osqa_message_id_list
+ mset = DjangoMessage.objects.all().exclude(id__in=id_list)
+ mset.delete()
+
+ def transfer_messages(self):
+ """transfers some messages from
+ SE to OSQA
+ """
+ for m in se.Message.objects.all():
+ if m.is_read:
+ continue
+ if m.user.id == -1:
+ continue
+ u = X.get_user(m.user)
+ text = X.get_message_text(m)
+ if text:
+ u.message_set.create(
+ message=text,
+ )
+
def _process_post_initial_revision_group(self, rev_group):
title = None
@@ -298,7 +398,7 @@ class Command(BaseCommand):
def _process_post_edit_revision_group(self, rev_group):
#question apply edit
- (title, text, tags, wiki) = (None, None, None, False)
+ (title, text, tags) = (None, None, None)
for rev in rev_group:
rev_type = rev.post_history_type.name
if rev_type == 'Edit Title':
@@ -308,7 +408,7 @@ class Command(BaseCommand):
elif rev_type == 'Edit Tags':
tags = X.clean_tags(rev.text)
elif rev_type == 'Community Owned':
- wiki = True
+ pass
else:
raise Exception('unexpected revision type %s' % rev_type)
@@ -327,39 +427,141 @@ class Command(BaseCommand):
text = text,
comment = comment,
tags = tags,
- wiki = wiki
)
elif post_type == 'Answer':
a = ANSWER[rev0.post.id]
- #todo: wiki will probably be lost here
a.apply_edit(
edited_at = edited_at,
edited_by = edited_by,
text = text,
comment = comment,
- wiki = wiki
)
- def _process_post_action_revision_group(self, rev_group):
- #this is odd - there were no edit actions like these
- #closed, reopened, etc in homeschoolers sample
- print 'Warning: these content revisions were not processed'
- print 'please give us your sample and we will write code to import it'
- print ';'.join([rev.post_history_type.name for rev in rev_group])
+ def _make_post_wiki(self, rev_group):
+ #todo: untested
+ for rev in rev_group:
+ if rev.post_history_type.name == 'Community Owned':
+ p = X.get_post(rev.post)
+ u = X.get_user(rev.user)
+ t = rev.creation_date
+ p.wiki = True
+ p.wikified_at = t
+ p.wikified_by = u
+ self.mark_activity(p,u,t)
+ p.save()
+ return
+
+ def mark_activity(self,p,u,t):
+ """p,u,t - post, user, timestamp
+ """
+ if isinstance(p, osqa.Question):
+ p.last_activity_by = u
+ p.last_activity_at = t
+ elif isinstance(p, osqa.Answer):
+ p.question.last_activity_by = u
+ p.question.last_activity_at = t
+ p.question.save()
+
+ def _process_post_rollback_revision_group(self, rev_group):
+ #todo: don't know what to do here as there were no
+ #such data available
+ pass
+
+ def _process_post_lock_revision_group(self, rev_group):
+ #todo: untested
+ for rev in rev_group:
+ rev_type = rev.post_history_type.name
+ if rev_type.endswith('ocked'):
+ t = rev.creation_date
+ u = X.get_user(rev.user)
+ p = X.get_post(rev.post)
+ if rev_type == 'Post Locked':
+ p.locked = True
+ p.locked_by = u
+ p.locked_at = t
+ elif rev_type == 'Post Unlocked':
+ p.locked = False
+ p.locked_by = None
+ p.locked_at = None
+ else:
+ return
+ self.mark_activity(p,u,t)
+ p.save()
+ return
+
+ def _process_post_close_revision_group(self, rev_group):
+ #todo: untested
+ for rev in rev_group:
+ if rev.post.post_type.name != 'Question':
+ return
+ rev_type = rev.post_history_type.name
+ if rev_type in ('Post Closed', 'Post Reopened'):
+ t = rev.creation_date
+ u = X.get_user(rev.user)
+ p = X.get_post(rev.post)
+ if rev_type == 'Post Closed':
+ p.closed = True
+ p.closed_at = t
+ p.closed_by = u
+ p.close_reason = X.get_close_reason(rev.text)
+ elif rev_type == 'Post Reopened':
+ p.closed = False
+ p.closed_at = None
+ p.closed_by = None
+ p.close_reason = None
+ self.mark_activity(p,u,t)
+ p.save()
+ return
+
+ def _process_post_delete_revision_group(self, rev_group):
+ #todo: untested
+ for rev in rev_group:
+ rev_type = rev.post_history_type.name
+ if rev_type.endswith('eleted'):
+ t = rev.creation_date
+ u = X.get_user(rev.user)
+ p = X.get_post(rev.post)
+ if rev_type == 'Post Deleted':
+ p.deleted = True
+ p.deleted_at = t
+ p.deleted_by = u
+ elif rev_type == 'Post Undeleted':
+ p.deleted = False
+ p.deleted_at = None
+ p.deleted_by = None
+ self.mark_activity(p,u,t)
+ p.save()
+ return
def _process_post_revision_group(self, rev_group):
#determine revision type
+ #'initial','edit','rollback','lock',
+ #'migrate','close','merge','delete',
rev_types = X.get_post_revision_group_types(rev_group)
- #initial,edit,lock,unlock,
- #migrate,close,reopen,merge,wiki
if 'initial' in rev_types:
self._process_post_initial_revision_group(rev_group)
elif 'edit' in rev_types:
self._process_post_edit_revision_group(rev_group)
+ elif 'rollback' in rev_types:
+ self._process_post_rollback_revision_group(rev_group)
+ elif 'lock' in rev_types:
+ self._process_post_lock_revision_group(rev_group)
+ elif 'close' in rev_types:
+ self._process_post_close_revision_group(rev_group)
+ elif 'delete' in rev_types:
+ self._process_post_delete_revision_group(rev_group)
else:
- self._process_post_action_revision_group(rev_group)
+ pass
+ #todo: rollback, lock, close and delete are
+ #not tested
+ #merge and migrate actions are ignored
+ #wiki is mixable with other groups, so process it in addition
+ if 'wiki' in rev_types:
+ self._make_post_wiki(rev_group)
def transfer_tag_preferences(self):
+ #todo: figure out where these are stored in SE
+ #maybe in se.User.preferences_raw?
pass
def transfer_question_and_answer_activity(self):
@@ -384,6 +586,8 @@ class Command(BaseCommand):
c_group = []
c_group.append(se_rev)
c_guid = se_rev.revision_guid
+ if len(c_group) != 0:
+ self._process_post_revision_group(c_group)
def transfer_comments(self):
for se_c in se.PostComment.objects.all():
@@ -477,18 +681,14 @@ class Command(BaseCommand):
u = X.get_user(v.user)
p = X.get_post(v.post)
m = X.vote_actions[vote_type]
- vote_method = getattr(osqa.User, m['on'])
+ vote_method = getattr(osqa.User, m)
vote_method(u, p, timestamp = v.creation_date)
if v.deletion_date:
vote_method(u, p, timestamp = v.deletion_date, cancel=True)
- def transfer_favorites(self):
- pass
-
def transfer_update_subscriptions(self):
- pass
-
- def transfer_flags(self):
+ #todo: not clear where this is stored in SE
+ #maybe in se.User.preferences_raw?
pass
def transfer_meta_pages(self):
@@ -548,7 +748,7 @@ class Command(BaseCommand):
if se_u.open_id is None and se_u.email is None:
print 'Warning: SE user %d is not recoverable (no email or openid)'
- u.reputation = se_u.reputation
+ u.reputation = 1#se_u.reputation, it's actually re-computed
u.last_seen = se_u.last_access_date
u.email = X.get_email(se_u.email)
u.location = X.blankable(se_u.location)