SE loader seems to work, details are in stackexchange/README

author: Evgeny Fadeev <evgeny.fadeev@gmail.com> 2010-03-09 22:05:39 -0500
committer: Evgeny Fadeev <evgeny.fadeev@gmail.com> 2010-03-09 22:05:39 -0500
commit: 020701bcba397d590d284962f3ce5df3134aaa08 (patch)
tree: d152d879fe3396c6d8591762c8c5284e9dcca56a /stackexchange
parent: c813ea591905e90512f2f04c68d251da3eb77eaa (diff)
download: askbot-020701bcba397d590d284962f3ce5df3134aaa08.tar.gz
askbot-020701bcba397d590d284962f3ce5df3134aaa08.tar.bz2
askbot-020701bcba397d590d284962f3ce5df3134aaa08.zip
3 files changed, 271 insertions, 38 deletions
diff --git a/stackexchange/ANOMALIES b/stackexchange/ANOMALIES
index 17ead454..05a7dbdb 100644
--- a/stackexchange/ANOMALIES
+++ b/stackexchange/ANOMALIES
@@ -3,3 +3,12 @@
 * users with no email (hack: gravatar set to settings.ANONYMOUS_USER_EMAIL)
 * users with no screen name
 * users with no email and no screen name (25% in homeschool)
+* tag preferences are not stored explicitly (interesting/ignored)
+  maybe they are in se.User.preferences_raw
+  but the data there is not marked up and is kind of cryptic
+* we don't have Community user. SE has one with id=-1
+  this id may break the load script
+  potential break places are anywhere where is X.get_user() call
+  issues may happen with larger data sets where activity
+  of user "Community" is somehow reflected in a way
+  that load_stackexchange does not take care of
diff --git a/stackexchange/README b/stackexchange/README
index b2a39e1c..bad11c9f 100644
--- a/stackexchange/README
+++ b/stackexchange/README
@@ -1,11 +1,12 @@
 this app's function will be to:
 
-* install it's own tables    <--- done 
-* read SE xml dump into DjangoDB  <--- done
-* populate osqa database <-- user accounts and Q&A revisions loaded
-* remove SE tables
+* install it's own tables (#todo: not yet automated)
+* read SE xml dump into DjangoDB (automated)
+* populate osqa database (automated)
+* remove SE tables (#todo: not done yet)
 
-Current process to load SE data into OSQA:
+Current process to load SE data into OSQA is:
+==============================================
 
 1) backup database
 
@@ -36,3 +37,26 @@ Current process to load SE data into OSQA:
 
     if anything doesn't go right - run 'python manage.py flush' and repeat
     steps 6 and 7
+
+NOTES:
+============
+
+Here is the load script that I used for the testing
+it assumes that SE dump has been unzipped inside the tmp directory
+
+    #!/bin/sh$
+    python manage.py flush 
+    #delete all data
+    mysql -u osqa -p osqa < sql_scripts/badges.sql
+    python manage.py load_stackexchange tmp
+
+Untested parts are tagged with comments starting with 
+#todo:
+
+The test set did not have all the usage cases of StackExchange represented so
+it may break with other sets.
+
+The job takes some time to run, especially
+content revisions and votes - may be optimized
+
+Some of the fringe cases are described in file stackexchange/ANOMALIES
diff --git a/stackexchange/management/commands/load_stackexchange.py b/stackexchange/management/commands/load_stackexchange.py
index 5673b9f0..11b0efc9 100644
--- a/stackexchange/management/commands/load_stackexchange.py
+++ b/stackexchange/management/commands/load_stackexchange.py
@@ -12,6 +12,8 @@ import stackexchange.models as se
 from forum.forms import EditUserEmailFeedsForm
 from forum.utils.html import sanitize_html
 from django.conf import settings
+from django.contrib.auth.models import Message as DjangoMessage
+from django.utils.translation import ugettext as _
 #from markdown2 import Markdown
 #markdowner = Markdown(html4tags=True)
 
@@ -55,14 +57,15 @@ class X(object):#
         'UpMod':'upvote',
         'DownMod':'downvote',
         'AcceptedByOriginator':'accept_answer',
-        'Offensive','flag_post',
-        'Favorite','toggle_favorite_question',
+        'Offensive':'flag_post',
+        'Favorite':'toggle_favorite_question',
     }
 
     #these modes cannot be mixed
+    #only wiki is assumed to be mixable
     exclusive_revision_modes = (
-        'initial','edit','lock','unlock',
-        'migrate','close','reopen','merge',
+        'initial','edit','rollback','lock',
+        'migrate','close','merge','delete',
     )
 
     #badges whose names don't match exactly, but
@@ -84,15 +87,49 @@ class X(object):#
         'Rollback Body':'rollback',
         'Rollback Tags':'rollback',
         'Post Closed':'close',
-        'Post Reopened':'reopen',
+        'Post Reopened':'close',
         'Post Deleted':'delete',
-        'Post Undeleted':'undelete',
+        'Post Undeleted':'delete',
         'Post Locked':'lock',
-        'Post Unlocked':'unlock',
+        'Post Unlocked':'lock',
         'Community Owned':'wiki',
         'Post Migrated':'migrate',
         'Question Merged':'merge',
     }
+
+    close_reason_map = {
+        1:1,#duplicate
+        2:2,#off-topic
+        3:3,#subjective and argumentative
+        4:4,#not a real question
+        5:7,#offensive
+        6:6,#irrelevant or outdated question
+        7:9,#too localized
+        10:8,#spam
+    }
+
+    @classmethod
+    def get_message_text(cls, se_m):
+        """try to intelligently translate
+        SE message to OSQA so that it makese sense in 
+        our context
+        """
+        #todo: properly translate messages
+        #todo: maybe work through more instances of messages
+        if se_m.message_type.name == 'Badge Notification':
+            return se_m.text
+        else:
+            if 'you are now an administrator' in se_m.text:
+                return _('Congratulations, you are now an Administrator')
+            elif re.search(r'^You have \d+ new',se_m.text):
+                bits = se_m.text.split('.')
+                text = bits[0]
+                if se_m.user.id == -1:
+                    return None
+                url = cls.get_user(se_m.user).get_profile_url()
+                return '<a href="%s?sort=responses">%s</a>' % (url,text)
+        return None
+
     @classmethod
     def get_post(cls, se_post):
         #todo: fix this hack - either in-memory id association table
@@ -106,6 +143,12 @@ class X(object):#
             raise Exception('unknown post type %s' % post_type)
 
     @classmethod
+    def get_close_reason(cls, se_reason):
+        #todo: this is a guess - have not seen real data
+        se_reason = int(se_reason)
+        return cls.close_reason_map[se_reason]
+
+    @classmethod
     def get_user(cls, se_user):
         #todo: same as get_post
         return osqa.User.objects.get(id=USER[se_user.id].id)
@@ -236,19 +279,76 @@ class Command(BaseCommand):
             table_name = self.get_table_name(xml)
             self.load_xml_file(xml_path, table_name)
 
+        #this is important so that when we clean up messages
+        #automatically generated by the procedures below
+        #we do not delete old messages
+        #todo: unfortunately this may need to be redone
+        #when we upgrade to django 1.2 and definitely by 1.4 when
+        #the current message system will be replaced with the
+        #django messages framework
+        self.save_osqa_message_id_list()
+
         #transfer data into OSQA tables
+        print 'Transferring users...',
+        sys.stdout.flush()
         self.transfer_users()
+        print 'done.'
+        print 'Transferring content edits...',
+        sys.stdout.flush()
         self.transfer_question_and_answer_activity()
+        print 'done.'
+        print 'Transferring view counts...',
+        sys.stdout.flush()
         self.transfer_question_view_counts()
+        print 'done.'
+        print 'Transferring comments...',
+        sys.stdout.flush()
         self.transfer_comments()
+        print 'done.'
+        print 'Transferring badges and badge awards...',
+        sys.stdout.flush()
         self.transfer_badges()
-        self.transfer_votes()
-        self.transfer_favorites()
-        self.transfer_tag_preferences()
+        print 'done.'
+        print 'Transferring votes...',
+        sys.stdout.flush()
+        self.transfer_votes()#includes favorites, accepts and flags
+        print 'done.'
+
+        self.cleanup_messages()#delete autogenerated messages
+        self.transfer_messages()
+
+        #todo: these are not clear how to go about
         self.transfer_update_subscriptions()
-        self.transfer_flags()
+        self.transfer_tag_preferences()
         self.transfer_meta_pages()
 
+    def save_osqa_message_id_list(self):
+        id_list = list(DjangoMessage.objects.all().values('id'))
+        self._osqa_message_id_list = id_list
+
+    def cleanup_messages(self):
+        """deletes messages generated by the load process
+        """
+        id_list = self._osqa_message_id_list
+        mset = DjangoMessage.objects.all().exclude(id__in=id_list)
+        mset.delete()
+
+    def transfer_messages(self):
+        """transfers some messages from
+        SE to OSQA
+        """
+        for m in se.Message.objects.all():
+            if m.is_read:
+                continue
+            if m.user.id == -1:
+                continue
+            u = X.get_user(m.user)
+            text = X.get_message_text(m)
+            if text:
+                u.message_set.create(
+                    message=text,
+                )
+
     def _process_post_initial_revision_group(self, rev_group):
 
         title = None
@@ -298,7 +398,7 @@ class Command(BaseCommand):
 
     def _process_post_edit_revision_group(self, rev_group):
         #question apply edit
-        (title, text, tags, wiki) = (None, None, None, False)
+        (title, text, tags) = (None, None, None)
         for rev in rev_group:
             rev_type = rev.post_history_type.name
             if rev_type == 'Edit Title':
@@ -308,7 +408,7 @@ class Command(BaseCommand):
             elif rev_type == 'Edit Tags':
                 tags = X.clean_tags(rev.text)
             elif rev_type == 'Community Owned':
-                wiki = True
+                pass
             else:
                 raise Exception('unexpected revision type %s' % rev_type)
 
@@ -327,39 +427,141 @@ class Command(BaseCommand):
                 text = text,
                 comment = comment,
                 tags = tags,
-                wiki = wiki
             )
         elif post_type == 'Answer':
             a = ANSWER[rev0.post.id]
-            #todo: wiki will probably be lost here
             a.apply_edit(
                 edited_at = edited_at,
                 edited_by = edited_by,
                 text = text,
                 comment = comment,
-                wiki = wiki
             )
 
-    def _process_post_action_revision_group(self, rev_group):
-        #this is odd - there were no edit actions like these
-        #closed, reopened, etc in homeschoolers sample
-        print 'Warning: these content revisions were not processed'
-        print 'please give us your sample and we will write code to import it'
-        print ';'.join([rev.post_history_type.name for rev in rev_group])
+    def _make_post_wiki(self, rev_group):
+        #todo: untested
+        for rev in rev_group:
+            if rev.post_history_type.name == 'Community Owned':
+                p = X.get_post(rev.post)
+                u = X.get_user(rev.user)
+                t = rev.creation_date
+                p.wiki = True
+                p.wikified_at = t
+                p.wikified_by = u
+                self.mark_activity(p,u,t)
+                p.save()
+                return
+
+    def mark_activity(self,p,u,t):
+        """p,u,t - post, user, timestamp
+        """
+        if isinstance(p, osqa.Question):
+            p.last_activity_by = u
+            p.last_activity_at = t
+        elif isinstance(p, osqa.Answer):
+            p.question.last_activity_by = u
+            p.question.last_activity_at = t
+            p.question.save()
+
+    def _process_post_rollback_revision_group(self, rev_group):
+        #todo: don't know what to do here as there were no
+        #such data available
+        pass
+
+    def _process_post_lock_revision_group(self, rev_group):
+        #todo: untested
+        for rev in rev_group:
+            rev_type = rev.post_history_type.name
+            if rev_type.endswith('ocked'):
+                t = rev.creation_date
+                u = X.get_user(rev.user)
+                p = X.get_post(rev.post)
+                if rev_type == 'Post Locked':
+                    p.locked = True
+                    p.locked_by = u
+                    p.locked_at = t 
+                elif rev_type == 'Post Unlocked':
+                    p.locked = False
+                    p.locked_by = None
+                    p.locked_at = None
+                else:
+                    return
+                self.mark_activity(p,u,t)
+                p.save()
+                return
+
+    def _process_post_close_revision_group(self, rev_group):
+        #todo: untested
+        for rev in rev_group:
+            if rev.post.post_type.name != 'Question':
+                return
+            rev_type = rev.post_history_type.name
+            if rev_type in ('Post Closed', 'Post Reopened'):
+                t = rev.creation_date
+                u = X.get_user(rev.user)
+                p = X.get_post(rev.post)
+                if rev_type == 'Post Closed':
+                    p.closed = True
+                    p.closed_at = t
+                    p.closed_by = u
+                    p.close_reason = X.get_close_reason(rev.text)
+                elif rev_type == 'Post Reopened':
+                    p.closed = False 
+                    p.closed_at = None
+                    p.closed_by = None
+                    p.close_reason = None
+                self.mark_activity(p,u,t)
+                p.save()
+                return
+
+    def _process_post_delete_revision_group(self, rev_group):
+        #todo: untested
+        for rev in rev_group:
+            rev_type = rev.post_history_type.name
+            if rev_type.endswith('eleted'):
+                t = rev.creation_date
+                u = X.get_user(rev.user)
+                p = X.get_post(rev.post)
+                if rev_type == 'Post Deleted':
+                    p.deleted = True
+                    p.deleted_at = t
+                    p.deleted_by = u
+                elif rev_type == 'Post Undeleted':
+                    p.deleted = False
+                    p.deleted_at = None
+                    p.deleted_by = None
+                self.mark_activity(p,u,t)
+                p.save()
+                return
 
     def _process_post_revision_group(self, rev_group):
         #determine revision type
+        #'initial','edit','rollback','lock',
+        #'migrate','close','merge','delete',
         rev_types = X.get_post_revision_group_types(rev_group) 
-        #initial,edit,lock,unlock,
-        #migrate,close,reopen,merge,wiki
         if 'initial' in rev_types:
             self._process_post_initial_revision_group(rev_group)
         elif 'edit' in rev_types:
             self._process_post_edit_revision_group(rev_group)
+        elif 'rollback' in rev_types:
+            self._process_post_rollback_revision_group(rev_group)
+        elif 'lock' in rev_types:
+            self._process_post_lock_revision_group(rev_group)
+        elif 'close' in rev_types:
+            self._process_post_close_revision_group(rev_group)
+        elif 'delete' in rev_types:
+            self._process_post_delete_revision_group(rev_group)
         else:
-            self._process_post_action_revision_group(rev_group)
+            pass
+            #todo: rollback, lock, close and delete are 
+            #not tested
+            #merge and migrate actions are ignored
+        #wiki is mixable with other groups, so process it in addition
+        if 'wiki' in rev_types:
+            self._make_post_wiki(rev_group)
 
     def transfer_tag_preferences(self):
+        #todo: figure out where these are stored in SE
+        #maybe in se.User.preferences_raw?
         pass
 
     def transfer_question_and_answer_activity(self):
@@ -384,6 +586,8 @@ class Command(BaseCommand):
                 c_group = []
                 c_group.append(se_rev)
                 c_guid = se_rev.revision_guid
+        if len(c_group) != 0:
+            self._process_post_revision_group(c_group)
 
     def transfer_comments(self):
         for se_c in se.PostComment.objects.all():
@@ -477,18 +681,14 @@ class Command(BaseCommand):
             u = X.get_user(v.user)
             p = X.get_post(v.post)
             m = X.vote_actions[vote_type]
-            vote_method = getattr(osqa.User, m['on'])
+            vote_method = getattr(osqa.User, m)
             vote_method(u, p, timestamp = v.creation_date)
             if v.deletion_date:
                 vote_method(u, p, timestamp = v.deletion_date, cancel=True)
 
-    def transfer_favorites(self):
-        pass
-
     def transfer_update_subscriptions(self):
-        pass
-
-    def transfer_flags(self):
+        #todo: not clear where this is stored in SE
+        #maybe in se.User.preferences_raw?
         pass
 
     def transfer_meta_pages(self):
@@ -548,7 +748,7 @@ class Command(BaseCommand):
             if se_u.open_id is None and se_u.email is None:
                 print 'Warning: SE user %d is not recoverable (no email or openid)'
 
-            u.reputation = se_u.reputation
+            u.reputation = 1#se_u.reputation, it's actually re-computed
             u.last_seen = se_u.last_access_date
             u.email = X.get_email(se_u.email)
             u.location = X.blankable(se_u.location)
author	Evgeny Fadeev <evgeny.fadeev@gmail.com>	2010-03-09 22:05:39 -0500
committer	Evgeny Fadeev <evgeny.fadeev@gmail.com>	2010-03-09 22:05:39 -0500
commit	020701bcba397d590d284962f3ce5df3134aaa08 (patch)
tree	d152d879fe3396c6d8591762c8c5284e9dcca56a /stackexchange
parent	c813ea591905e90512f2f04c68d251da3eb77eaa (diff)
download	askbot-020701bcba397d590d284962f3ce5df3134aaa08.tar.gz askbot-020701bcba397d590d284962f3ce5df3134aaa08.tar.bz2 askbot-020701bcba397d590d284962f3ce5df3134aaa08.zip