From 020701bcba397d590d284962f3ce5df3134aaa08 Mon Sep 17 00:00:00 2001 From: Evgeny Fadeev Date: Tue, 9 Mar 2010 22:05:39 -0500 Subject: SE loader seems to work, details are in stackexchange/README --- stackexchange/ANOMALIES | 9 + stackexchange/README | 34 ++- .../management/commands/load_stackexchange.py | 266 ++++++++++++++++++--- 3 files changed, 271 insertions(+), 38 deletions(-) (limited to 'stackexchange') diff --git a/stackexchange/ANOMALIES b/stackexchange/ANOMALIES index 17ead454..05a7dbdb 100644 --- a/stackexchange/ANOMALIES +++ b/stackexchange/ANOMALIES @@ -3,3 +3,12 @@ * users with no email (hack: gravatar set to settings.ANONYMOUS_USER_EMAIL) * users with no screen name * users with no email and no screen name (25% in homeschool) +* tag preferences are not stored explicitly (interesting/ignored) + maybe they are in se.User.preferences_raw + but the data there is not marked up and is kind of cryptic +* we don't have Community user. SE has one with id=-1 + this id may break the load script + potential break places are anywhere where is X.get_user() call + issues may happen with larger data sets where activity + of user "Community" is somehow reflected in a way + that load_stackexchange does not take care of diff --git a/stackexchange/README b/stackexchange/README index b2a39e1c..bad11c9f 100644 --- a/stackexchange/README +++ b/stackexchange/README @@ -1,11 +1,12 @@ this app's function will be to: -* install it's own tables <--- done -* read SE xml dump into DjangoDB <--- done -* populate osqa database <-- user accounts and Q&A revisions loaded -* remove SE tables +* install it's own tables (#todo: not yet automated) +* read SE xml dump into DjangoDB (automated) +* populate osqa database (automated) +* remove SE tables (#todo: not done yet) -Current process to load SE data into OSQA: +Current process to load SE data into OSQA is: +============================================== 1) backup database @@ -36,3 +37,26 @@ Current process to load SE data into OSQA: if anything doesn't go right - run 'python manage.py flush' and repeat steps 6 and 7 + +NOTES: +============ + +Here is the load script that I used for the testing +it assumes that SE dump has been unzipped inside the tmp directory + + #!/bin/sh$ + python manage.py flush + #delete all data + mysql -u osqa -p osqa < sql_scripts/badges.sql + python manage.py load_stackexchange tmp + +Untested parts are tagged with comments starting with +#todo: + +The test set did not have all the usage cases of StackExchange represented so +it may break with other sets. + +The job takes some time to run, especially +content revisions and votes - may be optimized + +Some of the fringe cases are described in file stackexchange/ANOMALIES diff --git a/stackexchange/management/commands/load_stackexchange.py b/stackexchange/management/commands/load_stackexchange.py index 5673b9f0..11b0efc9 100644 --- a/stackexchange/management/commands/load_stackexchange.py +++ b/stackexchange/management/commands/load_stackexchange.py @@ -12,6 +12,8 @@ import stackexchange.models as se from forum.forms import EditUserEmailFeedsForm from forum.utils.html import sanitize_html from django.conf import settings +from django.contrib.auth.models import Message as DjangoMessage +from django.utils.translation import ugettext as _ #from markdown2 import Markdown #markdowner = Markdown(html4tags=True) @@ -55,14 +57,15 @@ class X(object):# 'UpMod':'upvote', 'DownMod':'downvote', 'AcceptedByOriginator':'accept_answer', - 'Offensive','flag_post', - 'Favorite','toggle_favorite_question', + 'Offensive':'flag_post', + 'Favorite':'toggle_favorite_question', } #these modes cannot be mixed + #only wiki is assumed to be mixable exclusive_revision_modes = ( - 'initial','edit','lock','unlock', - 'migrate','close','reopen','merge', + 'initial','edit','rollback','lock', + 'migrate','close','merge','delete', ) #badges whose names don't match exactly, but @@ -84,15 +87,49 @@ class X(object):# 'Rollback Body':'rollback', 'Rollback Tags':'rollback', 'Post Closed':'close', - 'Post Reopened':'reopen', + 'Post Reopened':'close', 'Post Deleted':'delete', - 'Post Undeleted':'undelete', + 'Post Undeleted':'delete', 'Post Locked':'lock', - 'Post Unlocked':'unlock', + 'Post Unlocked':'lock', 'Community Owned':'wiki', 'Post Migrated':'migrate', 'Question Merged':'merge', } + + close_reason_map = { + 1:1,#duplicate + 2:2,#off-topic + 3:3,#subjective and argumentative + 4:4,#not a real question + 5:7,#offensive + 6:6,#irrelevant or outdated question + 7:9,#too localized + 10:8,#spam + } + + @classmethod + def get_message_text(cls, se_m): + """try to intelligently translate + SE message to OSQA so that it makese sense in + our context + """ + #todo: properly translate messages + #todo: maybe work through more instances of messages + if se_m.message_type.name == 'Badge Notification': + return se_m.text + else: + if 'you are now an administrator' in se_m.text: + return _('Congratulations, you are now an Administrator') + elif re.search(r'^You have \d+ new',se_m.text): + bits = se_m.text.split('.') + text = bits[0] + if se_m.user.id == -1: + return None + url = cls.get_user(se_m.user).get_profile_url() + return '%s' % (url,text) + return None + @classmethod def get_post(cls, se_post): #todo: fix this hack - either in-memory id association table @@ -105,6 +142,12 @@ class X(object):# else: raise Exception('unknown post type %s' % post_type) + @classmethod + def get_close_reason(cls, se_reason): + #todo: this is a guess - have not seen real data + se_reason = int(se_reason) + return cls.close_reason_map[se_reason] + @classmethod def get_user(cls, se_user): #todo: same as get_post @@ -236,19 +279,76 @@ class Command(BaseCommand): table_name = self.get_table_name(xml) self.load_xml_file(xml_path, table_name) + #this is important so that when we clean up messages + #automatically generated by the procedures below + #we do not delete old messages + #todo: unfortunately this may need to be redone + #when we upgrade to django 1.2 and definitely by 1.4 when + #the current message system will be replaced with the + #django messages framework + self.save_osqa_message_id_list() + #transfer data into OSQA tables + print 'Transferring users...', + sys.stdout.flush() self.transfer_users() + print 'done.' + print 'Transferring content edits...', + sys.stdout.flush() self.transfer_question_and_answer_activity() + print 'done.' + print 'Transferring view counts...', + sys.stdout.flush() self.transfer_question_view_counts() + print 'done.' + print 'Transferring comments...', + sys.stdout.flush() self.transfer_comments() + print 'done.' + print 'Transferring badges and badge awards...', + sys.stdout.flush() self.transfer_badges() - self.transfer_votes() - self.transfer_favorites() - self.transfer_tag_preferences() + print 'done.' + print 'Transferring votes...', + sys.stdout.flush() + self.transfer_votes()#includes favorites, accepts and flags + print 'done.' + + self.cleanup_messages()#delete autogenerated messages + self.transfer_messages() + + #todo: these are not clear how to go about self.transfer_update_subscriptions() - self.transfer_flags() + self.transfer_tag_preferences() self.transfer_meta_pages() + def save_osqa_message_id_list(self): + id_list = list(DjangoMessage.objects.all().values('id')) + self._osqa_message_id_list = id_list + + def cleanup_messages(self): + """deletes messages generated by the load process + """ + id_list = self._osqa_message_id_list + mset = DjangoMessage.objects.all().exclude(id__in=id_list) + mset.delete() + + def transfer_messages(self): + """transfers some messages from + SE to OSQA + """ + for m in se.Message.objects.all(): + if m.is_read: + continue + if m.user.id == -1: + continue + u = X.get_user(m.user) + text = X.get_message_text(m) + if text: + u.message_set.create( + message=text, + ) + def _process_post_initial_revision_group(self, rev_group): title = None @@ -298,7 +398,7 @@ class Command(BaseCommand): def _process_post_edit_revision_group(self, rev_group): #question apply edit - (title, text, tags, wiki) = (None, None, None, False) + (title, text, tags) = (None, None, None) for rev in rev_group: rev_type = rev.post_history_type.name if rev_type == 'Edit Title': @@ -308,7 +408,7 @@ class Command(BaseCommand): elif rev_type == 'Edit Tags': tags = X.clean_tags(rev.text) elif rev_type == 'Community Owned': - wiki = True + pass else: raise Exception('unexpected revision type %s' % rev_type) @@ -327,39 +427,141 @@ class Command(BaseCommand): text = text, comment = comment, tags = tags, - wiki = wiki ) elif post_type == 'Answer': a = ANSWER[rev0.post.id] - #todo: wiki will probably be lost here a.apply_edit( edited_at = edited_at, edited_by = edited_by, text = text, comment = comment, - wiki = wiki ) - def _process_post_action_revision_group(self, rev_group): - #this is odd - there were no edit actions like these - #closed, reopened, etc in homeschoolers sample - print 'Warning: these content revisions were not processed' - print 'please give us your sample and we will write code to import it' - print ';'.join([rev.post_history_type.name for rev in rev_group]) + def _make_post_wiki(self, rev_group): + #todo: untested + for rev in rev_group: + if rev.post_history_type.name == 'Community Owned': + p = X.get_post(rev.post) + u = X.get_user(rev.user) + t = rev.creation_date + p.wiki = True + p.wikified_at = t + p.wikified_by = u + self.mark_activity(p,u,t) + p.save() + return + + def mark_activity(self,p,u,t): + """p,u,t - post, user, timestamp + """ + if isinstance(p, osqa.Question): + p.last_activity_by = u + p.last_activity_at = t + elif isinstance(p, osqa.Answer): + p.question.last_activity_by = u + p.question.last_activity_at = t + p.question.save() + + def _process_post_rollback_revision_group(self, rev_group): + #todo: don't know what to do here as there were no + #such data available + pass + + def _process_post_lock_revision_group(self, rev_group): + #todo: untested + for rev in rev_group: + rev_type = rev.post_history_type.name + if rev_type.endswith('ocked'): + t = rev.creation_date + u = X.get_user(rev.user) + p = X.get_post(rev.post) + if rev_type == 'Post Locked': + p.locked = True + p.locked_by = u + p.locked_at = t + elif rev_type == 'Post Unlocked': + p.locked = False + p.locked_by = None + p.locked_at = None + else: + return + self.mark_activity(p,u,t) + p.save() + return + + def _process_post_close_revision_group(self, rev_group): + #todo: untested + for rev in rev_group: + if rev.post.post_type.name != 'Question': + return + rev_type = rev.post_history_type.name + if rev_type in ('Post Closed', 'Post Reopened'): + t = rev.creation_date + u = X.get_user(rev.user) + p = X.get_post(rev.post) + if rev_type == 'Post Closed': + p.closed = True + p.closed_at = t + p.closed_by = u + p.close_reason = X.get_close_reason(rev.text) + elif rev_type == 'Post Reopened': + p.closed = False + p.closed_at = None + p.closed_by = None + p.close_reason = None + self.mark_activity(p,u,t) + p.save() + return + + def _process_post_delete_revision_group(self, rev_group): + #todo: untested + for rev in rev_group: + rev_type = rev.post_history_type.name + if rev_type.endswith('eleted'): + t = rev.creation_date + u = X.get_user(rev.user) + p = X.get_post(rev.post) + if rev_type == 'Post Deleted': + p.deleted = True + p.deleted_at = t + p.deleted_by = u + elif rev_type == 'Post Undeleted': + p.deleted = False + p.deleted_at = None + p.deleted_by = None + self.mark_activity(p,u,t) + p.save() + return def _process_post_revision_group(self, rev_group): #determine revision type + #'initial','edit','rollback','lock', + #'migrate','close','merge','delete', rev_types = X.get_post_revision_group_types(rev_group) - #initial,edit,lock,unlock, - #migrate,close,reopen,merge,wiki if 'initial' in rev_types: self._process_post_initial_revision_group(rev_group) elif 'edit' in rev_types: self._process_post_edit_revision_group(rev_group) + elif 'rollback' in rev_types: + self._process_post_rollback_revision_group(rev_group) + elif 'lock' in rev_types: + self._process_post_lock_revision_group(rev_group) + elif 'close' in rev_types: + self._process_post_close_revision_group(rev_group) + elif 'delete' in rev_types: + self._process_post_delete_revision_group(rev_group) else: - self._process_post_action_revision_group(rev_group) + pass + #todo: rollback, lock, close and delete are + #not tested + #merge and migrate actions are ignored + #wiki is mixable with other groups, so process it in addition + if 'wiki' in rev_types: + self._make_post_wiki(rev_group) def transfer_tag_preferences(self): + #todo: figure out where these are stored in SE + #maybe in se.User.preferences_raw? pass def transfer_question_and_answer_activity(self): @@ -384,6 +586,8 @@ class Command(BaseCommand): c_group = [] c_group.append(se_rev) c_guid = se_rev.revision_guid + if len(c_group) != 0: + self._process_post_revision_group(c_group) def transfer_comments(self): for se_c in se.PostComment.objects.all(): @@ -477,18 +681,14 @@ class Command(BaseCommand): u = X.get_user(v.user) p = X.get_post(v.post) m = X.vote_actions[vote_type] - vote_method = getattr(osqa.User, m['on']) + vote_method = getattr(osqa.User, m) vote_method(u, p, timestamp = v.creation_date) if v.deletion_date: vote_method(u, p, timestamp = v.deletion_date, cancel=True) - def transfer_favorites(self): - pass - def transfer_update_subscriptions(self): - pass - - def transfer_flags(self): + #todo: not clear where this is stored in SE + #maybe in se.User.preferences_raw? pass def transfer_meta_pages(self): @@ -548,7 +748,7 @@ class Command(BaseCommand): if se_u.open_id is None and se_u.email is None: print 'Warning: SE user %d is not recoverable (no email or openid)' - u.reputation = se_u.reputation + u.reputation = 1#se_u.reputation, it's actually re-computed u.last_seen = se_u.last_access_date u.email = X.get_email(se_u.email) u.location = X.blankable(se_u.location) -- cgit v1.2.3-1-g7c22