diff options
Diffstat (limited to 'stackexchange/management/commands/load_stackexchange.py')
-rw-r--r-- | stackexchange/management/commands/load_stackexchange.py | 804 |
1 files changed, 804 insertions, 0 deletions
diff --git a/stackexchange/management/commands/load_stackexchange.py b/stackexchange/management/commands/load_stackexchange.py new file mode 100644 index 00000000..afe4b9ea --- /dev/null +++ b/stackexchange/management/commands/load_stackexchange.py @@ -0,0 +1,804 @@ +from django.core.management.base import BaseCommand +from django.template.defaultfilters import slugify #todo: adopt unicode-aware slugify +#todo: http://stackoverflow.com/questions/837828/how-to-use-a-slug-in-django +import os +import re +import sys +import stackexchange.parse_models as se_parser +from xml.etree import ElementTree as et +from django.db import models +import forum.models as osqa +import stackexchange.models as se +from forum.forms import EditUserEmailFeedsForm +from forum.utils.html import sanitize_html +from django.conf import settings +from django.contrib.auth.models import Message as DjangoMessage +from django.utils.translation import ugettext as _ +#from markdown2 import Markdown +#markdowner = Markdown(html4tags=True) + +xml_read_order = ( + 'VoteTypes','UserTypes','Users','Users2Votes', + 'Badges','Users2Badges','CloseReasons','FlatPages', + 'MessageTypes','PostHistoryTypes','PostTypes','SchemaVersion', + 'Settings','SystemMessages','ThemeResources','ThemeTextResources', + 'ThrottleBucket','UserHistoryTypes','UserHistory', + 'Users2Badges','VoteTypes','Users2Votes','MessageTypes', + 'Posts','Posts2Votes','PostHistory','PostComments', + 'ModeratorMessages','Messages','Comments2Votes', + ) + +#association tables SE item id --> OSQA item id +#table associations are implied +#todo: there is an issue that these may be inconsistent with the database +USER = {}#SE User.id --> django(OSQA) User.id +QUESTION = {} +ANSWER = {} +NAMESAKE_COUNT = {}# number of times user name was used - for X.get_screen_name + +class X(object):# + """class with methods for handling some details + of SE --> OSQA mapping + """ + badge_type_map = {'1':'gold','2':'silver','3':'bronze'} + + osqa_supported_id_providers = ( + 'google','yahoo','aol','myopenid', + 'flickr','technorati', + 'wordpress','blogger','livejournal', + 'claimid','vidoop','verisign', + 'openidurl','facebook','local', + 'twitter' #oauth is not on this list, b/c it has no own url + ) + + #map SE VoteType -> osqa.User vote method + #created methods with the same call structure in osqa.User + #User.<vote_method>(post, timestamp=None, cancel=False) + vote_actions = { + 'UpMod':'upvote', + 'DownMod':'downvote', + 'AcceptedByOriginator':'accept_answer', + 'Offensive':'flag_post', + 'Favorite':'toggle_favorite_question', + } + + #these modes cannot be mixed + #only wiki is assumed to be mixable + exclusive_revision_modes = ( + 'initial','edit','rollback','lock', + 'migrate','close','merge','delete', + ) + + #badges whose names don't match exactly, but + #present in both SE and OSQA + badge_exceptions = {# SE --> OSQA + 'Citizen Patrol':'Citizen patrol',#single #todo: why sentence case? + 'Strunk & White':'Strunk & White',#single + 'Civic Duty':'Civic duty', + } + + revision_type_map = { + 'Initial Title':'initial', + 'Initial Body':'initial', + 'Initial Tags':'initial', + 'Edit Title':'edit', + 'Edit Body':'edit', + 'Edit Tags':'edit', + 'Rollback Title':'rollback', + 'Rollback Body':'rollback', + 'Rollback Tags':'rollback', + 'Post Closed':'close', + 'Post Reopened':'close', + 'Post Deleted':'delete', + 'Post Undeleted':'delete', + 'Post Locked':'lock', + 'Post Unlocked':'lock', + 'Community Owned':'wiki', + 'Post Migrated':'migrate', + 'Question Merged':'merge', + } + + close_reason_map = { + 1:1,#duplicate + 2:2,#off-topic + 3:3,#subjective and argumentative + 4:4,#not a real question + 5:7,#offensive + 6:6,#irrelevant or outdated question + 7:9,#too localized + 10:8,#spam + } + + @classmethod + def get_message_text(cls, se_m): + """try to intelligently translate + SE message to OSQA so that it makese sense in + our context + """ + #todo: properly translate messages + #todo: maybe work through more instances of messages + if se_m.message_type.name == 'Badge Notification': + return se_m.text + else: + if 'you are now an administrator' in se_m.text: + return _('Congratulations, you are now an Administrator') + elif re.search(r'^You have \d+ new',se_m.text): + bits = se_m.text.split('.') + text = bits[0] + if se_m.user.id == -1: + return None + url = cls.get_user(se_m.user).get_profile_url() + return '<a href="%s?sort=responses">%s</a>' % (url,text) + return None + + @classmethod + def get_post(cls, se_post): + #todo: fix this hack - either in-memory id association table + #or use database to store these associations + post_type = se_post.post_type.name + if post_type == 'Question': + return osqa.Question.objects.get(id=QUESTION[se_post.id].id) + elif post_type == 'Answer': + return osqa.Answer.objects.get(id=ANSWER[se_post.id].id) + else: + raise Exception('unknown post type %s' % post_type) + + @classmethod + def get_close_reason(cls, se_reason): + #todo: this is a guess - have not seen real data + se_reason = int(se_reason) + return cls.close_reason_map[se_reason] + + @classmethod + def get_user(cls, se_user): + #todo: same as get_post + return osqa.User.objects.get(id=USER[se_user.id].id) + + @classmethod + def get_post_revision_group_types(cls, rev_group): + rev_types = {} + for rev in rev_group: + rev_type = cls.get_post_revision_type(rev) + rev_types[rev_type] = 1 + rev_types = rev_types.keys() + + #make sure that exclusive rev modes are not mixed + exclusive = cls.exclusive_revision_modes + if len(rev_types) > 1 and all([t in exclusive for t in rev_types]): + tstr = ','.join(rev_types) + gstr = ','.join([str(rev.id) for rev in rev_group]) + msg = 'incompatible revision types %s in PostHistory %s' % (tstr,gstr) + raise Exception(msg) + return rev_types + + @classmethod + def clean_tags(cls, tags): + tags = re.subn(r'\s+',' ',tags.strip())[0] + bits = tags.split(' ') + tags = ' '.join([bit[1:-1] for bit in bits]) + tags = re.subn(r'\xf6','-',tags)[0] + return tags + + @classmethod + def get_screen_name(cls, name): + """always returns unique screen name + even if there are multiple users in SE + with the same exact screen name + """ + if name is None: + name = 'anonymous' + name = name.strip() + name = re.subn(r'\s+',' ',name)[0]#remove repeating spaces + + try: + u = osqa.User.objects.get(username = name) + try: + if u.location: + name += ', %s' % u.location + if name in NAMESAKE_COUNT: + NAMESAKE_COUNT[name] += 1 + name += ' %d' % NAMESAKE_COUNT[name] + else: + NAMESAKE_COUNT[name] = 1 + except osqa.User.DoesNotExist: + pass + except osqa.User.DoesNotExist: + NAMESAKE_COUNT[name] = 1 + return name + + @classmethod + def get_email(cls, email):#todo: fix fringe case - user did not give email! + if email is None: + return settings.ANONYMOUS_USER_EMAIL + else: + assert(email != '') + return email + + @classmethod + def get_post_revision_type(cls, rev): + rev_name = rev.post_history_type.name + rev_type = cls.revision_type_map.get(rev_name, None) + if rev_type is None: + raise Exception('dont understand revision type %s' % rev) + return rev_type + + #crude method of getting id provider name from the url + @classmethod + def get_openid_provider_name(cls, openid_url): + openid_str = str(openid_url) + bits = openid_str.split('/') + base_url = bits[2] #assume this is base url + url_bits = base_url.split('.') + provider_name = url_bits[-2].lower() + if provider_name not in cls.osqa_supported_id_providers: + raise Exception('could not determine login provider for %s' % openid_url) + return provider_name + + @staticmethod + def blankable(input): + if input is None: + return '' + else: + return input + + @classmethod + def parse_badge_summary(cls, badge_summary): + (gold,silver,bronze) = (0,0,0) + if badge_summary: + if len(badge_summary) > 3: + print 'warning: guessing that badge summary is comma separated' + print 'have %s' % badge_summary + bits = badge_summary.split(',') + else: + bits = [badge_summary] + for bit in bits: + m = re.search(r'^(?P<type>[1-3])=(?P<count>\d+)$', bit) + if not m: + raise Exception('could not parse badge summary: %s' % badge_summary) + else: + badge_type = cls.badge_type_map[m.groupdict()['type']] + locals()[badge_type] = int(m.groupdict()['count']) + return (gold,silver,bronze) + + @classmethod + def get_badge_name(cls, name): + return cls.badge_exceptions.get(name, name) + +class Command(BaseCommand): + help = 'Loads StackExchange data from unzipped directory of XML files into the OSQA database' + args = 'se_dump_dir' + + def handle(self, *arg, **kwarg): + if len(arg) < 1 or not os.path.isdir(arg[0]): + print 'Error: first argument must be a directory with all the SE *.xml files' + sys.exit(1) + + self.dump_path = arg[0] + #read the data into SE tables + for xml in xml_read_order: + xml_path = self.get_xml_path(xml) + table_name = self.get_table_name(xml) + self.load_xml_file(xml_path, table_name) + + #this is important so that when we clean up messages + #automatically generated by the procedures below + #we do not delete old messages + #todo: unfortunately this may need to be redone + #when we upgrade to django 1.2 and definitely by 1.4 when + #the current message system will be replaced with the + #django messages framework + self.save_osqa_message_id_list() + + #transfer data into OSQA tables + print 'Transferring users...', + sys.stdout.flush() + self.transfer_users() + print 'done.' + print 'Transferring content edits...', + sys.stdout.flush() + self.transfer_question_and_answer_activity() + print 'done.' + print 'Transferring view counts...', + sys.stdout.flush() + self.transfer_question_view_counts() + print 'done.' + print 'Transferring comments...', + sys.stdout.flush() + self.transfer_comments() + print 'done.' + print 'Transferring badges and badge awards...', + sys.stdout.flush() + self.transfer_badges() + print 'done.' + print 'Transferring votes...', + sys.stdout.flush() + self.transfer_votes()#includes favorites, accepts and flags + print 'done.' + + self.cleanup_messages()#delete autogenerated messages + self.transfer_messages() + + #todo: these are not clear how to go about + self.transfer_update_subscriptions() + self.transfer_tag_preferences() + self.transfer_meta_pages() + + def save_osqa_message_id_list(self): + id_list = list(DjangoMessage.objects.all().values('id')) + self._osqa_message_id_list = id_list + + def cleanup_messages(self): + """deletes messages generated by the load process + """ + id_list = self._osqa_message_id_list + mset = DjangoMessage.objects.all().exclude(id__in=id_list) + mset.delete() + + def transfer_messages(self): + """transfers some messages from + SE to OSQA + """ + for m in se.Message.objects.all(): + if m.is_read: + continue + if m.user.id == -1: + continue + u = X.get_user(m.user) + text = X.get_message_text(m) + if text: + u.message_set.create( + message=text, + ) + + def _process_post_initial_revision_group(self, rev_group): + + title = None + text = None + tags = None + wiki = False + author = USER[rev_group[0].user.id] + added_at = rev_group[0].creation_date + + for rev in rev_group: + rev_type = rev.post_history_type.name + if rev_type == 'Initial Title': + title = rev.text + elif rev_type == 'Initial Body': + text = rev.text + elif rev_type == 'Initial Tags': + tags = X.clean_tags(rev.text) + elif rev_type == 'Community Owned': + wiki = True + else: + raise Exception('unexpected revision type %s' % rev_type) + + post_type = rev_group[0].post.post_type.name + if post_type == 'Question': + q = osqa.Question.objects.create_new( + title = title, + author = author, + added_at = added_at, + wiki = wiki, + tagnames = tags, + text = text + ) + QUESTION[rev_group[0].post.id] = q + elif post_type == 'Answer': + q = X.get_post(rev_group[0].post.parent) + a = osqa.Answer.objects.create_new( + question = q, + author = author, + added_at = added_at, + wiki = wiki, + text = text, + ) + ANSWER[rev_group[0].post.id] = a + else: + post_id = rev_group[0].post.id + raise Exception('unknown post type %s for id=%d' % (post_type, post_id)) + + def _process_post_edit_revision_group(self, rev_group): + #question apply edit + (title, text, tags) = (None, None, None) + for rev in rev_group: + rev_type = rev.post_history_type.name + if rev_type == 'Edit Title': + title = rev.text + elif rev_type == 'Edit Body': + text = rev.text + elif rev_type == 'Edit Tags': + tags = X.clean_tags(rev.text) + elif rev_type == 'Community Owned': + pass + else: + raise Exception('unexpected revision type %s' % rev_type) + + rev0 = rev_group[0] + edited_by = USER[rev0.user.id] + edited_at = rev0.creation_date + comment = ';'.join([rev.comment for rev in rev_group if rev.comment]) + post_type = rev0.post.post_type.name + + if post_type == 'Question': + q = X.get_post(rev0.post) + q.apply_edit( + edited_at = edited_at, + edited_by = edited_by, + title = title, + text = text, + comment = comment, + tags = tags, + ) + elif post_type == 'Answer': + a = X.get_post(rev0.post) + a.apply_edit( + edited_at = edited_at, + edited_by = edited_by, + text = text, + comment = comment, + ) + + def _make_post_wiki(self, rev_group): + #todo: untested + for rev in rev_group: + if rev.post_history_type.name == 'Community Owned': + p = X.get_post(rev.post) + u = X.get_user(rev.user) + t = rev.creation_date + p.wiki = True + p.wikified_at = t + p.wikified_by = u + self.mark_activity(p,u,t) + p.save() + return + + def mark_activity(self,p,u,t): + """p,u,t - post, user, timestamp + """ + if isinstance(p, osqa.Question): + p.last_activity_by = u + p.last_activity_at = t + elif isinstance(p, osqa.Answer): + p.question.last_activity_by = u + p.question.last_activity_at = t + p.question.save() + + def _process_post_rollback_revision_group(self, rev_group): + #todo: don't know what to do here as there were no + #such data available + pass + + def _process_post_lock_revision_group(self, rev_group): + #todo: untested + for rev in rev_group: + rev_type = rev.post_history_type.name + if rev_type.endswith('ocked'): + t = rev.creation_date + u = X.get_user(rev.user) + p = X.get_post(rev.post) + if rev_type == 'Post Locked': + p.locked = True + p.locked_by = u + p.locked_at = t + elif rev_type == 'Post Unlocked': + p.locked = False + p.locked_by = None + p.locked_at = None + else: + return + self.mark_activity(p,u,t) + p.save() + return + + def _process_post_close_revision_group(self, rev_group): + #todo: untested + for rev in rev_group: + if rev.post.post_type.name != 'Question': + return + rev_type = rev.post_history_type.name + if rev_type in ('Post Closed', 'Post Reopened'): + t = rev.creation_date + u = X.get_user(rev.user) + p = X.get_post(rev.post) + if rev_type == 'Post Closed': + p.closed = True + p.closed_at = t + p.closed_by = u + p.close_reason = X.get_close_reason(rev.text) + elif rev_type == 'Post Reopened': + p.closed = False + p.closed_at = None + p.closed_by = None + p.close_reason = None + self.mark_activity(p,u,t) + p.save() + return + + def _process_post_delete_revision_group(self, rev_group): + #todo: untested + for rev in rev_group: + rev_type = rev.post_history_type.name + if rev_type.endswith('eleted'): + t = rev.creation_date + u = X.get_user(rev.user) + p = X.get_post(rev.post) + if rev_type == 'Post Deleted': + p.deleted = True + p.deleted_at = t + p.deleted_by = u + elif rev_type == 'Post Undeleted': + p.deleted = False + p.deleted_at = None + p.deleted_by = None + self.mark_activity(p,u,t) + p.save() + return + + def _process_post_revision_group(self, rev_group): + #determine revision type + #'initial','edit','rollback','lock', + #'migrate','close','merge','delete', + rev_types = X.get_post_revision_group_types(rev_group) + if 'initial' in rev_types: + self._process_post_initial_revision_group(rev_group) + elif 'edit' in rev_types: + self._process_post_edit_revision_group(rev_group) + elif 'rollback' in rev_types: + self._process_post_rollback_revision_group(rev_group) + elif 'lock' in rev_types: + self._process_post_lock_revision_group(rev_group) + elif 'close' in rev_types: + self._process_post_close_revision_group(rev_group) + elif 'delete' in rev_types: + self._process_post_delete_revision_group(rev_group) + else: + pass + #todo: rollback, lock, close and delete are + #not tested + #merge and migrate actions are ignored + #wiki is mixable with other groups, so process it in addition + if 'wiki' in rev_types: + self._make_post_wiki(rev_group) + + def transfer_tag_preferences(self): + #todo: figure out where these are stored in SE + #maybe in se.User.preferences_raw? + pass + + def transfer_question_and_answer_activity(self): + """transfers all question and answer + edits and related status changes + """ + #assuming that there are only two post types + se_revs = se.PostHistory.objects.all() + #assuming that chronologial order is correct and there + #will be no problems of data integrity upon insertion of records + se_revs = se_revs.order_by('creation_date','revision_guid') + #todo: ignored fringe case - no revisions + c_guid = se_revs[0].revision_guid + c_group = [] + #this loop groups revisions by revision id, then calls process function + #for the revision grup (elementary revisions posted at once) + for se_rev in se_revs: + if se_rev.revision_guid == c_guid: + c_group.append(se_rev) + else: + self._process_post_revision_group(c_group) + c_group = [] + c_group.append(se_rev) + c_guid = se_rev.revision_guid + if len(c_group) != 0: + self._process_post_revision_group(c_group) + + def transfer_comments(self): + for se_c in se.PostComment.objects.all(): + if se_c.deletion_date: + print 'Warning deleted comment %d dropped' % se_c.id + continue + se_post = se_c.post + if se_post.post_type.name == 'Question': + osqa_post = QUESTION[se_post.id] + elif se_post.post_type.name == 'Answer': + osqa_post = ANSWER[se_post.id] + + osqa_post.add_comment( + comment = se_c.text, + added_at = se_c.creation_date, + user = USER[se_c.user.id] + ) + + def _install_missing_badges(self): + self._missing_badges = {} + for se_b in se.Badge.objects.all(): + name = X.get_badge_name(se_b.name) + try: + osqa.Badge.objects.get(name=name) + except: + self._missing_badges[name] = 0 + if len(se_b.description) > 300: + print 'Warning truncated description for badge %d' % se_b.id + osqa.Badge.objects.create( + name = name, + slug = slugify(name), + description = se_b.description, + multiple = (not se_b.single), + type = se_b.class_type + ) + + def _award_badges(self): + #note: SE does not keep information on + #content-related badges like osqa does + for se_a in se.User2Badge.objects.all(): + if se_a.user.id == -1: + continue #skip community user + u = USER[se_a.user.id] + badge_name = X.get_badge_name(se_a.badge.name) + b = osqa.Badge.objects.get(name=badge_name) + if b.multiple == False: + if b.award_badge.count() > 0: + continue + #todo: fake content object here b/c SE does not support this + #todo: but osqa requires related content object + osqa.Award.objects.create( + user=u, + badge=b, + awarded_at=se_a.date, + content_object=u, + ) + if b.name in self._missing_badges: + self._missing_badges[b.name] += 1 + + def _cleanup_badges(self): + d = self._missing_badges + unused = [name for name in d.keys() if d[name] == 0] + osqa.Badge.objects.filter(name__in=unused).delete() + installed = [name for name in d.keys() if d[name] > 0] + print 'Warning - following unsupported badges were installed:' + print ', '.join(installed) + + def transfer_badges(self): + #note: badge level is neglected + #1) install missing badges + self._install_missing_badges() + #2) award badges + self._award_badges() + #3) delete unused newly installed badges + self._cleanup_badges() + pass + + def transfer_question_view_counts(self): + for se_q in se.Post.objects.filter(post_type__name='Question'): + q = X.get_post(se_q) + q.view_count = se_q.view_count + q.save() + + + def transfer_votes(self): + for v in se.Post2Vote.objects.all(): + vote_type = v.vote_type.name + if not vote_type in X.vote_actions: + continue + + u = X.get_user(v.user) + p = X.get_post(v.post) + m = X.vote_actions[vote_type] + vote_method = getattr(osqa.User, m) + vote_method(u, p, timestamp = v.creation_date) + if v.deletion_date: + vote_method(u, p, timestamp = v.deletion_date, cancel=True) + + def transfer_update_subscriptions(self): + #todo: not clear where this is stored in SE + #maybe in se.User.preferences_raw? + pass + + def transfer_meta_pages(self): + #here we actually don't have anything in the database yet + #so we can't do this + pass + + def load_xml_file(self, xml_path, table_name): + tree = et.parse(xml_path) + print 'loading from %s to %s' % (xml_path, table_name) , + model = models.get_model('stackexchange', table_name) + i = 0 + for row in tree.findall('.//row'): + model_entry = model() + i += 1 + for col in row.getchildren(): + field_name = se_parser.parse_field_name(col.tag) + field_type = model._meta.get_field(field_name) + field_value = se_parser.parse_value(col.text, field_type) + setattr(model_entry, field_name, field_value) + model_entry.save() + print '... %d objects saved' % i + + def get_table_name(self,xml): + return se_parser.get_table_name(xml) + + def get_xml_path(self, xml): + xml_path = os.path.join(self.dump_path, xml + '.xml') + if not os.path.isfile(xml_path): + print 'Error: file %s not found' % xml_path + sys.exit(1) + return xml_path + + def transfer_users(self): + for se_u in se.User.objects.all(): + if se_u.id < 1:#skip the Community user + continue + u = osqa.User() + u_type = se_u.user_type.name + if u_type == 'Administrator': + u.is_superuser = True + elif u_type == 'Moderator': + u.is_staff = True + elif u_type not in ('Unregistered', 'Registered'): + raise Exception('unknown user type %s' % u_type) + + #if user is not registered, no association record created + #we do not allow posting by users who are not authenticated + #probably they'll just have to "recover" their account by email + if u_type != 'Unregistered': + assert(se_u.open_id)#everybody must have open_id + u_auth = osqa.AuthKeyUserAssociation() + u_auth.key = se_u.open_id + u_auth.provider = X.get_openid_provider_name(se_u.open_id) + u_auth.added_at = se_u.creation_date + + if se_u.open_id is None and se_u.email is None: + print 'Warning: SE user %d is not recoverable (no email or openid)' + + u.reputation = 1#se_u.reputation, it's actually re-computed + u.last_seen = se_u.last_access_date + u.email = X.get_email(se_u.email) + u.location = X.blankable(se_u.location) + u.date_of_birth = se_u.birthday #dattime -> date + u.website = X.blankable(se_u.website_url) + u.about = X.blankable(se_u.about_me) + u.last_login = se_u.last_login_date + u.date_joined = se_u.creation_date + u.is_active = True #todo: this may not be the case + + u.username = X.get_screen_name(se_u.display_name) + u.real_name = X.blankable(se_u.real_name) + + (gold,silver,bronze) = X.parse_badge_summary(se_u.badge_summary) + u.gold = gold + u.silver = silver + u.bronze = bronze + + #todo: we don't have these fields + #views - number of profile views? + #has_replies + #has_message + #opt_in_recruit + #last_login_ip + #open_id_alt - ?? + #preferences_raw - not clear how to use + #display_name_cleaned - lowercased, srtipped name + #timed_penalty_date + #phone + + #don't know how to handle these - there was no usage example + #password_id + #guid + + #ignored + #last_email_date - this translates directly to EmailFeedSetting.reported_at + + #save the data + u.save() + form = EditUserEmailFeedsForm() + form.reset() + if se_u.opt_in_email == True:#set up daily subscription on "own" items + form.initial['individually_selected'] = 'd' + form.initial['asked_by_me'] = 'd' + form.initial['answered_by_me'] = 'd' + # + form.save(user=u, save_unbound=True) + + if 'u_auth' in locals(): + u_auth.user = u + u_auth.save() + USER[se_u.id] = u |