diff options
Diffstat (limited to 'stackexchange')
-rw-r--r-- | stackexchange/ANOMALIES | 14 | ||||
-rw-r--r-- | stackexchange/README | 62 | ||||
-rw-r--r-- | stackexchange/__init__.py | 0 | ||||
-rw-r--r-- | stackexchange/management/__init__.py | 0 | ||||
-rw-r--r-- | stackexchange/management/commands/__init__.py | 0 | ||||
-rw-r--r-- | stackexchange/management/commands/load_stackexchange.py | 804 | ||||
-rw-r--r-- | stackexchange/models.py | 266 | ||||
-rw-r--r-- | stackexchange/parse_models.py | 225 |
8 files changed, 1371 insertions, 0 deletions
diff --git a/stackexchange/ANOMALIES b/stackexchange/ANOMALIES new file mode 100644 index 00000000..05a7dbdb --- /dev/null +++ b/stackexchange/ANOMALIES @@ -0,0 +1,14 @@ +* several user accounts with same email +* users with no openid +* users with no email (hack: gravatar set to settings.ANONYMOUS_USER_EMAIL) +* users with no screen name +* users with no email and no screen name (25% in homeschool) +* tag preferences are not stored explicitly (interesting/ignored) + maybe they are in se.User.preferences_raw + but the data there is not marked up and is kind of cryptic +* we don't have Community user. SE has one with id=-1 + this id may break the load script + potential break places are anywhere where is X.get_user() call + issues may happen with larger data sets where activity + of user "Community" is somehow reflected in a way + that load_stackexchange does not take care of diff --git a/stackexchange/README b/stackexchange/README new file mode 100644 index 00000000..64d8f5fb --- /dev/null +++ b/stackexchange/README @@ -0,0 +1,62 @@ +this app's function will be to: + +* install it's own tables (#todo: not yet automated) +* read SE xml dump into DjangoDB (automated) +* populate osqa database (automated) +* remove SE tables (#todo: not done yet) + +Current process to load SE data into OSQA is: +============================================== + +1) backup database + +2) unzip SE dump into dump_dir (any directory name) + you may want to make sure that your dump directory in .gitignore file + so that you don't publish it by mistake + +3) enable 'stackexchange' in the list of installed apps (probably aready in settings.py) + +4) (optional - create models.py for SE, which is included anyway) run: + + #a) run in-place removal of xml namspace prefix to make parsing easier + perl -pi -w -e 's/xs://g' $SE_DUMP_PATH/xsd/*.xsd + cd stackexchange + python parse_models.py $SE_DUMP_PATH/xsd/*.xsd > models.py + +5) Install stackexchange models (as well as any other missing models) + python manage.py syncdb + +6) make sure that osqa badges are installed + if not, run (example for mysql): + + mysql -u user -p dbname < sql_scripts/badges.sql + +7) load SE data: + + python manage.py load_stackexchange dump_dir + + if anything doesn't go right - run 'python manage.py flush' and repeat + steps 6 and 7 + +NOTES: +============ + +Here is the load script that I used for the testing +it assumes that SE dump has been unzipped inside the tmp directory + + #!/bin/sh$ + python manage.py flush + #delete all data + mysql -u osqa -p osqa < sql_scripts/badges.sql + python manage.py load_stackexchange tmp + +Untested parts are tagged with comments starting with +#todo: + +The test set did not have all the usage cases of StackExchange represented so +it may break with other sets. + +The job takes some time to run, especially +content revisions and votes - may be optimized + +Some of the fringe cases are described in file stackexchange/ANOMALIES diff --git a/stackexchange/__init__.py b/stackexchange/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/stackexchange/__init__.py diff --git a/stackexchange/management/__init__.py b/stackexchange/management/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/stackexchange/management/__init__.py diff --git a/stackexchange/management/commands/__init__.py b/stackexchange/management/commands/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/stackexchange/management/commands/__init__.py diff --git a/stackexchange/management/commands/load_stackexchange.py b/stackexchange/management/commands/load_stackexchange.py new file mode 100644 index 00000000..afe4b9ea --- /dev/null +++ b/stackexchange/management/commands/load_stackexchange.py @@ -0,0 +1,804 @@ +from django.core.management.base import BaseCommand +from django.template.defaultfilters import slugify #todo: adopt unicode-aware slugify +#todo: http://stackoverflow.com/questions/837828/how-to-use-a-slug-in-django +import os +import re +import sys +import stackexchange.parse_models as se_parser +from xml.etree import ElementTree as et +from django.db import models +import forum.models as osqa +import stackexchange.models as se +from forum.forms import EditUserEmailFeedsForm +from forum.utils.html import sanitize_html +from django.conf import settings +from django.contrib.auth.models import Message as DjangoMessage +from django.utils.translation import ugettext as _ +#from markdown2 import Markdown +#markdowner = Markdown(html4tags=True) + +xml_read_order = ( + 'VoteTypes','UserTypes','Users','Users2Votes', + 'Badges','Users2Badges','CloseReasons','FlatPages', + 'MessageTypes','PostHistoryTypes','PostTypes','SchemaVersion', + 'Settings','SystemMessages','ThemeResources','ThemeTextResources', + 'ThrottleBucket','UserHistoryTypes','UserHistory', + 'Users2Badges','VoteTypes','Users2Votes','MessageTypes', + 'Posts','Posts2Votes','PostHistory','PostComments', + 'ModeratorMessages','Messages','Comments2Votes', + ) + +#association tables SE item id --> OSQA item id +#table associations are implied +#todo: there is an issue that these may be inconsistent with the database +USER = {}#SE User.id --> django(OSQA) User.id +QUESTION = {} +ANSWER = {} +NAMESAKE_COUNT = {}# number of times user name was used - for X.get_screen_name + +class X(object):# + """class with methods for handling some details + of SE --> OSQA mapping + """ + badge_type_map = {'1':'gold','2':'silver','3':'bronze'} + + osqa_supported_id_providers = ( + 'google','yahoo','aol','myopenid', + 'flickr','technorati', + 'wordpress','blogger','livejournal', + 'claimid','vidoop','verisign', + 'openidurl','facebook','local', + 'twitter' #oauth is not on this list, b/c it has no own url + ) + + #map SE VoteType -> osqa.User vote method + #created methods with the same call structure in osqa.User + #User.<vote_method>(post, timestamp=None, cancel=False) + vote_actions = { + 'UpMod':'upvote', + 'DownMod':'downvote', + 'AcceptedByOriginator':'accept_answer', + 'Offensive':'flag_post', + 'Favorite':'toggle_favorite_question', + } + + #these modes cannot be mixed + #only wiki is assumed to be mixable + exclusive_revision_modes = ( + 'initial','edit','rollback','lock', + 'migrate','close','merge','delete', + ) + + #badges whose names don't match exactly, but + #present in both SE and OSQA + badge_exceptions = {# SE --> OSQA + 'Citizen Patrol':'Citizen patrol',#single #todo: why sentence case? + 'Strunk & White':'Strunk & White',#single + 'Civic Duty':'Civic duty', + } + + revision_type_map = { + 'Initial Title':'initial', + 'Initial Body':'initial', + 'Initial Tags':'initial', + 'Edit Title':'edit', + 'Edit Body':'edit', + 'Edit Tags':'edit', + 'Rollback Title':'rollback', + 'Rollback Body':'rollback', + 'Rollback Tags':'rollback', + 'Post Closed':'close', + 'Post Reopened':'close', + 'Post Deleted':'delete', + 'Post Undeleted':'delete', + 'Post Locked':'lock', + 'Post Unlocked':'lock', + 'Community Owned':'wiki', + 'Post Migrated':'migrate', + 'Question Merged':'merge', + } + + close_reason_map = { + 1:1,#duplicate + 2:2,#off-topic + 3:3,#subjective and argumentative + 4:4,#not a real question + 5:7,#offensive + 6:6,#irrelevant or outdated question + 7:9,#too localized + 10:8,#spam + } + + @classmethod + def get_message_text(cls, se_m): + """try to intelligently translate + SE message to OSQA so that it makese sense in + our context + """ + #todo: properly translate messages + #todo: maybe work through more instances of messages + if se_m.message_type.name == 'Badge Notification': + return se_m.text + else: + if 'you are now an administrator' in se_m.text: + return _('Congratulations, you are now an Administrator') + elif re.search(r'^You have \d+ new',se_m.text): + bits = se_m.text.split('.') + text = bits[0] + if se_m.user.id == -1: + return None + url = cls.get_user(se_m.user).get_profile_url() + return '<a href="%s?sort=responses">%s</a>' % (url,text) + return None + + @classmethod + def get_post(cls, se_post): + #todo: fix this hack - either in-memory id association table + #or use database to store these associations + post_type = se_post.post_type.name + if post_type == 'Question': + return osqa.Question.objects.get(id=QUESTION[se_post.id].id) + elif post_type == 'Answer': + return osqa.Answer.objects.get(id=ANSWER[se_post.id].id) + else: + raise Exception('unknown post type %s' % post_type) + + @classmethod + def get_close_reason(cls, se_reason): + #todo: this is a guess - have not seen real data + se_reason = int(se_reason) + return cls.close_reason_map[se_reason] + + @classmethod + def get_user(cls, se_user): + #todo: same as get_post + return osqa.User.objects.get(id=USER[se_user.id].id) + + @classmethod + def get_post_revision_group_types(cls, rev_group): + rev_types = {} + for rev in rev_group: + rev_type = cls.get_post_revision_type(rev) + rev_types[rev_type] = 1 + rev_types = rev_types.keys() + + #make sure that exclusive rev modes are not mixed + exclusive = cls.exclusive_revision_modes + if len(rev_types) > 1 and all([t in exclusive for t in rev_types]): + tstr = ','.join(rev_types) + gstr = ','.join([str(rev.id) for rev in rev_group]) + msg = 'incompatible revision types %s in PostHistory %s' % (tstr,gstr) + raise Exception(msg) + return rev_types + + @classmethod + def clean_tags(cls, tags): + tags = re.subn(r'\s+',' ',tags.strip())[0] + bits = tags.split(' ') + tags = ' '.join([bit[1:-1] for bit in bits]) + tags = re.subn(r'\xf6','-',tags)[0] + return tags + + @classmethod + def get_screen_name(cls, name): + """always returns unique screen name + even if there are multiple users in SE + with the same exact screen name + """ + if name is None: + name = 'anonymous' + name = name.strip() + name = re.subn(r'\s+',' ',name)[0]#remove repeating spaces + + try: + u = osqa.User.objects.get(username = name) + try: + if u.location: + name += ', %s' % u.location + if name in NAMESAKE_COUNT: + NAMESAKE_COUNT[name] += 1 + name += ' %d' % NAMESAKE_COUNT[name] + else: + NAMESAKE_COUNT[name] = 1 + except osqa.User.DoesNotExist: + pass + except osqa.User.DoesNotExist: + NAMESAKE_COUNT[name] = 1 + return name + + @classmethod + def get_email(cls, email):#todo: fix fringe case - user did not give email! + if email is None: + return settings.ANONYMOUS_USER_EMAIL + else: + assert(email != '') + return email + + @classmethod + def get_post_revision_type(cls, rev): + rev_name = rev.post_history_type.name + rev_type = cls.revision_type_map.get(rev_name, None) + if rev_type is None: + raise Exception('dont understand revision type %s' % rev) + return rev_type + + #crude method of getting id provider name from the url + @classmethod + def get_openid_provider_name(cls, openid_url): + openid_str = str(openid_url) + bits = openid_str.split('/') + base_url = bits[2] #assume this is base url + url_bits = base_url.split('.') + provider_name = url_bits[-2].lower() + if provider_name not in cls.osqa_supported_id_providers: + raise Exception('could not determine login provider for %s' % openid_url) + return provider_name + + @staticmethod + def blankable(input): + if input is None: + return '' + else: + return input + + @classmethod + def parse_badge_summary(cls, badge_summary): + (gold,silver,bronze) = (0,0,0) + if badge_summary: + if len(badge_summary) > 3: + print 'warning: guessing that badge summary is comma separated' + print 'have %s' % badge_summary + bits = badge_summary.split(',') + else: + bits = [badge_summary] + for bit in bits: + m = re.search(r'^(?P<type>[1-3])=(?P<count>\d+)$', bit) + if not m: + raise Exception('could not parse badge summary: %s' % badge_summary) + else: + badge_type = cls.badge_type_map[m.groupdict()['type']] + locals()[badge_type] = int(m.groupdict()['count']) + return (gold,silver,bronze) + + @classmethod + def get_badge_name(cls, name): + return cls.badge_exceptions.get(name, name) + +class Command(BaseCommand): + help = 'Loads StackExchange data from unzipped directory of XML files into the OSQA database' + args = 'se_dump_dir' + + def handle(self, *arg, **kwarg): + if len(arg) < 1 or not os.path.isdir(arg[0]): + print 'Error: first argument must be a directory with all the SE *.xml files' + sys.exit(1) + + self.dump_path = arg[0] + #read the data into SE tables + for xml in xml_read_order: + xml_path = self.get_xml_path(xml) + table_name = self.get_table_name(xml) + self.load_xml_file(xml_path, table_name) + + #this is important so that when we clean up messages + #automatically generated by the procedures below + #we do not delete old messages + #todo: unfortunately this may need to be redone + #when we upgrade to django 1.2 and definitely by 1.4 when + #the current message system will be replaced with the + #django messages framework + self.save_osqa_message_id_list() + + #transfer data into OSQA tables + print 'Transferring users...', + sys.stdout.flush() + self.transfer_users() + print 'done.' + print 'Transferring content edits...', + sys.stdout.flush() + self.transfer_question_and_answer_activity() + print 'done.' + print 'Transferring view counts...', + sys.stdout.flush() + self.transfer_question_view_counts() + print 'done.' + print 'Transferring comments...', + sys.stdout.flush() + self.transfer_comments() + print 'done.' + print 'Transferring badges and badge awards...', + sys.stdout.flush() + self.transfer_badges() + print 'done.' + print 'Transferring votes...', + sys.stdout.flush() + self.transfer_votes()#includes favorites, accepts and flags + print 'done.' + + self.cleanup_messages()#delete autogenerated messages + self.transfer_messages() + + #todo: these are not clear how to go about + self.transfer_update_subscriptions() + self.transfer_tag_preferences() + self.transfer_meta_pages() + + def save_osqa_message_id_list(self): + id_list = list(DjangoMessage.objects.all().values('id')) + self._osqa_message_id_list = id_list + + def cleanup_messages(self): + """deletes messages generated by the load process + """ + id_list = self._osqa_message_id_list + mset = DjangoMessage.objects.all().exclude(id__in=id_list) + mset.delete() + + def transfer_messages(self): + """transfers some messages from + SE to OSQA + """ + for m in se.Message.objects.all(): + if m.is_read: + continue + if m.user.id == -1: + continue + u = X.get_user(m.user) + text = X.get_message_text(m) + if text: + u.message_set.create( + message=text, + ) + + def _process_post_initial_revision_group(self, rev_group): + + title = None + text = None + tags = None + wiki = False + author = USER[rev_group[0].user.id] + added_at = rev_group[0].creation_date + + for rev in rev_group: + rev_type = rev.post_history_type.name + if rev_type == 'Initial Title': + title = rev.text + elif rev_type == 'Initial Body': + text = rev.text + elif rev_type == 'Initial Tags': + tags = X.clean_tags(rev.text) + elif rev_type == 'Community Owned': + wiki = True + else: + raise Exception('unexpected revision type %s' % rev_type) + + post_type = rev_group[0].post.post_type.name + if post_type == 'Question': + q = osqa.Question.objects.create_new( + title = title, + author = author, + added_at = added_at, + wiki = wiki, + tagnames = tags, + text = text + ) + QUESTION[rev_group[0].post.id] = q + elif post_type == 'Answer': + q = X.get_post(rev_group[0].post.parent) + a = osqa.Answer.objects.create_new( + question = q, + author = author, + added_at = added_at, + wiki = wiki, + text = text, + ) + ANSWER[rev_group[0].post.id] = a + else: + post_id = rev_group[0].post.id + raise Exception('unknown post type %s for id=%d' % (post_type, post_id)) + + def _process_post_edit_revision_group(self, rev_group): + #question apply edit + (title, text, tags) = (None, None, None) + for rev in rev_group: + rev_type = rev.post_history_type.name + if rev_type == 'Edit Title': + title = rev.text + elif rev_type == 'Edit Body': + text = rev.text + elif rev_type == 'Edit Tags': + tags = X.clean_tags(rev.text) + elif rev_type == 'Community Owned': + pass + else: + raise Exception('unexpected revision type %s' % rev_type) + + rev0 = rev_group[0] + edited_by = USER[rev0.user.id] + edited_at = rev0.creation_date + comment = ';'.join([rev.comment for rev in rev_group if rev.comment]) + post_type = rev0.post.post_type.name + + if post_type == 'Question': + q = X.get_post(rev0.post) + q.apply_edit( + edited_at = edited_at, + edited_by = edited_by, + title = title, + text = text, + comment = comment, + tags = tags, + ) + elif post_type == 'Answer': + a = X.get_post(rev0.post) + a.apply_edit( + edited_at = edited_at, + edited_by = edited_by, + text = text, + comment = comment, + ) + + def _make_post_wiki(self, rev_group): + #todo: untested + for rev in rev_group: + if rev.post_history_type.name == 'Community Owned': + p = X.get_post(rev.post) + u = X.get_user(rev.user) + t = rev.creation_date + p.wiki = True + p.wikified_at = t + p.wikified_by = u + self.mark_activity(p,u,t) + p.save() + return + + def mark_activity(self,p,u,t): + """p,u,t - post, user, timestamp + """ + if isinstance(p, osqa.Question): + p.last_activity_by = u + p.last_activity_at = t + elif isinstance(p, osqa.Answer): + p.question.last_activity_by = u + p.question.last_activity_at = t + p.question.save() + + def _process_post_rollback_revision_group(self, rev_group): + #todo: don't know what to do here as there were no + #such data available + pass + + def _process_post_lock_revision_group(self, rev_group): + #todo: untested + for rev in rev_group: + rev_type = rev.post_history_type.name + if rev_type.endswith('ocked'): + t = rev.creation_date + u = X.get_user(rev.user) + p = X.get_post(rev.post) + if rev_type == 'Post Locked': + p.locked = True + p.locked_by = u + p.locked_at = t + elif rev_type == 'Post Unlocked': + p.locked = False + p.locked_by = None + p.locked_at = None + else: + return + self.mark_activity(p,u,t) + p.save() + return + + def _process_post_close_revision_group(self, rev_group): + #todo: untested + for rev in rev_group: + if rev.post.post_type.name != 'Question': + return + rev_type = rev.post_history_type.name + if rev_type in ('Post Closed', 'Post Reopened'): + t = rev.creation_date + u = X.get_user(rev.user) + p = X.get_post(rev.post) + if rev_type == 'Post Closed': + p.closed = True + p.closed_at = t + p.closed_by = u + p.close_reason = X.get_close_reason(rev.text) + elif rev_type == 'Post Reopened': + p.closed = False + p.closed_at = None + p.closed_by = None + p.close_reason = None + self.mark_activity(p,u,t) + p.save() + return + + def _process_post_delete_revision_group(self, rev_group): + #todo: untested + for rev in rev_group: + rev_type = rev.post_history_type.name + if rev_type.endswith('eleted'): + t = rev.creation_date + u = X.get_user(rev.user) + p = X.get_post(rev.post) + if rev_type == 'Post Deleted': + p.deleted = True + p.deleted_at = t + p.deleted_by = u + elif rev_type == 'Post Undeleted': + p.deleted = False + p.deleted_at = None + p.deleted_by = None + self.mark_activity(p,u,t) + p.save() + return + + def _process_post_revision_group(self, rev_group): + #determine revision type + #'initial','edit','rollback','lock', + #'migrate','close','merge','delete', + rev_types = X.get_post_revision_group_types(rev_group) + if 'initial' in rev_types: + self._process_post_initial_revision_group(rev_group) + elif 'edit' in rev_types: + self._process_post_edit_revision_group(rev_group) + elif 'rollback' in rev_types: + self._process_post_rollback_revision_group(rev_group) + elif 'lock' in rev_types: + self._process_post_lock_revision_group(rev_group) + elif 'close' in rev_types: + self._process_post_close_revision_group(rev_group) + elif 'delete' in rev_types: + self._process_post_delete_revision_group(rev_group) + else: + pass + #todo: rollback, lock, close and delete are + #not tested + #merge and migrate actions are ignored + #wiki is mixable with other groups, so process it in addition + if 'wiki' in rev_types: + self._make_post_wiki(rev_group) + + def transfer_tag_preferences(self): + #todo: figure out where these are stored in SE + #maybe in se.User.preferences_raw? + pass + + def transfer_question_and_answer_activity(self): + """transfers all question and answer + edits and related status changes + """ + #assuming that there are only two post types + se_revs = se.PostHistory.objects.all() + #assuming that chronologial order is correct and there + #will be no problems of data integrity upon insertion of records + se_revs = se_revs.order_by('creation_date','revision_guid') + #todo: ignored fringe case - no revisions + c_guid = se_revs[0].revision_guid + c_group = [] + #this loop groups revisions by revision id, then calls process function + #for the revision grup (elementary revisions posted at once) + for se_rev in se_revs: + if se_rev.revision_guid == c_guid: + c_group.append(se_rev) + else: + self._process_post_revision_group(c_group) + c_group = [] + c_group.append(se_rev) + c_guid = se_rev.revision_guid + if len(c_group) != 0: + self._process_post_revision_group(c_group) + + def transfer_comments(self): + for se_c in se.PostComment.objects.all(): + if se_c.deletion_date: + print 'Warning deleted comment %d dropped' % se_c.id + continue + se_post = se_c.post + if se_post.post_type.name == 'Question': + osqa_post = QUESTION[se_post.id] + elif se_post.post_type.name == 'Answer': + osqa_post = ANSWER[se_post.id] + + osqa_post.add_comment( + comment = se_c.text, + added_at = se_c.creation_date, + user = USER[se_c.user.id] + ) + + def _install_missing_badges(self): + self._missing_badges = {} + for se_b in se.Badge.objects.all(): + name = X.get_badge_name(se_b.name) + try: + osqa.Badge.objects.get(name=name) + except: + self._missing_badges[name] = 0 + if len(se_b.description) > 300: + print 'Warning truncated description for badge %d' % se_b.id + osqa.Badge.objects.create( + name = name, + slug = slugify(name), + description = se_b.description, + multiple = (not se_b.single), + type = se_b.class_type + ) + + def _award_badges(self): + #note: SE does not keep information on + #content-related badges like osqa does + for se_a in se.User2Badge.objects.all(): + if se_a.user.id == -1: + continue #skip community user + u = USER[se_a.user.id] + badge_name = X.get_badge_name(se_a.badge.name) + b = osqa.Badge.objects.get(name=badge_name) + if b.multiple == False: + if b.award_badge.count() > 0: + continue + #todo: fake content object here b/c SE does not support this + #todo: but osqa requires related content object + osqa.Award.objects.create( + user=u, + badge=b, + awarded_at=se_a.date, + content_object=u, + ) + if b.name in self._missing_badges: + self._missing_badges[b.name] += 1 + + def _cleanup_badges(self): + d = self._missing_badges + unused = [name for name in d.keys() if d[name] == 0] + osqa.Badge.objects.filter(name__in=unused).delete() + installed = [name for name in d.keys() if d[name] > 0] + print 'Warning - following unsupported badges were installed:' + print ', '.join(installed) + + def transfer_badges(self): + #note: badge level is neglected + #1) install missing badges + self._install_missing_badges() + #2) award badges + self._award_badges() + #3) delete unused newly installed badges + self._cleanup_badges() + pass + + def transfer_question_view_counts(self): + for se_q in se.Post.objects.filter(post_type__name='Question'): + q = X.get_post(se_q) + q.view_count = se_q.view_count + q.save() + + + def transfer_votes(self): + for v in se.Post2Vote.objects.all(): + vote_type = v.vote_type.name + if not vote_type in X.vote_actions: + continue + + u = X.get_user(v.user) + p = X.get_post(v.post) + m = X.vote_actions[vote_type] + vote_method = getattr(osqa.User, m) + vote_method(u, p, timestamp = v.creation_date) + if v.deletion_date: + vote_method(u, p, timestamp = v.deletion_date, cancel=True) + + def transfer_update_subscriptions(self): + #todo: not clear where this is stored in SE + #maybe in se.User.preferences_raw? + pass + + def transfer_meta_pages(self): + #here we actually don't have anything in the database yet + #so we can't do this + pass + + def load_xml_file(self, xml_path, table_name): + tree = et.parse(xml_path) + print 'loading from %s to %s' % (xml_path, table_name) , + model = models.get_model('stackexchange', table_name) + i = 0 + for row in tree.findall('.//row'): + model_entry = model() + i += 1 + for col in row.getchildren(): + field_name = se_parser.parse_field_name(col.tag) + field_type = model._meta.get_field(field_name) + field_value = se_parser.parse_value(col.text, field_type) + setattr(model_entry, field_name, field_value) + model_entry.save() + print '... %d objects saved' % i + + def get_table_name(self,xml): + return se_parser.get_table_name(xml) + + def get_xml_path(self, xml): + xml_path = os.path.join(self.dump_path, xml + '.xml') + if not os.path.isfile(xml_path): + print 'Error: file %s not found' % xml_path + sys.exit(1) + return xml_path + + def transfer_users(self): + for se_u in se.User.objects.all(): + if se_u.id < 1:#skip the Community user + continue + u = osqa.User() + u_type = se_u.user_type.name + if u_type == 'Administrator': + u.is_superuser = True + elif u_type == 'Moderator': + u.is_staff = True + elif u_type not in ('Unregistered', 'Registered'): + raise Exception('unknown user type %s' % u_type) + + #if user is not registered, no association record created + #we do not allow posting by users who are not authenticated + #probably they'll just have to "recover" their account by email + if u_type != 'Unregistered': + assert(se_u.open_id)#everybody must have open_id + u_auth = osqa.AuthKeyUserAssociation() + u_auth.key = se_u.open_id + u_auth.provider = X.get_openid_provider_name(se_u.open_id) + u_auth.added_at = se_u.creation_date + + if se_u.open_id is None and se_u.email is None: + print 'Warning: SE user %d is not recoverable (no email or openid)' + + u.reputation = 1#se_u.reputation, it's actually re-computed + u.last_seen = se_u.last_access_date + u.email = X.get_email(se_u.email) + u.location = X.blankable(se_u.location) + u.date_of_birth = se_u.birthday #dattime -> date + u.website = X.blankable(se_u.website_url) + u.about = X.blankable(se_u.about_me) + u.last_login = se_u.last_login_date + u.date_joined = se_u.creation_date + u.is_active = True #todo: this may not be the case + + u.username = X.get_screen_name(se_u.display_name) + u.real_name = X.blankable(se_u.real_name) + + (gold,silver,bronze) = X.parse_badge_summary(se_u.badge_summary) + u.gold = gold + u.silver = silver + u.bronze = bronze + + #todo: we don't have these fields + #views - number of profile views? + #has_replies + #has_message + #opt_in_recruit + #last_login_ip + #open_id_alt - ?? + #preferences_raw - not clear how to use + #display_name_cleaned - lowercased, srtipped name + #timed_penalty_date + #phone + + #don't know how to handle these - there was no usage example + #password_id + #guid + + #ignored + #last_email_date - this translates directly to EmailFeedSetting.reported_at + + #save the data + u.save() + form = EditUserEmailFeedsForm() + form.reset() + if se_u.opt_in_email == True:#set up daily subscription on "own" items + form.initial['individually_selected'] = 'd' + form.initial['asked_by_me'] = 'd' + form.initial['answered_by_me'] = 'd' + # + form.save(user=u, save_unbound=True) + + if 'u_auth' in locals(): + u_auth.user = u + u_auth.save() + USER[se_u.id] = u diff --git a/stackexchange/models.py b/stackexchange/models.py new file mode 100644 index 00000000..a30a9859 --- /dev/null +++ b/stackexchange/models.py @@ -0,0 +1,266 @@ +from django.db import models +class Badge(models.Model): + id = models.IntegerField(primary_key=True) + class_type = models.IntegerField(null=True) + name = models.CharField(max_length=50, null=True) + description = models.TextField(null=True) + single = models.NullBooleanField(null=True) + secret = models.NullBooleanField(null=True) + tag_based = models.NullBooleanField(null=True) + command = models.TextField(null=True) + award_frequency = models.IntegerField(null=True) + +class CloseReason(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=200, null=True) + description = models.CharField(max_length=256, null=True) + display_order = models.IntegerField(null=True) + +class Comment2Vote(models.Model): + id = models.IntegerField(primary_key=True) + post_comment = models.ForeignKey('PostComment', related_name='Comment2Vote_by_post_comment_set', null=True) + vote_type = models.ForeignKey('VoteType', related_name='Comment2Vote_by_vote_type_set', null=True) + creation_date = models.DateTimeField(null=True) + user = models.ForeignKey('User', related_name='Comment2Vote_by_user_set', null=True) + ip_address = models.CharField(max_length=40, null=True) + user_display_name = models.CharField(max_length=40, null=True) + deletion_date = models.DateTimeField(null=True) + +class FlatPage(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + url = models.CharField(max_length=128, null=True) + value = models.TextField(null=True) + content_type = models.CharField(max_length=50, null=True) + active = models.NullBooleanField(null=True) + use_master = models.NullBooleanField(null=True) + +class Message(models.Model): + id = models.IntegerField(primary_key=True) + user = models.ForeignKey('User', related_name='Message_by_user_set', null=True) + message_type = models.ForeignKey('MessageType', related_name='Message_by_message_type_set', null=True) + is_read = models.NullBooleanField(null=True) + creation_date = models.DateTimeField(null=True) + text = models.TextField(null=True) + post = models.ForeignKey('Post', related_name='Message_by_post_set', null=True) + +class MessageType(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + description = models.CharField(max_length=300, null=True) + +class ModeratorMessage(models.Model): + id = models.IntegerField(primary_key=True) + message_type = models.ForeignKey('MessageType', related_name='ModeratorMessage_by_message_type_set', null=True) + creation_date = models.DateTimeField(null=True) + creation_ip_address = models.CharField(max_length=40, null=True) + text = models.TextField(null=True) + user = models.ForeignKey('User', related_name='ModeratorMessage_by_user_set', null=True) + post = models.ForeignKey('Post', related_name='ModeratorMessage_by_post_set', null=True) + deletion_date = models.DateTimeField(null=True) + deletion_user = models.ForeignKey('User', related_name='ModeratorMessage_by_deletion_user_set', null=True) + deletion_ip_address = models.CharField(max_length=40, null=True) + user_display_name = models.CharField(max_length=40, null=True) + +class PostComment(models.Model): + id = models.IntegerField(primary_key=True) + post = models.ForeignKey('Post', related_name='PostComment_by_post_set', null=True) + text = models.TextField(null=True) + creation_date = models.DateTimeField(null=True) + ip_address = models.CharField(max_length=15, null=True) + user = models.ForeignKey('User', related_name='PostComment_by_user_set', null=True) + user_display_name = models.CharField(max_length=30, null=True) + deletion_date = models.DateTimeField(null=True) + deletion_user = models.ForeignKey('User', related_name='PostComment_by_deletion_user_set', null=True) + score = models.IntegerField(null=True) + +class PostHistoryType(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + description = models.CharField(max_length=300, null=True) + +class PostHistory(models.Model): + id = models.IntegerField(primary_key=True) + post_history_type = models.ForeignKey('PostHistoryType', related_name='PostHistory_by_post_history_type_set', null=True) + post = models.ForeignKey('Post', related_name='PostHistory_by_post_set', null=True) + revision_guid = models.CharField(max_length=64, null=True) + creation_date = models.DateTimeField(null=True) + ip_address = models.CharField(max_length=40, null=True) + user = models.ForeignKey('User', related_name='PostHistory_by_user_set', null=True) + comment = models.CharField(max_length=400, null=True) + text = models.TextField(null=True) + user_display_name = models.CharField(max_length=40, null=True) + user_email = models.CharField(max_length=100, null=True) + user_website_url = models.CharField(max_length=200, null=True) + +class Post2Vote(models.Model): + id = models.IntegerField(primary_key=True) + post = models.ForeignKey('Post', related_name='Post2Vote_by_post_set', null=True) + user = models.ForeignKey('User', related_name='Post2Vote_by_user_set', null=True) + vote_type = models.ForeignKey('VoteType', related_name='Post2Vote_by_vote_type_set', null=True) + creation_date = models.DateTimeField(null=True) + deletion_date = models.DateTimeField(null=True) + target_user = models.ForeignKey('User', related_name='Post2Vote_by_target_user_set', null=True) + target_rep_change = models.IntegerField(null=True) + voter_rep_change = models.IntegerField(null=True) + comment = models.CharField(max_length=150, null=True) + ip_address = models.CharField(max_length=40, null=True) + linked_post = models.ForeignKey('Post', related_name='Post2Vote_by_linked_post_set', null=True) + +class Post(models.Model): + id = models.IntegerField(primary_key=True) + post_type = models.ForeignKey('PostType', related_name='Post_by_post_type_set', null=True) + creation_date = models.DateTimeField(null=True) + score = models.IntegerField(null=True) + view_count = models.IntegerField(null=True) + body = models.TextField(null=True) + owner_user = models.ForeignKey('User', related_name='Post_by_owner_user_set', null=True) + last_editor_user = models.ForeignKey('User', related_name='Post_by_last_editor_user_set', null=True) + last_edit_date = models.DateTimeField(null=True) + last_activity_date = models.DateTimeField(null=True) + last_activity_user = models.ForeignKey('User', related_name='Post_by_last_activity_user_set', null=True) + parent = models.ForeignKey('self', related_name='Post_by_parent_set', null=True) + accepted_answer = models.ForeignKey('self', related_name='Post_by_accepted_answer_set', null=True) + title = models.CharField(max_length=250, null=True) + tags = models.CharField(max_length=150, null=True) + community_owned_date = models.DateTimeField(null=True) + history_summary = models.CharField(max_length=150, null=True) + answer_score = models.IntegerField(null=True) + answer_count = models.IntegerField(null=True) + comment_count = models.IntegerField(null=True) + favorite_count = models.IntegerField(null=True) + deletion_date = models.DateTimeField(null=True) + closed_date = models.DateTimeField(null=True) + locked_date = models.DateTimeField(null=True) + locked_duration = models.IntegerField(null=True) + owner_display_name = models.CharField(max_length=40, null=True) + last_editor_display_name = models.CharField(max_length=40, null=True) + bounty_amount = models.IntegerField(null=True) + bounty_closes = models.DateTimeField(null=True) + bounty_closed = models.DateTimeField(null=True) + last_owner_email_date = models.DateTimeField(null=True) + +class PostType(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + description = models.CharField(max_length=300, null=True) + +class SchemaVersion(models.Model): + version = models.IntegerField(null=True) + +class Setting(models.Model): + id = models.IntegerField(primary_key=True) + key = models.CharField(max_length=256, null=True) + value = models.TextField(null=True) + +class SystemMessage(models.Model): + id = models.IntegerField(primary_key=True) + user = models.ForeignKey('User', related_name='SystemMessage_by_user_set', null=True) + creation_date = models.DateTimeField(null=True) + text = models.TextField(null=True) + deletion_date = models.DateTimeField(null=True) + deletion_user = models.ForeignKey('User', related_name='SystemMessage_by_deletion_user_set', null=True) + +class Tag(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + count = models.IntegerField(null=True) + user = models.ForeignKey('User', related_name='Tag_by_user_set', null=True) + creation_date = models.DateTimeField(null=True) + is_moderator_only = models.NullBooleanField(null=True) + is_required = models.NullBooleanField(null=True) + aliases = models.CharField(max_length=200, null=True) + +class ThemeResource(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + value = models.TextField(null=True) + content_type = models.CharField(max_length=50, null=True) + version = models.CharField(max_length=6, null=True) + +class ThemeTextResource(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + value = models.TextField(null=True) + content_type = models.CharField(max_length=50, null=True) + +class ThrottleBucket(models.Model): + id = models.IntegerField(primary_key=True) + type = models.CharField(max_length=256, null=True) + ip_address = models.CharField(max_length=64, null=True) + tokens = models.IntegerField(null=True) + last_update = models.DateTimeField(null=True) + +class UserHistoryType(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + description = models.CharField(max_length=300, null=True) + +class UserHistory(models.Model): + id = models.IntegerField(primary_key=True) + user_history_type = models.ForeignKey('UserHistoryType', related_name='UserHistory_by_user_history_type_set', null=True) + creation_date = models.DateTimeField(null=True) + ip_address = models.CharField(max_length=40, null=True) + user = models.ForeignKey('User', related_name='UserHistory_by_user_set', null=True) + comment = models.CharField(max_length=400, null=True) + user_display_name = models.CharField(max_length=40, null=True) + moderator_user = models.ForeignKey('User', related_name='UserHistory_by_moderator_user_set', null=True) + reputation = models.IntegerField(null=True) + +class User2Badge(models.Model): + id = models.IntegerField(primary_key=True) + user = models.ForeignKey('User', related_name='User2Badge_by_user_set', null=True) + badge = models.ForeignKey('Badge', related_name='User2Badge_by_badge_set', null=True) + date = models.DateTimeField(null=True) + comment = models.CharField(max_length=50, null=True) + +class User2Vote(models.Model): + id = models.IntegerField(primary_key=True) + user = models.ForeignKey('User', related_name='User2Vote_by_user_set', null=True) + vote_type = models.ForeignKey('VoteType', related_name='User2Vote_by_vote_type_set', null=True) + target_user = models.ForeignKey('User', related_name='User2Vote_by_target_user_set', null=True) + creation_date = models.DateTimeField(null=True) + deletion_date = models.DateTimeField(null=True) + ip_address = models.CharField(max_length=40, null=True) + +class User(models.Model): + id = models.IntegerField(primary_key=True) + user_type = models.ForeignKey('UserType', related_name='User_by_user_type_set', null=True) + open_id = models.CharField(max_length=200, null=True) + reputation = models.IntegerField(null=True) + views = models.IntegerField(null=True) + creation_date = models.DateTimeField(null=True) + last_access_date = models.DateTimeField(null=True) + has_replies = models.NullBooleanField(null=True) + has_message = models.NullBooleanField(null=True) + opt_in_email = models.NullBooleanField(null=True) + opt_in_recruit = models.NullBooleanField(null=True) + last_login_date = models.DateTimeField(null=True) + last_email_date = models.DateTimeField(null=True) + last_login_ip = models.CharField(max_length=15, null=True) + open_id_alt = models.CharField(max_length=200, null=True) + email = models.CharField(max_length=100, null=True) + display_name = models.CharField(max_length=40, null=True) + display_name_cleaned = models.CharField(max_length=40, null=True) + website_url = models.CharField(max_length=200, null=True) + real_name = models.CharField(max_length=100, null=True) + location = models.CharField(max_length=100, null=True) + birthday = models.DateTimeField(null=True) + badge_summary = models.CharField(max_length=50, null=True) + about_me = models.TextField(null=True) + preferences_raw = models.TextField(null=True) + timed_penalty_date = models.DateTimeField(null=True) + guid = models.CharField(max_length=64, null=True) + phone = models.CharField(max_length=20, null=True) + password_id = models.IntegerField(null=True) + +class UserType(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + description = models.CharField(max_length=300, null=True) + +class VoteType(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + description = models.CharField(max_length=300, null=True) + diff --git a/stackexchange/parse_models.py b/stackexchange/parse_models.py new file mode 100644 index 00000000..64796e57 --- /dev/null +++ b/stackexchange/parse_models.py @@ -0,0 +1,225 @@ +from xml.etree import ElementTree as et +import sys +import re +import os +if __name__ != '__main__':#hack do not import models if run as script + from django.db import models +from datetime import datetime + +table_prefix = ''#StackExchange or something, if needed +date_time_format = '%Y-%m-%dT%H:%M:%S' #note that fractional part of second is lost +time_re = re.compile(r'(\.[\d]+)?$') +loader_app_name = os.path.dirname(__file__) + +types = { + 'unsignedByte':'models.IntegerField', + 'FK':'models.ForeignKey', + 'PK':'models.IntegerField', + 'string':'models.CharField', + 'text':'models.TextField', + 'int':'models.IntegerField', + 'boolean':'models.NullBooleanField', + 'dateTime':'models.DateTimeField', + 'base64Binary':'models.TextField', + 'double':'models.IntegerField', +} + +def camel_to_python(camel): + """http://stackoverflow.com/questions/1175208/ + """ + s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', camel) + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() + +def singular(word): + if word.endswith('s'): + return word[:-1] + else: + return word + +def get_table_name(name): + """Determine db table name + from the basename of the .xml file + """ + out = table_prefix + if name.find('2') == -1: + out += singular(name) + else: + bits = name.split('2') + bits = map(singular, bits) + out += '2'.join(bits) + return out + +class DjangoModel(object): + def __init__(self, name): + self.name = get_table_name(name) + self.fields = [] + def add_field(self,field): + field.table = self + self.fields.append(field) + def __str__(self): + out = 'class %s(models.Model):\n' % self.name + for f in self.fields: + out += ' %s\n' % str(f) + return out + +class DjangoField(object): + def __init__(self, name, type, restriction = None): + self.name = camel_to_python(name) + if self.name == 'class': + self.name = 'class_type'#work around python keyword + self.type = type + self.table = None + self.restriction = restriction + self.relation = None + + def __str__(self): + out = '%s = %s(' % (self.name, types[self.type]) + if self.type == 'FK': + out += "'%s'" % self.relation + out += ", related_name='%s_by_%s_set'" % (self.table.name, self.name) + out += ', null=True'#nullable to make life easier + elif self.type == 'PK': + out += 'primary_key=True' + elif self.restriction != -1: + if self.type == 'string': + out += 'max_length=%s' % self.restriction + out += ', null=True' + else: + raise Exception('restriction (max_length) supported only for string type') + else: + out += 'null=True' + out += ')' + return out + + def get_type(self): + return self.type + +class DjangoPK(DjangoField): + def __init__(self): + self.name = 'id' + self.type = 'PK' + +class DjangoFK(DjangoField): + def __init__(self, source_name): + bits = source_name.split('Id') + if len(bits) == 2 and bits[1] == '': + name = bits[0] + super(DjangoFK, self).__init__(name, 'FK') + self.set_relation(name) + + def set_relation(self, name): + """some relations need to be mapped + to actual tables + """ + self.relation = table_prefix + if name.endswith('User'): + self.relation += 'User' + elif name.endswith('Post'): + self.relation += 'Post' + elif name in ('AcceptedAnswer','Parent'): + self.relation = 'self' #self-referential Post model + else: + self.relation += name + def get_relation(self): + return self.relation + +def get_col_type(col): + type = col.get('type') + restriction = -1 + if type == None: + type_e = col.find('.//simpleType/restriction') + type = type_e.get('base') + try: + restriction = int(type_e.getchildren()[0].get('value')) + except: + restriction = -1 + if restriction > 400: + type = 'text' + restriction = -1 + return type, restriction + +def make_field_from_xml_tree(xml_element): + """used by the model parser + here we need to be detailed about field types + because this defines the database schema + """ + name = xml_element.get('name') + if name == 'LinkedVoteId':#not used + return None + if name == 'Id': + field = DjangoPK() + elif name.endswith('Id') and name not in ('OpenId','PasswordId'): + field = DjangoFK(name) + elif name.endswith('GUID'): + field = DjangoField(name, 'string', 64) + else: + type, restriction = get_col_type(xml_element) + field = DjangoField(name, type, restriction) + return field + +def parse_field_name(input): + """used by the data reader + + The problem is that I've scattered + code for determination of field name over three classes: + DjangoField, DjangoPK and DjangoFK + so the function actually cretes fake field objects + many time over + """ + if input == 'Id': + return DjangoPK().name + elif input in ('OpenId', 'PasswordId'): + return DjangoField(input, 'string', 7).name#happy fake field + elif input.endswith('Id'): + return DjangoFK(input).name#real FK field + else: + return DjangoField(input, 'string', 7).name#happy fake field + +def parse_value(input, field_object): + if isinstance(field_object, models.ForeignKey): + try: + id = int(input) + except: + raise Exception('non-numeric foreign key %s' % input) + related_model = field_object.rel.to + try: + return related_model.objects.get(id=id) + except related_model.DoesNotExist: + obj = related_model(id=id) + obj.save()#save fake empty object + return obj + elif isinstance(field_object, models.IntegerField): + try: + return int(input) + except: + raise Exception('expected integer, found %s' % input) + elif isinstance(field_object, models.CharField): + return input + elif isinstance(field_object, models.TextField): + return input + elif isinstance(field_object, models.BooleanField): + try: + return bool(input) + except: + raise Exception('boolean value expected %s found' % input) + elif isinstance(field_object, models.DateTimeField): + input = time_re.sub('', input) + try: + return datetime.strptime(input, date_time_format) + except: + raise Exception('datetime expected "%s" found' % input) + +print 'from django.db import models' +for file in sys.argv: + if '.xsd' in file: + tname = os.path.basename(file).replace('.xsd','') + tree = et.parse(file) + + model = DjangoModel(tname) + + row = tree.find('.//sequence') + for col in row.getchildren(): + field = make_field_from_xml_tree(col) + if field: + model.add_field(field) + print model |