diff options
Diffstat (limited to 'stackexchange')
-rw-r--r-- | stackexchange/ANOMALIES | 14 | ||||
-rw-r--r-- | stackexchange/README | 62 | ||||
-rw-r--r-- | stackexchange/__init__.py | 0 | ||||
-rw-r--r-- | stackexchange/management/__init__.py | 0 | ||||
-rw-r--r-- | stackexchange/management/commands/__init__.py | 0 | ||||
-rw-r--r-- | stackexchange/management/commands/load_stackexchange.py | 804 | ||||
-rw-r--r-- | stackexchange/models.py | 266 | ||||
-rw-r--r-- | stackexchange/parse_models.py | 225 |
8 files changed, 0 insertions, 1371 deletions
diff --git a/stackexchange/ANOMALIES b/stackexchange/ANOMALIES deleted file mode 100644 index 05a7dbdb..00000000 --- a/stackexchange/ANOMALIES +++ /dev/null @@ -1,14 +0,0 @@ -* several user accounts with same email -* users with no openid -* users with no email (hack: gravatar set to settings.ANONYMOUS_USER_EMAIL) -* users with no screen name -* users with no email and no screen name (25% in homeschool) -* tag preferences are not stored explicitly (interesting/ignored) - maybe they are in se.User.preferences_raw - but the data there is not marked up and is kind of cryptic -* we don't have Community user. SE has one with id=-1 - this id may break the load script - potential break places are anywhere where is X.get_user() call - issues may happen with larger data sets where activity - of user "Community" is somehow reflected in a way - that load_stackexchange does not take care of diff --git a/stackexchange/README b/stackexchange/README deleted file mode 100644 index 64d8f5fb..00000000 --- a/stackexchange/README +++ /dev/null @@ -1,62 +0,0 @@ -this app's function will be to: - -* install it's own tables (#todo: not yet automated) -* read SE xml dump into DjangoDB (automated) -* populate osqa database (automated) -* remove SE tables (#todo: not done yet) - -Current process to load SE data into OSQA is: -============================================== - -1) backup database - -2) unzip SE dump into dump_dir (any directory name) - you may want to make sure that your dump directory in .gitignore file - so that you don't publish it by mistake - -3) enable 'stackexchange' in the list of installed apps (probably aready in settings.py) - -4) (optional - create models.py for SE, which is included anyway) run: - - #a) run in-place removal of xml namspace prefix to make parsing easier - perl -pi -w -e 's/xs://g' $SE_DUMP_PATH/xsd/*.xsd - cd stackexchange - python parse_models.py $SE_DUMP_PATH/xsd/*.xsd > models.py - -5) Install stackexchange models (as well as any other missing models) - python manage.py syncdb - -6) make sure that osqa badges are installed - if not, run (example for mysql): - - mysql -u user -p dbname < sql_scripts/badges.sql - -7) load SE data: - - python manage.py load_stackexchange dump_dir - - if anything doesn't go right - run 'python manage.py flush' and repeat - steps 6 and 7 - -NOTES: -============ - -Here is the load script that I used for the testing -it assumes that SE dump has been unzipped inside the tmp directory - - #!/bin/sh$ - python manage.py flush - #delete all data - mysql -u osqa -p osqa < sql_scripts/badges.sql - python manage.py load_stackexchange tmp - -Untested parts are tagged with comments starting with -#todo: - -The test set did not have all the usage cases of StackExchange represented so -it may break with other sets. - -The job takes some time to run, especially -content revisions and votes - may be optimized - -Some of the fringe cases are described in file stackexchange/ANOMALIES diff --git a/stackexchange/__init__.py b/stackexchange/__init__.py deleted file mode 100644 index e69de29b..00000000 --- a/stackexchange/__init__.py +++ /dev/null diff --git a/stackexchange/management/__init__.py b/stackexchange/management/__init__.py deleted file mode 100644 index e69de29b..00000000 --- a/stackexchange/management/__init__.py +++ /dev/null diff --git a/stackexchange/management/commands/__init__.py b/stackexchange/management/commands/__init__.py deleted file mode 100644 index e69de29b..00000000 --- a/stackexchange/management/commands/__init__.py +++ /dev/null diff --git a/stackexchange/management/commands/load_stackexchange.py b/stackexchange/management/commands/load_stackexchange.py deleted file mode 100644 index 9cfc7192..00000000 --- a/stackexchange/management/commands/load_stackexchange.py +++ /dev/null @@ -1,804 +0,0 @@ -from django.core.management.base import BaseCommand -from django.template.defaultfilters import slugify #todo: adopt unicode-aware slugify -#todo: http://stackoverflow.com/questions/837828/how-to-use-a-slug-in-django -import os -import re -import sys -import stackexchange.parse_models as se_parser -from xml.etree import ElementTree as et -from django.db import models -import forum.models as askbot -import stackexchange.models as se -from forum.forms import EditUserEmailFeedsForm -from forum.utils.html import sanitize_html -from django.conf import settings -from django.contrib.auth.models import Message as DjangoMessage -from django.utils.translation import ugettext as _ -#from markdown2 import Markdown -#markdowner = Markdown(html4tags=True) - -xml_read_order = ( - 'VoteTypes','UserTypes','Users','Users2Votes', - 'Badges','Users2Badges','CloseReasons','FlatPages', - 'MessageTypes','PostHistoryTypes','PostTypes','SchemaVersion', - 'Settings','SystemMessages','ThemeResources','ThemeTextResources', - 'ThrottleBucket','UserHistoryTypes','UserHistory', - 'Users2Badges','VoteTypes','Users2Votes','MessageTypes', - 'Posts','Posts2Votes','PostHistory','PostComments', - 'ModeratorMessages','Messages','Comments2Votes', - ) - -#association tables SE item id --> ASKBOT item id -#table associations are implied -#todo: there is an issue that these may be inconsistent with the database -USER = {}#SE User.id --> django(ASKBOT) User.id -QUESTION = {} -ANSWER = {} -NAMESAKE_COUNT = {}# number of times user name was used - for X.get_screen_name - -class X(object):# - """class with methods for handling some details - of SE --> ASKBOT mapping - """ - badge_type_map = {'1':'gold','2':'silver','3':'bronze'} - - askbot_supported_id_providers = ( - 'google','yahoo','aol','myopenid', - 'flickr','technorati', - 'wordpress','blogger','livejournal', - 'claimid','vidoop','verisign', - 'openidurl','facebook','local', - 'twitter' #oauth is not on this list, b/c it has no own url - ) - - #map SE VoteType -> askbot.User vote method - #created methods with the same call structure in askbot.User - #User.<vote_method>(post, timestamp=None, cancel=False) - vote_actions = { - 'UpMod':'upvote', - 'DownMod':'downvote', - 'AcceptedByOriginator':'accept_answer', - 'Offensive':'flag_post', - 'Favorite':'toggle_favorite_question', - } - - #these modes cannot be mixed - #only wiki is assumed to be mixable - exclusive_revision_modes = ( - 'initial','edit','rollback','lock', - 'migrate','close','merge','delete', - ) - - #badges whose names don't match exactly, but - #present in both SE and ASKBOT - badge_exceptions = {# SE --> ASKBOT - 'Citizen Patrol':'Citizen patrol',#single #todo: why sentence case? - 'Strunk & White':'Strunk & White',#single - 'Civic Duty':'Civic duty', - } - - revision_type_map = { - 'Initial Title':'initial', - 'Initial Body':'initial', - 'Initial Tags':'initial', - 'Edit Title':'edit', - 'Edit Body':'edit', - 'Edit Tags':'edit', - 'Rollback Title':'rollback', - 'Rollback Body':'rollback', - 'Rollback Tags':'rollback', - 'Post Closed':'close', - 'Post Reopened':'close', - 'Post Deleted':'delete', - 'Post Undeleted':'delete', - 'Post Locked':'lock', - 'Post Unlocked':'lock', - 'Community Owned':'wiki', - 'Post Migrated':'migrate', - 'Question Merged':'merge', - } - - close_reason_map = { - 1:1,#duplicate - 2:2,#off-topic - 3:3,#subjective and argumentative - 4:4,#not a real question - 5:7,#offensive - 6:6,#irrelevant or outdated question - 7:9,#too localized - 10:8,#spam - } - - @classmethod - def get_message_text(cls, se_m): - """try to intelligently translate - SE message to ASKBOT so that it makese sense in - our context - """ - #todo: properly translate messages - #todo: maybe work through more instances of messages - if se_m.message_type.name == 'Badge Notification': - return se_m.text - else: - if 'you are now an administrator' in se_m.text: - return _('Congratulations, you are now an Administrator') - elif re.search(r'^You have \d+ new',se_m.text): - bits = se_m.text.split('.') - text = bits[0] - if se_m.user.id == -1: - return None - url = cls.get_user(se_m.user).get_profile_url() - return '<a href="%s?sort=responses">%s</a>' % (url,text) - return None - - @classmethod - def get_post(cls, se_post): - #todo: fix this hack - either in-memory id association table - #or use database to store these associations - post_type = se_post.post_type.name - if post_type == 'Question': - return askbot.Question.objects.get(id=QUESTION[se_post.id].id) - elif post_type == 'Answer': - return askbot.Answer.objects.get(id=ANSWER[se_post.id].id) - else: - raise Exception('unknown post type %s' % post_type) - - @classmethod - def get_close_reason(cls, se_reason): - #todo: this is a guess - have not seen real data - se_reason = int(se_reason) - return cls.close_reason_map[se_reason] - - @classmethod - def get_user(cls, se_user): - #todo: same as get_post - return askbot.User.objects.get(id=USER[se_user.id].id) - - @classmethod - def get_post_revision_group_types(cls, rev_group): - rev_types = {} - for rev in rev_group: - rev_type = cls.get_post_revision_type(rev) - rev_types[rev_type] = 1 - rev_types = rev_types.keys() - - #make sure that exclusive rev modes are not mixed - exclusive = cls.exclusive_revision_modes - if len(rev_types) > 1 and all([t in exclusive for t in rev_types]): - tstr = ','.join(rev_types) - gstr = ','.join([str(rev.id) for rev in rev_group]) - msg = 'incompatible revision types %s in PostHistory %s' % (tstr,gstr) - raise Exception(msg) - return rev_types - - @classmethod - def clean_tags(cls, tags): - tags = re.subn(r'\s+',' ',tags.strip())[0] - bits = tags.split(' ') - tags = ' '.join([bit[1:-1] for bit in bits]) - tags = re.subn(r'\xf6','-',tags)[0] - return tags - - @classmethod - def get_screen_name(cls, name): - """always returns unique screen name - even if there are multiple users in SE - with the same exact screen name - """ - if name is None: - name = 'anonymous' - name = name.strip() - name = re.subn(r'\s+',' ',name)[0]#remove repeating spaces - - try: - u = askbot.User.objects.get(username = name) - try: - if u.location: - name += ', %s' % u.location - if name in NAMESAKE_COUNT: - NAMESAKE_COUNT[name] += 1 - name += ' %d' % NAMESAKE_COUNT[name] - else: - NAMESAKE_COUNT[name] = 1 - except askbot.User.DoesNotExist: - pass - except askbot.User.DoesNotExist: - NAMESAKE_COUNT[name] = 1 - return name - - @classmethod - def get_email(cls, email):#todo: fix fringe case - user did not give email! - if email is None: - return settings.ANONYMOUS_USER_EMAIL - else: - assert(email != '') - return email - - @classmethod - def get_post_revision_type(cls, rev): - rev_name = rev.post_history_type.name - rev_type = cls.revision_type_map.get(rev_name, None) - if rev_type is None: - raise Exception('dont understand revision type %s' % rev) - return rev_type - - #crude method of getting id provider name from the url - @classmethod - def get_openid_provider_name(cls, openid_url): - openid_str = str(openid_url) - bits = openid_str.split('/') - base_url = bits[2] #assume this is base url - url_bits = base_url.split('.') - provider_name = url_bits[-2].lower() - if provider_name not in cls.askbot_supported_id_providers: - raise Exception('could not determine login provider for %s' % openid_url) - return provider_name - - @staticmethod - def blankable(input): - if input is None: - return '' - else: - return input - - @classmethod - def parse_badge_summary(cls, badge_summary): - (gold,silver,bronze) = (0,0,0) - if badge_summary: - if len(badge_summary) > 3: - print 'warning: guessing that badge summary is comma separated' - print 'have %s' % badge_summary - bits = badge_summary.split(',') - else: - bits = [badge_summary] - for bit in bits: - m = re.search(r'^(?P<type>[1-3])=(?P<count>\d+)$', bit) - if not m: - raise Exception('could not parse badge summary: %s' % badge_summary) - else: - badge_type = cls.badge_type_map[m.groupdict()['type']] - locals()[badge_type] = int(m.groupdict()['count']) - return (gold,silver,bronze) - - @classmethod - def get_badge_name(cls, name): - return cls.badge_exceptions.get(name, name) - -class Command(BaseCommand): - help = 'Loads StackExchange data from unzipped directory of XML files into the ASKBOT database' - args = 'se_dump_dir' - - def handle(self, *arg, **kwarg): - if len(arg) < 1 or not os.path.isdir(arg[0]): - print 'Error: first argument must be a directory with all the SE *.xml files' - sys.exit(1) - - self.dump_path = arg[0] - #read the data into SE tables - for xml in xml_read_order: - xml_path = self.get_xml_path(xml) - table_name = self.get_table_name(xml) - self.load_xml_file(xml_path, table_name) - - #this is important so that when we clean up messages - #automatically generated by the procedures below - #we do not delete old messages - #todo: unfortunately this may need to be redone - #when we upgrade to django 1.2 and definitely by 1.4 when - #the current message system will be replaced with the - #django messages framework - self.save_askbot_message_id_list() - - #transfer data into ASKBOT tables - print 'Transferring users...', - sys.stdout.flush() - self.transfer_users() - print 'done.' - print 'Transferring content edits...', - sys.stdout.flush() - self.transfer_question_and_answer_activity() - print 'done.' - print 'Transferring view counts...', - sys.stdout.flush() - self.transfer_question_view_counts() - print 'done.' - print 'Transferring comments...', - sys.stdout.flush() - self.transfer_comments() - print 'done.' - print 'Transferring badges and badge awards...', - sys.stdout.flush() - self.transfer_badges() - print 'done.' - print 'Transferring votes...', - sys.stdout.flush() - self.transfer_votes()#includes favorites, accepts and flags - print 'done.' - - self.cleanup_messages()#delete autogenerated messages - self.transfer_messages() - - #todo: these are not clear how to go about - self.transfer_update_subscriptions() - self.transfer_tag_preferences() - self.transfer_meta_pages() - - def save_askbot_message_id_list(self): - id_list = list(DjangoMessage.objects.all().values('id')) - self._askbot_message_id_list = id_list - - def cleanup_messages(self): - """deletes messages generated by the load process - """ - id_list = self._askbot_message_id_list - mset = DjangoMessage.objects.all().exclude(id__in=id_list) - mset.delete() - - def transfer_messages(self): - """transfers some messages from - SE to ASKBOT - """ - for m in se.Message.objects.all(): - if m.is_read: - continue - if m.user.id == -1: - continue - u = X.get_user(m.user) - text = X.get_message_text(m) - if text: - u.message_set.create( - message=text, - ) - - def _process_post_initial_revision_group(self, rev_group): - - title = None - text = None - tags = None - wiki = False - author = USER[rev_group[0].user.id] - added_at = rev_group[0].creation_date - - for rev in rev_group: - rev_type = rev.post_history_type.name - if rev_type == 'Initial Title': - title = rev.text - elif rev_type == 'Initial Body': - text = rev.text - elif rev_type == 'Initial Tags': - tags = X.clean_tags(rev.text) - elif rev_type == 'Community Owned': - wiki = True - else: - raise Exception('unexpected revision type %s' % rev_type) - - post_type = rev_group[0].post.post_type.name - if post_type == 'Question': - q = askbot.Question.objects.create_new( - title = title, - author = author, - added_at = added_at, - wiki = wiki, - tagnames = tags, - text = text - ) - QUESTION[rev_group[0].post.id] = q - elif post_type == 'Answer': - q = X.get_post(rev_group[0].post.parent) - a = askbot.Answer.objects.create_new( - question = q, - author = author, - added_at = added_at, - wiki = wiki, - text = text, - ) - ANSWER[rev_group[0].post.id] = a - else: - post_id = rev_group[0].post.id - raise Exception('unknown post type %s for id=%d' % (post_type, post_id)) - - def _process_post_edit_revision_group(self, rev_group): - #question apply edit - (title, text, tags) = (None, None, None) - for rev in rev_group: - rev_type = rev.post_history_type.name - if rev_type == 'Edit Title': - title = rev.text - elif rev_type == 'Edit Body': - text = rev.text - elif rev_type == 'Edit Tags': - tags = X.clean_tags(rev.text) - elif rev_type == 'Community Owned': - pass - else: - raise Exception('unexpected revision type %s' % rev_type) - - rev0 = rev_group[0] - edited_by = USER[rev0.user.id] - edited_at = rev0.creation_date - comment = ';'.join([rev.comment for rev in rev_group if rev.comment]) - post_type = rev0.post.post_type.name - - if post_type == 'Question': - q = X.get_post(rev0.post) - q.apply_edit( - edited_at = edited_at, - edited_by = edited_by, - title = title, - text = text, - comment = comment, - tags = tags, - ) - elif post_type == 'Answer': - a = X.get_post(rev0.post) - a.apply_edit( - edited_at = edited_at, - edited_by = edited_by, - text = text, - comment = comment, - ) - - def _make_post_wiki(self, rev_group): - #todo: untested - for rev in rev_group: - if rev.post_history_type.name == 'Community Owned': - p = X.get_post(rev.post) - u = X.get_user(rev.user) - t = rev.creation_date - p.wiki = True - p.wikified_at = t - p.wikified_by = u - self.mark_activity(p,u,t) - p.save() - return - - def mark_activity(self,p,u,t): - """p,u,t - post, user, timestamp - """ - if isinstance(p, askbot.Question): - p.last_activity_by = u - p.last_activity_at = t - elif isinstance(p, askbot.Answer): - p.question.last_activity_by = u - p.question.last_activity_at = t - p.question.save() - - def _process_post_rollback_revision_group(self, rev_group): - #todo: don't know what to do here as there were no - #such data available - pass - - def _process_post_lock_revision_group(self, rev_group): - #todo: untested - for rev in rev_group: - rev_type = rev.post_history_type.name - if rev_type.endswith('ocked'): - t = rev.creation_date - u = X.get_user(rev.user) - p = X.get_post(rev.post) - if rev_type == 'Post Locked': - p.locked = True - p.locked_by = u - p.locked_at = t - elif rev_type == 'Post Unlocked': - p.locked = False - p.locked_by = None - p.locked_at = None - else: - return - self.mark_activity(p,u,t) - p.save() - return - - def _process_post_close_revision_group(self, rev_group): - #todo: untested - for rev in rev_group: - if rev.post.post_type.name != 'Question': - return - rev_type = rev.post_history_type.name - if rev_type in ('Post Closed', 'Post Reopened'): - t = rev.creation_date - u = X.get_user(rev.user) - p = X.get_post(rev.post) - if rev_type == 'Post Closed': - p.closed = True - p.closed_at = t - p.closed_by = u - p.close_reason = X.get_close_reason(rev.text) - elif rev_type == 'Post Reopened': - p.closed = False - p.closed_at = None - p.closed_by = None - p.close_reason = None - self.mark_activity(p,u,t) - p.save() - return - - def _process_post_delete_revision_group(self, rev_group): - #todo: untested - for rev in rev_group: - rev_type = rev.post_history_type.name - if rev_type.endswith('eleted'): - t = rev.creation_date - u = X.get_user(rev.user) - p = X.get_post(rev.post) - if rev_type == 'Post Deleted': - p.deleted = True - p.deleted_at = t - p.deleted_by = u - elif rev_type == 'Post Undeleted': - p.deleted = False - p.deleted_at = None - p.deleted_by = None - self.mark_activity(p,u,t) - p.save() - return - - def _process_post_revision_group(self, rev_group): - #determine revision type - #'initial','edit','rollback','lock', - #'migrate','close','merge','delete', - rev_types = X.get_post_revision_group_types(rev_group) - if 'initial' in rev_types: - self._process_post_initial_revision_group(rev_group) - elif 'edit' in rev_types: - self._process_post_edit_revision_group(rev_group) - elif 'rollback' in rev_types: - self._process_post_rollback_revision_group(rev_group) - elif 'lock' in rev_types: - self._process_post_lock_revision_group(rev_group) - elif 'close' in rev_types: - self._process_post_close_revision_group(rev_group) - elif 'delete' in rev_types: - self._process_post_delete_revision_group(rev_group) - else: - pass - #todo: rollback, lock, close and delete are - #not tested - #merge and migrate actions are ignored - #wiki is mixable with other groups, so process it in addition - if 'wiki' in rev_types: - self._make_post_wiki(rev_group) - - def transfer_tag_preferences(self): - #todo: figure out where these are stored in SE - #maybe in se.User.preferences_raw? - pass - - def transfer_question_and_answer_activity(self): - """transfers all question and answer - edits and related status changes - """ - #assuming that there are only two post types - se_revs = se.PostHistory.objects.all() - #assuming that chronologial order is correct and there - #will be no problems of data integrity upon insertion of records - se_revs = se_revs.order_by('creation_date','revision_guid') - #todo: ignored fringe case - no revisions - c_guid = se_revs[0].revision_guid - c_group = [] - #this loop groups revisions by revision id, then calls process function - #for the revision grup (elementary revisions posted at once) - for se_rev in se_revs: - if se_rev.revision_guid == c_guid: - c_group.append(se_rev) - else: - self._process_post_revision_group(c_group) - c_group = [] - c_group.append(se_rev) - c_guid = se_rev.revision_guid - if len(c_group) != 0: - self._process_post_revision_group(c_group) - - def transfer_comments(self): - for se_c in se.PostComment.objects.all(): - if se_c.deletion_date: - print 'Warning deleted comment %d dropped' % se_c.id - continue - se_post = se_c.post - if se_post.post_type.name == 'Question': - askbot_post = QUESTION[se_post.id] - elif se_post.post_type.name == 'Answer': - askbot_post = ANSWER[se_post.id] - - askbot_post.add_comment( - comment = se_c.text, - added_at = se_c.creation_date, - user = USER[se_c.user.id] - ) - - def _install_missing_badges(self): - self._missing_badges = {} - for se_b in se.Badge.objects.all(): - name = X.get_badge_name(se_b.name) - try: - askbot.Badge.objects.get(name=name) - except: - self._missing_badges[name] = 0 - if len(se_b.description) > 300: - print 'Warning truncated description for badge %d' % se_b.id - askbot.Badge.objects.create( - name = name, - slug = slugify(name), - description = se_b.description, - multiple = (not se_b.single), - type = se_b.class_type - ) - - def _award_badges(self): - #note: SE does not keep information on - #content-related badges like askbot does - for se_a in se.User2Badge.objects.all(): - if se_a.user.id == -1: - continue #skip community user - u = USER[se_a.user.id] - badge_name = X.get_badge_name(se_a.badge.name) - b = askbot.Badge.objects.get(name=badge_name) - if b.multiple == False: - if b.award_badge.count() > 0: - continue - #todo: fake content object here b/c SE does not support this - #todo: but askbot requires related content object - askbot.Award.objects.create( - user=u, - badge=b, - awarded_at=se_a.date, - content_object=u, - ) - if b.name in self._missing_badges: - self._missing_badges[b.name] += 1 - - def _cleanup_badges(self): - d = self._missing_badges - unused = [name for name in d.keys() if d[name] == 0] - askbot.Badge.objects.filter(name__in=unused).delete() - installed = [name for name in d.keys() if d[name] > 0] - print 'Warning - following unsupported badges were installed:' - print ', '.join(installed) - - def transfer_badges(self): - #note: badge level is neglected - #1) install missing badges - self._install_missing_badges() - #2) award badges - self._award_badges() - #3) delete unused newly installed badges - self._cleanup_badges() - pass - - def transfer_question_view_counts(self): - for se_q in se.Post.objects.filter(post_type__name='Question'): - q = X.get_post(se_q) - q.view_count = se_q.view_count - q.save() - - - def transfer_votes(self): - for v in se.Post2Vote.objects.all(): - vote_type = v.vote_type.name - if not vote_type in X.vote_actions: - continue - - u = X.get_user(v.user) - p = X.get_post(v.post) - m = X.vote_actions[vote_type] - vote_method = getattr(askbot.User, m) - vote_method(u, p, timestamp = v.creation_date) - if v.deletion_date: - vote_method(u, p, timestamp = v.deletion_date, cancel=True) - - def transfer_update_subscriptions(self): - #todo: not clear where this is stored in SE - #maybe in se.User.preferences_raw? - pass - - def transfer_meta_pages(self): - #here we actually don't have anything in the database yet - #so we can't do this - pass - - def load_xml_file(self, xml_path, table_name): - tree = et.parse(xml_path) - print 'loading from %s to %s' % (xml_path, table_name) , - model = models.get_model('stackexchange', table_name) - i = 0 - for row in tree.findall('.//row'): - model_entry = model() - i += 1 - for col in row.getchildren(): - field_name = se_parser.parse_field_name(col.tag) - field_type = model._meta.get_field(field_name) - field_value = se_parser.parse_value(col.text, field_type) - setattr(model_entry, field_name, field_value) - model_entry.save() - print '... %d objects saved' % i - - def get_table_name(self,xml): - return se_parser.get_table_name(xml) - - def get_xml_path(self, xml): - xml_path = os.path.join(self.dump_path, xml + '.xml') - if not os.path.isfile(xml_path): - print 'Error: file %s not found' % xml_path - sys.exit(1) - return xml_path - - def transfer_users(self): - for se_u in se.User.objects.all(): - if se_u.id < 1:#skip the Community user - continue - u = askbot.User() - u_type = se_u.user_type.name - if u_type == 'Administrator': - u.is_superuser = True - elif u_type == 'Moderator': - u.is_staff = True - elif u_type not in ('Unregistered', 'Registered'): - raise Exception('unknown user type %s' % u_type) - - #if user is not registered, no association record created - #we do not allow posting by users who are not authenticated - #probably they'll just have to "recover" their account by email - if u_type != 'Unregistered': - assert(se_u.open_id)#everybody must have open_id - u_auth = askbot.AuthKeyUserAssociation() - u_auth.key = se_u.open_id - u_auth.provider = X.get_openid_provider_name(se_u.open_id) - u_auth.added_at = se_u.creation_date - - if se_u.open_id is None and se_u.email is None: - print 'Warning: SE user %d is not recoverable (no email or openid)' - - u.reputation = 1#se_u.reputation, it's actually re-computed - u.last_seen = se_u.last_access_date - u.email = X.get_email(se_u.email) - u.location = X.blankable(se_u.location) - u.date_of_birth = se_u.birthday #dattime -> date - u.website = X.blankable(se_u.website_url) - u.about = X.blankable(se_u.about_me) - u.last_login = se_u.last_login_date - u.date_joined = se_u.creation_date - u.is_active = True #todo: this may not be the case - - u.username = X.get_screen_name(se_u.display_name) - u.real_name = X.blankable(se_u.real_name) - - (gold,silver,bronze) = X.parse_badge_summary(se_u.badge_summary) - u.gold = gold - u.silver = silver - u.bronze = bronze - - #todo: we don't have these fields - #views - number of profile views? - #has_replies - #has_message - #opt_in_recruit - #last_login_ip - #open_id_alt - ?? - #preferences_raw - not clear how to use - #display_name_cleaned - lowercased, srtipped name - #timed_penalty_date - #phone - - #don't know how to handle these - there was no usage example - #password_id - #guid - - #ignored - #last_email_date - this translates directly to EmailFeedSetting.reported_at - - #save the data - u.save() - form = EditUserEmailFeedsForm() - form.reset() - if se_u.opt_in_email == True:#set up daily subscription on "own" items - form.initial['individually_selected'] = 'd' - form.initial['asked_by_me'] = 'd' - form.initial['answered_by_me'] = 'd' - # - form.save(user=u, save_unbound=True) - - if 'u_auth' in locals(): - u_auth.user = u - u_auth.save() - USER[se_u.id] = u diff --git a/stackexchange/models.py b/stackexchange/models.py deleted file mode 100644 index a30a9859..00000000 --- a/stackexchange/models.py +++ /dev/null @@ -1,266 +0,0 @@ -from django.db import models -class Badge(models.Model): - id = models.IntegerField(primary_key=True) - class_type = models.IntegerField(null=True) - name = models.CharField(max_length=50, null=True) - description = models.TextField(null=True) - single = models.NullBooleanField(null=True) - secret = models.NullBooleanField(null=True) - tag_based = models.NullBooleanField(null=True) - command = models.TextField(null=True) - award_frequency = models.IntegerField(null=True) - -class CloseReason(models.Model): - id = models.IntegerField(primary_key=True) - name = models.CharField(max_length=200, null=True) - description = models.CharField(max_length=256, null=True) - display_order = models.IntegerField(null=True) - -class Comment2Vote(models.Model): - id = models.IntegerField(primary_key=True) - post_comment = models.ForeignKey('PostComment', related_name='Comment2Vote_by_post_comment_set', null=True) - vote_type = models.ForeignKey('VoteType', related_name='Comment2Vote_by_vote_type_set', null=True) - creation_date = models.DateTimeField(null=True) - user = models.ForeignKey('User', related_name='Comment2Vote_by_user_set', null=True) - ip_address = models.CharField(max_length=40, null=True) - user_display_name = models.CharField(max_length=40, null=True) - deletion_date = models.DateTimeField(null=True) - -class FlatPage(models.Model): - id = models.IntegerField(primary_key=True) - name = models.CharField(max_length=50, null=True) - url = models.CharField(max_length=128, null=True) - value = models.TextField(null=True) - content_type = models.CharField(max_length=50, null=True) - active = models.NullBooleanField(null=True) - use_master = models.NullBooleanField(null=True) - -class Message(models.Model): - id = models.IntegerField(primary_key=True) - user = models.ForeignKey('User', related_name='Message_by_user_set', null=True) - message_type = models.ForeignKey('MessageType', related_name='Message_by_message_type_set', null=True) - is_read = models.NullBooleanField(null=True) - creation_date = models.DateTimeField(null=True) - text = models.TextField(null=True) - post = models.ForeignKey('Post', related_name='Message_by_post_set', null=True) - -class MessageType(models.Model): - id = models.IntegerField(primary_key=True) - name = models.CharField(max_length=50, null=True) - description = models.CharField(max_length=300, null=True) - -class ModeratorMessage(models.Model): - id = models.IntegerField(primary_key=True) - message_type = models.ForeignKey('MessageType', related_name='ModeratorMessage_by_message_type_set', null=True) - creation_date = models.DateTimeField(null=True) - creation_ip_address = models.CharField(max_length=40, null=True) - text = models.TextField(null=True) - user = models.ForeignKey('User', related_name='ModeratorMessage_by_user_set', null=True) - post = models.ForeignKey('Post', related_name='ModeratorMessage_by_post_set', null=True) - deletion_date = models.DateTimeField(null=True) - deletion_user = models.ForeignKey('User', related_name='ModeratorMessage_by_deletion_user_set', null=True) - deletion_ip_address = models.CharField(max_length=40, null=True) - user_display_name = models.CharField(max_length=40, null=True) - -class PostComment(models.Model): - id = models.IntegerField(primary_key=True) - post = models.ForeignKey('Post', related_name='PostComment_by_post_set', null=True) - text = models.TextField(null=True) - creation_date = models.DateTimeField(null=True) - ip_address = models.CharField(max_length=15, null=True) - user = models.ForeignKey('User', related_name='PostComment_by_user_set', null=True) - user_display_name = models.CharField(max_length=30, null=True) - deletion_date = models.DateTimeField(null=True) - deletion_user = models.ForeignKey('User', related_name='PostComment_by_deletion_user_set', null=True) - score = models.IntegerField(null=True) - -class PostHistoryType(models.Model): - id = models.IntegerField(primary_key=True) - name = models.CharField(max_length=50, null=True) - description = models.CharField(max_length=300, null=True) - -class PostHistory(models.Model): - id = models.IntegerField(primary_key=True) - post_history_type = models.ForeignKey('PostHistoryType', related_name='PostHistory_by_post_history_type_set', null=True) - post = models.ForeignKey('Post', related_name='PostHistory_by_post_set', null=True) - revision_guid = models.CharField(max_length=64, null=True) - creation_date = models.DateTimeField(null=True) - ip_address = models.CharField(max_length=40, null=True) - user = models.ForeignKey('User', related_name='PostHistory_by_user_set', null=True) - comment = models.CharField(max_length=400, null=True) - text = models.TextField(null=True) - user_display_name = models.CharField(max_length=40, null=True) - user_email = models.CharField(max_length=100, null=True) - user_website_url = models.CharField(max_length=200, null=True) - -class Post2Vote(models.Model): - id = models.IntegerField(primary_key=True) - post = models.ForeignKey('Post', related_name='Post2Vote_by_post_set', null=True) - user = models.ForeignKey('User', related_name='Post2Vote_by_user_set', null=True) - vote_type = models.ForeignKey('VoteType', related_name='Post2Vote_by_vote_type_set', null=True) - creation_date = models.DateTimeField(null=True) - deletion_date = models.DateTimeField(null=True) - target_user = models.ForeignKey('User', related_name='Post2Vote_by_target_user_set', null=True) - target_rep_change = models.IntegerField(null=True) - voter_rep_change = models.IntegerField(null=True) - comment = models.CharField(max_length=150, null=True) - ip_address = models.CharField(max_length=40, null=True) - linked_post = models.ForeignKey('Post', related_name='Post2Vote_by_linked_post_set', null=True) - -class Post(models.Model): - id = models.IntegerField(primary_key=True) - post_type = models.ForeignKey('PostType', related_name='Post_by_post_type_set', null=True) - creation_date = models.DateTimeField(null=True) - score = models.IntegerField(null=True) - view_count = models.IntegerField(null=True) - body = models.TextField(null=True) - owner_user = models.ForeignKey('User', related_name='Post_by_owner_user_set', null=True) - last_editor_user = models.ForeignKey('User', related_name='Post_by_last_editor_user_set', null=True) - last_edit_date = models.DateTimeField(null=True) - last_activity_date = models.DateTimeField(null=True) - last_activity_user = models.ForeignKey('User', related_name='Post_by_last_activity_user_set', null=True) - parent = models.ForeignKey('self', related_name='Post_by_parent_set', null=True) - accepted_answer = models.ForeignKey('self', related_name='Post_by_accepted_answer_set', null=True) - title = models.CharField(max_length=250, null=True) - tags = models.CharField(max_length=150, null=True) - community_owned_date = models.DateTimeField(null=True) - history_summary = models.CharField(max_length=150, null=True) - answer_score = models.IntegerField(null=True) - answer_count = models.IntegerField(null=True) - comment_count = models.IntegerField(null=True) - favorite_count = models.IntegerField(null=True) - deletion_date = models.DateTimeField(null=True) - closed_date = models.DateTimeField(null=True) - locked_date = models.DateTimeField(null=True) - locked_duration = models.IntegerField(null=True) - owner_display_name = models.CharField(max_length=40, null=True) - last_editor_display_name = models.CharField(max_length=40, null=True) - bounty_amount = models.IntegerField(null=True) - bounty_closes = models.DateTimeField(null=True) - bounty_closed = models.DateTimeField(null=True) - last_owner_email_date = models.DateTimeField(null=True) - -class PostType(models.Model): - id = models.IntegerField(primary_key=True) - name = models.CharField(max_length=50, null=True) - description = models.CharField(max_length=300, null=True) - -class SchemaVersion(models.Model): - version = models.IntegerField(null=True) - -class Setting(models.Model): - id = models.IntegerField(primary_key=True) - key = models.CharField(max_length=256, null=True) - value = models.TextField(null=True) - -class SystemMessage(models.Model): - id = models.IntegerField(primary_key=True) - user = models.ForeignKey('User', related_name='SystemMessage_by_user_set', null=True) - creation_date = models.DateTimeField(null=True) - text = models.TextField(null=True) - deletion_date = models.DateTimeField(null=True) - deletion_user = models.ForeignKey('User', related_name='SystemMessage_by_deletion_user_set', null=True) - -class Tag(models.Model): - id = models.IntegerField(primary_key=True) - name = models.CharField(max_length=50, null=True) - count = models.IntegerField(null=True) - user = models.ForeignKey('User', related_name='Tag_by_user_set', null=True) - creation_date = models.DateTimeField(null=True) - is_moderator_only = models.NullBooleanField(null=True) - is_required = models.NullBooleanField(null=True) - aliases = models.CharField(max_length=200, null=True) - -class ThemeResource(models.Model): - id = models.IntegerField(primary_key=True) - name = models.CharField(max_length=50, null=True) - value = models.TextField(null=True) - content_type = models.CharField(max_length=50, null=True) - version = models.CharField(max_length=6, null=True) - -class ThemeTextResource(models.Model): - id = models.IntegerField(primary_key=True) - name = models.CharField(max_length=50, null=True) - value = models.TextField(null=True) - content_type = models.CharField(max_length=50, null=True) - -class ThrottleBucket(models.Model): - id = models.IntegerField(primary_key=True) - type = models.CharField(max_length=256, null=True) - ip_address = models.CharField(max_length=64, null=True) - tokens = models.IntegerField(null=True) - last_update = models.DateTimeField(null=True) - -class UserHistoryType(models.Model): - id = models.IntegerField(primary_key=True) - name = models.CharField(max_length=50, null=True) - description = models.CharField(max_length=300, null=True) - -class UserHistory(models.Model): - id = models.IntegerField(primary_key=True) - user_history_type = models.ForeignKey('UserHistoryType', related_name='UserHistory_by_user_history_type_set', null=True) - creation_date = models.DateTimeField(null=True) - ip_address = models.CharField(max_length=40, null=True) - user = models.ForeignKey('User', related_name='UserHistory_by_user_set', null=True) - comment = models.CharField(max_length=400, null=True) - user_display_name = models.CharField(max_length=40, null=True) - moderator_user = models.ForeignKey('User', related_name='UserHistory_by_moderator_user_set', null=True) - reputation = models.IntegerField(null=True) - -class User2Badge(models.Model): - id = models.IntegerField(primary_key=True) - user = models.ForeignKey('User', related_name='User2Badge_by_user_set', null=True) - badge = models.ForeignKey('Badge', related_name='User2Badge_by_badge_set', null=True) - date = models.DateTimeField(null=True) - comment = models.CharField(max_length=50, null=True) - -class User2Vote(models.Model): - id = models.IntegerField(primary_key=True) - user = models.ForeignKey('User', related_name='User2Vote_by_user_set', null=True) - vote_type = models.ForeignKey('VoteType', related_name='User2Vote_by_vote_type_set', null=True) - target_user = models.ForeignKey('User', related_name='User2Vote_by_target_user_set', null=True) - creation_date = models.DateTimeField(null=True) - deletion_date = models.DateTimeField(null=True) - ip_address = models.CharField(max_length=40, null=True) - -class User(models.Model): - id = models.IntegerField(primary_key=True) - user_type = models.ForeignKey('UserType', related_name='User_by_user_type_set', null=True) - open_id = models.CharField(max_length=200, null=True) - reputation = models.IntegerField(null=True) - views = models.IntegerField(null=True) - creation_date = models.DateTimeField(null=True) - last_access_date = models.DateTimeField(null=True) - has_replies = models.NullBooleanField(null=True) - has_message = models.NullBooleanField(null=True) - opt_in_email = models.NullBooleanField(null=True) - opt_in_recruit = models.NullBooleanField(null=True) - last_login_date = models.DateTimeField(null=True) - last_email_date = models.DateTimeField(null=True) - last_login_ip = models.CharField(max_length=15, null=True) - open_id_alt = models.CharField(max_length=200, null=True) - email = models.CharField(max_length=100, null=True) - display_name = models.CharField(max_length=40, null=True) - display_name_cleaned = models.CharField(max_length=40, null=True) - website_url = models.CharField(max_length=200, null=True) - real_name = models.CharField(max_length=100, null=True) - location = models.CharField(max_length=100, null=True) - birthday = models.DateTimeField(null=True) - badge_summary = models.CharField(max_length=50, null=True) - about_me = models.TextField(null=True) - preferences_raw = models.TextField(null=True) - timed_penalty_date = models.DateTimeField(null=True) - guid = models.CharField(max_length=64, null=True) - phone = models.CharField(max_length=20, null=True) - password_id = models.IntegerField(null=True) - -class UserType(models.Model): - id = models.IntegerField(primary_key=True) - name = models.CharField(max_length=50, null=True) - description = models.CharField(max_length=300, null=True) - -class VoteType(models.Model): - id = models.IntegerField(primary_key=True) - name = models.CharField(max_length=50, null=True) - description = models.CharField(max_length=300, null=True) - diff --git a/stackexchange/parse_models.py b/stackexchange/parse_models.py deleted file mode 100644 index 64796e57..00000000 --- a/stackexchange/parse_models.py +++ /dev/null @@ -1,225 +0,0 @@ -from xml.etree import ElementTree as et -import sys -import re -import os -if __name__ != '__main__':#hack do not import models if run as script - from django.db import models -from datetime import datetime - -table_prefix = ''#StackExchange or something, if needed -date_time_format = '%Y-%m-%dT%H:%M:%S' #note that fractional part of second is lost -time_re = re.compile(r'(\.[\d]+)?$') -loader_app_name = os.path.dirname(__file__) - -types = { - 'unsignedByte':'models.IntegerField', - 'FK':'models.ForeignKey', - 'PK':'models.IntegerField', - 'string':'models.CharField', - 'text':'models.TextField', - 'int':'models.IntegerField', - 'boolean':'models.NullBooleanField', - 'dateTime':'models.DateTimeField', - 'base64Binary':'models.TextField', - 'double':'models.IntegerField', -} - -def camel_to_python(camel): - """http://stackoverflow.com/questions/1175208/ - """ - s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', camel) - return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() - -def singular(word): - if word.endswith('s'): - return word[:-1] - else: - return word - -def get_table_name(name): - """Determine db table name - from the basename of the .xml file - """ - out = table_prefix - if name.find('2') == -1: - out += singular(name) - else: - bits = name.split('2') - bits = map(singular, bits) - out += '2'.join(bits) - return out - -class DjangoModel(object): - def __init__(self, name): - self.name = get_table_name(name) - self.fields = [] - def add_field(self,field): - field.table = self - self.fields.append(field) - def __str__(self): - out = 'class %s(models.Model):\n' % self.name - for f in self.fields: - out += ' %s\n' % str(f) - return out - -class DjangoField(object): - def __init__(self, name, type, restriction = None): - self.name = camel_to_python(name) - if self.name == 'class': - self.name = 'class_type'#work around python keyword - self.type = type - self.table = None - self.restriction = restriction - self.relation = None - - def __str__(self): - out = '%s = %s(' % (self.name, types[self.type]) - if self.type == 'FK': - out += "'%s'" % self.relation - out += ", related_name='%s_by_%s_set'" % (self.table.name, self.name) - out += ', null=True'#nullable to make life easier - elif self.type == 'PK': - out += 'primary_key=True' - elif self.restriction != -1: - if self.type == 'string': - out += 'max_length=%s' % self.restriction - out += ', null=True' - else: - raise Exception('restriction (max_length) supported only for string type') - else: - out += 'null=True' - out += ')' - return out - - def get_type(self): - return self.type - -class DjangoPK(DjangoField): - def __init__(self): - self.name = 'id' - self.type = 'PK' - -class DjangoFK(DjangoField): - def __init__(self, source_name): - bits = source_name.split('Id') - if len(bits) == 2 and bits[1] == '': - name = bits[0] - super(DjangoFK, self).__init__(name, 'FK') - self.set_relation(name) - - def set_relation(self, name): - """some relations need to be mapped - to actual tables - """ - self.relation = table_prefix - if name.endswith('User'): - self.relation += 'User' - elif name.endswith('Post'): - self.relation += 'Post' - elif name in ('AcceptedAnswer','Parent'): - self.relation = 'self' #self-referential Post model - else: - self.relation += name - def get_relation(self): - return self.relation - -def get_col_type(col): - type = col.get('type') - restriction = -1 - if type == None: - type_e = col.find('.//simpleType/restriction') - type = type_e.get('base') - try: - restriction = int(type_e.getchildren()[0].get('value')) - except: - restriction = -1 - if restriction > 400: - type = 'text' - restriction = -1 - return type, restriction - -def make_field_from_xml_tree(xml_element): - """used by the model parser - here we need to be detailed about field types - because this defines the database schema - """ - name = xml_element.get('name') - if name == 'LinkedVoteId':#not used - return None - if name == 'Id': - field = DjangoPK() - elif name.endswith('Id') and name not in ('OpenId','PasswordId'): - field = DjangoFK(name) - elif name.endswith('GUID'): - field = DjangoField(name, 'string', 64) - else: - type, restriction = get_col_type(xml_element) - field = DjangoField(name, type, restriction) - return field - -def parse_field_name(input): - """used by the data reader - - The problem is that I've scattered - code for determination of field name over three classes: - DjangoField, DjangoPK and DjangoFK - so the function actually cretes fake field objects - many time over - """ - if input == 'Id': - return DjangoPK().name - elif input in ('OpenId', 'PasswordId'): - return DjangoField(input, 'string', 7).name#happy fake field - elif input.endswith('Id'): - return DjangoFK(input).name#real FK field - else: - return DjangoField(input, 'string', 7).name#happy fake field - -def parse_value(input, field_object): - if isinstance(field_object, models.ForeignKey): - try: - id = int(input) - except: - raise Exception('non-numeric foreign key %s' % input) - related_model = field_object.rel.to - try: - return related_model.objects.get(id=id) - except related_model.DoesNotExist: - obj = related_model(id=id) - obj.save()#save fake empty object - return obj - elif isinstance(field_object, models.IntegerField): - try: - return int(input) - except: - raise Exception('expected integer, found %s' % input) - elif isinstance(field_object, models.CharField): - return input - elif isinstance(field_object, models.TextField): - return input - elif isinstance(field_object, models.BooleanField): - try: - return bool(input) - except: - raise Exception('boolean value expected %s found' % input) - elif isinstance(field_object, models.DateTimeField): - input = time_re.sub('', input) - try: - return datetime.strptime(input, date_time_format) - except: - raise Exception('datetime expected "%s" found' % input) - -print 'from django.db import models' -for file in sys.argv: - if '.xsd' in file: - tname = os.path.basename(file).replace('.xsd','') - tree = et.parse(file) - - model = DjangoModel(tname) - - row = tree.find('.//sequence') - for col in row.getchildren(): - field = make_field_from_xml_tree(col) - if field: - model.add_field(field) - print model |