summaryrefslogtreecommitdiffstats
path: root/forum/importers
diff options
context:
space:
mode:
Diffstat (limited to 'forum/importers')
-rw-r--r--forum/importers/__init__.py0
-rw-r--r--forum/importers/stackexchange/ANOMALIES14
-rw-r--r--forum/importers/stackexchange/README62
-rw-r--r--forum/importers/stackexchange/__init__.py0
-rw-r--r--forum/importers/stackexchange/management/__init__.py0
-rw-r--r--forum/importers/stackexchange/management/commands/__init__.py0
-rw-r--r--forum/importers/stackexchange/management/commands/load_stackexchange.py804
-rw-r--r--forum/importers/stackexchange/models.py266
-rw-r--r--forum/importers/stackexchange/parse_models.py225
9 files changed, 1371 insertions, 0 deletions
diff --git a/forum/importers/__init__.py b/forum/importers/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/forum/importers/__init__.py
diff --git a/forum/importers/stackexchange/ANOMALIES b/forum/importers/stackexchange/ANOMALIES
new file mode 100644
index 00000000..05a7dbdb
--- /dev/null
+++ b/forum/importers/stackexchange/ANOMALIES
@@ -0,0 +1,14 @@
+* several user accounts with same email
+* users with no openid
+* users with no email (hack: gravatar set to settings.ANONYMOUS_USER_EMAIL)
+* users with no screen name
+* users with no email and no screen name (25% in homeschool)
+* tag preferences are not stored explicitly (interesting/ignored)
+ maybe they are in se.User.preferences_raw
+ but the data there is not marked up and is kind of cryptic
+* we don't have Community user. SE has one with id=-1
+ this id may break the load script
+ potential break places are anywhere where is X.get_user() call
+ issues may happen with larger data sets where activity
+ of user "Community" is somehow reflected in a way
+ that load_stackexchange does not take care of
diff --git a/forum/importers/stackexchange/README b/forum/importers/stackexchange/README
new file mode 100644
index 00000000..64d8f5fb
--- /dev/null
+++ b/forum/importers/stackexchange/README
@@ -0,0 +1,62 @@
+this app's function will be to:
+
+* install it's own tables (#todo: not yet automated)
+* read SE xml dump into DjangoDB (automated)
+* populate osqa database (automated)
+* remove SE tables (#todo: not done yet)
+
+Current process to load SE data into OSQA is:
+==============================================
+
+1) backup database
+
+2) unzip SE dump into dump_dir (any directory name)
+ you may want to make sure that your dump directory in .gitignore file
+ so that you don't publish it by mistake
+
+3) enable 'stackexchange' in the list of installed apps (probably aready in settings.py)
+
+4) (optional - create models.py for SE, which is included anyway) run:
+
+ #a) run in-place removal of xml namspace prefix to make parsing easier
+ perl -pi -w -e 's/xs://g' $SE_DUMP_PATH/xsd/*.xsd
+ cd stackexchange
+ python parse_models.py $SE_DUMP_PATH/xsd/*.xsd > models.py
+
+5) Install stackexchange models (as well as any other missing models)
+ python manage.py syncdb
+
+6) make sure that osqa badges are installed
+ if not, run (example for mysql):
+
+ mysql -u user -p dbname < sql_scripts/badges.sql
+
+7) load SE data:
+
+ python manage.py load_stackexchange dump_dir
+
+ if anything doesn't go right - run 'python manage.py flush' and repeat
+ steps 6 and 7
+
+NOTES:
+============
+
+Here is the load script that I used for the testing
+it assumes that SE dump has been unzipped inside the tmp directory
+
+ #!/bin/sh$
+ python manage.py flush
+ #delete all data
+ mysql -u osqa -p osqa < sql_scripts/badges.sql
+ python manage.py load_stackexchange tmp
+
+Untested parts are tagged with comments starting with
+#todo:
+
+The test set did not have all the usage cases of StackExchange represented so
+it may break with other sets.
+
+The job takes some time to run, especially
+content revisions and votes - may be optimized
+
+Some of the fringe cases are described in file stackexchange/ANOMALIES
diff --git a/forum/importers/stackexchange/__init__.py b/forum/importers/stackexchange/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/forum/importers/stackexchange/__init__.py
diff --git a/forum/importers/stackexchange/management/__init__.py b/forum/importers/stackexchange/management/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/forum/importers/stackexchange/management/__init__.py
diff --git a/forum/importers/stackexchange/management/commands/__init__.py b/forum/importers/stackexchange/management/commands/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/forum/importers/stackexchange/management/commands/__init__.py
diff --git a/forum/importers/stackexchange/management/commands/load_stackexchange.py b/forum/importers/stackexchange/management/commands/load_stackexchange.py
new file mode 100644
index 00000000..ee22e33a
--- /dev/null
+++ b/forum/importers/stackexchange/management/commands/load_stackexchange.py
@@ -0,0 +1,804 @@
+from django.core.management.base import BaseCommand
+from django.template.defaultfilters import slugify #todo: adopt unicode-aware slugify
+#todo: http://stackoverflow.com/questions/837828/how-to-use-a-slug-in-django
+import os
+import re
+import sys
+import forum.importers.stackexchange.parse_models as se_parser
+from xml.etree import ElementTree as et
+from django.db import models
+import forum.models as askbot
+import forum.importers.stackexchange.models as se
+from forum.forms import EditUserEmailFeedsForm
+from forum.utils.html import sanitize_html
+from django.conf import settings
+from django.contrib.auth.models import Message as DjangoMessage
+from django.utils.translation import ugettext as _
+#from markdown2 import Markdown
+#markdowner = Markdown(html4tags=True)
+
+xml_read_order = (
+ 'VoteTypes','UserTypes','Users','Users2Votes',
+ 'Badges','Users2Badges','CloseReasons','FlatPages',
+ 'MessageTypes','PostHistoryTypes','PostTypes','SchemaVersion',
+ 'Settings','SystemMessages','ThemeResources','ThemeTextResources',
+ 'ThrottleBucket','UserHistoryTypes','UserHistory',
+ 'Users2Badges','VoteTypes','Users2Votes','MessageTypes',
+ 'Posts','Posts2Votes','PostHistory','PostComments',
+ 'ModeratorMessages','Messages','Comments2Votes',
+ )
+
+#association tables SE item id --> ASKBOT item id
+#table associations are implied
+#todo: there is an issue that these may be inconsistent with the database
+USER = {}#SE User.id --> django(ASKBOT) User.id
+QUESTION = {}
+ANSWER = {}
+NAMESAKE_COUNT = {}# number of times user name was used - for X.get_screen_name
+
+class X(object):#
+ """class with methods for handling some details
+ of SE --> ASKBOT mapping
+ """
+ badge_type_map = {'1':'gold','2':'silver','3':'bronze'}
+
+ askbot_supported_id_providers = (
+ 'google','yahoo','aol','myopenid',
+ 'flickr','technorati',
+ 'wordpress','blogger','livejournal',
+ 'claimid','vidoop','verisign',
+ 'openidurl','facebook','local',
+ 'twitter' #oauth is not on this list, b/c it has no own url
+ )
+
+ #map SE VoteType -> askbot.User vote method
+ #created methods with the same call structure in askbot.User
+ #User.<vote_method>(post, timestamp=None, cancel=False)
+ vote_actions = {
+ 'UpMod':'upvote',
+ 'DownMod':'downvote',
+ 'AcceptedByOriginator':'accept_answer',
+ 'Offensive':'flag_post',
+ 'Favorite':'toggle_favorite_question',
+ }
+
+ #these modes cannot be mixed
+ #only wiki is assumed to be mixable
+ exclusive_revision_modes = (
+ 'initial','edit','rollback','lock',
+ 'migrate','close','merge','delete',
+ )
+
+ #badges whose names don't match exactly, but
+ #present in both SE and ASKBOT
+ badge_exceptions = {# SE --> ASKBOT
+ 'Citizen Patrol':'Citizen patrol',#single #todo: why sentence case?
+ 'Strunk &amp; White':'Strunk & White',#single
+ 'Civic Duty':'Civic duty',
+ }
+
+ revision_type_map = {
+ 'Initial Title':'initial',
+ 'Initial Body':'initial',
+ 'Initial Tags':'initial',
+ 'Edit Title':'edit',
+ 'Edit Body':'edit',
+ 'Edit Tags':'edit',
+ 'Rollback Title':'rollback',
+ 'Rollback Body':'rollback',
+ 'Rollback Tags':'rollback',
+ 'Post Closed':'close',
+ 'Post Reopened':'close',
+ 'Post Deleted':'delete',
+ 'Post Undeleted':'delete',
+ 'Post Locked':'lock',
+ 'Post Unlocked':'lock',
+ 'Community Owned':'wiki',
+ 'Post Migrated':'migrate',
+ 'Question Merged':'merge',
+ }
+
+ close_reason_map = {
+ 1:1,#duplicate
+ 2:2,#off-topic
+ 3:3,#subjective and argumentative
+ 4:4,#not a real question
+ 5:7,#offensive
+ 6:6,#irrelevant or outdated question
+ 7:9,#too localized
+ 10:8,#spam
+ }
+
+ @classmethod
+ def get_message_text(cls, se_m):
+ """try to intelligently translate
+ SE message to ASKBOT so that it makese sense in
+ our context
+ """
+ #todo: properly translate messages
+ #todo: maybe work through more instances of messages
+ if se_m.message_type.name == 'Badge Notification':
+ return se_m.text
+ else:
+ if 'you are now an administrator' in se_m.text:
+ return _('Congratulations, you are now an Administrator')
+ elif re.search(r'^You have \d+ new',se_m.text):
+ bits = se_m.text.split('.')
+ text = bits[0]
+ if se_m.user.id == -1:
+ return None
+ url = cls.get_user(se_m.user).get_profile_url()
+ return '<a href="%s?sort=responses">%s</a>' % (url,text)
+ return None
+
+ @classmethod
+ def get_post(cls, se_post):
+ #todo: fix this hack - either in-memory id association table
+ #or use database to store these associations
+ post_type = se_post.post_type.name
+ if post_type == 'Question':
+ return askbot.Question.objects.get(id=QUESTION[se_post.id].id)
+ elif post_type == 'Answer':
+ return askbot.Answer.objects.get(id=ANSWER[se_post.id].id)
+ else:
+ raise Exception('unknown post type %s' % post_type)
+
+ @classmethod
+ def get_close_reason(cls, se_reason):
+ #todo: this is a guess - have not seen real data
+ se_reason = int(se_reason)
+ return cls.close_reason_map[se_reason]
+
+ @classmethod
+ def get_user(cls, se_user):
+ #todo: same as get_post
+ return askbot.User.objects.get(id=USER[se_user.id].id)
+
+ @classmethod
+ def get_post_revision_group_types(cls, rev_group):
+ rev_types = {}
+ for rev in rev_group:
+ rev_type = cls.get_post_revision_type(rev)
+ rev_types[rev_type] = 1
+ rev_types = rev_types.keys()
+
+ #make sure that exclusive rev modes are not mixed
+ exclusive = cls.exclusive_revision_modes
+ if len(rev_types) > 1 and all([t in exclusive for t in rev_types]):
+ tstr = ','.join(rev_types)
+ gstr = ','.join([str(rev.id) for rev in rev_group])
+ msg = 'incompatible revision types %s in PostHistory %s' % (tstr,gstr)
+ raise Exception(msg)
+ return rev_types
+
+ @classmethod
+ def clean_tags(cls, tags):
+ tags = re.subn(r'\s+',' ',tags.strip())[0]
+ bits = tags.split(' ')
+ tags = ' '.join([bit[1:-1] for bit in bits])
+ tags = re.subn(r'\xf6','-',tags)[0]
+ return tags
+
+ @classmethod
+ def get_screen_name(cls, name):
+ """always returns unique screen name
+ even if there are multiple users in SE
+ with the same exact screen name
+ """
+ if name is None:
+ name = 'anonymous'
+ name = name.strip()
+ name = re.subn(r'\s+',' ',name)[0]#remove repeating spaces
+
+ try:
+ u = askbot.User.objects.get(username = name)
+ try:
+ if u.location:
+ name += ', %s' % u.location
+ if name in NAMESAKE_COUNT:
+ NAMESAKE_COUNT[name] += 1
+ name += ' %d' % NAMESAKE_COUNT[name]
+ else:
+ NAMESAKE_COUNT[name] = 1
+ except askbot.User.DoesNotExist:
+ pass
+ except askbot.User.DoesNotExist:
+ NAMESAKE_COUNT[name] = 1
+ return name
+
+ @classmethod
+ def get_email(cls, email):#todo: fix fringe case - user did not give email!
+ if email is None:
+ return settings.ANONYMOUS_USER_EMAIL
+ else:
+ assert(email != '')
+ return email
+
+ @classmethod
+ def get_post_revision_type(cls, rev):
+ rev_name = rev.post_history_type.name
+ rev_type = cls.revision_type_map.get(rev_name, None)
+ if rev_type is None:
+ raise Exception('dont understand revision type %s' % rev)
+ return rev_type
+
+ #crude method of getting id provider name from the url
+ @classmethod
+ def get_openid_provider_name(cls, openid_url):
+ openid_str = str(openid_url)
+ bits = openid_str.split('/')
+ base_url = bits[2] #assume this is base url
+ url_bits = base_url.split('.')
+ provider_name = url_bits[-2].lower()
+ if provider_name not in cls.askbot_supported_id_providers:
+ raise Exception('could not determine login provider for %s' % openid_url)
+ return provider_name
+
+ @staticmethod
+ def blankable(input):
+ if input is None:
+ return ''
+ else:
+ return input
+
+ @classmethod
+ def parse_badge_summary(cls, badge_summary):
+ (gold,silver,bronze) = (0,0,0)
+ if badge_summary:
+ if len(badge_summary) > 3:
+ print 'warning: guessing that badge summary is comma separated'
+ print 'have %s' % badge_summary
+ bits = badge_summary.split(',')
+ else:
+ bits = [badge_summary]
+ for bit in bits:
+ m = re.search(r'^(?P<type>[1-3])=(?P<count>\d+)$', bit)
+ if not m:
+ raise Exception('could not parse badge summary: %s' % badge_summary)
+ else:
+ badge_type = cls.badge_type_map[m.groupdict()['type']]
+ locals()[badge_type] = int(m.groupdict()['count'])
+ return (gold,silver,bronze)
+
+ @classmethod
+ def get_badge_name(cls, name):
+ return cls.badge_exceptions.get(name, name)
+
+class Command(BaseCommand):
+ help = 'Loads StackExchange data from unzipped directory of XML files into the ASKBOT database'
+ args = 'se_dump_dir'
+
+ def handle(self, *arg, **kwarg):
+ if len(arg) < 1 or not os.path.isdir(arg[0]):
+ print 'Error: first argument must be a directory with all the SE *.xml files'
+ sys.exit(1)
+
+ self.dump_path = arg[0]
+ #read the data into SE tables
+ for xml in xml_read_order:
+ xml_path = self.get_xml_path(xml)
+ table_name = self.get_table_name(xml)
+ self.load_xml_file(xml_path, table_name)
+
+ #this is important so that when we clean up messages
+ #automatically generated by the procedures below
+ #we do not delete old messages
+ #todo: unfortunately this may need to be redone
+ #when we upgrade to django 1.2 and definitely by 1.4 when
+ #the current message system will be replaced with the
+ #django messages framework
+ self.save_askbot_message_id_list()
+
+ #transfer data into ASKBOT tables
+ print 'Transferring users...',
+ sys.stdout.flush()
+ self.transfer_users()
+ print 'done.'
+ print 'Transferring content edits...',
+ sys.stdout.flush()
+ self.transfer_question_and_answer_activity()
+ print 'done.'
+ print 'Transferring view counts...',
+ sys.stdout.flush()
+ self.transfer_question_view_counts()
+ print 'done.'
+ print 'Transferring comments...',
+ sys.stdout.flush()
+ self.transfer_comments()
+ print 'done.'
+ print 'Transferring badges and badge awards...',
+ sys.stdout.flush()
+ self.transfer_badges()
+ print 'done.'
+ print 'Transferring votes...',
+ sys.stdout.flush()
+ self.transfer_votes()#includes favorites, accepts and flags
+ print 'done.'
+
+ self.cleanup_messages()#delete autogenerated messages
+ self.transfer_messages()
+
+ #todo: these are not clear how to go about
+ self.transfer_update_subscriptions()
+ self.transfer_tag_preferences()
+ self.transfer_meta_pages()
+
+ def save_askbot_message_id_list(self):
+ id_list = list(DjangoMessage.objects.all().values('id'))
+ self._askbot_message_id_list = id_list
+
+ def cleanup_messages(self):
+ """deletes messages generated by the load process
+ """
+ id_list = self._askbot_message_id_list
+ mset = DjangoMessage.objects.all().exclude(id__in=id_list)
+ mset.delete()
+
+ def transfer_messages(self):
+ """transfers some messages from
+ SE to ASKBOT
+ """
+ for m in se.Message.objects.all():
+ if m.is_read:
+ continue
+ if m.user.id == -1:
+ continue
+ u = X.get_user(m.user)
+ text = X.get_message_text(m)
+ if text:
+ u.message_set.create(
+ message=text,
+ )
+
+ def _process_post_initial_revision_group(self, rev_group):
+
+ title = None
+ text = None
+ tags = None
+ wiki = False
+ author = USER[rev_group[0].user.id]
+ added_at = rev_group[0].creation_date
+
+ for rev in rev_group:
+ rev_type = rev.post_history_type.name
+ if rev_type == 'Initial Title':
+ title = rev.text
+ elif rev_type == 'Initial Body':
+ text = rev.text
+ elif rev_type == 'Initial Tags':
+ tags = X.clean_tags(rev.text)
+ elif rev_type == 'Community Owned':
+ wiki = True
+ else:
+ raise Exception('unexpected revision type %s' % rev_type)
+
+ post_type = rev_group[0].post.post_type.name
+ if post_type == 'Question':
+ q = askbot.Question.objects.create_new(
+ title = title,
+ author = author,
+ added_at = added_at,
+ wiki = wiki,
+ tagnames = tags,
+ text = text
+ )
+ QUESTION[rev_group[0].post.id] = q
+ elif post_type == 'Answer':
+ q = X.get_post(rev_group[0].post.parent)
+ a = askbot.Answer.objects.create_new(
+ question = q,
+ author = author,
+ added_at = added_at,
+ wiki = wiki,
+ text = text,
+ )
+ ANSWER[rev_group[0].post.id] = a
+ else:
+ post_id = rev_group[0].post.id
+ raise Exception('unknown post type %s for id=%d' % (post_type, post_id))
+
+ def _process_post_edit_revision_group(self, rev_group):
+ #question apply edit
+ (title, text, tags) = (None, None, None)
+ for rev in rev_group:
+ rev_type = rev.post_history_type.name
+ if rev_type == 'Edit Title':
+ title = rev.text
+ elif rev_type == 'Edit Body':
+ text = rev.text
+ elif rev_type == 'Edit Tags':
+ tags = X.clean_tags(rev.text)
+ elif rev_type == 'Community Owned':
+ pass
+ else:
+ raise Exception('unexpected revision type %s' % rev_type)
+
+ rev0 = rev_group[0]
+ edited_by = USER[rev0.user.id]
+ edited_at = rev0.creation_date
+ comment = ';'.join([rev.comment for rev in rev_group if rev.comment])
+ post_type = rev0.post.post_type.name
+
+ if post_type == 'Question':
+ q = X.get_post(rev0.post)
+ q.apply_edit(
+ edited_at = edited_at,
+ edited_by = edited_by,
+ title = title,
+ text = text,
+ comment = comment,
+ tags = tags,
+ )
+ elif post_type == 'Answer':
+ a = X.get_post(rev0.post)
+ a.apply_edit(
+ edited_at = edited_at,
+ edited_by = edited_by,
+ text = text,
+ comment = comment,
+ )
+
+ def _make_post_wiki(self, rev_group):
+ #todo: untested
+ for rev in rev_group:
+ if rev.post_history_type.name == 'Community Owned':
+ p = X.get_post(rev.post)
+ u = X.get_user(rev.user)
+ t = rev.creation_date
+ p.wiki = True
+ p.wikified_at = t
+ p.wikified_by = u
+ self.mark_activity(p,u,t)
+ p.save()
+ return
+
+ def mark_activity(self,p,u,t):
+ """p,u,t - post, user, timestamp
+ """
+ if isinstance(p, askbot.Question):
+ p.last_activity_by = u
+ p.last_activity_at = t
+ elif isinstance(p, askbot.Answer):
+ p.question.last_activity_by = u
+ p.question.last_activity_at = t
+ p.question.save()
+
+ def _process_post_rollback_revision_group(self, rev_group):
+ #todo: don't know what to do here as there were no
+ #such data available
+ pass
+
+ def _process_post_lock_revision_group(self, rev_group):
+ #todo: untested
+ for rev in rev_group:
+ rev_type = rev.post_history_type.name
+ if rev_type.endswith('ocked'):
+ t = rev.creation_date
+ u = X.get_user(rev.user)
+ p = X.get_post(rev.post)
+ if rev_type == 'Post Locked':
+ p.locked = True
+ p.locked_by = u
+ p.locked_at = t
+ elif rev_type == 'Post Unlocked':
+ p.locked = False
+ p.locked_by = None
+ p.locked_at = None
+ else:
+ return
+ self.mark_activity(p,u,t)
+ p.save()
+ return
+
+ def _process_post_close_revision_group(self, rev_group):
+ #todo: untested
+ for rev in rev_group:
+ if rev.post.post_type.name != 'Question':
+ return
+ rev_type = rev.post_history_type.name
+ if rev_type in ('Post Closed', 'Post Reopened'):
+ t = rev.creation_date
+ u = X.get_user(rev.user)
+ p = X.get_post(rev.post)
+ if rev_type == 'Post Closed':
+ p.closed = True
+ p.closed_at = t
+ p.closed_by = u
+ p.close_reason = X.get_close_reason(rev.text)
+ elif rev_type == 'Post Reopened':
+ p.closed = False
+ p.closed_at = None
+ p.closed_by = None
+ p.close_reason = None
+ self.mark_activity(p,u,t)
+ p.save()
+ return
+
+ def _process_post_delete_revision_group(self, rev_group):
+ #todo: untested
+ for rev in rev_group:
+ rev_type = rev.post_history_type.name
+ if rev_type.endswith('eleted'):
+ t = rev.creation_date
+ u = X.get_user(rev.user)
+ p = X.get_post(rev.post)
+ if rev_type == 'Post Deleted':
+ p.deleted = True
+ p.deleted_at = t
+ p.deleted_by = u
+ elif rev_type == 'Post Undeleted':
+ p.deleted = False
+ p.deleted_at = None
+ p.deleted_by = None
+ self.mark_activity(p,u,t)
+ p.save()
+ return
+
+ def _process_post_revision_group(self, rev_group):
+ #determine revision type
+ #'initial','edit','rollback','lock',
+ #'migrate','close','merge','delete',
+ rev_types = X.get_post_revision_group_types(rev_group)
+ if 'initial' in rev_types:
+ self._process_post_initial_revision_group(rev_group)
+ elif 'edit' in rev_types:
+ self._process_post_edit_revision_group(rev_group)
+ elif 'rollback' in rev_types:
+ self._process_post_rollback_revision_group(rev_group)
+ elif 'lock' in rev_types:
+ self._process_post_lock_revision_group(rev_group)
+ elif 'close' in rev_types:
+ self._process_post_close_revision_group(rev_group)
+ elif 'delete' in rev_types:
+ self._process_post_delete_revision_group(rev_group)
+ else:
+ pass
+ #todo: rollback, lock, close and delete are
+ #not tested
+ #merge and migrate actions are ignored
+ #wiki is mixable with other groups, so process it in addition
+ if 'wiki' in rev_types:
+ self._make_post_wiki(rev_group)
+
+ def transfer_tag_preferences(self):
+ #todo: figure out where these are stored in SE
+ #maybe in se.User.preferences_raw?
+ pass
+
+ def transfer_question_and_answer_activity(self):
+ """transfers all question and answer
+ edits and related status changes
+ """
+ #assuming that there are only two post types
+ se_revs = se.PostHistory.objects.all()
+ #assuming that chronologial order is correct and there
+ #will be no problems of data integrity upon insertion of records
+ se_revs = se_revs.order_by('creation_date','revision_guid')
+ #todo: ignored fringe case - no revisions
+ c_guid = se_revs[0].revision_guid
+ c_group = []
+ #this loop groups revisions by revision id, then calls process function
+ #for the revision grup (elementary revisions posted at once)
+ for se_rev in se_revs:
+ if se_rev.revision_guid == c_guid:
+ c_group.append(se_rev)
+ else:
+ self._process_post_revision_group(c_group)
+ c_group = []
+ c_group.append(se_rev)
+ c_guid = se_rev.revision_guid
+ if len(c_group) != 0:
+ self._process_post_revision_group(c_group)
+
+ def transfer_comments(self):
+ for se_c in se.PostComment.objects.all():
+ if se_c.deletion_date:
+ print 'Warning deleted comment %d dropped' % se_c.id
+ continue
+ se_post = se_c.post
+ if se_post.post_type.name == 'Question':
+ askbot_post = QUESTION[se_post.id]
+ elif se_post.post_type.name == 'Answer':
+ askbot_post = ANSWER[se_post.id]
+
+ askbot_post.add_comment(
+ comment = se_c.text,
+ added_at = se_c.creation_date,
+ user = USER[se_c.user.id]
+ )
+
+ def _install_missing_badges(self):
+ self._missing_badges = {}
+ for se_b in se.Badge.objects.all():
+ name = X.get_badge_name(se_b.name)
+ try:
+ askbot.Badge.objects.get(name=name)
+ except:
+ self._missing_badges[name] = 0
+ if len(se_b.description) > 300:
+ print 'Warning truncated description for badge %d' % se_b.id
+ askbot.Badge.objects.create(
+ name = name,
+ slug = slugify(name),
+ description = se_b.description,
+ multiple = (not se_b.single),
+ type = se_b.class_type
+ )
+
+ def _award_badges(self):
+ #note: SE does not keep information on
+ #content-related badges like askbot does
+ for se_a in se.User2Badge.objects.all():
+ if se_a.user.id == -1:
+ continue #skip community user
+ u = USER[se_a.user.id]
+ badge_name = X.get_badge_name(se_a.badge.name)
+ b = askbot.Badge.objects.get(name=badge_name)
+ if b.multiple == False:
+ if b.award_badge.count() > 0:
+ continue
+ #todo: fake content object here b/c SE does not support this
+ #todo: but askbot requires related content object
+ askbot.Award.objects.create(
+ user=u,
+ badge=b,
+ awarded_at=se_a.date,
+ content_object=u,
+ )
+ if b.name in self._missing_badges:
+ self._missing_badges[b.name] += 1
+
+ def _cleanup_badges(self):
+ d = self._missing_badges
+ unused = [name for name in d.keys() if d[name] == 0]
+ askbot.Badge.objects.filter(name__in=unused).delete()
+ installed = [name for name in d.keys() if d[name] > 0]
+ print 'Warning - following unsupported badges were installed:'
+ print ', '.join(installed)
+
+ def transfer_badges(self):
+ #note: badge level is neglected
+ #1) install missing badges
+ self._install_missing_badges()
+ #2) award badges
+ self._award_badges()
+ #3) delete unused newly installed badges
+ self._cleanup_badges()
+ pass
+
+ def transfer_question_view_counts(self):
+ for se_q in se.Post.objects.filter(post_type__name='Question'):
+ q = X.get_post(se_q)
+ q.view_count = se_q.view_count
+ q.save()
+
+
+ def transfer_votes(self):
+ for v in se.Post2Vote.objects.all():
+ vote_type = v.vote_type.name
+ if not vote_type in X.vote_actions:
+ continue
+
+ u = X.get_user(v.user)
+ p = X.get_post(v.post)
+ m = X.vote_actions[vote_type]
+ vote_method = getattr(askbot.User, m)
+ vote_method(u, p, timestamp = v.creation_date)
+ if v.deletion_date:
+ vote_method(u, p, timestamp = v.deletion_date, cancel=True)
+
+ def transfer_update_subscriptions(self):
+ #todo: not clear where this is stored in SE
+ #maybe in se.User.preferences_raw?
+ pass
+
+ def transfer_meta_pages(self):
+ #here we actually don't have anything in the database yet
+ #so we can't do this
+ pass
+
+ def load_xml_file(self, xml_path, table_name):
+ tree = et.parse(xml_path)
+ print 'loading from %s to %s' % (xml_path, table_name) ,
+ model = models.get_model('stackexchange', table_name)
+ i = 0
+ for row in tree.findall('.//row'):
+ model_entry = model()
+ i += 1
+ for col in row.getchildren():
+ field_name = se_parser.parse_field_name(col.tag)
+ field_type = model._meta.get_field(field_name)
+ field_value = se_parser.parse_value(col.text, field_type)
+ setattr(model_entry, field_name, field_value)
+ model_entry.save()
+ print '... %d objects saved' % i
+
+ def get_table_name(self,xml):
+ return se_parser.get_table_name(xml)
+
+ def get_xml_path(self, xml):
+ xml_path = os.path.join(self.dump_path, xml + '.xml')
+ if not os.path.isfile(xml_path):
+ print 'Error: file %s not found' % xml_path
+ sys.exit(1)
+ return xml_path
+
+ def transfer_users(self):
+ for se_u in se.User.objects.all():
+ if se_u.id < 1:#skip the Community user
+ continue
+ u = askbot.User()
+ u_type = se_u.user_type.name
+ if u_type == 'Administrator':
+ u.is_superuser = True
+ elif u_type == 'Moderator':
+ u.is_staff = True
+ elif u_type not in ('Unregistered', 'Registered'):
+ raise Exception('unknown user type %s' % u_type)
+
+ #if user is not registered, no association record created
+ #we do not allow posting by users who are not authenticated
+ #probably they'll just have to "recover" their account by email
+ if u_type != 'Unregistered':
+ assert(se_u.open_id)#everybody must have open_id
+ u_auth = askbot.AuthKeyUserAssociation()
+ u_auth.key = se_u.open_id
+ u_auth.provider = X.get_openid_provider_name(se_u.open_id)
+ u_auth.added_at = se_u.creation_date
+
+ if se_u.open_id is None and se_u.email is None:
+ print 'Warning: SE user %d is not recoverable (no email or openid)'
+
+ u.reputation = 1#se_u.reputation, it's actually re-computed
+ u.last_seen = se_u.last_access_date
+ u.email = X.get_email(se_u.email)
+ u.location = X.blankable(se_u.location)
+ u.date_of_birth = se_u.birthday #dattime -> date
+ u.website = X.blankable(se_u.website_url)
+ u.about = X.blankable(se_u.about_me)
+ u.last_login = se_u.last_login_date
+ u.date_joined = se_u.creation_date
+ u.is_active = True #todo: this may not be the case
+
+ u.username = X.get_screen_name(se_u.display_name)
+ u.real_name = X.blankable(se_u.real_name)
+
+ (gold,silver,bronze) = X.parse_badge_summary(se_u.badge_summary)
+ u.gold = gold
+ u.silver = silver
+ u.bronze = bronze
+
+ #todo: we don't have these fields
+ #views - number of profile views?
+ #has_replies
+ #has_message
+ #opt_in_recruit
+ #last_login_ip
+ #open_id_alt - ??
+ #preferences_raw - not clear how to use
+ #display_name_cleaned - lowercased, srtipped name
+ #timed_penalty_date
+ #phone
+
+ #don't know how to handle these - there was no usage example
+ #password_id
+ #guid
+
+ #ignored
+ #last_email_date - this translates directly to EmailFeedSetting.reported_at
+
+ #save the data
+ u.save()
+ form = EditUserEmailFeedsForm()
+ form.reset()
+ if se_u.opt_in_email == True:#set up daily subscription on "own" items
+ form.initial['individually_selected'] = 'd'
+ form.initial['asked_by_me'] = 'd'
+ form.initial['answered_by_me'] = 'd'
+ #
+ form.save(user=u, save_unbound=True)
+
+ if 'u_auth' in locals():
+ u_auth.user = u
+ u_auth.save()
+ USER[se_u.id] = u
diff --git a/forum/importers/stackexchange/models.py b/forum/importers/stackexchange/models.py
new file mode 100644
index 00000000..a30a9859
--- /dev/null
+++ b/forum/importers/stackexchange/models.py
@@ -0,0 +1,266 @@
+from django.db import models
+class Badge(models.Model):
+ id = models.IntegerField(primary_key=True)
+ class_type = models.IntegerField(null=True)
+ name = models.CharField(max_length=50, null=True)
+ description = models.TextField(null=True)
+ single = models.NullBooleanField(null=True)
+ secret = models.NullBooleanField(null=True)
+ tag_based = models.NullBooleanField(null=True)
+ command = models.TextField(null=True)
+ award_frequency = models.IntegerField(null=True)
+
+class CloseReason(models.Model):
+ id = models.IntegerField(primary_key=True)
+ name = models.CharField(max_length=200, null=True)
+ description = models.CharField(max_length=256, null=True)
+ display_order = models.IntegerField(null=True)
+
+class Comment2Vote(models.Model):
+ id = models.IntegerField(primary_key=True)
+ post_comment = models.ForeignKey('PostComment', related_name='Comment2Vote_by_post_comment_set', null=True)
+ vote_type = models.ForeignKey('VoteType', related_name='Comment2Vote_by_vote_type_set', null=True)
+ creation_date = models.DateTimeField(null=True)
+ user = models.ForeignKey('User', related_name='Comment2Vote_by_user_set', null=True)
+ ip_address = models.CharField(max_length=40, null=True)
+ user_display_name = models.CharField(max_length=40, null=True)
+ deletion_date = models.DateTimeField(null=True)
+
+class FlatPage(models.Model):
+ id = models.IntegerField(primary_key=True)
+ name = models.CharField(max_length=50, null=True)
+ url = models.CharField(max_length=128, null=True)
+ value = models.TextField(null=True)
+ content_type = models.CharField(max_length=50, null=True)
+ active = models.NullBooleanField(null=True)
+ use_master = models.NullBooleanField(null=True)
+
+class Message(models.Model):
+ id = models.IntegerField(primary_key=True)
+ user = models.ForeignKey('User', related_name='Message_by_user_set', null=True)
+ message_type = models.ForeignKey('MessageType', related_name='Message_by_message_type_set', null=True)
+ is_read = models.NullBooleanField(null=True)
+ creation_date = models.DateTimeField(null=True)
+ text = models.TextField(null=True)
+ post = models.ForeignKey('Post', related_name='Message_by_post_set', null=True)
+
+class MessageType(models.Model):
+ id = models.IntegerField(primary_key=True)
+ name = models.CharField(max_length=50, null=True)
+ description = models.CharField(max_length=300, null=True)
+
+class ModeratorMessage(models.Model):
+ id = models.IntegerField(primary_key=True)
+ message_type = models.ForeignKey('MessageType', related_name='ModeratorMessage_by_message_type_set', null=True)
+ creation_date = models.DateTimeField(null=True)
+ creation_ip_address = models.CharField(max_length=40, null=True)
+ text = models.TextField(null=True)
+ user = models.ForeignKey('User', related_name='ModeratorMessage_by_user_set', null=True)
+ post = models.ForeignKey('Post', related_name='ModeratorMessage_by_post_set', null=True)
+ deletion_date = models.DateTimeField(null=True)
+ deletion_user = models.ForeignKey('User', related_name='ModeratorMessage_by_deletion_user_set', null=True)
+ deletion_ip_address = models.CharField(max_length=40, null=True)
+ user_display_name = models.CharField(max_length=40, null=True)
+
+class PostComment(models.Model):
+ id = models.IntegerField(primary_key=True)
+ post = models.ForeignKey('Post', related_name='PostComment_by_post_set', null=True)
+ text = models.TextField(null=True)
+ creation_date = models.DateTimeField(null=True)
+ ip_address = models.CharField(max_length=15, null=True)
+ user = models.ForeignKey('User', related_name='PostComment_by_user_set', null=True)
+ user_display_name = models.CharField(max_length=30, null=True)
+ deletion_date = models.DateTimeField(null=True)
+ deletion_user = models.ForeignKey('User', related_name='PostComment_by_deletion_user_set', null=True)
+ score = models.IntegerField(null=True)
+
+class PostHistoryType(models.Model):
+ id = models.IntegerField(primary_key=True)
+ name = models.CharField(max_length=50, null=True)
+ description = models.CharField(max_length=300, null=True)
+
+class PostHistory(models.Model):
+ id = models.IntegerField(primary_key=True)
+ post_history_type = models.ForeignKey('PostHistoryType', related_name='PostHistory_by_post_history_type_set', null=True)
+ post = models.ForeignKey('Post', related_name='PostHistory_by_post_set', null=True)
+ revision_guid = models.CharField(max_length=64, null=True)
+ creation_date = models.DateTimeField(null=True)
+ ip_address = models.CharField(max_length=40, null=True)
+ user = models.ForeignKey('User', related_name='PostHistory_by_user_set', null=True)
+ comment = models.CharField(max_length=400, null=True)
+ text = models.TextField(null=True)
+ user_display_name = models.CharField(max_length=40, null=True)
+ user_email = models.CharField(max_length=100, null=True)
+ user_website_url = models.CharField(max_length=200, null=True)
+
+class Post2Vote(models.Model):
+ id = models.IntegerField(primary_key=True)
+ post = models.ForeignKey('Post', related_name='Post2Vote_by_post_set', null=True)
+ user = models.ForeignKey('User', related_name='Post2Vote_by_user_set', null=True)
+ vote_type = models.ForeignKey('VoteType', related_name='Post2Vote_by_vote_type_set', null=True)
+ creation_date = models.DateTimeField(null=True)
+ deletion_date = models.DateTimeField(null=True)
+ target_user = models.ForeignKey('User', related_name='Post2Vote_by_target_user_set', null=True)
+ target_rep_change = models.IntegerField(null=True)
+ voter_rep_change = models.IntegerField(null=True)
+ comment = models.CharField(max_length=150, null=True)
+ ip_address = models.CharField(max_length=40, null=True)
+ linked_post = models.ForeignKey('Post', related_name='Post2Vote_by_linked_post_set', null=True)
+
+class Post(models.Model):
+ id = models.IntegerField(primary_key=True)
+ post_type = models.ForeignKey('PostType', related_name='Post_by_post_type_set', null=True)
+ creation_date = models.DateTimeField(null=True)
+ score = models.IntegerField(null=True)
+ view_count = models.IntegerField(null=True)
+ body = models.TextField(null=True)
+ owner_user = models.ForeignKey('User', related_name='Post_by_owner_user_set', null=True)
+ last_editor_user = models.ForeignKey('User', related_name='Post_by_last_editor_user_set', null=True)
+ last_edit_date = models.DateTimeField(null=True)
+ last_activity_date = models.DateTimeField(null=True)
+ last_activity_user = models.ForeignKey('User', related_name='Post_by_last_activity_user_set', null=True)
+ parent = models.ForeignKey('self', related_name='Post_by_parent_set', null=True)
+ accepted_answer = models.ForeignKey('self', related_name='Post_by_accepted_answer_set', null=True)
+ title = models.CharField(max_length=250, null=True)
+ tags = models.CharField(max_length=150, null=True)
+ community_owned_date = models.DateTimeField(null=True)
+ history_summary = models.CharField(max_length=150, null=True)
+ answer_score = models.IntegerField(null=True)
+ answer_count = models.IntegerField(null=True)
+ comment_count = models.IntegerField(null=True)
+ favorite_count = models.IntegerField(null=True)
+ deletion_date = models.DateTimeField(null=True)
+ closed_date = models.DateTimeField(null=True)
+ locked_date = models.DateTimeField(null=True)
+ locked_duration = models.IntegerField(null=True)
+ owner_display_name = models.CharField(max_length=40, null=True)
+ last_editor_display_name = models.CharField(max_length=40, null=True)
+ bounty_amount = models.IntegerField(null=True)
+ bounty_closes = models.DateTimeField(null=True)
+ bounty_closed = models.DateTimeField(null=True)
+ last_owner_email_date = models.DateTimeField(null=True)
+
+class PostType(models.Model):
+ id = models.IntegerField(primary_key=True)
+ name = models.CharField(max_length=50, null=True)
+ description = models.CharField(max_length=300, null=True)
+
+class SchemaVersion(models.Model):
+ version = models.IntegerField(null=True)
+
+class Setting(models.Model):
+ id = models.IntegerField(primary_key=True)
+ key = models.CharField(max_length=256, null=True)
+ value = models.TextField(null=True)
+
+class SystemMessage(models.Model):
+ id = models.IntegerField(primary_key=True)
+ user = models.ForeignKey('User', related_name='SystemMessage_by_user_set', null=True)
+ creation_date = models.DateTimeField(null=True)
+ text = models.TextField(null=True)
+ deletion_date = models.DateTimeField(null=True)
+ deletion_user = models.ForeignKey('User', related_name='SystemMessage_by_deletion_user_set', null=True)
+
+class Tag(models.Model):
+ id = models.IntegerField(primary_key=True)
+ name = models.CharField(max_length=50, null=True)
+ count = models.IntegerField(null=True)
+ user = models.ForeignKey('User', related_name='Tag_by_user_set', null=True)
+ creation_date = models.DateTimeField(null=True)
+ is_moderator_only = models.NullBooleanField(null=True)
+ is_required = models.NullBooleanField(null=True)
+ aliases = models.CharField(max_length=200, null=True)
+
+class ThemeResource(models.Model):
+ id = models.IntegerField(primary_key=True)
+ name = models.CharField(max_length=50, null=True)
+ value = models.TextField(null=True)
+ content_type = models.CharField(max_length=50, null=True)
+ version = models.CharField(max_length=6, null=True)
+
+class ThemeTextResource(models.Model):
+ id = models.IntegerField(primary_key=True)
+ name = models.CharField(max_length=50, null=True)
+ value = models.TextField(null=True)
+ content_type = models.CharField(max_length=50, null=True)
+
+class ThrottleBucket(models.Model):
+ id = models.IntegerField(primary_key=True)
+ type = models.CharField(max_length=256, null=True)
+ ip_address = models.CharField(max_length=64, null=True)
+ tokens = models.IntegerField(null=True)
+ last_update = models.DateTimeField(null=True)
+
+class UserHistoryType(models.Model):
+ id = models.IntegerField(primary_key=True)
+ name = models.CharField(max_length=50, null=True)
+ description = models.CharField(max_length=300, null=True)
+
+class UserHistory(models.Model):
+ id = models.IntegerField(primary_key=True)
+ user_history_type = models.ForeignKey('UserHistoryType', related_name='UserHistory_by_user_history_type_set', null=True)
+ creation_date = models.DateTimeField(null=True)
+ ip_address = models.CharField(max_length=40, null=True)
+ user = models.ForeignKey('User', related_name='UserHistory_by_user_set', null=True)
+ comment = models.CharField(max_length=400, null=True)
+ user_display_name = models.CharField(max_length=40, null=True)
+ moderator_user = models.ForeignKey('User', related_name='UserHistory_by_moderator_user_set', null=True)
+ reputation = models.IntegerField(null=True)
+
+class User2Badge(models.Model):
+ id = models.IntegerField(primary_key=True)
+ user = models.ForeignKey('User', related_name='User2Badge_by_user_set', null=True)
+ badge = models.ForeignKey('Badge', related_name='User2Badge_by_badge_set', null=True)
+ date = models.DateTimeField(null=True)
+ comment = models.CharField(max_length=50, null=True)
+
+class User2Vote(models.Model):
+ id = models.IntegerField(primary_key=True)
+ user = models.ForeignKey('User', related_name='User2Vote_by_user_set', null=True)
+ vote_type = models.ForeignKey('VoteType', related_name='User2Vote_by_vote_type_set', null=True)
+ target_user = models.ForeignKey('User', related_name='User2Vote_by_target_user_set', null=True)
+ creation_date = models.DateTimeField(null=True)
+ deletion_date = models.DateTimeField(null=True)
+ ip_address = models.CharField(max_length=40, null=True)
+
+class User(models.Model):
+ id = models.IntegerField(primary_key=True)
+ user_type = models.ForeignKey('UserType', related_name='User_by_user_type_set', null=True)
+ open_id = models.CharField(max_length=200, null=True)
+ reputation = models.IntegerField(null=True)
+ views = models.IntegerField(null=True)
+ creation_date = models.DateTimeField(null=True)
+ last_access_date = models.DateTimeField(null=True)
+ has_replies = models.NullBooleanField(null=True)
+ has_message = models.NullBooleanField(null=True)
+ opt_in_email = models.NullBooleanField(null=True)
+ opt_in_recruit = models.NullBooleanField(null=True)
+ last_login_date = models.DateTimeField(null=True)
+ last_email_date = models.DateTimeField(null=True)
+ last_login_ip = models.CharField(max_length=15, null=True)
+ open_id_alt = models.CharField(max_length=200, null=True)
+ email = models.CharField(max_length=100, null=True)
+ display_name = models.CharField(max_length=40, null=True)
+ display_name_cleaned = models.CharField(max_length=40, null=True)
+ website_url = models.CharField(max_length=200, null=True)
+ real_name = models.CharField(max_length=100, null=True)
+ location = models.CharField(max_length=100, null=True)
+ birthday = models.DateTimeField(null=True)
+ badge_summary = models.CharField(max_length=50, null=True)
+ about_me = models.TextField(null=True)
+ preferences_raw = models.TextField(null=True)
+ timed_penalty_date = models.DateTimeField(null=True)
+ guid = models.CharField(max_length=64, null=True)
+ phone = models.CharField(max_length=20, null=True)
+ password_id = models.IntegerField(null=True)
+
+class UserType(models.Model):
+ id = models.IntegerField(primary_key=True)
+ name = models.CharField(max_length=50, null=True)
+ description = models.CharField(max_length=300, null=True)
+
+class VoteType(models.Model):
+ id = models.IntegerField(primary_key=True)
+ name = models.CharField(max_length=50, null=True)
+ description = models.CharField(max_length=300, null=True)
+
diff --git a/forum/importers/stackexchange/parse_models.py b/forum/importers/stackexchange/parse_models.py
new file mode 100644
index 00000000..64796e57
--- /dev/null
+++ b/forum/importers/stackexchange/parse_models.py
@@ -0,0 +1,225 @@
+from xml.etree import ElementTree as et
+import sys
+import re
+import os
+if __name__ != '__main__':#hack do not import models if run as script
+ from django.db import models
+from datetime import datetime
+
+table_prefix = ''#StackExchange or something, if needed
+date_time_format = '%Y-%m-%dT%H:%M:%S' #note that fractional part of second is lost
+time_re = re.compile(r'(\.[\d]+)?$')
+loader_app_name = os.path.dirname(__file__)
+
+types = {
+ 'unsignedByte':'models.IntegerField',
+ 'FK':'models.ForeignKey',
+ 'PK':'models.IntegerField',
+ 'string':'models.CharField',
+ 'text':'models.TextField',
+ 'int':'models.IntegerField',
+ 'boolean':'models.NullBooleanField',
+ 'dateTime':'models.DateTimeField',
+ 'base64Binary':'models.TextField',
+ 'double':'models.IntegerField',
+}
+
+def camel_to_python(camel):
+ """http://stackoverflow.com/questions/1175208/
+ """
+ s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', camel)
+ return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
+
+def singular(word):
+ if word.endswith('s'):
+ return word[:-1]
+ else:
+ return word
+
+def get_table_name(name):
+ """Determine db table name
+ from the basename of the .xml file
+ """
+ out = table_prefix
+ if name.find('2') == -1:
+ out += singular(name)
+ else:
+ bits = name.split('2')
+ bits = map(singular, bits)
+ out += '2'.join(bits)
+ return out
+
+class DjangoModel(object):
+ def __init__(self, name):
+ self.name = get_table_name(name)
+ self.fields = []
+ def add_field(self,field):
+ field.table = self
+ self.fields.append(field)
+ def __str__(self):
+ out = 'class %s(models.Model):\n' % self.name
+ for f in self.fields:
+ out += ' %s\n' % str(f)
+ return out
+
+class DjangoField(object):
+ def __init__(self, name, type, restriction = None):
+ self.name = camel_to_python(name)
+ if self.name == 'class':
+ self.name = 'class_type'#work around python keyword
+ self.type = type
+ self.table = None
+ self.restriction = restriction
+ self.relation = None
+
+ def __str__(self):
+ out = '%s = %s(' % (self.name, types[self.type])
+ if self.type == 'FK':
+ out += "'%s'" % self.relation
+ out += ", related_name='%s_by_%s_set'" % (self.table.name, self.name)
+ out += ', null=True'#nullable to make life easier
+ elif self.type == 'PK':
+ out += 'primary_key=True'
+ elif self.restriction != -1:
+ if self.type == 'string':
+ out += 'max_length=%s' % self.restriction
+ out += ', null=True'
+ else:
+ raise Exception('restriction (max_length) supported only for string type')
+ else:
+ out += 'null=True'
+ out += ')'
+ return out
+
+ def get_type(self):
+ return self.type
+
+class DjangoPK(DjangoField):
+ def __init__(self):
+ self.name = 'id'
+ self.type = 'PK'
+
+class DjangoFK(DjangoField):
+ def __init__(self, source_name):
+ bits = source_name.split('Id')
+ if len(bits) == 2 and bits[1] == '':
+ name = bits[0]
+ super(DjangoFK, self).__init__(name, 'FK')
+ self.set_relation(name)
+
+ def set_relation(self, name):
+ """some relations need to be mapped
+ to actual tables
+ """
+ self.relation = table_prefix
+ if name.endswith('User'):
+ self.relation += 'User'
+ elif name.endswith('Post'):
+ self.relation += 'Post'
+ elif name in ('AcceptedAnswer','Parent'):
+ self.relation = 'self' #self-referential Post model
+ else:
+ self.relation += name
+ def get_relation(self):
+ return self.relation
+
+def get_col_type(col):
+ type = col.get('type')
+ restriction = -1
+ if type == None:
+ type_e = col.find('.//simpleType/restriction')
+ type = type_e.get('base')
+ try:
+ restriction = int(type_e.getchildren()[0].get('value'))
+ except:
+ restriction = -1
+ if restriction > 400:
+ type = 'text'
+ restriction = -1
+ return type, restriction
+
+def make_field_from_xml_tree(xml_element):
+ """used by the model parser
+ here we need to be detailed about field types
+ because this defines the database schema
+ """
+ name = xml_element.get('name')
+ if name == 'LinkedVoteId':#not used
+ return None
+ if name == 'Id':
+ field = DjangoPK()
+ elif name.endswith('Id') and name not in ('OpenId','PasswordId'):
+ field = DjangoFK(name)
+ elif name.endswith('GUID'):
+ field = DjangoField(name, 'string', 64)
+ else:
+ type, restriction = get_col_type(xml_element)
+ field = DjangoField(name, type, restriction)
+ return field
+
+def parse_field_name(input):
+ """used by the data reader
+
+ The problem is that I've scattered
+ code for determination of field name over three classes:
+ DjangoField, DjangoPK and DjangoFK
+ so the function actually cretes fake field objects
+ many time over
+ """
+ if input == 'Id':
+ return DjangoPK().name
+ elif input in ('OpenId', 'PasswordId'):
+ return DjangoField(input, 'string', 7).name#happy fake field
+ elif input.endswith('Id'):
+ return DjangoFK(input).name#real FK field
+ else:
+ return DjangoField(input, 'string', 7).name#happy fake field
+
+def parse_value(input, field_object):
+ if isinstance(field_object, models.ForeignKey):
+ try:
+ id = int(input)
+ except:
+ raise Exception('non-numeric foreign key %s' % input)
+ related_model = field_object.rel.to
+ try:
+ return related_model.objects.get(id=id)
+ except related_model.DoesNotExist:
+ obj = related_model(id=id)
+ obj.save()#save fake empty object
+ return obj
+ elif isinstance(field_object, models.IntegerField):
+ try:
+ return int(input)
+ except:
+ raise Exception('expected integer, found %s' % input)
+ elif isinstance(field_object, models.CharField):
+ return input
+ elif isinstance(field_object, models.TextField):
+ return input
+ elif isinstance(field_object, models.BooleanField):
+ try:
+ return bool(input)
+ except:
+ raise Exception('boolean value expected %s found' % input)
+ elif isinstance(field_object, models.DateTimeField):
+ input = time_re.sub('', input)
+ try:
+ return datetime.strptime(input, date_time_format)
+ except:
+ raise Exception('datetime expected "%s" found' % input)
+
+print 'from django.db import models'
+for file in sys.argv:
+ if '.xsd' in file:
+ tname = os.path.basename(file).replace('.xsd','')
+ tree = et.parse(file)
+
+ model = DjangoModel(tname)
+
+ row = tree.find('.//sequence')
+ for col in row.getchildren():
+ field = make_field_from_xml_tree(col)
+ if field:
+ model.add_field(field)
+ print model