diff options
author | Evgeny Fadeev <evgeny.fadeev@gmail.com> | 2010-03-04 21:47:03 -0500 |
---|---|---|
committer | Evgeny Fadeev <evgeny.fadeev@gmail.com> | 2010-03-04 21:47:03 -0500 |
commit | b807b27c8c3ca1a27c1467ca54885f04ed4d6fa0 (patch) | |
tree | dbaab41324014da414d831adad48455b3c26608a /stackexchange | |
parent | 153fe986ea1c25d6c09b89368d71103aa76f0f23 (diff) | |
download | askbot-b807b27c8c3ca1a27c1467ca54885f04ed4d6fa0.tar.gz askbot-b807b27c8c3ca1a27c1467ca54885f04ed4d6fa0.tar.bz2 askbot-b807b27c8c3ca1a27c1467ca54885f04ed4d6fa0.zip |
SE data seems to load into stackexchange tables
Diffstat (limited to 'stackexchange')
-rw-r--r-- | stackexchange/README | 10 | ||||
-rw-r--r-- | stackexchange/management/__init__.py | 0 | ||||
-rw-r--r-- | stackexchange/management/commands/__init__.py | 0 | ||||
-rw-r--r-- | stackexchange/management/commands/load_stackexchange.py | 59 | ||||
-rw-r--r-- | stackexchange/models.py | 502 | ||||
-rw-r--r-- | stackexchange/parse_models.py | 145 |
6 files changed, 450 insertions, 266 deletions
diff --git a/stackexchange/README b/stackexchange/README index 3eb1fea9..f842e891 100644 --- a/stackexchange/README +++ b/stackexchange/README @@ -1,14 +1,14 @@ this app's function will be to: * install it's own tables <--- done -* read SE xml dump into DjangoDB +* read SE xml dump into DjangoDB <--- done * populate osqa database * remove SE tables So far models are automatically created via: 1) add 'stackexchange' to the list of installed apps -2) type commands +2) run: #in-place removal of xml namspace prefix to make parsing easier perl -pi -w -e 's/xs://g' $SE_DUMP_PATH/xsd/*.xsd @@ -17,3 +17,9 @@ So far models are automatically created via: python parse_models.py $SE_DUMP_PATH/xsd/*.xsd > models.py cd .. python manage.py syncdb + +3) run: + + python manage.py load_stackexchange + + does not yet populate final data diff --git a/stackexchange/management/__init__.py b/stackexchange/management/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/stackexchange/management/__init__.py diff --git a/stackexchange/management/commands/__init__.py b/stackexchange/management/commands/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/stackexchange/management/commands/__init__.py diff --git a/stackexchange/management/commands/load_stackexchange.py b/stackexchange/management/commands/load_stackexchange.py new file mode 100644 index 00000000..42ccc9f9 --- /dev/null +++ b/stackexchange/management/commands/load_stackexchange.py @@ -0,0 +1,59 @@ +from django.core.management.base import BaseCommand +import os +import sys +import stackexchange.parse_models as se_parser +from xml.etree import ElementTree as et +from django.db import models + +xml_read_order = ( + 'VoteTypes','UserTypes','Users','Users2Votes', + 'Badges','Users2Badges','CloseReasons','FlatPages', + 'MessageTypes','PostHistoryTypes','PostTypes','SchemaVersion', + 'Settings','SystemMessages','ThemeResources','ThemeTextResources', + 'ThrottleBucket','UserHistoryTypes','UserHistory', + 'Users2Badges','VoteTypes','Users2Votes','MessageTypes', + 'Posts','Posts2Votes','PostHistory','PostComments', + 'ModeratorMessages','Messages','Comments2Votes', + ) + + +class Command(BaseCommand): + help = 'Loads StackExchange data from unzipped directory of XML files into the OSQA database' + args = 'se_dump_dir' + + def handle(self, *arg, **kwarg): + if len(arg) < 1 or not os.path.isdir(arg[0]): + print 'Error: first argument must be a directory with all the SE *.xml files' + sys.exit(1) + + self.dump_path = arg[0] + for xml in xml_read_order: + xml_path = self.get_xml_path(xml) + table_name = self.get_table_name(xml) + self.load_xml_file(xml_path, table_name) + + def load_xml_file(self, xml_path, table_name): + tree = et.parse(xml_path) + print 'loading from %s to %s' % (xml_path, table_name) , + model = models.get_model('stackexchange', table_name) + i = 0 + for row in tree.findall('.//row'): + model_entry = model() + i += 1 + for col in row.getchildren(): + field_name = se_parser.parse_field_name(col.tag) + field_type = model._meta.get_field(field_name) + field_value = se_parser.parse_value(col.text, field_type) + setattr(model_entry, field_name, field_value) + model_entry.save() + print '... %d objects saved' % i + + def get_table_name(self,xml): + return se_parser.get_table_name(xml) + + def get_xml_path(self, xml): + xml_path = os.path.join(self.dump_path, xml + '.xml') + if not os.path.isfile(xml_path): + print 'Error: file %s not found' % xml_path + sys.exit(1) + return xml_path diff --git a/stackexchange/models.py b/stackexchange/models.py index 28b2dda6..a30a9859 100644 --- a/stackexchange/models.py +++ b/stackexchange/models.py @@ -1,240 +1,266 @@ from django.db import models -class StackExchangeBadge(models.Model): - class_type = models.IntegerField() - name = models.CharField(max_length=50) - description = models.TextField() - single = models.BooleanField() - secret = models.BooleanField() - tag_based = models.BooleanField() - command = models.TextField() - award_frequency = models.IntegerField() - -class StackExchangeCloseReason(models.Model): - name = models.CharField(max_length=200) - description = models.CharField(max_length=256) - display_order = models.IntegerField() - -class StackExchangeComment2Vote(models.Model): - post_comment = models.ForeignKey('StackExchangePostComment', related_name='StackExchangeComment2Vote_post_comment_set', null=True) - vote_type = models.ForeignKey('StackExchangeVoteType', related_name='StackExchangeComment2Vote_vote_type_set', null=True) - creation_date = models.DateTimeField() - user = models.ForeignKey('StackExchangeUser', related_name='StackExchangeComment2Vote_user_set', null=True) - ip_address = models.CharField(max_length=40) - user_display_name = models.CharField(max_length=40) - deletion_date = models.DateTimeField() - -class StackExchangeFlatPage(models.Model): - name = models.CharField(max_length=50) - url = models.CharField(max_length=128) - value = models.TextField() - content_type = models.CharField(max_length=50) - active = models.BooleanField() - use_master = models.BooleanField() - -class StackExchangeMessage(models.Model): - user = models.ForeignKey('StackExchangeUser', related_name='StackExchangeMessage_user_set', null=True) - message_type = models.ForeignKey('StackExchangeMessageType', related_name='StackExchangeMessage_message_type_set', null=True) - is_read = models.BooleanField() - creation_date = models.DateTimeField() - text = models.TextField() - post = models.ForeignKey('StackExchangePost', related_name='StackExchangeMessage_post_set', null=True) - -class StackExchangeMessageType(models.Model): - name = models.CharField(max_length=50) - description = models.CharField(max_length=300) - -class StackExchangeModeratorMessage(models.Model): - message_type = models.ForeignKey('StackExchangeMessageType', related_name='StackExchangeModeratorMessage_message_type_set', null=True) - creation_date = models.DateTimeField() - creation_ip_address = models.CharField(max_length=40) - text = models.TextField() - user = models.ForeignKey('StackExchangeUser', related_name='StackExchangeModeratorMessage_user_set', null=True) - post = models.ForeignKey('StackExchangePost', related_name='StackExchangeModeratorMessage_post_set', null=True) - deletion_date = models.DateTimeField() - deletion_user = models.ForeignKey('StackExchangeUser', related_name='StackExchangeModeratorMessage_deletion_user_set', null=True) - deletion_ip_address = models.CharField(max_length=40) - user_display_name = models.CharField(max_length=40) - -class StackExchangePostComment(models.Model): - post = models.ForeignKey('StackExchangePost', related_name='StackExchangePostComment_post_set', null=True) - text = models.TextField() - creation_date = models.DateTimeField() - ip_address = models.CharField(max_length=15) - user = models.ForeignKey('StackExchangeUser', related_name='StackExchangePostComment_user_set', null=True) - user_display_name = models.CharField(max_length=30) - deletion_date = models.DateTimeField() - deletion_user = models.ForeignKey('StackExchangeUser', related_name='StackExchangePostComment_deletion_user_set', null=True) - score = models.IntegerField() - -class StackExchangePostHistoryType(models.Model): - name = models.CharField(max_length=50) - description = models.CharField(max_length=300) - -class StackExchangePostHistory(models.Model): - post_history_type = models.ForeignKey('StackExchangePostHistoryType', related_name='StackExchangePostHistory_post_history_type_set', null=True) - post = models.ForeignKey('StackExchangePost', related_name='StackExchangePostHistory_post_set', null=True) - revision_guid = models.CharField(max_length=64) - creation_date = models.DateTimeField() - ip_address = models.CharField(max_length=40) - user = models.ForeignKey('StackExchangeUser', related_name='StackExchangePostHistory_user_set', null=True) - comment = models.CharField(max_length=400) - text = models.TextField() - user_display_name = models.CharField(max_length=40) - user_email = models.CharField(max_length=100) - user_website_url = models.CharField(max_length=200) - -class StackExchangePost2Vote(models.Model): - post = models.ForeignKey('StackExchangePost', related_name='StackExchangePost2Vote_post_set', null=True) - user = models.ForeignKey('StackExchangeUser', related_name='StackExchangePost2Vote_user_set', null=True) - vote_type = models.ForeignKey('StackExchangeVoteType', related_name='StackExchangePost2Vote_vote_type_set', null=True) - creation_date = models.DateTimeField() - deletion_date = models.DateTimeField() - target_user = models.ForeignKey('StackExchangeUser', related_name='StackExchangePost2Vote_target_user_set', null=True) - target_rep_change = models.IntegerField() - voter_rep_change = models.IntegerField() - comment = models.CharField(max_length=150) - ip_address = models.CharField(max_length=40) - linked_post = models.ForeignKey('StackExchangePost', related_name='StackExchangePost2Vote_linked_post_set', null=True) - -class StackExchangePost(models.Model): - post_type = models.ForeignKey('StackExchangePostType', related_name='StackExchangePost_post_type_set', null=True) - creation_date = models.DateTimeField() - score = models.IntegerField() - view_count = models.IntegerField() - body = models.TextField() - owner_user = models.ForeignKey('StackExchangeUser', related_name='StackExchangePost_owner_user_set', null=True) - last_editor_user = models.ForeignKey('StackExchangeUser', related_name='StackExchangePost_last_editor_user_set', null=True) - last_edit_date = models.DateTimeField() - last_activity_date = models.DateTimeField() - last_activity_user = models.ForeignKey('StackExchangeUser', related_name='StackExchangePost_last_activity_user_set', null=True) - parent = models.ForeignKey('self', related_name='StackExchangePost_parent_set', null=True) - accepted_answer = models.ForeignKey('self', related_name='StackExchangePost_accepted_answer_set', null=True) - title = models.CharField(max_length=250) - tags = models.CharField(max_length=150) - community_owned_date = models.DateTimeField() - history_summary = models.CharField(max_length=150) - answer_score = models.IntegerField() - answer_count = models.IntegerField() - comment_count = models.IntegerField() - favorite_count = models.IntegerField() - deletion_date = models.DateTimeField() - closed_date = models.DateTimeField() - locked_date = models.DateTimeField() - locked_duration = models.IntegerField() - owner_display_name = models.CharField(max_length=40) - last_editor_display_name = models.CharField(max_length=40) - bounty_amount = models.IntegerField() - bounty_closes = models.DateTimeField() - bounty_closed = models.DateTimeField() - last_owner_email_date = models.DateTimeField() - -class StackExchangePostType(models.Model): - name = models.CharField(max_length=50) - description = models.CharField(max_length=300) - -class StackExchangeSchemaVersion(models.Model): - version = models.IntegerField() - -class StackExchangeSetting(models.Model): - key = models.CharField(max_length=256) - value = models.TextField() - -class StackExchangeSystemMessage(models.Model): - user = models.ForeignKey('StackExchangeUser', related_name='StackExchangeSystemMessage_user_set', null=True) - creation_date = models.DateTimeField() - text = models.TextField() - deletion_date = models.DateTimeField() - deletion_user = models.ForeignKey('StackExchangeUser', related_name='StackExchangeSystemMessage_deletion_user_set', null=True) - -class StackExchangeTag(models.Model): - name = models.CharField(max_length=50) - count = models.IntegerField() - user = models.ForeignKey('StackExchangeUser', related_name='StackExchangeTag_user_set', null=True) - creation_date = models.DateTimeField() - is_moderator_only = models.BooleanField() - is_required = models.BooleanField() - aliases = models.CharField(max_length=200) - -class StackExchangeThemeResource(models.Model): - name = models.CharField(max_length=50) - value = models.TextField() - content_type = models.CharField(max_length=50) - version = models.CharField(max_length=6) - -class StackExchangeThemeTextResource(models.Model): - name = models.CharField(max_length=50) - value = models.TextField() - content_type = models.CharField(max_length=50) - -class StackExchangeThrottleBucket(models.Model): - type = models.CharField(max_length=256) - ip_address = models.CharField(max_length=64) - tokens = models.IntegerField() - last_update = models.DateTimeField() - -class StackExchangeUserHistoryType(models.Model): - name = models.CharField(max_length=50) - description = models.CharField(max_length=300) - -class StackExchangeUserHistory(models.Model): - user_history_type = models.ForeignKey('StackExchangeUserHistoryType', related_name='StackExchangeUserHistory_user_history_type_set', null=True) - creation_date = models.DateTimeField() - ip_address = models.CharField(max_length=40) - user = models.ForeignKey('StackExchangeUser', related_name='StackExchangeUserHistory_user_set', null=True) - comment = models.CharField(max_length=400) - user_display_name = models.CharField(max_length=40) - moderator_user = models.ForeignKey('StackExchangeUser', related_name='StackExchangeUserHistory_moderator_user_set', null=True) - reputation = models.IntegerField() - -class StackExchangeUser2Badge(models.Model): - user = models.ForeignKey('StackExchangeUser', related_name='StackExchangeUser2Badge_user_set', null=True) - badge = models.ForeignKey('StackExchangeBadge', related_name='StackExchangeUser2Badge_badge_set', null=True) - date = models.DateTimeField() - comment = models.CharField(max_length=50) - -class StackExchangeUser2Vote(models.Model): - user = models.ForeignKey('StackExchangeUser', related_name='StackExchangeUser2Vote_user_set', null=True) - vote_type = models.ForeignKey('StackExchangeVoteType', related_name='StackExchangeUser2Vote_vote_type_set', null=True) - target_user = models.ForeignKey('StackExchangeUser', related_name='StackExchangeUser2Vote_target_user_set', null=True) - creation_date = models.DateTimeField() - deletion_date = models.DateTimeField() - ip_address = models.CharField(max_length=40) - -class StackExchangeUser(models.Model): - user_type = models.ForeignKey('StackExchangeUserType', related_name='StackExchangeUser_user_type_set', null=True) - open_id = models.CharField(max_length=200) - reputation = models.IntegerField() - views = models.IntegerField() - creation_date = models.DateTimeField() - last_access_date = models.DateTimeField() - has_replies = models.BooleanField() - has_message = models.BooleanField() - opt_in_email = models.BooleanField() - opt_in_recruit = models.BooleanField() - last_login_date = models.DateTimeField() - last_email_date = models.DateTimeField() - last_login_ip = models.CharField(max_length=15) - open_id_alt = models.CharField(max_length=200) - email = models.CharField(max_length=100) - display_name = models.CharField(max_length=40) - display_name_cleaned = models.CharField(max_length=40) - website_url = models.CharField(max_length=200) - real_name = models.CharField(max_length=100) - location = models.CharField(max_length=100) - birthday = models.DateTimeField() - badge_summary = models.CharField(max_length=50) - about_me = models.TextField() - preferences_raw = models.TextField() - timed_penalty_date = models.DateTimeField() - guid = models.CharField(max_length=64) - phone = models.CharField(max_length=20) - password_id = models.IntegerField() - -class StackExchangeUserType(models.Model): - name = models.CharField(max_length=50) - description = models.CharField(max_length=300) - -class StackExchangeVoteType(models.Model): - name = models.CharField(max_length=50) - description = models.CharField(max_length=300) +class Badge(models.Model): + id = models.IntegerField(primary_key=True) + class_type = models.IntegerField(null=True) + name = models.CharField(max_length=50, null=True) + description = models.TextField(null=True) + single = models.NullBooleanField(null=True) + secret = models.NullBooleanField(null=True) + tag_based = models.NullBooleanField(null=True) + command = models.TextField(null=True) + award_frequency = models.IntegerField(null=True) + +class CloseReason(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=200, null=True) + description = models.CharField(max_length=256, null=True) + display_order = models.IntegerField(null=True) + +class Comment2Vote(models.Model): + id = models.IntegerField(primary_key=True) + post_comment = models.ForeignKey('PostComment', related_name='Comment2Vote_by_post_comment_set', null=True) + vote_type = models.ForeignKey('VoteType', related_name='Comment2Vote_by_vote_type_set', null=True) + creation_date = models.DateTimeField(null=True) + user = models.ForeignKey('User', related_name='Comment2Vote_by_user_set', null=True) + ip_address = models.CharField(max_length=40, null=True) + user_display_name = models.CharField(max_length=40, null=True) + deletion_date = models.DateTimeField(null=True) + +class FlatPage(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + url = models.CharField(max_length=128, null=True) + value = models.TextField(null=True) + content_type = models.CharField(max_length=50, null=True) + active = models.NullBooleanField(null=True) + use_master = models.NullBooleanField(null=True) + +class Message(models.Model): + id = models.IntegerField(primary_key=True) + user = models.ForeignKey('User', related_name='Message_by_user_set', null=True) + message_type = models.ForeignKey('MessageType', related_name='Message_by_message_type_set', null=True) + is_read = models.NullBooleanField(null=True) + creation_date = models.DateTimeField(null=True) + text = models.TextField(null=True) + post = models.ForeignKey('Post', related_name='Message_by_post_set', null=True) + +class MessageType(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + description = models.CharField(max_length=300, null=True) + +class ModeratorMessage(models.Model): + id = models.IntegerField(primary_key=True) + message_type = models.ForeignKey('MessageType', related_name='ModeratorMessage_by_message_type_set', null=True) + creation_date = models.DateTimeField(null=True) + creation_ip_address = models.CharField(max_length=40, null=True) + text = models.TextField(null=True) + user = models.ForeignKey('User', related_name='ModeratorMessage_by_user_set', null=True) + post = models.ForeignKey('Post', related_name='ModeratorMessage_by_post_set', null=True) + deletion_date = models.DateTimeField(null=True) + deletion_user = models.ForeignKey('User', related_name='ModeratorMessage_by_deletion_user_set', null=True) + deletion_ip_address = models.CharField(max_length=40, null=True) + user_display_name = models.CharField(max_length=40, null=True) + +class PostComment(models.Model): + id = models.IntegerField(primary_key=True) + post = models.ForeignKey('Post', related_name='PostComment_by_post_set', null=True) + text = models.TextField(null=True) + creation_date = models.DateTimeField(null=True) + ip_address = models.CharField(max_length=15, null=True) + user = models.ForeignKey('User', related_name='PostComment_by_user_set', null=True) + user_display_name = models.CharField(max_length=30, null=True) + deletion_date = models.DateTimeField(null=True) + deletion_user = models.ForeignKey('User', related_name='PostComment_by_deletion_user_set', null=True) + score = models.IntegerField(null=True) + +class PostHistoryType(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + description = models.CharField(max_length=300, null=True) + +class PostHistory(models.Model): + id = models.IntegerField(primary_key=True) + post_history_type = models.ForeignKey('PostHistoryType', related_name='PostHistory_by_post_history_type_set', null=True) + post = models.ForeignKey('Post', related_name='PostHistory_by_post_set', null=True) + revision_guid = models.CharField(max_length=64, null=True) + creation_date = models.DateTimeField(null=True) + ip_address = models.CharField(max_length=40, null=True) + user = models.ForeignKey('User', related_name='PostHistory_by_user_set', null=True) + comment = models.CharField(max_length=400, null=True) + text = models.TextField(null=True) + user_display_name = models.CharField(max_length=40, null=True) + user_email = models.CharField(max_length=100, null=True) + user_website_url = models.CharField(max_length=200, null=True) + +class Post2Vote(models.Model): + id = models.IntegerField(primary_key=True) + post = models.ForeignKey('Post', related_name='Post2Vote_by_post_set', null=True) + user = models.ForeignKey('User', related_name='Post2Vote_by_user_set', null=True) + vote_type = models.ForeignKey('VoteType', related_name='Post2Vote_by_vote_type_set', null=True) + creation_date = models.DateTimeField(null=True) + deletion_date = models.DateTimeField(null=True) + target_user = models.ForeignKey('User', related_name='Post2Vote_by_target_user_set', null=True) + target_rep_change = models.IntegerField(null=True) + voter_rep_change = models.IntegerField(null=True) + comment = models.CharField(max_length=150, null=True) + ip_address = models.CharField(max_length=40, null=True) + linked_post = models.ForeignKey('Post', related_name='Post2Vote_by_linked_post_set', null=True) + +class Post(models.Model): + id = models.IntegerField(primary_key=True) + post_type = models.ForeignKey('PostType', related_name='Post_by_post_type_set', null=True) + creation_date = models.DateTimeField(null=True) + score = models.IntegerField(null=True) + view_count = models.IntegerField(null=True) + body = models.TextField(null=True) + owner_user = models.ForeignKey('User', related_name='Post_by_owner_user_set', null=True) + last_editor_user = models.ForeignKey('User', related_name='Post_by_last_editor_user_set', null=True) + last_edit_date = models.DateTimeField(null=True) + last_activity_date = models.DateTimeField(null=True) + last_activity_user = models.ForeignKey('User', related_name='Post_by_last_activity_user_set', null=True) + parent = models.ForeignKey('self', related_name='Post_by_parent_set', null=True) + accepted_answer = models.ForeignKey('self', related_name='Post_by_accepted_answer_set', null=True) + title = models.CharField(max_length=250, null=True) + tags = models.CharField(max_length=150, null=True) + community_owned_date = models.DateTimeField(null=True) + history_summary = models.CharField(max_length=150, null=True) + answer_score = models.IntegerField(null=True) + answer_count = models.IntegerField(null=True) + comment_count = models.IntegerField(null=True) + favorite_count = models.IntegerField(null=True) + deletion_date = models.DateTimeField(null=True) + closed_date = models.DateTimeField(null=True) + locked_date = models.DateTimeField(null=True) + locked_duration = models.IntegerField(null=True) + owner_display_name = models.CharField(max_length=40, null=True) + last_editor_display_name = models.CharField(max_length=40, null=True) + bounty_amount = models.IntegerField(null=True) + bounty_closes = models.DateTimeField(null=True) + bounty_closed = models.DateTimeField(null=True) + last_owner_email_date = models.DateTimeField(null=True) + +class PostType(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + description = models.CharField(max_length=300, null=True) + +class SchemaVersion(models.Model): + version = models.IntegerField(null=True) + +class Setting(models.Model): + id = models.IntegerField(primary_key=True) + key = models.CharField(max_length=256, null=True) + value = models.TextField(null=True) + +class SystemMessage(models.Model): + id = models.IntegerField(primary_key=True) + user = models.ForeignKey('User', related_name='SystemMessage_by_user_set', null=True) + creation_date = models.DateTimeField(null=True) + text = models.TextField(null=True) + deletion_date = models.DateTimeField(null=True) + deletion_user = models.ForeignKey('User', related_name='SystemMessage_by_deletion_user_set', null=True) + +class Tag(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + count = models.IntegerField(null=True) + user = models.ForeignKey('User', related_name='Tag_by_user_set', null=True) + creation_date = models.DateTimeField(null=True) + is_moderator_only = models.NullBooleanField(null=True) + is_required = models.NullBooleanField(null=True) + aliases = models.CharField(max_length=200, null=True) + +class ThemeResource(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + value = models.TextField(null=True) + content_type = models.CharField(max_length=50, null=True) + version = models.CharField(max_length=6, null=True) + +class ThemeTextResource(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + value = models.TextField(null=True) + content_type = models.CharField(max_length=50, null=True) + +class ThrottleBucket(models.Model): + id = models.IntegerField(primary_key=True) + type = models.CharField(max_length=256, null=True) + ip_address = models.CharField(max_length=64, null=True) + tokens = models.IntegerField(null=True) + last_update = models.DateTimeField(null=True) + +class UserHistoryType(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + description = models.CharField(max_length=300, null=True) + +class UserHistory(models.Model): + id = models.IntegerField(primary_key=True) + user_history_type = models.ForeignKey('UserHistoryType', related_name='UserHistory_by_user_history_type_set', null=True) + creation_date = models.DateTimeField(null=True) + ip_address = models.CharField(max_length=40, null=True) + user = models.ForeignKey('User', related_name='UserHistory_by_user_set', null=True) + comment = models.CharField(max_length=400, null=True) + user_display_name = models.CharField(max_length=40, null=True) + moderator_user = models.ForeignKey('User', related_name='UserHistory_by_moderator_user_set', null=True) + reputation = models.IntegerField(null=True) + +class User2Badge(models.Model): + id = models.IntegerField(primary_key=True) + user = models.ForeignKey('User', related_name='User2Badge_by_user_set', null=True) + badge = models.ForeignKey('Badge', related_name='User2Badge_by_badge_set', null=True) + date = models.DateTimeField(null=True) + comment = models.CharField(max_length=50, null=True) + +class User2Vote(models.Model): + id = models.IntegerField(primary_key=True) + user = models.ForeignKey('User', related_name='User2Vote_by_user_set', null=True) + vote_type = models.ForeignKey('VoteType', related_name='User2Vote_by_vote_type_set', null=True) + target_user = models.ForeignKey('User', related_name='User2Vote_by_target_user_set', null=True) + creation_date = models.DateTimeField(null=True) + deletion_date = models.DateTimeField(null=True) + ip_address = models.CharField(max_length=40, null=True) + +class User(models.Model): + id = models.IntegerField(primary_key=True) + user_type = models.ForeignKey('UserType', related_name='User_by_user_type_set', null=True) + open_id = models.CharField(max_length=200, null=True) + reputation = models.IntegerField(null=True) + views = models.IntegerField(null=True) + creation_date = models.DateTimeField(null=True) + last_access_date = models.DateTimeField(null=True) + has_replies = models.NullBooleanField(null=True) + has_message = models.NullBooleanField(null=True) + opt_in_email = models.NullBooleanField(null=True) + opt_in_recruit = models.NullBooleanField(null=True) + last_login_date = models.DateTimeField(null=True) + last_email_date = models.DateTimeField(null=True) + last_login_ip = models.CharField(max_length=15, null=True) + open_id_alt = models.CharField(max_length=200, null=True) + email = models.CharField(max_length=100, null=True) + display_name = models.CharField(max_length=40, null=True) + display_name_cleaned = models.CharField(max_length=40, null=True) + website_url = models.CharField(max_length=200, null=True) + real_name = models.CharField(max_length=100, null=True) + location = models.CharField(max_length=100, null=True) + birthday = models.DateTimeField(null=True) + badge_summary = models.CharField(max_length=50, null=True) + about_me = models.TextField(null=True) + preferences_raw = models.TextField(null=True) + timed_penalty_date = models.DateTimeField(null=True) + guid = models.CharField(max_length=64, null=True) + phone = models.CharField(max_length=20, null=True) + password_id = models.IntegerField(null=True) + +class UserType(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + description = models.CharField(max_length=300, null=True) + +class VoteType(models.Model): + id = models.IntegerField(primary_key=True) + name = models.CharField(max_length=50, null=True) + description = models.CharField(max_length=300, null=True) diff --git a/stackexchange/parse_models.py b/stackexchange/parse_models.py index e83ca0d5..64796e57 100644 --- a/stackexchange/parse_models.py +++ b/stackexchange/parse_models.py @@ -1,14 +1,24 @@ from xml.etree import ElementTree as et import sys import re +import os +if __name__ != '__main__':#hack do not import models if run as script + from django.db import models +from datetime import datetime + +table_prefix = ''#StackExchange or something, if needed +date_time_format = '%Y-%m-%dT%H:%M:%S' #note that fractional part of second is lost +time_re = re.compile(r'(\.[\d]+)?$') +loader_app_name = os.path.dirname(__file__) types = { 'unsignedByte':'models.IntegerField', 'FK':'models.ForeignKey', + 'PK':'models.IntegerField', 'string':'models.CharField', 'text':'models.TextField', 'int':'models.IntegerField', - 'boolean':'models.BooleanField', + 'boolean':'models.NullBooleanField', 'dateTime':'models.DateTimeField', 'base64Binary':'models.TextField', 'double':'models.IntegerField', @@ -26,15 +36,22 @@ def singular(word): else: return word +def get_table_name(name): + """Determine db table name + from the basename of the .xml file + """ + out = table_prefix + if name.find('2') == -1: + out += singular(name) + else: + bits = name.split('2') + bits = map(singular, bits) + out += '2'.join(bits) + return out + class DjangoModel(object): def __init__(self, name): - self.name = 'StackExchange' - if name.find('2') == -1: - self.name += singular(name) - else: - bits = name.split('2') - bits = map(singular, bits) - self.name += '2'.join(bits) + self.name = get_table_name(name) self.fields = [] def add_field(self,field): field.table = self @@ -57,20 +74,31 @@ class DjangoField(object): def __str__(self): out = '%s = %s(' % (self.name, types[self.type]) - if self.relation and self.restriction: - raise Exception('impossible') - elif self.relation: + if self.type == 'FK': out += "'%s'" % self.relation - out += ", related_name='%s_%s_set'" % (self.table.name, self.name) + out += ", related_name='%s_by_%s_set'" % (self.table.name, self.name) out += ', null=True'#nullable to make life easier + elif self.type == 'PK': + out += 'primary_key=True' elif self.restriction != -1: if self.type == 'string': out += 'max_length=%s' % self.restriction + out += ', null=True' else: - raise Exception('only max_length restriction is supported') + raise Exception('restriction (max_length) supported only for string type') + else: + out += 'null=True' out += ')' return out + def get_type(self): + return self.type + +class DjangoPK(DjangoField): + def __init__(self): + self.name = 'id' + self.type = 'PK' + class DjangoFK(DjangoField): def __init__(self, source_name): bits = source_name.split('Id') @@ -83,7 +111,7 @@ class DjangoFK(DjangoField): """some relations need to be mapped to actual tables """ - self.relation = 'StackExchange' + self.relation = table_prefix if name.endswith('User'): self.relation += 'User' elif name.endswith('Post'): @@ -92,6 +120,8 @@ class DjangoFK(DjangoField): self.relation = 'self' #self-referential Post model else: self.relation += name + def get_relation(self): + return self.relation def get_col_type(col): type = col.get('type') @@ -108,25 +138,88 @@ def get_col_type(col): restriction = -1 return type, restriction +def make_field_from_xml_tree(xml_element): + """used by the model parser + here we need to be detailed about field types + because this defines the database schema + """ + name = xml_element.get('name') + if name == 'LinkedVoteId':#not used + return None + if name == 'Id': + field = DjangoPK() + elif name.endswith('Id') and name not in ('OpenId','PasswordId'): + field = DjangoFK(name) + elif name.endswith('GUID'): + field = DjangoField(name, 'string', 64) + else: + type, restriction = get_col_type(xml_element) + field = DjangoField(name, type, restriction) + return field + +def parse_field_name(input): + """used by the data reader + + The problem is that I've scattered + code for determination of field name over three classes: + DjangoField, DjangoPK and DjangoFK + so the function actually cretes fake field objects + many time over + """ + if input == 'Id': + return DjangoPK().name + elif input in ('OpenId', 'PasswordId'): + return DjangoField(input, 'string', 7).name#happy fake field + elif input.endswith('Id'): + return DjangoFK(input).name#real FK field + else: + return DjangoField(input, 'string', 7).name#happy fake field + +def parse_value(input, field_object): + if isinstance(field_object, models.ForeignKey): + try: + id = int(input) + except: + raise Exception('non-numeric foreign key %s' % input) + related_model = field_object.rel.to + try: + return related_model.objects.get(id=id) + except related_model.DoesNotExist: + obj = related_model(id=id) + obj.save()#save fake empty object + return obj + elif isinstance(field_object, models.IntegerField): + try: + return int(input) + except: + raise Exception('expected integer, found %s' % input) + elif isinstance(field_object, models.CharField): + return input + elif isinstance(field_object, models.TextField): + return input + elif isinstance(field_object, models.BooleanField): + try: + return bool(input) + except: + raise Exception('boolean value expected %s found' % input) + elif isinstance(field_object, models.DateTimeField): + input = time_re.sub('', input) + try: + return datetime.strptime(input, date_time_format) + except: + raise Exception('datetime expected "%s" found' % input) + print 'from django.db import models' for file in sys.argv: if '.xsd' in file: - tname = file.replace('.xsd','') + tname = os.path.basename(file).replace('.xsd','') tree = et.parse(file) model = DjangoModel(tname) row = tree.find('.//sequence') for col in row.getchildren(): - name = col.get('name') - if name in ('Id', 'LinkedVoteId'):#second one is not used - continue - elif name.endswith('Id') and name not in ('OpenId','PasswordId'): - field = DjangoFK(name) - elif name.endswith('GUID'): - field = DjangoField(name, 'string', 64) - else: - type, restriction = get_col_type(col) - field = DjangoField(name, type, restriction) - model.add_field(field) + field = make_field_from_xml_tree(col) + if field: + model.add_field(field) print model |