diff options
author | Evgeny Fadeev <evgeny.fadeev@gmail.com> | 2011-01-24 22:38:48 -0500 |
---|---|---|
committer | Evgeny Fadeev <evgeny.fadeev@gmail.com> | 2011-01-24 22:38:48 -0500 |
commit | 3403e5b66988bdc940fcdc6397231544f49c9be4 (patch) | |
tree | 398ca5dd9b8c2c8f386de9044e4d04671f78bda1 /askbot/importers | |
parent | 9e5c5f860ed0f0e58dd82075cb813e920b8f3e31 (diff) | |
download | askbot-3403e5b66988bdc940fcdc6397231544f49c9be4.tar.gz askbot-3403e5b66988bdc940fcdc6397231544f49c9be4.tar.bz2 askbot-3403e5b66988bdc940fcdc6397231544f49c9be4.zip |
SE import works through the web interface, but still needs visual enhancement
Diffstat (limited to 'askbot/importers')
-rw-r--r-- | askbot/importers/stackexchange/management/__init__.py | 11 | ||||
-rw-r--r-- | askbot/importers/stackexchange/management/commands/load_stackexchange.py | 67 |
2 files changed, 59 insertions, 19 deletions
diff --git a/askbot/importers/stackexchange/management/__init__.py b/askbot/importers/stackexchange/management/__init__.py index e69de29b..66103e27 100644 --- a/askbot/importers/stackexchange/management/__init__.py +++ b/askbot/importers/stackexchange/management/__init__.py @@ -0,0 +1,11 @@ +import threading +from django.core import management +import logging + +class ImporterThread(threading.Thread): + def __init__(self, dump_file = None): + self.dump_file = dump_file + super(ImporterThread, self).__init__() + + def run(self): + management.call_command('load_stackexchange', self.dump_file) diff --git a/askbot/importers/stackexchange/management/commands/load_stackexchange.py b/askbot/importers/stackexchange/management/commands/load_stackexchange.py index 52ed9167..48de02e5 100644 --- a/askbot/importers/stackexchange/management/commands/load_stackexchange.py +++ b/askbot/importers/stackexchange/management/commands/load_stackexchange.py @@ -1,8 +1,9 @@ -from django.core.management.base import BaseCommand #todo: http://stackoverflow.com/questions/837828/how-to-use-a-slug-in-django import os import re import sys +import zipfile +from django.core.management.base import BaseCommand, CommandError import askbot.importers.stackexchange.parse_models as se_parser from xml.etree import ElementTree as et from django.db import models, transaction @@ -252,6 +253,7 @@ class X(object):# if len(badge_summary) > 3: print 'warning: guessing that badge summary is comma separated' print 'have %s' % badge_summary + sys.stdout.flush() bits = badge_summary.split(',') else: bits = [badge_summary] @@ -277,16 +279,19 @@ class Command(BaseCommand): award_badges_signal.disconnect(award_badges) - if len(arg) < 1 or not os.path.isdir(arg[0]): - print 'Error: first argument must be a directory with all the SE *.xml files' - sys.exit(1) + import logging + logging.critical(str(arg)) - self.dump_path = arg[0] + logging.critical(os.path.isfile(arg[0])) + logging.critical(os.path.exists(arg[0])) + + if len(arg) < 1 or not os.path.isfile(arg[0]): + raise CommandError('Error: first argument must be a zip file with the SE forum data') + + self.zipfile = self.open_dump(arg[0]) #read the data into SE tables - for xml in xml_read_order: - xml_path = self.get_xml_path(xml) - table_name = self.get_table_name(xml) - self.load_xml_file(xml_path, table_name) + for item in xml_read_order: + self.load_xml_file(item) transaction.commit() #this is important so that when we clean up messages @@ -304,6 +309,7 @@ class Command(BaseCommand): self.transfer_users() transaction.commit() print 'done.' + sys.stdout.flush() print 'Transferring content edits...', sys.stdout.flush() self.transfer_question_and_answer_activity() @@ -343,6 +349,19 @@ class Command(BaseCommand): self.transfer_meta_pages() transaction.commit() + def open_dump(self, path): + """open the zipfile, raise error if it + does not exist or does not contain files with expected names""" + if not zipfile.is_zipfile(path): + raise CommandError('%s is not a zip file' % path) + dump = zipfile.ZipFile(path) + filenames = [item.filename for item in dump.infolist()] + for component in xml_read_order: + expected_file = component + '.xml' + if expected_file not in filenames: + raise CommandError('file %s not found in the archive' % expected_file) + return dump + def save_askbot_message_id_list(self): id_list = list(DjangoMessage.objects.all().values('id')) self._askbot_message_id_list = id_list @@ -612,6 +631,7 @@ class Command(BaseCommand): for se_c in se.PostComment.objects.all(): if se_c.deletion_date: print 'Warning deleted comment %d dropped' % se_c.id + sys.stdout.flush() continue se_post = se_c.post askbot_post = X.get_post(se_post) @@ -636,6 +656,7 @@ class Command(BaseCommand): self._missing_badges[name] = 0 if len(se_b.description) > 300: print 'Warning truncated description for badge %d' % se_b.id + sys.stdout.flush() def _award_badges(self): #note: SE does not keep information on @@ -670,6 +691,7 @@ class Command(BaseCommand): dropped = [name for name in d.keys() if d[name] > 0] print 'Warning - following unsupported badges were dropped:' print ', '.join(dropped) + sys.stdout.flush() def transfer_badges(self): #note: badge level is neglected @@ -712,8 +734,15 @@ class Command(BaseCommand): #so we can't do this pass - def load_xml_file(self, xml_path, table_name): - tree = et.parse(xml_path) + def load_xml_file(self, item): + """read data from the zip file for the item + """ + xml_path = self.get_xml_path(item) + table_name = self.get_table_name(item) + + xml_data = self.zipfile.read(xml_path) + + tree = et.fromstring(xml_data) print 'loading from %s to %s' % (xml_path, table_name) , model = models.get_model('stackexchange', table_name) i = 0 @@ -727,16 +756,13 @@ class Command(BaseCommand): setattr(model_entry, field_name, field_value) model_entry.save() print '... %d objects saved' % i + sys.stdout.flush() - def get_table_name(self,xml): - return se_parser.get_table_name(xml) + def get_table_name(self, xml_file_basename): + return se_parser.get_table_name(xml_file_basename) - def get_xml_path(self, xml): - xml_path = os.path.join(self.dump_path, xml + '.xml') - if not os.path.isfile(xml_path): - print 'Error: file %s not found' % xml_path - sys.exit(1) - return xml_path + def get_xml_path(self, xml_file_basename): + return xml_file_basename + '.xml' def transfer_users(self): for se_u in se.User.objects.all(): @@ -766,9 +792,11 @@ class Command(BaseCommand): except AssertionError: print 'User %s (id=%d) does not have openid' % \ (se_u.display_name, se_u.id) + sys.stdout.flush() if se_u.open_id is None and se_u.email is None: print 'Warning: SE user %d is not recoverable (no email or openid)' + sys.stdout.flush() u.reputation = 1#se_u.reputation, it's actually re-computed u.last_seen = se_u.last_access_date @@ -815,6 +843,7 @@ class Command(BaseCommand): try: other = askbot.User.objects.get(username = u.username) print 'alert - have a second user with name %s' % u.username + sys.sdtout.flush() except askbot.User.DoesNotExist: pass u.save() |