diff options
author | Evgeny Fadeev <evgeny.fadeev@gmail.com> | 2012-11-13 23:41:39 -0300 |
---|---|---|
committer | Evgeny Fadeev <evgeny.fadeev@gmail.com> | 2012-11-13 23:41:39 -0300 |
commit | 5c02a9b8fc82a5b017ff64ccdfff30cbc2191845 (patch) | |
tree | fb1d942e80774a021a89201579d6f94051294a9d /askbot/importers | |
parent | 74fef791632982f52c902ccb1c47e707da9e284e (diff) | |
download | askbot-5c02a9b8fc82a5b017ff64ccdfff30cbc2191845.tar.gz askbot-5c02a9b8fc82a5b017ff64ccdfff30cbc2191845.tar.bz2 askbot-5c02a9b8fc82a5b017ff64ccdfff30cbc2191845.zip |
hopefully fixed the stackexchange importer
Diffstat (limited to 'askbot/importers')
-rw-r--r-- | askbot/importers/stackexchange/management/commands/load_stackexchange.py | 71 |
1 files changed, 58 insertions, 13 deletions
diff --git a/askbot/importers/stackexchange/management/commands/load_stackexchange.py b/askbot/importers/stackexchange/management/commands/load_stackexchange.py index 600d00b1..313bab13 100644 --- a/askbot/importers/stackexchange/management/commands/load_stackexchange.py +++ b/askbot/importers/stackexchange/management/commands/load_stackexchange.py @@ -6,6 +6,7 @@ import sys from unidecode import unidecode import zipfile from datetime import datetime +from django.conf import settings as django_settings from django.core.management.base import BaseCommand, CommandError import askbot.importers.stackexchange.parse_models as se_parser from xml.etree import ElementTree as et @@ -24,9 +25,11 @@ except ImportError: from askbot.models.message import Message as DjangoMessage from django.utils.translation import ugettext as _ +from askbot.utils.console import ProgressBar from askbot.utils.slug import slugify from askbot.models.badges import award_badges_signal, award_badges from askbot.importers.stackexchange.management import is_ready as importer_is_ready +from optparse import make_option #from markdown2 import Markdown #markdowner = Markdown(html4tags=True) @@ -285,12 +288,43 @@ class X(object):# return slugify(cls.badge_exceptions.get(name, name).lower()) class Command(BaseCommand): - help = 'Loads StackExchange data from unzipped directory of XML files into the ASKBOT database' + help = """Loads StackExchange data from SE dump .zip file +it may be helpful to split this procedure in two:\n +* read the dump (with option --read-se-dump) +* transfer data to askbot (with option --process-data) +""" args = 'se_dump_dir' + option_list = BaseCommand.option_list + ( + make_option('-r', '--read-dump', + action='store_true', + dest='read_dump', + default=False, + help='Only read the the dump' + ), + make_option('-p', '--process-data', + action='store_true', + dest='process_data', + default=False, + help='Only process the data, assuming that the dump is loaded' + ) + ) + @transaction.commit_manually def handle(self, *arg, **kwarg): + if django_settings.DEBUG: + raise CommandError( + 'Please set DEBUG to False in the settings.py to reduce RAM usage' + ) + + #process the command line arguments, if given + if kwarg['read_dump'] is False and kwarg['process_data'] is False: + #make them both true as a hack to simulate a condition where + #no flags selected means the same as both are indeed selected + kwarg['read_dump'] = True + kwarg['process_data'] = True + askbot_settings.update('LIMIT_ONE_ANSWER_PER_USER', False) if not importer_is_ready(): @@ -306,16 +340,23 @@ class Command(BaseCommand): if len(arg) < 1 or not os.path.isfile(arg[0]): raise CommandError('Error: first argument must be a zip file with the SE forum data') - self.zipfile = self.open_dump(arg[0]) - #read the data into SE tables - for item in xml_read_order: - time_before = datetime.now() - self.load_xml_file(item) - transaction.commit() - time_after = datetime.now() - if DEBUGME == True: - print time_after - time_before - print HEAP.heap() + if kwarg['read_dump']: + self.zipfile = self.open_dump(arg[0]) + #read the data into SE tables + for item in xml_read_order: + time_before = datetime.now() + self.load_xml_file(item) + transaction.commit() + time_after = datetime.now() + if DEBUGME == True: + print time_after - time_before + print HEAP.heap() + + if kwarg['process_data'] is False: + #that means we just wanted to load the xml dump to + #do the second step in another go in order to have + #more ram for the transfer of data from SE to Askbot databases + return #this is important so that when we clean up messages #automatically generated by the procedures below @@ -651,7 +692,9 @@ class Command(BaseCommand): c_group = [] #this loop groups revisions by revision id, then calls process function #for the revision grup (elementary revisions posted at once) - for se_rev in se_revs.iterator(): + message = 'Processing revisions' + count = se_revs.count() + for se_rev in ProgressBar(se_revs.iterator(), count, message): if se_rev.revision_guid == c_guid: c_group.append(se_rev) else: @@ -853,7 +896,9 @@ class Command(BaseCommand): return xml_file_basename + '.xml' def transfer_users(self): - for se_u in se.User.objects.all().iterator(): + se_users = se.User.objects.all() + count = se_users.count() + for se_u in ProgressBar(se_users.iterator(), count): #if se_u.id == -1:#skip the Community user # continue u = askbot.User() |