summaryrefslogtreecommitdiffstats
path: root/askbot/importers
diff options
context:
space:
mode:
authorEvgeny Fadeev <evgeny.fadeev@gmail.com>2012-11-13 23:41:39 -0300
committerEvgeny Fadeev <evgeny.fadeev@gmail.com>2012-11-13 23:41:39 -0300
commit5c02a9b8fc82a5b017ff64ccdfff30cbc2191845 (patch)
treefb1d942e80774a021a89201579d6f94051294a9d /askbot/importers
parent74fef791632982f52c902ccb1c47e707da9e284e (diff)
downloadaskbot-5c02a9b8fc82a5b017ff64ccdfff30cbc2191845.tar.gz
askbot-5c02a9b8fc82a5b017ff64ccdfff30cbc2191845.tar.bz2
askbot-5c02a9b8fc82a5b017ff64ccdfff30cbc2191845.zip
hopefully fixed the stackexchange importer
Diffstat (limited to 'askbot/importers')
-rw-r--r--askbot/importers/stackexchange/management/commands/load_stackexchange.py71
1 files changed, 58 insertions, 13 deletions
diff --git a/askbot/importers/stackexchange/management/commands/load_stackexchange.py b/askbot/importers/stackexchange/management/commands/load_stackexchange.py
index 600d00b1..313bab13 100644
--- a/askbot/importers/stackexchange/management/commands/load_stackexchange.py
+++ b/askbot/importers/stackexchange/management/commands/load_stackexchange.py
@@ -6,6 +6,7 @@ import sys
from unidecode import unidecode
import zipfile
from datetime import datetime
+from django.conf import settings as django_settings
from django.core.management.base import BaseCommand, CommandError
import askbot.importers.stackexchange.parse_models as se_parser
from xml.etree import ElementTree as et
@@ -24,9 +25,11 @@ except ImportError:
from askbot.models.message import Message as DjangoMessage
from django.utils.translation import ugettext as _
+from askbot.utils.console import ProgressBar
from askbot.utils.slug import slugify
from askbot.models.badges import award_badges_signal, award_badges
from askbot.importers.stackexchange.management import is_ready as importer_is_ready
+from optparse import make_option
#from markdown2 import Markdown
#markdowner = Markdown(html4tags=True)
@@ -285,12 +288,43 @@ class X(object):#
return slugify(cls.badge_exceptions.get(name, name).lower())
class Command(BaseCommand):
- help = 'Loads StackExchange data from unzipped directory of XML files into the ASKBOT database'
+ help = """Loads StackExchange data from SE dump .zip file
+it may be helpful to split this procedure in two:\n
+* read the dump (with option --read-se-dump)
+* transfer data to askbot (with option --process-data)
+"""
args = 'se_dump_dir'
+ option_list = BaseCommand.option_list + (
+ make_option('-r', '--read-dump',
+ action='store_true',
+ dest='read_dump',
+ default=False,
+ help='Only read the the dump'
+ ),
+ make_option('-p', '--process-data',
+ action='store_true',
+ dest='process_data',
+ default=False,
+ help='Only process the data, assuming that the dump is loaded'
+ )
+ )
+
@transaction.commit_manually
def handle(self, *arg, **kwarg):
+ if django_settings.DEBUG:
+ raise CommandError(
+ 'Please set DEBUG to False in the settings.py to reduce RAM usage'
+ )
+
+ #process the command line arguments, if given
+ if kwarg['read_dump'] is False and kwarg['process_data'] is False:
+ #make them both true as a hack to simulate a condition where
+ #no flags selected means the same as both are indeed selected
+ kwarg['read_dump'] = True
+ kwarg['process_data'] = True
+
askbot_settings.update('LIMIT_ONE_ANSWER_PER_USER', False)
if not importer_is_ready():
@@ -306,16 +340,23 @@ class Command(BaseCommand):
if len(arg) < 1 or not os.path.isfile(arg[0]):
raise CommandError('Error: first argument must be a zip file with the SE forum data')
- self.zipfile = self.open_dump(arg[0])
- #read the data into SE tables
- for item in xml_read_order:
- time_before = datetime.now()
- self.load_xml_file(item)
- transaction.commit()
- time_after = datetime.now()
- if DEBUGME == True:
- print time_after - time_before
- print HEAP.heap()
+ if kwarg['read_dump']:
+ self.zipfile = self.open_dump(arg[0])
+ #read the data into SE tables
+ for item in xml_read_order:
+ time_before = datetime.now()
+ self.load_xml_file(item)
+ transaction.commit()
+ time_after = datetime.now()
+ if DEBUGME == True:
+ print time_after - time_before
+ print HEAP.heap()
+
+ if kwarg['process_data'] is False:
+ #that means we just wanted to load the xml dump to
+ #do the second step in another go in order to have
+ #more ram for the transfer of data from SE to Askbot databases
+ return
#this is important so that when we clean up messages
#automatically generated by the procedures below
@@ -651,7 +692,9 @@ class Command(BaseCommand):
c_group = []
#this loop groups revisions by revision id, then calls process function
#for the revision grup (elementary revisions posted at once)
- for se_rev in se_revs.iterator():
+ message = 'Processing revisions'
+ count = se_revs.count()
+ for se_rev in ProgressBar(se_revs.iterator(), count, message):
if se_rev.revision_guid == c_guid:
c_group.append(se_rev)
else:
@@ -853,7 +896,9 @@ class Command(BaseCommand):
return xml_file_basename + '.xml'
def transfer_users(self):
- for se_u in se.User.objects.all().iterator():
+ se_users = se.User.objects.all()
+ count = se_users.count()
+ for se_u in ProgressBar(se_users.iterator(), count):
#if se_u.id == -1:#skip the Community user
# continue
u = askbot.User()