summaryrefslogtreecommitdiffstats
path: root/stackexchange/parse_models.py
blob: 64796e57dd4b86da35cbd7e526bcde7cbd1832d6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
from xml.etree import ElementTree as et
import sys
import re
import os
if __name__ != '__main__':#hack do not import models if run as script
    from django.db import models
from datetime import datetime

table_prefix = ''#StackExchange or something, if needed
date_time_format = '%Y-%m-%dT%H:%M:%S' #note that fractional part of second is lost
time_re = re.compile(r'(\.[\d]+)?$')
loader_app_name = os.path.dirname(__file__)

types = {
   'unsignedByte':'models.IntegerField',
   'FK':'models.ForeignKey',
   'PK':'models.IntegerField',
   'string':'models.CharField',
   'text':'models.TextField',
   'int':'models.IntegerField',
   'boolean':'models.NullBooleanField',
   'dateTime':'models.DateTimeField',
   'base64Binary':'models.TextField',
   'double':'models.IntegerField',
}

def camel_to_python(camel):
    """http://stackoverflow.com/questions/1175208/
    """
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', camel)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

def singular(word):
    if word.endswith('s'):
        return word[:-1]
    else:
        return word

def get_table_name(name):
    """Determine db table name
    from the basename of the .xml file
    """
    out = table_prefix
    if name.find('2') == -1:
        out += singular(name)
    else:
        bits = name.split('2')
        bits = map(singular, bits)
        out += '2'.join(bits) 
    return out

class DjangoModel(object):
    def __init__(self, name):
        self.name = get_table_name(name)
        self.fields = []
    def add_field(self,field):
        field.table = self
        self.fields.append(field)
    def __str__(self):
        out = 'class %s(models.Model):\n' % self.name
        for f in self.fields:
            out += '    %s\n' % str(f)
        return out

class DjangoField(object):
    def __init__(self, name, type, restriction = None):
        self.name = camel_to_python(name)
        if self.name == 'class':
            self.name = 'class_type'#work around python keyword
        self.type = type
        self.table = None
        self.restriction = restriction
        self.relation = None

    def __str__(self):
        out  = '%s = %s(' % (self.name, types[self.type])
        if self.type == 'FK':
            out += "'%s'" % self.relation  
            out += ", related_name='%s_by_%s_set'" % (self.table.name, self.name)
            out += ', null=True'#nullable to make life easier
        elif self.type == 'PK':
            out += 'primary_key=True'
        elif self.restriction != -1:
            if self.type == 'string':
                out += 'max_length=%s' % self.restriction
                out += ', null=True'
            else:
                raise Exception('restriction (max_length) supported only for string type')
        else:
            out += 'null=True'
        out += ')'
        return out

    def get_type(self):
        return self.type

class DjangoPK(DjangoField):
    def __init__(self):
        self.name = 'id'
        self.type = 'PK'

class DjangoFK(DjangoField):
    def __init__(self, source_name):
        bits = source_name.split('Id')
        if len(bits) == 2 and bits[1] == '':
            name = bits[0]
        super(DjangoFK, self).__init__(name, 'FK')
        self.set_relation(name)

    def set_relation(self, name):
        """some relations need to be mapped 
        to actual tables
        """
        self.relation = table_prefix
        if name.endswith('User'):
            self.relation += 'User'
        elif name.endswith('Post'):
            self.relation += 'Post'
        elif name in ('AcceptedAnswer','Parent'):
            self.relation = 'self' #self-referential Post model
        else:
            self.relation += name
    def get_relation(self):
        return self.relation

def get_col_type(col):
    type = col.get('type')
    restriction = -1
    if type == None:
        type_e = col.find('.//simpleType/restriction')
        type = type_e.get('base')
        try:
            restriction = int(type_e.getchildren()[0].get('value'))
        except:
            restriction = -1 
        if restriction > 400:
            type = 'text'
            restriction = -1
    return type, restriction

def make_field_from_xml_tree(xml_element):
    """used by the model parser
    here we need to be detailed about field types
    because this defines the database schema
    """
    name = xml_element.get('name')
    if name == 'LinkedVoteId':#not used
        return None
    if name == 'Id':
        field = DjangoPK()
    elif name.endswith('Id') and name not in ('OpenId','PasswordId'):
        field = DjangoFK(name)
    elif name.endswith('GUID'):
        field = DjangoField(name, 'string', 64)
    else:
        type, restriction = get_col_type(xml_element)
        field = DjangoField(name, type, restriction)
    return field

def parse_field_name(input):
    """used by the data reader

    The problem is that I've scattered
    code for determination of field name over three classes:
    DjangoField, DjangoPK and DjangoFK
    so the function actually cretes fake field objects
    many time over
    """
    if input == 'Id':
        return DjangoPK().name
    elif input in ('OpenId', 'PasswordId'):
        return DjangoField(input, 'string', 7).name#happy fake field
    elif input.endswith('Id'):
        return DjangoFK(input).name#real FK field
    else:
        return DjangoField(input, 'string', 7).name#happy fake field

def parse_value(input, field_object):
    if isinstance(field_object, models.ForeignKey):
        try:
            id = int(input)
        except:
            raise Exception('non-numeric foreign key %s' % input)
        related_model = field_object.rel.to
        try:
            return related_model.objects.get(id=id)
        except related_model.DoesNotExist:
            obj = related_model(id=id)
            obj.save()#save fake empty object
            return obj
    elif isinstance(field_object, models.IntegerField):
        try:
            return int(input)
        except:
            raise Exception('expected integer, found %s' % input)
    elif isinstance(field_object, models.CharField):
        return input
    elif isinstance(field_object, models.TextField):
        return input
    elif isinstance(field_object, models.BooleanField):
        try:
            return bool(input)
        except:
            raise Exception('boolean value expected %s found' % input)
    elif isinstance(field_object, models.DateTimeField):
        input = time_re.sub('', input)
        try:
            return datetime.strptime(input, date_time_format)
        except:
            raise Exception('datetime expected "%s" found' % input)

print 'from django.db import models'
for file in sys.argv:
    if '.xsd' in file:
        tname = os.path.basename(file).replace('.xsd','')
        tree = et.parse(file)

        model = DjangoModel(tname)

        row = tree.find('.//sequence')
        for col in row.getchildren():
            field = make_field_from_xml_tree(col)
            if field:
                model.add_field(field)
        print model