created file parse_schema_v3

updated parser to accomodate version 3 updated parser to use topics only in loading
Goodly · Oct 16, 2017 · 9b10bb1 · 9b10bb1
1 parent 108a360
commit 9b10bb1
Show file tree

Hide file tree

Showing 10 changed files with 773 additions and 221 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -8,3 +8,11 @@ end_of_line = lf
 charset = utf-8
 trim_trailing_whitespace = true
 insert_final_newline = true
+
+[*.py]
+indent_style = space
+indent_size = 4
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
diff --git a/data/load_data.py b/data/load_data.py
@@ -29,30 +29,25 @@
                              ArticleHighlight, HighlightGroup,
                              Contributor, ParserError)
 
-ANALYSIS_TYPES = {}
 HIGH_ID = 20000
 
 class TopicsSchemaParser(object):
     """
     Parses a json schema of topics and questions and populates the database
     """
-    def __init__(self, topic_obj, schema, dependencies):
+    def __init__(self, schema):
         """
         topic_obj: The Topic object that is the parent of subtopics in schema
         schema: A json schema as a string or loaded json with subtopics
         dependencies: The list of answers that point to another question
         """
-        self.topic_obj = topic_obj
         # if the schema is a string, tries to load it as json, otherwise,
         # assumes it's already json
         if isinstance(schema, str) or isinstance(schema, unicode):
             self.schema_json = json.loads(schema)
         else:
             self.schema_json = schema
-        # ensure that the analysis_type is valid
-        if not isinstance(topic_obj, Topic):
-            raise ValueError("schema must be an instance of Topic model")
-        self.dep = dependencies
+        self.topic_obj = None
 
     def load_answers(self, answers, question):
         """
@@ -110,20 +105,38 @@ def load_topics(self):
             questions = topic_args.pop('questions')
             # Change id to order
             topic_args['order'] = topic_args.pop('id')
-            # Set reference to parent
-            topic_args['parent'] = self.topic_obj
+            # now the topics each have their own dependencies
+            # topics should already have their own glossary and instructions
             # Create the topic with the values in topic_args
             topic = Topic.objects.create(**topic_args)
+            if self.topic_obj is None:
+                try:
+                    topic.save()
+                except ValidationError:
+                    # we've already loaded this schema, pull it into memory.
+                    print "Schema already exists. It will be overwritten"
+                    curr_schema_obj = Topic.objects.get(name=self.schema_json['title'])
+                    # We can't just delete the object because this will delete all TUAs associated with it.
+                    # Instead, we update the Analysis Type and delete all the topics associated with it.
+                    # When the id is set, django automatically knows to update instead of creating a new entry.
+                    topic.id = curr_schema_obj.id
+                    # Save the updated object
+                    topic.save()
+                    # delete all topics associated with this Analysis Type
+                    # This will CASCADE DELETE all questions and answers as well
+                    Topic.objects.filter(parent=topic).delete()
+                self.topic_obj = topic
             self.load_questions(questions, topic)
 
-    def load_dependencies(self):
+    def load_dependencies(self, dependencies):
         """
         Loads dependencies into targeted answers.
         """
         # Report as many errors as possible to aid someone in
         # debugging a schema. Don't bail on first error.
-        for dep in self.dep:
+        for dep in dependencies:
             try:
+                # we will not have parent topics anymore
                 topic_obj = Topic.objects.get(parent=self.topic_obj,
                                               order=dep.topic)
             except Topic.DoesNotExist:
@@ -164,48 +177,10 @@ def load_dependencies(self):
                 answer_obj.save()
 
 def load_schema(schema):
-    schema_name = schema['title']
-    # old schemas don't have a 'parent' for schemas
-    if 'parent' in schema:
-        schema_parent = schema['parent']
-        if schema_parent:
-            parent = Topic.objects.get(name=schema_parent)
-        else:
-            parent = None
-    else:
-        parent = None
-    schema_obj = Topic.objects.create(
-        parent=parent,
-        name=schema_name,
-        instructions=schema['instructions'],
-        glossary=schema['glossary'],
-        order=0   # Give root topics order of 0 so they sort ahead of their subtopics
-    )
-    try:
-        schema_obj.save()
-    except ValidationError:
-        # we've already loaded this schema, pull it into memory.
-        print "Schema already exists. It will be overwritten"
-        curr_schema_obj = Topic.objects.get(name=schema_name)
-        # We can't just delete the object because this will delete all TUAs associated with it.
-        # Instead, we update the Analysis Type and delete all the topics associated with it.
-        # When the id is set, django automatically knows to update instead of creating a new entry.
-        schema_obj.id = curr_schema_obj.id
-        # Save the updated object
-        schema_obj.save()
-        # delete all topics associated with this Analysis Type
-        # This will CASCADE DELETE all questions and answers as well
-        Topic.objects.filter(parent=schema_obj).delete()
-
-    ANALYSIS_TYPES[schema_name] = schema_obj
-
     # Load the topics, questions and answers of the schema
-    schema_parser = TopicsSchemaParser(topic_obj=schema_obj,
-                                       schema=schema['topics'],
-                                       dependencies=schema['dependencies'])
+    schema_parser = TopicsSchemaParser(schema=schema['topics'])
     schema_parser.load_topics()
-    schema_parser.load_dependencies()
-    return schema_obj.id
+    schema_parser.load_dependencies(schema['dependencies'])
 
 def load_article(article):
     new_id = int(article['metadata']['article_number'])
@@ -257,16 +232,13 @@ def load_annotations(article, article_obj):
     for tua_type, tuas in article['tuas'].iteritems():
         try:
             topic = Topic.objects.filter(name=tua_type)[0]
-            #analysis_type = (ANALYSIS_TYPES.get(tua_type) or
-            #                 Topic.objects.get(name=tua_type))
         except IndexError:
             # No analysis type loaded--create a dummy type.
             topic = Topic.objects.create(
                 name=tua_type,
                 instructions='',
                 glossary='',
             )
-            ANALYSIS_TYPES[tua_type] = topic
             print("made a dummy topic: %s" % tua_type)
 #           raise ValueError("No TUA type '" + tua_type +
 #                            "' registered. Have you loaded the schemas?")

diff --git a/data/parse_schema.py b/data/parse_schema.py
@@ -12,17 +12,20 @@
 GLOSSARY_ID = 'glossary:'
 DEPENDENCY_ID = 'if'
 DEPENDENCY_TARGET = 'then'
+VERSION_ID = 'version:'
+VERSION_NUM = 'v3'
 
 QUESTION_TYPES = {'mc' : 'RADIO',
                   'dd' : 'RADIO', # old label
                   'cl' : 'CHECKBOX',
                   'tx' : 'TEXT',
                   'tb' : 'TEXT', # old label
                   'dt' : 'DATE',
-                  'tm' : 'TIME'}
+                  'tm' : 'TIME',
+                  'st' : 'SUBTOPIC'}
 
 Dependency = namedtuple('Dependency',
-    ['topic', 'question', 'answer', 'next_question'])
+    ['topic', 'question', 'answer', 'next_question', 'next_topic'])
 
 class ParseSchemaException(Exception):
 
@@ -50,6 +53,9 @@ def parse_schema(schema_file):
     load_defaults(parsed_schema)
     with open(schema_file, 'r') as f:
         linecount = 1
+        version3 = False
+        first_line = True
+        curr_topic_id = -1
         for line in f:
             raw_line = line.strip()
 
@@ -77,15 +83,18 @@ def parse_schema(schema_file):
             # Infer the line type and parse accordingly
             type_id, data = raw_line.split(None, 1)
             if type_id.lower() == TITLE_ID:
-                parse_title(data, parsed_schema)
+                parse_title(data, parsed_schema, version3)
             elif type_id.lower() == INSTRUCTIONS_ID:
-                parse_instructions(data, parsed_schema)
+                parse_instructions(data, parsed_schema, curr_topic_id)
             elif type_id.lower() == GLOSSARY_ID:
-                parse_glossary(data, parsed_schema)
+                parse_glossary(data, parsed_schema, curr_topic_id)
             elif type_id.lower() == DEPENDENCY_ID:
                 parse_dependency(data, parsed_schema)
             elif unicode(type_id[0]).isnumeric():
-                parse_question_entry(type_id, data, parsed_schema)
+                curr_topic_id = parse_question_entry(type_id, data, parsed_schema)
+            elif type_id.lower() == VERSION_ID and first_line:
+                version3 = data.strip() == VERSION_NUM
+                first_line = False
             else:
                 # type_id is wrong or split lines returned wrong stuffs
                 msg = "Invalid type_id {}".format(type_id)
@@ -98,17 +107,30 @@ def parse_schema(schema_file):
 
     return parsed_schema
 
-def parse_title(title, output):
-    output['title'] = title
+def parse_title(title, output, version3):
+    # only put in a title for the first title (that will be the root topic)
+    if 'title' not in output:
+        output['title'] = title
+    if version3:
+        if 'topics' not in output:
+            output['topics'] = []
+        # id should take on the value of the topic_id in the question block below
+        output['topics'].append({
+            'id': None,
+            'name': title,
+            'questions': [],
+        })
 
-def parse_instructions(instructions, output):
-    output['instructions'] = instructions
+def parse_instructions(instructions, output, curr_topic_id):
+    ind = [i for i in range(len(output['topics'])) if output['topics'][i]['id'] == curr_topic_id][0]
+    output['topics'][ind]['instructions'] = instructions
 
-def parse_glossary(glossary_entry, output):
-    if 'glossary' not in output:
-        output['glossary'] = {}
+def parse_glossary(glossary_entry, output, curr_topic_id):
+    ind = [i for i in range(len(output['topics'])) if output['topics'][i]['id'] == curr_topic_id][0]
+    if 'glossary' not in output['topics'][ind]:
+        output['topics'][ind]['glossary'] = {}
     term, definition = glossary_entry.split(':', 1)
-    output['glossary'][term.strip()] = definition.strip()
+    output['topics'][ind]['glossary'][term.strip()] = definition.strip()
 
 def parse_dependency(dependency, output):
 
@@ -117,18 +139,23 @@ def parse_dependency(dependency, output):
     target_phrase = splitted_dependency[1].split(' ')[1]
     source_topic_id, source_question_id, source_answer_id = (
         source_phrase.split('.'))
-    target_question = target_phrase.split('.')[1]
+    target_dependency = target_phrase.split('.')
+    # -1 if there is no target_question. find a better null value?
+    target_question = target_dependency[1] if len(target_dependency) > 1 else -1
+    target_topic = target_dependency[0]
 
     source_topic_id = int(source_topic_id)
     source_question_id = int(source_question_id)
     target_question = int(target_question)
+    target_topic = int(target_topic)
 
     # Do not convert source_answer_id to int, because value might be 'any'
     # source_answer_id = int(source_answer_id)
     output['dependencies'].append(Dependency(source_topic_id,
                                              source_question_id,
                                              source_answer_id,
-                                             target_question))
+                                             target_question,
+                                             target_topic))
 
 def infer_hint_type(question):
     match = re.search("WHERE|WHO|HOW MANY|WHEN", question, re.IGNORECASE)
@@ -155,6 +182,9 @@ def parse_question_entry(entry_id, data, output):
         })
     elif num_bits == 2:
         topic_id, question_id = type_bits
+        ind_list = [i for i in range(len(output['topics'])) if output['topics'][i]['id'] is None]
+        if len(ind_list) > 0:
+            output['topics'][ind_list[0]]['id'] = topic_id
         question_id = type_bits[1]
         topic = [t for t in output['topics'] if t['id'] == topic_id][0]
         question_type, question_text = data.split(None, 1)
@@ -171,12 +201,16 @@ def parse_question_entry(entry_id, data, output):
         })
     else:
         topic_id, question_id, answer_id = type_bits
+        ind_list = [i for i in range(len(output['topics'])) if output['topics'][i]['id'] is None]
+        if len(ind_list) > 0:
+            output['topics'][ind_list[0]]['id'] = topic_id
         topic = [t for t in output['topics'] if t['id'] == topic_id][0]
         question = [q for q in topic['questions'] if q['question_number'] == question_id][0]
         question['answers'].append({
             'answer_number': answer_id,
             'answer_content': data,
         })
+    return topic_id
 
 def print_data(output):
     print "Here's the current parsed data:"