Skip to content

Commit

Permalink
created file parse_schema_v3
Browse files Browse the repository at this point in the history
updated parser to accomodate version 3

updated parser to use topics only in loading
  • Loading branch information
JasmineDeng authored and normangilmore committed Oct 16, 2017
1 parent 108a360 commit 9b10bb1
Show file tree
Hide file tree
Showing 10 changed files with 773 additions and 221 deletions.
8 changes: 8 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,11 @@ end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true

[*.py]
indent_style = space
indent_size = 4
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
80 changes: 26 additions & 54 deletions data/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,30 +29,25 @@
ArticleHighlight, HighlightGroup,
Contributor, ParserError)

ANALYSIS_TYPES = {}
HIGH_ID = 20000

class TopicsSchemaParser(object):
"""
Parses a json schema of topics and questions and populates the database
"""
def __init__(self, topic_obj, schema, dependencies):
def __init__(self, schema):
"""
topic_obj: The Topic object that is the parent of subtopics in schema
schema: A json schema as a string or loaded json with subtopics
dependencies: The list of answers that point to another question
"""
self.topic_obj = topic_obj
# if the schema is a string, tries to load it as json, otherwise,
# assumes it's already json
if isinstance(schema, str) or isinstance(schema, unicode):
self.schema_json = json.loads(schema)
else:
self.schema_json = schema
# ensure that the analysis_type is valid
if not isinstance(topic_obj, Topic):
raise ValueError("schema must be an instance of Topic model")
self.dep = dependencies
self.topic_obj = None

def load_answers(self, answers, question):
"""
Expand Down Expand Up @@ -110,20 +105,38 @@ def load_topics(self):
questions = topic_args.pop('questions')
# Change id to order
topic_args['order'] = topic_args.pop('id')
# Set reference to parent
topic_args['parent'] = self.topic_obj
# now the topics each have their own dependencies
# topics should already have their own glossary and instructions
# Create the topic with the values in topic_args
topic = Topic.objects.create(**topic_args)
if self.topic_obj is None:
try:
topic.save()
except ValidationError:
# we've already loaded this schema, pull it into memory.
print "Schema already exists. It will be overwritten"
curr_schema_obj = Topic.objects.get(name=self.schema_json['title'])
# We can't just delete the object because this will delete all TUAs associated with it.
# Instead, we update the Analysis Type and delete all the topics associated with it.
# When the id is set, django automatically knows to update instead of creating a new entry.
topic.id = curr_schema_obj.id
# Save the updated object
topic.save()
# delete all topics associated with this Analysis Type
# This will CASCADE DELETE all questions and answers as well
Topic.objects.filter(parent=topic).delete()
self.topic_obj = topic
self.load_questions(questions, topic)

def load_dependencies(self):
def load_dependencies(self, dependencies):
"""
Loads dependencies into targeted answers.
"""
# Report as many errors as possible to aid someone in
# debugging a schema. Don't bail on first error.
for dep in self.dep:
for dep in dependencies:
try:
# we will not have parent topics anymore
topic_obj = Topic.objects.get(parent=self.topic_obj,
order=dep.topic)
except Topic.DoesNotExist:
Expand Down Expand Up @@ -164,48 +177,10 @@ def load_dependencies(self):
answer_obj.save()

def load_schema(schema):
schema_name = schema['title']
# old schemas don't have a 'parent' for schemas
if 'parent' in schema:
schema_parent = schema['parent']
if schema_parent:
parent = Topic.objects.get(name=schema_parent)
else:
parent = None
else:
parent = None
schema_obj = Topic.objects.create(
parent=parent,
name=schema_name,
instructions=schema['instructions'],
glossary=schema['glossary'],
order=0 # Give root topics order of 0 so they sort ahead of their subtopics
)
try:
schema_obj.save()
except ValidationError:
# we've already loaded this schema, pull it into memory.
print "Schema already exists. It will be overwritten"
curr_schema_obj = Topic.objects.get(name=schema_name)
# We can't just delete the object because this will delete all TUAs associated with it.
# Instead, we update the Analysis Type and delete all the topics associated with it.
# When the id is set, django automatically knows to update instead of creating a new entry.
schema_obj.id = curr_schema_obj.id
# Save the updated object
schema_obj.save()
# delete all topics associated with this Analysis Type
# This will CASCADE DELETE all questions and answers as well
Topic.objects.filter(parent=schema_obj).delete()

ANALYSIS_TYPES[schema_name] = schema_obj

# Load the topics, questions and answers of the schema
schema_parser = TopicsSchemaParser(topic_obj=schema_obj,
schema=schema['topics'],
dependencies=schema['dependencies'])
schema_parser = TopicsSchemaParser(schema=schema['topics'])
schema_parser.load_topics()
schema_parser.load_dependencies()
return schema_obj.id
schema_parser.load_dependencies(schema['dependencies'])

def load_article(article):
new_id = int(article['metadata']['article_number'])
Expand Down Expand Up @@ -257,16 +232,13 @@ def load_annotations(article, article_obj):
for tua_type, tuas in article['tuas'].iteritems():
try:
topic = Topic.objects.filter(name=tua_type)[0]
#analysis_type = (ANALYSIS_TYPES.get(tua_type) or
# Topic.objects.get(name=tua_type))
except IndexError:
# No analysis type loaded--create a dummy type.
topic = Topic.objects.create(
name=tua_type,
instructions='',
glossary='',
)
ANALYSIS_TYPES[tua_type] = topic
print("made a dummy topic: %s" % tua_type)
# raise ValueError("No TUA type '" + tua_type +
# "' registered. Have you loaded the schemas?")
Expand Down
66 changes: 50 additions & 16 deletions data/parse_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,20 @@
GLOSSARY_ID = 'glossary:'
DEPENDENCY_ID = 'if'
DEPENDENCY_TARGET = 'then'
VERSION_ID = 'version:'
VERSION_NUM = 'v3'

QUESTION_TYPES = {'mc' : 'RADIO',
'dd' : 'RADIO', # old label
'cl' : 'CHECKBOX',
'tx' : 'TEXT',
'tb' : 'TEXT', # old label
'dt' : 'DATE',
'tm' : 'TIME'}
'tm' : 'TIME',
'st' : 'SUBTOPIC'}

Dependency = namedtuple('Dependency',
['topic', 'question', 'answer', 'next_question'])
['topic', 'question', 'answer', 'next_question', 'next_topic'])

class ParseSchemaException(Exception):

Expand Down Expand Up @@ -50,6 +53,9 @@ def parse_schema(schema_file):
load_defaults(parsed_schema)
with open(schema_file, 'r') as f:
linecount = 1
version3 = False
first_line = True
curr_topic_id = -1
for line in f:
raw_line = line.strip()

Expand Down Expand Up @@ -77,15 +83,18 @@ def parse_schema(schema_file):
# Infer the line type and parse accordingly
type_id, data = raw_line.split(None, 1)
if type_id.lower() == TITLE_ID:
parse_title(data, parsed_schema)
parse_title(data, parsed_schema, version3)
elif type_id.lower() == INSTRUCTIONS_ID:
parse_instructions(data, parsed_schema)
parse_instructions(data, parsed_schema, curr_topic_id)
elif type_id.lower() == GLOSSARY_ID:
parse_glossary(data, parsed_schema)
parse_glossary(data, parsed_schema, curr_topic_id)
elif type_id.lower() == DEPENDENCY_ID:
parse_dependency(data, parsed_schema)
elif unicode(type_id[0]).isnumeric():
parse_question_entry(type_id, data, parsed_schema)
curr_topic_id = parse_question_entry(type_id, data, parsed_schema)
elif type_id.lower() == VERSION_ID and first_line:
version3 = data.strip() == VERSION_NUM
first_line = False
else:
# type_id is wrong or split lines returned wrong stuffs
msg = "Invalid type_id {}".format(type_id)
Expand All @@ -98,17 +107,30 @@ def parse_schema(schema_file):

return parsed_schema

def parse_title(title, output):
output['title'] = title
def parse_title(title, output, version3):
# only put in a title for the first title (that will be the root topic)
if 'title' not in output:
output['title'] = title
if version3:
if 'topics' not in output:
output['topics'] = []
# id should take on the value of the topic_id in the question block below
output['topics'].append({
'id': None,
'name': title,
'questions': [],
})

def parse_instructions(instructions, output):
output['instructions'] = instructions
def parse_instructions(instructions, output, curr_topic_id):
ind = [i for i in range(len(output['topics'])) if output['topics'][i]['id'] == curr_topic_id][0]
output['topics'][ind]['instructions'] = instructions

def parse_glossary(glossary_entry, output):
if 'glossary' not in output:
output['glossary'] = {}
def parse_glossary(glossary_entry, output, curr_topic_id):
ind = [i for i in range(len(output['topics'])) if output['topics'][i]['id'] == curr_topic_id][0]
if 'glossary' not in output['topics'][ind]:
output['topics'][ind]['glossary'] = {}
term, definition = glossary_entry.split(':', 1)
output['glossary'][term.strip()] = definition.strip()
output['topics'][ind]['glossary'][term.strip()] = definition.strip()

def parse_dependency(dependency, output):

Expand All @@ -117,18 +139,23 @@ def parse_dependency(dependency, output):
target_phrase = splitted_dependency[1].split(' ')[1]
source_topic_id, source_question_id, source_answer_id = (
source_phrase.split('.'))
target_question = target_phrase.split('.')[1]
target_dependency = target_phrase.split('.')
# -1 if there is no target_question. find a better null value?
target_question = target_dependency[1] if len(target_dependency) > 1 else -1
target_topic = target_dependency[0]

source_topic_id = int(source_topic_id)
source_question_id = int(source_question_id)
target_question = int(target_question)
target_topic = int(target_topic)

# Do not convert source_answer_id to int, because value might be 'any'
# source_answer_id = int(source_answer_id)
output['dependencies'].append(Dependency(source_topic_id,
source_question_id,
source_answer_id,
target_question))
target_question,
target_topic))

def infer_hint_type(question):
match = re.search("WHERE|WHO|HOW MANY|WHEN", question, re.IGNORECASE)
Expand All @@ -155,6 +182,9 @@ def parse_question_entry(entry_id, data, output):
})
elif num_bits == 2:
topic_id, question_id = type_bits
ind_list = [i for i in range(len(output['topics'])) if output['topics'][i]['id'] is None]
if len(ind_list) > 0:
output['topics'][ind_list[0]]['id'] = topic_id
question_id = type_bits[1]
topic = [t for t in output['topics'] if t['id'] == topic_id][0]
question_type, question_text = data.split(None, 1)
Expand All @@ -171,12 +201,16 @@ def parse_question_entry(entry_id, data, output):
})
else:
topic_id, question_id, answer_id = type_bits
ind_list = [i for i in range(len(output['topics'])) if output['topics'][i]['id'] is None]
if len(ind_list) > 0:
output['topics'][ind_list[0]]['id'] = topic_id
topic = [t for t in output['topics'] if t['id'] == topic_id][0]
question = [q for q in topic['questions'] if q['question_number'] == question_id][0]
question['answers'].append({
'answer_number': answer_id,
'answer_content': data,
})
return topic_id

def print_data(output):
print "Here's the current parsed data:"
Expand Down
Loading

0 comments on commit 9b10bb1

Please sign in to comment.