Skip to content

Commit

Permalink
Capture article parse exceptions and continue loading.
Browse files Browse the repository at this point in the history
Improve article parse error logging.

Dry out article loading with load_article_atomic function.
  • Loading branch information
normangilmore committed Nov 29, 2017
1 parent 3a39503 commit b591938
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 49 deletions.
15 changes: 3 additions & 12 deletions data/document_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
from django.db import transaction

from thresher.models import UserProfile
from data.load_data import (load_article, parse_batch_name,
load_annotations, highlight_all)
from data.parse_document import parse_article
from data.load_data import load_article_atomic, parse_batch_name


def import_archive(orig_filename, filename, owner_profile_id, with_annotations=False):
try:
Expand All @@ -29,12 +28,4 @@ def import_archive(orig_filename, filename, owner_profile_id, with_annotations=F
@django_rq.job('file_importer', timeout=60, result_ttl=24*3600)
def import_article(batch_name, raw_bytes, filename, owner_profile_id, with_annotations):
owner_profile = UserProfile.objects.get(pk=owner_profile_id)
with transaction.atomic():
# n.b. conversion to UTF-8 happens in parse_article
annotated_article = parse_article(raw_bytes, filename)
article_obj = load_article(batch_name, annotated_article)
if article_obj and with_annotations:
if annotated_article['parser'] == 'generic':
highlight_all(annotated_article)
load_annotations(annotated_article, article_obj)
return article_obj.id
load_article_atomic(batch_name, raw_bytes, filename, with_annotations)
35 changes: 25 additions & 10 deletions data/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from django.db import transaction

from data import init_defaults
from data.parse_document import parse_document
from data.parse_document import parse_article, ArticleParseError
from data.parse_schema import parse_schema, ParseSchemaException, OPTION_TYPES

from thresher.models import (Article, Topic, Question, Answer,
Expand Down Expand Up @@ -325,8 +325,8 @@ def load_article(batch_name, article):
metadata=article['metadata']
)
article_obj.save()
print "article id %d numbered %s" % (article_obj.id,
article_obj.article_number)
print ("{} article number db id {}"
.format(article_obj.article_number, article_obj.id))
return article_obj

def load_annotations(article, article_obj):
Expand Down Expand Up @@ -436,19 +436,34 @@ def highlight_all(parsed_article):
tuas[topic_name] = tua
parsed_article['tuas'] = tuas

def load_article_dir(dirpath, with_annotations=False):
batch_name = parse_batch_name(dirpath)
for article_filename in os.listdir(dirpath):
if os.path.splitext(article_filename)[1] != '.txt':
continue
fullpath = os.path.join(dirpath, article_filename)
def load_article_atomic(batch_name, raw_bytes, orig_filename, with_annotations):
try:
with transaction.atomic():
annotated_article = parse_document(fullpath, article_filename)
# n.b. conversion to UTF-8 happens in parse_article
annotated_article = parse_article(raw_bytes, orig_filename)
article_obj = load_article(batch_name, annotated_article)
if article_obj and with_annotations:
if annotated_article['parser'] == 'generic':
highlight_all(annotated_article)
load_annotations(annotated_article, article_obj)
if article_obj:
return article_obj.id
except ArticleParseError as e:
if e.error_type == ArticleParseError.HEADER_ERROR:
logger.warn(orig_filename + " " + e.message)
else:
logger.warn(e.message)
return None

def load_article_dir(dirpath, with_annotations=False):
batch_name = parse_batch_name(dirpath)
for article_filename in sorted(os.listdir(dirpath)):
if os.path.splitext(article_filename)[1] != '.txt':
continue
fullpath = os.path.join(dirpath, article_filename)
with open(fullpath) as f:
raw_bytes = f.read()
load_article_atomic(batch_name, raw_bytes, article_filename, with_annotations)

def load_args():
parser = argparse.ArgumentParser()
Expand Down
48 changes: 21 additions & 27 deletions data/parse_document.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import logging
logger = logging.getLogger(__name__)

import codecs
import json
import os
Expand Down Expand Up @@ -112,27 +115,30 @@ def parse_nickML(raw_text, filename):
if tua_id not in tuas[tua_type]:
tuas[tua_type][tua_id] = []

tua_text = clean_text[tua_span[0]:tua_span[1]].strip()
if (tua_text.lower() != tua_body.strip().lower()):
raise ArticleParseError(
"Reconstructed clean text didn't match TUA body!",
ArticleParseError.TEXT_ERROR)
tua_text = clean_text[tua_span[0]:tua_span[1]]
if (tua_text.lower().strip() != tua_body.lower().strip()):
message = (u"{} Offset error! '{}' <> '{}'"
.format(article_number, tua_text, tua_body))
raise ArticleParseError(message, ArticleParseError.TEXT_ERROR)

tua_content = list(tua_span)
tua_content.append(tua_text)
tuas[tua_type][tua_id].append(tua_content)

# If the only tua_type is 'Useless', this document is likely a duplicate.
# If the only tua_type is 'Useless', we don't need to load it.
if len(tuas.keys()) == 1 and 'Useless' in tuas.keys():
print "Possibly useless:", article_number
raise ArticleParseError("Only found Useless tuas!",
ArticleParseError.DUPLICATE_ERROR)
message = u"{} Only found Useless tuas!".format(article_number)
raise ArticleParseError(message, ArticleParseError.DUPLICATE_ERROR)

# Is it bad if angle brackets are left? Probably?
if '<' in clean_text or '>' in clean_text:
message = u"{} <> Brackets remain in clean text!".format(article_number)
# raise ArticleParseError(message, ArticleParseError.BRACKET_WARNING)

# Warning: brackets left over are usually bad news.
if '[' in clean_text or ']' in clean_text:
print "Unparsed brackets left in article:", article_number
# raise ArticleParseError("Brackets remain in clean text!",
# ArticleParseError.BRACKET_WARNING)
message = u"{} [] Brackets remain in clean text!".format(article_number)
raise ArticleParseError(message, ArticleParseError.BRACKET_WARNING)

# print out our data.
metadata = {
Expand All @@ -154,18 +160,6 @@ def parse_nickML(raw_text, filename):
'parser': 'nickML',
}

# print "final clean text:", clean_text
# import pprint; pprint.pprint(tuas)
# print "annotators:", annotators
# print "version:", version
# print "date published:", date_published
# print "article number:", article_number
# print "city:", city
# print "state:", state
# print "periodical:", periodical
# print "periodical code:", periodical_code
# print "\n\n\n"

def parse_header(raw_text):
# expected header format:
#
Expand Down Expand Up @@ -193,8 +187,8 @@ def parse_header(raw_text):
elif header_rownum == 3:
annotators, version = parse_annotator_line(line)
if not annotators and not version:
raise ArticleParseError("Unexpected header line 3: " + line,
ArticleParseError.HEADER_ERROR)
message = "Unexpected header line 3 has {}: ".format(line)
raise ArticleParseError(message, ArticleParseError.HEADER_ERROR)
break

if header_rownum != 3:
Expand Down Expand Up @@ -401,7 +395,7 @@ def parse_filename(filename):
return match.group('article_number', 'city', 'state', 'periodical',
'periodical_code')
else:
raise ArticleParseError('Bad File Name: ' + raw_name,
raise ArticleParseError(raw_name + 'Bad File Name',
ArticleParseError.FILENAME_ERROR)

def parse_documents(directory_path, error_directory_paths):
Expand Down

0 comments on commit b591938

Please sign in to comment.