Capture article parse exceptions and continue loading.

Improve article parse error logging. Dry out article loading with load_article_atomic function.
Goodly · Nov 29, 2017 · b591938 · b591938
1 parent 3a39503
commit b591938
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 49 deletions.
diff --git a/data/document_importer.py b/data/document_importer.py
@@ -7,9 +7,8 @@
 from django.db import transaction
 
 from thresher.models import UserProfile
-from data.load_data import (load_article, parse_batch_name,
-                            load_annotations, highlight_all)
-from data.parse_document import parse_article
+from data.load_data import load_article_atomic, parse_batch_name
+
 
 def import_archive(orig_filename, filename, owner_profile_id, with_annotations=False):
     try:
@@ -29,12 +28,4 @@ def import_archive(orig_filename, filename, owner_profile_id, with_annotations=F
 @django_rq.job('file_importer', timeout=60, result_ttl=24*3600)
 def import_article(batch_name, raw_bytes, filename, owner_profile_id, with_annotations):
     owner_profile = UserProfile.objects.get(pk=owner_profile_id)
-    with transaction.atomic():
-        # n.b. conversion to UTF-8 happens in parse_article
-        annotated_article = parse_article(raw_bytes, filename)
-        article_obj = load_article(batch_name, annotated_article)
-        if article_obj and with_annotations:
-            if annotated_article['parser'] == 'generic':
-                highlight_all(annotated_article)
-            load_annotations(annotated_article, article_obj)
-    return article_obj.id
+    load_article_atomic(batch_name, raw_bytes, filename, with_annotations)
diff --git a/data/load_data.py b/data/load_data.py
@@ -22,7 +22,7 @@
 from django.db import transaction
 
 from data import init_defaults
-from data.parse_document import parse_document
+from data.parse_document import parse_article, ArticleParseError
 from data.parse_schema import parse_schema, ParseSchemaException, OPTION_TYPES
 
 from thresher.models import (Article, Topic, Question, Answer,
@@ -325,8 +325,8 @@ def load_article(batch_name, article):
         metadata=article['metadata']
     )
     article_obj.save()
-    print "article id %d numbered %s" % (article_obj.id,
-          article_obj.article_number)
+    print ("{} article number db id {}"
+          .format(article_obj.article_number, article_obj.id))
     return article_obj
 
 def load_annotations(article, article_obj):
@@ -436,19 +436,34 @@ def highlight_all(parsed_article):
         tuas[topic_name] = tua
     parsed_article['tuas'] = tuas
 
-def load_article_dir(dirpath, with_annotations=False):
-    batch_name = parse_batch_name(dirpath)
-    for article_filename in os.listdir(dirpath):
-        if os.path.splitext(article_filename)[1] != '.txt':
-            continue
-        fullpath = os.path.join(dirpath, article_filename)
+def load_article_atomic(batch_name, raw_bytes, orig_filename, with_annotations):
+    try:
         with transaction.atomic():
-            annotated_article = parse_document(fullpath, article_filename)
+            # n.b. conversion to UTF-8 happens in parse_article
+            annotated_article = parse_article(raw_bytes, orig_filename)
             article_obj = load_article(batch_name, annotated_article)
             if article_obj and with_annotations:
                 if annotated_article['parser'] == 'generic':
                     highlight_all(annotated_article)
                 load_annotations(annotated_article, article_obj)
+            if article_obj:
+                return article_obj.id
+    except ArticleParseError as e:
+        if e.error_type == ArticleParseError.HEADER_ERROR:
+            logger.warn(orig_filename + " " + e.message)
+        else:
+            logger.warn(e.message)
+    return None
+
+def load_article_dir(dirpath, with_annotations=False):
+    batch_name = parse_batch_name(dirpath)
+    for article_filename in sorted(os.listdir(dirpath)):
+        if os.path.splitext(article_filename)[1] != '.txt':
+            continue
+        fullpath = os.path.join(dirpath, article_filename)
+        with open(fullpath) as f:
+            raw_bytes = f.read()
+            load_article_atomic(batch_name, raw_bytes, article_filename, with_annotations)
 
 def load_args():
     parser = argparse.ArgumentParser()

diff --git a/data/parse_document.py b/data/parse_document.py
@@ -1,3 +1,6 @@
+import logging
+logger = logging.getLogger(__name__)
+
 import codecs
 import json
 import os
@@ -112,27 +115,30 @@ def parse_nickML(raw_text, filename):
         if tua_id not in tuas[tua_type]:
             tuas[tua_type][tua_id] = []
 
-        tua_text = clean_text[tua_span[0]:tua_span[1]].strip()
-        if (tua_text.lower() != tua_body.strip().lower()):
-            raise ArticleParseError(
-                "Reconstructed clean text didn't match TUA body!",
-                ArticleParseError.TEXT_ERROR)
+        tua_text = clean_text[tua_span[0]:tua_span[1]]
+        if (tua_text.lower().strip() != tua_body.lower().strip()):
+            message = (u"{} Offset error! '{}' <> '{}'"
+                      .format(article_number, tua_text, tua_body))
+            raise ArticleParseError(message, ArticleParseError.TEXT_ERROR)
 
         tua_content = list(tua_span)
         tua_content.append(tua_text)
         tuas[tua_type][tua_id].append(tua_content)
 
-    # If the only tua_type is 'Useless', this document is likely a duplicate.
+    # If the only tua_type is 'Useless', we don't need to load it.
     if len(tuas.keys()) == 1 and 'Useless' in tuas.keys():
-        print "Possibly useless:", article_number
-        raise ArticleParseError("Only found Useless tuas!",
-                                ArticleParseError.DUPLICATE_ERROR)
+        message = u"{} Only found Useless tuas!".format(article_number)
+        raise ArticleParseError(message, ArticleParseError.DUPLICATE_ERROR)
+
+    # Is it bad if angle brackets are left? Probably?
+    if '<' in clean_text or '>' in clean_text:
+        message = u"{} <> Brackets remain in clean text!".format(article_number)
+        # raise ArticleParseError(message, ArticleParseError.BRACKET_WARNING)
 
     # Warning: brackets left over are usually bad news.
     if '[' in clean_text or ']' in clean_text:
-        print "Unparsed brackets left in article:", article_number
-#        raise ArticleParseError("Brackets remain in clean text!",
-#                                ArticleParseError.BRACKET_WARNING)
+        message = u"{} [] Brackets remain in clean text!".format(article_number)
+        raise ArticleParseError(message, ArticleParseError.BRACKET_WARNING)
 
     # print out our data.
     metadata = {
@@ -154,18 +160,6 @@ def parse_nickML(raw_text, filename):
         'parser': 'nickML',
     }
 
-#    print "final clean text:", clean_text
-#    import pprint; pprint.pprint(tuas)
-#    print "annotators:", annotators
-#    print "version:", version
-#    print "date published:", date_published
-#    print "article number:", article_number
-#    print "city:", city
-#    print "state:", state
-#    print "periodical:", periodical
-#    print "periodical code:", periodical_code
-#    print "\n\n\n"
-
 def parse_header(raw_text):
     # expected header format:
     #
@@ -193,8 +187,8 @@ def parse_header(raw_text):
         elif header_rownum == 3:
             annotators, version = parse_annotator_line(line)
             if not annotators and not version:
-                raise ArticleParseError("Unexpected header line 3: " + line,
-                                        ArticleParseError.HEADER_ERROR)
+                message = "Unexpected header line 3 has {}: ".format(line)
+                raise ArticleParseError(message, ArticleParseError.HEADER_ERROR)
             break
 
     if header_rownum != 3:
@@ -401,7 +395,7 @@ def parse_filename(filename):
         return match.group('article_number', 'city', 'state', 'periodical',
                            'periodical_code')
     else:
-        raise ArticleParseError('Bad File Name: ' + raw_name,
+        raise ArticleParseError(raw_name + 'Bad File Name',
                                 ArticleParseError.FILENAME_ERROR)
 
 def parse_documents(directory_path, error_directory_paths):