diff --git a/data/document_importer.py b/data/document_importer.py index b7579e9..5bf1fb5 100644 --- a/data/document_importer.py +++ b/data/document_importer.py @@ -7,9 +7,8 @@ from django.db import transaction from thresher.models import UserProfile -from data.load_data import (load_article, parse_batch_name, - load_annotations, highlight_all) -from data.parse_document import parse_article +from data.load_data import load_article_atomic, parse_batch_name + def import_archive(orig_filename, filename, owner_profile_id, with_annotations=False): try: @@ -29,12 +28,4 @@ def import_archive(orig_filename, filename, owner_profile_id, with_annotations=F @django_rq.job('file_importer', timeout=60, result_ttl=24*3600) def import_article(batch_name, raw_bytes, filename, owner_profile_id, with_annotations): owner_profile = UserProfile.objects.get(pk=owner_profile_id) - with transaction.atomic(): - # n.b. conversion to UTF-8 happens in parse_article - annotated_article = parse_article(raw_bytes, filename) - article_obj = load_article(batch_name, annotated_article) - if article_obj and with_annotations: - if annotated_article['parser'] == 'generic': - highlight_all(annotated_article) - load_annotations(annotated_article, article_obj) - return article_obj.id + load_article_atomic(batch_name, raw_bytes, filename, with_annotations) diff --git a/data/load_data.py b/data/load_data.py index 21e366a..4c08f0e 100644 --- a/data/load_data.py +++ b/data/load_data.py @@ -22,7 +22,7 @@ from django.db import transaction from data import init_defaults -from data.parse_document import parse_document +from data.parse_document import parse_article, ArticleParseError from data.parse_schema import parse_schema, ParseSchemaException, OPTION_TYPES from thresher.models import (Article, Topic, Question, Answer, @@ -325,8 +325,8 @@ def load_article(batch_name, article): metadata=article['metadata'] ) article_obj.save() - print "article id %d numbered %s" % (article_obj.id, - article_obj.article_number) + print ("{} article number db id {}" + .format(article_obj.article_number, article_obj.id)) return article_obj def load_annotations(article, article_obj): @@ -436,19 +436,34 @@ def highlight_all(parsed_article): tuas[topic_name] = tua parsed_article['tuas'] = tuas -def load_article_dir(dirpath, with_annotations=False): - batch_name = parse_batch_name(dirpath) - for article_filename in os.listdir(dirpath): - if os.path.splitext(article_filename)[1] != '.txt': - continue - fullpath = os.path.join(dirpath, article_filename) +def load_article_atomic(batch_name, raw_bytes, orig_filename, with_annotations): + try: with transaction.atomic(): - annotated_article = parse_document(fullpath, article_filename) + # n.b. conversion to UTF-8 happens in parse_article + annotated_article = parse_article(raw_bytes, orig_filename) article_obj = load_article(batch_name, annotated_article) if article_obj and with_annotations: if annotated_article['parser'] == 'generic': highlight_all(annotated_article) load_annotations(annotated_article, article_obj) + if article_obj: + return article_obj.id + except ArticleParseError as e: + if e.error_type == ArticleParseError.HEADER_ERROR: + logger.warn(orig_filename + " " + e.message) + else: + logger.warn(e.message) + return None + +def load_article_dir(dirpath, with_annotations=False): + batch_name = parse_batch_name(dirpath) + for article_filename in sorted(os.listdir(dirpath)): + if os.path.splitext(article_filename)[1] != '.txt': + continue + fullpath = os.path.join(dirpath, article_filename) + with open(fullpath) as f: + raw_bytes = f.read() + load_article_atomic(batch_name, raw_bytes, article_filename, with_annotations) def load_args(): parser = argparse.ArgumentParser() diff --git a/data/parse_document.py b/data/parse_document.py index e592d43..563bd68 100644 --- a/data/parse_document.py +++ b/data/parse_document.py @@ -1,3 +1,6 @@ +import logging +logger = logging.getLogger(__name__) + import codecs import json import os @@ -112,27 +115,30 @@ def parse_nickML(raw_text, filename): if tua_id not in tuas[tua_type]: tuas[tua_type][tua_id] = [] - tua_text = clean_text[tua_span[0]:tua_span[1]].strip() - if (tua_text.lower() != tua_body.strip().lower()): - raise ArticleParseError( - "Reconstructed clean text didn't match TUA body!", - ArticleParseError.TEXT_ERROR) + tua_text = clean_text[tua_span[0]:tua_span[1]] + if (tua_text.lower().strip() != tua_body.lower().strip()): + message = (u"{} Offset error! '{}' <> '{}'" + .format(article_number, tua_text, tua_body)) + raise ArticleParseError(message, ArticleParseError.TEXT_ERROR) tua_content = list(tua_span) tua_content.append(tua_text) tuas[tua_type][tua_id].append(tua_content) - # If the only tua_type is 'Useless', this document is likely a duplicate. + # If the only tua_type is 'Useless', we don't need to load it. if len(tuas.keys()) == 1 and 'Useless' in tuas.keys(): - print "Possibly useless:", article_number - raise ArticleParseError("Only found Useless tuas!", - ArticleParseError.DUPLICATE_ERROR) + message = u"{} Only found Useless tuas!".format(article_number) + raise ArticleParseError(message, ArticleParseError.DUPLICATE_ERROR) + + # Is it bad if angle brackets are left? Probably? + if '<' in clean_text or '>' in clean_text: + message = u"{} <> Brackets remain in clean text!".format(article_number) + # raise ArticleParseError(message, ArticleParseError.BRACKET_WARNING) # Warning: brackets left over are usually bad news. if '[' in clean_text or ']' in clean_text: - print "Unparsed brackets left in article:", article_number -# raise ArticleParseError("Brackets remain in clean text!", -# ArticleParseError.BRACKET_WARNING) + message = u"{} [] Brackets remain in clean text!".format(article_number) + raise ArticleParseError(message, ArticleParseError.BRACKET_WARNING) # print out our data. metadata = { @@ -154,18 +160,6 @@ def parse_nickML(raw_text, filename): 'parser': 'nickML', } -# print "final clean text:", clean_text -# import pprint; pprint.pprint(tuas) -# print "annotators:", annotators -# print "version:", version -# print "date published:", date_published -# print "article number:", article_number -# print "city:", city -# print "state:", state -# print "periodical:", periodical -# print "periodical code:", periodical_code -# print "\n\n\n" - def parse_header(raw_text): # expected header format: # @@ -193,8 +187,8 @@ def parse_header(raw_text): elif header_rownum == 3: annotators, version = parse_annotator_line(line) if not annotators and not version: - raise ArticleParseError("Unexpected header line 3: " + line, - ArticleParseError.HEADER_ERROR) + message = "Unexpected header line 3 has {}: ".format(line) + raise ArticleParseError(message, ArticleParseError.HEADER_ERROR) break if header_rownum != 3: @@ -401,7 +395,7 @@ def parse_filename(filename): return match.group('article_number', 'city', 'state', 'periodical', 'periodical_code') else: - raise ArticleParseError('Bad File Name: ' + raw_name, + raise ArticleParseError(raw_name + 'Bad File Name', ArticleParseError.FILENAME_ERROR) def parse_documents(directory_path, error_directory_paths):