diff --git a/data/document_importer.py b/data/document_importer.py index debf205..b7579e9 100644 --- a/data/document_importer.py +++ b/data/document_importer.py @@ -7,7 +7,8 @@ from django.db import transaction from thresher.models import UserProfile -from data.load_data import load_article, load_annotations, parse_batch_name +from data.load_data import (load_article, parse_batch_name, + load_annotations, highlight_all) from data.parse_document import parse_article def import_archive(orig_filename, filename, owner_profile_id, with_annotations=False): @@ -18,21 +19,22 @@ def import_archive(orig_filename, filename, owner_profile_id, with_annotations=F if af.isfile() and fnmatch.fnmatch(af.name, "*.txt")] logger.info("articles found %d" % len(members)) for member in members: - article = tar.extractfile(member).read() + raw_bytes = tar.extractfile(member).read() article_filename = os.path.basename(member.name) - import_article.delay(batch_name, article, article_filename, + import_article.delay(batch_name, raw_bytes, article_filename, owner_profile_id, with_annotations) finally: os.remove(filename) @django_rq.job('file_importer', timeout=60, result_ttl=24*3600) -def import_article(batch_name, article, filename, owner_profile_id, with_annotations): +def import_article(batch_name, raw_bytes, filename, owner_profile_id, with_annotations): owner_profile = UserProfile.objects.get(pk=owner_profile_id) with transaction.atomic(): - annotated_article = parse_article(article, filename) - article_obj = load_article(annotated_article) - article_obj.batch_name = batch_name - article_obj.save() + # n.b. conversion to UTF-8 happens in parse_article + annotated_article = parse_article(raw_bytes, filename) + article_obj = load_article(batch_name, annotated_article) if article_obj and with_annotations: + if annotated_article['parser'] == 'generic': + highlight_all(annotated_article) load_annotations(annotated_article, article_obj) return article_obj.id diff --git a/data/highlight_all.py b/data/highlight_all.py new file mode 100644 index 0000000..f2d70e4 --- /dev/null +++ b/data/highlight_all.py @@ -0,0 +1,46 @@ +import os +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "thresher_backend.settings") + +import logging +logger = logging.getLogger(__name__) + +import django +django.setup() + +from thresher.models import Article, Topic +from thresher.models import Contributor, ArticleHighlight, HighlightGroup + +GOLDUSERNAME=u"Full Text Highlighted" + +# Create a set of highlights that covers the entire article for +# every root topic, so the Researcher can send these directly to the Quiz +def highlightArticles(root_topics, article_set, contributor): + topic_list = root_topics.all().values_list("name", flat=True) + topic_names = ", ".join(topic_list) + logger.info(u"Annotating entire articles with following topics: {}" + .format(topic_names)) + + for article_obj in article_set: + taskruns = article_obj.highlight_taskruns + if taskruns.filter(contributor__username=GOLDUSERNAME).count() == 0: + logger.info("Adding highlights to article number: {}" + .format(article_obj.article_number)) + article_highlight = ArticleHighlight.objects.create( + article=article_obj, + contributor=contributor + ) + offset_list = [[0, len(article_obj.text), article_obj.text]] + case_number = 0 + for topic_obj in root_topics.all(): + highlight = HighlightGroup.objects.create(offsets=offset_list, + case_number=case_number, + topic=topic_obj, + article_highlight=article_highlight) +if __name__ == '__main__': + # PE schemas start with topic_number 1 + root_topics = Topic.objects.filter(topic_number=1) + article_set = Article.objects.all() + (contributor, created) = Contributor.objects.get_or_create( + username=GOLDUSERNAME + ) + highlightArticles(root_topics, article_set, contributor) diff --git a/data/load_data.py b/data/load_data.py index b7e6926..32530a1 100644 --- a/data/load_data.py +++ b/data/load_data.py @@ -293,7 +293,7 @@ def load_schema(schema): load_dependencies(schema) load_options(schema, root_topic) -def load_article(article): +def load_article(batch_name, article): new_id = int(article['metadata']['article_number']) try: # Catch duplicate article ids and assign new ids. @@ -306,18 +306,21 @@ def load_article(article): logger.warn("Article ID {} already assigned. New id is {}. " "Recommend fixing source data".format(old_id, new_id)) else: - # we've already loaded this article, so don't process its TUAs. - return + # already loaded this article, return None so TUAs not reloaded + return None except Article.DoesNotExist: # Not a duplicate. pass - date_published=article['metadata']['date_published'] - if isinstance(date_published, date): - # JSON Serializer doesn't like dates - article['metadata']['date_published']=date_published.isoformat() + if 'date_published' in article['metadata']: + date_published=article['metadata']['date_published'] + if isinstance(date_published, date): + # JSON Serializer doesn't like dates + article['metadata']['date_published']=date_published.isoformat() + article_obj = Article( article_number=new_id, + batch_name=batch_name, text=article['text'], metadata=article['metadata'] ) @@ -327,14 +330,17 @@ def load_article(article): return article_obj def load_annotations(article, article_obj): - # In future usage, the articles being imported will not be highlighted - # already and thus won't have 'annotators'. - annotators = ','.join(article['metadata']['annotators']) - if annotators == "": - annotators = "Unknown annotator" + # If articles being imported are not highlighted + # already they won't have 'tuas'. + if 'tuas' not in article: + return + + contributor_name = "Unknown annotator" + if 'contributor' in article: + contributor_name = article['contributor'] (contributor, created) = Contributor.objects.get_or_create( - username="Gold Standard" + username=contributor_name ) article_highlight = ArticleHighlight.objects.create(article=article_obj, @@ -412,6 +418,24 @@ def parse_batch_name(orig_filename): basename = os.path.basename(orig_filename) return os.path.splitext(basename)[0] +topic_name_cache = None + +# Create a set of highlights that covers the entire article for +# every root topic, so the Researcher can send these directly to the Quiz +def highlight_all(parsed_article): + global topic_name_cache + if topic_name_cache is None: + # PE schemas start with topic_number 1 + topic_name_cache = (Topic.objects.filter(topic_number=1) + .values_list("name", flat=True)) + text = parsed_article['text'] + offset_list = [[0, len(text), text]] + tuas = {} + for topic_name in topic_name_cache: + tua = { 0: offset_list } + tuas[topic_name] = tua + parsed_article['tuas'] = tuas + def load_article_dir(dirpath, with_annotations=False): batch_name = parse_batch_name(dirpath) for article_filename in os.listdir(dirpath): @@ -420,10 +444,10 @@ def load_article_dir(dirpath, with_annotations=False): fullpath = os.path.join(dirpath, article_filename) with transaction.atomic(): annotated_article = parse_document(fullpath, article_filename) - article_obj = load_article(annotated_article) - article_obj.batch_name = batch_name - article_obj.save() - if with_annotations: + article_obj = load_article(batch_name, annotated_article) + if article_obj and with_annotations: + if annotated_article['parser'] == 'generic': + highlight_all(annotated_article) load_annotations(annotated_article, article_obj) def load_args(): diff --git a/data/parse_document.py b/data/parse_document.py index 201ce3c..e592d43 100644 --- a/data/parse_document.py +++ b/data/parse_document.py @@ -61,13 +61,41 @@ def __init__(self, message, error_type): def parse_document(fullpath, filename): with open(fullpath, 'r') as f: - raw_text = f.read() - return parse_article(raw_text, filename) + raw_bytes = f.read() + return parse_article(raw_bytes, filename) -def parse_article(raw_text, filename): +def parse_article(raw_bytes, filename): # Convert to UTF-8, removing the Windows BOM if present - raw_text = raw_text.decode('utf-8-sig', errors='strict') + raw_text = raw_bytes.decode('utf-8-sig', errors='strict') + + # Allowing for initial blank lines and blank space, look for + # nickML marker +*+* + if re.match(r'(\s*\r\n?|\s*\n)*\s*\+\*\+\*', raw_text): + return parse_nickML(raw_text, filename) + else: + return parse_generic(raw_text, filename) + +def parse_generic(raw_text, filename): + basename = os.path.basename(filename) + match = re.search(r'^(?P\d+)', basename) + if match: + article_number = match.group('article_number') + else: + raise ArticleParseError('No article number starting filename: ' + basename, + ArticleParseError.FILENAME_ERROR) + metadata = { + 'article_number': article_number, + 'filename': filename, + } + return { + 'metadata': metadata, + 'text': raw_text, + 'contributor': u"Full Text Highlighted", + 'parser': 'generic', + } + +def parse_nickML(raw_text, filename): # extract info from the file name article_number, city, state, periodical, periodical_code = parse_filename(filename) @@ -107,7 +135,6 @@ def parse_article(raw_text, filename): # ArticleParseError.BRACKET_WARNING) # print out our data. - # TODO: store this somewhere. metadata = { 'annotators': annotators, 'version': version, @@ -122,7 +149,9 @@ def parse_article(raw_text, filename): return { 'metadata': metadata, 'text': clean_text, - 'tuas': tuas + 'tuas': tuas, + 'contributor': u"Gold Standard DF", + 'parser': 'nickML', } # print "final clean text:", clean_text @@ -384,7 +413,7 @@ def parse_documents(directory_path, error_directory_paths): print "PROCCESING FILE:", file_path, "..." try: - data.append(parse_document(full_path)) + data.append(parse_document(full_path, os.path.basename(full_path))) except ArticleParseError as e: new_path = os.path.join(error_directory_paths[e.error_type], file_path) diff --git a/docker/thresher_api/highlight_all.sh b/docker/thresher_api/highlight_all.sh new file mode 100755 index 0000000..94a86f6 --- /dev/null +++ b/docker/thresher_api/highlight_all.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e +cd /home/thresher +export PYTHONPATH=/home/thresher +python data/highlight_all.py diff --git a/highlight_all.sh b/highlight_all.sh new file mode 100755 index 0000000..63683fb --- /dev/null +++ b/highlight_all.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# MSYS_NO_PATHCONV fixes paths on Docker Toolbox on Windows using Git Bash / Mingw +# Harmless everywhere else. +export MSYS_NO_PATHCONV=1 +# DB updates can use either 'run' or 'exec' +docker-compose exec thresher_api sh /home/thresher/docker/thresher_api/highlight_all.sh diff --git a/thresher/models.py b/thresher/models.py index 941a74d..15c8b49 100644 --- a/thresher/models.py +++ b/thresher/models.py @@ -47,11 +47,12 @@ class Meta: ) def __unicode__(self): - return "id {} username {} pybossa user id {}".format( - self.id, - self.username, - self.pybossa_user_id - ) + result = u"id {}".format(self.id) + if self.username != "": + result += u" " + self.username + if self.pybossa_user_id: + result += u" pybossa id: {}".format(self.pybossa_user_id) + return result TASK_TYPE = (