Generic article loader with option to highlight all articles with all…

… root topics (#178) * Streamline saving article batch_name. Use variable 'raw_bytes' until article text converted to UTF-8. Improve code that handles attempts to load duplicate articles. * Add generic article loader. Adjust article loader to make metadata attributes and tuas optional. * Write command line utility to highlight all articles with all root topics. * Make highlight all utility into callable function with informative logging. * Improve formatting of Contributor default representation. * Integrate new generic loader and all highlight annotator with existing Upload Articles. Add command line script to highlight all articles with all root topics.
Goodly · Nov 22, 2017 · 997cc35 · 997cc35
1 parent 0cc4d3d
commit 997cc35
Show file tree

Hide file tree

Showing 7 changed files with 150 additions and 37 deletions.
diff --git a/data/document_importer.py b/data/document_importer.py
@@ -7,7 +7,8 @@
 from django.db import transaction
 
 from thresher.models import UserProfile
-from data.load_data import load_article, load_annotations, parse_batch_name
+from data.load_data import (load_article, parse_batch_name,
+                            load_annotations, highlight_all)
 from data.parse_document import parse_article
 
 def import_archive(orig_filename, filename, owner_profile_id, with_annotations=False):
@@ -18,21 +19,22 @@ def import_archive(orig_filename, filename, owner_profile_id, with_annotations=F
                             if af.isfile() and fnmatch.fnmatch(af.name, "*.txt")]
             logger.info("articles found %d" % len(members))
             for member in members:
-                article = tar.extractfile(member).read()
+                raw_bytes = tar.extractfile(member).read()
                 article_filename = os.path.basename(member.name)
-                import_article.delay(batch_name, article, article_filename,
+                import_article.delay(batch_name, raw_bytes, article_filename,
                                      owner_profile_id, with_annotations)
     finally:
         os.remove(filename)
 
 @django_rq.job('file_importer', timeout=60, result_ttl=24*3600)
-def import_article(batch_name, article, filename, owner_profile_id, with_annotations):
+def import_article(batch_name, raw_bytes, filename, owner_profile_id, with_annotations):
     owner_profile = UserProfile.objects.get(pk=owner_profile_id)
     with transaction.atomic():
-        annotated_article = parse_article(article, filename)
-        article_obj = load_article(annotated_article)
-        article_obj.batch_name = batch_name
-        article_obj.save()
+        # n.b. conversion to UTF-8 happens in parse_article
+        annotated_article = parse_article(raw_bytes, filename)
+        article_obj = load_article(batch_name, annotated_article)
         if article_obj and with_annotations:
+            if annotated_article['parser'] == 'generic':
+                highlight_all(annotated_article)
             load_annotations(annotated_article, article_obj)
     return article_obj.id
diff --git a/data/highlight_all.py b/data/highlight_all.py
@@ -0,0 +1,46 @@
+import os
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "thresher_backend.settings")
+
+import logging
+logger = logging.getLogger(__name__)
+
+import django
+django.setup()
+
+from thresher.models import Article, Topic
+from thresher.models import Contributor, ArticleHighlight, HighlightGroup
+
+GOLDUSERNAME=u"Full Text Highlighted"
+
+# Create a set of highlights that covers the entire article for
+# every root topic, so the Researcher can send these directly to the Quiz
+def highlightArticles(root_topics, article_set, contributor):
+    topic_list = root_topics.all().values_list("name", flat=True)
+    topic_names = ", ".join(topic_list)
+    logger.info(u"Annotating entire articles with following topics: {}"
+                .format(topic_names))
+
+    for article_obj in article_set:
+        taskruns = article_obj.highlight_taskruns
+        if taskruns.filter(contributor__username=GOLDUSERNAME).count() == 0:
+            logger.info("Adding highlights to article number: {}"
+                        .format(article_obj.article_number))
+            article_highlight = ArticleHighlight.objects.create(
+                article=article_obj,
+                contributor=contributor
+            )
+            offset_list = [[0, len(article_obj.text), article_obj.text]]
+            case_number = 0
+            for topic_obj in root_topics.all():
+                highlight = HighlightGroup.objects.create(offsets=offset_list,
+                                                          case_number=case_number,
+                                                          topic=topic_obj,
+                                                          article_highlight=article_highlight)
+if __name__ == '__main__':
+    # PE schemas start with topic_number 1
+    root_topics = Topic.objects.filter(topic_number=1)
+    article_set = Article.objects.all()
+    (contributor, created) = Contributor.objects.get_or_create(
+        username=GOLDUSERNAME
+    )
+    highlightArticles(root_topics, article_set, contributor)
diff --git a/data/load_data.py b/data/load_data.py
@@ -293,7 +293,7 @@ def load_schema(schema):
     load_dependencies(schema)
     load_options(schema, root_topic)
 
-def load_article(article):
+def load_article(batch_name, article):
     new_id = int(article['metadata']['article_number'])
 
     try: # Catch duplicate article ids and assign new ids.
@@ -306,18 +306,21 @@ def load_article(article):
             logger.warn("Article ID {} already assigned. New id is {}. "
                         "Recommend fixing source data".format(old_id, new_id))
         else:
-            # we've already loaded this article, so don't process its TUAs.
-            return
+            # already loaded this article, return None so TUAs not reloaded
+            return None
 
     except Article.DoesNotExist: # Not a duplicate.
         pass
 
-    date_published=article['metadata']['date_published']
-    if isinstance(date_published, date):
-        # JSON Serializer doesn't like dates
-        article['metadata']['date_published']=date_published.isoformat()
+    if 'date_published' in article['metadata']:
+        date_published=article['metadata']['date_published']
+        if isinstance(date_published, date):
+            # JSON Serializer doesn't like dates
+            article['metadata']['date_published']=date_published.isoformat()
+
     article_obj = Article(
         article_number=new_id,
+        batch_name=batch_name,
         text=article['text'],
         metadata=article['metadata']
     )
@@ -327,14 +330,17 @@ def load_article(article):
     return article_obj
 
 def load_annotations(article, article_obj):
-    # In future usage, the articles being imported will not be highlighted
-    # already and thus won't have 'annotators'.
-    annotators = ','.join(article['metadata']['annotators'])
-    if annotators == "":
-        annotators = "Unknown annotator"
+    # If articles being imported are not highlighted
+    # already they won't have 'tuas'.
+    if 'tuas' not in article:
+        return
+
+    contributor_name = "Unknown annotator"
+    if 'contributor' in article:
+        contributor_name = article['contributor']
 
     (contributor, created) = Contributor.objects.get_or_create(
-        username="Gold Standard"
+        username=contributor_name
     )
 
     article_highlight = ArticleHighlight.objects.create(article=article_obj,
@@ -412,6 +418,24 @@ def parse_batch_name(orig_filename):
     basename = os.path.basename(orig_filename)
     return os.path.splitext(basename)[0]
 
+topic_name_cache = None
+
+# Create a set of highlights that covers the entire article for
+# every root topic, so the Researcher can send these directly to the Quiz
+def highlight_all(parsed_article):
+    global topic_name_cache
+    if topic_name_cache is None:
+        # PE schemas start with topic_number 1
+        topic_name_cache = (Topic.objects.filter(topic_number=1)
+                            .values_list("name", flat=True))
+    text = parsed_article['text']
+    offset_list = [[0, len(text), text]]
+    tuas = {}
+    for topic_name in topic_name_cache:
+        tua = { 0: offset_list }
+        tuas[topic_name] = tua
+    parsed_article['tuas'] = tuas
+
 def load_article_dir(dirpath, with_annotations=False):
     batch_name = parse_batch_name(dirpath)
     for article_filename in os.listdir(dirpath):
@@ -420,10 +444,10 @@ def load_article_dir(dirpath, with_annotations=False):
         fullpath = os.path.join(dirpath, article_filename)
         with transaction.atomic():
             annotated_article = parse_document(fullpath, article_filename)
-            article_obj = load_article(annotated_article)
-            article_obj.batch_name = batch_name
-            article_obj.save()
-            if with_annotations:
+            article_obj = load_article(batch_name, annotated_article)
+            if article_obj and with_annotations:
+                if annotated_article['parser'] == 'generic':
+                    highlight_all(annotated_article)
                 load_annotations(annotated_article, article_obj)
 
 def load_args():

diff --git a/data/parse_document.py b/data/parse_document.py
@@ -61,13 +61,41 @@ def __init__(self, message, error_type):
 
 def parse_document(fullpath, filename):
     with open(fullpath, 'r') as f:
-        raw_text = f.read()
-    return parse_article(raw_text, filename)
+        raw_bytes = f.read()
+    return parse_article(raw_bytes, filename)
 
-def parse_article(raw_text, filename):
+def parse_article(raw_bytes, filename):
 
     # Convert to UTF-8, removing the Windows BOM if present
-    raw_text = raw_text.decode('utf-8-sig', errors='strict')
+    raw_text = raw_bytes.decode('utf-8-sig', errors='strict')
+
+    # Allowing for initial blank lines and blank space, look for
+    # nickML marker +*+*
+    if re.match(r'(\s*\r\n?|\s*\n)*\s*\+\*\+\*', raw_text):
+        return parse_nickML(raw_text, filename)
+    else:
+        return parse_generic(raw_text, filename)
+
+def parse_generic(raw_text, filename):
+    basename = os.path.basename(filename)
+    match = re.search(r'^(?P<article_number>\d+)', basename)
+    if match:
+        article_number = match.group('article_number')
+    else:
+        raise ArticleParseError('No article number starting filename: ' + basename,
+                                ArticleParseError.FILENAME_ERROR)
+    metadata = {
+        'article_number': article_number,
+        'filename': filename,
+    }
+    return {
+        'metadata': metadata,
+        'text': raw_text,
+        'contributor': u"Full Text Highlighted",
+        'parser': 'generic',
+    }
+
+def parse_nickML(raw_text, filename):
     # extract info from the file name
     article_number, city, state, periodical, periodical_code = parse_filename(filename)
 
@@ -107,7 +135,6 @@ def parse_article(raw_text, filename):
 #                                ArticleParseError.BRACKET_WARNING)
 
     # print out our data.
-    # TODO: store this somewhere.
     metadata = {
         'annotators': annotators,
         'version': version,
@@ -122,7 +149,9 @@ def parse_article(raw_text, filename):
     return {
         'metadata': metadata,
         'text': clean_text,
-        'tuas': tuas
+        'tuas': tuas,
+        'contributor': u"Gold Standard DF",
+        'parser': 'nickML',
     }
 
 #    print "final clean text:", clean_text
@@ -384,7 +413,7 @@ def parse_documents(directory_path, error_directory_paths):
         print "PROCCESING FILE:", file_path, "..."
 
         try:
-            data.append(parse_document(full_path))
+            data.append(parse_document(full_path, os.path.basename(full_path)))
         except ArticleParseError as e:
             new_path = os.path.join(error_directory_paths[e.error_type],
                                     file_path)

diff --git a/docker/thresher_api/highlight_all.sh b/docker/thresher_api/highlight_all.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -e
+cd /home/thresher
+export PYTHONPATH=/home/thresher
+python data/highlight_all.py
diff --git a/highlight_all.sh b/highlight_all.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# MSYS_NO_PATHCONV fixes paths on Docker Toolbox on Windows using Git Bash / Mingw
+# Harmless everywhere else.
+export MSYS_NO_PATHCONV=1
+# DB updates can use either 'run' or 'exec'
+docker-compose exec thresher_api sh /home/thresher/docker/thresher_api/highlight_all.sh
diff --git a/thresher/models.py b/thresher/models.py
@@ -47,11 +47,12 @@ class Meta:
         )
 
     def __unicode__(self):
-        return "id {} username {} pybossa user id {}".format(
-                self.id,
-                self.username,
-                self.pybossa_user_id
-        )
+        result = u"id {}".format(self.id)
+        if self.username != "":
+            result += u" " + self.username
+        if self.pybossa_user_id:
+            result += u" pybossa id: {}".format(self.pybossa_user_id)
+        return result
 
 
 TASK_TYPE = (