Optional annotation import and fix file upload queue (#77)

* Add command line and web UI flag to make importing existing article annotations optional. * Queue articles individually to allow import processing to run in separate container. * Schema contents queued to allow import processing to run in separate container.
Goodly · Mar 13, 2017 · 3d6a6b2 · 3d6a6b2
1 parent 494d316
commit 3d6a6b2
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 28 deletions.
diff --git a/data/load_data.py b/data/load_data.py
@@ -3,7 +3,6 @@
 
 import django
 django.setup()
-from django.conf import settings
 
 import argparse
 import json
@@ -161,7 +160,7 @@ def load_dependencies(self):
                 answers = answers.filter(answer_number=dep.answer)
             next_question_id = next_question.id
             for answer in answers:
-                answer.next_questions = (answer.next_questions[:-1] + "," + 
+                answer.next_questions = (answer.next_questions[:-1] + "," +
                                          str(next_question_id) + "]")
                 answer.save()
 
@@ -237,7 +236,9 @@ def load_article(article, created_by):
     article_obj.save()
     print "article id %d numbered %s" % (article_obj.id,
           article_obj.article_number)
+    return article_obj
 
+def load_annotations(article, article_obj, created_by):
     # In future usage, the articles being imported will not be highlighted
     # already and thus won't have 'annotators'.
     # The field annotators on Article logically maps to the
@@ -283,19 +284,19 @@ def load_schema_dir(dirpath):
     for schema_file in schema_files:
         load_schema(parse_schema(os.path.join(dirpath, schema_file)))
 
-def load_article_dir(dirpath, created_by):
-    for article_file in os.listdir(dirpath):
-        if os.path.splitext(article_file)[1] != '.txt':
-            continue
-        load_article(parse_document(os.path.join(dirpath, article_file)), created_by)
-
 def load_old_schema_dir(dirpath):
     schema_files = sorted(fnmatch.filter(os.listdir(dirpath), '*.txt'))
     for schema_file in schema_files:
         load_schema(old_parse_schema(os.path.join(dirpath, schema_file)))
 
-# To load old schemas:
-# PYTHONPATH=/home/thresher python data/load_data.py -o data/DF-schema/
+def load_article_dir(dirpath, created_by, with_annotations=False):
+    for article_filename in os.listdir(dirpath):
+        if os.path.splitext(article_filename)[1] != '.txt':
+            continue
+        annotated_article = parse_document(os.path.join(dirpath, article_filename))
+        article_obj = load_article(annotated_article, created_by)
+        if with_annotations:
+            load_annotations(annotated_article, article_obj, created_by)
 
 def load_args():
     parser = argparse.ArgumentParser()
@@ -307,7 +308,12 @@ def load_args():
         help='The directory holding old schema files')
     parser.add_argument(
         '-d', '--article-dir',
-        help='The directory holding raw article files for the TUA types')
+        help='directory with articles to load')
+    parser.add_argument(
+        '-a', '--with-annotations',
+        default=False,
+        action='store_true',
+        help='import article annotations and add any missing topics')
     return parser.parse_args()
 
 if __name__ == '__main__':
@@ -324,4 +330,4 @@ def load_args():
         load_old_schema_dir(args.old_schema_dir)
         print "Finished loading schemas"
     if args.article_dir:
-        load_article_dir(args.article_dir, created_by)
+        load_article_dir(args.article_dir, created_by, args.with_annotations)
diff --git a/docker/thresher_api/load_data.sh b/docker/thresher_api/load_data.sh
@@ -4,4 +4,7 @@ cd /home/thresher
 export PYTHONPATH=/home/thresher
 #python data/load_data.py --schema-dir=data/sample/schema
 python data/load_data.py --old-schema-dir=data/DF-schema
+# --with-annotations imports any ArticleHighlight, HighlightGroup markup,
+# and any topics that don't already exist.
+#python data/load_data.py --article-dir=data/sample/article --with-annotations
 python data/load_data.py --article-dir=data/sample/article
diff --git a/researcher/forms.py b/researcher/forms.py
@@ -3,8 +3,12 @@
 
 from thresher.models import Project, Topic
 
+help_with_annotations = "Check this box to import any existing annotations and topics embedded in the articles."
 class UploadArticlesForm(forms.Form):
     article_archive_file = forms.FileField(allow_empty_file=False)
+    with_annotations = forms.BooleanField(required=False,
+                                          label="Import annotations",
+                                          help_text=help_with_annotations)
 
 class UploadSchemaForm(forms.Form):
     schema_file = forms.FileField(allow_empty_file=False)

diff --git a/researcher/views.py b/researcher/views.py
@@ -14,7 +14,7 @@
 
 from researcher.forms import UploadArticlesForm, UploadSchemaForm
 from researcher.forms import SendTasksForm
-from data.load_data import load_article, load_schema
+from data.load_data import load_article, load_annotations, load_schema
 from data.parse_document import parse_article
 from data.parse_schema import parse_schema
 from data.legacy.parse_schema import parse_schema as old_parse_schema
@@ -35,26 +35,33 @@ def get(self, request):
                       {'projects': Project.objects.filter(pybossa_id__isnull=False).order_by('name')}
         )
 
-@django_rq.job('default', timeout=60, result_ttl=24*3600)
-def import_article(filename, owner_profile_id):
-    owner_profile = UserProfile.objects.get(pk=owner_profile_id)
+def import_archive(filename, owner_profile_id, with_annotations=False):
     try:
         with tarfile.open(filename) as tar:
             members = [ af for af in tar.getmembers()
                             if af.isfile() and fnmatch.fnmatch(af.name, "*.txt")]
             logger.info("articles found %d" % len(members))
             for member in members:
                 article = tar.extractfile(member).read()
-                load_article(parse_article(article, member.name), owner_profile)
+                import_article.delay(article, member.name, owner_profile_id, with_annotations)
     finally:
         os.remove(filename)
 
 @django_rq.job('default', timeout=60, result_ttl=24*3600)
-def import_schema(filename, owner_profile_id):
-    try:
-        load_schema(old_parse_schema(filename))
-    finally:
-        os.remove(filename)
+def import_article(article, filename, owner_profile_id, with_annotations):
+    owner_profile = UserProfile.objects.get(pk=owner_profile_id)
+    annotated_article = parse_article(article, filename)
+    article_obj = load_article(annotated_article, owner_profile)
+    if with_annotations:
+        load_annotations(annotated_article, article_obj, owner_profile)
+
+@django_rq.job('default', timeout=60, result_ttl=24*3600)
+def import_schema(schema_contents, owner_profile_id):
+    logger.info("Received %d schema file bytes" % len(schema_contents))
+    with tempfile.NamedTemporaryFile(delete=True) as schema_file:
+        schema_file.write(schema_contents)
+        schema_file.flush()
+        load_schema(old_parse_schema(schema_file.name))
 
 class UploadArticlesView(PermissionRequiredMixin, View):
     form_class = UploadArticlesForm
@@ -74,15 +81,15 @@ def post(self, request):
         bound_form = self.form_class(request.POST, request.FILES)
         if bound_form.is_valid():
             f = request.FILES['article_archive_file']
+            with_annotations = bound_form.cleaned_data["with_annotations"]
             logger.info("Request to import article archive %s, length %d" % (f.name, f.size))
             with tempfile.NamedTemporaryFile(delete=False) as archive_file:
                 for chunk in f.chunks():
                     archive_file.write(chunk)
                 archive_file.flush()
                 logger.info("Archive copied to temp file %s: tar file format: %s"
                             % (archive_file.name, tarfile.is_tarfile(archive_file.name)))
-                # Async job must delete temp file when done
-                import_article.delay(archive_file.name, request.user.userprofile.id)
+                import_archive(archive_file.name, request.user.userprofile.id, with_annotations)
 
             return redirect('/admin/thresher/article/')
         else:
@@ -115,13 +122,13 @@ def post(self, request):
         if bound_form.is_valid():
             f = request.FILES['schema_file']
             logger.info("Request to import schema %s, length %d" % (f.name, f.size))
-            with tempfile.NamedTemporaryFile(delete=False) as schema_file:
+            with tempfile.NamedTemporaryFile(delete=True) as schema_file:
                 for chunk in f.chunks():
                     schema_file.write(chunk)
-                schema_file.flush()
                 logger.info("Schema copied to temp file %s" % schema_file.name)
-                # Async job must delete temp file when done
-                import_schema.delay(schema_file.name, request.user.userprofile.id)
+                schema_file.seek(0)
+                schema_contents = schema_file.read()
+                import_schema.delay(schema_contents, request.user.userprofile.id)
 
             return redirect('/admin/thresher/topic/')
         else: