Skip to content

Commit

Permalink
Optional annotation import and fix file upload queue (#77)
Browse files Browse the repository at this point in the history
* Add command line and web UI flag to make importing existing article annotations optional.

* Queue articles individually to allow import processing to run in separate container.

* Schema contents queued to allow import processing to run in separate container.
  • Loading branch information
normangilmore authored Mar 13, 2017
1 parent 494d316 commit 3d6a6b2
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 28 deletions.
30 changes: 18 additions & 12 deletions data/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import django
django.setup()
from django.conf import settings

import argparse
import json
Expand Down Expand Up @@ -161,7 +160,7 @@ def load_dependencies(self):
answers = answers.filter(answer_number=dep.answer)
next_question_id = next_question.id
for answer in answers:
answer.next_questions = (answer.next_questions[:-1] + "," +
answer.next_questions = (answer.next_questions[:-1] + "," +
str(next_question_id) + "]")
answer.save()

Expand Down Expand Up @@ -237,7 +236,9 @@ def load_article(article, created_by):
article_obj.save()
print "article id %d numbered %s" % (article_obj.id,
article_obj.article_number)
return article_obj

def load_annotations(article, article_obj, created_by):
# In future usage, the articles being imported will not be highlighted
# already and thus won't have 'annotators'.
# The field annotators on Article logically maps to the
Expand Down Expand Up @@ -283,19 +284,19 @@ def load_schema_dir(dirpath):
for schema_file in schema_files:
load_schema(parse_schema(os.path.join(dirpath, schema_file)))

def load_article_dir(dirpath, created_by):
for article_file in os.listdir(dirpath):
if os.path.splitext(article_file)[1] != '.txt':
continue
load_article(parse_document(os.path.join(dirpath, article_file)), created_by)

def load_old_schema_dir(dirpath):
schema_files = sorted(fnmatch.filter(os.listdir(dirpath), '*.txt'))
for schema_file in schema_files:
load_schema(old_parse_schema(os.path.join(dirpath, schema_file)))

# To load old schemas:
# PYTHONPATH=/home/thresher python data/load_data.py -o data/DF-schema/
def load_article_dir(dirpath, created_by, with_annotations=False):
for article_filename in os.listdir(dirpath):
if os.path.splitext(article_filename)[1] != '.txt':
continue
annotated_article = parse_document(os.path.join(dirpath, article_filename))
article_obj = load_article(annotated_article, created_by)
if with_annotations:
load_annotations(annotated_article, article_obj, created_by)

def load_args():
parser = argparse.ArgumentParser()
Expand All @@ -307,7 +308,12 @@ def load_args():
help='The directory holding old schema files')
parser.add_argument(
'-d', '--article-dir',
help='The directory holding raw article files for the TUA types')
help='directory with articles to load')
parser.add_argument(
'-a', '--with-annotations',
default=False,
action='store_true',
help='import article annotations and add any missing topics')
return parser.parse_args()

if __name__ == '__main__':
Expand All @@ -324,4 +330,4 @@ def load_args():
load_old_schema_dir(args.old_schema_dir)
print "Finished loading schemas"
if args.article_dir:
load_article_dir(args.article_dir, created_by)
load_article_dir(args.article_dir, created_by, args.with_annotations)
3 changes: 3 additions & 0 deletions docker/thresher_api/load_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@ cd /home/thresher
export PYTHONPATH=/home/thresher
#python data/load_data.py --schema-dir=data/sample/schema
python data/load_data.py --old-schema-dir=data/DF-schema
# --with-annotations imports any ArticleHighlight, HighlightGroup markup,
# and any topics that don't already exist.
#python data/load_data.py --article-dir=data/sample/article --with-annotations
python data/load_data.py --article-dir=data/sample/article
4 changes: 4 additions & 0 deletions researcher/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@

from thresher.models import Project, Topic

help_with_annotations = "Check this box to import any existing annotations and topics embedded in the articles."
class UploadArticlesForm(forms.Form):
article_archive_file = forms.FileField(allow_empty_file=False)
with_annotations = forms.BooleanField(required=False,
label="Import annotations",
help_text=help_with_annotations)

class UploadSchemaForm(forms.Form):
schema_file = forms.FileField(allow_empty_file=False)
Expand Down
39 changes: 23 additions & 16 deletions researcher/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from researcher.forms import UploadArticlesForm, UploadSchemaForm
from researcher.forms import SendTasksForm
from data.load_data import load_article, load_schema
from data.load_data import load_article, load_annotations, load_schema
from data.parse_document import parse_article
from data.parse_schema import parse_schema
from data.legacy.parse_schema import parse_schema as old_parse_schema
Expand All @@ -35,26 +35,33 @@ def get(self, request):
{'projects': Project.objects.filter(pybossa_id__isnull=False).order_by('name')}
)

@django_rq.job('default', timeout=60, result_ttl=24*3600)
def import_article(filename, owner_profile_id):
owner_profile = UserProfile.objects.get(pk=owner_profile_id)
def import_archive(filename, owner_profile_id, with_annotations=False):
try:
with tarfile.open(filename) as tar:
members = [ af for af in tar.getmembers()
if af.isfile() and fnmatch.fnmatch(af.name, "*.txt")]
logger.info("articles found %d" % len(members))
for member in members:
article = tar.extractfile(member).read()
load_article(parse_article(article, member.name), owner_profile)
import_article.delay(article, member.name, owner_profile_id, with_annotations)
finally:
os.remove(filename)

@django_rq.job('default', timeout=60, result_ttl=24*3600)
def import_schema(filename, owner_profile_id):
try:
load_schema(old_parse_schema(filename))
finally:
os.remove(filename)
def import_article(article, filename, owner_profile_id, with_annotations):
owner_profile = UserProfile.objects.get(pk=owner_profile_id)
annotated_article = parse_article(article, filename)
article_obj = load_article(annotated_article, owner_profile)
if with_annotations:
load_annotations(annotated_article, article_obj, owner_profile)

@django_rq.job('default', timeout=60, result_ttl=24*3600)
def import_schema(schema_contents, owner_profile_id):
logger.info("Received %d schema file bytes" % len(schema_contents))
with tempfile.NamedTemporaryFile(delete=True) as schema_file:
schema_file.write(schema_contents)
schema_file.flush()
load_schema(old_parse_schema(schema_file.name))

class UploadArticlesView(PermissionRequiredMixin, View):
form_class = UploadArticlesForm
Expand All @@ -74,15 +81,15 @@ def post(self, request):
bound_form = self.form_class(request.POST, request.FILES)
if bound_form.is_valid():
f = request.FILES['article_archive_file']
with_annotations = bound_form.cleaned_data["with_annotations"]
logger.info("Request to import article archive %s, length %d" % (f.name, f.size))
with tempfile.NamedTemporaryFile(delete=False) as archive_file:
for chunk in f.chunks():
archive_file.write(chunk)
archive_file.flush()
logger.info("Archive copied to temp file %s: tar file format: %s"
% (archive_file.name, tarfile.is_tarfile(archive_file.name)))
# Async job must delete temp file when done
import_article.delay(archive_file.name, request.user.userprofile.id)
import_archive(archive_file.name, request.user.userprofile.id, with_annotations)

return redirect('/admin/thresher/article/')
else:
Expand Down Expand Up @@ -115,13 +122,13 @@ def post(self, request):
if bound_form.is_valid():
f = request.FILES['schema_file']
logger.info("Request to import schema %s, length %d" % (f.name, f.size))
with tempfile.NamedTemporaryFile(delete=False) as schema_file:
with tempfile.NamedTemporaryFile(delete=True) as schema_file:
for chunk in f.chunks():
schema_file.write(chunk)
schema_file.flush()
logger.info("Schema copied to temp file %s" % schema_file.name)
# Async job must delete temp file when done
import_schema.delay(schema_file.name, request.user.userprofile.id)
schema_file.seek(0)
schema_contents = schema_file.read()
import_schema.delay(schema_contents, request.user.userprofile.id)

return redirect('/admin/thresher/topic/')
else:
Expand Down

0 comments on commit 3d6a6b2

Please sign in to comment.