Skip to content

Commit

Permalink
Generic article loader with option to highlight all articles with all…
Browse files Browse the repository at this point in the history
… root topics (#178)

* Streamline saving article batch_name.

Use variable 'raw_bytes' until article text converted to UTF-8.

Improve code that handles attempts to load duplicate articles.

* Add generic article loader.

Adjust article loader to make metadata attributes and tuas optional.

* Write command line utility to highlight all articles with all root topics.

* Make highlight all utility into callable function with informative logging.

* Improve formatting of Contributor default representation.

* Integrate new generic loader and all highlight annotator with existing Upload Articles.

Add command line script to highlight all articles with all root topics.
  • Loading branch information
normangilmore authored Nov 22, 2017
1 parent 0cc4d3d commit 997cc35
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 37 deletions.
18 changes: 10 additions & 8 deletions data/document_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from django.db import transaction

from thresher.models import UserProfile
from data.load_data import load_article, load_annotations, parse_batch_name
from data.load_data import (load_article, parse_batch_name,
load_annotations, highlight_all)
from data.parse_document import parse_article

def import_archive(orig_filename, filename, owner_profile_id, with_annotations=False):
Expand All @@ -18,21 +19,22 @@ def import_archive(orig_filename, filename, owner_profile_id, with_annotations=F
if af.isfile() and fnmatch.fnmatch(af.name, "*.txt")]
logger.info("articles found %d" % len(members))
for member in members:
article = tar.extractfile(member).read()
raw_bytes = tar.extractfile(member).read()
article_filename = os.path.basename(member.name)
import_article.delay(batch_name, article, article_filename,
import_article.delay(batch_name, raw_bytes, article_filename,
owner_profile_id, with_annotations)
finally:
os.remove(filename)

@django_rq.job('file_importer', timeout=60, result_ttl=24*3600)
def import_article(batch_name, article, filename, owner_profile_id, with_annotations):
def import_article(batch_name, raw_bytes, filename, owner_profile_id, with_annotations):
owner_profile = UserProfile.objects.get(pk=owner_profile_id)
with transaction.atomic():
annotated_article = parse_article(article, filename)
article_obj = load_article(annotated_article)
article_obj.batch_name = batch_name
article_obj.save()
# n.b. conversion to UTF-8 happens in parse_article
annotated_article = parse_article(raw_bytes, filename)
article_obj = load_article(batch_name, annotated_article)
if article_obj and with_annotations:
if annotated_article['parser'] == 'generic':
highlight_all(annotated_article)
load_annotations(annotated_article, article_obj)
return article_obj.id
46 changes: 46 additions & 0 deletions data/highlight_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "thresher_backend.settings")

import logging
logger = logging.getLogger(__name__)

import django
django.setup()

from thresher.models import Article, Topic
from thresher.models import Contributor, ArticleHighlight, HighlightGroup

GOLDUSERNAME=u"Full Text Highlighted"

# Create a set of highlights that covers the entire article for
# every root topic, so the Researcher can send these directly to the Quiz
def highlightArticles(root_topics, article_set, contributor):
topic_list = root_topics.all().values_list("name", flat=True)
topic_names = ", ".join(topic_list)
logger.info(u"Annotating entire articles with following topics: {}"
.format(topic_names))

for article_obj in article_set:
taskruns = article_obj.highlight_taskruns
if taskruns.filter(contributor__username=GOLDUSERNAME).count() == 0:
logger.info("Adding highlights to article number: {}"
.format(article_obj.article_number))
article_highlight = ArticleHighlight.objects.create(
article=article_obj,
contributor=contributor
)
offset_list = [[0, len(article_obj.text), article_obj.text]]
case_number = 0
for topic_obj in root_topics.all():
highlight = HighlightGroup.objects.create(offsets=offset_list,
case_number=case_number,
topic=topic_obj,
article_highlight=article_highlight)
if __name__ == '__main__':
# PE schemas start with topic_number 1
root_topics = Topic.objects.filter(topic_number=1)
article_set = Article.objects.all()
(contributor, created) = Contributor.objects.get_or_create(
username=GOLDUSERNAME
)
highlightArticles(root_topics, article_set, contributor)
58 changes: 41 additions & 17 deletions data/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def load_schema(schema):
load_dependencies(schema)
load_options(schema, root_topic)

def load_article(article):
def load_article(batch_name, article):
new_id = int(article['metadata']['article_number'])

try: # Catch duplicate article ids and assign new ids.
Expand All @@ -306,18 +306,21 @@ def load_article(article):
logger.warn("Article ID {} already assigned. New id is {}. "
"Recommend fixing source data".format(old_id, new_id))
else:
# we've already loaded this article, so don't process its TUAs.
return
# already loaded this article, return None so TUAs not reloaded
return None

except Article.DoesNotExist: # Not a duplicate.
pass

date_published=article['metadata']['date_published']
if isinstance(date_published, date):
# JSON Serializer doesn't like dates
article['metadata']['date_published']=date_published.isoformat()
if 'date_published' in article['metadata']:
date_published=article['metadata']['date_published']
if isinstance(date_published, date):
# JSON Serializer doesn't like dates
article['metadata']['date_published']=date_published.isoformat()

article_obj = Article(
article_number=new_id,
batch_name=batch_name,
text=article['text'],
metadata=article['metadata']
)
Expand All @@ -327,14 +330,17 @@ def load_article(article):
return article_obj

def load_annotations(article, article_obj):
# In future usage, the articles being imported will not be highlighted
# already and thus won't have 'annotators'.
annotators = ','.join(article['metadata']['annotators'])
if annotators == "":
annotators = "Unknown annotator"
# If articles being imported are not highlighted
# already they won't have 'tuas'.
if 'tuas' not in article:
return

contributor_name = "Unknown annotator"
if 'contributor' in article:
contributor_name = article['contributor']

(contributor, created) = Contributor.objects.get_or_create(
username="Gold Standard"
username=contributor_name
)

article_highlight = ArticleHighlight.objects.create(article=article_obj,
Expand Down Expand Up @@ -412,6 +418,24 @@ def parse_batch_name(orig_filename):
basename = os.path.basename(orig_filename)
return os.path.splitext(basename)[0]

topic_name_cache = None

# Create a set of highlights that covers the entire article for
# every root topic, so the Researcher can send these directly to the Quiz
def highlight_all(parsed_article):
global topic_name_cache
if topic_name_cache is None:
# PE schemas start with topic_number 1
topic_name_cache = (Topic.objects.filter(topic_number=1)
.values_list("name", flat=True))
text = parsed_article['text']
offset_list = [[0, len(text), text]]
tuas = {}
for topic_name in topic_name_cache:
tua = { 0: offset_list }
tuas[topic_name] = tua
parsed_article['tuas'] = tuas

def load_article_dir(dirpath, with_annotations=False):
batch_name = parse_batch_name(dirpath)
for article_filename in os.listdir(dirpath):
Expand All @@ -420,10 +444,10 @@ def load_article_dir(dirpath, with_annotations=False):
fullpath = os.path.join(dirpath, article_filename)
with transaction.atomic():
annotated_article = parse_document(fullpath, article_filename)
article_obj = load_article(annotated_article)
article_obj.batch_name = batch_name
article_obj.save()
if with_annotations:
article_obj = load_article(batch_name, annotated_article)
if article_obj and with_annotations:
if annotated_article['parser'] == 'generic':
highlight_all(annotated_article)
load_annotations(annotated_article, article_obj)

def load_args():
Expand Down
43 changes: 36 additions & 7 deletions data/parse_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,41 @@ def __init__(self, message, error_type):

def parse_document(fullpath, filename):
with open(fullpath, 'r') as f:
raw_text = f.read()
return parse_article(raw_text, filename)
raw_bytes = f.read()
return parse_article(raw_bytes, filename)

def parse_article(raw_text, filename):
def parse_article(raw_bytes, filename):

# Convert to UTF-8, removing the Windows BOM if present
raw_text = raw_text.decode('utf-8-sig', errors='strict')
raw_text = raw_bytes.decode('utf-8-sig', errors='strict')

# Allowing for initial blank lines and blank space, look for
# nickML marker +*+*
if re.match(r'(\s*\r\n?|\s*\n)*\s*\+\*\+\*', raw_text):
return parse_nickML(raw_text, filename)
else:
return parse_generic(raw_text, filename)

def parse_generic(raw_text, filename):
basename = os.path.basename(filename)
match = re.search(r'^(?P<article_number>\d+)', basename)
if match:
article_number = match.group('article_number')
else:
raise ArticleParseError('No article number starting filename: ' + basename,
ArticleParseError.FILENAME_ERROR)
metadata = {
'article_number': article_number,
'filename': filename,
}
return {
'metadata': metadata,
'text': raw_text,
'contributor': u"Full Text Highlighted",
'parser': 'generic',
}

def parse_nickML(raw_text, filename):
# extract info from the file name
article_number, city, state, periodical, periodical_code = parse_filename(filename)

Expand Down Expand Up @@ -107,7 +135,6 @@ def parse_article(raw_text, filename):
# ArticleParseError.BRACKET_WARNING)

# print out our data.
# TODO: store this somewhere.
metadata = {
'annotators': annotators,
'version': version,
Expand All @@ -122,7 +149,9 @@ def parse_article(raw_text, filename):
return {
'metadata': metadata,
'text': clean_text,
'tuas': tuas
'tuas': tuas,
'contributor': u"Gold Standard DF",
'parser': 'nickML',
}

# print "final clean text:", clean_text
Expand Down Expand Up @@ -384,7 +413,7 @@ def parse_documents(directory_path, error_directory_paths):
print "PROCCESING FILE:", file_path, "..."

try:
data.append(parse_document(full_path))
data.append(parse_document(full_path, os.path.basename(full_path)))
except ArticleParseError as e:
new_path = os.path.join(error_directory_paths[e.error_type],
file_path)
Expand Down
5 changes: 5 additions & 0 deletions docker/thresher_api/highlight_all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
set -e
cd /home/thresher
export PYTHONPATH=/home/thresher
python data/highlight_all.py
6 changes: 6 additions & 0 deletions highlight_all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
# MSYS_NO_PATHCONV fixes paths on Docker Toolbox on Windows using Git Bash / Mingw
# Harmless everywhere else.
export MSYS_NO_PATHCONV=1
# DB updates can use either 'run' or 'exec'
docker-compose exec thresher_api sh /home/thresher/docker/thresher_api/highlight_all.sh
11 changes: 6 additions & 5 deletions thresher/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,12 @@ class Meta:
)

def __unicode__(self):
return "id {} username {} pybossa user id {}".format(
self.id,
self.username,
self.pybossa_user_id
)
result = u"id {}".format(self.id)
if self.username != "":
result += u" " + self.username
if self.pybossa_user_id:
result += u" pybossa id: {}".format(self.pybossa_user_id)
return result


TASK_TYPE = (
Expand Down

0 comments on commit 997cc35

Please sign in to comment.