fix #34

fhamborg · May 20, 2019 · b9de435 · b9de435
1 parent ac503f6
commit b9de435
Show file tree

Hide file tree

Showing 7 changed files with 44 additions and 7 deletions.
diff --git a/.idea/Giveme5W1H.iml b/.idea/Giveme5W1H.iml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/Giveme5W1H/examples/extracting/parse_documents.py b/Giveme5W1H/examples/extracting/parse_documents.py
@@ -14,7 +14,7 @@
 """
 
 # don`t forget to start up core_nlp_host
-# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
+# giveme5w1h-corenlp
 
 if __name__ == '__main__':
     # helper to setup a correct path

diff --git a/Giveme5W1H/examples/extracting/parse_from_newsplease.py b/Giveme5W1H/examples/extracting/parse_from_newsplease.py
@@ -0,0 +1,28 @@
+"""
+This is a simple example how to use the extractor in combination with news-please, a news crawler and scraper (https://github.com/fhamborg/news-please).
+
+- Nothing is cached
+
+"""
+
+# don`t forget to start up core_nlp_host
+# giveme5w1h-corenlp
+
+from newsplease import NewsPlease
+
+from Giveme5W1H.extractor.document import Document
+from Giveme5W1H.extractor.extractor import MasterExtractor
+
+extractor = MasterExtractor()
+
+
+def main():
+    article = NewsPlease.from_url(
+        'https://www.foxnews.com/politics/house-democrat-subpoenas-mnuchin-irs-for-trumps-tax-returns')
+    doc = Document.from_newsplease(article)
+    doc = extractor.parse(doc)
+    answers = doc.get_top_answer('who').get_parts_as_text()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/Giveme5W1H/examples/extracting/parse_single_from_code.py b/Giveme5W1H/examples/extracting/parse_single_from_code.py
@@ -11,8 +11,9 @@
 """
 
 # don`t forget to start up core_nlp_host
-# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
+# giveme5w1h-corenlp
 
+titleshort = "Barack Obama was born in Hawaii.  He is the president. Obama was elected in 2008."
 
 title = "Taliban attacks German consulate in northern Afghan city of Mazar-i-Sharif with truck bomb"
 lead = "The death toll from a powerful Taliban truck bombing at the German consulate in Afghanistan's Mazar-i-Sharif city rose to at least six Friday, with more than 100 others wounded in a major militant assault."
@@ -74,8 +75,8 @@
 
     # giveme5w setup - with defaults
     extractor = MasterExtractor()
-    Document()
-    doc = Document(title, lead, text, date_publish)
+    doc = Document.from_text(titleshort, date_publish)
+
     doc = extractor.parse(doc)
 
     top_who_answer = doc.get_top_answer('who').get_parts_as_text()

diff --git a/Giveme5W1H/extractor/document.py b/Giveme5W1H/extractor/document.py
@@ -53,6 +53,14 @@ def __init__(self, title='', desc='', text='', date=None, raw_data=None):
     def from_text(cls, text, date=None, raw_data=None):
         return cls(title=text, date=date, raw_data=raw_data)
 
+    @classmethod
+    def from_newsplease(cls, article):
+        if article.date_publish:
+            tmp_date = str(article.date_publish)
+        else:
+            tmp_date = None
+        return cls(title=article.title, text=article.text, desc=article.description, date=tmp_date, raw_data=None)
+
     def is_preprocessed(self, preprocessed=None):
         if preprocessed is True or preprocessed is False:
             self._preprocessed = preprocessed

diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 
 setup(name='giveme5w1h',
-      version='1.0.15',
+      version='1.0.16',
       description="Extraction of the journalistic five W and one H questions (5W1H) from news articles.",
       long_description="""Giveme5W1H is an open source, easy-to-use system to that extracts phrases answering the journalist 5W1H questions to describe an article's main event: who did what, when, where, why, and how?""",
       classifiers=[