diff --git a/cliche/celery.py b/cliche/celery.py index f9d9b9f..028ed18 100644 --- a/cliche/celery.py +++ b/cliche/celery.py @@ -227,6 +227,7 @@ def setup_raven_logging(conf=None, **kwargs): @task_failure.connect -def report_task_failure(task_id, exception, args, kwargs, traceback, einfo): +def report_task_failure(task_id, exception, args, kwargs, + traceback, einfo, sender): client = get_raven_client() client.captureException(einfo.exc_info) diff --git a/cliche/services/tvtropes/crawler.py b/cliche/services/tvtropes/crawler.py index 2997ee5..3cefeed 100644 --- a/cliche/services/tvtropes/crawler.py +++ b/cliche/services/tvtropes/crawler.py @@ -132,23 +132,22 @@ def fetch_link(url, session, *, log_prefix=''): return False, None, None, None, final_url tree = document_fromstring(r.text) try: - namespace = tree.xpath('//div[@class="pagetitle"]')[0] \ - .text.strip()[:-1] + name = tree.find_class('article_title')[0].text_content() except (AttributeError, AssertionError, IndexError): logger.warning('%sWarning on url %s: ' 'There is no pagetitle on this page. Ignoring.', log_prefix, url) return False, tree, None, None, final_url - if namespace == '': - namespace = 'Main' - name = tree.xpath('//div[@class="pagetitle"]/span')[0].text.strip() - - type = determine_type(namespace) - if type == 'Administrivia': - return False, tree, namespace, name, final_url - upsert_entity(session, namespace, name, type, final_url) - process_redirections(session, url, final_url, namespace, name) - return True, tree, namespace, name, final_url + else: + *namespace, name = name.split(':') + name = name.strip() + namespace = 'Main' if not namespace else namespace[0] + type = determine_type(namespace) + if type == 'Administrivia': + return False, tree, namespace, name, final_url + upsert_entity(session, namespace, name, type, final_url) + process_redirections(session, url, final_url, namespace, name) + return True, tree, namespace, name, final_url def recently_crawled(current_time, url, session): diff --git a/tests/tvtropes_crawler_test.py b/tests/tvtropes_crawler_test.py new file mode 100644 index 0000000..7979332 --- /dev/null +++ b/tests/tvtropes_crawler_test.py @@ -0,0 +1,21 @@ +import requests + +from cliche.services.tvtropes.crawler import fetch_link + + +def test_fetch_link(monkeypatch, fx_session, fx_celery_app): + + url = 'http://tvtropes.org/pmwiki/pmwiki.php/Main/GodJob' + text = '

' \ + 'God Job

' + + def mockreturn(path): + req = requests.Request() + req.url = url + req.text = text + return req + + monkeypatch.setattr(requests, "get", mockreturn) + + result = fetch_link(url, fx_session) + assert result[-3:] == ('Main', 'God Job', url)