Skip to content

#93 Change xpath based on 'changed tvtropes' #94

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 9, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cliche/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ def setup_raven_logging(conf=None, **kwargs):


@task_failure.connect
def report_task_failure(task_id, exception, args, kwargs, traceback, einfo):
def report_task_failure(task_id, exception, args, kwargs,
traceback, einfo, sender):
client = get_raven_client()
client.captureException(einfo.exc_info)
23 changes: 11 additions & 12 deletions cliche/services/tvtropes/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,23 +132,22 @@ def fetch_link(url, session, *, log_prefix=''):
return False, None, None, None, final_url
tree = document_fromstring(r.text)
try:
namespace = tree.xpath('//div[@class="pagetitle"]')[0] \
.text.strip()[:-1]
name = tree.find_class('article_title')[0].text_content()
except (AttributeError, AssertionError, IndexError):
logger.warning('%sWarning on url %s: '
'There is no pagetitle on this page. Ignoring.',
log_prefix, url)
return False, tree, None, None, final_url
if namespace == '':
namespace = 'Main'
name = tree.xpath('//div[@class="pagetitle"]/span')[0].text.strip()

type = determine_type(namespace)
if type == 'Administrivia':
return False, tree, namespace, name, final_url
upsert_entity(session, namespace, name, type, final_url)
process_redirections(session, url, final_url, namespace, name)
return True, tree, namespace, name, final_url
else:
*namespace, name = name.split(':')
name = name.strip()
namespace = 'Main' if not namespace else namespace[0]
type = determine_type(namespace)
if type == 'Administrivia':
return False, tree, namespace, name, final_url
upsert_entity(session, namespace, name, type, final_url)
process_redirections(session, url, final_url, namespace, name)
return True, tree, namespace, name, final_url


def recently_crawled(current_time, url, session):
Expand Down
21 changes: 21 additions & 0 deletions tests/tvtropes_crawler_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import requests

from cliche.services.tvtropes.crawler import fetch_link


def test_fetch_link(monkeypatch, fx_session, fx_celery_app):

url = 'http://tvtropes.org/pmwiki/pmwiki.php/Main/GodJob'
text = '<div class="pagetitle"><div class="article_title"><h1>' \
'<span>God Job</span></h1></div></div>'

def mockreturn(path):
req = requests.Request()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure a bit about monkeypatch since I'm not so used to it; so I'm just asking: is requests.Request() appropriate in this context? (not requests.Response()?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

monkeypatch changes "request" so that the test can fetch pre-defined "request".
mockreturn define 'request', the test don't need any external network.

it doesn't need `response', because crawling don't need to change response.

req.url = url
req.text = text
return req

monkeypatch.setattr(requests, "get", mockreturn)

result = fetch_link(url, fx_session)
assert result[-3:] == ('Main', 'God Job', url)