Skip to content

Commit

Permalink
Allow custom PageMethod callbacks (#318)
Browse files Browse the repository at this point in the history
* Allow custom PageMethod callbacks

* Add test for callable page methods

* Adjust typing for PageMethod

* Remove trailing commas (thank you pylint)

* Update docstring

* Update docs, tests & types

* Remove unused import

---------

Co-authored-by: Eugenio Lacuesta <[email protected]>
  • Loading branch information
jdemaeyer and elacuesta authored Nov 6, 2024
1 parent a28631a commit 5500a6e
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 6 deletions.
32 changes: 30 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -844,10 +844,12 @@ down or clicking links) and you want to handle only the final result in your cal

### `PageMethod` class

#### `scrapy_playwright.page.PageMethod(method: str, *args, **kwargs)`:
#### `scrapy_playwright.page.PageMethod(method: str | callable, *args, **kwargs)`:

Represents a method to be called (and awaited if necessary) on a
`playwright.page.Page` object (e.g. "click", "screenshot", "evaluate", etc).
It's also possible to pass callable objects that will be invoked as callbacks
and receive Playwright Page as argument.
`method` is the name of the method, `*args` and `**kwargs`
are passed when calling such method. The return value
will be stored in the `PageMethod.result` attribute.
Expand Down Expand Up @@ -885,8 +887,34 @@ async def parse(self, response, **kwargs):
await page.close()
```

### Passing callable objects

### Supported methods
If a `PageMethod` receives a callable object as its first argument, it will be
called with the page as its first argument. Any additional arguments are passed
to the callable after the page.

```python
async def scroll_page(page: Page) -> str:
await page.wait_for_selector(selector="div.quote")
await page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
await page.wait_for_selector(selector="div.quote:nth-child(11)")
return page.url


class MySpyder(scrapy.Spider):
name = "scroll"

def start_requests(self):
yield Request(
url="https://quotes.toscrape.com/scroll",
meta={
"playwright": True,
"playwright_page_methods": [PageMethod(scroll_page)],
},
)
```

### Supported Playwright methods

Refer to the [upstream docs for the `Page` class](https://playwright.dev/python/docs/api/class-page)
to see available methods.
Expand Down
6 changes: 5 additions & 1 deletion scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import warnings
from contextlib import suppress
from dataclasses import dataclass, field as dataclass_field
from functools import partial
from ipaddress import ip_address
from time import time
from typing import Awaitable, Callable, Dict, Optional, Tuple, Type, TypeVar, Union
Expand Down Expand Up @@ -607,7 +608,10 @@ async def _apply_page_methods(self, page: Page, request: Request, spider: Spider
for pm in page_methods:
if isinstance(pm, PageMethod):
try:
method = getattr(page, pm.method)
if callable(pm.method):
method = partial(pm.method, page)
else:
method = getattr(page, pm.method)
except AttributeError as ex:
logger.warning(
"Ignoring %r: could not find method",
Expand Down
9 changes: 6 additions & 3 deletions scrapy_playwright/page.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any
from typing import Any, Callable, Union


__all__ = ["PageMethod"]
Expand All @@ -8,10 +8,13 @@ class PageMethod:
"""
Represents a method to be called (and awaited if necessary) on a
Playwright page, such as "click", "screenshot", "evaluate", etc.
If a callable is received, it will be called with the page as its first argument.
Any additional arguments are passed to the callable after the page.
"""

def __init__(self, method: str, *args, **kwargs) -> None:
self.method: str = method
def __init__(self, method: Union[str, Callable], *args, **kwargs) -> None:
self.method: Union[str, Callable] = method
self.args: tuple = args
self.kwargs: dict = kwargs
self.result: Any = None
Expand Down
29 changes: 29 additions & 0 deletions tests/tests_asyncio/test_page_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from scrapy import Spider, Request
from scrapy.http.response.html import HtmlResponse

from playwright.async_api import Page
from scrapy_playwright.page import PageMethod

from tests import allow_windows, make_handler, assert_correct_response
Expand Down Expand Up @@ -186,6 +187,34 @@ async def test_page_method_pdf(self):
if platform.system() != "Windows":
assert get_mimetype(pdf_file) == "application/pdf"

@allow_windows
async def test_page_method_callable(self):

async def scroll_page(page: Page) -> str:
await page.wait_for_selector(selector="div.quote")
await page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
await page.wait_for_selector(selector="div.quote:nth-child(11)")
await page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
await page.wait_for_selector(selector="div.quote:nth-child(21)")
return page.url

async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with StaticMockServer() as server:
req = Request(
url=server.urljoin("/scroll.html"),
meta={
"playwright": True,
"playwright_page_methods": {
"callable": PageMethod(scroll_page),
},
},
)
resp = await handler._download_request(req, Spider("foo"))

assert_correct_response(resp, req)
assert len(resp.css("div.quote")) == 30
assert resp.meta["playwright_page_methods"]["callable"].result == resp.url


class TestPageMethodChromium(IsolatedAsyncioTestCase, MixinPageMethodTestCase):
browser_type = "chromium"
Expand Down

0 comments on commit 5500a6e

Please sign in to comment.