Allow custom PageMethod callbacks (#318)

* Allow custom PageMethod callbacks * Add test for callable page methods * Adjust typing for PageMethod * Remove trailing commas (thank you pylint) * Update docstring * Update docs, tests & types * Remove unused import --------- Co-authored-by: Eugenio Lacuesta <[email protected]>
scrapy-plugins · Nov 6, 2024 · 5500a6e · 5500a6e
1 parent a28631a
commit 5500a6e
Show file tree

Hide file tree

Showing 4 changed files with 70 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -844,10 +844,12 @@ down or clicking links) and you want to handle only the final result in your cal
 
 ### `PageMethod` class
 
-#### `scrapy_playwright.page.PageMethod(method: str, *args, **kwargs)`:
+#### `scrapy_playwright.page.PageMethod(method: str | callable, *args, **kwargs)`:
 
 Represents a method to be called (and awaited if necessary) on a
 `playwright.page.Page` object (e.g. "click", "screenshot", "evaluate", etc).
+It's also possible to pass callable objects that will be invoked as callbacks
+and receive Playwright Page as argument.
 `method` is the name of the method, `*args` and `**kwargs`
 are passed when calling such method. The return value
 will be stored in the `PageMethod.result` attribute.
@@ -885,8 +887,34 @@ async def parse(self, response, **kwargs):
     await page.close()
 ```
 
+### Passing callable objects
 
-### Supported methods
+If a `PageMethod` receives a callable object as its first argument, it will be
+called with the page as its first argument. Any additional arguments are passed
+to the callable after the page.
+
+```python
+async def scroll_page(page: Page) -> str:
+    await page.wait_for_selector(selector="div.quote")
+    await page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
+    await page.wait_for_selector(selector="div.quote:nth-child(11)")
+    return page.url
+
+
+class MySpyder(scrapy.Spider):
+    name = "scroll"
+
+    def start_requests(self):
+        yield Request(
+            url="https://quotes.toscrape.com/scroll",
+            meta={
+                "playwright": True,
+                "playwright_page_methods": [PageMethod(scroll_page)],
+            },
+        )
+```
+
+### Supported Playwright methods
 
 Refer to the [upstream docs for the `Page` class](https://playwright.dev/python/docs/api/class-page)
 to see available methods.

diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
@@ -5,6 +5,7 @@
 import warnings
 from contextlib import suppress
 from dataclasses import dataclass, field as dataclass_field
+from functools import partial
 from ipaddress import ip_address
 from time import time
 from typing import Awaitable, Callable, Dict, Optional, Tuple, Type, TypeVar, Union
@@ -607,7 +608,10 @@ async def _apply_page_methods(self, page: Page, request: Request, spider: Spider
         for pm in page_methods:
             if isinstance(pm, PageMethod):
                 try:
-                    method = getattr(page, pm.method)
+                    if callable(pm.method):
+                        method = partial(pm.method, page)
+                    else:
+                        method = getattr(page, pm.method)
                 except AttributeError as ex:
                     logger.warning(
                         "Ignoring %r: could not find method",

diff --git a/scrapy_playwright/page.py b/scrapy_playwright/page.py
@@ -1,4 +1,4 @@
-from typing import Any
+from typing import Any, Callable, Union
 
 
 __all__ = ["PageMethod"]
@@ -8,10 +8,13 @@ class PageMethod:
     """
     Represents a method to be called (and awaited if necessary) on a
     Playwright page, such as "click", "screenshot", "evaluate", etc.
+
+    If a callable is received, it will be called with the page as its first argument.
+    Any additional arguments are passed to the callable after the page.
     """
 
-    def __init__(self, method: str, *args, **kwargs) -> None:
-        self.method: str = method
+    def __init__(self, method: Union[str, Callable], *args, **kwargs) -> None:
+        self.method: Union[str, Callable] = method
         self.args: tuple = args
         self.kwargs: dict = kwargs
         self.result: Any = None

diff --git a/tests/tests_asyncio/test_page_methods.py b/tests/tests_asyncio/test_page_methods.py
@@ -8,6 +8,7 @@
 from scrapy import Spider, Request
 from scrapy.http.response.html import HtmlResponse
 
+from playwright.async_api import Page
 from scrapy_playwright.page import PageMethod
 
 from tests import allow_windows, make_handler, assert_correct_response
@@ -186,6 +187,34 @@ async def test_page_method_pdf(self):
                 if platform.system() != "Windows":
                     assert get_mimetype(pdf_file) == "application/pdf"
 
+    @allow_windows
+    async def test_page_method_callable(self):
+
+        async def scroll_page(page: Page) -> str:
+            await page.wait_for_selector(selector="div.quote")
+            await page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
+            await page.wait_for_selector(selector="div.quote:nth-child(11)")
+            await page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
+            await page.wait_for_selector(selector="div.quote:nth-child(21)")
+            return page.url
+
+        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
+            with StaticMockServer() as server:
+                req = Request(
+                    url=server.urljoin("/scroll.html"),
+                    meta={
+                        "playwright": True,
+                        "playwright_page_methods": {
+                            "callable": PageMethod(scroll_page),
+                        },
+                    },
+                )
+                resp = await handler._download_request(req, Spider("foo"))
+
+            assert_correct_response(resp, req)
+            assert len(resp.css("div.quote")) == 30
+            assert resp.meta["playwright_page_methods"]["callable"].result == resp.url
+
 
 class TestPageMethodChromium(IsolatedAsyncioTestCase, MixinPageMethodTestCase):
     browser_type = "chromium"