diff --git a/README.md b/README.md index 19046bb..17f4265 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ - event-driven programming is not always advisable. - Selenium is slow - webdrivers often come with memory leak. -- In desperate need of a stable toolkit to communicate with Chrome browser +- In desperate need of a stable toolkit to communicate with Chrome browser (or other Blink-based browsers like Chromium) - fast http & websocket connections (based on aiohttp) for **asyncio** environment - **ichrome.debugger** is a sync tool and depends on the `ichrome.async_utils` - a choice for debugging interactively. diff --git a/ichrome/__init__.py b/ichrome/__init__.py index f3fe030..2558df4 100644 --- a/ichrome/__init__.py +++ b/ichrome/__init__.py @@ -6,7 +6,7 @@ from .logs import logger from .sync_utils import Chrome, Tab -__version__ = "2.3.6" +__version__ = "2.3.7" __tips__ = "[github]: https://github.com/ClericPy/ichrome\n[cdp]: https://chromedevtools.github.io/devtools-protocol/\n[cmd args]: https://peter.sh/experiments/chromium-command-line-switches/" __all__ = [ 'Chrome', 'ChromeDaemon', 'Tab', 'Tag', 'AsyncChrome', 'AsyncTab', 'logger', diff --git a/ichrome/__main__.py b/ichrome/__main__.py index 4b526a5..77dda9f 100644 --- a/ichrome/__main__.py +++ b/ichrome/__main__.py @@ -6,6 +6,7 @@ from pathlib import Path from ichrome import ChromeDaemon, ChromeWorkers, __version__, logger +from ichrome.base import get_readable_dir_size, install_chromium def main(): @@ -156,10 +157,16 @@ def main(): help="killall chrome launched local with --remote-debugging-port", default=False, action="store_true") + parser.add_argument("--install", + help="download chromium and unzip it to given path", + default="") args, extra_config = parser.parse_known_args() + if args.version: print(__version__) return + if args.install: + return install_chromium(args.install) if args.config: path = Path(args.config) if not (path.is_file() and path.exists()): @@ -233,7 +240,16 @@ def main(): elif args.clear_cache: from .debugger import clear_cache_handler kwargs['headless'] = getattr(args, 'headless', True) + port = kwargs.get('port') or 9222 + main_user_dir = ChromeDaemon._ensure_user_dir(kwargs['user_data_dir']) + port_user_dir = main_user_dir / f"chrome_{port}" + print( + f'Clearing cache(port={port}): {get_readable_dir_size(port_user_dir)}' + ) asyncio.run(clear_cache_handler(**kwargs)) + print( + f'Cleared cache(port={port}): {get_readable_dir_size(port_user_dir)}' + ) else: start_port = getattr(args, 'port', 9222) asyncio.run( diff --git a/ichrome/async_utils.py b/ichrome/async_utils.py index 0c5bfa5..b5e3e19 100644 --- a/ichrome/async_utils.py +++ b/ichrome/async_utils.py @@ -250,7 +250,6 @@ def __init__(self, :type default_recv_callback: Callable, optional :param _recv_daemon_break_callback: like the tab_close_callback. sync/async function only accept 1 arg of self while _recv_daemon break, defaults to None :type _recv_daemon_break_callback: Callable, optional - :raises ValueError: [description] """ tab_id = tab_id or kwargs.pop('id') if not tab_id: @@ -561,8 +560,7 @@ async def delete_cookies(self, timeout=NotSet): """deleteCookies by name, with url / domain / path.""" if not any((url, domain)): - raise ValueError( - 'At least one of the url and domain needs to be specified') + raise ValueError('URL and domain should not be null at the same time.') return await self.send("Network.deleteCookies", name=name, url=url, @@ -606,8 +604,7 @@ async def set_cookie(self, sameSite [CookieSameSite] Cookie SameSite type. expires [TimeSinceEpoch] Cookie expiration date, session cookie if not set""" if not any((url, domain)): - raise ValueError( - 'At least one of the url and domain needs to be specified') + raise ValueError('URL and domain should not be null at the same time.') kwargs: Dict[str, Any] = dict(name=name, value=value, url=url, diff --git a/ichrome/base.py b/ichrome/base.py index 8552dcf..d4b3b0a 100644 --- a/ichrome/base.py +++ b/ichrome/base.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import re import time +from asyncio import get_running_loop from pathlib import Path from typing import List @@ -79,7 +80,7 @@ def get_proc_by_regex(regex, proc_names=None): if (not proc_names or proc.name() in proc_names) and re.search( regex, ' '.join(proc.cmdline())): procs.append(proc) - except psutil.Error: + except (psutil.Error, OSError): pass return procs @@ -114,9 +115,10 @@ def clear_chrome_process(port=None, timeout=None, max_deaths=1, interval=0.5): while 1: procs = get_proc(port) for proc in procs: - logger.debug( - f"[Killing] {proc}, port: {port}. {' '.join(proc.cmdline())}") try: + logger.debug( + f"[Killing] {proc}, port: {port}. {' '.join(proc.cmdline())}" + ) proc.kill() except (psutil._exceptions.NoSuchProcess, ProcessLookupError): continue @@ -139,3 +141,116 @@ def get_dir_size(path): def get_readable_dir_size(path): return get_readable_size(get_dir_size(path), rounded=1) + + +def install_chromium(path, platform_name=None, x64=True, max_threads=5): + import os + import platform + import time + import zipfile + from io import BytesIO + from pathlib import Path + from torequests import tPool + from torequests.utils import get_readable_size + + def slice_content_length(total, chunk=1 * 1024 * 1024): + start = 0 + end = 0 + while 1: + end = start + chunk + if end > total: + yield (start, total) + break + yield (start, end) + start += chunk + 1 + + # https://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html + # https://storage.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2FLAST_CHANGE?alt=media + # https://storage.googleapis.com/chromium-browser-snapshots/Linux_x64/798492/chrome-linux.zip + req = tPool(max_threads) + # os.environ['http_proxy'] = 'https://localhost:1080' + proxy = os.getenv('HTTPS_PROXY') or os.getenv('https_proxy') or os.getenv( + 'http_proxy') or os.getenv('HTTP_PROXY') + platform_name = platform_name or platform.system() + platform_map = { + 'Linux': ['Linux', '_x64' if x64 else '', 'chrome-linux', 'chrome'], + 'Windows': ['Win', '_x64' if x64 else '', 'chrome-win', 'chrome.exe'], + 'Darwin': ['Mac', '', 'chrome-mac', 'chrome.app'], + } + # alias names + platform_map['Mac'] = platform_map['Darwin'] + platform_map['Win'] = platform_map['Windows'] + _platform_name, _x64, zip_file_name, chrome_runner_name = platform_map[ + platform_name] + version_api = f'https://storage.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/{_platform_name}{_x64}%2FLAST_CHANGE?alt=media' + r = req.get(version_api, + timeout=3, + retry=1, + proxies={ + 'https': proxy, + 'https': proxy + }) + if not r.text.isdigit(): + print(f'check your network connect to {version_api}') + return + download_url = f'https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/{_platform_name}{_x64}%2F{r.text}%2F{zip_file_name}.zip?alt=media' + print('Downloading zip file from:', download_url) + with BytesIO() as f: + r = req.head(download_url, + retry=1, + proxies={ + 'https': proxy, + 'https': proxy + }) + total = int(r.headers['Content-Length']) + start_time = time.time() + responses = [ + req.get( + download_url, + proxies={ + 'https': proxy, + 'https': proxy + }, + retry=3, + headers={'Range': f'bytes={range_start}-{range_end}'}, + ) for range_start, range_end in slice_content_length( + total, 1 * 1024 * 1024) + ] + total_mb = round(total / 1024 / 1024, 2) + proc = 0 + for r in responses: + if not r.ok: + raise ValueError(f'Bad request {r!r}') + i = r.content + f.write(i) + proc += len(i) + print( + f'{round(proc / total * 100): >3}% | {round(proc / 1024 / 1024, 2)}mb / {total_mb}mb | {get_readable_size(proc/(time.time()-start_time+0.001), rounded=0)}/s' + ) + print('Downloading is finished, will unzip it to:', path) + zf = zipfile.ZipFile(f) + zf.extractall(path) + install_folder_path = Path(path) / zip_file_name + if _platform_name == 'Mac' and install_folder_path.is_dir(): + print('Install succeeded, check your folder:', + install_folder_path.absolute()) + return + chrome_path = install_folder_path / chrome_runner_name + if chrome_path.is_file(): + chrome_abs_path = chrome_path.absolute() + print('chrome_path:', chrome_abs_path) + if _platform_name == 'Linux': + print(f'chmod 755 {chrome_abs_path}') + os.chmod(chrome_path, 755) + print(f'check chromium version:\n{chrome_abs_path} --version') + print('Install succeeded.') + else: + print('Mission failed.') + + +def async_run(func, *args, **kwargs): + + def function(): + return func(*args, **kwargs) + + return get_running_loop().run_in_executor(None, function) diff --git a/ichrome/daemon.py b/ichrome/daemon.py index d6ce1e1..a0e3242 100644 --- a/ichrome/daemon.py +++ b/ichrome/daemon.py @@ -7,14 +7,15 @@ import threading import time from getpass import getuser +from inspect import isawaitable from pathlib import Path from torequests import tPool from torequests.aiohttp_dummy import Requests from torequests.utils import timepass, ttime -from .base import (clear_chrome_process, get_dir_size, get_memory_by_port, - get_proc, get_readable_dir_size) +from .base import (async_run, clear_chrome_process, get_dir_size, + get_memory_by_port, get_proc, get_readable_dir_size) from .logs import logger """ Sync / block operations for launching chrome processes. @@ -109,45 +110,54 @@ def __init__( self.ready = False self.proc = None self.host = host - if port is None: - self.port = self.get_free_port(host=self.host) - else: - self.port = port - self.server = f"http://{self.host}:{self.port}" + self.port = port self.chrome_path = chrome_path self.UA = user_agent self.headless = headless self.proxy = proxy self.disable_image = disable_image - if '--user-data-dir=' in str(extra_config): - # ignore custom user_data_dir by ichrome - self.user_data_dir = None - else: - self._wrap_user_data_dir(user_data_dir) + self.user_data_dir = user_data_dir self.start_url = start_url - if extra_config and isinstance(extra_config, str): - extra_config = [extra_config] - self.extra_config = extra_config or ["--disable-gpu", "--no-first-run"] - if '--no-sandbox' not in str(self.extra_config) and getuser() == 'root': - self.extra_config.append('--no-sandbox') - if not isinstance(self.extra_config, list): - raise TypeError("extra_config type should be list.") - self.chrome_proc_start_time = time.time() + self.extra_config = extra_config self.proc_check_interval = proc_check_interval self.on_startup = on_startup self.on_shutdown = on_shutdown - self.init(block) + self._block = block + self.init() + + def init(self): + self._init_chrome_daemon() - def init(self, block): - self.chrome_path = self.chrome_path or self._get_default_path() + def _init_chrome_daemon(self): + self._init_extra_config() + self._init_port() + self._wrap_user_data_dir() + if not self.chrome_path: + self.chrome_path = self._get_default_path() self._ensure_port_free() self.req = tPool() self.launch_chrome() if self._use_daemon: - self._daemon_thread = self.run_forever(block=block) + self._daemon_thread = self.run_forever(block=self._block) if self.on_startup: self.on_startup(self) + def _init_extra_config(self): + if self.extra_config and isinstance(self.extra_config, str): + self.extra_config = [self.extra_config] + self.extra_config = self.extra_config or [ + "--disable-gpu", "--no-first-run" + ] + if '--no-sandbox' not in str(self.extra_config) and getuser() == 'root': + self.extra_config.append('--no-sandbox') + if not isinstance(self.extra_config, list): + raise TypeError("extra_config type should be list.") + + def _init_port(self): + if self.port is None: + self.port = self.get_free_port(host=self.host) + self.server = f"http://{self.host}:{self.port}" + def get_memory(self, attr='uss', unit='MB'): """Only support local Daemon. `uss` is slower than `rss` but useful.""" return get_memory_by_port(port=self.port, attr=attr, unit=unit) @@ -190,8 +200,13 @@ def _ensure_user_dir(cls, user_data_dir): # valid path string return Path(user_data_dir) - def _wrap_user_data_dir(self, user_data_dir): - main_user_dir = self._ensure_user_dir(user_data_dir) + def _wrap_user_data_dir(self): + if '--user-data-dir=' in str(self.extra_config): + # ignore custom user_data_dir by ichrome + self.user_data_dir = None + return + # user_data_dir = self.user_data_dir + main_user_dir = self._ensure_user_dir(self.user_data_dir) if main_user_dir is None: self.user_data_dir = None return @@ -212,6 +227,7 @@ def _wrap_user_data_dir(self, user_data_dir): def clear_user_dir(cls, user_data_dir, port=None): main_user_dir = cls._ensure_user_dir(user_data_dir) if port: + # clear port dir if port is not None port_user_dir = main_user_dir / f"chrome_{port}" logger.debug( f'Clearing only port dir: {port_user_dir} => {get_readable_dir_size(port_user_dir)} / {get_readable_dir_size(main_user_dir)}' @@ -221,6 +237,7 @@ def clear_user_dir(cls, user_data_dir, port=None): f'Cleared only port dir: {port_user_dir} => {get_readable_dir_size(port_user_dir)} / {get_readable_dir_size(main_user_dir)}' ) else: + # clear whole ichrome dir if port is None logger.debug( f'Clearing total user dir: {main_user_dir} => {get_readable_dir_size(main_user_dir)} / {get_readable_dir_size(main_user_dir)}' ) @@ -229,17 +246,30 @@ def clear_user_dir(cls, user_data_dir, port=None): f'Cleared total user dir: {main_user_dir} => {get_readable_dir_size(main_user_dir)} / {get_readable_dir_size(main_user_dir)}' ) + def _clear_user_dir(self): + # clear self user dir + self.shutdown('_clear_user_dir') + return self.clear_dir_with_shutil(self.user_data_dir) + @staticmethod def clear_dir_with_shutil(dir_path): + errors = [] + + def onerror(*args): + errors.append(args[2][1]) + dir_path = Path(dir_path) if not dir_path.is_dir(): logger.warning(f'{dir_path} is not exists, ignore.') return import shutil - shutil.rmtree(dir_path) + shutil.rmtree(dir_path, onerror=onerror) + if errors: + logger.error(f'clear_dir_with_shutil({dir_path}) error: {errors}') @classmethod def clear_dir(cls, dir_path): + # please use clear_dir_with_shutil dir_path = Path(dir_path) if not dir_path.is_dir(): logger.warning(f'{dir_path} not exists, ignore.') @@ -261,6 +291,9 @@ def ok(self): @property def proc_ok(self): + return self._proc_ok() + + def _proc_ok(self): if self.proc and self.proc.poll() is None: return True return False @@ -268,11 +301,16 @@ def proc_ok(self): @property def connection_ok(self): url = self.server + "/json" - for _ in range(2): - r = self.req.head(url, timeout=self._timeout) + for _ in range(3): + timeout = self._timeout + _ + r = self.req.head(url, timeout=timeout) if r.x and r.ok: self.ready = True self.port_in_using.add(self.port) + if self._timeout != timeout: + logger.warning( + f'timeout has been reset: {self._timeout} -> {timeout}') + self._timeout = timeout return True time.sleep(1) return False @@ -313,6 +351,7 @@ def cmd_args(self): return kwargs def _start_chrome_process(self): + self.chrome_proc_start_time = time.time() self.proc = subprocess.Popen(**self.cmd_args) def launch_chrome(self): @@ -467,7 +506,7 @@ def kill(self, force=False): if self.proc: self.proc.terminate() try: - self.proc.wait(1) + self.proc.wait(self._timeout) except subprocess.TimeoutExpired: self.proc.kill() if force: @@ -574,10 +613,9 @@ def __init__( on_shutdown=on_shutdown, ) - def init(self, block): + def init(self): # Please init AsyncChromeDaemon in a running loop with `async with` self._req = None - self._block = block # please use AsyncChromeDaemon in `async with` self._init_coro = self._init_chrome_daemon() @@ -588,32 +626,33 @@ def req(self): return self._req async def _init_chrome_daemon(self): - await self.loop.run_in_executor(None, self._ensure_port_free) + await async_run(self._init_extra_config) + await async_run(self._init_port) + await async_run(self._wrap_user_data_dir) if not self.chrome_path: - self.chrome_path = await self.loop.run_in_executor( - None, self._get_default_path) + self.chrome_path = await async_run(self._get_default_path) + await async_run(self._ensure_port_free) self._req = Requests() await self.launch_chrome() if self._use_daemon: self._daemon_thread = await self.run_forever(block=self._block) if self.on_startup: - self.on_startup(self) + _coro = self.on_startup(self) + if isawaitable(_coro): + await _coro return self - def _start_chrome_process(self): - self.proc = subprocess.Popen(**self.cmd_args) - async def restart(self): logger.debug(f"restarting {self}") - await self.loop.run_in_executor(None, super().kill) + await async_run(self.kill) return await self.launch_chrome() async def launch_chrome(self): - await self.loop.run_in_executor(None, self._start_chrome_process) + await async_run(self._start_chrome_process) error = None - if not self.proc_ok: + if not await async_run(self._proc_ok): error = f'launch_chrome failed for proc not ok' - elif not await self.connection_ok: + elif not await self.check_connection(): error = f'launch_chrome failed for connection not ok' if error: logger.error(error) @@ -621,11 +660,16 @@ async def launch_chrome(self): async def check_connection(self): url = self.server + "/json" - for _ in range(int(self._timeout) + 1): - r = await self.req.head(url, timeout=self._timeout) + for _ in range(3): + timeout = self._timeout + _ + r = await self.req.head(url, timeout=timeout) if r and r.ok: self.ready = True self.port_in_using.add(self.port) + if self._timeout != timeout: + logger.warning( + f'timeout has been reset: {self._timeout} -> {timeout}') + self._timeout = timeout return True await asyncio.sleep(1) return False @@ -646,13 +690,11 @@ async def get_free_port(cls, start=9222, max_tries=100, timeout=1): - return await asyncio.get_running_loop().run_in_executor( - None, - super().get_free_port, - host=host, - start=start, - max_tries=max_tries, - timeout=timeout) + return await async_run(super().get_free_port, + host=host, + start=start, + max_tries=max_tries, + timeout=timeout) async def check_chrome_ready(self): if self.proc_ok and await self.check_connection(): @@ -706,13 +748,12 @@ async def _daemon(self, interval=None): deaths += 1 continue try: - return_code = await self.loop.run_in_executor( - None, self.proc.wait, interval) + return_code = await async_run(self.proc.wait, interval) deaths += 1 except subprocess.TimeoutExpired: deaths = 0 logger.debug(f"{self} daemon exited.") - self.update_shutdown_time() + await self.update_shutdown_time() return return_code async def __aenter__(self): @@ -720,7 +761,30 @@ async def __aenter__(self): async def __aexit__(self, *args, **kwargs): if not self._shutdown: - await self.loop.run_in_executor(None, self.__exit__) + await self.shutdown('__aexit__') + + async def shutdown(self, reason=None): + if self._shutdown: + # logger.debug(f"{self} shutdown at {ttime(self._shutdown)} yet.") + return + reason = f' for {reason}' if reason else '' + logger.debug( + f"{self} shutting down{reason}, start-up: {ttime(self.start_time)}, duration: {timepass(time.time() - self.start_time, accuracy=3, format=1)}." + ) + await self.update_shutdown_time() + await async_run(self.kill, True) + + async def update_shutdown_time(self): + self._shutdown = time.time() + if self.on_shutdown: + _coro = self.on_shutdown(self) + if isawaitable(_coro): + await _coro + + async def _clear_user_dir(self): + # clear self user dir + await self.shutdown('_clear_user_dir') + return async_run(self.clear_dir_with_shutil, self.user_data_dir) class ChromeWorkers: