Skip to content

Commit 316fda6

Browse files
authored
Merge pull request #32 from ClericPy/dev
2.3.3
2 parents b785860 + 4e1fdad commit 316fda6

5 files changed

Lines changed: 66 additions & 30 deletions

File tree

ichrome/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from .logs import logger
77
from .sync_utils import Chrome, Tab
88

9-
__version__ = "2.3.2"
9+
__version__ = "2.3.3"
1010
__tips__ = "[github]: https://github.com/ClericPy/ichrome\n[cdp]: https://chromedevtools.github.io/devtools-protocol/\n[cmd args]: https://peter.sh/experiments/chromium-command-line-switches/"
1111
__all__ = [
1212
'Chrome', 'ChromeDaemon', 'Tab', 'Tag', 'AsyncChrome', 'AsyncTab', 'logger',

ichrome/__main__.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,7 @@ def main():
4646
"-cp",
4747
"--chrome-path",
4848
"--chrome_path",
49-
help=
50-
"chrome executable file path, default to null for automatic searching",
49+
help="chrome executable file path, default to null(automatic searching)",
5150
default="")
5251
parser.add_argument("-H",
5352
"--host",
@@ -211,17 +210,26 @@ def main():
211210
log_level = getattr(args, 'log_level', None)
212211
if log_level:
213212
logger.setLevel(log_level)
214-
if args.start_url == 'about:blank' or not args.start_url:
213+
if kwargs['start_url'] == 'about:blank' or not kwargs['start_url']:
214+
# reset start_url from extra_config
215215
for config in kwargs['extra_config']:
216216
if re.match('^https?://', config):
217217
kwargs['start_url'] = config
218218
kwargs['extra_config'].remove(config)
219219
break
220+
220221
if '--dump-dom' in extra_config or args.crawl:
221222
logger.setLevel(60)
222223
from .debugger import crawl_once
224+
if kwargs['start_url'] == 'about:blank' or not kwargs['start_url']:
225+
kwargs['start_url'] = kwargs['extra_config'].pop(0)
226+
if kwargs['start_url'] != 'about:blank' and not kwargs[
227+
'start_url'].startswith('http'):
228+
kwargs['start_url'] = 'http://' + kwargs['start_url']
223229
kwargs['headless'] = getattr(args, 'headless', True)
224-
asyncio.run(crawl_once(**kwargs))
230+
kwargs['disable_image'] = True
231+
kwargs['timeout'] = max([5, args.timeout])
232+
print(asyncio.run(crawl_once(**kwargs)), flush=True)
225233
elif args.clear_cache:
226234
from .debugger import clear_cache_handler
227235
kwargs['headless'] = getattr(args, 'headless', True)

ichrome/async_utils.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -993,6 +993,23 @@ async def reset_history(self, timeout=NotSet) -> bool:
993993
result = await self.send('Page.resetNavigationHistory', timeout=timeout)
994994
return self.check_error('reset_history', result)
995995

996+
async def setBlockedURLs(self, urls: List[str], timeout=NotSet):
997+
"""(Network.setBlockedURLs) Blocks URLs from loading. [EXPERIMENTAL].
998+
:param urls: URL patterns to block. Wildcards ('*') are allowed.
999+
:type urls: List[str]
1000+
1001+
Demo::
1002+
1003+
await tab.setBlockedURLs(urls=['*.jpg', '*.png'])
1004+
1005+
WARNING: This method is EXPERIMENTAL, the official suggestion is using Fetch.enable, even Fetch is also EXPERIMENTAL, and wait events to control the requests (continue / abort / modify), especially block urls with resourceType: Document, Stylesheet, Image, Media, Font, Script, TextTrack, XHR, Fetch, EventSource, WebSocket, Manifest, SignedExchange, Ping, CSPViolationReport, Other.
1006+
https://chromedevtools.github.io/devtools-protocol/tot/Fetch/#method-enable
1007+
1008+
1009+
"""
1010+
await self.enable('Network')
1011+
return await self.send('Network.setBlockedURLs', urls=urls, timeout=timeout)
1012+
9961013
async def set_url(self,
9971014
url: Optional[str] = None,
9981015
referrer: Optional[str] = None,

ichrome/daemon.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@ class ChromeDaemon(object):
3030
host="127.0.0.1", --remote-debugging-address, default to 127.0.0.1
3131
port, --remote-debugging-port, default to 9222
3232
headless, --headless and --hide-scrollbars, default to False
33-
user_agent, --user-agent, default to 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
33+
user_agent, --user-agent, default to None (with the original UA)
3434
proxy, --proxy-server, default to None
35-
user_data_dir, user_data_dir to save the user data, default to ~/ichrome_user_data. These string will ignore user_data_dir arg: {'null', 'None', '/dev/null', "''", '""'}
35+
user_data_dir, user_data_dir to save the user data, default to ~/ichrome_user_data. These strings will ignore user_data_dir arg: {'null', 'None', '/dev/null', "''", '""'}
3636
disable_image, disable image for loading performance, default to False
37-
start_url, start url while launching chrome, default to about:blank
38-
max_deaths, max deaths in 5 secs, auto restart `max_deaths` times if crash fast in 5 secs. default to 1 for without auto-restart
37+
start_url, start url while launching chrome, default to None
38+
max_deaths, max deaths in 5 secs, auto restart `max_deaths` times if crash fast in 5 secs. default to 1 (without auto-restart)
3939
timeout, timeout to connect the remote server, default to 1 for localhost
4040
debug, set logger level to DEBUG
4141
proc_check_interval, check chrome process alive every interval seconds
@@ -67,7 +67,7 @@ class ChromeDaemon(object):
6767
"""
6868

6969
port_in_using: set = set()
70-
PC_UA = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Mobile Safari/537.36"
70+
PC_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"
7171
MAC_OS_UA = (
7272
"Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) Version/8.0.1a Safari/728.28.19"
7373
)
@@ -86,7 +86,7 @@ def __init__(
8686
proxy=None,
8787
user_data_dir=None,
8888
disable_image=False,
89-
start_url="about:blank",
89+
start_url="",
9090
extra_config=None,
9191
max_deaths=1,
9292
daemon=True,
@@ -109,10 +109,13 @@ def __init__(
109109
self.ready = False
110110
self.proc = None
111111
self.host = host
112-
self.port = port
112+
if port is None:
113+
self.port = self.get_free_port(host=self.host)
114+
else:
115+
self.port = port
113116
self.server = f"http://{self.host}:{self.port}"
114117
self.chrome_path = chrome_path
115-
self.UA = self.PC_UA if user_agent is None else user_agent
118+
self.UA = user_agent
116119
self.headless = headless
117120
self.proxy = proxy
118121
self.disable_image = disable_image
@@ -265,7 +268,7 @@ def proc_ok(self):
265268
def connection_ok(self):
266269
url = self.server + "/json"
267270
for _ in range(2):
268-
r = self.req.get(url, timeout=self._timeout)
271+
r = self.req.head(url, timeout=self._timeout)
269272
if r.x and r.ok:
270273
self.ready = True
271274
self.port_in_using.add(self.port)
@@ -531,7 +534,7 @@ def __init__(
531534
proxy=None,
532535
user_data_dir=None,
533536
disable_image=False,
534-
start_url="about:blank",
537+
start_url="",
535538
extra_config=None,
536539
max_deaths=1,
537540
daemon=True,
@@ -604,7 +607,7 @@ async def launch_chrome(self):
604607
async def check_connection(self):
605608
url = self.server + "/json"
606609
for _ in range(int(self._timeout) + 1):
607-
r = await self.req.get(url, timeout=self._timeout)
610+
r = await self.req.head(url, timeout=self._timeout)
608611
if r and r.ok:
609612
self.ready = True
610613
self.port_in_using.add(self.port)
@@ -719,7 +722,7 @@ async def __aenter__(self):
719722
async def create_chrome_workers(self):
720723
for port in range(self.start_port, self.start_port + self.workers):
721724
logger.debug("ChromeDaemon cmd args: port=%s, %s" %
722-
(port, self.kwargs))
725+
(port, self.kwargs))
723726
self.daemons.append(await
724727
AsyncChromeDaemon(port=port,
725728
daemon=True,

ichrome/debugger.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22
import asyncio
33
import atexit
4+
import os
45
from functools import wraps
56
from inspect import isawaitable
67
from typing import Set
@@ -68,8 +69,6 @@ def __getattr__(self, name):
6869
def quit_while_daemon_missing(daemon):
6970
# quit the whole program while missing daemon process for daemon debugger
7071
if not daemon.get_proc(daemon.port):
71-
import os
72-
print(f'{daemon} missing process, exit.')
7372
os._exit(1)
7473

7574

@@ -210,17 +209,27 @@ def launch(*args, **kwargs):
210209
return Daemon(*args, **kwargs)
211210

212211

213-
def get_a_tab(host='127.0.0.1', port=None) -> AsyncTab:
212+
def connect_a_chrome(host='127.0.0.1', port=None, **daemon_kwargs) -> Chrome:
214213
if not port:
215214
for port in ChromeDaemon.port_in_using:
216-
return Chrome(port=port).get_tab()
217-
port = 9222
215+
return Chrome(host=host, port=port)
216+
port = ChromeDaemon.get_free_port(host=host)
218217
try:
219-
return Chrome(host=host, port=port).get_tab()
218+
return Chrome(host=host, port=port)
220219
except RuntimeError:
221220
# no existing port, launch a new chrome, and auto quit if chrome process missed.
222-
launch()
223-
return Chrome(host=host, port=port).get_tab()
221+
d = launch(host=host, port=port, **daemon_kwargs)
222+
return Chrome(host=host, port=d.port)
223+
224+
225+
def get_a_tab(host='127.0.0.1', port=None, **daemon_kwargs) -> AsyncTab:
226+
chrome = connect_a_chrome(host=host, port=port, **daemon_kwargs)
227+
return chrome.get_tab()
228+
229+
230+
def get_a_new_tab(host='127.0.0.1', port=None, **daemon_kwargs) -> AsyncTab:
231+
chrome = connect_a_chrome(host=host, port=port, **daemon_kwargs)
232+
return chrome.new_tab()
224233

225234

226235
def show_all_log():
@@ -235,14 +244,13 @@ def mute_all_log():
235244

236245
def stop_all_daemons():
237246
if Daemon.daemons:
238-
logger.info(f'auto shutdown {Daemon.daemons}')
247+
logger.debug(f'auto shutdown {Daemon.daemons}')
239248
for daemon in Daemon.daemons:
240249
daemon.stop()
241250

242251

243252
def shutdown():
244253
stop_all_daemons()
245-
import os
246254
os._exit(0)
247255

248256

@@ -257,7 +265,7 @@ def _filter_function(r):
257265
indent=2)
258266
req_type = get_data_value(r, 'params.type')
259267
doc_url = get_data_value(r, 'params.documentURL')
260-
print(f'{doc_url} - {req_type}\n{req}', end=f'\n{"="*40}\n')
268+
print(f'{doc_url} - {req_type}\n{req}', end=f'\n{"="*40}\n', flush=True)
261269

262270
# listen network flow in 60 s
263271
timeout = timeout
@@ -276,13 +284,13 @@ async def crawl_once(**kwargs):
276284
async with AsyncChromeDaemon(**kwargs) as cd:
277285
async with AsyncChrome(
278286
host=kwargs.get('host', '127.0.0.1'),
279-
port=kwargs.get('port', 9222),
287+
port=cd.port,
280288
timeout=cd._timeout or 2,
281289
) as chrome:
282290
async with chrome.connect_tab(0, auto_close=True) as tab:
283291
await tab.set_url(url, timeout=cd._timeout)
284292
html = await tab.get_html(timeout=cd._timeout)
285-
print(html)
293+
return html
286294

287295

288296
async def clear_cache_handler(**kwargs):

0 commit comments

Comments
 (0)