Skip to content

Commit c27643a

Browse files
vringargridl0ck
andauthored
Storage watchdog (#1056)
* OpenWPM StorageWatchdog complete * Revised tmp_profile_dir member to use the tempfile.gettempdir function for increased compatibility * Restored version changes added watchdog 3.0.0 as a requirement. * Implemented changes as requested, with significant alteration to the StorageWatchdog backend. * refactor(storage-watchdog): adjust storage watchdog implementation * fix(dependencies): remove watchdog * docs(storage-watchdog): align wording * fix(mypy): refactor type annotation on test * refactor(demo.py): show off maximum profile size --------- Co-authored-by: Jalen Morgan <[email protected]> Co-authored-by: Jalen Morgan <[email protected]>
1 parent 761e46d commit c27643a

File tree

11 files changed

+276
-58
lines changed

11 files changed

+276
-58
lines changed

demo.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@
5454
# browser_param.callstack_instrument = True
5555
# Record DNS resolution
5656
browser_param.dns_instrument = True
57+
# Set this value as appropriate for the size of your temp directory
58+
# if you are running out of space
59+
browser_param.maximum_profile_size = 50 * (10**20) # 50 MB = 50 * 2^20 Bytes
5760

5861
# Update TaskManager configuration (use this for crawl-wide settings)
5962
manager_params.data_directory = Path("./datadir/")

environment.yaml

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,39 +3,40 @@ channels:
33
- main
44
dependencies:
55
- beautifulsoup4=4.12.2
6-
- black=23.7.0
6+
- black=23.9.1
77
- click=8.1.7
88
- codecov=2.1.13
99
- dill=0.3.7
10+
- dill=0.3.7
1011
- easyprocess=1.1
11-
- gcsfs=2023.9.0
12+
- gcsfs=2023.9.1
1213
- geckodriver=0.33.0
1314
- ipython=8.15.0
1415
- isort=5.12.0
1516
- leveldb=1.23
1617
- multiprocess=0.70.15
1718
- mypy=1.5.1
18-
- nodejs=20.6.0
19+
- nodejs=20.7.0
1920
- pandas=2.1.0
20-
- pillow=10.0.0
21+
- pillow=10.0.1
2122
- pip=23.2.1
2223
- pre-commit=3.4.0
2324
- psutil=5.9.5
2425
- pyarrow=13.0.0
2526
- pytest-asyncio=0.21.1
2627
- pytest-cov=4.1.0
27-
- pytest=7.4.1
28+
- pytest=7.4.2
2829
- python=3.11.5
2930
- pyvirtualdisplay=3.0
3031
- recommonmark=0.7.1
3132
- redis-py=5.0.0
32-
- s3fs=2023.9.0
33+
- s3fs=2023.9.1
3334
- selenium=4.12.0
34-
- sentry-sdk=1.30.0
35+
- sentry-sdk=1.31.0
3536
- sphinx-markdown-tables=0.0.17
36-
- sphinx=7.2.5
37+
- sphinx=7.2.6
3738
- tabulate=0.9.0
38-
- tblib=1.7.0
39+
- tblib=2.0.0
3940
- wget=1.20.3
4041
- pip:
4142
- dataclasses-json==0.6.0
@@ -44,6 +45,6 @@ dependencies:
4445
- plyvel==1.5.0
4546
- tranco==0.6
4647
- types-pyyaml==6.0.12.11
47-
- types-redis==4.6.0.5
48+
- types-redis==4.6.0.6
4849
- types-tabulate==0.9.0.3
4950
name: openwpm

openwpm/browser_manager.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
kill_process_and_children,
3535
parse_traceback_for_sentry,
3636
)
37+
from .utilities.storage_watchdog import profile_size_exceeds_max_size
3738

3839
pickling_support.install()
3940

@@ -42,7 +43,7 @@
4243

4344

4445
class BrowserManagerHandle:
45-
"""The BrowserManagerHandle class is responsible for holding all of the
46+
"""The BrowserManagerHandle class is responsible for holding all the
4647
configuration and status information on BrowserManager process
4748
it corresponds to. It also includes a set of methods for managing
4849
the BrowserManager process and its child processes/threads.
@@ -501,6 +502,16 @@ def execute_command_sequence(
501502
if task_manager.closing:
502503
return
503504

505+
# Allow StorageWatchdog to utilize built-in browser reset functionality
506+
# which results in a graceful restart of the browser instance
507+
if self.browser_params.maximum_profile_size:
508+
assert self.current_profile_path is not None
509+
510+
reset = profile_size_exceeds_max_size(
511+
self.current_profile_path,
512+
self.browser_params.maximum_profile_size,
513+
)
514+
504515
if self.restart_required or reset:
505516
success = self.restart_browser_manager(clear_profile=reset)
506517
if not success:
@@ -564,7 +575,11 @@ def kill_browser_manager(self):
564575
"type %s" % (self.browser_id, str(self.display_pid))
565576
)
566577
if self.display_port is not None: # xvfb display lock
567-
lockfile = "/tmp/.X%s-lock" % self.display_port
578+
# lockfile = "/tmp/.X%s-lock" % self.display_port
579+
lockfile = os.path.join(
580+
self.browser_params.tmp_profile_dir, f".X{self.display_port}-lock"
581+
)
582+
568583
try:
569584
os.remove(lockfile)
570585
except OSError:

openwpm/config.py

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import os
1+
import tempfile
22
from dataclasses import dataclass, field
33
from json import JSONEncoder
44
from pathlib import Path
@@ -99,6 +99,47 @@ class BrowserParams(DataClassJsonMixin):
9999
profile_archive_dir: Optional[Path] = field(
100100
default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path)
101101
)
102+
103+
tmp_profile_dir: Path = field(
104+
default=Path(tempfile.gettempdir()),
105+
metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path),
106+
)
107+
"""
108+
The tmp_profile_dir defaults to the OS's temporary file folder (typically /tmp) and is where the generated
109+
browser profiles and residual files are stored.
110+
"""
111+
112+
maximum_profile_size: Optional[int] = None
113+
"""
114+
The total amount of on disk space the generated
115+
browser profiles and residual files are allowed to consume in bytes.
116+
If this option is not set, no checks will be performed
117+
118+
Rationale
119+
---------
120+
This option can serve as a happy medium between killing a browser after each
121+
crawl and allowing the application to still perform quickly.
122+
123+
Used as a way to save space
124+
in a limited environment with minimal detriment to speed.
125+
126+
If the maximum_profile_size is exceeded after a CommandSequence
127+
is completed, the browser will be shut down and a new one will
128+
be created. **Even with this setting you may temporarily have
129+
more disk usage than the sum of all maximum_profile_sizes**
130+
However, this will also ensure that a CommandSequence is
131+
allowed to complete without undue interruptions.
132+
133+
Sample values
134+
-------------
135+
* 1073741824: 1GB
136+
* 20971520: 20MB - for testing purposes
137+
* 52428800: 50MB
138+
* 73400320: 70MB
139+
* 104857600: 100MB - IDEAL for 10+ browsers
140+
141+
"""
142+
102143
recovery_tar: Optional[Path] = None
103144
donottrack: bool = False
104145
tracking_protection: bool = False
@@ -133,8 +174,11 @@ class ManagerParams(DataClassJsonMixin):
133174
"""A watchdog that tries to ensure that no Firefox instance takes up too much memory.
134175
It is mostly useful for long running cloud crawls"""
135176
process_watchdog: bool = False
136-
"""It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`) instances that haven't been spawned by OpenWPM. (GeckoDriver is used by
137-
Selenium to control Firefox and Xvfb a "virtual display" so we simulate having graphics when running on a server)."""
177+
"""It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`)
178+
instances that haven't been spawned by OpenWPM. (GeckoDriver is used by
179+
Selenium to control Firefox and Xvfb a "virtual display" so we simulate having graphics when running on a server).
180+
"""
181+
138182
num_browsers: int = 1
139183
_failure_limit: Optional[int] = None
140184
"""The number of command failures the platform will tolerate before raising a

openwpm/deploy_browsers/deploy_firefox.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,9 @@ def deploy_firefox(
3535

3636
root_dir = os.path.dirname(__file__) # directory of this file
3737

38-
browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_"))
38+
browser_profile_path = Path(
39+
tempfile.mkdtemp(prefix="firefox_profile_", dir=browser_params.tmp_profile_dir)
40+
)
3941
status_queue.put(("STATUS", "Profile Created", browser_profile_path))
4042

4143
# Use Options instead of FirefoxProfile to set preferences since the
@@ -167,8 +169,6 @@ def deploy_firefox(
167169
# Get browser process pid
168170
if hasattr(driver, "service") and hasattr(driver.service, "process"):
169171
pid = driver.service.process.pid
170-
elif hasattr(driver, "binary") and hasattr(driver.options.binary, "process"):
171-
pid = driver.options.binary.process.pid
172172
else:
173173
raise RuntimeError("Unable to identify Firefox process ID.")
174174

openwpm/task_manager.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import pickle
44
import threading
55
import time
6+
from functools import reduce
67
from types import TracebackType
78
from typing import Any, Dict, List, Optional, Set, Type
89

@@ -29,6 +30,7 @@
2930
)
3031
from .utilities.multiprocess_utils import kill_process_and_children
3132
from .utilities.platform_utils import get_configuration_string, get_version
33+
from .utilities.storage_watchdog import StorageLogger
3234

3335
tblib.pickling_support.install()
3436

@@ -79,8 +81,8 @@ def __init__(
7981

8082
manager_params.source_dump_path = manager_params.data_directory / "sources"
8183

82-
self.manager_params = manager_params
83-
self.browser_params = browser_params
84+
self.manager_params: ManagerParamsInternal = manager_params
85+
self.browser_params: List[BrowserParamsInternal] = browser_params
8486
self._logger_kwargs = logger_kwargs
8587

8688
# Create data directories if they do not exist
@@ -108,7 +110,7 @@ def __init__(
108110
self.logging_server = MPLogger(
109111
self.manager_params.log_path,
110112
str(structured_storage_provider),
111-
**self._logger_kwargs
113+
**self._logger_kwargs,
112114
)
113115
self.manager_params.logger_address = self.logging_server.logger_address
114116
self.logger = logging.getLogger("openwpm")
@@ -128,6 +130,20 @@ def __init__(
128130
thread.name = "OpenWPM-watchdog"
129131
thread.start()
130132

133+
# Start the StorageLogger if a maximum storage value has been specified for any browser
134+
if reduce(
135+
lambda x, y: x or y,
136+
map(lambda p: p.maximum_profile_size is not None, self.browser_params),
137+
False,
138+
):
139+
storage_logger = StorageLogger(
140+
self.browser_params[0].tmp_profile_dir,
141+
)
142+
143+
storage_logger.daemon = True
144+
storage_logger.name = "OpenWPM-storage-logger"
145+
146+
storage_logger.start()
131147
# Save crawl config information to database
132148
openwpm_v, browser_v = get_version()
133149
self.storage_controller_handle.save_configuration(
@@ -363,6 +379,7 @@ def _start_thread(
363379
# Start command execution thread
364380
args = (self, command_sequence)
365381
thread = threading.Thread(target=browser.execute_command_sequence, args=args)
382+
thread.name = f"BrowserManagerHandle-{browser.browser_id}"
366383
browser.command_thread = thread
367384
thread.daemon = True
368385
thread.start()

openwpm/utilities/storage_watchdog.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import logging
2+
import math
3+
import os
4+
import subprocess
5+
import time
6+
from pathlib import Path
7+
from threading import Thread
8+
from typing import Optional
9+
10+
# Nifty little function to prettyfi the output. Takes in a number of bytes and spits out the
11+
# corresponding size in the largest unit it is able to convert to.
12+
13+
14+
def convert_size(size_bytes: int) -> str:
15+
if size_bytes == 0:
16+
return "0B"
17+
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
18+
i: int = int(math.floor(math.log(size_bytes, 1024)))
19+
p: float = math.pow(1024, i)
20+
s: float = round(size_bytes / p, 2)
21+
return "%s %s" % (s, size_name[i])
22+
23+
24+
def total_folder_size(startup: bool = False, root_dir: str = "/tmp") -> str:
25+
"""Generates a human-readable message about the current size of the directory
26+
27+
Args:
28+
startup (bool, optional): Runs the function on the total supplied folder.
29+
root_dir (str, optional): The root directory that will be recursively checked.
30+
"""
31+
32+
running_total: int = 0
33+
if not startup:
34+
for file in os.listdir(root_dir):
35+
if "firefox" in file or ".xpi" in file or "owpm" in file or "Temp" in file:
36+
path = os.path.join(root_dir, file)
37+
try:
38+
running_total += int(
39+
subprocess.check_output(["du", "-s", "-b", path])
40+
.split()[0]
41+
.decode("utf-8")
42+
)
43+
except:
44+
pass
45+
return f"Currently using: {convert_size(running_total)} of storage on disk..."
46+
47+
for file in os.listdir(root_dir):
48+
path = os.path.join(root_dir, file)
49+
try:
50+
running_total += int(
51+
subprocess.check_output(
52+
["du", "-s", "-b", path], stderr=subprocess.DEVNULL
53+
)
54+
.split()[0]
55+
.decode("utf-8")
56+
)
57+
except:
58+
pass
59+
60+
return f"Readable files in {root_dir} folder take up {convert_size(running_total)} of storage on disk at start time..."
61+
62+
63+
class StorageLogger(Thread):
64+
"""Logs the total amount of storage used in the supplied_dir"""
65+
66+
def __init__(self, supplied_dir: Optional[Path] = None) -> None:
67+
super().__init__()
68+
self.dir_to_watch = supplied_dir
69+
70+
def run(self) -> None:
71+
logger = logging.getLogger("openwpm")
72+
# Checks if the default dirsize and directory to watch were configured.
73+
# If they are still the default, it exits because
74+
# it would essentially work identically to setting the "reset" flag in the command sequence
75+
if self.dir_to_watch is None:
76+
logger.info("No dir_to_watch specified. StorageLogger shutting down")
77+
return
78+
79+
logger.info("Starting the StorageLogger...")
80+
logger.info(total_folder_size(startup=True))
81+
try:
82+
while True:
83+
time.sleep(300) # Give storage updates every 5 minutes
84+
logger.info(total_folder_size())
85+
except:
86+
print("Error")
87+
88+
89+
def profile_size_exceeds_max_size(
90+
profile_path: Path,
91+
max_dir_size: int,
92+
) -> bool:
93+
logger = logging.getLogger("openwpm")
94+
# 1073741824: # 1GB
95+
# 20971520: # 20MB - for testing purposes
96+
# 52428800: # 50MB
97+
# 73400320: # 70MB
98+
# 104857600: 100MB - IDEAL for 10+ browsers
99+
100+
readable_max_dir_size = convert_size(max_dir_size)
101+
102+
dir_size = int(
103+
subprocess.check_output(["du", "-s", "-b", profile_path])
104+
.split()[0]
105+
.decode("utf-8")
106+
)
107+
readable_dir_size = convert_size(dir_size)
108+
109+
if dir_size < max_dir_size:
110+
logger.info(
111+
f"Current browser profile directory {profile_path} size is less than {readable_max_dir_size}: {readable_dir_size}"
112+
)
113+
return False
114+
else:
115+
logger.info(
116+
f"{profile_path}: Folder scheduled to be deleted and recover {readable_dir_size} of storage."
117+
)
118+
return True
119+
120+
121+
if __name__ == "__main__":
122+
print("---Testing the StorageWatchdog folder size function---")
123+
print(total_folder_size(startup=True))

0 commit comments

Comments
 (0)