Skip to content

Commit e0bf86e

Browse files
committed
[#181] Add a new checkpoint_dir argument to SparkConnection()
Previously we always set the checkpoint directory to be the same as spark.local.dir, which we call "tmp_dir". However, this doesn't make sense because tmp_dir should be on a disk local to each executor, and the checkpoint directory has to be on shared storage to work correctly.
1 parent 1f99c93 commit e0bf86e

File tree

5 files changed

+16
-2
lines changed

5 files changed

+16
-2
lines changed

hlink/scripts/main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,12 +153,14 @@ def _parse_args():
153153
def _get_spark(run_name: str, args: argparse.Namespace) -> SparkSession:
154154
derby_dir = HLINK_DIR / "derby" / run_name
155155
warehouse_dir = HLINK_DIR / "warehouse" / run_name
156+
checkpoint_dir = HLINK_DIR / "checkpoint" / run_name
156157
tmp_dir = HLINK_DIR / "tmp" / run_name
157158
python = sys.executable
158159

159160
spark_connection = SparkConnection(
160161
derby_dir=derby_dir,
161162
warehouse_dir=warehouse_dir,
163+
checkpoint_dir=checkpoint_dir,
162164
tmp_dir=tmp_dir,
163165
python=python,
164166
db_name="linking",

hlink/spark/factory.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def create(self):
7878
spark_conn = SparkConnection(
7979
str(self.derby_dir),
8080
str(self.warehouse_dir),
81+
"checkpoint",
8182
str(self.tmp_dir),
8283
self.python,
8384
self.db_name,

hlink/spark/session.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,14 @@ class SparkConnection:
3333
"""Handles initialization of spark session and connection to local cluster."""
3434

3535
def __init__(
36-
self, derby_dir, warehouse_dir, tmp_dir, python, db_name, app_name="linking"
36+
self,
37+
derby_dir,
38+
warehouse_dir,
39+
checkpoint_dir,
40+
tmp_dir,
41+
python,
42+
db_name,
43+
app_name="linking",
3744
):
3845
self.derby_dir = derby_dir
3946
self.warehouse_dir = warehouse_dir

hlink/tests/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def spark(tmpdir_factory):
3535
spark_connection = SparkConnection(
3636
tmpdir_factory.mktemp("derby"),
3737
tmpdir_factory.mktemp("warehouse"),
38+
tmpdir_factory.mktemp("checkpoint"),
3839
tmpdir_factory.mktemp("spark_tmp_dir"),
3940
sys.executable,
4041
"linking",

hlink/tests/spark_connection_test.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@
77
def test_app_name_defaults_to_linking(tmp_path: Path) -> None:
88
derby_dir = tmp_path / "derby"
99
warehouse_dir = tmp_path / "warehouse"
10+
checkpoint_dir = tmp_path / "checkpoint"
1011
tmp_dir = tmp_path / "tmp"
1112
connection = SparkConnection(
12-
derby_dir, warehouse_dir, tmp_dir, sys.executable, "test"
13+
derby_dir, warehouse_dir, checkpoint_dir, tmp_dir, sys.executable, "test"
1314
)
1415
spark = connection.local(cores=1, executor_memory="1G")
1516
app_name = spark.conf.get("spark.app.name")
@@ -19,10 +20,12 @@ def test_app_name_defaults_to_linking(tmp_path: Path) -> None:
1920
def test_app_name_argument(tmp_path: Path) -> None:
2021
derby_dir = tmp_path / "derby"
2122
warehouse_dir = tmp_path / "warehouse"
23+
checkpoint_dir = tmp_path / "checkpoint_dir"
2224
tmp_dir = tmp_path / "tmp"
2325
connection = SparkConnection(
2426
derby_dir,
2527
warehouse_dir,
28+
checkpoint_dir,
2629
tmp_dir,
2730
sys.executable,
2831
"test",

0 commit comments

Comments
 (0)