Skip to content

Commit 1d35a98

Browse files
authored
A script for managing sparse checkouts (MystenLabs#20796)
By using sparse checkouts, you can make rust-analyzer much faster, since it doesn't need to process every crate in the repo. In a decent sized diff, in which the following crates were checked out crates/sui-node crates/sui-single-node-benchmark crates/sui-benchmark crates/sui-core crates/sui-types crates/sui crates/mysten-common rust-analyzer for vscode was much faster both for startup and for running cargo check after a single file was edited. Startup speed (until cargo check stopped on its own): - before: 1m6s at best, some runs took many minutes for some reason - after: 50s on average Cargo check after editing a single file, no errors: - before: 2m16s - after 45s cargo check, time from introducing a simple error to finishing - before 12s - after 7s Of course, the editor is responsive before these processes fully complete, but faster is still better than slower This also makes it far faster to run various commands from the root of the repo such as `cargo check`, `clippy`, `nextest run` and so on. These can often be sped up with `-p` or `--test` as well, but that can be tricky to use correctly in some cases, and its very convenient to be able to quickly run only the tests in crates you've edited.
1 parent 139a2b9 commit 1d35a98

File tree

3 files changed

+334
-0
lines changed

3 files changed

+334
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,5 @@ docs/content/references/sui-api/sui-graphql/*
7373
docs/content/references/framework/**
7474

7575
lcov.info
76+
77+
.sparse

notes

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
vscode startup: 48s vs 1m06s
2+
3+
cargo check after modifying one file: 45s vs 2m16s

scripts/sparse-checkout.py

Lines changed: 329 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Mysten Labs, Inc.
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
"""
6+
This manages a git sparse checkout in a Rust project environment.
7+
It dynamically updates the checked-out directories based on a configuration file,
8+
ignores specific files from being tracked by git, and adjusts the project's Cargo.toml
9+
to match the sparse checkout state.
10+
11+
Usage:
12+
13+
0. Run the script from an existing git checkout to create a new git worktree as a sparse checkout.
14+
1. Edit your .sparse file to include the set of directories you wish to check out, or
15+
use the `auto` subcommand to generate this file automatically based on which files
16+
you have edited since the merge base of the current commit and origin/main.
17+
2. Run the script to update the sparse checkout, and modify Cargo.toml as needed.
18+
3. Always re-run the script after modifying your .sparse file.
19+
20+
When in a sparse checkout, changes to Cargo.toml and Cargo.lock are ignored by git. If you need
21+
to edit these files from within a sparse checkout, use the `reset` subcommand to un-ignore them.
22+
Then edit them, check in the changes, and run this script again.
23+
"""
24+
25+
import os
26+
import subprocess
27+
import sys
28+
29+
try:
30+
import toml
31+
except ImportError:
32+
print("This script requires the 'toml' Python package. Install via: pip install toml")
33+
sys.exit(1)
34+
35+
36+
def read_sparse_config(sparse_file=".sparse"):
37+
"""
38+
Read the .sparse file and return a list of crates (paths) that should be included.
39+
"""
40+
if os.path.isfile(sparse_file):
41+
crates = []
42+
with open(sparse_file, "r") as f:
43+
for line in f:
44+
line = line.strip()
45+
if line and not line.startswith("#"):
46+
crates.append(strip_trailing_slash(line))
47+
# throw error if no crates are found
48+
if not crates:
49+
print(f"No crates found in {sparse_file}. Exiting.")
50+
sys.exit(1)
51+
52+
# sui-benchmark tests use sui-surfer which requires move sources to be checked out
53+
if "crates/sui-surfer" not in crates and "crates/sui-benchmark" in crates:
54+
crates.append("crates/sui-surfer")
55+
56+
return crates
57+
else:
58+
return None
59+
60+
61+
def update_git_sparse_checkout(crates_to_checkout):
62+
"""
63+
Initialize or update the git sparse-checkout to include only the given crates.
64+
"""
65+
66+
# You can add any default directories you always want checked out here
67+
default_directories = ["scripts", ".cargo", ".changeset", ".config", ".github", "examples"]
68+
69+
# if we don't have sui-framework, we probably need to add the move sources in order for tests
70+
# to run
71+
if "crates/sui-framework" not in crates_to_checkout:
72+
default_directories.append("crates/sui-framework/packages")
73+
74+
# 1) Initialize sparse checkout (if not already).
75+
subprocess.check_call(["git", "sparse-checkout", "init", "--cone"])
76+
77+
# 2) Set the paths we actually want to check out.
78+
cmd = ["git", "sparse-checkout", "set"] + crates_to_checkout + default_directories
79+
subprocess.check_call(cmd)
80+
81+
# 3) run git checkout to refresh the sparse checkout
82+
subprocess.check_call(["git", "checkout"])
83+
84+
def load_cargo_toml(cargo_toml_path="Cargo.toml"):
85+
"""
86+
Load the Cargo.toml file (from git, not the filesystem) and return the parsed TOML data.
87+
"""
88+
89+
try:
90+
cargo_toml_content = subprocess.check_output(["git", "show", f"HEAD:{cargo_toml_path}"]).decode()
91+
cargo_data = toml.loads(cargo_toml_content)
92+
return cargo_data
93+
except subprocess.CalledProcessError:
94+
print(f"Could not retrieve {cargo_toml_path} from git. Exiting.")
95+
sys.exit(1)
96+
97+
def strip_trailing_slash(s):
98+
if s is None:
99+
return None
100+
return s[:-1] if s.endswith("/") else s
101+
102+
def get_path(dep_spec):
103+
if not isinstance(dep_spec, dict):
104+
return None
105+
return strip_trailing_slash(dep_spec.get("path"))
106+
107+
def modify_cargo_toml(crates_to_checkout, cargo_toml_path="Cargo.toml"):
108+
"""
109+
Remove crates not in crates_to_checkout from Cargo.toml [workspace.members].
110+
Then, for each missing crate:
111+
- Locate a matching dependency in [workspace.dependencies] that has path=<crate_path>.
112+
- Convert it from path-based to git-based, preserving the original dependency name (TOML key).
113+
- If no matching dependency is found, create a new entry with a fallback name.
114+
"""
115+
# Load the Cargo.toml
116+
if not os.path.isfile(cargo_toml_path):
117+
print(f"Could not find {cargo_toml_path}. Exiting.")
118+
sys.exit(1)
119+
120+
cargo_data = load_cargo_toml(cargo_toml_path)
121+
122+
# Make sure we have workspace.members in the top-level Cargo.toml
123+
workspace = cargo_data.setdefault("workspace", {})
124+
members = workspace.get("members", [])
125+
excluded = workspace.get("exclude", [])
126+
127+
all_directories = members + excluded
128+
129+
# Determine which crates are missing from .sparse
130+
missing_crates = [m for m in all_directories if m not in crates_to_checkout]
131+
kept_members = [m for m in members if m in crates_to_checkout]
132+
133+
# all kept crates including the ones in exclude
134+
kept_crates = [m for m in all_directories if m in crates_to_checkout]
135+
136+
# Update workspace.members
137+
cargo_data["workspace"]["members"] = kept_members
138+
139+
# Get the merge base of the current commit and origin/main
140+
commit_sha = subprocess.check_output(["git", "merge-base", "HEAD", "origin/main"]).decode().strip()
141+
142+
# Get the git remote URL (assuming 'origin' is the correct remote)
143+
try:
144+
repo_url = subprocess.check_output(["git", "remote", "get-url", "origin"]).decode().strip()
145+
# Convert SSH URL (git@...) to HTTPS if desired
146+
if repo_url.startswith("git@"):
147+
# Example conversion: [email protected]:User/Repo.git -> https://github.com/User/Repo.git
148+
repo_url = repo_url.replace(":", "/").replace("git@", "https://")
149+
except subprocess.CalledProcessError:
150+
# If there's no 'origin', handle as you see fit
151+
repo_url = "https://unknown-repo-url"
152+
153+
# Ensure [workspace.dependencies] is a dict
154+
workspace_deps = cargo_data["workspace"].setdefault("dependencies", {})
155+
156+
157+
# now find the names of every checked-out crate that are specified in workspace.dependencies
158+
kept_dep_names = [dep_name for dep_name in workspace_deps if get_path(workspace_deps[dep_name]) in kept_crates]
159+
patch = cargo_data.setdefault("patch", {})
160+
for dep_name in kept_dep_names:
161+
path = get_path(workspace_deps[dep_name])
162+
assert path is not None
163+
patch_section = patch.setdefault(f"{repo_url}", {})[dep_name] = { "path": path }
164+
165+
# For each missing crate, we want to:
166+
# 1) Find a dependency in [workspace.dependencies] whose 'path' == crate_path.
167+
# 2) Replace that 'path' dep with a 'git' + 'rev' dep, preserving the key.
168+
for crate_path in missing_crates:
169+
matched_dep = False
170+
171+
for dep_name, dep_spec in workspace_deps.items():
172+
# If dep_spec is not a dict, skip
173+
if not isinstance(dep_spec, dict):
174+
continue
175+
176+
if strip_trailing_slash(dep_spec.get("path")) == crate_path:
177+
del dep_spec["path"]
178+
dep_spec["git"] = repo_url
179+
dep_spec["rev"] = commit_sha
180+
matched_dep = True
181+
break
182+
183+
# Write updated Cargo.toml back
184+
with open(cargo_toml_path, "w") as f:
185+
toml.dump(cargo_data, f)
186+
187+
print("Successfully updated Cargo.toml")
188+
189+
def get_ignored_files():
190+
ignored_files = subprocess.check_output(["git", "ls-files", "-v"]).decode().split("\n")
191+
ignored_files = [line.split(" ")[1] for line in ignored_files if line.startswith("h")]
192+
return ignored_files
193+
194+
def ignore_cargo_changes():
195+
"""
196+
Ignore changes to Cargo.toml and Cargo.lock with the --assume-unchanged flag.
197+
"""
198+
199+
ignored_files = get_ignored_files()
200+
201+
# ignored files should include only Cargo.toml and Cargo.lock
202+
if "Cargo.toml" not in ignored_files:
203+
print("Ignoring changes to Cargo.toml");
204+
subprocess.check_call(["git", "update-index", "--assume-unchanged", "Cargo.toml"])
205+
else:
206+
ignored_files.remove("Cargo.toml")
207+
208+
if "Cargo.lock" not in ignored_files:
209+
print("Ignoring changes to Cargo.lock");
210+
subprocess.check_call(["git", "update-index", "--assume-unchanged", "Cargo.lock"])
211+
else:
212+
ignored_files.remove("Cargo.lock")
213+
214+
# un-ignore any remaining files
215+
for file in ignored_files:
216+
print(f"Un-ignoring {file}")
217+
subprocess.check_call(["git", "update-index", "--no-assume-unchanged", file])
218+
219+
def create_sparse_checkout_worktree():
220+
# if .sparse is not found, offer to create a new sparse worktree
221+
print("No crates found in .sparse (or file not present).")
222+
print("Would you like to create a new sparse worktree? (Y/n)")
223+
choice = input().lower()
224+
if choice == "y" or choice == "":
225+
# move to git repo root
226+
os.chdir(subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode().strip())
227+
# get basename of current directory
228+
dir = os.path.basename(os.getcwd())
229+
sparse_dir = f"../{dir}-sparse"
230+
231+
# ask if they would like to use this name or a different one
232+
print(f"Would you like to use the directory name '{sparse_dir}' for the sparse worktree? (Y/n)")
233+
choice = input().lower()
234+
if choice == "n":
235+
print("Enter the name for the sparse worktree:")
236+
sparse_dir = input()
237+
# add ../ if not already present
238+
if not sparse_dir.startswith("../"):
239+
sparse_dir = f"../{sparse_dir}"
240+
241+
print(f"Creating a new sparse worktree at {sparse_dir}")
242+
subprocess.check_call(["git", "worktree", "add", "--no-checkout", sparse_dir, "main"])
243+
244+
# move to the sparse worktree
245+
os.chdir(sparse_dir)
246+
247+
# now launch $EDITOR to configure the .sparse file. The default contents of .sparse
248+
# are `crates/sui-core`. First, write the defaults
249+
with open(".sparse", "w") as f:
250+
f.write("# Directories to include in the sparse checkout\n")
251+
f.write("crates/sui-core\n")
252+
# now launch $EDITOR
253+
subprocess.check_call([os.getenv("EDITOR", "vi"), ".sparse"])
254+
else:
255+
print("Exiting.")
256+
sys.exit(0)
257+
crates_to_checkout = read_sparse_config(".sparse")
258+
return crates_to_checkout
259+
260+
def reset_index():
261+
ignored_files = get_ignored_files()
262+
263+
# check that Cargo.toml and Cargo.lock are ignored
264+
if "Cargo.toml" not in ignored_files or "Cargo.lock" not in ignored_files:
265+
print("Cargo.toml and/or Cargo.lock are not ignored. Reset them manually or check in your changes")
266+
sys.exit(1)
267+
268+
subprocess.check_call(["git", "checkout", "Cargo.toml", "Cargo.lock"])
269+
270+
def auto_update_config():
271+
272+
# get the list of files that have changed between the current commit and the merge base.
273+
# Use this to select the directories from the workspace that should be included in the sparse checkout.
274+
275+
# Get the merge base of the current commit and origin/main
276+
commit_sha = subprocess.check_output(["git", "merge-base", "HEAD", "origin/main"]).decode().strip()
277+
278+
# Get the list of files that have changed between the current commit and the merge base
279+
changed_files = subprocess.check_output(["git", "diff", "--name-only", commit_sha]).decode().split("\n")
280+
281+
cargo_data = load_cargo_toml("Cargo.toml")
282+
283+
# for every directory in workspace.members and exclude, check if it is a prefix of some changed file.
284+
# If it is, add it to the list of directories to checkout.
285+
directories_to_checkout = []
286+
287+
for directory in cargo_data["workspace"]["members"] + cargo_data["workspace"].get("exclude", []):
288+
for file in changed_files:
289+
if file.startswith(directory):
290+
directories_to_checkout.append(directory)
291+
break
292+
293+
# unique-ify the list
294+
directories_to_checkout = list(set(directories_to_checkout))
295+
296+
# write the list of directories to .sparse
297+
with open(".sparse", "w") as f:
298+
f.write("# Directories to include in the sparse checkout\n")
299+
for directory in directories_to_checkout:
300+
f.write(f"{directory}\n")
301+
302+
def main():
303+
# if given the `reset` command, reset changes to Cargo.lock and Cargo.toml
304+
if len(sys.argv) > 1 and sys.argv[1] == "reset":
305+
reset_index()
306+
sys.exit(0)
307+
308+
if len(sys.argv) > 1 and sys.argv[1] == "auto":
309+
auto_update_config()
310+
sys.exit(0)
311+
312+
# 1. Read the crates to include from .sparse
313+
crates_to_checkout = read_sparse_config(".sparse")
314+
if crates_to_checkout is None:
315+
crates_to_checkout = create_sparse_checkout_worktree()
316+
assert crates_to_checkout is not None
317+
318+
# 2. Update git sparse checkout
319+
update_git_sparse_checkout(crates_to_checkout)
320+
321+
# 3. Ignore changes to Cargo.toml and Cargo.lock
322+
ignore_cargo_changes()
323+
324+
# 4. Modify Cargo.toml
325+
modify_cargo_toml(crates_to_checkout, "Cargo.toml")
326+
327+
328+
if __name__ == "__main__":
329+
main()

0 commit comments

Comments
 (0)