Skip to content

Commit 64992dd

Browse files
committed
Add link checking workflow and script
- Introduced a GitHub Actions workflow (`check-links.yml`) to automate the checking of absolute and relative links in markdown files upon push and pull request events. - Added a new script (`check_relative_links.py`) to verify that relative links in markdown files point to existing files in the repository, enhancing documentation integrity. - The script scans markdown files, extracts relative links, and checks their validity without making HTTP requests, providing detailed output for any broken links found. This addition aims to improve the reliability of documentation links and streamline the review process for changes in markdown files.
1 parent fb38db3 commit 64992dd

File tree

2 files changed

+276
-0
lines changed

2 files changed

+276
-0
lines changed

.github/workflows/check-links.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
---
2+
name: Check Links
3+
on:
4+
workflow_dispatch:
5+
push:
6+
pull_request:
7+
types: [opened, synchronize, reopened]
8+
concurrency:
9+
# New commit on branch cancels running workflows of the same branch
10+
group: ${{ github.workflow }}-${{ github.ref }}
11+
cancel-in-progress: true
12+
jobs:
13+
check:
14+
if: github.event.pull_request.draft == false
15+
runs-on: ubuntu-latest
16+
steps:
17+
- name: Checkout Repository
18+
uses: actions/checkout@v4
19+
- name: Set up Python
20+
uses: actions/setup-python@v4
21+
with:
22+
python-version: "3.11"
23+
- name: Install dependencies
24+
run: pip install requests
25+
- name: Check Absolute Links
26+
run: |
27+
# Only check absolute links (http/https), not relative links
28+
python docs/link_checker.py --dir docs/book --substring "http" --validate-links --timeout 15
29+
- name: Check Relative Links
30+
run: |-
31+
# Check if relative links resolve within the repository
32+
python scripts/check_relative_links.py --dir docs/book

scripts/check_relative_links.py

Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
#!/usr/bin/env python3
2+
"""Script to verify that relative links in markdown files point to existing files in the repository.
3+
4+
This script scans markdown files for relative links and checks if they resolve to
5+
actual files in the repository structure, without making any HTTP requests.
6+
7+
Usage:
8+
python check_relative_links.py --dir docs/book
9+
"""
10+
11+
import argparse
12+
import os
13+
import re
14+
import sys
15+
from typing import List, Tuple
16+
17+
18+
def find_markdown_files(directory: str) -> List[str]:
19+
"""Find all markdown files in the given directory and its subdirectories."""
20+
markdown_files = []
21+
for root, _, files in os.walk(directory):
22+
for file in files:
23+
if file.endswith(".md"):
24+
markdown_files.append(os.path.join(root, file))
25+
return markdown_files
26+
27+
28+
def extract_relative_links(file_path: str) -> List[Tuple[str, int, str]]:
29+
"""
30+
Extract all relative links from a markdown file along with line numbers.
31+
Returns list of tuples: (link, line_num, full_line)
32+
"""
33+
links = []
34+
with open(file_path, "r", encoding="utf-8") as file:
35+
lines = file.readlines()
36+
37+
# Regular expressions for different types of markdown links
38+
# Only match relative links that don't start with http/https/ftp
39+
inline_link_pattern = re.compile(
40+
r"\[(?:[^\]]+)\]\(((?!https?:|ftp:)[^)]+)\)"
41+
)
42+
reference_link_def_pattern = re.compile(
43+
r"^\s*\[(?:[^\]]+)\]:\s*((?!https?:|ftp:)\S+)"
44+
)
45+
html_link_pattern = re.compile(
46+
r'<a\s+(?:[^>]*?)href=["\']((?!https?:|ftp:).*?)["\']', re.IGNORECASE
47+
)
48+
49+
for line_num, line in enumerate(lines, 1):
50+
# Find inline links [text](url)
51+
for match in inline_link_pattern.finditer(line):
52+
url = match.group(1).split()[0] # Handle links with title
53+
links.append((url, line_num, line.strip()))
54+
55+
# Find reference link definitions [id]: url
56+
for match in reference_link_def_pattern.finditer(line):
57+
url = match.group(1).split()[0] # Handle links with title
58+
links.append((url, line_num, line.strip()))
59+
60+
# Find HTML links <a href="url">
61+
for match in html_link_pattern.finditer(line):
62+
url = match.group(1).split()[0] # Handle links with title
63+
links.append((url, line_num, line.strip()))
64+
65+
return links
66+
67+
68+
def resolve_relative_path(base_file: str, rel_path: str) -> str:
69+
"""
70+
Resolve a relative path from a base file location.
71+
For instance, if base_file is 'docs/book/user-guide/a.md' and rel_path is '../b.md',
72+
this will return 'docs/book/b.md'.
73+
"""
74+
# Handle fragment/anchor links (like file.md#section)
75+
fragment = ""
76+
if "#" in rel_path:
77+
rel_path, fragment = rel_path.split("#", 1)
78+
79+
# Handle query parameters
80+
query = ""
81+
if "?" in rel_path:
82+
rel_path, query = rel_path.split("?", 1)
83+
84+
# Get the directory of the base file
85+
base_dir = os.path.dirname(base_file)
86+
87+
# Resolve the relative path
88+
resolved_path = os.path.normpath(os.path.join(base_dir, rel_path))
89+
90+
# Return with original fragment and query
91+
result = resolved_path
92+
if fragment:
93+
result += f"#{fragment}"
94+
if query:
95+
result += f"?{query}"
96+
97+
return result
98+
99+
100+
def check_relative_links(dir_path: str) -> bool:
101+
"""
102+
Check if all relative links in markdown files actually point to existing files.
103+
104+
Returns True if all links are valid, False otherwise.
105+
"""
106+
markdown_files = find_markdown_files(dir_path)
107+
print(
108+
f"Found {len(markdown_files)} markdown files in directory: {dir_path}"
109+
)
110+
111+
broken_links = []
112+
valid_links_count = 0
113+
114+
# Keep track of all checked links to avoid duplicates
115+
checked_links = set()
116+
117+
# First, gather all markdown files for validating links
118+
all_md_files = set()
119+
for file_path in markdown_files:
120+
all_md_files.add(os.path.normpath(file_path))
121+
# Also add versions without .md extension
122+
if file_path.endswith(".md"):
123+
all_md_files.add(os.path.normpath(file_path[:-3]))
124+
125+
# Also add README alternatives
126+
readme_alternatives = set()
127+
for file_path in all_md_files:
128+
if file_path.endswith("/README.md"):
129+
# Add directory path (without the README.md) as valid
130+
readme_alternatives.add(os.path.normpath(file_path[:-9]))
131+
elif file_path.endswith("/README"):
132+
readme_alternatives.add(os.path.normpath(file_path[:-7]))
133+
134+
all_valid_paths = all_md_files.union(readme_alternatives)
135+
136+
# Now check links
137+
for file_path in markdown_files:
138+
relative_links = extract_relative_links(file_path)
139+
file_broken_links = []
140+
141+
for link, line_num, line in relative_links:
142+
# Skip links we've already checked
143+
link_check_key = f"{file_path}:{link}"
144+
if link_check_key in checked_links:
145+
continue
146+
147+
checked_links.add(link_check_key)
148+
149+
# Ignore links to assets, images, etc.
150+
if any(
151+
ignore in link
152+
for ignore in [
153+
".png",
154+
".jpg",
155+
".jpeg",
156+
".gif",
157+
".svg",
158+
"assets",
159+
".gitbook",
160+
]
161+
):
162+
continue
163+
164+
# Resolve the relative link to a full path
165+
resolved_path = resolve_relative_path(file_path, link)
166+
167+
# Strip fragments and queries for existence check
168+
check_path = resolved_path
169+
if "#" in check_path:
170+
check_path = check_path.split("#")[0]
171+
if "?" in check_path:
172+
check_path = check_path.split("?")[0]
173+
174+
# Normalize the path
175+
check_path = os.path.normpath(check_path)
176+
177+
# First check if file exists directly
178+
if os.path.exists(check_path):
179+
valid_links_count += 1
180+
continue
181+
182+
# If it doesn't exist, try adding .md extension
183+
if not check_path.endswith(".md") and os.path.exists(
184+
f"{check_path}.md"
185+
):
186+
valid_links_count += 1
187+
continue
188+
189+
# If it's a directory, check if README.md exists
190+
if os.path.isdir(check_path) and os.path.exists(
191+
os.path.join(check_path, "README.md")
192+
):
193+
valid_links_count += 1
194+
continue
195+
196+
# Check against our pre-computed set of valid paths
197+
if check_path in all_valid_paths:
198+
valid_links_count += 1
199+
continue
200+
201+
# If we get here, it's a broken link
202+
file_broken_links.append((link, line_num, resolved_path, line))
203+
204+
# Print details about broken links in this file
205+
if file_broken_links:
206+
print(f"\n{file_path}:")
207+
for link, line_num, resolved_path, line in file_broken_links:
208+
print(f" Line {line_num}: {link}")
209+
print(f" Resolves to: {resolved_path}")
210+
print(f" Context: {line}")
211+
broken_links.append((file_path, line_num, link, resolved_path))
212+
213+
# Summary
214+
total_links = valid_links_count + len(broken_links)
215+
print(f"\nChecked {total_links} relative links:")
216+
print(f" ✅ {valid_links_count} valid links")
217+
print(f" ❌ {len(broken_links)} broken links")
218+
219+
return len(broken_links) == 0
220+
221+
222+
def main():
223+
parser = argparse.ArgumentParser(
224+
description="Check if relative links in markdown files resolve to existing files"
225+
)
226+
parser.add_argument(
227+
"--dir", required=True, help="Directory to scan for links"
228+
)
229+
args = parser.parse_args()
230+
231+
all_links_valid = check_relative_links(args.dir)
232+
233+
if all_links_valid:
234+
print("\nAll relative links are valid!")
235+
return 0
236+
else:
237+
print(
238+
"\nFound broken relative links. Please fix them before proceeding."
239+
)
240+
return 1
241+
242+
243+
if __name__ == "__main__":
244+
sys.exit(main())

0 commit comments

Comments
 (0)