Skip to content

Commit ffb663b

Browse files
committed
Handle backslashes in docs better
1 parent ab0d1be commit ffb663b

File tree

1 file changed

+55
-7
lines changed

1 file changed

+55
-7
lines changed

docs/link_checker.py

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,9 @@ def extract_links_from_markdown(
111111
url = match.group(1).split()[0]
112112
start_pos = match.start(1)
113113
end_pos = start_pos + len(url)
114+
115+
# Clean URLs with common escape sequences
116+
# We preserve the original position for proper replacement later
114117
links.append((url, line_num, line, start_pos, end_pos))
115118

116119
# Find reference link definitions [id]: url
@@ -137,6 +140,31 @@ def extract_links_from_markdown(
137140
return links
138141

139142

143+
def clean_url(url: str) -> str:
144+
"""
145+
Clean up escaped characters in URLs from markdown files.
146+
147+
Args:
148+
url: The URL to clean
149+
150+
Returns:
151+
Cleaned URL with escape sequences properly handled
152+
"""
153+
# Replace escaped underscores with actual underscores
154+
cleaned = url.replace("\\_", "_")
155+
156+
# Replace escaped hyphens with actual hyphens
157+
cleaned = cleaned.replace("\\-", "-")
158+
159+
# Handle other common escapes in Markdown
160+
cleaned = cleaned.replace("\\.", ".")
161+
cleaned = cleaned.replace("\\#", "#")
162+
cleaned = cleaned.replace("\\(", "(")
163+
cleaned = cleaned.replace("\\)", ")")
164+
165+
return cleaned
166+
167+
140168
def check_links_with_substring(
141169
file_path: str, substring: str
142170
) -> List[Tuple[str, int, str, int, int]]:
@@ -158,7 +186,10 @@ def check_links_with_substring(
158186
internal_paths = ["how-to", "user-guide", "component-guide", "book"]
159187

160188
def should_include_link(link: str) -> bool:
161-
if substring not in link:
189+
# Clean the link to properly handle escaped characters
190+
cleaned_link = clean_url(link)
191+
192+
if substring not in cleaned_link and substring not in link:
162193
return False
163194

164195
# For internal documentation paths, only include relative links
@@ -211,12 +242,16 @@ def check_link_validity(
211242
if not HAS_REQUESTS:
212243
return url, False, "requests module not installed", None
213244

245+
# Clean up escaped characters in URLs
246+
# This helps with Markdown URLs that have escaped underscores, etc.
247+
cleaned_url = clean_url(url)
248+
214249
# Skip non-HTTP links
215-
if not url.startswith(("http://", "https://")):
250+
if not cleaned_url.startswith(("http://", "https://")):
216251
return url, True, None, None
217252

218253
# Skip local development URLs
219-
if is_local_development_url(url):
254+
if is_local_development_url(cleaned_url):
220255
return url, True, None, None
221256

222257
# Configure session with retries
@@ -232,20 +267,24 @@ def check_link_validity(
232267

233268
try:
234269
# First try with HEAD request
235-
response = session.head(url, timeout=timeout, allow_redirects=True)
270+
response = session.head(
271+
cleaned_url, timeout=timeout, allow_redirects=True
272+
)
236273

237274
# If HEAD fails, try GET
238275
if response.status_code >= 400:
239-
response = session.get(url, timeout=timeout, allow_redirects=True)
276+
response = session.get(
277+
cleaned_url, timeout=timeout, allow_redirects=True
278+
)
240279

241280
is_valid = response.status_code < 400
242281

243282
# Additional check for Gitbook URLs that return 200 for non-existent pages
244-
if is_valid and "docs.zenml.io" in url:
283+
if is_valid and "docs.zenml.io" in cleaned_url:
245284
# We need to check for "noindex" meta tag which indicates a 404 page in Gitbook
246285
try:
247286
# Use GET to fetch the page content
248-
content_response = session.get(url, timeout=timeout)
287+
content_response = session.get(cleaned_url, timeout=timeout)
249288
content = content_response.text.lower()
250289

251290
# Look for the "noindex" meta tag which indicates a 404 page
@@ -848,13 +887,22 @@ def main():
848887
# Show the original link in the output, but we validated the transformed one
849888
print(f" Line {line_num}: {original_link}")
850889
print(f" ↳ ❌ {status_info}")
890+
891+
# Always show what URL was actually validated
892+
cleaned_url = clean_url(original_link)
893+
if cleaned_url != original_link:
894+
print(
895+
f" ↳ URL with escapes removed: {cleaned_url}"
896+
)
851897
if (
852898
original_link != transformed_link
899+
and cleaned_url != transformed_link
853900
and not args.ci_mode
854901
):
855902
print(
856903
f" ↳ Validated as: {transformed_link}"
857904
)
905+
858906
clickable_path = get_clickable_path(
859907
file_path, line_num
860908
)

0 commit comments

Comments
 (0)