@@ -111,6 +111,9 @@ def extract_links_from_markdown(
111111 url = match .group (1 ).split ()[0 ]
112112 start_pos = match .start (1 )
113113 end_pos = start_pos + len (url )
114+
115+ # Clean URLs with common escape sequences
116+ # We preserve the original position for proper replacement later
114117 links .append ((url , line_num , line , start_pos , end_pos ))
115118
116119 # Find reference link definitions [id]: url
@@ -137,6 +140,31 @@ def extract_links_from_markdown(
137140 return links
138141
139142
143+ def clean_url (url : str ) -> str :
144+ """
145+ Clean up escaped characters in URLs from markdown files.
146+
147+ Args:
148+ url: The URL to clean
149+
150+ Returns:
151+ Cleaned URL with escape sequences properly handled
152+ """
153+ # Replace escaped underscores with actual underscores
154+ cleaned = url .replace ("\\ _" , "_" )
155+
156+ # Replace escaped hyphens with actual hyphens
157+ cleaned = cleaned .replace ("\\ -" , "-" )
158+
159+ # Handle other common escapes in Markdown
160+ cleaned = cleaned .replace ("\\ ." , "." )
161+ cleaned = cleaned .replace ("\\ #" , "#" )
162+ cleaned = cleaned .replace ("\\ (" , "(" )
163+ cleaned = cleaned .replace ("\\ )" , ")" )
164+
165+ return cleaned
166+
167+
140168def check_links_with_substring (
141169 file_path : str , substring : str
142170) -> List [Tuple [str , int , str , int , int ]]:
@@ -158,7 +186,10 @@ def check_links_with_substring(
158186 internal_paths = ["how-to" , "user-guide" , "component-guide" , "book" ]
159187
160188 def should_include_link (link : str ) -> bool :
161- if substring not in link :
189+ # Clean the link to properly handle escaped characters
190+ cleaned_link = clean_url (link )
191+
192+ if substring not in cleaned_link and substring not in link :
162193 return False
163194
164195 # For internal documentation paths, only include relative links
@@ -211,12 +242,16 @@ def check_link_validity(
211242 if not HAS_REQUESTS :
212243 return url , False , "requests module not installed" , None
213244
245+ # Clean up escaped characters in URLs
246+ # This helps with Markdown URLs that have escaped underscores, etc.
247+ cleaned_url = clean_url (url )
248+
214249 # Skip non-HTTP links
215- if not url .startswith (("http://" , "https://" )):
250+ if not cleaned_url .startswith (("http://" , "https://" )):
216251 return url , True , None , None
217252
218253 # Skip local development URLs
219- if is_local_development_url (url ):
254+ if is_local_development_url (cleaned_url ):
220255 return url , True , None , None
221256
222257 # Configure session with retries
@@ -232,20 +267,24 @@ def check_link_validity(
232267
233268 try :
234269 # First try with HEAD request
235- response = session .head (url , timeout = timeout , allow_redirects = True )
270+ response = session .head (
271+ cleaned_url , timeout = timeout , allow_redirects = True
272+ )
236273
237274 # If HEAD fails, try GET
238275 if response .status_code >= 400 :
239- response = session .get (url , timeout = timeout , allow_redirects = True )
276+ response = session .get (
277+ cleaned_url , timeout = timeout , allow_redirects = True
278+ )
240279
241280 is_valid = response .status_code < 400
242281
243282 # Additional check for Gitbook URLs that return 200 for non-existent pages
244- if is_valid and "docs.zenml.io" in url :
283+ if is_valid and "docs.zenml.io" in cleaned_url :
245284 # We need to check for "noindex" meta tag which indicates a 404 page in Gitbook
246285 try :
247286 # Use GET to fetch the page content
248- content_response = session .get (url , timeout = timeout )
287+ content_response = session .get (cleaned_url , timeout = timeout )
249288 content = content_response .text .lower ()
250289
251290 # Look for the "noindex" meta tag which indicates a 404 page
@@ -848,13 +887,22 @@ def main():
848887 # Show the original link in the output, but we validated the transformed one
849888 print (f" Line { line_num } : { original_link } " )
850889 print (f" ↳ ❌ { status_info } " )
890+
891+ # Always show what URL was actually validated
892+ cleaned_url = clean_url (original_link )
893+ if cleaned_url != original_link :
894+ print (
895+ f" ↳ URL with escapes removed: { cleaned_url } "
896+ )
851897 if (
852898 original_link != transformed_link
899+ and cleaned_url != transformed_link
853900 and not args .ci_mode
854901 ):
855902 print (
856903 f" ↳ Validated as: { transformed_link } "
857904 )
905+
858906 clickable_path = get_clickable_path (
859907 file_path , line_num
860908 )
0 commit comments