4848# source -
4949# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
5050__url_pattern = re .compile (
51- r"^(?:http|s? ftp)s? ://" # http:// or https://
51+ r"^(?:http|https|sftp| ftp)://" # http:// or https://
5252 r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)"
5353 r"+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # ...domain
5454 r"localhost|" # localhost...
@@ -601,31 +601,51 @@ def is_valid_url(url: str, raise_error: bool = True) -> bool:
601601 return is_url
602602
603603
604- def is_valid_source_code_package (
604+ def _raise_or_log_failure (log : "Logger" , raise_error : bool , failure_msg : str ) -> bool :
605+ if raise_error :
606+ raise LisaException (failure_msg )
607+ else :
608+ log .debug (failure_msg )
609+ return False
610+
611+
612+ # big function to check the parts of a url
613+ # allow raising exceptions or log and return a bool
614+ # allows checks for:
615+ # expected domains
616+ # protocols (require https, sftp, etc)
617+ # filenames (pattern matching)
618+ def check_url (
619+ log : "Logger" ,
605620 source_url : str ,
606- expected_package_name_pattern : Pattern [str ],
621+ expected_filename_pattern : Optional [ Pattern [str ]] = None ,
607622 allowed_protocols : Optional [List [str ]] = None ,
608623 expected_domains : Optional [List [str ]] = None ,
624+ raise_error : bool = False ,
609625) -> bool :
610626 # avoid using a mutable default parameter
611627 if not allowed_protocols :
612628 allowed_protocols = [
613629 "https" ,
614- "sftp" ,
615630 ]
616631 # first, check if it's a url.
617- if not is_valid_url (url = source_url , raise_error = False ):
632+ failure_msg = f"{ source_url } is not a valid URL, check your arguments."
633+ if not (
634+ is_valid_url (url = source_url , raise_error = False )
635+ or _raise_or_log_failure (log , raise_error , failure_msg )
636+ ):
618637 return False
619638
620639 # NOTE: urllib might not work as you'd expect.
621640 # It doesn't throw on lots of things you wouldn't expect to be urls.
622641 # You must verify the parts on your own, some of them may be empty, some null.
623642 # check: https://docs.python.org/3/library/urllib.parse.html#url-parsing
624-
643+ failure_msg = f"urlparse failed to parse url { source_url } , check your arguments."
625644 try :
626645 parts = urlparse (source_url )
627646 except ValueError :
628- return False
647+ if not _raise_or_log_failure (log , raise_error , failure_msg ):
648+ return False
629649
630650 # ex: from https://www.com/path/to/file.tar
631651 # scheme : https
@@ -634,23 +654,53 @@ def is_valid_source_code_package(
634654
635655 # get the filename from the path portion of the url
636656 file_path = parts .path .split ("/" )[- 1 ]
637- full_match = expected_package_name_pattern .match (file_path )
638- if not full_match :
639- return False
657+ full_match = None
658+ # check we can match against the filename
659+ if expected_filename_pattern :
660+ full_match = expected_filename_pattern .match (file_path )
661+ failure_msg = (
662+ f"File at { source_url } did not match pattern "
663+ "{expected_package_name_pattern.pattern}."
664+ )
665+ if not full_match :
666+ if not _raise_or_log_failure (log , raise_error , failure_msg ):
667+ return False
640668
641669 # check the expected domain is correct if present
642670 valid_netloc = not expected_domains or any (
643671 [domain .endswith (parts .netloc ) for domain in expected_domains ]
644672 )
673+ failure_msg = (
674+ f"net location of url { source_url } did not match "
675+ f"expected domains { ',' .join (expected_domains ) } "
676+ )
677+ if not (valid_netloc or _raise_or_log_failure (log , raise_error , failure_msg )):
678+ return False
645679
646- # optional but default is check access is via sftp/https
647- valid_scheme = any ([parts .scheme == x for x in allowed_protocols ])
648- return (
649- valid_scheme
650- and parts .netloc != ""
651- and valid_netloc
652- and (full_match .group (0 ) == file_path )
680+ # Check the protocol (aka scheme) in the url
681+ # default is check access is via https
682+ failure_msg = (
683+ f"URL { source_url } uses an invalid protocol "
684+ "or net location! Check url argument."
653685 )
686+ valid_scheme = any ([parts .scheme == x for x in allowed_protocols ])
687+ valid_netloc_and_scheme = valid_scheme and parts .netloc != "" and valid_netloc
688+ if not (
689+ valid_netloc_and_scheme or _raise_or_log_failure (log , raise_error , failure_msg )
690+ ):
691+ return False
692+ # finally verify the full match we found matches the actual filename
693+ # avoids an accidental partial match
694+ if expected_filename_pattern and full_match :
695+ path_matches = full_match .group (0 ) == file_path
696+ failure_msg = (
697+ f"File at url { source_url } failed to match"
698+ f" pattern { expected_filename_pattern .pattern } ."
699+ )
700+ if not (path_matches or _raise_or_log_failure (log , raise_error , failure_msg )):
701+ return False
702+
703+ return True
654704
655705
656706def filter_ansi_escape (content : str ) -> str :
0 commit comments