-
-
Notifications
You must be signed in to change notification settings - Fork 42
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve URL boundary with quotations and parentheses #254
Changes from all commits
6c6bce5
0e54956
20b3aa5
5115f04
1a81f93
7536ed9
8d22840
725a113
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -112,9 +112,12 @@ | |
url_scheme = one_of(schemes, caseless=True) | ||
port = Word(":", nums, min=2) | ||
url_authority = Combine(Or([complete_email_address, domain_name, ipv4_address, ipv6_address]) + Optional(port)("port")) | ||
# although the ":" character is not valid in url paths, | ||
# The url_path_word characters are taken from https://www.ietf.org/rfc/rfc3986.txt... | ||
# (of particular interest is "Appendix A. Collected ABNF for URI") | ||
|
||
# Although the ":" character is not valid in url paths, | ||
# some urls are written with the ":" unencoded so we include it below | ||
url_path_word = Word(alphanums + "-._~!$&'()*+,;:=%") | ||
url_path_word = Word(alphanums + "-._~!$&'()*+,;=:%") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Switch order of characters to align w/ RFC. |
||
url_path = Combine(OneOrMore(MatchFirst([url_path_word, Literal("/")]))) | ||
url_query = Word(printables, excludeChars="#\"']") | ||
url_fragment = Word(printables, excludeChars="?\"']") | ||
|
@@ -281,9 +284,7 @@ def hasBothOrNeitherAngleBrackets(string): | |
alphanum_word_start | ||
# we use `Or([Literal("pub-")...` instead of something like `CaselessLiteral("pub-")` b/c... | ||
# we only want to parse "pub" when it is all upper or lowercased (not "pUb" or other, similar variations) | ||
+ Combine(one_of("pub- PUB-") + Word(nums, exact=16)).set_parse_action( | ||
pyparsing_common.downcase_tokens | ||
) | ||
+ Combine(one_of("pub- PUB-") + Word(nums, exact=16)).set_parse_action(pyparsing_common.downcase_tokens) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lint update |
||
+ alphanum_word_end | ||
) | ||
|
||
|
@@ -324,9 +325,7 @@ def hasBothOrNeitherAngleBrackets(string): | |
|
||
# the mac address grammar was developed from https://en.wikipedia.org/wiki/MAC_address#Notational_conventions | ||
# handles xx:xx:xx:xx:xx:xx or xx-xx-xx-xx-xx-xx | ||
mac_address_16_bit_section = Combine( | ||
(Word(hexnums, exact=2) + one_of("- :")) * 5 + Word(hexnums, exact=2) | ||
) | ||
mac_address_16_bit_section = Combine((Word(hexnums, exact=2) + one_of("- :")) * 5 + Word(hexnums, exact=2)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lint update |
||
# handles xxxx.xxxx.xxxx | ||
mac_address_32_bit_section = Combine((Word(hexnums, exact=4) + ".") * 2 + Word(hexnums, exact=4)) | ||
mac_address_word_start = WordStart(wordChars=alphanums + ":-.") | ||
|
@@ -414,7 +413,6 @@ def hasBothOrNeitherAngleBrackets(string): | |
+ Combine( | ||
one_of(enterprise_attack_techniques, caseless=True).set_parse_action(pyparsing_common.upcase_tokens) | ||
+ Optional(attack_sub_technique) | ||
|
||
) | ||
+ alphanum_word_end | ||
) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -192,11 +192,13 @@ def test_ipv4_cidr_parsing(): | |
def test_registry_key_parsing(): | ||
s = r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows HKLM\Software\Microsoft\Windows HKCC\Software\Microsoft\Windows" | ||
iocs = find_iocs(s) | ||
assert sorted([ | ||
r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows", | ||
r"HKLM\Software\Microsoft\Windows", | ||
r"HKCC\Software\Microsoft\Windows", | ||
]) == sorted(iocs["registry_key_paths"]) | ||
assert sorted( | ||
[ | ||
r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows", | ||
r"HKLM\Software\Microsoft\Windows", | ||
r"HKCC\Software\Microsoft\Windows", | ||
] | ||
) == sorted(iocs["registry_key_paths"]) | ||
Comment on lines
+195
to
+201
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lint update |
||
|
||
|
||
def test_adsense_publisher_id_parsing(): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Three changes to note:
)
before rstriping quotation marks so cases likehttp://example.com/foo')
are properly parsed ashttp://example.com/foo
url.rstrip(")")
to remove a closing paren. at the end of a URL; now we dourl = url.split(")")[0]
to remove the paren and everything after it.