-
-
Notifications
You must be signed in to change notification settings - Fork 42
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve URL boundary with quotations and parentheses #254
Changes from 4 commits
6c6bce5
0e54956
20b3aa5
5115f04
1a81f93
7536ed9
8d22840
725a113
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -112,9 +112,12 @@ | |
url_scheme = one_of(schemes, caseless=True) | ||
port = Word(":", nums, min=2) | ||
url_authority = Combine(Or([complete_email_address, domain_name, ipv4_address, ipv6_address]) + Optional(port)("port")) | ||
# although the ":" character is not valid in url paths, | ||
# The url_path_word characters are taken from https://www.ietf.org/rfc/rfc3986.txt... | ||
# (of particular interest is "Appendix A. Collected ABNF for URI") | ||
|
||
# Although the ":" character is not valid in url paths, | ||
# some urls are written with the ":" unencoded so we include it below | ||
url_path_word = Word(alphanums + "-._~!$&'()*+,;:=%") | ||
url_path_word = Word(alphanums + "-._~!$&'()*+,;=:%") | ||
url_path = Combine(OneOrMore(MatchFirst([url_path_word, Literal("/")]))) | ||
url_query = Word(printables, excludeChars="#\"']") | ||
url_fragment = Word(printables, excludeChars="?\"']") | ||
|
@@ -281,9 +284,7 @@ def hasBothOrNeitherAngleBrackets(string): | |
alphanum_word_start | ||
# we use `Or([Literal("pub-")...` instead of something like `CaselessLiteral("pub-")` b/c... | ||
# we only want to parse "pub" when it is all upper or lowercased (not "pUb" or other, similar variations) | ||
+ Combine(one_of("pub- PUB-") + Word(nums, exact=16)).set_parse_action( | ||
pyparsing_common.downcase_tokens | ||
) | ||
+ Combine(one_of("pub- PUB-") + Word(nums, exact=16)).set_parse_action(pyparsing_common.downcase_tokens) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lint update |
||
+ alphanum_word_end | ||
) | ||
|
||
|
@@ -324,9 +325,7 @@ def hasBothOrNeitherAngleBrackets(string): | |
|
||
# the mac address grammar was developed from https://en.wikipedia.org/wiki/MAC_address#Notational_conventions | ||
# handles xx:xx:xx:xx:xx:xx or xx-xx-xx-xx-xx-xx | ||
mac_address_16_bit_section = Combine( | ||
(Word(hexnums, exact=2) + one_of("- :")) * 5 + Word(hexnums, exact=2) | ||
) | ||
mac_address_16_bit_section = Combine((Word(hexnums, exact=2) + one_of("- :")) * 5 + Word(hexnums, exact=2)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lint update |
||
# handles xxxx.xxxx.xxxx | ||
mac_address_32_bit_section = Combine((Word(hexnums, exact=4) + ".") * 2 + Word(hexnums, exact=4)) | ||
mac_address_word_start = WordStart(wordChars=alphanums + ":-.") | ||
|
@@ -414,7 +413,6 @@ def hasBothOrNeitherAngleBrackets(string): | |
+ Combine( | ||
one_of(enterprise_attack_techniques, caseless=True).set_parse_action(pyparsing_common.upcase_tokens) | ||
+ Optional(attack_sub_technique) | ||
|
||
) | ||
+ alphanum_word_end | ||
) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,4 +10,36 @@ | |
{}, | ||
id="URL and domains parsed", | ||
), | ||
param( | ||
"Foo https://citizenlab.ca/about/), bar", | ||
{ | ||
"urls": ["https://citizenlab.ca/about/"], | ||
}, | ||
{"parse_domain_from_url": False}, | ||
id="URL boundary w/ ) handled properly", | ||
), | ||
fhightower marked this conversation as resolved.
Show resolved
Hide resolved
|
||
param( | ||
"DownloadString('https://example[.]com/rdp.ps1');g $I DownloadString(\"https://example[.]com/rdp.ps2\");g $I", | ||
{ | ||
"urls": ["https://example.com/rdp.ps1", "https://example.com/rdp.ps2"], | ||
}, | ||
{"parse_domain_from_url": False}, | ||
id="URL boundary w/ single or double quotes handled properly", | ||
), | ||
param( | ||
"url,https://example.com/g/foo,Malicious Google Groups discussion", | ||
{ | ||
"urls": ["https://example.com/g/foo"], | ||
}, | ||
{"parse_domain_from_url": False}, | ||
id="URL boundary w/ comma handled properly", | ||
), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To resolve these two failing cases, my approach will be to:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will not handle cases like #261 where the url is not preceded by a comma (it is only postceded by one). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am comfortable removing punctuation from the end of a URL or it the URL is preceded by the same character, but other than that, we would have to break significantly with the RFC (which is something I need to think about more). |
||
param( | ||
"https://example.com/g//foo", | ||
{ | ||
"urls": ["https://example.com/g//foo"], | ||
}, | ||
{"parse_domain_from_url": False}, | ||
id="Consecutive slashes handled properly", | ||
), | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -192,11 +192,13 @@ def test_ipv4_cidr_parsing(): | |
def test_registry_key_parsing(): | ||
s = r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows HKLM\Software\Microsoft\Windows HKCC\Software\Microsoft\Windows" | ||
iocs = find_iocs(s) | ||
assert sorted([ | ||
r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows", | ||
r"HKLM\Software\Microsoft\Windows", | ||
r"HKCC\Software\Microsoft\Windows", | ||
]) == sorted(iocs["registry_key_paths"]) | ||
assert sorted( | ||
[ | ||
r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows", | ||
r"HKLM\Software\Microsoft\Windows", | ||
r"HKCC\Software\Microsoft\Windows", | ||
] | ||
) == sorted(iocs["registry_key_paths"]) | ||
Comment on lines
+195
to
+201
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lint update |
||
|
||
|
||
def test_adsense_publisher_id_parsing(): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Switch order of characters to align w/ RFC.