-
Notifications
You must be signed in to change notification settings - Fork 2
/
html-anchor-validation.py
executable file
Β·267 lines (238 loc) Β· 10.6 KB
/
html-anchor-validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/usr/bin/env python3
"""
Validates all links in the specified page URL.
Usage:
url_link_validation.py {url}
url_link_validation.py -vt {url}
url_link_validation.py https://cs.co/ise-guides
"""
__author__ = "Thomas Howard"
__email__ = "[email protected]"
__license__ = "MIT - https://mit-license.org/"
from bs4 import BeautifulSoup
import argparse
import asyncio
import aiohttp
import aiohttp_client_cache
import datetime
import os.path # file paths & existence
import pandas as pd
import requests
import sys
import time
import urllib.parse
CACHE_DIR = './.cache'
CACHE_EXPIRATION = datetime.timedelta(days=30) # -1 = Never, 0 = no write, N seconds, timedelta, datetime
ICONS = {
'CACHE': 'β―', # β§
'INFO': 'β',
'NEW': 'π',
'GOOD': 'β
',
'FAIL': 'β',
'WARN': 'πΊ',
'TODO': 'π§',
'MISSING': 'π»',
'UNKNOWN': 'β',
'BUG': 'π',
'REDIR': 'β₯', # β ⱠⲠⱠⲠ• β€· β€Έ β€Ή β»
}
HTTP_STATUS_ICONS = {
# Informational responses (100 β 199)
100 : '', # Continue
101 : '', # Switching Protocols
102 : '', # Processing (WebDAV)
103 : '', # Early Hints
# Successful responses (200 β 299)
200 : 'β
', # OK
201 : 'π', # Created
202 : 'π§', # Accepted
203 : '', # Non-Authoritative Information
204 : 'π»', # No Content
205 : 'β²', # Reset Content
206 : 'β¬', # Partial Content
207 : '', # Multi-Status (WebDAV)
208 : '', # Already Reported (WebDAV)
226 : '', # IM Used (HTTP Delta encoding)
# Redirection messages (300 β 399)
300 : 'βΆ', # Multiple Choices
301 : 'β₯', # Moved Permanently
302 : 'β', # Found
303 : '', # See Other
304 : '', # Not Modified
305 : 'β¦', # Use Proxy Deprecated
306 : '', # unused
307 : 'β₯', # Temporary Redirect
308 : 'β₯', # Permanent Redirect
# Client error responses (400 β 499)
400 : 'β', # Bad Request
401 : 'π', # Unauthorized
402 : 'π°', # Payment Required Experimental
403 : 'β', # Forbidden
404 : 'π»', # Not Found
405 : 'β', # Method Not Allowed
406 : 'β', # Not Acceptable
407 : '', # Proxy Authentication Required
408 : 'β³', # Request Timeout
409 : '', # Conflict
410 : 'π»', # Gone
411 : 'βΊ', # Length Required
412 : '', # Precondition Failed
413 : 'βΊ', # Payload Too Large
414 : '⬳', # URI Too Long
415 : 'πΎ', # Unsupported Media Type
416 : 'β§°', # Range Not Satisfiable
417 : '', # Expectation Failed
418 : 'π«', # I'm a teapot
421 : 'β§', # Misdirected Request
422 : '', # Unprocessable Content (WebDAV)
423 : 'π', # Locked (WebDAV)
424 : '', # Failed Dependency (WebDAV)
425 : 'π§', # Too Early Experimental
426 : 'β¬', # Upgrade Required
428 : 'β', # Precondition Required
429 : 'βΆ', # Too Many Requests
431 : 'βΊ', # Request Header Fields Too Large
451 : 'β', # Unavailable For Legal Reasons
# Server error responses (500 β 599)
500 : 'π₯', # Internal Server Error
501 : 'π§', # Not Implemented
502 : 'π’', # Bad Gateway
503 : 'π’', # Service Unavailable
504 : 'β³', # Gateway Timeout
505 : '', # HTTP Version Not Supported
506 : '', # Variant Also Negotiates
507 : 'πΎ', # Insufficient Storage (WebDAV)
508 : '', # Loop Detected (WebDAV)
510 : '', # Not Extended
511 : 'βΏ', # Network Authentication Required
}
def bs4ff_a_has_href_and_target_not_self (tag) -> bool:
"""
BS4 tag filter function to ignore self-referential links.
Returns True if the tag matches, False otherwise.
:param tag (bs4.element.Tag) : a BeautifulSoup4 tag.
"""
return tag.has_attr('href') and tag.has_attr('target') and tag['target'] != "_self"
async def get_all_bs4_tags (url:str=None, filter=None) -> list:
"""
Returns a list of BeautifulSoup Tags (`bs4.element.Tag`).
:param url (str) : a string representing a URL.
:param filter (callable) : a BeautifulSoup4 tag filter function(tag)->bool.
"""
with requests.Session() as session:
response = session.get(url, allow_redirects=True)
soup = BeautifulSoup(response.text, 'html.parser')
return soup.find_all(filter) # tag filter function
async def get_unique_urls_from_anchor_tags (url:str, tags:list=[]) -> dict:
"""
Returns a dict mapping href string to anchor attributes and status:
'Name' : the anchor tag inner text (linl name)
'URL' : the anchor tag `href` attribute
'Target': the anchor tag `target` attribute
:param url (str) : a string representing a URL.
:param tags (list[bs4.Tag]) : a BeautifulSoup4 tag filter function(tag)->bool.
"""
urls = {}
for tag in tags:
# Fix missing URL elements using base URL
parsed_url = urllib.parse.urlsplit(tag.get('href'), allow_fragments=False)
if parsed_url.scheme == '' or parsed_url.netloc == '':
joined_url = urllib.parse.urljoin(url, tag.get('href'))
print(f"π Fix URL: {tag.get('href')} ==> {joined_url}", file=sys.stderr)
tag['href'] = joined_url
# Save extracted links to urls dict
urls[tag.get('href')] = {
# 'Tag' : tag.name,
'Name' : tag.text.strip(),
# 'Title' : tag.get('title', ''),
'URL' : tag.get('href'),
# 'Class' : tag.get('class', ''),
# 'ID' : tag.get('id', ''),
'Target': tag.get('target', ''),
# 'Rel' : tag.get('rel', ''),
# 'Style' : tag.get('style', ''),
}
return urls
async def get_unique_tag_attrs (tags:list=[]) -> None:
"""
Prints a set of unique tag attributes from the list of tags specified.
This may help understand interesting available attributes.
:param tags ([bs4.Tag]) : a list of bs4.Tag objects.
"""
unique_tag_attrs = set()
[unique_tag_attrs.update(tag.attrs.keys()) for tag in tags]
return unique_tag_attrs
# async def get_url_data (session=None, urlq:asyncio.Queue=None):
async def get_url_data (session=None, urlq:asyncio.Queue=None):
"""
Asyncio task handler that updates the `url_data` in the urlq with information from the HTTP HEAD method.
:param session (aiohttp.ClientSession) : an aiohttp.ClientSession to use
:param urlq (asyncio.Queue) : the queue of `url_data` to monitor.
"""
while True:
url_data = await urlq.get() # Get an item from the queue
# print(f"𧡠{name} | q:{urlq.qsize()} | {url_data['URL']}", file=sys.stderr)
# urls = await get_page_links(session, parent_url)
# async with aiohttp.ClientSession() as session:
try:
response = await session.get(url_data['URL'], allow_redirects=True)
print(f"{ICONS['GOOD'] if response.ok else ICONS['FAIL']} | {response.status} | {url_data['Name']} | {response.url}", file=sys.stdout)
# print(f"{ICONS['INFO']}: {response.from_cache}, {response.created_at}, {response.expires}, {response.is_expired}", file=sys.stderr)
if len(response.history) > 1: url_data['Redir'] = response.history[-1].url
url_data['Icon'] = HTTP_STATUS_ICONS[response.status]
url_data['Status'] = response.status
url_data['Content-Type'] = response.headers.get('Content-Type', '')
# url_data['Pragma'] = response.headers.get('Pragma', '')
except Exception as e:
print(f"{ICONS['FAIL']}: {e}", file=sys.stderr)
url_data['Icon'] = 'π'
url_data['Status'] = 666
url_data['Error'] = e
urlq.task_done() # Notify queue the item is processed
async def main ():
"""
Run script with async functions.
"""
argp = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
argp.add_argument('url', action='store', type=str, help='the URL to validate')
argp.add_argument('-f','--force', action='store_true', default=False, help='Force a fetch', required=False)
argp.add_argument('-t','--timer', action='store_true', default=False, help='Time the process', required=False)
argp.add_argument('-v','--verbosity', action='count', default=0, help='Verbosity')
args = argp.parse_args()
if args.verbosity: print(f"β args: {args}", file=sys.stderr)
if args.timer: start_time = time.time()
# Map { href : {dict} } for tracking URL attributes
urls_data = {}
# Extract tags from document and return
response = requests.get(args.url, allow_redirects=True) # resolve any base URL redirects
soup = BeautifulSoup(response.text, 'html.parser')
tags = soup.find_all(bs4ff_a_has_href_and_target_not_self) # Tag filter function
urls_data = await get_unique_urls_from_anchor_tags(response.url, tags) # unique-ify and resolve any base URL redirects
# Summarize document tag findings
if args.verbosity: print(f"β Found {len(urls_data)} unique URLs from {len(tags)} tags", file=sys.stderr)
if args.verbosity: print(f"β Unique tag attributes: {sorted(await get_unique_tag_attrs(tags))}", file=sys.stderr)
# Create worker tasks to process the queue concurrently and add to the urls_data
url_data_queue = asyncio.Queue() # Create a queue for the URL workload
# tasks = [asyncio.create_task(head_url_data(url_data_queue)) for ii in range(1,6)]
cache = aiohttp_client_cache.FileBackend(cache_name=CACHE_DIR, use_temp=False)
async with aiohttp_client_cache.CachedSession(cache=cache, expire_after=CACHE_EXPIRATION) as session: # cache only valid for this session run
if args.force: await session.cache.clear()
tasks = [asyncio.create_task(get_url_data(session, url_data_queue)) for ii in range(1,6)]
[url_data_queue.put_nowait(url_data) for url_data in urls_data.values()]
await url_data_queue.join() # process the queue until finished
# Load urls_data into DataFrame for easy CSV conversion
df = pd.DataFrame(urls_data.values())
if args.verbosity: print(df.columns)
df['Status'] = df['Status'].astype('Int64')
if args.verbosity: print(df.dtypes)
df = df.sort_values(['Status','URL'], ascending=[False,True])
df = df[['Icon','Status','Name','URL','Target','Redir','Content-Type','Error']] # Re-order columns
df.to_csv('urls.csv', index=False)
if args.verbosity: print(df, file=sys.stdout)
print(df[['Icon','Status','URL']].groupby('Status').count().sort_values(['Status'], ascending=False).rename({'URL':'Count'}, axis='columns'))
if args.timer: print(f"β² {'{0:.3f}'.format(time.time() - start_time)} seconds", file=sys.stderr)
if __name__ == '__main__':
"""
Run as local script from command line.
"""
asyncio.run(main())