1
- import requests
2
- import xml .etree .ElementTree as ET
3
- from scripts .link import parse_links
4
- import re
5
-
6
- def parse_sitemap (url ):
7
- response = requests .get (url )
8
- root = ET .fromstring (response .content )
9
-
10
- urls = []
11
- for element in root .iter ('{http://www.sitemaps.org/schemas/sitemap/0.9}url' ):
12
- for loc in element .iter ('{http://www.sitemaps.org/schemas/sitemap/0.9}loc' ):
13
- if not has_extension_to_ignore (loc .text ):
14
- urls .append (loc .text )
15
- else :
16
- print (f"Skipping filetype: { loc .text } " )
17
-
18
- return urls
19
-
20
- # Example sitemap URL https://www.nerdwallet.com/blog/wp-sitemap-news-articles-1.xml
21
- def sitemap ():
22
- sitemap_url = input ("Enter the URL of the sitemap: " )
23
-
24
- if (len (sitemap_url ) == 0 ):
25
- print ("No valid sitemap provided!" )
26
- exit (1 )
27
-
28
- url_array = parse_sitemap (sitemap_url )
29
-
30
- #parse links from array
31
- parse_links (url_array )
32
-
33
- def has_extension_to_ignore (string ):
34
- image_extensions = ['.jpg' , '.jpeg' , '.png' , '.gif' , '.bmp' , '.pdf' ]
35
-
36
- pattern = r'\b(' + '|' .join (re .escape (ext ) for ext in image_extensions ) + r')\b'
37
- match = re .search (pattern , string , re .IGNORECASE )
38
-
1
+ import requests
2
+ import xml .etree .ElementTree as ET
3
+ from scripts .link import parse_links
4
+ import re
5
+
6
+ def parse_sitemap (url ):
7
+ response = requests .get (url )
8
+ root = ET .fromstring (response .content )
9
+
10
+ urls = []
11
+ for element in root .iter ('{http://www.sitemaps.org/schemas/sitemap/0.9}url' ):
12
+ for loc in element .iter ('{http://www.sitemaps.org/schemas/sitemap/0.9}loc' ):
13
+ if not has_extension_to_ignore (loc .text ):
14
+ urls .append (loc .text )
15
+ else :
16
+ print (f"Skipping filetype: { loc .text } " )
17
+
18
+ return urls
19
+
20
+ # Example sitemap URL https://www.nerdwallet.com/blog/wp-sitemap-news-articles-1.xml
21
+ def sitemap ():
22
+ sitemap_url = input ("Enter the URL of the sitemap: " )
23
+
24
+ if (len (sitemap_url ) == 0 ):
25
+ print ("No valid sitemap provided!" )
26
+ exit (1 )
27
+
28
+ url_array = parse_sitemap (sitemap_url )
29
+
30
+ #parse links from array
31
+ parse_links (url_array )
32
+
33
+ def has_extension_to_ignore (string ):
34
+ image_extensions = ['.jpg' , '.jpeg' , '.png' , '.gif' , '.bmp' , '.pdf' ]
35
+
36
+ pattern = r'\b(' + '|' .join (re .escape (ext ) for ext in image_extensions ) + r')\b'
37
+ match = re .search (pattern , string , re .IGNORECASE )
38
+
39
39
return match is not None
0 commit comments