Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: improved the readability of code by adding docstrings #11

Merged
merged 2 commits into from
Mar 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.vscode/
29 changes: 29 additions & 0 deletions core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,32 @@
class LinkParser(HTMLParser):

def handle_starttag(self, tag, attrs):
"""Process HTML start tags to find anchor tags and extract href URLs.

Args:
tag (str): HTML tag name
attrs (list): List of (attribute, value) tuples

"""

if tag == 'a':
for (key, value) in attrs:
if key == 'href':
newUrl = parse.urljoin(self.baseUrl, value)
self.links = self.links + [newUrl]

def getLinks(self, url):
"""Fetch webpage and extract all links found.

Args:
url (str): URL to fetch and parse

Returns:
tuple: (html_content, links_found)
- html_content (str): Page HTML content
- links_found (list): List of absolute URLs found
"""

self.links = []
self.baseUrl = url
response = urlopen(url)
Expand All @@ -24,6 +43,16 @@ def getLinks(self, url):
return "",[]

def spider(url, maxPages):
"""Web crawler that visits pages and collects links.

Args:
url (str): Starting URL to begin crawl
maxPages (int): Maximum number of pages to visit

Returns:
list: All unique links discovered during crawl
"""

links = []
pagesToVisit = [url]
numberVisited = 0
Expand Down
16 changes: 16 additions & 0 deletions core/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,22 @@


def param_extract(response, level, black_list, placeholder):
"""
Extract URL parameters from response text and replace values with placeholders.

Args:
response (str): HTML/text response to parse for URLs with parameters
level (str): Depth of parameter extraction ('high' extracts nested params)
black_list (list): List of strings to exclude from results
placeholder (str): String to replace parameter values with

Returns:
list: Unique URLs with parameter values replaced by placeholder

Uses regex patterns:
- r'.*?:\/\/.*\?.*\=[^$]' : Matches URLs with at least one parameter
- r'.*?:\/\/.*\?.*\=' : Basic URL parameter pattern
"""

'''
regexp : r'.*?:\/\/.*\?.*\=[^$]'
Expand Down
18 changes: 17 additions & 1 deletion core/requester.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,25 @@


def connector(url):
"""
Make an HTTP GET request to a URL with a random user agent.

Args:
url (str): The target URL to send the GET request to

Returns:
str: The response text from the server, or False if connection fails

Raises:
ConnectionError: If cannot connect to the server
TimeoutError: If the request times out (30s)
AttributeError: If there's an error in the HTTP request
KeyboardInterrupt: If the user interrupts the process
RuntimeError: For other unexpected exceptions
"""
result = False
user_agent_list = [
#Chrome
#Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
Expand Down