americo · americo · Mar 11, 2025 · Feb 5, 2025 · Feb 5, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.vscode/
diff --git a/core/crawler.py b/core/crawler.py
@@ -5,13 +5,32 @@
 class LinkParser(HTMLParser):
 
     def handle_starttag(self, tag, attrs):
+        """Process HTML start tags to find anchor tags and extract href URLs.
+
+        Args:
+            tag (str): HTML tag name
+            attrs (list): List of (attribute, value) tuples
+
+        """
+
         if tag == 'a':
             for (key, value) in attrs:
                 if key == 'href':
                     newUrl = parse.urljoin(self.baseUrl, value)
                     self.links = self.links + [newUrl]
 
     def getLinks(self, url):
+        """Fetch webpage and extract all links found.
+
+        Args:
+            url (str): URL to fetch and parse
+
+        Returns:
+            tuple: (html_content, links_found)
+                - html_content (str): Page HTML content
+                - links_found (list): List of absolute URLs found
+        """
+
         self.links = []
         self.baseUrl = url
         response = urlopen(url)
@@ -24,6 +43,16 @@ def getLinks(self, url):
             return "",[]
 
 def spider(url, maxPages):
+    """Web crawler that visits pages and collects links.
+
+    Args:
+        url (str): Starting URL to begin crawl
+        maxPages (int): Maximum number of pages to visit
+
+    Returns:
+        list: All unique links discovered during crawl
+    """
+
     links = [] 
     pagesToVisit = [url]
     numberVisited = 0

diff --git a/core/extractor.py b/core/extractor.py
@@ -2,6 +2,22 @@
 
 
 def param_extract(response, level, black_list, placeholder):
+    """
+    Extract URL parameters from response text and replace values with placeholders.
+
+    Args:
+        response (str): HTML/text response to parse for URLs with parameters
+        level (str): Depth of parameter extraction ('high' extracts nested params)
+        black_list (list): List of strings to exclude from results
+        placeholder (str): String to replace parameter values with
+
+    Returns:
+        list: Unique URLs with parameter values replaced by placeholder
+
+    Uses regex patterns:
+        - r'.*?:\/\/.*\?.*\=[^$]' : Matches URLs with at least one parameter
+        - r'.*?:\/\/.*\?.*\=' : Basic URL parameter pattern
+    """
 
     ''' 
     regexp : r'.*?:\/\/.*\?.*\=[^$]'

diff --git a/core/requester.py b/core/requester.py
@@ -3,9 +3,25 @@
 
 
 def connector(url):
+    """
+    Make an HTTP GET request to a URL with a random user agent.
+
+    Args:
+        url (str): The target URL to send the GET request to
+
+    Returns:
+        str: The response text from the server, or False if connection fails
+
+    Raises:
+        ConnectionError: If cannot connect to the server
+        TimeoutError: If the request times out (30s)
+        AttributeError: If there's an error in the HTTP request
+        KeyboardInterrupt: If the user interrupts the process
+        RuntimeError: For other unexpected exceptions
+    """
     result = False
     user_agent_list = [
-   #Chrome
+    #Chrome
     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
     'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
     'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',