Refactored sites module, updated documentation (#1918)

soxoj · Dec 1, 2024 · 2f93963 · 2f93963
1 parent 5073cef
commit 2f93963
Show file tree

Hide file tree

Showing 8 changed files with 191 additions and 86 deletions.
diff --git a/Makefile b/Makefile
@@ -16,10 +16,10 @@ lint:
 	flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503,E501 ${LINT_FILES}
 
 	@echo 'mypy'
-	mypy ${LINT_FILES}
+	mypy --check-untyped-defs ${LINT_FILES}
 
 speed:
-	time python3 ./maigret.py --version
+	time python3 -m maigret --version
 	python3 -c "import timeit; t = timeit.Timer('import maigret'); print(t.timeit(number = 1000000))"
 	python3 -X importtime -c "import maigret" 2> maigret-import.log
 	python3 -m tuna maigret-import.log

diff --git a/docs/source/development.rst b/docs/source/development.rst
@@ -33,7 +33,7 @@ Install test requirements:
 
 .. code-block:: console
 
-  pip install -r test-requirements.txt
+  poetry install --with dev
 
 
 Use the following commands to check Maigret:
@@ -54,6 +54,9 @@ Use the following commands to check Maigret:
   # open html report
   open htmlcov/index.html
 
+  # get flamechart of imports to estimate startup time
+  make speed
+
 
 How to fix false-positives
 -----------------------------------------------

diff --git a/maigret/checking.py b/maigret/checking.py
@@ -11,7 +11,6 @@
 
 # Third party imports
 import aiodns
-import alive_progress
 from alive_progress import alive_bar
 from aiohttp import ClientSession, TCPConnector, http_exceptions
 from aiohttp.client_exceptions import ClientConnectorError, ServerDisconnectedError
@@ -127,7 +126,7 @@ async def check(self) -> Tuple[str, int, Optional[CheckError]]:
         async with ClientSession(
             connector=connector,
             trust_env=True,
-            cookie_jar=self.cookie_jar.copy() if self.cookie_jar else None
+            cookie_jar=self.cookie_jar.copy() if self.cookie_jar else None,
         ) as session:
             html_text, status_code, error = await self._make_request(
                 session,

diff --git a/maigret/sites.py b/maigret/sites.py
@@ -21,6 +21,7 @@ def json(self):
 
 
 class MaigretSite:
+    # Fields that should not be serialized when converting site to JSON
     NOT_SERIALIZABLE_FIELDS = [
         "name",
         "engineData",
@@ -31,37 +32,65 @@ class MaigretSite:
         "urlRegexp",
     ]
 
+    # Username known to exist on the site
     username_claimed = ""
+    # Username known to not exist on the site
     username_unclaimed = ""
+    # Additional URL path component, e.g. /forum in https://example.com/forum/users/{username}
     url_subpath = ""
+    # Main site URL (the main page)
     url_main = ""
+    # Full URL pattern for username page, e.g. https://example.com/forum/users/{username}
     url = ""
+    # Whether site is disabled. Not used by Maigret without --use-disabled argument
     disabled = False
+    # Whether a positive result indicates accounts with similar usernames rather than exact matches
     similar_search = False
+    # Whether to ignore 403 status codes
     ignore403 = False
+    # Site category tags
     tags: List[str] = []
 
+    # Type of identifier (username, gaia_id etc); see SUPPORTED_IDS in checking.py
     type = "username"
+    # Custom HTTP headers
     headers: Dict[str, str] = {}
+    # Error message substrings
     errors: Dict[str, str] = {}
+    # Site activation requirements
     activation: Dict[str, Any] = {}
+    # Regular expression for username validation
     regex_check = None
+    # URL to probe site status
     url_probe = None
+    # Type of check to perform
     check_type = ""
+    # Whether to only send HEAD requests (GET by default)
     request_head_only = ""
+    # GET parameters to include in requests
     get_params: Dict[str, Any] = {}
 
+    # Substrings in HTML response that indicate profile exists
     presense_strs: List[str] = []
+    # Substrings in HTML response that indicate profile doesn't exist
     absence_strs: List[str] = []
+    # Site statistics
     stats: Dict[str, Any] = {}
 
+    # Site engine name
     engine = None
+    # Engine-specific configuration
     engine_data: Dict[str, Any] = {}
+    # Engine instance
     engine_obj: Optional["MaigretEngine"] = None
+    # Future for async requests
     request_future = None
+    # Alexa traffic rank
     alexa_rank = None
+    # Source (in case a site is a mirror of another site)
     source = None
 
+    # URL protocol (http/https)
     protocol = ''
 
     def __init__(self, name, information):
@@ -96,20 +125,21 @@ def __is_equal_by_url_or_name(self, url_or_name_str: str):
     def __eq__(self, other):
         if isinstance(other, MaigretSite):
             # Compare only relevant attributes, not internal state like request_future
-            attrs_to_compare = ['name', 'url_main', 'url_subpath', 'type', 'headers',
-                              'errors', 'activation', 'regex_check', 'url_probe',
-                              'check_type', 'request_head_only', 'get_params',
-                              'presense_strs', 'absence_strs', 'stats', 'engine',
-                              'engine_data', 'alexa_rank', 'source', 'protocol']
+            attrs_to_compare = [
+                'name', 'url_main', 'url_subpath', 'type', 'headers',
+                'errors', 'activation', 'regex_check', 'url_probe',
+                'check_type', 'request_head_only', 'get_params',
+                'presense_strs', 'absence_strs', 'stats', 'engine',
+                'engine_data', 'alexa_rank', 'source', 'protocol'
+            ]
 
             return all(getattr(self, attr) == getattr(other, attr)
-                      for attr in attrs_to_compare)
+                         for attr in attrs_to_compare)
         elif isinstance(other, str):
             # Compare only by name (exactly) or url_main (partial similarity)
             return self.__is_equal_by_url_or_name(other)
         return False
 
-
     def update_detectors(self):
         if "url" in self.__dict__:
             url = self.url
@@ -474,78 +504,64 @@ def extract_ids_from_url(self, url: str) -> dict:
         return results
 
     def get_db_stats(self, is_markdown=False):
+        # Initialize counters
         sites_dict = self.sites_dict
-
         urls = {}
         tags = {}
-        output = ""
         disabled_count = 0
-        total_count = len(sites_dict)
-
-        message_checks = 0
         message_checks_one_factor = 0
-
         status_checks = 0
 
-        for _, site in sites_dict.items():
+        # Collect statistics
+        for site in sites_dict.values():
+            # Count disabled sites
             if site.disabled:
                 disabled_count += 1
 
+            # Count URL types
             url_type = site.get_url_template()
             urls[url_type] = urls.get(url_type, 0) + 1
 
-            if site.check_type == 'message' and not site.disabled:
-                message_checks += 1
-                if site.absence_strs and site.presense_strs:
-                    continue
-                message_checks_one_factor += 1
-
-            if site.check_type == 'status_code':
-                status_checks += 1
+            # Count check types for enabled sites
+            if not site.disabled:
+                if site.check_type == 'message':
+                    if not (site.absence_strs and site.presense_strs):
+                        message_checks_one_factor += 1
+                elif site.check_type == 'status_code':
+                    status_checks += 1
 
+            # Count tags
             if not site.tags:
                 tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1
-
             for tag in filter(lambda x: not is_country_tag(x), site.tags):
                 tags[tag] = tags.get(tag, 0) + 1
 
+        # Calculate percentages
+        total_count = len(sites_dict)
         enabled_count = total_count - disabled_count
         enabled_perc = round(100 * enabled_count / total_count, 2)
-        output += (
-            f"Enabled/total sites: {enabled_count}/{total_count} = {enabled_perc}%\n\n"
-        )
-
         checks_perc = round(100 * message_checks_one_factor / enabled_count, 2)
-        output += f"Incomplete message checks: {message_checks_one_factor}/{enabled_count} = {checks_perc}% (false positive risks)\n\n"
-
         status_checks_perc = round(100 * status_checks / enabled_count, 2)
-        output += f"Status code checks: {status_checks}/{enabled_count} = {status_checks_perc}% (false positive risks)\n\n"
 
-        output += (
-            f"False positive risk (total): {checks_perc+status_checks_perc:.2f}%\n\n"
-        )
-
-        top_urls_count = 20
-        output += f"Top {top_urls_count} profile URLs:\n"
-        for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[
-            :top_urls_count
-        ]:
+        # Format output
+        separator = "\n\n"
+        output = [
+            f"Enabled/total sites: {enabled_count}/{total_count} = {enabled_perc}%",
+            f"Incomplete message checks: {message_checks_one_factor}/{enabled_count} = {checks_perc}% (false positive risks)",
+            f"Status code checks: {status_checks}/{enabled_count} = {status_checks_perc}% (false positive risks)",
+            f"False positive risk (total): {checks_perc + status_checks_perc:.2f}%",
+            self._format_top_items("profile URLs", urls, 20, is_markdown),
+            self._format_top_items("tags", tags, 20, is_markdown, self._tags),
+        ]
+
+        return separator.join(output)
+
+    def _format_top_items(self, title, items_dict, limit, is_markdown, valid_items=None):
+        """Helper method to format top items lists"""
+        output = f"Top {limit} {title}:\n"
+        for item, count in sorted(items_dict.items(), key=lambda x: x[1], reverse=True)[:limit]:
             if count == 1:
                 break
-            output += f"- ({count})\t`{url}`\n" if is_markdown else f"{count}\t{url}\n"
-
-        top_tags_count = 20
-        output += f"\nTop {top_tags_count} tags:\n"
-        for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[
-            :top_tags_count
-        ]:
-            mark = ""
-            if tag not in self._tags:
-                mark = " (non-standard)"
-            output += (
-                f"- ({count})\t`{tag}`{mark}\n"
-                if is_markdown
-                else f"{count}\t{tag}{mark}\n"
-            )
-
+            mark = " (non-standard)" if valid_items is not None and item not in valid_items else ""
+            output += f"- ({count})\t`{item}`{mark}\n" if is_markdown else f"{count}\t{item}{mark}\n"
         return output
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,11 @@ classifiers = [
 "Bug Tracker" = "https://github.com/soxoj/maigret/issues"
 
 [tool.poetry.dependencies]
+# poetry install
+# Install only production dependencies:
+# poetry install --without dev
+# Install with dev dependencies:
+# poetry install --with dev
 python = "^3.10"
 aiodns = "^3.0.0"
 aiohttp = "^3.11.8"
@@ -68,13 +73,18 @@ cloudscraper = "^1.2.71"
 
 
 [tool.poetry.group.dev.dependencies]
+# How to add a new dev dependency: poetry add black --group dev
+# Install dev dependencies with: poetry install --with dev
 flake8 = "^7.1.1"
 pytest = "^7.2.0"
 pytest-asyncio = "^0.23.8"
 pytest-cov = "^6.0.0"
 pytest-httpserver = "^1.0.0"
 pytest-rerunfailures = "^15.0"
 reportlab = "^4.2.0"
+mypy = "^1.13.0"
+tuna = "^0.5.11"
 
 [tool.poetry.scripts]
+# Run with: poetry run maigret <username>
 maigret = "maigret.maigret:run"