From 2f93963a0a7f1fb667170f39027012bf18bf54b1 Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Sun, 1 Dec 2024 11:41:41 +0100 Subject: [PATCH] Refactored sites module, updated documentation (#1918) --- Makefile | 4 +- docs/source/development.rst | 5 +- maigret/checking.py | 3 +- maigret/sites.py | 126 ++++++++++++++++++++---------------- poetry.lock | 79 +++++++++++++++++++++- pyproject.toml | 10 +++ sites.md | 48 +++++++------- utils/update_site_data.py | 2 +- 8 files changed, 191 insertions(+), 86 deletions(-) diff --git a/Makefile b/Makefile index d091c83a..534ddf9b 100644 --- a/Makefile +++ b/Makefile @@ -16,10 +16,10 @@ lint: flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503,E501 ${LINT_FILES} @echo 'mypy' - mypy ${LINT_FILES} + mypy --check-untyped-defs ${LINT_FILES} speed: - time python3 ./maigret.py --version + time python3 -m maigret --version python3 -c "import timeit; t = timeit.Timer('import maigret'); print(t.timeit(number = 1000000))" python3 -X importtime -c "import maigret" 2> maigret-import.log python3 -m tuna maigret-import.log diff --git a/docs/source/development.rst b/docs/source/development.rst index cffcfe18..2ee6a40e 100644 --- a/docs/source/development.rst +++ b/docs/source/development.rst @@ -33,7 +33,7 @@ Install test requirements: .. code-block:: console - pip install -r test-requirements.txt + poetry install --with dev Use the following commands to check Maigret: @@ -54,6 +54,9 @@ Use the following commands to check Maigret: # open html report open htmlcov/index.html + # get flamechart of imports to estimate startup time + make speed + How to fix false-positives ----------------------------------------------- diff --git a/maigret/checking.py b/maigret/checking.py index 1583c027..ea9aa0e0 100644 --- a/maigret/checking.py +++ b/maigret/checking.py @@ -11,7 +11,6 @@ # Third party imports import aiodns -import alive_progress from alive_progress import alive_bar from aiohttp import ClientSession, TCPConnector, http_exceptions from aiohttp.client_exceptions import ClientConnectorError, ServerDisconnectedError @@ -127,7 +126,7 @@ async def check(self) -> Tuple[str, int, Optional[CheckError]]: async with ClientSession( connector=connector, trust_env=True, - cookie_jar=self.cookie_jar.copy() if self.cookie_jar else None + cookie_jar=self.cookie_jar.copy() if self.cookie_jar else None, ) as session: html_text, status_code, error = await self._make_request( session, diff --git a/maigret/sites.py b/maigret/sites.py index dc4cb50e..9e8dfabe 100644 --- a/maigret/sites.py +++ b/maigret/sites.py @@ -21,6 +21,7 @@ def json(self): class MaigretSite: + # Fields that should not be serialized when converting site to JSON NOT_SERIALIZABLE_FIELDS = [ "name", "engineData", @@ -31,37 +32,65 @@ class MaigretSite: "urlRegexp", ] + # Username known to exist on the site username_claimed = "" + # Username known to not exist on the site username_unclaimed = "" + # Additional URL path component, e.g. /forum in https://example.com/forum/users/{username} url_subpath = "" + # Main site URL (the main page) url_main = "" + # Full URL pattern for username page, e.g. https://example.com/forum/users/{username} url = "" + # Whether site is disabled. Not used by Maigret without --use-disabled argument disabled = False + # Whether a positive result indicates accounts with similar usernames rather than exact matches similar_search = False + # Whether to ignore 403 status codes ignore403 = False + # Site category tags tags: List[str] = [] + # Type of identifier (username, gaia_id etc); see SUPPORTED_IDS in checking.py type = "username" + # Custom HTTP headers headers: Dict[str, str] = {} + # Error message substrings errors: Dict[str, str] = {} + # Site activation requirements activation: Dict[str, Any] = {} + # Regular expression for username validation regex_check = None + # URL to probe site status url_probe = None + # Type of check to perform check_type = "" + # Whether to only send HEAD requests (GET by default) request_head_only = "" + # GET parameters to include in requests get_params: Dict[str, Any] = {} + # Substrings in HTML response that indicate profile exists presense_strs: List[str] = [] + # Substrings in HTML response that indicate profile doesn't exist absence_strs: List[str] = [] + # Site statistics stats: Dict[str, Any] = {} + # Site engine name engine = None + # Engine-specific configuration engine_data: Dict[str, Any] = {} + # Engine instance engine_obj: Optional["MaigretEngine"] = None + # Future for async requests request_future = None + # Alexa traffic rank alexa_rank = None + # Source (in case a site is a mirror of another site) source = None + # URL protocol (http/https) protocol = '' def __init__(self, name, information): @@ -96,20 +125,21 @@ def __is_equal_by_url_or_name(self, url_or_name_str: str): def __eq__(self, other): if isinstance(other, MaigretSite): # Compare only relevant attributes, not internal state like request_future - attrs_to_compare = ['name', 'url_main', 'url_subpath', 'type', 'headers', - 'errors', 'activation', 'regex_check', 'url_probe', - 'check_type', 'request_head_only', 'get_params', - 'presense_strs', 'absence_strs', 'stats', 'engine', - 'engine_data', 'alexa_rank', 'source', 'protocol'] + attrs_to_compare = [ + 'name', 'url_main', 'url_subpath', 'type', 'headers', + 'errors', 'activation', 'regex_check', 'url_probe', + 'check_type', 'request_head_only', 'get_params', + 'presense_strs', 'absence_strs', 'stats', 'engine', + 'engine_data', 'alexa_rank', 'source', 'protocol' + ] return all(getattr(self, attr) == getattr(other, attr) - for attr in attrs_to_compare) + for attr in attrs_to_compare) elif isinstance(other, str): # Compare only by name (exactly) or url_main (partial similarity) return self.__is_equal_by_url_or_name(other) return False - def update_detectors(self): if "url" in self.__dict__: url = self.url @@ -474,78 +504,64 @@ def extract_ids_from_url(self, url: str) -> dict: return results def get_db_stats(self, is_markdown=False): + # Initialize counters sites_dict = self.sites_dict - urls = {} tags = {} - output = "" disabled_count = 0 - total_count = len(sites_dict) - - message_checks = 0 message_checks_one_factor = 0 - status_checks = 0 - for _, site in sites_dict.items(): + # Collect statistics + for site in sites_dict.values(): + # Count disabled sites if site.disabled: disabled_count += 1 + # Count URL types url_type = site.get_url_template() urls[url_type] = urls.get(url_type, 0) + 1 - if site.check_type == 'message' and not site.disabled: - message_checks += 1 - if site.absence_strs and site.presense_strs: - continue - message_checks_one_factor += 1 - - if site.check_type == 'status_code': - status_checks += 1 + # Count check types for enabled sites + if not site.disabled: + if site.check_type == 'message': + if not (site.absence_strs and site.presense_strs): + message_checks_one_factor += 1 + elif site.check_type == 'status_code': + status_checks += 1 + # Count tags if not site.tags: tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1 - for tag in filter(lambda x: not is_country_tag(x), site.tags): tags[tag] = tags.get(tag, 0) + 1 + # Calculate percentages + total_count = len(sites_dict) enabled_count = total_count - disabled_count enabled_perc = round(100 * enabled_count / total_count, 2) - output += ( - f"Enabled/total sites: {enabled_count}/{total_count} = {enabled_perc}%\n\n" - ) - checks_perc = round(100 * message_checks_one_factor / enabled_count, 2) - output += f"Incomplete message checks: {message_checks_one_factor}/{enabled_count} = {checks_perc}% (false positive risks)\n\n" - status_checks_perc = round(100 * status_checks / enabled_count, 2) - output += f"Status code checks: {status_checks}/{enabled_count} = {status_checks_perc}% (false positive risks)\n\n" - output += ( - f"False positive risk (total): {checks_perc+status_checks_perc:.2f}%\n\n" - ) - - top_urls_count = 20 - output += f"Top {top_urls_count} profile URLs:\n" - for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[ - :top_urls_count - ]: + # Format output + separator = "\n\n" + output = [ + f"Enabled/total sites: {enabled_count}/{total_count} = {enabled_perc}%", + f"Incomplete message checks: {message_checks_one_factor}/{enabled_count} = {checks_perc}% (false positive risks)", + f"Status code checks: {status_checks}/{enabled_count} = {status_checks_perc}% (false positive risks)", + f"False positive risk (total): {checks_perc + status_checks_perc:.2f}%", + self._format_top_items("profile URLs", urls, 20, is_markdown), + self._format_top_items("tags", tags, 20, is_markdown, self._tags), + ] + + return separator.join(output) + + def _format_top_items(self, title, items_dict, limit, is_markdown, valid_items=None): + """Helper method to format top items lists""" + output = f"Top {limit} {title}:\n" + for item, count in sorted(items_dict.items(), key=lambda x: x[1], reverse=True)[:limit]: if count == 1: break - output += f"- ({count})\t`{url}`\n" if is_markdown else f"{count}\t{url}\n" - - top_tags_count = 20 - output += f"\nTop {top_tags_count} tags:\n" - for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[ - :top_tags_count - ]: - mark = "" - if tag not in self._tags: - mark = " (non-standard)" - output += ( - f"- ({count})\t`{tag}`{mark}\n" - if is_markdown - else f"{count}\t{tag}{mark}\n" - ) - + mark = " (non-standard)" if valid_items is not None and item not in valid_items else "" + output += f"- ({count})\t`{item}`{mark}\n" if is_markdown else f"{count}\t{item}{mark}\n" return output diff --git a/poetry.lock b/poetry.lock index 3c228b3a..09bedace 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "about-time" @@ -1377,6 +1377,70 @@ files = [ [package.dependencies] typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""} +[[package]] +name = "mypy" +version = "1.13.0" +description = "Optional static typing for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mypy-1.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6607e0f1dd1fb7f0aca14d936d13fd19eba5e17e1cd2a14f808fa5f8f6d8f60a"}, + {file = "mypy-1.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a21be69bd26fa81b1f80a61ee7ab05b076c674d9b18fb56239d72e21d9f4c80"}, + {file = "mypy-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b2353a44d2179846a096e25691d54d59904559f4232519d420d64da6828a3a7"}, + {file = "mypy-1.13.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0730d1c6a2739d4511dc4253f8274cdd140c55c32dfb0a4cf8b7a43f40abfa6f"}, + {file = "mypy-1.13.0-cp310-cp310-win_amd64.whl", hash = "sha256:c5fc54dbb712ff5e5a0fca797e6e0aa25726c7e72c6a5850cfd2adbc1eb0a372"}, + {file = "mypy-1.13.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:581665e6f3a8a9078f28d5502f4c334c0c8d802ef55ea0e7276a6e409bc0d82d"}, + {file = "mypy-1.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3ddb5b9bf82e05cc9a627e84707b528e5c7caaa1c55c69e175abb15a761cec2d"}, + {file = "mypy-1.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:20c7ee0bc0d5a9595c46f38beb04201f2620065a93755704e141fcac9f59db2b"}, + {file = "mypy-1.13.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3790ded76f0b34bc9c8ba4def8f919dd6a46db0f5a6610fb994fe8efdd447f73"}, + {file = "mypy-1.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:51f869f4b6b538229c1d1bcc1dd7d119817206e2bc54e8e374b3dfa202defcca"}, + {file = "mypy-1.13.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5c7051a3461ae84dfb5dd15eff5094640c61c5f22257c8b766794e6dd85e72d5"}, + {file = "mypy-1.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:39bb21c69a5d6342f4ce526e4584bc5c197fd20a60d14a8624d8743fffb9472e"}, + {file = "mypy-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:164f28cb9d6367439031f4c81e84d3ccaa1e19232d9d05d37cb0bd880d3f93c2"}, + {file = "mypy-1.13.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a4c1bfcdbce96ff5d96fc9b08e3831acb30dc44ab02671eca5953eadad07d6d0"}, + {file = "mypy-1.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0affb3a79a256b4183ba09811e3577c5163ed06685e4d4b46429a271ba174d2"}, + {file = "mypy-1.13.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a7b44178c9760ce1a43f544e595d35ed61ac2c3de306599fa59b38a6048e1aa7"}, + {file = "mypy-1.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5d5092efb8516d08440e36626f0153b5006d4088c1d663d88bf79625af3d1d62"}, + {file = "mypy-1.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2904956dac40ced10931ac967ae63c5089bd498542194b436eb097a9f77bc8"}, + {file = "mypy-1.13.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7bfd8836970d33c2105562650656b6846149374dc8ed77d98424b40b09340ba7"}, + {file = "mypy-1.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:9f73dba9ec77acb86457a8fc04b5239822df0c14a082564737833d2963677dbc"}, + {file = "mypy-1.13.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:100fac22ce82925f676a734af0db922ecfea991e1d7ec0ceb1e115ebe501301a"}, + {file = "mypy-1.13.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7bcb0bb7f42a978bb323a7c88f1081d1b5dee77ca86f4100735a6f541299d8fb"}, + {file = "mypy-1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bde31fc887c213e223bbfc34328070996061b0833b0a4cfec53745ed61f3519b"}, + {file = "mypy-1.13.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:07de989f89786f62b937851295ed62e51774722e5444a27cecca993fc3f9cd74"}, + {file = "mypy-1.13.0-cp38-cp38-win_amd64.whl", hash = "sha256:4bde84334fbe19bad704b3f5b78c4abd35ff1026f8ba72b29de70dda0916beb6"}, + {file = "mypy-1.13.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0246bcb1b5de7f08f2826451abd947bf656945209b140d16ed317f65a17dc7dc"}, + {file = "mypy-1.13.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7f5b7deae912cf8b77e990b9280f170381fdfbddf61b4ef80927edd813163732"}, + {file = "mypy-1.13.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7029881ec6ffb8bc233a4fa364736789582c738217b133f1b55967115288a2bc"}, + {file = "mypy-1.13.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3e38b980e5681f28f033f3be86b099a247b13c491f14bb8b1e1e134d23bb599d"}, + {file = "mypy-1.13.0-cp39-cp39-win_amd64.whl", hash = "sha256:a6789be98a2017c912ae6ccb77ea553bbaf13d27605d2ca20a76dfbced631b24"}, + {file = "mypy-1.13.0-py3-none-any.whl", hash = "sha256:9c250883f9fd81d212e0952c92dbfcc96fc237f4b7c92f56ac81fd48460b3e5a"}, + {file = "mypy-1.13.0.tar.gz", hash = "sha256:0291a61b6fbf3e6673e3405cfcc0e7650bebc7939659fdca2702958038bd835e"}, +] + +[package.dependencies] +mypy-extensions = ">=1.0.0" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = ">=4.6.0" + +[package.extras] +dmypy = ["psutil (>=4.0)"] +faster-cache = ["orjson"] +install-types = ["pip"] +mypyc = ["setuptools (>=50)"] +reports = ["lxml"] + +[[package]] +name = "mypy-extensions" +version = "1.0.0" +description = "Type system extensions for programs checked with the mypy type checker." +optional = false +python-versions = ">=3.5" +files = [ + {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, + {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, +] + [[package]] name = "networkx" version = "2.8.8" @@ -2546,6 +2610,17 @@ files = [ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"] +[[package]] +name = "tuna" +version = "0.5.11" +description = "Visualize Python performance profiles" +optional = false +python-versions = ">=3.6" +files = [ + {file = "tuna-0.5.11-py3-none-any.whl", hash = "sha256:ab352a6d836014ace585ecd882148f1f7c68be9ea4bf9e9298b7127594dab2ef"}, + {file = "tuna-0.5.11.tar.gz", hash = "sha256:d47f3e39e80af961c8df016ac97d1643c3c60b5eb451299da0ab5fe411d8866c"}, +] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -2791,4 +2866,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "988c92f025a627b4c9394bf17872e6c9e506b8fa8070d51f830e92abf48aa530" +content-hash = "e521713c426ae38d26d975fbd47ad5159e393d6532be845af169e03b7421fd40" diff --git a/pyproject.toml b/pyproject.toml index 2ef6243d..8bc34e2c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,11 @@ classifiers = [ "Bug Tracker" = "https://github.com/soxoj/maigret/issues" [tool.poetry.dependencies] +# poetry install +# Install only production dependencies: +# poetry install --without dev +# Install with dev dependencies: +# poetry install --with dev python = "^3.10" aiodns = "^3.0.0" aiohttp = "^3.11.8" @@ -68,6 +73,8 @@ cloudscraper = "^1.2.71" [tool.poetry.group.dev.dependencies] +# How to add a new dev dependency: poetry add black --group dev +# Install dev dependencies with: poetry install --with dev flake8 = "^7.1.1" pytest = "^7.2.0" pytest-asyncio = "^0.23.8" @@ -75,6 +82,9 @@ pytest-cov = "^6.0.0" pytest-httpserver = "^1.0.0" pytest-rerunfailures = "^15.0" reportlab = "^4.2.0" +mypy = "^1.13.0" +tuna = "^0.5.11" [tool.poetry.scripts] +# Run with: poetry run maigret maigret = "maigret.maigret:run" diff --git a/sites.md b/sites.md index 0f900576..bc34463f 100644 --- a/sites.md +++ b/sites.md @@ -3130,16 +3130,17 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://massagerepublic.com) [massagerepublic.com (https://massagerepublic.com)](https://massagerepublic.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://mynickname.com) [mynickname.com (https://mynickname.com)](https://mynickname.com)*: top 100M* -The list was updated at (2024-11-29 UTC) +The list was updated at (2024-11-30) + ## Statistics Enabled/total sites: 2693/3126 = 86.15% Incomplete message checks: 404/2693 = 15.0% (false positive risks) -Status code checks: 720/2693 = 26.74% (false positive risks) +Status code checks: 618/2694 = 22.94% (false positive risks) -False positive risk (total): 41.74% +False positive risk (total): 37.97% Top 20 profile URLs: - (796) `{urlMain}/index/8-0-{username} (uCoz)` @@ -3163,24 +3164,25 @@ Top 20 profile URLs: - (17) `/forum/members/?username={username}` - (17) `/search.php?keywords=&terms=all&author={username}` + Top 20 tags: -- (327) `NO_TAGS` (non-standard) -- (307) `forum` -- (50) `gaming` -- (26) `coding` -- (21) `photo` -- (20) `blog` -- (19) `news` -- (15) `music` -- (14) `tech` -- (12) `freelance` -- (12) `finance` -- (11) `sharing` -- (10) `dating` -- (10) `art` -- (10) `shopping` -- (10) `movies` -- (8) `hobby` -- (8) `crypto` -- (7) `sport` -- (7) `hacking` +- (1104) `NO_TAGS` (non-standard) +- (735) `forum` +- (80) `gaming` +- (48) `photo` +- (41) `coding` +- (30) `tech` +- (29) `news` +- (27) `blog` +- (23) `music` +- (18) `finance` +- (18) `crypto` +- (17) `sharing` +- (16) `freelance` +- (15) `art` +- (15) `shopping` +- (13) `sport` +- (13) `business` +- (12) `movies` +- (11) `hobby` +- (11) `education` diff --git a/utils/update_site_data.py b/utils/update_site_data.py index a4f5a054..dd275fef 100755 --- a/utils/update_site_data.py +++ b/utils/update_site_data.py @@ -137,7 +137,7 @@ def get_readable_rank(r): site_file.write(f'1. {favicon} [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n') db.update_site(site) - site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()} UTC)\n') + site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()})\n') db.save_to_file(args.base_file) statistics_text = db.get_db_stats(is_markdown=True)