Skip to content

Commit

Permalink
Refactored sites module, updated documentation (#1918)
Browse files Browse the repository at this point in the history
  • Loading branch information
soxoj authored Dec 1, 2024
1 parent 5073cef commit 2f93963
Show file tree
Hide file tree
Showing 8 changed files with 191 additions and 86 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ lint:
flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503,E501 ${LINT_FILES}

@echo 'mypy'
mypy ${LINT_FILES}
mypy --check-untyped-defs ${LINT_FILES}

speed:
time python3 ./maigret.py --version
time python3 -m maigret --version
python3 -c "import timeit; t = timeit.Timer('import maigret'); print(t.timeit(number = 1000000))"
python3 -X importtime -c "import maigret" 2> maigret-import.log
python3 -m tuna maigret-import.log
Expand Down
5 changes: 4 additions & 1 deletion docs/source/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Install test requirements:

.. code-block:: console
pip install -r test-requirements.txt
poetry install --with dev
Use the following commands to check Maigret:
Expand All @@ -54,6 +54,9 @@ Use the following commands to check Maigret:
# open html report
open htmlcov/index.html
# get flamechart of imports to estimate startup time
make speed
How to fix false-positives
-----------------------------------------------
Expand Down
3 changes: 1 addition & 2 deletions maigret/checking.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

# Third party imports
import aiodns
import alive_progress
from alive_progress import alive_bar
from aiohttp import ClientSession, TCPConnector, http_exceptions
from aiohttp.client_exceptions import ClientConnectorError, ServerDisconnectedError
Expand Down Expand Up @@ -127,7 +126,7 @@ async def check(self) -> Tuple[str, int, Optional[CheckError]]:
async with ClientSession(
connector=connector,
trust_env=True,
cookie_jar=self.cookie_jar.copy() if self.cookie_jar else None
cookie_jar=self.cookie_jar.copy() if self.cookie_jar else None,
) as session:
html_text, status_code, error = await self._make_request(
session,
Expand Down
126 changes: 71 additions & 55 deletions maigret/sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def json(self):


class MaigretSite:
# Fields that should not be serialized when converting site to JSON
NOT_SERIALIZABLE_FIELDS = [
"name",
"engineData",
Expand All @@ -31,37 +32,65 @@ class MaigretSite:
"urlRegexp",
]

# Username known to exist on the site
username_claimed = ""
# Username known to not exist on the site
username_unclaimed = ""
# Additional URL path component, e.g. /forum in https://example.com/forum/users/{username}
url_subpath = ""
# Main site URL (the main page)
url_main = ""
# Full URL pattern for username page, e.g. https://example.com/forum/users/{username}
url = ""
# Whether site is disabled. Not used by Maigret without --use-disabled argument
disabled = False
# Whether a positive result indicates accounts with similar usernames rather than exact matches
similar_search = False
# Whether to ignore 403 status codes
ignore403 = False
# Site category tags
tags: List[str] = []

# Type of identifier (username, gaia_id etc); see SUPPORTED_IDS in checking.py
type = "username"
# Custom HTTP headers
headers: Dict[str, str] = {}
# Error message substrings
errors: Dict[str, str] = {}
# Site activation requirements
activation: Dict[str, Any] = {}
# Regular expression for username validation
regex_check = None
# URL to probe site status
url_probe = None
# Type of check to perform
check_type = ""
# Whether to only send HEAD requests (GET by default)
request_head_only = ""
# GET parameters to include in requests
get_params: Dict[str, Any] = {}

# Substrings in HTML response that indicate profile exists
presense_strs: List[str] = []
# Substrings in HTML response that indicate profile doesn't exist
absence_strs: List[str] = []
# Site statistics
stats: Dict[str, Any] = {}

# Site engine name
engine = None
# Engine-specific configuration
engine_data: Dict[str, Any] = {}
# Engine instance
engine_obj: Optional["MaigretEngine"] = None
# Future for async requests
request_future = None
# Alexa traffic rank
alexa_rank = None
# Source (in case a site is a mirror of another site)
source = None

# URL protocol (http/https)
protocol = ''

def __init__(self, name, information):
Expand Down Expand Up @@ -96,20 +125,21 @@ def __is_equal_by_url_or_name(self, url_or_name_str: str):
def __eq__(self, other):
if isinstance(other, MaigretSite):
# Compare only relevant attributes, not internal state like request_future
attrs_to_compare = ['name', 'url_main', 'url_subpath', 'type', 'headers',
'errors', 'activation', 'regex_check', 'url_probe',
'check_type', 'request_head_only', 'get_params',
'presense_strs', 'absence_strs', 'stats', 'engine',
'engine_data', 'alexa_rank', 'source', 'protocol']
attrs_to_compare = [
'name', 'url_main', 'url_subpath', 'type', 'headers',
'errors', 'activation', 'regex_check', 'url_probe',
'check_type', 'request_head_only', 'get_params',
'presense_strs', 'absence_strs', 'stats', 'engine',
'engine_data', 'alexa_rank', 'source', 'protocol'
]

return all(getattr(self, attr) == getattr(other, attr)
for attr in attrs_to_compare)
for attr in attrs_to_compare)
elif isinstance(other, str):
# Compare only by name (exactly) or url_main (partial similarity)
return self.__is_equal_by_url_or_name(other)
return False


def update_detectors(self):
if "url" in self.__dict__:
url = self.url
Expand Down Expand Up @@ -474,78 +504,64 @@ def extract_ids_from_url(self, url: str) -> dict:
return results

def get_db_stats(self, is_markdown=False):
# Initialize counters
sites_dict = self.sites_dict

urls = {}
tags = {}
output = ""
disabled_count = 0
total_count = len(sites_dict)

message_checks = 0
message_checks_one_factor = 0

status_checks = 0

for _, site in sites_dict.items():
# Collect statistics
for site in sites_dict.values():
# Count disabled sites
if site.disabled:
disabled_count += 1

# Count URL types
url_type = site.get_url_template()
urls[url_type] = urls.get(url_type, 0) + 1

if site.check_type == 'message' and not site.disabled:
message_checks += 1
if site.absence_strs and site.presense_strs:
continue
message_checks_one_factor += 1

if site.check_type == 'status_code':
status_checks += 1
# Count check types for enabled sites
if not site.disabled:
if site.check_type == 'message':
if not (site.absence_strs and site.presense_strs):
message_checks_one_factor += 1
elif site.check_type == 'status_code':
status_checks += 1

# Count tags
if not site.tags:
tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1

for tag in filter(lambda x: not is_country_tag(x), site.tags):
tags[tag] = tags.get(tag, 0) + 1

# Calculate percentages
total_count = len(sites_dict)
enabled_count = total_count - disabled_count
enabled_perc = round(100 * enabled_count / total_count, 2)
output += (
f"Enabled/total sites: {enabled_count}/{total_count} = {enabled_perc}%\n\n"
)

checks_perc = round(100 * message_checks_one_factor / enabled_count, 2)
output += f"Incomplete message checks: {message_checks_one_factor}/{enabled_count} = {checks_perc}% (false positive risks)\n\n"

status_checks_perc = round(100 * status_checks / enabled_count, 2)
output += f"Status code checks: {status_checks}/{enabled_count} = {status_checks_perc}% (false positive risks)\n\n"

output += (
f"False positive risk (total): {checks_perc+status_checks_perc:.2f}%\n\n"
)

top_urls_count = 20
output += f"Top {top_urls_count} profile URLs:\n"
for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[
:top_urls_count
]:
# Format output
separator = "\n\n"
output = [
f"Enabled/total sites: {enabled_count}/{total_count} = {enabled_perc}%",
f"Incomplete message checks: {message_checks_one_factor}/{enabled_count} = {checks_perc}% (false positive risks)",
f"Status code checks: {status_checks}/{enabled_count} = {status_checks_perc}% (false positive risks)",
f"False positive risk (total): {checks_perc + status_checks_perc:.2f}%",
self._format_top_items("profile URLs", urls, 20, is_markdown),
self._format_top_items("tags", tags, 20, is_markdown, self._tags),
]

return separator.join(output)

def _format_top_items(self, title, items_dict, limit, is_markdown, valid_items=None):
"""Helper method to format top items lists"""
output = f"Top {limit} {title}:\n"
for item, count in sorted(items_dict.items(), key=lambda x: x[1], reverse=True)[:limit]:
if count == 1:
break
output += f"- ({count})\t`{url}`\n" if is_markdown else f"{count}\t{url}\n"

top_tags_count = 20
output += f"\nTop {top_tags_count} tags:\n"
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[
:top_tags_count
]:
mark = ""
if tag not in self._tags:
mark = " (non-standard)"
output += (
f"- ({count})\t`{tag}`{mark}\n"
if is_markdown
else f"{count}\t{tag}{mark}\n"
)

mark = " (non-standard)" if valid_items is not None and item not in valid_items else ""
output += f"- ({count})\t`{item}`{mark}\n" if is_markdown else f"{count}\t{item}{mark}\n"
return output
79 changes: 77 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ classifiers = [
"Bug Tracker" = "https://github.com/soxoj/maigret/issues"

[tool.poetry.dependencies]
# poetry install
# Install only production dependencies:
# poetry install --without dev
# Install with dev dependencies:
# poetry install --with dev
python = "^3.10"
aiodns = "^3.0.0"
aiohttp = "^3.11.8"
Expand Down Expand Up @@ -68,13 +73,18 @@ cloudscraper = "^1.2.71"


[tool.poetry.group.dev.dependencies]
# How to add a new dev dependency: poetry add black --group dev
# Install dev dependencies with: poetry install --with dev
flake8 = "^7.1.1"
pytest = "^7.2.0"
pytest-asyncio = "^0.23.8"
pytest-cov = "^6.0.0"
pytest-httpserver = "^1.0.0"
pytest-rerunfailures = "^15.0"
reportlab = "^4.2.0"
mypy = "^1.13.0"
tuna = "^0.5.11"

[tool.poetry.scripts]
# Run with: poetry run maigret <username>
maigret = "maigret.maigret:run"
Loading

0 comments on commit 2f93963

Please sign in to comment.