Skip to content

Commit

Permalink
Added a couple of sites, fixed false positives (#286)
Browse files Browse the repository at this point in the history
  • Loading branch information
soxoj authored Jan 2, 2022
1 parent 8801f7e commit ecabf88
Show file tree
Hide file tree
Showing 4 changed files with 189 additions and 24 deletions.
2 changes: 1 addition & 1 deletion maigret/maigret.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,7 @@ async def main():
site_data = get_top_sites_for_id(args.id_type)

if args.new_site_to_submit:
submitter = Submitter(db=db, logger=logger, settings=settings)
submitter = Submitter(db=db, logger=logger, settings=settings, args=args)
is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file)
if is_submitted:
db.save_to_file(db_file)
Expand Down
152 changes: 152 additions & 0 deletions maigret/resources/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -1833,6 +1833,7 @@
"usernameUnclaimed": "noonewouldeverusethis7"
},
"Bestfantasybooks": {
"disabled": true,
"tags": [
"us"
],
Expand Down Expand Up @@ -4432,6 +4433,7 @@
]
},
"Facenama": {
"disabled": true,
"tags": [
"ir"
],
Expand Down Expand Up @@ -28440,6 +28442,156 @@
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "message",
"alexaRank": 6859
},
"Worldis.me": {
"absenceStrs": [
"user_password",
"send_email"
],
"presenseStrs": [
"my_profile",
"profile_upi",
"UserInfo"
],
"url": "http://en.worldis.me/{username}",
"urlMain": "http://en.worldis.me",
"usernameClaimed": "admin",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "message",
"alexaRank": 3233509,
"tags": [
"ru"
]
},
"photoshop-kopona.com": {
"absenceStrs": [
"<title>noonewouldeverusethis7 &raquo; \u0420\u0435\u0441\u0443\u0440\u0441\u044b \u0434\u043b\u044f \u0424\u043e\u0442\u043e\u0448\u043e\u043f\u0430</title>"
],
"presenseStrs": [
"offline",
"uspusertitle"
],
"url": "https://photoshop-kopona.com/ru/user/{username}/",
"urlMain": "https://photoshop-kopona.com",
"usernameClaimed": "test",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "message",
"alexaRank": 44106,
"tags": [
"ru"
]
},
"dumskaya.net": {
"absenceStrs": [
"><img class=nobo src=/banner/ps2_/ alt="
],
"presenseStrs": [
"><img class=nobo src=/banner/prague_/ alt="
],
"url": "https://dumskaya.net/user/{username}/",
"urlMain": "https://dumskaya.net",
"usernameClaimed": "test",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "message",
"alexaRank": 73617,
"tags": [
"ru"
]
},
"rblx.trade": {
"absenceStrs": [
"isRblxTradeException"
],
"presenseStrs": [
"userId"
],
"url": "https://rblx.trade/p/{username}",
"urlMain": "https://rblx.trade",
"usernameClaimed": "test",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "message",
"alexaRank": 362185,
"tags": [
"gaming"
]
},
"monitoringminecraft.ru": {
"absenceStrs": [
"shadowi"
],
"presenseStrs": [
"small"
],
"url": "https://monitoringminecraft.ru/player/{username}",
"urlMain": "https://monitoringminecraft.ru",
"usernameClaimed": "test",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "message",
"alexaRank": 115209,
"tags": [
"gaming"
]
},
"profi.ru": {
"absenceStrs": [
"page-404__paragraph"
],
"presenseStrs": [
"PROFILE",
"profiles",
"profileOIO",
"fullProfile",
"profileUGC2"
],
"url": "https://profi.ru/profile/{username}/",
"urlMain": "https://profi.ru",
"usernameClaimed": "EgorovRV",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "message",
"alexaRank": 12037,
"tags": [
"freelance"
]
},
"app.airnfts.com": {
"absenceStrs": [
"user-not-found-div"
],
"presenseStrs": [
"username",
"ownerUsername",
"creatorUsername",
"name",
"user"
],
"url": "https://app.airnfts.com/creators/{username}",
"urlMain": "https://app.airnfts.com",
"usernameClaimed": "test",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "message",
"alexaRank": 30223
},
"xgm.guru": {
"absenceStrs": [
">Username:</label>"
],
"presenseStrs": [
"email",
"usernamereg",
"username-top",
"\u041e\u043f\u044b\u0442 \u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044f",
"check-username"
],
"url": "https://xgm.guru/user/{username}",
"urlMain": "https://xgm.guru",
"usernameClaimed": "test",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "message",
"alexaRank": 692341,
"tags": [
"forum",
"gaming"
]
}
},
"engines": {
Expand Down
57 changes: 35 additions & 22 deletions maigret/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
from typing import List
import xml.etree.ElementTree as ET
from aiohttp import TCPConnector, ClientSession
import requests

from .activation import import_aiohttp_cookies
Expand All @@ -24,11 +25,24 @@ class Submitter:
TOP_FEATURES = 5
URL_RE = re.compile(r"https?://(www\.)?")

def __init__(self, db: MaigretDatabase, settings: Settings, logger):
def __init__(self, db: MaigretDatabase, settings: Settings, logger, args):
self.settings = settings
self.args = args
self.db = db
self.logger = logger

from aiohttp_socks import ProxyConnector
proxy = self.args.proxy
cookie_jar = None
if args.cookie_file:
cookie_jar = import_aiohttp_cookies(args.cookie_file)

connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=False)
connector.verify_ssl = False
self.session = ClientSession(
connector=connector, trust_env=True, cookie_jar=cookie_jar
)

@staticmethod
def get_alexa_rank(site_url_main):
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
Expand Down Expand Up @@ -63,6 +77,7 @@ async def site_self_check(self, site, semaphore, silent=False):
results_dict = await maigret(
username=username,
site_dict={site.name: site},
proxy=self.args.proxy,
logger=self.logger,
timeout=30,
id_type=site.type,
Expand Down Expand Up @@ -126,20 +141,22 @@ def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog):
return fields

async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]:
resp_text = ''
try:
r = requests.get(url_mainpage)
self.logger.debug(r.text)
r = await self.session.get(url_mainpage)
resp_text = await r.text()
self.logger.debug(resp_text)
except Exception as e:
self.logger.warning(e)
print("Some error while checking main page")
return []

for engine in self.db.engines:
strs_to_check = engine.__dict__.get("presenseStrs")
if strs_to_check and r and r.text:
if strs_to_check and resp_text:
all_strs_in_response = True
for s in strs_to_check:
if s not in r.text:
if s not in resp_text:
all_strs_in_response = False
sites = []
if all_strs_in_response:
Expand Down Expand Up @@ -209,32 +226,28 @@ async def check_features_manually(
headers = dict(self.HEADERS)
headers.update(custom_headers)

# cookies
cookie_dict = None
if cookie_file:
self.logger.info(f'Use {cookie_file} for cookies')
cookie_jar = import_aiohttp_cookies(cookie_file)
cookie_dict = {c.key: c.value for c in cookie_jar}

exists_resp = requests.get(
url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
exists_resp = await self.session.get(
url_exists,
headers=headers,
allow_redirects=redirects,
)
exists_resp_text = await exists_resp.text()
self.logger.debug(url_exists)
self.logger.debug(exists_resp.status_code)
self.logger.debug(exists_resp.text)
self.logger.debug(exists_resp.status)
self.logger.debug(exists_resp_text)

non_exists_resp = requests.get(
non_exists_resp = await self.session.get(
url_not_exists,
cookies=cookie_dict,
headers=headers,
allow_redirects=redirects,
)
non_exists_resp_text = await non_exists_resp.text()
self.logger.debug(url_not_exists)
self.logger.debug(non_exists_resp.status_code)
self.logger.debug(non_exists_resp.text)
self.logger.debug(non_exists_resp.status)
self.logger.debug(non_exists_resp_text)

a = exists_resp.text
b = non_exists_resp.text
a = exists_resp_text
b = non_exists_resp_text

tokens_a = set(re.split(f'[{self.SEPARATORS}]', a))
tokens_b = set(re.split(f'[{self.SEPARATORS}]', b))
Expand Down
2 changes: 1 addition & 1 deletion utils/update_site_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
'100000000': '100M',
})

SEMAPHORE = threading.Semaphore(10)
SEMAPHORE = threading.Semaphore(20)

def get_rank(domain_to_query, site, print_errors=True):
with SEMAPHORE:
Expand Down

0 comments on commit ecabf88

Please sign in to comment.