From e44ef43531df00431c5d31ef4e0c6350e5364636 Mon Sep 17 00:00:00 2001 From: AnsahMohammad Date: Wed, 5 Feb 2025 22:46:17 +0530 Subject: [PATCH] refactor: improved code optimization --- core/crawler.py | 9 ++++----- core/extractor.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/core/crawler.py b/core/crawler.py index 19feb24..719a399 100644 --- a/core/crawler.py +++ b/core/crawler.py @@ -9,7 +9,7 @@ def handle_starttag(self, tag, attrs): for (key, value) in attrs: if key == 'href': newUrl = parse.urljoin(self.baseUrl, value) - self.links = self.links + [newUrl] + self.links.append(newUrl) def getLinks(self, url): self.links = [] @@ -28,10 +28,9 @@ def spider(url, maxPages): pagesToVisit = [url] numberVisited = 0 foundWord = False - while numberVisited < maxPages and pagesToVisit != [] and not foundWord: - numberVisited = numberVisited +1 - url = pagesToVisit[0] - pagesToVisit = pagesToVisit[1:] + while numberVisited < maxPages and pagesToVisit: + numberVisited += 1 + url = pagesToVisit.pop() try: parser = LinkParser() data, links = parser.getLinks(url) diff --git a/core/extractor.py b/core/extractor.py index 02255d3..e8bd3ea 100644 --- a/core/extractor.py +++ b/core/extractor.py @@ -13,7 +13,7 @@ def param_extract(response, level, black_list, placeholder): for i in parsed: delim = i.find('=') - second_delim = i.find('=', i.find('=') + 1) + second_delim = i.find('=', delim + 1) if len(black_list) > 0: words_re = re.compile("|".join(black_list)) if not words_re.search(i):