Skip to content

Commit 0d76122

Browse files
committed
badlinks filtering
1 parent ad8cf56 commit 0d76122

File tree

3 files changed

+7
-1
lines changed

3 files changed

+7
-1
lines changed

musiccrawler/config/badlinks.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
http://www.dwmp3.com/wp-content;http://www.dwmp3.com/xmlrpc;http://www.dwmp3.com/wp-content;http://www.dwmp3.com/wp-includes;http://www.dwmp3.com/feed;http://www.dwmp3.com/dmca;http://www.dwmp3.com/2013;http://www.dwmp3.com/tag

musiccrawler/pipelines.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,15 @@ def process_item(self, item, spider):
8484
return item
8585

8686
class BadFilesPipeline(object):
87+
def __init__(self):
88+
self.badlinks = open(pkg_resources.resource_string('musiccrawler.config', "badlinks.cfg")).read().split(";")
89+
8790
def process_item(self, item, spider):
8891
if str(item['url']).endswith(".jpg") or str(item['url']).endswith(".png") or str(item['url']).endswith(".gif"):
8992
log.msg(("Bad Item:" + str(item)), level=log.DEBUG)
9093
raise DropItem("Bad Link-URL found: %s" % item['url'])
94+
elif str(item['url']) in self.badlinks:
95+
raise DropItem("Bad Link-URL found: %s" % item['url'])
9196
else:
9297
return item
9398

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
name='musiccrawler',
77
version='1.0',
88
packages=find_packages(),
9-
package_data={'musiccrawler.config': ['*.json']},
9+
package_data={'musiccrawler.config': ['*.json','*.cfg']},
1010
entry_points={'scrapy': ['settings = musiccrawler.settings']},
1111
zip_safe=False
1212
)

0 commit comments

Comments
 (0)