Skip to content

Commit 7a2603b

Browse files
committed
work in progress
1 parent 961745d commit 7a2603b

File tree

8 files changed

+182
-13
lines changed

8 files changed

+182
-13
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,7 @@ settings.py
2121
scrapy.cfg
2222
config
2323
*.cfg
24+
25+
local_settings.py
26+
27+
local_settings.py

musiccrawler/exporters.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ def __init__(self):
1010
connection = pymongo.Connection(musiccrawler.settings.MONGODB_SERVER, musiccrawler.settings.MONGODB_PORT, tz_aware=True)
1111
self.db = connection[musiccrawler.settings.MONGODB_DB]
1212
log.msg("Authenticating to MongoDB", level=log.DEBUG)
13-
self.db.authenticate(musiccrawler.settings.MONGODB_USER, musiccrawler.settings.MONGODB_PASSWORD)
13+
if musiccrawler.settings.__dict__.has_key('MONGODB_USER') and musiccrawler.settings.__dict__.has_key('MONGODB_PASSWORD') and musiccrawler.settings.MONGODB_USER is not None:
14+
self.db.authenticate(musiccrawler.settings.MONGODB_USER, musiccrawler.settings.MONGODB_PASSWORD)
1415
self.collection = self.db[musiccrawler.settings.MONGODB_COLLECTION]
1516
if self.__get_uniq_key() is not None:
1617
self.collection.create_index(self.__get_uniq_key(), unique=True)

musiccrawler/items.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,26 @@ class DownloadLinkItem(Item):
2222
metainfo = Field()
2323
aid = Field()
2424
oid = Field()
25+
youtube_name = Field()
26+
youtube_likes = Field()
27+
youtube_comments = Field()
28+
youtube_views = Field()
29+
youtube_dislikes = Field()
30+
youtube_favorites = Field()
31+
youtube_date_published = Field()
32+
hypem_likes = Field()
33+
hypem_posts = Field()
34+
hypem_date_published = Field()
35+
hypem_name = Field()
36+
hypem_artwork_url = Field()
37+
soundcloud_likes = Field()
38+
soundcloud_comments = Field()
39+
soundcloud_downloads = Field()
40+
soundcloud_playbacks = Field()
41+
soundcloud_genre = Field()
42+
soundcloud_name = Field()
43+
soundcloud_date_created = Field()
44+
soundcloud_artwork_url = Field()
45+
facebook_shares = Field()
46+
name_routing = Field()
47+
facebook_shares = Field()

musiccrawler/pipelines.py

Lines changed: 148 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,158 @@
33
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
44
# See: http://doc.scrapy.org/topics/item-pipeline.html
55

6-
from musiccrawler.exporters import MongoDBExporter
6+
from datetime import datetime
7+
import dateutil.parser
8+
import json
9+
from operator import attrgetter
10+
import pkg_resources
11+
from pytz import timezone
12+
import re
13+
import traceback
14+
from urllib2 import HTTPError
15+
16+
from apiclient.discovery import build
17+
from apiclient.errors import HttpError
18+
import facebook
19+
import hypem
720
from py4j.java_gateway import JavaGateway, logging
821
from py4j.protocol import Py4JError
922
from scrapy import log, signals
1023
from scrapy.exceptions import DropItem
11-
import json
12-
import re
13-
import traceback
14-
import pkg_resources
24+
import soundcloud
25+
26+
from musiccrawler.exporters import MongoDBExporter
27+
import musiccrawler.settings
28+
29+
30+
class GetMusicDownloadLinkStatisticsPipeline(object):
31+
def __init__(self):
32+
self.tz = timezone("Europe/Berlin")
33+
self.facebookGraphAPI = facebook.GraphAPI();
34+
self.soundcloudAPI = soundcloud.Client(client_id=musiccrawler.settings.SOUNDCLOUD_APP_ID);
35+
36+
self.youtubeDataAPI = build(musiccrawler.settings.GOOGLE_API_SERVICE_NAME, musiccrawler.settings.GOOGLE_API_VERSION,
37+
developerKey=musiccrawler.settings.GOOGLE_API_KEY)
38+
39+
def process_item(self, item, spider):
40+
videoids = []
41+
42+
if item is None or item.get("name",None) is None:
43+
log.msg(("Received empty itemp (corrupted)"), level=log.DEBUG)
44+
raise DropItem("Dropped empty item (corrupted)")
45+
else:
46+
try:
47+
search_response_ids = self.youtubeDataAPI.search().list(
48+
q=item["name"],
49+
part="id",
50+
maxResults=musiccrawler.settings.STATISTICS_ITEM_BASE,
51+
type="video"
52+
).execute()
53+
54+
for search_result in search_response_ids.get("items", []):
55+
if search_result["id"] is not None:
56+
videoids.append(search_result["id"]["videoId"]);
57+
58+
try:
59+
search_response_videos = self.youtubeDataAPI.videos().list(
60+
id=",".join(videoids),
61+
part="statistics,snippet,id",
62+
maxResults=musiccrawler.settings.STATISTICS_ITEM_BASE
63+
).execute()
64+
65+
if len(search_response_videos.get("items", [])) >= 1:
66+
item["youtube_name"] = search_response_videos.get("items", [])[0]["snippet"]["title"];
67+
item["youtube_date_published"] = dateutil.parser.parse(search_response_videos.get("items", [])[0]["snippet"]["publishedAt"]);
68+
69+
for search_result in search_response_videos.get("items", []):
70+
if search_result["statistics"] is not None and search_result["snippet"] is not None:
71+
item["youtube_comments"] += int(search_result["statistics"]["commentCount"])
72+
item["youtube_views"] += int(search_result["statistics"]["viewCount"])
73+
item["youtube_favorites"] += int(search_result["statistics"]["favoriteCount"])
74+
item["youtube_dislikes"] += int(search_result["statistics"]["dislikeCount"])
75+
item["youtube_likes"] += int(search_result["statistics"]["likeCount"])
76+
77+
if item["youtube_date_published"] < dateutil.parser.parse(search_result["snippet"]["publishedAt"]):
78+
item["youtube_date_published"] = dateutil.parser.parse(search_result["snippet"]["publishedAt"]);
79+
80+
except HttpError, e:
81+
print "An HTTP error occured"
82+
except HttpError, e:
83+
print "An HTTP error occured"
84+
85+
try:
86+
searchresults = hypem.search(item["name"], 1)
87+
88+
if searchresults is not None and searchresults.__len__() >= 1:
89+
try:
90+
searchresults = sorted(searchresults, key=attrgetter('dateposted'), reverse=True)
91+
92+
if len(searchresults) >= 1:
93+
item["hypem_name"] = searchresults[0].artist + " - " + searchresults[0].title;
94+
item["hypem_date_published"] = self.tz.localize(datetime.fromtimestamp(searchresults[0].dateposted))
95+
item["hypem_artwork_url"] = searchresults[0].thumb_url_medium;
96+
97+
for track in searchresults[:musiccrawler.settings.STATISTICS_ITEM_BASE]:
98+
item["hypem_likes"] += track.loved_count;
99+
item["hypem_posts"] += track.posted_count;
100+
101+
if item["hypem_artwork_url"] is None:
102+
item["hypem_artwork_url"] = track.thumb_url_medium;
103+
104+
if hasattr(track, 'itunes_link'):
105+
facebook_shares = self.facebookGraphAPI.get_object(track.itunes_link)
106+
if facebook_shares.get('shares',None) is not None:
107+
item["facebook_shares"] += facebook_shares['shares'];
108+
elif facebook_shares.get('likes',None) is not None:
109+
item["facebook_shares"] += facebook_shares['likes'];
110+
111+
if item["hypem_date_published"] < self.tz.localize(datetime.fromtimestamp(track.dateposted)):
112+
item["hypem_date_published"] = self.tz.localize(datetime.fromtimestamp(track.dateposted));
113+
except ValueError, e:
114+
print "Corrupt JSON data from hypem"
115+
except HttpError, e:
116+
print "An HTTP error occured"
117+
except HttpError, e:
118+
print "An HTTP error occured"
119+
120+
searchresults = sorted(self.soundcloudAPI.get('/tracks', q=item["name"], limit=musiccrawler.settings.STATISTICS_ITEM_BASE, filter='public'), key=attrgetter('created_at'), reverse=True);
121+
122+
if len(searchresults) >= 1:
123+
item["soundcloud_name"] = searchresults[0].title;
124+
item["soundcloud_date_created"] = dateutil.parser.parse(searchresults[0].created_at);
125+
item["name_routing"] = searchresults[0].permalink;
126+
item["soundcloud_genre"] = searchresults[0].genre;
127+
item["soundcloud_artwork_url"] = searchresults[0].artwork_url;
128+
129+
for track in searchresults:
130+
if hasattr(track,'permalink_url') and track.permalink_url is not None:
131+
facebook_shares = self.facebookGraphAPI.get_object(track.permalink_url)
132+
if facebook_shares.get('shares',None) is not None:
133+
item["facebook_shares"] += facebook_shares['shares'];
134+
elif facebook_shares.get('likes',None) is not None:
135+
item["facebook_shares"] += facebook_shares['likes'];
136+
137+
if hasattr(track,'video_url') and track.video_url is not None:
138+
facebook_shares =self.facebookGraphAPI.get_object(track.video_url)
139+
if facebook_shares.get('shares',None) is not None:
140+
item["facebook_shares"] += facebook_shares['shares'];
141+
elif facebook_shares.get('likes',None) is not None:
142+
item["facebook_shares"] += facebook_shares['likes'];
143+
144+
if item["soundcloud_artwork_url"] is None:
145+
item["soundcloud_artwork_url"] = track.artwork_url;
146+
if item["soundcloud_genre"] is None:
147+
item["soundcloud_genre"] = track.genre; #VK Genres auch noch mitnehmen
148+
149+
item["soundcloud_comments"] += track.comment_count;
150+
item["soundcloud_downloads"] += track.download_count;
151+
item["soundcloud_likes"] += track.favoritings_count;
152+
item["soundcloud_playbacks"] += track.playback_count;
153+
item["soundcloud_date_created"] = track.created_at;
154+
155+
if item["soundcloud_date_created"] < dateutil.parser.parse(track.created_at):
156+
item["soundcloud_date_created"] = dateutil.parser.parse(track.created_at);
157+
15158

16159
class CheckMusicDownloadLinkPipeline(object):
17160
urlregex = re.compile(

musiccrawler/spiders/cleanupspider.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,12 @@ def __init__(self, **kwargs):
1515
self.start_urls = ['http://soundcloud.com']
1616
connection = pymongo.Connection(musiccrawler.settings.MONGODB_SERVER, musiccrawler.settings.MONGODB_PORT, tz_aware=True)
1717
self.db = connection[musiccrawler.settings.MONGODB_DB]
18-
if musiccrawler.settings.__dict__.has_key('MONGODB_USER') and musiccrawler.settings.__dict__.has_key('MONGODB_PASSWORD'):
18+
if musiccrawler.settings.__dict__.has_key('MONGODB_USER') and musiccrawler.settings.__dict__.has_key('MONGODB_PASSWORD') and musiccrawler.settings.MONGODB_USER is not None:
1919
self.db.authenticate(musiccrawler.settings.MONGODB_USER, musiccrawler.settings.MONGODB_PASSWORD)
2020
self.links = self.db['links']
2121
log.msg("Removing " + str(self.links.find({'status': 'off', 'date_published': {'$lte': (datetime.now()-timedelta(days=90))}}).count()) + " links from Database that are OFFLINE OR UNKNOWN and older than 90 days.", level=log.INFO)
2222
self.links.remove({'status': 'off', 'date_published': {'$lte': (datetime.now()-timedelta(days=90))}},False)
2323
self.links.remove({'status': 'unknown', 'date_published': {'$lte': (datetime.now()-timedelta(days=90))}},False)
24-
log.msg("Removing " + str(self.links.find({'date_published': {'$lte': (datetime.now()-timedelta(days=365))}}).count()) + " links from Database that are older than one year.", level=log.INFO)
25-
self.links.remove({'date_published': {'$lte': (datetime.now()-timedelta(days=365))}},False)
2624
self.unknownlinks = list(self.links.find({'$query':{"status": 'unknown',"url": { "$not": re.compile(".*dwmp3\.com.*") } },'$orderby': {"url": 1}}))
2725
log.msg("Received " + str(len(self.unknownlinks)) + " UNKNOWN links from Database", level=log.INFO)
2826

musiccrawler/spiders/facebookgroupspider.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def __init__(self, **kwargs):
2121
dispatcher.connect(self.handle_spider_closed, signals.spider_closed)
2222
connection = pymongo.Connection(musiccrawler.settings.MONGODB_SERVER, musiccrawler.settings.MONGODB_PORT, tz_aware=True)
2323
self.db = connection[musiccrawler.settings.MONGODB_DB]
24-
if musiccrawler.settings.__dict__.has_key('MONGODB_USER') and musiccrawler.settings.__dict__.has_key('MONGODB_PASSWORD'):
24+
if musiccrawler.settings.__dict__.has_key('MONGODB_USER') and musiccrawler.settings.__dict__.has_key('MONGODB_PASSWORD') and musiccrawler.settings.MONGODB_USER is not None:
2525
self.db.authenticate(musiccrawler.settings.MONGODB_USER, musiccrawler.settings.MONGODB_PASSWORD)
2626
self.collection = self.db['sites']
2727
self.site = self.collection.find_one({"feedurl": kwargs.get('feedurl')})

musiccrawler/spiders/feedspider.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def __init__(self, **kwargs):
3535
dispatcher.connect(self.handle_spider_closed, signals.spider_closed)
3636
connection = pymongo.Connection(musiccrawler.settings.MONGODB_SERVER, musiccrawler.settings.MONGODB_PORT, tz_aware=True)
3737
self.db = connection[musiccrawler.settings.MONGODB_DB]
38-
if musiccrawler.settings.__dict__.has_key('MONGODB_USER') and musiccrawler.settings.__dict__.has_key('MONGODB_PASSWORD'):
38+
if musiccrawler.settings.__dict__.has_key('MONGODB_USER') and musiccrawler.settings.__dict__.has_key('MONGODB_PASSWORD') and musiccrawler.settings.MONGODB_USER is not None:
3939
self.db.authenticate(musiccrawler.settings.MONGODB_USER, musiccrawler.settings.MONGODB_PASSWORD)
4040
self.collection = self.db['sites']
4141
self.site = self.collection.find_one({"feedurl": kwargs.get('feedurl')})
@@ -205,7 +205,7 @@ def parse_entry_html(self, response):
205205
request.meta['date_published'] = response.meta['date_published']
206206
request.meta['entry_title'] = response.meta['entry_title']
207207
yield request
208-
208+
209209
for regexpr in self.regexes:
210210
iterator = regexpr.finditer(response.body)
211211
for match in iterator:

musiccrawler/spiders/vkontaktespider.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def __init__(self, **kwargs):
3535
dispatcher.connect(self.handle_spider_closed, signals.spider_closed)
3636
connection = pymongo.Connection(musiccrawler.settings.MONGODB_SERVER, musiccrawler.settings.MONGODB_PORT, tz_aware=True)
3737
self.db = connection[musiccrawler.settings.MONGODB_DB]
38-
if musiccrawler.settings.__dict__.has_key('MONGODB_USER') and musiccrawler.settings.__dict__.has_key('MONGODB_PASSWORD'):
38+
if musiccrawler.settings.__dict__.has_key('MONGODB_USER') and musiccrawler.settings.__dict__.has_key('MONGODB_PASSWORD') and musiccrawler.settings.MONGODB_USER is not None:
3939
self.db.authenticate(musiccrawler.settings.MONGODB_USER, musiccrawler.settings.MONGODB_PASSWORD)
4040
self.collection = self.db['sites']
4141
self.site = self.collection.find_one({"feedurl": kwargs.get('feedurl')})

0 commit comments

Comments
 (0)