|
3 | 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
4 | 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html
|
5 | 5 |
|
6 |
| -from musiccrawler.exporters import MongoDBExporter |
| 6 | +from datetime import datetime |
| 7 | +import dateutil.parser |
| 8 | +import json |
| 9 | +from operator import attrgetter |
| 10 | +import pkg_resources |
| 11 | +from pytz import timezone |
| 12 | +import re |
| 13 | +import traceback |
| 14 | +from urllib2 import HTTPError |
| 15 | + |
| 16 | +from apiclient.discovery import build |
| 17 | +from apiclient.errors import HttpError |
| 18 | +import facebook |
| 19 | +import hypem |
7 | 20 | from py4j.java_gateway import JavaGateway, logging
|
8 | 21 | from py4j.protocol import Py4JError
|
9 | 22 | from scrapy import log, signals
|
10 | 23 | from scrapy.exceptions import DropItem
|
11 |
| -import json |
12 |
| -import re |
13 |
| -import traceback |
14 |
| -import pkg_resources |
| 24 | +import soundcloud |
| 25 | + |
| 26 | +from musiccrawler.exporters import MongoDBExporter |
| 27 | +import musiccrawler.settings |
| 28 | + |
| 29 | + |
| 30 | +class GetMusicDownloadLinkStatisticsPipeline(object): |
| 31 | + def __init__(self): |
| 32 | + self.tz = timezone("Europe/Berlin") |
| 33 | + self.facebookGraphAPI = facebook.GraphAPI(); |
| 34 | + self.soundcloudAPI = soundcloud.Client(client_id=musiccrawler.settings.SOUNDCLOUD_APP_ID); |
| 35 | + |
| 36 | + self.youtubeDataAPI = build(musiccrawler.settings.GOOGLE_API_SERVICE_NAME, musiccrawler.settings.GOOGLE_API_VERSION, |
| 37 | + developerKey=musiccrawler.settings.GOOGLE_API_KEY) |
| 38 | + |
| 39 | + def process_item(self, item, spider): |
| 40 | + videoids = [] |
| 41 | + |
| 42 | + if item is None or item.get("name",None) is None: |
| 43 | + log.msg(("Received empty itemp (corrupted)"), level=log.DEBUG) |
| 44 | + raise DropItem("Dropped empty item (corrupted)") |
| 45 | + else: |
| 46 | + try: |
| 47 | + search_response_ids = self.youtubeDataAPI.search().list( |
| 48 | + q=item["name"], |
| 49 | + part="id", |
| 50 | + maxResults=musiccrawler.settings.STATISTICS_ITEM_BASE, |
| 51 | + type="video" |
| 52 | + ).execute() |
| 53 | + |
| 54 | + for search_result in search_response_ids.get("items", []): |
| 55 | + if search_result["id"] is not None: |
| 56 | + videoids.append(search_result["id"]["videoId"]); |
| 57 | + |
| 58 | + try: |
| 59 | + search_response_videos = self.youtubeDataAPI.videos().list( |
| 60 | + id=",".join(videoids), |
| 61 | + part="statistics,snippet,id", |
| 62 | + maxResults=musiccrawler.settings.STATISTICS_ITEM_BASE |
| 63 | + ).execute() |
| 64 | + |
| 65 | + if len(search_response_videos.get("items", [])) >= 1: |
| 66 | + item["youtube_name"] = search_response_videos.get("items", [])[0]["snippet"]["title"]; |
| 67 | + item["youtube_date_published"] = dateutil.parser.parse(search_response_videos.get("items", [])[0]["snippet"]["publishedAt"]); |
| 68 | + |
| 69 | + for search_result in search_response_videos.get("items", []): |
| 70 | + if search_result["statistics"] is not None and search_result["snippet"] is not None: |
| 71 | + item["youtube_comments"] += int(search_result["statistics"]["commentCount"]) |
| 72 | + item["youtube_views"] += int(search_result["statistics"]["viewCount"]) |
| 73 | + item["youtube_favorites"] += int(search_result["statistics"]["favoriteCount"]) |
| 74 | + item["youtube_dislikes"] += int(search_result["statistics"]["dislikeCount"]) |
| 75 | + item["youtube_likes"] += int(search_result["statistics"]["likeCount"]) |
| 76 | + |
| 77 | + if item["youtube_date_published"] < dateutil.parser.parse(search_result["snippet"]["publishedAt"]): |
| 78 | + item["youtube_date_published"] = dateutil.parser.parse(search_result["snippet"]["publishedAt"]); |
| 79 | + |
| 80 | + except HttpError, e: |
| 81 | + print "An HTTP error occured" |
| 82 | + except HttpError, e: |
| 83 | + print "An HTTP error occured" |
| 84 | + |
| 85 | + try: |
| 86 | + searchresults = hypem.search(item["name"], 1) |
| 87 | + |
| 88 | + if searchresults is not None and searchresults.__len__() >= 1: |
| 89 | + try: |
| 90 | + searchresults = sorted(searchresults, key=attrgetter('dateposted'), reverse=True) |
| 91 | + |
| 92 | + if len(searchresults) >= 1: |
| 93 | + item["hypem_name"] = searchresults[0].artist + " - " + searchresults[0].title; |
| 94 | + item["hypem_date_published"] = self.tz.localize(datetime.fromtimestamp(searchresults[0].dateposted)) |
| 95 | + item["hypem_artwork_url"] = searchresults[0].thumb_url_medium; |
| 96 | + |
| 97 | + for track in searchresults[:musiccrawler.settings.STATISTICS_ITEM_BASE]: |
| 98 | + item["hypem_likes"] += track.loved_count; |
| 99 | + item["hypem_posts"] += track.posted_count; |
| 100 | + |
| 101 | + if item["hypem_artwork_url"] is None: |
| 102 | + item["hypem_artwork_url"] = track.thumb_url_medium; |
| 103 | + |
| 104 | + if hasattr(track, 'itunes_link'): |
| 105 | + facebook_shares = self.facebookGraphAPI.get_object(track.itunes_link) |
| 106 | + if facebook_shares.get('shares',None) is not None: |
| 107 | + item["facebook_shares"] += facebook_shares['shares']; |
| 108 | + elif facebook_shares.get('likes',None) is not None: |
| 109 | + item["facebook_shares"] += facebook_shares['likes']; |
| 110 | + |
| 111 | + if item["hypem_date_published"] < self.tz.localize(datetime.fromtimestamp(track.dateposted)): |
| 112 | + item["hypem_date_published"] = self.tz.localize(datetime.fromtimestamp(track.dateposted)); |
| 113 | + except ValueError, e: |
| 114 | + print "Corrupt JSON data from hypem" |
| 115 | + except HttpError, e: |
| 116 | + print "An HTTP error occured" |
| 117 | + except HttpError, e: |
| 118 | + print "An HTTP error occured" |
| 119 | + |
| 120 | + searchresults = sorted(self.soundcloudAPI.get('/tracks', q=item["name"], limit=musiccrawler.settings.STATISTICS_ITEM_BASE, filter='public'), key=attrgetter('created_at'), reverse=True); |
| 121 | + |
| 122 | + if len(searchresults) >= 1: |
| 123 | + item["soundcloud_name"] = searchresults[0].title; |
| 124 | + item["soundcloud_date_created"] = dateutil.parser.parse(searchresults[0].created_at); |
| 125 | + item["name_routing"] = searchresults[0].permalink; |
| 126 | + item["soundcloud_genre"] = searchresults[0].genre; |
| 127 | + item["soundcloud_artwork_url"] = searchresults[0].artwork_url; |
| 128 | + |
| 129 | + for track in searchresults: |
| 130 | + if hasattr(track,'permalink_url') and track.permalink_url is not None: |
| 131 | + facebook_shares = self.facebookGraphAPI.get_object(track.permalink_url) |
| 132 | + if facebook_shares.get('shares',None) is not None: |
| 133 | + item["facebook_shares"] += facebook_shares['shares']; |
| 134 | + elif facebook_shares.get('likes',None) is not None: |
| 135 | + item["facebook_shares"] += facebook_shares['likes']; |
| 136 | + |
| 137 | + if hasattr(track,'video_url') and track.video_url is not None: |
| 138 | + facebook_shares =self.facebookGraphAPI.get_object(track.video_url) |
| 139 | + if facebook_shares.get('shares',None) is not None: |
| 140 | + item["facebook_shares"] += facebook_shares['shares']; |
| 141 | + elif facebook_shares.get('likes',None) is not None: |
| 142 | + item["facebook_shares"] += facebook_shares['likes']; |
| 143 | + |
| 144 | + if item["soundcloud_artwork_url"] is None: |
| 145 | + item["soundcloud_artwork_url"] = track.artwork_url; |
| 146 | + if item["soundcloud_genre"] is None: |
| 147 | + item["soundcloud_genre"] = track.genre; #VK Genres auch noch mitnehmen |
| 148 | + |
| 149 | + item["soundcloud_comments"] += track.comment_count; |
| 150 | + item["soundcloud_downloads"] += track.download_count; |
| 151 | + item["soundcloud_likes"] += track.favoritings_count; |
| 152 | + item["soundcloud_playbacks"] += track.playback_count; |
| 153 | + item["soundcloud_date_created"] = track.created_at; |
| 154 | + |
| 155 | + if item["soundcloud_date_created"] < dateutil.parser.parse(track.created_at): |
| 156 | + item["soundcloud_date_created"] = dateutil.parser.parse(track.created_at); |
| 157 | + |
15 | 158 |
|
16 | 159 | class CheckMusicDownloadLinkPipeline(object):
|
17 | 160 | urlregex = re.compile(
|
|
0 commit comments