-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathgather.py
executable file
·112 lines (84 loc) · 2.99 KB
/
gather.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
__author__ = 'Tony Beltramelli www.tonybeltramelli.com - 09/07/2016'
import argparse
import os
import urllib2
import re
import codecs
from threading import Thread
from HTMLParser import HTMLParser
DOMAIN = "songmeanings.com/"
ARTIST_PATH = 'artist/view/songs/'
def start_new_thread(task, arg):
thread = Thread(target=task, args=(arg,))
thread.start()
def write_to_file(path, data):
output_file = codecs.open(path, 'a', 'utf_8')
output_file.write(data.encode('utf-8'))
output_file.write("\n")
output_file.close()
def get_url(path, arg = ""):
return 'http://' + DOMAIN + path + arg
def get_page_content(url):
response = urllib2.urlopen(url)
return response.read()
class SongPageParser(HTMLParser):
record = False
lyrics = ""
output_path = ""
def handle_starttag(self, tag, attrs):
for attr in attrs:
if attr[0] == "class" and attr[1].find('lyric-box') != -1:
self.record = True
if attr[0] == "id" and attr[1].find('lyrics-edit') != -1:
self.record = False
write_to_file(self.output_path, self.lyrics)
self.lyrics = ""
def handle_data(self, data):
if self.record:
self.lyrics += re.sub(r'[^\x00-\x7F]+', '\'', data.lstrip()) + "\n"
class ArtistPageParser(HTMLParser):
match = 0
url = ""
title = ""
output_path = ""
def handle_starttag(self, tag, attrs):
href = None
for attr in attrs:
if attr[0] == "id" and attr[1].find('lyric-') != -1:
self.match += 1
if attr[0] == "href" and attr[1].find(DOMAIN) != -1:
self.match += 1
href = attr[1]
if self.match > 1 and href is not None:
self.url = href[href.find(DOMAIN) + len(DOMAIN):]
def handle_endtag(self, tag):
self.match = 0
def handle_data(self, data):
if self.match > 1:
self.title = data
html = get_page_content(get_url(self.url))
song_parser = SongPageParser()
song_parser.output_path = self.output_path
start_new_thread(song_parser.feed, html)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--output_file', type=str, required=True)
parser.add_argument('--artists', type=str, required=True)
args = parser.parse_args()
output_file = args.output_file
artists = args.artists.replace(' ', '').split(',')
try:
os.remove(output_file)
except OSError:
print "The output file doesn't exist, creating it"
print "Gathering lyrics..."
for i, artist in enumerate(artists):
html = get_page_content(get_url(ARTIST_PATH, artist))
artist_parser = ArtistPageParser()
artist_parser.output_path = output_file
artist_parser.feed(html)
print "Progress: {}%".format(((i + 1) * 100) / len(artists))
print "Lyrics saved in {}".format(output_file)
if __name__ == "__main__":
main()