Skip to content

Commit 19cae2f

Browse files
committed
blergh
1 parent 9becb68 commit 19cae2f

File tree

4 files changed

+156
-3
lines changed

4 files changed

+156
-3
lines changed

Lyrics.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ def get_lyrics2(song):
6767
.replace(' t ', 't ')\
6868
.replace(' ll ', 'll ')\
6969
.replace('-', '')\
70+
.replace('#', '')\
7071
.replace(".", "")\
7172
.replace("& ", "")\
7273
.replace('?', '')\

god_frame.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121

2222
df = pd.DataFrame(rows)
2323
df['date'] = pd.to_datetime(df['date'])
24-
df['raw'] = df['raw'].astype(int)
25-
df['comp'] = df['comp'].astype(int)
24+
# Blargh. Can't do this with nullable col. http://stackoverflow.com/a/21290084/262271
25+
#df['raw'] = df['raw'].astype(int)
26+
#df['comp'] = df['comp'].astype(int)
2627
print "Saving god frame with shape {}".format(df.shape)
2728
df.to_pickle(common.OMNI_PICKLE_NAME)

notebook_helpers.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import common
55

66
BIAS_ADJUSTED_RATIO = 1
7-
BIAS = 34.775287769
7+
#BIAS = 34.775287769
8+
BIAS = 12
89

910
def get_frame(having_lyrics=False):
1011
om = common.get_omnisong()

retry_lyrics_scrape.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
# TODO: if you rerun this later, make sure you check against filenames in "bad_lyrics" dir,
2+
# to avoid scraping them twice
3+
import pickle
4+
import time
5+
import os
6+
import re
7+
8+
import common
9+
import Lyrics
10+
11+
PICKLE_NAME = 'hot-100.pickle'
12+
LYRICS_DIR = 'lyrics'
13+
SLEEPYTIME = 1
14+
EXT = '.txt'
15+
16+
def unicode_unfuck(s):
17+
return ''.join(map(lambda c: chr(ord(c)), s))
18+
19+
def load_extant(d):
20+
keys = set()
21+
for fname in os.listdir(d):
22+
if fname.endswith(EXT):
23+
keys.add(fname[:-len(EXT)])
24+
return keys
25+
26+
class FakeSong(object):
27+
def __init__(self, artist, title):
28+
self.artist = artist
29+
self.title = title
30+
31+
def __str__(self):
32+
return '{} - {}'.format(self.artist, self.title)
33+
34+
artists_renamed = {
35+
'Beyonce': 'Beyonce Knowles',
36+
'Janet': 'Janet Jackson',
37+
'India.Arie': 'India Arie',
38+
'James Brown And The Famous Flames': 'James Brown',
39+
"Go-Go's": "The Gogos",
40+
}
41+
andy_artists = [
42+
'Peter And Gordon', 'Blood, Sweat & Tears', 'Captain & Tennille',
43+
'Crosby, Stills & Nash',
44+
]
45+
46+
for andy in andy_artists:
47+
canon = andy.replace(' & ', ' ')
48+
canon = canon.replace(' And ', ' ')
49+
artists_renamed[andy] = canon
50+
51+
def transformed_songs(song):
52+
trans = []
53+
cp = lambda: FakeSong(song.artist, song.title)
54+
artist = song.artist
55+
title = song.title
56+
if '#' in artist or '#' in title:
57+
yield song
58+
if artist.startswith('The '):
59+
s = cp()
60+
s.artist = artist[len('The '):]
61+
yield s
62+
if artist.startswith('Gladys Knight '):
63+
s = cp()
64+
s.artist = 'Gladys Knight'
65+
yield s
66+
if artist == 'Earth, Wind & Fire':
67+
s = cp()
68+
s.artist = 'Earth Wind Fire'
69+
yield s
70+
if artist == 'Big & Rich':
71+
s = cp()
72+
s.artist = 'Big Rich'
73+
yield s
74+
if artist == 'Peaches & Herb':
75+
s = cp()
76+
s.artist = 'Peaches Herb'
77+
yield s
78+
if artist == 'Maroon5':
79+
s = cp()
80+
s.artist = 'Maroon 5'
81+
yield s
82+
if 'B****' in title:
83+
s = cp()
84+
s.title = title.replace('B****', 'Bitch')
85+
yield s
86+
if artist in artists_renamed:
87+
s = cp()
88+
s.artist = artists_renamed[artist]
89+
yield s
90+
91+
# cause it has no parens. yuk yuk.
92+
orphaned = re.sub('\(.*\)', '', title)
93+
if orphaned != title:
94+
s = cp()
95+
s.title = orphaned
96+
yield s
97+
if artist.endswith(' s'):
98+
s = cp()
99+
s.artist = artist[:-2]+'s'
100+
yield s
101+
102+
with open(PICKLE_NAME) as f:
103+
db = pickle.load(f)
104+
105+
# nvm. probably better just to use os.path.exists each time. we need to sleep
106+
# between requests anyways, so who cares if it's slower
107+
#extant = load_extant(LYRICS_DIR)
108+
malencoded = 0
109+
with open('song_404s.txt') as to_retry:
110+
bad_keys = set([line.split('\t')[-1].strip() for line in to_retry])
111+
112+
with open('still_404s.txt', 'w') as skips_file:
113+
for artist in db:
114+
for orig_song in db[artist].itervalues():
115+
k = common.song_key(orig_song)
116+
if k not in bad_keys:
117+
continue
118+
path = os.path.join(LYRICS_DIR, k + EXT)
119+
found = False
120+
for song in transformed_songs(orig_song):
121+
#print "Transformed {} to {}".format(orig_song, song)
122+
try:
123+
lyrics, url = Lyrics.get_lyrics2(song)
124+
time.sleep(SLEEPYTIME)
125+
except Lyrics.LyricsNotFoundException:
126+
time.sleep(SLEEPYTIME)
127+
continue
128+
if len(lyrics) < 5:
129+
continue
130+
else:
131+
found = True
132+
break
133+
if not found:
134+
try:
135+
skips_file.write('\t'.join([orig_song.artist, orig_song.title, k]) + '\n')
136+
except UnicodeEncodeError:
137+
malencoded += 1
138+
else:
139+
print "Success! {}".format(orig_song)
140+
with open(path, 'w') as f:
141+
try:
142+
f.write(lyrics)
143+
except UnicodeEncodeError:
144+
# Blah blah fishcakes. Somehow got into a situation where, like, if there are multi-byte
145+
# unicode code points in the lyrics, we get each byte encoded in utf-8, rather than the
146+
# whole thing. TODO: should probably file a bug on... someone
147+
lyrics = unicode_unfuck(lyrics)
148+
f.write(lyrics)
149+
150+
print "Skipped {} malencoded songs".format(malencoded)

0 commit comments

Comments
 (0)