-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_conglomerator.py
88 lines (65 loc) · 2.99 KB
/
data_conglomerator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
''' data_conglomerator.py
This program takes the pitch/timbre/etc. data from each file, combines them all into one dataset,
and then runs the dirichlet process on everything '''
import hdf5_getters #not on adroit
import msd_utils #not on adroit
from __future__ import division
import os
import sys
import re
import time
import json
import glob
import sklearn.mixture
import math
import numpy as np
import collections
from string import ascii_uppercase
import ast
import matplotlib.pyplot as plt
import operator
scratch_dir = './../../../../scratch/network/mssilver/mssilver/'
edm_textfile_everything = scratch_dir + 'msd_EDM_dataraw_EVERYTHING.txt'
edm_textfile_everything_sorted = scratch_dir + 'msd_EDM_dataraw_EVERYTHING_SORTED.txt'
# first put everything in the same text file
with open(edm_textfile_everything,'a') as f:
f.write('{')
for l1 in ascii_uppercase:
for l2 in ascii_uppercase:
directory = l1 + l2
edm_textfile = scratch_dir + 'msd_dataraw_' + directory + '.txt'
print 'looking at file {0} at time {1}'.format(edm_textfile,time.time())
with open(edm_textfile,'r') as f2:
json_contents = f2.read().strip()[1:-1]
f.write(json_contents)
#don't insert comma after last file
if not directory is 'ZZ':
f.write(', ')
f.write('}')
print 'file merging complete at time {0}'.format(time.time())
# then sort the songs in the mega text file by year
print 'reading all data and appending to a data structure at time {0}'.format(time.time())
json_contents = open(edm_textfile_everything,'r').read()
all_song_data = []
time_start = time.time()
count = 0
for json_object_str in re.finditer('{\'title.*?}',json_contents):
json_object_str = str(json_object_str.group(0))
json_object_str = json_object_str.replace('\'title\':', '\"title\":')
json_object_str = json_object_str.replace('\'timbre\':', '\"timbre\":')
json_object_str = json_object_str.replace('\'pitches\':', '\"pitches\":')
json_object_str = json_object_str.replace('\'artist_name\':', '\"artist_name\":')
json_object_str = json_object_str.replace('\'year\':', '\"year\":')
json_object_str = json_object_str.replace('\'duration\':', '\"duration\":')
# json_object_str = re.sub('(\')(title)(\'): (\')(.*)(\'),', r'"\2": "\5",', json_object_str)
# json_object_str = re.sub('(\')(artist_name)(\'): (\')(.*)(\'),', r'"\2": "\5",', json_object_str)
# json_object = json.loads(json_object_str)
json_object = ast.literal_eval(json_object_str)
all_song_data.append(json_object)
count += 1
print 'appended song {0} at {1} seconds after start'.format(count,time.time() - time_start)
print 'beginning sorting of song data at time {0}'.format(time.time())
all_song_data_sorted = sorted(all_song_data, key=lambda k: k['year'], reverse=True)
with open(edm_textfile_everything_sorted, 'w') as f:
f.write(str(all_song_data_sorted))
# then feed it all into the dirichlet process