-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3_Examples_Toy_Datasets.py
191 lines (140 loc) · 7.01 KB
/
3_Examples_Toy_Datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# Pandas
import pandas as pd
import numpy as np
# Seagull
from src.Seagull.Seagull import Seagull
def main():
print("---------------------------------------------------------------")
print(" LOADING COMMON DATASETS ")
print("---------------------------------------------------------------")
# Toy datasets come in two flavors. First are datasets with a unique
# dataframe, and second are datasets with multiple dataframes.
print("---------------------------------------------------------------")
print(" 1.- Loading the Iris dataset")
print("---------------------------------------------------------------")
# Iris dataset is an example of the first case, in which nothing
# is special is done with the data, just loaded and shown.
my_sg = Seagull()
my_sg.set_iris()
print(my_sg.str_overview())
print("---------------------------------------------------------------")
print(" 2.- Loading the Spotify dataset")
print("---------------------------------------------------------------")
# Spotify dataset is an example of the second case, in which the data
# is loaded and then processed to create three different dataframes.
# Notice that we didn't create the Seagull object before loading the
# dataset. This is because the method is a class method, and
# it is called directly from the class.
# Also notice that, while the original data is contained in a single
# CSV file where everything is mixed, in our case we divided the data
# so it follows the proper BCNF rules.
# We also included a little bit of optimization in the data.
#
# For example, for the songs we have the year, month, and day as
# separate integer columns, but we also included a column with the
# date, as date type, with format YYYY-MM-DD.
#
# The count of composers can be infer from the composer dataset,
# but we also included a column with the count of composers in the
# song dataset so it is faster to load such data.
#
# We also included a little bit of data cleaning.
#
# The song Love Grows (Where My Rosemary Goes) has a wrong stream count,
# instead of an integer, there's a string where the BMP, Type of song, etc... is mixed.
# So we changed the stream count to 0.
spotify_instances = Seagull.get_spotify_datasets()
spotify_ArtitstDF = spotify_instances[0]
spotify_SongsDF = spotify_instances[1]
spotify_ComposersDF = spotify_instances[2]
print("--------------------------------")
print(" Artists dataset")
print("--------------------------------")
print()
print(spotify_ArtitstDF.str_overview())
print("--------------------------------")
print(" Songs dataset")
print("--------------------------------")
print()
print(spotify_SongsDF.str_overview())
print("--------------------------------")
print(" Composers dataset")
print("--------------------------------")
print()
print(spotify_ComposersDF.str_overview())
if(False):
# ----------------------------------------------------------------------
# Load the spotify data into the three different datasets
# ----------------------------------------------------------------------
spotify_instances = Seagull.get_spotify_datasets()
spotifyArtitstDF = spotify_instances[0]
spotifyArtitstDF.print_overview()
spotifySongsDF = spotify_instances[1]
spotifySongsDF.print_overview()
spotifyComposersDF = spotify_instances[2]
spotifyComposersDF.print_overview()
# ----------------------------------------------------------------------
# Prepare the data for the plots
#
# ---- Get the top 20 artists
# ---- Count how many songs each release each month
# ---- Normalize the data from 0 to 1
#
# ----------------------------------------------------------------------
# How many artists do you want
totalTopArtists = 20
# Prepare the dataframe that will be use in the Heatmap later
heatmapDataDF = Seagull(totalTopArtists , 13)
heatmapDataDF.renameColumns(["Artist", "January", "February", "March", "April", "May", "June",
"July", "August", "September", "October","November","December"])
# Get the top 20 artists
topArtistsDF = spotifyArtitstDF.copy()
topArtistsDF.keepColumnTopValues(2, topValues = totalTopArtists)
topArtistsDF.print_overview()
# Initialize the heatmap data with the artists names
heatmapDataDF[:,0] = topArtistsDF[:,1]
# Initialize the rest of the heatmap data with zeros which are integers
for i in range(12):
heatmapDataDF.setColumnZeroes(i+1)
heatmapDataDF.print_overview()
# For each artist, get the number of songs released in each month
for i in range(totalTopArtists):
# Get the artist ID
artistID = topArtistsDF[i,0]
artistName = topArtistsDF[i,1]
# Search for the song made by this artist ID
songsIDs = spotifyComposersDF.getPanda().iloc[spotifyComposersDF.getPanda().iloc[:, 1].values == artistID, 0].values
# For each song, get the month of the year
for j in range(len(songsIDs)):
songID = songsIDs[j]
# Get the month of the year
currentSong = spotifySongsDF.getPanda().iloc[spotifySongsDF.getPanda().iloc[:, 0].values == songID, ]
currentMonth = currentSong.iloc[0,4]
# Add one to the heatmap
heatmapDataDF[i,currentMonth] = heatmapDataDF[i,currentMonth] + 1
# Show the heatmap data
heatmapDataDF.print_overview()
# Normalize the data by rows
# In this case, we are interested in the percentage of songs released in each month
# and check whether there is a trend of top 20 artist releasing, for example, in summer
heatmapDataDF.normalize(column=False, avoidFirstColumn=True)
heatmapDataDF.print_overview()
# Until here, the data is ready.
# Now let's do the plotting
# This is the default initialization of the heatmap object.
myImportantHeatmap = Heatmap(SAVING_FOLDER)
# Update the heatmap with the new data and show it
myImportantHeatmap.update_from_seagull(heatmapDataDF)
# Lets give it new labels
myImportantHeatmap.set_name("Top_20_artists_by_month")
myImportantHeatmap.set_title("Top 20 artists and the month of the year they release their songs")
myImportantHeatmap.set_x_label("Month")
myImportantHeatmap.set_y_label("Top 20 artists")
# Show the figure, this open up a window in runtime
myImportantHeatmap.show()
# Show the plot in the terminal via string representation
print(myImportantHeatmap)
# Save the plot
myImportantHeatmap.save()
if __name__ == "__main__":
main()