-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path100_Multidistributions copy.py
129 lines (97 loc) · 5.45 KB
/
100_Multidistributions copy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Pandas
import pandas as pd
# Constants
from src import constants
# Seagull
from src.Seagull.Seagull import Seagull
# Plots
from src.Plot.V2_Plot.V2_Num_Cat.Plots.V2_Density import Distributions_plot
def main():
# Where do you want to save the plots
# Otherwise they will go to cwd()
save_folder = constants.DENSITY_PLOTS_PATH
print("---------------------------------------------------------------")
print(" LOADING DATASETS ")
print("---------------------------------------------------------------")
# region
# Load the Iris dataset
irisDF = Seagull()
irisDF.set_iris()
# Load the Spotify dataset
spotify_instances = Seagull.get_spotify_datasets()
spotify_ArtitstDF = spotify_instances[0]
spotify_SongsDF = spotify_instances[1]
spotify_ComposersDF = spotify_instances[2]
print("---------------------------------------------------------------")
print(" Create a plot with random data and save it ")
print("---------------------------------------------------------------")
# Create the plot with random data and save it in the given folder
my_plot = Distributions_plot(folder_path = save_folder)
my_plot.save() # "Multidensity_Plot.png"
print("---------------------------------------------------------------")
print(" Create a plot and init from Seagull ")
print("---------------------------------------------------------------")
my_plot = Distributions_plot(irisDF, 'sepal length (cm)', 'species', save_folder)
my_plot.set_title("Comparing lengths")
my_plot.set_subtitle("")
my_plot.save() # "Multidensity_Plot_Iris dataset_0_1"
print("---------------------------------------------------------------")
print(" Compare the spotify playlist for a few selected artists ")
print("---------------------------------------------------------------")
# Get the top 5 artists (the ones with the most songs)
total_top_artists = 5
top_artists_DF = spotify_ArtitstDF.keep_column_top_values("ArtistTotalSongs", topValues = total_top_artists)
top_artists_names = top_artists_DF[1].to_list()
top_artists_ids = top_artists_DF[0].to_list()
# From the top 5 artists, get their songs
row_mask = spotify_ComposersDF.mask("ArtistID", top_artists_ids) # This give you the mask, not the data
top_artists_songs_DF = spotify_ComposersDF[row_mask,:] # Use the mask as you please later on
target_songs_ids = top_artists_songs_DF["SongID"].to_list()
# From the songs database, get the songs that are in the top 5 artists
target_songs_DF = spotify_SongsDF.inside("SongID", target_songs_ids) # This give you the Seagull with the data, not the mask
# We only care about the Song ID and the number of playlists, so drop everything else
target_songs_DF.keep_columns(["SongID", "in_spotify_playlists"], inplace = True)
# We need to do a bit of a conversion to get the data in the right format
#
# Artist Name | Song ID | Playlists (How many playlist have the song)
final_DF = Seagull(total_rows = target_songs_DF.getTotalRows(), total_columns = 3, dtypes=["string", "int", "int"])
final_DF.renameColumns(["Artist Name", "Song ID", "Playlists"])
final_DF[1] = target_songs_DF["SongID"]
final_DF[2] = target_songs_DF["in_spotify_playlists"]
# This is really slow
# Never do this!
#
# For each song, get the artist name
#for i in range(final_DF.totalRows):
#
# # Get the ID
# current_song_id = final_DF[i,1]
#
# # Get the artist ID
# current_artist_id = spotify_ComposersDF.inside("SongID", current_song_id)["ArtistID"].to_list()[0]
#
# # Find the artist name
# current_artist_name = spotify_ArtitstDF.inside("ArtistID", current_artist_id)["ArtistName"].to_list()[0]
#
# # Save the artist name
# final_DF[i,0] = current_artist_name
# Do this instead
first_join = spotify_ComposersDF.get_data().merge( target_songs_DF.get_data() , on = 'SongID', how = 'inner')
second_join = first_join.merge( spotify_ArtitstDF.get_data() , on = 'ArtistID', how = 'inner')
second_join = second_join.iloc[:,2:4] # Only keep the Playlists and the Artist Name
final_DF = Seagull.from_pandasDF(second_join)
print(final_DF.str_overview())
print(final_DF.get_column_type(1))
print(constants.SOFT_CATEGORIES)
print(final_DF.get_column_type(1) in constants.SOFT_CATEGORIES)
my_plot = Distributions_plot(final_DF, 'in_spotify_playlists', 'ArtistName', save_folder) # The given column is not categorical, but the
my_plot.set_title("Comparing playlists by artists") # plot doesn't care and will treat it as such
my_plot.set_subtitle("")
my_plot.save() # "Multidensity_Plot_Iris dataset_0_1"
print("---------------------------------------------------------------")
print(" Compare the spotify playlist for a few selected artists ")
print(" Compare top artists by number of songs (total) with top artists by average playlist songs (boxplot and absolute count each)")
print("---------------------------------------------------------------")
return 0
if __name__ == "__main__":
main()