-
Notifications
You must be signed in to change notification settings - Fork 0
/
wrangle.py
150 lines (130 loc) · 4.66 KB
/
wrangle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# The Office Wrangle.py File
# imports
import pandas as pd
# color function
def set_color(ratings):
if ratings < 7.4:
return 'red'
elif (ratings >= 7.4) & (ratings < 8.2):
return 'yellow'
elif (ratings >= 8.2) & (ratings < 9.0):
return 'lightgreen'
elif (ratings >= 9.0):
return 'darkgreen'
# The Office Function
def the_office():
'''
Function to read the office series csv file,
and drop null values in the data,
rename columns for readability,
convert column names to lowercase
and save the new df to a csv file
'''
# reading the office series data from a csv file
df = pd.read_csv('the_office_series.csv')
# dropping the GuestStars column, too many null values
df = df.drop(columns='GuestStars')
# renaming the columns for readability
df = df.rename(columns={"Unnamed: 0": "Episode", "EpisodeTitle": "Episode_Title"})
# convert column names to lowercase
df.columns = [col.lower() for col in df]
# adding a color column to the df
df['color'] = df['ratings'].apply(set_color)
# saving the office data to a csv
df.to_csv('the_office.csv')
return df
# Guest Office_fuction
def guest_office():
'''
Function to create a df where there are guest stars on the episode,
drop the null values in the data,
rename columns for readability,
convert column names to lowercase
and save the df to a csv file
'''
# reading the office series data from a csv file
df = pd.read_csv('the_office_series.csv')
# dropping the null values in the GuestStars column
df = df[df.GuestStars.notna()]
# renaming the columns for readability
df = df.rename(columns={"Unnamed: 0": "Episode", "EpisodeTitle": "Episode_Title", "GuestStars": "Guest_Stars"})
# convert column names to lowercase
df.columns = [col.lower() for col in df]
# adding a color column to the df
df['color'] = df['ratings'].apply(set_color)
# saving the office data to a csv
df.to_csv('the_office_guest.csv')
return df
# Number of episodes per season
def season_episodes():
'''
function that takes the office df and
makes a new df with the season and
number of episodes per season
'''
# getting the office data
df = the_office()
# creating a new df with seasons and episodes
df_episodes = df.groupby('season').size().reset_index(name='episodes')
return df_episodes
def max_views():
'''
function that takes the office data
and findes the episode with the most views
'''
# getting the office data
df = the_office()
# getting the episode data with the most views
max_views = df.loc[df.viewership.idxmax()]
return max_views
def cluster_df():
'''
function to make a dataframe for clustering
takes the office data and drops columns
then changes the dataframe to integers
'''
# getting the office data
df = the_office()
# dropping columns that are not needed for clustering
df.drop(columns=['episode_title', 'about', 'date', 'director', 'writers', 'color', 'votes', 'duration', 'viewership', 'season'], inplace=True)
# changing the dataframe type to integers
#df = df.astype(int)
return df
# time series function
def office_time():
'''
fucntion to take the office data and convert the date column
to datetime, then add year, month, day, weekday columns
set a new index to the date column, save a csv file and
print the date range and shape of df
'''
# getting the office data
df = the_office()
# converting the date column to datetime
df.date = pd.to_datetime(df.date)
# adding columns for year, month, day, and weekday
df['year'] = df.date.dt.year
df['month'] = df.date.dt.month
df['day'] = df.date.dt.day
df['weekday'] = df.date.dt.day_name()
# setting the df index to the date column
df = df.set_index('date').sort_index()
# saving the office data to a csv
df.to_csv('office_time.csv')
# printing out the date range and shape of the df
print('Date Range:', df.index.min(), 'to', df.index.max())
print('Shape:', df.shape)
return df
# making a function
def office_time_model():
'''
this function takes the office time data and then
drops columns not needed for modeling and
returns a model new df
'''
df = office_time()
# dropping columns that are not needed for modeling purposes
df = df.drop(columns= {'episode_title', 'episode', 'season', 'about', 'director', 'writers',
'color', 'year', 'month', 'day', 'weekday', 'votes', 'duration'})
print('Model DF Shape:', df.shape)
return df