-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAnalytical Procedures
310 lines (226 loc) · 8.41 KB
/
Analytical Procedures
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
#1. What is the seasonality of the sales throughout a year in different regions, if any?
queryCmd = """SELECT o.orderdate, count(o.order_number) as number, s.region
FROM orders o, salesteam s
WHERE s.region = 'Midwest'
GROUP BY orderdate, s.region
ORDER BY orderdate DESC;
"""
cur.execute(queryCmd)
conn.commit()
results = connection.execute(queryCmd).fetchall()
column_names = results[0].keys()
df1 = pd.DataFrame(results, columns = column_names)
df1
queryCmd2 = """SELECT o.orderdate,
count(o.order_number) as number, s.region
FROM orders o, salesteam s
WHERE s.region = 'Northeast'
GROUP BY orderdate, s.region
ORDER BY orderdate DESC;
"""
cur.execute(queryCmd2)
conn.commit()
results = connection.execute(queryCmd2).fetchall()
column_names = results[0].keys()
df2 = pd.DataFrame(results, columns = column_names)
df2
queryCmd3 = """SELECT o.orderdate, count(o.order_number) as number, s.region
FROM orders o, salesteam s
WHERE s.region = 'West'
GROUP BY orderdate, s.region
ORDER BY orderdate DESC;
"""
cur.execute(queryCmd3)
conn.commit()
results = connection.execute(queryCmd3).fetchall()
column_names = results[0].keys()
df3 = pd.DataFrame(results, columns = column_names)
df3
queryCmd4 = """SELECT o.orderdate, count(o.order_number) as number, s.region
FROM orders o, salesteam s
WHERE s.region = 'South'
GROUP BY orderdate, s.region
ORDER BY orderdate DESC;
"""
cur.execute(queryCmd4)
conn.commit()
results = connection.execute(queryCmd4).fetchall()
column_names = results[0].keys()
df4 = pd.DataFrame(results, columns = column_names)
df4
import matplotlib.pyplot as plt
fig, axs = plt.subplots(2, 2)
axs[0, 0].plot(df1["orderdate"],df1["number"])
axs[0, 0].set_title('Time Series Plot of Midwest')
axs[0, 1].plot(df2["orderdate"],df2["number"], 'tab:orange')
axs[0, 1].set_title('Time Series Plot of Northeast')
axs[1, 0].plot(df3["orderdate"],df3["number"], 'tab:green')
axs[1, 0].set_title('Time Series Plot of West')
axs[1, 1].plot(df4["orderdate"],df4["number"], 'tab:red')
axs[1, 1].set_title('Time Series Plot of South')
for ax in axs.flat:
ax.set(xlabel='orderdate', ylabel='ordernumbers')
# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
ax.label_outer()
#2. What are the top ten best selling products (in terms of order quantity) in each year?
#2018:
stmt = """ select orders.order_number, orders.orderDate,
product.product_name
from orders, product
where EXTRACT(YEAR FROM orders.orderdate)='2018';
"""
#Execute the statement and get the results from the database
results = connection.execute(stmt).fetchall()
#Extract column names
column_names = results[0].keys()
#Store results from the database in a panda DataFrame
df3 = pd.DataFrame(results, columns=column_names)
df3.groupby('product_name').size()
#2019:
stmt = """ select orders.order_number, orders.orderDate,
product.product_name
from orders, product
where EXTRACT(YEAR FROM orders.orderdate)='2019';
"""
#Execute the statement and get the results from the database
results = connection.execute(stmt).fetchall()
#Extract column names
column_names = results[0].keys()
#Store results from the database in a panda DataFrame
df4 = pd.DataFrame(results, columns=column_names)
df4.groupby('product_name').size()
#2020
stmt = """ select orders.order_number, orders.orderDate,
product.product_name
from orders, product
where EXTRACT(YEAR FROM orders.orderdate)='2020';
"""
#Execute the statement and get the results from the database
results = connection.execute(stmt).fetchall()
df5.groupby('product_name').size()
#3. What proportion of the customers ordered over 165?
SELECT COUNT(DISTINCT customer_id) FROM orders
#50
WITH a AS
(SELECT customer_id AS loyalist FROM orders GROUP BY customer_id HAVING COUNT(order_number) > 165)
SELECT ROUND(COUNT(loyalist)/(1.00*50), 2) AS proportion
FROM a
#0.3
#4. Which sales team performs the best (involved in most sales) in each year?
-2018
SELECT s.sales_teamid, s.salesteam, COUNT(o.order_number) AS number_sales, o.orderdate
FROM salesteam s, orders o
WHERE date_part('year',orderdate) = 2018
GROUP BY s.sales_teamid,o.orderdate
ORDER BY number_sales DESC, s.sales_teamid ASC;
#sales_teamid salesteam number_sales orderdate
0 1 Adam Hernandez 21 2018-08-04
1 10 Jonathan Hawkins 21 2018-08-04
2 11 Joshua Little 21 2018-08-04
3 12 Carl Nguyen 21 2018-08-04
4 13 Todd Roberts 21 2018-08-04
-2019
SELECT s.sales_teamid, s.salesteam, COUNT(o.order_number) AS number_sales, o.orderdate
FROM salesteam s, orders o
WHERE date_part('year',orderdate) = 2019
GROUP by s.sales_teamid,o.orderdate
ORDER BY number_sales DESC, s.sales_teamid ASC;
sales_teamid salesteam number_sales orderdate
0 1 Adam Hernandez 20 2019-05-04
1 10 Jonathan Hawkins 20 2019-05-04
2 11 Joshua Little 20 2019-05-04
3 12 Carl Nguyen 20 2019-05-04
4 13 Todd Roberts 20 2019-05-04
-2020
SELECT s.sales_teamid, s.salesteam, COUNT(o.order_number) AS number_sales, o.orderdate
FROM salesteam s, orders o
WHERE date_part('year',orderdate) = 2020
GROUP BY s.sales_teamid,o.orderdate
ORDER BY number_sales DESC, s.sales_teamid ASC;
#sales_teamid salesteam number_sales orderdate
0 1 Adam Hernandez 16 2020-09-21
1 10 Jonathan Hawkins 16 2020-09-21
2 11 Joshua Little 16 2020-09-21
3 12 Carl Nguyen 16 2020-09-21
4 13 Todd Roberts 16 2020-09-21
#5. What are the five store locations that have the most orders each year?
SELECT COUNT (order_number) AS total_order, s.storeid,
FROM orders
JOIN order_product using (order_number)
JOIN product using (product_id)
JOIN product_storelocation using (product_id)
JOIN store_location s using (storeid)
GROUP BY s.storeid
ORDER BY total_order DESC
LIMIT 5;
# total_order storeid
0 4645 262
1 4601 21
2 4561 26
3 4489 128
4 4414 245
#6. Which sales channel has generated the most profits?
stmt = """
SELECT sales_channel, unitprice, unitcost
FROM orders;
"""
#Execute the statement and get the results from the database
results = connection.execute(stmt).fetchall()
#Extract column names
column_names = results[0].keys()
#Store results from the database in a panda DataFrame
df = pd.DataFrame(results, columns=column_names)
df['profit']=df['unitprice']-df['unitcost']
df.head()
df.groupby('sales_channel').sum().sort_values(['profit'],ascending=False)
#In-store has generated the most profit
unitprice unitcost profit
sales_channel
In-Store 7487169.60000 4698862.02300 2788307.57700
Online 5544846.30000 3479394.42000 2065451.88000
Distributor 3201762.50000 2002160.33000 1199602.17000
Wholesale 2021952.80000 1261984.46300 759968.33700
#7. Which state has the most stores?
WITH a AS (SELECT COUNT(storeID) AS number_of_stores, StateCode
FROM store_location
GROUP BY StateCode)
SELECT StateCode, number_of_stores
FROM a
WHERE number_of_stores = (SELECT MAX(number_of_stores) FROM a);
"CA" 74
#8. Which region has generated the most profits?
stmt = """
SELECT orders.unitprice, orders.unitcost, salesteam.region
FROM orders
JOIN salesteam ON orders.sales_teamid=salesteam.sales_teamid;
"""
#Execute the statement and get the results from the database
results = connection.execute(stmt).fetchall()
#Extract column names
column_names = results[0].keys()
#Store results from the database in a panda DataFrame
df2 = pd.DataFrame(results, columns=column_names)
df2.head()
#9. What are the proportions of sales channels accounted for total orders?
WITH total AS
(SELECT sales_channel, COUNT(order_number) n
FROM orders
GROUP BY sales_channel),
tn AS (SELECT COUNT(order_number) t FROM orders)
SELECT sales_channel, round((1.00*n)/(1.00*t),2) AS percentage
FROM total, tn
"Online" 0.30
"Wholesale" 0.11
"In-Store" 0.41
"Distributor" 0.17
#10. Which product is the most popular among all the records?
SELECT COUNT (order_number) AS total_order, p.product_id, p.product_name
FROM orders
JOIN order_product using (order_number)
JOIN product p using (product_id)
GROUP BY p.product_id
ORDER BY total_order DESC
LIMIT 1;
total_order product_id product_name
0 200 37 Platters