-
Notifications
You must be signed in to change notification settings - Fork 0
/
case_study_movie_ticket.py
1325 lines (973 loc) · 41.7 KB
/
case_study_movie_ticket.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""Case study_Movie ticket
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1M5WsDJU8k04xQOH3FlGqFAI0BJbT7qXB
# 1. Load data
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df_campaign = pd.read_csv('/content/drive/MyDrive/Dataset/Movie 2/campaign.csv')
df_customer = pd.read_csv('/content/drive/MyDrive/Dataset/Movie 2/customer.csv')
df_device = pd.read_csv('/content/drive/MyDrive/Dataset/Movie 2/device_detail.csv')
df_status = pd.read_csv('/content/drive/MyDrive/Dataset/Movie 2/status_detail.csv')
df_ticket = pd.read_csv('/content/drive/MyDrive/Dataset/Movie 2/ticket_history.csv')
"""#2. Data cleaning
## 2.1 Data type, NULL values, Duplicate values
"""
df_customer.info()
#Phan tich: non-null count deu bang voi so dong cua bang => ok
#Customer_id la dang int => ok
#usergender la dang object (dang text) => ok
#dob dang object => chua chinh xac vi day la ngay sinh thi phai la dang datetime
#Chuyen doi type cua dob
from datetime import datetime #buoc dau tien la import thu vien datetime
df_customer['dob'] = pd.to_datetime(df_customer['dob']) #chuyen doi type cua dob
df_customer.info()
#kiem tra duplicate values
df_customer['customer_id'].nunique()
# =>con so nay trung voi so dong cua bang customer => ok
df_campaign.info()
df_campaign.head(2)
df_campaign['campaign_id'].nunique()
df_device.info()
#model co so luong non-null it hon so dong cua bang => kiem tra xem so luong null chiem bao nhieu % => define ra mot function moi
def calc_null_rate(df):
newdf = df.isnull().sum().to_frame('null_count')
newdf[['null_rate']] = newdf[['null_count']] / len(df)
return newdf.sort_values(by=['null_rate'], ascending=False)
calc_null_rate(df_device)
#ti le null cua model chiem 5% => ko qua lon nhung chung ta nen thay the bang gia tri unknown
#null cua device chi co 1 thi ta se xoa luon
df_device = df_device.fillna({'model' : 'unknown'})
df_device = df_device[df_device['device_number'].notna()]
calc_null_rate(df_device)
df_status.info()
df_status.head()
#co mot NULL tuy nhien vi day la null cua gia tri successful nen hop ly => ko can lam gi
df_ticket.info()
#cot time o dang text => thay doi thanh dang datatime
df_ticket['time']= pd.to_datetime(df_ticket['time'])
df_ticket.info()
#kiem tra duplicate => dem primary cua bang
df_ticket['ticket_id'].nunique()
#Co nhieu dong bi duplicate => ko xoa voi ma phai kiem tra xem chung co bi trung that khong
df_duplicate = df_ticket[df_ticket.duplicated(keep= False)]
df_duplicate.head()
#cac dong bi duplicate giong het nhau ve moi du lieu => co the xoa chung di duoc roi
df_ticket.drop_duplicates(inplace=True)
df_ticket['ticket_id'].nunique()
df_ticket.info()
#Ket qua cac dong da bang nhau => Ok
#Kiem tra buoc dau da xong, bh se kiem tra buoc 2
#tuy nhien chung ta co the join tat ca cac bang de kiem tra 1 lan cho nhanh
"""## 2.2 Join tables"""
df_join_customer = pd.merge(df_ticket, df_customer, how ='left', on = 'customer_id')
df_join_campaign = pd.merge(df_join_customer, df_campaign, how ='left', on = 'campaign_id')
df_join_status = pd.merge(df_join_campaign, df_status, how ='left', on = 'status_id')
df_join_all = pd.merge(df_join_status, df_device, how ='left', on = 'device_number')
#Kiem tra xem so dong co bi thay doi ko
df_join_all.count()
calc_null_rate(df_join_all)
#cot error group bi null khi don hang giao dich thanh cong
#campaign type bi null khi don hang ko co khuyen mai
#=> thay the cac gia tri null bang unknown
df_join_all = df_join_all.fillna('unknown')
calc_null_rate(df_join_all)
"""## 2.3 View all values of each columns"""
df_join_all.nunique().sort_values(ascending =False)
#Tao mot function de liet ke tat ca cac gia tri trong 1 column
specific_cols = ['movie_name','description','paying_method','campaign_type','usergender','platform', 'error_group']
for col in specific_cols:
print(col + ' : ', np.sort(df_join_all[col].unique().astype(str)))
print('\r')
print('--------------------------')
print('\r')
#Check loi chinh ta, loi duplicate => ok ko sai gi nen ko can chinh sua nua
"""# 3. Analyze
## 3.1 Customer portrait analysis
### 3.1.1 Age and gender distribution
"""
df_join_all.head(2)
#Tinh so tuoi
current_year = datetime.now()
df_join_all['age_days'] = (current_year - df_join_all['dob']).dt.days
df_join_all.head(2)
df_join_all['age'] = df_join_all['age_days'] / 365.25
df_join_all['age'] = df_join_all['age'].astype(int)
df_join_all.head(2)
#Lay ra danh sach KH kem theo do tuoi va gioi tinh
df_cus = df_join_all.drop_duplicates(subset=['customer_id']) [['customer_id', 'dob', 'age', 'usergender']]
df_cus.count()
#so sanh voi count danh sach unique o tren kia thi da bang nhau ve so luong customer_id => ok
#Kiem tra phan bo => Histogram
plt.figure(figsize=(8,4))
df_cus['age'].hist(bins = 30, color = 'cornflowerblue', grid = False)
plt.xlabel('ages')
plt.ylabel('#customers')
plt.title('Age distribution')
plt.show()
#co nguoi tren 100 tuoi => outlier
#co nguoi tren 80 di xem phim va dat ve => co binh thuong ko
#bins o do tuoi 5x cao bat thuong
# Phân bổ độ tuổi theo nhóm giới tính :
plt.figure(figsize=(8,4))
## data
male_age = df_cus[df_cus['usergender'] == 'Male']['age']
female_age = df_cus[df_cus['usergender'] == 'Female']['age']
unknown_age = df_cus[df_cus['usergender'] == 'Not verify']['age']
## plot
plt.hist(male_age, bins=30, alpha = 0.3, color = 'cornflowerblue', label = 'Male')
plt.hist(female_age, bins=30, alpha = 0.3, color = 'salmon', label = 'Female')
plt.hist(unknown_age, bins=30, alpha = 0.3, color = 'limegreen', label = 'Not verify')
#alpha la thong so phan tram cua mau sac hien thi
## edit
plt.title('Age distribution')
plt.xlabel('ages')
plt.ylabel('#customers')
plt.legend()
plt.show()
#nhin hai bang ta thay => nhom do tuoi 5x o bang tuoi la nhom not verify gioi tinh
#Can kiem tra xem doan mau xanh not verify nay nhu the nao
#Danh gia chi tiet nhom verify
df_gen = df_cus.groupby('usergender').agg(total_cus = ('customer_id', 'count')).sort_values(by = 'total_cus', ascending = False).reset_index()
df_gen
#Bieu do tron
plt.pie(df_gen['total_cus'], labels = df_gen['usergender'],
colors = ['cornflowerblue', 'lightsteelblue', 'slategrey'],
autopct = '%1.0f%%',
startangle = 90)
plt.show()
""">**Notes** :
>- Nhom KH chua verify chiem hon 10% dan den hai truong hop:
>>- Neu ho da nhap dob thi se co data
>>- Neu ho ko nhap thi he thong se autofill la 1970 => 54 tuoi
"""
df_cus[df_cus['usergender'] == 'Not verify'].groupby('age').agg(number = ('customer_id', 'count')).reset_index().sort_values(by='number', ascending=False)
"""### 3.1.2 Age generation distribution
"""
#Logic phan loai X, Y, Z, Baby boomers => dua vao nam sinh
df_cus['age_generation'] = df_cus['dob'].apply(lambda x: 'Baby boomers' if x.year < 1965
else 'Gen X' if x.year < 1981
else 'Gen Y' if x.year < 1997
else 'Gen Z')
df_cus.head(2)
df_gen_group = (
df_cus[df_cus['usergender'] != 'Not verify']
.groupby('age_generation')
.agg(total = ('customer_id', 'count'))
.sort_values(by='total', ascending = False)
.reset_index()
)
df_gen_group
#Ket hop hai bieu do cung luc
plt.figure(figsize=(13,4))
#plot 1
ax1 = plt.subplot(1, 2, 1) #Chi 1 dong, co 2 cot, vi tri o thu 1
df_cus[df_cus['usergender'] != 'Not verify']['age'].hist(bins = 30, color = 'cornflowerblue', grid = False)
plt.xlabel('ages')
plt.ylabel('#customers')
plt.title('Age distribution')
#plot 2
ax2 = plt.subplot(1, 2, 2)
plt.pie(df_gen_group['total'], labels = df_gen_group['age_generation'],
colors = ['cornflowerblue', 'lightsteelblue', 'slategrey', 'lightskyblue'],
autopct = '%1.0f%%',
startangle = 90)
plt.show()
"""## 3.2 Time series data - When did customers buy tickets ?
### 3.2.1 Trend by month
"""
df_join_all['month'] = pd.to_datetime(df_join_all['time']).dt.month #lay ra cot thang
df_join_all['name_day'] = pd.to_datetime(df_join_all['time']).dt.day_name() #lay ra cot ngay
df_join_all['hour'] = pd.to_datetime(df_join_all['time']).dt.hour #Lay ra cot gio
df_join_all['year_month'] = df_join_all['time'].dt.strftime('%Y-%m') #Lay ra cot nam va thang
df_join_all.head(2)
#Thong ke theo thang
df_time_month = (
df_join_all
.groupby(['year_month'])
.agg(total_ticket = ('ticket_id', 'count'))
.reset_index()
)
df_time_month.head()
#Ve bieu do mien theo thang
plt.figure(figsize=(13,4))
plt.fill_between(df_time_month['year_month'], df_time_month['total_ticket'], color = 'cornflowerblue', alpha = 0.7)
plt.xticks(rotation = 90)
plt.show()
#Giai thich : giai doan covid dien ra => ko di xem phim duoc
#Quan sat : bi thieu mot so thang trong nam ----> can 1 bang DIM thoi gian theo thang (FULL)
#Tao bang DIM ve thoi gian de JOIN voi data ticket va ve lai chart
# Tạo bảng dimension thời gian:
# Xác định khoảng thời gian
start_date = '2019-01-01'
end_date = '2022-12-31'
# Tạo ra range thời gian từ 2 mốc start và end
date_range = pd.date_range(start=start_date, end=end_date, freq='MS')
# Lấy ra list phần tử thời gian tương ứng:
list_month = date_range.month
list_month_name = date_range.strftime('%B')
list_year = date_range.year
list_year_month = date_range.strftime('%Y-%m')
# # Khởi tạo dataframe
dim_time = pd.DataFrame({
'month_number': list_month,
'month_name': list_month_name,
'year': list_year,
'year_month': list_year_month
})
dim_time.head()
#JOIN voi bang join_all de co du data ve thoi gian
df_time_month_dim = (
pd.merge(dim_time, df_join_all, how='left', on='year_month')
.groupby(['year_month'])
.agg(total_ticket = ('ticket_id', 'count'))
.reset_index()
)
df_time_month_dim.head()
plt.figure(figsize=(13,4))
plt.fill_between(df_time_month_dim['year_month'], df_time_month_dim['total_ticket'], color = 'cornflowerblue', alpha = 0.7)
plt.xticks(rotation = 90)
plt.show()
#Quan sat : mac du da du gia tri date nhung nhung thang ko co du lieu van hien len mau => co the gay hieu lam
# ----> thay gia tri 0 bang gia tri NULL
df_time_month_dim.replace(0, np.nan, inplace=True)
plt.figure(figsize=(13,4))
plt.fill_between(df_time_month_dim['year_month'], df_time_month_dim['total_ticket'], color = 'cornflowerblue', alpha = 0.7)
plt.xticks(rotation = 90)
plt.show()
"""### 3.2.2 Trend by week days"""
#Thong ke theo ngay trong tuan
df_week_day = (
df_join_all
.groupby(['name_day'])
.agg(total_ticket = ('ticket_id', 'count'))
.reset_index()
)
df_week_day.head()
plt.figure(figsize=(13,4))
plt.fill_between(df_week_day['name_day'], df_week_day['total_ticket'], color = 'cornflowerblue', alpha = 0.7)
plt.xticks(rotation = 90)
plt.show()
#Quan sat => cac thu dang bi lon xon
# ----> can sap xep lai theo thu tu cac ngay trong tuan
#Dinh nghia thu tu cac ngay trong tuan
week_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
#Sap xep theo thu tu cac ngay trong tuan
df_week_day['day_order'] = pd.Categorical(df_week_day['name_day'], categories=week_order, ordered=True)
df_week_day.sort_values('day_order', inplace=True)
df_week_day
plt.figure(figsize=(13,4))
plt.fill_between(df_week_day['name_day'], df_week_day['total_ticket'], color = 'cornflowerblue', alpha = 0.7)
plt.xticks(rotation = 90)
plt.show()
#Giai thich : mn di xem nhieu vao cuoi tuan
"""### 3.2.3 Trend by hours"""
#Thong ke theo gio trong ngay
df_hour = (
df_join_all
.groupby(['hour'])
.agg(total_ticket = ('ticket_id', 'count'))
.reset_index()
)
df_hour.head()
plt.figure(figsize=(13,4))
plt.fill_between(df_hour['hour'], df_hour['total_ticket'], color = 'cornflowerblue', alpha = 0.7)
x_values = [i for i in range(24)]
plt.xticks(x_values)
plt.show()
# Vẽ chung trên 1 frame :
plt.figure(figsize=(13, 8))
# chart 1: Tháng
ax1 = plt.subplot(2, 1, 1)
plt.fill_between(df_time_month_dim['year_month'], df_time_month_dim['total_ticket'], color = 'cornflowerblue', alpha=0.7)
plt.title('#ticket by months')
plt.xticks(rotation=90)
# chart 2: ngày
ax2 = plt.subplot(2, 2, 3) #hai hang, hai cot, vi tri thu 3
plt.fill_between(df_week_day['name_day'], df_week_day['total_ticket'], color = 'cornflowerblue', alpha=0.7)
plt.title('#ticket by week day')
plt.xticks(rotation=90)
# chart 3: giờ
ax3 = plt.subplot(2, 2, 4)
plt.fill_between(df_hour['hour'], df_hour['total_ticket'], color = 'cornflowerblue', alpha=0.7)
x_values = [i for i in range(24)]
plt.xticks(x_values)
plt.title('#ticket by hour')
plt.subplots_adjust( hspace = 0.5, top=0.8)
"""## 3.3 Factor related to the customer's purchasing process
### 3.3.1 Payment platform
"""
df_platform = (
df_join_all[df_join_all['platform'] != 'unknown']
.groupby(['platform'])
.agg(total_ticket = ('ticket_id', 'count'))
.reset_index()
)
df_platform
# Biểu đồ cột ngang :
plt.figure(figsize=(8, 4))
plt.barh(
df_platform['platform'], df_platform['total_ticket'],
color = df_platform['platform'].replace({ 'mobile': 'lightskyblue', 'website': 'tomato'})
)
for index,value in enumerate(df_platform['total_ticket']):
plt.text(value,index,str(value))
plt.title('#ticket by platform')
#Bieu do tron :
plt.figure(figsize=(8, 4))
plt.pie(
df_platform['total_ticket'],
labels = df_platform['platform'],
colors = df_platform['platform'].replace({ 'mobile': 'lightskyblue', 'website': 'tomato'}),
autopct = '%.1f%%',
startangle=90
)
plt.show()
#Theo thoi gian :
df_platform_time = (
df_join_all[df_join_all['platform'] != 'unknown']
.groupby(['year_month', 'platform'])
.agg(total_ticket = ('ticket_id', 'count'))
.sort_values(by='year_month', ascending=True)
.reset_index()
)
df_platform_time.head()
#Ve bieu do line cho mobile va website
plt.figure(figsize=(13, 4))
df_mobile_line = df_platform_time[df_platform_time['platform'] == 'mobile']
plt.plot(df_mobile_line['year_month'], df_mobile_line['total_ticket'],
label = 'mobile', color = 'tomato',
marker = 0, linewidth = 2, markersize = 4)
df_website_line = df_platform_time[df_platform_time['platform'] == 'website']
plt.plot(df_website_line['year_month'], df_website_line['total_ticket'],
label = 'website', color = 'lightskyblue',
marker = 0, linewidth = 2, markersize = 4)
plt.legend()
plt.xticks(rotation = 90)
plt.show()
#Phan tich : theo bieu do tron => payment qua mobile chiem da so
#theo bieu do line => payment qua website moi xuat hien va cung chi chiem mot so it %
"""### 3.3.2 OS version"""
#Phan loai thiet bi OS version thanh cac nhom : android, ios, unknown, browser
df_join_all['os_version'] = df_join_all['model'].apply(
lambda
x: 'ios' if ('iPhone' in x or 'iPod' in x)
else 'browser' if x == 'browser'
else 'unknown' if ('devicemodel' in x or 'unknown' in x)
else 'android & other'
)
df_join_all['os_version'].unique()
#Group by de thong ke
df_os = (
df_join_all
.groupby(['os_version'])
.agg(total_ticket = ('ticket_id', 'count'))
.sort_values(by='total_ticket', ascending=True)
.reset_index()
)
df_os
# Biểu đồ cột ngang :
plt.figure(figsize=(12, 3))
ax1 = plt.subplot(1,2,1)
plt.barh(
df_os['os_version'], df_os['total_ticket'],
color = df_os['os_version'].replace({ 'browser': 'lightsteelblue', 'android & other': 'lightskyblue', 'ios': 'cornflowerblue', 'unknown': 'steelblue'})
)
for index,value in enumerate(df_os['total_ticket']):
plt.text(value,index,str(value))
plt.title('#ticket by os')
ax2 = plt.subplot(1,2,2)
plt.pie(df_os['total_ticket'], labels= df_os['os_version'],
colors=df_os['os_version'].replace({ 'browser': 'lightsteelblue', 'android & other': 'lightskyblue', 'ios': 'cornflowerblue', 'unknown': 'steelblue'}),
autopct='%1.0f%%',
startangle=90)
plt.show()
df_os_time = (
df_join_all
.groupby(['year_month', 'os_version'])
.agg(total_ticket = ('ticket_id', 'count'))
.sort_values(by='year_month', ascending=True)
.reset_index()
)
df_os_time.head(10)
#Ve bieu do mien ---> xu ly data dang PIVOT
df_os_time = (
df_join_all
.pivot_table( index= 'year_month', columns ='os_version', aggfunc='count', values='ticket_id')
.reset_index()
)
df_os_time.head(10)
#Ve bieu do mien chong theo thoi gian
plt.figure(figsize=(13,4))
plt.fill_between(df_os_time['year_month'], df_os_time['ios'], color ='cornflowerblue', alpha = 0.5, label = 'ios')
plt.fill_between(df_os_time['year_month'], df_os_time['android & other'], color ='lightskyblue', alpha = 0.5, label = 'android & other')
plt.fill_between(df_os_time['year_month'], df_os_time['browser'], color ='lightsteelblue', alpha = 0.5, label = 'browser')
plt.fill_between(df_os_time['year_month'], df_os_time['unknown'], color ='steelblue', alpha = 0.5, label = 'unknown')
plt.xticks(rotation =90)
plt.show()
#Cot thoi gian van bi thieu range ve thoi gian => join voi bang thoi gian o tren de co day du thoi gian
#Bieu dien chung mot frame
plt.figure(figsize=(12, 8))
ax1 = plt.subplot(2,2,1)
plt.barh(
df_os['os_version'], df_os['total_ticket'],
color = df_os['os_version'].replace({ 'browser': 'lightsteelblue', 'android & other': 'lightskyblue', 'ios': 'cornflowerblue', 'unknown': 'steelblue'})
)
for index,value in enumerate(df_os['total_ticket']):
plt.text(value,index,str(value))
plt.title('#ticket by os')
ax2 = plt.subplot(2,2,2)
plt.pie(df_os['total_ticket'], labels= df_os['os_version'],
colors=df_os['os_version'].replace({ 'browser': 'lightsteelblue', 'android & other': 'lightskyblue', 'ios': 'cornflowerblue', 'unknown': 'steelblue'}),
autopct='%1.0f%%',
startangle=90)
ax3 = plt.subplot(2,1,2)
plt.fill_between(df_os_time['year_month'], df_os_time['ios'], color ='cornflowerblue', alpha = 0.5, label = 'ios')
plt.fill_between(df_os_time['year_month'], df_os_time['android & other'], color ='lightskyblue', alpha = 0.5, label = 'android & other')
plt.fill_between(df_os_time['year_month'], df_os_time['browser'], color ='lightsteelblue', alpha = 0.5, label = 'browser')
plt.fill_between(df_os_time['year_month'], df_os_time['unknown'], color ='steelblue', alpha = 0.5, label = 'unknown')
plt.xticks(rotation =90)
plt.show()
#Mac du unknown chiem kha nhieu nhung chi moi xuat hien gan day => co the la do thay doi cau truc du lieu nen ko co du lieu ghi nhan
# ---> danh gia mot van de theo nhieu goc nhieu : so luong thuc te, ti trong %, theo thoi gian
"""### 3.3.3 Payment method"""
df_method = (
df_join_all[ (df_join_all['status_id'] == 1) & (df_join_all['paying_method'] != 'other') ]
.groupby('paying_method')
.agg(total_ticket = ('ticket_id', 'count'))
.sort_values(by='total_ticket', ascending=True)
.reset_index()
)
df_method
#Xu ly data dang Pivot de ve bieu do mien :
df_method_time = (
df_join_all[ (df_join_all['status_id'] == 1) & (df_join_all['paying_method'] != 'other') ]
.pivot_table( index= 'year_month', columns ='paying_method', aggfunc='count', values='ticket_id')
.reset_index()
)
df_method_time.head(10)
#Ve bieu do mien 100%
df_method_time = (
df_join_all[ (df_join_all['status_id'] == 1) & (df_join_all['paying_method'] != 'other') ]
.pivot_table( index= 'year_month', columns ='paying_method', aggfunc='count', values='ticket_id')
.reset_index()
)
df_method_time_pct = df_method_time.copy()
df_method_time_pct = df_method_time_pct.fillna(0)
df_method_time_pct['total'] = df_method_time_pct.iloc[:, 1:].sum(axis=1)
for i in df_method_time_pct.columns[1:5]:
df_method_time_pct[i+'_pct'] = df_method_time_pct[i]/df_method_time_pct['total']
df_method_time_pct.head(10)
# biểu diễn chung 1 frame
plt.figure(figsize=(12, 8))
ax1 = plt.subplot(3,2,1)
plt.barh(
df_method['paying_method'], df_method['total_ticket'],
color = df_method['paying_method'].replace({ 'bank account': 'lightsteelblue', 'credit card': 'lightskyblue', 'debit card': 'cornflowerblue', 'money in app': 'steelblue'})
)
for index,value in enumerate(df_method['total_ticket']):
plt.text(value,index,str(value))
plt.title('#ticket by method')
ax2 = plt.subplot(3,2,2)
plt.pie(df_method['total_ticket'], labels= df_method['paying_method'],
colors=df_method['paying_method'].replace({ 'bank account': 'lightsteelblue', 'credit card': 'lightskyblue', 'debit card': 'cornflowerblue', 'money in app': 'steelblue'}),
autopct='%1.0f%%',
startangle=90)
ax3 = plt.subplot(3,1,2)
plt.fill_between(df_method_time['year_month'], df_method_time['bank account'], color='cornflowerblue', alpha=0.5, label='bank account')
plt.fill_between(df_method_time['year_month'], df_method_time['credit card'], color='lightskyblue', alpha=0.5, label='credit card')
plt.fill_between(df_method_time['year_month'], df_method_time['debit card'], color='lightsteelblue', alpha=0.5, label='debit')
plt.fill_between(df_method_time['year_month'], df_method_time['money in app'], color='steelblue', alpha=0.5, label='money in app')
plt.title('#ticket of method by time')
# plt.xlabel('Month')
plt.ylabel('#ticket')
plt.legend(loc='upper left')
plt.xticks(rotation=90)
ax4 = plt.subplot(3,1,3)
# vẽ biểu đồ miền 100%
plt.stackplot(df_method_time_pct['year_month'], df_method_time_pct["money in app_pct"], df_method_time_pct['debit card_pct'], df_method_time_pct['credit card_pct'], df_method_time_pct['bank account_pct']
, labels=['money in app', 'debit card', 'credit card', 'bank account'], colors=['royalblue', 'slategrey', 'lightsteelblue', 'cornflowerblue'], alpha=0.7)
plt.title('#ticket of method by time')
# plt.xlabel('Month')
plt.ylabel('#ticket')
plt.legend(loc='upper right', bbox_to_anchor=(1.15, 1))
plt.xticks(rotation=90)
plt.subplots_adjust(hspace = 0.7, top = 0.9)
#Phan tich : money in app luon chiem ti trong lon, nhat la tu nhung nam gan day
"""### 3.3.4 Promotion"""
df_join_all['campaign_type'].unique()
df_join_all['type']= df_join_all['campaign_type'].apply(
lambda x: 'non-promotion' if x == 'unknown'
else 'promotion'
)
df_join_all.head(2)
df_type = (
df_join_all[ (df_join_all['status_id'] == 1) & (df_join_all['paying_method'] != 'other') ]
.groupby('type')
.agg(total_ticket = ('ticket_id', 'count'))
.sort_values(by='total_ticket', ascending=True)
.reset_index()
)
df_type
#xu ly data dang PIVOT de ve bieu do mien :
df_type_time = (
df_join_all[ (df_join_all['status_id'] == 1) & (df_join_all['paying_method'] != 'other') ]
.pivot_table( index= 'year_month', columns ='type', aggfunc='count', values='ticket_id')
.reset_index()
)
df_type_time_pct = df_type_time.copy()
df_type_time_pct = df_type_time_pct.fillna(0)
df_type_time_pct['total'] = df_type_time_pct.iloc[:, 1:].sum(axis=1)
for i in df_type_time_pct.columns[1:3]:
df_type_time_pct[i+'_pct'] = df_type_time_pct[i]/df_type_time_pct['total']
df_type_time_pct.head()
# biểu diễn chung 1 frame
plt.figure(figsize=(12, 8))
ax1 = plt.subplot(3,2,1)
plt.barh(
df_type['type'], df_type['total_ticket'],
color = df_type['type'].replace({ 'non-promotion': 'tomato', 'promotion': 'lightskyblue'})
)
for index,value in enumerate(df_type['total_ticket']):
plt.text(value,index,str(value))
plt.title('#ticket by type')
ax2 = plt.subplot(3,2,2)
plt.pie(df_type['total_ticket'], labels= df_type['type'],
colors=df_type['type'].replace({ 'non-promotion': 'tomato', 'promotion': 'lightskyblue'}),
autopct='%1.0f%%',
startangle=90)
ax3 = plt.subplot(3,1,2)
plt.plot(df_type_time['year_month'], df_type_time['non-promotion'],
label = 'non-promotion', color = 'tomato',
marker = 0, linewidth = 2, markersize = 4)
plt.plot(df_type_time['year_month'], df_type_time['promotion'],
label = 'promotion', color = 'lightskyblue',
marker = 0, linewidth = 2, markersize = 4)
plt.title('#ticket of type by time')
# plt.xlabel('Month')
plt.ylabel('#ticket')
plt.legend(loc='upper left')
plt.xticks(rotation=90)
ax4 = plt.subplot(3,1,3)
# vẽ biểu đồ miền 100%
plt.stackplot(df_type_time_pct['year_month'], df_type_time_pct['non-promotion_pct'], df_type_time_pct['promotion_pct']
, labels=['non-promotion', 'promotion'], colors=['tomato', 'lightsteelblue'], alpha=0.7)
plt.title('#ticket of type by time')
# plt.xlabel('Month')
plt.ylabel('#ticket')
plt.legend(loc='upper right', bbox_to_anchor=(1.15, 1))
plt.xticks(rotation=90)
plt.subplots_adjust(hspace = 0.7, top = 0.9)
#Nhan xet : giai doan sau nay, promotion tang rat nhang
"""### 3.3.5 Which movie they watched ?"""
df_film_sum = (
df_join_all[df_join_all['status_id'] == 1]
.groupby('movie_name')
.agg(total_ticket = ('ticket_id', 'count'),
total_customer = ('customer_id', 'nunique'),
revue = ('final_price', 'sum'))
.sort_values(by='total_ticket', ascending=False)
.reset_index()
)
df_film_sum.head()
#danh sach phim qua dai nen ko the lay duoc het => phai loc ra
list_film = df_film_sum[df_film_sum['total_ticket'] > 1000] ['movie_name'].unique()
list_selected_film = list(list_film)
list_selected_film
df_movie_time_pivot = (
df_join_all[(df_join_all['status_id'] == 1) & (df_join_all['movie_name'].isin(list_selected_film))]
.pivot_table( index= 'year_month', columns ='movie_name', aggfunc='count', values='ticket_id')
.reset_index()
)
df_movie_time_pivot.head(5)
#Biểu đồ cột chồng
ax = df_movie_time_pivot.plot(x = 'year_month', kind='bar', stacked=True, figsize=(15, 6), width=0.8, alpha = 0.7)
# Set the title and labels
ax.set_title('Movie name trend')
# ax.set_xlabel('Month')
ax.set_ylabel('Number of Tickets')
# Add a legend
plt.legend(title='Movies', loc='upper right', bbox_to_anchor=(1.35, 1))
# Show the plot
plt.show()
"""## 3.4 Customer value dimension
Muc tieu : Phan tich cac chi so ve gia tri ma mot KH mang lai
* Frequency : count, day, month
* Monetary : total, money, total_discount
* Success rate : number_success /total
* Promotion rate = number_promotion / total_success
* Discount rate = sum_discount / sum_money
"""
#khi su dung head ma bi ba cham (...) vi nhieu data qua thi phai su dung them hai cau lenh sau de in ra day du
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
df_join_all.head(2)
#Tinh tat ca cac chi so tren
#Tinh chi so cho nhung don hang thanh cong :
def calculate_n_promotion(x):
return (x == 'promotion').sum()
df_success_metric = (
df_join_all[df_join_all['status_id'] == 1]
.assign(date = pd.to_datetime(df_join_all['time']).dt.date) #tao ra them cot date de tinh toan
.groupby('customer_id')
.agg(
n_success = ('ticket_id', 'count'),
s_money = ('original_price', 'sum'),
s_discount = ('discount_value', 'sum'),
n_days = ('date', 'nunique'),
n_month = ('year_month', 'nunique'),
n_promotion = ('type', calculate_n_promotion)
)
.reset_index()
)
df_success_metric.head(2)
#Tinh cac chi so : total va cac giao dich loi
def calculate_n_failed(x):
return (x != 1).sum() #bien status_id != 1
df_failed_metric = (
df_join_all
.groupby('customer_id')
.agg(
n_total = ('ticket_id', 'count'),
n_failed = ('status_id', calculate_n_failed)
)
.reset_index()
)
df_failed_metric.head(2)
#Join hai bang
df_customer_value = (
pd.merge(df_success_metric, df_failed_metric, how = 'left', on='customer_id')
.fillna(0)
)
df_customer_value.head(2)
df_customer_value['success_rate'] = df_customer_value['n_success'] / df_customer_value['n_total']
df_customer_value['promotion_rate'] = df_customer_value['n_promotion'] / df_customer_value['n_success']
df_customer_value['discount_rate'] = df_customer_value['s_discount'] / df_customer_value['s_money']
df_customer_value.head(2)
#Visualize tat ca cac chi so bang Histogram :
df_customer_value.iloc[:, 1:].hist(figsize = (12, 9), grid = False, color = 'cornflowerblue', bins = 20)
plt.show()
""">- **Note:**
>>- **n_total**: Hau het cac KH mua ve rat it (1-2 lan) nhung co top nhung nguoi mua ve rat nhieu (250 ve) => check nhung nguoi nay
>>- **succes_rate**: co khoang 10% success rate = 0, tuc la ko the thanh toan thanh cong => kiem tra loi thanh toan
>>- **promotion_rate**: co hon 60000 KH chi tham gia promotion (rate = 100%)
>>- **n_promotion** : 60000 KH chi huong promotion mot lan (co lien quan gi den nhom o promotion rate = 100% khong? )
### Frequency & anomaly behavior
"""
#Voi n_total : gom nhom lai vi voi bang cu du lieu bi tap trung lech ve trai qua
df_customer_value['n_order_dis'] = df_customer_value['n_success'].apply(lambda x: 'more than 10' if x >=10 else str(x))
df_customer_value.head(2)
df_n_dis = df_customer_value.groupby('n_order_dis').agg(total_cus = ('customer_id', 'count')).reset_index()
df_n_dis
# Biểu đồ cột ngang :
plt.figure(figsize=(8, 4))
plt.barh(
df_n_dis['n_order_dis'], df_n_dis['total_cus']
)
for index,value in enumerate(df_n_dis['total_cus']):
plt.text(value,index,str(value))
#hau het mn chi mua ve 1 lan
#Voi nhung ng mua nhieu : Neu ho mua cung 1 luc => bat thuong. Neu ho mua dan trai => binh thuong
df_customer_value.sort_values(by='n_success', ascending= False).head(20)
#Can xem tan suat giao dich cua nhung nguoi mua tren 30 ve
#Lay ra danh sach nhung nguoi nay
#Join voi bang join_all => tim duoc lich su mua ve cua nhung nguoi nay
List_customer_massive = list(df_customer_value[df_customer_value['n_success'] > 30 ]['customer_id'].unique())
List_customer_massive
df_customer_massvie_pivot = (
df_join_all[df_join_all['customer_id'].isin(List_customer_massive) & (df_join_all['status_id'] == 1)]
.pivot_table(index= 'year_month', columns ='customer_id', aggfunc='count', values ='ticket_id')
.reset_index()
)
df_customer_massvie_pivot.head()
#Biểu đồ cột chồng
ax = df_customer_massvie_pivot.plot(x = 'year_month', kind='bar', stacked=True, figsize=(15, 6), width=0.8, alpha = 0.7)
# Set the title and labels
ax.set_title('Massive customer trend')
# ax.set_xlabel('Month')
ax.set_ylabel('Number of Tickets')
# Add a legend
plt.legend(title='customer_id', loc='upper right', bbox_to_anchor=(1.35, 1))
# Show the plot
plt.show()
#Moi nguoi mua hau het deu rai rac. Co mau xanh duong mua nhieu nhat thi moi thang mua khoang 40 ve => chua co gi bat thuong
#Neu muon tim hieu ki hon thi phan tich ki profil cua nhung nguoi nay
"""### Massive promotion"""
df_customer_value['n_promo_dis'] = df_customer_value['n_promotion'].apply(lambda x: 'more than 10' if x >=10 else str(x))
df_customer_value.head(2)
df_promo_dis = df_customer_value.groupby('n_promo_dis').agg(total_cus = ('customer_id', 'count')).reset_index()
df_promo_dis
plt.figure(figsize=(8, 4))
plt.barh(
df_promo_dis['n_promo_dis'], df_promo_dis['total_cus']
)
for index,value in enumerate(df_promo_dis['total_cus']):
plt.text(value,index,str(value))
#Gan 60% join cac ctrinh KM. Trong khi do 90% la chi huong KM 1 lan
#Vay thi: 1. KH den mot lan roi thoi. 2. Cac ctrinh KM chi cho 1 nguoi dung 1 lan (chi danh cho KH moi) ???
#Minh co thong tin ve promotion => kiem tra thong tin nay
df_type_group = (
df_join_all[ (df_join_all['status_id'] == 1) & (df_join_all['type'] == 'promotion')]
.groupby('campaign_type')
.agg(total = ('ticket_id', 'count'))
.reset_index()
)
df_type_group
## Tinh ti le loai Km cho tung nhom KH (nhom 1, 2, 3 KM,...)
df_n_success = (
df_join_all[ (df_join_all['status_id'] == 1) & (df_join_all['type'] == 'promotion')]
.groupby('customer_id')
.agg(n_promotion = ('ticket_id', 'count'))
)
df_n_pivot = (
df_join_all[ (df_join_all['status_id'] == 1) & (df_join_all['type'] == 'promotion')]
.pivot_table(index= 'customer_id', columns ='campaign_type', aggfunc='count', values ='ticket_id')
.reset_index()
)