-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmp_skin_care_recommender_system.py
251 lines (186 loc) · 9.16 KB
/
mp_skin_care_recommender_system.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# -*- coding: utf-8 -*-
"""MP-Skin Care Recommender System.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1DeJftzXh0OkKHUFylvDFyM1vmcyYvJz8
# Skin Care Product Recommender System Based on Skin Problem
This project is a *skincare* product recommendation system based on the facial problems experienced.
This dataset was scrapped by me from ecommerce or official websites of the product with details :
| Feature Name | Description |
| --- | --- |
|**product_href** | Product URL link |
|**product_name** | Product name |
|**product_type** |Type of product (Facial wash, Toner, Serum, Moisturizer, Sunscreen) |
|**brand** | Product brand |
|**notable_effects** | What it's good for |
|**skintype** | The suitable type of skin for the product (Normal, Dry, Oily, Combination, Sensitive) |
|**price** | Product price (in IDR Rp) |
|**description** | Product description |
|**picture_src** | Product image URL link |
# Import Libraries
"""
# Commented out IPython magic to ensure Python compatibility.
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# %matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.io as pio
import seaborn as sns
import missingno as mno
from sklearn.feature_extraction.text import TfidfVectorizer
"""# Data Exploration"""
skincare = pd.read_csv('https://drive.google.com/uc?export=download&id=19LD3liEpL9ZRi77evf5cK-icR0w_qXCx', low_memory=False)
skincare.info()
skincare.head()
# Check duplicated rows
skincare.duplicated().sum()
# Describe the features
cat = ['product_type', 'brand', 'notable_effects', 'skintype']
skincare[cat].describe()
"""**SUMMARY**
1. In total, there are 1224 products scrapped from websites.
2. Because this data was scrapped and arranged by me, it looks neater and cleaner. There's no null value.
3. Unfortunately, there are 14 duplicate rows. Need to be removed.
4. Of 5 types of products, serum is more hype than others.
5. SOMETHINC is the top brand, which means they have many kinds of products.
6. From many pairs of notable effects, 150 products are good for pore care, brightening, and anti-aging all in one product.
7. Looks like many skin care products suitable for oily skin.
## Exploratory Data Analysis (EDA)
"""
counts_brand = skincare['brand'].value_counts()
count_percentage = skincare['brand'].value_counts(1)*100
counts_dfbrand = pd.DataFrame({'Brand':counts_brand.index,'Counts':counts_brand.values,'Percent%':np.round(count_percentage.values,2)})
top_10_brands = counts_dfbrand.head(10)
top_10_brands
# Brand
plt.figure(figsize=(10, 5))
sns.set(style='white')
ax = sns.barplot(x='Brand', y='Counts', width = 0.6, data=top_10_brands, palette='magma')
ax.set_title('Total Products of Top 10 Brands', fontsize=15, fontweight='bold')
ax.set_xlabel('Brand', fontsize=12, fontweight='medium')
ax.set_ylabel('Total Products', fontsize=12, fontweight='medium')
for label in ax.containers:
ax.bar_label(label, fontweight='medium', fontsize=10)
plt.xticks(rotation = 15, fontsize=10)
plt.show()
# Checking product type
pd_type = pd.DataFrame()
pd_type["Count"] = skincare["product_type"].value_counts()
pd_type["Count%"] = skincare["product_type"].value_counts()/skincare.shape[0]*100
pd_type
fig = px.pie(pd_type, values = 'Count', color = pd_type.index, names = pd_type.index,
color_discrete_sequence = ['#003049','#D62828', '#F77F00', '#FCBF49', '#E9D8A6'], hole = 0.5)
fig.update_traces(textposition = 'outside', textfont = dict(color = 'dark blue', size = 15), textinfo = 'label+percent', pull = [0.2,0,0,0], rotation = 10)
fig.add_annotation (text = 'Product Type', showarrow = False, font = dict(size = 12, color='dark blue'))
fig.update_layout(title={'text':'Skin Care Product Type', 'y':0.95, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
# Skin Type That Suitable For the Product
counts = skincare['skintype'].value_counts()
count_percentage = skincare['skintype'].value_counts(1)*100
counts_df = pd.DataFrame({'Skin_type':counts.index,'Counts':counts.values,'Percent%':np.round(count_percentage.values,2)})
counts_df
# Skin Type
top_7 = counts_df.head(7)
sns.set(style='white')
fig = px.bar(data_frame=top_7, x='Skin_type', y='Counts',
color='Counts', color_continuous_scale='Inferno_r',
width=800, height=600,
text_auto=True, title=f'Count of Skin Type That Suitable For The Product')
fig.update_layout(plot_bgcolor='#FFFCF2', xaxis_tickangle = 15)
fig.show()
# Creating numerical labels of notable_effects due to many effects shown by the diagram above
skincare['notable_effects']=skincare['notable_effects'].astype('category')
skincare['labels']=skincare['notable_effects'].cat.codes
skincare.head()
# Skin Type That Suitable For the Product
counts_effect = skincare['notable_effects'].value_counts()
count_percentage = skincare['notable_effects'].value_counts(1)*100
counts_effect_df = pd.DataFrame({'Notable_Effects':counts_effect.index,'Counts':counts_effect.values,'Percent%':np.round(count_percentage.values,2)})
counts_effect_df.head(10)
top_5 = counts_effect_df.head(5)
fig = px.pie(top_5, values = 'Counts', color = 'Notable_Effects', names = 'Notable_Effects',
color_discrete_sequence = ['#99582A','#F4ACB7', '#FFCCD5', '#FDFCDC', '#C6AC8F'])
fig.update_traces(textposition = 'inside', textfont = dict(color = 'black', size = 15), textinfo = 'percent')
fig.update_layout(title={'text':'Skin Care Product Notable Effects', 'y':0.95, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
"""# Data Cleaning"""
# Remove duplicated rows (14 in total)
skincare.drop_duplicates(inplace=True)
# Re-checking the existence of duplicated rows
skincare.duplicated().sum()
"""# Data Preparation
**Encoding Each Skintypes**
"""
# Separate all skintype into one list, considering comma + space as separators
skintype = skincare['skintype'].str.split(', ').tolist()
# Flatten the list
flat_skintype = [item for sublist in skintype for item in sublist]
# Convert to a set to make unique
set_skintype = set(flat_skintype)
# Back to list
unique_skintype = list(set_skintype)
# Create columns by each unique skintype
skincare = skincare.reindex(skincare.columns.tolist() + unique_skintype, axis=1, fill_value=0)
# For each value inside column, update the dummy
for index, row in skincare.iterrows():
for val in row.skintype.split(', '):
if val != 'NA':
skincare.loc[index, val] = 1
skincare.head(5)
"""## Exporting the DataFrame as a CSV file
The next stage is to extract the cleaned and prepared dataset into .csv format, to enter the modeling and deployment stage in the home.py file
"""
skincare.to_csv("export_skincare.csv")
"""# Modelling
### TF-IDF Vectorizer
The TF-IDF Vectorizer will be used in the recommendation system to find a representation of the important features of each notable_effects category. We will use the tfidfvectorizer() function from the sklearn library.
"""
# Modeling with Content Based Filtering
# Initializing TfidfVectorizer
tf = TfidfVectorizer()
# Perform IDF calculation on 'notable_efects' data
tf.fit(skincare['notable_effects'])
# Mapping array from integer index feature to name feature
tf.get_feature_names_out()
# Doing fit then transformed to matrix form
tfidf_matrix = tf.fit_transform(skincare['notable_effects'])
# Viewing matrix size TF IDF
shape = tfidf_matrix.shape
shape
# Convert TF-IDF vector in matrix form with todense() function
tfidf_matrix.todense()
# Making dataframe to see TF-IDF matrix
pd.DataFrame(
tfidf_matrix.todense(),
columns=tf.get_feature_names_out(),
index=skincare.product_name
).sample(shape[1], axis=1).sample(10, axis=0)
"""### Cosine Similarity"""
# Calculating Cosine Similarity on the TF-IDF matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim
# Creating a dataframe from the cosine_similarity variable with rows and columns in the form of product names
cosine_sim_df = pd.DataFrame(cosine_sim, index=skincare['product_name'], columns=skincare['product_name'])
# See the similarity matrix for each product
cosine_sim_df.sample(5, axis=1).sample(10, axis=0)
"""### Getting Recommendation"""
def skincare_recommendations(nama, similarity_data=cosine_sim_df, items=skincare[['product_name', 'notable_effects']], k=5):
# Retrieve data by using argpartition to partition indirectly along a given axis
# Dataframe converted to be numpy
# Range(start, stop, step)
index = similarity_data.loc[:,nama].to_numpy().argpartition(
range(-1, -k, -1))
# Retrieve data with the greatest similarity from the existing index
closest = similarity_data.columns[index[-1:-(k+2):-1]]
# Drop a name so that the name of the product we are looking for doesnt' appear in the list of recommendations
closest = closest.drop(nama, errors='ignore')
return pd.DataFrame(closest).merge(items).head(k)
skincare[skincare.product_name.eq('ELSHE SKIN Radiant Supple Serum')].head()
"""Trying to test the model by getting recommendation"""
# Getting skin care product recommendation which similar to Wardah Renew You Anti Aging Day Cream
skincare_recommendations("ELSHE SKIN Radiant Supple Serum")