Skip to content

Commit 633b425

Browse files
Add files via upload
1 parent 362a776 commit 633b425

File tree

4 files changed

+1872
-0
lines changed

4 files changed

+1872
-0
lines changed

__pycache__/func_file.cpython-310.pyc

3.22 KB
Binary file not shown.

app.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import streamlit as st
2+
import numpy as np
3+
import pandas as pd
4+
import os
5+
from sklearn.feature_extraction.text import TfidfVectorizer
6+
from sklearn.metrics.pairwise import cosine_similarity
7+
from func_file import *
8+
from string import punctuation
9+
from nltk.corpus import stopwords
10+
11+
#####
12+
df = pd.read_csv('prog_book.csv')
13+
import nltk
14+
nltk.download('stopwords')
15+
stop = stopwords.words('english')
16+
stop = set(stop)
17+
18+
df['clean_Book_title']=df['Book_title'].apply(clean_text)
19+
df['clean_Description']=df['Description'].apply(clean_text)
20+
21+
vectorizer = TfidfVectorizer(analyzer='word', lowercase=False)
22+
X = vectorizer.fit_transform(df['clean_Book_title'])
23+
title_vectors = X.toarray()
24+
25+
tk = 0
26+
st.title('Book Recommendation System')
27+
28+
col1, col2 = st.columns(2)
29+
#taking book name as input
30+
with col1:
31+
book = st.text_input('Enter book name that you liked : ')
32+
33+
#taking multiple fiels to get similarity
34+
with col2:
35+
feat = st.selectbox("Select Mode : ",['Book_title', 'Rating', 'Price'])
36+
if st.button('Search'):
37+
tk = 1
38+
39+
#st.dataframe(df.head(10))
40+
if tk == 1:
41+
st.success('Recommending books similar to '+book)
42+
rec = st.empty()
43+
rec = st.dataframe(get_recommendations(book, 'Book_title', df, title_vectors, feat), width=700, height=76)
44+
45+
#print(get_recommendations())

func_file.py

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
# To add a new cell, type '# %%'
2+
# To add a new markdown cell, type '# %% [markdown]'
3+
# %%
4+
# This Python 3 environment comes with many helpful analytics libraries installed
5+
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
6+
# For example, here's several helpful packages to load
7+
8+
import numpy as np # linear algebra
9+
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
10+
11+
# Input data files are available in the read-only "../input/" directory
12+
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
13+
14+
import os
15+
#for dirname, _, filenames in os.walk('/kaggle/input'):
16+
# for filename in filenames:
17+
# print(os.path.join(dirname, filename))
18+
19+
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
20+
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
21+
22+
23+
# %%
24+
from sklearn.feature_extraction.text import TfidfVectorizer
25+
from sklearn.metrics.pairwise import cosine_similarity
26+
27+
28+
# %%
29+
df = pd.read_csv('prog_book.csv')
30+
#/kaggle/input/top-270-rated-computer-science-programing-books/
31+
32+
# %% [markdown]
33+
# Let's take a look at our programming books dataset:
34+
35+
# %%
36+
df.head()
37+
38+
# %% [markdown]
39+
# We can use "Book_title" and "Description" columns to find books similar to each other.
40+
# %% [markdown]
41+
# # Text preprocessing
42+
43+
# %%
44+
import nltk
45+
nltk.download('stopwords')
46+
from nltk.corpus import stopwords
47+
stop = stopwords.words('english')
48+
49+
# Set of stopwords to remove
50+
stop = set(stop)
51+
52+
# Set of punctuation signs to remove
53+
from string import punctuation
54+
55+
# %% [markdown]
56+
# We'll be using this small set of functions for text preprocessing:
57+
58+
# %%
59+
import re
60+
61+
def lower(text):
62+
return text.lower()
63+
64+
def remove_punctuation(text):
65+
return text.translate(str.maketrans('','', punctuation))
66+
67+
def remove_stopwords(text):
68+
return " ".join([word for word in str(text).split() if word not in stop])
69+
70+
# Removing all words with digits and standalone digits
71+
def remove_digits(text):
72+
return re.sub(r'\d+', '', text)
73+
74+
# One function to clean it all
75+
def clean_text(text):
76+
text = lower(text)
77+
text = remove_punctuation(text)
78+
text = remove_stopwords(text)
79+
text = remove_digits(text)
80+
return text
81+
82+
# %% [markdown]
83+
# And then, we'll create new columns with cleaned "Book_title" and "Description" texts:
84+
85+
# %%
86+
df['clean_Book_title']=df['Book_title'].apply(clean_text)
87+
df.head()
88+
89+
90+
# %%
91+
df['clean_Description']=df['Description'].apply(clean_text)
92+
df.head()
93+
94+
# %% [markdown]
95+
# # Creating features
96+
# Now, we need to transform text from "Book_title" to vectors array:
97+
98+
# %%
99+
# Initializing vectorizer
100+
vectorizer = TfidfVectorizer(analyzer='word', lowercase=False)
101+
102+
# Applying vectorized to clean text
103+
X = vectorizer.fit_transform(df['clean_Book_title'])
104+
105+
# Getting array with vectorized titles
106+
title_vectors = X.toarray()
107+
title_vectors
108+
109+
# %% [markdown]
110+
# Let's do the same with "Description" column:
111+
112+
# %%
113+
desc_vectorizer = TfidfVectorizer(analyzer='word', lowercase=False)
114+
Y = desc_vectorizer.fit_transform(df['clean_Description'])
115+
desc_vectors = Y.toarray()
116+
desc_vectors
117+
118+
# %% [markdown]
119+
# And now we have two arrays of vectors ready for work.
120+
121+
# %%
122+
# List of titles for use
123+
# df['Book_title'].tolist()
124+
125+
# %% [markdown]
126+
# # Recommendation system
127+
#
128+
129+
# %%
130+
def get_recommendations(value_of_element, feature_locate, df, vectors_array, feature_show):
131+
"""Returns DataFrame with particular feature of target and the same feature of five objects similar to it.
132+
133+
value_of_element - unique value of target object
134+
feature_locate - name of the feature which this unique value belongs to
135+
df - DataFrame with feautures
136+
vectors_array - array of vectorized text used to find similarity
137+
feature_show - feature that will be shown in final DataFrame
138+
"""
139+
140+
# Locating target element by its specific value
141+
index_of_element = df[df[feature_locate]==value_of_element].index.values[0]
142+
143+
# Finding its value to show
144+
show_value_of_element = df.iloc[index_of_element][feature_show]
145+
146+
# Dropping target element from df
147+
df_without = df.drop(index_of_element).reset_index().drop(['index'], axis=1)
148+
149+
# Dropping target element from vectors array
150+
vectors_array = list(vectors_array)
151+
target = vectors_array.pop(index_of_element).reshape(1,-1)
152+
vectors_array = np.array(vectors_array)
153+
154+
# Finding cosine similarity between vectors
155+
most_similar_sklearn = cosine_similarity(target, vectors_array)[0]
156+
157+
# Sorting coefs in desc order
158+
idx = (-most_similar_sklearn).argsort()
159+
160+
# Finding features of similar objects by index
161+
all_values = df_without[[feature_show]]
162+
for index in idx:
163+
simular = all_values.values[idx]
164+
165+
recommendations_df = pd.DataFrame({feature_show: show_value_of_element,
166+
"rec_1": simular[0][0],
167+
"rec_2": simular[1][0],
168+
"rec_3": simular[2][0],
169+
"rec_4": simular[3][0],
170+
"rec_5": simular[4][0]}, index=[0])
171+
172+
173+
return recommendations_df
174+
175+
# %% [markdown]
176+
# Ok, let's find books similar to "Algorithms" book based on the title:
177+
178+
# %%
179+
get_recommendations("Algorithms", 'Book_title', df, title_vectors, 'Book_title')
180+
181+
# %% [markdown]
182+
# We can also look at their prices:
183+
184+
# %%
185+
get_recommendations("Algorithms", 'Book_title', df, title_vectors, 'Price')
186+
187+
# %% [markdown]
188+
# Or ratings:
189+
190+
# %%
191+
get_recommendations("Algorithms", 'Book_title', df, title_vectors, 'Rating')
192+
193+
# %% [markdown]
194+
# Now, let's find books similar to "Algorithms" book based on the description:
195+
196+
# %%
197+
get_recommendations("Algorithms", 'Book_title', df, desc_vectors, 'Book_title')
198+
199+
# %% [markdown]
200+
# As you can see, recommendations based on description are different from title-based recommendations in some ways.
201+
202+
# %%
203+
get_recommendations("Unity in Action", 'Book_title', df, desc_vectors, 'Book_title')
204+
205+
206+
# %%
207+
get_recommendations("Unity in Action", 'Book_title', df, title_vectors, 'Book_title')
208+
209+
# %% [markdown]
210+
# We can also access some book by any unique value, for example, by number of reviwes (or, more logically, ID of the book, if there's some):
211+
212+
# %%
213+
get_recommendations("1,406", 'Reviews', df, title_vectors, 'Book_title')
214+
215+
216+
# %%
217+
get_recommendations("The Information: A History, a Theory, a Flood", 'Book_title', df, title_vectors, 'Book_title')
218+
219+
220+
# %%
221+
222+
223+

0 commit comments

Comments
 (0)