Description
when i am executing the below code ,i am getting
ValueError: empty vocabulary; perhaps the documents only contain stop words in TfidfVectorizer error:
vectorizer = TfidfVectorizer(decode_error='ignore',strip_accents='unicode',stop_words='english',min_df=1,analyzer='word')
tfidf= vectorizer.fit_transform([str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower()])
Python Code:
"""
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import FastText
from sklearn.feature_extraction.text import TfidfVectorizer
from difflib import SequenceMatcher
#Loading Query Text for Corpus Building
qt=pd.read_csv('C:/Demo/Query_Text.csv')
qt.shape
#Loading QueryText for comparing
convid =pd.read_csv('C:/Demo/ConvId_May06.csv')
convid.shape
sentences = qt
convid = convid.sort_values(['User_PUID','EventInfo_Time'], ascending=[True,True])
convid = convid.reset_index()
convid['FastTextResult'] =float()
convid['Tfidf'] = float()
convid['TfidfWc'] = float()
model_ted = FastText(qt, window=1, min_count=1, sg=0)
for i in range(len(convid['Query_Text'])):
print('i',i)
if(i == len(convid)-1):
break
print("The FastText Output")
ft=model_ted.wv.similarity(str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower())
ft=round(ft,4)
print(ft)
#convid.loc[convid['FastTextResult'][i+1]]=ft
convid['FastTextResult'][i+1]=ft
print("vector output ")
vectorizer = TfidfVectorizer(decode_error='ignore',strip_accents='unicode',stop_words='english',min_df=1,analyzer='word')
tfidf= vectorizer.fit_transform([str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower()])
product=(tfidf *tfidf.T).A
pro = product[0,1]
pro = round(pro,4)
print(pro)
convid['Tfidf'][i+1]=pro
print("widlchar")
wildchar_value = SequenceMatcher(str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower()).ratio()
wildchar_value = round(wildchar_value,4)
convid['TfidfWc'][i+1]=wildchar_value
print(wildchar_value)
convid.to_csv('C:/Demo/ConvIdOutput_May061.csv')