Skip to content

ValueError: empty vocabulary; perhaps the documents only contain stop words in TfidfVectorizer #365

Open
@DSKaarthick

Description

@DSKaarthick

when i am executing the below code ,i am getting
ValueError: empty vocabulary; perhaps the documents only contain stop words in TfidfVectorizer error:

vectorizer = TfidfVectorizer(decode_error='ignore',strip_accents='unicode',stop_words='english',min_df=1,analyzer='word')
tfidf= vectorizer.fit_transform([str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower()])

Python Code:
"""

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import FastText
from sklearn.feature_extraction.text import TfidfVectorizer
from difflib import SequenceMatcher

#Loading Query Text for Corpus Building
qt=pd.read_csv('C:/Demo/Query_Text.csv')
qt.shape

#Loading QueryText for comparing

convid =pd.read_csv('C:/Demo/ConvId_May06.csv')

convid.shape

sentences = qt

convid = convid.sort_values(['User_PUID','EventInfo_Time'], ascending=[True,True])
convid = convid.reset_index()
convid['FastTextResult'] =float()
convid['Tfidf'] = float()
convid['TfidfWc'] = float()

model_ted = FastText(qt, window=1, min_count=1, sg=0)

for i in range(len(convid['Query_Text'])):
print('i',i)
if(i == len(convid)-1):
break

print("The FastText Output")

ft=model_ted.wv.similarity(str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower())
ft=round(ft,4)
print(ft)
#convid.loc[convid['FastTextResult'][i+1]]=ft
convid['FastTextResult'][i+1]=ft

print("vector output ")

vectorizer = TfidfVectorizer(decode_error='ignore',strip_accents='unicode',stop_words='english',min_df=1,analyzer='word')
tfidf= vectorizer.fit_transform([str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower()])
product=(tfidf *tfidf.T).A
pro = product[0,1]
pro = round(pro,4)
print(pro)
convid['Tfidf'][i+1]=pro

print("widlchar")
wildchar_value = SequenceMatcher(str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower()).ratio()

wildchar_value = round(wildchar_value,4)

convid['TfidfWc'][i+1]=wildchar_value
print(wildchar_value)

convid.to_csv('C:/Demo/ConvIdOutput_May061.csv')

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions