ValueError: empty vocabulary; perhaps the documents only contain stop words in TfidfVectorizer




when i am executing the below code ,i am getting 
ValueError: empty vocabulary; perhaps the documents only contain stop words in TfidfVectorizer error:

 vectorizer = TfidfVectorizer(decode_error='ignore',strip_accents='unicode',stop_words='english',min_df=1,analyzer='word')
    tfidf= vectorizer.fit_transform([str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower()])


Python Code:
"""

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim 
from gensim.models import FastText
from sklearn.feature_extraction.text import TfidfVectorizer
from difflib import SequenceMatcher

#Loading Query Text for Corpus Building
qt=pd.read_csv('C:/Demo/Query_Text.csv')
qt.shape

#Loading QueryText for comparing

convid =pd.read_csv('C:/Demo/ConvId_May06.csv')

convid.shape


sentences = qt


convid = convid.sort_values(['User_PUID','EventInfo_Time'], ascending=[True,True])
convid = convid.reset_index()
convid['FastTextResult'] =float()
convid['Tfidf'] = float()
convid['TfidfWc'] = float()


model_ted = FastText(qt,  window=1, min_count=1, sg=0)
 


for i in range(len(convid['Query_Text'])):
    print('i',i)
    if(i == len(convid)-1):
            break
    
    print("The FastText Output")
    
    ft=model_ted.wv.similarity(str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower())
    ft=round(ft,4)
    print(ft)
    #convid.loc[convid['FastTextResult'][i+1]]=ft
    convid['FastTextResult'][i+1]=ft
    
    print("vector output ")
    
    vectorizer = TfidfVectorizer(decode_error='ignore',strip_accents='unicode',stop_words='english',min_df=1,analyzer='word')
    tfidf= vectorizer.fit_transform([str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower()])
    product=(tfidf *tfidf.T).A
    pro = product[0,1]
    pro = round(pro,4)
    print(pro)
    convid['Tfidf'][i+1]=pro
    
    print("widlchar")
    wildchar_value = SequenceMatcher(str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower()).ratio()
    
    wildchar_value = round(wildchar_value,4)

    convid['TfidfWc'][i+1]=wildchar_value
    print(wildchar_value)

convid.to_csv('C:/Demo/ConvIdOutput_May061.csv')



Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

ValueError: empty vocabulary; perhaps the documents only contain stop words in TfidfVectorizer #365

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

ValueError: empty vocabulary; perhaps the documents only contain stop words in TfidfVectorizer #365

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions