mccalum/naivebayes.py

#!/sw/bin/python

import math
import sys
import glob
import pickle
from dicts import DefaultDict

# In the documentation and variable names below "class" is the same
# as "category"

def naivebayes (dirs):
    """Train and return a naive Bayes classifier.  
    The datastructure returned is an array of tuples, one tuple per
    class; each tuple contains the class name (same as dir name)
    and the multinomial distribution over words associated with
    the class"""
    classes = []
    for dir in dirs:
	print dir
	countdict = files2countdict(glob.glob(dir+"/*"))
	# Here turn the "countdict" dictionary of word counts into
	# into a dictionary of smoothed word probabilities
	classes.append((dir,countdict))
    return classes

def classify (classes, filename):
    """Given a trained naive Bayes classifier returned by naivebayes(), and
    the filename of a test document, d, return an array of tuples, each
    containing a class label; the array is sorted by log-probability 
    of the class, log p(c|d)"""
    answers = []
    print 'Classifying', filename
    for c in classes:
	score = 0
	for word in open(filename).read().split():
	    word = word.lower()
	    score += math.log(c[1].get(word,1))
	answers.append((score,c[0]))
    answers.sort()
    return answers

def files2countdict (files):
    """Given an array of filenames, return a dictionary with keys
    being the space-separated, lower-cased words, and the values being
    the number of times that word occurred in the files."""
    d = DefaultDict(0)
    for file in files:
	for word in open(file).read().split():
	    d[word.lower()] += 1
    return d
	

if __name__ == '__main__':
    print 'argv', sys.argv
    print "Usage:", sys.argv[0], "classdir1 classdir2 [classdir3...] testfile"
    dirs = sys.argv[1:-1]
    testfile = sys.argv[-1]
    nb = naivebayes (dirs)
    print classify(nb, testfile)
    pickle.dump(nb, open("classifier.pickle",'w'))