algorithm - An issue of a SPAM filter with Naive Bayes Classification in python -
i'm trying write spam filter naive bayes classification program in python(3.6). code has 4 parts showed below:
- import same necessaire package, import tranning dataset , test dataset , use sparse command transfer datasets format can handle better
- define trainning function function
- define testfunction
- call 2 function
for first 3 parts of code, can runned in jupyter notebook well, stuck in execution of test function
first part code
#import packages import numpy np import pandas pd scipy import sparse scipy import stats #transfer trainning dataset data = pd.read_csv("emails-train-features.txt",' ',header=none) maillabel = pd.read_csv("emails-train-labels.txt",' ',header=none) maillabel.columns = ['category'] row = data.iloc[:,0] row = np.array(row) col = data.iloc[:,1] col = np.array(col) worddata = data.iloc[:,2] worddata = np.array(worddata) data = sparse.coo_matrix((worddata,(row,col)),shape=(701,2501)) data = data.toarray() c = pd.dataframe(data).iloc[1:,1:] c = c.join(maillabel) data = c data['category'] = data['category'].replace(0,'nospam') data['category'] = data['category'].replace(1,'spam') #transfer test dataset spam_test = pd.read_csv("emails-test-features.txt",' ',header=none) row = spam_test.iloc[:,0] row = np.array(row) col = spam_test.iloc[:,1] col = np.array(col) worddata = spam_test.iloc[:,2] worddata = np.array(worddata) spam_test = sparse.coo_matrix((worddata,(row,col)),shape=(701,2501)) spam_test = spam_test.toarray() d = pd.dataframe(spam_test).iloc[1:,1:] spam_test = d second part code
classes=['spam', 'nospam'] # data structure class emaildata: def __init__(self,c,p,m,s): self.classes=c self.prior=p self.means=m self.stdev=s # training phase def spamfiltertrain(data,classes): means = dict() stdev = dict() prior = dict() c in classes: means[c]=data[data.category==c].mean() stdev[c]=data[data.category==c].std() prior[c]=len(data[data.category==c])/len(data) return emaildata(classes, prior, means, stdev) third part code
#prediction phase def spamfiltertest(classifier,x): logscores = dict() bestscore = -float(np.inf) bestclass = classifier.classes[0] finalscore = np.ones(x.shape[0], dtype=np.float64) finalclass = list(range(x.shape[0])) in range(x.shape[0]): c in classes: logscores[c] = np.log(classifier.prior[c]) var in range(x.shape[1]): logscores[c] += np.log(stats.norm(classifier.means[c][var],classifier.stdev[c][var]).pdf(x.ioc[i,var])) if (logscores[c] > 0): bestscore = logscores[c] bestclass = c finalscore[i] = bestscore finalclass[i] = bestclass return (finalclass,finalscore) fourth part code
myspam = spamfiltertrain(data, classes) spamfiltertest(myspam,spam_test) when run code spamfiltertest(myspam,spam_test), generates issue don't understand below.
issue
keyerror traceback (most recent call last) <ipython-input-19-b8cb97dbd2a1> in <module>() ----> 1 spamfiltertest(myspam,range(3)) 2 #spamfiltertest(myspam,spam_test) <ipython-input-17-db644fe629ee> in spamfiltertest(classifier, x) 44 #p(s) 45 var in range(len(x)): ---> 46 logscores[c] += np.log(stats.norm(classifier.means[c][var],classifier.stdev[c][var]).pdf(x[var])) 47 if (logscores[c]>0): 48 bestscore=logscores[c] /users/yuanhaoran/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key) 601 key = com._apply_if_callable(key, self) 602 try: --> 603 result = self.index.get_value(self, key) 604 605 if not is_scalar(result): /users/yuanhaoran/anaconda/lib/python3.6/site-packages/pandas/indexes/base.py in get_value(self, series, key) 2167 try: 2168 return self._engine.get_value(s, k, -> 2169 tz=getattr(series.dtype, 'tz', none)) 2170 except keyerror e1: 2171 if len(self) > 0 , self.inferred_type in ['integer', 'boolean']: pandas/index.pyx in pandas.index.indexengine.get_value (pandas/index.c:3557)() pandas/index.pyx in pandas.index.indexengine.get_value (pandas/index.c:3240)() pandas/index.pyx in pandas.index.indexengine.get_loc (pandas/index.c:4279)() pandas/src/hashtable_class_helper.pxi in pandas.hashtable.pyobjecthashtable.get_item (pandas/hashtable.c:13742)() pandas/src/hashtable_class_helper.pxi in pandas.hashtable.pyobjecthashtable.get_item (pandas/hashtable.c:13696)() keyerror: 0
Comments
Post a Comment