朴素贝叶斯增量学习的python实现

先上代码,说明后面再加。

[python]
#-*- coding:utf-8 -*-
”’
Created on 2014-04-28

@author: Howard
”’
from os import path, sys

sys.path.append("../")
from nlpir import seg
import numpy
import cPickle, codecs, os

class NaiveBayes:
def __init__(self):
self.segmentor = seg.Seg()

def createVocabList(self, dataSet):
"创建一个词表"
#❶ 创建一个空集
self.vocabSet = set([])
for document in dataSet:
#❷ 创建两个集合的并集
self.vocabSet = self.vocabSet | set(document)
self.vocabList = list(self.vocabSet)

”’
def addATrainDoc(self,doc):
self.vocabSet=self.vocabSet|set(doc)
self.vocabList=list(self.vocabSet)
”’

def bagOfWords2VecMN(self, inputSet):
#向量的每个位置上保存的是该位置词出现的个数,即向量各个分量之和等于文档总词数。
returnVec = [0] * len(self.vocabList)
for word in inputSet:
if word in self.vocabList:
returnVec[self.vocabList.index(word)] += 1
return returnVec

def trainNB0(self, trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix)

#保存总文档数,扩展时使用
self.documentNum = numTrainDocs

numWords = len(trainMatrix[0])
#所有垃圾的正好是1,所以正好是加起来的数除以总分类数
p1 = sum(trainCategory) / float(numTrainDocs)
#❶ (以下两行)初始化概率 ,加入平滑(加一平滑)
p0Num = numpy.ones(numWords);
p1Num = numpy.ones(numWords)
p0Denom = 0.0;
p1Denom = 0.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
#❷(以下两行)向量相加
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])

#保存各个类型文档的数目,扩展时使用
self.p1wordNum = p1Denom
self.p0wordNum = p0Denom

p1Vect = numpy.log(p1Num / (p1Denom + len(self.vocabList))) #加一平滑的分母为 总单词数+|V|
p0Vect = numpy.log(p0Num / (p0Denom + len(self.vocabList))) #加一平滑的分母为 总单词数+|V|
#self.p1VectDict = dict(zip(self.vocabList, p1Vect)) #1类中所有单词的概率值
#self.p0VectDict = dict(zip(self.vocabList, p0Vect)) #0类中所有单词的概率值
self.vectDict = {}
for i in range(len(self.vocabList)):
self.vectDict[self.vocabList[i]] = {‘p1’: p1Vect[i], ‘p0’: p0Vect[i]}
self.p1 = p1
self.p0 = 1 – p1
return p0Vect, p1Vect, p1

def classifyNB(self, words):
"""
vec2Classify 词向量
p0Vec 在0类中的概率向量
p1Vec 在1类中的概率向量
pClass1 1类的概率
"""
#❶ 元素相乘
p1 = numpy.log(self.p1)
p0 = numpy.log(self.p0)
v1Values = [self.vectDict.get(word, {‘p1’: numpy.log(1.0 / (self.p1wordNum + len(self.vectDict)))})[‘p1’] for
word in
words] #加一平滑中未见词的概率为 总单词数+|V|
v0Values = [self.vectDict.get(word, {‘p0’: numpy.log(1.0 / (self.p0wordNum + len(self.vectDict)))})[‘p0’] for
word in words] #加一平滑中未见词的概率为 总单词数+|V|
p1 += numpy.sum(v1Values)
p0 += numpy.sum(v0Values)
if p1 > p0:
return 1, p1, p0
else:
return 0, p1, p0

def _calcMostFreq(self, fullText):
import operator

freqDict = {}
for token in self.vocabList:
freqDict[token] = fullText.count(token)
sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedFreq[:30]

def saveModel(self, path):
outputfile = open(path, "w")
cPickle.dump(self.p0, outputfile)
cPickle.dump(self.p1, outputfile)
cPickle.dump(self.vectDict, outputfile)
cPickle.dump(self.documentNum, outputfile)
cPickle.dump(self.p1wordNum, outputfile)
cPickle.dump(self.p0wordNum, outputfile)
outputfile.close()

def loadModel(self, path):
if os.path.isfile(path):
inputfile = open(path, ‘r’)
self.p0 = cPickle.load(inputfile)
self.p1 = cPickle.load(inputfile)
self.vectDict = cPickle.load(inputfile)
self.documentNum = cPickle.load(inputfile)
self.p1wordNum = cPickle.load(inputfile)
self.p0wordNum = cPickle.load(inputfile)
inputfile.close()
return True
else:
return False

def increase(self, text, lable):
”’
贝叶斯增量学习
”’
words = self.segmentor.seg(text).split()
wordslength = len(words)
if lable == 1:
p1wordslength = wordslength
p0wordslength = 0
else:
p1wordslength = 0
p0wordslength = wordslength
#更新不同标记的分类的频率 步骤一
self.p0 = self.documentNum * self.p0 / (self.documentNum + 1)
self.p1 = self.documentNum * self.p1 / (self.documentNum + 1)

newwordscount = 0
for word in words:
if not self.vectDict.has_key(word):
newwordscount += 1
dictlength = len(self.vectDict)

def updateFrequency1(key):
self.vectDict[key][‘p1’] += numpy.log(
(self.p1wordNum + dictlength) / (self.p1wordNum + (dictlength + newwordscount) + p1wordslength))

def updateFrequency0(key):
self.vectDict[key][‘p0’] += numpy.log(
(self.p0wordNum + dictlength) / (self.p0wordNum + (dictlength + newwordscount) + p0wordslength))

if lable == 1:
self.p1 += 1.0 / (self.documentNum + 1) #更新不同标记的分类的频率 步骤二
#更新该分类下所有词的频率
map(updateFrequency1,filter((lambda key: not words.__contains__(key)),self.vectDict.keys()))
if newwordscount != 0: #如果新词频率不等于零,还要更新对侧分类的频率值
map(updateFrequency0, self.vectDict.keys())

#更新出现了的词的频率
for word in set(words):
if self.vectDict.has_key(word):
self.vectDict[word][‘p1’] = numpy.log(
((self.p1wordNum + dictlength) * numpy.exp(self.vectDict[word][‘p1’]) + words.count(word)) / (
self.p1wordNum + (dictlength + newwordscount) + p1wordslength))
else:
self.vectDict[word] = {}
self.vectDict[word][‘p1’] = numpy.log((words.count(word) + 1.0) / (
self.p1wordNum + (dictlength + newwordscount) + p1wordslength)) #+1是为了平滑,与最初构建时的加一平滑一致
self.vectDict[word][‘p0’] = numpy.log(
1.0 / (self.p0wordNum + (dictlength + newwordscount) + p0wordslength))
self.p1wordNum += wordslength

else:
self.p0 += 1.0 / (self.documentNum + 1) #更新不同标记的分类的频率 步骤二
#更新该分类下所有词的频率
map(updateFrequency0, filter((lambda key: not words.__contains__(key)),self.vectDict.keys()))
if newwordscount != 0: #如果新词频率不等于零,还要更新对侧分类的频率值
map(updateFrequency1, self.vectDict.keys())

#更新出现了的词的频率
for word in set(words):
if self.vectDict.has_key(word):
self.vectDict[word][‘p0’] = numpy.log(
((self.p0wordNum + dictlength) * numpy.exp(self.vectDict[word][‘p0’]) + words.count(word)) / (
self.p0wordNum + (dictlength + newwordscount) + p0wordslength))
else:
self.vectDict[word] = {}
self.vectDict[word][‘p0’] = numpy.log((words.count(word) + 1.0) / (
self.p0wordNum + (dictlength + newwordscount) + p0wordslength)) #+1是为了平滑,与最初构建时的加一平滑一致
self.vectDict[word][‘p1′] = numpy.log(
1.0 / (self.p1wordNum + (dictlength + newwordscount) + p1wordslength))
self.p0wordNum += wordslength
self.documentNum += 1

def train(self, texts, lables):
wordMatrix = []
fullText = []
for text in texts:
words = self.segmentor.seg(text).split()
wordMatrix.append(words)
fullText.extend(words)
self.createVocabList(wordMatrix)
”’
top30Words = self._calcMostFreq(fullText)
for pairW in top30Words:
if pairW[0] in self.vocabList: self.vocabList.remove(pairW[0])
”’
trainMatrix = []
for docs in wordMatrix:
trainMatrix.append(self.bagOfWords2VecMN(docs))
self.trainNB0(numpy.array(trainMatrix), numpy.array(lables))

def classify(self, text):
words = self.segmentor.seg(text).split()
return self.classifyNB(words)

if __name__ == ‘__main__’:
texts = [u’i am spam 1′, u’i am not spam 2′, u’i am spam 2′, ‘i am not spam 3′]
#texts = [u’i am spam 1′, u’i am not spam 1’, u”, ”]
lables = [1, 0, 1, 0]

bayes1 = NaiveBayes()
bayes1.train(texts, lables)
print ‘testing spam’
psum = 0
for (k, v) in bayes1.vectDict.items():
psum += numpy.exp(v[‘p1’])
print k, v[‘p1’]
print psum
print ‘testing common’
psum = 0
for (k, v) in bayes1.vectDict.items():
psum += numpy.exp(v[‘p0’])
print k, v[‘p0′]
print psum
bayes2 = NaiveBayes()
bayes2.train(texts[:2], lables[:2])
bayes2.increase(u’i am spam 2′, 1)
bayes2.increase(u’i am not spam 3’, 0)
#bayes2.increase(u”, 1)
#bayes2.increase(u”, 0)
print ‘testing spam’
psum = 0
for (k, v) in bayes2.vectDict.items():
psum += numpy.exp(v[‘p1’])
print k, v[‘p1’]
print psum
print ‘testing common’
psum = 0
for (k, v) in bayes2.vectDict.items():
psum += numpy.exp(v[‘p0’])
print k, v[‘p0’]
print psum
[/python]

参考文献:http://www.ituring.com.cn/article/32338

2 评论

发表评论

电子邮件地址不会被公开。 必填项已用*标注