一.贝叶斯用于分类的原理:
二.朴素贝叶斯的两种假设:
由统计学知,如果每个特征需要N个样本,那么对于10个特征将需要N的10次方个样本,对于包含1000个特征的词汇表将需要N的1000次
方个样本。可以看到,所需样本数会随着特征数目增大而迅速增长。
如果特征之间相互独立,那么样本数就可以从N的1000次方减少到1000*N。所谓独立,即一个特征或者单词出现的可能性与它和其他单词相邻没有
关系。这是朴素贝叶斯的一个假设,它的另一个假设是每个特征(单词)同等重要。
朴素贝叶斯分类器有两种实现方式:1.基于伯努利模型实现,2.基于多项式模型实现
(伯努利模型方式并不考虑词在文档中出现的次数,只考虑出不出现,因此在这个意义上相当于假设词是等权重的)
三.伯努利模型实现 :
1.构建词向量:
def loadDataSet():
postingList = [
['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']
]
classVec = [0,1,0,1,0,1] #1:侮辱性文字,0:正常言论
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([]) #创建一个空集
for document in dataSet:
vocabSet = vocabSet | set(document) #创建两个集合的并集,去重操作
return list(vocabSet)
def setOfWords2Vec(vocabList,inputSet): #对单词做简单的词向量
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print ("the word: %s is not in my Vocabulary!" % word)
return returnVec
listPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listPosts)
print (myVocabList)
print (setOfWords2Vec(myVocabList,listPosts[0]))
print (setOfWords2Vec(myVocabList,listPosts[1]))
print (setOfWords2Vec(myVocabList,listPosts[2]))
print (setOfWords2Vec(myVocabList,listPosts[3]))
print (setOfWords2Vec(myVocabList,listPosts[4]))
print (setOfWords2Vec(myVocabList,listPosts[5]))
2.算法训练:
import numpy as np
def loadDataSet():
postingList = [
['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']
]
classVec = [0,1,0,1,0,1] #1:侮辱性文字,0:正常言论
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([]) #创建一个空集
for document in dataSet:
vocabSet = vocabSet | set(document) #创建两个集合的并集,去重操作
return list(vocabSet)
def setOfWords2Vec(vocabList,inputSet): #对单词做简单的词向量
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print ("the word: %s is not in my Vocabulary!" % word)
return returnVec
#训练函数
def trainNB0(trainMatrix,trainCategory):
#trainMatrix : 0,1表示的文档矩阵
#trainCategory : 类别标签构成的向量
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory) / float(numTrainDocs) #P(c1)
p0Num = np.zeros(numWords)
p1Num = np.zeros(numWords)
p0Denom = 0.0
p1Denom = 0.0
#len(trainMatrix) == len(trainCategory)
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p0Vect = p0Num / p0Denom #P(Wi|C1)的向量形式
p1Vect = p1Num / p1Denom #p(wi|c2)的向量形式
return p0Vect,p1Vect,pAbusive
listPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listPosts)
trainMat = []
for postinDoc in listPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
print (trainMat)
p0V,p1V,pAb = trainNB0(trainMat,listClasses)
print ("p0V: \n",p0V)
print ("p1V: \n",p1V)
print ("pAb: \n",pAb)
3.完整代码:
import numpy as np
from math import log
def loadDataSet():
postingList = [
['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']
]
classVec = [0,1,0,1,0,1] #1:侮辱性文字,0:正常言论
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([]) #创建一个空集
for document in dataSet:
vocabSet = vocabSet | set(document) #创建两个集合的并集,去重操作
return list(vocabSet)
def setOfWords2Vec(vocabList,inputSet): #对单词做简单的词向量
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print ("the word: %s is not in my Vocabulary!" % word)
return returnVec
#训练函数
def trainNB0(trainMatrix,trainCategory):
#trainMatrix : 0,1表示的文档矩阵
#trainCategory : 类别标签构成的向量
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory) / float(numTrainDocs) #P(c1)
p0Num = np.ones(numWords)
p1Num = np.ones(numWords) #change to ones()
p0Denom = 2.0 #change to 2.0
p1Denom = 2.0 #防止概率乘积为0
#len(trainMatrix) == len(trainCategory)
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p0Vect = np.log10(p0Num / p0Denom)
p1Vect = np.log10(p1Num / p1Denom) #对向量中的逐个元素取对数,使用np.log10()而不是log()
return p0Vect,p1Vect,pAbusive #取对数为了防止下溢出
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listPosts)
trainMat=[]
for postinDoc in listPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses))
testEntry = ['love', 'my', 'dalmation']
thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
testEntry = ['stupid', 'garbage']
thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
testingNB()
之前统计的是一个单词在是否出现,这个被称为词集模型。还可以统计一个单词出现了几次,这个被称为词袋模型
四.数学知识 :
1.朴素贝叶斯的推断过程:
2.朴素贝叶斯的参数估计:
3.朴素贝叶斯的算法流程:
4.垃圾邮件分类原理:
朴素贝叶斯公式:P(Y|X)=P(X|Y)*P(Y)/P(X)
我们把Y看作邮件分类结果(0不是垃圾邮件、1是垃圾邮件),X看作邮件中的各个词语
于是,P(Y)表示训练集中各类邮件出现的概率(条件概率)(大数定理,即各类邮件出现次数/总邮件数),若训练样本较少,大数定理不适用,本文旨在深刻理解NB原理,因此即使本文中训练样本较少,但仍然使用了大数定理。
P(X)表示邮件某个词语出现的概率,P(X|Y)表示某类邮件中词语X出现的概率(先验概率)
P(Y|X)表示邮件包含X词语时,该邮件为Y类的概率(后验概率),是垃圾邮件过滤的结果,我们会将测试样本X归类到使得P(Y|X)最大的那个Y类
代码实现:
1.利用训练集,训练概率参数(拉普拉斯平滑)[类似mnb.fit()]
#先将训练集的内容和标签合为一个dataframe
d={"content":X_train_count.toarray().tolist(),"label":Y_train}
emails_train=pd.DataFrame(data=d)
#将正常邮件(Y=0)和垃圾邮件(Y=1)分为两个子集
normal=emails_train[emails_train.label==0]
normal.reset_index(inplace=True,drop=True)
#重置normal索引,作用于原表,丢弃之前的索引
spam=emails_train[emails_train.label==1]
spam.reset_index(inplace=True,drop=True)
#重置spam索引,作用于原表,丢弃之前的索引
"""计算Y_train=0、1的条件概率(拉普拉斯平滑)"""
Py0=(len(normal)+1)/(len(emails_train)+2)
Py1=(len(spam)+1)/(len(emails_train)+2)
"""计算X_train各特征向量取各特征值时的先验概率(拉普拉斯平滑)""""""计算垃圾邮件中,各特征向量的先验概率"""
vd=len(spam.content[0])
#特征向量的维度
spam_count_dict={}
#用于保存content特征向量按列累加的结果
spam_count_prob={}
#用于保存垃圾邮件中各特征向量出现的概率
#求content各特征向量按列累加的结果,用于计算各向量在训练集中出现的概率for i in range(len(spam)):
for j in range(vd):
spam_count_dict[j]=spam_count_dict.get(j,0)+spam.content[i][j]
#计算垃圾邮件中各特征向量出现的次数,即,求content各特征向量count按列累加的结果
for j in range(vd):
spam_count_prob[j]=(spam_count_dict.get(j,0)+1)/(len(spam)+2)
#计算垃圾邮件中各特征向量出现的概率(拉普拉斯平滑)
"""计算正常邮件中,各特征向量的先验概率"""
normal_count_dict={}
#用于保存content特征向量按列累加的结果
normal_count_prob={}
#用于保存正常邮件中各特征向量出现的概率
#求content各特征向量按列累加的结果,用于计算各向量在训练集中出现的概率
for i in range(len(normal)):
for j in range(vd):
normal_count_dict[j]=normal_count_dict.get(j,0)+normal.content[i][j]
#计算垃圾邮件中各特征向量出现的次数,即,求content各特征向量count按列累加的结果
for j in range(vd):
normal_count_prob[j]=(normal_count_dict.get(j,0)+1)/(len(normal)+2)
#计算垃圾邮件中各特征向量出现的概率(拉普拉斯平滑)
2.将测试集各特征向量值带入训练的概率参数中,计算后验概率,取使后验概率最大的Y=ck为测试样本的分类[类似mnb.predict(), mnb.predict_proba()]
"""计算各测试样本的后验概率"""
test_classify={} #用于保存测试集各样本的后验概率 P(Y|X)=P(Y)*P(X|Y)/P(X)
Px_spam={} #用于保存测试集各样本在垃圾邮件下的先验概率 P(X|Y)
Px_normal={} #用于保存测试集各样本在正常邮件下的先验概率 P(X|Y)
for i in range(X_test_array.shape[0]):
for j in range(X_test_array.shape[1]):
if X_test_array[i][j]!=0:
Px_spam[i]=Px_spam.get(i,1)*spam_count_prob.get(j)
#计算垃圾邮件下,各测试样本的后验概率
Px_normal[i]=Px_normal.get(i,1)*normal_count_prob.get(j)
#计算正常邮件下,各测试样本的后验概率
test_classify[i]=Py0*Px_normal.get(i,0),Py1*Px_spam.get(i,0) #后验概率P(Y|X)=P(Y)*P(X|Y)/P(X)
#比较各样本属于不同分类时(正常/垃圾)的后验概率,去后验概率大的为样本分类结果
results={} #用于存放邮件判定结果
for key,value in test_classify.items():
if value[0]<=value[1]:
results[key]=1
else:
results[key]=0
关于为什么要用平滑算法:
共同学习,写下你的评论
评论加载中...
作者其他优质文章