目的:

用朴素贝叶斯分类,来判断文本是属于政府工作报告 OR 哈利波特小说。

数据来源:

来自于互联网下载。
政府工作报告放在spam文件夹中,分类为1;哈利波特小说放在ham文件夹中,分类为0。测试文本放在test文件夹中,未标记分类。
分享链接:https://pan.baidu.com/s/1fjbQO19StRy8UspZFJsdAQ
提取码:pypy

代码

项目:判断政府工作报告OR哈利波特小说

# 引入朴素贝叶斯
import bayes
from bayes import *

# 读取数据
docList=[]; classList = []; fullText =[]
for i in range(1,10+1):
    wordList = textParse2(open('my_file/report/spam/%d.txt' % i,encoding='UTF-8',errors='ignore').read())
    docList.append(wordList)
    fullText.extend(wordList)
    classList.append(1)
    wordList = textParse2(open('my_file/report/ham/%d.txt' % i,encoding='UTF-8',errors='ignore').read())
    docList.append(wordList)
    fullText.extend(wordList)
    classList.append(0)

print(len(docList))
#20
# 读取样本数据(否则测试新样本数据时会显示失败,因为没有读过这些词)
for i in range(1,3+1):
    wordList = textParse2(open('my_file/report/test/%d.txt' % i,encoding='UTF-8',errors='ignore').read())
    docList.append(wordList)
    fullText.extend(wordList)

print(len(docList))
#23
# 检查读取的数据是正常的
print(docList[0][:10])
print(classList[:10])
#['政府', '工作', '报告', '2020', '22', '第十三届', '全国人民代表大会', '第三次', '会议', '国务院']
#[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
# 分割训练数据和测试数据
vocabList = createVocabList(docList)#create vocabulary
trainingSet = list(range(20)); testSet=[]           #create test set
for i in range(4):
    randIndex = int(random.uniform(0,len(trainingSet)))
    testSet.append(trainingSet[randIndex])
    del(trainingSet[randIndex])
    
print(testSet)
print(trainingSet)
#[16, 0, 4, 10]
#[1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19]

# 用训练数据来训练贝叶斯模型
trainMat=[]; trainClasses = []
for docIndex in trainingSet:#train the classifier (get probs) trainNB0
    trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
    trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))

# 打印并检查数据
print(p0V)
print(p1V)
print(pSpam)
#[-10.17888194 -10.17888194 -9.08026965 ... -10.17888194 -9.48573476 -10.17888194]
#[-10.61304927 -9.91990209 -10.61304927 ... -9.91990209 -10.61304927 -9.91990209]
#0.375
# 交叉验证来测试分类器效果
errorCount = 0
for docIndex in testSet:        #classify the remaining items
    wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
    if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
        errorCount += 1
        print ("classification error",docList[docIndex])
    else:
        print("classification:",classifyNB(array(wordVector),p0V,p1V,pSpam),"real:",classList[docIndex])
print ('the error rate is: ',float(errorCount)/len(testSet))
#classification: 1 real: 1
#classification: 1 real: 1
#classification: 1 real: 1
#classification: 1 real: 1
#the error rate is: 0.0
# 用新数据来检测分类器效果0
def classification_test(file,docList,p0V,p1V,pSpam):
    wordList = textParse2(open(file,encoding='UTF-8',errors='ignore').read())
    #docList.append(wordList)
    vocabList = createVocabList(docList)#create vocabulary
    wordVector = bagOfWords2VecMN(vocabList, wordList)
    output = classifyNB(array(wordVector),p0V,p1V,pSpam)
    print(file,'分类结果是',output)
    return output

# 用新数据来检测分类器效果1
classification_test('my_file/report/test/1.txt',docList,p0V,p1V,pSpam)
#my_file/report/test/1.txt 分类结果是 1

# 用新数据来检测分类器效果2
classification_test('my_file/report/test/2.txt',docList,p0V,p1V,pSpam)
#my_file/report/test/2.txt 分类结果是 0

# 用新数据来检测分类器效果3
classification_test('my_file/report/test/3.txt',docList,p0V,p1V,pSpam)
#my_file/report/test/3.txt 分类结果是 0

bayes.py


from numpy import *
                 
def createVocabList(dataSet):
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print ("the word: %s is not in my Vocabulary!" % word)
    return returnVec

def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = ones(numWords); p1Num = ones(numWords)      #change to ones() 
    p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom)          #change to log()
    p0Vect = log(p0Num/p0Denom)          #change to log()
    return p0Vect,p1Vect,pAbusive

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else: 
        return 0
    
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

    
def textParse2(bigString):    #input is big string, #output is word list
    import re
    import jieba
    listOfTokens = jieba.cut(bigString,cut_all=False)
    return [tok for tok in listOfTokens if len(tok) >= 2] 

本文地址:https://blog.csdn.net/m0_46629123/article/details/110441305