特征预处理后,训练集feature,如下图所示:
(3) 建模
from sklearn.naive_bayes import BernoulliNB import time features=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN'] X_train, X_test, y_train, y_test = train_test_split(trainData[features], trainData['crime'], train_size=0.6) NB = BernoulliNB() nbStart = time.time() NB.fit(X_train, y_train) nbCostTime = time.time() - nbStart #print(X_test.shape) propa = NB.predict_proba(X_test) #X_test为263415*17;那么该行就是将263415分到39种犯罪类型中,每个样本被分到每一种的概率 print("朴素贝叶斯建模%.2f秒"%(nbCostTime)) predicted = np.array(propa) logLoss=log_loss(y_test, predicted) print("朴素贝叶斯的log损失为:%.6f"%logLoss)输出:
朴素贝叶斯建模0.55秒
朴素贝叶斯的log损失为:2.582561
例3 文本分类——垃圾邮件过滤
收集数据:提供文本文件
准备数据:将文本文件解析成词条向量
分析数据;检查词条确保解析的正确性
训练算法:使用之前建立的trainNB0()函数
测试算法:使用classifyNB(),并且构建一个新的测试函数来计算文档集的错误率
使用算法:构建一个完整的程序对一组文档进行分类,将错分的文档输出到屏幕上
准备数据:切分文本
使用正则表达式切分,其中分隔符是除单词、数字外的任意字符
import re mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.' regEx = re.compile('\\W*') listOfTokens = regEx.split(mySent) # 去掉长度小于0的单词,并转换为小写 [tok.lower() for tok in listOfTokens if len(tok) > 0] [out] ['this', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'm', 'l', 'i', 'have', 'ever', 'laid', 'eyes', 'upon']切分邮件
emailText = open('email/ham/6.txt').read() listOfTokens = regEx.split(emailText)测试算法:使用朴素贝叶斯进行交叉验证
import randomdef textParse(bigString): ''' 字符串解析 ''' import re # 根据非数字字母的任意字符进行拆分 listOfTokens = re.split(r'\W*', bigString) # 拆分后字符串长度大于2的字符串,并转换为小写 return [tok.lower() for tok in listOfTokens if len(tok) > 2]def spamTest(): ''' 贝叶斯分类器对垃圾邮件进行自动化处理 ''' docList = [] classList = [] fullText = [] for i in range(1, 26): # 读取spam文件夹下的文件,并转换为特征和标签向量 wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) # 读取ham文件夹下的文件,并转换为特征和标签向量 wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) # 转换为词列表 vocabList = createVocabList(docList) # 初始化训练集和测试集 trainingSet = range(50); testSet = [] # 随机抽取测试集索引 for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat = [] trainClasses = [] # 构造训练集 for docIndex in trainingSet: trainMat.append(setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) # 朴素贝叶斯分类模型训练 p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses)) errorCount = 0 # 朴素贝叶斯分类模型测试 for docIndex in testSet: wordVector = setOfWords2Vec(vocabList, docList[docIndex]) if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'classification error', docList[docIndex] print 'the error rate is: ',float(errorCount)/len(testSet)由于SpamTest()构造的测试集和训练集是随机的,所以每次运行的分类结果可能不一样。如果发生错误,函数会输出错分文档的词表,这样就可以了解到底哪篇文档发生了错误。这里出现的错误是将垃圾邮件误判为了正常邮件。
import randomdef textParse(bigString): ''' 字符串解析 ''' import re # 根据非数字字母的任意字符进行拆分 listOfTokens = re.split(r'\W*', bigString) # 拆分后字符串长度大于2的字符串,并转换为小写 return [tok.lower() for tok in listOfTokens if len(tok) > 2]def spamTest(): ''' 贝叶斯分类器对垃圾邮件进行自动化处理 ''' spamTest() [out] classification error ['benoit', 'mandelbrot', '1924', '2010', 'benoit', 'mandelbrot', '1924', '2010', 'wilmott', 'team', 'benoit', 'mandelbrot', 'the', 'mathematician', 'the', 'father', 'fractal', 'mathematics', 'and', 'advocate', 'more', 'sophisticated', 'modelling', 'quantitative', 'finance', 'died', '14th', 'october', '2010', 'aged', 'wilmott', 'magazine', 'has', 'often', 'featured', 'mandelbrot', 'his', 'ideas', 'and', 'the', 'work', 'others', 'inspired', 'his', 'fundamental', 'insights', 'you', 'must', 'logged', 'view', 'these', 'articles', 'from', 'past', 'issues', 'wilmott', 'magazine'] the error rate is: 0.1spamTest() [out] the error rate is: 0.0参考文献: