老实来讲这课我一头雾水满脑袋问号
import numpy as np
from collections import Counter
counttime = 0
#统计训练语料:spam 和 ham 各自评论总数,单词频率
def seperate(filename):
hamcnt = Counter() #ham 评论统计字典
spamcnt = Counter() #spam 评论统计字典
totalNum = 0 # 邮件的总数
hamNum = 0 # ham 评论数
spamNum = 0 # spam 评论数
global counttime #!!!
i = 0
file = open(filename, encoding='gb18030', errors='ignore')
for line in file: #逐行处理
i = i + 1
new = line.split() # 一行邮件,以 ham/spam 标记分开 #默认为所有的空字符,包括空格、换行(
)、制表符( )等。分割次数。默认为 -1, 即分隔所有。
totalNum = totalNum + 1
if new[0]=='1': # ham 好的评论数
hamNum = hamNum + 1
for word in new[1:]:
hamcnt[word] += 1 #ham 中词频计数
if new[0]=='0': # spam 评论数
spamNum = spamNum + 1
for word in new[1:]:
spamcnt[word] += 1 #spam 词频计数
if counttime == 0 :
print('训练样本的总行数:%s' % i)
print('ham 样本:%s' % hamNum)
print('spam 样本:%s' % spamNum)
counttime += 1
return hamcnt, spamcnt, totalNum, hamNum, spamNum
def train(filename, preData):
hamcnt, spamcnt, totalNum, hamNum, spamNum = seperate(filename) #统计 spam/ham 词典
# 计算 spam/ham 各自总词数
wordNumerOfham = 0
for key in hamcnt:
wordNumerOfham += hamcnt[key]
wordNumerOfspam = 0
for key in spamcnt:
wordNumerOfspam += spamcnt[key]
# 计算概率 p(spam|total),p(ham|total)
p1_spam = spamNum / totalNum #ham 先验概率
p1_ham = hamNum / totalNum #spam 先验概率
hamProbablity = 1
spamProbability = 1
# 针对测试文本文本计算条件概率
newPreData = preData.split()
for word in newPreData: #计算测试语料中每个词的条件概率
try: #加 1 平滑
hamProbablity = hamProbablity * (hamcnt[word] + 1) / (wordNumerOfham + len(hamcnt))
except: # 文本中没有该单词
hamProbablity = hamProbablity * 1 / (wordNumerOfham + len(hamcnt))
res1 = hamProbablity * p1_ham #为 ham 类的概率:先验*条件
for word in newPreData:
try:
spamProbability = spamProbability * (spamcnt[word] + 1) / (wordNumerOfspam + len(spamcnt))
except:
spamProbability = spamProbability * (1) / (wordNumerOfspam + len(spamcnt))
res2 = spamProbability * p1_spam #为 spam 类的概率:先验*条件
if res1 == res2:
print('res1', res1, 'res2', res2)
if res1 > res2:
print('好的评论!', 'ham概率:',res1, ' spam概率:', res2)
return 0
else:
print('糟糕评论!', 'ham概率:',res1, ' spam概率:', res2)
return 1
filename = 'kaggle_training.txt' #训练语料
for line in open("kaggle_test.txt", encoding='gb18030', errors='ignore'):
#print(line)
#line = line.encode()
res = train(filename, line)
#print("RES: "+str(res))
不管别的
- 如何让部分代码只执行一次 counttime在函数外声明一次 在函数内用global表示一下 https://www.cnblogs.com/fendou-999/p/3822028.html
- byte string:https://blog.csdn.net/lqzdreamer/article/details/76549256 split:这个地方跟给的区别https://blog.csdn.net/weixin_40283816/article/details/83591582
(忘记我改的啥了 最后看到上面有对应的操作:encoding='gb18030', errors='ignore') - 想让打印输出的那堆有固定的格式,于是:https://blog.csdn.net/liuweiyuxiang/article/details/100574386 但是都不可以 因为都四舍五入全成0了