本文實例講述了Python實現的樸素貝葉斯分類器。分享給大家供大家參考,具體如下:
因工作中需要,自己寫了一個樸素貝葉斯分類器。
對于未出現的屬性,采取了拉普拉斯平滑,避免未出現的屬性的概率為零導致整個條件概率都為零的情況出現。
樸素貝葉斯的基本原理網上很容易查到,這里不再敘述,直接附上代碼
因工作中需要,自己寫了一個樸素貝葉斯分類器。對于未出現的屬性,采取了拉普拉斯平滑,避免未出現的屬性的概率為零導致整個條件概率都為零的情況出現。
class NBClassify(object):  def __init__(self, fillNa = 1):    self.fillNa = 1    pass  def train(self, trainSet):    # 計算每種類別的概率    # 保存所有tag的所有種類,及它們出現的頻次    dictTag = {}    for subTuple in trainSet:      dictTag[str(subTuple[1])] = 1 if str(subTuple[1]) not in dictTag.keys() else dictTag[str(subTuple[1])] + 1    # 保存每個tag本身的概率    tagProbablity = {}    totalFreq = sum([value for value in dictTag.values()])    for key, value in dictTag.items():      tagProbablity[key] = value / totalFreq    # print(tagProbablity)    self.tagProbablity = tagProbablity    ##############################################################################    # 計算特征的條件概率    # 保存特征屬性基本信息{特征1:{值1:出現5次, 值2:出現1次}, 特征2:{值1:出現1次, 值2:出現5次}}    dictFeaturesBase = {}    for subTuple in trainSet:      for key, value in subTuple[0].items():        if key not in dictFeaturesBase.keys():          dictFeaturesBase[key] = {value:1}        else:          if value not in dictFeaturesBase[key].keys():            dictFeaturesBase[key][value] = 1          else:            dictFeaturesBase[key][value] += 1    # dictFeaturesBase = {      # '職業': {'農夫': 1, '教師': 2, '建筑工人': 2, '護士': 1},      # '癥狀': {'打噴嚏': 3, '頭痛': 3}      # }    dictFeatures = {}.fromkeys([key for key in dictTag])    for key in dictFeatures.keys():      dictFeatures[key] = {}.fromkeys([key for key in dictFeaturesBase])    for key, value in dictFeatures.items():      for subkey in value.keys():        value[subkey] = {}.fromkeys([x for x in dictFeaturesBase[subkey].keys()])    # dictFeatures = {    #  '感冒 ': {'癥狀': {'打噴嚏': None, '頭痛': None}, '職業': {'護士': None, '農夫': None, '建筑工人': None, '教師': None}},    #  '腦震蕩': {'癥狀': {'打噴嚏': None, '頭痛': None}, '職業': {'護士': None, '農夫': None, '建筑工人': None, '教師': None}},    #  '過敏 ': {'癥狀': {'打噴嚏': None, '頭痛': None}, '職業': {'護士': None, '農夫': None, '建筑工人': None, '教師': None}}    #  }    # initialise dictFeatures    for subTuple in trainSet:      for key, value in subTuple[0].items():        dictFeatures[subTuple[1]][key][value] = 1 if dictFeatures[subTuple[1]][key][value] == None else dictFeatures[subTuple[1]][key][value] + 1    # print(dictFeatures)    # 將馴良樣本中沒有的項目,由None改為一個非常小的數值,表示其概率極小而并非是零    for tag, featuresDict in dictFeatures.items():      for featureName, fetureValueDict in featuresDict.items():        for featureKey, featureValues in fetureValueDict.items():          if featureValues == None:            fetureValueDict[featureKey] = 1    # 由特征頻率計算特征的條件概率P(feature|tag)    for tag, featuresDict in dictFeatures.items():      for featureName, fetureValueDict in featuresDict.items():        totalCount = sum([x for x in fetureValueDict.values() if x != None])        for featureKey, featureValues in fetureValueDict.items():          fetureValueDict[featureKey] = featureValues/totalCount if featureValues != None else None    self.featuresProbablity = dictFeatures    ##############################################################################  def classify(self, featureDict):    resultDict = {}    # 計算每個tag的條件概率    for key, value in self.tagProbablity.items():      iNumList = []      for f, v in featureDict.items():        if self.featuresProbablity[key][f][v]:          iNumList.append(self.featuresProbablity[key][f][v])      conditionPr = 1      for iNum in iNumList:        conditionPr *= iNum      resultDict[key] = value * conditionPr    # 對比每個tag的條件概率的大小    resultList = sorted(resultDict.items(), key=lambda x:x[1], reverse=True)    return resultList[0][0]if __name__ == '__main__':  trainSet = [    ({"癥狀":"打噴嚏", "職業":"護士"}, "感冒 "),    ({"癥狀":"打噴嚏", "職業":"農夫"}, "過敏 "),    ({"癥狀":"頭痛", "職業":"建筑工人"}, "腦震蕩"),    ({"癥狀":"頭痛", "職業":"建筑工人"}, "感冒 "),    ({"癥狀":"打噴嚏", "職業":"教師"}, "感冒 "),    ({"癥狀":"頭痛", "職業":"教師"}, "腦震蕩"),  ]  monitor = NBClassify()  # trainSet is something like that [(featureDict, tag), ]  monitor.train(trainSet)  # 打噴嚏的建筑工人  # 請問他患上感冒的概率有多大?  result = monitor.classify({"癥狀":"打噴嚏", "職業":"建筑工人"})  print(result)            
新聞熱點
疑難解答