国产探花免费观看_亚洲丰满少妇自慰呻吟_97日韩有码在线_资源在线日韩欧美_一区二区精品毛片,辰东完美世界有声小说,欢乐颂第一季,yy玄幻小说排行榜完本

首頁 > 編程 > Python > 正文

基于ID3決策樹算法的實現(Python版)

2020-01-04 17:15:25
字體:
來源:轉載
供稿:網友

實例如下:

# -*- coding:utf-8 -*-from numpy import *import numpy as npimport pandas as pdfrom math import logimport operator#計算數據集的香農熵def calcShannonEnt(dataSet):  numEntries=len(dataSet)  labelCounts={}  #給所有可能分類創建字典  for featVec in dataSet:    currentLabel=featVec[-1]    if currentLabel not in labelCounts.keys():      labelCounts[currentLabel]=0    labelCounts[currentLabel]+=1  shannonEnt=0.0  #以2為底數計算香農熵  for key in labelCounts:    prob = float(labelCounts[key])/numEntries    shannonEnt-=prob*log(prob,2)  return shannonEnt#對離散變量劃分數據集,取出該特征取值為value的所有樣本def splitDataSet(dataSet,axis,value):  retDataSet=[]  for featVec in dataSet:    if featVec[axis]==value:      reducedFeatVec=featVec[:axis]      reducedFeatVec.extend(featVec[axis+1:])      retDataSet.append(reducedFeatVec)  return retDataSet#對連續變量劃分數據集,direction規定劃分的方向,#決定是劃分出小于value的數據樣本還是大于value的數據樣本集def splitContinuousDataSet(dataSet,axis,value,direction):  retDataSet=[]  for featVec in dataSet:    if direction==0:      if featVec[axis]>value:        reducedFeatVec=featVec[:axis]        reducedFeatVec.extend(featVec[axis+1:])        retDataSet.append(reducedFeatVec)    else:      if featVec[axis]<=value:        reducedFeatVec=featVec[:axis]        reducedFeatVec.extend(featVec[axis+1:])        retDataSet.append(reducedFeatVec)  return retDataSet#選擇最好的數據集劃分方式def chooseBestFeatureToSplit(dataSet,labels):  numFeatures=len(dataSet[0])-1  baseEntropy=calcShannonEnt(dataSet)  bestInfoGain=0.0  bestFeature=-1  bestSplitDict={}  for i in range(numFeatures):    featList=[example[i] for example in dataSet]    #對連續型特征進行處理    if type(featList[0]).__name__=='float' or type(featList[0]).__name__=='int':      #產生n-1個候選劃分點      sortfeatList=sorted(featList)      splitList=[]      for j in range(len(sortfeatList)-1):        splitList.append((sortfeatList[j]+sortfeatList[j+1])/2.0)      bestSplitEntropy=10000      slen=len(splitList)      #求用第j個候選劃分點劃分時,得到的信息熵,并記錄最佳劃分點      for j in range(slen):        value=splitList[j]        newEntropy=0.0        subDataSet0=splitContinuousDataSet(dataSet,i,value,0)        subDataSet1=splitContinuousDataSet(dataSet,i,value,1)        prob0=len(subDataSet0)/float(len(dataSet))        newEntropy+=prob0*calcShannonEnt(subDataSet0)        prob1=len(subDataSet1)/float(len(dataSet))        newEntropy+=prob1*calcShannonEnt(subDataSet1)        if newEntropy<bestSplitEntropy:          bestSplitEntropy=newEntropy          bestSplit=j      #用字典記錄當前特征的最佳劃分點      bestSplitDict[labels[i]]=splitList[bestSplit]      infoGain=baseEntropy-bestSplitEntropy    #對離散型特征進行處理    else:      uniqueVals=set(featList)      newEntropy=0.0      #計算該特征下每種劃分的信息熵      for value in uniqueVals:        subDataSet=splitDataSet(dataSet,i,value)        prob=len(subDataSet)/float(len(dataSet))        newEntropy+=prob*calcShannonEnt(subDataSet)      infoGain=baseEntropy-newEntropy    if infoGain>bestInfoGain:      bestInfoGain=infoGain      bestFeature=i  #若當前節點的最佳劃分特征為連續特征,則將其以之前記錄的劃分點為界進行二值化處理  #即是否小于等于bestSplitValue  if type(dataSet[0][bestFeature]).__name__=='float' or type(dataSet[0][bestFeature]).__name__=='int':    bestSplitValue=bestSplitDict[labels[bestFeature]]    labels[bestFeature]=labels[bestFeature]+'<='+str(bestSplitValue)    for i in range(shape(dataSet)[0]):      if dataSet[i][bestFeature]<=bestSplitValue:        dataSet[i][bestFeature]=1      else:        dataSet[i][bestFeature]=0  return bestFeature#特征若已經劃分完,節點下的樣本還沒有統一取值,則需要進行投票def majorityCnt(classList):  classCount={}  for vote in classList:    if vote not in classCount.keys():      classCount[vote]=0    classCount[vote]+=1  return max(classCount)#主程序,遞歸產生決策樹def createTree(dataSet,labels,data_full,labels_full):  classList=[example[-1] for example in dataSet]  if classList.count(classList[0])==len(classList):    return classList[0]  if len(dataSet[0])==1:    return majorityCnt(classList)  bestFeat=chooseBestFeatureToSplit(dataSet,labels)  bestFeatLabel=labels[bestFeat]  myTree={bestFeatLabel:{}}  featValues=[example[bestFeat] for example in dataSet]  uniqueVals=set(featValues)  if type(dataSet[0][bestFeat]).__name__=='str':    currentlabel=labels_full.index(labels[bestFeat])    featValuesFull=[example[currentlabel] for example in data_full]    uniqueValsFull=set(featValuesFull)  del(labels[bestFeat])  #針對bestFeat的每個取值,劃分出一個子樹。  for value in uniqueVals:    subLabels=labels[:]    if type(dataSet[0][bestFeat]).__name__=='str':      uniqueValsFull.remove(value)    myTree[bestFeatLabel][value]=createTree(splitDataSet/     (dataSet,bestFeat,value),subLabels,data_full,labels_full)  if type(dataSet[0][bestFeat]).__name__=='str':    for value in uniqueValsFull:      myTree[bestFeatLabel][value]=majorityCnt(classList)  return myTreeimport matplotlib.pyplot as pltdecisionNode=dict(boxstyle="sawtooth",fc="0.8")leafNode=dict(boxstyle="round4",fc="0.8")arrow_args=dict(arrowstyle="<-")#計算樹的葉子節點數量def getNumLeafs(myTree):  numLeafs=0  firstSides = list(myTree.keys())  firstStr=firstSides[0]  secondDict=myTree[firstStr]  for key in secondDict.keys():    if type(secondDict[key]).__name__=='dict':      numLeafs+=getNumLeafs(secondDict[key])    else: numLeafs+=1  return numLeafs#計算樹的最大深度def getTreeDepth(myTree):  maxDepth=0  firstSides = list(myTree.keys())  firstStr=firstSides[0]  secondDict=myTree[firstStr]  for key in secondDict.keys():    if type(secondDict[key]).__name__=='dict':      thisDepth=1+getTreeDepth(secondDict[key])    else: thisDepth=1    if thisDepth>maxDepth:      maxDepth=thisDepth  return maxDepth#畫節點def plotNode(nodeTxt,centerPt,parentPt,nodeType):  createPlot.ax1.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',/  xytext=centerPt,textcoords='axes fraction',va="center", ha="center",/  bbox=nodeType,arrowprops=arrow_args)#畫箭頭上的文字def plotMidText(cntrPt,parentPt,txtString):  lens=len(txtString)  xMid=(parentPt[0]+cntrPt[0])/2.0-lens*0.002  yMid=(parentPt[1]+cntrPt[1])/2.0  createPlot.ax1.text(xMid,yMid,txtString)def plotTree(myTree,parentPt,nodeTxt):  numLeafs=getNumLeafs(myTree)  depth=getTreeDepth(myTree)  firstSides = list(myTree.keys())  firstStr=firstSides[0]  cntrPt=(plotTree.x0ff+(1.0+float(numLeafs))/2.0/plotTree.totalW,plotTree.y0ff)  plotMidText(cntrPt,parentPt,nodeTxt)  plotNode(firstStr,cntrPt,parentPt,decisionNode)  secondDict=myTree[firstStr]  plotTree.y0ff=plotTree.y0ff-1.0/plotTree.totalD  for key in secondDict.keys():    if type(secondDict[key]).__name__=='dict':      plotTree(secondDict[key],cntrPt,str(key))    else:      plotTree.x0ff=plotTree.x0ff+1.0/plotTree.totalW      plotNode(secondDict[key],(plotTree.x0ff,plotTree.y0ff),cntrPt,leafNode)      plotMidText((plotTree.x0ff,plotTree.y0ff),cntrPt,str(key))  plotTree.y0ff=plotTree.y0ff+1.0/plotTree.totalDdef createPlot(inTree):  fig=plt.figure(1,facecolor='white')  fig.clf()  axprops=dict(xticks=[],yticks=[])  createPlot.ax1=plt.subplot(111,frameon=False,**axprops)  plotTree.totalW=float(getNumLeafs(inTree))  plotTree.totalD=float(getTreeDepth(inTree))  plotTree.x0ff=-0.5/plotTree.totalW  plotTree.y0ff=1.0  plotTree(inTree,(0.5,1.0),'')  plt.show()df=pd.read_csv('watermelon_4_3.csv')data=df.values[:,1:].tolist()data_full=data[:]labels=df.columns.values[1:-1].tolist()labels_full=labels[:]myTree=createTree(data,labels,data_full,labels_full)print(myTree)createPlot(myTree)

最終結果如下:

{'texture': {'blur': 0, 'little_blur': {'touch': {'soft_stick': 1, 'hard_smooth': 0}}, 'distinct': {'density<=0.38149999999999995': {0: 1, 1: 0}}}}

得到的決策樹如下:

決策樹算法,Python

參考資料:

《機器學習實戰》

《機器學習》周志華著

以上這篇基于ID3決策樹算法的實現(Python版)就是小編分享給大家的全部內容了,希望能給大家一個參考,也希望大家多多支持VEVB武林網。

發表評論 共有條評論
用戶名: 密碼:
驗證碼: 匿名發表
主站蜘蛛池模板: 南通市| 鱼台县| 固始县| 普洱| 金昌市| 屯留县| 长宁县| 邳州市| 高安市| 新泰市| 锡林郭勒盟| 塔河县| 来宾市| 绥德县| 台东市| 甘德县| 平阳县| 浦江县| 德令哈市| 晴隆县| 宜川县| 南召县| 治多县| 青田县| 六枝特区| 会宁县| 浮山县| 雷波县| 凤冈县| 通江县| 临潭县| 太白县| 镇平县| 鹤岗市| 枞阳县| 罗平县| 汉源县| 綦江县| 金堂县| 根河市| 巴彦淖尔市|