代码行数 230, 由于每次执行代码选取的训练集不同, 所以每次执行得到的正确率也不同, 最好的情况是正确率达到 83%.
特征值离散化的思路:
既然最终的分类是分成三种, 那我猜测每个特征的取值也可以分成三个区间, 那也就是求两个分割值. 求分割值用双层 for 循环找使得信息熵最小的下标 i 和 j.
代码整体思路:
1 . 先处理数据, shuffle 函数随机抽取 80% 样本做训练集.
2 . 特征值离散化
3 . 用信息熵来递归地构造树
4 . 用构造好的树来判断剩下 20% 的测试集, 求算法做分类的正确率
- # coding: utf-8
- # In[1]:
- from sklearn import datasets
- import math
- import numpy as np
- # In[69]:
- def getInformationEntropy(arr,leng):
- #print("length =",leng)
- return -(arr[0]/leng*math.log(arr[0]/leng if arr[0]>0 else 1)+ arr[1]/leng*math.log(arr[1]/leng if arr[1]>0 else 1)+ arr[2]/leng*math.log(arr[2]/leng if arr[2]>0 else 1))
- #informationEntropy = getInformationEntropy(num,length)
- #print(informationEntropy)
- # In[105]:
- # 离散化特征一的值
- def discretization(index):
- feature1 = np.array([iris.data[:,index],iris.target]).T
- feature1 = feature1[feature1[:,0].argsort()]
- counter1 = np.array([0,0,0])
- counter2 = np.array([0,0,0])
- resEntropy = 100000
- for i in range(len(feature1[:,0])):
- counter1[int(feature1[i,1])] = counter1[int(feature1[i,1])] + 1
- counter2 = np.copy(counter1)
- for j in range(i+1,len(feature1[:,0])):
- counter2[int(feature1[j,1])] = counter2[int(feature1[j,1])] + 1
- #print(i,j,counter1,counter2)
- #贪心算法求最优的切割点
- if i != j and j != len(feature1[:,0])-1:
- #print(counter1,i+1,counter2-counter1,j-i,np.array(num)-counter2,length-j-1)
- sum = (i+1)*getInformationEntropy(counter1,i+1) + (j-i)*getInformationEntropy(counter2-counter1,j-i) + (length-j-1)*getInformationEntropy(np.array(num)-counter2,length-j-1)
- if sum <resEntropy:
- resEntropy = sum
- res = np.array([i,j])
- res_value = [feature1[res[0],0],feature1[res[1],0]]
- print(res,resEntropy,res_value)
- return res_value
- # In[122]:
- # 求合适的分割值
- def getRazors():
- a = []
- for i in range(len(iris.feature_names)):
- print(i)
- a.append(discretization(i))
- return np.array(a)
- # In[326]:
- # 随机抽取 80% 的训练集和 20% 的测试集
- def divideData():
- completeData = np.c_[iris.data,iris.target.T]
- np.random.shuffle(completeData)
- trainData = completeData[range(int(length*0.8)),:]
- testData = completeData[range(int(length*0.8),length),:]
- return [trainData,testData]
- # In[213]:
- def getEntropy(counter):
- res = 0
- denominator = np.sum(counter)
- if denominator == 0:
- return 0
- for value in counter:
- if value == 0:
- continue
- res += value/denominator * math.log(value/denominator if value>0 and denominator>0 else 1)
- return -res
- # In[262]:
- def findMaxIndex(dataSet):
- maxIndex = 0
- maxValue = -1
- for index,value in enumerate(dataSet):
- if value>maxValue:
- maxIndex = index
- maxValue = value
- return maxIndex
- # In[308]:
- def recursion(featureSet,dataSet,counterSet):
- #print("函数开始, 剩余特征:",featureSet,"剩余结果长度:",len(dataSet))
- if(counterSet[0]==0 and counterSet[1]==0 and counterSet[2]!=0):
- return iris.target_names[2]
- if(counterSet[0]!=0 and counterSet[1]==0 and counterSet[2]==0):
- return iris.target_names[0]
- if(counterSet[0]==0 and counterSet[1]!=0 and counterSet[2]==0):
- return iris.target_names[1]
- if len(featureSet) == 0:
- return iris.target_names[findMaxIndex(counterSet)]
- if len(dataSet) == 0:
- return []
- res = 1000
- final = 0
- #print("剩余特征数目", len(featureSet))
- for feature in featureSet:
- i = razors[feature][0]
- j = razors[feature][1]
- #print("i =",i,"j =",j)
- set1 = []
- set2 = []
- set3 = []
- counter1 = [0,0,0]
- counter2 = [0,0,0]
- counter3 = [0,0,0]
- for data in dataSet:
- index = int(data[-1])
- #print("data",data,"index",index)
- if data[feature]<i :
- set1.append(data)
- counter1[index] = counter1[index]+1
- elif data[feature]>= i and data[feature] <=j:
- set2.append(data)
- counter2[index] = counter2[index]+1
- else:
- set3.append(data)
- counter3[index] = counter3[index]+1
- a =( len(set1)*getEntropy(counter1) + len(set2)*getEntropy(counter2) + len(set3)*getEntropy(counter3) )/ len(dataSet)
- #print("特征编号:",feature,"选取该特征得到的信息熵:",a)
- if a<res :
- res = a
- final = feature
- #返回被选中的特征的下标
- #sequence.append(final)
- #print("最终在本节点上选取的特征编号是:",final)
- featureSet.remove(final)
- child = [0,0,0,0]
- child[0] = final
- child[1] = recursion(featureSet,set1,counter1)
- child[2] = recursion(featureSet,set2,counter2)
- child[3] = recursion(featureSet,set3,counter3)
- return child
- # In[322]:
- def judge(data,tree):
- root = "unknow"
- while(len(tree)>0):
- if isinstance(tree,str) and tree in iris.target_names:
- return tree
- root = tree[0]
- if(isinstance(root,str)):
- return root
- if isinstance(root,int):
- if data[root]<razors[root][0] and tree[1] != [] :
- tree = tree[1]
- elif tree[2] != [] and (tree[1]==[] or (data[root]>=razors[root][0] and data[root]<=razors[root][1])):
- tree = tree[2]
- else :
- tree = tree[3]
- return root
- # In[327]:
- if __name__ == '__main__':
- iris = datasets.load_iris()
- num = [0,0,0]
- for row in iris.data:
- num[int(row[-1])] = num[int(row[-1])] + 1
- length = len(iris.target)
- [trainData,testData] = divideData()
- razors = getRazors()
- tree = recursion(list(range(len(iris.feature_names))), trainData,[np.sum(trainData[:,-1]==0), np.sum(trainData[:,-1]==1),np.sum(trainData[:,-1]==2)])
- print("本次选取的训练集构建出的树:",tree)
- index = 0
- right = 0
- for data in testData:
- result = judge(testData[index],tree)
- truth = iris.target_names[int(testData[index][-1])]
- print("result is",result ,"truth is",truth)
- index = index + 1
- if result == truth:
- right = right + 1
- print("正确率 :",right/index)
来源: http://www.jianshu.com/p/87ca7cbf1e0c