ID3演算法-有解無憂

ID3演算法最核心的思想是采用資訊增益來選擇特征，決策樹類的演算法最大的不同就是特征的選擇標準不同，C4.5采用資訊增益比，用于減少ID3演算法的局限（在訓練集中，某個屬性所取的不同值的個數越多，那么越有可能拿它來作為分裂屬性，而這樣做有時候是沒有意義的）,CART演算法采用gini系數，不僅可以用來分類，也可以解決回歸問題，

我這里的資料集采用mnist（二進制檔案），使用ID3來對圖片集進行分類，將每一個像素作為一個特征，為了提高預測的精確率，需要對圖片進行二值化處理，

資訊增益計算公式：

以下是ID3的源代碼，

# encoding=utf-8

import cv2
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



# 二值化
def binaryzation(img):
    cv_img = img.astype(np.uint8) # astype用于改變資料的型別   uint8是無符號八位（bit）整型，表示范圍是[0, 255]的整數
    # 原圖片，閾值，最大值，劃分時采用的演算法  大于50的點設定為0，其他的為最大值1
    cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img) 
    return cv_img

def binaryzation_features(trainset):
    features = []
    for img in trainset:
        img = np.reshape(img,(28,28)) # 28行*28列，每一個元素代表一個像素值，元素的值是8位無符號整型
        cv_img = img.astype(np.uint8)
        img_b = binaryzation(cv_img)  # 此為上面自定義的函式
        # hog_feature = np.transpose(hog_feature)
        features.append(img_b)
    features = np.array(features) # 將list轉化為array
    features = np.reshape(features,(-1,feature_len)) # 我不知道可以分成多少行，但是我的需要是分成feature_len列(feature_len = 784) 
    return features


import os
import struct
import numpy as np

def load_mnist(path, kind='train'):
    labels_path = os.path.join(path, '%s-labels.idx1-ubyte' % kind)
    images_path = os.path.join(path, '%s-images.idx3-ubyte' % kind)
    with open(labels_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II', lbpath.read(8)) # 一個I代表4個位元組，所以一共有8位元組的頭部，分別存入變數magic和n中
        labels = np.fromfile(lbpath, dtype=np.uint8) # 一個位元組一讀，并轉化為8位無符號整型
    with open(images_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack('>IIII', imgpath.read(16))
        images = binaryzation_features(np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784))
        # images = np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784) # 除去16個位元組的頭部之后，剩下的資料轉變為8位無符號整型
    return images, labels
    # 訓練集一共有9912422位元組，訓練集一共有60000個樣本，通過樣本數無法推得有那么多的位元組，應該是經過壓縮了的



# 決策樹的定義
class Tree(object):
    def __init__(self,node_type,Class = None, feature = None):
        self.node_type = node_type  # 節點型別（internal或leaf）
        self.dict = {}              # dict的鍵表示特征Ag的可能值ai，值表示根據ai得到的子樹，{特征值：子樹}，采用字典的形式來描述樹型結構
        self.Class = Class          # 葉節點表示的類標記，若是內部節點則為none
        self.feature = feature      # 表示當前的樹即將由第feature個特征劃分（即第feature特征是使得當前樹中資訊增益最大的特征）

    def add_tree(self,key,tree):    # key代表的特征值，不同的key用于區分不同的子樹
        self.dict[key] = tree       # 一個key就是一條樹的分支

    def predict(self,features):     # features為當前待預測的一個樣本
        # 當樣本中采樣不全，結果在測驗集中出現了一個在樣本中從未出現過特征值，則搜索會在內部節點就直接終止，self.class便是None
        if self.node_type == 'leaf' or (features[self.feature] not in self.dict): 
            return self.Class       #  predict最侄訓傳的是樣本對應的類標記，
        tree = self.dict.get(features[self.feature]) # 找出第feature個特征所對應的特征值
        return tree.predict(features)  # 遞回搜索


# 計算資料集x的經驗熵H(x)——每一個類的樣本數量在所有類樣本資料中所占的比重
def calc_ent(x):  # 類列
    # 樣本中有那些類（樣本中的類可能不全），x的型別是array，先將其轉變為list，再變為set
    x_value_list = set([x[i] for i in range(x.shape[0])]) # 形成不重復的類集合
    ent = 0.0
    for x_value in x_value_list:
        p = float(x[x == x_value].shape[0]) / x.shape[0]  # 當前類的數量 / 總的類的數量
        logp = np.log2(p)
        ent -= p * logp # 計算熵，熵越大，隨機變數的不確定性就越大
    return ent


# 計算條件熵H(y/x)
def calc_condition_ent(x, y): # 特征列，類列
    x_value_list = set([x[i] for i in range(x.shape[0])]) # 某一特征中的所有不重復的特征值
    ent = 0.0
    for x_value in x_value_list:
        # 從x這一array陣列中依次遍歷，挑出其中與值x_value相同的，由于train_test_split的影響，其結果會直接對應y中的對應類
        sub_y = y[x == x_value] 
        temp_ent = calc_ent(sub_y) # sub_y是擁有相同特征值的不同的類的串列
        ent += (float(sub_y.shape[0]) / y.shape[0]) * temp_ent
    return ent


# 計算資訊增益
def calc_ent_grap(x,y):  # 特征列，類列
    base_ent = calc_ent(y) # 計算資料集x的經驗熵H(x)
    condition_ent = calc_condition_ent(x, y) # 計算條件熵H(y/x)
    ent_grap = base_ent - condition_ent
    return ent_grap


# ID3演算法
def recurse_train(train_set, train_label, features): # train_features, train_labels. 訓練集以及其對應的類串列  features為特征串列 
    LEAF = 'leaf'
    INTERNAL = 'internal'

    # 步驟1——如果訓練集train_set中的所有實體都屬于同一類Ck
    label_set = set(train_label)  # set可以理解為資料不重復的串列，且元素為不可變物件
    if len(label_set) == 1:
        return Tree(LEAF,Class = label_set.pop())

    # 步驟2——處理特征集為空時的情況
        # filter() 函式用于過濾序列，過濾掉不符合條件的元素，回傳由符合條件元素組成的迭代器物件，
        # lambda x:x==i,train_label，x指向train_label，依次搜索train_label中的每一個值，如果與i相等，則將其放入list中
        # 計算每一個類出現的個數，并用元組存放結果[(類標號，類的數量), ...]
    class_len = [(i,len(list(  filter(lambda x:x==i,train_label)  )))   for i in range(class_num)] 
    (max_class, max_len) = max(class_len, key = lambda x:x[1]) # 回傳串列中元組中的類數量最多的那個元組   x[1]類所對應的數量
        # 類， 類的數量 
    if len(features) == 0:  # features為特征集合
        return Tree(LEAF,Class = max_class)

    # 步驟3——計算資訊增益,并選擇資訊增益最大的特征
    max_feature = 0  # 能產生最大資訊增益的特征
    max_gda = 0      # 資訊增益的最大值
    D = train_label  # train_labels,訓練集對應的類串列
    for feature in features: # 從784個特征中依次選取特征計算器資訊增益
        A = np.array(train_set[:,feature].flat) # 選擇訓練集中的第feature列的所有特征值，.flat是將陣列轉換為1-D的迭代器
        gda=calc_ent_grap(A,D) # 計算資訊增益
        if gda > max_gda:
            max_gda,max_feature = gda,feature
    '''
    如果要寫C4.5演算法，那就可以直接在ID3演算法的基礎上，將用于特征選擇的資訊增益改為資訊增益比，即添加以下代碼：
        ent = calc_ent(A)
        if ent != 0:
            gad /= ent
    不過有意思的是，我曾將ID3改為C4.5來對MNIST進行學習分類，但是會報記憶體溢位錯誤，一直不明白這是怎么回事，因為我只是在ID3的基礎上多呼叫了一個calc_ent(A)函式，而且函式運行完畢也會將記憶體釋放的，怎么就會導致記憶體溢位呢？
    '''

    # 步驟4——資訊增益小于閾值，這里采用先剪枝演算法，防止過擬合
    if max_gda < epsilon:
        return Tree(LEAF,Class = max_class)

    # 步驟5——構建非空子集
    sub_features = list(filter(lambda x:x!=max_feature,features)) # 將已經使用過的特征從特征集中洗掉
    tree = Tree(INTERNAL,feature=max_feature) # 當前樹節點是內部節點，將由feature特征繼續劃分樣本集
    max_feature_col = np.array(train_set[:,max_feature].flat) # .flat是將陣列轉換為1-D的迭代器
        # 保存資訊增益最大的特征可能的取值 (shape[0]表示計算行數)
        # y.shape 回傳的一個元組，代表 y 資料集的資訊如（行，列） y.shape[0], 意思是：回傳 y 中行的總數，
        # 這個值在 y 是單特征的情況下 和 len(y) 是等價的，即資料集中資料點的總數，
    # 依據特征值的不同將樣本分到不同的子樹
    feature_value_list = set([max_feature_col[i] for i in range(max_feature_col.shape[0])]) 
    for feature_value in feature_value_list:
        index = []
        for i in range(len(train_label)):
            if train_set[i][max_feature] == feature_value:
                index.append(i)
        sub_train_set = train_set[index]     # 將特征值同是feature_value的樣本放到一起，形成一個新的子樹樣本
        sub_train_label = train_label[index] # 將樣本對應的類也封裝到一起，一起傳入子樹中
        sub_tree = recurse_train(sub_train_set,sub_train_label,sub_features) # 遞回函式
        tree.add_tree(feature_value,sub_tree)      
    return tree

def train(train_set,train_label,features):
    return recurse_train(train_set,train_label,features)

def predict(test_set,tree):
    result = []
    for features in test_set:
        tmp_predict = tree.predict(features)
        result.append(tmp_predict) # result存盤測驗集中的樣本對應的測驗得到的類
    return np.array(result) # 將list陣列轉變為array陣列



class_num = 10     # MINST資料集有10種labels，分別是“0,1,2,3,4,5,6,7,8,9”
feature_len = 784  # MINST資料集每個image有28*28=784個特征（pixels）
epsilon = 0.01

if __name__ == '__main__':
    print("ID3")
    print("Start read data...")
    t1 = time.time()
    # basedir = os.path.dirname(__file__) 
    train_features,train_labels = load_mnist("data" , kind='train')
    test_features,test_labels = load_mnist("data", kind='t10k')
    t2 = time.time()
    print("讀取資料用時：" + str((t2-t1)))
    
    print('Start training...')
    tree = train(train_features, train_labels, list(range(feature_len))) # 0-783的一個陣列，特征集
    t3 = time.time()
    print("訓練資料用時：" + str((t3-t2)))

    print('Start predicting...')
    test_predict = predict(test_features,tree)  # test_features測驗集，tree為訓練好的模型，函式回傳的型別為np.array
    t4 = time.time()
    print("預測結果用時：" + str((t4-t3)))

    r = 0
    for i in range(len(test_predict)):
        if test_predict[i] == None: # 在樹的某一個分支處沒有其對應的特征值
            test_predict[i] = 10 # 最終結果是不匹配，但是由于需要比較，所以得將None變成數字，不存在10這個手寫數字
        else:
            if test_predict[i] == test_labels[i]:
                r = r + 1
    score = float(r)/float(len(test_predict))
    print("The accruacy score is %f" % score)

參考鏈接：

1.https://github.com/Dod-o/Statistical-Learning-Method_Code

2.https://www.cnblogs.com/kuaizifeng/p/9110157.html

3.https://blog.csdn.net/thxiong1234/article/details/79920526

我在這里重新寫一遍只是為了整理自己學過的東西并加深自己的理解，方便以后回顧復習，對于初學的朋友，建議直接閱讀以上的鏈接，

轉載請註明出處，本文鏈接：https://www.uj5u.com/qita/117872.html

標籤：其他

上一篇：查找演算法

下一篇：LeetCode刷題--1.兩數之和（簡單）