// RF_C2.cpp : 定義控制臺應用程式的入口點。
//
#include "stdafx.h"
#include <iostream>
#include <fstream>
#include <sstream>
#include "random_forest.h"
using namespace std;
vector<decision_tree*> alltrees; // 森林(決策樹集合)
vector<TupleData> trainAll,train,test; // 樣本集
vector<int> attributes; // 屬性集(元素為屬性序號)
int trainAllNum = 0;
int testAllNum = 0;
int MaxAttr; // 屬性總數
int *ArrtNum={0}; // 屬性個數集(元素為屬性最大值)
unsigned int F;
int tree_num = 10; // 決策樹個數
const int leafattrnum = -1; // 葉子節點的屬性序號
// 讀入資料
void init(char * trainname, char * testname)
{
trainAllNum = readData(trainAll, trainname);
testAllNum = readData(test, testname);
calculate_attributes();
double temp = (double)trainAllNum;
temp = log(temp)/log(2.0);
F = (unsigned int)floor(temp+0.5)+1;
if(F>MaxAttr) F = MaxAttr;
}
// 初始化訓練樣本子集
void sub_init()
{
// 選取決策樹的訓練樣本集合
RandomSelectData(trainAll, train);
// 計算樣本屬性個數
calculate_ArrtNum();
}
// 讀資料
int readData(vector<TupleData> &data, const char* fileName)
{
ifstream fin;
fin.open(fileName);
string line;
int datanum=0;
// 每行資料作為一個樣本
while(getline(fin,line))
{
TupleData d;
istringstream stream(line);
string str;
// 設定每個樣本的標簽和內容
while(stream>>str)
{
if(str.find('a')==0)
{
d.label='a';
}
else if(str.find('b')==0)
{
d.label='b';
}
else if(str.find('c')==0)
{
d.label='c';
}
else if(str.find('d')==0)
{
d.label='d';
}
else if(str.find('e')==0)
{
d.label='e';
}
else if(str.find('f')==0)
{
d.label='f';
}
else if(str.find('g')==0)
{
d.label='g';
}
else if(str.find('h')==0)
{
d.label='h';
}
else if(str.find('i')==0)
{
d.label='i';
}
else if(str.find('j')==0)
{
d.label='j';
}
else
{
int j=stringtoint(str);
d.A.push_back(j);
}
}
data.push_back(d);
datanum++;
}
fin.close();
return datanum;
}
// 生成根節點的訓練樣本子集
void RandomSelectData(vector<TupleData> &data, vector<TupleData> &subdata)
{
int index;
subdata.clear();
int d = 0;
while (d < trainAllNum)
{
index = rand() % trainAllNum;
subdata.push_back(data.at(index));
d++;
}
}
// 計算屬性序列
void calculate_attributes()
{
// 每個樣本必須具有相同的屬性個數
TupleData d = trainAll.at(0);
MaxAttr = d.A.size();
attributes.clear();
// 建立屬性集合attributes,元素為屬性序號
for (int i = 0; i < MaxAttr; i++)
{
attributes.push_back(i);
}
// 初始化屬性最大值序列,元素為屬性最大值
ArrtNum = new int[MaxAttr];
}
// 字串轉化為int
int stringtoint(string s)
{
int sum=0;
for(int i=0; s[i]!='\0';i++)
{
int j=int(s[i])-48;
sum=sum*10+j;
}
return sum;
}
// 計算ArrtNum元素值
void calculate_ArrtNum()
{
for(int i = 0; i < MaxAttr; i++) ArrtNum[i] = 0;
// ArrtNum元素值為屬性最大值
for (vector<TupleData>::const_iterator it = train.begin(); it != train.end(); it++)
{
int i = 0;
for (vector<int>::const_iterator intt=(*it).A.begin(); intt!=(*it).A.end();intt++)
{
int valuemax=(*intt)+1;
if(valuemax>ArrtNum[i]) ArrtNum[i]=valuemax;
i++;
}
}
}
// 計算熵
double Entropy(double p, double s)
{
double n = s - p;
double result = 0;
if (n != 0)
result += - double(n) / s * log(double(n) / s) / log(2.0);
if (p != 0)
result += double(-p) / s * log(double(p) / s) / log(2.0);
return result;
}
// 訓練一棵決策樹
int creat_classifier(decision_tree *&p, const vector<TupleData> &samples, vector<int> &attributes)
{
if (p == NULL)
p = new decision_tree();
// 根據樣本真實類別,輸出葉子節點類別
if (Allthesame(samples, 'a'))
{
p->node.label = 'a';
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
if (Allthesame(samples, 'b'))
{
p->node.label = 'b';
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
if (Allthesame(samples, 'c'))
{
p->node.label = 'c';
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
if (Allthesame(samples, 'd'))
{
p->node.label = 'd';
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
if (Allthesame(samples, 'e'))
{
p->node.label = 'e';
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
if (Allthesame(samples, 'f'))
{
p->node.label = 'f';
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
if (Allthesame(samples, 'g'))
{
p->node.label = 'g';
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
if (Allthesame(samples, 'h'))
{
p->node.label = 'h';
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
if (Allthesame(samples, 'i'))
{
p->node.label = 'i';
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
if (Allthesame(samples, 'j'))
{
p->node.label = 'j';
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
// 如果屬性序列為空,當前節點就為葉子節點
if (attributes.size() == 0)
{
p->node.label = Majorityclass(samples);
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
// 計算當前節點的最優屬性
p->node.attrNum = BestGainArrt(samples, attributes);
// 中間節點無標簽
p->node.label = ' ';
// 計算子節點候選屬性集合,候選集合元素越來越少
vector<int> newAttributes;
for (vector<int>::iterator it = attributes.begin(); it != attributes.end(); it++)
if ((*it) != p->node.attrNum)
newAttributes.push_back((*it));
// 初始化樣本子集,建立maxvalue個樣本子集,也就說明該節點有maxvalue個子節點
// 為什么不建立一個閾值,進行二分類?
int maxvalue = ArrtNum[p->node.attrNum];
vector<TupleData>* subSamples = new vector<TupleData>[maxvalue];
for (int i = 0; i < maxvalue; i++)
subSamples[i].clear();
// 將樣本集合分為樣本子集
for (vector<TupleData>::const_iterator it = samples.begin(); it != samples.end(); it++)
{
// 對樣本進行分類,分別分到maxvalue個子節點中
// p->node.attrNum是當前節點的最優屬性序號
// (*it).A.at(p->node.attrNum)正是子節點的序號
// 基于當前節點最優屬性,計算當前樣本的歸類
subSamples[(*it).A.at(p->node.attrNum)].push_back((*it));
}
decision_tree *child;
for (int i = 0; i < maxvalue; i++)
{
child = new decision_tree;
if (subSamples[i].size() == 0)
child->node.label = Majorityclass(samples);
else
creat_classifier(child, subSamples[i], newAttributes);
p->childs.push_back(child);
}
delete[] subSamples;
return 0;
}
uj5u.com熱心網友回復:
// 計算節點處的資訊增益
int BestGainArrt(const vector<TupleData> &samples, vector<int> &attributes)
{
int attr,
bestAttr = 0,
s = (int)samples.size();
int z[10]={0};
// 計算正樣本個數
for (vector<TupleData>::const_iterator it = samples.begin(); it != samples.end(); it++)
{
if ((*it).label == 'a')
z[0]++;
if ((*it).label == 'b')
z[1]++;
if ((*it).label == 'c')
z[2]++;
if ((*it).label == 'd')
z[3]++;
if ((*it).label == 'e')
z[4]++;
if ((*it).label == 'f')
z[5]++;
if ((*it).label == 'g')
z[6]++;
if ((*it).label == 'h')
z[7]++;
if ((*it).label == 'i')
z[8]++;
if ((*it).label == 'j')
z[9]++;
}
double infoD = 0;
double bestResult = 0;
// 計算初始熵
for(int i=0;i<10;i++)
{
infoD += Entropy(z[i], s);
}
vector<int> m_attributes;
// 隨機確定候選屬性集
RandomSelectAttr(attributes, m_attributes);
// 遍歷屬性(即主題),通過資訊增益篩選最優屬性
for (vector<int>::iterator it = m_attributes.begin(); it != m_attributes.end(); it++)
{
attr = (*it);
double result = infoD;
// 第attr個屬性的最大屬性值
int maxvalue = ArrtNum[attr];
// 正負樣本集
int* subP1 = new int[maxvalue];
int* subP2 = new int[maxvalue];
int* subP3 = new int[maxvalue];
int* subP4 = new int[maxvalue];
int* subP5 = new int[maxvalue];
int* subP6 = new int[maxvalue];
int* subP7 = new int[maxvalue];
int* subP8 = new int[maxvalue];
int* subP9 = new int[maxvalue];
int* subP0 = new int[maxvalue];
int* sub = new int[maxvalue];
for (int i = 0; i < maxvalue; i++)
{
subP1[i] = 0;
subP2[i] = 0;
subP3[i] = 0;
subP4[i] = 0;
subP5[i] = 0;
subP6[i] = 0;
subP7[i] = 0;
subP8[i] = 0;
subP9[i] = 0;
subP0[i] = 0;
sub[i] = 0;
}
// 基于特定屬性,對當前訓練樣本進行分類
// 屬性計算這一步的確沒有,屬性值直接存盤在樣本中
for (vector<TupleData>::const_iterator jt = samples.begin(); jt != samples.end(); jt++)
{
switch ((*jt).label)
{
case 'a': subP1[(*jt).A.at(attr)]++;break;
case 'b': subP2[(*jt).A.at(attr)]++;break;
case 'c': subP3[(*jt).A.at(attr)]++;break;
case 'd': subP4[(*jt).A.at(attr)]++;break;
case 'e': subP5[(*jt).A.at(attr)]++;break;
case 'f': subP6[(*jt).A.at(attr)]++;break;
case 'g': subP7[(*jt).A.at(attr)]++;break;
case 'h': subP8[(*jt).A.at(attr)]++;break;
case 'i': subP9[(*jt).A.at(attr)]++;break;
case 'j': subP0[(*jt).A.at(attr)]++;break;
default: break;
}
sub[(*jt).A.at(attr)]++;
}
// 計算特定屬性下資訊增益(相對熵)
double SplitInfo = 0;
for(int i = 0; i < maxvalue; i++)
{
double partsplitinfo;
partsplitinfo = -double(sub[i])/s*log(double(sub[i])/s)/log(2.0);
SplitInfo = SplitInfo+partsplitinfo;
}
double infoattr = 0;
for (int i = 0; i < maxvalue; i++)
{
int temp=subP1[i]+subP2[i]+subP3[i]+subP4[i]+subP5[i]+subP6[i]+subP7[i]+subP8[i]+subP9[i]+subP0[i];
double partentropy=0;
partentropy += Entropy(subP1[i],temp );
partentropy += Entropy(subP2[i],temp);
partentropy += Entropy(subP3[i],temp);
partentropy += Entropy(subP4[i],temp);
partentropy += Entropy(subP5[i],temp);
partentropy += Entropy(subP6[i],temp);
partentropy += Entropy(subP7[i],temp);
partentropy += Entropy(subP8[i],temp);
partentropy += Entropy(subP9[i],temp);
partentropy += Entropy(subP0[i],temp);
infoattr = infoattr+((double)(temp)/(double)(s))*partentropy;
}
result = result - infoattr;
result = result / SplitInfo;
// 尋找最優屬性
if (result > bestResult)
{
bestResult = result;
bestAttr = attr;
}
delete[] subP1;
delete[] subP2;
delete[] subP3;
delete[] subP4;
delete[] subP5;
delete[] subP6;
delete[] subP7;
delete[] subP8;
delete[] subP9;
delete[] subP0;
delete[] sub;
}
if (bestResult == 0)
{
bestAttr=attributes.at(0);
}
return bestAttr;
}
void RandomSelectAttr(vector<int> &data, vector<int> &subdata)
{
int index;
unsigned int dataNum=data.size();
subdata.clear();
if(dataNum<=F)
{
for (vector<int>::iterator it = data.begin(); it != data.end(); it++)
{
int attr = (*it);
subdata.push_back(attr);
}
}
else
{
set<int> AttrSet;
AttrSet.clear();
while (AttrSet.size() < F)
{
index = rand() % dataNum;
if (AttrSet.count(index) == 0)
{
AttrSet.insert(index);
subdata.push_back(data.at(index));
}
}
}
}
bool Allthesame(const vector<TupleData> &samples, char ch)
{
for (vector<TupleData>::const_iterator it = samples.begin(); it != samples.end(); it++)
if ((*it).label != ch)
return false;
return true;
}
// 確定節點中哪個類別樣本個數最多
char Majorityclass(const vector<TupleData> &samples)
{
int n = 0,temp;
int pp[10]={0};
for (vector<TupleData>::const_iterator it = samples.begin(); it != samples.end(); it++)
{
if ((*it).label == 'a')
pp[0]++;
else if ((*it).label == 'b')
pp[1]++;
else if ((*it).label == 'c')
pp[2]++;
else if ((*it).label == 'd')
pp[3]++;
else if ((*it).label == 'e')
pp[4]++;
else if ((*it).label == 'f')
pp[5]++;
else if ((*it).label == 'g')
pp[6]++;
else if ((*it).label == 'h')
pp[7]++;
else if ((*it).label == 'i')
pp[8]++;
else if ((*it).label == 'j')
pp[9]++;
}
for(int i=0;i<10;i++)
{
if(n<pp[i])
{
n=pp[i];
temp=i;
}
}
if (temp==0)
return 'a';
else if(temp==1)
return 'b';
else if(temp==2)
return 'c';
else if(temp==3)
return 'd';
else if(temp==4)
return 'e';
else if(temp==5)
return 'f';
else if(temp==6)
return 'g';
else if(temp==7)
return 'h';
else if(temp==8)
return 'i';
else if(temp==9)
return 'j';
}
// 測驗階段
char testClassifier(decision_tree *p, TupleData d)
{
// 抵達葉子節點
if (p->node.label != ' ')
return p->node.label;
// 節點處最優屬性
int attrNum = p->node.attrNum;
// 錯誤樣本
if (d.A.at(attrNum) < 0)
return ' ';
// 確定分支
return testClassifier(p->childs.at(d.A.at(attrNum)), d);
}
void testData()
{
for (vector<TupleData>::iterator it = test.begin(); it != test.end(); it++)
{
printf("新樣本\n");
int s[10]={0,0,0,0,0,0,0,0,0,0};
for(int i = 0; i < tree_num; i++)
{
printf("第%d棵樹\n",i);
char m;
m=testClassifier(alltrees.at(i), (*it));
printf("over");
switch (m)
{
case 'a': s[0]++;break;
case 'b': s[1]++;break;
case 'c': s[2]++;break;
case 'd': s[3]++;break;
case 'e': s[4]++;break;
case 'f': s[5]++;break;
case 'g': s[6]++;break;
case 'h': s[7]++;break;
case 'i': s[8]++;break;
case 'j': s[9]++;break;
default: break;
}
for(int j=0;j<10;j++)
{
printf("p的第%d個為:%d\n",j,s[j]);
}
}
int tt=0,oo=0;
for(int j=0;j<10;j++)
{
if(oo<s[j])
{
oo=s[j];
tt=j;
}
}
if (tt==0)
printf("預測為a");
else if(tt==1)
printf("預測為b");
else if(tt==2)
printf("預測為c");
else if(tt==3)
printf("預測為d");
else if(tt==4)
printf("預測為e");
else if(tt==5)
printf("預測為f");
else if(tt==6)
printf("預測為g");
else if(tt==7)
printf("預測為h");
else if(tt==8)
printf("預測為i");
else if(tt==9)
printf("預測為j");
printf("\n");
printf("實際為%c\n",(*it).label);
}
}
void freeClassifier(decision_tree *p)
{
if (p == NULL)
return;
for (vector<decision_tree*>::iterator it = p->childs.begin(); it != p->childs.end(); it++)
{
freeClassifier(*it);
}
delete p;
}
void freeArrtNum()
{
delete[] ArrtNum;
}
//void showResult()
//{
// cout << "Train size: "<< trainAllNum<<endl;
// cout << "Test size: "<<testAllNum<<endl;
// cout << "True positive: " << TP << endl;
// cout << "False negative: "<< FN<<endl;
// cout << "False positive: "<<FP<<endl;
// cout << "True negative: "<<TN<<endl;
//}
int main( )
{
srand((unsigned)time(NULL));
// 初始化樣本
init("F:/c.txt", "F:/d.txt");
// 訓練階段
for(int i = 0; i < tree_num; i++)
{
printf("第 %d 棵決策樹訓練開始\n", i);
// 每棵樹的訓練樣本子集
sub_init();
// 訓練每棵決策樹
decision_tree * root=NULL;
creat_classifier(root, train, attributes);
// 建立森林
alltrees.push_back(root);
printf("第 %d 棵決策樹訓練完畢\n", i);
}
// 測驗階段
testData();
for (vector<decision_tree *>::const_iterator it = alltrees.begin(); it != alltrees.end(); it++)
{
freeClassifier((*it));
}
freeArrtNum();
//showResult();
system("pause");
return 0;
}
uj5u.com熱心網友回復:
這個程式是在http://www.it165.net/pro/html/201503/37220.html這基礎上改的,想變成一個多分類,一直出現_THROW(invalid_argument, "invalid vector<T> argument");的錯誤,不一定在第幾棵樹就卡死了,幫忙看看呀,謝謝啦
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/94388.html
標籤:基礎類
上一篇:元旦散分
下一篇:久違了!
