具体算法步骤如下; 1创建节点N 2如果训练集为空,在返回节点N标记为Failure 3如果训练集中的所有记录都属于同一个类别,则以该类别标记节点N 4如果候选属性为空,则返回N作为叶节点,标记为训练集中最普通的类; 5for each 候选属性 attribute_list 6if 候选属性是联系的then 7对该属性进行离散化 8选择候选属性attribute_list中具有最高信息增益的属性D 9标记节点N为属性D 10for each 属性D的一致值d 11由节点N长出一个条件为D=d的分支 12设s是训练集中D=d的训练样本的集合 13if s为空 14加上一个树叶,标记为训练集中最普通的类 15else加上一个有C4.5(R - {D},C,s)返回的点
创新互联-专业网站定制、快速模板网站建设、高性价比龙州网站开发、企业建站全套包干低至880元,成熟完善的模板库,直接使用。一站式龙州网站制作公司更省心,省钱,快速模板网站建设找我们,业务覆盖龙州地区。费用合理售后完善,十多年实体公司更值得信赖。
C++代码你可以参考下
C4.5算法源代码(C++)
// C4.5_test.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include stdio.h
#include math.h
#include "malloc.h"
#include stdlib.h
const int MAX = 10;
int** iInput;
int i = 0;//列数
int j = 0;//行数
void build_tree(FILE *fp, int* iSamples, int* iAttribute,int ilevel);//输出规则
int choose_attribute(int* iSamples, int* iAttribute);//通过计算信息增益率选出test_attribute
double info(double dTrue,double dFalse);//计算期望信息
double entropy(double dTrue, double dFalse, double dAll);//求熵
double splitinfo(int* list,double dAll);
int check_samples(int *iSamples);//检查samples是否都在同一个类里
int check_ordinary(int *iSamples);//检查最普通的类
int check_attribute_null(int *iAttribute);//检查attribute是否为空
void get_attributes(int *iSamples,int *iAttributeValue,int iAttribute);
int _tmain(int argc, _TCHAR* argv[])
{
FILE *fp;
FILE *fp1;
char iGet;
int a = 0;
int b = 0;//a,b是循环变量
int* iSamples;
int* iAttribute;
fp = fopen("c:\\input.txt","r");
if (NULL == fp)
{
printf("error\n");
return 0;
}
iGet = getc(fp);
while (('\n' != iGet)(EOF != iGet))
{
if (',' == iGet)
{
i++;
}
iGet = getc(fp);
}
i++;
iAttribute = (int *)malloc(sizeof(int)*i);
for (int k = 0; ki; k++)
{
iAttribute[k] = (int)malloc(sizeof(int));
iAttribute[k] = 1;
}
while (EOF != iGet)
{
if ('\n' == iGet)
{
j++;
}
iGet = getc(fp);
}
j++;
iInput = (int **)malloc(sizeof(int*)*j);
iSamples = (int *)malloc(sizeof(int)*j);
for (a = 0;a j;a++)
{
iInput[a] = (int *)malloc(sizeof(int)*i);
iSamples[a] = (int)malloc(sizeof(int));
iSamples[a] = a;
}
a = 0;
fclose(fp);
fp=fopen("c:\\input.txt","r");
iGet = getc(fp);
while(EOF != iGet)
{
if ((',' != iGet)('\n' != iGet))
{
iInput[a][b] = iGet - 48;
b++;
}
if (b == i)
{
a++;
b = 0;
}
iGet = getc(fp);
}
fp1 = fopen("d:\\output.txt","w");
build_tree(fp1,iSamples,iAttribute,0);
fclose(fp);
return 0;
}
void build_tree(FILE * fp, int* iSamples, int* iAttribute,int level)//
{
int iTest_Attribute = 0;
int iAttributeValue[MAX];
int k = 0;
int l = 0;
int m = 0;
int *iSamples1;
for (k = 0; kMAX; k++)
{
iAttributeValue[k] = -1;
}
if (0 == check_samples(iSamples))
{
fprintf(fp,"result: %d\n",iInput[iSamples[0]][i-1]);
return;
}
if (1 == check_attribute_null(iAttribute))
{
fprintf(fp,"result: %d\n",check_ordinary(iSamples));
return;
}
iTest_Attribute = choose_attribute(iSamples,iAttribute);
iAttribute[iTest_Attribute] = -1;
get_attributes(iSamples,iAttributeValue,iTest_Attribute);
k = 0;
while ((-1 != iAttributeValue[k])(k MAX))
{
l = 0;
m = 0;
while ((-1 != iSamples[l])(l j))
{
if (iInput[iSamples[l]][iTest_Attribute] == iAttributeValue[k])
{
m++;
}
l++;
}
iSamples1 = (int *)malloc(sizeof(int)*(m+1));
l = 0;
m = 0;
while ((-1 != iSamples[l])(l j))
{
if (iInput[iSamples[l]][iTest_Attribute] == iAttributeValue[k])
{
iSamples1[m] = iSamples[l];
m++;
}
l++;
}
iSamples1[m] = -1;
if (-1 == iSamples1[0])
{
fprintf(fp,"result: %d\n",check_ordinary(iSamples));
return;
}
fprintf(fp,"level%d: %d = %d\n",level,iTest_Attribute,iAttributeValue[k]);
build_tree(fp,iSamples1,iAttribute,level+1);
k++;
}
}
int choose_attribute(int* iSamples, int* iAttribute)
{
int iTestAttribute = -1;
int k = 0;
int l = 0;
int m = 0;
int n = 0;
int iTrue = 0;
int iFalse = 0;
int iTrue1 = 0;
int iFalse1 = 0;
int iDepart[MAX];
int iRecord[MAX];
double dEntropy = 0.0;
double dGainratio = 0.0;
double test = 0.0;
for (k = 0;kMAX;k++)
{
iDepart[k] = -1;
iRecord[k] = 0;
}
k = 0;
while ((l!=2)(k(i - 1)))
{
if (iAttribute[k] == -1)
{
l++;
}
k++;
}
if (l == 1)
{
for (k = 0;k(k-1);k++)
{
if (iAttribute[k] == -1)
{
return iAttribute[k];
}
}
}
for (k = 0;k (i-1);k++)
{
l = 0;
iTrue = 0;
iFalse = 0;
if (iAttribute[k] != -1)
{
while ((-1 != iSamples[l])(l j))
{
if (0 == iInput[iSamples[l]][i-1])
{
iFalse++;
}
if (1 == iInput[iSamples[l]][i-1])
{
iTrue++;
}
l++;
}
for (n = 0;nl;n++)//计算该属性有多少不同的值并记录
{
m = 0;
while((iDepart[m]!=-1)(m!=MAX))
{
if (iInput[iSamples[n]][iAttribute[k]] == iDepart[m])
{
break;
}
m++;
}
if (-1 == iDepart[m])
{
iDepart[m] = iInput[iSamples[n]][iAttribute[k]];
}
}
while ((iDepart[m] != -1)(m!=MAX))
{
for (n = 0;nl;n++)
{
if (iInput[iSamples[n]][iAttribute[k]] == iDepart[m])
{
if (1 == iInput[iSamples[n]][i-1])
{
iTrue1++;
}
if (0 == iInput[iSamples[n]][i-1])
{
iFalse1++;
}
iRecord[m]++;
}
}
dEntropy += entropy((double)iTrue1,(double)iFalse1,(double)l);
iTrue1 = 0;
iFalse1 = 0;
m++;
}
double dSplitinfo = splitinfo(iRecord,(double)l);
if (-1 == iTestAttribute)
{
iTestAttribute = k;
dGainratio = (info((double)iTrue,(double)iFalse)-dEntropy)/dSplitinfo;
}
else
{
test = (info((double)iTrue,(double)iFalse)-dEntropy)/dSplitinfo;
if (dGainratio test)
{
iTestAttribute = k;
dGainratio = test;
}
}
}
}
return iTestAttribute;
}
double info(double dTrue,double dFalse)
{
double dInfo = 0.0;
dInfo = ((dTrue/(dTrue+dFalse))*(log(dTrue/(dTrue+dFalse))/log(2.0))+(dFalse/(dTrue+dFalse))*(log(dFalse/(dTrue+dFalse))/log(2.0)))*(-1);
return dInfo;
}
double entropy(double dTrue, double dFalse, double dAll)
{
double dEntropy = 0.0;
dEntropy = (dTrue + dFalse)*info(dTrue,dFalse)/dAll;
return dEntropy;
}
double splitinfo(int* list,double dAll)
{
int k = 0;
double dSplitinfo = 0.0;
while (0!=list[k])
{
dSplitinfo -= ((double)list[k]/(double)dAll)*(log((double)list[k]/(double)dAll));
k++;
}
return dSplitinfo;
}
int check_samples(int *iSamples)
{
int k = 0;
int b = 0;
while ((-1 != iSamples[k])(k j-1))
{
if (iInput[k][i-1] != iInput[k+1][i-1])
{
b = 1;
break;
}
k++;
}
return b;
}
int check_ordinary(int *iSamples)
{
int k = 0;
int iTrue = 0;
int iFalse = 0;
while ((-1 != iSamples[k])(k i))
{
if (0 == iInput[iSamples[k]][i-1])
{
iFalse++;
}
else
{
iTrue++;
}
k++;
}
if (iTrue = iFalse)
{
return 1;
}
else
{
return 0;
}
}
int check_attribute_null(int *iAttribute)
{
int k = 0;
while (k (i-1))
{
if (-1 != iAttribute[k])
{
return 0;
}
k++;
}
return 1;
}
void get_attributes(int *iSamples,int *iAttributeValue,int iAttribute)
{
int k = 0;
int l = 0;
while ((-1 != iSamples[k])(k j))
{
l = 0;
while (-1 != iAttributeValue[l])
{
if (iInput[iSamples[k]][iAttribute] == iAttributeValue[l])
{
break;
}
l++;
}
if (-1 == iAttributeValue[l])
{
iAttributeValue[l] = iInput[iSamples[k]][iAttribute];
}
k++;
}
}
1、熵是描述区域的随机程度的,P=ΣC*logC,C是灰度概率值,当图像均匀时,各灰度值的概率基本相等,熵可以达到最大
2、例程:
#includeiostream.h
#includemath.h
int i,j;
double rowsum(double table[][4],int nrow)//定义第i行的边际概率函数
{
for(i=0;inrow;i++)
{
for( j=1;j4;j++)
table[i][0]+=table[i][j];
}
return 0;
}
double liesum(double table[4][4],int nlie)//定义第j列的边际概率函数
{ for( j=0;jnlie;j++)
{
for( i=1;i4;i++)
table[0][j]+=table[i][j];
}
return 0;
}
void main()
{
double p[4][4]={{1.0/8.0,1.0/16.0,1.0/32.0,1.0/32.0},{1.0/16.0,1.0/8.0,1.0/32.0,1.0/32.0},
{1.0/16.0,1.0/16.0,1.0/16.0,1.0/16.0},{1.0/4.0,0.0,0.0,0.0}};
for ( i=0;i4;i++)//输出概率矩阵
{
for ( j=0;j4;j++)
coutp[i][j]" ";
coutendl;
}coutendl;
rowsum(p,4);//调用函数输出第i行的边际概率
for (i =0;i4;i++)
{cout"第"i"行的边际概率p""["i"]""是"p[i][0]endl;}coutendl;
liesum(p,4);//调用函数输出第j列的边际概率
for ( j =0;j4;j++)
{cout"第"j"列的条件概率p""["j"]""是"p[0][j]endl;}coutendl;
// double p[4][4];
double H1=0.0;
for( i=0;i4;i++)
{H1+=p[i][0]*(log((1.0/p[i][0])/log(2.0)));}
double H2=0.0;
for( j=0;j4;j++)
{H2+=p[0][j]*(log((1.0/p[0][j])/log(2.0)));}
double H3=0.0;
for(i=0;i3;i++)
for(j=0;j4;j++)
{H3+=p[i][j]*(log(1.0/p[i][j])/log(2.0));}
H3+=p[4][1]*(log(1.0/p[4][1])/log(2.0));
cout"X的熵:H(X)="H1endl;
cout"Y的熵:H(Y)="H2endl;
cout"(X,Y)的熵:H(X,Y)="H3endl;
coutendl;
cout"条件熵:H(X|Y)="H3-H2endl;
cout"条件熵:H(Y|X)="H3-H1endl;
cout"互信息:I(X;Y)="H1+H2-H3endl;
int size=4;//定义联合概率p为维数组
double *p;
p=new double[size];
for ( i=0;i4;i++)//联合概率计算
{
for ( j=0;j4;j++)
{
/*int nSize;
scanf( "%d", nSize );
int *p = ( int* )malloc( sizeof( int ) * nSize );
for( int i = 0; i nSize; i++ )
p[ i ] = 0;
double table[4][4];
p[i]=pp[0][i]*table[i][j];
cout"联合概率""p""["i"]""["j"]""是"p[i]endl;
}
}
for ( i=0;i4;i++)//联合熵的计算
{
for ( j=0;j4;j++)
{
// H+=p[i][j]*log(1.0/p[i][j]);
H+=p[i]*(log((1.0/p[i])/log(2.0)));
}
}
cout"联合H(x,y)熵为"Hendl;
delete []p; */
}
#include stdio.h
#include string.h
#includemath.h
int main()//是少了main函数,程序里面一定要有main函数的
{
double p[100];//每个信源的概率
int n;//信源个数
int i;
double sum=0;
scanf("%d",n);
for(i=0;in;i++)
{
scanf("%lf",p[i]);
sum+=-p[i]*(log(p[i])/log(2.0));
}
printf("%lf\n",sum);
return 0;
}