1、ID3算法源程序ID3 算法的源程序(C语言) 2005-3-25PROTO.HENTROPYNEG negentropy ( REAL *, UINT, NODE*, UINT );void print_tree ( NODE* , CHAR* );void free_tree ( NODE* );NODE* ID3 ( MATRIX * , NODE* , UINT , UINT );void err_exit ( CHAR* , UINT );MATRIX *build_matrix ( UINT, UINT );void free_matrix ( MATRIX * );void re
2、ad_matrix ( CHAR *, MATRIX * );void file_size ( CHAR * , UINT * , UINT * );CHAR *read_tags ( CHAR * , UINT );void free_tags ( CHAR *, UINT);ID3.htypedef unsigned intUINT;typedef unsigned long ULONG;typedefchar CHAR;typedef unsigned char BOOL;typedef doubleREAL;typedef struct node UINT idx; /* ID cod
3、e for attribute */ REAL threshold; /* Numerical threshold for attribute test */ struct node *on; /* Address of on node */ struct node *off; /* Address of off node */ struct node *parent; /* Addess of parent node */ NODE;typedef struct ne_struct REAL ne;UINT status; NEGENTROPY;typedef struct matrix U
4、INT width; UINT height; REAL *data; MATRIX;enum UINT INACTIVE, OFF, ON ;#define LN_2 0.693147180559945309417#define entropy(x) (x 0 ? x * log(x) / LN_2 : 0.0)/* FILE: id3.c* Author: Andrew Colin* DISCLAIMER: No liability is assumed by the author for any use made* of this program.* DISTRIBUTION: Any
5、use may be made of this program, as long as the* clear acknowledgment is made to the author in code and runtime* executables*/#include #include #include #include #include #include #include #include #include id3.h#include proto.h/*-*/MATRIX *build_matrix (UINT width, UINT height)MATRIX *_matrix;UINT
6、i;_matrix = (MATRIX*) malloc (sizeof (MATRIX);if (!_matrix)err_exit (_FILE_, _LINE_);_matrix-width= width;_matrix-height = height;_matrix-data = (REAL*) malloc (height * sizeof (REAL*);if (_matrix-data = NULL)err_exit(_FILE_, _LINE_);for (i=0; idatai = (REAL*) malloc (width * sizeof(REAL);if (_matri
7、x-datai = NULL)err_exit(_FILE_, _LINE_);return _matrix;/*-*/* Standard error handler function*/void err_exit (CHAR* file, UINT line)printf(n Fatal error in file %s, line %u, file, line);exit(0);/*-*/void file_size (CHAR *file_name, UINT *width, UINT *height)/* Given the name of a file of numeric dat
8、a, this routine counts* the numbers of rows and columns. Its assumed that the number* of entries is the same in each row, and an error is flagged if this* is not the case.*/FILE *f;UINT buf_size = 0xFF, _width = 0;CHAR *buffer, *ptr;*width = *height = 0;buffer = (CHAR*) malloc (buf_size * sizeof (CH
9、AR);if (buffer = NULL)err_exit (_FILE_, _LINE_);/* Open price file - abort if filename invalid */f = fopen(file_name, r);if (f = NULL)printf(n File not found : %sn, file_name);err_exit (_FILE_, _LINE_);/* Get number of entries in first row */if (fgets(buffer, buf_size, f) != NULL)+*height;ptr = strt
10、ok (buffer, ,);while (ptr != NULL)+*width;ptr = strtok (NULL, ,);/* Count numbers of subsequent rows */while (!feof(f)if (fgets(buffer, buf_size, f) != NULL)if (strlen(buffer) strlen(n)/* if line is more than a NL char */+*height;_width = 0;ptr = strtok (buffer, ,);while (ptr != NULL)+_width;ptr = s
11、trtok (NULL, ,);if (*width != _width)printf(n Number of entries in file %s did not agree, file_name);err_exit (_FILE_, _LINE_);free (buffer);/*-*/void free_matrix (MATRIX *_matrix)UINT i;for (i=0; iheight; i+)free (_matrix-datai);free (_matrix-data);free (_matrix);/*-*/void free_tags ( CHAR* varname
12、, UINT width)UINT i;for (i=0; ion);free_tree (node-off);free(node);/*-*/NODE* ID3 ( MATRIX *matrix, NODE* parent, UINT target, UINT state)/* Routine to build a decision tree, based on Quinlans ID3 algorithm. */NEGENTROPY negentropy_struct;NODE *node;UINT n_vars = matrix-width, n_samples = matrix-hei
13、ght, i, j, split;REAL *data = matrix-data;REAL best_threshold, min_negentropy, _negentropy;/* Allocate memory for this node */node = (NODE*) malloc (sizeof(NODE);if (!node)err_exit (_FILE_, _LINE_);/* Set up links in decision tree */node-parent = parent;/* Set address of parent node */if (parent !=
14、NULL) /* parent to child; not relevant for root node */* Pass address of this node to the parent node */if (state = ON)parent-on = node;elseif (state = OFF)parent-off = node;/* * Select attribute with lowest negentropy for splitting. Scan through * ALL attributes (except the target) and ALL data sam
15、ples. This is * pretty inefficient for data sets with repeated values, but will do * for illustrative purposes */min_negentropy = 1.0;for (i=0; in_vars; i+)for (j=0; jidx = i;node-threshold = dataji;/* .and calculate the negentropy of this partition */negentropy_struct = negentropy (data, n_samples,
16、 node, target);_negentropy = negentropy_struct.ne;/* If this negentropy is lower than any other, retain the index and threshold for future use */if (_negentropy min_negentropy) min_negentropy = _negentropy;split = i;best_threshold = dataji; /*if (i != target)*/ /*for (j=0; jn_samples; j+)*/ /*for (i
17、=0; iidx = split;node-threshold = best_threshold;/* * If the negentropy routine found itself at an end-of-branch * for the decision tree, the status flag in negentropy_struct * is set to ON or OFF and the node labelled accordingly. Otherwise, * ID3 continues to call itself until all end-of-branch no
18、des are * found. */if(negentropy_struct.status != INACTIVE) node-on = node-off = NULL;node-idx = negentropy_struct.status;elsenode-on= ID3 (matrix, node, target, ON);node-off = ID3 (matrix, node, target, OFF);return node;/*-*/void main (int argv, char *argc)MATRIX *matrix;NODE *node;UINT target, n_v
19、ars, n_samples;CHAR data_file13, tag_file13;/* Longest file name in DOS */CHAR *tag_names;/* Set up file names */if (argv != 2)printf(nUsage: id3 datafile);exit(0);elseprintf(nWelcome to ID3);printf(nLast compiled on %s, %s, _DATE_, _TIME_);printf(n);strcpy(data_file, argc1);strcpy(tag_file,argc1);s
20、trcat(data_file, .dat);strcat(tag_file,.tag);/* Read dimensions of data file */file_size (data_file, &n_vars, &n_samples);/* Read labels for columns of data */tag_names = read_tags (tag_file, n_vars);/* Allocate storage for data. */matrix = build_matrix (n_vars, n_samples);/* .and read it from disk
21、*/read_matrix (data_file, matrix);/* Classification target is last column */target = n_vars - 1;/* Return root of decision tree - ID3 continues to call itself recursively */node = ID3 ( matrix, NULL, target, 0 );print_tree(node, tag_names);printf(n);free_tags (tag_names, n_vars);free_matrix(matrix);free_tree (node);/*-*/NEGENTROPY negentropy ( REAL *data,UINT n_samples,NODE *local,UINT target)/* * Calculates the entropy of classif
copyright@ 2008-2022 冰豆网网站版权所有
经营许可证编号:鄂ICP备2022015515号-1