From d611fb61fc5f31778ddb9c974d516df0e61ee13d Mon Sep 17 00:00:00 2001 From: "P. Druzhkov" Date: Wed, 13 Oct 2010 20:18:12 +0000 Subject: [PATCH] Gradient Boosting Trees (CvGBTrees) added to opencv mll. Test for all CvGBTrees public methods added. --- modules/ml/include/opencv2/ml/ml.hpp | 529 +++++++++++++ modules/ml/src/gbt.cpp | 1044 ++++++++++++++++++++++++++ tests/ml/src/gbttest.cpp | 271 +++++++ 3 files changed, 1844 insertions(+) create mode 100644 modules/ml/src/gbt.cpp create mode 100644 tests/ml/src/gbttest.cpp diff --git a/modules/ml/include/opencv2/ml/ml.hpp b/modules/ml/include/opencv2/ml/ml.hpp index 1768a2d0b9..ceaf08ff5c 100644 --- a/modules/ml/include/opencv2/ml/ml.hpp +++ b/modules/ml/include/opencv2/ml/ml.hpp @@ -183,6 +183,7 @@ CV_INLINE CvParamLattice cvDefaultParamLattice( void ) #define CV_TYPE_NAME_ML_ANN_MLP "opencv-ml-ann-mlp" #define CV_TYPE_NAME_ML_CNN "opencv-ml-cnn" #define CV_TYPE_NAME_ML_RTREES "opencv-ml-random-trees" +#define CV_TYPE_NAME_ML_GBT "opencv-ml-gradient-boosting-trees" #define CV_TRAIN_ERROR 0 #define CV_TEST_ERROR 1 @@ -1359,6 +1360,532 @@ protected: }; +/****************************************************************************************\ +* Gradient Boosted Trees * +\****************************************************************************************/ + +// DataType: STRUCT CvGBTreesParams +// Parameters of GBT (Gradient Boosted trees model), including single +// tree settings and ensemble parameters. +// +// weak_count - count of trees in the ensemble +// loss_function_type - loss function used for ensemble training +// subsample_portion - portion of whole training set used for +// every single tree training. +// subsample_portion value is in (0.0, 1.0]. +// subsample_portion == 1.0 when whole dataset is +// used on each step. Count of sample used on each +// step is computed as +// int(total_samples_count * subsample_portion). +// shrinkage - regularization parameter. +// Each tree prediction is multiplied on shrinkage value. + + +struct CV_EXPORTS CvGBTreesParams : public CvDTreeParams +{ + int weak_count; + int loss_function_type; + float subsample_portion; + float shrinkage; + + CvGBTreesParams(); + CvGBTreesParams( int loss_function_type, int weak_count, float shrinkage, + float subsample_portion, int max_depth, bool use_surrogates ); +}; + +// DataType: CLASS CvGBTrees +// Gradient Boosting Trees (GBT) algorithm implementation. +// +// data - training dataset +// params - parameters of the CvGBTrees +// weak - array[0..(class_count-1)] of CvSeq +// for storing tree ensembles +// orig_response - original responses of the training set samples +// sum_response - predicitons of the current model on the training dataset. +// this matrix is updated on every iteration. +// sum_response_tmp - predicitons of the model on the training set on the next +// step. On every iteration values of sum_responses_tmp are +// computed via sum_responses values. When the current +// step is complete sum_response values become equal to +// sum_responses_tmp. +// sample_idx - indices of samples used for training the ensemble. +// CvGBTrees training procedure takes a set of samples +// (train_data) and a set of responses (responses). +// Only pairs (train_data[i], responses[i]), where i is +// in sample_idx are used for training the ensemble. +// subsample_train - indices of samples used for training a single decision +// tree on the current step. This indices are countered +// relatively to the sample_idx, so that pairs +// (train_data[sample_idx[i]], responses[sample_idx[i]]) +// are used for training a decision tree. +// Training set is randomly splited +// in two parts (subsample_train and subsample_test) +// on every iteration accordingly to the portion parameter. +// subsample_test - relative indices of samples from the training set, +// which are not used for training a tree on the current +// step. +// missing - mask of the missing values in the training set. This +// matrix has the same size as train_data. 1 - missing +// value, 0 - not a missing value. +// class_labels - output class labels map. +// rng - random number generator. Used for spliting the +// training set. +// class_count - count of output classes. +// class_count == 1 in the case of regression, +// and > 1 in the case of classification. +// delta - Huber loss function parameter. +// base_value - start point of the gradient descent procedure. +// model prediction is +// f(x) = f_0 + sum_{i=1..weak_count-1}(f_i(x)), where +// f_0 is the base value. + + + +class CV_EXPORTS CvGBTrees : public CvStatModel +{ +public: + + /* + // DataType: ENUM + // Loss functions implemented in CvGBTrees. + // + // SQUARED_LOSS + // problem: regression + // loss = (x - x')^2 + // + // ABSOLUTE_LOSS + // problem: regression + // loss = abs(x - x') + // + // HUBER_LOSS + // problem: regression + // loss = delta*( abs(x - x') - delta/2), if abs(x - x') > delta + // 1/2*(x - x')^2, if abs(x - x') <= delta, + // where delta is the alpha-quantile of pseudo responses from + // the training set. + // + // DEVIANCE_LOSS + // problem: classification + // + */ + enum {SQUARED_LOSS=0, ABSOLUTE_LOSS, HUBER_LOSS=3, DEVIANCE_LOSS}; + + + /* + // Default constructor. Creates a model only (without training). + // Should be followed by one form of the train(...) function. + // + // API + // CvGBTrees(); + + // INPUT + // OUTPUT + // RESULT + */ + CvGBTrees(); + + + /* + // Full form constructor. Creates a gradient boosting model and does the + // train. + // + // API + // CvGBTrees( const CvMat* _train_data, int _tflag, + const CvMat* _responses, const CvMat* _var_idx=0, + const CvMat* _sample_idx=0, const CvMat* _var_type=0, + const CvMat* _missing_mask=0, + CvGBTreesParams params=CvGBTreesParams() ); + + // INPUT + // _train_data - a set of input feature vectors. + // size of matrix is + // x + // or x + // depending on the _tflag parameter. + // matrix values are float. + // _tflag - a flag showing how do samples stored in the + // _train_data matrix row by row (tflag=CV_ROW_SAMPLE) + // or column by column (tflag=CV_COL_SAMPLE). + // _responses - a vector of responses corresponding to the samples + // in _train_data. + // _var_idx - indices of used variables. zero value means that all + // variables are active. + // _sample_idx - indices of used samples. zero value means that all + // samples from _train_data are in the training set. + // _var_type - vector of length. gives every + // variable type CV_VAR_CATEGORICAL or CV_VAR_ORDERED. + // _var_type = 0 means all variables are numerical. + // _missing_mask - a mask of misiing values in _train_data. + // _missing_mask = 0 means that there are no missing + // values. + // params - parameters of GTB algorithm. + // OUTPUT + // RESULT + */ + CvGBTrees( const CvMat* _train_data, int _tflag, + const CvMat* _responses, const CvMat* _var_idx=0, + const CvMat* _sample_idx=0, const CvMat* _var_type=0, + const CvMat* _missing_mask=0, + CvGBTreesParams params=CvGBTreesParams() ); + + + /* + // Destructor. + */ + virtual ~CvGBTrees(); + + + /* + // Gradient tree boosting model training + // + // API + // virtual bool train( const CvMat* _train_data, int _tflag, + const CvMat* _responses, const CvMat* _var_idx=0, + const CvMat* _sample_idx=0, const CvMat* _var_type=0, + const CvMat* _missing_mask=0, + CvGBTreesParams params=CvGBTreesParams(), + bool update=false ); + + // INPUT + // _train_data - a set of input feature vectors. + // size of matrix is + // x + // or x + // depending on the _tflag parameter. + // matrix values are float. + // _tflag - a flag showing how do samples stored in the + // _train_data matrix row by row (tflag=CV_ROW_SAMPLE) + // or column by column (tflag=CV_COL_SAMPLE). + // _responses - a vector of responses corresponding to the samples + // in _train_data. + // _var_idx - indices of used variables. zero value means that all + // variables are active. + // _sample_idx - indices of used samples. zero value means that all + // samples from _train_data are in the training set. + // _var_type - vector of length. gives every + // variable type CV_VAR_CATEGORICAL or CV_VAR_ORDERED. + // _var_type = 0 means all variables are numerical. + // _missing_mask - a mask of misiing values in _train_data. + // _missing_mask = 0 means that there are no missing + // values. + // params - parameters of GTB algorithm. + // update - is not supported now. (!) + // OUTPUT + // RESULT + // Error state. + */ + virtual bool train( const CvMat* _train_data, int _tflag, + const CvMat* _responses, const CvMat* _var_idx=0, + const CvMat* _sample_idx=0, const CvMat* _var_type=0, + const CvMat* _missing_mask=0, + CvGBTreesParams params=CvGBTreesParams(), + bool update=false ); + + + /* + // Gradient tree boosting model training + // + // API + // virtual bool train( CvMLData* data, + CvGBTreesParams params=CvGBTreesParams(), + bool update=false ) {return false;}; + + // INPUT + // data - training set. + // params - parameters of GTB algorithm. + // update - is not supported now. (!) + // OUTPUT + // RESULT + // Error state. + */ + virtual bool train( CvMLData* data, + CvGBTreesParams params=CvGBTreesParams(), + bool update=false ); + + + /* + // Response value prediction + // + // API + // virtual float predict( const CvMat* _sample, const CvMat* _missing=0, + CvMat* weak_responses=0, CvSlice slice = CV_WHOLE_SEQ, + int k=-1 ) const; + + // INPUT + // _sample - input sample of the same type as in the training set. + // _missing - missing values mask. _missing=0 if there are no + // missing values in _sample vector. + // weak_responses - predictions of all of the trees. + // not implemented (!) + // slice - part of the ensemble used for prediction. + // slice = CV_WHOLE_SEQ when all trees are used. + // k - number of ensemble used. + // k is in {-1,0,1,..,}. + // in the case of classification problem + // ensembles are built. + // If k = -1 ordinary prediction is the result, + // otherwise function gives the prediction of the + // k-th ensemble only. + // OUTPUT + // RESULT + // Predicted value. + */ + virtual float predict( const CvMat* _sample, const CvMat* _missing=0, + CvMat* weak_responses=0, CvSlice slice = CV_WHOLE_SEQ, + int k=-1 ) const; + + /* + // Delete all temporary data. + // + // API + // virtual void clear(); + + // INPUT + // OUTPUT + // delete data, weak, orig_response, sum_response, + // weak_eval, ubsample_train, subsample_test, + // sample_idx, missing, lass_labels + // delta = 0.0 + // RESULT + */ + virtual void clear(); + + /* + // Compute error on the train/test set. + // + // API + // virtual float calc_error( CvMLData* _data, int type, + // std::vector *resp = 0 ); + // + // INPUT + // data - dataset + // type - defines which error is to compute^ train (CV_TRAIN_ERROR) or + // test (CV_TEST_ERROR). + // OUTPUT + // resp - vector of predicitons + // RESULT + // Error value. + */ + virtual float calc_error( CvMLData* _data, int type, + std::vector *resp = 0 ); + + + /* + // + // Write parameters of the gtb model and data. Write learned model. + // + // API + // virtual void write( CvFileStorage* fs, const char* name ) const; + // + // INPUT + // fs - file storage to read parameters from. + // name - model name. + // OUTPUT + // RESULT + */ + virtual void write( CvFileStorage* fs, const char* name ) const; + + + /* + // + // Read parameters of the gtb model and data. Read learned model. + // + // API + // virtual void read( CvFileStorage* fs, CvFileNode* node ); + // + // INPUT + // fs - file storage to read parameters from. + // node - file node. + // OUTPUT + // RESULT + */ + virtual void read( CvFileStorage* fs, CvFileNode* node ); + + +protected: + + /* + // Compute the gradient vector components. + // + // API + // virtual void find_gradient( const int k = 0); + + // INPUT + // k - used for classification problem, determining current + // tree ensemble. + // OUTPUT + // changes components of data->responses + // which correspond to samples used for training + // on the current step. + // RESULT + */ + virtual void find_gradient( const int k = 0); + + + /* + // + // Change values in tree leaves according to the used loss function. + // + // API + // virtual void change_values(CvDTree* tree, const int k = 0); + // + // INPUT + // tree - decision tree to change. + // k - used for classification problem, determining current + // tree ensemble. + // OUTPUT + // changes 'value' fields of the trees' leaves. + // changes sum_response_tmp. + // RESULT + */ + virtual void change_values(CvDTree* tree, const int k = 0); + + + /* + // + // Find optimal constant prediction value according to the used loss + // function. + // The goal is to find a constant which gives the minimal summary loss + // on the _Idx samples. + // + // API + // virtual float find_optimal_value( const CvMat* _Idx ); + // + // INPUT + // _Idx - indices of the samples from the training set. + // OUTPUT + // RESULT + // optimal constant value. + */ + virtual float find_optimal_value( const CvMat* _Idx ); + + + /* + // + // Randomly split the whole training set in two parts according + // to params.portion. + // + // API + // virtual void do_subsample(); + // + // INPUT + // OUTPUT + // subsample_train - indices of samples used for training + // subsample_test - indices of samples used for test + // RESULT + */ + virtual void do_subsample(); + + + /* + // + // Internal recursive function giving an array of subtree tree leaves. + // + // API + // void leaves_get( CvDTreeNode** leaves, int& count, CvDTreeNode* node ); + // + // INPUT + // node - current leaf. + // OUTPUT + // count - count of leaves in the subtree. + // leaves - array of pointers to leaves. + // RESULT + */ + void leaves_get( CvDTreeNode** leaves, int& count, CvDTreeNode* node ); + + + /* + // + // Get leaves of the tree. + // + // API + // CvDTreeNode** GetLeaves( const CvDTree* dtree, int& len ); + // + // INPUT + // dtree - decision tree. + // OUTPUT + // len - count of the leaves. + // RESULT + // CvDTreeNode** - array of pointers to leaves. + */ + CvDTreeNode** GetLeaves( const CvDTree* dtree, int& len ); + + + /* + // + // Is it a regression or a classification. + // + // API + // bool problem_type(); + // + // INPUT + // OUTPUT + // RESULT + // false if it is a classification problem, + // true - if regression. + */ + virtual bool problem_type() const; + + + /* + // + // Write parameters of the gtb model. + // + // API + // virtual void write_params( CvFileStorage* fs ) const; + // + // INPUT + // fs - file storage to write parameters to. + // OUTPUT + // RESULT + */ + virtual void write_params( CvFileStorage* fs ) const; + + + /* + // + // Read parameters of the gtb model and data. + // + // API + // virtual void read_params( CvFileStorage* fs ); + // + // INPUT + // fs - file storage to read parameters from. + // OUTPUT + // params - parameters of the gtb model. + // data - contains information about the structure + // of the data set (count of variables, + // their types, etc.). + // class_labels - output class labels map. + // RESULT + */ + virtual void read_params( CvFileStorage* fs, CvFileNode* fnode ); + + + CvDTreeTrainData* data; + CvGBTreesParams params; + + CvSeq** weak; + CvMat* orig_response; + CvMat* sum_response; + CvMat* sum_response_tmp; + CvMat* weak_eval; + CvMat* sample_idx; + CvMat* subsample_train; + CvMat* subsample_test; + CvMat* missing; + CvMat* class_labels; + + CvRNG rng; + + int class_count; + float delta; + float base_value; + +}; + + + /****************************************************************************************\ * Artificial Neural Networks (ANN) * \****************************************************************************************/ @@ -1936,6 +2463,8 @@ typedef CvBoostTree BoostTree; typedef CvBoost Boost; typedef CvANN_MLP_TrainParams ANN_MLP_TrainParams; typedef CvANN_MLP NeuralNet_MLP; +typedef CvGBTreesParams GradientBoostingTreesParams; +typedef CvGBTrees GradientBoostingTrees; } diff --git a/modules/ml/src/gbt.cpp b/modules/ml/src/gbt.cpp new file mode 100644 index 0000000000..2d4259b82b --- /dev/null +++ b/modules/ml/src/gbt.cpp @@ -0,0 +1,1044 @@ + +#include "precomp.hpp" +#include +#include + +using namespace std; + +#define pCvSeq CvSeq* +#define pCvDTreeNode CvDTreeNode* + +#define CV_CMP_FLOAT(a,b) ((a) < (b)) +static CV_IMPLEMENT_QSORT_EX( icvSortFloat, float, CV_CMP_FLOAT, float) + + +//=========================================================================== +string ToString(int i) +{ + stringstream tmp; + tmp << i; + + return tmp.str(); +} + +//=========================================================================== +int get_len(const CvMat* mat) +{ + return (mat->cols > mat->rows) ? mat->cols : mat->rows; +} + +//=========================================================================== +//----------------------------- CvGBTreesParams ----------------------------- +//=========================================================================== + +CvGBTreesParams::CvGBTreesParams() + : CvDTreeParams( 3, 10, 0, true, 10, 0, false, false, 0 ) +{ + weak_count = 50; + loss_function_type = CvGBTrees::SQUARED_LOSS; + subsample_portion = 1.0f; + shrinkage = 1.0f; +} + +//=========================================================================== + +CvGBTreesParams::CvGBTreesParams( int _loss_function_type, int _weak_count, + float _shrinkage, float _subsample_portion, + int _max_depth, bool _use_surrogates ) + : CvDTreeParams( 3, 10, 0, true, 10, 0, false, false, 0 ) +{ + loss_function_type = _loss_function_type; + weak_count = _weak_count; + shrinkage = _shrinkage; + subsample_portion = _subsample_portion; + max_depth = _max_depth; + use_surrogates = _use_surrogates; +} + +//=========================================================================== +//------------------------------- CvGBTrees --------------------------------- +//=========================================================================== + +CvGBTrees::CvGBTrees() +{ + data = 0; + weak = 0; + default_model_name = "my_boost_tree"; + orig_response = sum_response = sum_response_tmp = 0; + weak_eval = subsample_train = subsample_test = 0; + missing = sample_idx = 0; + class_labels = 0; + class_count = 1; + delta = 0.0f; + + clear(); +} + +//=========================================================================== + +void CvGBTrees::clear() +{ + if( weak ) + { + CvSeqReader reader; + CvSlice slice = CV_WHOLE_SEQ; + int weak_count = cvSliceLength( slice, weak[class_count-1] ); + CvDTree* tree; + + //data->shared = false; + for (int i=0; iclear(); + delete tree; + tree = 0; + } + } + } + for (int i=0; istorage) ); + delete[] weak; + } + if (data) + { + data->shared = false; + delete data; + } + weak = 0; + data = 0; + delta = 0.0f; + cvReleaseMat( &orig_response ); + cvReleaseMat( &sum_response ); + cvReleaseMat( &sum_response_tmp ); + cvReleaseMat( &weak_eval ); + cvReleaseMat( &subsample_train ); + cvReleaseMat( &subsample_test ); + cvReleaseMat( &sample_idx ); + cvReleaseMat( &missing ); + cvReleaseMat( &class_labels ); +} + +//=========================================================================== + +CvGBTrees::~CvGBTrees() +{ + clear(); +} + +//=========================================================================== + +CvGBTrees::CvGBTrees( const CvMat* _train_data, int _tflag, + const CvMat* _responses, const CvMat* _var_idx, + const CvMat* _sample_idx, const CvMat* _var_type, + const CvMat* _missing_mask, CvGBTreesParams _params ) +{ + weak = 0; + data = 0; + default_model_name = "my_boost_tree"; + orig_response = sum_response = sum_response_tmp = 0; + weak_eval = subsample_train = subsample_test = 0; + missing = sample_idx = 0; + class_labels = 0; + class_count = 1; + delta = 0.0f; + + train( _train_data, _tflag, _responses, _var_idx, _sample_idx, + _var_type, _missing_mask, _params ); +} + +//=========================================================================== + +bool CvGBTrees::problem_type() const +{ + switch (params.loss_function_type) + { + case DEVIANCE_LOSS: return false; + default: return true; + } +} + +//=========================================================================== + +bool +CvGBTrees::train( CvMLData* data, CvGBTreesParams params, bool update ) +{ + bool result; + result = train ( data->get_values(), CV_ROW_SAMPLE, + data->get_responses(), data->get_var_idx(), + data->get_train_sample_idx(), data->get_var_types(), + data->get_missing(), params, update); + //update is not supported + return result; +} + +//=========================================================================== + + +bool +CvGBTrees::train( const CvMat* _train_data, int _tflag, + const CvMat* _responses, const CvMat* _var_idx, + const CvMat* _sample_idx, const CvMat* _var_type, + const CvMat* _missing_mask, + CvGBTreesParams _params, bool _update ) //update is not supported +{ + CvMemStorage* storage = 0; + + params = _params; + bool is_regression = problem_type(); + + clear(); + int len = get_len(_responses); + + CvMat* new_responses = cvCreateMat( len, 1, CV_32F); + cvZero(new_responses); + + data = new CvDTreeTrainData( _train_data, _tflag, new_responses, _var_idx, + _sample_idx, _var_type, _missing_mask, _params, true, true ); + if (_missing_mask) + { + missing = cvCreateMat(_missing_mask->rows, _missing_mask->cols, + _missing_mask->type); + cvCopy( _missing_mask, missing); + } + + orig_response = cvCreateMat( _responses->rows, _responses->cols, + _responses->type ); + cvCopy( _responses, orig_response); + orig_response->step = CV_ELEM_SIZE(_responses->type); + + if (!is_regression) + { + int max_label = -1; + for (int i=0; idata.fl[i]) + max_label = int(orig_response->data.fl[i]); + max_label++; + class_labels = cvCreateMat(1, max_label, CV_32S); + cvZero(class_labels); + for (int i=0; idata.i[int(orig_response->data.fl[i])] = 1; + class_count = 0; + for (int i=0; idata.i[i]) + class_labels->data.i[i] = ++class_count; + } + + data->is_classifier = false; + + if (_sample_idx) + { + sample_idx = cvCreateMat( _sample_idx->rows, _sample_idx->cols, + _sample_idx->type ); + cvCopy( _sample_idx, sample_idx); + icvSortFloat(sample_idx->data.fl, get_len(sample_idx), 0); + } + else + { + int n = (_tflag == CV_ROW_SAMPLE) ? _train_data->rows + : _train_data->cols; + sample_idx = cvCreateMat( 1, n, CV_32S ); + for (int i=0; idata.i[i] = i; + } + + sum_response = cvCreateMat(class_count, len, CV_32F); + sum_response_tmp = cvCreateMat(class_count, len, CV_32F); + cvZero(sum_response); + + delta = 0.0f; + if (is_regression) base_value = find_optimal_value(sample_idx); + else base_value = 0.0f; + cvSet( sum_response, cvScalar(base_value) ); + + weak = new pCvSeq[class_count]; + for (int i=0; i 1) params.subsample_portion = 1; + //if ( params.subsample_portion < 0) params.subsample_portion = 1; + params.subsample_portion = params.subsample_portion <= FLT_EPSILON || + 1 - params.subsample_portion <= FLT_EPSILON + ? 1 : params.subsample_portion; + int train_sample_count = cvFloor(params.subsample_portion * samples_count); + if (train_sample_count == 0) + train_sample_count = samples_count; + int test_sample_count = samples_count - train_sample_count; + int* idx_data = new int[samples_count]; + subsample_train = cvCreateMatHeader( 1, train_sample_count, CV_32SC1 ); + *subsample_train = cvMat( 1, train_sample_count, CV_32SC1, idx_data ); + if (test_sample_count) + { + subsample_test = cvCreateMatHeader( 1, test_sample_count, CV_32SC1 ); + *subsample_test = cvMat( 1, test_sample_count, CV_32SC1, + idx_data + train_sample_count ); + } + + + // training procedure + + for ( int i=0; i < params.weak_count; ++i ) + { + for ( int m=0; m < class_count; ++m ) + { + do_subsample(); + find_gradient(m); + CvDTree* tree = new CvDTree; + tree->train( data, subsample_train ); + change_values(tree, m); + + if (subsample_test) + { + CvMat x; + CvMat x_miss; + int* sample_data = sample_idx->data.i; + int* subsample_data = subsample_test->data.i; + int s_step = (sample_idx->cols > sample_idx->rows) ? 1 + : sample_idx->step/CV_ELEM_SIZE(sample_idx->type); + for (int j=0; jtrain_data, &x, idx); + if (missing) + { + cvGetRow( missing, &x_miss, idx); + res = (float)tree->predict(&x, &x_miss)->value; + } + else + { + res = (float)tree->predict(&x)->value; + } + sum_response_tmp->data.fl[idx + k*len] = + sum_response->data.fl[idx + k*len] + + params.shrinkage * res; + } + } + } + + cvSeqPush( weak[m], &tree ); + tree = 0; + } // m=0..class_count + CvMat* tmp; + tmp = sum_response_tmp; + sum_response_tmp = sum_response; + sum_response = tmp; + tmp = 0; + } // i=0..params.weak_count + + delete[] idx_data; + cvReleaseMat(&new_responses); + data->free_train_data(); + return true; + +} // CvGBTrees::train(...) + +//=========================================================================== + +float Sign(float x) + { + if (x<0.0f) return -1.0f; + else if (x>0.0f) return 1.0f; + return 0.0f; + } + +//=========================================================================== + +void CvGBTrees::find_gradient(const int k) +{ + int* sample_data = sample_idx->data.i; + int* subsample_data = subsample_train->data.i; + float* grad_data = data->responses->data.fl; + float* resp_data = orig_response->data.fl; + float* current_data = sum_response->data.fl; + + switch (params.loss_function_type) + // loss_function_type in + // {SQUARED_LOSS, ABSOLUTE_LOSS, HUBER_LOSS, DEVIANCE_LOSS} + { + case SQUARED_LOSS: + { + for (int i=0; icols > sample_idx->rows) ? 1 + : sample_idx->step/CV_ELEM_SIZE(sample_idx->type); + int idx = *(sample_data + subsample_data[i]*s_step); + grad_data[idx] = resp_data[idx] - current_data[idx]; + } + }; break; + + case ABSOLUTE_LOSS: + { + for (int i=0; icols > sample_idx->rows) ? 1 + : sample_idx->step/CV_ELEM_SIZE(sample_idx->type); + int idx = *(sample_data + subsample_data[i]*s_step); + grad_data[idx] = Sign(resp_data[idx] - current_data[idx]); + } + }; break; + + case HUBER_LOSS: + { + float alpha = 0.2f; + int n = get_len(subsample_train); + int s_step = (sample_idx->cols > sample_idx->rows) ? 1 + : sample_idx->step/CV_ELEM_SIZE(sample_idx->type); + + float* residuals = new float[n]; + for (int i=0; i delta) ? delta*Sign(r) : r; + } + delete[] residuals; + + }; break; + + case DEVIANCE_LOSS: + { + for (int i=0; icols > sample_idx->rows) ? 1 + : sample_idx->step/CV_ELEM_SIZE(sample_idx->type); + int idx = *(sample_data + subsample_data[i]*s_step); + + for (int j=0; jcols]; + res = expl(res); + if (j == k) exp_fk = res; + exp_sfi += res; + } + int orig_label = int(resp_data[idx]); + grad_data[idx] = (float)(!(k-class_labels->data.i[orig_label]+1)) - + (float)(exp_fk / exp_sfi); + } + }; break; + + default: break; + } + +} // CvGBTrees::find_gradient(...) + +//=========================================================================== + +void CvGBTrees::change_values(CvDTree* tree, const int _k) +{ + CvDTreeNode** predictions = new pCvDTreeNode[get_len(subsample_train)]; + + int* sample_data = sample_idx->data.i; + int* subsample_data = subsample_train->data.i; + int s_step = (sample_idx->cols > sample_idx->rows) ? 1 + : sample_idx->step/CV_ELEM_SIZE(sample_idx->type); + + CvMat x; + CvMat miss_x; + + for (int i=0; itrain_data, &x, idx); + if (missing) + { + cvGetRow( missing, &miss_x, idx); + predictions[i] = tree->predict(&x, &miss_x); + } + else + predictions[i] = tree->predict(&x); + } + + CvDTreeNode** leaves; + int leaves_count = 0; + leaves = GetLeaves( tree, leaves_count); + + for (int i=0; ivalue = 0.0; + continue; + } + + CvMat* leaf_idx = cvCreateMat(1, samples_in_leaf, CV_32S); + int* leaf_idx_data = leaf_idx->data.i; + + for (int j=0; jvalue = value; + + leaf_idx_data = leaf_idx->data.i; + + int len = sum_response_tmp->cols; + for (int j=0; jdata.fl[idx + _k*len] = + sum_response->data.fl[idx + _k*len] + + params.shrinkage * value; + } + leaf_idx_data = 0; + cvReleaseMat(&leaf_idx); + } + + // releasing the memory + for (int i=0; isample_count; + int* leaf_idx_data = new int[n]; + data->get_sample_indices(leaves[i], leaf_idx_data); + CvMat* leaf_idx = 0; + cvInitMatHeader(leaf_idx, n, 1, CV_32S, leaf_idx_data); + + float value = find_optimal_value(leaf_idx); + leaves[i]->value = value; + + int len = sum_response_tmp->cols; + for (int j=0; jdata.fl[idx] = sum_response->data.fl[idx] + + params.shrinkage * value; + } + leaf_idx_data = 0; + cvReleaseMat(&leaf_idx); + } + + // releasing the memory + for (int i=0; idata.i; + float* resp_data = orig_response->data.fl; + float* cur_data = sum_response->data.fl; + int n = get_len(_Idx); + + switch (params.loss_function_type) + // SQUARED_LOSS=0, ABSOLUTE_LOSS=1, HUBER_LOSS=3, DEVIANCE_LOSS=4 + { + case SQUARED_LOSS: + { + for (int i=0; i> 1; + float r_median = (n == n_half<<1) ? + (residuals[n_half-1] + residuals[n_half]) / 2.0f : + residuals[n_half]; + + for (int i=0; iresponses->data.fl; + long double tmp1 = 0; + long double tmp2 = 0; + long double tmp = 0; + for (int i=0; ileft != NULL) leaves_get(leaves, count, node->left); + if (node->right != NULL) leaves_get(leaves, count, node->right); + if ((node->left == NULL) && (node->right == NULL)) + leaves[count++] = node; +} + +//--------------------------------------------------------------------------- + +CvDTreeNode** CvGBTrees::GetLeaves( const CvDTree* dtree, int& len ) +{ + len = 0; + CvDTreeNode** leaves = new pCvDTreeNode[1 << params.max_depth]; + leaves_get(leaves, len, const_cast(dtree->get_root())); + return leaves; +} + +//=========================================================================== + +void CvGBTrees::do_subsample() +{ + + int n = get_len(sample_idx); + int* idx = subsample_train->data.i; + + for (int i = 0; i < n; i++ ) + idx[i] = i; + + if (subsample_test) + for (int i = 0; i < n; i++) + { + int a = cvRandInt( &rng ) % n; + int b = cvRandInt( &rng ) % n; + int t; + CV_SWAP( idx[a], idx[b], t ); + } + +/* + int n = get_len(sample_idx); + if (subsample_train == 0) + subsample_train = cvCreateMat(1, n, CV_32S); + int* subsample_data = subsample_train->data.i; + for (int i=0; ipredict(_sample, _missing)->value); + } + } + } + + if (class_count == 1) + { + result = sum[0]; + delete[] sum; + return result; + } + + if ((k>=0) && (k max) + { + max = sum[i]; + class_label = i; + } + + delete[] sum; + + int orig_class_label = -1; + for (int i=0; idata.i[i] == class_label+1) + orig_class_label = i; + + return float(orig_class_label); +} + +//=========================================================================== + +void CvGBTrees::write_params( CvFileStorage* fs ) const +{ + CV_FUNCNAME( "CvGBTrees::write_params" ); + __BEGIN__; + + const char* loss_function_type_str = + params.loss_function_type == SQUARED_LOSS ? "SquaredLoss" : + params.loss_function_type == ABSOLUTE_LOSS ? "AbsoluteLoss" : + params.loss_function_type == HUBER_LOSS ? "HuberLoss" : + params.loss_function_type == DEVIANCE_LOSS ? "DevianceLoss" : 0; + + + if( loss_function_type_str ) + cvWriteString( fs, "loss_function", loss_function_type_str ); + else + cvWriteInt( fs, "loss_function", params.loss_function_type ); + + cvWriteInt( fs, "ensemble_length", params.weak_count ); + cvWriteReal( fs, "shrinkage", params.shrinkage ); + cvWriteReal( fs, "subsample_portion", params.subsample_portion ); + //cvWriteInt( fs, "max_tree_depth", params.max_depth ); + //cvWriteString( fs, "use_surrogate_splits", params.use_surrogates ? "true" : "false"); + if (class_labels) cvWrite( fs, "class_labels", class_labels); + + data->is_classifier = !problem_type(); + data->write_params( fs ); + data->is_classifier = 0; + + __END__; +} + + +//=========================================================================== + +void CvGBTrees::read_params( CvFileStorage* fs, CvFileNode* fnode ) +{ + CV_FUNCNAME( "CvGBTrees::read_params" ); + __BEGIN__; + + + CvFileNode* temp; + + if( !fnode || !CV_NODE_IS_MAP(fnode->tag) ) + return; + + data = new CvDTreeTrainData(); + CV_CALL( data->read_params(fs, fnode)); + data->shared = true; + + params.max_depth = data->params.max_depth; + params.min_sample_count = data->params.min_sample_count; + params.max_categories = data->params.max_categories; + params.priors = data->params.priors; + params.regression_accuracy = data->params.regression_accuracy; + params.use_surrogates = data->params.use_surrogates; + + temp = cvGetFileNodeByName( fs, fnode, "loss_function" ); + if( !temp ) + EXIT; + + if( temp && CV_NODE_IS_STRING(temp->tag) ) + { + const char* loss_function_type_str = cvReadString( temp, "" ); + params.loss_function_type = strcmp( loss_function_type_str, "SquaredLoss" ) == 0 ? SQUARED_LOSS : + strcmp( loss_function_type_str, "AbsoluteLoss" ) == 0 ? ABSOLUTE_LOSS : + strcmp( loss_function_type_str, "HuberLoss" ) == 0 ? HUBER_LOSS : + strcmp( loss_function_type_str, "DevianceLoss" ) == 0 ? DEVIANCE_LOSS : -1; + } + else + params.loss_function_type = cvReadInt( temp, -1 ); + + + if( params.loss_function_type < SQUARED_LOSS || params.loss_function_type > DEVIANCE_LOSS || params.loss_function_type == 2) + CV_ERROR( CV_StsBadArg, "Unknown loss function" ); + + params.weak_count = cvReadIntByName( fs, fnode, "ensemble_length" ); + params.shrinkage = (float)cvReadRealByName( fs, fnode, "shrinkage", 0.1 ); + params.subsample_portion = (float)cvReadRealByName( fs, fnode, "subsample_portion", 1.0 ); + + if (data->is_classifier) + { + class_labels = (CvMat*)cvReadByName( fs, fnode, "class_labels" ); + if( class_labels && !CV_IS_MAT(class_labels)) + CV_ERROR( CV_StsParseError, "class_labels must stored as a matrix"); + } + data->is_classifier = 0; + + __END__; +} + + + + +void CvGBTrees::write( CvFileStorage* fs, const char* name ) const +{ + CV_FUNCNAME( "CvGBTrees::write" ); + + __BEGIN__; + + CvSeqReader reader; + int i; + std::string s; + + cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_GBT ); + + if( !weak ) + CV_ERROR( CV_StsBadArg, "The model has not been trained yet" ); + + write_params( fs ); + cvWriteReal( fs, "base_value", base_value); + cvWriteInt( fs, "class_count", class_count); + + for ( int j=0; j < class_count; ++j ) + { + s = "trees_"; + s += ToString(j); + cvStartWriteStruct( fs, s.c_str(), CV_NODE_SEQ ); + + cvStartReadSeq( weak[j], &reader ); + + for( i = 0; i < weak[j]->total; i++ ) + { + CvDTree* tree; + CV_READ_SEQ_ELEM( tree, reader ); + cvStartWriteStruct( fs, 0, CV_NODE_MAP ); + tree->write( fs ); + cvEndWriteStruct( fs ); + } + + cvEndWriteStruct( fs ); + } + + cvEndWriteStruct( fs ); + + __END__; +} + + +//=========================================================================== + + +void CvGBTrees::read( CvFileStorage* fs, CvFileNode* node ) +{ + + CV_FUNCNAME( "CvGBTrees::read" ); + + __BEGIN__; + + CvSeqReader reader; + CvFileNode* trees_fnode; + CvMemStorage* storage; + int i, ntrees; + std::string s; + + clear(); + read_params( fs, node ); + + if( !data ) + EXIT; + + base_value = (float)cvReadRealByName( fs, node, "base_value", 0.0 ); + class_count = cvReadIntByName( fs, node, "class_count", 1 ); + + weak = new pCvSeq[class_count]; + + + for (int j=0; jtag) ) + CV_ERROR( CV_StsParseError, " tag is missing" ); + + cvStartReadSeq( trees_fnode->data.seq, &reader ); + ntrees = trees_fnode->data.seq->total; + + if( ntrees != params.weak_count ) + CV_ERROR( CV_StsUnmatchedSizes, + "The number of trees stored does not match tag value" ); + + CV_CALL( storage = cvCreateMemStorage() ); + weak[j] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage ); + + for( i = 0; i < ntrees; i++ ) + { + CvDTree* tree = new CvDTree(); + CV_CALL(tree->read( fs, (CvFileNode*)reader.ptr, data )); + CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader ); + cvSeqPush( weak[j], &tree ); + } + } + + __END__; +} + +//=========================================================================== + +// type in {CV_TRAIN_ERROR, CV_TEST_ERROR} +float +CvGBTrees::calc_error( CvMLData* _data, int type, std::vector *resp ) +{ + float err = 0; + const CvMat* values = _data->get_values(); + const CvMat* response = _data->get_responses(); + const CvMat* missing = _data->get_missing(); + const CvMat* sample_idx = (type == CV_TEST_ERROR) ? + _data->get_test_sample_idx() : + _data->get_train_sample_idx(); + //const CvMat* var_types = _data->get_var_types(); + int* sidx = sample_idx ? sample_idx->data.i : 0; + int r_step = CV_IS_MAT_CONT(response->type) ? + 1 : response->step / CV_ELEM_SIZE(response->type); + //bool is_classifier = + // var_types->data.ptr[var_types->cols-1] == CV_VAR_CATEGORICAL; + int sample_count = sample_idx ? sample_idx->cols : 0; + sample_count = (type == CV_TRAIN_ERROR && sample_count == 0) ? + values->rows : + sample_count; + float* pred_resp = 0; + if( resp && (sample_count > 0) ) + { + resp->resize( sample_count ); + pred_resp = &((*resp)[0]); + } + if ( !problem_type() ) + { + for( int i = 0; i < sample_count; i++ ) + { + CvMat sample, miss; + int si = sidx ? sidx[i] : i; + cvGetRow( values, &sample, si ); + if( missing ) + cvGetRow( missing, &miss, si ); + float r = (float)predict( &sample, missing ? &miss : 0 ); + if( pred_resp ) + pred_resp[i] = r; + int d = fabs((double)r - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1; + err += d; + } + err = sample_count ? err / (float)sample_count * 100 : -FLT_MAX; + } + else + { + for( int i = 0; i < sample_count; i++ ) + { + CvMat sample, miss; + int si = sidx ? sidx[i] : i; + cvGetRow( values, &sample, si ); + if( missing ) + cvGetRow( missing, &miss, si ); + float r = (float)predict( &sample, missing ? &miss : 0 ); + if( pred_resp ) + pred_resp[i] = r; + float d = r - response->data.fl[si*r_step]; + err += d*d; + } + err = sample_count ? err / (float)sample_count : -FLT_MAX; + } + return err; + +} diff --git a/tests/ml/src/gbttest.cpp b/tests/ml/src/gbttest.cpp new file mode 100644 index 0000000000..8331a02092 --- /dev/null +++ b/tests/ml/src/gbttest.cpp @@ -0,0 +1,271 @@ + +#include "mltest.h" +#include +#include +#include + +using namespace std; + + +class CV_GBTreesTest : public CvTest +{ +public: + CV_GBTreesTest(); + ~CV_GBTreesTest(); + +protected: + void run(int); + + int TestTrainPredict(int test_num); + int TestSaveLoad(); + + int checkPredictError(int test_num); + int checkLoadSave(); + + //string model_file_name1; + //string model_file_name2; + char model_file_name1[50]; + char model_file_name2[50]; + string* datasets; + string data_path; + + CvMLData* data; + CvGBTrees* gtb; + + vector test_resps1; + vector test_resps2; +}; + + +int _get_len(const CvMat* mat) +{ + return (mat->cols > mat->rows) ? mat->cols : mat->rows; +} + + +CV_GBTreesTest::CV_GBTreesTest() : + CvTest( "CvGBTrees_test", + "all public methods (train, predict, save, load)" ) +{ + datasets = 0; + data = 0; + gtb = 0; +} + +CV_GBTreesTest::~CV_GBTreesTest() +{ + if (data) + delete data; + delete[] datasets; +} + + +int CV_GBTreesTest::TestTrainPredict(int test_num) +{ + int code = CvTS::OK; + + int weak_count = 200; + float shrinkage = 0.1f; + float subsample_portion = 0.5f; + int max_depth = 5; + bool use_surrogates = true; + int loss_function_type = 0; + switch (test_num) + { + case (1) : loss_function_type = CvGBTrees::SQUARED_LOSS; break; + case (2) : loss_function_type = CvGBTrees::ABSOLUTE_LOSS; break; + case (3) : loss_function_type = CvGBTrees::HUBER_LOSS; break; + case (0) : loss_function_type = CvGBTrees::DEVIANCE_LOSS; break; + default : + { + ts->printf( CvTS::LOG, "Bad test_num value in CV_GBTreesTest::TestTrainPredict(..) function." ); + return CvTS::FAIL_BAD_ARG_CHECK; + } + } + + int dataset_num = test_num == 0 ? 0 : 1; + if (!data) + { + data = new CvMLData(); + data->set_delimiter(','); + + if (data->read_csv(datasets[dataset_num].c_str())) + { + ts->printf( CvTS::LOG, "File reading error." ); + return CvTS::FAIL_INVALID_TEST_DATA; + } + + if (test_num == 0) + { + data->set_response_idx(57); + data->set_var_types("ord[0-56],cat[57]"); + } + else + { + data->set_response_idx(13); + data->set_var_types("ord[0-2,4-13],cat[3]"); + subsample_portion = 0.7f; + } + + int train_sample_count = cvFloor(_get_len(data->get_responses())*0.5f); + CvTrainTestSplit spl( train_sample_count ); + data->set_train_test_split( &spl ); + } + + data->mix_train_and_test_idx(); + + + if (gtb) delete gtb; + gtb = new CvGBTrees(); + bool tmp_code = true; + tmp_code = gtb->train(data, CvGBTreesParams(loss_function_type, weak_count, + shrinkage, subsample_portion, + max_depth, use_surrogates)); + + if (!tmp_code) + { + ts->printf( CvTS::LOG, "Model training was failed."); + return CvTS::FAIL_INVALID_OUTPUT; + } + + code = checkPredictError(test_num); + + return code; + +} + + +int CV_GBTreesTest::checkPredictError(int test_num) +{ + if (!gtb) + return CvTS::FAIL_GENERIC; + + float mean[] = {5.3555f, 11.2241f, 11.9212f, 12.0848f}; + float sigma[] = {0.362127f, 3.4906f, 3.4906f, 3.64994f}; + + float current_error = gtb->calc_error(data, CV_TEST_ERROR); + + if ( abs( current_error - mean[test_num]) > 6*sigma[test_num] ) + { + ts->printf( CvTS::LOG, "Test error is out of range:\n" + "abs(%f/*curEr*/ - %f/*mean*/ > %f/*6*sigma*/", + current_error, mean[test_num], 6*sigma[test_num] ); + return CvTS::FAIL_BAD_ACCURACY; + } + + return CvTS::OK; + +} + + +int CV_GBTreesTest::TestSaveLoad() +{ + if (!gtb) + return CvTS::FAIL_GENERIC; + + tmpnam(model_file_name1); + tmpnam(model_file_name2); + + gtb->save(model_file_name1); + gtb->calc_error(data, CV_TEST_ERROR, &test_resps1); + gtb->load(model_file_name1); + gtb->calc_error(data, CV_TEST_ERROR, &test_resps2); + gtb->save(model_file_name2); + + return checkLoadSave(); + +} + + + +int CV_GBTreesTest::checkLoadSave() +{ + int code = CvTS::OK; + + // 1. compare files + ifstream f1( model_file_name1 ), f2( model_file_name2 ); + string s1, s2; + int lineIdx = 0; + CV_Assert( f1.is_open() && f2.is_open() ); + for( ; !f1.eof() && !f2.eof(); lineIdx++ ) + { + getline( f1, s1 ); + getline( f2, s2 ); + if( s1.compare(s2) ) + { + ts->printf( CvTS::LOG, "first and second saved files differ in %n-line; first %n line: %s; second %n-line: %s", + lineIdx, lineIdx, s1.c_str(), lineIdx, s2.c_str() ); + code = CvTS::FAIL_INVALID_OUTPUT; + } + } + if( !f1.eof() || !f2.eof() ) + { + ts->printf( CvTS::LOG, "First and second saved files differ in %n-line; first %n line: %s; second %n-line: %s", + lineIdx, lineIdx, s1.c_str(), lineIdx, s2.c_str() ); + code = CvTS::FAIL_INVALID_OUTPUT; + } + f1.close(); + f2.close(); + // delete temporary files + remove( model_file_name1 ); + remove( model_file_name2 ); + + // 2. compare responses + CV_Assert( test_resps1.size() == test_resps2.size() ); + vector::const_iterator it1 = test_resps1.begin(), it2 = test_resps2.begin(); + for( ; it1 != test_resps1.end(); ++it1, ++it2 ) + { + if( fabs(*it1 - *it2) > FLT_EPSILON ) + { + ts->printf( CvTS::LOG, "Responses predicted before saving and after loading are different" ); + code = CvTS::FAIL_INVALID_OUTPUT; + } + } + return code; +} + + + +void CV_GBTreesTest::run(int) +{ + + string data_path = string(ts->get_data_path()); + datasets = new string[2]; + datasets[0] = data_path + string("spambase.data"); /*string("dataset_classification.csv");*/ + datasets[1] = data_path + string("housing_.data"); /*string("dataset_regression.csv");*/ + + int code = CvTS::OK; + + for (int i = 0; i < 4; i++) + { + + int temp_code = TestTrainPredict(i); + if (temp_code != CvTS::OK) + { + code = temp_code; + break; + } + + else if (i==0) + { + temp_code = TestSaveLoad(); + if (temp_code != CvTS::OK) + code = temp_code; + delete data; + data = 0; + } + + delete gtb; + gtb = 0; + } + delete data; + data = 0; + + ts->set_failed_test_info( code ); +} + +///////////////////////////////////////////////////////////////////////////// +//////////////////// test registration ///////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// + +CV_GBTreesTest gbtrees_test;