From 793e7c0d9fdb70d7b9f758322d1a59ee585bdc93 Mon Sep 17 00:00:00 2001
From: pemmanuelviel <p.emmanuel.viel@gmail.com>
Date: Mon, 3 Aug 2020 20:29:57 +0200
Subject: [PATCH] Merge pull request #18019 from
 pemmanuelviel:pev--multiple-kmeans-trees

* Possibility to set more than one tree for the hierarchical KMeans (default is still 1 tree).

This particularly improves NN retrieval results with binary vectors, allowing better quality
compared to LSH for similar processing time when speed is the criterium.

* Add explanations on the FLANN's hierarchical KMeans for binary data.
---
 modules/flann/include/opencv2/flann.hpp       |  32 ++-
 .../include/opencv2/flann/kmeans_index.h      | 195 +++++++++++++-----
 2 files changed, 179 insertions(+), 48 deletions(-)

diff --git a/modules/flann/include/opencv2/flann.hpp b/modules/flann/include/opencv2/flann.hpp
index 674e6583c5..e8ee91a3ec 100644
--- a/modules/flann/include/opencv2/flann.hpp
+++ b/modules/flann/include/opencv2/flann.hpp
@@ -191,8 +191,28 @@ public:
             KDTreeIndexParams( int trees = 4 );
         };
         @endcode
+        - **HierarchicalClusteringIndexParams** When passing an object of this type the index constructed
+        will be a hierarchical tree of clusters, dividing each set of points into n clusters whose centers
+        are picked among the points without further refinement of their position.
+        This algorithm fits both floating, integer and binary vectors. :
+        @code
+        struct HierarchicalClusteringIndexParams : public IndexParams
+        {
+            HierarchicalClusteringIndexParams(
+                int branching = 32,
+                flann_centers_init_t centers_init = CENTERS_RANDOM,
+                int trees = 4,
+                int leaf_size = 100);
+
+        };
+        @endcode
         - **KMeansIndexParams** When passing an object of this type the index constructed will be a
-        hierarchical k-means tree. :
+        hierarchical k-means tree (one tree by default), dividing each set of points into n clusters
+        whose barycenters are refined iteratively.
+        Note that this algorithm has been extended to the support of binary vectors as an alternative
+        to LSH when knn search speed is the criterium. It will also outperform LSH when processing
+        directly (i.e. without the use of MCA/PCA) datasets whose points share mostly the same values
+        for most of the dimensions. It is recommended to set more than one tree with binary data. :
         @code
         struct KMeansIndexParams : public IndexParams
         {
@@ -201,6 +221,13 @@ public:
                 int iterations = 11,
                 flann_centers_init_t centers_init = CENTERS_RANDOM,
                 float cb_index = 0.2 );
+
+            KMeansIndexParams(
+                int branching,
+                int iterations,
+                flann_centers_init_t centers_init,
+                float cb_index,
+                int trees );
         };
         @endcode
         - **CompositeIndexParams** When using a parameters object of this type the index created
@@ -219,7 +246,8 @@ public:
         - **LshIndexParams** When using a parameters object of this type the index created uses
         multi-probe LSH (by Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity Search
         by Qin Lv, William Josephson, Zhe Wang, Moses Charikar, Kai Li., Proceedings of the 33rd
-        International Conference on Very Large Data Bases (VLDB). Vienna, Austria. September 2007) :
+        International Conference on Very Large Data Bases (VLDB). Vienna, Austria. September 2007).
+        This algorithm is designed for binary vectors. :
         @code
         struct LshIndexParams : public IndexParams
         {
diff --git a/modules/flann/include/opencv2/flann/kmeans_index.h b/modules/flann/include/opencv2/flann/kmeans_index.h
index a50e0cdf8d..a823986e09 100644
--- a/modules/flann/include/opencv2/flann/kmeans_index.h
+++ b/modules/flann/include/opencv2/flann/kmeans_index.h
@@ -57,8 +57,8 @@ namespace cvflann
 
 struct KMeansIndexParams : public IndexParams
 {
-    KMeansIndexParams(int branching = 32, int iterations = 11,
-                      flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 )
+    void indexParams(int branching, int iterations,
+                     flann_centers_init_t centers_init, float cb_index, int trees)
     {
         (*this)["algorithm"] = FLANN_INDEX_KMEANS;
         // branching factor
@@ -69,6 +69,20 @@ struct KMeansIndexParams : public IndexParams
         (*this)["centers_init"] = centers_init;
         // cluster boundary index. Used when searching the kmeans tree
         (*this)["cb_index"] = cb_index;
+        // number of kmeans trees to search in
+        (*this)["trees"] = trees;
+    }
+
+    KMeansIndexParams(int branching = 32, int iterations = 11,
+                      flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 )
+    {
+        indexParams(branching, iterations, centers_init, cb_index, 1);
+    }
+
+    KMeansIndexParams(int branching, int iterations,
+                      flann_centers_init_t centers_init, float cb_index, int trees)
+    {
+        indexParams(branching, iterations, centers_init, cb_index, trees);
     }
 };
 
@@ -347,6 +361,7 @@ public:
         veclen_ = dataset_.cols;
 
         branching_ = get_param(params,"branching",32);
+        trees_ = get_param(params,"trees",1);
         iterations_ = get_param(params,"iterations",11);
         if (iterations_<0) {
             iterations_ = (std::numeric_limits<int>::max)();
@@ -367,6 +382,13 @@ public:
         }
         cb_index_ = 0.4f;
 
+        root_ = new KMeansNodePtr[trees_];
+        indices_ = new int*[trees_];
+
+        for (int i=0; i<trees_; ++i) {
+            root_[i] = NULL;
+            indices_[i] = NULL;
+        }
     }
 
 
@@ -382,9 +404,11 @@ public:
     virtual ~KMeansIndex()
     {
         if (root_ != NULL) {
-            free_centers(root_);
+            free_centers();
+            delete[] root_;
         }
         if (indices_!=NULL) {
+            free_indices();
             delete[] indices_;
         }
     }
@@ -429,23 +453,24 @@ public:
             throw FLANNException("Branching factor must be at least 2");
         }
 
-        indices_ = new int[size_];
-        for (size_t i=0; i<size_; ++i) {
-            indices_[i] = int(i);
-        }
+        free_indices();
 
-        root_ = pool_.allocate<KMeansNode>();
-        std::memset(root_, 0, sizeof(KMeansNode));
+        for (int i=0; i<trees_; ++i) {
+            indices_[i] = new int[size_];
+            for (size_t j=0; j<size_; ++j) {
+                indices_[i][j] = int(j);
+            }
+            root_[i] = pool_.allocate<KMeansNode>();
+            std::memset(root_[i], 0, sizeof(KMeansNode));
 
-        if(is_kdtree_distance::val || is_vector_space_distance::val)
-        {
-            computeNodeStatistics(root_, indices_, (unsigned int)size_);
-            computeClustering(root_, indices_, (int)size_, branching_,0);
-        }
-        else
-        {
-            computeBitfieldNodeStatistics(root_, indices_, (unsigned int)size_);
-            computeBitfieldClustering(root_, indices_, (int)size_, branching_,0);
+            if(is_kdtree_distance::val || is_vector_space_distance::val) {
+                computeNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
+                computeClustering(root_[i], indices_[i], (int)size_, branching_,0);
+            }
+            else {
+                computeBitfieldNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
+                computeBitfieldClustering(root_[i], indices_[i], (int)size_, branching_,0);
+            }
         }
     }
 
@@ -456,35 +481,43 @@ public:
         save_value(stream, iterations_);
         save_value(stream, memoryCounter_);
         save_value(stream, cb_index_);
-        save_value(stream, *indices_, (int)size_);
-
-        save_tree(stream, root_);
+        save_value(stream, trees_);
+        for (int i=0; i<trees_; ++i) {
+            save_value(stream, *indices_[i], (int)size_);
+            save_tree(stream, root_[i], i);
+        }
     }
 
 
     void loadIndex(FILE* stream) CV_OVERRIDE
     {
+        if (indices_!=NULL) {
+            free_indices();
+            delete[] indices_;
+        }
+        if (root_!=NULL) {
+            free_centers();
+        }
+
         load_value(stream, branching_);
         load_value(stream, iterations_);
         load_value(stream, memoryCounter_);
         load_value(stream, cb_index_);
-        if (indices_!=NULL) {
-            delete[] indices_;
-        }
-        indices_ = new int[size_];
-        load_value(stream, *indices_, size_);
+        load_value(stream, trees_);
 
-        if (root_!=NULL) {
-            free_centers(root_);
+        indices_ = new int*[trees_];
+        for (int i=0; i<trees_; ++i) {
+            indices_[i] = new int[size_];
+            load_value(stream, *indices_[i], size_);
+            load_tree(stream, root_[i], i);
         }
-        load_tree(stream, root_);
 
         index_params_["algorithm"] = getType();
         index_params_["branching"] = branching_;
+        index_params_["trees"] = trees_;
         index_params_["iterations"] = iterations_;
         index_params_["centers_init"] = centers_init_;
         index_params_["cb_index"] = cb_index_;
-
     }
 
 
@@ -500,17 +533,21 @@ public:
     void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
     {
 
-        int maxChecks = get_param(searchParams,"checks",32);
+        const int maxChecks = get_param(searchParams,"checks",32);
 
         if (maxChecks==FLANN_CHECKS_UNLIMITED) {
-            findExactNN(root_, result, vec);
+            findExactNN(root_[0], result, vec);
         }
         else {
             // Priority queue storing intermediate branches in the best-bin-first search
             Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
 
             int checks = 0;
-            findNN(root_, result, vec, checks, maxChecks, heap);
+            for (int i=0; i<trees_; ++i) {
+                findNN(root_[i], result, vec, checks, maxChecks, heap);
+                if ((checks >= maxChecks) && result.full())
+                    break;
+            }
 
             BranchSt branch;
             while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {
@@ -521,7 +558,6 @@ public:
 
             CV_Assert(result.full());
         }
-
     }
 
     /**
@@ -541,7 +577,7 @@ public:
         DistanceType variance;
         KMeansNodePtr* clusters = new KMeansNodePtr[numClusters];
 
-        int clusterCount = getMinVarianceClusters(root_, clusters, numClusters, variance);
+        int clusterCount = getMinVarianceClusters(root_[0], clusters, numClusters, variance);
 
         Logger::info("Clusters requested: %d, returning %d\n",numClusters, clusterCount);
 
@@ -611,23 +647,23 @@ private:
 
 
 
-    void save_tree(FILE* stream, KMeansNodePtr node)
+    void save_tree(FILE* stream, KMeansNodePtr node, int num)
     {
         save_value(stream, *node);
         save_value(stream, *(node->pivot), (int)veclen_);
         if (node->childs==NULL) {
-            int indices_offset = (int)(node->indices - indices_);
+            int indices_offset = (int)(node->indices - indices_[num]);
             save_value(stream, indices_offset);
         }
         else {
             for(int i=0; i<branching_; ++i) {
-                save_tree(stream, node->childs[i]);
+                save_tree(stream, node->childs[i], num);
             }
         }
     }
 
 
-    void load_tree(FILE* stream, KMeansNodePtr& node)
+    void load_tree(FILE* stream, KMeansNodePtr& node, int num)
     {
         node = pool_.allocate<KMeansNode>();
         load_value(stream, *node);
@@ -636,12 +672,12 @@ private:
         if (node->childs==NULL) {
             int indices_offset;
             load_value(stream, indices_offset);
-            node->indices = indices_ + indices_offset;
+            node->indices = indices_[num] + indices_offset;
         }
         else {
             node->childs = pool_.allocate<KMeansNodePtr>(branching_);
             for(int i=0; i<branching_; ++i) {
-                load_tree(stream, node->childs[i]);
+                load_tree(stream, node->childs[i], num);
             }
         }
     }
@@ -660,6 +696,32 @@ private:
         }
     }
 
+    void free_centers()
+    {
+       if (root_ != NULL) {
+           for(int i=0; i<trees_; ++i) {
+               if (root_[i] != NULL) {
+                   free_centers(root_[i]);
+               }
+           }
+       }
+    }
+
+    /**
+     * Release the inner elements of indices[]
+     */
+    void free_indices()
+    {
+        if (indices_!=NULL) {
+            for(int i=0; i<trees_; ++i) {
+                if (indices_[i]!=NULL) {
+                    delete[] indices_[i];
+                    indices_[i] = NULL;
+                }
+            }
+        }
+    }
+
     /**
      * Computes the statistics of a node (mean, radius, variance).
      *
@@ -960,7 +1022,45 @@ private:
     }
 
 
-
+    /**
+     * The method responsible with doing the recursive hierarchical clustering on
+     * binary vectors.
+     * As some might have heared that KMeans on binary data doesn't make sense,
+     * it's worth a little explanation why it actually fairly works. As
+     * with the Hierarchical Clustering algortihm, we seed several centers for the
+     * current node by picking some of its points. Then in a first pass each point
+     * of the node is then related to its closest center. Now let's have a look at
+     * the 5 central dimensions of the 9 following points:
+     *
+     * xxxxxx11100xxxxx (1)
+     * xxxxxx11010xxxxx (2)
+     * xxxxxx11001xxxxx (3)
+     * xxxxxx10110xxxxx (4)
+     * xxxxxx10101xxxxx (5)
+     * xxxxxx10011xxxxx (6)
+     * xxxxxx01110xxxxx (7)
+     * xxxxxx01101xxxxx (8)
+     * xxxxxx01011xxxxx (9)
+     * sum   _____
+     * of 1: 66555
+     *
+     * Even if the barycenter notion doesn't apply, we can set a center
+     * xxxxxx11111xxxxx that will better fit the five dimensions we are focusing
+     * on for these points.
+     *
+     * Note that convergence isn't ensured anymore. In practice, using Gonzales
+     * as seeding algorithm should be fine for getting convergence ("iterations"
+     * value can be set to -1). But with KMeans++ seeding you should definitely
+     * set a maximum number of iterations (but make it higher than the "iterations"
+     * default value of 11).
+     *
+     * Params:
+     *     node = the node to cluster
+     *     indices = indices of the points belonging to the current node
+     *     indices_length = number of points in the current node
+     *     branching = the branching factor to use in the clustering
+     *     level = 0 for the root node, it increases with the subdivision levels
+     */
     void computeBitfieldClustering(KMeansNodePtr node, int* indices,
                                    int indices_length, int branching, int level)
     {
@@ -1195,8 +1295,8 @@ private:
         }
 
         if (node->childs==NULL) {
-            if (checks>=maxChecks) {
-                if (result.full()) return;
+            if ((checks>=maxChecks) && result.full()) {
+                return;
             }
             checks += node->size;
             for (int i=0; i<node->size; ++i) {
@@ -1397,6 +1497,9 @@ private:
     /** The branching factor used in the hierarchical k-means clustering */
     int branching_;
 
+    /** Number of kmeans trees (default is one) */
+    int trees_;
+
     /** Maximum number of iterations to use when performing k-means clustering */
     int iterations_;
 
@@ -1432,12 +1535,12 @@ private:
     /**
      * The root node in the tree.
      */
-    KMeansNodePtr root_;
+    KMeansNodePtr* root_;
 
     /**
      *  Array of indices to vectors in the dataset.
      */
-    int* indices_;
+    int** indices_;
 
     /**
      * The distance