From 98de57c6c459765a085f50c06982a1cb60b36505 Mon Sep 17 00:00:00 2001
From: Pierre-Emmanuel Viel
Date: Fri, 26 Jun 2020 23:08:04 +0200
Subject: [PATCH] Refactoring to prepare for other vector types while
mutualizing some methods
---
.../include/opencv2/flann/kmeans_index.h | 781 ++++++++++--------
1 file changed, 414 insertions(+), 367 deletions(-)
diff --git a/modules/flann/include/opencv2/flann/kmeans_index.h b/modules/flann/include/opencv2/flann/kmeans_index.h
index a823986e09..98ec68a87b 100644
--- a/modules/flann/include/opencv2/flann/kmeans_index.h
+++ b/modules/flann/include/opencv2/flann/kmeans_index.h
@@ -463,14 +463,10 @@ public:
root_[i] = pool_.allocate();
std::memset(root_[i], 0, sizeof(KMeansNode));
- if(is_kdtree_distance::val || is_vector_space_distance::val) {
- computeNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
- computeClustering(root_[i], indices_[i], (int)size_, branching_,0);
- }
- else {
- computeBitfieldNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
- computeBitfieldClustering(root_[i], indices_[i], (int)size_, branching_,0);
- }
+ Distance* dummy = NULL;
+ computeNodeStatistics(root_[i], indices_[i], (unsigned int)size_, dummy);
+
+ computeClustering(root_[i], indices_[i], (int)size_, branching_,0);
}
}
@@ -829,6 +825,413 @@ private:
}
+ template
+ void computeNodeStatistics(KMeansNodePtr node, int* indices,
+ unsigned int indices_length,
+ const DistType* identifier)
+ {
+ (void)identifier;
+ computeNodeStatistics(node, indices, indices_length);
+ }
+
+ void computeNodeStatistics(KMeansNodePtr node, int* indices,
+ unsigned int indices_length,
+ const cvflann::HammingLUT* identifier)
+ {
+ (void)identifier;
+ computeBitfieldNodeStatistics(node, indices, indices_length);
+ }
+
+ void computeNodeStatistics(KMeansNodePtr node, int* indices,
+ unsigned int indices_length,
+ const cvflann::Hamming* identifier)
+ {
+ (void)identifier;
+ computeBitfieldNodeStatistics(node, indices, indices_length);
+ }
+
+ void computeNodeStatistics(KMeansNodePtr node, int* indices,
+ unsigned int indices_length,
+ const cvflann::Hamming2* identifier)
+ {
+ (void)identifier;
+ computeBitfieldNodeStatistics(node, indices, indices_length);
+ }
+
+
+ void refineClustering(int* indices, int indices_length, int branching, CentersType** centers,
+ std::vector& radiuses, int* belongs_to, int* count)
+ {
+ cv::AutoBuffer dcenters_buf(branching*veclen_);
+ Matrix dcenters(dcenters_buf.data(), branching, veclen_);
+
+ bool converged = false;
+ int iteration = 0;
+ while (!converged && iteration new_centroids(indices_length);
+ std::vector sq_dists(indices_length);
+
+ // reassign points to clusters
+ KMeansDistanceComputer > invoker(
+ distance_, dataset_, branching, indices, dcenters, veclen_, new_centroids, sq_dists);
+ parallel_for_(cv::Range(0, (int)indices_length), invoker);
+
+ for (int i=0; i < (int)indices_length; ++i) {
+ DistanceType sq_dist(sq_dists[i]);
+ int new_centroid(new_centroids[i]);
+ if (sq_dist > radiuses[new_centroid]) {
+ radiuses[new_centroid] = sq_dist;
+ }
+ if (new_centroid != belongs_to[i]) {
+ count[belongs_to[i]]--;
+ count[new_centroid]++;
+ belongs_to[i] = new_centroid;
+ converged = false;
+ }
+ }
+
+ for (int i=0; i& radiuses, int* belongs_to, int* count)
+ {
+ for (int i=0; i(
+ veclen_*sizeof(ElementType)*BITS_PER_CHAR);
+ cv::AutoBuffer dcenters_buf(branching*accumulator_veclen);
+ Matrix dcenters(dcenters_buf.data(), branching, accumulator_veclen);
+
+ bool converged = false;
+ int iteration = 0;
+ while (!converged && iteration>1) & 0x01;
+ dcenter[k+2] += (vec[l]>>2) & 0x01;
+ dcenter[k+3] += (vec[l]>>3) & 0x01;
+ dcenter[k+4] += (vec[l]>>4) & 0x01;
+ dcenter[k+5] += (vec[l]>>5) & 0x01;
+ dcenter[k+6] += (vec[l]>>6) & 0x01;
+ dcenter[k+7] += (vec[l]>>7) & 0x01;
+ }
+ }
+ for (int i=0; i(count[i]);
+ unsigned int* dcenter = dcenters[i];
+ unsigned char* charCenter = (unsigned char*)centers[i];
+ for (size_t k=0, l=0; k(
+ (((int)(0.5 + (double)(dcenter[k]) / cnt)))
+ | (((int)(0.5 + (double)(dcenter[k+1]) / cnt))<<1)
+ | (((int)(0.5 + (double)(dcenter[k+2]) / cnt))<<2)
+ | (((int)(0.5 + (double)(dcenter[k+3]) / cnt))<<3)
+ | (((int)(0.5 + (double)(dcenter[k+4]) / cnt))<<4)
+ | (((int)(0.5 + (double)(dcenter[k+5]) / cnt))<<5)
+ | (((int)(0.5 + (double)(dcenter[k+6]) / cnt))<<6)
+ | (((int)(0.5 + (double)(dcenter[k+7]) / cnt))<<7));
+ }
+ }
+
+ std::vector new_centroids(indices_length);
+ std::vector dists(indices_length);
+
+ // reassign points to clusters
+ KMeansDistanceComputer invoker(
+ distance_, dataset_, branching, indices, centers, veclen_, new_centroids, dists);
+ parallel_for_(cv::Range(0, (int)indices_length), invoker);
+
+ for (int i=0; i < indices_length; ++i) {
+ DistanceType dist(dists[i]);
+ int new_centroid(new_centroids[i]);
+ if (dist > radiuses[new_centroid]) {
+ radiuses[new_centroid] = dist;
+ }
+ if (new_centroid != belongs_to[i]) {
+ count[belongs_to[i]]--;
+ count[new_centroid]++;
+ belongs_to[i] = new_centroid;
+ converged = false;
+ }
+ }
+
+ for (int i=0; i& radiuses, int* belongs_to, int* count)
+ {
+ // compute kmeans clustering for each of the resulting clusters
+ node->childs = pool_.allocate(branching);
+ int start = 0;
+ int end = start;
+ for (int c=0; c(), veclen_);
+ variance += d;
+ mean_radius += static_cast( sqrt(d) );
+ std::swap(indices[i],indices[end]);
+ std::swap(belongs_to[i],belongs_to[end]);
+ end++;
+ }
+ }
+ variance /= s;
+ mean_radius /= s;
+ variance -= distance_(centers[c], ZeroIterator(), veclen_);
+
+ node->childs[c] = pool_.allocate();
+ std::memset(node->childs[c], 0, sizeof(KMeansNode));
+ node->childs[c]->radius = radiuses[c];
+ node->childs[c]->pivot = centers[c];
+ node->childs[c]->variance = variance;
+ node->childs[c]->mean_radius = mean_radius;
+ computeClustering(node->childs[c],indices+start, end-start, branching, level+1);
+ start=end;
+ }
+ }
+
+
+ void computeAnyBitfieldSubClustering(KMeansNodePtr node, int* indices, int indices_length,
+ int branching, int level, CentersType** centers,
+ std::vector& radiuses, int* belongs_to, int* count)
+ {
+ // compute kmeans clustering for each of the resulting clusters
+ node->childs = pool_.allocate(branching);
+ int start = 0;
+ int end = start;
+ for (int c=0; c(), veclen_);
+ variance += static_cast( ensureSquareDistance(d) );
+ mean_radius += ensureSimpleDistance(d);
+ std::swap(indices[i],indices[end]);
+ std::swap(belongs_to[i],belongs_to[end]);
+ end++;
+ }
+ }
+ mean_radius = static_cast(
+ 0.5f + static_cast(mean_radius) / static_cast(s));
+ variance = static_cast(
+ 0.5 + static_cast(variance) / static_cast(s));
+ variance -= static_cast(
+ ensureSquareDistance(
+ distance_(centers[c], ZeroIterator(), veclen_)));
+
+ node->childs[c] = pool_.allocate();
+ std::memset(node->childs[c], 0, sizeof(KMeansNode));
+ node->childs[c]->radius = radiuses[c];
+ node->childs[c]->pivot = centers[c];
+ node->childs[c]->variance = static_cast(variance);
+ node->childs[c]->mean_radius = mean_radius;
+ computeClustering(node->childs[c],indices+start, end-start, branching, level+1);
+ start=end;
+ }
+ }
+
+
+ template
+ void refineAndSplitClustering(
+ KMeansNodePtr node, int* indices, int indices_length, int branching,
+ int level, CentersType** centers, std::vector& radiuses,
+ int* belongs_to, int* count, const DistType* identifier)
+ {
+ (void)identifier;
+ refineClustering(indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+ computeSubClustering(node, indices, indices_length, branching,
+ level, centers, radiuses, belongs_to, count);
+ }
+
+
+ /**
+ * The methods responsible with doing the recursive hierarchical clustering on
+ * binary vectors.
+ * As some might have heared that KMeans on binary data doesn't make sense,
+ * it's worth a little explanation why it actually fairly works. As
+ * with the Hierarchical Clustering algortihm, we seed several centers for the
+ * current node by picking some of its points. Then in a first pass each point
+ * of the node is then related to its closest center. Now let's have a look at
+ * the 5 central dimensions of the 9 following points:
+ *
+ * xxxxxx11100xxxxx (1)
+ * xxxxxx11010xxxxx (2)
+ * xxxxxx11001xxxxx (3)
+ * xxxxxx10110xxxxx (4)
+ * xxxxxx10101xxxxx (5)
+ * xxxxxx10011xxxxx (6)
+ * xxxxxx01110xxxxx (7)
+ * xxxxxx01101xxxxx (8)
+ * xxxxxx01011xxxxx (9)
+ * sum _____
+ * of 1: 66555
+ *
+ * Even if the barycenter notion doesn't apply, we can set a center
+ * xxxxxx11111xxxxx that will better fit the five dimensions we are focusing
+ * on for these points.
+ *
+ * Note that convergence isn't ensured anymore. In practice, using Gonzales
+ * as seeding algorithm should be fine for getting convergence ("iterations"
+ * value can be set to -1). But with KMeans++ seeding you should definitely
+ * set a maximum number of iterations (but make it higher than the "iterations"
+ * default value of 11).
+ *
+ * Params:
+ * node = the node to cluster
+ * indices = indices of the points belonging to the current node
+ * indices_length = number of points in the current node
+ * branching = the branching factor to use in the clustering
+ * level = 0 for the root node, it increases with the subdivision levels
+ * centers = clusters centers to compute
+ * radiuses = radiuses of clusters
+ * belongs_to = LookUp Table returning, for a given indice id, the center id it belongs to
+ * count = array storing the number of indices for a given center id
+ * identifier = dummy pointer on an instance of Distance (use to branch correctly among templates)
+ */
+ void refineAndSplitClustering(
+ KMeansNodePtr node, int* indices, int indices_length, int branching,
+ int level, CentersType** centers, std::vector& radiuses,
+ int* belongs_to, int* count, const cvflann::HammingLUT* identifier)
+ {
+ (void)identifier;
+ refineBitfieldClustering(
+ indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+ computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
+ level, centers, radiuses, belongs_to, count);
+ }
+
+
+ void refineAndSplitClustering(
+ KMeansNodePtr node, int* indices, int indices_length, int branching,
+ int level, CentersType** centers, std::vector& radiuses,
+ int* belongs_to, int* count, const cvflann::Hamming* identifier)
+ {
+ (void)identifier;
+ refineBitfieldClustering(
+ indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+ computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
+ level, centers, radiuses, belongs_to, count);
+ }
+
+
+ void refineAndSplitClustering(
+ KMeansNodePtr node, int* indices, int indices_length, int branching,
+ int level, CentersType** centers, std::vector& radiuses,
+ int* belongs_to, int* count, const cvflann::Hamming2* identifier)
+ {
+ (void)identifier;
+ refineBitfieldClustering(
+ indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+ computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
+ level, centers, radiuses, belongs_to, count);
+ }
+
/**
* The method responsible with actually doing the recursive hierarchical
@@ -893,372 +1296,16 @@ private:
count[belongs_to[i]]++;
}
- cv::AutoBuffer dcenters_buf(branching*veclen_);
- Matrix dcenters(dcenters_buf.data(), branching, veclen_);
- for (int i=0; i new_centroids(indices_length);
- std::vector sq_dists(indices_length);
-
- // reassign points to clusters
- KMeansDistanceComputer > invoker(distance_, dataset_, branching, indices, dcenters, veclen_, new_centroids, sq_dists);
- parallel_for_(cv::Range(0, (int)indices_length), invoker);
-
- for (int i=0; i < (int)indices_length; ++i) {
- DistanceType sq_dist(sq_dists[i]);
- int new_centroid(new_centroids[i]);
- if (sq_dist > radiuses[new_centroid]) {
- radiuses[new_centroid] = sq_dist;
- }
- if (new_centroid != belongs_to[i]) {
- count[belongs_to[i]]--;
- count[new_centroid]++;
- belongs_to[i] = new_centroid;
- converged = false;
- }
- }
-
- for (int i=0; ichilds = pool_.allocate(branching);
- int start = 0;
- int end = start;
- for (int c=0; c(), veclen_);
- variance += d;
- mean_radius += static_cast( sqrt(d) );
- std::swap(indices[i],indices[end]);
- std::swap(belongs_to[i],belongs_to[end]);
- end++;
- }
- }
- variance /= s;
- mean_radius /= s;
- variance -= distance_(centers[c], ZeroIterator(), veclen_);
-
- node->childs[c] = pool_.allocate();
- std::memset(node->childs[c], 0, sizeof(KMeansNode));
- node->childs[c]->radius = radiuses[c];
- node->childs[c]->pivot = centers[c];
- node->childs[c]->variance = variance;
- node->childs[c]->mean_radius = mean_radius;
- computeClustering(node->childs[c],indices+start, end-start, branching, level+1);
- start=end;
- }
+ Distance* dummy = NULL;
+ refineAndSplitClustering(node, indices, indices_length, branching, level,
+ centers, radiuses, belongs_to, count, dummy);
delete[] centers;
}
- /**
- * The method responsible with doing the recursive hierarchical clustering on
- * binary vectors.
- * As some might have heared that KMeans on binary data doesn't make sense,
- * it's worth a little explanation why it actually fairly works. As
- * with the Hierarchical Clustering algortihm, we seed several centers for the
- * current node by picking some of its points. Then in a first pass each point
- * of the node is then related to its closest center. Now let's have a look at
- * the 5 central dimensions of the 9 following points:
- *
- * xxxxxx11100xxxxx (1)
- * xxxxxx11010xxxxx (2)
- * xxxxxx11001xxxxx (3)
- * xxxxxx10110xxxxx (4)
- * xxxxxx10101xxxxx (5)
- * xxxxxx10011xxxxx (6)
- * xxxxxx01110xxxxx (7)
- * xxxxxx01101xxxxx (8)
- * xxxxxx01011xxxxx (9)
- * sum _____
- * of 1: 66555
- *
- * Even if the barycenter notion doesn't apply, we can set a center
- * xxxxxx11111xxxxx that will better fit the five dimensions we are focusing
- * on for these points.
- *
- * Note that convergence isn't ensured anymore. In practice, using Gonzales
- * as seeding algorithm should be fine for getting convergence ("iterations"
- * value can be set to -1). But with KMeans++ seeding you should definitely
- * set a maximum number of iterations (but make it higher than the "iterations"
- * default value of 11).
- *
- * Params:
- * node = the node to cluster
- * indices = indices of the points belonging to the current node
- * indices_length = number of points in the current node
- * branching = the branching factor to use in the clustering
- * level = 0 for the root node, it increases with the subdivision levels
- */
- void computeBitfieldClustering(KMeansNodePtr node, int* indices,
- int indices_length, int branching, int level)
- {
- node->size = indices_length;
- node->level = level;
-
- if (indices_length < branching) {
- node->indices = indices;
- std::sort(node->indices,node->indices+indices_length);
- node->childs = NULL;
- return;
- }
-
- cv::AutoBuffer centers_idx_buf(branching);
- int* centers_idx = centers_idx_buf.data();
- int centers_length;
- (this->*chooseCenters)(branching, indices, indices_length, centers_idx, centers_length);
-
- if (centers_lengthindices = indices;
- std::sort(node->indices,node->indices+indices_length);
- node->childs = NULL;
- return;
- }
-
- const unsigned int accumulator_veclen = static_cast(
- veclen_*sizeof(ElementType)*BITS_PER_CHAR);
- cv::AutoBuffer dcenters_buf(branching*accumulator_veclen);
- Matrix dcenters(dcenters_buf.data(), branching, accumulator_veclen);
-
- CentersType** centers = new CentersType*[branching];
-
- for (int i=0; i radiuses(branching);
- cv::AutoBuffer count_buf(branching);
- int* count = count_buf.data();
- for (int i=0; i belongs_to_buf(indices_length);
- int* belongs_to = belongs_to_buf.data();
- for (int i=0; inew_dist) {
- belongs_to[i] = j;
- dist = new_dist;
- }
- }
- if (dist>radiuses[belongs_to[i]]) {
- radiuses[belongs_to[i]] = dist;
- }
- count[belongs_to[i]]++;
- }
-
- bool converged = false;
- int iteration = 0;
- while (!converged && iteration>1) & 0x01;
- dcenter[k+2] += (vec[l]>>2) & 0x01;
- dcenter[k+3] += (vec[l]>>3) & 0x01;
- dcenter[k+4] += (vec[l]>>4) & 0x01;
- dcenter[k+5] += (vec[l]>>5) & 0x01;
- dcenter[k+6] += (vec[l]>>6) & 0x01;
- dcenter[k+7] += (vec[l]>>7) & 0x01;
- }
- }
- for (int i=0; i(count[i]);
- unsigned int* dcenter = dcenters[i];
- unsigned char* charCenter = (unsigned char*)centers[i];
- for (size_t k=0, l=0; k(
- (((int)(0.5 + (double)(dcenter[k]) / cnt)))
- | (((int)(0.5 + (double)(dcenter[k+1]) / cnt))<<1)
- | (((int)(0.5 + (double)(dcenter[k+2]) / cnt))<<2)
- | (((int)(0.5 + (double)(dcenter[k+3]) / cnt))<<3)
- | (((int)(0.5 + (double)(dcenter[k+4]) / cnt))<<4)
- | (((int)(0.5 + (double)(dcenter[k+5]) / cnt))<<5)
- | (((int)(0.5 + (double)(dcenter[k+6]) / cnt))<<6)
- | (((int)(0.5 + (double)(dcenter[k+7]) / cnt))<<7));
- }
- }
-
- std::vector new_centroids(indices_length);
- std::vector dists(indices_length);
-
- // reassign points to clusters
- KMeansDistanceComputer invoker(distance_, dataset_, branching, indices, centers, veclen_, new_centroids, dists);
- parallel_for_(cv::Range(0, (int)indices_length), invoker);
-
- for (int i=0; i < indices_length; ++i) {
- DistanceType dist(dists[i]);
- int new_centroid(new_centroids[i]);
- if (dist > radiuses[new_centroid]) {
- radiuses[new_centroid] = dist;
- }
- if (new_centroid != belongs_to[i]) {
- count[belongs_to[i]]--;
- count[new_centroid]++;
- belongs_to[i] = new_centroid;
- converged = false;
- }
- }
-
- for (int i=0; ichilds = pool_.allocate(branching);
- int start = 0;
- int end = start;
- for (int c=0; c(), veclen_);
- variance += static_cast( ensureSquareDistance(d) );
- mean_radius += ensureSimpleDistance(d);
- std::swap(indices[i],indices[end]);
- std::swap(belongs_to[i],belongs_to[end]);
- end++;
- }
- }
- mean_radius = static_cast(
- 0.5f + static_cast(mean_radius) / static_cast(s));
- variance = static_cast(
- 0.5 + static_cast(variance) / static_cast(s));
- variance -= static_cast(
- ensureSquareDistance(
- distance_(centers[c], ZeroIterator(), veclen_)));
-
- node->childs[c] = pool_.allocate();
- std::memset(node->childs[c], 0, sizeof(KMeansNode));
- node->childs[c]->radius = radiuses[c];
- node->childs[c]->pivot = centers[c];
- node->childs[c]->variance = static_cast(variance);
- node->childs[c]->mean_radius = mean_radius;
- computeBitfieldClustering(node->childs[c],indices+start, end-start, branching, level+1);
- start=end;
- }
-
- delete[] centers;
- }
-
-
-
-
/**
* Performs one descent in the hierarchical k-means tree. The branches not
* visited are stored in a priority queue.