From 05fbd1e5bc4500cf3b96319a41d3298534d72070 Mon Sep 17 00:00:00 2001
From: Pierre-Emmanuel Viel
Date: Mon, 22 Jun 2020 22:53:05 +0200
Subject: [PATCH] DNA mode: add the distance computations
---
modules/flann/include/opencv2/flann.hpp | 10 ++
modules/flann/include/opencv2/flann/defines.h | 1 +
modules/flann/include/opencv2/flann/dist.h | 151 ++++++++++++++++++
3 files changed, 162 insertions(+)
diff --git a/modules/flann/include/opencv2/flann.hpp b/modules/flann/include/opencv2/flann.hpp
index e8ee91a3ec..5c52bfeeb5 100644
--- a/modules/flann/include/opencv2/flann.hpp
+++ b/modules/flann/include/opencv2/flann.hpp
@@ -95,6 +95,8 @@ using ::cvflann::MaxDistance;
using ::cvflann::HammingLUT;
using ::cvflann::Hamming;
using ::cvflann::Hamming2;
+using ::cvflann::DNAmmingLUT;
+using ::cvflann::DNAmming2;
using ::cvflann::HistIntersectionDistance;
using ::cvflann::HellingerDistance;
using ::cvflann::ChiSquareDistance;
@@ -131,6 +133,14 @@ performed using library calls, if available. Lookup table implementation is used
cv::flann::Hamming2 - %Hamming distance functor. Population count is
implemented in 12 arithmetic operations (one of which is multiplication).
+cv::flann::DNAmmingLUT - %Adaptation of the Hamming distance functor to DNA comparison.
+As the four bases A, C, G, T of the DNA (or A, G, C, U for RNA) can be coded on 2 bits,
+it counts the bits pairs differences between two sequences using a lookup table implementation.
+
+cv::flann::DNAmming2 - %Adaptation of the Hamming distance functor to DNA comparison.
+Bases differences count are vectorised thanks to arithmetic operations using standard
+registers (AVX2 and AVX-512 should come in a near future).
+
cv::flann::HistIntersectionDistance - The histogram
intersection distance functor.
diff --git a/modules/flann/include/opencv2/flann/defines.h b/modules/flann/include/opencv2/flann/defines.h
index 884c6004d4..8ab83293ff 100644
--- a/modules/flann/include/opencv2/flann/defines.h
+++ b/modules/flann/include/opencv2/flann/defines.h
@@ -128,6 +128,7 @@ enum flann_distance_t
FLANN_DIST_KULLBACK_LEIBLER = 8,
FLANN_DIST_KL = 8,
FLANN_DIST_HAMMING = 9,
+ FLANN_DIST_DNAMMING = 10,
// deprecated constants, should use the FLANN_DIST_* ones instead
EUCLIDEAN = 1,
diff --git a/modules/flann/include/opencv2/flann/dist.h b/modules/flann/include/opencv2/flann/dist.h
index e41b994d7e..97daa55cd9 100644
--- a/modules/flann/include/opencv2/flann/dist.h
+++ b/modules/flann/include/opencv2/flann/dist.h
@@ -744,6 +744,157 @@ private:
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct DNAmmingLUT
+{
+ typedef False is_kdtree_distance;
+ typedef False is_vector_space_distance;
+
+ typedef unsigned char ElementType;
+ typedef int ResultType;
+ typedef ElementType CentersType;
+
+ /** this will count the bits in a ^ b
+ */
+ template
+ ResultType operator()(const unsigned char* a, const Iterator2 b, size_t size) const
+ {
+ static const uchar popCountTable[] =
+ {
+ 0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+ 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+ 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+ 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+ 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+ 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+ 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+ 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
+ };
+ ResultType result = 0;
+ const unsigned char* b2 = reinterpret_cast (b);
+ for (size_t i = 0; i < size; i++) {
+ result += popCountTable[a[i] ^ b2[i]];
+ }
+ return result;
+ }
+
+
+ ResultType operator()(const unsigned char* a, const ZeroIterator b, size_t size) const
+ {
+ (void)b;
+ static const uchar popCountTable[] =
+ {
+ 0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+ 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+ 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+ 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+ 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+ 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+ 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+ 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
+ };
+ ResultType result = 0;
+ for (size_t i = 0; i < size; i++) {
+ result += popCountTable[a[i]];
+ }
+ return result;
+ }
+};
+
+
+template
+struct DNAmming2
+{
+ typedef False is_kdtree_distance;
+ typedef False is_vector_space_distance;
+
+ typedef T ElementType;
+ typedef int ResultType;
+ typedef ElementType CentersType;
+
+ /** This is popcount_3() from:
+ * http://en.wikipedia.org/wiki/Hamming_weight */
+ unsigned int popcnt32(uint32_t n) const
+ {
+ n = ((n >> 1) | n) & 0x55555555;
+ n = (n & 0x33333333) + ((n >> 2) & 0x33333333);
+ return (((n + (n >> 4))& 0x0F0F0F0F)* 0x01010101) >> 24;
+ }
+
+#ifdef FLANN_PLATFORM_64_BIT
+ unsigned int popcnt64(uint64_t n) const
+ {
+ n = ((n >> 1) | n) & 0x5555555555555555;
+ n = (n & 0x3333333333333333) + ((n >> 2) & 0x3333333333333333);
+ return (((n + (n >> 4))& 0x0f0f0f0f0f0f0f0f)* 0x0101010101010101) >> 56;
+ }
+#endif
+
+ template
+ ResultType operator()(const Iterator1 a, const Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
+ {
+ CV_DbgAssert(!(size % long_word_size_) && "vectors size must be multiple of long words size (i.e. 8)");
+
+#ifdef FLANN_PLATFORM_64_BIT
+ const uint64_t* pa = reinterpret_cast(a);
+ const uint64_t* pb = reinterpret_cast(b);
+ ResultType result = 0;
+ size /= long_word_size_;
+ for(size_t i = 0; i < size; ++i ) {
+ result += popcnt64(*pa ^ *pb);
+ ++pa;
+ ++pb;
+ }
+#else
+ const uint32_t* pa = reinterpret_cast(a);
+ const uint32_t* pb = reinterpret_cast(b);
+ ResultType result = 0;
+ size /= long_word_size_;
+ for(size_t i = 0; i < size; ++i ) {
+ result += popcnt32(*pa ^ *pb);
+ ++pa;
+ ++pb;
+ }
+#endif
+ return result;
+ }
+
+
+ template
+ ResultType operator()(const Iterator1 a, ZeroIterator b, size_t size, ResultType /*worst_dist*/ = -1) const
+ {
+ CV_DbgAssert(!(size % long_word_size_) && "vectors size must be multiple of long words size (i.e. 8)");
+
+ (void)b;
+#ifdef FLANN_PLATFORM_64_BIT
+ const uint64_t* pa = reinterpret_cast(a);
+ ResultType result = 0;
+ size /= long_word_size_;
+ for(size_t i = 0; i < size; ++i ) {
+ result += popcnt64(*pa);
+ ++pa;
+ }
+#else
+ const uint32_t* pa = reinterpret_cast(a);
+ ResultType result = 0;
+ size /= long_word_size_;
+ for(size_t i = 0; i < size; ++i ) {
+ result += popcnt32(*pa);
+ ++pa;
+ }
+#endif
+ return result;
+ }
+
+private:
+#ifdef FLANN_PLATFORM_64_BIT
+ static const size_t long_word_size_= sizeof(uint64_t)/sizeof(unsigned char);
+#else
+ static const size_t long_word_size_= sizeof(uint32_t)/sizeof(unsigned char);
+#endif
+};
+
+
+
template
struct HistIntersectionDistance
{