From 05fbd1e5bc4500cf3b96319a41d3298534d72070 Mon Sep 17 00:00:00 2001 From: Pierre-Emmanuel Viel Date: Mon, 22 Jun 2020 22:53:05 +0200 Subject: [PATCH] DNA mode: add the distance computations --- modules/flann/include/opencv2/flann.hpp | 10 ++ modules/flann/include/opencv2/flann/defines.h | 1 + modules/flann/include/opencv2/flann/dist.h | 151 ++++++++++++++++++ 3 files changed, 162 insertions(+) diff --git a/modules/flann/include/opencv2/flann.hpp b/modules/flann/include/opencv2/flann.hpp index e8ee91a3ec..5c52bfeeb5 100644 --- a/modules/flann/include/opencv2/flann.hpp +++ b/modules/flann/include/opencv2/flann.hpp @@ -95,6 +95,8 @@ using ::cvflann::MaxDistance; using ::cvflann::HammingLUT; using ::cvflann::Hamming; using ::cvflann::Hamming2; +using ::cvflann::DNAmmingLUT; +using ::cvflann::DNAmming2; using ::cvflann::HistIntersectionDistance; using ::cvflann::HellingerDistance; using ::cvflann::ChiSquareDistance; @@ -131,6 +133,14 @@ performed using library calls, if available. Lookup table implementation is used cv::flann::Hamming2 - %Hamming distance functor. Population count is implemented in 12 arithmetic operations (one of which is multiplication). +cv::flann::DNAmmingLUT - %Adaptation of the Hamming distance functor to DNA comparison. +As the four bases A, C, G, T of the DNA (or A, G, C, U for RNA) can be coded on 2 bits, +it counts the bits pairs differences between two sequences using a lookup table implementation. + +cv::flann::DNAmming2 - %Adaptation of the Hamming distance functor to DNA comparison. +Bases differences count are vectorised thanks to arithmetic operations using standard +registers (AVX2 and AVX-512 should come in a near future). + cv::flann::HistIntersectionDistance - The histogram intersection distance functor. diff --git a/modules/flann/include/opencv2/flann/defines.h b/modules/flann/include/opencv2/flann/defines.h index 884c6004d4..8ab83293ff 100644 --- a/modules/flann/include/opencv2/flann/defines.h +++ b/modules/flann/include/opencv2/flann/defines.h @@ -128,6 +128,7 @@ enum flann_distance_t FLANN_DIST_KULLBACK_LEIBLER = 8, FLANN_DIST_KL = 8, FLANN_DIST_HAMMING = 9, + FLANN_DIST_DNAMMING = 10, // deprecated constants, should use the FLANN_DIST_* ones instead EUCLIDEAN = 1, diff --git a/modules/flann/include/opencv2/flann/dist.h b/modules/flann/include/opencv2/flann/dist.h index e41b994d7e..97daa55cd9 100644 --- a/modules/flann/include/opencv2/flann/dist.h +++ b/modules/flann/include/opencv2/flann/dist.h @@ -744,6 +744,157 @@ private: //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct DNAmmingLUT +{ + typedef False is_kdtree_distance; + typedef False is_vector_space_distance; + + typedef unsigned char ElementType; + typedef int ResultType; + typedef ElementType CentersType; + + /** this will count the bits in a ^ b + */ + template + ResultType operator()(const unsigned char* a, const Iterator2 b, size_t size) const + { + static const uchar popCountTable[] = + { + 0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, + 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, + 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, + 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, + 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, + 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, + 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, + 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4 + }; + ResultType result = 0; + const unsigned char* b2 = reinterpret_cast (b); + for (size_t i = 0; i < size; i++) { + result += popCountTable[a[i] ^ b2[i]]; + } + return result; + } + + + ResultType operator()(const unsigned char* a, const ZeroIterator b, size_t size) const + { + (void)b; + static const uchar popCountTable[] = + { + 0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, + 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, + 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, + 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, + 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, + 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, + 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, + 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4 + }; + ResultType result = 0; + for (size_t i = 0; i < size; i++) { + result += popCountTable[a[i]]; + } + return result; + } +}; + + +template +struct DNAmming2 +{ + typedef False is_kdtree_distance; + typedef False is_vector_space_distance; + + typedef T ElementType; + typedef int ResultType; + typedef ElementType CentersType; + + /** This is popcount_3() from: + * http://en.wikipedia.org/wiki/Hamming_weight */ + unsigned int popcnt32(uint32_t n) const + { + n = ((n >> 1) | n) & 0x55555555; + n = (n & 0x33333333) + ((n >> 2) & 0x33333333); + return (((n + (n >> 4))& 0x0F0F0F0F)* 0x01010101) >> 24; + } + +#ifdef FLANN_PLATFORM_64_BIT + unsigned int popcnt64(uint64_t n) const + { + n = ((n >> 1) | n) & 0x5555555555555555; + n = (n & 0x3333333333333333) + ((n >> 2) & 0x3333333333333333); + return (((n + (n >> 4))& 0x0f0f0f0f0f0f0f0f)* 0x0101010101010101) >> 56; + } +#endif + + template + ResultType operator()(const Iterator1 a, const Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const + { + CV_DbgAssert(!(size % long_word_size_) && "vectors size must be multiple of long words size (i.e. 8)"); + +#ifdef FLANN_PLATFORM_64_BIT + const uint64_t* pa = reinterpret_cast(a); + const uint64_t* pb = reinterpret_cast(b); + ResultType result = 0; + size /= long_word_size_; + for(size_t i = 0; i < size; ++i ) { + result += popcnt64(*pa ^ *pb); + ++pa; + ++pb; + } +#else + const uint32_t* pa = reinterpret_cast(a); + const uint32_t* pb = reinterpret_cast(b); + ResultType result = 0; + size /= long_word_size_; + for(size_t i = 0; i < size; ++i ) { + result += popcnt32(*pa ^ *pb); + ++pa; + ++pb; + } +#endif + return result; + } + + + template + ResultType operator()(const Iterator1 a, ZeroIterator b, size_t size, ResultType /*worst_dist*/ = -1) const + { + CV_DbgAssert(!(size % long_word_size_) && "vectors size must be multiple of long words size (i.e. 8)"); + + (void)b; +#ifdef FLANN_PLATFORM_64_BIT + const uint64_t* pa = reinterpret_cast(a); + ResultType result = 0; + size /= long_word_size_; + for(size_t i = 0; i < size; ++i ) { + result += popcnt64(*pa); + ++pa; + } +#else + const uint32_t* pa = reinterpret_cast(a); + ResultType result = 0; + size /= long_word_size_; + for(size_t i = 0; i < size; ++i ) { + result += popcnt32(*pa); + ++pa; + } +#endif + return result; + } + +private: +#ifdef FLANN_PLATFORM_64_BIT + static const size_t long_word_size_= sizeof(uint64_t)/sizeof(unsigned char); +#else + static const size_t long_word_size_= sizeof(uint32_t)/sizeof(unsigned char); +#endif +}; + + + template struct HistIntersectionDistance {