Revision history [back]

test NEON-optimized cv::threshold() on mobile device

I have been writing some optimizations for the OpenCV's threshold function, for ARM devices (mobile phones). It should be working on both Android and iPhone.

However, I do not have a device to test it on, so I am looking for volunteers to give me a little help. If that motivates you more, I am planning to push it to the main OpenCV repo - and hope you guys will accept it

I am interested in code correctness, and if it happens to work as intended, some statistics for original/optimized performance. Do not forget to look at all scenarios.

If this patch format is a nonsense for you, the post here may help you more

Patch 1. Update checkHardwareSupport()

From 5129091430a7423e5c07a4f3c845033adb8ccefe Mon Sep 17 00:00:00 2001
From: sammy <[email protected]>
Date: Tue, 31 Jul 2012 09:59:36 +0300
Subject: [PATCH 1/2] Update checkHardwareSupport and HWFeatures to support
 ARM NEON

---
 modules/core/include/opencv2/core/core_c.h |  3 +++
 modules/core/src/system.cpp                | 17 ++++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/modules/core/include/opencv2/core/core_c.h b/modules/core/include/opencv2/core/core_c.h
index df763ab..bdfb879 100644
--- a/modules/core/include/opencv2/core/core_c.h
+++ b/modules/core/include/opencv2/core/core_c.h
@@ -1706,6 +1706,9 @@ CVAPI(double) cvGetTickFrequency( void );
 #define CV_CPU_SSE4_2  7
 #define CV_CPU_POPCNT  8
 #define CV_CPU_AVX    10
+
+#define CV_CPU_ARM_NEON 100
+
 #define CV_HARDWARE_MAX_FEATURE 255

 CVAPI(int) cvCheckHardwareSupport(int feature);
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index b8a4661..eeb2a58 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -176,7 +176,22 @@ struct HWFeatures
             f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
             f.have[CV_CPU_AVX]    = (cpuid_data[2] & (1<<28)) != 0;
         }
-
+       
+       // Android check
+       #if defined ANDROID
+       if (android_getCpuFamily() == ANDROID_CPU_FAMILY_ARM &&
+            (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) != 0)
+        {
+            f.have[CV_CPU_ARM_NEON] = 1;
+        }
+       #endif
+       // iOS check. Automatically activated by GCC/LLVM compiler option
+       #if define TARGET_OS_IPHONE 
+       #if define __ARM_NEON__
+       f.have[CV_CPU_ARM_NEON] = 1;
+       #endif
+       #endif
+       
         return f;
     }

-- 
1.7.11

Patch 2. threshold() optimizations

From 9f9e6e0a382b0ec2b5ddb3eedd27bcdf95af9763 Mon Sep 17 00:00:00 2001
From: sammy <[email protected]>
Date: Tue, 31 Jul 2012 10:02:35 +0300
Subject: [PATCH 2/2] NEON-accelerated threshold()

---
 modules/imgproc/src/thresh.cpp | 140 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp
index 1fb4847..fb01852 100644
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -42,6 +42,10 @@

 #include "precomp.hpp"

+#if defined CV_USE_NEON
+#include <arm_neon.h>
+#endif
+
 namespace cv
 {

@@ -226,6 +230,142 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     }
 #endif

+#if CV_USE_NEON
+    if( checkHardwareSupport(CV_CPU_ARM_NEON) )
+    {
+       uint8x16_t thresh_u = vdupq_n_u8(thresh);
+        uint8x16_t maxval_ = vdupq_n_u8(maxval);
+       
+        j_scalar = roi.width & -8;
+
+        for( i = 0; i < roi.height; i++ )
+        {
+            const uchar* src = (const uchar*)(_src.data + _src.step*i);
+            uchar* dst = (uchar*)(_dst.data + _dst.step*i);
+
+            switch( type )
+            {
+            case THRESH_BINARY:
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );
+                   v0 = vcgtq_u8 ( v0, thresh_u );
+                   v1 = vcgtq_u8 ( v1, thresh_u );
+                   v0 = vandq_u8 ( v0, maxval_ );
+                   v1 = vandq_u8 ( v1, maxval_ );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8( src + j );
+                   v2 = vcgt_u8 ( v2, vget_low_s8 ( thresh_u ) );
+                   v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+
+            case THRESH_BINARY_INV:            
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );
+                   v0 = vcleq_u8 ( v0, thresh_u );
+                   v1 = vcleq_u8 ( v1, thresh_u );
+                   v0 = vandq_u8 ( v0, maxval_ );
+                   v1 = vandq_u8 ( v1, maxval_ );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8( src + j );
+                   v2 = vcle_u8 ( v2, vget_low_s8 ( thresh_u ) );
+                   v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+               
+            case THRESH_TRUNC:
+               for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );
+                   v0 = vminq_u8 ( v0, thresh_u );
+                   v1 = vminq_u8 ( v1, thresh_u );                 
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8( src + j );
+                   v2 = vmin_u8  ( v2, vget_low_s8 ( thresh_u ) );                 
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+               
+            case THRESH_TOZERO:            
+               for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );                
+                   v0 = vandq_u8 ( vcgtq_u8 ( v0, thresh_u ), vmaxq_u8 ( v0, thresh_u ) );
+                   v1 = vandq_u8 ( vcgtq_u8 ( v1, thresh_u ), vmaxq_u8 ( v1, thresh_u ) );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8 ( src + j );                    
+                   v2 = vand_u8 ( vcgt_u8 ( v2, vget_low_s8(thresh_u) ), vmax_u8 ( v2, vget_low_s8(thresh_u) ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+                               
+            case THRESH_TOZERO_INV:
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );                
+                   v0 = vandq_u8 ( vcleq_u8 ( v0, thresh_u ), vminq_u8 ( v0, thresh_u ) );
+                   v1 = vandq_u8 ( vcleq_u8 ( v1, thresh_u ), vminq_u8 ( v1, thresh_u ) );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8 ( src + j );                    
+                   v2 = vand_u8 ( vcle_u8 ( v2, vget_low_s8(thresh_u) ), vmin_u8 ( v2, vget_low_s8(thresh_u) ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+            }
+        }
+    }
+#endif
+
+
     if( j_scalar < roi.width )
     {
         for( i = 0; i < roi.height; i++ )
-- 
1.7.11

test NEON-optimized cv::threshold() on mobile device

I have been writing some optimizations for the OpenCV's threshold function, for ARM devices (mobile phones). It should be working on both Android and iPhone.

I am interested in code correctness, and if it happens to work as intended, some statistics for original/optimized performance. Do not forget to look at all scenarios.

If this patch format is a nonsense for you, the post here may help you more

Patch 1. 1/3. Update checkHardwareSupport()

From 5129091430a7423e5c07a4f3c845033adb8ccefe Mon Sep 17 00:00:00 2001
From: sammy <[email protected]>
Date: Tue, 31 Jul 2012 09:59:36 +0300
Subject: [PATCH 1/2] Update checkHardwareSupport and HWFeatures to support
 ARM NEON

---
 modules/core/include/opencv2/core/core_c.h |  3 +++
 modules/core/src/system.cpp                | 17 ++++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/modules/core/include/opencv2/core/core_c.h b/modules/core/include/opencv2/core/core_c.h
index df763ab..bdfb879 100644
--- a/modules/core/include/opencv2/core/core_c.h
+++ b/modules/core/include/opencv2/core/core_c.h
@@ -1706,6 +1706,9 @@ CVAPI(double) cvGetTickFrequency( void );
 #define CV_CPU_SSE4_2  7
 #define CV_CPU_POPCNT  8
 #define CV_CPU_AVX    10
+
+#define CV_CPU_ARM_NEON 100
+
 #define CV_HARDWARE_MAX_FEATURE 255

 CVAPI(int) cvCheckHardwareSupport(int feature);
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index b8a4661..eeb2a58 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -176,7 +176,22 @@ struct HWFeatures
             f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
             f.have[CV_CPU_AVX]    = (cpuid_data[2] & (1<<28)) != 0;
         }
-
+       
+       // Android check
+       #if defined ANDROID
+       if (android_getCpuFamily() == ANDROID_CPU_FAMILY_ARM &&
+            (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) != 0)
+        {
+            f.have[CV_CPU_ARM_NEON] = 1;
+        }
+       #endif
+       // iOS check. Automatically activated by GCC/LLVM compiler option
+       #if define TARGET_OS_IPHONE 
+       #if define __ARM_NEON__
+       f.have[CV_CPU_ARM_NEON] = 1;
+       #endif
+       #endif
+       
         return f;
     }

-- 
1.7.11

Patch 2. 2/3. threshold() optimizations

From 9f9e6e0a382b0ec2b5ddb3eedd27bcdf95af9763 Mon Sep 17 00:00:00 2001
From: sammy <[email protected]>
Date: Tue, 31 Jul 2012 10:02:35 +0300
Subject: [PATCH 2/2] NEON-accelerated threshold()

---
 modules/imgproc/src/thresh.cpp | 140 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp
index 1fb4847..fb01852 100644
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -42,6 +42,10 @@

 #include "precomp.hpp"

+#if defined CV_USE_NEON
+#include <arm_neon.h>
+#endif
+
 namespace cv
 {

@@ -226,6 +230,142 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     }
 #endif

+#if CV_USE_NEON
+    if( checkHardwareSupport(CV_CPU_ARM_NEON) )
+    {
+       uint8x16_t thresh_u = vdupq_n_u8(thresh);
+        uint8x16_t maxval_ = vdupq_n_u8(maxval);
+       
+        j_scalar = roi.width & -8;
+
+        for( i = 0; i < roi.height; i++ )
+        {
+            const uchar* src = (const uchar*)(_src.data + _src.step*i);
+            uchar* dst = (uchar*)(_dst.data + _dst.step*i);
+
+            switch( type )
+            {
+            case THRESH_BINARY:
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );
+                   v0 = vcgtq_u8 ( v0, thresh_u );
+                   v1 = vcgtq_u8 ( v1, thresh_u );
+                   v0 = vandq_u8 ( v0, maxval_ );
+                   v1 = vandq_u8 ( v1, maxval_ );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8( src + j );
+                   v2 = vcgt_u8 ( v2, vget_low_s8 ( thresh_u ) );
+                   v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+
+            case THRESH_BINARY_INV:            
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );
+                   v0 = vcleq_u8 ( v0, thresh_u );
+                   v1 = vcleq_u8 ( v1, thresh_u );
+                   v0 = vandq_u8 ( v0, maxval_ );
+                   v1 = vandq_u8 ( v1, maxval_ );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8( src + j );
+                   v2 = vcle_u8 ( v2, vget_low_s8 ( thresh_u ) );
+                   v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+               
+            case THRESH_TRUNC:
+               for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );
+                   v0 = vminq_u8 ( v0, thresh_u );
+                   v1 = vminq_u8 ( v1, thresh_u );                 
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8( src + j );
+                   v2 = vmin_u8  ( v2, vget_low_s8 ( thresh_u ) );                 
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+               
+            case THRESH_TOZERO:            
+               for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );                
+                   v0 = vandq_u8 ( vcgtq_u8 ( v0, thresh_u ), vmaxq_u8 ( v0, thresh_u ) );
+                   v1 = vandq_u8 ( vcgtq_u8 ( v1, thresh_u ), vmaxq_u8 ( v1, thresh_u ) );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8 ( src + j );                    
+                   v2 = vand_u8 ( vcgt_u8 ( v2, vget_low_s8(thresh_u) ), vmax_u8 ( v2, vget_low_s8(thresh_u) ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+                               
+            case THRESH_TOZERO_INV:
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );                
+                   v0 = vandq_u8 ( vcleq_u8 ( v0, thresh_u ), vminq_u8 ( v0, thresh_u ) );
+                   v1 = vandq_u8 ( vcleq_u8 ( v1, thresh_u ), vminq_u8 ( v1, thresh_u ) );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8 ( src + j );                    
+                   v2 = vand_u8 ( vcle_u8 ( v2, vget_low_s8(thresh_u) ), vmin_u8 ( v2, vget_low_s8(thresh_u) ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+            }
+        }
+    }
+#endif
+
+
     if( j_scalar < roi.width )
     {
         for( i = 0; i < roi.height; i++ )
-- 
1.7.11

Patch 3/3. Fix omitted header

From e827b63d7e070c98ca33f390c8016bdea37c19d9 Mon Sep 17 00:00:00 2001 From: sammy [email protected] Date: Tue, 31 Jul 2012 10:34:50 +0300 Subject: [PATCH] Fix checkCpuFeatures() header omission

modules/core/src/system.cpp | 1 + 1 file changed, 1 insertion(+)

diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index eeb2a58..e1de270 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -91,6 +91,7 @@ #include <sys types.h=""> #if defined ANDROID #include <sys sysconf.h=""> +#include <cpu-features.h> #else #include <sys sysctl.h="">

#endif

1.7.11

test NEON-optimized cv::threshold() on mobile device

I have been writing some optimizations for the OpenCV's threshold function, for ARM devices (mobile phones). It should be working on both Android and iPhone.

I am interested in code correctness, and if it happens to work as intended, some statistics for original/optimized performance. Do not forget to look at all scenarios.

If this patch format is a nonsense for you, the post here may help you more

Edit

I forgot to mention that you have to activate it by adding #define CV_USE_NEON somewhere at the top of the thresh.cpp function or as compile flag -DCV_USE_NEON

Patch 1/3. Update checkHardwareSupport()

From 5129091430a7423e5c07a4f3c845033adb8ccefe Mon Sep 17 00:00:00 2001
From: sammy <[email protected]>
Date: Tue, 31 Jul 2012 09:59:36 +0300
Subject: [PATCH 1/2] Update checkHardwareSupport and HWFeatures to support
 ARM NEON

---
 modules/core/include/opencv2/core/core_c.h |  3 +++
 modules/core/src/system.cpp                | 17 ++++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/modules/core/include/opencv2/core/core_c.h b/modules/core/include/opencv2/core/core_c.h
index df763ab..bdfb879 100644
--- a/modules/core/include/opencv2/core/core_c.h
+++ b/modules/core/include/opencv2/core/core_c.h
@@ -1706,6 +1706,9 @@ CVAPI(double) cvGetTickFrequency( void );
 #define CV_CPU_SSE4_2  7
 #define CV_CPU_POPCNT  8
 #define CV_CPU_AVX    10
+
+#define CV_CPU_ARM_NEON 100
+
 #define CV_HARDWARE_MAX_FEATURE 255

 CVAPI(int) cvCheckHardwareSupport(int feature);
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index b8a4661..eeb2a58 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -176,7 +176,22 @@ struct HWFeatures
             f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
             f.have[CV_CPU_AVX]    = (cpuid_data[2] & (1<<28)) != 0;
         }
-
+       
+       // Android check
+       #if defined ANDROID
+       if (android_getCpuFamily() == ANDROID_CPU_FAMILY_ARM &&
+            (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) != 0)
+        {
+            f.have[CV_CPU_ARM_NEON] = 1;
+        }
+       #endif
+       // iOS check. Automatically activated by GCC/LLVM compiler option
+       #if define TARGET_OS_IPHONE 
+       #if define __ARM_NEON__
+       f.have[CV_CPU_ARM_NEON] = 1;
+       #endif
+       #endif
+       
         return f;
     }

-- 
1.7.11

Patch 2/3. threshold() optimizations

From 9f9e6e0a382b0ec2b5ddb3eedd27bcdf95af9763 Mon Sep 17 00:00:00 2001
From: sammy <[email protected]>
Date: Tue, 31 Jul 2012 10:02:35 +0300
Subject: [PATCH 2/2] NEON-accelerated threshold()

---
 modules/imgproc/src/thresh.cpp | 140 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp
index 1fb4847..fb01852 100644
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -42,6 +42,10 @@

 #include "precomp.hpp"

+#if defined CV_USE_NEON
+#include <arm_neon.h>
+#endif
+
 namespace cv
 {

@@ -226,6 +230,142 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     }
 #endif

+#if CV_USE_NEON
+    if( checkHardwareSupport(CV_CPU_ARM_NEON) )
+    {
+       uint8x16_t thresh_u = vdupq_n_u8(thresh);
+        uint8x16_t maxval_ = vdupq_n_u8(maxval);
+       
+        j_scalar = roi.width & -8;
+
+        for( i = 0; i < roi.height; i++ )
+        {
+            const uchar* src = (const uchar*)(_src.data + _src.step*i);
+            uchar* dst = (uchar*)(_dst.data + _dst.step*i);
+
+            switch( type )
+            {
+            case THRESH_BINARY:
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );
+                   v0 = vcgtq_u8 ( v0, thresh_u );
+                   v1 = vcgtq_u8 ( v1, thresh_u );
+                   v0 = vandq_u8 ( v0, maxval_ );
+                   v1 = vandq_u8 ( v1, maxval_ );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8( src + j );
+                   v2 = vcgt_u8 ( v2, vget_low_s8 ( thresh_u ) );
+                   v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+
+            case THRESH_BINARY_INV:            
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );
+                   v0 = vcleq_u8 ( v0, thresh_u );
+                   v1 = vcleq_u8 ( v1, thresh_u );
+                   v0 = vandq_u8 ( v0, maxval_ );
+                   v1 = vandq_u8 ( v1, maxval_ );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8( src + j );
+                   v2 = vcle_u8 ( v2, vget_low_s8 ( thresh_u ) );
+                   v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+               
+            case THRESH_TRUNC:
+               for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );
+                   v0 = vminq_u8 ( v0, thresh_u );
+                   v1 = vminq_u8 ( v1, thresh_u );                 
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8( src + j );
+                   v2 = vmin_u8  ( v2, vget_low_s8 ( thresh_u ) );                 
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+               
+            case THRESH_TOZERO:            
+               for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );                
+                   v0 = vandq_u8 ( vcgtq_u8 ( v0, thresh_u ), vmaxq_u8 ( v0, thresh_u ) );
+                   v1 = vandq_u8 ( vcgtq_u8 ( v1, thresh_u ), vmaxq_u8 ( v1, thresh_u ) );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8 ( src + j );                    
+                   v2 = vand_u8 ( vcgt_u8 ( v2, vget_low_s8(thresh_u) ), vmax_u8 ( v2, vget_low_s8(thresh_u) ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+                               
+            case THRESH_TOZERO_INV:
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );                
+                   v0 = vandq_u8 ( vcleq_u8 ( v0, thresh_u ), vminq_u8 ( v0, thresh_u ) );
+                   v1 = vandq_u8 ( vcleq_u8 ( v1, thresh_u ), vminq_u8 ( v1, thresh_u ) );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8 ( src + j );                    
+                   v2 = vand_u8 ( vcle_u8 ( v2, vget_low_s8(thresh_u) ), vmin_u8 ( v2, vget_low_s8(thresh_u) ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+            }
+        }
+    }
+#endif
+
+
     if( j_scalar < roi.width )
     {
         for( i = 0; i < roi.height; i++ )
-- 
1.7.11

Patch 3/3. Fix omitted header

From e827b63d7e070c98ca33f390c8016bdea37c19d9 Mon Sep 17 00:00:00 2001 From: sammy [email protected] Date: Tue, 31 Jul 2012 10:34:50 +0300 Subject: [PATCH] Fix checkCpuFeatures() header omission

modules/core/src/system.cpp | 1 + 1 file changed, 1 insertion(+)

#endif

1.7.11

test NEON-optimized cv::threshold() on mobile device

I have been writing some optimizations for the OpenCV's threshold function, for ARM devices (mobile phones). It should be working on both Android and iPhone.

I am interested in code correctness, and if it happens to work as intended, some statistics for original/optimized performance. Do not forget to look at all scenarios.

If this patch format is a nonsense for you, the post here may help you more

Edit

I forgot to mention that you have to activate it by adding #define CV_USE_NEON somewhere at the top of the thresh.cpp function or as compile flag -DCV_USE_NEON

Patch 1/3. Update checkHardwareSupport()

From 5129091430a7423e5c07a4f3c845033adb8ccefe Mon Sep 17 00:00:00 2001
From: sammy <[email protected]>
Date: Tue, 31 Jul 2012 09:59:36 +0300
Subject: [PATCH 1/2] Update checkHardwareSupport and HWFeatures to support
 ARM NEON

---
 modules/core/include/opencv2/core/core_c.h |  3 +++
 modules/core/src/system.cpp                | 17 ++++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/modules/core/include/opencv2/core/core_c.h b/modules/core/include/opencv2/core/core_c.h
index df763ab..bdfb879 100644
--- a/modules/core/include/opencv2/core/core_c.h
+++ b/modules/core/include/opencv2/core/core_c.h
@@ -1706,6 +1706,9 @@ CVAPI(double) cvGetTickFrequency( void );
 #define CV_CPU_SSE4_2  7
 #define CV_CPU_POPCNT  8
 #define CV_CPU_AVX    10
+
+#define CV_CPU_ARM_NEON 100
+
 #define CV_HARDWARE_MAX_FEATURE 255

 CVAPI(int) cvCheckHardwareSupport(int feature);
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index b8a4661..eeb2a58 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -176,7 +176,22 @@ struct HWFeatures
             f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
             f.have[CV_CPU_AVX]    = (cpuid_data[2] & (1<<28)) != 0;
         }
-
+       
+       // Android check
+       #if defined ANDROID
+       if (android_getCpuFamily() == ANDROID_CPU_FAMILY_ARM &&
+            (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) != 0)
+        {
+            f.have[CV_CPU_ARM_NEON] = 1;
+        }
+       #endif
+       // iOS check. Automatically activated by GCC/LLVM compiler option
+       #if define TARGET_OS_IPHONE 
+       #if define __ARM_NEON__
+       f.have[CV_CPU_ARM_NEON] = 1;
+       #endif
+       #endif
+       
         return f;
     }

-- 
1.7.11

Patch 2/3. threshold() optimizations

From 9f9e6e0a382b0ec2b5ddb3eedd27bcdf95af9763 Mon Sep 17 00:00:00 2001
From: sammy <[email protected]>
Date: Tue, 31 Jul 2012 10:02:35 +0300
Subject: [PATCH 2/2] NEON-accelerated threshold()

---
 modules/imgproc/src/thresh.cpp | 140 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp
index 1fb4847..fb01852 100644
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -42,6 +42,10 @@

 #include "precomp.hpp"

+#if defined CV_USE_NEON
+#include <arm_neon.h>
+#endif
+
 namespace cv
 {

@@ -226,6 +230,142 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     }
 #endif

+#if CV_USE_NEON
+    if( checkHardwareSupport(CV_CPU_ARM_NEON) )
+    {
+       uint8x16_t thresh_u = vdupq_n_u8(thresh);
+        uint8x16_t maxval_ = vdupq_n_u8(maxval);
+       
+        j_scalar = roi.width & -8;
+
+        for( i = 0; i < roi.height; i++ )
+        {
+            const uchar* src = (const uchar*)(_src.data + _src.step*i);
+            uchar* dst = (uchar*)(_dst.data + _dst.step*i);
+
+            switch( type )
+            {
+            case THRESH_BINARY:
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );
+                   v0 = vcgtq_u8 ( v0, thresh_u );
+                   v1 = vcgtq_u8 ( v1, thresh_u );
+                   v0 = vandq_u8 ( v0, maxval_ );
+                   v1 = vandq_u8 ( v1, maxval_ );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8( src + j );
+                   v2 = vcgt_u8 ( v2, vget_low_s8 ( thresh_u ) );
+                   v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+
+            case THRESH_BINARY_INV:            
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );
+                   v0 = vcleq_u8 ( v0, thresh_u );
+                   v1 = vcleq_u8 ( v1, thresh_u );
+                   v0 = vandq_u8 ( v0, maxval_ );
+                   v1 = vandq_u8 ( v1, maxval_ );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8( src + j );
+                   v2 = vcle_u8 ( v2, vget_low_s8 ( thresh_u ) );
+                   v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+               
+            case THRESH_TRUNC:
+               for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );
+                   v0 = vminq_u8 ( v0, thresh_u );
+                   v1 = vminq_u8 ( v1, thresh_u );                 
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8( src + j );
+                   v2 = vmin_u8  ( v2, vget_low_s8 ( thresh_u ) );                 
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+               
+            case THRESH_TOZERO:            
+               for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );                
+                   v0 = vandq_u8 ( vcgtq_u8 ( v0, thresh_u ), vmaxq_u8 ( v0, thresh_u ) );
+                   v1 = vandq_u8 ( vcgtq_u8 ( v1, thresh_u ), vmaxq_u8 ( v1, thresh_u ) );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8 ( src + j );                    
+                   v2 = vand_u8 ( vcgt_u8 ( v2, vget_low_s8(thresh_u) ), vmax_u8 ( v2, vget_low_s8(thresh_u) ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+                               
+            case THRESH_TOZERO_INV:
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                   uint8x16_t v0, v1;
+                   v0 = vld1q_u8 ( src + j );
+                    v1 = vld1q_u8 ( src + j + 16 );                
+                   v0 = vandq_u8 ( vcleq_u8 ( v0, thresh_u ), vminq_u8 ( v0, thresh_u ) );
+                   v1 = vandq_u8 ( vcleq_u8 ( v1, thresh_u ), vminq_u8 ( v1, thresh_u ) );
+                   vst1q_u8 ( dst + j, v0 );
+                   vst1q_u8 ( dst + j + 16, v1 );
+                }
+
+               
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                   uint8x8_t v2;
+                   v2 = vld1_u8 ( src + j );                    
+                   v2 = vand_u8 ( vcle_u8 ( v2, vget_low_s8(thresh_u) ), vmin_u8 ( v2, vget_low_s8(thresh_u) ) );
+                   vst1_u8 ( dst + j, v2 );                    
+                }
+                break;
+            }
+        }
+    }
+#endif
+
+
     if( j_scalar < roi.width )
     {
         for( i = 0; i < roi.height; i++ )
-- 
1.7.11

Patch 3/3. Fix omitted header

From e827b63d7e070c98ca33f390c8016bdea37c19d9 Mon Sep 17 00:00:00 2001 From: sammy [email protected] <[email protected]> Date: Tue, 31 Jul 2012 10:34:50 +0300 Subject: [PATCH] Fix checkCpuFeatures() header omission omission --- modules/core/src/system.cpp | 1 + 1 file changed, 1 insertion(+) insertion(+) diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index eeb2a58..e1de270 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -91,6 +91,7 @@ #include <sys types.h=""> <sys/types.h> #if defined ANDROID #include <sys sysconf.h=""> <sys/sysconf.h> +#include <cpu-features.h> #else #include <sys sysctl.h=""> #endif 1.7.11<sys/sysctl.h> #endif -- 1.7.11