Hello there. I'd like to know if there's any reason for using generic template
template<typename T, typename WT> static void
mul_( const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, Size size, WT scale )
{
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
if( scale == (WT)1. )
{
for( ; size.height--; src1 += step1, src2 += step2, dst += step )
{
int i=0;
#if CV_ENABLE_UNROLLED
for(; i <= size.width - 4; i += 4 )
{
T t0;
T t1;
t0 = saturate_cast<T>(src1[i ] * src2[i ]);
t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
dst[i ] = t0;
dst[i+1] = t1;
t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
dst[i+2] = t0;
dst[i+3] = t1;
}
#endif
for( ; i < size.width; i++ )
dst[i] = saturate_cast<T>(src1[i] * src2[i]);
}
}
else
{
for( ; size.height--; src1 += step1, src2 += step2, dst += step )
{
int i = 0;
#if CV_ENABLE_UNROLLED
for(; i <= size.width - 4; i += 4 )
{
T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
dst[i] = t0; dst[i+1] = t1;
t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
dst[i+2] = t0; dst[i+3] = t1;
}
#endif
for( ; i < size.width; i++ )
dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
}
}
}
and not having partially-specified versions of this code which uses some of simd intrinsics? As far as I understand, this code (at least it's part inside of CV_ENABLE_UNROLLED) should be optimized by compiler, but e.g. MSVC 12.0 is unable to produce optimized code for T=uchar, because of saturation code that is used.