Ask Your Question

Revision history [back]

Assisting the compiler into generating better code

I found a set of routines in the imgcodecs portion of the opencv where a fairly simple code change improves performance by 2x on the Power platform but has little to no effect on x64. My question is should changes be made to help the compiler like below? I realize that eventually the compiler "could" be made to generate better code.

template<class dataType>
inline void cvtBGR2Gray( const dataType* rgb, dataType* gray,
                         Size& size, int ncn, int _swap_rb )
{
    int i;
#if 0
    for( i = 0; i < size.width; i++, rgb += ncn )
    {
        int t = descale( rgb[_swap_rb]*cB + rgb[1]*cG + rgb[_swap_rb^2]*cR, SCALE );
        gray[i] = (dataType)t;
    }
#else
    if (_swap_rb)
    {
        for( i = 0; i < size.width; i++, rgb += ncn )
        {
            int t = descale( rgb[0]*cR + rgb[1]*cG + rgb[2]*cB, SCALE );
            gray[i] = (dataType)t;
        }
    }
    else
    {
        for( i = 0; i < size.width; i++, rgb += ncn )
        {
            int t = descale( rgb[0]*cB + rgb[1]*cG + rgb[2]*cR, SCALE );
            gray[i] = (dataType)t;
        }
    }
#endif
}

void icvCvt_BGRA2Gray_8u_C4C1R( const uchar* rgba, int rgba_step,
                                 uchar* gray, int gray_step,
                                 Size size, int _swap_rb )
{
   _swap_rb = _swap_rb ? 2 : 0;
   for( ; size.height--; gray += gray_step )
   {
       cvtBGR2Gray<uchar>(rgba, gray, size, 4, _swap_rb);

       rgba += rgba_step - size.width*4;
   }
}

// Similar changes to icvCvt_BGR2Gray_8u_C3C1R and icvCvt_BGRA2Gray_16u_CnC1R