Revision history [back]

the short answer is: you should not worry at all about this.

all your functions will be calling gemm() one way or the other, and the only "overhead" would be the allocation cost of the return value, which is neglible, compared to the cost of a full matrix multiplication.

what you should care about is: building opencv libs with all optimization available, TBB. IPP, opencl, BLAS, and such, as below small example shows:

Mat  *             191.494
Mat  gemm no alloc 193.061
Mat  gemm    alloc 190.75
UMat gemm no alloc 63.9974  * opencv3
UMat gemm    alloc 65.2547  * opencv3

and here's the code:

Mat A(500,500,CV_32F);
Mat B(500,500,CV_32F);

int64 t0 = getTickCount();
for (int i=0; i<500; i++) {
    Mat C = A * B;
}
int64 t1 = getTickCount();
cerr << "Mat  *             " << (t1-t0)/getTickFrequency() << endl;

for (int i=0; i<500; i++) {
    Mat C;
    gemm(A,B,1,noArray(),0,C);
}
int64 t2 = getTickCount();
cerr << "Mat  gemm no alloc " << (t2-t1)/getTickFrequency() << endl;

Mat C(500,500,CV_32F); // preallocated
for (int i=0; i<500; i++) {
    gemm(A,B,1,noArray(),0,C);
}
int64 t3 = getTickCount();
cerr << "Mat  gemm    alloc " << (t3-t2)/getTickFrequency() << endl;

UMat D(500,500,CV_32F);
UMat E(500,500,CV_32F);
int64 t4 = getTickCount();
for (int i=0; i<500; i++) {
    UMat F;
    gemm(D,E,1,noArray(),0,F);
}
int64 t5 = getTickCount();
cerr << "UMat gemm no alloc " << (t5-t4)/getTickFrequency() << endl;

UMat F(500,500,CV_32F); // preallocated
for (int i=0; i<500; i++) {
    gemm(D,E,1,noArray(),0,F);
}
int64 t6 = getTickCount();
cerr << "UMat gemm    alloc " << (t6-t5)/getTickFrequency() << endl;