optimization: rows of a transposed image are processed instead of cols

This commit is contained in:
Dmitry Kokorin 2018-09-05 23:36:00 +03:00
parent 196957f3d0
commit cccd0f1641

View file

@ -48,19 +48,22 @@ Mat integral_image_openmp(const Mat &image, int thread_number)
} }
} }
//This loop is likely to have lots of cache misses that can probably be avoided by transposing data, processing data //It is more cache-friendly to accumulate data row-wise, so here we transpose the matrix,
//in a way similar to the previous loop, and than transposing data again. //than process it, and than traspose it again to restore original matrix shape
//TODO: benchmark
result = result.t();
#pragma omp parallel for #pragma omp parallel for
for (int col = 0; col < result.cols; ++col) { for (int row = 0; row < result.rows; ++row) {
for (int row = 1; row < result.rows; ++row) { for (int col = 1; col < result.cols; ++col) {
result[row][col] += result[row - 1][col]; result[row][col] += result[row][col - 1];
} }
} }
result = result.t();
return result; return result;
} }