diff --git a/integral_image.cpp b/integral_image.cpp index 1494d57..0a53ff8 100644 --- a/integral_image.cpp +++ b/integral_image.cpp @@ -48,19 +48,22 @@ Mat integral_image_openmp(const Mat &image, int thread_number) } } - //This loop is likely to have lots of cache misses that can probably be avoided by transposing data, processing data - //in a way similar to the previous loop, and than transposing data again. - //TODO: benchmark + //It is more cache-friendly to accumulate data row-wise, so here we transpose the matrix, + //than process it, and than traspose it again to restore original matrix shape + + result = result.t(); #pragma omp parallel for - for (int col = 0; col < result.cols; ++col) { + for (int row = 0; row < result.rows; ++row) { - for (int row = 1; row < result.rows; ++row) { + for (int col = 1; col < result.cols; ++col) { - result[row][col] += result[row - 1][col]; + result[row][col] += result[row][col - 1]; } } + result = result.t(); + return result; }