optimization: rows of a transposed image are processed instead of cols

This commit is contained in:
Dmitry Kokorin 2018-09-05 23:36:00 +03:00
parent 196957f3d0
commit cccd0f1641

View file

@ -48,19 +48,22 @@ Mat integral_image_openmp(const Mat &image, int thread_number)
}
}
//This loop is likely to have lots of cache misses that can probably be avoided by transposing data, processing data
//in a way similar to the previous loop, and than transposing data again.
//TODO: benchmark
//It is more cache-friendly to accumulate data row-wise, so here we transpose the matrix,
//than process it, and than traspose it again to restore original matrix shape
result = result.t();
#pragma omp parallel for
for (int col = 0; col < result.cols; ++col) {
for (int row = 0; row < result.rows; ++row) {
for (int row = 1; row < result.rows; ++row) {
for (int col = 1; col < result.cols; ++col) {
result[row][col] += result[row - 1][col];
result[row][col] += result[row][col - 1];
}
}
result = result.t();
return result;
}