#include "integral_image.h" #include namespace integral_image { Mat integral_image_serial(const Mat &image) { if (image.cols == 0 || image.rows == 0) return Mat(); Mat result = image.clone(); for (size_t row = 1; row < result.rows; ++row) result[row][0] += result[row - 1][0]; for (size_t col = 1; col < result.cols; ++col) result[0][col] += result[0][col - 1]; for (size_t row = 1; row < result.rows; ++row) for (size_t col = 1; col < result.cols; ++col) result[row][col] += result[row - 1][col] + result[row][col - 1] - result[row - 1][col - 1]; return result; } Mat integral_image_openmp(const Mat &image, int thread_number) { if (image.cols == 0 || image.rows == 0) return Mat(); if (0 != thread_number) { omp_set_dynamic(0); omp_set_num_threads(thread_number); } Mat result = image.clone(); #pragma omp parallel for for (int row = 0; row < result.rows; ++row) { for (int col = 1; col < result.cols; ++col) { result[row][col] += result[row][col - 1]; } } //This loop is likely to have lots of cache misses that can probably be avoided by transposing data, processing data //in a way similar to the previous loop, and than transposing data again. //TODO: benchmark #pragma omp parallel for for (int col = 0; col < result.cols; ++col) { for (int row = 1; row < result.rows; ++row) { result[row][col] += result[row - 1][col]; } } return result; } }