diff --git a/integral_image.cpp b/integral_image.cpp
index 1494d57..0a53ff8 100644
--- a/integral_image.cpp
+++ b/integral_image.cpp
@@ -48,19 +48,22 @@ Mat integral_image_openmp(const Mat &image, int thread_number)
         }
     }
 
-    //This loop is likely to have lots of cache misses that can probably be avoided by transposing data, processing data
-    //in a way similar to the previous loop, and than transposing data again.
-    //TODO: benchmark
+    //It is more cache-friendly to accumulate data row-wise, so here we transpose the matrix,
+    //than process it, and than traspose it again to restore original matrix shape
+
+    result = result.t();
 
     #pragma omp parallel for
-    for (int col = 0; col < result.cols; ++col) {
+    for (int row = 0; row < result.rows; ++row) {
 
-        for (int row = 1; row < result.rows; ++row) {
+        for (int col = 1; col < result.cols; ++col) {
 
-            result[row][col] += result[row - 1][col];
+            result[row][col] += result[row][col - 1];
         }
     }
 
+    result = result.t();
+
     return result;
 }