parallel101 · GeLee-Q · Mar 14, 2022 · archibate · Mar 17, 2022
diff --git a/ANSWER.md b/ANSWER.md
@@ -1,36 +1,66 @@
 # 改进前
 
 ```
-这里贴改进前的运行结果。
-main: 100s
+134.008686s
 ```
 
 # 改进后
 
+开启openMP
+
+```c++
+49.100373s        加速：2.72x
 ```
-这里贴改进后的运行结果。
-main: 0.01s
+
+参考03/00.cpp 指针数组来表示
+
+```c++
+OpenMp  29.86s 	  加速：4,48x
+tbb 	17.48s    加速：7.66x
 ```
 
-# 加速比
+参考04/04.cpp 指针数组加上spin_mutex
 
-main: 10000x
+```c++
+ openMP 33.47s    加速：4.06x
+ tbb    17.76s	  加速：7.54x
+```
 
-> 如果记录了多种优化方法，可以做表格比较
+# 加速比
+
+> 7.5x
 
 # 实现方法
 
-你是如何封装稀疏网格的 Grid 类的？
+**封装稀疏网格的 Grid 类的？**
+
+- 参考03/00.cpp 使用了指针的数组封装稀疏的Grid表格
+- 参考04/04.cpp 在上一个的基础上使用了spin_mutex自旋锁
+- 如果使用hash().pointer().dense的话，在WSL上回出现 out of memory
+
+**有无位运算量化减轻内存带宽？**
+
+- &   替代   %，   >>    替代   /，    |   替代   +
+
+
+
+ **Grid 是否可以并行访问？OpenMP 还是 TBB？**
+
+- 使用OpenMP进行并行访问。
+
+
+
+**有没有用访问者模式缓存坐标，避免重复上锁？**
+
+- 没有使用, 在WSL里使用这个，会莫名出现segment fault
+
 
-有没有用位运算量化减轻内存带宽？
 
-你封装的 Grid 是否可以并行访问？OpenMP 还是 TBB？
+**对于 `step()` 中这种插桩你是如何优化的？用了老师的什么知识点？**
 
-有没有用访问者模式缓存坐标，避免重复上锁？
+- 将step()改造成了tbb并行。
 
-对于 `step()` 中这种插桩你是如何优化的？用了老师的什么知识点？
 
-> 请回答。
 
 # 我的创新点
 

diff --git a/main.cpp b/main.cpp
@@ -4,12 +4,83 @@
 #include <cstdlib>
 #include <cstring>
 #include "ticktock.h"
+#include <omp.h>
+// #include "snode.h"
+#include <memory>
+#include "bate.h"
+#include <tbb/parallel_for.h>
+#include <tbb/blocked_range2d.h>
+#include <tbb/spin_mutex.h>
+#include <mutex>
+
+#include "bate.h"
+#include <tbb/spin_mutex.h>
+#include <mutex>
+#include <atomic>
+
+
+
+struct Grid {
+    static constexpr int Bshift = 8;
+    static constexpr int B = 1 << Bshift;
+    static constexpr int Bmask = B - 1;
+
+    static constexpr int B1shift = 11;
+    static constexpr int B1 = 1 << B1shift;
+    static constexpr int B1mask = B1 - 1;
+
+    struct Block {
+        char m_block[B][B];
+    };
+
+    tbb::spin_mutex m_mtx[B1][B1];
+    std::unique_ptr<Block> m_data[B1][B1];  // ~1MB
+
+    char read(int x, int y) const {
+        auto &block = m_data[(x >> Bshift) & B1mask][(y >> Bshift) & B1mask];
+        if (!block)
+            return 0;
+        return block->m_block[x & Bmask][y & Bmask];
+    }
+
+    void write(int x, int y, char value) {
+        auto &block = m_data[(x >> Bshift) & B1mask][(y >> Bshift) & B1mask];
+        if (!block) {
+            std::lock_guard _(m_mtx[(x >> Bshift) & B1mask][(y >> Bshift) & B1mask]);
+            if (!block)
+                block = std::make_unique<Block>();
+        }
+        block->m_block[x & Bmask][y & Bmask] = value;
+    }
+
+    template <class Func>
+    void foreach(Func const &func) {
+#pragma omp parallel for collapse(2)
+        for (int x1 = 0; x1 < B1; x1++) {
+            for (int y1 = 0; y1 < B1; y1++) {
+                auto const &block = m_data[x1 & B1mask][y1 & B1mask];
+                if (!block)
+                    continue;
+                int xb = x1 << B1shift;
+                int yb = y1 << B1shift;
+                for (int dx = 0; dx < B; dx++) {
+                    for (int dy = 0; dy < B; dy++) {
+                        func(xb | dx, yb | dy, block->m_block[dx][dy]);
+                    }
+                }
+            }
+        }
+    }
+};
 
 constexpr int N = 1<<12;
 
 // 任务: 改造成小彭老师说的稀疏数据结构
-std::vector<bool> cells(N * N);
-std::vector<bool> outcells(N * N);
+// std::vector<bool> cells(N * N);
+// std::vector<bool> outcells(N * N);
+Grid * cells = new Grid{};
+Grid * outcells = new Grid{};
+
 
 const char gundata[] = R"(125b3o$125bobo$125b3o6bo$125b3o5b3o$125b3o$125b3o$125bobo5b3o$125b3o$
 133bobo$133bobo2$133b3o$111b2o6b2o$110bo2bo4bo2bo$110bo2bo4bo2bo11b3o$
@@ -36,6 +107,7 @@ o4bo2bo$99bo2bo4bo2bo$99bo2bo4bo2bo$100b2o6b2o$114b3o2$113bo3bo$113bo
 3bo2$114b3o3$114b3o2$113bo3bo$113bo3bo2$114b3o!)";
 
 void init(int bx, int by) {
+
     int acc = 0;
     int x = bx;
     int y = by;
@@ -50,12 +122,12 @@ void init(int bx, int by) {
         if (!acc) acc = 1;
         if (c == 'b') {
             for (int o = 0; o < acc; o++) {
-                cells[x * N + y++] = 0;
+               cells->write(x, y++, 0);
             }
         }
         if (c == 'o') {
             for (int o = 0; o < acc; o++) {
-                cells[x * N + y++] = 1;
+                cells->write(x, y++, 1);
             }
         }
         if (c == '$') {
@@ -66,45 +138,95 @@ void init(int bx, int by) {
     }
 }
 
+
 void step() {
+
+    tbb::parallel_for(tbb::blocked_range2d<int>(1,N-1,1,N-1),
+    [&](tbb::blocked_range2d<int> r){
+        for(int y = r.cols().begin() ; y < r.cols().end(); y++){
+            for(int x = r.rows().begin(); x < r.rows().end(); x++){
+                int neigh = 0;
+
+            neigh += cells->read(x , y+1);
+            neigh += cells->read(x , y-1);
+
+            neigh += cells->read((x + 1) , (y + 1));
+            neigh += cells->read((x + 1) , y);
+            neigh += cells->read((x + 1) , y-1);
+            neigh += cells->read((x - 1) , y+1);
+            neigh += cells->read((x - 1) , y);
+            neigh += cells->read((x - 1) , y-1);
+
+            if (cells->read(x , y)) {
+                if (neigh == 2 || neigh == 3) {
+                    outcells->write(x, y, 1);
+
+                } else {
+                    outcells->write(x, y, 0);
+
+                }
+            } else {
+                if (neigh == 3) {
+                    outcells->write(x, y, 1);
+
+                } else {
+                    outcells->write(x, y, 0);
+
+                }
+            }
+
+            }
+        }
+    });
+    std::swap(cells, outcells);
+}
+
+void step1() {
 #pragma omp parallel for collapse(2)
     for (int y = 1; y < N-1; y++) {
         for (int x = 1; x < N-1; x++) {
             int neigh = 0;
-            neigh += cells[x * N + (y + 1)];
-            neigh += cells[x * N + (y - 1)];
-            neigh += cells[(x + 1) * N + (y + 1)];
-            neigh += cells[(x + 1) * N + y];
-            neigh += cells[(x + 1) * N + (y - 1)];
-            neigh += cells[(x - 1) * N + (y + 1)];
-            neigh += cells[(x - 1) * N + y];
-            neigh += cells[(x - 1) * N + (y - 1)];
-            if (cells[x * N + y]) {
+
+            neigh += cells->read(x , y+1);
+            neigh += cells->read(x , y-1);
+
+            neigh += cells->read((x + 1) , (y + 1));
+            neigh += cells->read((x + 1) , y);
+            neigh += cells->read((x + 1) , y-1);
+            neigh += cells->read((x - 1) , y+1);
+            neigh += cells->read((x - 1) , y);
+            neigh += cells->read((x - 1) , y-1);
+
+            if (cells->read(x , y)) {
                 if (neigh == 2 || neigh == 3) {
-                    outcells[x * N + y] = 1;
+                    outcells->write(x, y, 1);
                 } else {
-                    outcells[x * N + y] = 0;
+                    outcells->write(x, y, 0);
                 }
             } else {
                 if (neigh == 3) {
-                    outcells[x * N + y] = 1;
+                    outcells->write(x, y, 1);
                 } else {
-                    outcells[x * N + y] = 0;
+                    outcells->write(x, y, 0);
                 }
             }
         }
     }
     std::swap(cells, outcells);
+
+
 }
 
+
+
 void showinfo() {
     int rightbound = std::numeric_limits<int>::min();
     int leftbound = std::numeric_limits<int>::max();
     int count = 0;
 #pragma omp parallel for collapse(2) reduction(max:rightbound) reduction(min:leftbound) reduction(+:count)
     for (int x = 0; x < N; x++) {
         for (int y = 0; y < N; y++) {
-            if (cells[x * N + y]) {
+            if (cells->read(x , y)) {
                 rightbound = std::max(rightbound, y);
                 leftbound = std::min(leftbound, y);
                 count++;
@@ -123,14 +245,16 @@ int main() {
     init(N / 2 + 500, N / 2 + 500);
     init(N / 2 - 1000, N / 2 - 1000);
     init(N / 2 + 1000, N / 2 + 1000);
+    printf("init is ok\n");
     for (int times = 0; times < 800; times++) {
         printf("step %d\n", times);
         if (times % 100 == 0)
             showinfo();
+
         step();
     }
     showinfo();
 
     TOCK(main);
     return 0;
-}
+}