Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hw10 Gelee #1

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 43 additions & 13 deletions ANSWER.md
Original file line number Diff line number Diff line change
@@ -1,36 +1,66 @@
# 改进前

```
这里贴改进前的运行结果。
main: 100s
134.008686s
```

# 改进后

开启openMP

```c++
49.100373s 加速:2.72x
```
这里贴改进后的运行结果。
main: 0.01s

参考03/00.cpp 指针数组来表示

```c++
OpenMp 29.86s 加速:4,48x
tbb 17.48s 加速:7.66x
```

# 加速比
参考04/04.cpp 指针数组加上spin_mutex

main: 10000x
```c++
openMP 33.47s 加速:4.06x
tbb 17.76s 加速:7.54x
```

> 如果记录了多种优化方法,可以做表格比较
# 加速比

> 7.5x

# 实现方法

你是如何封装稀疏网格的 Grid 类的?
**封装稀疏网格的 Grid 类的?**

- 参考03/00.cpp 使用了指针的数组封装稀疏的Grid表格
- 参考04/04.cpp 在上一个的基础上使用了spin_mutex自旋锁
- 如果使用hash().pointer().dense的话,在WSL上回出现 out of memory

**有无位运算量化减轻内存带宽?**

- & 替代 %, >> 替代 /, | 替代 +



**Grid 是否可以并行访问?OpenMP 还是 TBB?**

- 使用OpenMP进行并行访问。



**有没有用访问者模式缓存坐标,避免重复上锁?**

- 没有使用, 在WSL里使用这个,会莫名出现segment fault


有没有用位运算量化减轻内存带宽?

你封装的 Grid 是否可以并行访问?OpenMP 还是 TBB?
**对于 `step()` 中这种插桩你是如何优化的?用了老师的什么知识点?**

有没有用访问者模式缓存坐标,避免重复上锁?
- 将step()改造成了tbb并行。

对于 `step()` 中这种插桩你是如何优化的?用了老师的什么知识点?

> 请回答。

# 我的创新点

Expand Down
162 changes: 143 additions & 19 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,83 @@
#include <cstdlib>
#include <cstring>
#include "ticktock.h"
#include <omp.h>
// #include "snode.h"
#include <memory>
#include "bate.h"
#include <tbb/parallel_for.h>
#include <tbb/blocked_range2d.h>
#include <tbb/spin_mutex.h>
#include <mutex>

#include "bate.h"
#include <tbb/spin_mutex.h>
#include <mutex>
#include <atomic>



struct Grid {
static constexpr int Bshift = 8;
static constexpr int B = 1 << Bshift;
static constexpr int Bmask = B - 1;

static constexpr int B1shift = 11;
static constexpr int B1 = 1 << B1shift;
static constexpr int B1mask = B1 - 1;

struct Block {
char m_block[B][B];
};

tbb::spin_mutex m_mtx[B1][B1];
std::unique_ptr<Block> m_data[B1][B1]; // ~1MB

char read(int x, int y) const {
auto &block = m_data[(x >> Bshift) & B1mask][(y >> Bshift) & B1mask];
if (!block)
return 0;
return block->m_block[x & Bmask][y & Bmask];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

我觉得还可以做一下8bit合成一个char的那个优化。

}

void write(int x, int y, char value) {
auto &block = m_data[(x >> Bshift) & B1mask][(y >> Bshift) & B1mask];
if (!block) {
std::lock_guard _(m_mtx[(x >> Bshift) & B1mask][(y >> Bshift) & B1mask]);
if (!block)
block = std::make_unique<Block>();
}
block->m_block[x & Bmask][y & Bmask] = value;
}

template <class Func>
void foreach(Func const &func) {
#pragma omp parallel for collapse(2)
for (int x1 = 0; x1 < B1; x1++) {
for (int y1 = 0; y1 < B1; y1++) {
auto const &block = m_data[x1 & B1mask][y1 & B1mask];
if (!block)
continue;
int xb = x1 << B1shift;
int yb = y1 << B1shift;
for (int dx = 0; dx < B; dx++) {
for (int dy = 0; dy < B; dy++) {
func(xb | dx, yb | dy, block->m_block[dx][dy]);
}
}
}
}
}
};

constexpr int N = 1<<12;

// 任务: 改造成小彭老师说的稀疏数据结构
std::vector<bool> cells(N * N);
std::vector<bool> outcells(N * N);
// std::vector<bool> cells(N * N);
// std::vector<bool> outcells(N * N);
Grid * cells = new Grid{};
Grid * outcells = new Grid{};


const char gundata[] = R"(125b3o$125bobo$125b3o6bo$125b3o5b3o$125b3o$125b3o$125bobo5b3o$125b3o$
133bobo$133bobo2$133b3o$111b2o6b2o$110bo2bo4bo2bo$110bo2bo4bo2bo11b3o$
Expand All @@ -36,6 +107,7 @@ o4bo2bo$99bo2bo4bo2bo$99bo2bo4bo2bo$100b2o6b2o$114b3o2$113bo3bo$113bo
3bo2$114b3o3$114b3o2$113bo3bo$113bo3bo2$114b3o!)";

void init(int bx, int by) {

int acc = 0;
int x = bx;
int y = by;
Expand All @@ -50,12 +122,12 @@ void init(int bx, int by) {
if (!acc) acc = 1;
if (c == 'b') {
for (int o = 0; o < acc; o++) {
cells[x * N + y++] = 0;
cells->write(x, y++, 0);
}
}
if (c == 'o') {
for (int o = 0; o < acc; o++) {
cells[x * N + y++] = 1;
cells->write(x, y++, 1);
}
}
if (c == '$') {
Expand All @@ -66,45 +138,95 @@ void init(int bx, int by) {
}
}


void step() {

tbb::parallel_for(tbb::blocked_range2d<int>(1,N-1,1,N-1),
[&](tbb::blocked_range2d<int> r){
for(int y = r.cols().begin() ; y < r.cols().end(); y++){
for(int x = r.rows().begin(); x < r.rows().end(); x++){
int neigh = 0;

neigh += cells->read(x , y+1);
neigh += cells->read(x , y-1);

neigh += cells->read((x + 1) , (y + 1));
neigh += cells->read((x + 1) , y);
neigh += cells->read((x + 1) , y-1);
neigh += cells->read((x - 1) , y+1);
neigh += cells->read((x - 1) , y);
neigh += cells->read((x - 1) , y-1);

if (cells->read(x , y)) {
if (neigh == 2 || neigh == 3) {
outcells->write(x, y, 1);

} else {
outcells->write(x, y, 0);

}
} else {
if (neigh == 3) {
outcells->write(x, y, 1);

} else {
outcells->write(x, y, 0);

}
}

}
}
});
std::swap(cells, outcells);
}

void step1() {
#pragma omp parallel for collapse(2)
for (int y = 1; y < N-1; y++) {
for (int x = 1; x < N-1; x++) {
int neigh = 0;
neigh += cells[x * N + (y + 1)];
neigh += cells[x * N + (y - 1)];
neigh += cells[(x + 1) * N + (y + 1)];
neigh += cells[(x + 1) * N + y];
neigh += cells[(x + 1) * N + (y - 1)];
neigh += cells[(x - 1) * N + (y + 1)];
neigh += cells[(x - 1) * N + y];
neigh += cells[(x - 1) * N + (y - 1)];
if (cells[x * N + y]) {

neigh += cells->read(x , y+1);
neigh += cells->read(x , y-1);

neigh += cells->read((x + 1) , (y + 1));
neigh += cells->read((x + 1) , y);
neigh += cells->read((x + 1) , y-1);
neigh += cells->read((x - 1) , y+1);
neigh += cells->read((x - 1) , y);
neigh += cells->read((x - 1) , y-1);

if (cells->read(x , y)) {
if (neigh == 2 || neigh == 3) {
outcells[x * N + y] = 1;
outcells->write(x, y, 1);
} else {
outcells[x * N + y] = 0;
outcells->write(x, y, 0);
}
} else {
if (neigh == 3) {
outcells[x * N + y] = 1;
outcells->write(x, y, 1);
} else {
outcells[x * N + y] = 0;
outcells->write(x, y, 0);
}
}
}
}
std::swap(cells, outcells);


}



void showinfo() {
int rightbound = std::numeric_limits<int>::min();
int leftbound = std::numeric_limits<int>::max();
int count = 0;
#pragma omp parallel for collapse(2) reduction(max:rightbound) reduction(min:leftbound) reduction(+:count)
for (int x = 0; x < N; x++) {
for (int y = 0; y < N; y++) {
if (cells[x * N + y]) {
if (cells->read(x , y)) {
rightbound = std::max(rightbound, y);
leftbound = std::min(leftbound, y);
count++;
Expand All @@ -123,14 +245,16 @@ int main() {
init(N / 2 + 500, N / 2 + 500);
init(N / 2 - 1000, N / 2 - 1000);
init(N / 2 + 1000, N / 2 + 1000);
printf("init is ok\n");
for (int times = 0; times < 800; times++) {
printf("step %d\n", times);
if (times % 100 == 0)
showinfo();

step();
}
showinfo();

TOCK(main);
return 0;
}
}