-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathmain.cpp
124 lines (110 loc) · 3.74 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
// 这是第07课的回家作业,主题是访存优化
// 录播见:https://www.bilibili.com/video/BV1gu41117bW
// 作业的回答推荐写在 ANSWER.md 方便老师批阅,也可在 PR 描述中
// 请晒出程序被你优化前后的运行结果(ticktock 的用时统计)
// 可以比较采用了不同的优化手段后,加速了多少倍,做成表格
// 如能同时贴出 CPU 核心数量,缓存大小等就最好了(lscpu 命令)
// 作业中有很多个问句,请通过注释回答问题,并改进其代码,以使其更快
// 并行可以用 OpenMP 也可以用 TBB
#include <iostream>
//#include <x86intrin.h> // _mm 系列指令都来自这个头文件
//#include <xmmintrin.h> // 如果上面那个不行,试试这个
#include "ndarray.h"
#include "wangsrng.h"
#include "ticktock.h"
// Matrix 是 YX 序的二维浮点数组:mat(x, y) = mat.data()[y * mat.shape(0) + x]
using Matrix = ndarray<2, float>;
// 注意:默认对齐到 64 字节,如需 4096 字节,请用 ndarray<2, float, AlignedAllocator<4096, float>>
static void matrix_randomize(Matrix &out) {
TICK(matrix_randomize);
size_t nx = out.shape(0);
size_t ny = out.shape(1);
// 这个循环为什么不够高效?如何优化? 10 分
#pragma omp parallel for collapse(2)
for (int x = 0; x < nx; x++) {
for (int y = 0; y < ny; y++) {
float val = wangsrng(x, y).next_float();
out(x, y) = val;
}
}
TOCK(matrix_randomize);
}
static void matrix_transpose(Matrix &out, Matrix const &in) {
TICK(matrix_transpose);
size_t nx = in.shape(0);
size_t ny = in.shape(1);
out.reshape(ny, nx);
// 这个循环为什么不够高效?如何优化? 15 分
#pragma omp parallel for collapse(2)
for (int x = 0; x < nx; x++) {
for (int y = 0; y < ny; y++) {
out(y, x) = in(x, y);
}
}
TOCK(matrix_transpose);
}
static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
TICK(matrix_multiply);
size_t nx = lhs.shape(0);
size_t nt = lhs.shape(1);
size_t ny = rhs.shape(1);
if (rhs.shape(0) != nt) {
std::cerr << "matrix_multiply: shape mismatch" << std::endl;
throw;
}
out.reshape(nx, ny);
// 这个循环为什么不够高效?如何优化? 15 分
#pragma omp parallel for collapse(2)
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
out(x, y) = 0; // 有没有必要手动初始化? 5 分
for (int t = 0; t < nt; t++) {
out(x, y) += lhs(x, t) * rhs(t, y);
}
}
}
TOCK(matrix_multiply);
}
// 求出 R^T A R
static void matrix_RtAR(Matrix &RtAR, Matrix const &R, Matrix const &A) {
TICK(matrix_RtAR);
// 这两个是临时变量,有什么可以优化的? 5 分
Matrix Rt, RtA;
matrix_transpose(Rt, R);
matrix_multiply(RtA, Rt, A);
matrix_multiply(RtAR, RtA, R);
TOCK(matrix_RtAR);
}
static float matrix_trace(Matrix const &in) {
TICK(matrix_trace);
float res = 0;
size_t nt = std::min(in.shape(0), in.shape(1));
#pragma omp parallel for reduction(+:res)
for (int t = 0; t < nt; t++) {
res += in(t, t);
}
TOCK(matrix_trace);
return res;
}
static void test_func(size_t n) {
TICK(test_func);
Matrix R(n, n);
matrix_randomize(R);
Matrix A(n, n);
matrix_randomize(A);
Matrix RtAR;
matrix_RtAR(RtAR, R, A);
std::cout << matrix_trace(RtAR) << std::endl;
TOCK(test_func);
}
int main() {
wangsrng rng;
TICK(overall);
for (int t = 0; t < 4; t++) {
size_t n = 32 * (rng.next_uint64() % 16 + 24);
std::cout << "t=" << t << ": n=" << n << std::endl;
test_func(n);
}
TOCK(overall);
return 0;
}