Skip to content

complete hw08 using exclusive scan but does not optimize bank conflict #14

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
cmake_minimum_required(VERSION 3.12)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CUDA_STANDARD 11)
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
# 如果需要指定显卡版本号的话:
# set(CMAKE_CUDA_ARCHITECTURES 52)
set(CMAKE_CUDA_ARCHITECTURES 70)

project(hellocmake LANGUAGES CXX CUDA)

Expand Down
14 changes: 7 additions & 7 deletions include/CudaAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ struct CudaAllocator {

template <class ...Args>
void construct(T *p, Args &&...args) {
if constexpr (!(sizeof...(Args) == 0 && std::is_pod_v<T>))
if (!(sizeof...(Args) == 0 ))
::new((void *)p) T(std::forward<Args>(args)...);
}

//**** CIHOU SHABI WENDOUS ****
template<class _Other>
constexpr CudaAllocator(const CudaAllocator<_Other>&) noexcept {}
// //**** CIHOU SHABI WENDOUS ****
// template<class _Other>
// constexpr CudaAllocator(const CudaAllocator<_Other>&) noexcept {}

constexpr bool operator==(CudaAllocator<T> const &other) const {
return this == &other;
}
// constexpr bool operator==(CudaAllocator<T> const &other) const {
// return this == &other;
// }
};
25 changes: 25 additions & 0 deletions kernel.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#define THREADS_PER_BLOCK 512

template<class T,int thread_per_block=THREADS_PER_BLOCK>
void scan(T *d_in,T *d_out,int length);

template<class T,int thread_per_block=THREADS_PER_BLOCK>
__global__ void singleBlockScan(T *d_in,T *d_out,int length);

template<class T,int thread_per_block=THREADS_PER_BLOCK>
void largeDataScan(T *d_in,T *d_out,int length);

template<class T,int thread_per_block=THREADS_PER_BLOCK>
__global__ void preScan(T *d_in,T *d_out,T *block_offset);

template<class T,int thread_per_block=THREADS_PER_BLOCK>
__global__ void postScan(T *d_out,T *block_offset);

template<class T>
__global__ void procLeft(T *d_out,T *num1,T *num2, int length);

template<class T>
__global__ void filter(T *arr,int *isPos,int n);

template<class T>
__global__ void Compact(T *arr,T *res,int *isPos,int *index,int n);
214 changes: 195 additions & 19 deletions main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,51 +4,227 @@
#include "helper_cuda.h"
#include <cmath>
#include <vector>
#include "ticktock.h"
#include "kernel.cuh"
// #include <thrust/device_vector.h> // 如果想用 thrust 也是没问题的

#define ELEMENTS_PER_BLOCK (thread_per_block*2)

template<class T,int thread_per_block>
void scan(T *d_in,T *d_out,int length){
if(length<=ELEMENTS_PER_BLOCK)
singleBlockScan<T><<<1,thread_per_block>>>(d_in,d_out,length);
else{
largeDataScan<T>(d_in,d_out,length);
}
}

template<class T,int thread_per_block>
__global__ void singleBlockScan(T *d_in,T *d_out,int length){
int tid=threadIdx.x;
__shared__ T bls_presum[ELEMENTS_PER_BLOCK];
bls_presum[tid*2]=(tid*2)<length?d_in[tid*2]:0;
bls_presum[tid*2+1]=(tid*2+1)<length?d_in[tid*2+1]:0;
__syncthreads();
for (int stride=1;stride<=thread_per_block;stride*=2){
int index=(tid+1)*2*stride-1;
if (index<ELEMENTS_PER_BLOCK)
bls_presum[index]+=bls_presum[index-stride];
__syncthreads();
}
if (tid==0){
bls_presum[ELEMENTS_PER_BLOCK-1]=0;
}
int tmp;
for (int stride=thread_per_block;stride>0;stride/=2){
int index=(tid+1)*2*stride-1;
if (index<ELEMENTS_PER_BLOCK){
tmp=bls_presum[index];
bls_presum[index]+=bls_presum[index-stride];
bls_presum[index-stride]=tmp;
}
__syncthreads();
}
if (tid*2<length)
d_out[tid*2]=bls_presum[tid*2];
if (tid*2+1<length)
d_out[tid*2+1]=bls_presum[tid*2+1];
}

template<class T,int thread_per_block>
void largeDataScan(T *d_in,T *d_out,int length){

//计算超出ELEMENTS_PER_BLOCK倍数部分的元素数量
int leftover=length%ELEMENTS_PER_BLOCK;

//实际上preScan扫描的元素数量,剩余的部分使用singleBlockScan扫描计算
int real_length=length-leftover;

//cuda核函数网格维度
int gridDim=real_length/ELEMENTS_PER_BLOCK;
constexpr int blockDim=thread_per_block;

//每个block内部元素的和,作为post_scan每个block内部的偏移量
// std::vector<int, CudaAllocator<int>> block_offset(gridDim);
int *dev_block_offset;
checkCudaErrors(cudaMalloc((void **)&dev_block_offset,gridDim*sizeof(int)));

//将large data分成一个个block*2(ELEMENTS_PER_BLOCK)大小的块,每个block内部进行scan
preScan<T,blockDim><<<gridDim,blockDim>>>(d_in,d_out,dev_block_offset);
//对block_offset进行exlusive_scan
scan<T,blockDim>(dev_block_offset,dev_block_offset,gridDim);
//将large data每个block局部scan的结果加上block_offset
postScan<T,blockDim><<<gridDim*2,blockDim>>>(d_out,dev_block_offset);
//同步等待上述核函数执行完毕
checkCudaErrors(cudaDeviceSynchronize());
//释放指针
checkCudaErrors(cudaFree(dev_block_offset));

//如果不是ELEMENTS_PER_BLOCK的整数倍,需要处理剩余的部分
if(leftover){
singleBlockScan<T,blockDim><<<1,blockDim>>>(&d_in[real_length],&d_out[real_length],leftover);
//对d_out剩余部分进行
procLeft<T><<<1,blockDim>>>(&d_out[real_length],&d_out[real_length-1],&d_in[real_length-1],leftover);
}
}

template<class T,int thread_per_block>
__global__ void preScan(T *d_in,T *d_out,T *block_offset){
int tid=threadIdx.x;
int gid=threadIdx.x+thread_per_block*blockIdx.x;
__shared__ T bls_presum[ELEMENTS_PER_BLOCK];
bls_presum[tid*2]=d_in[gid*2];
bls_presum[tid*2+1]=d_in[gid*2+1];
__syncthreads();
for (int stride=1;stride<=thread_per_block;stride*=2){
int index=(tid+1)*2*stride-1;
if (index<ELEMENTS_PER_BLOCK)
bls_presum[index]+=bls_presum[index-stride];
__syncthreads();
}
if (tid==0){
block_offset[blockIdx.x]=bls_presum[ELEMENTS_PER_BLOCK-1];
bls_presum[ELEMENTS_PER_BLOCK-1]=0;
}
T tmp;
for (int stride=thread_per_block;stride>0;stride/=2){
int index=(tid+1)*2*stride-1;
if (index<ELEMENTS_PER_BLOCK){
tmp=bls_presum[index];
bls_presum[index]+=bls_presum[index-stride];
bls_presum[index-stride]=tmp;
}
__syncthreads();
}
d_out[gid*2]=bls_presum[tid*2];
d_out[gid*2+1]=bls_presum[tid*2+1];
}

template<class T,int thread_per_block>
__global__ void postScan(T *d_out,T *block_offset){
int gid=threadIdx.x+thread_per_block*blockIdx.x;
d_out[gid]+=block_offset[gid/ELEMENTS_PER_BLOCK];

}

template<class T>
__global__ void procLeft(T *d_out,T *num1,T *num2, int length){
int tid=threadIdx.x;
if(tid < length)
d_out[tid]+=num1[0]+num2[0];
}

template<class T>
__global__ void filter(T *arr,int *isPos,int n){
int gid=threadIdx.x+blockDim.x*blockIdx.x;
if (gid>=n) return;
isPos[gid]=(arr[gid]>=0?1:0);
}

template<class T>
__global__ void Compact(T *arr,T *res,int *isPos,int *index,int n){
int gid=threadIdx.x+blockDim.x*blockIdx.x;
if (gid>=n) return;
if (isPos[gid])
res[index[gid]]=arr[gid];
}

template<class T,int thread_per_block>
void filter_positive(int *counter,T *dev_res,T *dev_arr,int n){
// std::vector<int, CudaAllocator<int>> isPos(n);
// std::vector<int, CudaAllocator<int>> index(n);
int *dev_isPos;
int *dev_index;
checkCudaErrors(cudaMalloc((void **)(&dev_isPos),n*sizeof(int)));
checkCudaErrors(cudaMalloc((void **)(&dev_index),n*sizeof(int)));
filter<T><<<(n+thread_per_block-1)/thread_per_block,thread_per_block>>>(dev_arr,dev_isPos,n);
scan<int,thread_per_block>(dev_isPos,dev_index,n);
Compact<T><<<(n+thread_per_block-1)/thread_per_block,thread_per_block>>>(dev_arr,dev_res,dev_isPos,dev_index,n);
checkCudaErrors(cudaMemcpy(counter,&dev_index[n-1],sizeof(int),cudaMemcpyDeviceToHost));
}

// 这是基于“边角料法”的,请把他改成基于“网格跨步循环”的:10 分
__global__ void fill_sin(int *arr, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) return;
arr[i] = sinf(i);
__global__ void fill_sin(float *arr, int n) {
for (int i= threadIdx.x+ blockIdx.x* blockDim.x;i<n;i+= gridDim.x* blockDim.x){
arr[i]=sinf(i);
// printf("%f\n",arr[i]);
}
}

__global__ void filter_positive(int *counter, int *res, int const *arr, int n) {
__global__ void filter_positive_atomic_add(int *counter, float *res, float const *arr, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) return;
if (i > n) return;
if (arr[i] >= 0) {
// 这里有什么问题?请改正:10 分
int loc = *counter;
*counter += 1;
res[loc] = n;
int loc = atomicAdd(counter,1);
res[loc]=arr[i];
}
}

int main() {
constexpr int n = 1<<24;
std::vector<int, CudaAllocator<int>> arr(n);
std::vector<int, CudaAllocator<int>> res(n);
std::vector<int, CudaAllocator<int>> counter(1);
constexpr int n = 1<<24;//64MB数据
std::vector<float, CudaAllocator<float>> arr(n);
std::vector<float, CudaAllocator<float>> res(n);
std::vector<int, CudaAllocator<int>> counter_ad(1);

//===========直接在核函数内部使用原子指令对index进行递增============
TICK(atomic_add);
fill_sin<<<n / 32 /1024, 1024>>>(arr.data(), n);
filter_positive_atomic_add<<< (n+1023) / 1024, 1024>>>(counter_ad.data(), res.data(), arr.data(), n);
checkCudaErrors(cudaDeviceSynchronize());
TOCK(atomic_add);

float *dev_arr;
float *dev_res;
float *h_res=(float *)malloc(n*sizeof(float));
int counter[1];

//===========手写exclusive_scan_and_compact方法进行filter===========
TICK(exclusive_scan_and_compact);
checkCudaErrors(cudaMalloc((void **)(&dev_arr),n*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)(&dev_res),n*sizeof(float)));
// fill_sin 改成“网格跨步循环”以后,这里三重尖括号里的参数如何调整?10 分
fill_sin<<<n / 1024, 1024>>>(arr.data(), n);
fill_sin<<<n / 32 /1024, 1024>>>(dev_arr, n);

// 这里的“边角料法”对于不是 1024 整数倍的 n 会出错,为什么?请修复:10 分
filter_positive<<<n / 1024, 1024>>>(counter.data(), res.data(), arr.data(), n);

// filter_positive<n,1024><<< (n+1023) / 1024, 1024>>>(counter.data(), res.data(), dev_arr);
filter_positive<float,THREADS_PER_BLOCK>(counter, dev_res, dev_arr,n);
// 这里 CPU 访问数据前漏了一步什么操作?请补上:10 分
checkCudaErrors(cudaMemcpy(h_res,dev_res,n*sizeof(float),cudaMemcpyDeviceToHost));
TOCK(exclusive_scan_and_compact);

if (counter[0] <= n / 50) {
printf("Result too short! %d <= %d\n", counter[0], n / 50);
return -1;
}
for (int i = 0; i < counter[0]; i++) {
if (res[i] < 0) {
printf("Wrong At %d: %f < 0\n", i, res[i]);
if (h_res[i] < 0) {
printf("Wrong At %d: %f < 0\n", i, h_res[i]);
return -1; // 突然想起了ICPC有一年队名叫“蓝翔WA掘机”的,笑不活了:)
}
}

std::cout<<counter[0]<<std::endl;
std::cout<<counter_ad[0]<<std::endl;
printf("All Correct!\n"); // 还有个队名叫“AC自动机”的,和隔壁“WAWA大哭”对标是吧:)
return 0;
}
6 changes: 5 additions & 1 deletion run.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#!/bin/sh
#!/bin/bash -x
if [[ $1 = "clean" && -e build ]];then
rm -rf build
fi
export CUDA_VISIBLE_DEVICES=0
set -e
cmake -B build
cmake --build build
Expand Down
7 changes: 7 additions & 0 deletions score.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
+ build/main
atomic_add: 0.015539s
exclusive_scan_and_compact: 0.023504s
8388608
8388608
All Correct!
[liuf@gpu3 hw08]